From e9bd39486b3ed0cfa7062be1798efae8316cda0d Mon Sep 17 00:00:00 2001
From: Sergio Bossa <sergio.bossa@gmail.com>
Date: Thu, 5 Oct 2017 16:51:06 +0100
Subject: [PATCH 001/151] STAR-567: Add test from DB-1208 (bug itself is not
 present)

(cherry picked from commit f4cca35aaa78780dbc344217c2ac0a70ca617679)
---
 .../io/sstable/ReducingKeyIteratorTest.java   | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java

diff --git a/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java b/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java
new file mode 100644
index 000000000000..aaf1a2aa779b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class ReducingKeyIteratorTest
+{
+    public static final String KEYSPACE1 = "ReducingKeyIteratorTest";
+    public static final String CF_STANDARD = "Standard1";
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        SchemaLoader.prepareServer();
+        CompactionManager.instance.disableAutoCompaction();
+
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD));
+    }
+
+    @After
+    public void afterTest() throws Exception
+    {
+        ColumnFamilyStore store = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD);
+        store.truncateBlocking();
+    }
+
+    @Test
+    public void testTotalAndReadBytesOneSSTable() throws IOException
+    {
+        testTotalAndReadBytes(1, 1000);
+    }
+
+    @Test
+    public void testTotalAndReadBytesManySSTables() throws IOException
+    {
+        testTotalAndReadBytes(10, 100);
+    }
+
+    public void testTotalAndReadBytes(int tableCount, int rowCount) throws IOException
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD);
+        LoggerFactory.getLogger(getClass()).info("Compression {}", store.metadata().params.compression.asMap());
+
+        for (int t = 0; t < tableCount; ++t)
+        {
+            for (int i = 0; i < rowCount; i++)
+            {
+                new RowUpdateBuilder(store.metadata(), i, String.valueOf(i))
+                .clustering("0")
+                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                .build()
+                .applyUnsafe();
+            }
+            store.forceBlockingFlush();
+        }
+
+        Set<SSTableReader> sstables = store.getLiveSSTables();
+        ReducingKeyIterator reducingIterator = new ReducingKeyIterator(sstables);
+
+        while (reducingIterator.hasNext())
+        {
+            Assert.assertTrue(reducingIterator.getTotalBytes() >= reducingIterator.getBytesRead());
+            reducingIterator.next();
+        }
+        Assert.assertEquals(reducingIterator.getTotalBytes(), reducingIterator.getBytesRead());
+    }
+}

From 1d6eb628b3748f7f0742b62465ae20715249e165 Mon Sep 17 00:00:00 2001
From: Ruslan Fomkin <Ruslan.Fomkin@gmail.com>
Date: Mon, 31 May 2021 12:16:05 +0200
Subject: [PATCH 002/151] STAR-570 Synchronize schema pull handling (#175)

Synchronize schema pull handling with applying new schema changes

There's a race condition around pulling schema changes, that can occur
in case the schema changes push/propagation mechanism is not
immediately effective (e.g. because of network delay, or because of
the pulling node being down, etc.).

If schema changes happen on node 1, these changes do not reach node 2
immediately through the SCHEMA.PUSH mechanism, and are first
recognized during gossiping, the corresponding SCHEMA.PULL request
from node 2 can catch the node 1 schema in the middle of it being
modified by another schema change request. This can easily lead to
problems (e.g. if a new table is being added, and the node 2 request
reads the changes that need to be applied to system_schema.tables, but
not the ones that need to be applied to system_schema.columns).

This PR addresses that by synchronizing the SCHEMA.PULL "RPC call"
executed in node 1 by a request from node 2 with the method for
applying schema changes in node 1.

It also adds debug level logging tracking SCHEMA.PUSH and SCHEMA.PULL
messages, as there were some unexpected findings around these that may
need further investigation.

E.g. during my investigations, seemingly redundant SCHEMA.PULL
messages were sent multiple times from node 1 to node 2, even though
no schema changes were made at node 2, and node 2 did not go offline.

Co-authored-by: Dimitar Dimitrov

Co-authored-by: Dimitar Dimitrov <30328539+dimitarndimitrov@users.noreply.github.com>
(cherry picked from commit ab2a669cb8b318e7014f6445d032e75f371e3da9)
---
 src/java/org/apache/cassandra/schema/SchemaKeyspace.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
index 65e9adbdae67..fdb209e06ca0 100644
--- a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
+++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
@@ -393,7 +393,7 @@ private static ReadCommand getReadCommandForTableSchema(String schemaTableName)
         return PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds());
     }
 
-    static Collection<Mutation> convertSchemaToMutations()
+    static synchronized Collection<Mutation> convertSchemaToMutations()
     {
         Map<DecoratedKey, Mutation.PartitionUpdateCollector> mutationMap = new HashMap<>();
 

From 7abf8639965e317a73f1193fc5624ea968400a67 Mon Sep 17 00:00:00 2001
From: Ruslan Fomkin <Ruslan.Fomkin@gmail.com>
Date: Tue, 1 Jun 2021 09:37:12 +0200
Subject: [PATCH 003/151] STAR-572 Improve error message when altering MV
 (#176)

Improve error message when altering an MV with default ttl > 0.

Co-authored-by: Brandon Williams <brandon@datastax.com>
(cherry picked from commit 8296fe1319524480ae1910fde5f23c266c1642ed)
---
 .../cassandra/cql3/statements/schema/AlterViewStatement.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
index 1931bb489df3..3eba21561a48 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
@@ -66,8 +66,9 @@ public Keyspaces apply(Keyspaces schema)
         if (params.defaultTimeToLive > 0)
         {
             throw ire("Cannot set or alter default_time_to_live for a materialized view. " +
-                      "Data in a materialized view always expire at the same time than " +
-                      "the corresponding data in the parent table.");
+                      "Data in a materialized view always expires at the same time as " +
+                      "the corresponding data in the parent table. default_time_to_live " +
+                      "must be set to zero, see CASSANDRA-12868 for more information.");
         }
 
         ViewMetadata newView = view.copy(view.metadata.withSwapped(params));

From 03dc51051c1a2188edbb715128c08ad285aa8a0b Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Tue, 1 Jun 2021 09:39:25 +0200
Subject: [PATCH 004/151] STAR-565  Use the indexed item type as backing table
 key validator (#172)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

... of 2i on collections

Co-authored-by: Andrés de la Peña andres.de_la_pena_garcia@datastax.com
(cherry picked from commit 07edf0d25def34c1c8b2e7f06b93a817692784dd)
---
 .../index/internal/CassandraIndex.java        |   2 +-
 .../entities/SecondaryIndexTest.java          | 104 ++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
index ea5f8e2f613c..06a6cd9cfd66 100644
--- a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
@@ -742,7 +742,7 @@ public static TableMetadata indexCfsMetadata(TableMetadata baseCfsMetadata, Inde
             TableMetadata.builder(baseCfsMetadata.keyspace, baseCfsMetadata.indexTableName(indexMetadata), baseCfsMetadata.id)
                          .kind(TableMetadata.Kind.INDEX)
                          .partitioner(new LocalPartitioner(indexedValueType))
-                         .addPartitionKeyColumn(indexedColumn.name, indexedColumn.type)
+                         .addPartitionKeyColumn(indexedColumn.name, indexedValueType)
                          .addClusteringColumn("partition_key", baseCfsMetadata.partitioner.partitionOrdering());
 
         // Adding clustering columns, which depends on the index type.
diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
index 5c17fb85638b..3d97df931068 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
@@ -27,6 +27,12 @@
 import org.apache.commons.lang3.StringUtils;
 import org.junit.Test;
 
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
@@ -432,6 +438,104 @@ public void testIndexOnCollections() throws Throwable
         });
     }
 
+    private static void assertBackingTableKeyValidator(SecondaryIndexManager indexManager, String indexName, AbstractType expectedType)
+    {
+        assertEquals(expectedType, indexManager.getIndexByName(indexName)
+                                               .getBackingTable()
+                                               .map(ColumnFamilyStore::metadata)
+                                               .map(m -> m.partitionKeyType)
+                                               .orElseThrow(AssertionError::new));
+    }
+
+    /**
+     * Test for DB-1121
+     */
+    @Test
+    public void testIndexOnCollectionsBackingTableKeyValidator() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k int PRIMARY KEY, " +
+                    "non_frozen_list list<int>, " +
+                    "non_frozen_set set<text>, " +
+                    "non_frozen_map map<text, int>," +
+                    "frozen_list frozen<list<int>>, " +
+                    "frozen_set frozen<set<text>>, " +
+                    "frozen_map frozen<map<text, int>>)");
+
+        createIndex("CREATE INDEX non_frozen_list_idx ON %s (non_frozen_list)");
+        createIndex("CREATE INDEX non_frozen_set_idx ON %s (non_frozen_set)");
+        createIndex("CREATE INDEX non_frozen_map_idx ON %s (non_frozen_map)");
+        createIndex("CREATE INDEX non_frozen_map_keys_idx ON %s (KEYS(non_frozen_map))");
+        createIndex("CREATE INDEX non_frozen_map_entries_idx ON %s (ENTRIES(non_frozen_map))");
+        createIndex("CREATE INDEX frozen_list_idx ON %s (FULL(frozen_list))");
+        createIndex("CREATE INDEX frozen_set_idx ON %s (FULL(frozen_set))");
+        createIndex("CREATE INDEX frozen_map_idx ON %s (FULL(frozen_map))");
+
+        SecondaryIndexManager indexManager = ColumnFamilyStore.getIfExists(keyspace(), currentTable()).indexManager;
+
+        assertBackingTableKeyValidator(indexManager, "non_frozen_list_idx", Int32Type.instance);
+        assertBackingTableKeyValidator(indexManager, "non_frozen_set_idx", UTF8Type.instance);
+        assertBackingTableKeyValidator(indexManager, "non_frozen_map_idx", Int32Type.instance);
+        assertBackingTableKeyValidator(indexManager, "non_frozen_map_keys_idx", UTF8Type.instance);
+        assertBackingTableKeyValidator(indexManager, "non_frozen_map_entries_idx", CompositeType.getInstance(UTF8Type.instance, Int32Type.instance));
+        assertBackingTableKeyValidator(indexManager, "frozen_list_idx", ListType.getInstance(Int32Type.instance, false));
+        assertBackingTableKeyValidator(indexManager, "frozen_set_idx", SetType.getInstance(UTF8Type.instance, false));
+        assertBackingTableKeyValidator(indexManager, "frozen_map_idx", MapType.getInstance(UTF8Type.instance, Int32Type.instance, false));
+
+        // Unsupported index types for non-frozen list
+        assertInvalidMessage("Cannot create index on keys of column non_frozen_list with non-map type",
+                             "CREATE INDEX ON %s (KEYS(non_frozen_list))");
+        assertInvalidMessage("Cannot create index on entries of column non_frozen_list with non-map type",
+                             "CREATE INDEX ON %s (ENTRIES(non_frozen_list))");
+        assertInvalidMessage("full() indexes can only be created on frozen collections",
+                             "CREATE INDEX ON %s (FULL(non_frozen_list))");
+
+        // Unsupported index types for non-frozen set
+        assertInvalidMessage("Cannot create index on keys of column non_frozen_set with non-map type",
+                             "CREATE INDEX ON %s (KEYS(non_frozen_set))");
+        assertInvalidMessage("Cannot create index on entries of column non_frozen_set with non-map type",
+                             "CREATE INDEX ON %s (ENTRIES(non_frozen_set))");
+        assertInvalidMessage("full() indexes can only be created on frozen collections",
+                             "CREATE INDEX ON %s (FULL(non_frozen_set))");
+
+        // Unsupported index types for non-frozen map
+        assertInvalidMessage("full() indexes can only be created on frozen collections",
+                             "CREATE INDEX ON %s (FULL(non_frozen_map))");
+
+        // Unsupported index types for frozen list
+        assertInvalidMessage("Cannot create keys() index on frozen column frozen_list. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_list)' modifier",
+                             "CREATE INDEX ON %s (KEYS(frozen_list))");
+        assertInvalidMessage("Cannot create entries() index on frozen column frozen_list. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_list)' modifier",
+                             "CREATE INDEX ON %s (ENTRIES(frozen_list))");
+        assertInvalidMessage("Cannot create values() index on frozen column frozen_list. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_list)' modifier",
+                             "CREATE INDEX ON %s (VALUES(frozen_list))");
+
+        // Unsupported index types for frozen set
+        assertInvalidMessage("Cannot create keys() index on frozen column frozen_set. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_set)' modifier",
+                             "CREATE INDEX ON %s (KEYS(frozen_set))");
+        assertInvalidMessage("Cannot create entries() index on frozen column frozen_set. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_set)' modifier",
+                             "CREATE INDEX ON %s (ENTRIES(frozen_set))");
+        assertInvalidMessage("Cannot create values() index on frozen column frozen_set. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_set)' modifier",
+                             "CREATE INDEX ON %s (VALUES(frozen_set))");
+
+        // Unsupported index types for frozen map
+        assertInvalidMessage("Cannot create keys() index on frozen column frozen_map. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_map)' modifier",
+                             "CREATE INDEX ON %s (KEYS(frozen_map))");
+        assertInvalidMessage("Cannot create entries() index on frozen column frozen_map. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_map)' modifier",
+                             "CREATE INDEX ON %s (ENTRIES(frozen_map))");
+        assertInvalidMessage("Cannot create values() index on frozen column frozen_map. Frozen collections " +
+                             "are immutable and must be fully indexed by using the 'full(frozen_map)' modifier",
+                             "CREATE INDEX ON %s (VALUES(frozen_map))");
+    }
+
     @Test
     public void testSelectOnMultiIndexOnCollectionsWithNull() throws Throwable
     {

From c1eefd1f5b8d2332fa14ec7a6c6db862dbcced88 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Tue, 1 Jun 2021 10:32:21 +0200
Subject: [PATCH 005/151] STAR-564 Check only MODIFY on base when updating
 table with MV (#171)

If a user has only MODIFY permission on a table and there is a
materialized view built on the same table an insert will fail
with the following error:
Unauthorized: Error from server: code=2100 [Unauthorized]

Only base MODIFY permission is required to update base with MV.

Co-authored-by: Zhao Yang <zhao.yang@datastax.com>
(cherry picked from commit 53c5aa68693cf084bb36d56389c7ede485abbbf6)
---
 CHANGES.txt                                          |  3 +++
 .../cql3/statements/ModificationStatement.java       | 12 +-----------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 7e0c1a98a71a..462c5f41d00e 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,3 +1,6 @@
+Future version (tbd)
+ * Require only MODIFY permission on base when updating table with MV (STAR-564)
+
 4.0.1
  * Cleanup dependency scopes (CASSANDRA-16704)
  * Make JmxHistogram#getRecentValues() and JmxTimer#getRecentValues() thread-safe (CASSANDRA-16707)
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index 087f3b0e7c4b..7ec3a69fcedd 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -244,17 +244,7 @@ public void authorize(ClientState state) throws InvalidRequestException, Unautho
         if (hasConditions())
             state.ensureTablePermission(metadata, Permission.SELECT);
 
-        // MV updates need to get the current state from the table, and might update the views
-        // Require Permission.SELECT on the base table, and Permission.MODIFY on the views
-        Iterator<ViewMetadata> views = View.findAll(keyspace(), columnFamily()).iterator();
-        if (views.hasNext())
-        {
-            state.ensureTablePermission(metadata, Permission.SELECT);
-            do
-            {
-                state.ensureTablePermission(views.next().metadata, Permission.MODIFY);
-            } while (views.hasNext());
-        }
+        // Modification on base table with MV should skip SELECT access control to base table and WRITE access control to view table.
 
         for (Function function : getFunctions())
             state.ensurePermission(Permission.EXECUTE, function);

From a484e25631d6d7ac2cadfdf2b1a6008d44794894 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Tue, 1 Jun 2021 14:55:09 +0200
Subject: [PATCH 006/151] STAR-571 fix nanoseconds overflowing in
 CommitLogService (#174)

* remove dead code

* add awaitSyncAt test

The test verifies two cases when nanoseconds overflowing causes
awaitSyncAt to never return (as described in the original ticket).

* fix nanoseconds overflowing in CommitLogService
* Use clock consistently in all CommitLogService implementations
* Compare nanoseconds as a difference as advised in the docs:
https://docs.oracle.com/javase/8/docs/api/java/lang/System.html#nanoTime
* add abs values where the overflowed value may cause a negative
result

Co-authored-by: Sergio Bossa <sergio@datastax.com>
(cherry picked from commit 91db2d4c8490e6beff706e1ca7e7339d52cbd155)
---
 .../commitlog/AbstractCommitLogService.java   |  33 ++--
 .../db/commitlog/BatchCommitLogService.java   |   6 +-
 .../cassandra/db/commitlog/CommitLog.java     |  12 +-
 .../db/commitlog/GroupCommitLogService.java   |   5 +-
 .../commitlog/PeriodicCommitLogService.java   |   9 +-
 .../cassandra/utils/MonotonicClock.java       |   4 +-
 .../cassandra/utils/SlidingTimeRate.java      | 171 ------------------
 .../cassandra/utils/SystemTimeSource.java     |  54 ------
 .../apache/cassandra/utils/TimeSource.java    |  58 ------
 .../utils/concurrent/IntervalLock.java        |  69 -------
 .../AbstractCommitLogServiceTest.java         |   3 +-
 .../commitlog/CommitLogAwaitAsyncAtTest.java  | 110 +++++++++++
 .../cassandra/utils/MonotonicClockTest.java   |  10 +
 .../cassandra/utils/SlidingTimeRateTest.java  | 161 -----------------
 .../cassandra/utils/TestTimeSource.java       |  72 --------
 15 files changed, 165 insertions(+), 612 deletions(-)
 delete mode 100644 src/java/org/apache/cassandra/utils/SlidingTimeRate.java
 delete mode 100644 src/java/org/apache/cassandra/utils/SystemTimeSource.java
 delete mode 100644 src/java/org/apache/cassandra/utils/TimeSource.java
 delete mode 100644 src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java
 create mode 100644 test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java
 delete mode 100644 test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java
 delete mode 100644 test/unit/org/apache/cassandra/utils/TestTimeSource.java

diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java
index a65ef00a11bd..326936cdb5d1 100644
--- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogService.java
@@ -45,7 +45,7 @@ public abstract class AbstractCommitLogService
     private volatile boolean shutdown = false;
 
     // all Allocations written before this time will be synced
-    protected volatile long lastSyncedAt = System.currentTimeMillis();
+    protected volatile long lastSyncedAt;
 
     // counts of total written, and pending, log messages
     private final AtomicLong written = new AtomicLong(0);
@@ -68,6 +68,11 @@ public abstract class AbstractCommitLogService
      */
     final long markerIntervalNanos;
 
+    /**
+     * Provides time related functions for commit log syncing scheduling.
+     */
+    protected final MonotonicClock clock;
+
     /**
      * A flag that callers outside of the sync thread can use to signal they want the commitlog segments
      * to be flushed to disk. Note: this flag is primarily to support commit log's batch mode, which requires
@@ -83,9 +88,9 @@ public abstract class AbstractCommitLogService
      *
      * Subclasses may be notified when a sync finishes by using the syncComplete WaitQueue.
      */
-    AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis)
+    AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, MonotonicClock clock)
     {
-        this (commitLog, name, syncIntervalMillis, false);
+        this (commitLog, name, syncIntervalMillis, clock, false);
     }
 
     /**
@@ -96,10 +101,12 @@ public abstract class AbstractCommitLogService
      *
      * @param markHeadersFaster true if the chained markers should be updated more frequently than on the disk sync bounds.
      */
-    AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, boolean markHeadersFaster)
+    AbstractCommitLogService(final CommitLog commitLog, final String name, long syncIntervalMillis, MonotonicClock clock, boolean markHeadersFaster)
     {
         this.commitLog = commitLog;
         this.name = name;
+        this.clock = clock;
+        this.lastSyncedAt = clock.now();
 
         final long markerIntervalMillis;
         if (markHeadersFaster && syncIntervalMillis > DEFAULT_MARKER_INTERVAL_MILLIS)
@@ -132,7 +139,7 @@ void start()
             throw new IllegalArgumentException(String.format("Commit log flush interval must be positive: %fms",
                                                              syncIntervalNanos * 1e-6));
         shutdown = false;
-        thread = NamedThreadFactory.createThread(new SyncRunnable(MonotonicClock.preciseTime), name);
+        thread = NamedThreadFactory.createThread(new SyncRunnable(clock), name);
         thread.start();
     }
 
@@ -168,7 +175,7 @@ boolean sync()
             {
                 // sync and signal
                 long pollStarted = clock.now();
-                boolean flushToDisk = lastSyncedAt + syncIntervalNanos <= pollStarted || shutdownRequested || syncRequested;
+                boolean flushToDisk = lastSyncedAt + syncIntervalNanos - pollStarted <= 0 || shutdownRequested || syncRequested;
                 if (flushToDisk)
                 {
                     // in this branch, we want to flush the commit log to disk
@@ -192,7 +199,7 @@ boolean sync()
                     return false;
 
                 long wakeUpAt = pollStarted + markerIntervalNanos;
-                if (wakeUpAt > now)
+                if (wakeUpAt - now > 0)
                     LockSupport.parkNanos(wakeUpAt - now);
             }
             catch (Throwable t)
@@ -218,7 +225,7 @@ boolean maybeLogFlushLag(long pollStarted, long now)
 
             // this is the timestamp by which we should have completed the flush
             long maxFlushTimestamp = pollStarted + syncIntervalNanos;
-            if (maxFlushTimestamp > now)
+            if (maxFlushTimestamp - now > 0)
                 return false;
 
             // if we have lagged noticeably, update our lag counter
@@ -229,7 +236,7 @@ boolean maybeLogFlushLag(long pollStarted, long now)
                 syncCount = 1;
                 totalSyncDuration = flushDuration;
             }
-            syncExceededIntervalBy += now - maxFlushTimestamp;
+            syncExceededIntervalBy += Math.abs(now - maxFlushTimestamp);
             lagCount++;
 
             if (firstLagAt > 0)
@@ -241,7 +248,7 @@ boolean maybeLogFlushLag(long pollStarted, long now)
                                                   TimeUnit.MINUTES,
                                                   "Out of {} commit log syncs over the past {}s with average duration of {}ms, {} have exceeded the configured commit interval by an average of {}ms",
                                                   syncCount,
-                                                  String.format("%.2f", (now - firstLagAt) * 1e-9d),
+                                                  String.format("%.2f", Math.abs(now - firstLagAt) * 1e-9d),
                                                   String.format("%.2f", totalSyncDuration * 1e-6d / syncCount),
                                                   lagCount,
                                                   String.format("%.2f", syncExceededIntervalBy * 1e-6d / lagCount));
@@ -292,7 +299,7 @@ public void shutdown()
      */
     public void syncBlocking()
     {
-        long requestTime = System.nanoTime();
+        long requestTime = clock.now();
         requestExtraSync();
         awaitSyncAt(requestTime, null);
     }
@@ -302,12 +309,12 @@ void awaitSyncAt(long syncTime, Context context)
         do
         {
             WaitQueue.Signal signal = context != null ? syncComplete.register(context) : syncComplete.register();
-            if (lastSyncedAt < syncTime)
+            if (lastSyncedAt - syncTime < 0)
                 signal.awaitUninterruptibly();
             else
                 signal.cancel();
         }
-        while (lastSyncedAt < syncTime);
+        while (lastSyncedAt - syncTime < 0);
     }
 
     public void awaitTermination() throws InterruptedException
diff --git a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java
index 78bf30c336c6..e354b925a036 100644
--- a/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/BatchCommitLogService.java
@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.db.commitlog;
 
+import org.apache.cassandra.utils.MonotonicClock;
+
 class BatchCommitLogService extends AbstractCommitLogService
 {
     /**
@@ -26,9 +28,9 @@ class BatchCommitLogService extends AbstractCommitLogService
      */
     private static final int POLL_TIME_MILLIS = 1000;
 
-    public BatchCommitLogService(CommitLog commitLog)
+    public BatchCommitLogService(CommitLog commitLog, MonotonicClock clock)
     {
-        super(commitLog, "COMMIT-LOG-WRITER", POLL_TIME_MILLIS);
+        super(commitLog, "COMMIT-LOG-WRITER", POLL_TIME_MILLIS, clock);
     }
 
     protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
index 7670c5e1a1f5..a32b8a1030e7 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
@@ -46,6 +46,7 @@
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.MBeanWrapper;
+import org.apache.cassandra.utils.MonotonicClock;
 
 import static org.apache.cassandra.db.commitlog.CommitLogSegment.Allocation;
 import static org.apache.cassandra.db.commitlog.CommitLogSegment.CommitLogSegmentFileComparator;
@@ -72,6 +73,9 @@ public class CommitLog implements CommitLogMBean
     volatile Configuration configuration;
     private boolean started = false;
 
+    @VisibleForTesting
+    final MonotonicClock clock;
+
     private static CommitLog construct()
     {
         CommitLog log = new CommitLog(CommitLogArchiver.construct(), DatabaseDescriptor.getCommitLogSegmentMgrProvider());
@@ -96,16 +100,18 @@ private static CommitLog construct()
         this.archiver = archiver;
         metrics = new CommitLogMetrics();
 
+        this.clock = MonotonicClock.preciseTime;
+
         switch (DatabaseDescriptor.getCommitLogSync())
         {
             case periodic:
-                executor = new PeriodicCommitLogService(this);
+                executor = new PeriodicCommitLogService(this, clock);
                 break;
             case batch:
-                executor = new BatchCommitLogService(this);
+                executor = new BatchCommitLogService(this, clock);
                 break;
             case group:
-                executor = new GroupCommitLogService(this);
+                executor = new GroupCommitLogService(this, clock);
                 break;
             default:
                 throw new IllegalArgumentException("Unknown commitlog service type: " + DatabaseDescriptor.getCommitLogSync());
diff --git a/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java
index a76923e581e0..056bc6c88cbe 100644
--- a/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/GroupCommitLogService.java
@@ -19,6 +19,7 @@
 package org.apache.cassandra.db.commitlog;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.utils.MonotonicClock;
 
 /**
  * A commitlog service that will block returning an ACK back to the a coordinator/client
@@ -26,9 +27,9 @@
  */
 public class GroupCommitLogService extends AbstractCommitLogService
 {
-    public GroupCommitLogService(CommitLog commitLog)
+    public GroupCommitLogService(CommitLog commitLog, MonotonicClock clock)
     {
-        super(commitLog, "GROUP-COMMIT-LOG-WRITER", (int) DatabaseDescriptor.getCommitLogSyncGroupWindow());
+        super(commitLog, "GROUP-COMMIT-LOG-WRITER", (int) DatabaseDescriptor.getCommitLogSyncGroupWindow(), clock);
     }
 
     protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
diff --git a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java
index e94c616e444f..c33624cde41d 100644
--- a/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java
+++ b/src/java/org/apache/cassandra/db/commitlog/PeriodicCommitLogService.java
@@ -20,21 +20,22 @@
 import java.util.concurrent.TimeUnit;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.utils.MonotonicClock;
 
 class PeriodicCommitLogService extends AbstractCommitLogService
 {
     private static final long blockWhenSyncLagsNanos = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getPeriodicCommitLogSyncBlock());
 
-    public PeriodicCommitLogService(final CommitLog commitLog)
+    public PeriodicCommitLogService(final CommitLog commitLog, MonotonicClock clock)
     {
-        super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod(),
+        super(commitLog, "PERIODIC-COMMIT-LOG-SYNCER", DatabaseDescriptor.getCommitLogSyncPeriod(), clock,
               !(commitLog.configuration.useCompression() || commitLog.configuration.useEncryption()));
     }
 
     protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
     {
-        long expectedSyncTime = System.nanoTime() - blockWhenSyncLagsNanos;
-        if (lastSyncedAt < expectedSyncTime)
+        long expectedSyncTime = clock.now() - blockWhenSyncLagsNanos;
+        if (lastSyncedAt - expectedSyncTime < 0)
         {
             pending.incrementAndGet();
             awaitSyncAt(expectedSyncTime, commitLog.metrics.waitingOnCommit.time());
diff --git a/src/java/org/apache/cassandra/utils/MonotonicClock.java b/src/java/org/apache/cassandra/utils/MonotonicClock.java
index 5a1aa3c0361e..d641ec2abfd3 100644
--- a/src/java/org/apache/cassandra/utils/MonotonicClock.java
+++ b/src/java/org/apache/cassandra/utils/MonotonicClock.java
@@ -259,13 +259,13 @@ public long error()
         @Override
         public boolean isAfter(long instant)
         {
-            return now() > instant;
+            return instant - now() < 0;
         }
 
         @Override
         public boolean isAfter(long now, long instant)
         {
-            return now > instant;
+            return instant - now < 0;
         }
     }
 
diff --git a/src/java/org/apache/cassandra/utils/SlidingTimeRate.java b/src/java/org/apache/cassandra/utils/SlidingTimeRate.java
deleted file mode 100644
index 0e00054d0205..000000000000
--- a/src/java/org/apache/cassandra/utils/SlidingTimeRate.java
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.concurrent.ConcurrentNavigableMap;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.locks.ReadWriteLock;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-
-/**
- * Concurrent rate computation over a sliding time window.
- *
- * Currently not used in the Cassandra 4.0 code base. If you decide to use it, please check CASSANDRA-16713.
- * There still might be a bug, flaky test to be fixed before using it again.
- */
-public class SlidingTimeRate
-{
-    private final ConcurrentSkipListMap<Long, AtomicInteger> counters = new ConcurrentSkipListMap<>();
-    private final AtomicLong lastCounterTimestamp = new AtomicLong(0);
-    private final ReadWriteLock pruneLock = new ReentrantReadWriteLock();
-    private final long sizeInMillis;
-    private final long precisionInMillis;
-    private final TimeSource timeSource;
-
-    /**
-     * Creates a sliding rate whose time window is of the given size, with the given precision and time unit.
-     * <p>
-     * The precision defines how accurate the rate computation is, as it will be computed over window size +/-
-     * precision.
-     * </p>
-     */
-    public SlidingTimeRate(TimeSource timeSource, long size, long precision, TimeUnit unit)
-    {
-        Preconditions.checkArgument(size > precision, "Size should be greater than precision.");
-        Preconditions.checkArgument(TimeUnit.MILLISECONDS.convert(precision, unit) >= 1, "Precision must be greater than or equal to 1 millisecond.");
-        this.sizeInMillis = TimeUnit.MILLISECONDS.convert(size, unit);
-        this.precisionInMillis = TimeUnit.MILLISECONDS.convert(precision, unit);
-        this.timeSource = timeSource;
-    }
-
-    /**
-     * Updates the rate.
-     */
-    public void update(int delta)
-    {
-        pruneLock.readLock().lock();
-        try
-        {
-            while (true)
-            {
-                long now = timeSource.currentTimeMillis();
-                long lastTimestamp = lastCounterTimestamp.get();
-                boolean isWithinPrecisionRange = (now - lastTimestamp) < precisionInMillis;
-                AtomicInteger lastCounter = counters.get(lastTimestamp);
-                // If there's a valid counter for the current last timestamp, and we're in the precision range,
-                // update such counter:
-                if (lastCounter != null && isWithinPrecisionRange)
-                {
-                    lastCounter.addAndGet(delta);
-
-                    break;
-                }
-                // Else if there's no counter or we're past the precision range, try to create a new counter,
-                // but only the thread updating the last timestamp will create a new counter:
-                else if (lastCounterTimestamp.compareAndSet(lastTimestamp, now))
-                {
-                    AtomicInteger existing = counters.putIfAbsent(now, new AtomicInteger(delta));
-                    if (existing != null)
-                    {
-                        existing.addAndGet(delta);
-                    }
-
-                    break;
-                }
-            }
-        }
-        finally
-        {
-            pruneLock.readLock().unlock();
-        }
-    }
-
-    /**
-     * Gets the current rate in the given time unit from the beginning of the time window to the
-     * provided point in time ago.
-     */
-    public double get(long toAgo, TimeUnit unit)
-    {
-        pruneLock.readLock().lock();
-        try
-        {
-            long toAgoInMillis = TimeUnit.MILLISECONDS.convert(toAgo, unit);
-            Preconditions.checkArgument(toAgoInMillis < sizeInMillis, "Cannot get rate in the past!");
-
-            long now = timeSource.currentTimeMillis();
-            long sum = 0;
-            ConcurrentNavigableMap<Long, AtomicInteger> tailCounters = counters
-                    .tailMap(now - sizeInMillis, true)
-                    .headMap(now - toAgoInMillis, true);
-            for (AtomicInteger i : tailCounters.values())
-            {
-                sum += i.get();
-            }
-
-            double rateInMillis = sum == 0
-                                  ? sum
-                                  : sum / (double) Math.max(1000, (now - toAgoInMillis) - tailCounters.firstKey());
-            double multiplier = TimeUnit.MILLISECONDS.convert(1, unit);
-            return rateInMillis * multiplier;
-        }
-        finally
-        {
-            pruneLock.readLock().unlock();
-        }
-    }
-
-    /**
-     * Gets the current rate in the given time unit.
-     */
-    public double get(TimeUnit unit)
-    {
-        return get(0, unit);
-    }
-
-    /**
-     * Prunes the time window of old unused updates.
-     */
-    public void prune()
-    {
-        pruneLock.writeLock().lock();
-        try
-        {
-            long now = timeSource.currentTimeMillis();
-            counters.headMap(now - sizeInMillis, false).clear();
-        }
-        finally
-        {
-            pruneLock.writeLock().unlock();
-        }
-    }
-
-    @VisibleForTesting
-    public int size()
-    {
-        return counters.values().stream().reduce(new AtomicInteger(), (v1, v2) -> {
-            v1.addAndGet(v2.get());
-            return v1;
-        }).get();
-    }
-}
diff --git a/src/java/org/apache/cassandra/utils/SystemTimeSource.java b/src/java/org/apache/cassandra/utils/SystemTimeSource.java
deleted file mode 100644
index fef525e39c17..000000000000
--- a/src/java/org/apache/cassandra/utils/SystemTimeSource.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.concurrent.TimeUnit;
-
-import com.google.common.util.concurrent.Uninterruptibles;
-
-/**
- * Time source backed by JVM clock.
- */
-public class SystemTimeSource implements TimeSource
-{
-    @Override
-    public long currentTimeMillis()
-    {
-        return System.currentTimeMillis();
-    }
-
-    @Override
-    public long nanoTime()
-    {
-        return System.nanoTime();
-    }
-
-    @Override
-    public TimeSource sleepUninterruptibly(long sleepFor, TimeUnit unit)
-    {
-        Uninterruptibles.sleepUninterruptibly(sleepFor, unit);
-        return this;
-    }
-
-    @Override
-    public TimeSource sleep(long sleepFor, TimeUnit unit) throws InterruptedException
-    {
-        TimeUnit.NANOSECONDS.sleep(TimeUnit.NANOSECONDS.convert(sleepFor, unit));
-        return this;
-    }
-}
diff --git a/src/java/org/apache/cassandra/utils/TimeSource.java b/src/java/org/apache/cassandra/utils/TimeSource.java
deleted file mode 100644
index 5d8acec7031a..000000000000
--- a/src/java/org/apache/cassandra/utils/TimeSource.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.concurrent.TimeUnit;
-
-public interface TimeSource
-{
-    /**
-     *
-     * @return the current time in milliseconds
-     */
-    long currentTimeMillis();
-
-    /**
-     *
-     * @return Returns the current time value in nanoseconds.
-     *
-     * <p>This method can only be used to measure elapsed time and is
-     * not related to any other notion of system or wall-clock time.
-     */
-    long nanoTime();
-
-    /**
-     * Sleep for the given amount of time uninterruptibly.
-     *
-     * @param  sleepFor given amout.
-     * @param  unit time unit
-     * @return The time source itself after the given sleep period.
-     */
-    TimeSource sleepUninterruptibly(long sleepFor, TimeUnit unit);
-
-    /**
-     * Sleep for the given amount of time. This operation could interrupted.
-     * Hence after returning from this method, it is not guaranteed
-     * that the request amount of time has passed.
-     *
-     * @param  sleepFor given amout.
-     * @param  unit time unit
-     * @return The time source itself after the given sleep period.
-     */
-    TimeSource sleep(long sleepFor, TimeUnit unit) throws InterruptedException;
-}
diff --git a/src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java b/src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java
deleted file mode 100644
index 382a2dc43669..000000000000
--- a/src/java/org/apache/cassandra/utils/concurrent/IntervalLock.java
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils.concurrent;
-
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.locks.ReentrantReadWriteLock;
-
-import com.google.common.annotations.VisibleForTesting;
-
-import org.apache.cassandra.utils.TimeSource;
-
-/**
- * This class extends ReentrantReadWriteLock to provide a write lock that can only be acquired at provided intervals.
- */
-public class IntervalLock extends ReentrantReadWriteLock
-{
-    private final AtomicLong lastAcquire = new AtomicLong();
-    private final TimeSource timeSource;
-
-    public IntervalLock(TimeSource timeSource)
-    {
-        this.timeSource = timeSource;
-    }
-
-    /**
-     * Try acquiring a write lock if the given interval is passed since the last call to this method.
-     *
-     * @param interval In millis.
-     * @return True if acquired and locked, false otherwise.
-     */
-    public boolean tryIntervalLock(long interval)
-    {
-        long now = timeSource.currentTimeMillis();
-        boolean acquired = (now - lastAcquire.get() >= interval) && writeLock().tryLock();
-        if (acquired)
-            lastAcquire.set(now);
-
-        return acquired;
-    }
-
-    /**
-     * Release the last acquired interval lock.
-     */
-    public void releaseIntervalLock()
-    {
-        writeLock().unlock();
-    }
-
-    @VisibleForTesting
-    public long getLastIntervalAcquire()
-    {
-        return lastAcquire.get();
-    }
-}
diff --git a/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java b/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java
index 741b1454b5c9..f91690cf5d5b 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/AbstractCommitLogServiceTest.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.commitlog.AbstractCommitLogService.SyncRunnable;
 import org.apache.cassandra.utils.FreeRunningClock;
+import org.apache.cassandra.utils.MonotonicClock;
 
 import static org.apache.cassandra.db.commitlog.AbstractCommitLogService.DEFAULT_MARKER_INTERVAL_MILLIS;
 
@@ -100,7 +101,7 @@ private static class FakeCommitLogService extends AbstractCommitLogService
     {
         FakeCommitLogService(long syncIntervalMillis)
         {
-            super(new FakeCommitLog(), "This is not a real commit log", syncIntervalMillis, true);
+            super(new FakeCommitLog(), "This is not a real commit log", syncIntervalMillis, MonotonicClock.preciseTime, true);
             lastSyncedAt = 0;
         }
 
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java
new file mode 100644
index 000000000000..e024954a7958
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogAwaitAsyncAtTest.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.commitlog;
+
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.FreeRunningClock;
+import org.apache.cassandra.utils.MonotonicClock;
+import org.mockito.Mockito;
+
+public class CommitLogAwaitAsyncAtTest
+{
+    @BeforeClass
+    public static void beforeClass() throws ConfigurationException
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    /**
+     * syncTime (awaitSyncAt param) is in the past, now value overflowed, awaitSyncAt should not block,
+     * no clock advance calls.
+     */
+    @Test
+    public void notBlockIfSyncTimeIsInPast() throws InterruptedException
+    {
+        testResumingAwaitSyncAt(Long.MIN_VALUE + 10,
+                              Long.MAX_VALUE - 10,
+                                0);
+    }
+
+    /**
+     * syncTime (awaitSyncAt param) is in the future, awaitSyncAt should block, unblocking is caused by the flush
+     */
+    @Test
+    public void flushShouldUnblockAwaitSync() throws InterruptedException
+    {
+        testResumingAwaitSyncAt(Long.MAX_VALUE - 10,
+                              Long.MAX_VALUE - 5,
+                                1000);
+    }
+
+    /**
+     * Creates a CommitLogService instance and a new thread that calls awaitSyncAt. Awaits for at most a minute
+     * for the call to return.
+     * Uses artificial clock to progress through the commit flush. One clock advance is performed after the service and
+     * the thread are started.
+     *
+     * @param nowNanos test start time nanoseconds
+     * @param syncAtNanos awaitSyncAt parameter nanoseconds
+     * @param advanceMillis clock step in milliseconds
+     */
+    private void testResumingAwaitSyncAt(long nowNanos, long syncAtNanos, long advanceMillis) throws InterruptedException
+    {
+        FreeRunningClock clock = new FreeRunningClock(nowNanos);
+        AbstractCommitLogService service = getCommitLogService(clock);
+
+        Thread awaitForSync = new Thread(CommitLogAwaitAsyncAtTest.class.getSimpleName() + " commit log waiting thread")
+        {
+            @Override
+            public void run()
+            {
+                service.awaitSyncAt(syncAtNanos, null);
+            }
+        };
+        awaitForSync.start();
+
+        service.start();
+
+        // move clock once with advance millis
+        clock.advance(advanceMillis, TimeUnit.MILLISECONDS);
+
+        // wait at most 1 minute for awaitSyncAt to unblock
+        awaitForSync.join(60 * 1000);
+        if (awaitForSync.isAlive())
+            Assert.fail("awaitSyncAt should be unblocked by now, check commit log code for bugs in nanoseconds" +
+                        "comparisons");
+    }
+
+    private AbstractCommitLogService getCommitLogService(MonotonicClock clock) {
+        CommitLog commitLog = Mockito.mock(CommitLog.class);
+        return new AbstractCommitLogService(commitLog, "testService", 100, clock)
+        {
+            @Override
+            protected void maybeWaitForSync(CommitLogSegment.Allocation alloc)
+            {
+            }
+        };
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java b/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java
index b2891a9950e2..db12ff2df5cb 100644
--- a/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java
+++ b/test/unit/org/apache/cassandra/utils/MonotonicClockTest.java
@@ -20,6 +20,7 @@
 import static org.apache.cassandra.utils.MonotonicClock.approxTime;
 import static org.junit.Assert.*;
 
+import org.junit.Assert;
 import org.junit.Test;
 
 public class MonotonicClockTest
@@ -53,4 +54,13 @@ public void testTimestampOrdering() throws Exception
             lastConverted = convertedNow;
         }
     }
+
+    @Test
+    public void testTimestampOverflowComparison()
+    {
+        MonotonicClock clock = MonotonicClock.preciseTime;
+
+        Assert.assertTrue("Overflown long (now) should be after long close to max",
+                          clock.isAfter(Long.MIN_VALUE + 1, Long.MAX_VALUE));
+    }
 }
diff --git a/test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java b/test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java
deleted file mode 100644
index 8dc4a14d6d76..000000000000
--- a/test/unit/org/apache/cassandra/utils/SlidingTimeRateTest.java
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-
-import org.junit.Assert;
-import org.junit.Ignore;
-import org.junit.Test;
-
-/**
- * No objects are created currently from SlidingTimeRate in Cassandra 4.0.
- * If you decide to use it, please check CASSANDRA-16713.
- * There still might be a bug, flaky test to be fixed before using it again.
- *
- * Skipping all tests for running now to clean he noise before 4.0 GA release.
- */
-public class SlidingTimeRateTest
-{
-    @Ignore
-    @Test
-    public void testUpdateAndGet()
-    {
-        SlidingTimeRate rate = new SlidingTimeRate(new TestTimeSource(), 10, 1, TimeUnit.SECONDS);
-        int updates = 100;
-        for (int i = 0; i < updates; i++)
-        {
-            rate.update(1);
-        }
-        Assert.assertEquals(updates, rate.get(TimeUnit.SECONDS), 0.0);
-    }
-
-    @Ignore
-    @Test
-    public void testUpdateAndGetBetweenWindows()
-    {
-        TestTimeSource time = new TestTimeSource();
-        SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS);
-        int updates = 100;
-        for (int i = 0; i < updates; i++)
-        {
-            rate.update(1);
-            time.sleep(100, TimeUnit.MILLISECONDS);
-        }
-        Assert.assertEquals(10, rate.get(TimeUnit.SECONDS), 0.0);
-    }
-
-    @Ignore
-    @Test
-    public void testUpdateAndGetPastWindowSize()
-    {
-        TestTimeSource time = new TestTimeSource();
-        SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS);
-        int updates = 100;
-        for (int i = 0; i < updates; i++)
-        {
-            rate.update(1);
-        }
-
-        time.sleep(6, TimeUnit.SECONDS);
-
-        Assert.assertEquals(0, rate.get(TimeUnit.SECONDS), 0.0);
-    }
-
-    @Ignore
-    @Test
-    public void testUpdateAndGetToPointInTime()
-    {
-        TestTimeSource time = new TestTimeSource();
-        SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS);
-        int updates = 10;
-        for (int i = 0; i < updates; i++)
-        {
-            rate.update(1);
-            time.sleep(100, TimeUnit.MILLISECONDS);
-        }
-
-        time.sleep(1, TimeUnit.SECONDS);
-
-        Assert.assertEquals(5, rate.get(TimeUnit.SECONDS), 0.0);
-        Assert.assertEquals(10, rate.get(1, TimeUnit.SECONDS), 0.0);
-    }
-
-    @Ignore
-    @Test
-    public void testDecay() throws InterruptedException
-    {
-        TestTimeSource time = new TestTimeSource();
-        SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS);
-        int updates = 10;
-        for (int i = 0; i < updates; i++)
-        {
-            rate.update(1);
-            time.sleep(100, TimeUnit.MILLISECONDS);
-        }
-        Assert.assertEquals(10, rate.get(TimeUnit.SECONDS), 0.0);
-
-        time.sleep(1, TimeUnit.SECONDS);
-
-        Assert.assertEquals(5, rate.get(TimeUnit.SECONDS), 0.0);
-
-        time.sleep(2, TimeUnit.SECONDS);
-
-        Assert.assertEquals(2.5, rate.get(TimeUnit.SECONDS), 0.0);
-    }
-
-    @Ignore
-    @Test
-    public void testPruning()
-    {
-        TestTimeSource time = new TestTimeSource();
-        SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS);
-
-        rate.update(1);
-        Assert.assertEquals(1, rate.size());
-
-        time.sleep(6, TimeUnit.SECONDS);
-
-        rate.prune();
-        Assert.assertEquals(0, rate.size());
-    }
-
-    @Ignore
-    @Test
-    public void testConcurrentUpdateAndGet() throws InterruptedException
-    {
-        final ExecutorService executor = Executors.newFixedThreadPool(FBUtilities.getAvailableProcessors());
-        final TestTimeSource time = new TestTimeSource();
-        final SlidingTimeRate rate = new SlidingTimeRate(time, 5, 1, TimeUnit.SECONDS);
-        int updates = 100000;
-        for (int i = 0; i < updates; i++)
-        {
-            executor.submit(() -> {
-                time.sleep(1, TimeUnit.MILLISECONDS);
-                rate.update(1);
-            });
-        }
-
-        executor.shutdown();
-
-        Assert.assertTrue(executor.awaitTermination(1, TimeUnit.MINUTES));
-        Assert.assertEquals(1000, rate.get(TimeUnit.SECONDS), 100.0);
-    }
-}
diff --git a/test/unit/org/apache/cassandra/utils/TestTimeSource.java b/test/unit/org/apache/cassandra/utils/TestTimeSource.java
deleted file mode 100644
index 4ecd086f38d5..000000000000
--- a/test/unit/org/apache/cassandra/utils/TestTimeSource.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.utils;
-
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicLong;
-
-public class TestTimeSource implements TimeSource
-{
-    private final AtomicLong timeInMillis = new AtomicLong(System.currentTimeMillis());
-
-    @Override
-    public long currentTimeMillis()
-    {
-        return timeInMillis.get();
-    }
-
-    @Override
-    public long nanoTime()
-    {
-        return timeInMillis.get() * 1_000_000;
-    }
-
-    @Override
-    public TimeSource sleep(long sleepFor, TimeUnit unit)
-    {
-        long current = timeInMillis.get();
-        long sleepInMillis = TimeUnit.MILLISECONDS.convert(sleepFor, unit);
-        boolean elapsed;
-        do
-        {
-            long newTime = current + sleepInMillis;
-            elapsed = timeInMillis.compareAndSet(current, newTime);
-            if (!elapsed)
-            {
-                long updated = timeInMillis.get();
-                if (updated - current >= sleepInMillis)
-                {
-                    elapsed = true;
-                }
-                else
-                {
-                    sleepInMillis -= updated - current;
-                    current = updated;
-                }
-            }
-        }
-        while (!elapsed);
-        return this;
-    }
-
-    @Override
-    public TimeSource sleepUninterruptibly(long sleepFor, TimeUnit unit)
-    {
-        return sleep(sleepFor, unit);
-    }
-}

From 68984d8f6932c3d5ea56ba5e834213c9cd5dcf11 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Wed, 2 Jun 2021 11:10:44 +0200
Subject: [PATCH 007/151] STAR-561 selectors should contain elements with same
 type (#169)

* selectors should contain elements with same type

Expressions like "SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t, WRITETIME(t))] FROM %s"
can't be deserialized. If we try to extract the result by calling
getRows we get:
org.apache.cassandra.serializers.MarshalException: Expected 8 or 0 byte long

This commit makes sure that all collection elements have the
same type.

Co-authored-by: Zhao Yang <zhao.yang@datastax.com>

* test for Maps.getExactMapTypeIfKnow

Maps are a bit different than Lists and Sets. Maps can't be selected
without a type hint, which makes it impossible to exercise query execution
path with `getExactMapTypeIfKnow`. That is why tests for "map SELECTs" are
missing TermSelectionTest.java.
The previous fixed `Maps.getExactMapTypeIfKnow` but it can't be tested in
a similar way as for Lists and Sets. That's why commit adds a separate
unit tests for `Maps.getExactMapTypeIfKnow`.

Co-authored-by: Zhao Yang <zhao.yang@datastax.com>
(cherry picked from commit 78b0117ac1a233de739ed8d17f72d300a81a5fea)
---
 src/java/org/apache/cassandra/cql3/Lists.java | 26 ++++-
 src/java/org/apache/cassandra/cql3/Maps.java  | 27 ++++--
 src/java/org/apache/cassandra/cql3/Sets.java  |  4 +-
 .../cassandra/cql3/selection/Selectable.java  | 23 +++++
 .../org/apache/cassandra/cql3/CQLTester.java  |  6 +-
 .../org/apache/cassandra/cql3/MapsTest.java   | 75 +++++++++++++++
 .../cql3/selection/TermSelectionTest.java     | 95 ++++++++++++++++++-
 7 files changed, 241 insertions(+), 15 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/cql3/MapsTest.java

diff --git a/src/java/org/apache/cassandra/cql3/Lists.java b/src/java/org/apache/cassandra/cql3/Lists.java
index 1d94d697a50e..cd45095ea5c4 100644
--- a/src/java/org/apache/cassandra/cql3/Lists.java
+++ b/src/java/org/apache/cassandra/cql3/Lists.java
@@ -128,8 +128,30 @@ public static <T> String listToString(Iterable<T> items, java.util.function.Func
     public static <T> AbstractType<?> getExactListTypeIfKnown(List<T> items,
                                                               java.util.function.Function<T, AbstractType<?>> mapper)
     {
-        Optional<AbstractType<?>> type = items.stream().map(mapper).filter(Objects::nonNull).findFirst();
-        return type.isPresent() ? ListType.getInstance(type.get(), false) : null;
+        AbstractType<?> type = getElementType(items, mapper);
+        return type != null ? ListType.getInstance(type, false) : null;
+    }
+
+    protected static <T> AbstractType<?> getElementType(List<T> items,
+                                                        java.util.function.Function<T, AbstractType<?>> mapper)
+    {
+        AbstractType<?> type = null;
+        for (T item : items)
+        {
+            AbstractType<?> itemType = mapper.apply(item);
+            if (itemType == null)
+                continue;
+
+            if (type != null && !itemType.isCompatibleWith(type))
+            {
+                if (type.isCompatibleWith(itemType))
+                    continue;
+
+                throw new InvalidRequestException("Invalid collection literal: all selectors must have the same CQL type inside collection literals");
+            }
+            type = itemType;
+        }
+        return type;
     }
 
     public static class Literal extends Term.Raw
diff --git a/src/java/org/apache/cassandra/cql3/Maps.java b/src/java/org/apache/cassandra/cql3/Maps.java
index 6e7e07b57601..a4c213c98a3a 100644
--- a/src/java/org/apache/cassandra/cql3/Maps.java
+++ b/src/java/org/apache/cassandra/cql3/Maps.java
@@ -134,16 +134,31 @@ public static <T> AbstractType<?> getExactMapTypeIfKnown(List<Pair<T, T>> entrie
         AbstractType<?> valueType = null;
         for (Pair<T, T> entry : entries)
         {
-            if (keyType == null)
-                keyType = mapper.apply(entry.left);
-            if (valueType == null)
-                valueType = mapper.apply(entry.right);
-            if (keyType != null && valueType != null)
-                return MapType.getInstance(keyType, valueType, false);
+            keyType = selectType(keyType, mapper.apply(entry.left));
+            valueType = selectType(valueType, mapper.apply(entry.right));
         }
+
+        if (keyType != null && valueType != null)
+            return MapType.getInstance(keyType, valueType, false);
+
         return null;
     }
 
+    private static <T> AbstractType<?> selectType(AbstractType<?> type, AbstractType<?> otherType)
+    {
+        if (otherType == null)
+            return type;
+
+        if (type != null && !otherType.isCompatibleWith(type))
+        {
+            if (type.isCompatibleWith(otherType))
+                return type;
+
+            throw new InvalidRequestException("Invalid collection literal: all selectors must have the same CQL type inside collection literals");
+        }
+        return otherType;
+    }
+
     public static class Literal extends Term.Raw
     {
         public final List<Pair<Term.Raw, Term.Raw>> entries;
diff --git a/src/java/org/apache/cassandra/cql3/Sets.java b/src/java/org/apache/cassandra/cql3/Sets.java
index aab4192587fb..e31841a583cc 100644
--- a/src/java/org/apache/cassandra/cql3/Sets.java
+++ b/src/java/org/apache/cassandra/cql3/Sets.java
@@ -119,8 +119,8 @@ public static <T> String setToString(Iterable<T> items, java.util.function.Funct
     public static <T> AbstractType<?> getExactSetTypeIfKnown(List<T> items,
                                                              java.util.function.Function<T, AbstractType<?>> mapper)
     {
-        Optional<AbstractType<?>> type = items.stream().map(mapper).filter(Objects::nonNull).findFirst();
-        return type.isPresent() ? SetType.getInstance(type.get(), false) : null;
+        AbstractType<?> type = Lists.getElementType(items, mapper);
+        return type != null ? SetType.getInstance(type, false) : null;
     }
 
     public static class Literal extends Term.Raw
diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
index de5360f52529..66759381ef5a 100644
--- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java
+++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java
@@ -114,6 +114,23 @@ default ColumnSpecification specForElementOrSlice(Selectable selected, ColumnSpe
         }
     }
 
+    /**
+     * Checks that this {@code Selectable} is or can be converted into the specified type.
+     * @param table the table schema
+     * @param type the expected type
+     * @throws InvalidRequestException if the {@code Selectable} can not be converted into the specified type
+     */
+    default void validateType(TableMetadata table, AbstractType<?> type)
+    {
+        ColumnSpecification receiver = new ColumnSpecification(table.keyspace,
+                                                               table.name,
+                                                               new ColumnIdentifier(toString(), true),
+                                                               type);
+
+        if (!testAssignment(table.keyspace, receiver).isAssignable())
+            throw invalidRequest("%s is not of the expected type: %s", this, type.asCQL3Type());
+    }
+
     public interface Raw
     {
         public Selectable prepare(TableMetadata table);
@@ -175,6 +192,8 @@ public Selector.Factory newSelectorFactory(TableMetadata table, AbstractType<?>
                 type = expectedType;
                 if (type == null)
                     throw new InvalidRequestException("Cannot infer type for term " + this + " in selection clause (try using a cast to force a type)");
+
+                validateType(table, type);
             }
 
             // The fact we default the name to "[selection]" inconditionally means that any bind marker in a
@@ -624,6 +643,7 @@ public Factory newSelectorFactory(TableMetadata cfm,
                 if (type == null)
                     throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)",
                                          this);
+                validateType(cfm, type);
             }
 
             if (selectables.size() == 1 && !type.isTuple())
@@ -742,6 +762,7 @@ public Factory newSelectorFactory(TableMetadata cfm,
                 if (type == null)
                     throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)",
                                          this);
+                validateType(cfm, type);
             }
 
             ListType<?> listType = (ListType<?>) type;
@@ -827,6 +848,7 @@ public Factory newSelectorFactory(TableMetadata cfm,
                 if (type == null)
                     throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)",
                                          this);
+                validateType(cfm, type);
             }
 
             // The parser treats empty Maps as Sets so if the type is a MapType we know that the Map is empty
@@ -931,6 +953,7 @@ public Factory newSelectorFactory(TableMetadata cfm,
                 if (type == null)
                     throw invalidRequest("Cannot infer type for term %s in selection clause (try using a cast to force a type)",
                                          this);
+                validateType(cfm, type);
             }
 
             if (type.isUDT())
diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 2f88f7c5dcb1..1c9150e20981 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -77,6 +77,7 @@
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
@@ -1137,10 +1138,13 @@ public static void assertRows(UntypedResultSet result, Object[]... rows)
         int i = 0;
         while (iter.hasNext() && i < rows.length)
         {
+            if (rows[i] == null)
+                throw new IllegalArgumentException(String.format("Invalid expected value for row: %d. A row cannot be null.", i));
+
             Object[] expected = rows[i];
             UntypedResultSet.Row actual = iter.next();
 
-            Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), expected == null ? 1 : expected.length, meta.size());
+            Assert.assertEquals(String.format("Invalid number of (expected) values provided for row %d", i), expected.length, meta.size());
 
             for (int j = 0; j < meta.size(); j++)
             {
diff --git a/test/unit/org/apache/cassandra/cql3/MapsTest.java b/test/unit/org/apache/cassandra/cql3/MapsTest.java
new file mode 100644
index 000000000000..118485ad9191
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/MapsTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.util.function.Function;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.NumberType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.utils.Pair;
+
+public class MapsTest extends CQLTester
+{
+    private final Function<NumberType<?>, AbstractType<?>> identityMapper = integerType -> integerType;
+
+    @Rule
+    public ExpectedException thrown = ExpectedException.none();
+
+    @Test
+    public void testGetExactMapTypeIfKnownWithDifferentTypes()
+    {
+        thrown.expect(InvalidRequestException.class);
+        thrown.expectMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals");
+
+        Maps.getExactMapTypeIfKnown(ImmutableList.of(
+            Pair.create(Int32Type.instance, Int32Type.instance),
+            Pair.create(Int32Type.instance, IntegerType.instance)
+        ), identityMapper);
+    }
+
+    @Test
+    public void testGetExactMapTypeIfKnownWithTheSameTypes()
+    {
+        AbstractType<?> exactType = Maps.getExactMapTypeIfKnown(ImmutableList.of(
+            Pair.create(Int32Type.instance, Int32Type.instance),
+            Pair.create(Int32Type.instance, Int32Type.instance)
+        ), identityMapper);
+
+        AbstractType<?> expected = MapType.getInstance(Int32Type.instance, Int32Type.instance, false).freeze();
+        Assert.assertEquals(expected, exactType);
+    }
+
+    @Test
+    public void testGetExactMapTypeIfKnownWithoutTypes()
+    {
+        AbstractType<?> exactType = Maps.getExactMapTypeIfKnown(ImmutableList.of(), identityMapper);
+
+        Assert.assertNull(exactType);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java b/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java
index fb46809ff48b..ae907cf55a7a 100644
--- a/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/selection/TermSelectionTest.java
@@ -134,6 +134,10 @@ public void testSelectLiteral() throws Throwable
                    row(list(set(1), set(3))),
                    row(list(set(1), set(2))),
                    row(list(set(1), set(1))));
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT [{pk, t}, {ck}] FROM %s WHERE pk = 1");
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT [{pk}, {t}] FROM %s WHERE pk = 1");
 
         // Test Maps nested within Lists
         assertRows(execute("SELECT [{}, (map<text, int>){'min' : min(ck), 'max' : max(ck)}] FROM %s"),
@@ -154,10 +158,50 @@ public void testSelectLiteral() throws Throwable
                    row(list(tuple(1, 3, timestampInMicros))));
         assertRows(execute("SELECT [(min(ck), max(ck))] FROM %s"),
                    row(list(tuple(1, 3))));
-        assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t, WRITETIME(t))] FROM %s"),
-                   row(list(tuple(1L, 1L), tuple("one", timestampInMicros))),
-                   row(list(tuple(1L, 2L), tuple("two", timestampInMicros))),
-                   row(list(tuple(1L, 3L), tuple("three", timestampInMicros))));
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t, writetime(t))] FROM %s");
+
+        assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT), t), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))] FROM %s"),
+                   row(list(tuple(1L, 1L, "one"), tuple(1L, 1L))),
+                   row(list(tuple(1L, 2L, "two"), tuple(1L, 2L))),
+                   row(list(tuple(1L, 3L, "three"), tuple(1L, 3L))));
+
+        assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT), t)] FROM %s"),
+                   row(list(tuple(1L, 1L), tuple(1L, 1L, "one"))),
+                   row(list(tuple(1L, 2L), tuple(1L, 2L, "two"))),
+                   row(list(tuple(1L, 3L), tuple(1L, 3L, "three"))));
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT [(CAST(pk AS BIGINT), t, CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))] FROM %s");
+
+        // list of tuples of tuples
+        assertRows(execute("SELECT [((t,t, t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT)))] FROM %s"),
+                   row(list(tuple(tuple("one", "one", "one"), tuple("one", "one", 1L)),
+                            tuple(tuple("one", "one"), tuple("one", "one", 1L)))),
+                   row(list(tuple(tuple("two", "two", "two"), tuple("two", "two", 2L)),
+                            tuple(tuple("two", "two"), tuple("two", "two", 2L)))),
+                   row(list(tuple(tuple("three", "three", "three"), tuple("three", "three", 3L)),
+                            tuple(tuple("three", "three"), tuple("three", "three", 3L)))));
+
+        assertRows(execute("SELECT [((t,t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT),t))] FROM %s"),
+                   row(list(tuple(tuple("one", "one"), tuple("one", "one", 1L)),
+                            tuple(tuple("one", "one"), tuple("one", "one", 1L, "one")))),
+                   row(list(tuple(tuple("two", "two"), tuple("two", "two", 2L)),
+                            tuple(tuple("two", "two"), tuple("two", "two", 2L, "two")))),
+                   row(list(tuple(tuple("three", "three"), tuple("three", "three", 3L)),
+                            tuple(tuple("three", "three"), tuple("three", "three", 3L, "three")))));
+
+        // single element tuple: tuple(t) incompatible with tuple(long, long)
+        assertInvalidMessage("(t) is not of the expected type: frozen<tuple<bigint, bigint>>",
+                             "SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t)] FROM %s");
+
+        assertInvalidMessage("(cast(ck as bigint)) is not of the expected type: frozen<tuple<text, text>>",
+                             "SELECT [(t, t), (CAST(ck AS BIGINT))] FROM %s");
+
+        // single element tuple: tuple(long) compatible with tuple(long, long)
+        assertRows(execute("SELECT [(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (CAST(ck AS BIGINT))] FROM %s"),
+                   row(list(tuple(1L, 1L), tuple(1L))),
+                   row(list(tuple(1L, 2L), tuple(2L))),
+                   row(list(tuple(1L, 3L), tuple(3L))));
 
         // Test UDTs nested within Lists
         String type = createType("CREATE TYPE %s(a int, b int, c bigint)");
@@ -189,6 +233,10 @@ public void testSelectLiteral() throws Throwable
                    row(set(list(1), list(3))));
         assertRows(execute("SELECT {([min(ck)]), [max(ck)]} FROM %s"),
                    row(set(list(1), list(3))));
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT {[min(ck), writetime(t)], [max(ck)]} FROM %s");
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT {[writetime(t)], [max(ck)]} FROM %s");
 
         // Test Sets nested within Sets
         assertRows(execute("SELECT {{}, {min(ck), max(ck)}} FROM %s"),
@@ -223,6 +271,45 @@ public void testSelectLiteral() throws Throwable
                    row(set(tuple(1, 3, timestampInMicros))));
         assertRows(execute("SELECT {(min(ck), max(ck))} FROM %s"),
                    row(set(tuple(1, 3))));
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT {(min(ck), max(ck)), (t, writetime(t))} FROM %s");
+
+        assertRows(execute("SELECT {(CAST(pk AS BIGINT), CAST(ck AS BIGINT), t), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s"),
+                   row(set(tuple(1L, 1L, "one"), tuple(1L, 1L))),
+                   row(set(tuple(1L, 2L, "two"), tuple(1L, 2L))),
+                   row(set(tuple(1L, 3L, "three"), tuple(1L, 3L))));
+
+        assertRows(execute("SELECT {(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT), t)} FROM %s"),
+                   row(set(tuple(1L, 1L), tuple(1L, 1L, "one"))),
+                   row(set(tuple(1L, 2L), tuple(1L, 2L, "two"))),
+                   row(set(tuple(1L, 3L), tuple(1L, 3L, "three"))));
+
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT {(CAST(pk AS BIGINT), t, CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s");
+
+        assertInvalidMessage("Invalid collection literal: all selectors must have the same CQL type inside collection literals",
+                             "SELECT {(CAST(pk AS BIGINT), t, CAST(ck AS BIGINT)), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s");
+
+        // set of tuples of tuples
+        assertRows(execute("SELECT {((t,t, t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT)))} FROM %s"),
+                   row(set(tuple(tuple("one", "one", "one"), tuple("one", "one", 1L)),
+                           tuple(tuple("one", "one"), tuple("one", "one", 1L)))),
+                   row(set(tuple(tuple("two", "two", "two"), tuple("two", "two", 2L)),
+                           tuple(tuple("two", "two"), tuple("two", "two", 2L)))),
+                   row(set(tuple(tuple("three", "three", "three"), tuple("three", "three", 3L)),
+                           tuple(tuple("three", "three"), tuple("three", "three", 3L)))));
+
+        assertRows(execute("SELECT {((t,t), (t,t,CAST(ck AS BIGINT))), ((t,t), (t,t,CAST(ck AS BIGINT),t))} FROM %s"),
+                   row(set(tuple(tuple("one", "one"), tuple("one", "one", 1L)),
+                           tuple(tuple("one", "one"), tuple("one", "one", 1L, "one")))),
+                   row(set(tuple(tuple("two", "two"), tuple("two", "two", 2L)),
+                           tuple(tuple("two", "two"), tuple("two", "two", 2L, "two")))),
+                   row(set(tuple(tuple("three", "three"), tuple("three", "three", 3L)),
+                           tuple(tuple("three", "three"), tuple("three", "three", 3L, "three")))));
+
+        // getExactType for (t) is null
+        assertInvalidMessage("(t) is not of the expected type: frozen<tuple<bigint, bigint>>",
+                             "SELECT {(CAST(pk AS BIGINT), CAST(ck AS BIGINT)), (t), (CAST(pk AS BIGINT), CAST(ck AS BIGINT))} FROM %s");
 
         // Test UDTs nested within Sets
         assertRows(execute("SELECT {(" + type + "){a : min(ck), b: max(ck)}} FROM %s"),

From 47d1561b532e41c6de8517b8267b0292348e6762 Mon Sep 17 00:00:00 2001
From: Ruslan Fomkin <Ruslan.Fomkin@gmail.com>
Date: Wed, 2 Jun 2021 12:20:28 +0200
Subject: [PATCH 008/151] STAR-573 Make assassinate more resilient to missing
 tokens (#178)

Co-authored-by: Robert Stupp <snazy@snazy.de>
(cherry picked from commit 4109c7377fe92e55cb8751d66b28b7c2499e4669)
---
 src/java/org/apache/cassandra/gms/Gossiper.java        | 10 ++++++----
 .../org/apache/cassandra/locator/TokenMetadata.java    |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java
index 39fe1fe7287c..5dd121831008 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java
@@ -762,7 +762,6 @@ public void assassinateEndpoint(String address) throws UnknownHostException
         InetAddressAndPort endpoint = InetAddressAndPort.getByName(address);
         runInGossipStageBlocking(() -> {
             EndpointState epState = endpointStateMap.get(endpoint);
-            Collection<Token> tokens;
             logger.warn("Assassinating {} via gossip", endpoint);
 
             if (epState == null)
@@ -787,6 +786,7 @@ else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat)
                 epState.getHeartBeatState().forceNewerGenerationUnsafe();
             }
 
+            Collection<Token> tokens = null;
             try
             {
                 tokens = StorageService.instance.getTokenMetadata().getTokens(endpoint);
@@ -794,8 +794,10 @@ else if (newState.getHeartBeatState().getHeartBeatVersion() != heartbeat)
             catch (Throwable th)
             {
                 JVMStabilityInspector.inspectThrowable(th);
-                // TODO this is broken
-                logger.warn("Unable to calculate tokens for {}.  Will use a random one", address);
+            }
+            if (tokens == null || tokens.isEmpty())
+            {
+                logger.warn("Trying to assassinate an endpoint {} that does not have any tokens assigned. This should not have happened, trying to continue with a random token.", address);
                 tokens = Collections.singletonList(StorageService.instance.getTokenMetadata().partitioner.getRandomToken());
             }
 
@@ -1014,7 +1016,7 @@ void doStatusCheck()
                             // to make sure that the previous read data was correct
                             logger.info("Race condition marking {} as a FatClient; ignoring", endpoint);
                             return;
-                        }                        
+                        }
                         removeEndpoint(endpoint); // will put it in justRemovedEndpoints to respect quarantine delay
                         evictFromMembership(endpoint); // can get rid of the state immediately
                     });
diff --git a/src/java/org/apache/cassandra/locator/TokenMetadata.java b/src/java/org/apache/cassandra/locator/TokenMetadata.java
index f2bbb9fe71eb..ab210457f0dd 100644
--- a/src/java/org/apache/cassandra/locator/TokenMetadata.java
+++ b/src/java/org/apache/cassandra/locator/TokenMetadata.java
@@ -564,11 +564,11 @@ public void removeFromMoving(InetAddressAndPort endpoint)
     public Collection<Token> getTokens(InetAddressAndPort endpoint)
     {
         assert endpoint != null;
-        assert isMember(endpoint); // don't want to return nulls
 
         lock.readLock().lock();
         try
         {
+            assert isMember(endpoint); // don't want to return nulls
             return new ArrayList<>(tokenToEndpointMap.inverse().get(endpoint));
         }
         finally

From f90677c7649ea7288fc5bc58e747513f09f815af Mon Sep 17 00:00:00 2001
From: Stefania <stefania.alborghetti@datastax.com>
Date: Tue, 22 Aug 2017 17:41:39 +0800
Subject: [PATCH 009/151] STAR-563: Fix SIGSEGVs on aborted flush

If a flush is aborted, e.g. by exception thrown by flushAllNonCFSBackedIndexesBlocking(),
this was done by closing the flush writer, potentially concurrently with operations on
it. The latter is unsafe and may cause writes to released memory.

Fixed by adding an abort mechanism to the flush runnables.

Port of DB-962 with an earlier commit.

patch by Stefania Alborghetti; reviewed by Alex Petrov
ported by Branimir Lambov; reviewed by Ruslan Fomkin

(cherry picked from commit 8df735dd3b558dddeb927ac1f8b917ba4cab164c)
---
 .../cassandra/db/ColumnFamilyStore.java       |  20 +-
 .../org/apache/cassandra/db/Memtable.java     | 150 +++++++++++----
 .../cassandra/io/util/SequentialWriter.java   |   7 +
 .../org/apache/cassandra/db/MemtableTest.java | 172 ++++++++++++++++++
 4 files changed, 310 insertions(+), 39 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/db/MemtableTest.java

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 113a9164b4d0..6b49855ec2fe 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -95,6 +95,7 @@
 
 import static org.apache.cassandra.utils.Throwables.maybeFail;
 import static org.apache.cassandra.utils.Throwables.merge;
+import static org.apache.cassandra.utils.Throwables.perform;
 
 public class ColumnFamilyStore implements ColumnFamilyStoreMBean
 {
@@ -1132,12 +1133,29 @@ public Collection<SSTableReader> flushMemtable(Memtable memtable, boolean flushN
                     if (flushNonCf2i)
                         indexManager.flushAllNonCFSBackedIndexesBlocking();
 
+                    // It may be worthwhile to add an early abort mechanism here if one of the futures throws.
+                    // In such a case this code will run the other threads to completion and only then abort the operation.
                     flushResults = Lists.newArrayList(FBUtilities.waitOnFutures(futures));
                 }
                 catch (Throwable t)
                 {
-                    t = memtable.abortRunnables(flushRunnables, t);
+                    logger.error("Flushing {} failed with error", memtable.toString(), t);
+                    if (flushRunnables != null)
+                    {
+                        for (Memtable.FlushRunnable runnable : flushRunnables)
+                            t = runnable.abort(t);
+                    }
+
+                    // wait for any flush runnables that were submitted (after aborting they should complete immediately)
+                    // this ensures that the writers are aborted by FlushRunnable.writeSortedContents(), in the worst
+                    // case we'll repeat the same exception twice if the initial exception was thrown whilst waiting
+                    // on a future
+                    t = perform(t, () -> FBUtilities.waitOnFutures(futures));
+
+                    //finally abort the transaction
                     t = txn.abort(t);
+
+                    // and re-throw
                     throw Throwables.propagate(t);
                 }
 
diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java
index 73c64169f712..3186ffb92aeb 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java
@@ -75,6 +75,8 @@
 import org.apache.cassandra.utils.memory.NativePool;
 import org.apache.cassandra.utils.memory.SlabPool;
 
+import static org.apache.cassandra.utils.Throwables.maybeFail;
+
 public class Memtable implements Comparable<Memtable>
 {
     private static final Logger logger = LoggerFactory.getLogger(Memtable.class);
@@ -319,6 +321,12 @@ private List<FlushRunnable> createFlushRunnables(LifecycleTransaction txn)
         if (boundaries == null)
             return Collections.singletonList(new FlushRunnable(txn));
 
+        return createFlushRunnables(boundaries, locations, txn);
+    }
+
+    @VisibleForTesting
+    List<FlushRunnable> createFlushRunnables(List<PartitionPosition> boundaries, List<Directories.DataDirectory> locations, LifecycleTransaction txn)
+    {
         List<FlushRunnable> runnables = new ArrayList<>(boundaries.size());
         PartitionPosition rangeStart = cfs.getPartitioner().getMinimumToken().minKeyBound();
         try
@@ -333,16 +341,11 @@ private List<FlushRunnable> createFlushRunnables(LifecycleTransaction txn)
         }
         catch (Throwable e)
         {
-            throw Throwables.propagate(abortRunnables(runnables, e));
-        }
-    }
+            for (Memtable.FlushRunnable runnable : runnables)
+                e = runnable.abort(e);
 
-    public Throwable abortRunnables(List<FlushRunnable> runnables, Throwable t)
-    {
-        if (runnables != null)
-            for (FlushRunnable runnable : runnables)
-                t = runnable.writer.abort(t);
-        return t;
+            throw Throwables.propagate(e);
+        }
     }
 
     public String toString()
@@ -411,6 +414,22 @@ public void makeUnflushable()
         liveDataSize.addAndGet((long) 1024 * 1024 * 1024 * 1024 * 1024);
     }
 
+    /**
+     * The valid states for {@link FlushRunnable} writers. The thread writing the contents
+     * will transition from IDLE -> RUNNING and back to IDLE when finished using the writer
+     * or from ABORTING -> ABORTED if another thread has transitioned from RUNNING -> ABORTING.
+     * We can also transition directly from IDLE -> ABORTED. Whichever threads transitions
+     * to ABORTED is responsible to abort the writer.
+     */
+    @VisibleForTesting
+    enum FlushRunnableWriterState
+    {
+        IDLE, // the runnable is idle, either not yet started or completed but with the writer waiting to be committed
+        RUNNING, // the runnable is executing, therefore the writer cannot be aborted or else a SEGV may ensue
+        ABORTING, // an abort request has been issued, this only happens if abort() is called whilst RUNNING
+        ABORTED  // the writer has been aborted, no resources will be leaked
+    }
+
     class FlushRunnable implements Callable<SSTableMultiWriter>
     {
         private final long estimatedSize;
@@ -423,6 +442,8 @@ class FlushRunnable implements Callable<SSTableMultiWriter>
         private final PartitionPosition from;
         private final PartitionPosition to;
 
+        private final AtomicReference<FlushRunnableWriterState> state;
+
         FlushRunnable(PartitionPosition from, PartitionPosition to, Directories.DataDirectory flushLocation, LifecycleTransaction txn)
         {
             this(partitions.subMap(from, to), flushLocation, from, to, txn);
@@ -439,6 +460,8 @@ class FlushRunnable implements Callable<SSTableMultiWriter>
             this.from = from;
             this.to = to;
             long keySize = 0;
+            state = new AtomicReference<>(FlushRunnableWriterState.IDLE);
+
             for (PartitionPosition key : toFlush.keySet())
             {
                 //  make sure we don't write non-sensical keys
@@ -456,7 +479,6 @@ class FlushRunnable implements Callable<SSTableMultiWriter>
                 writer = createFlushWriter(txn, cfs.newSSTableDescriptor(getDirectories().getWriteableLocationAsFile(estimatedSize)), columnsCollector.get(), statsCollector.get());
             else
                 writer = createFlushWriter(txn, cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(flushLocation)), columnsCollector.get(), statsCollector.get());
-
         }
 
         protected Directories getDirectories()
@@ -466,44 +488,96 @@ protected Directories getDirectories()
 
         private void writeSortedContents()
         {
-            logger.info("Writing {}, flushed range = ({}, {}]", Memtable.this.toString(), from, to);
+            if (!state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.RUNNING))
+            {
+                logger.debug("Failed to write {}, flushed range = ({}, {}], state: {}",
+                             Memtable.this.toString(), from, to, state);
+                return;
+            }
+
+            logger.info("Writing {}, flushed range = ({}, {}], state: {}",
+                        Memtable.this.toString(), from, to, state);
 
-            boolean trackContention = logger.isTraceEnabled();
             int heavilyContendedRowCount = 0;
-            // (we can't clear out the map as-we-go to free up memory,
-            //  since the memtable is being used for queries in the "pending flush" category)
-            for (AtomicBTreePartition partition : toFlush.values())
+            try
             {
-                // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
-                // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local,
-                // we don't need to preserve tombstones for repair. So if both operation are in this
-                // memtable (which will almost always be the case if there is no ongoing failure), we can
-                // just skip the entry (CASSANDRA-4667).
-                if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
-                    continue;
-
-                if (trackContention && partition.useLock())
-                    heavilyContendedRowCount++;
-
-                if (!partition.isEmpty())
+                boolean trackContention = logger.isTraceEnabled();
+                // (we can't clear out the map as-we-go to free up memory,
+                //  since the memtable is being used for queries in the "pending flush" category)
+                for (AtomicBTreePartition partition : toFlush.values())
                 {
-                    try (UnfilteredRowIterator iter = partition.unfilteredIterator())
+                    if (state.get() == FlushRunnableWriterState.ABORTING)
+                        break;
+
+                    // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
+                    // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local,
+                    // we don't need to preserve tombstones for repair. So if both operation are in this
+                    // memtable (which will almost always be the case if there is no ongoing failure), we can
+                    // just skip the entry (CASSANDRA-4667).
+                    if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
+                        continue;
+
+                    if (trackContention && partition.useLock())
+                        heavilyContendedRowCount++;
+
+                    if (!partition.isEmpty())
                     {
-                        writer.append(iter);
+                        try (UnfilteredRowIterator iter = partition.unfilteredIterator())
+                        {
+                            writer.append(iter);
+                        }
                     }
                 }
             }
+            finally
+            {
+                while (true)
+                {
+                    if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.IDLE))
+                    {
+                        long bytesFlushed = writer.getFilePointer();
+                        logger.info("Completed flushing {} ({}) for commitlog position {}",
+                                    writer.getFilename(),
+                                    FBUtilities.prettyPrintMemory(bytesFlushed),
+                                    commitLogUpperBound);
+                        // Update the metrics
+                        cfs.metric.bytesFlushed.inc(bytesFlushed);
+
+                        if (heavilyContendedRowCount > 0)
+                            logger.trace("High update contention in {}/{} partitions of {} ", heavilyContendedRowCount, toFlush.size(), Memtable.this);
+                        break;
+                    }
+                    else if (state.compareAndSet(FlushRunnableWriterState.ABORTING, FlushRunnableWriterState.ABORTED))
+                    {
+                        logger.debug("Flushing of {} aborted", writer.getFilename());
+                        maybeFail(writer.abort(null));
+                        break;
+                    }
+                }
+            }
+        }
 
-            long bytesFlushed = writer.getFilePointer();
-            logger.info("Completed flushing {} ({}) for commitlog position {}",
-                         writer.getFilename(),
-                         FBUtilities.prettyPrintMemory(bytesFlushed),
-                         commitLogUpperBound);
-            // Update the metrics
-            cfs.metric.bytesFlushed.inc(bytesFlushed);
+        public Throwable abort(Throwable throwable)
+        {
+            while (true)
+            {
+                if (state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.ABORTED))
+                {
+                    logger.debug("Flushing of {} aborted", writer.getFilename());
+                    return writer.abort(throwable);
+                }
+                else if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.ABORTING))
+                {
+                    // thread currently executing writeSortedContents() will take care of aborting and throw any exceptions
+                    return throwable;
+                }
+            }
+        }
 
-            if (heavilyContendedRowCount > 0)
-                logger.trace("High update contention in {}/{} partitions of {} ", heavilyContendedRowCount, toFlush.size(), Memtable.this);
+        @VisibleForTesting
+        FlushRunnableWriterState state()
+        {
+            return state.get();
         }
 
         public SSTableMultiWriter createFlushWriter(LifecycleTransaction txn,
diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
index 9ad944be3bc0..a17135621786 100644
--- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
@@ -384,6 +384,13 @@ public final Throwable commit(Throwable accumulate)
         return txnProxy.commit(accumulate);
     }
 
+    /**
+     * Stop the operation after errors, i.e. close and release all held resources.
+     *
+     * Do not use this to interrupt a write operation running in another thread.
+     * This is thread-unsafe, releasing and cleaning the buffer while it is being written can have disastrous
+     * consequences (e.g. SIGSEGV).
+     */
     public final Throwable abort(Throwable accumulate)
     {
         return txnProxy.abort(accumulate);
diff --git a/test/unit/org/apache/cassandra/db/MemtableTest.java b/test/unit/org/apache/cassandra/db/MemtableTest.java
new file mode 100644
index 000000000000..63b27ed19932
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/MemtableTest.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.Semaphore;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
+
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMUnitConfig;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+@RunWith(BMUnitRunner.class)
+@BMUnitConfig(debug = true)
+public class MemtableTest extends CQLTester
+{
+    List<PartitionPosition> ranges;
+    List<Directories.DataDirectory> locations;
+    ColumnFamilyStore cfs;
+    Memtable memtable;
+    ExecutorService executor;
+    int nThreads;
+
+    @Before
+    public void setup() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int PRIMARY KEY, value int)");
+
+        for (int i = 0; i < 10000; i++)
+            execute("INSERT INTO %s (pk, value) VALUES (?, ?)", i, i);
+
+        cfs = getCurrentColumnFamilyStore();
+        memtable = cfs.getTracker().getView().getCurrentMemtable();
+
+        OpOrder.Barrier barrier = cfs.keyspace.writeOrder.newBarrier();
+        memtable.setDiscarding(barrier, new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
+        barrier.issue();
+
+        ranges = new ArrayList<>();
+        locations = new ArrayList<>();
+        // this determines the number of flush writers created, the FlushRunnable will convert a null location into an sstable location for us
+        int rangeCount = 24;
+        for (int i = 0; i < rangeCount; ++i)
+        {
+            // split the range to ensure there are partitions to write
+            ranges.add(cfs.getPartitioner().split(cfs.getPartitioner().getMinimumToken(),
+                                                  cfs.getPartitioner().getMaximumToken(),
+                                                  (i+1) * 1.0 / rangeCount).minKeyBound());
+            locations.add(null);
+        }
+        nThreads = locations.size() / 2;
+        executor = Executors.newFixedThreadPool(nThreads);
+    }
+
+    @Test
+    public void testAbortingFlushRunnablesWithoutStarting() throws Throwable
+    {
+        // abort without starting
+        try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
+        {
+            List<Memtable.FlushRunnable> flushRunnables = memtable.createFlushRunnables(ranges, locations, txn);
+            assertNotNull(flushRunnables);
+
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Memtable.FlushRunnableWriterState.IDLE, flushRunnable.state());
+
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertNull(flushRunnable.abort(null));
+
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state());
+        }
+    }
+
+    static Semaphore stopSignal = null;
+    static Semaphore continueSignal;
+
+    public static void stopAndWait() throws InterruptedException
+    {
+        if (stopSignal != null)
+        {
+            stopSignal.release();
+            continueSignal.acquire();
+        }
+    }
+
+    @Test
+    @BMRule(name = "Wait before loop",
+    targetClass = "Memtable$FlushRunnable",
+    targetMethod = "writeSortedContents",
+    targetLocation = "AT INVOKE Logger.isTraceEnabled()",
+    action = "org.apache.cassandra.db.MemtableTest.stopAndWait()")
+    public void testAbortingFlushRunnablesAfterStarting() throws Throwable
+    {
+        // abort after starting
+        try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
+        {
+            List<Memtable.FlushRunnable> flushRunnables = memtable.createFlushRunnables(ranges, locations, txn);
+
+            stopSignal = new Semaphore(0);
+            continueSignal = new Semaphore(0);
+
+            List<Future<SSTableMultiWriter>> futures = flushRunnables.stream().map(executor::submit).collect(Collectors.toList());
+
+            stopSignal.acquire(nThreads);
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertNull(flushRunnable.abort(null));
+            continueSignal.release(flushRunnables.size());  // release all, including the ones that have not started yet
+
+            FBUtilities.waitOnFutures(futures);
+
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state());
+        }
+    }
+
+    @Test
+    public void testAbortingFlushRunnablesBeforeStarting() throws Throwable
+    {
+        // abort before starting
+        try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
+        {
+            List<Memtable.FlushRunnable> flushRunnables = memtable.createFlushRunnables(ranges, locations, txn);
+
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertNull(flushRunnable.abort(null));
+
+            List<Future<SSTableMultiWriter>> futures = flushRunnables.stream().map(executor::submit).collect(Collectors.toList());
+
+            FBUtilities.waitOnFutures(futures);
+
+            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state());
+        }
+    }
+}

From 12db5848f0477db8945c79ec23537dcdb9881180 Mon Sep 17 00:00:00 2001
From: Zhao Yang <zhaoyangsingapore@gmail.com>
Date: Wed, 4 Oct 2017 05:14:44 -0500
Subject: [PATCH 010/151] STAR-566: Test for wrap-around in
 estimatedKeysForRanges

This ports the up-to-date version of the test introduced by DB-1157

patch by Zhao Yang; reviewed by Branimir Lambov
ported by Branimir Lambov

(cherry picked from commit beec1105e96f5f025d61bbe35088e518820c6e00)
---
 .../sstable/format/big/BigTableScanner.java   |  2 +-
 .../io/sstable/SSTableReaderTest.java         | 86 ++++++++++++++++++-
 2 files changed, 86 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
index 20105cd1e14c..6644b3b8cff1 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
@@ -164,7 +164,7 @@ private static void addRange(SSTableReader sstable, AbstractBounds<PartitionPosi
         }
         else
         {
-            assert requested.left.compareTo(requested.right) <= 0 || requested.right.isMinimum();
+            assert !AbstractBounds.strictlyWrapsAround(requested.left, requested.right);
             Boundary<PartitionPosition> left, right;
             left = requested.leftBoundary();
             right = requested.rightBoundary();
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 0b64028c4d72..6ba942a7be1d 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -26,6 +26,7 @@
 import java.util.concurrent.*;
 
 import com.google.common.collect.Sets;
+import org.junit.After;
 import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
@@ -44,6 +45,7 @@
 import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
 import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.LocalPartitioner.LocalToken;
 import org.apache.cassandra.dht.Range;
@@ -90,7 +92,9 @@ public static void defineSchema() throws Exception
         SchemaLoader.createKeyspace(KEYSPACE1,
                                     KeyspaceParams.simple(1),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2)
+                                                .minIndexInterval(8)
+                                                .maxIndexInterval(8),  // ensure close key count estimation
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_COMPRESSED).compression(CompressionParams.DEFAULT),
                                     SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEXED, true),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL)
@@ -99,6 +103,12 @@ public static void defineSchema() throws Exception
                                                 .caching(CachingParams.CACHE_NOTHING));
     }
 
+    @After
+    public void Cleanup() {
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD).truncateBlocking();
+        Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2).truncateBlocking();
+    }
+
     @Test
     public void testGetPositionsForRanges()
     {
@@ -140,6 +150,80 @@ public void testGetPositionsForRanges()
         }
     }
 
+    @Test
+    public void testEstimatedKeysForRangesAndKeySamples()
+    {
+        // prepare data
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard2");
+        partitioner = store.getPartitioner();
+
+        Random random = new Random();
+        List<Token> tokens = new ArrayList<>();
+        tokens.add(partitioner.getMinimumToken());
+        if (partitioner.splitter().isPresent())
+            tokens.add(partitioner.getMaximumToken());
+
+        for (int j = 0; j < 100; j++)
+        {
+            Mutation mutation = new RowUpdateBuilder(store.metadata(), j, String.valueOf(random.nextInt())).clustering("0")
+                                                                                                           .add("val",
+                                                                                                                ByteBufferUtil.EMPTY_BYTE_BUFFER)
+                                                                                                           .build();
+            if (j % 4 != 0) // skip some keys
+                mutation.applyUnsafe();
+            tokens.add(mutation.key().getToken());
+        }
+
+        store.forceBlockingFlush();
+        assertEquals(1, store.getLiveSSTables().size());
+        SSTableReader sstable = store.getLiveSSTables().iterator().next();
+
+        // verify any combination of start and end point among the keys we have, which includes empty, full and
+        // wrap-around ranges
+        for (int i = 0; i < tokens.size(); i++)
+            for (int j = 0; j < tokens.size(); j++)
+            {
+                verifyEstimatedKeysAndKeySamples(sstable, new Range<Token>(tokens.get(i), tokens.get(j)));
+            }
+    }
+
+    private void verifyEstimatedKeysAndKeySamples(SSTableReader sstable, Range<Token> range)
+    {
+        List<DecoratedKey> expectedKeys = new ArrayList<>();
+        try (ISSTableScanner scanner = sstable.getScanner())
+        {
+            while (scanner.hasNext())
+            {
+                try (UnfilteredRowIterator rowIterator = scanner.next())
+                {
+                    if (range.contains(rowIterator.partitionKey().getToken()))
+                        expectedKeys.add(rowIterator.partitionKey());
+                }
+            }
+        }
+
+        // check estimated key
+        long estimated = sstable.estimatedKeysForRanges(Collections.singleton(range));
+        assertTrue("Range: " + range + " having " + expectedKeys.size() + " partitions, but estimated "
+                   + estimated, closeEstimation(expectedKeys.size(), estimated));
+
+        // check key samples
+        List<DecoratedKey> sampledKeys = new ArrayList<>();
+        sstable.getKeySamples(range).forEach(sampledKeys::add);
+
+        assertTrue("Range: " + range + " having " + expectedKeys + " keys, but keys sampled: "
+                   + sampledKeys, expectedKeys.containsAll(sampledKeys));
+        // no duplicate
+        assertEquals(expectedKeys.size(), expectedKeys.stream().distinct().count());
+        assertEquals(sampledKeys.size(), sampledKeys.stream().distinct().count());
+    }
+
+    private boolean closeEstimation(long expected, long estimated)
+    {
+        return expected <= estimated + 16 && expected >= estimated - 16;
+    }
+
     @Test
     public void testSpannedIndexPositions() throws IOException
     {

From b8eb62df7dbedb58ab4ef5624cb75e2cb507c373 Mon Sep 17 00:00:00 2001
From: Jakub Zytka <jakub.zytka@datastax.com>
Date: Wed, 2 Jun 2021 17:28:44 +0200
Subject: [PATCH 011/151] STAR-692: test exposing Date type overflow when using
 functions

(cherry picked from commit 3dcd047cb2f6cd0a39fbe547372a73142d80e71c)
---
 .../cassandra/cql3/functions/OperationFctsTest.java    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java b/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java
index c8ee9352e944..b8a8e37d75d7 100644
--- a/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/functions/OperationFctsTest.java
@@ -850,6 +850,16 @@ public void testOperationsWithDuration() throws Throwable
                              "SELECT time / 10m FROM %s WHERE pk = 1");
         assertInvalidMessage("the operation 'date - duration' failed: The duration must have a day precision. Was: 10m",
                              "SELECT * FROM %s WHERE pk = 1 AND time > ? - 10m", toDate("2016-10-04"));
+
+        // test overflow errors
+        assertInvalidMessage("is greater than max supported date",
+                             "INSERT INTO %s (pk, time, v) VALUES (2, '+5881581-01-01', 7)");
+        assertInvalidMessage("is greater than max supported date",
+                             "INSERT INTO %s (pk, time, v) VALUES (4, '+5881580-01-01' + 1y, 9)");
+        assertInvalidMessage("is less than min supported date",
+                             "INSERT INTO %s (pk, time, v) VALUES (3, '-5877642-01-01', 8)");
+        assertInvalidMessage("is less than min supported date",
+                             "INSERT INTO %s (pk, time, v) VALUES (5, '-5877640-01-01' - 2y, 10)");
     }
 
     private Date toTimestamp(String timestampAsString)

From 2a9a89b6e7e7da4a3653da78449daa14ef462e00 Mon Sep 17 00:00:00 2001
From: Jakub Zytka <jakub.zytka@datastax.com>
Date: Wed, 2 Jun 2021 17:42:24 +0200
Subject: [PATCH 012/151] STAR-692: protect Date type from overflows regardless
 of whether it is constructed from date string or from millis since epoch

Co-authored-by: Ulises Cervino Beresi <ulises.cervino@datastax.com>
(cherry picked from commit 7bcbf8444f853c812fc21fa9187867f17ddd5c4c)
---
 .../serializers/SimpleDateSerializer.java     | 26 +++++++++++++------
 .../serializers/SimpleDateSerializerTest.java | 13 ++++++++++
 2 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java b/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java
index c367705fc7d3..4bcb2f94dbab 100644
--- a/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/SimpleDateSerializer.java
@@ -70,14 +70,7 @@ public static int dateStringToDays(String source) throws MarshalException
         {
             LocalDate parsed = formatter.parse(source, LocalDate::from);
             long millis = parsed.atStartOfDay(UTC).toInstant().toEpochMilli();
-            if (millis < minSupportedDateMillis)
-                throw new MarshalException(String.format("Input date %s is less than min supported date %s", source,
-                        ZonedDateTime.ofInstant(Instant.ofEpochMilli(minSupportedDateMillis), UTC).toString()));
-            if (millis > maxSupportedDateMillis)
-                throw new MarshalException(String.format("Input date %s is greater than max supported date %s", source,
-                        ZonedDateTime.ofInstant(Instant.ofEpochMilli(maxSupportedDateMillis), UTC).toString()));
-
-            return timeInMillisToDay(millis);
+            return timeInMillisToDay(source, millis);
         }
         catch (DateTimeParseException| ArithmeticException e1)
         {
@@ -107,6 +100,23 @@ private static int parseRaw(String source) {
 
     public static int timeInMillisToDay(long millis)
     {
+        return timeInMillisToDay(null, millis);
+    }
+
+    private static int timeInMillisToDay(String source, long millis)
+    {
+        if (millis < minSupportedDateMillis)
+        {
+            throw new MarshalException(String.format("Input date %s is less than min supported date %s",
+                                                     null == source ? ZonedDateTime.ofInstant(Instant.ofEpochMilli(millis), UTC).toLocalDate() : source,
+                                                     ZonedDateTime.ofInstant(Instant.ofEpochMilli(minSupportedDateMillis), UTC).toLocalDate()));
+        }
+        if (millis > maxSupportedDateMillis)
+        {
+            throw new MarshalException(String.format("Input date %s is greater than max supported date %s",
+                                                     null == source ? ZonedDateTime.ofInstant(Instant.ofEpochMilli(millis), UTC).toLocalDate() : source,
+                                                     ZonedDateTime.ofInstant(Instant.ofEpochMilli(maxSupportedDateMillis), UTC).toLocalDate()));
+        }
         return (int) (Duration.ofMillis(millis).toDays() - Integer.MIN_VALUE);
     }
 
diff --git a/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java b/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java
index 9c1ef886f96d..8502fbde7659 100644
--- a/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java
+++ b/test/unit/org/apache/cassandra/serializers/SimpleDateSerializerTest.java
@@ -26,6 +26,7 @@
 import java.text.SimpleDateFormat;
 import java.time.temporal.ChronoUnit;
 import java.util.*;
+import java.util.concurrent.TimeUnit;
 
 public class SimpleDateSerializerTest
 {
@@ -152,4 +153,16 @@ public void testBadDayToMonth()
     {
         Integer days = SimpleDateSerializer.dateStringToDays("1000-09-31");
     }
+
+    @Test(expected = MarshalException.class)
+    public void testOutOfBoundsHighMillis()
+    {
+        SimpleDateSerializer.timeInMillisToDay(TimeUnit.DAYS.toMillis(Integer.MAX_VALUE) + 1);
+    }
+
+    @Test(expected = MarshalException.class)
+    public void testOutOfBoundsLowMillis()
+    {
+        SimpleDateSerializer.timeInMillisToDay(TimeUnit.DAYS.toMillis(Integer.MIN_VALUE) - 1L);
+    }
 }

From 58d49ab93f70d22df6772c8f708803a782d368d5 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Mon, 7 Jun 2021 15:05:36 +0200
Subject: [PATCH 013/151] STAR-749: Fixed ArrayIndexOutOfBoundsException in
 FunctionResource#fromName (#168)

The problematic syntax was a function name with empty argument list.

Co-authored-by: kamlesh ghoradkar <kamlesh_ghoradkar@persistent.com>
(cherry picked from commit 5d7df12d037759b7410fab182c49e60f6abde03e)
---
 .../cassandra/auth/FunctionResource.java      |   3 +-
 .../cassandra/auth/FunctionResourceTest.java  | 106 ++++++++++++++++++
 2 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 test/unit/org/apache/cassandra/auth/FunctionResourceTest.java

diff --git a/src/java/org/apache/cassandra/auth/FunctionResource.java b/src/java/org/apache/cassandra/auth/FunctionResource.java
index 61c6a2966694..d47c019cf543 100644
--- a/src/java/org/apache/cassandra/auth/FunctionResource.java
+++ b/src/java/org/apache/cassandra/auth/FunctionResource.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.auth;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Optional;
 import java.util.Set;
@@ -187,7 +188,7 @@ public static FunctionResource fromName(String name)
             return keyspace(parts[1]);
 
         String[] nameAndArgs = StringUtils.split(parts[2], "[|]");
-        return function(parts[1], nameAndArgs[0], argsListFromString(nameAndArgs[1]));
+        return function(parts[1], nameAndArgs[0], nameAndArgs.length > 1 ? argsListFromString(nameAndArgs[1]) : Collections.emptyList());
     }
 
     /**
diff --git a/test/unit/org/apache/cassandra/auth/FunctionResourceTest.java b/test/unit/org/apache/cassandra/auth/FunctionResourceTest.java
new file mode 100644
index 000000000000..89863958b8f8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/auth/FunctionResourceTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.auth;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.TypeParser;
+
+import static org.assertj.core.api.Assertions.assertThatExceptionOfType;
+import static org.junit.Assert.assertEquals;
+
+public class FunctionResourceTest
+{
+    private static final String ks = "fr_ks";
+    private static final String func = "functions";
+    private static final String name = "concat";
+    private static final String varType = "org.apache.cassandra.db.marshal.UTF8Type";
+
+    @Test
+    public void testFunction() throws Exception
+    {
+        FunctionResource expected = FunctionResource.root();
+        FunctionResource actual = FunctionResource.fromName(func);
+        assertEquals(expected, actual);
+        assertEquals(expected.getName(), actual.getName());
+    }
+
+    @Test
+    public void testFunctionKeyspace() throws Exception
+    {
+        FunctionResource expected = FunctionResource.keyspace(ks);
+        FunctionResource actual = FunctionResource.fromName(String.format("%s/%s", func, ks));
+        assertEquals(expected, actual);
+        assertEquals(expected.getKeyspace(), actual.getKeyspace());
+    }
+
+    @Test
+    public void testFunctionWithSingleInputParameter() throws Exception
+    {
+        List<AbstractType<?>> argTypes = new ArrayList<>();
+        argTypes.add(TypeParser.parse(varType));
+        FunctionResource expected = FunctionResource.function(ks, name, argTypes);
+        FunctionResource actual = FunctionResource.fromName(String.format("%s/%s/%s[%s]", func, ks, name, varType));
+        assertEquals(expected, actual);
+        assertEquals(expected.getKeyspace(), actual.getKeyspace());
+    }
+
+    @Test
+    public void testFunctionWithMultipleInputParameter() throws Exception
+    {
+        List<AbstractType<?>> argTypes = new ArrayList<>();
+        argTypes.add(TypeParser.parse(varType));
+        argTypes.add(TypeParser.parse(varType));
+        FunctionResource expected = FunctionResource.function(ks, name, argTypes);
+        FunctionResource actual = FunctionResource.fromName(String.format("%s/%s/%s[%s^%s]", func, ks, name, varType, varType));
+        assertEquals(expected, actual);
+        assertEquals(expected.getKeyspace(), actual.getKeyspace());
+    }
+
+    @Test
+    public void testFunctionWithoutInputParameter() throws Exception
+    {
+        List<AbstractType<?>> argTypes = new ArrayList<>();
+        FunctionResource expected = FunctionResource.function(ks, name, argTypes);
+        FunctionResource actual = FunctionResource.fromName(String.format("%s/%s/%s[]", func, ks, name));
+        assertEquals(expected, actual);
+        assertEquals(expected.getKeyspace(), actual.getKeyspace());
+    }
+
+    @Test
+    public void testInvalidFunctionName()
+    {
+        String expected = "functions_test is not a valid function resource name";
+        assertThatExceptionOfType(IllegalArgumentException.class)
+            .describedAs(expected)
+            .isThrownBy(() -> FunctionResource.fromName("functions_test"));
+    }
+
+    @Test
+    public void testFunctionWithInvalidInput()
+    {
+        String expected = String.format("%s/%s/%s[%s]/test is not a valid function resource name", func, ks, name, varType);
+        assertThatExceptionOfType(IllegalArgumentException.class)
+            .describedAs(expected)
+            .isThrownBy(() -> FunctionResource.fromName(String.format("%s/%s/%s[%s]/test", func, ks, name, varType)));
+    }
+}
\ No newline at end of file

From 90d17f0473d67a3d58fb558e0a731a656472770b Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Thu, 10 Jun 2021 17:27:39 -0700
Subject: [PATCH 014/151] STAR-578 avoid copying EMPTY_STATIC_ROW to heap with
 offheap memtable(#523) (#188)

patch by Zhao Yang; reviewed by Robert Stupp for DB-1375

Co-authored-by: Zhao Yang <zhaoyangsingapore@gmail.com>
(cherry picked from commit d09bc6624a657b3e0a37ce7f4c1b5e6a0fc82c91)
---
 src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java b/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java
index 71c17934cbd6..6893fb0ac985 100644
--- a/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java
+++ b/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java
@@ -57,8 +57,9 @@ public DecoratedKey applyToPartitionKey(DecoratedKey key)
 
         public Row applyToRow(Row row)
         {
-            if (row == null)
-                return null;
+            // If current "row" is Rows.EMPTY_STATIC_ROW, don't copy it again, as "copied_empty_static_row" != EMPTY_STATIC_ROW
+            if (row == null || row == Rows.EMPTY_STATIC_ROW)
+                return row;
             return Rows.copy(row, HeapAllocator.instance.cloningBTreeRowBuilder()).build();
         }
 

From 18a8ce994d87ec242639fc46d3750198bd94b46a Mon Sep 17 00:00:00 2001
From: Ruslan Fomkin <Ruslan.Fomkin@gmail.com>
Date: Fri, 11 Jun 2021 11:10:57 +0200
Subject: [PATCH 015/151] STAR-582 avoid assertion when repairing 1 node
 cluster (#185)

Porting patch DB-1511, riptano/apollo#627

Co-authored-by: Zhao Yang <zhaoyangsingapore@gmail.com>
(cherry picked from commit 998eca09654c6f571c6479333d8bbef3a74a7707)
---
 src/java/org/apache/cassandra/repair/RepairRunnable.java  | 1 +
 src/java/org/apache/cassandra/service/StorageService.java | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/repair/RepairRunnable.java b/src/java/org/apache/cassandra/repair/RepairRunnable.java
index 5ada11661ede..fa64b85c0f2e 100644
--- a/src/java/org/apache/cassandra/repair/RepairRunnable.java
+++ b/src/java/org/apache/cassandra/repair/RepairRunnable.java
@@ -326,6 +326,7 @@ private NeighborsAndRanges getNeighborsAndRanges()
             EndpointsForRange neighbors = ActiveRepairService.getNeighbors(keyspace, keyspaceLocalRanges, range,
                                                                            options.getDataCenters(),
                                                                            options.getHosts());
+            // local RF = 1 or given range is not part of local range, neighbors would be empty.
             if (neighbors.isEmpty())
             {
                 if (options.ignoreUnreplicatedKeyspaces())
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index a8bbef76c595..92b57d9b186b 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -4001,8 +4001,11 @@ else if (option.isInLocalDCOnly())
                 Iterables.addAll(option.getRanges(), getLocalReplicas(keyspace).onlyFull().ranges());
             }
         }
-        if (option.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor().allReplicas < 2)
+        if (option.getRanges().isEmpty() || Keyspace.open(keyspace).getReplicationStrategy().getReplicationFactor().allReplicas < 2
+            || tokenMetadata.getAllEndpoints().size() < 2)
+        {
             return Pair.create(0, Futures.immediateFuture(null));
+        }
 
         int cmd = nextRepairCommand.incrementAndGet();
         return Pair.create(cmd, ActiveRepairService.repairCommandExecutor().submit(createRepairTask(cmd, keyspace, option, listeners)));

From 5d10d16c88458a1ae655e59575658b7cd74c3ea6 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Mon, 14 Jun 2021 11:12:38 +0200
Subject: [PATCH 016/151] STAR-571 fix *SnichTests (#197)

Migrate to commit log initialization implemented by
ServerTestUtils that is race free. Files were removed by
the cleanup that followed CL init.

(cherry picked from commit 37e2e3d52aae414c521110efdf5bf1bb20c37fad)
---
 .../org/apache/cassandra/locator/AlibabaCloudSnitchTest.java | 5 ++---
 .../org/apache/cassandra/locator/CloudstackSnitchTest.java   | 5 ++---
 test/unit/org/apache/cassandra/locator/EC2SnitchTest.java    | 5 ++---
 .../org/apache/cassandra/locator/GoogleCloudSnitchTest.java  | 5 ++---
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java b/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java
index fb85a23f4732..809037e283d9 100644
--- a/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/AlibabaCloudSnitchTest.java
@@ -25,6 +25,7 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.ServerTestUtils;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
@@ -47,9 +48,7 @@ public static void setup() throws Exception
     {
         System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.daemonInitialization();
-        CommitLog.instance.start();
-        mkdirs();
-        cleanup();
+        ServerTestUtils.cleanupAndLeaveDirs();
         Keyspace.setInitialized();
         StorageService.instance.initServer(0);
     }
diff --git a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
index 9e39c48abde0..7f7a07e3f374 100644
--- a/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/CloudstackSnitchTest.java
@@ -26,6 +26,7 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.ServerTestUtils;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
@@ -48,9 +49,7 @@ public static void setup() throws Exception
     {
         System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.daemonInitialization();
-        CommitLog.instance.start();
-        mkdirs();
-        cleanup();
+        ServerTestUtils.cleanupAndLeaveDirs();
         Keyspace.setInitialized();
         StorageService.instance.initServer(0);
     }
diff --git a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
index f05f4a274b20..0cc819c609d4 100644
--- a/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/EC2SnitchTest.java
@@ -31,6 +31,7 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.ServerTestUtils;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
@@ -62,9 +63,7 @@ public static void setup() throws Exception
     {
         System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.daemonInitialization();
-        CommitLog.instance.start();
-        mkdirs();
-        cleanup();
+        ServerTestUtils.cleanupAndLeaveDirs();
         Keyspace.setInitialized();
         StorageService.instance.initServer(0);
     }
diff --git a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
index e524f3a531e6..5a8589022241 100644
--- a/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
+++ b/test/unit/org/apache/cassandra/locator/GoogleCloudSnitchTest.java
@@ -27,6 +27,7 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.ServerTestUtils;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
@@ -49,9 +50,7 @@ public static void setup() throws Exception
     {
         System.setProperty(Gossiper.Props.DISABLE_THREAD_VALIDATION, "true");
         DatabaseDescriptor.daemonInitialization();
-        CommitLog.instance.start();
-        mkdirs();
-        cleanup();
+        ServerTestUtils.cleanupAndLeaveDirs();
         Keyspace.setInitialized();
         StorageService.instance.initServer(0);
     }

From a75b01a98120636d051f53301f82b4fff08c9fb1 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Tue, 15 Jun 2021 19:31:44 +0200
Subject: [PATCH 017/151] STAR-745: Recreate BF on fp chance change and limit
 the total memory usage (#177)

* STAR-745: Add BloomFilter global memory limit

Added a memory limiter which is a global instance (static field in BloomFilter) which monitors
the total memory used by all created (and deserialized) Bloom filters. When the limit is reached
we return a dummy always-true filter with a log error message.

* STAR-745: Recreate Bloom filter on startup

In certain situations BF will be recreated on startup (for example, if FP chance changed
more than the defined tolerance).

Reviewed by: Daniel Jatnieks <jatnieks@pobox.com>

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
(cherry picked from commit c1a2595e556cb325fe915febd6c57b77955a09d8)
---
 .../cassandra/db/compaction/Verifier.java     |   4 +-
 .../io/sstable/format/SSTableReader.java      |   2 +-
 .../sstable/format/SSTableReaderBuilder.java  | 109 +++++++----
 .../io/sstable/format/big/BigTableWriter.java |   2 +-
 .../sstable/metadata/IMetadataSerializer.java |   7 +
 .../sstable/metadata/MetadataSerializer.java  |   8 +
 .../apache/cassandra/utils/BloomFilter.java   |  46 ++++-
 .../utils/BloomFilterSerializer.java          |  37 +++-
 .../apache/cassandra/utils/FilterFactory.java |  43 +++--
 .../cassandra/utils/obs/MemoryLimiter.java    |  72 ++++++++
 .../cassandra/utils/obs/OffHeapBitSet.java    |  51 +++--
 .../BloomFilterSerializerBench.java           |   4 +-
 test/unit/org/apache/cassandra/Util.java      |  26 ++-
 .../io/sstable/SSTableReaderTest.java         | 148 ++++++++++++++-
 .../cassandra/utils/BloomFilterTest.java      | 174 ++++++++++++++----
 .../cassandra/utils/SerializationsTest.java   |   8 +-
 .../utils/obs/OffHeapBitSetTest.java          |  25 +--
 17 files changed, 634 insertions(+), 132 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java

diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java
index 30e74adb4b35..68d5163e4d85 100644
--- a/src/java/org/apache/cassandra/db/compaction/Verifier.java
+++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java
@@ -43,7 +43,7 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.BloomFilterSerializer;
+import org.apache.cassandra.utils.BloomFilter;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.IFilter;
@@ -448,7 +448,7 @@ private void deserializeBloomFilter(SSTableReader sstable) throws IOException
         if (Files.exists(bfPath))
         {
             try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(bfPath)));
-                 IFilter bf = BloomFilterSerializer.deserialize(stream, sstable.descriptor.version.hasOldBfFormat()))
+                 IFilter bf = BloomFilter.serializer.deserialize(stream, sstable.descriptor.version.hasOldBfFormat()))
             {
             }
         }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index ea40f34f265b..258b004871bf 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -740,7 +740,7 @@ public static void saveBloomFilter(Descriptor descriptor, IFilter filter)
         File filterFile = new File(descriptor.filenameFor(Component.FILTER));
         try (DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(new FileOutputStream(filterFile)))
         {
-            BloomFilterSerializer.serialize((BloomFilter) filter, stream);
+            BloomFilter.serializer.serialize((BloomFilter) filter, stream);
             stream.flush();
         }
         catch (IOException e)
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
index 8fe1deff9e4e..e5abcf834e48 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
@@ -25,6 +25,7 @@
 import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
 import org.apache.cassandra.io.util.DiskOptimizationStrategy;
@@ -43,10 +44,13 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 
+import com.google.common.collect.ImmutableMap;
+
 public abstract class SSTableReaderBuilder
 {
     private static final Logger logger = LoggerFactory.getLogger(SSTableReaderBuilder.class);
@@ -89,30 +93,6 @@ public SSTableReaderBuilder(Descriptor descriptor,
 
     public abstract SSTableReader build();
 
-    public SSTableReaderBuilder dfile(FileHandle dfile)
-    {
-        this.dfile = dfile;
-        return this;
-    }
-
-    public SSTableReaderBuilder ifile(FileHandle ifile)
-    {
-        this.ifile = ifile;
-        return this;
-    }
-
-    public SSTableReaderBuilder bf(IFilter bf)
-    {
-        this.bf = bf;
-        return this;
-    }
-
-    public SSTableReaderBuilder summary(IndexSummary summary)
-    {
-        this.summary = summary;
-        return this;
-    }
-
     /**
      * Load index summary, first key and last key from Summary.db file if it exists.
      *
@@ -226,17 +206,29 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter,
         }
     }
 
-    /**
-     * Load bloom filter from Filter.db file.
-     *
-     * @throws IOException
-     */
-    IFilter loadBloomFilter() throws IOException
+    public static IFilter loadBloomFilter(Path path, boolean oldFormat)
     {
-        try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(descriptor.filenameFor(Component.FILTER))))))
+        if (Files.exists(path))
+        {
+            IFilter filter = null;
+            try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(path))))
+            {
+                filter = BloomFilter.serializer.deserialize(stream, oldFormat);
+                return filter;
+            }
+            catch (Throwable t)
+            {
+                JVMStabilityInspector.inspectThrowable(t);
+                logger.error("Failed to deserialize Bloom filter: {}", t.getMessage());
+                if (filter != null)
+                    filter.close();
+            }
+        }
+        else
         {
-            return BloomFilterSerializer.deserialize(stream, descriptor.version.hasOldBfFormat());
+            logger.error("Bloom filter {} not found", path);
         }
+        return null;
     }
 
     public static class ForWriter extends SSTableReaderBuilder
@@ -252,6 +244,30 @@ public ForWriter(Descriptor descriptor,
             super(descriptor, metadataRef, maxDataAge, components, statsMetadata, openReason, header);
         }
 
+        public SSTableReaderBuilder.ForWriter dfile(FileHandle dfile)
+        {
+            this.dfile = dfile;
+            return this;
+        }
+
+        public SSTableReaderBuilder.ForWriter ifile(FileHandle ifile)
+        {
+            this.ifile = ifile;
+            return this;
+        }
+
+        public SSTableReaderBuilder.ForWriter bf(IFilter bf)
+        {
+            this.bf = bf;
+            return this;
+        }
+
+        public SSTableReaderBuilder.ForWriter summary(IndexSummary summary)
+        {
+            this.summary = summary;
+            return this;
+        }
+
         @Override
         public SSTableReader build()
         {
@@ -276,6 +292,7 @@ public ForBatch(Descriptor descriptor,
         @Override
         public SSTableReader build()
         {
+            assert dfile == null && ifile == null && summary == null && bf == null;
             String dataFilePath = descriptor.filenameFor(Component.DATA);
             long fileLength = new File(dataFilePath).length();
             logger.info("Opening {} ({})", descriptor, FBUtilities.prettyPrintMemory(fileLength));
@@ -346,6 +363,7 @@ public ForRead(Descriptor descriptor,
         @Override
         public SSTableReader build()
         {
+            assert dfile == null && ifile == null && summary == null && bf == null;
             String dataFilePath = descriptor.filenameFor(Component.DATA);
             long fileLength = new File(dataFilePath).length();
             logger.info("Opening {} ({})", descriptor, FBUtilities.prettyPrintMemory(fileLength));
@@ -381,11 +399,10 @@ private void load(ValidationMetadata validation,
                           DiskOptimizationStrategy optimizationStrategy,
                           StatsMetadata statsMetadata) throws IOException
         {
-            if (metadata.params.bloomFilterFpChance == 1.0)
+            if (!BloomFilter.shouldUseBloomFilter(metadata.params.bloomFilterFpChance))
             {
                 // bf is disabled.
                 load(false, !isOffline, optimizationStrategy, statsMetadata, components);
-                bf = FilterFactory.AlwaysPresent;
             }
             else if (!components.contains(Component.PRIMARY_INDEX)) // What happens if filter component and primary index is missing?
             {
@@ -397,15 +414,21 @@ else if (!components.contains(Component.FILTER) || validation == null)
             {
                 // bf is enabled, but filter component is missing.
                 load(!isOffline, !isOffline, optimizationStrategy, statsMetadata, components);
-                if (isOffline)
-                    bf = FilterFactory.AlwaysPresent;
+            }
+            else if (!BloomFilter.isFPChanceDiffNeglectable(metadata.params.bloomFilterFpChance, validationMetadata.bloomFilterFPChance) && BloomFilter.recreateOnFPChanceChange)
+            {
+                // bf is enabled, but fp chance changed
+                load(!isOffline, !isOffline, optimizationStrategy, statsMetadata, components);
             }
             else
             {
                 // bf is enabled and fp chance matches the currently configured value.
-                load(false, !isOffline, optimizationStrategy, statsMetadata, components);
-                bf = loadBloomFilter();
+                bf = loadBloomFilter(Paths.get(descriptor.filenameFor(Component.FILTER)), descriptor.version.hasOldBfFormat());
+                load(bf == null, !isOffline, optimizationStrategy, statsMetadata, components);
             }
+            // if the filter was neither loaded nor created, or we encountered some problems, we fallback to pass-through filter
+            if (bf == null)
+                bf = FilterFactory.AlwaysPresent;
         }
 
         /**
@@ -448,7 +471,11 @@ void load(boolean recreateBloomFilter,
                     if (saveSummaryIfCreated)
                         SSTableReader.saveSummary(descriptor, first, last, summary);
                     if (recreateBloomFilter)
+                    {
                         SSTableReader.saveBloomFilter(descriptor, bf);
+                        ValidationMetadata updatedValidationMetadata = new ValidationMetadata(validationMetadata.partitioner, metadata.params.bloomFilterFpChance);
+                        descriptor.getMetadataSerializer().updateSSTableMetadata(descriptor, ImmutableMap.of(MetadataType.VALIDATION, updatedValidationMetadata));
+                    }
                 }
             }
             catch (Throwable t)
@@ -463,6 +490,12 @@ void load(boolean recreateBloomFilter,
                     dfile.close();
                 }
 
+                if (bf != null)
+                {
+                    bf.close();
+                    bf = null;
+                }
+
                 if (summary != null)
                 {
                     summary.close();
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
index eeb9153826c5..806a05951118 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
@@ -539,7 +539,7 @@ void flushBf()
                      DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos))
                 {
                     // bloom filter
-                    BloomFilterSerializer.serialize((BloomFilter) bf, stream);
+                    BloomFilter.serializer.serialize((BloomFilter) bf, stream);
                     stream.flush();
                     SyncUtil.sync(fos);
                 }
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java
index fc1ce422a40a..8c3adb811389 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/IMetadataSerializer.java
@@ -100,4 +100,11 @@ public interface IMetadataSerializer
      * Replace the sstable metadata file ({@code -Statistics.db}) with the given components.
      */
     void rewriteSSTableMetadata(Descriptor descriptor, Map<MetadataType, MetadataComponent> currentComponents) throws IOException;
+
+    /**
+     * Updates the sstable metadata components (works similarly to {@link #rewriteSSTableMetadata(Descriptor, Map)} but
+     * only updates the provided components rather than replacing the whole metadata map).
+     */
+    void updateSSTableMetadata(Descriptor descriptor, Map<MetadataType, MetadataComponent> updatedComponents) throws IOException;
+
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java
index 042103e26798..ee9a24aca9f4 100644
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataSerializer.java
@@ -279,4 +279,12 @@ public void rewriteSSTableMetadata(Descriptor descriptor, Map<MetadataType, Meta
         FileUtils.renameWithConfirm(filePath, descriptor.filenameFor(Component.STATS));
 
     }
+
+    public void updateSSTableMetadata(Descriptor descriptor, Map<MetadataType, MetadataComponent> updatedComponents) throws IOException
+    {
+        Map<MetadataType, MetadataComponent> currentComponents = deserialize(descriptor, EnumSet.allOf(MetadataType.class));
+        currentComponents.putAll(updatedComponents);
+        rewriteSSTableMetadata(descriptor, currentComponents);
+    }
+
 }
diff --git a/src/java/org/apache/cassandra/utils/BloomFilter.java b/src/java/org/apache/cassandra/utils/BloomFilter.java
index bf48d4341894..a59a7fb14a9a 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilter.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilter.java
@@ -21,12 +21,45 @@
 
 import io.netty.util.concurrent.FastThreadLocal;
 import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.utils.concurrent.Ref;
 import org.apache.cassandra.utils.concurrent.WrappedSharedCloseable;
 import org.apache.cassandra.utils.obs.IBitSet;
+import org.apache.cassandra.utils.obs.MemoryLimiter;
 
 public class BloomFilter extends WrappedSharedCloseable implements IFilter
 {
+    /**
+     * The maximum memory to be used by all loaded bloom filters. If the limit is exceeded, pass-through filter will be
+     * used until some filters get unloaded.
+     */
+    public final static String MAX_MEMORY_MB_PROP = Config.PROPERTY_PREFIX + "bf.max_memory_mb";
+
+    /**
+     * A minimal relative change of the fase-positive chance so that it is considered as a reason to recreate the bloom
+     * filter. If the change is smaller than this, it will be ignored.
+     */
+    public final static String FP_CHANCE_TOLERANCE_PROP = Config.PROPERTY_PREFIX + "bf.fp_chance_tolerance";
+
+    /**
+     * If the false-positive chance has changed since the last compaction (for example by alter table statement), and
+     * the node is restarted - the bloom filter can get rebuilt if this property jest set to true.
+     */
+    public final static String RECREATE_ON_FP_CHANCE_CHANGE = Config.PROPERTY_PREFIX + "bf.recreate_on_fp_chance_change";
+
+    private static final long maxMemory = Long.getLong(MAX_MEMORY_MB_PROP, 0) << 20;
+
+    @VisibleForTesting
+    public static double fpChanceTolerance = Double.parseDouble(System.getProperty(FP_CHANCE_TOLERANCE_PROP, "0.000001"));
+
+    @VisibleForTesting
+    public static boolean recreateOnFPChanceChange = Boolean.getBoolean(RECREATE_ON_FP_CHANCE_CHANGE);
+
+    public static final MemoryLimiter memoryLimiter = new MemoryLimiter(maxMemory != 0 ? maxMemory : Long.MAX_VALUE,
+                                                                        "Allocating %s for Bloom filter would reach max of %s (current %s)");
+
+    public final static BloomFilterSerializer serializer = new BloomFilterSerializer(memoryLimiter);
+
     private final static FastThreadLocal<long[]> reusableIndexes = new FastThreadLocal<long[]>()
     {
         protected long[] initialValue()
@@ -54,7 +87,7 @@ private BloomFilter(BloomFilter copy)
 
     public long serializedSize()
     {
-        return BloomFilterSerializer.serializedSize(this);
+        return serializer.serializedSize(this);
     }
 
     // Murmur is faster than an SHA-based approach and provides as-good collision
@@ -149,4 +182,15 @@ public void addTo(Ref.IdentityCollection identities)
         super.addTo(identities);
         bitset.addTo(identities);
     }
+
+    public static boolean shouldUseBloomFilter(double fpChance)
+    {
+        return Math.abs(1 - fpChance) > BloomFilter.fpChanceTolerance;
+    }
+
+    public static boolean isFPChanceDiffNeglectable(double fpChance1, double fpChance2)
+    {
+        return Math.abs(fpChance1 - fpChance2) <= fpChanceTolerance;
+    }
+
 }
diff --git a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
index d3c08b53cbed..c677bf26d84b 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilterSerializer.java
@@ -20,40 +20,61 @@
 import java.io.DataInputStream;
 import java.io.IOException;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.obs.IBitSet;
+import org.apache.cassandra.utils.obs.MemoryLimiter;
 import org.apache.cassandra.utils.obs.OffHeapBitSet;
 
+import static org.apache.cassandra.utils.FilterFactory.AlwaysPresent;
+
 public final class BloomFilterSerializer
 {
-    private BloomFilterSerializer()
+    private final static Logger logger = LoggerFactory.getLogger(BloomFilterSerializer.class);
+
+    private final MemoryLimiter memoryLimiter;
+
+    public BloomFilterSerializer(MemoryLimiter memoryLimiter)
     {
+        this.memoryLimiter = memoryLimiter;
     }
 
-    public static void serialize(BloomFilter bf, DataOutputPlus out) throws IOException
+    public void serialize(BloomFilter bf, DataOutputPlus out) throws IOException
     {
         out.writeInt(bf.hashCount);
         bf.bitset.serialize(out);
     }
 
     @SuppressWarnings("resource")
-    public static BloomFilter deserialize(DataInputStream in, boolean oldBfFormat) throws IOException
+    public IFilter deserialize(DataInputStream in, boolean oldBfFormat) throws IOException
     {
         int hashes = in.readInt();
-        IBitSet bs = OffHeapBitSet.deserialize(in, oldBfFormat);
-
+        IBitSet bs;
+        try
+        {
+            bs = OffHeapBitSet.deserialize(in, oldBfFormat, memoryLimiter);
+        }
+        catch (MemoryLimiter.ReachedMemoryLimitException | OutOfMemoryError e)
+        {
+            logger.error("Failed to create Bloom filter during deserialization: ({}) - " +
+                         "continuing but this will have severe performance implications, consider increasing FP chance or" +
+                         "lowering number of sstables through compaction", e.getMessage());
+            return AlwaysPresent;
+        }
         return new BloomFilter(hashes, bs);
     }
 
     /**
      * Calculates a serialized size of the given Bloom Filter
-     * @param bf Bloom filter to calculate serialized size
-     * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus)
      *
+     * @param bf Bloom filter to calculate serialized size
      * @return serialized size of the given bloom filter
+     * @see org.apache.cassandra.io.ISerializer#serialize(Object, org.apache.cassandra.io.util.DataOutputPlus)
      */
-    public static long serializedSize(BloomFilter bf)
+    public long serializedSize(BloomFilter bf)
     {
         int size = TypeSizes.sizeof(bf.hashCount); // hash count
         size += bf.bitset.serializedSize();
diff --git a/src/java/org/apache/cassandra/utils/FilterFactory.java b/src/java/org/apache/cassandra/utils/FilterFactory.java
index 4cf0cbf74d19..63a54e11fe3c 100644
--- a/src/java/org/apache/cassandra/utils/FilterFactory.java
+++ b/src/java/org/apache/cassandra/utils/FilterFactory.java
@@ -21,6 +21,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.utils.obs.IBitSet;
+import org.apache.cassandra.utils.obs.MemoryLimiter;
 import org.apache.cassandra.utils.obs.OffHeapBitSet;
 
 public class FilterFactory
@@ -35,6 +36,11 @@ public class FilterFactory
      *         probability for the given number of elements.
      */
     public static IFilter getFilter(long numElements, int targetBucketsPerElem)
+    {
+        return getFilter(numElements, targetBucketsPerElem, BloomFilter.memoryLimiter);
+    }
+
+    public static IFilter getFilter(long numElements, int targetBucketsPerElem, MemoryLimiter memoryLimiter)
     {
         int maxBucketsPerElement = Math.max(1, BloomCalculations.maxBucketsPerElement(numElements));
         int bucketsPerElement = Math.min(targetBucketsPerElem, maxBucketsPerElement);
@@ -43,31 +49,46 @@ public static IFilter getFilter(long numElements, int targetBucketsPerElem)
             logger.warn("Cannot provide an optimal BloomFilter for {} elements ({}/{} buckets per element).", numElements, bucketsPerElement, targetBucketsPerElem);
         }
         BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement);
-        return createFilter(spec.K, numElements, spec.bucketsPerElement);
+        return createFilter(spec.K, numElements, spec.bucketsPerElement, memoryLimiter);
     }
 
     /**
      * @return The smallest BloomFilter that can provide the given false
-     *         positive probability rate for the given number of elements.
-     *
-     *         Asserts that the given probability can be satisfied using this
-     *         filter.
+     * positive probability rate for the given number of elements.
+     * <p>
+     * Asserts that the given probability can be satisfied using this
+     * filter.
      */
     public static IFilter getFilter(long numElements, double maxFalsePosProbability)
+    {
+        return getFilter(numElements, maxFalsePosProbability, BloomFilter.memoryLimiter);
+    }
+
+    public static IFilter getFilter(long numElements, double maxFalsePosProbability, MemoryLimiter memoryLimiter)
     {
         assert maxFalsePosProbability <= 1.0 : "Invalid probability";
         if (maxFalsePosProbability == 1.0)
-            return new AlwaysPresentFilter();
+            return AlwaysPresent;
         int bucketsPerElement = BloomCalculations.maxBucketsPerElement(numElements);
         BloomCalculations.BloomSpecification spec = BloomCalculations.computeBloomSpec(bucketsPerElement, maxFalsePosProbability);
-        return createFilter(spec.K, numElements, spec.bucketsPerElement);
+        return createFilter(spec.K, numElements, spec.bucketsPerElement, memoryLimiter);
     }
 
     @SuppressWarnings("resource")
-    private static IFilter createFilter(int hash, long numElements, int bucketsPer)
+    private static IFilter createFilter(int hash, long numElements, int bucketsPer, MemoryLimiter memoryLimiter)
     {
-        long numBits = (numElements * bucketsPer) + BITSET_EXCESS;
-        IBitSet bitset = new OffHeapBitSet(numBits);
-        return new BloomFilter(hash, bitset);
+        try
+        {
+            long numBits = (numElements * bucketsPer) + BITSET_EXCESS;
+            IBitSet bitset = new OffHeapBitSet(numBits, memoryLimiter);
+            return new BloomFilter(hash, bitset);
+        }
+        catch (MemoryLimiter.ReachedMemoryLimitException | OutOfMemoryError e)
+        {
+            logger.error("Failed to create new Bloom filter with {} elements: ({}) - " +
+                         "continuing but this will have severe performance implications, consider increasing FP chance or" +
+                         "lowering number of sstables through compaction", numElements, e.getMessage());
+            return AlwaysPresent;
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java b/src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java
new file mode 100644
index 000000000000..bb2eb28a341d
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/obs/MemoryLimiter.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils.obs;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.cassandra.utils.FBUtilities;
+
+public class MemoryLimiter
+{
+    public final long maxMemory;
+    private final AtomicLong currentMemory;
+    private final String exceptionFormat;
+
+    public MemoryLimiter(long maxMemory, String exceptionFormat)
+    {
+        this.maxMemory = maxMemory;
+        this.currentMemory = new AtomicLong();
+        this.exceptionFormat = exceptionFormat;
+    }
+
+    public void increment(long bytesCount) throws ReachedMemoryLimitException
+    {
+        assert bytesCount >= 0;
+        long bytesCountAfterAllocation = this.currentMemory.addAndGet(bytesCount);
+        if (bytesCountAfterAllocation >= maxMemory)
+        {
+            this.currentMemory.addAndGet(-bytesCount);
+
+            throw new ReachedMemoryLimitException(String.format(exceptionFormat,
+                                                                FBUtilities.prettyPrintMemory(bytesCount),
+                                                                FBUtilities.prettyPrintMemory(maxMemory),
+                                                                FBUtilities.prettyPrintMemory(bytesCountAfterAllocation - bytesCount)));
+        }
+    }
+
+    public void decrement(long bytesCount)
+    {
+        assert bytesCount >= 0;
+        long result = this.currentMemory.addAndGet(-bytesCount);
+        assert result >= 0;
+    }
+
+    public long memoryAllocated()
+    {
+        return currentMemory.get();
+    }
+
+    public static class ReachedMemoryLimitException extends Exception
+    {
+        public ReachedMemoryLimitException(String message)
+        {
+            super(message);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
index 486ec388d820..fa9dcd07ef0f 100644
--- a/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
+++ b/src/java/org/apache/cassandra/utils/obs/OffHeapBitSet.java
@@ -17,9 +17,7 @@
  */
 package org.apache.cassandra.utils.obs;
 
-import java.io.DataInput;
 import java.io.DataInputStream;
-import java.io.DataOutput;
 import java.io.IOException;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -37,18 +35,25 @@
  */
 public class OffHeapBitSet implements IBitSet
 {
+    /**
+     * The maximum memory that can be used by bloom filters, in megabytes, overall.
+     * The default is unlimited, a limit should only be set as a last resort measure.
+     */
+    @VisibleForTesting
     private final Memory bytes;
+    private final MemoryLimiter memoryLimiter;
 
-    public OffHeapBitSet(long numBits)
+    public OffHeapBitSet(long numBits, MemoryLimiter memoryLimiter) throws MemoryLimiter.ReachedMemoryLimitException
     {
-        /** returns the number of 64 bit words it would take to hold numBits */
+        this.memoryLimiter = memoryLimiter;
+        // returns the number of 64 bit words it would take to hold numBits
         long wordCount = (((numBits - 1) >>> 6) + 1);
         if (wordCount > Integer.MAX_VALUE)
             throw new UnsupportedOperationException("Bloom filter size is > 16GB, reduce the bloom_filter_fp_chance");
         try
         {
             long byteCount = wordCount * 8L;
-            bytes = Memory.allocate(byteCount);
+            bytes = allocate(byteCount, memoryLimiter);
         }
         catch (OutOfMemoryError e)
         {
@@ -58,11 +63,33 @@ public OffHeapBitSet(long numBits)
         clear();
     }
 
-    private OffHeapBitSet(Memory bytes)
+    private OffHeapBitSet(Memory bytes, MemoryLimiter memoryLimiter)
     {
+        this.memoryLimiter = memoryLimiter;
         this.bytes = bytes;
     }
 
+    private static Memory allocate(long byteCount, MemoryLimiter memoryLimiter) throws MemoryLimiter.ReachedMemoryLimitException
+    {
+        memoryLimiter.increment(byteCount);
+        try
+        {
+            return Memory.allocate(byteCount);
+        }
+        catch (OutOfMemoryError e)
+        {
+            memoryLimiter.decrement(byteCount);
+            throw e;
+        }
+    }
+
+    private static void release(Memory memory, MemoryLimiter memoryLimiter)
+    {
+        long size = memory.size();
+        memory.free();
+        memoryLimiter.decrement(size);
+    }
+
     public long capacity()
     {
         return bytes.size() * 8;
@@ -145,10 +172,10 @@ public long serializedSize()
     }
 
     @SuppressWarnings("resource")
-    public static OffHeapBitSet deserialize(DataInputStream in, boolean oldBfFormat) throws IOException
+    public static OffHeapBitSet deserialize(DataInputStream in, boolean oldBfFormat, MemoryLimiter memoryLimiter) throws IOException, MemoryLimiter.ReachedMemoryLimitException
     {
         long byteCount = in.readInt() * 8L;
-        Memory memory = Memory.allocate(byteCount);
+        Memory memory = allocate(byteCount, memoryLimiter);
         if (oldBfFormat)
         {
             for (long i = 0; i < byteCount; )
@@ -168,12 +195,12 @@ public static OffHeapBitSet deserialize(DataInputStream in, boolean oldBfFormat)
         {
             FBUtilities.copy(in, new MemoryOutputStream(memory), byteCount);
         }
-        return new OffHeapBitSet(memory);
+        return new OffHeapBitSet(memory, memoryLimiter);
     }
 
     public void close()
     {
-        bytes.free();
+        release(bytes, memoryLimiter);
     }
 
     @Override
@@ -192,7 +219,7 @@ public int hashCode()
     {
         // Similar to open bitset.
         long h = 0;
-        for (long i = bytes.size(); --i >= 0;)
+        for (long i = bytes.size(); --i >= 0; )
         {
             h ^= bytes.getByte(i);
             h = (h << 1) | (h >>> 63); // rotate left
@@ -202,6 +229,6 @@ public int hashCode()
 
     public String toString()
     {
-        return "[OffHeapBitSet]";
+        return String.format("[OffHeapBitSet %s]", FBUtilities.prettyPrintMemory(serializedSize()));
     }
 }
diff --git a/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java b/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java
index 922281145f52..4cabf0633eba 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/BloomFilterSerializerBench.java
@@ -81,12 +81,12 @@ public void serializationTest() throws IOException
             if (oldBfFormat)
                 SerializationsTest.serializeOldBfFormat(filter, out);
             else
-                BloomFilterSerializer.serialize(filter, out);
+                BloomFilter.serializer.serialize(filter, out);
             out.close();
             filter.close();
 
             DataInputStream in = new DataInputStream(new FileInputStream(file));
-            BloomFilter filter2 = BloomFilterSerializer.deserialize(in, oldBfFormat);
+            IFilter filter2 = BloomFilter.serializer.deserialize(in, oldBfFormat);
             FileUtils.closeQuietly(in);
             filter2.close();
         }
diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index 69c9f1ad8779..cd4e4f442f77 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -19,12 +19,12 @@
  *
  */
 
-import java.io.Closeable;
-import java.io.EOFException;
-import java.io.File;
-import java.io.IOError;
+import java.io.*;
+import java.lang.reflect.Field;
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
+import java.nio.file.*;
+import java.nio.file.attribute.FileTime;
 import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Future;
@@ -95,6 +95,8 @@ public class Util
 
     private static List<UUID> hostIdPool = new ArrayList<>();
 
+    public final static TimeUnit supportedMTimeGranularity = getSupportedMTimeGranularity();
+
     public static IPartitioner testPartitioner()
     {
         return DatabaseDescriptor.getPartitioner();
@@ -818,4 +820,20 @@ public static void setUpgradeFromVersion(String version)
                                                    VersionedValue.unsafeMakeVersionedValue(version, v + 1));
         Gossiper.instance.expireUpgradeFromVersion();
     }
+
+    private static TimeUnit getSupportedMTimeGranularity() {
+        try
+        {
+            Path p = Files.createTempFile(Util.class.getSimpleName(), "dummy-file");
+            FileTime ft = Files.getLastModifiedTime(p);
+            Files.deleteIfExists(p);
+            Field f = FileTime.class.getDeclaredField("unit");
+            f.setAccessible(true);
+            return (TimeUnit) f.get(ft);
+        }
+        catch (IOException |  NoSuchFieldException | IllegalAccessException e)
+        {
+            throw new AssertionError("Failed to read supported file modification time granularity");
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 6ba942a7be1d..80828e1b64e3 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -22,11 +22,16 @@
 import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.FileTime;
+import java.time.Instant;
 import java.util.*;
 import java.util.concurrent.*;
 
 import com.google.common.collect.Sets;
+import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.After;
+import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
@@ -48,19 +53,28 @@
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.LocalPartitioner.LocalToken;
+import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.MmappedRegions;
 import org.apache.cassandra.schema.CachingParams;
 import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.service.CacheService;
+import org.apache.cassandra.utils.BloomCalculations;
+import org.apache.cassandra.utils.BloomFilter;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FilterFactory;
+import org.apache.cassandra.utils.IFilter;
 
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 import static org.junit.Assert.assertEquals;
@@ -77,6 +91,7 @@ public class SSTableReaderTest
     public static final String CF_COMPRESSED = "Compressed";
     public static final String CF_INDEXED = "Indexed1";
     public static final String CF_STANDARDLOWINDEXINTERVAL = "StandardLowIndexInterval";
+    public static final String CF_STANDARDNOBLOOMFILTER = "StandardNoBloomFilter";
 
     private IPartitioner partitioner;
 
@@ -100,13 +115,16 @@ public static void defineSchema() throws Exception
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL)
                                                 .minIndexInterval(8)
                                                 .maxIndexInterval(256)
-                                                .caching(CachingParams.CACHE_NOTHING));
+                                                .caching(CachingParams.CACHE_NOTHING),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDNOBLOOMFILTER)
+                                                .bloomFilterFpChance(1));
     }
 
     @After
     public void Cleanup() {
         Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD).truncateBlocking();
         Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2).truncateBlocking();
+        BloomFilter.recreateOnFPChanceChange = false;
     }
 
     @Test
@@ -834,13 +852,15 @@ public void testMoveAndOpenSSTable() throws IOException
         }
     }
 
-
-
     private SSTableReader getNewSSTable(ColumnFamilyStore cfs)
     {
+        return getNewSSTable(cfs, 100, 2);
+    }
 
+    private SSTableReader getNewSSTable(ColumnFamilyStore cfs, int numKeys, int step)
+    {
         Set<SSTableReader> before = cfs.getLiveSSTables();
-        for (int j = 0; j < 100; j += 2)
+        for (int j = 0; j < numKeys; j += step)
         {
             new RowUpdateBuilder(cfs.metadata(), j, String.valueOf(j))
             .clustering("0")
@@ -915,6 +935,126 @@ public void testVerifyCompressionInfoExistencePasses()
         SSTableReader.verifyCompressionInfoExistenceIfApplicable(desc, components);
     }
 
+    @Test
+    public void testBloomFilterIsCreatedOnLoad() throws IOException
+    {
+        BloomFilter.recreateOnFPChanceChange = true;
+
+        final int numKeys = 100;
+        final Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARDNOBLOOMFILTER);
+
+        SSTableReader sstable = getNewSSTable(cfs, numKeys, 1);
+        Assert.assertTrue(sstable.getBloomFilterSerializedSize() == 0);
+        Assert.assertSame(FilterFactory.AlwaysPresent, sstable.getBloomFilter());
+
+        // should do nothing
+        checkSSTableOpenedWithGivenFPChance(sstable, 1, false, numKeys, false);
+
+        // should create BF because the FP has changed
+        checkSSTableOpenedWithGivenFPChance(sstable, BloomCalculations.minSupportedBloomFilterFpChance(), true, numKeys, true);
+        checkSSTableOpenedWithGivenFPChance(sstable, 0.05, true, numKeys, true);
+        checkSSTableOpenedWithGivenFPChance(sstable, 0.1, true, numKeys, true);
+
+        // should deserialize the existing BF
+        checkSSTableOpenedWithGivenFPChance(sstable, 0.1, true, numKeys, false);
+        // should create BF because the FP has changed
+        checkSSTableOpenedWithGivenFPChance(sstable, 1 - BloomFilter.fpChanceTolerance, true, numKeys, true);
+        // should install empty filter without changing file or metadata
+        checkSSTableOpenedWithGivenFPChance(sstable, 1, false, numKeys, false);
+
+        // corrupted bf file should fail to deserialize and we should fall back to recreating it
+        Files.write(Paths.get(sstable.descriptor.filenameFor(Component.FILTER)), new byte[] { 0, 0, 0, 0});
+        checkSSTableOpenedWithGivenFPChance(sstable, 1 - BloomFilter.fpChanceTolerance, true, numKeys, true);
+
+        // missing primary index file should make BF fail to load and we should install the empty one
+        new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete();
+        checkSSTableOpenedWithGivenFPChance(sstable, 0.05, false, numKeys, false);
+    }
+
+    private void checkSSTableOpenedWithGivenFPChance(SSTableReader sstable, double fpChance, boolean bfShouldExist, int numKeys, boolean expectRecreated) throws IOException
+    {
+        Descriptor desc = sstable.descriptor;
+        TableMetadata metadata = sstable.metadata.get().unbuild().bloomFilterFpChance(fpChance).build();
+        ValidationMetadata prevValidationMetadata = getValidationMetadata(desc);
+        Assert.assertNotNull(prevValidationMetadata);
+        File bfFile = new File(desc.filenameFor(Component.FILTER));
+
+        SSTableReader target = null;
+        try
+        {
+            FileTime bf0Time = bfFile.exists() ? Files.getLastModifiedTime(bfFile.toPath()) : FileTime.from(Instant.MIN);
+
+            // make sure we wait enough - some JDK implementations use seconds granularity and we need to wait a bit to actually see the change
+            Uninterruptibles.sleepUninterruptibly(1, Util.supportedMTimeGranularity);
+
+            target = SSTableReader.open(desc,
+                                        SSTableReader.discoverComponentsFor(desc),
+                                        TableMetadataRef.forOfflineTools(metadata),
+                                        false,
+                                        false);
+            IFilter bloomFilter = target.getBloomFilter();
+            ValidationMetadata validationMetadata = getValidationMetadata(desc);
+            Assert.assertNotNull(validationMetadata);
+            FileTime bf1Time = bfFile.exists() ? Files.getLastModifiedTime(bfFile.toPath()) : FileTime.from(Instant.MIN);
+
+            if (expectRecreated)
+            {
+                Assert.assertTrue(bf0Time.compareTo(bf1Time) < 0);
+            }
+            else
+            {
+                assertEquals(bf0Time, bf1Time);
+            }
+
+            if (bfShouldExist)
+            {
+                Assert.assertNotEquals(FilterFactory.AlwaysPresent, bloomFilter);
+                Assert.assertTrue(bloomFilter.serializedSize() > 0);
+                Assert.assertEquals(fpChance, validationMetadata.bloomFilterFPChance, BloomFilter.fpChanceTolerance);
+                Assert.assertTrue(bfFile.exists());
+                Assert.assertEquals(bloomFilter.serializedSize(), bfFile.length());
+            }
+            else
+            {
+                Assert.assertEquals(FilterFactory.AlwaysPresent, sstable.getBloomFilter());
+                Assert.assertTrue(sstable.getBloomFilterSerializedSize() == 0);
+                Assert.assertEquals(prevValidationMetadata.bloomFilterFPChance, validationMetadata.bloomFilterFPChance, BloomFilter.fpChanceTolerance);
+                Assert.assertEquals(bfFile.exists(), bfFile.exists());
+            }
+
+            // verify all keys are present according to the BF
+            Token token = new Murmur3Partitioner.LongToken(0L);
+            for (int i = 0; i < numKeys; i++)
+            {
+                DecoratedKey key = new BufferDecoratedKey(token, ByteBufferUtil.bytes(String.valueOf(i)));
+                Assert.assertTrue("Expected key to be in BF: " + i, bloomFilter.isPresent(key));
+            }
+        }
+        finally
+        {
+            if (target != null)
+                target.selfRef().release();
+        }
+    }
+
+    private static ValidationMetadata getValidationMetadata(Descriptor descriptor)
+    {
+        EnumSet<MetadataType> types = EnumSet.of(MetadataType.VALIDATION);
+
+        Map<MetadataType, MetadataComponent> sstableMetadata;
+        try
+        {
+            sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor, types);
+        }
+        catch (Throwable t)
+        {
+            throw new CorruptSSTableException(t, descriptor.filenameFor(Component.STATS));
+        }
+
+        return (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
+    }
+
     private Descriptor setUpForTestVerfiyCompressionInfoExistence()
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
diff --git a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
index 1c3afff2efab..f612a57c8d63 100644
--- a/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
+++ b/test/unit/org/apache/cassandra/utils/BloomFilterTest.java
@@ -1,31 +1,42 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.cassandra.utils;
 
-import java.io.*;
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.text.NumberFormat;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Random;
 import java.util.Set;
 
-import org.junit.*;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
 
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
@@ -36,14 +47,17 @@
 import org.apache.cassandra.utils.IFilter.FilterKey;
 import org.apache.cassandra.utils.KeyGenerator.RandomStringGenerator;
 import org.apache.cassandra.utils.obs.IBitSet;
+import org.apache.cassandra.utils.obs.MemoryLimiter;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
 
 public class BloomFilterTest
 {
     public IFilter bfInvHashes;
-
-
+    public MemoryLimiter memoryLimiter;
 
     public static IFilter testSerialize(IFilter f, boolean oldBfFormat) throws IOException
     {
@@ -55,11 +69,11 @@ public static IFilter testSerialize(IFilter f, boolean oldBfFormat) throws IOExc
         }
         else
         {
-            BloomFilterSerializer.serialize((BloomFilter) f, out);
+            BloomFilter.serializer.serialize((BloomFilter) f, out);
         }
 
         ByteArrayInputStream in = new ByteArrayInputStream(out.getData(), 0, out.getLength());
-        IFilter f2 = BloomFilterSerializer.deserialize(new DataInputStream(in), oldBfFormat);
+        IFilter f2 = BloomFilter.serializer.deserialize(new DataInputStream(in), oldBfFormat);
 
         assert f2.isPresent(FilterTestHelper.bytes("a"));
         assert !f2.isPresent(FilterTestHelper.bytes("b"));
@@ -76,6 +90,10 @@ static void compare(IBitSet bs, IBitSet newbs)
     @Before
     public void setup()
     {
+        // Set a high limit so that normal tests won't reach it, but we don't want Long.MAX_VALUE because
+        // we want to test what happens when we reach it
+        System.setProperty(BloomFilter.MAX_MEMORY_MB_PROP, Long.toString(128 << 10));
+        memoryLimiter = new MemoryLimiter(128L << 30, "Allocating %s for bloom filter would reach max of %s (current %s)");
         bfInvHashes = FilterFactory.getFilter(10000L, FilterTestHelper.MAX_FAILURE_RATE);
     }
 
@@ -83,6 +101,7 @@ public void setup()
     public void destroy()
     {
         bfInvHashes.close();
+        assertEquals(0, memoryLimiter.memoryAllocated());
     }
 
     @Test(expected = UnsupportedOperationException.class)
@@ -164,13 +183,13 @@ private static void testManyRandom(Iterator<ByteBuffer> keys)
             collisions += (MAX_HASH_COUNT - hashes.size());
             bf.close();
         }
-        Assert.assertTrue("collisions=" + collisions, collisions <= 100);
+        assertTrue("collisions=" + collisions, collisions <= 100);
     }
 
     @Test(expected = UnsupportedOperationException.class)
     public void testOffHeapException()
     {
-        long numKeys = ((long)Integer.MAX_VALUE) * 64L + 1L; // approx 128 Billion
+        long numKeys = ((long) Integer.MAX_VALUE) * 64L + 1L; // approx 128 Billion
         FilterFactory.getFilter(numKeys, 0.01d).close();
     }
 
@@ -202,22 +221,21 @@ public void compareCachedKey()
     }
 
     @Test
-    @Ignore
-    public void testHugeBFSerialization() throws IOException
+    public void testHugeBFSerialization() throws Exception
     {
-        ByteBuffer test = ByteBuffer.wrap(new byte[] {0, 1});
+        ByteBuffer test = ByteBuffer.wrap(new byte[]{ 0, 1 });
 
         File file = FileUtils.createDeletableTempFile("bloomFilterTest-", ".dat");
         BloomFilter filter = (BloomFilter) FilterFactory.getFilter(((long) Integer.MAX_VALUE / 8) + 1, 0.01d);
         filter.add(FilterTestHelper.wrap(test));
         DataOutputStreamPlus out = new BufferedDataOutputStreamPlus(new FileOutputStream(file));
-        BloomFilterSerializer.serialize(filter, out);
+        BloomFilter.serializer.serialize(filter, out);
         out.close();
         filter.close();
 
         DataInputStream in = new DataInputStream(new FileInputStream(file));
-        BloomFilter filter2 = BloomFilterSerializer.deserialize(in, false);
-        Assert.assertTrue(filter2.isPresent(FilterTestHelper.wrap(test)));
+        IFilter filter2 = BloomFilter.serializer.deserialize(in, false);
+        assertTrue(filter2.isPresent(FilterTestHelper.wrap(test)));
         FileUtils.closeQuietly(in);
         filter2.close();
     }
@@ -243,4 +261,96 @@ public void testMurmur3FilterHash()
             Assert.assertArrayEquals(expected, actual);
         }
     }
-}
+
+    @Test
+    public void testMaxMemoryExceeded()
+    {
+        long allocSize = 2L * (1 << 20);
+        double fpChance = 0.01;
+        long size;
+
+        try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter))
+        {
+            size = filter.offHeapSize();
+        }
+        assertNotEquals(0, size);
+
+        memoryLimiter = new MemoryLimiter(3 * size / 2, "Allocating %s for bloom filter would reach max of %s (current %s)");
+
+        try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter))
+        {
+            assertNotNull(filter);
+            assertTrue(filter instanceof BloomFilter);
+
+            long memBefore = memoryLimiter.memoryAllocated();
+
+            try (IFilter blankFilter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter))
+            {
+                assertNotNull(blankFilter);
+                assertTrue(blankFilter instanceof AlwaysPresentFilter);
+
+                assertEquals(memBefore, memoryLimiter.memoryAllocated());
+            }
+        }
+    }
+
+    @Test
+    public void testMaxMemoryExceededOnDeserialize() throws IOException
+    {
+        long allocSize = 2L * (1 << 20);
+        double fpChance = 0.01;
+        long size;
+
+        DataOutputBuffer out = new DataOutputBuffer();
+        try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter))
+        {
+            size = filter.offHeapSize();
+            BloomFilter.serializer.serialize((BloomFilter) filter, out);
+        }
+        assertNotEquals(0, size);
+
+        memoryLimiter = new MemoryLimiter(3 * size / 2, "Allocating %s for bloom filter would reach max of %s (current %s)");
+
+        try (IFilter filter = FilterFactory.getFilter(allocSize, fpChance, memoryLimiter))
+        {
+            assertNotNull(filter);
+            assertTrue(filter instanceof BloomFilter);
+
+            long memBefore = memoryLimiter.memoryAllocated();
+
+            ByteArrayInputStream in = new ByteArrayInputStream(out.getData(), 0, out.getLength());
+            try (IFilter blankFilter = new BloomFilterSerializer(memoryLimiter).deserialize(new DataInputStream(in), false))
+            {
+                assertNotNull(blankFilter);
+                assertTrue(blankFilter instanceof AlwaysPresentFilter);
+                assertEquals(memBefore, memoryLimiter.memoryAllocated());
+            }
+        }
+    }
+
+    @Test
+    @Ignore // this is a test that can be used to print out the sizes of BFs
+    public void testBloomFilterSize()
+    {
+        int[] nks = new int[]{
+        100_000, 500_000,
+        1_000_000, 5_000_000,
+        10_000_000, 50_000_000,
+        100_000_000, 500_000_000 };
+
+        //double[] fps = new double[] { 0.01, 0.05, 0.1, 0.2, 0.25 };
+        double[] fps = new double[]{ 0.01, 0.1 };
+
+        for (int nk : nks)
+        {
+            for (double fp : fps)
+            {
+                IFilter filter = FilterFactory.getFilter(nk, fp);
+                System.out.println(String.format("%s keys %s FP chance => %s",
+                                                 NumberFormat.getNumberInstance(Locale.US).format(nk),
+                                                 NumberFormat.getNumberInstance(Locale.US).format(fp),
+                                                 FBUtilities.prettyPrintMemory(filter.serializedSize())));
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/SerializationsTest.java b/test/unit/org/apache/cassandra/utils/SerializationsTest.java
index 6597f3bb562d..ba84e5f79731 100644
--- a/test/unit/org/apache/cassandra/utils/SerializationsTest.java
+++ b/test/unit/org/apache/cassandra/utils/SerializationsTest.java
@@ -66,7 +66,7 @@ private static void testBloomFilterWrite1000(boolean oldBfFormat) throws IOExcep
                 if (oldBfFormat)
                     serializeOldBfFormat((BloomFilter) bf, out);
                 else
-                    BloomFilterSerializer.serialize((BloomFilter) bf, out);
+                    BloomFilter.serializer.serialize((BloomFilter) bf, out);
             }
         }
     }
@@ -81,7 +81,7 @@ public void testBloomFilterRead1000() throws IOException
         }
 
         try (DataInputStream in = getInput("4.0", "utils.BloomFilter1000.bin");
-             IFilter filter = BloomFilterSerializer.deserialize(in, false))
+             IFilter filter = BloomFilter.serializer.deserialize(in, false))
         {
             boolean present;
             for (int i = 0 ; i < 1000 ; i++)
@@ -97,7 +97,7 @@ public void testBloomFilterRead1000() throws IOException
         }
 
         try (DataInputStream in = getInput("3.0", "utils.BloomFilter1000.bin");
-             IFilter filter = BloomFilterSerializer.deserialize(in, true))
+             IFilter filter = BloomFilter.serializer.deserialize(in, true))
         {
             boolean present;
             for (int i = 0 ; i < 1000 ; i++)
@@ -124,7 +124,7 @@ private static void testBloomFilterTable(String file, boolean oldBfFormat) throw
         Murmur3Partitioner partitioner = new Murmur3Partitioner();
 
         try (DataInputStream in = new DataInputStream(new FileInputStream(new File(file)));
-             IFilter filter = BloomFilterSerializer.deserialize(in, oldBfFormat))
+             IFilter filter = BloomFilter.serializer.deserialize(in, oldBfFormat))
         {
             for (int i = 1; i <= 10; i++)
             {
diff --git a/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java b/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java
index 49b4c94dd387..87dbd192f7bf 100644
--- a/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java
+++ b/test/unit/org/apache/cassandra/utils/obs/OffHeapBitSetTest.java
@@ -20,15 +20,15 @@
 
 import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
-import java.io.IOException;
 import java.util.List;
 import java.util.Random;
 
 import com.google.common.collect.Lists;
-import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.junit.Assert;
 import org.junit.Test;
 
+import org.apache.cassandra.io.util.DataOutputBuffer;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
@@ -36,6 +36,7 @@
 public class OffHeapBitSetTest
 {
     private static final Random random = new Random();
+    private static final MemoryLimiter memoryLimiter = new MemoryLimiter(1L << 30, "Allocating %s for bloom filter would reach max of %s (current %s)");
 
     static void compare(IBitSet bs, IBitSet newbs)
     {
@@ -44,9 +45,9 @@ static void compare(IBitSet bs, IBitSet newbs)
             Assert.assertEquals(bs.get(i), newbs.get(i));
     }
 
-    private void testOffHeapSerialization(boolean oldBfFormat) throws IOException
+    private void testOffHeapSerialization(boolean oldBfFormat) throws Exception
     {
-        try (OffHeapBitSet bs = new OffHeapBitSet(100000))
+        try (OffHeapBitSet bs = new OffHeapBitSet(100000, memoryLimiter))
         {
             for (long i = 0; i < bs.capacity(); i++)
                 if (random.nextBoolean())
@@ -59,7 +60,7 @@ private void testOffHeapSerialization(boolean oldBfFormat) throws IOException
                 bs.serialize(out);
 
             DataInputStream in = new DataInputStream(new ByteArrayInputStream(out.getData()));
-            try (OffHeapBitSet newbs = OffHeapBitSet.deserialize(in, oldBfFormat))
+            try (OffHeapBitSet newbs = OffHeapBitSet.deserialize(in, oldBfFormat, memoryLimiter))
             {
                 compare(bs, newbs);
             }
@@ -67,17 +68,17 @@ private void testOffHeapSerialization(boolean oldBfFormat) throws IOException
     }
 
     @Test
-    public void testSerialization() throws IOException
+    public void testSerialization() throws Exception
     {
         testOffHeapSerialization(true);
         testOffHeapSerialization(false);
     }
 
     @Test
-    public void testBitSetGetClear()
+    public void testBitSetGetClear() throws Exception
     {
         int size = Integer.MAX_VALUE / 4000;
-        try (OffHeapBitSet bs = new OffHeapBitSet(size))
+        try (OffHeapBitSet bs = new OffHeapBitSet(size, memoryLimiter))
         {
             List<Integer> randomBits = Lists.newArrayList();
             for (int i = 0; i < 10; i++)
@@ -98,16 +99,16 @@ public void testBitSetGetClear()
     }
 
     @Test(expected = UnsupportedOperationException.class)
-    public void testUnsupportedLargeSize()
+    public void testUnsupportedLargeSize() throws Exception
     {
         long size = 64L * Integer.MAX_VALUE + 1; // Max size 16G * 8 bits
-        OffHeapBitSet bs = new OffHeapBitSet(size);
+        OffHeapBitSet bs = new OffHeapBitSet(size, memoryLimiter);
     }
 
     @Test
-    public void testInvalidIndex()
+    public void testInvalidIndex() throws Exception
     {
-        OffHeapBitSet bs = new OffHeapBitSet(10);
+        OffHeapBitSet bs = new OffHeapBitSet(10, memoryLimiter);
         int invalidIdx[] = {-1, 64, 1000};
 
         for (int i : invalidIdx)

From b777dbebfbdf3006141da5825acd0a0c1b542840 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Wed, 16 Jun 2021 10:56:25 +0200
Subject: [PATCH 018/151] STAR-748: Fix Scrubber so that it can work in case of
 broken index (#193)

Fix scrubber so that it can work with broken indexes.

If the index is broken, but the data file is ok, we can continue scrubbing and the index will be rebuilt. With this patch it will work even if we cannot access the initial position in the index. Also, the corrupted file will be obsoleted before finishing the rewriter so that it will not be attempted to move starts (and thus fail at the end as move starts requires correct indexes).

There are also some explanations added to how canonical set of sstables is constructed and why

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
(cherry picked from commit c0be3ae5cdd89fae0afcbb277322a466ece4de00)
---
 .../cassandra/db/compaction/Scrubber.java     |  50 +++--
 .../apache/cassandra/db/lifecycle/View.java   |  23 ++-
 .../cassandra/io/sstable/SSTableRewriter.java |  13 +-
 .../org/apache/cassandra/db/ScrubTest.java    | 171 ++++++++++++++----
 .../io/sstable/SSTableRewriterTest.java       |  88 ++++++---
 5 files changed, 252 insertions(+), 93 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index afbfe3d27a61..5884f989e008 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -167,12 +167,23 @@ public void scrub()
         try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, false, sstable.maxDataAge);
              Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable)))
         {
-            nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null;
-            if (indexAvailable())
+            try
+            {
+                nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null;
+                if (indexAvailable())
+                {
+                    // throw away variable so we don't have a side effect in the assert
+                    long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
+                    assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
+                }
+            }
+            catch (Throwable ex)
             {
-                // throw away variable so we don't have a side effect in the assert
-                long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
-                assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
+                throwIfFatal(ex);
+                nextIndexKey = null;
+                nextRowPositionFromIndex = dataFile.length();
+                if (indexFile != null)
+                    indexFile.seek(indexFile.length());
             }
 
             StatsMetadata metadata = sstable.getSSTableMetadata();
@@ -199,18 +210,22 @@ public void scrub()
                     // check for null key below
                 }
 
-                updateIndexKey();
-
-                long dataStart = dataFile.getFilePointer();
-
                 long dataStartFromIndex = -1;
                 long dataSizeFromIndex = -1;
-                if (currentIndexKey != null)
+
+                updateIndexKey();
+
+                if (indexAvailable())
                 {
-                    dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining();
-                    dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
+                    if (currentIndexKey != null)
+                    {
+                        dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining();
+                        dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
+                    }
                 }
 
+                long dataStart = dataFile.getFilePointer();
+
                 // avoid an NPE if key is null
                 String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
                 outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex)));
@@ -295,13 +310,10 @@ public void scrub()
             }
 
             // finish obsoletes the old sstable
+            transaction.obsoleteOriginals();
             finished.addAll(writer.setRepairedAt(badRows > 0 ? ActiveRepairService.UNREPAIRED_SSTABLE : sstable.getSSTableMetadata().repairedAt).finish());
             completed = true;
         }
-        catch (IOException e)
-        {
-            throw Throwables.propagate(e);
-        }
         finally
         {
             if (transaction.isOffline())
@@ -379,8 +391,8 @@ private void updateIndexKey()
             nextIndexKey = !indexAvailable() ? null : ByteBufferUtil.readWithShortLength(indexFile);
 
             nextRowPositionFromIndex = !indexAvailable()
-                    ? dataFile.length()
-                    : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
+                                       ? dataFile.length()
+                                       : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
         }
         catch (Throwable th)
         {
@@ -388,6 +400,8 @@ private void updateIndexKey()
             outputHandler.warn("Error reading index file", th);
             nextIndexKey = null;
             nextRowPositionFromIndex = dataFile.length();
+            if (indexFile != null)
+                indexFile.seek(indexFile.length());
         }
     }
 
diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java
index b26426de63ae..e2f09b7791d4 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/View.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/View.java
@@ -136,13 +136,30 @@ public Iterable<SSTableReader> select(SSTableSet sstableSet)
             case NONCOMPACTING:
                 return filter(sstables, (s) -> !compacting.contains(s));
             case CANONICAL:
+                // When early open is not in play, the LIVE and CANONICAL sets are the same.
+                // However, when we do have early-open sstables, we will have some unfinished sources in the live set.
+                // For these sources we need to extract the originals, in their non-moved-start versions, from the
+                // compacting set.
+                // This creates a problem when the compaction completes, as then both:
+                // - the source is in the compacting set
+                // - the result is in the live set
+                // This currently causes the CANONICAL set to return both source and result when early-open is disabled,
+                // and is otherwise worked around by opening early the last sstable in the result set (which pushes it
+                // in the compacting set with EARLY openReason) and the !compacting.contains(sstable) check in the
+                // second loop below.
+                // Unfortunately there does not appear to be a way to avoid this workaround. Filtering the compacting
+                // set through having an early-open version in live does not work because sources are fully removed from
+                // the live set when they are completely exhausted.
+
+                // Add the compacting versions first because they will be the canonical versions of compaction sources.
                 Set<SSTableReader> canonicalSSTables = new HashSet<>();
                 for (SSTableReader sstable : compacting)
                     if (sstable.openReason != SSTableReader.OpenReason.EARLY)
                         canonicalSSTables.add(sstable);
-                // reason for checking if compacting contains the sstable is that if compacting has an EARLY version
-                // of a NORMAL sstable, we still have the canonical version of that sstable in sstables.
-                // note that the EARLY version is equal, but not == since it is a different instance of the same sstable.
+                // Add anything that is not compacting, removing any compaction result where we still have the
+                // compaction sources.
+                // note that the EARLY version is equal to the original, i.e. the set itself can guarantee early-open
+                // versions of sstables in compacting won't be added, but we also want to remove the results.
                 for (SSTableReader sstable : sstables)
                     if (!compacting.contains(sstable) && sstable.openReason != SSTableReader.OpenReason.EARLY)
                         canonicalSSTables.add(sstable);
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
index a3d5ae9a2bab..92548b26aea4 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
@@ -255,10 +255,13 @@ private void moveStarts(SSTableReader newReader, DecoratedKey lowerbound)
                 continue;
             }
 
-            DecoratedKey newStart = latest.firstKeyBeyond(lowerbound);
-            assert newStart != null;
-            SSTableReader replacement = latest.cloneWithNewStart(newStart, runOnClose);
-            transaction.update(replacement, true);
+            if (!transaction.isObsolete(latest))
+            {
+                DecoratedKey newStart = latest.firstKeyBeyond(lowerbound);
+                assert newStart != null;
+                SSTableReader replacement = latest.cloneWithNewStart(newStart, runOnClose);
+                transaction.update(replacement, true);
+            }
         }
     }
 
@@ -310,6 +313,8 @@ public void switchWriter(SSTableWriter newWriter)
             return;
         }
 
+        // Open fully completed sstables early. This is also required for the final sstable in a set (where newWriter
+        // is null) to permit the compilation of a canonical set of sstables (see View.select).
         if (preemptiveOpenInterval != Long.MAX_VALUE)
         {
             // we leave it as a tmp file, but we open it and add it to the Tracker
diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index 1b425051f859..e17d202a9835 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
@@ -28,22 +28,26 @@
 import java.nio.file.Paths;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+import java.util.SortedSet;
 import java.util.UUID;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.atomic.AtomicInteger;
 
-import org.apache.commons.lang3.StringUtils;
-
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Sets;
+import org.apache.commons.lang3.ArrayUtils;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
+import net.openhft.chronicle.core.util.ThrowingBiConsumer;
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.UpdateBuilder;
@@ -59,7 +63,6 @@
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.db.marshal.UUIDType;
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
@@ -199,7 +202,8 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep
             scrubber.scrub();
             fail("Expected a CorruptSSTableException to be thrown");
         }
-        catch (IOError err) {
+        catch (IOError err)
+        {
             assertTrue(err.getCause() instanceof CorruptSSTableException);
         }
 
@@ -224,7 +228,7 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep
         else
         {
             assertEquals(1, scrubResult.badRows);
-            assertEquals(numPartitions-1, scrubResult.goodRows);
+            assertEquals(numPartitions - 1, scrubResult.goodRows);
         }
         assertEquals(1, cfs.getLiveSSTables().size());
 
@@ -232,36 +236,98 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep
     }
 
     @Test
-    public void testScrubCorruptedRowInSmallFile() throws IOException, WriteTimeoutException
+    public void testScrubCorruptedRowInSmallFile() throws Throwable
+    {
+        // overwrite one row with garbage
+        testCorruptionInSmallFile((sstable, keys) ->
+                                  overrideWithGarbage(sstable,
+                                                      ByteBufferUtil.bytes(keys[0]),
+                                                      ByteBufferUtil.bytes(keys[1]),
+                                                      (byte) 0x7A),
+                                  false,
+                                  4);
+    }
+
+
+    @Test
+    public void testScrubCorruptedIndex() throws Throwable
+    {
+        // overwrite a part of the index with garbage
+        testCorruptionInSmallFile((sstable, keys) ->
+                                  overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX),
+                                                      5,
+                                                      6,
+                                                      (byte) 0x7A),
+                                  true,
+                                  5);
+    }
+
+    @Test
+    public void testScrubCorruptedIndexOnOpen() throws Throwable
+    {
+        // overwrite the whole index with garbage
+        testCorruptionInSmallFile((sstable, keys) ->
+                                  overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX),
+                                                      0,
+                                                      60,
+                                                      (byte) 0x7A),
+                                  true,
+                                  5);
+    }
+
+    @Test
+    public void testScrubCorruptedRowCorruptedIndex() throws Throwable
+    {
+        // overwrite one row, and the index with garbage
+        testCorruptionInSmallFile((sstable, keys) ->
+                                  {
+                                      overrideWithGarbage(sstable,
+                                                          ByteBufferUtil.bytes(keys[2]),
+                                                          ByteBufferUtil.bytes(keys[3]),
+                                                          (byte) 0x7A);
+                                      overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX),
+                                                          5,
+                                                          6,
+                                                          (byte) 0x7A);
+                                  },
+                                  false,
+                                  2);   // corrupt after the second partition, no way to resync
+    }
+
+    public void testCorruptionInSmallFile(ThrowingBiConsumer<SSTableReader, String[], IOException> corrupt, boolean isFullyRecoverable, int expectedPartitions) throws Throwable
     {
         // cannot test this with compression
         assumeTrue(!Boolean.parseBoolean(System.getProperty("cassandra.test.compression", "false")));
 
         CompactionManager.instance.disableAutoCompaction();
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(COUNTER_CF);
+        cfs.clearUnsafe();
 
-        fillCounterCF(cfs, 2);
+        String[] keys = fillCounterCF(cfs, 5);
 
-        assertOrderedAll(cfs, 2);
+        assertOrderedAll(cfs, 5);
 
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         // overwrite one row with garbage
-        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"));
+        corrupt.accept(sstable, keys);
 
         // with skipCorrupted == false, the scrub is expected to fail
-        try (LifecycleTransaction txn = cfs.getTracker().tryModify(Collections.singletonList(sstable), OperationType.SCRUB);
-             Scrubber scrubber = new Scrubber(cfs, txn, false, true))
+        if (!isFullyRecoverable)
         {
-            // with skipCorrupted == true, the corrupt row will be skipped
-            scrubber.scrub();
-            fail("Expected a CorruptSSTableException to be thrown");
-        }
-        catch (IOError err) {
-            assertTrue(err.getCause() instanceof CorruptSSTableException);
+            try (LifecycleTransaction txn = cfs.getTracker().tryModify(Arrays.asList(sstable), OperationType.SCRUB);
+                 Scrubber scrubber = new Scrubber(cfs, txn, false, true))
+            {
+                // with skipCorrupted == true, the corrupt row will be skipped
+                scrubber.scrub();
+                fail("Expected a CorruptSSTableException to be thrown");
+            }
+            catch (IOError err)
+            {
+            }
         }
 
-        try (LifecycleTransaction txn = cfs.getTracker().tryModify(Collections.singletonList(sstable), OperationType.SCRUB);
+        try (LifecycleTransaction txn = cfs.getTracker().tryModify(ImmutableList.of(sstable), OperationType.SCRUB);
              Scrubber scrubber = new Scrubber(cfs, txn, true, true))
         {
             // with skipCorrupted == true, the corrupt row will be skipped
@@ -269,8 +335,8 @@ public void testScrubCorruptedRowInSmallFile() throws IOException, WriteTimeoutE
         }
 
         assertEquals(1, cfs.getLiveSSTables().size());
-        // verify that we can read all of the rows, and there is now one less row
-        assertOrderedAll(cfs, 1);
+        // verify that we can read all of the rows, and there is now the expected number of rows
+        assertOrderedAll(cfs, expectedPartitions);
     }
 
     @Test
@@ -346,14 +412,14 @@ public void testScrubNoIndex() throws ExecutionException, InterruptedException,
     }
 
     @Test
-    public void testScrubOutOfOrder()
+    public void testScrubOutOfOrder() throws IOException
     {
         // This test assumes ByteOrderPartitioner to create out-of-order SSTable
         IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner();
         DatabaseDescriptor.setPartitionerUnsafe(new ByteOrderedPartitioner());
 
         // Create out-of-order SSTable
-        File tempDir = FileUtils.createTempFile("ScrubTest.testScrubOutOfOrder", "").getParentFile();
+        File tempDir = Files.createTempDirectory("ScrubTest.testScrubOutOfOrder").toFile();
         // create ks/cf directory
         File tempDataDir = new File(tempDir, String.join(File.separator, ksName, CF));
         assertTrue(tempDataDir.mkdirs());
@@ -420,6 +486,11 @@ public void testScrubOutOfOrder()
     }
 
     private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2) throws IOException
+    {
+        overrideWithGarbage(sstable, key1, key2, (byte) 'z');
+    }
+
+    private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2, byte junk) throws IOException
     {
         boolean compression = Boolean.parseBoolean(System.getProperty("cassandra.test.compression", "false"));
         long startPosition, endPosition;
@@ -429,9 +500,9 @@ private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuf
             CompressionMetadata compData = CompressionMetadata.create(sstable.getFilename());
 
             CompressionMetadata.Chunk chunk1 = compData.chunkFor(
-                    sstable.getPosition(PartitionPosition.ForKey.get(key1, sstable.getPartitioner()), SSTableReader.Operator.EQ).position);
+            sstable.getPosition(PartitionPosition.ForKey.get(key1, sstable.getPartitioner()), SSTableReader.Operator.EQ).position);
             CompressionMetadata.Chunk chunk2 = compData.chunkFor(
-                    sstable.getPosition(PartitionPosition.ForKey.get(key2, sstable.getPartitioner()), SSTableReader.Operator.EQ).position);
+            sstable.getPosition(PartitionPosition.ForKey.get(key2, sstable.getPartitioner()), SSTableReader.Operator.EQ).position);
 
             startPosition = Math.min(chunk1.offset, chunk2.offset);
             endPosition = Math.max(chunk1.offset + chunk1.length, chunk2.offset + chunk2.length);
@@ -446,18 +517,31 @@ private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuf
             endPosition = Math.max(row0Start, row1Start);
         }
 
-        overrideWithGarbage(sstable, startPosition, endPosition);
+        overrideWithGarbage(sstable, startPosition, endPosition, junk);
     }
 
     private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition) throws IOException
     {
-        try (RandomAccessFile file = new RandomAccessFile(sstable.getFilename(), "rw"))
+        overrideWithGarbage(sstable, startPosition, endPosition, (byte) 'z');
+    }
+
+    private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition, byte junk) throws IOException
+    {
+        overrideWithGarbage(sstable.getFilename(), startPosition, endPosition, junk);
+    }
+
+    private void overrideWithGarbage(String path, long startPosition, long endPosition, byte junk) throws IOException
+    {
+        try (RandomAccessFile file = new RandomAccessFile(path, "rw"))
         {
             file.seek(startPosition);
-            file.writeBytes(StringUtils.repeat('z', (int) (endPosition - startPosition)));
+            int length = (int) (endPosition - startPosition);
+            byte[] buff = new byte[length];
+            Arrays.fill(buff, junk);
+            file.write(buff, 0, length);
         }
         if (ChunkCache.instance != null)
-            ChunkCache.instance.invalidateFile(sstable.getFilename());
+            ChunkCache.instance.invalidateFile(path);
     }
 
     private static void assertOrderedAll(ColumnFamilyStore cfs, int expectedSize)
@@ -494,10 +578,10 @@ protected void fillCF(ColumnFamilyStore cfs, int partitionsPerSSTable)
         cfs.forceBlockingFlush();
     }
 
-    public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long ... values)
+    public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long... values)
     {
         assertEquals(0, values.length % 2);
-        for (int i = 0; i < values.length; i +=2)
+        for (int i = 0; i < values.length; i += 2)
         {
             UpdateBuilder builder = UpdateBuilder.create(cfs.metadata(), String.valueOf(i));
             if (composite)
@@ -518,17 +602,23 @@ public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long ..
         cfs.forceBlockingFlush();
     }
 
-    protected void fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException
+    protected String[] fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException
     {
+        SortedSet<String> tokenSorted = Sets.newTreeSet(Comparator.comparing(a -> cfs.getPartitioner()
+                                                                                     .decorateKey(ByteBufferUtil.bytes(a))));
+
         for (int i = 0; i < partitionsPerSSTable; i++)
         {
             PartitionUpdate update = UpdateBuilder.create(cfs.metadata(), String.valueOf(i))
                                                   .newRow("r1").add("val", 100L)
                                                   .build();
+            tokenSorted.add(String.valueOf(i));
             new CounterMutation(new Mutation(update), ConsistencyLevel.ONE).apply();
         }
 
         cfs.forceBlockingFlush();
+
+        return tokenSorted.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
     }
 
     @Test
@@ -624,18 +714,18 @@ public void testScrubTwice() throws IOException, ExecutionException, Interrupted
     }
 
     @SuppressWarnings("SameParameterValue")
-    private void testScrubIndex(String cfName, String colName, boolean composite, boolean ... scrubs)
-            throws IOException, ExecutionException, InterruptedException
+    private void testScrubIndex(String cfName, String colName, boolean composite, boolean... scrubs)
+    throws IOException, ExecutionException, InterruptedException
     {
         CompactionManager.instance.disableAutoCompaction();
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfName);
 
         int numRows = 1000;
-        long[] colValues = new long [numRows * 2]; // each row has two columns
-        for (int i = 0; i < colValues.length; i+=2)
+        long[] colValues = new long[numRows * 2]; // each row has two columns
+        for (int i = 0; i < colValues.length; i += 2)
         {
             colValues[i] = (i % 4 == 0 ? 1L : 2L); // index column
-            colValues[i+1] = 3L; //other column
+            colValues[i + 1] = 3L; //other column
         }
         fillIndexCF(cfs, composite, colValues);
 
@@ -646,7 +736,7 @@ private void testScrubIndex(String cfName, String colName, boolean composite, bo
         // scrub index
         Set<ColumnFamilyStore> indexCfss = cfs.indexManager.getAllIndexColumnFamilyStores();
         assertEquals(1, indexCfss.size());
-        for(ColumnFamilyStore indexCfs : indexCfss)
+        for (ColumnFamilyStore indexCfs : indexCfss)
         {
             for (int i = 0; i < scrubs.length; i++)
             {
@@ -655,11 +745,11 @@ private void testScrubIndex(String cfName, String colName, boolean composite, bo
                 { //make sure the next scrub fails
                     overrideWithGarbage(indexCfs.getLiveSSTables().iterator().next(), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(2L));
                 }
-                CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, false, true, false,0);
+                CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, false, true, false, 0);
                 assertEquals(failure ?
                              CompactionManager.AllSSTableOpStatus.ABORTED :
                              CompactionManager.AllSSTableOpStatus.SUCCESSFUL,
-                                result);
+                             result);
             }
         }
 
@@ -804,7 +894,8 @@ public void testNoSkipScrubCorruptedCounterRowWithTool() throws IOException, Wri
             ToolRunner.invokeClass(StandaloneScrubber.class, ksName, COUNTER_CF);
             fail("Expected a CorruptSSTableException to be thrown");
         }
-        catch (IOError err) {
+        catch (IOError err)
+        {
             assertTrue(err.getCause() instanceof CorruptSSTableException);
         }
     }
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
index 1895653ccd0b..30b1d5ca3221 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
@@ -23,14 +23,17 @@
 import java.util.*;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import org.apache.cassandra.Util;
 import org.apache.cassandra.UpdateBuilder;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.DeletionTime;
@@ -620,7 +623,7 @@ private void testAbortHelper(boolean earlyException, boolean offline)
              CompactionController controller = new CompactionController(cfs, compacting, 0);
              LifecycleTransaction txn = offline ? LifecycleTransaction.offline(OperationType.UNKNOWN, compacting)
                                        : cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(txn, 100, 10000000, false, true);
+             SSTableRewriter rewriter = new SSTableRewriter(txn, 100, 10000000, offline, true);
              CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, Collections.singletonList(scanner), controller, FBUtilities.nowInSeconds(), UUIDGen.getTimeUUID())
         )
         {
@@ -812,42 +815,71 @@ public void testTwoWriters()
     }
 
     @Test
-    public void testCanonicalSSTables() throws ExecutionException, InterruptedException
+    public void testCanonicalSSTablesWithEarlyOpen() throws ExecutionException, InterruptedException
     {
-        Keyspace keyspace = Keyspace.open(KEYSPACE);
-        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
-        truncate(cfs);
+        testCanonicalSSTables(1);
+    }
 
-        cfs.addSSTable(writeFile(cfs, 100));
-        Collection<SSTableReader> allSSTables = cfs.getLiveSSTables();
-        assertEquals(1, allSSTables.size());
-        final AtomicBoolean done = new AtomicBoolean(false);
-        final AtomicBoolean failed = new AtomicBoolean(false);
-        Runnable r = () -> {
-            while (!done.get())
-            {
-                Iterable<SSTableReader> sstables = cfs.getSSTables(SSTableSet.CANONICAL);
-                if (Iterables.size(sstables) != 1)
+    @Test
+    public void testCanonicalSSTablesWithFinalEarlyOpen() throws ExecutionException, InterruptedException
+    {
+        testCanonicalSSTables(1000000);
+    }
+
+    @Test
+    @Ignore // This does not currently work. See View.select.
+    public void testCanonicalSSTablesNoEarlyOpen() throws ExecutionException, InterruptedException
+    {
+        testCanonicalSSTables(-1);
+    }
+
+
+    public void testCanonicalSSTables(int preemptiveOpenInterval) throws ExecutionException, InterruptedException
+    {
+        int prevPreemptiveOpenInterval = DatabaseDescriptor.getSSTablePreemptiveOpenIntervalInMB();
+        try
+        {
+            DatabaseDescriptor.setSSTablePreemptiveOpenIntervalInMB(preemptiveOpenInterval);
+            Keyspace keyspace = Keyspace.open(KEYSPACE);
+            final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
+            truncate(cfs);
+
+            cfs.addSSTable(writeFile(cfs, 2000));
+            Collection<SSTableReader> allSSTables = cfs.getLiveSSTables();
+            assertEquals(1, allSSTables.size());
+            final AtomicBoolean done = new AtomicBoolean(false);
+            final AtomicBoolean gotZero = new AtomicBoolean(false);
+            final AtomicInteger maxValue = new AtomicInteger(0);
+            Runnable r = () -> {
+                while (!done.get())
                 {
-                    failed.set(true);
-                    return;
+                    Iterable<SSTableReader> sstables = cfs.getSSTables(SSTableSet.CANONICAL);
+                    int sstablesCount = Iterables.size(sstables);
+                    if (sstablesCount == 0)
+                        gotZero.set(true);
+                    else
+                        maxValue.updateAndGet(prev -> Math.max(prev, sstablesCount));
                 }
+            };
+            Thread t = NamedThreadFactory.createThread(r);
+            try
+            {
+                t.start();
+                cfs.forceMajorCompaction();
             }
-        };
-        Thread t = NamedThreadFactory.createThread(r);
-        try
-        {
-            t.start();
-            cfs.forceMajorCompaction();
+            finally
+            {
+                done.set(true);
+                t.join(20);
+            }
+            // Note: the checks below can falsely succeed. Flaky failures should be treated as genuine problems.
+            assertFalse("No sstables", gotZero.get());
+            assertEquals("Too many sstables", 1, maxValue.get());
         }
         finally
         {
-            done.set(true);
-            t.join(20);
+            DatabaseDescriptor.setSSTablePreemptiveOpenIntervalInMB(prevPreemptiveOpenInterval);
         }
-        assertFalse(failed.get());
-
-
     }
 
     /**

From 6e7c1287a80e2484f8c4b09dd9775ed39edd1dde Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Thu, 17 Jun 2021 03:03:36 -0700
Subject: [PATCH 019/151] =?UTF-8?q?STAR-583=20Fix=20infinite=20loop=20when?=
 =?UTF-8?q?=20replaying=20a=20truncated=20commit=20log=20file=E2=80=A6=20(?=
 =?UTF-8?q?#194)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

STAR-583 Fix infinite loop when replaying a truncated commit log file and truncation is tolerated

Co-authored-by: Massimiliano Tomassi <maxtomassi@users.noreply.github.com>
(cherry picked from commit 2bf6dd4e31c48a3567cb3cd4eb515da8005d8e10)
---
 .../cassandra/db/commitlog/CommitLog.java     |  6 +--
 .../db/commitlog/CommitLogSegmentReader.java  |  9 +++-
 .../cassandra/db/commitlog/CommitLogTest.java | 43 +++++++++++++++++--
 3 files changed, 50 insertions(+), 8 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
index a32b8a1030e7..3fdfaf44c127 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java
@@ -200,10 +200,10 @@ public int recoverFiles(File... clogs) throws IOException
         return replayer.blockForWrites();
     }
 
-    public void recoverPath(String path) throws IOException
+    public void recoverPath(String path, boolean tolerateTruncation) throws IOException
     {
         CommitLogReplayer replayer = CommitLogReplayer.construct(this, getLocalHostId());
-        replayer.replayPath(new File(path), false);
+        replayer.replayPath(new File(path), tolerateTruncation);
         replayer.blockForWrites();
     }
 
@@ -217,7 +217,7 @@ private static UUID getLocalHostId()
      */
     public void recover(String path) throws IOException
     {
-        recoverPath(path);
+        recoverPath(path, false);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java
index e23a915ba355..de4f135583ac 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogSegmentReader.java
@@ -87,9 +87,9 @@ protected SyncSegment computeNext()
         {
             while (true)
             {
+                final int currentStart = end;
                 try
                 {
-                    final int currentStart = end;
                     end = readSyncMarker(descriptor, currentStart, reader);
                     if (end == -1)
                     {
@@ -133,6 +133,13 @@ protected SyncSegment computeNext()
                         throw new RuntimeException(ioe);
                     }
                 }
+
+                // if we've not been able to read the sync marker, or the file is truncated,
+                // then return end of data, otherwise continue the loop
+                if (currentStart == end)
+                {
+                    return endOfData();
+                }
             }
         }
     }
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
index da3b83ee6ce5..86a47febc74b 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
@@ -74,8 +74,10 @@
 import org.junit.After;
 
 import static org.apache.cassandra.db.commitlog.CommitLogSegment.ENTRY_OVERHEAD_SIZE;
+import static org.apache.cassandra.db.commitlog.CommitLogSegment.SYNC_MARKER_SIZE;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
 
+import static org.apache.cassandra.utils.FBUtilities.updateChecksumInt;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -262,6 +264,28 @@ public void testRecoveryWithShortSize() throws Exception
         }, CommitLogReplayException.class);
     }
 
+    @Test
+    public void testRecoveryWithTruncatedFileAndTruncationToleration() throws Exception
+    {
+        CommitLogDescriptor desc = new CommitLogDescriptor(CommitLogDescriptor.current_version,
+                                                           CommitLogSegment.getNextId(),
+                                                           DatabaseDescriptor.getCommitLogCompression(),
+                                                           DatabaseDescriptor.getEncryptionContext());
+
+        byte[] randomData = new byte[100];
+        (new java.util.Random()).nextBytes(randomData);
+
+        // Simulates a truncated log segment section by writing a segment section marker with a section end offset
+        // that is greater than the log file size.
+        // 
+        // This is achieved by using a data length greater than the actual data contents, which will be used when
+        // writing the segment marker.
+        int dataLength = randomData.length * 2;
+        
+        // Recovery should succeed when truncation toleration is specified
+        testRecovery(desc, randomData, dataLength, true);
+    }
+
     @Test
     public void testRecoveryWithShortMutationSize() throws Exception
     {
@@ -595,20 +619,31 @@ protected Void testRecovery(byte[] logData, int version) throws Exception
         return null;
     }
 
-    protected Void testRecovery(CommitLogDescriptor desc, byte[] logData) throws Exception
+    protected Void testRecovery(CommitLogDescriptor desc, byte[] logData, int dataLength, boolean tolerateTruncation) throws Exception
     {
         File logFile = tmpFile(desc.version);
         CommitLogDescriptor fromFile = CommitLogDescriptor.fromFileName(logFile.getName());
         // Change id to match file.
         desc = new CommitLogDescriptor(desc.version, fromFile.id, desc.compression, desc.getEncryptionContext());
+
         ByteBuffer buf = ByteBuffer.allocate(1024);
         CommitLogDescriptor.writeHeader(buf, desc, getAdditionalHeaders(desc.getEncryptionContext()));
+
+        // Write a section marker using the given data length
+        CommitLogSegment.writeSyncMarker(fromFile.id, buf, buf.position(), buf.position(), buf.position() + SYNC_MARKER_SIZE + dataLength);
+        
+        // Update buffer position for sync marker
+        buf.position(buf.position() + SYNC_MARKER_SIZE);
+
+        // Add data to byte buffer
+        buf.put(logData);
+        
         try (OutputStream lout = new FileOutputStream(logFile))
         {
             lout.write(buf.array(), 0, buf.position());
-            lout.write(logData);
+
             //statics make it annoying to test things correctly
-            CommitLog.instance.recover(logFile.getPath()); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure*/
+            CommitLog.instance.recoverPath(logFile.getPath(), tolerateTruncation); //CASSANDRA-1119 / CASSANDRA-1179 throw on failure
         }
         return null;
     }
@@ -636,7 +671,7 @@ public void testRecoveryWithBadCompressor() throws Exception
     {
         CommitLogDescriptor desc = new CommitLogDescriptor(4, new ParameterizedClass("UnknownCompressor", null), EncryptionContextGenerator.createDisabledContext());
         runExpecting(() -> {
-            testRecovery(desc, new byte[0]);
+            testRecovery(desc, new byte[0], 0, false);
             return null;
         }, CommitLogReplayException.class);
     }

From 5d5368c719ed39b47d49ca3ef1ef0810e80a7e15 Mon Sep 17 00:00:00 2001
From: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Date: Tue, 20 Feb 2018 11:41:22 +0800
Subject: [PATCH 020/151] STAR-593 Harden txn log files against exceptions

Harden txn log files against exceptions when adding records and improve log messages

Port of riptano/apollo@83b93bc434dc2f07371e848a9b24854403bf740e

(cherry picked from commit 8175eb82b5f7c31ee77dfe13ac57eb40a647c775)
---
 .../cassandra/db/lifecycle/LogReplicaSet.java | 21 ++++++++++++++++---
 .../db/lifecycle/LogTransaction.java          |  4 +++-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java b/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java
index 0295357e8f0f..6a07392217b7 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogReplicaSet.java
@@ -222,13 +222,28 @@ void printContentsWithAnyErrors(StringBuilder str)
      */
     void append(LogRecord record)
     {
-        Throwable err = Throwables.perform(null, replicas().stream().map(r -> () -> r.append(record)));
+        Throwable err = null;
+        int failed = 0;
+        for (LogReplica replica : replicas())
+        {
+            try
+            {
+                replica.append(record);
+            }
+            catch (Throwable t)
+            {
+                logger.warn("Failed to add record to a replica: {}", t.getMessage());
+                err = Throwables.merge(err, t);
+                failed++;
+            }
+        }
+
         if (err != null)
         {
-            if (!record.isFinal() || err.getSuppressed().length == replicas().size() -1)
+            if (!record.isFinal() || failed == replicas().size())
                 Throwables.maybeFail(err);
 
-            logger.error("Failed to add record '{}' to some replicas '{}'", record, this);
+            logger.error("Failed to add record '{}' to some replicas '{}'", record, this, err);
         }
     }
 
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
index 85df4d64e04f..fd916864a879 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
@@ -548,7 +548,9 @@ static boolean removeUnfinishedLeftovers(Map.Entry<String, List<File>> entry)
                 }
                 else
                 {
-                    logger.error("Unexpected disk state: failed to read transaction log {}", txn.toString(true));
+                    logger.error("Unexpected disk state: failed to read transaction log {}, " +
+                                 "check logs before last shutdown for any errors, and ensure txn log files were not edited manually.",
+                                 txn.toString(true));
                     return false;
                 }
             }

From 07666e00d0c9c76bae051a66c766592dc30f8d9b Mon Sep 17 00:00:00 2001
From: Ruslan Fomkin <ruslan.fomkin@datastax.com>
Date: Wed, 16 Jun 2021 15:19:53 +0200
Subject: [PATCH 021/151] STAR-593 test patched LogReplicaSet.append

The ported bug fix patch changes impelmentation of
LogReplicaSet.append in the error case, however no tests exist.
This commit adds tests to cover the error path.

It also changes the version of JUnit to use assertThrow.

(cherry picked from commit ca66e38f764963d9bf4c1ea8018cb8e4cdad0cef)
---
 build.xml                                     |  2 +-
 relocate-dependencies.pom                     |  2 +-
 .../db/lifecycle/LogReplicationSetTest.java   | 78 +++++++++++++++++++
 3 files changed, 80 insertions(+), 2 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java

diff --git a/build.xml b/build.xml
index 95b237ee16c3..884d9cd60e51 100644
--- a/build.xml
+++ b/build.xml
@@ -523,7 +523,7 @@
           <dependency groupId="com.boundary" artifactId="high-scale-lib" version="1.0.6"/>
           <dependency groupId="com.github.jbellis" artifactId="jamm" version="${jamm.version}"/>
           <dependency groupId="org.yaml" artifactId="snakeyaml" version="1.26"/>
-          <dependency groupId="junit" artifactId="junit" version="4.12" scope="test">
+          <dependency groupId="junit" artifactId="junit" version="4.13" scope="test">
             <exclusion groupId="org.hamcrest" artifactId="hamcrest-core"/>
           </dependency>
           <dependency groupId="org.mockito" artifactId="mockito-core" version="3.2.4" scope="test"/>
diff --git a/relocate-dependencies.pom b/relocate-dependencies.pom
index 07728dd405ae..d7c9150dda7d 100644
--- a/relocate-dependencies.pom
+++ b/relocate-dependencies.pom
@@ -36,7 +36,7 @@
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
 
-        <junit.version>4.12</junit.version>
+        <junit.version>4.13</junit.version>
         <maven.compiler.source>${java.version}</maven.compiler.source>
         <maven.compiler.target>${java.version}</maven.compiler.target>
         <dtest-local.version>4.0.0-SNAPSHOT</dtest-local.version>
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java
new file mode 100644
index 000000000000..6baccd6af1d1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.lifecycle;
+
+import java.io.File;
+import java.util.ArrayList;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.FileUtils;
+import org.mockito.Mockito;
+
+public class LogReplicationSetTest
+{
+    @Test
+    public void shouldThrowIfAppendFailedToAllReplicas() throws Throwable
+    {
+        int nrReplicas = 2;
+        LogReplicaSet replicas = new LogReplicaSet();
+        ArrayList<File> spyFiles = getSpyFiles("testAppendFailedToAll", nrReplicas);
+
+        replicas.addReplicas(spyFiles);
+        spyFiles.forEach(f -> Mockito.when(f.exists()).thenThrow(new RuntimeException()));
+
+        Assert.assertThrows(RuntimeException.class,
+                            () ->
+                            replicas.append(LogRecord.makeAbort(System.currentTimeMillis())));
+    }
+
+    @Test
+    public void shouldNotThrowIfAppendFailedToSomeReplicas() throws Throwable
+    {
+        int nrReplicas = 2;
+        LogReplicaSet replicas = new LogReplicaSet();
+        ArrayList<File> spyFiles = getSpyFiles("testAppendFailedToSome", nrReplicas);
+
+        replicas.addReplicas(spyFiles);
+        Mockito.when(spyFiles.get(0).exists()).thenThrow(new RuntimeException());
+    }
+
+    private ArrayList<File> getSpyFiles(String testName, int nrReplicas)
+    {
+        ArrayList<File> files = new ArrayList<>(nrReplicas);
+        for (int i = 0; i < nrReplicas; i++)
+        {
+            files.add(Mockito.spy(createTempFile(testName, i)));
+        }
+        return files;
+    }
+
+    private static File createTempFile(String testName, int id)
+    {
+        String prefix = String.format("%s_%d", testName, id);
+        File dir = new File(FileUtils.getTempDir(), prefix);
+
+        FileUtils.createDirectory(dir);
+        File file = FileUtils.createTempFile(prefix, "tmp", dir);
+
+        file.deleteOnExit();
+        dir.deleteOnExit();
+        return file;
+    }
+}

From 06d7704df6182355dce4b53f1f390f74804f77b9 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 14:52:05 +0100
Subject: [PATCH 022/151] CORE-92: Remove component number argument from
 getComparatorSize

That parameter was misleading, those methods do not use it at all

Also refactored a bit DynamicCompositeTypeTest

(cherry picked from commit b0dd03ebc8371f67a7098b46c5de60443e4db5fe)
(cherry picked from commit 0779d149067647f6e57edf83bdd192caeba11254)
---
 .../db/marshal/AbstractCompositeType.java     | 13 +++---
 .../cassandra/db/marshal/CompositeType.java   |  2 +-
 .../db/marshal/DynamicCompositeType.java      |  3 +-
 .../db/marshal/DynamicCompositeTypeTest.java  | 41 ++++++++-----------
 4 files changed, 26 insertions(+), 33 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
index 24d283457e99..86ac00f2fe73 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractCompositeType.java
@@ -62,8 +62,8 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         while (!accessorL.isEmptyFromOffset(left, offsetL) && !accessorR.isEmptyFromOffset(right, offsetL))
         {
             AbstractType<?> comparator = getComparator(i, left, accessorL, right, accessorR, offsetL, offsetR);
-            offsetL += getComparatorSize(i, left, accessorL, offsetL);
-            offsetR += getComparatorSize(i, right, accessorR, offsetR);
+            offsetL += getComparatorSize(left, accessorL, offsetL);
+            offsetR += getComparatorSize(right, accessorR, offsetR);
 
             VL value1 = accessorL.sliceWithShortLength(left, offsetL);
             offsetL += accessorL.sizeWithShortLength(value1);
@@ -106,10 +106,9 @@ public ByteBuffer[] split(ByteBuffer bb)
         boolean isStatic = readIsStatic(bb, ByteBufferAccessor.instance);
         int offset = startingOffset(isStatic);
 
-        int i = 0;
         while (!ByteBufferAccessor.instance.isEmptyFromOffset(bb, offset))
         {
-            offset += getComparatorSize(i++, bb, ByteBufferAccessor.instance, offset);
+            offset += getComparatorSize(bb, ByteBufferAccessor.instance, offset);
             ByteBuffer value = ByteBufferAccessor.instance.sliceWithShortLength(bb, offset);
             offset += ByteBufferAccessor.instance.sizeWithShortLength(value);
             l.add(value);
@@ -188,7 +187,7 @@ public <V> String getString(V input, ValueAccessor<V> accessor)
                 sb.append(":");
 
             AbstractType<?> comparator = getAndAppendComparator(i, input, accessor, sb, offset);
-            offset += getComparatorSize(i, input, accessor, offset);
+            offset += getComparatorSize(input, accessor, offset);
             V value = accessor.sliceWithShortLength(input, offset);
             offset += accessor.sizeWithShortLength(value);
 
@@ -285,7 +284,7 @@ public  <V> void validate(V input, ValueAccessor<V> accessor)
         while (!accessor.isEmptyFromOffset(input, offset))
         {
             AbstractType<?> comparator = validateComparator(i, input, accessor, offset);
-            offset += getComparatorSize(i, input, accessor, offset);
+            offset += getComparatorSize(input, accessor, offset);
 
             if (accessor.sizeFromOffset(input, offset) < 2)
                 throw new MarshalException("Not enough bytes to read value size of component " + i);
@@ -317,7 +316,7 @@ public TypeSerializer<ByteBuffer> getSerializer()
         return BytesSerializer.instance;
     }
 
-    abstract protected <V> int getComparatorSize(int i, V value, ValueAccessor<V> accessor, int offset);
+    abstract protected <V> int getComparatorSize(V value, ValueAccessor<V> accessor, int offset);
     /**
      * @return the comparator for the given component. static CompositeType will consult
      * @param i DynamicCompositeType will read the type information from @param bb
diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
index bf5e914a9d9e..d8e0ac7b79e2 100644
--- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
@@ -177,7 +177,7 @@ protected <V> AbstractType<?> validateComparator(int i, V value, ValueAccessor<V
         return types.get(i);
     }
 
-    protected <V> int getComparatorSize(int i, V value, ValueAccessor<V> accessor, int offset)
+    protected <V> int getComparatorSize(V value, ValueAccessor<V> accessor, int offset)
     {
         return 0;
     }
diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
index 5df36009956e..e0377fd5396b 100644
--- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
@@ -94,7 +94,7 @@ protected int startingOffset(boolean isStatic)
         return 0;
     }
 
-    protected <V> int getComparatorSize(int i, V value, ValueAccessor<V> accessor, int offset)
+    protected <V> int getComparatorSize(V value, ValueAccessor<V> accessor, int offset)
     {
         int header = accessor.getShort(value, offset);
         if ((header & 0x8000) == 0)
@@ -114,7 +114,6 @@ private <V> AbstractType<?> getComparator(V value, ValueAccessor<V> accessor, in
             int header = accessor.getShort(value, offset);
             if ((header & 0x8000) == 0)
             {
-
                 String name = accessor.toString(accessor.slice(value, offset + 2, header));
                 return TypeParser.parse(name);
             }
diff --git a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
index 9f8eec3c21c7..1de4f20f3c95 100644
--- a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java
@@ -20,11 +20,13 @@
 
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.UUID;
+import java.util.stream.Stream;
 
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableMap;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import static org.junit.Assert.fail;
@@ -45,25 +47,18 @@ public class DynamicCompositeTypeTest
 {
     private static final String KEYSPACE1 = "DynamicCompositeType";
     private static final String CF_STANDARDDYNCOMPOSITE = "StandardDynamicComposite";
-    private static Map<Byte, AbstractType<?>> aliases = new HashMap<>();
 
-    private static final DynamicCompositeType comparator;
-    static
-    {
-        aliases.put((byte)'b', BytesType.instance);
-        aliases.put((byte)'B', ReversedType.getInstance(BytesType.instance));
-        aliases.put((byte)'t', TimeUUIDType.instance);
-        aliases.put((byte)'T', ReversedType.getInstance(TimeUUIDType.instance));
-        comparator = DynamicCompositeType.getInstance(aliases);
-    }
+    public final static Map<Byte, AbstractType<?>> aliases = ImmutableMap.<Byte, AbstractType<?>>builder()
+                                                             .put((byte) 'b', BytesType.instance)
+                                                             .put((byte) 'B', ReversedType.getInstance(BytesType.instance))
+                                                             .put((byte) 't', TimeUUIDType.instance)
+                                                             .put((byte) 'T', ReversedType.getInstance(TimeUUIDType.instance))
+                                                             .build();
 
-    private static final int UUID_COUNT = 3;
-    private static final UUID[] uuids = new UUID[UUID_COUNT];
-    static
-    {
-        for (int i = 0; i < UUID_COUNT; ++i)
-            uuids[i] = UUIDGen.getTimeUUID();
-    }
+    public static final DynamicCompositeType comparator = DynamicCompositeType.getInstance(aliases);
+
+    public static final int UUID_COUNT = 3;
+    public static final UUID[] uuids = Stream.generate(UUIDGen::getTimeUUID).limit(UUID_COUNT).toArray(UUID[]::new);
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
@@ -323,13 +318,13 @@ private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean
         return createDynamicCompositeKey(s, uuid, i, lastIsOne, false);
     }
 
-    private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne,
-            final boolean reversed)
+    @VisibleForTesting
+    public static ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne, boolean reversed)
     {
         String intType = (reversed ? "ReversedType(IntegerType)" : "IntegerType");
-        ByteBuffer bytes = ByteBufferUtil.bytes(s);
+        ByteBuffer bytes = s != null ? ByteBufferUtil.bytes(s) : null;
         int totalSize = 0;
-        if (s != null)
+        if (bytes != null)
         {
             totalSize += 2 + 2 + bytes.remaining() + 1;
             if (uuid != null)
@@ -344,7 +339,7 @@ private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean
 
         ByteBuffer bb = ByteBuffer.allocate(totalSize);
 
-        if (s != null)
+        if (bytes != null)
         {
             bb.putShort((short)(0x8000 | (reversed ? 'B' : 'b')));
             bb.putShort((short) bytes.remaining());

From 9b9775bcc46bc243728c874db6d84a0ff9764739 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 14:59:56 +0100
Subject: [PATCH 023/151] CORE-92: Remove misleading method readCollectionSize
 from CollectionSerializer

That method is misleading because it accepts a byte buffer as an input source
but unlike any other byte buffer reading method it does not shift the buffer
position

(cherry picked from commit 1ffd9284de0925e753c5a6ac45c4348699cf5f01)
(cherry picked from commit 2be1e6051d094cfc6ec307de5f6853ff70655a64)
---
 src/java/org/apache/cassandra/cql3/CQL3Type.java             | 2 +-
 src/java/org/apache/cassandra/db/marshal/ListType.java       | 2 +-
 src/java/org/apache/cassandra/db/marshal/MapType.java        | 2 +-
 .../apache/cassandra/serializers/CollectionSerializer.java   | 5 -----
 src/java/org/apache/cassandra/serializers/MapSerializer.java | 4 ++--
 src/java/org/apache/cassandra/serializers/SetSerializer.java | 4 ++--
 6 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/CQL3Type.java b/src/java/org/apache/cassandra/cql3/CQL3Type.java
index 5059104446e5..421b44684e6d 100644
--- a/src/java/org/apache/cassandra/cql3/CQL3Type.java
+++ b/src/java/org/apache/cassandra/cql3/CQL3Type.java
@@ -196,7 +196,7 @@ public String toCQLLiteral(ByteBuffer buffer, ProtocolVersion version)
 
             StringBuilder target = new StringBuilder();
             buffer = buffer.duplicate();
-            int size = CollectionSerializer.readCollectionSize(buffer, version);
+            int size = CollectionSerializer.readCollectionSize(buffer, ByteBufferAccessor.instance, version);
             buffer.position(buffer.position() + CollectionSerializer.sizeOfCollectionSize(size, version));
 
             switch (type.kind)
diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java
index cc6393751bcc..cee3cd2c4c77 100644
--- a/src/java/org/apache/cassandra/db/marshal/ListType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ListType.java
@@ -245,7 +245,7 @@ public static String setOrListToJsonString(ByteBuffer buffer, AbstractType eleme
     {
         ByteBuffer value = buffer.duplicate();
         StringBuilder sb = new StringBuilder("[");
-        int size = CollectionSerializer.readCollectionSize(value, protocolVersion);
+        int size = CollectionSerializer.readCollectionSize(value, ByteBufferAccessor.instance, protocolVersion);
         int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion);
         for (int i = 0; i < size; i++)
         {
diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java
index 9473e2913618..991ae08048a6 100644
--- a/src/java/org/apache/cassandra/db/marshal/MapType.java
+++ b/src/java/org/apache/cassandra/db/marshal/MapType.java
@@ -286,7 +286,7 @@ public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion)
     {
         ByteBuffer value = buffer.duplicate();
         StringBuilder sb = new StringBuilder("{");
-        int size = CollectionSerializer.readCollectionSize(value, protocolVersion);
+        int size = CollectionSerializer.readCollectionSize(value, ByteBufferAccessor.instance, protocolVersion);
         int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion);
         for (int i = 0; i < size; i++)
         {
diff --git a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
index eb2991b8d78c..204261d46fd7 100644
--- a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java
@@ -91,11 +91,6 @@ protected static void writeCollectionSize(ByteBuffer output, int elements, Proto
         output.putInt(elements);
     }
 
-    public static int readCollectionSize(ByteBuffer input, ProtocolVersion version)
-    {
-        return readCollectionSize(input, ByteBufferAccessor.instance, version);
-    }
-
     public static <V> int readCollectionSize(V value, ValueAccessor<V> accessor, ProtocolVersion version)
     {
         return accessor.toInt(value);
diff --git a/src/java/org/apache/cassandra/serializers/MapSerializer.java b/src/java/org/apache/cassandra/serializers/MapSerializer.java
index 9eae598003ba..867308404336 100644
--- a/src/java/org/apache/cassandra/serializers/MapSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/MapSerializer.java
@@ -149,7 +149,7 @@ public ByteBuffer getSerializedValue(ByteBuffer collection, ByteBuffer key, Abst
         try
         {
             ByteBuffer input = collection.duplicate();
-            int n = readCollectionSize(input, ProtocolVersion.V3);
+            int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3);
             int offset = sizeOfCollectionSize(n, ProtocolVersion.V3);
             for (int i = 0; i < n; i++)
             {
@@ -185,7 +185,7 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection,
         try
         {
             ByteBuffer input = collection.duplicate();
-            int n = readCollectionSize(input, ProtocolVersion.V3);
+            int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3);
             input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3));
             int startPos = input.position();
             int count = 0;
diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java
index 0b7a2a5fa2ec..aae78d861ec7 100644
--- a/src/java/org/apache/cassandra/serializers/SetSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java
@@ -157,7 +157,7 @@ public ByteBuffer getSerializedValue(ByteBuffer input, ByteBuffer key, AbstractT
     {
         try
         {
-            int n = readCollectionSize(input, ProtocolVersion.V3);
+            int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3);
             int offset = sizeOfCollectionSize(n, ProtocolVersion.V3);
 
             for (int i = 0; i < n; i++)
@@ -193,7 +193,7 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection,
         try
         {
             ByteBuffer input = collection.duplicate();
-            int n = readCollectionSize(input, ProtocolVersion.V3);
+            int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3);
             input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3));
             int startPos = input.position();
             int count = 0;

From 8b066116dc7591888859b436b2dd5b7621c05d71 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 13 Nov 2020 12:05:02 +0100
Subject: [PATCH 024/151] CORE-93: Add ByteComparable, ByteSource and related
 stuff

(cherry picked from commit c76194ed765811b14c8e85bb577a9945d41cc1bd)
(cherry picked from commit 3cdb4c0148db34587708f306e67bf9277054bd07)
---
 .../cassandra/utils/ByteComparable.java       | 166 +++++
 .../apache/cassandra/utils/ByteSource.java    | 699 ++++++++++++++++++
 2 files changed, 865 insertions(+)
 create mode 100644 src/java/org/apache/cassandra/utils/ByteComparable.java
 create mode 100644 src/java/org/apache/cassandra/utils/ByteSource.java

diff --git a/src/java/org/apache/cassandra/utils/ByteComparable.java b/src/java/org/apache/cassandra/utils/ByteComparable.java
new file mode 100644
index 000000000000..05e53e682460
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/ByteComparable.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.nio.ByteBuffer;
+
+import static org.apache.cassandra.utils.ByteSource.END_OF_STREAM;
+
+/**
+ * Interface indicating a value can be represented/identified by a comparable {@link ByteSource}.
+ */
+public interface ByteComparable
+{
+    /**
+     * Returns a source that generates the byte-comparable representation of the value byte by byte.
+     */
+    ByteSource asComparableBytes(Version version);
+
+    default ByteSource.Peekable asPeekableBytes(Version version)
+    {
+        return ByteSource.peekable(asComparableBytes(version));
+    }
+
+    enum Version
+    {
+        LEGACY,
+        OSS41,  // CASSANDRA 4.1 encoding, used in trie-based indices
+    }
+
+    ByteComparable EMPTY = (Version version) -> ByteSource.EMPTY;
+
+    /**
+     * Construct a human-readable string from the byte-comparable representation. Used for debugging.
+     */
+    default String byteComparableAsString(Version version)
+    {
+        StringBuilder builder = new StringBuilder();
+        ByteSource stream = asComparableBytes(version);
+        if (stream == null)
+            return "null";
+        for (int b = stream.next(); b != END_OF_STREAM; b = stream.next())
+            builder.append(Integer.toHexString((b >> 4) & 0xF)).append(Integer.toHexString(b & 0xF));
+        return builder.toString();
+    }
+
+    // Simple factories used for testing
+
+    static ByteComparable of(String s)
+    {
+        return v -> ByteSource.of(s, v);
+    }
+
+    static ByteComparable of(long value)
+    {
+        return v -> ByteSource.of(value);
+    }
+
+    static ByteComparable of(int value)
+    {
+        return v -> ByteSource.of(value);
+    }
+
+    static ByteComparable fixedLength(ByteBuffer bytes)
+    {
+        return v -> ByteSource.fixedLength(bytes);
+    }
+
+    static ByteComparable fixedLength(byte[] bytes)
+    {
+        return v -> ByteSource.fixedLength(bytes);
+    }
+
+    /**
+     * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming
+     * prevMax < currMin.
+     * This returns the shortest prefix of currMin that is greater than prevMax.
+     */
+    static ByteComparable separatorPrefix(ByteComparable prevMax, ByteComparable currMin)
+    {
+        return version -> ByteSource.separatorPrefix(prevMax.asComparableBytes(version), currMin.asComparableBytes(version));
+    }
+
+    /**
+     * Returns a separator for two byte comparable, i.e. something that is definitely > prevMax, and <= currMin, assuming
+     * prevMax < currMin.
+     * This is a stream of length 1 longer than the common prefix of the two streams, with last byte one higher than the
+     * prevMax stream.
+     */
+    static ByteComparable separatorGt(ByteComparable prevMax, ByteComparable currMin)
+    {
+        return version -> ByteSource.separatorGt(prevMax.asComparableBytes(version), currMin.asComparableBytes(version));
+    }
+
+    static ByteComparable cut(ByteComparable src, int cutoff)
+    {
+        return version -> ByteSource.cut(src.asComparableBytes(version), cutoff);
+    }
+
+    /**
+     * Return the length of a byte comparable, not including the terminator byte.
+     */
+    static int length(ByteComparable src, Version version)
+    {
+        int l = 0;
+        ByteSource s = src.asComparableBytes(version);
+        while (s.next() != END_OF_STREAM)
+            ++l;
+        return l;
+    }
+
+    /**
+     * Compare two byte-comparable values by their byte-comparable representation. Used for tests.
+     *
+     * @return the result of the lexicographic unsigned byte comparison of the byte-comparable representations of the
+     *         two arguments
+     */
+    static int compare(ByteComparable bytes1, ByteComparable bytes2, Version version)
+    {
+        ByteSource s1 = bytes1.asComparableBytes(version);
+        ByteSource s2 = bytes2.asComparableBytes(version);
+
+        if (s1 == null || s2 == null)
+            return Boolean.compare(s1 != null, s2 != null);
+
+        while (true)
+        {
+            int b1 = s1.next();
+            int b2 = s2.next();
+            int cmp = Integer.compare(b1, b2);
+            if (cmp != 0)
+                return cmp;
+            if (b1 == ByteSource.END_OF_STREAM)
+                return 0;
+        }
+    }
+
+    /**
+     * Returns the length of the minimum prefix that differentiates the two given byte-comparable representations.
+     */
+    static int diffPoint(ByteComparable bytes1, ByteComparable bytes2, Version version)
+    {
+        ByteSource s1 = bytes1.asComparableBytes(version);
+        ByteSource s2 = bytes2.asComparableBytes(version);
+        int pos = 1;
+        int b;
+        while ((b = s1.next()) == s2.next() && b != END_OF_STREAM)
+            ++pos;
+        return pos;
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/ByteSource.java b/src/java/org/apache/cassandra/utils/ByteSource.java
new file mode 100644
index 000000000000..6326861b28f4
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/ByteSource.java
@@ -0,0 +1,699 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+
+import org.apache.cassandra.utils.ByteComparable.Version;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+/**
+ * A stream of byte, used for byte-order-comparable representations of data.
+ */
+public interface ByteSource
+{
+    /** Get the next byte, unsigned. Must be between 0 and 255, or END_OF_STREAM if there are no more bytes. */
+    int next();
+
+    /** Value returned if at the end of the stream. */
+    int END_OF_STREAM = -1;
+
+    ByteSource EMPTY = () -> END_OF_STREAM;
+
+    /**
+     * Escape value. Used, among other things, to mark the end of subcomponents (so that shorter compares before anything longer).
+     * Actual zeros in input need to be escaped if this is in use (see BufferReinterpreter).
+     */
+    int ESCAPE = 0x00;
+
+    // Zeros are encoded as a sequence of ESCAPE, 0 or more of ESCAPED_0_CONT, ESCAPED_0_DONE so zeroed spaces only grow by 1 byte
+    int ESCAPED_0_CONT = 0xFE;
+    int ESCAPED_0_DONE = 0xFF;
+
+    // All separators must be within these bounds
+    int MIN_SEPARATOR = 0x10;
+    int MAX_SEPARATOR = 0xEF;
+
+    // Next component marker.
+    int NEXT_COMPONENT = 0x40;
+    int NEXT_COMPONENT_NULL = 0x3F;
+    int NEXT_COMPONENT_NULL_REVERSED = 0x41;
+    // Default terminator byte in sequences. Smaller than NEXT_COMPONENT_NULL, but larger than LT_NEXT_COMPONENT to
+    // ensure lexicographic compares go in the correct direction
+    int TERMINATOR = 0x38;
+    // These are special endings, for exclusive/inclusive bounds (i.e. smaller than anything with more components, bigger than anything with more components)
+    int LT_NEXT_COMPONENT = 0x20;
+    int GT_NEXT_COMPONENT = 0x60;
+
+    /**
+     * Reinterprets a byte buffer as a byte-comparable source that has 0s escaped and finishes in an escape.
+     * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences.
+     * (See ByteSource.BufferReinterpreter/Multi for explanation.)
+     */
+    static ByteSource of(ByteBuffer buf, Version version)
+    {
+        return new BufferReinterpreter(buf, version);
+    }
+
+    /**
+     * Reinterprets a byte array as a byte-comparable source that has 0s escaped and finishes in an escape.
+     * This provides a prefix-free byte-comparable version of the content to use in sequences.
+     * (See ByteSource.BufferReinterpreter/Multi for explanation.)
+     */
+    static ByteSource of(byte[] buf, Version version)
+    {
+        return new ReinterpreterArray(buf, version);
+    }
+
+    /**
+     * Combines a chain of sources, turning their weak-prefix-free byte-comparable representation into the combination's
+     * prefix-free byte-comparable representation, with the included terminator character.
+     * For correctness, the terminator must be within MIN-MAX_SEPARATOR and different from NEXT_COMPONENT+/-1.
+     * Typically TERMINATOR, or LT/GT_NEXT_COMPONENT if used for partially specified bounds.
+     */
+    static ByteSource withTerminator(int terminator, ByteSource... srcs)
+    {
+        return new Multi(srcs, terminator);
+    }
+
+    static ByteSource of(String s, Version version)
+    {
+        return new ReinterpreterArray(s.getBytes(StandardCharsets.UTF_8), version);
+    }
+
+    static ByteSource of(long value)
+    {
+        return new Number(value ^ (1L<<63), 8);
+    }
+
+    static ByteSource of(int value)
+    {
+        return new Number(value ^ (1L<<31), 4);
+    }
+
+    /**
+     * Produce a source for a signed fixed-length number, also translating empty to null.
+     * The first byte has its sign bit inverted, and the rest are passed unchanged.
+     * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and
+     * ensures the representation is prefix-free.
+     */
+    static ByteSource optionalSignedFixedLengthNumber(ByteBuffer b)
+    {
+        return b.hasRemaining() ? signedFixedLengthNumber(b) : null;
+    }
+
+    /**
+     * Produce a source for a signed fixed-length number.
+     * The first byte has its sign bit inverted, and the rest are passed unchanged.
+     * Presumes that the length of the buffer is always constant for the type.
+     */
+    static ByteSource signedFixedLengthNumber(ByteBuffer b)
+    {
+        return new SignedFixedLengthNumber(b);
+    }
+
+    /**
+     * Produce a source for a signed fixed-length floating-point number, also translating empty to null.
+     * If sign bit is on, returns negated bytes. If not, add the sign bit value.
+     * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.)
+     * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and
+     * ensures the representation is prefix-free.
+     */
+    static ByteSource optionalSignedFixedLengthFloat(ByteBuffer b)
+    {
+        return b.hasRemaining() ? signedFixedLengthFloat(b) : null;
+    }
+
+    /**
+     * Produce a source for a signed fixed-length floating-point number.
+     * If sign bit is on, returns negated bytes. If not, add the sign bit value.
+     * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.)
+     * Presumes that the length of the buffer is always constant for the type.
+     */
+    static ByteSource signedFixedLengthFloat(ByteBuffer b)
+    {
+        return new SignedFixedLengthFloat(b);
+    }
+
+    /**
+     * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming
+     * prevMax < currMin.
+     * This returns the shortest prefix of currMin that is greater than prevMax.
+     */
+    public static ByteSource separatorPrefix(ByteSource prevMax, ByteSource currMin)
+    {
+        return new Separator(prevMax, currMin, true);
+    }
+
+    /**
+     * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming
+     * prevMax < currMin.
+     * This is a source of length 1 longer than the common prefix of the two sources, with last byte one higher than the
+     * prevMax source.
+     */
+    public static ByteSource separatorGt(ByteSource prevMax, ByteSource currMin)
+    {
+        return new Separator(prevMax, currMin, false);
+    }
+
+    public static ByteSource oneByte(int i)
+    {
+        assert i >= 0 && i <= 0xFF;
+        return new ByteSource()
+        {
+            boolean given = false;
+            public int next()
+            {
+                if (given)
+                    return END_OF_STREAM;
+                given = true;
+                return i;
+            }
+        };
+    }
+
+    public static ByteSource cut(ByteSource src, int cutoff)
+    {
+        return new ByteSource()
+        {
+            int pos = 0;
+
+            @Override
+            public int next()
+            {
+                return pos++ < cutoff ? src.next() : END_OF_STREAM;
+            }
+        };
+    }
+
+    /**
+     * Wrap a ByteSource in a length-fixing facade.
+     *
+     * If the length of {@code src} is less than {@code cutoff}, then pad it on the right with {@code padding} until
+     * the overall length equals {@code cutoff}.  If the length of {@code src} is greater than {@code cutoff}, then
+     * truncate {@code src} to that size.  Effectively a noop if {@code src} happens to have length {@code cutoff}.
+     *
+     * @param src the input source to wrap
+     * @param cutoff the size of the source returned
+     * @param padding a padding byte (an int subject to a 0xFF mask)
+     * @return
+     */
+    public static ByteSource cutOrRightPad(ByteSource src, int cutoff, int padding)
+    {
+        return new ByteSource()
+        {
+            int pos = 0;
+
+            @Override
+            public int next()
+            {
+                if (pos++ >= cutoff)
+                {
+                    return END_OF_STREAM;
+                }
+                int next = src.next();
+                return next == END_OF_STREAM ? padding : next;
+            }
+        };
+    }
+
+
+    static ByteSource MAX = new ByteSource()
+    {
+        public int next()
+        {
+            return 0xFF;
+        }
+
+        public String toString()
+        {
+            return "MAX";
+        }
+    };
+
+    /**
+     * Returns a maximal ByteSource, i.e. something that compares greater to any other byte source.
+     * This is an infinite sequence of 0xFF.
+     *
+     * Note that since the sequence is infinite, trying to calculate this item's length, copying it, trying
+     * to store it in a trie, or comparing it to another max will result in an infinite loop.
+     */
+    public static ByteSource max()
+    {
+        return MAX;
+    }
+
+    /**
+     * Variable-length encoding. Escapes 0s as ESCAPE + zero or more ESCAPED_0_CONT + ESCAPED_0_DONE.
+     * Finishes with an escape value (to which Multi will add non-zero component separator)
+     * E.g. A00B translates to 4100FEFF4200
+     *      A0B0               4100FF4200FE (+00 for {@link Version#LEGACY})
+     *      A0                 4100FE       (+00 for {@link Version#LEGACY})
+     *
+     * If in a single byte source, the bytes could be simply passed unchanged, but this would not allow us to
+     * combine components. This translation preserves order, and since the encoding for 0 is higher than the separator
+     * also makes sure shorter components are treated as smaller.
+     *
+     * The encoding is not prefix-free, since e.g. the encoding of "A" (4100) is a prefix of the encoding of "A0"
+     * (4100FE), but the byte following the prefix is guaranteed to be FE or FF, which makes the encoding weakly
+     * prefix-free. Additionally, any such prefix sequence will compare smaller than the value to which it is a prefix,
+     * because any permitted separator byte will be smaller than the byte following the prefix.
+     */
+    static abstract class AbstractReinterpreter implements ByteSource
+    {
+        final Version version;
+        int bufpos;
+        boolean escaped;
+
+        AbstractReinterpreter(int position, Version version)
+        {
+            this.bufpos = position;
+            this.version = version;
+        }
+
+        public final int next()
+        {
+            if (bufpos >= limit())
+            {
+                if (bufpos > limit())
+                    return END_OF_STREAM;
+
+                ++bufpos;
+                if (escaped)
+                {
+                    escaped = false;
+                    if (version == Version.LEGACY)
+                        --bufpos; // place an ESCAPE at the end of sequence ending in ESCAPE
+                    return ESCAPED_0_CONT;
+                }
+                return ESCAPE;
+            }
+
+            int index = bufpos++;
+            int b = get(index) & 0xFF;
+            if (!escaped)
+            {
+                if (b == ESCAPE)
+                    escaped = true;
+                return b;
+            }
+            else
+            {
+                if (b == ESCAPE)
+                    return ESCAPED_0_CONT;
+                --bufpos;
+                escaped = false;
+                return ESCAPED_0_DONE;
+            }
+        }
+
+        protected abstract byte get(int index);
+
+        protected abstract int limit();
+    }
+
+    static class BufferReinterpreter extends AbstractReinterpreter
+    {
+        final ByteBuffer buf;
+
+        private BufferReinterpreter(ByteBuffer buf, Version version)
+        {
+            super(buf.position(), version);
+            this.buf = buf;
+        }
+
+        protected int limit()
+        {
+            return buf.limit();
+        }
+
+        protected byte get(int index)
+        {
+            return buf.get(index);
+        }
+    }
+
+    static class ReinterpreterArray extends AbstractReinterpreter
+    {
+        final byte[] buf;
+
+        private ReinterpreterArray(byte[] buf, Version version)
+        {
+            super(0, version);
+            this.buf = buf;
+        }
+
+        @Override
+        protected byte get(int index)
+        {
+            return buf[index];
+        }
+
+        @Override
+        protected int limit()
+        {
+            return buf.length;
+        }
+    }
+
+    /**
+     * Fixed length signed number encoding. Inverts first bit (so that neg < pos), then just posts all bytes from the
+     * buffer. Assumes buffer is of correct length.
+     */
+    static class SignedFixedLengthNumber implements ByteSource
+    {
+        ByteBuffer buf;
+        int bufpos;
+
+        public SignedFixedLengthNumber(ByteBuffer buf)
+        {
+            this.buf = buf;
+            bufpos = buf.position();
+        }
+
+        public int next()
+        {
+            if (bufpos >= buf.limit())
+                return END_OF_STREAM;
+            int v = buf.get(bufpos) & 0xFF;
+            if (bufpos == buf.position())
+                v ^= 0x80;
+            ++bufpos;
+            return v;
+        }
+    }
+
+    static class Number implements ByteSource
+    {
+        final long value;
+        int pos;
+
+        public Number(long value, int length)
+        {
+            this.value = value;
+            this.pos = length;
+        }
+
+        public int next()
+        {
+            if (pos == 0)
+                return END_OF_STREAM;
+            return (int) ((value >> (--pos * 8)) & 0xFF);
+        }
+    }
+
+    /**
+     * Fixed length signed floating point number encoding. First bit is sign. If positive, add sign bit value to make
+     * greater than all negatives. If not, invert all content to make negatives with bigger magnitude smaller.
+     */
+    static class SignedFixedLengthFloat implements ByteSource
+    {
+        final ByteBuffer buf;
+        int bufpos;
+        boolean invert;
+
+        public SignedFixedLengthFloat(ByteBuffer buf)
+        {
+            this.buf = buf;
+            this.bufpos = buf.position();
+        }
+
+        public int next()
+        {
+            if (bufpos >= buf.limit())
+                return END_OF_STREAM;
+            int v = buf.get(bufpos) & 0xFF;
+            if (bufpos == buf.position())
+            {
+                invert = v >= 0x80;
+                v |= 0x80;
+            }
+            if (invert)
+                v = v ^ 0xFF;
+            ++bufpos;
+            return v;
+        }
+    }
+
+    /**
+     * Combination of multiple byte sources. Adds NEXT_COMPONENT before sources, or NEXT_COMPONENT_NULL if next is null.
+     */
+    static class Multi implements ByteSource
+    {
+        final ByteSource[] srcs;
+        int srcnum = -1;
+        int terminator;
+
+        Multi(ByteSource[] srcs, int terminator)
+        {
+            this.srcs = srcs;
+            this.terminator = terminator;
+        }
+
+        public int next()
+        {
+            if (srcnum == srcs.length)
+                return END_OF_STREAM;
+
+            int b = END_OF_STREAM;
+            if (srcnum >= 0 && srcs[srcnum] != null)
+                b = srcs[srcnum].next();
+            if (b > END_OF_STREAM)
+                return b;
+
+            ++srcnum;
+            if (srcnum == srcs.length)
+                return terminator;
+            if (srcs[srcnum] == null)
+                return NEXT_COMPONENT_NULL;
+            return NEXT_COMPONENT;
+        }
+    }
+
+    /**
+     * Construct the shortest common prefix of prevMax and currMin that separates those two byte streams.
+     * If {@code useCurr == true} the last byte of the returned stream comes from {@code currMin} and is the first
+     * byte which is greater than byte on the corresponding position of {@code prevMax}.
+     * Otherwise, the last byte of the returned stream comes from {@code prevMax} and is incremented by one, still
+     * guaranteeing that it is <= than the byte on the corresponding position of {@code currMin}.
+     */
+    static class Separator implements ByteSource
+    {
+        final ByteSource prev;
+        final ByteSource curr;
+        boolean done = false;
+        final boolean useCurr;
+
+        Separator(ByteSource prevMax, ByteSource currMin, boolean useCurr)
+        {
+            this.prev = prevMax;
+            this.curr = currMin;
+            this.useCurr = useCurr;
+        }
+
+        public int next()
+        {
+            if (done)
+                return END_OF_STREAM;
+            int p = prev.next();
+            int c = curr.next();
+            assert p <= c : prev + " not less than " + curr;
+            if (p == c)
+                return c;
+            done = true;
+            return useCurr ? c : p + 1;
+        }
+    }
+
+    static ByteSource optionalFixedLength(ByteBuffer b)
+    {
+        return b.hasRemaining() ? fixedLength(b) : null;
+    }
+
+    /**
+     * A byte source of the given bytes without any encoding.
+     * The resulting source is only guaranteed to give correct comparison results and be prefix-free if the
+     * underlying type has a fixed length.
+     * In tests, this method is also used to generate non-escaped test cases.
+     */
+    public static ByteSource fixedLength(ByteBuffer b)
+    {
+        return new ByteSource()
+        {
+            int pos = b.position() - 1;
+
+            @Override
+            public int next()
+            {
+                return ++pos < b.limit() ? b.get(pos) & 0xFF : -1;
+            }
+        };
+    }
+
+    /**
+     * A byte source of the given bytes without any encoding.
+     * If used in a sequence, the resulting source is only guaranteed to give correct comparison results if the
+     * underlying type has a fixed length.
+     * In tests, this method is also used to generate non-escaped test cases.
+     */
+    public static ByteSource fixedLength(byte[] b)
+    {
+        return fixedLength(b, 0, b.length);
+    }
+
+    public static ByteSource fixedLength(byte[] b, int offset, int length)
+    {
+        checkArgument(offset >= 0 && offset <= b.length);
+        checkArgument(length >= 0 && offset + length <= b.length);
+
+        return new ByteSource()
+        {
+            int pos = offset - 1;
+
+            @Override
+            public int next()
+            {
+                return ++pos < offset + length ? b[pos] & 0xFF : END_OF_STREAM;
+            }
+        };
+    }
+
+    public static ByteSource fourBit(ByteSource s)
+    {
+        return new ByteSource()
+        {
+            int pos = 0;
+            int v = 0;
+
+            @Override
+            public int next()
+            {
+                if ((pos++ & 1) == 0)
+                {
+                    v = s.next();
+                    if (v == END_OF_STREAM)
+                        return END_OF_STREAM;
+                    return (v >> 4) & 0xF;
+                }
+                else
+                    return v & 0xF;
+            }
+        };
+    }
+
+    /**
+     * Splits each byte into portions of bitCount bits.
+     * @param s source
+     * @param bitCount number of bits to issue at a time, 1-4 make sense
+     */
+    public static ByteSource splitBytes(ByteSource s, int bitCount)
+    {
+        return new ByteSource()
+        {
+            int pos = 8;
+            int v = 0;
+            int mask = (1 << bitCount) - 1;
+
+            @Override
+            public int next()
+            {
+                if ((pos += bitCount) >= 8)
+                {
+                    pos = 0;
+                    v = s.next();
+                    if (v == END_OF_STREAM)
+                        return END_OF_STREAM;
+                }
+                v <<= bitCount;
+                return (v >> 8) & mask;
+            }
+        };
+    }
+
+    /**
+     * Returns the key that is immediately after src in the topology.
+     * @param src
+     * @return src with added 00 byte at the end
+     */
+    public static ByteSource nextKey(ByteSource src)
+    {
+        return new ByteSource()
+        {
+            boolean done = false;
+
+            @Override
+            public int next()
+            {
+                if (done)
+                    return END_OF_STREAM;
+                int n = src.next();
+                if (n != END_OF_STREAM)
+                    return n;
+
+                done = true;
+                return 0;
+            }
+        };
+    }
+
+    public class Peekable implements ByteSource
+    {
+        static final int NONE = Integer.MIN_VALUE;
+
+        final ByteSource wrapped;
+        int peeked = NONE;
+
+        public Peekable(ByteSource wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+
+        public int next()
+        {
+            if (peeked != NONE)
+            {
+                int val = peeked;
+                peeked = NONE;
+                return val;
+            }
+            else
+                return wrapped.next();
+        }
+
+        public int peek()
+        {
+            if (peeked == NONE)
+                peeked = wrapped.next();
+            return peeked;
+        }
+    }
+
+    public static Peekable peekable(ByteSource p)
+    {
+        // When given a null source, we're better off not wrapping it and just returning null. This way existing
+        // code that doesn't know about ByteSource.Peekable, but handles correctly null ByteSources won't be thrown
+        // off by a non-null instance that semantically should have been null.
+        if (p == null)
+            return null;
+        return (p instanceof Peekable)
+               ? (Peekable) p
+               : new Peekable(p);
+    }
+}

From 6b5704a982eeff7b5dde6ddfddc5b14dc184ead7 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 16:17:54 +0100
Subject: [PATCH 025/151] CORE-93: Add isValueLengthFixed method to
 AbstractType

(cherry picked from commit d4f1de75d0e65fd52323c70fa1f8e85fe4ff8e16)
(cherry picked from commit 3ea8c7a1f34775639b169d117c5459f10feb295d)
---
 .../cassandra/db/marshal/AbstractType.java    | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
index 19cf849dba06..4c886f31ef53 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
@@ -54,6 +54,8 @@
 @Unmetered
 public abstract class AbstractType<T> implements Comparator<ByteBuffer>, AssignmentTestable
 {
+    private final static int VARIABLE_LENGTH = -1;
+
     public final Comparator<ByteBuffer> reverseComparator;
 
     public enum ComparisonType
@@ -419,11 +421,28 @@ public List<AbstractType<?>> getComponents()
     }
 
     /**
-     * The length of values for this type if all values are of fixed length, -1 otherwise.
+     * The length of values for this type if all values are of fixed length, -1 otherwise. This has an impact on
+     * serialization.
+     * <lu>
+     *  <li> see {@link #writeValue} </li>
+     *  <li> see {@link #read} </li>
+     *  <li> see {@link #writtenLength} </li>
+     *  <li> see {@link #skipValue} </li>
+     * </lu>
      */
     public int valueLengthIfFixed()
     {
-        return -1;
+        return VARIABLE_LENGTH;
+    }
+
+    /**
+     * Checks if all values are of fixed length.
+     *
+     * @return {@code true} if all values are of fixed length, {@code false} otherwise.
+     */
+    public final boolean isValueLengthFixed()
+    {
+        return valueLengthIfFixed() != VARIABLE_LENGTH;
     }
 
     // This assumes that no empty values are passed

From cfd23c81f94d253fbffe880fd8c643fc6cdd9dcd Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 16:26:06 +0100
Subject: [PATCH 026/151] CORE-93: Add implementation of asComparableBytes to
 types

(cherry picked from commit 783f1c1f1f5fb6739d61f099c19f571543701d19)
(cherry picked from commit 851bd6ac2abbd8c11353616b5027a8cd75c8687a)
---
 .../cassandra/db/marshal/AbstractType.java    |  34 ++++++
 .../cassandra/db/marshal/BooleanType.java     |  13 +++
 .../apache/cassandra/db/marshal/ByteType.java |  11 ++
 .../cassandra/db/marshal/CompositeType.java   |  59 ++++++++++
 .../apache/cassandra/db/marshal/DateType.java |   9 ++
 .../cassandra/db/marshal/DecimalType.java     | 109 ++++++++++++++++++
 .../cassandra/db/marshal/DoubleType.java      |   8 ++
 .../db/marshal/DynamicCompositeType.java      | 108 +++++++++++++++++
 .../cassandra/db/marshal/EmptyType.java       |   8 ++
 .../cassandra/db/marshal/FloatType.java       |   8 ++
 .../cassandra/db/marshal/Int32Type.java       |   8 ++
 .../cassandra/db/marshal/IntegerType.java     |  73 ++++++++++++
 .../cassandra/db/marshal/LexicalUUIDType.java |  28 +++++
 .../apache/cassandra/db/marshal/ListType.java |  33 +++++-
 .../apache/cassandra/db/marshal/LongType.java |   8 ++
 .../apache/cassandra/db/marshal/MapType.java  |  33 +++++-
 .../db/marshal/PartitionerDefinedOrder.java   |  19 +++
 .../cassandra/db/marshal/ReversedType.java    |  56 +++++++++
 .../apache/cassandra/db/marshal/SetType.java  |   8 ++
 .../cassandra/db/marshal/ShortType.java       |  10 ++
 .../cassandra/db/marshal/SimpleDateType.java  |  13 ++-
 .../apache/cassandra/db/marshal/TimeType.java |  13 ++-
 .../cassandra/db/marshal/TimeUUIDType.java    |  19 ++-
 .../cassandra/db/marshal/TimestampType.java   |   9 +-
 .../cassandra/db/marshal/TupleType.java       |  17 ++-
 .../apache/cassandra/db/marshal/UUIDType.java |  24 ++++
 .../cassandra/utils/ByteBufferUtil.java       |  20 ++++
 27 files changed, 751 insertions(+), 7 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
index 4c886f31ef53..edd4de79e9d8 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
@@ -39,8 +39,11 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.github.jamm.Unmetered;
 
+import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.BYTE_ORDER;
 import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.CUSTOM;
 
 /**
@@ -587,6 +590,37 @@ public AssignmentTestable.TestResult testAssignment(AbstractType<?> receiverType
         return AssignmentTestable.TestResult.NOT_ASSIGNABLE;
     }
 
+    /**
+     * Produce a byte-comparable representation of the given value, i.e. a sequence of bytes that compares the same way
+     * using lexicographical unsigned byte comparison as the original value using the type's comparator.
+     *
+     * We use a slightly stronger requirement to be able to use the types in tuples. Precisely, for any pair x, y of
+     * non-equal valid values of this type and any bytes b1, b2 between 0x10 and 0xEF,
+     * (+ stands for concatenation)
+     *   compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2)
+     * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and:
+     *   asByteComparable(x)+b1 is not a prefix of asByteComparable(y)      (weakly prefix free)
+     * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the
+     * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if
+     * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat
+     * more efficient encoding of arbitrary-length byte-comparable blobs.
+     *
+     * Depending on the type, this method can be called for null or empty input, in which case the output is allowed to
+     * be null (the clustering/tuple encoding will accept and handle it).
+     */
+    public ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version)
+    {
+        if (comparisonType == BYTE_ORDER)
+        {
+            // When a type is byte-ordered on its own, we only need to escape it, so that we can include it in
+            // multi-component types and make the encoding weakly-prefix-free.
+            return ByteSource.of(byteBuffer, version);
+        }
+        else
+            // default is only good for byte-comparables
+            throw new UnsupportedOperationException(getClass().getSimpleName() + " does not implement asComparableBytes");
+    }
+
     /**
      * This must be overriden by subclasses if necessary so that for any
      * AbstractType, this == TypeParser.parse(toString()).
diff --git a/src/java/org/apache/cassandra/db/marshal/BooleanType.java b/src/java/org/apache/cassandra/db/marshal/BooleanType.java
index 4ef5f95b0bfc..fff72203f657 100644
--- a/src/java/org/apache/cassandra/db/marshal/BooleanType.java
+++ b/src/java/org/apache/cassandra/db/marshal/BooleanType.java
@@ -26,6 +26,8 @@
 import org.apache.cassandra.serializers.BooleanSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -54,6 +56,17 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return v1 - v2;
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        if (!buf.hasRemaining())
+            return null;
+        byte b = buf.get(buf.position());
+        if (b != 0)
+            b = 1;
+        return ByteSource.oneByte(b);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
 
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteType.java b/src/java/org/apache/cassandra/db/marshal/ByteType.java
index f94f4bb01cc5..e57b479ccb15 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteType.java
@@ -27,6 +27,9 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 public class ByteType extends NumberType<Byte>
 {
@@ -42,6 +45,14 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return accessorL.getByte(left, 0) - accessorR.getByte(right, 0);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    {
+        return version == Version.LEGACY
+               ? ByteSource.signedFixedLengthNumber(buf)
+               : ByteSource.optionalSignedFixedLengthNumber(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
index d8e0ac7b79e2..dc4fdcc7112a 100644
--- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
@@ -31,6 +31,8 @@
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 import static com.google.common.collect.Iterables.any;
 import static com.google.common.collect.Iterables.transform;
@@ -165,6 +167,39 @@ protected <V> AbstractType<?> getAndAppendComparator(int i, V value, ValueAccess
         return types.get(i);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
+    {
+        if (byteBuffer == null || byteBuffer.remaining() == 0)
+            return null;
+
+        ByteSource[] srcs = new ByteSource[types.size() * 2 + 1];
+        ByteBuffer bb = byteBuffer.duplicate();
+
+        // statics go first
+        boolean isStatic = readStatic(bb);
+        srcs[0] = isStatic ? null : ByteSource.EMPTY;
+
+        int i = 0;
+        byte lastEoc = 0;
+        while (bb.remaining() > 0)
+        {
+            // Only the end-of-component byte of the last component of this composite can be non-zero, so the
+            // component before can't have a non-zero end-of-component byte.
+            assert lastEoc == 0 : lastEoc;
+
+            srcs[i * 2 + 1] = types.get(i).asComparableBytes(ByteBufferUtil.readBytesWithShortLength(bb), version);
+            lastEoc = bb.get();
+            srcs[i * 2 + 2] = ByteSource.oneByte(lastEoc & 0xFF ^ 0x80); // end-of-component also takes part in comparison as signed byte
+            ++i;
+        }
+        if (i * 2 + 1 < srcs.length)
+            srcs = Arrays.copyOfRange(srcs, 0, i * 2 + 1);
+
+        return ByteSource.withTerminator(version == Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR,
+                                         srcs);
+    }
+
     protected ParsedComparator parseComparator(int i, String part)
     {
         return new StaticParsedComparator(types.get(i), part);
@@ -389,4 +424,28 @@ public static <V> V build(ValueAccessor<V> accessor, boolean isStatic, V... valu
         out.flip();
         return accessor.valueOf(out);
     }
+
+    public static ByteBuffer build(boolean isStatic, ByteBuffer[] buffers, byte lastEoc)
+    {
+        int totalLength = isStatic ? 2 : 0;
+        for (ByteBuffer bb : buffers)
+            totalLength += 2 + bb.remaining() + 1;
+
+        ByteBuffer out = ByteBuffer.allocate(totalLength);
+
+        if (isStatic)
+            out.putShort((short)STATIC_MARKER);
+
+        for (int i = 0; i < buffers.length; ++i)
+        {
+            ByteBuffer bb = buffers[i];
+            ByteBufferUtil.writeShortLength(out, bb.remaining());
+            int toCopy = bb.remaining();
+            ByteBufferUtil.arrayCopy(bb, bb.position(), out, out.position(), toCopy);
+            out.position(out.position() + toCopy);
+            out.put(i != buffers.length - 1 ? (byte) 0 : lastEoc);
+        }
+        out.flip();
+        return out;
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/marshal/DateType.java b/src/java/org/apache/cassandra/db/marshal/DateType.java
index 473cedf40795..4e6aa5a27704 100644
--- a/src/java/org/apache/cassandra/db/marshal/DateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DateType.java
@@ -31,6 +31,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 /**
  * This is the old version of TimestampType, but has been replaced as it wasn't comparing pre-epoch timestamps
@@ -50,6 +52,13 @@ public boolean isEmptyValueMeaningless()
         return true;
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient.
+        return ByteSource.optionalFixedLength(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
       // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/DecimalType.java b/src/java/org/apache/cassandra/db/marshal/DecimalType.java
index 5740fdcc0fcb..d6d47d8e0f59 100644
--- a/src/java/org/apache/cassandra/db/marshal/DecimalType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DecimalType.java
@@ -24,6 +24,8 @@
 import java.nio.ByteBuffer;
 import java.util.Objects;
 
+import com.google.common.primitives.Ints;
+
 import org.apache.cassandra.cql3.CQL3Type;
 import org.apache.cassandra.cql3.Constants;
 import org.apache.cassandra.cql3.Term;
@@ -32,6 +34,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class DecimalType extends NumberType<BigDecimal>
 {
@@ -41,6 +45,12 @@ public class DecimalType extends NumberType<BigDecimal>
     private static final int MAX_SCALE = 1000;
     private static final MathContext MAX_PRECISION = new MathContext(10000);
 
+    // Constants or escaping values needed to encode/decode variable-length floating point numbers (decimals) in our
+    // custom byte-ordered encoding scheme.
+    private static final int POSITIVE_DECIMAL_HEADER_MASK = 0x80;
+    private static final int NEGATIVE_DECIMAL_HEADER_MASK = 0x00;
+    private static final int DECIMAL_EXPONENT_LENGTH_HEADER_MASK = 0x40;
+
     DecimalType() {super(ComparisonType.CUSTOM);} // singleton
 
     public boolean isEmptyValueMeaningless()
@@ -59,6 +69,105 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return compareComposed(left, accessorL, right, accessorR, this);
     }
 
+    /**
+     * Constructs a byte-comparable representation.
+     * This is rather difficult and involves reconstructing the decimal.
+     *
+     * To compare, we need a normalized value, i.e. one with a sign, exponent and (0,1) mantissa. To avoid
+     * loss of precision, both exponent and mantissa need to be base-100.  We can't get this directly off the serialized
+     * bytes, as they have base-10 scale and base-256 unscaled part.
+     *
+     * We store:
+     *     - sign bit inverted * 0x80 + 0x40 + signed exponent length, where exponent is negated if value is negative
+     *     - zero or more exponent bytes (as given by length)
+     *     - 0x80 + first pair of decimal digits, negative is value is negative, rounded to -inf
+     *     - zero or more 0x80 + pair of decimal digits, always positive
+     *     - trailing 0x00
+     * Zero is special-cased as 0x80.
+     *
+     * Because the trailing 00 cannot be produced from a pair of decimal digits (positive or not), no value can be
+     * a prefix of another.
+     *
+     * Encoding examples:
+     *    1.1    as       c1 = 0x80 (positive number) + 0x40 + (positive exponent) 0x01 (exp length 1)
+     *                    01 = exponent 1 (100^1)
+     *                    81 = 0x80 + 01 (0.01)
+     *                    8a = 0x80 + 10 (....10)   0.0110e2
+     *                    00
+     *    -1     as       3f = 0x00 (negative number) + 0x40 - (negative exponent) 0x01 (exp length 1)
+     *                    ff = exponent -1. negative number, thus 100^1
+     *                    7f = 0x80 - 01 (-0.01)    -0.01e2
+     *                    00
+     *    -99.9  as       3f = 0x00 (negative number) + 0x40 - (negative exponent) 0x01 (exp length 1)
+     *                    ff = exponent -1. negative number, thus 100^1
+     *                    1c = 0x80 - 100 (-1.00)
+     *                    8a = 0x80 + 10  (+....10) -0.999e2
+     *                    00
+     *
+     */
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        BigDecimal value = compose(buf);
+        if (value == null)
+            return null;
+        if (value.compareTo(BigDecimal.ZERO) == 0)  // Note: 0.equals(0.0) returns false!
+            return ByteSource.oneByte(POSITIVE_DECIMAL_HEADER_MASK);
+        long scale = (((long) value.scale()) - value.precision()) & ~1;
+        boolean negative = value.signum() < 0;
+        final int negmul = negative ? -1 : 1;
+        // This should always fit into an int
+        final long exponent = (-scale * negmul) / 2;
+        // We should never have scale > Integer.MAX_VALUE, as we're always subtracting the non-negative precision of
+        // the encoded BigDecimal, and furthermore we're rounding to negative infinity.
+        if (scale > Integer.MAX_VALUE || scale < Integer.MIN_VALUE)
+        {
+            // We are practically out of range here, but let's handle that anyway
+            int mv = Long.signum(scale) * Integer.MAX_VALUE;
+            value = value.scaleByPowerOfTen(mv);
+            scale -= mv;
+        }
+        final BigDecimal mantissa = value.scaleByPowerOfTen(Ints.checkedCast(scale)).stripTrailingZeros();
+        assert mantissa.abs().compareTo(BigDecimal.ONE) < 0;
+
+        return new ByteSource()
+        {
+            int posInExp = 0;
+            BigDecimal current = mantissa;
+
+            @Override
+            public int next()
+            {
+                if (posInExp < 5)
+                {
+                    if (posInExp == 0)
+                    {
+                        int absexp = (int) (exponent < 0 ? -exponent : exponent);
+                        while (posInExp < 5 && absexp >> (32 - ++posInExp * 8) == 0) {}
+                        int explen = DECIMAL_EXPONENT_LENGTH_HEADER_MASK + (exponent < 0 ? -1 : 1) * (5 - posInExp);
+                        return explen + (negative ? NEGATIVE_DECIMAL_HEADER_MASK : POSITIVE_DECIMAL_HEADER_MASK);
+                    }
+                    else
+                        return (int) ((exponent >> (32 - posInExp++ * 8))) & 0xFF;
+                }
+                if (current == null)
+                    return END_OF_STREAM;
+                if (current.compareTo(BigDecimal.ZERO) == 0)
+                {
+                    current = null;
+                    return 0x00;
+                }
+                else
+                {
+                    BigDecimal v = current.scaleByPowerOfTen(2);
+                    BigDecimal floor = v.setScale(0, BigDecimal.ROUND_FLOOR);
+                    current = v.subtract(floor);
+                    return floor.byteValueExact() + 0x80;
+                }
+            }
+        };
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/DoubleType.java b/src/java/org/apache/cassandra/db/marshal/DoubleType.java
index 570d420a75bb..d68bc4ca90a4 100644
--- a/src/java/org/apache/cassandra/db/marshal/DoubleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DoubleType.java
@@ -27,6 +27,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class DoubleType extends NumberType<Double>
 {
@@ -50,6 +52,12 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return compareComposed(left, accessorL, right, accessorR, this);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        return ByteSource.optionalSignedFixedLengthFloat(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
       // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
index e0377fd5396b..a293f11b2ce1 100644
--- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
@@ -19,9 +19,13 @@
 
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Maps;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -34,6 +38,8 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 import static com.google.common.collect.Iterables.any;
 
@@ -60,6 +66,9 @@ public class DynamicCompositeType extends AbstractCompositeType
 {
     private static final Logger logger = LoggerFactory.getLogger(DynamicCompositeType.class);
 
+    private static final ByteSource[] EMPTY_BYTE_SOURCE_ARRAY = new ByteSource[0];
+    private static final String REVERSED_TYPE = ReversedType.class.getSimpleName();
+
     private final Map<Byte, AbstractType<?>> aliases;
 
     // interning instances
@@ -196,6 +205,105 @@ protected <V> AbstractType<?> getAndAppendComparator(int i, V value, ValueAccess
         }
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
+    {
+        List<ByteSource> srcs = new ArrayList<>();
+        ByteBuffer bb = byteBuffer.duplicate();
+
+        // statics go first
+        boolean isStatic = readIsStatic(bb, ByteBufferAccessor.instance);
+        srcs.add(isStatic ? null : ByteSource.EMPTY);
+        bb.position(bb.position() + startingOffset(isStatic));
+
+        byte lastEoc = 0;
+        while (bb.remaining() > 0)
+        {
+            // Only the end-of-component byte of the last component of this composite can be non-zero, so the
+            // component before can't have a non-zero end-of-component byte.
+            assert lastEoc == 0 : lastEoc;
+
+            AbstractType<?> comp = getComparator(bb, ByteBufferAccessor.instance, 0);
+            bb.position(bb.position() + getComparatorSize(bb, ByteBufferAccessor.instance, 0));
+            // The comparable bytes for the component need to ensure comparisons consistent with
+            // AbstractCompositeType.compareCustom(ByteBuffer, ByteBuffer) and
+            // DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer):
+            if (version == Version.LEGACY || !(comp instanceof ReversedType))
+            {
+                // ...most often that means just adding the short name of the type, followed by the full name of the type.
+                srcs.add(ByteSource.of(comp.getClass().getSimpleName(), version));
+                srcs.add(ByteSource.of(comp.getClass().getName(), version));
+            }
+            else
+            {
+                // ...however some times the component uses a complex type (currently the only supported complex type
+                // is ReversedType - we can't have elements that are of MapType, CompositeType, TupleType, etc.)...
+                ReversedType<?> reversedComp = (ReversedType<?>) comp;
+                // ...in this case, we need to add the short name of ReversedType before the short name of the base
+                // type, to ensure consistency with DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer).
+                srcs.add(ByteSource.of(REVERSED_TYPE, version));
+                srcs.add(ByteSource.of(reversedComp.baseType.getClass().getSimpleName(), version));
+                srcs.add(ByteSource.of(reversedComp.baseType.getClass().getName(), version));
+            }
+            // Only then the payload of the component gets encoded.
+            srcs.add(comp.asComparableBytes(ByteBufferUtil.readBytesWithShortLength(bb), version));
+            // The end-of-component byte also takes part in the comparison, and therefore needs to be encoded.
+            lastEoc = bb.get();
+            srcs.add(ByteSource.oneByte(version == Version.LEGACY ? lastEoc : lastEoc & 0xFF ^ 0x80));
+        }
+
+        return ByteSource.withTerminator(version == Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR,
+                                         srcs.toArray(EMPTY_BYTE_SOURCE_ARRAY));
+    }
+
+    public static ByteBuffer build(List<String> types, List<ByteBuffer> values)
+    {
+        return build(types, values, (byte) 0);
+    }
+
+    @VisibleForTesting
+    public static ByteBuffer build(List<String> types, List<ByteBuffer> values, byte lastEoc)
+    {
+        assert types.size() == values.size();
+
+        int numComponents = types.size();
+        // Compute the total number of bytes that we'll need to store the types and their payloads.
+        int totalLength = 0;
+        for (int i = 0; i < numComponents; ++i)
+        {
+            int typeNameLength = types.get(i).getBytes(StandardCharsets.UTF_8).length;
+            // The type data will be stored by means of the type's fully qualified name, not by aliasing, so:
+            //   1. The type data header should be the fully qualified name length in bytes.
+            //   2. The length should be small enough so that it fits in 15 bits (2 bytes with the first bit zero).
+            assert typeNameLength <= 0x7FFF;
+            int valueLength = values.get(i).remaining();
+            // The value length should also expect its first bit to be 0, as the length should be stored as a signed
+            // 2-byte value (short).
+            assert valueLength <= 0x7FFF;
+            totalLength += 2 + typeNameLength + 2 + valueLength + 1;
+        }
+
+        ByteBuffer result = ByteBuffer.allocate(totalLength);
+        for (int i = 0; i < numComponents; ++i)
+        {
+            // Write the type data (2-byte length header + the fully qualified type name in UTF-8).
+            byte[] typeNameBytes = types.get(i).getBytes(StandardCharsets.UTF_8);
+            ByteBufferUtil.writeShortLength(result, typeNameBytes.length);
+            result.put(ByteBuffer.wrap(typeNameBytes));
+
+            // Write the type payload data (2-byte length header + the payload).
+            ByteBuffer value = values.get(i);
+            int bytesToCopy = value.remaining();
+            ByteBufferUtil.writeShortLength(result, bytesToCopy);
+            ByteBufferUtil.arrayCopy(value, value.position(), result, result.position(), bytesToCopy);
+            result.position(result.position() + bytesToCopy);
+
+            // Write the end-of-component byte.
+            result.put(i != numComponents - 1 ? (byte) 0 : lastEoc);
+        }
+        return result;
+    }
+
     protected ParsedComparator parseComparator(int i, String part)
     {
         return new DynamicParsedComparator(part);
diff --git a/src/java/org/apache/cassandra/db/marshal/EmptyType.java b/src/java/org/apache/cassandra/db/marshal/EmptyType.java
index 357b6e85ad15..80f8950e7c59 100644
--- a/src/java/org/apache/cassandra/db/marshal/EmptyType.java
+++ b/src/java/org/apache/cassandra/db/marshal/EmptyType.java
@@ -33,6 +33,8 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.NoSpamLogger;
 
 /**
@@ -68,6 +70,12 @@ private static NonEmptyWriteBehavior parseNonEmptyWriteBehavior()
 
     private EmptyType() {super(ComparisonType.CUSTOM);} // singleton
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    {
+        return null;
+    }
+
     public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right, ValueAccessor<VR> accessorR)
     {
         return 0;
diff --git a/src/java/org/apache/cassandra/db/marshal/FloatType.java b/src/java/org/apache/cassandra/db/marshal/FloatType.java
index 35abee0f98ed..8618325f1e45 100644
--- a/src/java/org/apache/cassandra/db/marshal/FloatType.java
+++ b/src/java/org/apache/cassandra/db/marshal/FloatType.java
@@ -27,6 +27,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 
 public class FloatType extends NumberType<Float>
@@ -51,6 +53,12 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return compareComposed(left, accessorL, right, accessorR, this);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        return ByteSource.optionalSignedFixedLengthFloat(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
       // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/Int32Type.java b/src/java/org/apache/cassandra/db/marshal/Int32Type.java
index 98f4c83cf64c..7c644633270c 100644
--- a/src/java/org/apache/cassandra/db/marshal/Int32Type.java
+++ b/src/java/org/apache/cassandra/db/marshal/Int32Type.java
@@ -28,6 +28,8 @@
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class Int32Type extends NumberType<Integer>
 {
@@ -55,6 +57,12 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return ValueAccessor.compare(left, accessorL, right, accessorR);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        return ByteSource.optionalSignedFixedLengthNumber(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/IntegerType.java b/src/java/org/apache/cassandra/db/marshal/IntegerType.java
index 4c913d50afee..fed7e672c268 100644
--- a/src/java/org/apache/cassandra/db/marshal/IntegerType.java
+++ b/src/java/org/apache/cassandra/db/marshal/IntegerType.java
@@ -30,11 +30,19 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public final class IntegerType extends NumberType<BigInteger>
 {
     public static final IntegerType instance = new IntegerType();
 
+    // Constants or escaping values needed to encode/decode variable-length integers in our custom byte-ordered
+    // encoding scheme.
+    private static final int POSITIVE_VARINT_HEADER = 0x80;
+    private static final int NEGATIVE_VARINT_LENGTH_HEADER = 0x00;
+    private static final int POSITIVE_VARINT_LENGTH_HEADER = 0xFF;
+
     private static <V> int findMostSignificantByte(V value, ValueAccessor<V> accessor)
     {
         int len = accessor.size(value) - 1;
@@ -131,6 +139,71 @@ public static <VL, VR> int compareIntegers(VL lhs, ValueAccessor<VL> accessorL,
         return 0;
     }
 
+    /**
+     * Constructs a byte-comparable representation of the number.
+     * We represent it as
+     *    <zero or more length_bytes where length = 128> <length_byte> <first_significant_byte> <zero or more bytes>
+     * where a length_byte is:
+     *    - 0x80 + (length - 1) for positive numbers (so that longer length sorts bigger)
+     *    - 0x7F - (length - 1) for negative numbers (so that longer length sorts smaller)
+     * we don't need to sign-invert the first significant byte as the order there is already determined by the length
+     * byte.
+     *
+     * The representations are prefix-free, because representations of different length always have length bytes that
+     * differ.
+     *
+     * Examples:
+     *    0             as 8000
+     *    1             as 8001
+     *    127           as 807F
+     *    255           as 80FF
+     *    2^32-1        as 837FFFFFFF
+     *    2^32          as 8380000000
+     *    2^33          as 840100000000
+     */
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        int p = buf.position();
+        final int limit = buf.limit();
+        if (p == limit)
+            return null;
+
+        // skip padding
+        final byte signbyte = buf.get(p);
+        if (signbyte == (byte) POSITIVE_VARINT_LENGTH_HEADER || signbyte == (byte) NEGATIVE_VARINT_LENGTH_HEADER)
+            while (p + 1 < limit && buf.get(++p) == signbyte) {}
+        final int startpos = p;
+
+        return new ByteSource()
+        {
+            int pos = startpos;
+            int sizeToReport = limit - startpos;
+            boolean sizeReported = false;
+
+            public int next()
+            {
+                if (!sizeReported)
+                {
+                    int v = sizeToReport;
+                    if (v >= 128)
+                        v = 128;
+                    else
+                        sizeReported = true;
+
+                    sizeToReport -= v;
+                    return signbyte >= 0
+                           ? POSITIVE_VARINT_HEADER + (v - 1)
+                           : POSITIVE_VARINT_HEADER - v;
+                }
+                if (pos == limit)
+                    return END_OF_STREAM;
+
+                return buf.get(pos++) & 0xFF;
+            }
+        };
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
index 6dd41616f04d..c0d099dee174 100644
--- a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
@@ -26,6 +26,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.UUIDSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class LexicalUUIDType extends AbstractType<UUID>
 {
@@ -48,6 +50,32 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return accessorL.toUUID(left).compareTo(accessorR.toUUID(right));
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        if (buf == null || buf.remaining() == 0)
+            return null;
+
+        // fixed-length (hence prefix-free) representation, but
+        // we have to sign-flip the highest bytes of the two longs
+        final int bufstart = buf.position();
+        return new ByteSource()
+        {
+            int bufpos = 0;
+
+            public int next()
+            {
+                if (bufpos + bufstart >= buf.limit())
+                    return END_OF_STREAM;
+                int v = buf.get(bufpos + bufstart) & 0xFF;
+                if (bufpos == 0 || bufpos == 8)
+                    v ^= 0x80;
+                ++bufpos;
+                return v;
+            }
+        };
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java
index cee3cd2c4c77..ada7bc198d4c 100644
--- a/src/java/org/apache/cassandra/db/marshal/ListType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ListType.java
@@ -18,7 +18,11 @@
 package org.apache.cassandra.db.marshal;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.UUID;
 import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.cassandra.cql3.Json;
@@ -32,6 +36,8 @@
 import org.apache.cassandra.serializers.ListSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 public class ListType<T> extends CollectionType<List<T>>
 {
@@ -195,6 +201,31 @@ static <VL, VR> int compareListOrSet(AbstractType<?> elementsComparator, VL left
         return sizeL == sizeR ? 0 : (sizeL < sizeR ? -1 : 1);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, Version version)
+    {
+        return asComparableBytesListOrSet(getElementsType(), b, version);
+    }
+
+    static ByteSource asComparableBytesListOrSet(AbstractType<?> elementsComparator, ByteBuffer b, Version version)
+    {
+        if (!b.hasRemaining())
+            return null;
+
+        b = b.duplicate();
+        int offset = 0;
+        int size = CollectionSerializer.readCollectionSize(b, ByteBufferAccessor.instance, ProtocolVersion.V3);
+        offset += CollectionSerializer.sizeOfCollectionSize(size, ProtocolVersion.V3);
+        ByteSource[] srcs = new ByteSource[size];
+        for (int i = 0; i < size; ++i)
+        {
+            ByteBuffer v = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, ProtocolVersion.V3);
+            offset += CollectionSerializer.sizeOfValue(v, ByteBufferAccessor.instance, ProtocolVersion.V3);
+            srcs[i] = elementsComparator.asComparableBytes(v, version);
+        }
+        return ByteSource.withTerminator(version == Version.LEGACY ? 0x00 : ByteSource.TERMINATOR, srcs);
+    }
+
     @Override
     public String toString(boolean ignoreFreezing)
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java
index ad539f70de70..e8db323731af 100644
--- a/src/java/org/apache/cassandra/db/marshal/LongType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LongType.java
@@ -28,6 +28,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class LongType extends NumberType<Long>
 {
@@ -57,6 +59,12 @@ public static <VL, VR> int compareLongs(VL left, ValueAccessor<VL> accessorL, VR
         return ValueAccessor.compare(left, accessorL, right, accessorR);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        return ByteSource.optionalSignedFixedLengthNumber(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java
index 991ae08048a6..a6e59ad2fcc9 100644
--- a/src/java/org/apache/cassandra/db/marshal/MapType.java
+++ b/src/java/org/apache/cassandra/db/marshal/MapType.java
@@ -28,9 +28,11 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.CollectionSerializer;
-import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.MapSerializer;
+import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.Pair;
 
 public class MapType<K, V> extends CollectionType<Map<K, V>>
@@ -218,6 +220,35 @@ public static <TL, TR> int compareMaps(AbstractType<?> keysComparator, AbstractT
         return sizeL == sizeR ? 0 : (sizeL < sizeR ? -1 : 1);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, Version version)
+    {
+        return asComparableBytesMap(getKeysType(), getValuesType(), b, version);
+    }
+
+    static ByteSource asComparableBytesMap(AbstractType<?> keysComparator, AbstractType<?> valuesComparator, ByteBuffer b, Version version)
+    {
+        if (!b.hasRemaining())
+            return null;
+
+        b = b.duplicate();
+        ProtocolVersion protocolVersion = ProtocolVersion.V3;
+        int offset = 0;
+        int size = CollectionSerializer.readCollectionSize(b, ByteBufferAccessor.instance, protocolVersion);
+        offset += CollectionSerializer.sizeOfCollectionSize(size, protocolVersion);
+        ByteSource[] srcs = new ByteSource[size * 2];
+        for (int i = 0; i < size; ++i)
+        {
+            ByteBuffer k = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, protocolVersion);
+            offset += CollectionSerializer.sizeOfValue(k, ByteBufferAccessor.instance, protocolVersion);
+            srcs[i * 2 + 0] = keysComparator.asComparableBytes(k, version);
+            ByteBuffer v = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, protocolVersion);
+            offset += CollectionSerializer.sizeOfValue(v, ByteBufferAccessor.instance, protocolVersion);
+            srcs[i * 2 + 1] = valuesComparator.asComparableBytes(v, version);
+        }
+        return ByteSource.withTerminator(version == Version.LEGACY ? 0x00 : ByteSource.TERMINATOR, srcs);
+    }
+
     @Override
     public MapSerializer<K, V> getSerializer()
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
index 89241b416bb4..d72969267ab1 100644
--- a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
+++ b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
@@ -27,6 +27,10 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 
 /** for sorting columns representing row keys in the row ordering as determined by a partitioner.
@@ -93,6 +97,21 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return PartitionPosition.ForKey.get(accessorL.toBuffer(left), partitioner).compareTo(PartitionPosition.ForKey.get(accessorR.toBuffer(right), partitioner));
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    {
+        if (version != Version.LEGACY)
+        {
+            // For ByteComparable.Version.OSS41 and above we encode an empty key with a null byte source. This
+            // way we avoid the need to special-handle a sentinel value when we decode the byte source for such a key
+            // (e.g. for ByteComparable.Version.Legacy we use the minimum key bound of the partitioner's minimum token as
+            // a sentinel value, and that results in the need to go twice through the byte source that is being
+            // decoded).
+            return buf.hasRemaining() ? partitioner.decorateKey(buf).asComparableBytes(version) : null;
+        }
+        return PartitionPosition.ForKey.get(buf, partitioner).asComparableBytes(version);
+    }
+
     @Override
     public void validate(ByteBuffer bytes) throws MarshalException
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/ReversedType.java b/src/java/org/apache/cassandra/db/marshal/ReversedType.java
index 8a4b58dca297..4b753f528184 100644
--- a/src/java/org/apache/cassandra/db/marshal/ReversedType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ReversedType.java
@@ -28,6 +28,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class ReversedType<T> extends AbstractType<T>
 {
@@ -63,6 +65,26 @@ public boolean isEmptyValueMeaningless()
         return baseType.isEmptyValueMeaningless();
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    {
+        ByteSource src = baseType.asComparableBytes(b, version);
+        if (src == null)    // Note: this will only compare correctly if used within a sequence
+            return null;
+        // Invert all bytes.
+        // The comparison requirements for the original type ensure that this encoding will compare correctly with
+        // respect to the reversed comparator function (and, specifically, prefixes of escaped byte-ordered types will
+        // compare as larger). Additionally, the weak prefix-freedom requirement ensures this encoding will also be
+        // weakly prefix-free.
+        return () ->
+        {
+            int v = src.next();
+            if (v == ByteSource.END_OF_STREAM)
+                return v;
+            return v ^ 0xFF;
+        };
+    }
+
     public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right, ValueAccessor<VR> accessorR)
     {
         return baseType.compare(right, accessorR, left, accessorL);
@@ -162,4 +184,38 @@ public String toString()
     {
         return getClass().getName() + "(" + baseType + ")";
     }
+
+    private static final class ReversedPeekableByteSource extends ByteSource.Peekable
+    {
+        private final ByteSource.Peekable original;
+
+        static ByteSource.Peekable of(ByteSource.Peekable original)
+        {
+            return original != null ? new ReversedPeekableByteSource(original) : null;
+        }
+
+        private ReversedPeekableByteSource(ByteSource.Peekable original)
+        {
+            super(null);
+            this.original = original;
+        }
+
+        @Override
+        public int next()
+        {
+            int v = original.next();
+            if (v != END_OF_STREAM)
+                return v ^ 0xFF;
+            return END_OF_STREAM;
+        }
+
+        @Override
+        public int peek()
+        {
+            int v = original.peek();
+            if (v != END_OF_STREAM)
+                return v ^ 0xFF;
+            return END_OF_STREAM;
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java
index e5bdadab25f8..cab4d55a2357 100644
--- a/src/java/org/apache/cassandra/db/marshal/SetType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SetType.java
@@ -30,6 +30,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.SetSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class SetType<T> extends CollectionType<Set<T>>
 {
@@ -157,6 +159,12 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return ListType.compareListOrSet(elements, left, accessorL, right, accessorR);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    {
+        return ListType.asComparableBytesListOrSet(getElementsType(), b, version);
+    }
+
     public SetSerializer<T> getSerializer()
     {
         return serializer;
diff --git a/src/java/org/apache/cassandra/db/marshal/ShortType.java b/src/java/org/apache/cassandra/db/marshal/ShortType.java
index 03dcf5d31446..83a3e054a23e 100644
--- a/src/java/org/apache/cassandra/db/marshal/ShortType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ShortType.java
@@ -28,6 +28,8 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public class ShortType extends NumberType<Short>
 {
@@ -46,6 +48,14 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return ValueAccessor.compare(left, accessorL, right, accessorR);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        return version == ByteComparable.Version.LEGACY
+               ? ByteSource.signedFixedLengthNumber(buf)
+               : ByteSource.optionalSignedFixedLengthNumber(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
index f883ccdc1c54..0f0546af7baa 100644
--- a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
@@ -23,12 +23,14 @@
 import org.apache.cassandra.cql3.Constants;
 import org.apache.cassandra.cql3.Duration;
 import org.apache.cassandra.cql3.Term;
-import org.apache.cassandra.cql3.statements.RequestValidations;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.SimpleDateSerializer;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -38,6 +40,15 @@ public class SimpleDateType extends TemporalType<Integer>
 
     SimpleDateType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    {
+        // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient.
+        return version == Version.LEGACY
+               ? ByteSource.fixedLength(buf)
+               : ByteSource.optionalFixedLength(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         return ByteBufferUtil.bytes(SimpleDateSerializer.dateStringToDays(source));
diff --git a/src/java/org/apache/cassandra/db/marshal/TimeType.java b/src/java/org/apache/cassandra/db/marshal/TimeType.java
index be20ba7a526e..58a2bdb69fa5 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeType.java
@@ -19,7 +19,6 @@
 
 import java.nio.ByteBuffer;
 import java.time.LocalTime;
-import java.time.ZoneId;
 import java.time.ZoneOffset;
 
 import org.apache.cassandra.cql3.Constants;
@@ -29,6 +28,9 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 /**
  * Nanosecond resolution time values
@@ -43,6 +45,15 @@ public ByteBuffer fromString(String source) throws MarshalException
         return decompose(TimeSerializer.timeStringToLong(source));
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    {
+        // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient.
+        return version == Version.LEGACY
+               ? ByteSource.fixedLength(buf)
+               : ByteSource.optionalFixedLength(buf);
+    }
+
     @Override
     public boolean isValueCompatibleWithInternal(AbstractType<?> otherType)
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
index 6cf137596ccf..64bee6c430d6 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
@@ -21,10 +21,11 @@
 import java.util.UUID;
 
 import org.apache.cassandra.cql3.CQL3Type;
-import org.apache.cassandra.cql3.ColumnSpecification;
 import org.apache.cassandra.cql3.Constants;
 import org.apache.cassandra.cql3.Term;
 import org.apache.cassandra.serializers.TypeSerializer;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TimeUUIDSerializer;
@@ -74,6 +75,22 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return Long.compare(lsb1, lsb2);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    {
+        if (!b.hasRemaining())
+            return null;
+
+        int s = b.position();
+        long msb = b.getLong(s);
+        assert ((msb >>> 12) & 0xf) == 1;
+        ByteBuffer swizzled = ByteBuffer.allocate(16);
+        swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(msb));
+        swizzled.putLong(8, b.getLong(s + 8) ^ 0x8080808080808080L);
+
+        return ByteSource.fixedLength(swizzled);
+    }
+
     // takes as input 8 signed bytes in native machine order
     // returns the first byte unchanged, and the following 7 bytes converted to an unsigned representation
     // which is the same as a 2's complement long in native format
diff --git a/src/java/org/apache/cassandra/db/marshal/TimestampType.java b/src/java/org/apache/cassandra/db/marshal/TimestampType.java
index 0dac6b0394d2..310eafc50354 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimestampType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimestampType.java
@@ -23,7 +23,6 @@
 import org.apache.cassandra.cql3.Constants;
 import org.apache.cassandra.cql3.Duration;
 import org.apache.cassandra.cql3.Term;
-import org.apache.cassandra.cql3.statements.RequestValidations;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -33,6 +32,8 @@
 import org.apache.cassandra.serializers.TimestampSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -61,6 +62,12 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return LongType.compareLongs(left, accessorL, right, accessorR);
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    {
+        return ByteSource.optionalSignedFixedLengthNumber(buf);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
       // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java
index 83fbb25d548a..59f9786d33f6 100644
--- a/src/java/org/apache/cassandra/db/marshal/TupleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java
@@ -35,6 +35,8 @@
 import org.apache.cassandra.serializers.*;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 import static com.google.common.collect.Iterables.any;
 import static com.google.common.collect.Iterables.transform;
@@ -194,12 +196,25 @@ private <T> boolean allRemainingComponentsAreNull(T v, ValueAccessor<T> accessor
         {
             int size = accessor.getInt(v, offset);
             offset += TypeSizes.INT_SIZE;
-            if (size >= 0)
+            if (size > 0)
                 return false;
         }
         return true;
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version)
+    {
+        ByteBuffer[] bufs = split(byteBuffer);  // this may be shorter than types.size -- other srcs remain null in that case
+        ByteSource[] srcs = new ByteSource[types.size()];
+        for (int i = 0; i < bufs.length; ++i)
+            srcs[i] = types.get(i).asComparableBytes(bufs[i], version);
+        // We always have a fixed number of sources, with the trailing ones possibly being nulls.
+        // This can only result in a prefix if the last type in the tuple allows prefixes. Since that type is required
+        // to be weakly prefix-free, so is the tuple.
+        return ByteSource.withTerminator(ByteSource.END_OF_STREAM, srcs);
+    }
+
     /**
      * Split a tuple value into its component values.
      */
diff --git a/src/java/org/apache/cassandra/db/marshal/UUIDType.java b/src/java/org/apache/cassandra/db/marshal/UUIDType.java
index 55ce59dae798..1ff728402ae9 100644
--- a/src/java/org/apache/cassandra/db/marshal/UUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/UUIDType.java
@@ -30,6 +30,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.UUIDSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.UUIDGen;
 
 /**
@@ -99,6 +101,28 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
         return UnsignedLongs.compare(accessorL.getLong(left, 8), accessorR.getLong(right, 8));
     }
 
+    @Override
+    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version v)
+    {
+        if (!b.hasRemaining())
+            return null;
+
+        int s = b.position();
+        long msb = b.getLong(s);
+        long version = ((msb >>> 12) & 0xf);
+        ByteBuffer swizzled = ByteBuffer.allocate(16);
+
+        if (version == 1)
+            swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(msb));
+        else
+            swizzled.putLong(0, (version << 60) | ((msb >>> 4) & 0x0FFFFFFFFFFFF000L) | (msb & 0xFFFL));
+
+        swizzled.putLong(8, b.getLong(s + 8));
+
+        // fixed-length thus prefix-free
+        return ByteSource.fixedLength(swizzled);
+    }
+
     @Override
     public boolean isValueCompatibleWithInternal(AbstractType<?> otherType)
     {
diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
index d0ab6b233653..26d9437f9f5a 100644
--- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
+++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
@@ -296,6 +296,26 @@ public static void copyBytes(ByteBuffer src, int srcPos, ByteBuffer dst, int dst
         FastByteOperations.copy(src, srcPos, dst, dstPos, length);
     }
 
+    /**
+     * Transfer bytes from one ByteBuffer to another.
+     * This function acts as System.arrayCopy() but for ByteBuffers.
+     *
+     * @param src the source ByteBuffer
+     * @param srcPos starting position in the source ByteBuffer
+     * @param dst the destination ByteBuffer
+     * @param dstPos starting position in the destination ByteBuffer
+     * @param length the number of bytes to copy
+     */
+    public static void arrayCopy(ByteBuffer src, int srcPos, ByteBuffer dst, int dstPos, int length)
+    {
+        FastByteOperations.copy(src, srcPos, dst, dstPos, length);
+    }
+
+    public static void arrayCopy(ByteBuffer src, int srcPos, byte[] dst, int dstPos, int length)
+    {
+        FastByteOperations.copy(src, srcPos, dst, dstPos, length);
+    }
+
     public static int put(ByteBuffer src, ByteBuffer trg)
     {
         int length = Math.min(src.remaining(), trg.remaining());

From 7640c0494c9a8568c9468b87dfea2e30327b2a65 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 16:42:46 +0100
Subject: [PATCH 027/151] CORE-93: Add implementation of asComparableBytes to
 tokens

(cherry picked from commit 73c8be5ca94a5a02aa88f68b0ec03d753b9b89b0)
(cherry picked from commit fa557fd092ec459bd2eee28c8d56dac49bd478ce)
---
 .../cassandra/dht/ByteOrderedPartitioner.java    |  8 ++++++++
 .../apache/cassandra/dht/LocalPartitioner.java   |  8 ++++++++
 .../apache/cassandra/dht/Murmur3Partitioner.java |  8 ++++++++
 .../dht/OrderPreservingPartitioner.java          |  8 ++++++++
 .../apache/cassandra/dht/RandomPartitioner.java  |  8 ++++++++
 src/java/org/apache/cassandra/dht/Token.java     | 16 ++++++++++++++++
 .../apache/cassandra/dht/KeyCollisionTest.java   |  9 +++++++++
 7 files changed, 65 insertions(+)

diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
index a6314dcccc8e..13e2d9c2f44c 100644
--- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
@@ -26,6 +26,8 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Hex;
 import org.apache.cassandra.utils.ObjectSizes;
@@ -102,6 +104,12 @@ public boolean equals(Object obj)
             return Arrays.equals(token, other.token);
         }
 
+        @Override
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            return ByteSource.of(token, version);
+        }
+
         @Override
         public IPartitioner getPartitioner()
         {
diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java
index 168601ca3ef8..fe9f12de432d 100644
--- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java
@@ -27,6 +27,8 @@
 import org.apache.cassandra.db.CachedHashDecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.memory.HeapAllocator;
 
@@ -174,6 +176,12 @@ public boolean equals(Object obj)
             return token.equals(other.token);
         }
 
+        @Override
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            return comparator.asComparableBytes(token, version);
+        }
+
         @Override
         public IPartitioner getPartitioner()
         {
diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
index 2856f131f1ab..94ebb46cbdd7 100644
--- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
+++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
@@ -33,6 +33,8 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.MurmurHash;
 import org.apache.cassandra.utils.ObjectSizes;
 
@@ -176,6 +178,12 @@ public int compareTo(Token o)
             return Long.compare(token, ((LongToken) o).token);
         }
 
+        @Override
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            return ByteSource.of(token);
+        }
+
         @Override
         public IPartitioner getPartitioner()
         {
diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
index 16c5db17a448..d248e0c5ee87 100644
--- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
@@ -33,6 +33,8 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.Pair;
@@ -194,6 +196,12 @@ public long getHeapSize()
         {
             return EMPTY_SIZE + ObjectSizes.sizeOf(token);
         }
+
+        @Override
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            return ByteSource.of((String) token, version);
+        }
     }
 
     public StringToken getToken(ByteBuffer key)
diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java
index 241b7850fdf7..eb7eed8f15ad 100644
--- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java
@@ -34,6 +34,8 @@
 import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.GuidGenerator;
 import org.apache.cassandra.utils.ObjectSizes;
@@ -244,6 +246,12 @@ public BigIntegerToken(String token)
             this(new BigInteger(token));
         }
 
+        @Override
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            return IntegerType.instance.asComparableBytes(ByteBuffer.wrap(token.toByteArray()), version);
+        }
+
         @Override
         public IPartitioner getPartitioner()
         {
diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java
index d8e82f82c510..bf89e1979ec8 100644
--- a/src/java/org/apache/cassandra/dht/Token.java
+++ b/src/java/org/apache/cassandra/dht/Token.java
@@ -26,6 +26,8 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 public abstract class Token implements RingPosition<Token>, Serializable
 {
@@ -99,6 +101,20 @@ public long serializedSize(Token object, int version)
     abstract public long getHeapSize();
     abstract public Object getTokenValue();
 
+    /**
+     * Produce a weakly prefix-free byte-comparable representation of the token, i.e. such a sequence of bytes that any
+     * pair x, y of valid tokens of this type and any bytes b1, b2 between 0x10 and 0xEF,
+     * (+ stands for concatenation)
+     *   compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2)
+     * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and:
+     *   asByteComparable(x)+b1 is not a prefix of asByteComparable(y)      (weakly prefix free)
+     * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the
+     * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if
+     * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat
+     * more efficient encoding of arbitrary-length byte-comparable blobs.
+     */
+    abstract public ByteSource asComparableBytes(ByteComparable.Version version);
+
     /**
      * Returns a measure for the token space covered between this token and next.
      * Used by the token allocation algorithm (see CASSANDRA-7032).
diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
index 5b5365da099b..2881ab96c9ce 100644
--- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
+++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
@@ -27,6 +27,7 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.marshal.IntegerType;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
@@ -36,6 +37,8 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 
 /**
@@ -124,5 +127,11 @@ public long getHeapSize()
         {
             return 0;
         }
+
+        @Override
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            return IntegerType.instance.asComparableBytes(IntegerType.instance.decompose(token), version);
+        }
     }
 }

From 7b67b6c0079408d55a205f89a654be4d847af9bc Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 16:45:31 +0100
Subject: [PATCH 028/151] CORE-93: Add implementation of asComparableBytes to
 partition position

(cherry picked from commit d29ec5b1198738e1a966b560acdc170dfef609bb)
(cherry picked from commit 59a4b3b46a7c9eb4709c4a2c38198ef386a31173)
---
 .../org/apache/cassandra/db/DecoratedKey.java | 19 ++++++++++++++++++-
 .../cassandra/db/PartitionPosition.java       | 18 +++++++++++++++++-
 src/java/org/apache/cassandra/dht/Token.java  |  7 +++++++
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java
index 92d641460e10..70ca5d1a6b65 100644
--- a/src/java/org/apache/cassandra/db/DecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/DecoratedKey.java
@@ -24,8 +24,9 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.dht.Token.KeyBound;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.MurmurHash;
+import org.apache.cassandra.utils.ByteSource;
 import org.apache.cassandra.utils.IFilter.FilterKey;
+import org.apache.cassandra.utils.MurmurHash;
 
 /**
  * Represents a decorated key, handy for certain operations
@@ -97,6 +98,22 @@ public static int compareTo(IPartitioner partitioner, ByteBuffer key, PartitionP
         return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.getKey()) : cmp;
     }
 
+    @Override
+    public ByteSource asComparableBytes(Version version)
+    {
+        // Note: In the legacy version one encoding could be a prefix of another as the escaping is only weakly
+        // prefix-free (see ByteSourceTest.testDecoratedKeyPrefixes()).
+        // The OSS41 version avoids this by adding a terminator.
+        return ByteSource.withTerminator(version == Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR,
+                                         token.asComparableBytes(version),
+                                         keyComparableBytes(version));
+    }
+
+    protected ByteSource keyComparableBytes(Version version)
+    {
+        return ByteSource.of(getKey(), version);
+    }
+
     public IPartitioner getPartitioner()
     {
         return getToken().getPartitioner();
diff --git a/src/java/org/apache/cassandra/db/PartitionPosition.java b/src/java/org/apache/cassandra/db/PartitionPosition.java
index 3b45c6c0e2eb..578b109a835c 100644
--- a/src/java/org/apache/cassandra/db/PartitionPosition.java
+++ b/src/java/org/apache/cassandra/db/PartitionPosition.java
@@ -24,8 +24,10 @@
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
-public interface PartitionPosition extends RingPosition<PartitionPosition>
+public interface PartitionPosition extends RingPosition<PartitionPosition>, ByteComparable
 {
     public static enum Kind
     {
@@ -54,6 +56,20 @@ public static PartitionPosition get(ByteBuffer key, IPartitioner p)
     public Kind kind();
     public boolean isMinimum();
 
+    /**
+     * Produce a prefix-free byte-comparable representation of the key, i.e. such a sequence of bytes that any pair x, y
+     * of valid positions (with the same key column types and partitioner),
+     *   x.compareTo(y) == compareLexicographicallyUnsigned(x.asComparableBytes(), y.asComparableBytes())
+     * and
+     *   x.asComparableBytes() is not a prefix of y.asComparableBytes()
+     *
+     * We use a two-component tuple for decorated keys, and a one-component tuple for key bounds, where the terminator
+     * byte is chosen to yield the correct comparison result. No decorated key can be a prefix of another (per the tuple
+     * encoding), and no key bound can be a prefix of one because it uses a terminator byte that is different from the
+     * tuple separator.
+     */
+    public abstract ByteSource asComparableBytes(Version version);
+
     public static class RowPositionSerializer implements IPartitionerDependentSerializer<PartitionPosition>
     {
         /*
diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java
index bf89e1979ec8..5dd3904a2e35 100644
--- a/src/java/org/apache/cassandra/dht/Token.java
+++ b/src/java/org/apache/cassandra/dht/Token.java
@@ -206,6 +206,13 @@ public int compareTo(PartitionPosition pos)
                 return ((pos instanceof KeyBound) && !((KeyBound)pos).isMinimumBound) ? 0 : 1;
         }
 
+        @Override
+        public ByteSource asComparableBytes(Version version)
+        {
+            int terminator = isMinimumBound ? ByteSource.LT_NEXT_COMPONENT : ByteSource.GT_NEXT_COMPONENT;
+            return ByteSource.withTerminator(terminator, token.asComparableBytes(version));
+        }
+
         public IPartitioner getPartitioner()
         {
             return getToken().getPartitioner();

From 1540142bd2414fb830fb3d2dafabebb488326ac7 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 16:54:20 +0100
Subject: [PATCH 029/151] CORE-93: Add implementation of asComparableBytes to
 clustering

(cherry picked from commit 4187defa75496f64e0e743b0091431cd261fa87c)
(cherry picked from commit 19b701e06145976fbd254095ff5962bafcaf9014)
---
 .../cassandra/db/ClusteringComparator.java    | 85 +++++++++++++++++++
 .../apache/cassandra/db/ClusteringPrefix.java | 57 +++++++++++--
 2 files changed, 133 insertions(+), 9 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java
index fdc450813ff2..a23aa36017ae 100644
--- a/src/java/org/apache/cassandra/db/ClusteringComparator.java
+++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java
@@ -31,6 +31,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 
 import org.apache.cassandra.io.sstable.IndexInfo;
+import org.apache.cassandra.utils.ByteComparable;
+import org.apache.cassandra.utils.ByteSource;
 
 /**
  * A comparator of clustering prefixes (or more generally of {@link Clusterable}}.
@@ -232,6 +234,89 @@ public <T> void validate(ClusteringPrefix<T> clustering)
         }
     }
 
+    /**
+     * Produce a prefix-free byte-comparable representation of the given value, i.e. such a sequence of bytes that any
+     * pair x, y of valid values of this type
+     *   compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x), asByteComparable(y))
+     * and
+     *   asByteComparable(x) is not a prefix of asByteComparable(y)
+     */
+    public ByteComparable asByteComparable(ClusteringPrefix clustering)
+    {
+        return new ByteComparableClustering(clustering);
+    }
+
+    /**
+     * A prefix-free byte-comparable representation for a clustering or prefix.
+     *
+     * Adds a NEXT_COMPONENT byte before each component (allowing inclusive/exclusive bounds over incomplete prefixes
+     * of that length) and finishes with a suitable byte for the clustering kind. Also deals with null entries.
+     *
+     * Since all types' encodings are weakly prefix-free, this is guaranteed to be prefix-free as long as the
+     * bound/ClusteringPrefix terminators are different from the separator byte. It is okay for the terminator for
+     * Clustering to be the same as the separator, as all Clusterings must be completely specified.
+     *
+     * See also {@link AbstractType#asComparableBytes}.
+     *
+     * Some examples:
+     *    "A", 0005, Clustering     -> 40 4100 40 0005 40
+     *    "B", 0006, InclusiveEnd   -> 40 4200 40 0006 60
+     *    "A", ExclusiveStart       -> 40 4100 60
+     *    "", null, Clustering      -> 40 00 3F 40
+     *    "", 0000, Clustering      -> 40 00 40 0000 40
+     *    BOTTOM                    -> 20
+     */
+    private class ByteComparableClustering<V> implements ByteComparable
+    {
+        private final ClusteringPrefix<V> src;
+
+        ByteComparableClustering(ClusteringPrefix<V> src)
+        {
+            this.src = src;
+        }
+
+        @Override
+        public ByteSource asComparableBytes(Version version)
+        {
+            return new ByteSource()
+            {
+                private ByteSource current = null;
+                private int srcnum = -1;
+
+                @Override
+                public int next()
+                {
+                    if (current != null)
+                    {
+                        int b = current.next();
+                        if (b > END_OF_STREAM)
+                            return b;
+                        current = null;
+                    }
+
+                    int sz = src.size();
+                    if (srcnum == sz)
+                        return END_OF_STREAM;
+
+                    ++srcnum;
+                    if (srcnum == sz)
+                        return src.kind().asByteComparableValue(version);
+
+                    current = subtype(srcnum).asComparableBytes(src.accessor().toBuffer(src.get(srcnum)), version);
+                    if (current == null)
+                        return subtype(srcnum).isReversed() ? NEXT_COMPONENT_NULL_REVERSED : NEXT_COMPONENT_NULL;
+
+                    return NEXT_COMPONENT;
+                }
+            };
+        }
+
+        public String toString()
+        {
+            return src.clusteringString(subtypes());
+        }
+    }
+
     /**
      * A comparator for rows.
      *
diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
index a1291c889f1d..3cf814c6cfc7 100644
--- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java
+++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.function.ToIntFunction;
 
 import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.config.*;
@@ -34,6 +35,8 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.ByteArrayUtil;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteSource;
 
 /**
  * A clustering prefix is the unit of what a {@link ClusteringComparator} can compare.
@@ -62,14 +65,19 @@ public enum Kind
     {
         // WARNING: the ordering of that enum matters because we use ordinal() in the serialization
 
-        EXCL_END_BOUND              (0, -1),
-        INCL_START_BOUND            (0, -1),
-        EXCL_END_INCL_START_BOUNDARY(0, -1),
-        STATIC_CLUSTERING           (1, -1),
-        CLUSTERING                  (2,  0),
-        INCL_END_EXCL_START_BOUNDARY(3,  1),
-        INCL_END_BOUND              (3,  1),
-        EXCL_START_BOUND            (3,  1);
+        EXCL_END_BOUND              (0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
+        INCL_START_BOUND            (0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
+        EXCL_END_INCL_START_BOUNDARY(0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
+        STATIC_CLUSTERING           (1, -1, v -> v == Version.LEGACY
+                                                 ? ByteSource.LT_NEXT_COMPONENT + 1
+                                                 : ByteSource.TERMINATOR - 1),
+        CLUSTERING                  (2,  0, v -> v == Version.LEGACY
+                                                 ? ByteSource.NEXT_COMPONENT
+                                                 : ByteSource.TERMINATOR),
+        INCL_END_EXCL_START_BOUNDARY(3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
+        INCL_END_BOUND              (3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
+        EXCL_START_BOUND            (3,  1, v -> ByteSource.GT_NEXT_COMPONENT);
+
 
         private final int comparison;
 
@@ -79,10 +87,13 @@ public enum Kind
          */
         public final int comparedToClustering;
 
-        Kind(int comparison, int comparedToClustering)
+        public final ToIntFunction<Version> asByteComparable;
+
+        Kind(int comparison, int comparedToClustering, ToIntFunction<Version> asByteComparable)
         {
             this.comparison = comparison;
             this.comparedToClustering = comparedToClustering;
+            this.asByteComparable = asByteComparable;
         }
 
         /**
@@ -197,6 +208,16 @@ public Kind openBoundOfBoundary(boolean reversed)
                  ? (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND)
                  : (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND);
         }
+
+        /*
+         * Returns a terminator value for this clustering type that is suitable for byte comparison.
+         * Inclusive starts / exclusive ends need a lower value than ByteSource.NEXT_COMPONENT and the clustering byte,
+         * exclusive starts / inclusive ends -- a higher.
+         */
+        public int asByteComparableValue(Version version)
+        {
+            return asByteComparable.applyAsInt(version);
+        }
     }
 
     default boolean isBottom()
@@ -308,6 +329,24 @@ default ByteBuffer serializeAsPartitionKey()
             values[i] = accessor().toBuffer(get(i));
         return CompositeType.build(ByteBufferAccessor.instance, values);
     }
+
+    /**
+     * Produce a human-readable representation of the clustering given the list of types.
+     * Easier to access than metadata for debugging.
+     */
+    public default String clusteringString(List<AbstractType<?>> types)
+    {
+        StringBuilder sb = new StringBuilder();
+        sb.append(kind()).append('(');
+        for (int i = 0; i < size(); i++)
+        {
+            if (i > 0)
+                sb.append(", ");
+            sb.append(types.get(i).getString(get(i), accessor()));
+        }
+        return sb.append(')').toString();
+    }
+
     /**
      * The values of this prefix as an array.
      * <p>

From 49c3f29ad4e6b402c77903e62136cb6345c51daa Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 6 Nov 2020 16:55:26 +0100
Subject: [PATCH 030/151] CORE-93: Add some tests

(cherry picked from commit 9f6983dd580f0ab346aa9df13d69b38c9ecccec8)
(cherry picked from commit 819733f4304e16a69ffe85bc2f99c48b9244b628)
---
 test/unit/org/apache/cassandra/Util.java      |  195 +++-
 .../cassandra/utils/ByteSourceTest.java       | 1026 +++++++++++++++++
 .../73-819733f430 CORE-93: Add some tests     |   19 +
 3 files changed, 1239 insertions(+), 1 deletion(-)
 create mode 100644 test/unit/org/apache/cassandra/utils/ByteSourceTest.java
 create mode 100644 update-history/STAR-801/73-819733f430 CORE-93: Add some tests

diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index cd4e4f442f77..c26b339dc5b7 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -21,6 +21,7 @@
 
 import java.io.*;
 import java.lang.reflect.Field;
+import java.math.BigInteger;
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.nio.file.*;
@@ -86,7 +87,6 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 
 public class Util
@@ -784,6 +784,199 @@ public static void assertOnDiskState(ColumnFamilyStore cfs, int expectedSSTableC
         assertEquals(expectedSSTableCount, fileCount);
     }
 
+    public static ByteBuffer generateMurmurCollision(ByteBuffer original, byte... bytesToAdd)
+    {
+        // Round size up to 16, and add another 16 bytes
+        ByteBuffer collision = ByteBuffer.allocate((original.remaining() + bytesToAdd.length + 31) & -16);
+        collision.put(original);    // we can use this as a copy of original with 0s appended at the end
+
+        original.flip();
+
+        long c1 = 0x87c37b91114253d5L;
+        long c2 = 0x4cf5ad432745937fL;
+
+        long h1 = 0;
+        long h2 = 0;
+
+        // Get hash of original
+        int index = 0;
+        final int length = original.limit();
+        while (index <= length - 16)
+        {
+            long k1 = Long.reverseBytes(collision.getLong(index + 0));
+            long k2 = Long.reverseBytes(collision.getLong(index + 8));
+
+            // 16 bytes
+            k1 *= c1;
+            k1 = rotl64(k1, 31);
+            k1 *= c2;
+            h1 ^= k1;
+            h1 = rotl64(h1, 27);
+            h1 += h2;
+            h1 = h1 * 5 + 0x52dce729;
+            k2 *= c2;
+            k2 = rotl64(k2, 33);
+            k2 *= c1;
+            h2 ^= k2;
+            h2 = rotl64(h2, 31);
+            h2 += h1;
+            h2 = h2 * 5 + 0x38495ab5;
+
+            index += 16;
+        }
+
+        long oh1 = h1;
+        long oh2 = h2;
+
+        // Process final unfilled chunk, but only adjust the original hash value
+        if (index < length)
+        {
+            long k1 = Long.reverseBytes(collision.getLong(index + 0));
+            long k2 = Long.reverseBytes(collision.getLong(index + 8));
+
+            // 16 bytes
+            k1 *= c1;
+            k1 = rotl64(k1, 31);
+            k1 *= c2;
+            oh1 ^= k1;
+
+            k2 *= c2;
+            k2 = rotl64(k2, 33);
+            k2 *= c1;
+            oh2 ^= k2;
+        }
+
+        // These are the hashes the original would provide, before final mixing
+        oh1 ^= original.capacity();
+        oh2 ^= original.capacity();
+
+        // Fill in the remaining bytes before the last 16 and get their hash
+        collision.put(bytesToAdd);
+        while ((collision.position() & 0x0f) != 0)
+            collision.put((byte) 0);
+
+        while (index < collision.position())
+        {
+            long k1 = Long.reverseBytes(collision.getLong(index + 0));
+            long k2 = Long.reverseBytes(collision.getLong(index + 8));
+
+            // 16 bytes
+            k1 *= c1;
+            k1 = rotl64(k1, 31);
+            k1 *= c2;
+            h1 ^= k1;
+            h1 = rotl64(h1, 27);
+            h1 += h2;
+            h1 = h1 * 5 + 0x52dce729;
+            k2 *= c2;
+            k2 = rotl64(k2, 33);
+            k2 *= c1;
+            h2 ^= k2;
+            h2 = rotl64(h2, 31);
+            h2 += h1;
+            h2 = h2 * 5 + 0x38495ab5;
+
+            index += 16;
+        }
+
+        // Working backwards, we must get this hash pair
+        long th1 = h1;
+        long th2 = h2;
+
+        // adjust ohx with length
+        h1 = oh1 ^ collision.capacity();
+        h2 = oh2 ^ collision.capacity();
+
+        // Get modulo-long inverses of the multipliers used in the computation
+        long i5i = inverse(5L);
+        long c1i = inverse(c1);
+        long c2i = inverse(c2);
+
+        // revert one step
+        h2 -= 0x38495ab5;
+        h2 *= i5i;
+        h2 -= h1;
+        h2 = rotl64(h2, 33);
+
+        h1 -= 0x52dce729;
+        h1 *= i5i;
+        h1 -= th2;  // use h2 before it's adjusted with k2
+        h1 = rotl64(h1, 37);
+
+        // extract the required modifiers and applies the inverse of their transformation
+        long k1 = h1 ^ th1;
+        k1 = c2i * k1;
+        k1 = rotl64(k1, 33);
+        k1 = c1i * k1;
+
+        long k2 = h2 ^ th2;
+        k2 = c1i * k2;
+        k2 = rotl64(k2, 31);
+        k2 = c2i * k2;
+
+        collision.putLong(Long.reverseBytes(k1));
+        collision.putLong(Long.reverseBytes(k2));
+        collision.flip();
+
+        return collision;
+    }
+
+    // Assumes a and b are positive
+    private static BigInteger[] xgcd(BigInteger a, BigInteger b) {
+        BigInteger x = a, y = b;
+        BigInteger[] qrem;
+        BigInteger[] result = new BigInteger[3];
+        BigInteger x0 = BigInteger.ONE, x1 = BigInteger.ZERO;
+        BigInteger y0 = BigInteger.ZERO, y1 = BigInteger.ONE;
+        while (true)
+        {
+            qrem = x.divideAndRemainder(y);
+            x = qrem[1];
+            x0 = x0.subtract(y0.multiply(qrem[0]));
+            x1 = x1.subtract(y1.multiply(qrem[0]));
+            if (x.equals(BigInteger.ZERO))
+            {
+                result[0] = y;
+                result[1] = y0;
+                result[2] = y1;
+                return result;
+            }
+
+            qrem = y.divideAndRemainder(x);
+            y = qrem[1];
+            y0 = y0.subtract(x0.multiply(qrem[0]));
+            y1 = y1.subtract(x1.multiply(qrem[0]));
+            if (y.equals(BigInteger.ZERO))
+            {
+                result[0] = x;
+                result[1] = x0;
+                result[2] = x1;
+                return result;
+            }
+        }
+    }
+
+    /**
+     * Find a mupltiplicative inverse for the given multiplier for long, i.e.
+     * such that x * inverse(x) = 1 where * is long multiplication.
+     * In other words, such an integer that x * inverse(x) == 1 (mod 2^64).
+     */
+    public static long inverse(long multiplier)
+    {
+        final BigInteger modulus = BigInteger.ONE.shiftLeft(64);
+        // Add the modulus to the multiplier to avoid problems with negatives (a + m == a (mod m))
+        BigInteger[] gcds = xgcd(BigInteger.valueOf(multiplier).add(modulus), modulus);
+        // xgcd gives g, a and b, such that ax + bm = g
+        // ie, ax = g (mod m). Return a
+        assert gcds[0].equals(BigInteger.ONE) : "Even number " + multiplier + " has no long inverse";
+        return gcds[1].longValueExact();
+    }
+
+    public static long rotl64(long v, int n)
+    {
+        return ((v << n) | (v >>> (64 - n)));
+    }
+
     /**
      * Disable bloom filter on all sstables of given table
      */
diff --git a/test/unit/org/apache/cassandra/utils/ByteSourceTest.java b/test/unit/org/apache/cassandra/utils/ByteSourceTest.java
new file mode 100644
index 000000000000..fd1188f27858
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/ByteSourceTest.java
@@ -0,0 +1,1026 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Supplier;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import org.junit.Assert;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
+import org.apache.cassandra.db.marshal.ByteType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.DateType;
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.DoubleType;
+import org.apache.cassandra.db.marshal.DynamicCompositeType;
+import org.apache.cassandra.db.marshal.DynamicCompositeTypeTest;
+import org.apache.cassandra.db.marshal.EmptyType;
+import org.apache.cassandra.db.marshal.FloatType;
+import org.apache.cassandra.db.marshal.InetAddressType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.LexicalUUIDType;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.ShortType;
+import org.apache.cassandra.db.marshal.SimpleDateType;
+import org.apache.cassandra.db.marshal.TimeType;
+import org.apache.cassandra.db.marshal.TimeUUIDType;
+import org.apache.cassandra.db.marshal.TimestampType;
+import org.apache.cassandra.db.marshal.TupleType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.utils.ByteComparable.Version;
+
+import static org.junit.Assert.assertEquals;
+
+public class ByteSourceTest
+{
+    private final static Logger logger = LoggerFactory.getLogger(ByteSourceTest.class);
+
+    @Rule
+    public final ExpectedException expectedException = ExpectedException.none();
+
+    String[] testStrings = new String[] { "", "\0", "\0\0", "\001", "A\0\0B", "A\0B\0", "0", "0\0", "00", "1", "\377" };
+    Integer[] testInts = new Integer[] { null, Integer.MIN_VALUE, Integer.MIN_VALUE + 1, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, Integer.MAX_VALUE - 1, Integer.MAX_VALUE };
+    Byte[] testBytes = new Byte[] { -128, -127, -1, 0, 1, 127 };
+    Short[] testShorts = new Short[] { Short.MIN_VALUE, Short.MIN_VALUE + 1, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, Short.MAX_VALUE - 1, Short.MAX_VALUE };
+    Long[] testLongs = new Long[] { null, Long.MIN_VALUE, Long.MIN_VALUE + 1, Integer.MIN_VALUE - 1L, -256L, -255L, -128L, -127L, -1L, 0L, 1L, 127L, 128L, 255L, 256L, Integer.MAX_VALUE + 1L, Long.MAX_VALUE - 1, Long.MAX_VALUE };
+    Double[] testDoubles = new Double[] { null, Double.NEGATIVE_INFINITY, -Double.MAX_VALUE, -1e+200, -1e3, -1e0, -1e-3, -1e-200, -Double.MIN_VALUE, -0.0, 0.0, Double.MIN_VALUE, 1e-200, 1e-3, 1e0, 1e3, 1e+200, Double.MAX_VALUE, Double.POSITIVE_INFINITY, Double.NaN };
+    Float[] testFloats = new Float[] { null, Float.NEGATIVE_INFINITY, -Float.MAX_VALUE, -1e+30f, -1e3f, -1e0f, -1e-3f, -1e-30f, -Float.MIN_VALUE, -0.0f, 0.0f, Float.MIN_VALUE, 1e-30f, 1e-3f, 1e0f, 1e3f, 1e+30f, Float.MAX_VALUE, Float.POSITIVE_INFINITY, Float.NaN };
+    Boolean[] testBools = new Boolean[] { null, false, true };
+    UUID[] testUUIDs = new UUID[] { null, UUIDGen.getTimeUUID(), UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID(),
+                                    UUIDGen.getTimeUUID(123, 234), UUIDGen.getTimeUUID(123, 234), UUIDGen.getTimeUUID(123),
+                                    UUID.fromString("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
+                                    UUID.fromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
+                                    UUID.fromString("e902893a-9d22-3c7e-a7b8-d6e313b71d9f"),
+                                    UUID.fromString("74738ff5-5367-5958-9aee-98fffdcd1876"),
+                                    UUID.fromString("52df1bb0-6a2f-11e6-b6e4-a6dea7a01b67"),
+                                    UUID.fromString("52df1bb0-6a2f-11e6-362d-aff2143498ea"),
+                                    UUID.fromString("52df1bb0-6a2f-11e6-b62d-aff2143498ea")};
+    // Instant.MIN/MAX fail Date.from.
+    Date[] testDates = new Date[] { null,
+                                    Date.from(Instant.ofEpochSecond(Integer.MIN_VALUE)),
+                                    Date.from(Instant.ofEpochSecond(Short.MIN_VALUE)),
+                                    Date.from(Instant.ofEpochMilli(-2000)),
+                                    Date.from(Instant.EPOCH),
+                                    Date.from(Instant.ofEpochMilli(2000)),
+                                    Date.from(Instant.ofEpochSecond(Integer.MAX_VALUE)),
+                                    Date.from(Instant.now()) };
+    BigInteger[] testBigInts;
+
+    {
+        Set<BigInteger> bigs = new TreeSet<>();
+        for (Long l : testLongs)
+            if (l != null)
+                bigs.add(BigInteger.valueOf(l));
+        for (int i = 0; i < 11; ++i)
+        {
+            bigs.add(BigInteger.valueOf(i));
+            bigs.add(BigInteger.valueOf(-i));
+
+            bigs.add(BigInteger.valueOf((1L << 4 * i) - 1));
+            bigs.add(BigInteger.valueOf((1L << 4 * i)));
+            bigs.add(BigInteger.valueOf(-(1L << 4 * i) - 1));
+            bigs.add(BigInteger.valueOf(-(1L << 4 * i)));
+            String p = exp10(i);
+            bigs.add(new BigInteger(p));
+            bigs.add(new BigInteger("-" + p));
+            p = exp10(1 << i);
+            bigs.add(new BigInteger(p));
+            bigs.add(new BigInteger("-" + p));
+
+            BigInteger base = BigInteger.ONE.shiftLeft(512 * i);
+            bigs.add(base);
+            bigs.add(base.add(BigInteger.ONE));
+            bigs.add(base.subtract(BigInteger.ONE));
+            base = base.negate();
+            bigs.add(base);
+            bigs.add(base.add(BigInteger.ONE));
+            bigs.add(base.subtract(BigInteger.ONE));
+        }
+        testBigInts = bigs.toArray(new BigInteger[0]);
+    }
+    BigDecimal[] testBigDecimals;
+    {
+        String vals = "0, 1, 1.1, 21, 98.9, 99, 99.9, 100, 100.1, 101, 331, 0.4, 0.07, 0.0700, 0.005, " +
+                      "6e4, 7e200, 6e-300, 8.1e2000, 8.1e-2000, 9e2000, " +
+                      "123456789012.34567890e-1000, 123456.78901234, 1234.56789012e2, " +
+                      "1.0000, 0.01e2, 100e-2, 00, 0.000, 0E-18, 0E+18";
+        List<BigDecimal> decs = new ArrayList<>();
+        for (String s : vals.split(", "))
+        {
+            decs.add(new BigDecimal(s));
+            decs.add(new BigDecimal("-" + s));
+        }
+        testBigDecimals = decs.toArray(new BigDecimal[0]);
+    }
+
+    static String exp10(int pow)
+    {
+        StringBuilder builder = new StringBuilder();
+        builder.append('1');
+        for (int i=0; i<pow; ++i)
+            builder.append('0');
+        return builder.toString();
+    }
+
+    Object[][] testValues = new Object[][] { testStrings, testInts, testBools, testDoubles, testBigInts, testBigDecimals };
+    AbstractType[] testTypes = new AbstractType[] {
+                               AsciiType.instance,
+                               Int32Type.instance,
+                               BooleanType.instance,
+                               DoubleType.instance,
+                               IntegerType.instance,
+                               DecimalType.instance };
+
+    @Test
+    public void testStringsAscii()
+    {
+        testType(AsciiType.instance, testStrings);
+    }
+
+    @Test
+    public void testStringsUTF8()
+    {
+        testType(UTF8Type.instance, testStrings);
+    }
+
+    @Test
+    public void testBooleans()
+    {
+        testType(BooleanType.instance, testBools);
+    }
+
+    @Test
+    public void testInts()
+    {
+        testType(Int32Type.instance, testInts);
+        testDirect(x -> ByteSource.of(x), Integer::compare, testInts);
+    }
+
+    @Test
+    public void randomTestInts()
+    {
+        Random rand = new Random();
+        for (int i=0; i<10000; ++i)
+        {
+            int i1 = rand.nextInt();
+            int i2 = rand.nextInt();
+            assertComparesSame(Int32Type.instance, i1, i2);
+        }
+
+    }
+
+    @Test
+    public void testLongs()
+    {
+        testType(LongType.instance, testLongs);
+        testDirect(x -> ByteSource.of(x), Long::compare, testLongs);
+    }
+
+    @Test
+    public void testShorts()
+    {
+        testType(ShortType.instance, testShorts);
+    }
+
+    @Test
+    public void testBytes()
+    {
+        testType(ByteType.instance, testBytes);
+    }
+
+    @Test
+    public void testDoubles()
+    {
+        testType(DoubleType.instance, testDoubles);
+    }
+
+    @Test
+    public void testFloats()
+    {
+        testType(FloatType.instance, testFloats);
+    }
+
+    @Test
+    public void testBigInts()
+    {
+        testType(IntegerType.instance, testBigInts);
+    }
+
+    @Test
+    public void testBigDecimals()
+    {
+        testType(DecimalType.instance, testBigDecimals);
+    }
+
+    @Test
+    public void testBigDecimalInCombination()
+    {
+        BigDecimal b1 = new BigDecimal("123456.78901201");
+        BigDecimal b2 = new BigDecimal("123456.789012");
+        Boolean b = false;
+
+        assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b);
+        assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2);
+
+        b1 = b1.negate();
+        b2 = b2.negate();
+
+        assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b);
+        assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2);
+
+        b1 = new BigDecimal("-123456.78901289");
+        b2 = new BigDecimal("-123456.789012");
+
+        assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b);
+        assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2);
+
+        b1 = new BigDecimal("1");
+        b2 = new BigDecimal("1.1");
+
+        assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b);
+        assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2);
+
+        b1 = b1.negate();
+        b2 = b2.negate();
+
+        assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b);
+        assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2);
+    }
+
+    @Test
+    public void testUUIDs()
+    {
+        testType(UUIDType.instance, testUUIDs);
+    }
+
+    @Test
+    public void testTimeUUIDs()
+    {
+        testType(TimeUUIDType.instance, Arrays.stream(testUUIDs).filter(x -> x == null || x.version() == 1).toArray());
+    }
+
+    @Test
+    public void testLexicalUUIDs()
+    {
+        testType(LexicalUUIDType.instance, testUUIDs);
+    }
+
+    @Test
+    public void testSimpleDate()
+    {
+        testType(SimpleDateType.instance, Arrays.stream(testInts).filter(x -> x != null).toArray());
+    }
+
+    @Test
+    public void testTimeType()
+    {
+        testType(TimeType.instance, Arrays.stream(testLongs).filter(x -> x != null && x >= 0 && x <= 24L * 60 * 60 * 1000 * 1000 * 1000).toArray());
+    }
+
+    @SuppressWarnings("deprecation")
+    @Test
+    public void testDateType()
+    {
+        testType(DateType.instance, testDates);
+    }
+
+    @Test
+    public void testTimestampType()
+    {
+        testType(TimestampType.instance, testDates);
+    }
+
+    @Test
+    public void testBytesType()
+    {
+        List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < testValues.length; ++i)
+            for (Object o : testValues[i])
+                values.add(testTypes[i].decompose(o));
+
+        testType(BytesType.instance, values.toArray());
+    }
+
+    @Test
+    public void testInetAddressType() throws UnknownHostException
+    {
+        InetAddress[] testInets = new InetAddress[] { null,
+                                                      InetAddress.getLocalHost(),
+                                                      InetAddress.getLoopbackAddress(),
+                                                      InetAddress.getByName("192.168.0.1"),
+                                                      InetAddress.getByName("fe80::428d:5cff:fe53:1dc9"),
+                                                      InetAddress.getByName("2001:610:3:200a:192:87:36:2"),
+                                                      InetAddress.getByName("10.0.0.1"),
+                                                      InetAddress.getByName("0a00:0001::"),
+                                                      InetAddress.getByName("::10.0.0.1") };
+        testType(InetAddressType.instance, testInets);
+    }
+
+    @Test
+    public void testEmptyType()
+    {
+        testType(EmptyType.instance, new Void[] { null });
+    }
+
+    @Test
+    public void testPatitionerDefinedOrder()
+    {
+        List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < testValues.length; ++i)
+            for (Object o : testValues[i])
+                values.add(testTypes[i].decompose(o));
+
+        testBuffers(new PartitionerDefinedOrder(Murmur3Partitioner.instance), values);
+        testBuffers(new PartitionerDefinedOrder(RandomPartitioner.instance), values);
+        testBuffers(new PartitionerDefinedOrder(ByteOrderedPartitioner.instance), values);
+    }
+
+    @Test
+    public void testPatitionerOrder()
+    {
+        List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < testValues.length; ++i)
+            for (Object o : testValues[i])
+                values.add(testTypes[i].decompose(o));
+
+        testDecoratedKeys(Murmur3Partitioner.instance, values);
+        testDecoratedKeys(RandomPartitioner.instance, values);
+        testDecoratedKeys(ByteOrderedPartitioner.instance, values);
+    }
+
+    @Test
+    public void testLocalPatitionerOrder()
+    {
+        for (int i = 0; i < testValues.length; ++i)
+        {
+            final AbstractType testType = testTypes[i];
+            testDecoratedKeys(new LocalPartitioner(testType), Lists.transform(Arrays.asList(testValues[i]),
+                                                                                            v -> testType.decompose(v)));
+        }
+    }
+
+    ClusteringPrefix.Kind[] kinds = new ClusteringPrefix.Kind[] {
+    ClusteringPrefix.Kind.INCL_START_BOUND,
+    ClusteringPrefix.Kind.CLUSTERING,
+    ClusteringPrefix.Kind.EXCL_START_BOUND,
+    };
+
+    interface PairTester
+    {
+        void test(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4);
+    }
+
+    void testCombinationSampling(Random rand, PairTester tester)
+    {
+        for (int i=0;i<testTypes.length;++i)
+            for (int j=0;j<testTypes.length;++j)
+            {
+                Object[] tv1 = new Object[3];
+                Object[] tv2 = new Object[3];
+                for (int t=0; t<tv1.length; ++t)
+                {
+                    tv1[t] = testValues[i][rand.nextInt(testValues[i].length)];
+                    tv2[t] = testValues[j][rand.nextInt(testValues[j].length)];
+                }
+
+                for (Object o1 : tv1)
+                    for (Object o2 : tv2)
+                        for (Object o3 : tv1)
+                            for (Object o4 : tv2)
+
+                {
+                    tester.test(testTypes[i], testTypes[j], o1, o2, o3, o4);
+                }
+            }
+    }
+
+    @Test
+    public void testCombinations()
+    {
+        Random rand = new Random(0);
+        testCombinationSampling(rand, this::assertClusteringPairComparesSame);
+    }
+
+    void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4)
+    {
+        for (Version v : Version.values())
+            for (ClusteringPrefix.Kind k1 : kinds)
+                for (ClusteringPrefix.Kind k2 : kinds)
+                {
+                    ClusteringComparator comp = new ClusteringComparator(t1, t2);
+                    ByteBuffer[] b = new ByteBuffer[2];
+                    ByteBuffer[] d = new ByteBuffer[2];
+                    b[0] = t1.decompose(o1);
+                    b[1] = t2.decompose(o2);
+                    d[0] = t1.decompose(o3);
+                    d[1] = t2.decompose(o4);
+                    ClusteringPrefix<ByteBuffer> c = ByteBufferAccessor.instance.factory().bound(k1, b);
+                    ClusteringPrefix<ByteBuffer> e = ByteBufferAccessor.instance.factory().bound(k2, d);
+                    final ByteComparable bsc = comp.asByteComparable(c);
+                    final ByteComparable bse = comp.asByteComparable(e);
+                    int expected = Integer.signum(comp.compare(c, e));
+                    assertEquals(String.format("Failed comparing %s and %s, %s vs %s version %s",
+                                               safeStr(c.clusteringString(comp.subtypes())),
+                                               safeStr(e.clusteringString(comp.subtypes())), bsc, bse, v),
+                                 expected, Integer.signum(ByteComparable.compare(bsc, bse, v)));
+                    maybeCheck41Properties(expected, bsc, bse, v);
+                    maybeAssertNotPrefix(bsc, bse, v);
+
+                    ClusteringComparator compR = new ClusteringComparator(ReversedType.getInstance(t1), ReversedType.getInstance(t2));
+                    final ByteComparable bsrc = compR.asByteComparable(c);
+                    final ByteComparable bsre = compR.asByteComparable(e);
+                    int expectedR = Integer.signum(compR.compare(c, e));
+                    assertEquals(String.format("Failed comparing reversed %s and %s, %s vs %s version %s",
+                                               safeStr(c.clusteringString(comp.subtypes())),
+                                               safeStr(e.clusteringString(comp.subtypes())), bsrc, bsre, v),
+                                 expectedR, Integer.signum(ByteComparable.compare(bsrc, bsre, v)));
+                    maybeCheck41Properties(expectedR, bsrc, bsre, v);
+                    maybeAssertNotPrefix(bsrc, bsre, v);
+                }
+    }
+
+    @Test
+    public void testTupleType()
+    {
+        Random rand = ThreadLocalRandom.current();
+        testCombinationSampling(rand, this::assertTupleComparesSame);
+    }
+
+    @Test
+    public void testTupleTypeNonFull()
+    {
+        TupleType tt = new TupleType(ImmutableList.of(AsciiType.instance, Int32Type.instance));
+        List<ByteBuffer> tests = ImmutableList.of
+            (
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, ""),
+                                                                                decomposeAndRandomPad(Int32Type.instance, 0)}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, ""),
+                                                                                decomposeAndRandomPad(Int32Type.instance, null)}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, "")}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[0])
+            );
+        testBuffers(tt, tests);
+    }
+
+    void assertTupleComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4)
+    {
+        TupleType tt = new TupleType(ImmutableList.of(t1, t2));
+        ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {t1.decompose(o1), t2.decompose(o2)});
+        ByteBuffer b2 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {t1.decompose(o3), t2.decompose(o4)});
+        assertComparesSame(tt, b1, b2);
+    }
+
+    @Test
+    public void testCompositeType()
+    {
+        Random rand = new Random(0);
+        testCombinationSampling(rand, this::assertCompositeComparesSame);
+    }
+
+    @Test
+    public void testCompositeTypeNonFull()
+    {
+        CompositeType tt = CompositeType.getInstance(AsciiType.instance, Int32Type.instance);
+        List<ByteBuffer> tests = ImmutableList.of
+            (
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, "")),
+            CompositeType.build(ByteBufferAccessor.instance),
+            CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(AsciiType.instance, "")),
+            CompositeType.build(ByteBufferAccessor.instance,true)
+            );
+        for (ByteBuffer b : tests)
+            tt.validate(b);
+        testBuffers(tt, tests);
+    }
+
+    void assertCompositeComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4)
+    {
+        CompositeType tt = CompositeType.getInstance(t1, t2);
+        ByteBuffer b1 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o1), decomposeAndRandomPad(t2, o2));
+        ByteBuffer b2 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o3), decomposeAndRandomPad(t2, o4));
+        assertComparesSame(tt, b1, b2);
+    }
+
+    @Test
+    public void testDynamicComposite()
+    {
+        DynamicCompositeType tt = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases);
+        UUID[] uuids = DynamicCompositeTypeTest.uuids;
+        List<ByteBuffer> tests = ImmutableList.of
+            (
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test1", null, -1, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 24, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 42, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[0], -1, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[1], 42, false, true)
+            );
+        for (ByteBuffer b : tests)
+            tt.validate(b);
+        testBuffers(tt, tests);
+    }
+
+    @Test
+    public void testListTypeString()
+    {
+        testCollection(ListType.getInstance(AsciiType.instance, true), testStrings, () -> new ArrayList<>(), new Random());
+    }
+
+    @Test
+    public void testListTypeLong()
+    {
+        testCollection(ListType.getInstance(LongType.instance, true), testLongs, () -> new ArrayList<>(), new Random());
+    }
+
+    @Test
+    public void testSetTypeString()
+    {
+        testCollection(SetType.getInstance(AsciiType.instance, true), testStrings, () -> new HashSet<>(), new Random());
+    }
+
+    @Test
+    public void testSetTypeLong()
+    {
+        testCollection(SetType.getInstance(LongType.instance, true), testLongs, () -> new HashSet<>(), new Random());
+    }
+
+    <T, CT extends Collection<T>> void testCollection(CollectionType<CT> tt, T[] values, Supplier<CT> gen, Random rand)
+    {
+        int cnt = 0;
+        List<CT> tests = new ArrayList<>();
+        tests.add(gen.get());
+        for (int c = 1; c <= 3; ++c)
+            for (int j = 0; j < 5; ++j)
+            {
+                CT l = gen.get();
+                for (int i = 0; i < c; ++i)
+                    l.add(values[cnt++ % values.length]);
+
+                tests.add(l);
+            }
+        testType(tt, tests.toArray());
+    }
+
+    @Test
+    public void testMapTypeStringLong()
+    {
+        testMap(MapType.getInstance(AsciiType.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random());
+    }
+
+    @Test
+    public void testMapTypeStringLongTree()
+    {
+        testMap(MapType.getInstance(AsciiType.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random());
+    }
+
+    @Test
+    public void testDecoratedKeyPrefixesVOSS41()
+    {
+        // This should pass with the OSS 4.1 encoding
+        testDecoratedKeyPrefixes(Version.OSS41);
+    }
+
+    @Test
+    public void testDecoratedKeyPrefixesVLegacy()
+    {
+        // ... and fail with the legacy encoding
+        try
+        {
+            testDecoratedKeyPrefixes(Version.LEGACY);
+        }
+        catch (AssertionError e)
+        {
+            // Correct path, test failing.
+            return;
+        }
+        Assert.fail("Test expected to fail.");
+    }
+
+    @Test
+    public void testFixedLengthWithOffset()
+    {
+        byte[] bytes = new byte[]{ 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+
+        ByteSource source = ByteSource.fixedLength(bytes, 0, 1);
+        assertEquals(1, source.next());
+        assertEquals(ByteSource.END_OF_STREAM, source.next());
+
+        source = ByteSource.fixedLength(bytes, 4, 5);
+        assertEquals(5, source.next());
+        assertEquals(6, source.next());
+        assertEquals(7, source.next());
+        assertEquals(8, source.next());
+        assertEquals(9, source.next());
+        assertEquals(ByteSource.END_OF_STREAM, source.next());
+
+        ByteSource.fixedLength(bytes, 9, 0);
+        assertEquals(ByteSource.END_OF_STREAM, source.next());
+    }
+
+    @Test
+    public void testFixedLengthNegativeLength()
+    {
+        byte[] bytes = new byte[]{ 1, 2, 3 };
+
+        expectedException.expect(IllegalArgumentException.class);
+        ByteSource.fixedLength(bytes, 0, -1);
+    }
+
+    @Test
+    public void testFixedLengthNegativeOffset()
+    {
+        byte[] bytes = new byte[]{ 1, 2, 3 };
+
+        expectedException.expect(IllegalArgumentException.class);
+        ByteSource.fixedLength(bytes, -1, 1);
+    }
+
+    @Test
+    public void testFixedLengthOutOfBounds()
+    {
+        byte[] bytes = new byte[]{ 1, 2, 3 };
+
+        expectedException.expect(IllegalArgumentException.class);
+        ByteSource.fixedLength(bytes, 0, 4);
+    }
+
+    @Test
+    public void testFixedOffsetOutOfBounds()
+    {
+        byte[] bytes = new byte[]{ 1, 2, 3 };
+
+        expectedException.expect(IllegalArgumentException.class);
+        ByteSource.fixedLength(bytes, 4, 1);
+    }
+
+    public void testDecoratedKeyPrefixes(Version version)
+    {
+        testDecoratedKeyPrefixes("012345678BCDE\0", "", version);
+        testDecoratedKeyPrefixes("012345678ABCDE\0", "ABC", version);
+        testDecoratedKeyPrefixes("0123456789ABCDE\0", "\0AB", version);
+        testDecoratedKeyPrefixes("0123456789ABCDEF\0", "\0", version);
+
+        testDecoratedKeyPrefixes("0123456789ABCDEF0", "ABC", version);
+        testDecoratedKeyPrefixes("0123456789ABCDEF", "", version);
+        testDecoratedKeyPrefixes("0123456789ABCDE", "", version);
+        testDecoratedKeyPrefixes("0123456789ABCD", "\0AB", version);
+        testDecoratedKeyPrefixes("0123456789ABC", "\0", version);
+
+    }
+
+    public void testDecoratedKeyPrefixes(String key, String append, Version version)
+    {
+        logger.info("Testing {} + {}", safeStr(key), safeStr(append));
+        IPartitioner partitioner = Murmur3Partitioner.instance;
+        ByteBuffer original = ByteBufferUtil.bytes(key);
+        ByteBuffer collision = Util.generateMurmurCollision(original, append.getBytes(StandardCharsets.UTF_8));
+
+        long[] hash = new long[2];
+        MurmurHash.hash3_x64_128(original, 0, original.limit(), 0, hash);
+        logger.info(String.format("Original hash  %016x,%016x", hash[0], hash[1]));
+        MurmurHash.hash3_x64_128(collision, 0, collision.limit(), 0, hash);
+        logger.info(String.format("Collision hash %016x,%016x", hash[0], hash[1]));
+
+        DecoratedKey kk1 = partitioner.decorateKey(original);
+        DecoratedKey kk2 = partitioner.decorateKey(collision);
+        logger.info("{}\n{}\n{}\n{}", kk1, kk2, kk1.byteComparableAsString(version), kk2.byteComparableAsString(version));
+
+        final ByteSource s1 = kk1.asComparableBytes(version);
+        final ByteSource s2 = kk2.asComparableBytes(version);
+        logger.info("{}\n{}", s1, s2);
+
+        // Check that the representations compare correctly
+        Assert.assertEquals(Long.signum(kk1.compareTo(kk2)), ByteComparable.compare(kk1, kk2, version));
+        // s1 must not be a prefix of s2
+        assertNotPrefix(s1, s2);
+    }
+
+    private void assertNotPrefix(ByteSource s1, ByteSource s2)
+    {
+        int c1, c2;
+        do
+        {
+            c1 = s1.next();
+            c2 = s2.next();
+        }
+        while (c1 == c2 && c1 != ByteSource.END_OF_STREAM);
+
+        // Equal is ok
+        if (c1 == c2)
+            return;
+
+        Assert.assertNotEquals("ByteComparable is a prefix of other", ByteSource.END_OF_STREAM, c1);
+        Assert.assertNotEquals("ByteComparable is a prefix of other", ByteSource.END_OF_STREAM, c2);
+    }
+
+    private int compare(ByteSource s1, ByteSource s2)
+    {
+        int c1, c2;
+        do
+        {
+            c1 = s1.next();
+            c2 = s2.next();
+        }
+        while (c1 == c2 && c1 != ByteSource.END_OF_STREAM);
+
+        return Integer.compare(c1, c2);
+    }
+
+    private void maybeAssertNotPrefix(ByteComparable s1, ByteComparable s2, Version version)
+    {
+        if (version == Version.OSS41)
+            assertNotPrefix(s1.asComparableBytes(version), s2.asComparableBytes(version));
+    }
+
+    private void maybeCheck41Properties(int expectedComparison, ByteComparable s1, ByteComparable s2, Version version)
+    {
+        if (version != Version.OSS41)
+            return;
+
+        if (s1 == null || s2 == null || 0 == expectedComparison)
+            return;
+        int b1 = ThreadLocalRandom.current().nextInt(ByteSource.MIN_SEPARATOR, ByteSource.MAX_SEPARATOR + 1);
+        int b2 = ThreadLocalRandom.current().nextInt(ByteSource.MIN_SEPARATOR, ByteSource.MAX_SEPARATOR + 1);
+        assertEquals(String.format("Comparison failed for %s(%s + %02x) and %s(%s + %02x)", s1, s1.byteComparableAsString(version), b1, s2, s2.byteComparableAsString(version), b2),
+                expectedComparison, Integer.signum(compare(ByteSource.withTerminator(b1, s1.asComparableBytes(version)), ByteSource.withTerminator(b2, s2.asComparableBytes(version)))));
+        assertNotPrefix(ByteSource.withTerminator(b1, s1.asComparableBytes(version)), ByteSource.withTerminator(b2, s2.asComparableBytes(version)));
+    }
+
+    <K, V, M extends Map<K, V>> void testMap(MapType<K, V> tt, K[] keys, V[] values, Supplier<M> gen, Random rand)
+    {
+        List<M> tests = new ArrayList<>();
+        tests.add(gen.get());
+        for (int c = 1; c <= 3; ++c)
+            for (int j = 0; j < 5; ++j)
+            {
+                M l = gen.get();
+                for (int i = 0; i < c; ++i)
+                    l.put(keys[rand.nextInt(keys.length)], values[rand.nextInt(values.length)]);
+
+                tests.add(l);
+            }
+        testType(tt, tests.toArray());
+    }
+
+    /*
+     * Convert type to a comparable.
+     */
+    private ByteComparable typeToComparable(AbstractType type, ByteBuffer value)
+    {
+        return new ByteComparable()
+        {
+            @Override
+            public ByteSource asComparableBytes(Version v)
+            {
+                return type.asComparableBytes(value, v);
+            }
+
+            @Override
+            public String toString()
+            {
+                return type.getString(value);
+            }
+        };
+    }
+
+    public void testType(AbstractType type, Object[] values)
+    {
+        for (Object i : values) {
+            ByteBuffer b = decomposeAndRandomPad(type, i);
+            logger.info("Value {} ({}) bytes {} ByteSource {}",
+                              safeStr(i),
+                              safeStr(type.getSerializer().toCQLLiteral(b)),
+                              safeStr(ByteBufferUtil.bytesToHex(b)),
+                              typeToComparable(type, b).byteComparableAsString(Version.OSS41));
+        }
+        for (Object i : values)
+            for (Object j : values)
+                assertComparesSame(type, i, j);
+        if (!type.isReversed())
+            testType(ReversedType.getInstance(type), values);
+    }
+
+    public void testBuffers(AbstractType type, List<ByteBuffer> values)
+    {
+        try
+        {
+            for (Object i : values) {
+                ByteBuffer b = decomposeAndRandomPad(type, i);
+                logger.info("Value {} bytes {} ByteSource {}",
+                                  safeStr(type.getSerializer().toCQLLiteral(b)),
+                                  safeStr(ByteBufferUtil.bytesToHex(b)),
+                                  typeToComparable(type, b).byteComparableAsString(Version.OSS41));
+            }
+        }
+        catch (UnsupportedOperationException e)
+        {
+            // Continue without listing values.
+        }
+
+        for (ByteBuffer i : values)
+            for (ByteBuffer j : values)
+                assertComparesSameBuffers(type, i, j);
+    }
+
+    void assertComparesSameBuffers(AbstractType type, ByteBuffer b1, ByteBuffer b2)
+    {
+        int expected = Integer.signum(type.compare(b1, b2));
+        final ByteComparable bs1 = typeToComparable(type, b1);
+        final ByteComparable bs2 = typeToComparable(type, b2);
+
+        for (Version version : Version.values())
+        {
+            int actual = Integer.signum(ByteComparable.compare(bs1, bs2, version));
+            assertEquals(String.format("Failed comparing %s(%s) and %s(%s)", ByteBufferUtil.bytesToHex(b1), bs1.byteComparableAsString(version), ByteBufferUtil.bytesToHex(b2), bs2.byteComparableAsString(version)),
+                         expected,
+                         actual);
+            maybeCheck41Properties(expected, bs1, bs2, version);
+        }
+    }
+
+    public void testDecoratedKeys(IPartitioner type, List<ByteBuffer> values)
+    {
+        for (ByteBuffer i : values)
+            for (ByteBuffer j : values)
+                assertComparesSameDecoratedKeys(type, i, j);
+    }
+
+    void assertComparesSameDecoratedKeys(IPartitioner type, ByteBuffer b1, ByteBuffer b2)
+    {
+        DecoratedKey k1 = type.decorateKey(b1);
+        DecoratedKey k2 = type.decorateKey(b2);
+        int expected = Integer.signum(k1.compareTo(k2));
+
+        for (Version version : Version.values())
+        {
+            int actual = Integer.signum(ByteComparable.compare(k1, k2, version));
+            assertEquals(String.format("Failed comparing %s[%s](%s) and %s[%s](%s)\npartitioner %s version %s",
+                                       ByteBufferUtil.bytesToHex(b1),
+                                       k1,
+                                       k1.byteComparableAsString(version),
+                                       ByteBufferUtil.bytesToHex(b2),
+                                       k2,
+                                       k2.byteComparableAsString(version),
+                                       type,
+                                       version),
+                         expected,
+                         actual);
+            maybeAssertNotPrefix(k1, k2, version);
+        }
+    }
+
+    private Object safeStr(Object i)
+    {
+        if (i == null)
+            return null;
+        String s = i.toString();
+        if (s.length() > 100)
+            s = s.substring(0, 100) + "...";
+        return s.replaceAll("\0", "<0>");
+    }
+
+    public <T> void testDirect(Function<T, ByteSource> convertor, BiFunction<T, T, Integer> comparator, T[] values)
+    {
+        for (T i : values) {
+            if (i == null)
+                continue;
+
+            logger.info("Value {} ByteSource {}\n",
+                              safeStr(i),
+                              convertor.apply(i));
+        }
+        for (T i : values)
+            if (i != null)
+                for (T j : values)
+                    if (j != null)
+                        assertComparesSame(convertor, comparator, i, j);
+    }
+
+    <T> void assertComparesSame(Function<T, ByteSource> convertor, BiFunction<T, T, Integer> comparator, T v1, T v2)
+    {
+        ByteComparable b1 = v -> convertor.apply(v1);
+        ByteComparable b2 = v -> convertor.apply(v2);
+        int expected = Integer.signum(comparator.apply(v1, v2));
+        int actual = Integer.signum(ByteComparable.compare(b1, b2, null));  // version ignored above
+        assertEquals(String.format("Failed comparing %s and %s", v1, v2), expected, actual);
+    }
+
+    void assertComparesSame(AbstractType type, Object v1, Object v2)
+    {
+        ByteBuffer b1 = decomposeAndRandomPad(type, v1);
+        ByteBuffer b2 = decomposeAndRandomPad(type, v2);
+        int expected = Integer.signum(type.compare(b1, b2));
+        final ByteComparable bc1 = typeToComparable(type, b1);
+        final ByteComparable bc2 = typeToComparable(type, b2);
+
+        for (Version version : Version.values())
+        {
+            int actual = Integer.signum(ByteComparable.compare(bc1, bc2, version));
+            if (expected != actual)
+            {
+                if (type.isReversed())
+                {
+                    // This can happen for reverse of nulls and prefixes. Check that it's ok within multi-component
+                    ClusteringComparator cc = new ClusteringComparator(type);
+                    ByteComparable c1 = cc.asByteComparable(Clustering.make(b1));
+                    ByteComparable c2 = cc.asByteComparable(Clustering.make(b2));
+                    int actualcc = Integer.signum(ByteComparable.compare(c1, c2, version));
+                    if (actualcc == expected)
+                        return;
+                    assertEquals(String.format("Failed comparing reversed %s(%s, %s) and %s(%s, %s) direct (%d) and as clustering", safeStr(v1), ByteBufferUtil.bytesToHex(b1), c1, safeStr(v2), ByteBufferUtil.bytesToHex(b2), c2, actual), expected, actualcc);
+                }
+                else
+                    assertEquals(String.format("Failed comparing %s(%s) and %s(%s)", safeStr(v1), ByteBufferUtil.bytesToHex(b1), safeStr(v2), ByteBufferUtil.bytesToHex(b2)), expected, actual);
+            }
+            maybeCheck41Properties(expected, bc1, bc2, version);
+        }
+    }
+
+    ByteBuffer decomposeAndRandomPad(AbstractType type, Object v)
+    {
+        ByteBuffer b = type.decompose(v);
+        Random rand = new Random(0);
+        int padBefore = rand.nextInt(16);
+        int padAfter = rand.nextInt(16);
+        int paddedCapacity = b.remaining() + padBefore + padAfter;
+        ByteBuffer padded = allocateBuffer(paddedCapacity);
+        rand.ints(padBefore).forEach(x -> padded.put((byte) x));
+        padded.put(b);
+        rand.ints(padAfter).forEach(x -> padded.put((byte) x));
+        padded.clear().limit(padded.capacity() - padAfter).position(padBefore);
+        return padded;
+    }
+
+    protected ByteBuffer allocateBuffer(int paddedCapacity)
+    {
+        return ByteBuffer.allocate(paddedCapacity);
+    }
+}
diff --git a/update-history/STAR-801/73-819733f430 CORE-93: Add some tests b/update-history/STAR-801/73-819733f430 CORE-93: Add some tests
new file mode 100644
index 000000000000..8fbd81023924
--- /dev/null
+++ b/update-history/STAR-801/73-819733f430 CORE-93: Add some tests	
@@ -0,0 +1,19 @@
+--- a/test/unit/org/apache/cassandra/Util.java
++++ b/test/unit/org/apache/cassandra/Util.java
+@@ -19,16 +19,9 @@
+  *
+  */
+ 
+-<<<<<<<
+ import java.io.*;
+ import java.lang.reflect.Field;
+-=======
+-import java.io.Closeable;
+-import java.io.EOFException;
+-import java.io.File;
+-import java.io.IOError;
+ import java.math.BigInteger;
+->>>>>>>
+ import java.net.UnknownHostException;
+ import java.nio.ByteBuffer;
+ import java.nio.file.*;

From 7bb0ad043e80acfa4494a08c7404e3c44d2c1bf4 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Tue, 10 Nov 2020 13:54:32 +0100
Subject: [PATCH 031/151] STAR-15: Allow for other implementations of
 SSTableFormat

[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
:...skipping...
[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
Also make it possible to obtain PartitionIndexIterator directly from
:...skipping...
[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
Also make it possible to obtain PartitionIndexIterator directly from
the SSTableReader.Factory
:...skipping...
[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
Also make it possible to obtain PartitionIndexIterator directly from
the SSTableReader.Factory

:...skipping...
[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
Also make it possible to obtain PartitionIndexIterator directly from
the SSTableReader.Factory

Added methods to reset the iterator position and set the position
to exact value

:...skipping...
[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
Also make it possible to obtain PartitionIndexIterator directly from
the SSTableReader.Factory

Added methods to reset the iterator position and set the position
to exact value

[402ce9f31313d3c963a845d04bd5e7ee93598119] STAR-15: Refactor explicit usages of BigTableRowIndexEntry.Serializer
Use generic PartitionIndexIterator or KeyIterator instead

:...skipping...
[288dd9e05f037674319b4891cae21c46afe2df2b] STAR-15: Add getType method to SSTableFormat iface

[ea53c0104fa704c28640b18ad0a977153269ff32] STAR-15: Remove getIndexSerializer from SSTableFormat
This includes introduction of PartitionIndexIterator.

Pull down serializer creation to the locations where it is needed.
In other locations, we use PartitionIndexIterator abstract

[0b18884f7f547248c0fca9c9767d45e691695840] STAR-15: Just moved RowIndexEntry to o.a.c.io.sstable.format.big

[bfe955fc702eaba1cf2914642e42cc9d1fb10aef] STAR-15: Extract IndexState to a separate class

[044bb012176ccca7300643308e3181ea681c7d84] STAR-15: Move AbstractSSTableIterator to o.a.c.io.sstable.format

[9e946e674c6b33aec08174a41c7c7c27b64d052d] STAR-15: SSTableIterator and SSTableReversedIterator moved to a.o.c.io.sstable.format.big

[ff574f7e3681e3217125b28de04cf005875dcd77] STAR-15: Refactoring
1. Generic RowIndexEntry was extracted
2. AbstractSSTableIterator is now typed by row index entry
3. In SSTableReader some methods were squashed and some were pulled down
4. Test were adjusted to just work (for now)

[a9493886af5d1b5c9d4e748b70b0dd3646713023] STAR-15: Moved IndexInfo to o.a.c.io.sstable.format.big

[fce4eea79084645c3b9b40c6d915dd88d8a29072] STAR-15: Move ColumnIndex to o.a.c.io.sstable.format.big

[91ab2a1b4b271094a8a069fa913401f38798d12c] STAR-15: Get rid of explicit BigTableIndexRowEntry usage in Verifier

[0e2ba2923c24c0e5a49b04ce9963d0f626da8f9e] STAR-15: Refactor KeyIterator to use PartitionIndexIterator
Also fixed a possible bug in ReducingKeyIterator - when an exception occurred during
instantiation of some SSTables passed to the constructor, the index files were left
unclosed

In SASIIndexBuilder exception transformation from IOE -> FSRE has been moved to the
enclosing block - it does not seem to change the semantics

In SSTableExport opening the SSTable was moved to the upper level as sstable instance
was needed to create PartitionIndexIterator

[82f88f6dfb95c3168884d5dc77df5374fb4c97d0] STAR-15: Refactor PartitionIndexIterator
Also make it possible to obtain PartitionIndexIterator directly from
the SSTableReader.Factory

Added methods to reset the iterator position and set the position
to exact value

[402ce9f31313d3c963a845d04bd5e7ee93598119] STAR-15: Refactor explicit usages of BigTableRowIndexEntry.Serializer
Use generic PartitionIndexIterator or KeyIterator instead

[ad4535715f117f349c99d98403b3f7c23454d9fd] Apply review comments

(cherry picked from commit 4520f47729752b78b5a83e71e29cb6ff45ceb599)
(cherry picked from commit 3aba2f97e3a64b125a840a1b244e1bd0fd81cd64)
---
 .../cassandra/db/ClusteringComparator.java    |   2 +-
 .../cassandra/db/ColumnFamilyStore.java       |   2 +-
 .../apache/cassandra/db/SSTableImporter.java  |  15 +-
 .../db/SinglePartitionReadCommand.java        |   1 +
 .../db/compaction/CompactionController.java   |   8 +-
 .../cassandra/db/compaction/Scrubber.java     | 122 +++-----
 .../cassandra/db/compaction/Verifier.java     |  46 ++-
 .../writers/MajorLeveledCompactionWriter.java |   4 +-
 .../writers/MaxSSTableSizeWriter.java         |   4 +-
 .../SplittingSizeTieredCompactionWriter.java  |   4 +-
 .../rows/UnfilteredRowIteratorSerializer.java |   1 +
 .../UnfilteredRowIteratorWithLowerBound.java  |   7 +-
 .../index/sasi/SASIIndexBuilder.java          |  37 ++-
 .../cassandra/io/sstable/KeyIterator.java     | 156 +++++-----
 .../io/sstable/ReducingKeyIterator.java       |  14 +-
 .../apache/cassandra/io/sstable/SSTable.java  |  29 +-
 .../io/sstable/SSTableIdentityIterator.java   |   1 +
 .../cassandra/io/sstable/SSTableRewriter.java |  15 +-
 .../io/sstable/SimpleSSTableMultiWriter.java  |   4 +-
 .../format}/AbstractSSTableIterator.java      | 270 ++----------------
 .../format/PartitionIndexIterator.java        |  77 +++++
 .../io/sstable/format/RowIndexEntry.java      |  48 ++++
 .../io/sstable/format/SSTableFormat.java      |  10 +-
 .../io/sstable/format/SSTableReader.java      | 126 +++-----
 .../sstable/format/SSTableReaderBuilder.java  |  65 ++---
 .../sstable/format/SSTableReadsListener.java  |   2 +-
 .../io/sstable/format/SSTableWriter.java      |   6 +-
 .../format/big/AbstractBigTableIterator.java  |  87 ++++++
 .../io/sstable/format/big/BigFormat.java      |  35 ++-
 .../big/BigTablePartitionIndexIterator.java   | 173 +++++++++++
 .../io/sstable/format/big/BigTableReader.java |  60 +++-
 .../format/big/BigTableRowIndexEntry.java}    |  84 +++---
 .../sstable/format/big/BigTableScanner.java   |  26 +-
 .../io/sstable/format/big/BigTableWriter.java |  46 ++-
 .../sstable/format/big}/ColumnIndex.java      |  13 +-
 .../sstable/{ => format/big}/IndexInfo.java   |   6 +-
 .../io/sstable/format/big/IndexState.java     | 228 +++++++++++++++
 .../sstable/format/big}/SSTableIterator.java  |  12 +-
 .../format/big}/SSTableReversedIterator.java  |  12 +-
 .../cassandra/service/CacheService.java       |  25 +-
 .../apache/cassandra/tools/SSTableExport.java |   6 +-
 .../apache/cassandra/utils/StatusLogger.java  |   4 +-
 .../distributed/test/FailingRepairTest.java   |   7 +
 ...yInspectorCorruptSSTableExceptionTest.java |   8 +-
 .../format/ForwardingSSTableReader.java       |  33 +--
 .../cassandra/cache/AutoSavingCacheTest.java  |   3 +-
 .../cql3/QueryWithIndexedSSTableTest.java     |   4 +-
 .../TombstonesWithIndexedSSTableTest.java     |   5 +-
 .../org/apache/cassandra/db/KeyCacheTest.java |  17 +-
 .../org/apache/cassandra/db/KeyspaceTest.java |   3 +-
 .../streaming/CassandraOutgoingFileTest.java  |  11 +-
 .../big/BigTableRowIndexEntryTest.java}       |  52 ++--
 .../SSTableReverseIteratorTest.java           |   6 +-
 ...for other implementations of SSTableFormat | 212 ++++++++++++++
 54 files changed, 1408 insertions(+), 846 deletions(-)
 rename src/java/org/apache/cassandra/{db/columniterator => io/sstable/format}/AbstractSSTableIterator.java (54%)
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java
 rename src/java/org/apache/cassandra/{db/RowIndexEntry.java => io/sstable/format/big/BigTableRowIndexEntry.java} (91%)
 rename src/java/org/apache/cassandra/{db => io/sstable/format/big}/ColumnIndex.java (95%)
 rename src/java/org/apache/cassandra/io/sstable/{ => format/big}/IndexInfo.java (97%)
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java
 rename src/java/org/apache/cassandra/{db/columniterator => io/sstable/format/big}/SSTableIterator.java (96%)
 rename src/java/org/apache/cassandra/{db/columniterator => io/sstable/format/big}/SSTableReversedIterator.java (97%)
 rename test/unit/org/apache/cassandra/{db/RowIndexEntryTest.java => io/sstable/format/big/BigTableRowIndexEntryTest.java} (93%)
 rename test/unit/org/apache/cassandra/{db => io/sstable/format}/columniterator/SSTableReverseIteratorTest.java (93%)
 create mode 100644 update-history/STAR-801/72-3aba2f97e3 STAR-15: Allow for other implementations of SSTableFormat

diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java
index a23aa36017ae..e5b63aa6a27a 100644
--- a/src/java/org/apache/cassandra/db/ClusteringComparator.java
+++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java
@@ -30,7 +30,7 @@
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.serializers.MarshalException;
 
-import org.apache.cassandra.io.sstable.IndexInfo;
+import org.apache.cassandra.io.sstable.format.big.IndexInfo;
 import org.apache.cassandra.utils.ByteComparable;
 import org.apache.cassandra.utils.ByteSource;
 
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 6b49855ec2fe..f80a04366238 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -1770,7 +1770,7 @@ public List<String> getSSTablesForKey(String key, boolean hexFormat)
             for (SSTableReader sstr : select(View.select(SSTableSet.LIVE, dk)).sstables)
             {
                 // check if the key actually exists in this sstable, without updating cache and stats
-                if (sstr.getPosition(dk, SSTableReader.Operator.EQ, false) != null)
+                if (sstr.checkEntryExists(dk, SSTableReader.Operator.EQ, false))
                     files.add(sstr.getFilename());
             }
             return files;
diff --git a/src/java/org/apache/cassandra/db/SSTableImporter.java b/src/java/org/apache/cassandra/db/SSTableImporter.java
index 989ff12297a7..5bcbd4c528cd 100644
--- a/src/java/org/apache/cassandra/db/SSTableImporter.java
+++ b/src/java/org/apache/cassandra/db/SSTableImporter.java
@@ -181,8 +181,15 @@ synchronized List<String> importNewSSTables(Options options)
             cfs.getTracker().addSSTables(newSSTables);
             for (SSTableReader reader : newSSTables)
             {
-                if (options.invalidateCaches && cfs.isRowCacheEnabled())
-                    invalidateCachesForSSTable(reader.descriptor);
+                try
+                {
+                    if (options.invalidateCaches && cfs.isRowCacheEnabled())
+                        invalidateCachesForSSTable(reader);
+                }
+                catch (IOException ex)
+                {
+                    throw new RuntimeException(ex);
+                }
             }
 
         }
@@ -311,9 +318,9 @@ private void removeCopiedSSTables(Set<MovedSSTable> movedSSTables)
      * Iterates over all keys in the sstable index and invalidates the row cache
      */
     @VisibleForTesting
-    void invalidateCachesForSSTable(Descriptor desc)
+    void invalidateCachesForSSTable(SSTableReader reader) throws IOException
     {
-        try (KeyIterator iter = new KeyIterator(desc, cfs.metadata()))
+        try (KeyIterator iter = KeyIterator.forSSTable(reader))
         {
             while (iter.hasNext())
             {
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index 7dba4d88380e..df52130e8a25 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.db.transform.RTBoundValidator;
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.util.DataInputPlus;
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
index e1b0f3258359..6078dabac2e6 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
@@ -257,7 +257,7 @@ public LongPredicate getPurgeEvaluator(DecoratedKey key)
         {
             // if we don't have bloom filter(bf_fp_chance=1.0 or filter file is missing),
             // we check index file instead.
-            if (sstable.getBloomFilter() instanceof AlwaysPresentFilter && sstable.getPosition(key, SSTableReader.Operator.EQ, false) != null
+            if (sstable.getBloomFilter() instanceof AlwaysPresentFilter && sstable.checkEntryExists(key, SSTableReader.Operator.EQ, false)
                 || sstable.getBloomFilter().isPresent(key))
             {
                 minTimestampSeen = Math.min(minTimestampSeen, sstable.getMinTimestamp());
@@ -321,11 +321,7 @@ private UnfilteredRowIterator getShadowIterator(SSTableReader reader, DecoratedK
             reader.getMaxTimestamp() <= minTimestamp ||
             tombstoneOnly && !reader.mayHaveTombstones())
             return null;
-        RowIndexEntry<?> position = reader.getPosition(key, SSTableReader.Operator.EQ);
-        if (position == null)
-            return null;
-        FileDataInput dfile = openDataFiles.computeIfAbsent(reader, this::openDataFile);
-        return reader.simpleIterator(dfile, key, position, tombstoneOnly);
+        return reader.simpleIterator(() -> openDataFiles.computeIfAbsent(reader, this::openDataFile), key, tombstoneOnly);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index 5884f989e008..b0d601937a47 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -25,9 +25,9 @@
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Throwables;
 import com.google.common.collect.ImmutableSet;
 
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -60,19 +60,13 @@ public class Scrubber implements Closeable
 
     private final ReadWriteLock fileAccessLock;
     private final RandomAccessReader dataFile;
-    private final RandomAccessReader indexFile;
+    private final PartitionIndexIterator indexIterator;
     private final ScrubInfo scrubInfo;
-    private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
 
     private int goodRows;
     private int badRows;
     private int emptyRows;
 
-    private ByteBuffer currentIndexKey;
-    private ByteBuffer nextIndexKey;
-    long currentRowPositionFromIndex;
-    long nextRowPositionFromIndex;
-
     private NegativeLocalDeletionInfoMetrics negativeLocalDeletionInfoMetrics = new NegativeLocalDeletionInfoMetrics();
 
     private final OutputHandler outputHandler;
@@ -111,9 +105,6 @@ public Scrubber(ColumnFamilyStore cfs,
         this.outputHandler = outputHandler;
         this.skipCorrupted = skipCorrupted;
         this.reinsertOverflowedTTLRows = reinsertOverflowedTTLRows;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(cfs.metadata(),
-                                                                                                        sstable.descriptor.version,
-                                                                                                        sstable.header);
 
         List<SSTableReader> toScrub = Collections.singletonList(sstable);
 
@@ -141,19 +132,29 @@ public Scrubber(ColumnFamilyStore cfs,
                         ? sstable.openDataReader()
                         : sstable.openDataReader(CompactionManager.instance.getRateLimiter());
 
-        this.indexFile = hasIndexFile
-                ? RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)))
-                : null;
+        this.indexIterator = hasIndexFile
+                             ? openIndexIterator()
+                             : null;
 
         this.scrubInfo = new ScrubInfo(dataFile, sstable, fileAccessLock.readLock());
 
-        this.currentRowPositionFromIndex = 0;
-        this.nextRowPositionFromIndex = 0;
-
         if (reinsertOverflowedTTLRows)
             outputHandler.output("Starting scrub with reinsert overflowed TTL option");
     }
 
+    private PartitionIndexIterator openIndexIterator()
+    {
+        try
+        {
+            return sstable.allKeysIterator();
+        }
+        catch (IOException e)
+        {
+            outputHandler.warn("Index is unreadable.");
+        }
+        return null;
+    }
+
     private UnfilteredRowIterator withValidation(UnfilteredRowIterator iter, String filename)
     {
         return checkData ? UnfilteredRowIterators.withValidation(iter, filename) : iter;
@@ -167,24 +168,7 @@ public void scrub()
         try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, false, sstable.maxDataAge);
              Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable)))
         {
-            try
-            {
-                nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null;
-                if (indexAvailable())
-                {
-                    // throw away variable so we don't have a side effect in the assert
-                    long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
-                    assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
-                }
-            }
-            catch (Throwable ex)
-            {
-                throwIfFatal(ex);
-                nextIndexKey = null;
-                nextRowPositionFromIndex = dataFile.length();
-                if (indexFile != null)
-                    indexFile.seek(indexFile.length());
-            }
+            assert !indexAvailable() || indexIterator.dataPosition() == 0 : indexIterator.dataPosition();
 
             StatsMetadata metadata = sstable.getSSTableMetadata();
             writer.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, metadata.repairedAt, metadata.pendingRepair, metadata.isTransient, sstable, transaction));
@@ -210,22 +194,18 @@ public void scrub()
                     // check for null key below
                 }
 
+                long dataStart = dataFile.getFilePointer();
+
                 long dataStartFromIndex = -1;
                 long dataSizeFromIndex = -1;
-
-                updateIndexKey();
-
-                if (indexAvailable())
+                ByteBuffer currentIndexKey = indexIterator != null ? indexIterator.key() : null;
+                if (currentIndexKey != null)
                 {
-                    if (currentIndexKey != null)
-                    {
-                        dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining();
-                        dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
-                    }
+                    dataStartFromIndex = indexIterator.dataPosition() + TypeSizes.SHORT_SIZE + currentIndexKey.remaining();
+                    if (advanceIndexNoThrow())
+                        dataSizeFromIndex = indexIterator.dataPosition() - dataStartFromIndex;
                 }
 
-                long dataStart = dataFile.getFilePointer();
-
                 // avoid an NPE if key is null
                 String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
                 outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex)));
@@ -244,10 +224,10 @@ public void scrub()
                                 "_too big_", ByteBufferUtil.bytesToHex(currentIndexKey))));
                     }
 
-                    if (indexFile != null && dataSizeFromIndex > dataFile.length())
+                    if (indexIterator != null && dataSizeFromIndex > dataFile.length())
                         throw new IOError(new IOException("Impossible row size (greater than file length): " + dataSizeFromIndex));
 
-                    if (indexFile != null && dataStart != dataStartFromIndex)
+                    if (indexIterator != null && dataStart != dataStartFromIndex)
                         outputHandler.warn(String.format("Data file row position %d differs from index file row position %d", dataStart, dataStartFromIndex));
 
                     if (tryAppend(prevKey, key, writer))
@@ -382,51 +362,41 @@ private UnfilteredRowIterator getIterator(DecoratedKey key)
                                                                                     negativeLocalDeletionInfoMetrics) : rowMergingIterator;
     }
 
-    private void updateIndexKey()
+    private boolean advanceIndexNoThrow()
     {
-        currentIndexKey = nextIndexKey;
-        currentRowPositionFromIndex = nextRowPositionFromIndex;
         try
         {
-            nextIndexKey = !indexAvailable() ? null : ByteBufferUtil.readWithShortLength(indexFile);
-
-            nextRowPositionFromIndex = !indexAvailable()
-                                       ? dataFile.length()
-                                       : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
+            return indexAvailable() && indexIterator.advance();
         }
         catch (Throwable th)
         {
             JVMStabilityInspector.inspectThrowable(th);
             outputHandler.warn("Error reading index file", th);
-            nextIndexKey = null;
-            nextRowPositionFromIndex = dataFile.length();
-            if (indexFile != null)
-                indexFile.seek(indexFile.length());
+            indexIterator.close();
+            return false;
         }
     }
 
     private boolean indexAvailable()
     {
-        return indexFile != null && !indexFile.isEOF();
+        return indexIterator != null && !indexIterator.isExhausted();
     }
 
     private void seekToNextRow()
     {
-        while(nextRowPositionFromIndex < dataFile.length())
-        {
-            try
-            {
-                dataFile.seek(nextRowPositionFromIndex);
-                return;
-            }
-            catch (Throwable th)
-            {
-                throwIfFatal(th);
-                outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th);
-                badRows++;
-            }
+        long nextRowPositionFromIndex = indexIterator.isExhausted()
+                                        ? dataFile.length()
+                                        : indexIterator.dataPosition();
 
-            updateIndexKey();
+        try
+        {
+            dataFile.seek(nextRowPositionFromIndex);
+        }
+        catch (Throwable th)
+        {
+            throwIfFatal(th);
+            outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th);
+            badRows++;
         }
     }
 
@@ -469,7 +439,7 @@ public void close()
         try
         {
             FileUtils.closeQuietly(dataFile);
-            FileUtils.closeQuietly(indexFile);
+        FileUtils.closeQuietly(indexIterator);
         }
         finally
         {
diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java
index 68d5163e4d85..fb4a17f810f3 100644
--- a/src/java/org/apache/cassandra/db/compaction/Verifier.java
+++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java
@@ -32,6 +32,7 @@
 import org.apache.cassandra.io.sstable.IndexSummary;
 import org.apache.cassandra.io.sstable.KeyIterator;
 import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -76,9 +77,7 @@ public class Verifier implements Closeable
 
     private final ReadWriteLock fileAccessLock;
     private final RandomAccessReader dataFile;
-    private final RandomAccessReader indexFile;
     private final VerifyInfo verifyInfo;
-    private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
     private final Options options;
     private final boolean isOffline;
     /**
@@ -103,7 +102,6 @@ public Verifier(ColumnFamilyStore cfs, SSTableReader sstable, OutputHandler outp
         this.cfs = cfs;
         this.sstable = sstable;
         this.outputHandler = outputHandler;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(cfs.metadata(), sstable.descriptor.version, sstable.header);
 
         this.controller = new VerifyController(cfs);
 
@@ -111,7 +109,6 @@ public Verifier(ColumnFamilyStore cfs, SSTableReader sstable, OutputHandler outp
         this.dataFile = isOffline
                         ? sstable.openDataReader()
                         : sstable.openDataReader(CompactionManager.instance.getRateLimiter());
-        this.indexFile = RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)));
         this.verifyInfo = new VerifyInfo(dataFile, sstable, fileAccessLock.readLock());
         this.options = options;
         this.isOffline = isOffline;
@@ -185,7 +182,7 @@ public void verify()
         if (options.checkOwnsTokens && !isOffline && !(cfs.getPartitioner() instanceof LocalPartitioner))
         {
             outputHandler.debug("Checking that all tokens are owned by the current node");
-            try (KeyIterator iter = new KeyIterator(sstable.descriptor, sstable.metadata()))
+            try (KeyIterator iter = KeyIterator.forSSTable(sstable))
             {
                 List<Range<Token>> ownedRanges = Range.normalize(tokenLookup.apply(cfs.metadata.keyspace));
                 if (ownedRanges.isEmpty())
@@ -239,14 +236,10 @@ public void verify()
 
         outputHandler.output("Extended Verify requested, proceeding to inspect values");
 
-        try
+        try(PartitionIndexIterator indexIterator = sstable.allKeysIterator())
         {
-            ByteBuffer nextIndexKey = ByteBufferUtil.readWithShortLength(indexFile);
-            {
-                long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
-                if (firstRowPositionFromIndex != 0)
-                    markAndThrow(new RuntimeException("firstRowPositionFromIndex != 0: "+firstRowPositionFromIndex));
-            }
+            if (indexIterator.dataPosition() != 0)
+                markAndThrow(new RuntimeException("First row position from index != 0: " + indexIterator.dataPosition()));
 
             List<Range<Token>> ownedRanges = isOffline ? Collections.emptyList() : Range.normalize(tokenLookup.apply(cfs.metadata().keyspace));
             RangeOwnHelper rangeOwnHelper = new RangeOwnHelper(ownedRanges);
@@ -285,14 +278,18 @@ public void verify()
                     }
                 }
 
-                ByteBuffer currentIndexKey = nextIndexKey;
+                ByteBuffer currentIndexKey = indexIterator.key();
                 long nextRowPositionFromIndex = 0;
                 try
                 {
-                    nextIndexKey = indexFile.isEOF() ? null : ByteBufferUtil.readWithShortLength(indexFile);
-                    nextRowPositionFromIndex = indexFile.isEOF()
-                                             ? dataFile.length()
-                                             : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
+                    if (indexIterator.advance())
+                    {
+                        nextRowPositionFromIndex = indexIterator.dataPosition();
+                    }
+                    else
+                    {
+                        nextRowPositionFromIndex = dataFile.length();
+                    }
                 }
                 catch (Throwable th)
                 {
@@ -309,8 +306,6 @@ public void verify()
                 String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
                 outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSize)));
 
-                assert currentIndexKey != null || indexFile.isEOF();
-
                 try
                 {
                     if (key == null || dataSize > dataFile.length())
@@ -413,15 +408,9 @@ public boolean check(DecoratedKey key)
 
     private void deserializeIndex(SSTableReader sstable) throws IOException
     {
-        try (RandomAccessReader primaryIndex = RandomAccessReader.open(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX))))
-        {
-            long indexSize = primaryIndex.length();
-
-            while ((primaryIndex.getFilePointer()) != indexSize)
-            {
-                ByteBuffer key = ByteBufferUtil.readWithShortLength(primaryIndex);
-                RowIndexEntry.Serializer.skip(primaryIndex, sstable.descriptor.version);
-            }
+        try (PartitionIndexIterator it = sstable.allKeysIterator()) {
+            //noinspection StatementWithEmptyBody
+            while (it.advance()); // no-op, just check if index is readable
         }
     }
 
@@ -460,7 +449,6 @@ public void close()
         try
         {
             FileUtils.closeQuietly(dataFile);
-            FileUtils.closeQuietly(indexFile);
         }
         finally
         {
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
index 1c5360020b49..93043913f39b 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
@@ -21,7 +21,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.LeveledManifest;
@@ -70,7 +70,7 @@ public MajorLeveledCompactionWriter(ColumnFamilyStore cfs,
     @SuppressWarnings("resource")
     public boolean realAppend(UnfilteredRowIterator partition)
     {
-        RowIndexEntry rie = sstableWriter.append(partition);
+        BigTableRowIndexEntry rie = sstableWriter.append(partition);
         partitionsWritten++;
         long totalWrittenInCurrentWriter = sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten();
         if (totalWrittenInCurrentWriter > maxSSTableSize)
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
index 915f96bfb431..af21e51ed4f3 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
@@ -21,7 +21,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -81,7 +81,7 @@ private static long getTotalWriteSize(Iterable<SSTableReader> nonExpiredSSTables
 
     protected boolean realAppend(UnfilteredRowIterator partition)
     {
-        RowIndexEntry rie = sstableWriter.append(partition);
+        BigTableRowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > maxSSTableSize)
         {
             switchCompactionLocation(sstableDirectory);
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
index d29061ca8630..f2d6fe91674a 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
@@ -25,7 +25,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -86,7 +86,7 @@ public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories di
     @Override
     public boolean realAppend(UnfilteredRowIterator partition)
     {
-        RowIndexEntry rie = sstableWriter.append(partition);
+        BigTableRowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > currentBytesToWrite && currentRatioIndex < ratios.length - 1) // if we underestimate how many keys we have, the last sstable might get more than we expect
         {
             currentRatioIndex++;
diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java
index 938a3eed114e..9b93c89f8454 100644
--- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorSerializer.java
@@ -26,6 +26,7 @@
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.io.sstable.format.big.ColumnIndex;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.schema.TableMetadata;
diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java
index b6f425458dec..d0ba98f7075f 100644
--- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java
@@ -26,12 +26,13 @@
 import java.util.List;
 
 import org.apache.cassandra.db.marshal.ByteBufferAccessor;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ClusteringIndexFilter;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.transform.RTBoundValidator;
-import org.apache.cassandra.io.sstable.IndexInfo;
+import org.apache.cassandra.io.sstable.format.big.IndexInfo;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
@@ -189,11 +190,11 @@ private ClusteringBound<?> getPartitionIndexLowerBound()
         if (!canUseMetadataLowerBound())
             maybeInit();
 
-        RowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false);
+        BigTableRowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false);
         if (rowIndexEntry == null || !rowIndexEntry.indexOnHeap())
             return null;
 
-        try (RowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null))
+        try (BigTableRowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null))
         {
             IndexInfo column = onHeapRetriever.columnsIndex(filter.isReversed() ? rowIndexEntry.columnsIndexCount() - 1 : 0);
             ClusteringPrefix<?> lowerBoundPrefix = filter.isReversed() ? column.lastName : column.firstName;
diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
index bb42dc2d178a..c2b0aa19c9d3 100644
--- a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
@@ -24,10 +24,10 @@
 import java.io.IOException;
 import java.util.*;
 
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
@@ -78,7 +78,7 @@ public void build()
                 PerSSTableIndexWriter indexWriter = SASIIndex.newWriter(keyValidator, sstable.descriptor, indexes, OperationType.COMPACTION);
 
                 long previousKeyPosition = 0;
-                try (KeyIterator keys = new KeyIterator(sstable.descriptor, cfs.metadata()))
+                try (KeyIterator keys = KeyIterator.forSSTable(sstable))
                 {
                     while (keys.hasNext())
                     {
@@ -90,25 +90,18 @@ public void build()
 
                         indexWriter.startPartition(key, keyPosition);
 
-                        try
-                        {
-                            RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
-                            dataFile.seek(indexEntry.position);
-                            ByteBufferUtil.readWithShortLength(dataFile); // key
-
-                            try (SSTableIdentityIterator partition = SSTableIdentityIterator.create(sstable, dataFile, key))
-                            {
-                                // if the row has statics attached, it has to be indexed separately
-                                if (cfs.metadata().hasStaticColumns())
-                                    indexWriter.nextUnfilteredCluster(partition.staticRow());
-
-                                while (partition.hasNext())
-                                    indexWriter.nextUnfilteredCluster(partition.next());
-                            }
-                        }
-                        catch (IOException ex)
+                        RowIndexEntry<?> indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
+                        dataFile.seek(indexEntry.position);
+                        ByteBufferUtil.readWithShortLength(dataFile); // key
+
+                        try (SSTableIdentityIterator partition = SSTableIdentityIterator.create(sstable, dataFile, key))
                         {
-                            throw new FSReadError(ex, sstable.getFilename());
+                            // if the row has statics attached, it has to be indexed separately
+                            if (cfs.metadata().hasStaticColumns())
+                                indexWriter.nextUnfilteredCluster(partition.staticRow());
+
+                            while (partition.hasNext())
+                                indexWriter.nextUnfilteredCluster(partition.next());
                         }
 
                         bytesProcessed += keyPosition - previousKeyPosition;
@@ -117,6 +110,10 @@ public void build()
 
                     completeSSTable(indexWriter, sstable, indexes.values());
                 }
+                catch (IOException ex)
+                {
+                    throw new FSReadError(ex, sstable.getFilename());
+                }
             }
         }
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
index 1a5792c4fad2..c31af70d4c0d 100644
--- a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
@@ -17,105 +17,70 @@
  */
 package org.apache.cassandra.io.sstable;
 
-import java.io.File;
 import java.io.IOException;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.io.util.DataInputPlus;
-import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.AbstractIterator;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.CloseableIterator;
 
 public class KeyIterator extends AbstractIterator<DecoratedKey> implements CloseableIterator<DecoratedKey>
 {
-    private final static class In
-    {
-        private final File path;
-        private volatile RandomAccessReader in;
-
-        public In(File path)
-        {
-            this.path = path;
-        }
-
-        private void maybeInit()
-        {
-            if (in != null)
-                return;
-
-            synchronized (this)
-            {
-                if (in == null)
-                {
-                    in = RandomAccessReader.open(path);
-                }
-            }
-        }
-
-        public DataInputPlus get()
-        {
-            maybeInit();
-            return in;
-        }
-
-        public boolean isEOF()
-        {
-            maybeInit();
-            return in.isEOF();
-        }
-
-        public void close()
-        {
-            if (in != null)
-                in.close();
-        }
+    private final IPartitioner partitioner;
+    private final PartitionIndexIterator it;
+    private final ReadWriteLock fileAccessLock;
+    private final long indexLength;
 
-        public long getFilePointer()
-        {
-            maybeInit();
-            return in.getFilePointer();
-        }
+    private long keyPosition = -1;
 
-        public long length()
-        {
-            maybeInit();
-            return in.length();
-        }
+    public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner, ReadWriteLock fileAccessLock)
+    {
+        this.it = it;
+        this.partitioner = partitioner;
+        this.fileAccessLock = fileAccessLock;
+        this.indexLength = it.indexLength();
     }
 
-    private final Descriptor desc;
-    private final In in;
-    private final IPartitioner partitioner;
-    private final ReadWriteLock fileAccessLock;
+    public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner)
+    {
+        this(it, partitioner, null);
+    }
 
-    private long keyPosition;
+    public static KeyIterator forSSTable(SSTableReader ssTableReader) throws IOException
+    {
+        return new KeyIterator(ssTableReader.allKeysIterator(), ssTableReader.getPartitioner(), new ReentrantReadWriteLock());
+    }
 
-    public KeyIterator(Descriptor desc, TableMetadata metadata)
+    public static KeyIterator create(SSTableReader.Factory factory, Descriptor descriptor, TableMetadata metadata)
     {
-        this.desc = desc;
-        in = new In(new File(desc.filenameFor(Component.PRIMARY_INDEX)));
-        partitioner = metadata.partitioner;
-        fileAccessLock = new ReentrantReadWriteLock();
+        return new KeyIterator(factory.indexIterator(descriptor, metadata), metadata.partitioner, new ReentrantReadWriteLock());
     }
 
     protected DecoratedKey computeNext()
     {
-        fileAccessLock.readLock().lock();
+        if (fileAccessLock != null)
+            fileAccessLock.readLock().lock();
         try
         {
-            if (in.isEOF())
-                return endOfData();
-
-            keyPosition = in.getFilePointer();
-            DecoratedKey key = partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in.get()));
-            RowIndexEntry.Serializer.skip(in.get(), desc.version); // skip remainder of the entry
-            return key;
+            if (keyPosition < 0)
+            {
+                keyPosition = 0;
+                return it.isExhausted()
+                       ? endOfData()
+                       : partitioner.decorateKey(it.key());
+            }
+            else
+            {
+                keyPosition = it.indexPosition();
+                return it.advance()
+                       ? partitioner.decorateKey(it.key())
+                       : endOfData();
+            }
         }
         catch (IOException e)
         {
@@ -123,45 +88,68 @@ protected DecoratedKey computeNext()
         }
         finally
         {
-            fileAccessLock.readLock().unlock();
+            if (fileAccessLock != null)
+                fileAccessLock.readLock().unlock();
         }
     }
 
     public void close()
     {
-        fileAccessLock.writeLock().lock();
+        if (fileAccessLock != null)
+            fileAccessLock.writeLock().lock();
         try
         {
-            in.close();
+            it.close();
         }
         finally
         {
-            fileAccessLock.writeLock().unlock();
+            if (fileAccessLock != null)
+                fileAccessLock.writeLock().unlock();
         }
     }
 
     public long getBytesRead()
     {
-        fileAccessLock.readLock().lock();
+        if (fileAccessLock != null)
+            fileAccessLock.readLock().lock();
         try
         {
-            return in.getFilePointer();
+            return it.indexPosition();
         }
         finally
         {
-            fileAccessLock.readLock().unlock();
+            if (fileAccessLock != null)
+                fileAccessLock.readLock().unlock();
         }
     }
 
     public long getTotalBytes()
     {
-        // length is final in the referenced object.
-        // no need to acquire the lock
-        return in.length();
+        return indexLength;
     }
 
     public long getKeyPosition()
     {
         return keyPosition;
     }
+
+    public void reset()
+    {
+        if (fileAccessLock != null)
+            fileAccessLock.readLock().lock();
+        try
+        {
+            it.reset();
+            keyPosition = -1;
+        }
+        catch (IOException ex)
+        {
+            throw new RuntimeException(ex);
+        }
+        finally
+        {
+            if (fileAccessLock != null)
+                fileAccessLock.readLock().unlock();
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
index 826b91d65257..9a231c932355 100644
--- a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.io.sstable;
 
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
@@ -26,6 +27,7 @@
 import org.apache.cassandra.utils.CloseableIterator;
 import org.apache.cassandra.utils.IMergeIterator;
 import org.apache.cassandra.utils.MergeIterator;
+import org.apache.cassandra.utils.Throwables;
 
 /**
  * Caller must acquire and release references to the sstables used here.
@@ -38,8 +40,16 @@ public class ReducingKeyIterator implements CloseableIterator<DecoratedKey>
     public ReducingKeyIterator(Collection<SSTableReader> sstables)
     {
         iters = new ArrayList<>(sstables.size());
-        for (SSTableReader sstable : sstables)
-            iters.add(new KeyIterator(sstable.descriptor, sstable.metadata()));
+        try
+        {
+            for (SSTableReader sstable : sstables)
+                iters.add(KeyIterator.forSSTable(sstable));
+        }
+        catch (IOException | RuntimeException ex)
+        {
+            iters.forEach(KeyIterator::close);
+            throw Throwables.cleaned(ex);
+        }
     }
 
     private void maybeInit()
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java
index 0471be3238cf..ba4b323becaf 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTable.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java
@@ -33,7 +33,7 @@
 
 import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
@@ -79,6 +79,8 @@ public abstract class SSTable
 
     protected final DiskOptimizationStrategy optimizationStrategy;
     protected final TableMetadataRef metadata;
+    private static final int SAMPLES_CAP = 10000;
+    private static final int BYTES_CAP = 10000000;
 
     protected SSTable(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata, DiskOptimizationStrategy optimizationStrategy)
     {
@@ -265,21 +267,24 @@ public static Set<Component> discoverComponentsFor(Descriptor desc)
     }
 
     /** @return An estimate of the number of keys contained in the given index file. */
-    public static long estimateRowsFromIndex(RandomAccessReader ifile, Descriptor descriptor) throws IOException
+    public static long estimateRowsFromIndex(PartitionIndexIterator iterator) throws IOException
     {
         // collect sizes for the first 10000 keys, or first 10 megabytes of data
-        final int SAMPLES_CAP = 10000, BYTES_CAP = (int)Math.min(10000000, ifile.length());
-        int keys = 0;
-        while (ifile.getFilePointer() < BYTES_CAP && keys < SAMPLES_CAP)
+        try
+        {
+            int keys = 0;
+            while (!iterator.isExhausted() && iterator.indexPosition() < BYTES_CAP && keys < SAMPLES_CAP)
+            {
+                iterator.advance();
+                keys++;
+            }
+            assert keys > 0 && iterator.indexPosition() > 0 && iterator.indexLength() > 0 : "Unexpected empty index file";
+            return iterator.indexLength() / (iterator.indexPosition() / keys);
+        }
+        finally
         {
-            ByteBufferUtil.skipShortLength(ifile);
-            RowIndexEntry.Serializer.skip(ifile, descriptor.version);
-            keys++;
+            iterator.reset();
         }
-        assert keys > 0 && ifile.getFilePointer() > 0 && ifile.length() > 0 : "Unexpected empty index file: " + ifile;
-        long estimatedRows = ifile.length() / (ifile.getFilePointer() / keys);
-        ifile.seek(0);
-        return estimatedRows;
     }
 
     public long bytesOnDisk()
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
index 76e12c891ada..cf19083c9f55 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
@@ -19,6 +19,7 @@
 
 import java.io.*;
 
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.*;
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
index 92548b26aea4..1b6336dfbef2 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
@@ -27,7 +27,8 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.lifecycle.ILifecycleTransaction;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -69,7 +70,7 @@ public class SSTableRewriter extends Transactional.AbstractTransactional impleme
     private final boolean eagerWriterMetaRelease; // true if the writer metadata should be released when switch is called
 
     private SSTableWriter writer;
-    private Map<DecoratedKey, RowIndexEntry> cachedKeys = new HashMap<>();
+    private Map<DecoratedKey, BigTableRowIndexEntry> cachedKeys = new HashMap<>();
 
     // for testing (TODO: remove when have byteman setup)
     private boolean throwEarly, throwLate;
@@ -117,12 +118,12 @@ public SSTableWriter currentWriter()
         return writer;
     }
 
-    public RowIndexEntry append(UnfilteredRowIterator partition)
+    public BigTableRowIndexEntry append(UnfilteredRowIterator partition)
     {
         // we do this before appending to ensure we can resetAndTruncate() safely if the append fails
         DecoratedKey key = partition.partitionKey();
         maybeReopenEarly(key);
-        RowIndexEntry index = writer.append(partition);
+        BigTableRowIndexEntry index = writer.append(partition);
         if (DatabaseDescriptor.shouldMigrateKeycacheOnCompaction())
         {
             if (!transaction.isOffline() && index != null)
@@ -141,7 +142,7 @@ public RowIndexEntry append(UnfilteredRowIterator partition)
     }
 
     // attempts to append the row, if fails resets the writer position
-    public RowIndexEntry tryAppend(UnfilteredRowIterator partition)
+    public BigTableRowIndexEntry tryAppend(UnfilteredRowIterator partition)
     {
         writer.mark();
         try
@@ -163,7 +164,7 @@ private void maybeReopenEarly(DecoratedKey key)
             {
                 for (SSTableReader reader : transaction.originals())
                 {
-                    RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE);
+                    RowIndexEntry<?> index = reader.getPosition(key, SSTableReader.Operator.GE);
                     NativeLibrary.trySkipCache(reader.getFilename(), 0, index == null ? 0 : index.position);
                 }
             }
@@ -223,7 +224,7 @@ private void moveStarts(SSTableReader newReader, DecoratedKey lowerbound)
         if (!cachedKeys.isEmpty())
         {
             invalidateKeys = new ArrayList<>(cachedKeys.size());
-            for (Map.Entry<DecoratedKey, RowIndexEntry> cacheKey : cachedKeys.entrySet())
+            for (Map.Entry<DecoratedKey, BigTableRowIndexEntry> cacheKey : cachedKeys.entrySet())
             {
                 invalidateKeys.add(cacheKey.getKey());
                 newReader.cacheKey(cacheKey.getKey(), cacheKey.getValue());
diff --git a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
index a84f07e94971..d38d03292b9a 100644
--- a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
@@ -21,7 +21,7 @@
 import java.util.Collections;
 import java.util.UUID;
 
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -45,7 +45,7 @@ protected SimpleSSTableMultiWriter(SSTableWriter writer, LifecycleNewTracker lif
 
     public boolean append(UnfilteredRowIterator partition)
     {
-        RowIndexEntry<?> indexEntry = writer.append(partition);
+        BigTableRowIndexEntry indexEntry = writer.append(partition);
         return indexEntry != null;
     }
 
diff --git a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java
similarity index 54%
rename from src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
rename to src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java
index fee45c232fe2..2a1e67572426 100644
--- a/src/java/org/apache/cassandra/db/columniterator/AbstractSSTableIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java
@@ -15,10 +15,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.io.sstable.format;
 
+import java.io.Closeable;
 import java.io.IOException;
-import java.util.Comparator;
 import java.util.Iterator;
 import java.util.NoSuchElementException;
 
@@ -26,15 +26,12 @@
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.*;
-import org.apache.cassandra.io.sstable.IndexInfo;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.DataPosition;
 import org.apache.cassandra.io.util.FileHandle;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-public abstract class AbstractSSTableIterator implements UnfilteredRowIterator
+public abstract class AbstractSSTableIterator<E extends RowIndexEntry<?>> implements UnfilteredRowIterator
 {
     protected final SSTableReader sstable;
     // We could use sstable.metadata(), but that can change during execution so it's good hygiene to grab an immutable instance
@@ -59,7 +56,7 @@ public abstract class AbstractSSTableIterator implements UnfilteredRowIterator
     protected AbstractSSTableIterator(SSTableReader sstable,
                                       FileDataInput file,
                                       DecoratedKey key,
-                                      RowIndexEntry indexEntry,
+                                      E indexEntry,
                                       Slices slices,
                                       ColumnFilter columnFilter,
                                       FileHandle ifile)
@@ -176,9 +173,9 @@ private static Row readStaticRow(SSTableReader sstable,
         }
     }
 
-    protected abstract Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile);
+    protected abstract Reader createReaderInternal(E indexEntry, FileDataInput file, boolean shouldCloseFile);
 
-    private Reader createReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    private Reader createReader(E indexEntry, FileDataInput file, boolean shouldCloseFile)
     {
         return slices.isEmpty() ? new NoRowsReader(file, shouldCloseFile)
                                 : createReaderInternal(indexEntry, file, shouldCloseFile);
@@ -255,7 +252,7 @@ private void slice(Slice slice)
                 e.addSuppressed(suppressed);
             }
             sstable.markSuspect();
-            throw new CorruptSSTableException(e, reader.file.getPath());
+            throw new CorruptSSTableException(e, reader.toString());
         }
     }
 
@@ -286,53 +283,19 @@ public void close()
         catch (IOException e)
         {
             sstable.markSuspect();
-            throw new CorruptSSTableException(e, reader.file.getPath());
+            throw new CorruptSSTableException(e, reader.toString());
         }
     }
 
-    protected abstract class Reader implements Iterator<Unfiltered>
+    protected abstract class Reader implements Iterator<Unfiltered>, Closeable
     {
-        private final boolean shouldCloseFile;
         public FileDataInput file;
-
-        protected UnfilteredDeserializer deserializer;
-
-        // Records the currently open range tombstone (if any)
-        protected DeletionTime openMarker = null;
+        protected final boolean shouldCloseFile;
 
         protected Reader(FileDataInput file, boolean shouldCloseFile)
         {
             this.file = file;
             this.shouldCloseFile = shouldCloseFile;
-
-            if (file != null)
-                createDeserializer();
-        }
-
-        private void createDeserializer()
-        {
-            assert file != null && deserializer == null;
-            deserializer = UnfilteredDeserializer.create(metadata, file, sstable.header, helper);
-        }
-
-        protected void seekToPosition(long position) throws IOException
-        {
-            // This may be the first time we're actually looking into the file
-            if (file == null)
-            {
-                file = sstable.getFileDataInput(position);
-                createDeserializer();
-            }
-            else
-            {
-                file.seek(position);
-            }
-        }
-
-        protected void updateOpenMarker(RangeTombstoneMarker marker)
-        {
-            // Note that we always read index blocks in forward order so this method is always called in forward order
-            openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null;
         }
 
         public boolean hasNext()
@@ -352,7 +315,7 @@ public boolean hasNext()
                     e.addSuppressed(suppressed);
                 }
                 sstable.markSuspect();
-                throw new CorruptSSTableException(e, reader.file.getPath());
+                throw new CorruptSSTableException(e, toString());
             }
         }
 
@@ -373,7 +336,7 @@ public Unfiltered next()
                     e.addSuppressed(suppressed);
                 }
                 sstable.markSuspect();
-                throw new CorruptSSTableException(e, reader.file.getPath());
+                throw new CorruptSSTableException(e, toString());
             }
         }
 
@@ -381,26 +344,34 @@ public Unfiltered next()
         public abstract void setForSlice(Slice slice) throws IOException;
 
         protected abstract boolean hasNextInternal() throws IOException;
+
         protected abstract Unfiltered nextInternal() throws IOException;
 
+        @Override
         public void close() throws IOException
         {
             if (shouldCloseFile && file != null)
                 file.close();
         }
+
+        @Override
+        public String toString()
+        {
+            return file != null ? file.getPath() : "null";
+        }
     }
 
     // Reader for when we have Slices.NONE but need to read static row or partition level deletion
-    private class NoRowsReader extends AbstractSSTableIterator.Reader
+    private class NoRowsReader extends Reader
     {
-        private NoRowsReader(FileDataInput file, boolean shouldCloseFile)
+        public NoRowsReader(FileDataInput file, boolean shouldCloseFile)
         {
             super(file, shouldCloseFile);
         }
 
         public void setForSlice(Slice slice) throws IOException
         {
-            return;
+            // no-op
         }
 
         protected boolean hasNextInternal() throws IOException
@@ -413,199 +384,4 @@ protected Unfiltered nextInternal() throws IOException
             throw new NoSuchElementException();
         }
     }
-
-    // Used by indexed readers to store where they are of the index.
-    public static class IndexState implements AutoCloseable
-    {
-        private final Reader reader;
-        private final ClusteringComparator comparator;
-
-        private final RowIndexEntry indexEntry;
-        private final RowIndexEntry.IndexInfoRetriever indexInfoRetriever;
-        private final boolean reversed;
-
-        private int currentIndexIdx;
-
-        // Marks the beginning of the block corresponding to currentIndexIdx.
-        private DataPosition mark;
-
-        public IndexState(Reader reader, ClusteringComparator comparator, RowIndexEntry indexEntry, boolean reversed, FileHandle indexFile)
-        {
-            this.reader = reader;
-            this.comparator = comparator;
-            this.indexEntry = indexEntry;
-            this.indexInfoRetriever = indexEntry.openWithIndex(indexFile);
-            this.reversed = reversed;
-            this.currentIndexIdx = reversed ? indexEntry.columnsIndexCount() : -1;
-        }
-
-        public boolean isDone()
-        {
-            return reversed ? currentIndexIdx < 0 : currentIndexIdx >= indexEntry.columnsIndexCount();
-        }
-
-        // Sets the reader to the beginning of blockIdx.
-        public void setToBlock(int blockIdx) throws IOException
-        {
-            if (blockIdx >= 0 && blockIdx < indexEntry.columnsIndexCount())
-            {
-                reader.seekToPosition(columnOffset(blockIdx));
-                mark = reader.file.mark();
-                reader.deserializer.clearState();
-            }
-
-            currentIndexIdx = blockIdx;
-            reader.openMarker = blockIdx > 0 ? index(blockIdx - 1).endOpenMarker : null;
-        }
-
-        private long columnOffset(int i) throws IOException
-        {
-            return indexEntry.position + index(i).offset;
-        }
-
-        public int blocksCount()
-        {
-            return indexEntry.columnsIndexCount();
-        }
-
-        // Update the block idx based on the current reader position if we're past the current block.
-        // This only makes sense for forward iteration (for reverse ones, when we reach the end of a block we
-        // should seek to the previous one, not update the index state and continue).
-        public void updateBlock() throws IOException
-        {
-            assert !reversed;
-
-            // If we get here with currentBlockIdx < 0, it means setToBlock() has never been called, so it means
-            // we're about to read from the beginning of the partition, but haven't "prepared" the IndexState yet.
-            // Do so by setting us on the first block.
-            if (currentIndexIdx < 0)
-            {
-                setToBlock(0);
-                return;
-            }
-
-            while (currentIndexIdx + 1 < indexEntry.columnsIndexCount() && isPastCurrentBlock())
-            {
-                reader.openMarker = currentIndex().endOpenMarker;
-                ++currentIndexIdx;
-
-                // We have to set the mark, and we have to set it at the beginning of the block. So if we're not at the beginning of the block, this forces us to a weird seek dance.
-                // This can only happen when reading old file however.
-                long startOfBlock = columnOffset(currentIndexIdx);
-                long currentFilePointer = reader.file.getFilePointer();
-                if (startOfBlock == currentFilePointer)
-                {
-                    mark = reader.file.mark();
-                }
-                else
-                {
-                    reader.seekToPosition(startOfBlock);
-                    mark = reader.file.mark();
-                    reader.seekToPosition(currentFilePointer);
-                }
-            }
-        }
-
-        // Check if we've crossed an index boundary (based on the mark on the beginning of the index block).
-        public boolean isPastCurrentBlock() throws IOException
-        {
-            assert reader.deserializer != null;
-            return reader.file.bytesPastMark(mark) >= currentIndex().width;
-        }
-
-        public int currentBlockIdx()
-        {
-            return currentIndexIdx;
-        }
-
-        public IndexInfo currentIndex() throws IOException
-        {
-            return index(currentIndexIdx);
-        }
-
-        public IndexInfo index(int i) throws IOException
-        {
-            return indexInfoRetriever.columnsIndex(i);
-        }
-
-        // Finds the index of the first block containing the provided bound, starting at the provided index.
-        // Will be -1 if the bound is before any block, and blocksCount() if it is after every block.
-        public int findBlockIndex(ClusteringBound<?> bound, int fromIdx) throws IOException
-        {
-            if (bound.isBottom())
-                return -1;
-            if (bound.isTop())
-                return blocksCount();
-
-            return indexFor(bound, fromIdx);
-        }
-
-        public int indexFor(ClusteringPrefix<?> name, int lastIndex) throws IOException
-        {
-            IndexInfo target = new IndexInfo(name, name, 0, 0, null);
-            /*
-            Take the example from the unit test, and say your index looks like this:
-            [0..5][10..15][20..25]
-            and you look for the slice [13..17].
-
-            When doing forward slice, we are doing a binary search comparing 13 (the start of the query)
-            to the lastName part of the index slot. You'll end up with the "first" slot, going from left to right,
-            that may contain the start.
-
-            When doing a reverse slice, we do the same thing, only using as a start column the end of the query,
-            i.e. 17 in this example, compared to the firstName part of the index slots.  bsearch will give us the
-            first slot where firstName > start ([20..25] here), so we subtract an extra one to get the slot just before.
-            */
-            int startIdx = 0;
-            int endIdx = indexEntry.columnsIndexCount() - 1;
-
-            if (reversed)
-            {
-                if (lastIndex < endIdx)
-                {
-                    endIdx = lastIndex;
-                }
-            }
-            else
-            {
-                if (lastIndex > 0)
-                {
-                    startIdx = lastIndex;
-                }
-            }
-
-            int index = binarySearch(target, comparator.indexComparator(reversed), startIdx, endIdx);
-            return (index < 0 ? -index - (reversed ? 2 : 1) : index);
-        }
-
-        private int binarySearch(IndexInfo key, Comparator<IndexInfo> c, int low, int high) throws IOException
-        {
-            while (low <= high)
-            {
-                int mid = (low + high) >>> 1;
-                IndexInfo midVal = index(mid);
-                int cmp = c.compare(midVal, key);
-
-                if (cmp < 0)
-                    low = mid + 1;
-                else if (cmp > 0)
-                    high = mid - 1;
-                else
-                    return mid;
-            }
-            return -(low + 1);
-        }
-
-        @Override
-        public String toString()
-        {
-            return String.format("IndexState(indexSize=%d, currentBlock=%d, reversed=%b)", indexEntry.columnsIndexCount(), currentIndexIdx, reversed);
-        }
-
-        @Override
-        public void close() throws IOException
-        {
-            indexInfoRetriever.close();
-        }
-    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java b/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java
new file mode 100644
index 000000000000..616db43aec2f
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Iterator over the partitions of an sstable.
+ * <p>
+ * The index iterator starts with a key/position ready. advance() should be used to move to the next key; iteration
+ * completes when key() == null.
+ */
+public interface PartitionIndexIterator extends Closeable
+{
+    /**
+     * Current key
+     */
+    public ByteBuffer key();
+
+    /**
+     * Position in the data file where the associated content resides
+     */
+    public long dataPosition();
+
+    /**
+     * Moves the iterator forward. Returns false if we reach EOF and there nothing more to read
+     */
+    public boolean advance() throws IOException;
+
+    /**
+     * Closes the iterator quietly
+     */
+    public void close();
+
+    /**
+     * Returns true if we reach EOF
+     */
+    boolean isExhausted();
+
+    /**
+     * Returns the current position in index file (which along with {@link #indexLength()}
+     * can be used to track iteration progress
+     */
+    long indexPosition();
+
+    /**
+     * Sets the current position in index file
+     */
+    void indexPosition(long position) throws IOException;
+
+    /**
+     * Returns length of the index file
+     */
+    long indexLength();
+
+    /**
+     * Resets the iterator to the initial position
+     */
+    void reset() throws IOException;
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java
new file mode 100644
index 000000000000..c00a37ae39bf
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable.format;
+
+import org.apache.cassandra.cache.IMeasurableMemory;
+import org.apache.cassandra.db.DeletionTime;
+
+public abstract class RowIndexEntry<T> implements IMeasurableMemory
+{
+    public final long position;
+
+    public RowIndexEntry(long position)
+    {
+        this.position = position;
+    }
+
+    /**
+     * @return true if this index entry contains the row-level tombstone and column summary.  Otherwise,
+     * caller should fetch these from the row header.
+     */
+    public boolean isIndexed()
+    {
+        return columnsIndexCount() > 1;
+    }
+
+    public abstract DeletionTime deletionTime();
+
+    public int columnsIndexCount()
+    {
+        return 0;
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
index 14f660258fbe..2ecef6025ac3 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
@@ -19,9 +19,6 @@
 
 import com.google.common.base.CharMatcher;
 
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.db.RowIndexEntry;
-import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
 
 /**
@@ -31,6 +28,7 @@ public interface SSTableFormat
 {
     static boolean enableSSTableDevelopmentTestMode = Boolean.getBoolean("cassandra.test.sstableformatdevelopment");
 
+    Type getType();
 
     Version getLatestVersion();
     Version getVersion(String version);
@@ -38,9 +36,7 @@ public interface SSTableFormat
     SSTableWriter.Factory getWriterFactory();
     SSTableReader.Factory getReaderFactory();
 
-    RowIndexEntry.IndexSerializer<?> getIndexSerializer(TableMetadata metadata, Version version, SerializationHeader header);
-
-    public static enum Type
+    public enum Type
     {
         //The original sstable format
         BIG("big", BigFormat.instance);
@@ -53,7 +49,7 @@ public static Type current()
             return BIG;
         }
 
-        private Type(String name, SSTableFormat info)
+        Type(String name, SSTableFormat info)
         {
             //Since format comes right after generation
             //we disallow formats with numeric names
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index 258b004871bf..7aa38482169b 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -24,6 +24,7 @@
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Supplier;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
@@ -58,6 +59,7 @@
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.CompressionMetadata;
 import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.metadata.*;
 import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.metrics.RestorableMeter;
@@ -65,6 +67,7 @@
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.CacheService;
@@ -205,9 +208,7 @@ public enum OpenReason
     protected final IFilter bf;
     public final IndexSummary indexSummary;
 
-    protected final RowIndexEntry.IndexSerializer<?> rowIndexEntrySerializer;
-
-    protected InstrumentingCache<KeyCacheKey, RowIndexEntry> keyCache;
+    protected InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> keyCache;
 
     protected final BloomFilterTracker bloomFilterTracker = new BloomFilterTracker();
 
@@ -659,7 +660,6 @@ protected SSTableReader(final Descriptor desc,
         this.bf = bf;
         this.maxDataAge = maxDataAge;
         this.openReason = openReason;
-        this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata.get(), desc.version, header);
         tidy = new InstanceTidier(descriptor, metadata.id);
         selfRef = new Ref<>(this, tidy);
     }
@@ -681,6 +681,8 @@ public static long getTotalUncompressedBytes(Iterable<SSTableReader> sstables)
         return sum;
     }
 
+    public abstract PartitionIndexIterator allKeysIterator() throws IOException;
+
     public boolean equals(Object that)
     {
         return that instanceof SSTableReader && ((SSTableReader) that).descriptor.equals(this.descriptor);
@@ -701,7 +703,7 @@ public void setupOnline()
         // under normal operation we can do this at any time, but SSTR is also used outside C* proper,
         // e.g. by BulkLoader, which does not initialize the cache.  As a kludge, we set up the cache
         // here when we know we're being wired into the rest of the server infrastructure.
-        InstrumentingCache<KeyCacheKey, RowIndexEntry> maybeKeyCache = CacheService.instance.keyCache;
+        InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> maybeKeyCache = CacheService.instance.keyCache;
         if (maybeKeyCache.getCapacity() > 0)
             keyCache = maybeKeyCache;
 
@@ -995,25 +997,13 @@ else if (samplingLevel < indexSummary.getSamplingLevel())
     private IndexSummary buildSummaryAtLevel(int newSamplingLevel) throws IOException
     {
         // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
-        RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)));
-        try
+        try (KeyIterator iterator = KeyIterator.forSSTable(this);
+             IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata().params.minIndexInterval, newSamplingLevel))
         {
-            long indexSize = primaryIndex.length();
-            try (IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata().params.minIndexInterval, newSamplingLevel))
-            {
-                long indexPosition;
-                while ((indexPosition = primaryIndex.getFilePointer()) != indexSize)
-                {
-                    summaryBuilder.maybeAddEntry(decorateKey(ByteBufferUtil.readWithShortLength(primaryIndex)), indexPosition);
-                    RowIndexEntry.Serializer.skip(primaryIndex, descriptor.version);
-                }
+            while (iterator.hasNext())
+                summaryBuilder.maybeAddEntry(iterator.next(), iterator.getKeyPosition());
 
-                return summaryBuilder.build(getPartitioner());
-            }
-        }
-        finally
-        {
-            FileUtils.closeQuietly(primaryIndex);
+            return summaryBuilder.build(getPartitioner());
         }
     }
 
@@ -1314,7 +1304,7 @@ public KeyCacheKey getCacheKey(DecoratedKey key)
         return new KeyCacheKey(metadata(), descriptor, key.getKey());
     }
 
-    public void cacheKey(DecoratedKey key, RowIndexEntry info)
+    public void cacheKey(DecoratedKey key, BigTableRowIndexEntry info)
     {
         CachingParams caching = metadata().params.caching;
 
@@ -1326,20 +1316,20 @@ public void cacheKey(DecoratedKey key, RowIndexEntry info)
         keyCache.put(cacheKey, info);
     }
 
-    public RowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats)
+    public BigTableRowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats)
     {
         if (isKeyCacheEnabled())
             return getCachedPosition(new KeyCacheKey(metadata(), descriptor, key.getKey()), updateStats);
         return null;
     }
 
-    protected RowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
+    protected BigTableRowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
     {
         if (isKeyCacheEnabled())
         {
             if (updateStats)
             {
-                RowIndexEntry cachedEntry = keyCache.get(unifiedKey);
+                BigTableRowIndexEntry cachedEntry = keyCache.get(unifiedKey);
                 keyCacheRequest.incrementAndGet();
                 if (cachedEntry != null)
                 {
@@ -1367,28 +1357,16 @@ public boolean isKeyCacheEnabled()
      * allow key selection by token bounds but only if op != * EQ
      * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins.
      */
-    public final RowIndexEntry getPosition(PartitionPosition key, Operator op)
+    public final RowIndexEntry<?> getPosition(PartitionPosition key, Operator op)
     {
-        return getPosition(key, op, SSTableReadsListener.NOOP_LISTENER);
+        return getPosition(key, op, true, false, SSTableReadsListener.NOOP_LISTENER);
     }
 
-    /**
-     * Retrieves the position while updating the key cache and the stats.
-     * @param key The key to apply as the rhs to the given Operator. A 'fake' key is allowed to
-     * allow key selection by token bounds but only if op != * EQ
-     * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins.
-     * @param listener the {@code SSTableReaderListener} that must handle the notifications.
-     */
-    public final RowIndexEntry getPosition(PartitionPosition key, Operator op, SSTableReadsListener listener)
+    public final boolean checkEntryExists(PartitionPosition key,
+                                          Operator op,
+                                          boolean updateCacheAndStats)
     {
-        return getPosition(key, op, true, false, listener);
-    }
-
-    public final RowIndexEntry getPosition(PartitionPosition key,
-                                           Operator op,
-                                           boolean updateCacheAndStats)
-    {
-        return getPosition(key, op, updateCacheAndStats, false, SSTableReadsListener.NOOP_LISTENER);
+        return getPosition(key, op, updateCacheAndStats, false, SSTableReadsListener.NOOP_LISTENER) != null;
     }
 
     /**
@@ -1399,11 +1377,11 @@ public final RowIndexEntry getPosition(PartitionPosition key,
      * @param listener a listener used to handle internal events
      * @return The index entry corresponding to the key, or null if the key is not present
      */
-    protected abstract RowIndexEntry getPosition(PartitionPosition key,
-                                                 Operator op,
-                                                 boolean updateCacheAndStats,
-                                                 boolean permitMatchPastLast,
-                                                 SSTableReadsListener listener);
+    protected abstract RowIndexEntry<?> getPosition(PartitionPosition key,
+                                                    Operator op,
+                                                    boolean updateCacheAndStats,
+                                                    boolean permitMatchPastLast,
+                                                    SSTableReadsListener listener);
 
     public abstract UnfilteredRowIterator iterator(DecoratedKey key,
                                                    Slices slices,
@@ -1411,9 +1389,7 @@ public abstract UnfilteredRowIterator iterator(DecoratedKey key,
                                                    boolean reversed,
                                                    SSTableReadsListener listener);
 
-    public abstract UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed);
-
-    public abstract UnfilteredRowIterator simpleIterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, boolean tombstoneOnly);
+    public abstract UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, DecoratedKey key, boolean tombstoneOnly);
 
     /**
      * Finds and returns the first key beyond a given token in this SSTable or null if no such key exists.
@@ -1428,24 +1404,22 @@ public DecoratedKey firstKeyBeyond(PartitionPosition token)
         if (ifile == null)
             return null;
 
-        String path = null;
-        try (FileDataInput in = ifile.createReader(sampledPosition))
+        try (PartitionIndexIterator iterator = allKeysIterator())
         {
-            path = in.getPath();
-            while (!in.isEOF())
+            iterator.indexPosition(sampledPosition);
+            KeyIterator keyIterator = new KeyIterator(iterator, getPartitioner());
+
+            while (keyIterator.hasNext())
             {
-                ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
-                DecoratedKey indexDecoratedKey = decorateKey(indexKey);
+                DecoratedKey indexDecoratedKey = keyIterator.next();
                 if (indexDecoratedKey.compareTo(token) > 0)
                     return indexDecoratedKey;
-
-                RowIndexEntry.Serializer.skip(in, descriptor.version);
             }
         }
         catch (IOException e)
         {
             markSuspect();
-            throw new CorruptSSTableException(e, path);
+            throw new CorruptSSTableException(e, ifile.path());
         }
 
         return null;
@@ -1618,25 +1592,7 @@ public boolean isRepaired()
         return sstableMetadata.repairedAt != ActiveRepairService.UNREPAIRED_SSTABLE;
     }
 
-    public DecoratedKey keyAt(long indexPosition) throws IOException
-    {
-        DecoratedKey key;
-        try (FileDataInput in = ifile.createReader(indexPosition))
-        {
-            if (in.isEOF())
-                return null;
-
-            key = decorateKey(ByteBufferUtil.readWithShortLength(in));
-
-            // hint read path about key location if caching is enabled
-            // this saves index summary lookup and index file iteration which whould be pretty costly
-            // especially in presence of promoted column indexes
-            if (isKeyCacheEnabled())
-                cacheKey(key, rowIndexEntrySerializer.deserialize(in));
-        }
-
-        return key;
-    }
+    public abstract DecoratedKey keyAt(long indexPosition) throws IOException;
 
     public boolean isPendingRepair()
     {
@@ -1725,7 +1681,7 @@ public long getRecentBloomFilterTrueNegativeCount()
         return bloomFilterTracker.getRecentTrueNegativeCount();
     }
 
-    public InstrumentingCache<KeyCacheKey, RowIndexEntry> getKeyCache()
+    public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
     {
         return keyCache;
     }
@@ -1954,7 +1910,7 @@ public Ref<SSTableReader> ref()
         return selfRef.ref();
     }
 
-    void setup(boolean trackHotness)
+    protected void setup(boolean trackHotness)
     {
         tidy.setup(this, trackHotness);
         this.readMeter = tidy.global.readMeter;
@@ -2200,9 +2156,11 @@ public static void resetTidying()
         GlobalTidy.lookup.clear();
     }
 
-    public static abstract class Factory
+    public interface Factory
     {
-        public abstract SSTableReader open(SSTableReaderBuilder builder);
+        SSTableReader open(SSTableReaderBuilder builder);
+
+        PartitionIndexIterator indexIterator(Descriptor descriptor, TableMetadata metadata);
     }
 
     public static class PartitionPositionBounds
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
index e5abcf834e48..24edf70d1c24 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
@@ -22,7 +22,6 @@
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -42,7 +41,6 @@
 import java.io.DataInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -93,6 +91,20 @@ public SSTableReaderBuilder(Descriptor descriptor,
 
     public abstract SSTableReader build();
 
+    public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor)
+    {
+        return new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX))
+                .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
+                .withChunkCache(ChunkCache.instance);
+    }
+
+    public static FileHandle.Builder defaultDataHandleBuilder(Descriptor descriptor)
+    {
+        return new FileHandle.Builder(descriptor.filenameFor(Component.DATA))
+                .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap)
+                .withChunkCache(ChunkCache.instance);
+    }
+
     /**
      * Load index summary, first key and last key from Summary.db file if it exists.
      *
@@ -151,47 +163,39 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter,
         if (!components.contains(Component.PRIMARY_INDEX))
             return;
 
+        if (!recreateBloomFilter && summaryLoaded)
+            return;
+
         if (logger.isDebugEnabled())
             logger.debug("Attempting to build summary for {}", descriptor);
 
-
-        // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
-        try (RandomAccessReader primaryIndex = RandomAccessReader.open(new File(descriptor.filenameFor(Component.PRIMARY_INDEX))))
-        {
-            long indexSize = primaryIndex.length();
+        try (PartitionIndexIterator indexIterator = readerFactory.indexIterator(descriptor, metadata)) {
             long histogramCount = statsMetadata.estimatedPartitionSize.count();
             long estimatedKeys = histogramCount > 0 && !statsMetadata.estimatedPartitionSize.isOverflowed()
                                  ? histogramCount
-                                 : SSTable.estimateRowsFromIndex(primaryIndex, descriptor); // statistics is supposed to be optional
-
+                                 : SSTable.estimateRowsFromIndex(indexIterator); // statistics is supposed to be optional
             if (recreateBloomFilter)
                 bf = FilterFactory.getFilter(estimatedKeys, metadata.params.bloomFilterFpChance);
 
-            try (IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL))
+            // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
+            try (KeyIterator keyIterator = new KeyIterator(indexIterator, metadata.partitioner);
+                 IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL))
             {
-                long indexPosition;
-
-                while ((indexPosition = primaryIndex.getFilePointer()) != indexSize)
+                while (keyIterator.hasNext())
                 {
-                    ByteBuffer key = ByteBufferUtil.readWithShortLength(primaryIndex);
-                    RowIndexEntry.Serializer.skip(primaryIndex, descriptor.version);
-                    DecoratedKey decoratedKey = metadata.partitioner.decorateKey(key);
+                    DecoratedKey decoratedKey = keyIterator.next();
 
                     if (!summaryLoaded)
                     {
                         if (first == null)
                             first = decoratedKey;
                         last = decoratedKey;
+
+                        summaryBuilder.maybeAddEntry(decoratedKey, keyIterator.getKeyPosition());
                     }
 
                     if (recreateBloomFilter)
                         bf.add(decoratedKey);
-
-                    // if summary was already read from disk we don't want to re-populate it using primary index
-                    if (!summaryLoaded)
-                    {
-                        summaryBuilder.maybeAddEntry(decoratedKey, indexPosition);
-                    }
                 }
 
                 if (!summaryLoaded)
@@ -300,12 +304,8 @@ public SSTableReader build()
             initSummary(dataFilePath, components, statsMetadata);
 
             boolean compression = components.contains(Component.COMPRESSION_INFO);
-            try (FileHandle.Builder ibuilder = new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX))
-                    .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
-                    .withChunkCache(ChunkCache.instance);
-                    FileHandle.Builder dbuilder = new FileHandle.Builder(descriptor.filenameFor(Component.DATA)).compressed(compression)
-                                                                                                                .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap)
-                                                                                                                .withChunkCache(ChunkCache.instance))
+            try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor);
+                 FileHandle.Builder dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression))
             {
                 long indexFileLength = new File(descriptor.filenameFor(Component.PRIMARY_INDEX)).length();
                 DiskOptimizationStrategy optimizationStrategy = DatabaseDescriptor.getDiskOptimizationStrategy();
@@ -443,12 +443,9 @@ void load(boolean recreateBloomFilter,
                   StatsMetadata statsMetadata,
                   Set<Component> components) throws IOException
         {
-            try(FileHandle.Builder ibuilder = new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX))
-                    .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
-                    .withChunkCache(ChunkCache.instance);
-                    FileHandle.Builder dbuilder = new FileHandle.Builder(descriptor.filenameFor(Component.DATA)).compressed(components.contains(Component.COMPRESSION_INFO))
-                                                                                                                .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap)
-                                                                                                                .withChunkCache(ChunkCache.instance))
+            boolean compression = components.contains(Component.COMPRESSION_INFO);
+            try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor);
+                 FileHandle.Builder dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression))
             {
                 loadSummary();
                 boolean buildSummary = summary == null || recreateBloomFilter;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
index 6d384bfb7233..0b34fa4b8d32 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.io.sstable.format;
 
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 
 /**
  * Listener for receiving notifications associated with reading SSTables.
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index 43c50c56141a..52667bfe78ed 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
@@ -25,7 +25,6 @@
 import com.google.common.collect.Sets;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
@@ -35,6 +34,7 @@
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -59,7 +59,6 @@ public abstract class SSTableWriter extends SSTable implements Transactional
     protected long maxDataAge = -1;
     protected final long keyCount;
     protected final MetadataCollector metadataCollector;
-    protected final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
     protected final SerializationHeader header;
     protected final TransactionalProxy txnProxy = txnProxy();
     protected final Collection<SSTableFlushObserver> observers;
@@ -91,7 +90,6 @@ protected SSTableWriter(Descriptor descriptor,
         this.isTransient = isTransient;
         this.metadataCollector = metadataCollector;
         this.header = header;
-        this.rowIndexEntrySerializer = descriptor.version.getSSTableFormat().getIndexSerializer(metadata.get(), descriptor.version, header);
         this.observers = observers == null ? Collections.emptySet() : observers;
     }
 
@@ -209,7 +207,7 @@ private static Collection<SSTableFlushObserver> observers(Descriptor descriptor,
      *
      * @throws FSWriteError if a write to the dataFile fails
      */
-    public abstract RowIndexEntry append(UnfilteredRowIterator iterator);
+    public abstract BigTableRowIndexEntry append(UnfilteredRowIterator iterator);
 
     public abstract long getFilePointer();
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java
new file mode 100644
index 000000000000..4ff6a727c12b
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable.format.big;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.UnfilteredDeserializer;
+import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+
+public abstract class AbstractBigTableIterator extends AbstractSSTableIterator<BigTableRowIndexEntry>
+{
+    protected AbstractBigTableIterator(SSTableReader sstable,
+                                       FileDataInput file,
+                                       DecoratedKey key,
+                                       BigTableRowIndexEntry indexEntry,
+                                       Slices slices,
+                                       ColumnFilter columnFilter,
+                                       FileHandle ifile)
+    {
+        super(sstable, file, key, indexEntry, slices, columnFilter, ifile);
+    }
+
+    protected abstract class RowReader extends Reader {
+        protected UnfilteredDeserializer deserializer;
+
+        // Records the currently open range tombstone (if any)
+        protected DeletionTime openMarker;
+
+        protected RowReader(FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+
+            if (file != null)
+                createDeserializer();
+        }
+
+        private void createDeserializer()
+        {
+            assert file != null && deserializer == null;
+            deserializer = UnfilteredDeserializer.create(metadata, file, sstable.header, helper);
+        }
+
+        public void seekToPosition(long position) throws IOException
+        {
+            // This may be the first time we're actually looking into the file
+            if (file == null)
+            {
+                file = sstable.getFileDataInput(position);
+                createDeserializer();
+            }
+            else
+            {
+                file.seek(position);
+            }
+        }
+
+        protected void updateOpenMarker(RangeTombstoneMarker marker)
+        {
+            // Note that we always read index blocks in forward order so this method is always called in forward order
+            openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index ff0d7916672c..9c8b161386a9 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -17,13 +17,15 @@
  */
 package org.apache.cassandra.io.sstable.format.big;
 
+import java.io.IOException;
 import java.util.Collection;
 import java.util.UUID;
 
 import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.util.FileHandle;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.io.sstable.Descriptor;
@@ -46,6 +48,12 @@ private BigFormat()
 
     }
 
+    @Override
+    public Type getType()
+    {
+        return Type.BIG;
+    }
+
     @Override
     public Version getLatestVersion()
     {
@@ -70,12 +78,6 @@ public SSTableReader.Factory getReaderFactory()
         return readerFactory;
     }
 
-    @Override
-    public RowIndexEntry.IndexSerializer getIndexSerializer(TableMetadata metadata, Version version, SerializationHeader header)
-    {
-        return new RowIndexEntry.Serializer(version, header);
-    }
-
     static class WriterFactory extends SSTableWriter.Factory
     {
         @Override
@@ -95,13 +97,30 @@ public SSTableWriter open(Descriptor descriptor,
         }
     }
 
-    static class ReaderFactory extends SSTableReader.Factory
+    static class ReaderFactory implements SSTableReader.Factory
     {
         @Override
         public SSTableReader open(SSTableReaderBuilder builder)
         {
             return new BigTableReader(builder);
         }
+
+        @Override
+        public PartitionIndexIterator indexIterator(Descriptor descriptor, TableMetadata metadata)
+        {
+            try (FileHandle iFile = SSTableReaderBuilder.defaultIndexHandleBuilder(descriptor).complete()) {
+                SerializationHeader.Component headerComponent = (SerializationHeader.Component)
+                                                                descriptor.getMetadataSerializer()
+                                                                          .deserialize(descriptor, MetadataType.HEADER);
+                SerializationHeader header = headerComponent.toHeader(metadata);
+                BigTableRowIndexEntry.Serializer serializer = new BigTableRowIndexEntry.Serializer(descriptor.version, header);
+                return BigTablePartitionIndexIterator.create(iFile, serializer);
+            }
+            catch (IOException ex)
+            {
+                throw new RuntimeException(ex);
+            }
+        }
     }
 
     // versions are denoted as [major][minor].  Minor versions must be forward-compatible:
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java
new file mode 100644
index 000000000000..846f00809cae
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.big;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry.IndexSerializer;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+@NotThreadSafe
+public class BigTablePartitionIndexIterator implements PartitionIndexIterator
+{
+    private final FileHandle indexFile;
+    private final RandomAccessReader reader;
+    private final IndexSerializer<IndexInfo> rowIndexEntrySerializer;
+    private final long initialPosition;
+
+    private ByteBuffer key;
+    private long dataPosition;
+
+    private BigTablePartitionIndexIterator(FileHandle indexFile,
+                                           RandomAccessReader reader,
+                                           IndexSerializer<IndexInfo> rowIndexEntrySerializer)
+    {
+        this.indexFile = indexFile;
+        this.reader = reader;
+        this.rowIndexEntrySerializer = rowIndexEntrySerializer;
+        this.initialPosition = reader.getFilePointer();
+    }
+
+    public static BigTablePartitionIndexIterator create(RandomAccessReader reader, IndexSerializer<IndexInfo> serializer)
+    throws IOException
+    {
+        BigTablePartitionIndexIterator iterator = new BigTablePartitionIndexIterator(null, reader, serializer);
+        try
+        {
+            iterator.advance();
+            return iterator;
+        }
+        catch (IOException | RuntimeException ex)
+        {
+            iterator.close();
+            throw ex;
+        }
+    }
+
+    @SuppressWarnings({ "resource" })
+    public static BigTablePartitionIndexIterator create(FileHandle indexFile, IndexSerializer<IndexInfo> serializer)
+    throws IOException
+    {
+        FileHandle iFile = null;
+        RandomAccessReader reader = null;
+        BigTablePartitionIndexIterator iterator = null;
+        try
+        {
+            iFile = indexFile.sharedCopy();
+            reader = iFile.createReader();
+            iterator = new BigTablePartitionIndexIterator(iFile, reader, serializer);
+            iterator.advance();
+            return iterator;
+        }
+        catch (IOException | RuntimeException ex)
+        {
+            if (iterator != null)
+            {
+                iterator.close();
+            }
+            else
+            {
+                FileUtils.closeQuietly(reader);
+                FileUtils.closeQuietly(iFile);
+            }
+            throw ex;
+        }
+    }
+
+    @Override
+    public void close()
+    {
+        key = null;
+        dataPosition = -1;
+        FileUtils.closeQuietly(reader);
+        FileUtils.closeQuietly(indexFile);
+    }
+
+    @Override
+    public boolean advance() throws IOException
+    {
+        if (!reader.isEOF())
+        {
+            key = ByteBufferUtil.readWithShortLength(reader);
+            dataPosition = rowIndexEntrySerializer.deserializePositionAndSkip(reader);
+            return true;
+        }
+        else
+        {
+            dataPosition = -1;
+            key = null;
+            return false;
+        }
+    }
+
+    @Override
+    public boolean isExhausted()
+    {
+        return key == null && dataPosition < 0;
+    }
+
+    @Override
+    public ByteBuffer key()
+    {
+        return key;
+    }
+
+    @Override
+    public long dataPosition()
+    {
+        return dataPosition;
+    }
+
+    @Override
+    public long indexPosition()
+    {
+        return reader.getFilePointer();
+    }
+
+    @Override
+    public void indexPosition(long position) throws IOException
+    {
+        if (position > indexLength())
+            throw new IndexOutOfBoundsException("The requested position exceeds the index length");
+        reader.seek(position);
+        key = null;
+        dataPosition = 0;
+        advance();
+    }
+
+    @Override
+    public long indexLength()
+    {
+        return reader.length();
+    }
+
+    @Override
+    public void reset() throws IOException
+    {
+        reader.seek(initialPosition);
+        key = null;
+        dataPosition = 0;
+        advance();
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
index f60c9dfee473..708488873098 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
@@ -20,14 +20,14 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.function.Supplier;
 
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.columniterator.SSTableIterator;
-import org.apache.cassandra.db.columniterator.SSTableReversedIterator;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.Rows;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -52,9 +52,18 @@ public class BigTableReader extends SSTableReader
 {
     private static final Logger logger = LoggerFactory.getLogger(BigTableReader.class);
 
+    protected final BigTableRowIndexEntry.IndexSerializer<IndexInfo> rowIndexEntrySerializer;
+
     BigTableReader(SSTableReaderBuilder builder)
     {
         super(builder);
+        this.rowIndexEntrySerializer = new BigTableRowIndexEntry.Serializer(descriptor.version, header);
+    }
+
+    @Override
+    public PartitionIndexIterator allKeysIterator() throws IOException
+    {
+        return BigTablePartitionIndexIterator.create(getIndexFile(), rowIndexEntrySerializer);
     }
 
     public UnfilteredRowIterator iterator(DecoratedKey key,
@@ -63,12 +72,12 @@ public UnfilteredRowIterator iterator(DecoratedKey key,
                                           boolean reversed,
                                           SSTableReadsListener listener)
     {
-        RowIndexEntry rie = getPosition(key, SSTableReader.Operator.EQ, listener);
+        BigTableRowIndexEntry rie = getPosition(key, SSTableReader.Operator.EQ, true, false, listener);
         return iterator(null, key, rie, slices, selectedColumns, reversed);
     }
 
     @SuppressWarnings("resource")
-    public UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed)
+    public UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, BigTableRowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed)
     {
         if (indexEntry == null)
             return UnfilteredRowIterators.noRowsIterator(metadata(), key, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE, reversed);
@@ -121,9 +130,12 @@ public ISSTableScanner getScanner(Collection<Range<Token>> ranges)
 
     @SuppressWarnings("resource") // caller to close
     @Override
-    public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, RowIndexEntry position, boolean tombstoneOnly)
+    public UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, DecoratedKey key, boolean tombstoneOnly)
     {
-        return SSTableIdentityIterator.create(this, dfile, position, key, tombstoneOnly);
+        BigTableRowIndexEntry position = getPosition(key, SSTableReader.Operator.EQ, true, false, SSTableReadsListener.NOOP_LISTENER);
+        if (position == null)
+            return null;
+        return SSTableIdentityIterator.create(this, dfile.get(), position, key, tombstoneOnly);
     }
 
     /**
@@ -133,11 +145,11 @@ public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey ke
      * @param updateCacheAndStats true if updating stats and cache
      * @return The index entry corresponding to the key, or null if the key is not present
      */
-    protected RowIndexEntry getPosition(PartitionPosition key,
-                                        Operator op,
-                                        boolean updateCacheAndStats,
-                                        boolean permitMatchPastLast,
-                                        SSTableReadsListener listener)
+    protected BigTableRowIndexEntry getPosition(PartitionPosition key,
+                                                Operator op,
+                                                boolean updateCacheAndStats,
+                                                boolean permitMatchPastLast,
+                                                SSTableReadsListener listener)
     {
         if (op == Operator.EQ)
         {
@@ -155,7 +167,7 @@ protected RowIndexEntry getPosition(PartitionPosition key,
         if ((op == Operator.EQ || op == Operator.GE) && (key instanceof DecoratedKey))
         {
             DecoratedKey decoratedKey = (DecoratedKey) key;
-            RowIndexEntry cachedPosition = getCachedPosition(decoratedKey, updateCacheAndStats);
+            BigTableRowIndexEntry cachedPosition = getCachedPosition(decoratedKey, updateCacheAndStats);
             if (cachedPosition != null)
             {
                 listener.onSSTableSelected(this, cachedPosition, SelectionReason.KEY_CACHE_HIT);
@@ -244,7 +256,7 @@ protected RowIndexEntry getPosition(PartitionPosition key,
                 if (opSatisfied)
                 {
                     // read data position from index entry
-                    RowIndexEntry indexEntry = rowIndexEntrySerializer.deserialize(in);
+                    BigTableRowIndexEntry indexEntry = rowIndexEntrySerializer.deserialize(in);
                     if (exactMatch && updateCacheAndStats)
                     {
                         assert key instanceof DecoratedKey; // key can be == to the index key only if it's a true row key
@@ -271,7 +283,7 @@ protected RowIndexEntry getPosition(PartitionPosition key,
                     return indexEntry;
                 }
 
-                RowIndexEntry.Serializer.skip(in, descriptor.version);
+                BigTableRowIndexEntry.Serializer.skip(in, descriptor.version);
             }
         }
         catch (IOException e)
@@ -288,4 +300,24 @@ protected RowIndexEntry getPosition(PartitionPosition key,
     }
 
 
+    @Override
+    public DecoratedKey keyAt(long indexPosition) throws IOException
+    {
+        DecoratedKey key;
+        try (FileDataInput in = ifile.createReader(indexPosition))
+        {
+            if (in.isEOF())
+                return null;
+
+            key = decorateKey(ByteBufferUtil.readWithShortLength(in));
+
+            // hint read path about key location if caching is enabled
+            // this saves index summary lookup and index file iteration which whould be pretty costly
+            // especially in presence of promoted column indexes
+            if (isKeyCacheEnabled())
+                cacheKey(key, rowIndexEntrySerializer.deserialize(in));
+        }
+
+        return key;
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/RowIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java
similarity index 91%
rename from src/java/org/apache/cassandra/db/RowIndexEntry.java
rename to src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java
index 215768bc8d3f..dc8e91ec93e3 100644
--- a/src/java/org/apache/cassandra/db/RowIndexEntry.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db;
+package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -24,8 +24,11 @@
 import com.codahale.metrics.Histogram;
 import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.sstable.IndexInfo;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputBuffer;
@@ -103,7 +106,7 @@
  *     This results in these classes:
  * </p>
  * <ul>
- *     <li>{@link RowIndexEntry} just stores the offset in the data file.</li>
+ *     <li>{@link BigTableRowIndexEntry} just stores the offset in the data file.</li>
  *     <li>{@link IndexedEntry} is for index entries with index samples
  *     and used for both current and legacy sstables, which do not exceed
  *     {@link org.apache.cassandra.config.Config#column_index_cache_size_in_kb}.</li>
@@ -124,9 +127,9 @@
  * </p>
  *
  */
-public class RowIndexEntry<T> implements IMeasurableMemory
+public class BigTableRowIndexEntry extends RowIndexEntry<IndexInfo> implements IMeasurableMemory
 {
-    private static final long EMPTY_SIZE = ObjectSizes.measure(new RowIndexEntry(0));
+    public static final long EMPTY_SIZE = ObjectSizes.measure(new BigTableRowIndexEntry(0));
 
     // constants for type of row-index-entry as serialized for saved-cache
     static final int CACHE_NOT_INDEXED = 0;
@@ -146,20 +149,9 @@ public class RowIndexEntry<T> implements IMeasurableMemory
         indexInfoReadsHistogram = Metrics.histogram(factory.createMetricName("IndexInfoReads"), false);
     }
 
-    public final long position;
-
-    public RowIndexEntry(long position)
+    public BigTableRowIndexEntry(long position)
     {
-        this.position = position;
-    }
-
-    /**
-     * @return true if this index entry contains the row-level tombstone and column summary.  Otherwise,
-     * caller should fetch these from the row header.
-     */
-    public boolean isIndexed()
-    {
-        return columnsIndexCount() > 1;
+        super(position);
     }
 
     public boolean indexOnHeap()
@@ -167,16 +159,12 @@ public boolean indexOnHeap()
         return false;
     }
 
+    @Override
     public DeletionTime deletionTime()
     {
         throw new UnsupportedOperationException();
     }
 
-    public int columnsIndexCount()
-    {
-        return 0;
-    }
-
     public long unsharedHeapSize()
     {
         return EMPTY_SIZE;
@@ -184,20 +172,20 @@ public long unsharedHeapSize()
 
     /**
      * @param dataFilePosition  position of the partition in the {@link org.apache.cassandra.io.sstable.Component.Type#DATA} file
-     * @param indexFilePosition position in the {@link org.apache.cassandra.io.sstable.Component.Type#PRIMARY_INDEX} of the {@link RowIndexEntry}
-     * @param deletionTime      deletion time of {@link RowIndexEntry}
-     * @param headerLength      deletion time of {@link RowIndexEntry}
-     * @param columnIndexCount  number of {@link IndexInfo} entries in the {@link RowIndexEntry}
+     * @param indexFilePosition position in the {@link org.apache.cassandra.io.sstable.Component.Type#PRIMARY_INDEX} of the {@link BigTableRowIndexEntry}
+     * @param deletionTime      deletion time of {@link BigTableRowIndexEntry}
+     * @param headerLength      deletion time of {@link BigTableRowIndexEntry}
+     * @param columnIndexCount  number of {@link IndexInfo} entries in the {@link BigTableRowIndexEntry}
      * @param indexedPartSize   serialized size of all serialized {@link IndexInfo} objects and their offsets
      * @param indexSamples      list with IndexInfo offsets (if total serialized size is less than {@link org.apache.cassandra.config.Config#column_index_cache_size_in_kb}
      * @param offsets           offsets of IndexInfo offsets
      * @param idxInfoSerializer the {@link IndexInfo} serializer
      */
-    public static RowIndexEntry<IndexInfo> create(long dataFilePosition, long indexFilePosition,
-                                                  DeletionTime deletionTime, long headerLength, int columnIndexCount,
-                                                  int indexedPartSize,
-                                                  List<IndexInfo> indexSamples, int[] offsets,
-                                                  ISerializer<IndexInfo> idxInfoSerializer)
+    public static BigTableRowIndexEntry create(long dataFilePosition, long indexFilePosition,
+                                               DeletionTime deletionTime, long headerLength, int columnIndexCount,
+                                               int indexedPartSize,
+                                               List<IndexInfo> indexSamples, int[] offsets,
+                                               ISerializer<IndexInfo> idxInfoSerializer)
     {
         // If the "partition building code" in BigTableWriter.append() via ColumnIndex returns a list
         // of IndexInfo objects, which is the case if the serialized size is less than
@@ -215,7 +203,7 @@ public static RowIndexEntry<IndexInfo> create(long dataFilePosition, long indexF
                                            deletionTime, headerLength, columnIndexCount,
                                            indexedPartSize, idxInfoSerializer);
         // Last case is that there are no index samples.
-        return new RowIndexEntry<>(dataFilePosition);
+        return new BigTableRowIndexEntry(dataFilePosition);
     }
 
     public IndexInfoRetriever openWithIndex(FileHandle indexFile)
@@ -225,27 +213,27 @@ public IndexInfoRetriever openWithIndex(FileHandle indexFile)
 
     public interface IndexSerializer<T>
     {
-        void serialize(RowIndexEntry<T> rie, DataOutputPlus out, ByteBuffer indexInfo) throws IOException;
+        void serialize(BigTableRowIndexEntry rie, DataOutputPlus out, ByteBuffer indexInfo) throws IOException;
 
-        RowIndexEntry<T> deserialize(DataInputPlus in, long indexFilePosition) throws IOException;
-        default RowIndexEntry<T> deserialize(RandomAccessReader reader) throws IOException
+        BigTableRowIndexEntry deserialize(DataInputPlus in, long indexFilePosition) throws IOException;
+        default BigTableRowIndexEntry deserialize(RandomAccessReader reader) throws IOException
         {
             return deserialize(reader, reader.getFilePointer());
 
         }
 
-        default RowIndexEntry<T> deserialize(FileDataInput input) throws IOException
+        default BigTableRowIndexEntry deserialize(FileDataInput input) throws IOException
         {
             return deserialize(input, input.getFilePointer());
 
         }
 
-        void serializeForCache(RowIndexEntry<T> rie, DataOutputPlus out) throws IOException;
-        RowIndexEntry<T> deserializeForCache(DataInputPlus in) throws IOException;
+        void serializeForCache(BigTableRowIndexEntry rie, DataOutputPlus out) throws IOException;
+        BigTableRowIndexEntry deserializeForCache(DataInputPlus in) throws IOException;
 
         long deserializePositionAndSkip(DataInputPlus in) throws IOException;
 
-        ISerializer<T> indexInfoSerializer();
+        ISerializer<IndexInfo> indexInfoSerializer();
     }
 
     public static final class Serializer implements IndexSerializer<IndexInfo>
@@ -264,24 +252,24 @@ public IndexInfo.Serializer indexInfoSerializer()
             return idxInfoSerializer;
         }
 
-        public void serialize(RowIndexEntry<IndexInfo> rie, DataOutputPlus out, ByteBuffer indexInfo) throws IOException
+        public void serialize(BigTableRowIndexEntry rie, DataOutputPlus out, ByteBuffer indexInfo) throws IOException
         {
             rie.serialize(out, indexInfo);
         }
 
-        public void serializeForCache(RowIndexEntry<IndexInfo> rie, DataOutputPlus out) throws IOException
+        public void serializeForCache(BigTableRowIndexEntry rie, DataOutputPlus out) throws IOException
         {
             rie.serializeForCache(out);
         }
 
-        public RowIndexEntry<IndexInfo> deserializeForCache(DataInputPlus in) throws IOException
+        public BigTableRowIndexEntry deserializeForCache(DataInputPlus in) throws IOException
         {
             long position = in.readUnsignedVInt();
 
             switch (in.readByte())
             {
                 case CACHE_NOT_INDEXED:
-                    return new RowIndexEntry<>(position);
+                    return new BigTableRowIndexEntry(position);
                 case CACHE_INDEXED:
                     return new IndexedEntry(position, in, idxInfoSerializer);
                 case CACHE_INDEXED_SHALLOW:
@@ -309,14 +297,14 @@ public static void skipForCache(DataInputPlus in) throws IOException
             }
         }
 
-        public RowIndexEntry<IndexInfo> deserialize(DataInputPlus in, long indexFilePosition) throws IOException
+        public BigTableRowIndexEntry deserialize(DataInputPlus in, long indexFilePosition) throws IOException
         {
             long position = in.readUnsignedVInt();
 
             int size = (int)in.readUnsignedVInt();
             if (size == 0)
             {
-                return new RowIndexEntry<>(position);
+                return new BigTableRowIndexEntry(position);
             }
             else
             {
@@ -410,7 +398,7 @@ public void serializeForCache(DataOutputPlus out) throws IOException
     /**
      * An entry in the row index for a row whose columns are indexed - used for both legacy and current formats.
      */
-    private static final class IndexedEntry extends RowIndexEntry<IndexInfo>
+    private static final class IndexedEntry extends BigTableRowIndexEntry
     {
         private static final long BASE_SIZE;
 
@@ -587,7 +575,7 @@ static void skipForCache(DataInputPlus in) throws IOException
      * An entry in the row index for a row whose columns are indexed and the {@link IndexInfo} objects
      * are not read into the key cache.
      */
-    private static final class ShallowIndexedEntry extends RowIndexEntry<IndexInfo>
+    private static final class ShallowIndexedEntry extends BigTableRowIndexEntry
     {
         private static final long BASE_SIZE;
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
index 6644b3b8cff1..62e3ed7a40c3 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
@@ -54,14 +54,14 @@ public class BigTableScanner implements ISSTableScanner
     private final AtomicBoolean isClosed = new AtomicBoolean(false);
     protected final RandomAccessReader dfile;
     protected final RandomAccessReader ifile;
-    public final SSTableReader sstable;
+    public final BigTableReader sstable;
 
     private final Iterator<AbstractBounds<PartitionPosition>> rangeIterator;
     private AbstractBounds<PartitionPosition> currentRange;
 
     private final ColumnFilter columns;
     private final DataRange dataRange;
-    private final RowIndexEntry.IndexSerializer rowIndexEntrySerializer;
+    private final BigTableRowIndexEntry.IndexSerializer<IndexInfo> rowIndexEntrySerializer;
     private final SSTableReadsListener listener;
     private long startScan = -1;
     private long bytesScanned = 0;
@@ -69,12 +69,12 @@ public class BigTableScanner implements ISSTableScanner
     protected Iterator<UnfilteredRowIterator> iterator;
 
     // Full scan of the sstables
-    public static ISSTableScanner getScanner(SSTableReader sstable)
+    public static ISSTableScanner getScanner(BigTableReader sstable)
     {
         return getScanner(sstable, Iterators.singletonIterator(fullRange(sstable)));
     }
 
-    public static ISSTableScanner getScanner(SSTableReader sstable,
+    public static ISSTableScanner getScanner(BigTableReader sstable,
                                              ColumnFilter columns,
                                              DataRange dataRange,
                                              SSTableReadsListener listener)
@@ -82,7 +82,7 @@ public static ISSTableScanner getScanner(SSTableReader sstable,
         return new BigTableScanner(sstable, columns, dataRange, makeBounds(sstable, dataRange).iterator(), listener);
     }
 
-    public static ISSTableScanner getScanner(SSTableReader sstable, Collection<Range<Token>> tokenRanges)
+    public static ISSTableScanner getScanner(BigTableReader sstable, Collection<Range<Token>> tokenRanges)
     {
         // We want to avoid allocating a SSTableScanner if the range don't overlap the sstable (#5249)
         List<SSTableReader.PartitionPositionBounds> positions = sstable.getPositionsForRanges(tokenRanges);
@@ -92,12 +92,12 @@ public static ISSTableScanner getScanner(SSTableReader sstable, Collection<Range
         return getScanner(sstable, makeBounds(sstable, tokenRanges).iterator());
     }
 
-    public static ISSTableScanner getScanner(SSTableReader sstable, Iterator<AbstractBounds<PartitionPosition>> rangeIterator)
+    public static ISSTableScanner getScanner(BigTableReader sstable, Iterator<AbstractBounds<PartitionPosition>> rangeIterator)
     {
         return new BigTableScanner(sstable, ColumnFilter.all(sstable.metadata()), null, rangeIterator, SSTableReadsListener.NOOP_LISTENER);
     }
 
-    private BigTableScanner(SSTableReader sstable,
+    private BigTableScanner(BigTableReader sstable,
                             ColumnFilter columns,
                             DataRange dataRange,
                             Iterator<AbstractBounds<PartitionPosition>> rangeIterator,
@@ -110,9 +110,7 @@ private BigTableScanner(SSTableReader sstable,
         this.sstable = sstable;
         this.columns = columns;
         this.dataRange = dataRange;
-        this.rowIndexEntrySerializer = sstable.descriptor.version.getSSTableFormat().getIndexSerializer(sstable.metadata(),
-                                                                                                        sstable.descriptor.version,
-                                                                                                        sstable.header);
+        this.rowIndexEntrySerializer = new BigTableRowIndexEntry.Serializer(sstable.descriptor.version, sstable.header);
         this.rangeIterator = rangeIterator;
         this.listener = listener;
     }
@@ -191,14 +189,14 @@ private void seekToCurrentRangeStart()
                 if (indexDecoratedKey.compareTo(currentRange.left) > 0 || currentRange.contains(indexDecoratedKey))
                 {
                     // Found, just read the dataPosition and seek into index and data files
-                    long dataPosition = RowIndexEntry.Serializer.readPosition(ifile);
+                    long dataPosition = BigTableRowIndexEntry.Serializer.readPosition(ifile);
                     ifile.seek(indexPosition);
                     dfile.seek(dataPosition);
                     break;
                 }
                 else
                 {
-                    RowIndexEntry.Serializer.skip(ifile, sstable.descriptor.version);
+                    BigTableRowIndexEntry.Serializer.skip(ifile, sstable.descriptor.version);
                 }
             }
         }
@@ -282,9 +280,9 @@ private Iterator<UnfilteredRowIterator> createIterator()
     protected class KeyScanningIterator extends AbstractIterator<UnfilteredRowIterator>
     {
         private DecoratedKey nextKey;
-        private RowIndexEntry nextEntry;
+        private BigTableRowIndexEntry nextEntry;
         private DecoratedKey currentKey;
-        private RowIndexEntry currentEntry;
+        private BigTableRowIndexEntry currentEntry;
 
         protected UnfilteredRowIterator computeNext()
         {
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
index 806a05951118..286103598cd7 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
@@ -39,6 +39,7 @@
 import org.apache.cassandra.io.sstable.*;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
@@ -54,6 +55,7 @@ public class BigTableWriter extends SSTableWriter
 {
     private static final Logger logger = LoggerFactory.getLogger(BigTableWriter.class);
 
+    protected final BigTableRowIndexEntry.IndexSerializer<IndexInfo> rowIndexEntrySerializer;
     private final ColumnIndex columnIndexWriter;
     private final IndexWriter iwriter;
     private final FileHandle.Builder dbuilder;
@@ -61,7 +63,6 @@ public class BigTableWriter extends SSTableWriter
     private DecoratedKey lastWrittenKey;
     private DataPosition dataMark;
     private long lastEarlyOpenLength = 0;
-    private final Optional<ChunkCache> chunkCache = Optional.ofNullable(ChunkCache.instance);
 
     private final SequentialWriterOption writerOption = SequentialWriterOption.newBuilder()
                                                         .trickleFsync(DatabaseDescriptor.getTrickleFsync())
@@ -100,12 +101,11 @@ public BigTableWriter(Descriptor descriptor,
                     new File(descriptor.filenameFor(Component.DIGEST)),
                     writerOption);
         }
-        dbuilder = new FileHandle.Builder(descriptor.filenameFor(Component.DATA)).compressed(compression)
-                                              .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap);
-        chunkCache.ifPresent(dbuilder::withChunkCache);
+        dbuilder = SSTableReaderBuilder.defaultDataHandleBuilder(descriptor).compressed(compression);
         iwriter = new IndexWriter(keyCount);
 
-        columnIndexWriter = new ColumnIndex(this.header, dataFile, descriptor.version, this.observers, getRowIndexEntrySerializer().indexInfoSerializer());
+        this.rowIndexEntrySerializer = new BigTableRowIndexEntry.Serializer(descriptor.version, header);
+        columnIndexWriter = new ColumnIndex(this.header, dataFile, descriptor.version, this.observers, rowIndexEntrySerializer.indexInfoSerializer());
     }
 
     /**
@@ -170,7 +170,7 @@ protected long beforeAppend(DecoratedKey decoratedKey)
         return (lastWrittenKey == null) ? 0 : dataFile.position();
     }
 
-    private void afterAppend(DecoratedKey decoratedKey, long dataEnd, RowIndexEntry index, ByteBuffer indexInfo) throws IOException
+    private void afterAppend(DecoratedKey decoratedKey, long dataEnd, BigTableRowIndexEntry index, ByteBuffer indexInfo) throws IOException
     {
         metadataCollector.addKey(decoratedKey.getKey());
         lastWrittenKey = decoratedKey;
@@ -192,7 +192,7 @@ private void afterAppend(DecoratedKey decoratedKey, long dataEnd, RowIndexEntry
      *
      * @throws FSWriteError if a write to the dataFile fails
      */
-    public RowIndexEntry append(UnfilteredRowIterator iterator)
+    public BigTableRowIndexEntry append(UnfilteredRowIterator iterator)
     {
         DecoratedKey key = iterator.partitionKey();
 
@@ -219,14 +219,14 @@ public RowIndexEntry append(UnfilteredRowIterator iterator)
             // serialized size to the index-writer position
             long indexFilePosition = ByteBufferUtil.serializedSizeWithShortLength(key.getKey()) + iwriter.indexFile.position();
 
-            RowIndexEntry entry = RowIndexEntry.create(startPosition, indexFilePosition,
-                                                       collecting.partitionLevelDeletion(),
-                                                       columnIndexWriter.headerLength,
-                                                       columnIndexWriter.columnIndexCount,
-                                                       columnIndexWriter.indexInfoSerializedSize(),
-                                                       columnIndexWriter.indexSamples(),
-                                                       columnIndexWriter.offsets(),
-                                                       getRowIndexEntrySerializer().indexInfoSerializer());
+            BigTableRowIndexEntry entry = BigTableRowIndexEntry.create(startPosition, indexFilePosition,
+                                                                       collecting.partitionLevelDeletion(),
+                                                                       columnIndexWriter.headerLength,
+                                                                       columnIndexWriter.columnIndexCount,
+                                                                       columnIndexWriter.indexInfoSerializedSize(),
+                                                                       columnIndexWriter.indexSamples(),
+                                                                       columnIndexWriter.offsets(),
+                                                                       rowIndexEntrySerializer.indexInfoSerializer());
 
             long endPosition = dataFile.position();
             long rowSize = endPosition - startPosition;
@@ -245,11 +245,6 @@ public RowIndexEntry append(UnfilteredRowIterator iterator)
         }
     }
 
-    private RowIndexEntry.IndexSerializer<IndexInfo> getRowIndexEntrySerializer()
-    {
-        return (RowIndexEntry.IndexSerializer<IndexInfo>) rowIndexEntrySerializer;
-    }
-
     private void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
     {
         if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
@@ -354,10 +349,8 @@ public SSTableReader openEarly()
 
     void invalidateCacheAtBoundary(FileHandle dfile)
     {
-        chunkCache.ifPresent(cache -> {
-            if (lastEarlyOpenLength != 0 && dfile.dataLength() > lastEarlyOpenLength)
-                cache.invalidatePosition(dfile, lastEarlyOpenLength);
-        });
+        if (ChunkCache.instance != null && lastEarlyOpenLength != 0 && dfile.dataLength() > lastEarlyOpenLength)
+            ChunkCache.instance.invalidatePosition(dfile, lastEarlyOpenLength);
         lastEarlyOpenLength = dfile.dataLength();
     }
 
@@ -491,8 +484,7 @@ class IndexWriter extends AbstractTransactional implements Transactional
         IndexWriter(long keyCount)
         {
             indexFile = new SequentialWriter(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)), writerOption);
-            builder = new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX)).mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap);
-            chunkCache.ifPresent(builder::withChunkCache);
+            builder = SSTableReaderBuilder.defaultIndexHandleBuilder(descriptor);
             summary = new IndexSummaryBuilder(keyCount, metadata().params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL);
             bf = FilterFactory.getFilter(keyCount, metadata().params.bloomFilterFpChance);
             // register listeners to be alerted when the data files are flushed
@@ -506,7 +498,7 @@ IndexSummaryBuilder.ReadableBoundary getMaxReadable()
             return summary.getLastReadableBoundary();
         }
 
-        public void append(DecoratedKey key, RowIndexEntry indexEntry, long dataEnd, ByteBuffer indexInfo) throws IOException
+        public void append(DecoratedKey key, BigTableRowIndexEntry indexEntry, long dataEnd, ByteBuffer indexInfo) throws IOException
         {
             bf.add(key);
             long indexStart = indexFile.position();
diff --git a/src/java/org/apache/cassandra/db/ColumnIndex.java b/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
similarity index 95%
rename from src/java/org/apache/cassandra/db/ColumnIndex.java
rename to src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
index b87230083d8d..680644d941ec 100644
--- a/src/java/org/apache/cassandra/db/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.db;
+package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
@@ -25,9 +25,14 @@
 import com.google.common.primitives.Ints;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.ISerializer;
-import org.apache.cassandra.io.sstable.IndexInfo;
+import org.apache.cassandra.io.sstable.format.big.IndexInfo;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataOutputBuffer;
@@ -37,7 +42,7 @@
 /**
  * Column index builder used by {@link org.apache.cassandra.io.sstable.format.big.BigTableWriter}.
  * For index entries that exceed {@link org.apache.cassandra.config.Config#column_index_cache_size_in_kb},
- * this uses the serialization logic as in {@link RowIndexEntry}.
+ * this uses the serialization logic as in {@link BigTableRowIndexEntry}.
  */
 public class ColumnIndex
 {
@@ -288,7 +293,7 @@ private void finish() throws IOException
         // in indexOffsets[]. buffer is != null, if it exceeds Config.column_index_cache_size_in_kb.
         // In the other case, when buffer==null, the offsets are serialized in RowIndexEntry.IndexedEntry.serialize().
         if (buffer != null)
-            RowIndexEntry.Serializer.serializeOffsets(buffer, indexOffsets, columnIndexCount);
+            BigTableRowIndexEntry.Serializer.serializeOffsets(buffer, indexOffsets, columnIndexCount);
 
         // we should always have at least one computed index block, but we only write it out if there is more than that.
         assert columnIndexCount > 0 && headerLength >= 0;
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexInfo.java b/src/java/org/apache/cassandra/io/sstable/format/big/IndexInfo.java
similarity index 97%
rename from src/java/org/apache/cassandra/io/sstable/IndexInfo.java
rename to src/java/org/apache/cassandra/io/sstable/format/big/IndexInfo.java
index e24436d017ff..5ed622ddf97a 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexInfo.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/IndexInfo.java
@@ -16,18 +16,16 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.io.sstable;
+package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.IOException;
 import java.util.List;
 
 import org.apache.cassandra.db.ClusteringPrefix;
 import org.apache.cassandra.db.DeletionTime;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ByteArrayAccessor;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataInputPlus;
@@ -35,7 +33,7 @@
 import org.apache.cassandra.utils.ObjectSizes;
 
 /**
- * {@code IndexInfo} is embedded in the indexed version of {@link RowIndexEntry}.
+ * {@code IndexInfo} is embedded in the indexed version of {@link BigTableRowIndexEntry}.
  * Each instance roughly covers a range of {@link org.apache.cassandra.config.Config#column_index_size_in_kb column_index_size_in_kb} kB
  * and contains the first and last clustering value (or slice bound), its offset in the data file and width in the data file.
  * <p>
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java b/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java
new file mode 100644
index 000000000000..857b251b7a0c
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable.format.big;
+
+import java.io.IOException;
+import java.util.Comparator;
+
+import org.apache.cassandra.db.ClusteringBound;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.io.sstable.format.big.AbstractBigTableIterator.RowReader;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.io.util.FileHandle;
+
+// Used by indexed readers to store where they are of the index.
+public class IndexState implements AutoCloseable
+{
+    private final RowReader reader;
+    private final ClusteringComparator comparator;
+
+    private final BigTableRowIndexEntry indexEntry;
+    private final BigTableRowIndexEntry.IndexInfoRetriever indexInfoRetriever;
+    private final boolean reversed;
+
+    private int currentIndexIdx;
+
+    // Marks the beginning of the block corresponding to currentIndexIdx.
+    private DataPosition mark;
+
+    public IndexState(RowReader reader,
+                      ClusteringComparator comparator,
+                      BigTableRowIndexEntry indexEntry,
+                      boolean reversed,
+                      FileHandle indexFile)
+    {
+        this.reader = reader;
+        this.comparator = comparator;
+        this.indexEntry = indexEntry;
+        this.indexInfoRetriever = indexEntry.openWithIndex(indexFile);
+        this.reversed = reversed;
+        this.currentIndexIdx = reversed ? indexEntry.columnsIndexCount() : -1;
+    }
+
+    public boolean isDone()
+    {
+        return reversed ? currentIndexIdx < 0 : currentIndexIdx >= indexEntry.columnsIndexCount();
+    }
+
+    // Sets the reader to the beginning of blockIdx.
+    public void setToBlock(int blockIdx) throws IOException
+    {
+        if (blockIdx >= 0 && blockIdx < indexEntry.columnsIndexCount())
+        {
+            reader.seekToPosition(columnOffset(blockIdx));
+            mark = reader.file.mark();
+            reader.deserializer.clearState();
+        }
+
+        currentIndexIdx = blockIdx;
+        reader.openMarker = blockIdx > 0 ? index(blockIdx - 1).endOpenMarker : null;
+    }
+
+    private long columnOffset(int i) throws IOException
+    {
+        return indexEntry.position + index(i).offset;
+    }
+
+    public int blocksCount()
+    {
+        return indexEntry.columnsIndexCount();
+    }
+
+    // Update the block idx based on the current reader position if we're past the current block.
+    // This only makes sense for forward iteration (for reverse ones, when we reach the end of a block we
+    // should seek to the previous one, not update the index state and continue).
+    public void updateBlock() throws IOException
+    {
+        assert !reversed;
+
+        // If we get here with currentBlockIdx < 0, it means setToBlock() has never been called, so it means
+        // we're about to read from the beginning of the partition, but haven't "prepared" the IndexState yet.
+        // Do so by setting us on the first block.
+        if (currentIndexIdx < 0)
+        {
+            setToBlock(0);
+            return;
+        }
+
+        while (currentIndexIdx + 1 < indexEntry.columnsIndexCount() && isPastCurrentBlock())
+        {
+            reader.openMarker = currentIndex().endOpenMarker;
+            ++currentIndexIdx;
+
+            // We have to set the mark, and we have to set it at the beginning of the block. So if we're not at the beginning of the block, this forces us to a weird seek dance.
+            // This can only happen when reading old file however.
+            long startOfBlock = columnOffset(currentIndexIdx);
+            long currentFilePointer = reader.file.getFilePointer();
+            if (startOfBlock == currentFilePointer)
+            {
+                mark = reader.file.mark();
+            }
+            else
+            {
+                reader.seekToPosition(startOfBlock);
+                mark = reader.file.mark();
+                reader.seekToPosition(currentFilePointer);
+            }
+        }
+    }
+
+    // Check if we've crossed an index boundary (based on the mark on the beginning of the index block).
+    public boolean isPastCurrentBlock() throws IOException
+    {
+        assert reader.deserializer != null;
+        return reader.file.bytesPastMark(mark) >= currentIndex().width;
+    }
+
+    public int currentBlockIdx()
+    {
+        return currentIndexIdx;
+    }
+
+    public IndexInfo currentIndex() throws IOException
+    {
+        return index(currentIndexIdx);
+    }
+
+    public IndexInfo index(int i) throws IOException
+    {
+        return indexInfoRetriever.columnsIndex(i);
+    }
+
+    // Finds the index of the first block containing the provided bound, starting at the provided index.
+    // Will be -1 if the bound is before any block, and blocksCount() if it is after every block.
+    public int findBlockIndex(ClusteringBound<?> bound, int fromIdx) throws IOException
+    {
+        if (bound.isBottom())
+            return -1;
+        if (bound.isTop())
+            return blocksCount();
+
+        return indexFor(bound, fromIdx);
+    }
+
+    public int indexFor(ClusteringPrefix<?> name, int lastIndex) throws IOException
+    {
+        IndexInfo target = new IndexInfo(name, name, 0, 0, null);
+        /*
+        Take the example from the unit test, and say your index looks like this:
+        [0..5][10..15][20..25]
+        and you look for the slice [13..17].
+
+        When doing forward slice, we are doing a binary search comparing 13 (the start of the query)
+        to the lastName part of the index slot. You'll end up with the "first" slot, going from left to right,
+        that may contain the start.
+
+        When doing a reverse slice, we do the same thing, only using as a start column the end of the query,
+        i.e. 17 in this example, compared to the firstName part of the index slots.  bsearch will give us the
+        first slot where firstName > start ([20..25] here), so we subtract an extra one to get the slot just before.
+        */
+        int startIdx = 0;
+        int endIdx = indexEntry.columnsIndexCount() - 1;
+
+        if (reversed)
+        {
+            if (lastIndex < endIdx)
+            {
+                endIdx = lastIndex;
+            }
+        }
+        else
+        {
+            if (lastIndex > 0)
+            {
+                startIdx = lastIndex;
+            }
+        }
+
+        int index = binarySearch(target, comparator.indexComparator(reversed), startIdx, endIdx);
+        return (index < 0 ? -index - (reversed ? 2 : 1) : index);
+    }
+
+    private int binarySearch(IndexInfo key, Comparator<IndexInfo> c, int low, int high) throws IOException
+    {
+        while (low <= high)
+        {
+            int mid = (low + high) >>> 1;
+            IndexInfo midVal = index(mid);
+            int cmp = c.compare(midVal, key);
+
+            if (cmp < 0)
+                low = mid + 1;
+            else if (cmp > 0)
+                high = mid - 1;
+            else
+                return mid;
+        }
+        return -(low + 1);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("IndexState(indexSize=%d, currentBlock=%d, reversed=%b)", indexEntry.columnsIndexCount(), currentIndexIdx, reversed);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        indexInfoRetriever.close();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
similarity index 96%
rename from src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
rename to src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
index d4362f775b99..30499075a277 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.IOException;
 import java.util.NoSuchElementException;
@@ -30,7 +30,7 @@
 /**
  *  A Cell Iterator over SSTable
  */
-public class SSTableIterator extends AbstractSSTableIterator
+public class SSTableIterator extends AbstractBigTableIterator
 {
     /**
      * The index of the slice being processed.
@@ -40,7 +40,7 @@ public class SSTableIterator extends AbstractSSTableIterator
     public SSTableIterator(SSTableReader sstable,
                            FileDataInput file,
                            DecoratedKey key,
-                           RowIndexEntry indexEntry,
+                           BigTableRowIndexEntry indexEntry,
                            Slices slices,
                            ColumnFilter columns,
                            FileHandle ifile)
@@ -48,7 +48,7 @@ public SSTableIterator(SSTableReader sstable,
         super(sstable, file, key, indexEntry, slices, columns, ifile);
     }
 
-    protected Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    protected RowReader createReaderInternal(BigTableRowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
     {
         return indexEntry.isIndexed()
              ? new ForwardIndexedReader(indexEntry, file, shouldCloseFile)
@@ -72,7 +72,7 @@ public boolean isReverseOrder()
         return false;
     }
 
-    private class ForwardReader extends Reader
+    private class ForwardReader extends RowReader
     {
         // The start of the current slice. This will be null as soon as we know we've passed that bound.
         protected ClusteringBound<?> start;
@@ -211,7 +211,7 @@ private class ForwardIndexedReader extends ForwardReader
 
         private int lastBlockIdx; // the last index block that has data for the current query
 
-        private ForwardIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+        private ForwardIndexedReader(BigTableRowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
         {
             super(file, shouldCloseFile);
             this.indexState = new IndexState(this, metadata.comparator, indexEntry, false, ifile);
diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
similarity index 97%
rename from src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
rename to src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
index a60aafa77181..0ceb269e9267 100644
--- a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.IOException;
 import java.util.*;
@@ -34,7 +34,7 @@
 /**
  *  A Cell Iterator in reversed clustering order over SSTable
  */
-public class SSTableReversedIterator extends AbstractSSTableIterator
+public class SSTableReversedIterator extends AbstractBigTableIterator
 {
     /**
      * The index of the slice being processed.
@@ -44,7 +44,7 @@ public class SSTableReversedIterator extends AbstractSSTableIterator
     public SSTableReversedIterator(SSTableReader sstable,
                                    FileDataInput file,
                                    DecoratedKey key,
-                                   RowIndexEntry indexEntry,
+                                   BigTableRowIndexEntry indexEntry,
                                    Slices slices,
                                    ColumnFilter columns,
                                    FileHandle ifile)
@@ -52,7 +52,7 @@ public SSTableReversedIterator(SSTableReader sstable,
         super(sstable, file, key, indexEntry, slices, columns, ifile);
     }
 
-    protected Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    protected Reader createReaderInternal(BigTableRowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
     {
         return indexEntry.isIndexed()
              ? new ReverseIndexedReader(indexEntry, file, shouldCloseFile)
@@ -76,7 +76,7 @@ protected boolean hasMoreSlices()
         return slice < slices.size();
     }
 
-    private class ReverseReader extends Reader
+    private class ReverseReader extends RowReader
     {
         protected ReusablePartitionData buffer;
         protected Iterator<Unfiltered> iterator;
@@ -262,7 +262,7 @@ private class ReverseIndexedReader extends ReverseReader
         // The last index block to consider for the slice
         private int lastBlockIdx;
 
-        private ReverseIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+        private ReverseIndexedReader(BigTableRowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
         {
             super(file, shouldCloseFile);
             this.indexState = new IndexState(this, metadata.comparator, indexEntry, true, ifile);
diff --git a/src/java/org/apache/cassandra/service/CacheService.java b/src/java/org/apache/cassandra/service/CacheService.java
index a1225fb5124e..7fa76d639578 100644
--- a/src/java/org/apache/cassandra/service/CacheService.java
+++ b/src/java/org/apache/cassandra/service/CacheService.java
@@ -42,6 +42,7 @@
 import org.apache.cassandra.db.partitions.CachedBTreePartition;
 import org.apache.cassandra.db.partitions.CachedPartition;
 import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
@@ -79,7 +80,7 @@ public String toString()
 
     public final static CacheService instance = new CacheService();
 
-    public final AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache;
+    public final AutoSavingCache<KeyCacheKey, BigTableRowIndexEntry> keyCache;
     public final AutoSavingCache<RowCacheKey, IRowCacheEntry> rowCache;
     public final AutoSavingCache<CounterCacheKey, ClockAndCount> counterCache;
 
@@ -95,7 +96,7 @@ private CacheService()
     /**
      * @return auto saving cache object
      */
-    private AutoSavingCache<KeyCacheKey, RowIndexEntry> initKeyCache()
+    private AutoSavingCache<KeyCacheKey, BigTableRowIndexEntry> initKeyCache()
     {
         logger.info("Initializing key cache with capacity of {} MBs.", DatabaseDescriptor.getKeyCacheSizeInMB());
 
@@ -103,9 +104,9 @@ private AutoSavingCache<KeyCacheKey, RowIndexEntry> initKeyCache()
 
         // as values are constant size we can use singleton weigher
         // where 48 = 40 bytes (average size of the key) + 8 bytes (size of value)
-        ICache<KeyCacheKey, RowIndexEntry> kc;
+        ICache<KeyCacheKey, BigTableRowIndexEntry> kc;
         kc = CaffeineCache.create(keyCacheInMemoryCapacity);
-        AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = new AutoSavingCache<>(kc, CacheType.KEY_CACHE, new KeyCacheSerializer());
+        AutoSavingCache<KeyCacheKey, BigTableRowIndexEntry> keyCache = new AutoSavingCache<>(kc, CacheType.KEY_CACHE, new KeyCacheSerializer());
 
         int keyCacheKeysToSave = DatabaseDescriptor.getKeyCacheKeysToSave();
 
@@ -411,11 +412,11 @@ public Pair<RowCacheKey, IRowCacheEntry> call() throws Exception
         }
     }
 
-    public static class KeyCacheSerializer implements CacheSerializer<KeyCacheKey, RowIndexEntry>
+    public static class KeyCacheSerializer implements CacheSerializer<KeyCacheKey, BigTableRowIndexEntry>
     {
         public void serialize(KeyCacheKey key, DataOutputPlus out, ColumnFamilyStore cfs) throws IOException
         {
-            RowIndexEntry entry = CacheService.instance.keyCache.getInternal(key);
+            BigTableRowIndexEntry entry = CacheService.instance.keyCache.getInternal(key);
             if (entry == null)
                 return;
 
@@ -427,10 +428,10 @@ public void serialize(KeyCacheKey key, DataOutputPlus out, ColumnFamilyStore cfs
             out.writeBoolean(true);
 
             SerializationHeader header = new SerializationHeader(false, cfs.metadata(), cfs.metadata().regularAndStaticColumns(), EncodingStats.NO_STATS);
-            key.desc.getFormat().getIndexSerializer(cfs.metadata(), key.desc.version, header).serializeForCache(entry, out);
+            new BigTableRowIndexEntry.Serializer(key.desc.version, header).serializeForCache(entry, out);
         }
 
-        public Future<Pair<KeyCacheKey, RowIndexEntry>> deserialize(DataInputPlus input, ColumnFamilyStore cfs) throws IOException
+        public Future<Pair<KeyCacheKey, BigTableRowIndexEntry>> deserialize(DataInputPlus input, ColumnFamilyStore cfs) throws IOException
         {
             //Keyspace and CF name are deserialized by AutoSaving cache and used to fetch the CFS provided as a
             //parameter so they aren't deserialized here, even though they are serialized by this serializer
@@ -450,13 +451,11 @@ public Future<Pair<KeyCacheKey, RowIndexEntry>> deserialize(DataInputPlus input,
                 // wrong is during upgrade, in which case we fail at deserialization. This is not a huge deal however since 1) this is unlikely enough that
                 // this won't affect many users (if any) and only once, 2) this doesn't prevent the node from starting and 3) CASSANDRA-10219 shows that this
                 // part of the code has been broken for a while without anyone noticing (it is, btw, still broken until CASSANDRA-10219 is fixed).
-                RowIndexEntry.Serializer.skipForCache(input);
+                BigTableRowIndexEntry.Serializer.skipForCache(input);
                 return null;
             }
-            RowIndexEntry.IndexSerializer<?> indexSerializer = reader.descriptor.getFormat().getIndexSerializer(reader.metadata(),
-                                                                                                                reader.descriptor.version,
-                                                                                                                reader.header);
-            RowIndexEntry<?> entry = indexSerializer.deserializeForCache(input);
+            BigTableRowIndexEntry.IndexSerializer<?> indexSerializer = new BigTableRowIndexEntry.Serializer(reader.descriptor.version, reader.header);
+            BigTableRowIndexEntry entry = indexSerializer.deserializeForCache(input);
             return Futures.immediateFuture(Pair.create(new KeyCacheKey(cfs.metadata(), reader.descriptor, key), entry));
         }
 
diff --git a/src/java/org/apache/cassandra/tools/SSTableExport.java b/src/java/org/apache/cassandra/tools/SSTableExport.java
index ca01cc34fd89..74af9fbbe49d 100644
--- a/src/java/org/apache/cassandra/tools/SSTableExport.java
+++ b/src/java/org/apache/cassandra/tools/SSTableExport.java
@@ -142,9 +142,11 @@ public static void main(String[] args) throws ConfigurationException
         try
         {
             TableMetadata metadata = Util.metadataFromSSTable(desc);
+            SSTableReader sstable = SSTableReader.openNoValidation(desc, TableMetadataRef.forOfflineTools(metadata));
+            IPartitioner partitioner = sstable.getPartitioner();
             if (cmd.hasOption(ENUMERATE_KEYS_OPTION))
             {
-                try (KeyIterator iter = new KeyIterator(desc, metadata))
+                try (KeyIterator iter = KeyIterator.forSSTable(sstable))
                 {
                     JsonTransformer.keysToJson(null, Util.iterToStream(iter),
                                                cmd.hasOption(RAW_TIMESTAMPS),
@@ -154,8 +156,6 @@ public static void main(String[] args) throws ConfigurationException
             }
             else
             {
-                SSTableReader sstable = SSTableReader.openNoValidation(desc, TableMetadataRef.forOfflineTools(metadata));
-                IPartitioner partitioner = sstable.getPartitioner();
                 final ISSTableScanner currentScanner;
                 if ((keys != null) && (keys.length > 0))
                 {
diff --git a/src/java/org/apache/cassandra/utils/StatusLogger.java b/src/java/org/apache/cassandra/utils/StatusLogger.java
index dcb1135bfccc..0c3f2e4034d6 100644
--- a/src/java/org/apache/cassandra/utils/StatusLogger.java
+++ b/src/java/org/apache/cassandra/utils/StatusLogger.java
@@ -28,7 +28,7 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.service.CacheService;
@@ -92,7 +92,7 @@ private static void logStatus()
                                   "MessagingService", "n/a", pendingLargeMessages + "/" + pendingSmallMessages));
 
         // Global key/row cache information
-        AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = CacheService.instance.keyCache;
+        AutoSavingCache<KeyCacheKey, BigTableRowIndexEntry> keyCache = CacheService.instance.keyCache;
         AutoSavingCache<RowCacheKey, IRowCacheEntry> rowCache = CacheService.instance.rowCache;
 
         int keyCacheKeysToSave = DatabaseDescriptor.getKeyCacheKeysToSave();
diff --git a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
index 7feefa301256..e6ccbc2b49d5 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
@@ -63,6 +63,7 @@
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.ForwardingSSTableReader;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.util.ChannelProxy;
@@ -273,6 +274,12 @@ private FailingSSTableReader(SSTableReader delegate)
             super(delegate);
         }
 
+        @Override
+        public PartitionIndexIterator allKeysIterator() throws IOException
+        {
+            throw new IOException("Fail");
+        }
+
         public ISSTableScanner getScanner()
         {
             return new FailingISSTableScanner();
diff --git a/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java b/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java
index c7abfc557131..6ee1b2d016dc 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java
@@ -29,7 +29,7 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.Slices;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -192,12 +192,6 @@ public UnfilteredRowIterator iterator(DecoratedKey key, Slices slices, ColumnFil
             throw throwCorrupted();
         }
 
-        @Override
-        public UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed)
-        {
-            throw throwCorrupted();
-        }
-
         private CorruptSSTableException throwCorrupted()
         {
             throw new CorruptSSTableException(new IOException("failed to get position"), descriptor.baseFilename());
diff --git a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
index 9a76661d37dd..30e78970d574 100644
--- a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
+++ b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
@@ -24,6 +24,7 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.UUID;
+import java.util.function.Supplier;
 
 import com.google.common.util.concurrent.RateLimiter;
 
@@ -33,7 +34,6 @@
 import org.apache.cassandra.db.DataRange;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.PartitionPosition;
-import org.apache.cassandra.db.RowIndexEntry;
 import org.apache.cassandra.db.Slices;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.EncodingStats;
@@ -46,6 +46,7 @@
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.io.util.ChannelProxy;
 import org.apache.cassandra.io.util.FileDataInput;
@@ -72,6 +73,12 @@ public ForwardingSSTableReader(SSTableReader delegate)
         this.last = delegate.last;
     }
 
+    @Override
+    public PartitionIndexIterator allKeysIterator() throws IOException
+    {
+        return delegate.allKeysIterator();
+    }
+
     @Override
     public boolean equals(Object that)
     {
@@ -253,19 +260,19 @@ public KeyCacheKey getCacheKey(DecoratedKey key)
     }
 
     @Override
-    public void cacheKey(DecoratedKey key, RowIndexEntry info)
+    public void cacheKey(DecoratedKey key, BigTableRowIndexEntry info)
     {
         delegate.cacheKey(key, info);
     }
 
     @Override
-    public RowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats)
+    public BigTableRowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats)
     {
         return delegate.getCachedPosition(key, updateStats);
     }
 
     @Override
-    protected RowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
+    protected BigTableRowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
     {
         return delegate.getCachedPosition(unifiedKey, updateStats);
     }
@@ -277,9 +284,9 @@ public boolean isKeyCacheEnabled()
     }
 
     @Override
-    protected RowIndexEntry getPosition(PartitionPosition key, Operator op, boolean updateCacheAndStats, boolean permitMatchPastLast, SSTableReadsListener listener)
+    protected BigTableRowIndexEntry getPosition(PartitionPosition key, Operator op, boolean updateCacheAndStats, boolean permitMatchPastLast, SSTableReadsListener listener)
     {
-        return delegate.getPosition(key, op, updateCacheAndStats, permitMatchPastLast, listener);
+        return (BigTableRowIndexEntry) delegate.getPosition(key, op, updateCacheAndStats, permitMatchPastLast, listener);
     }
 
     @Override
@@ -289,15 +296,9 @@ public UnfilteredRowIterator iterator(DecoratedKey key, Slices slices, ColumnFil
     }
 
     @Override
-    public UnfilteredRowIterator iterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, Slices slices, ColumnFilter selectedColumns, boolean reversed)
-    {
-        return delegate.iterator(file, key, indexEntry, slices, selectedColumns, reversed);
-    }
-
-    @Override
-    public UnfilteredRowIterator simpleIterator(FileDataInput file, DecoratedKey key, RowIndexEntry indexEntry, boolean tombstoneOnly)
+    public UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, DecoratedKey key, boolean tombstoneOnly)
     {
-        return delegate.simpleIterator(file, key, indexEntry, tombstoneOnly);
+        return delegate.simpleIterator(dfile, key, tombstoneOnly);
     }
 
     @Override
@@ -475,7 +476,7 @@ public long getRecentBloomFilterTruePositiveCount()
     }
 
     @Override
-    public InstrumentingCache<KeyCacheKey, RowIndexEntry> getKeyCache()
+    public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
     {
         return delegate.getKeyCache();
     }
@@ -673,7 +674,7 @@ public Ref<SSTableReader> ref()
     }
 
     @Override
-    void setup(boolean trackHotness)
+    protected void setup(boolean trackHotness)
     {
         delegate.setup(trackHotness);
     }
diff --git a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
index bb5129af9a46..baec51a58ed0 100644
--- a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
+++ b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.cache;
 
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -83,7 +84,7 @@ private static void doTestSerializeAndLoadKeyCache() throws Exception
         for (SSTableReader sstable : cfs.getLiveSSTables())
             sstable.getPosition(Util.dk("key1"), SSTableReader.Operator.EQ);
 
-        AutoSavingCache<KeyCacheKey, RowIndexEntry> keyCache = CacheService.instance.keyCache;
+        AutoSavingCache<KeyCacheKey, BigTableRowIndexEntry> keyCache = CacheService.instance.keyCache;
 
         // serialize to file
         keyCache.submitWrite(keyCache.size()).get();
diff --git a/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java b/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
index 01a2afd6ff38..ec34230d176f 100644
--- a/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
+++ b/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
@@ -23,7 +23,7 @@
 
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -59,7 +59,7 @@ public void queryIndexedSSTableTest() throws Throwable
         boolean hasIndexed = false;
         for (SSTableReader sstable : getCurrentColumnFamilyStore().getLiveSSTables())
         {
-            RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
+            BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
             hasIndexed |= indexEntry != null && indexEntry.isIndexed();
         }
         assert hasIndexed;
diff --git a/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java b/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
index f9ac8d16b0cf..15b4cca35535 100644
--- a/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
+++ b/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
@@ -24,6 +24,7 @@
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.utils.ByteBufferUtil;
@@ -76,12 +77,12 @@ public void testTombstoneBoundariesInIndex(String cacheKeys) throws Throwable
             {
                 // The line below failed with key caching off (CASSANDRA-11158)
                 @SuppressWarnings("unchecked")
-                RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
+                BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
                 if (indexEntry != null && indexEntry.isIndexed())
                 {
                     try (FileDataInput reader = sstable.openIndexReader())
                     {
-                        RowIndexEntry.IndexInfoRetriever infoRetriever = indexEntry.openWithIndex(sstable.getIndexFile());
+                        BigTableRowIndexEntry.IndexInfoRetriever infoRetriever = indexEntry.openWithIndex(sstable.getIndexFile());
                         ClusteringPrefix<?> firstName = infoRetriever.columnsIndex(1).firstName;
                         if (firstName.kind().isBoundary())
                             break deletionLoop;
diff --git a/test/unit/org/apache/cassandra/db/KeyCacheTest.java b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
index 1819b1811844..31bdf83f3f9f 100644
--- a/test/unit/org/apache/cassandra/db/KeyCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
@@ -38,6 +38,7 @@
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.CacheService;
@@ -109,15 +110,15 @@ private void testKeyCacheLoad(String cf) throws Exception
         assertKeyCacheSize(100, KEYSPACE1, cf);
 
         // really? our caches don't implement the map interface? (hence no .addAll)
-        Map<KeyCacheKey, RowIndexEntry> savedMap = new HashMap<>();
-        Map<KeyCacheKey, RowIndexEntry.IndexInfoRetriever> savedInfoMap = new HashMap<>();
+        Map<KeyCacheKey, BigTableRowIndexEntry> savedMap = new HashMap<>();
+        Map<KeyCacheKey, BigTableRowIndexEntry.IndexInfoRetriever> savedInfoMap = new HashMap<>();
         for (Iterator<KeyCacheKey> iter = CacheService.instance.keyCache.keyIterator();
              iter.hasNext();)
         {
             KeyCacheKey k = iter.next();
             if (k.desc.ksname.equals(KEYSPACE1) && k.desc.cfname.equals(cf))
             {
-                RowIndexEntry rie = CacheService.instance.keyCache.get(k);
+                BigTableRowIndexEntry rie = CacheService.instance.keyCache.get(k);
                 savedMap.put(k, rie);
                 SSTableReader sstr = readerForKey(k);
                 savedInfoMap.put(k, rie.openWithIndex(sstr.getIndexFile()));
@@ -134,18 +135,18 @@ private void testKeyCacheLoad(String cf) throws Exception
         assertKeyCacheSize(savedMap.size(), KEYSPACE1, cf);
 
         // probably it's better to add equals/hashCode to RowIndexEntry...
-        for (Map.Entry<KeyCacheKey, RowIndexEntry> entry : savedMap.entrySet())
+        for (Map.Entry<KeyCacheKey, BigTableRowIndexEntry> entry : savedMap.entrySet())
         {
-            RowIndexEntry expected = entry.getValue();
-            RowIndexEntry actual = CacheService.instance.keyCache.get(entry.getKey());
+            BigTableRowIndexEntry expected = entry.getValue();
+            BigTableRowIndexEntry actual = CacheService.instance.keyCache.get(entry.getKey());
             assertEquals(expected.position, actual.position);
             assertEquals(expected.columnsIndexCount(), actual.columnsIndexCount());
             for (int i = 0; i < expected.columnsIndexCount(); i++)
             {
                 SSTableReader actualSstr = readerForKey(entry.getKey());
-                try (RowIndexEntry.IndexInfoRetriever actualIir = actual.openWithIndex(actualSstr.getIndexFile()))
+                try (BigTableRowIndexEntry.IndexInfoRetriever actualIir = actual.openWithIndex(actualSstr.getIndexFile()))
                 {
-                    RowIndexEntry.IndexInfoRetriever expectedIir = savedInfoMap.get(entry.getKey());
+                    BigTableRowIndexEntry.IndexInfoRetriever expectedIir = savedInfoMap.get(entry.getKey());
                     assertEquals(expectedIir.columnsIndex(i), actualIir.columnsIndex(i));
                 }
             }
diff --git a/test/unit/org/apache/cassandra/db/KeyspaceTest.java b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
index fd15366e567c..4aa8a32e1c05 100644
--- a/test/unit/org/apache/cassandra/db/KeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
@@ -21,6 +21,7 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.assertj.core.api.Assertions;
 import org.junit.Test;
 import org.mockito.Mockito;
@@ -408,7 +409,7 @@ public void testGetSliceFromLarge() throws Throwable
 
         // verify that we do indeed have multiple index entries
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
-        RowIndexEntry<?> indexEntry = sstable.getPosition(Util.dk("0"), SSTableReader.Operator.EQ);
+        BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(Util.dk("0"), SSTableReader.Operator.EQ);
         assert indexEntry.columnsIndexCount() > 2;
 
         validateSliceLarge(cfs);
diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java
index 9d663b58626b..5cf8c0bd0f6e 100644
--- a/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java
@@ -18,6 +18,7 @@
 
 package org.apache.cassandra.db.streaming;
 
+import java.io.IOException;
 import java.util.Arrays;
 import java.util.List;
 
@@ -98,7 +99,7 @@ public void validateFullyContainedIn_SingleContiguousRange_Succeeds()
     }
 
     @Test
-    public void validateFullyContainedIn_PartialOverlap_Fails()
+    public void validateFullyContainedIn_PartialOverlap_Fails() throws IOException
     {
         List<Range<Token>> requestedRanges = Arrays.asList(new Range<>(store.getPartitioner().getMinimumToken(), getTokenAtIndex(2)));
 
@@ -111,7 +112,7 @@ public void validateFullyContainedIn_PartialOverlap_Fails()
     }
 
     @Test
-    public void validateFullyContainedIn_SplitRange_Succeeds()
+    public void validateFullyContainedIn_SplitRange_Succeeds() throws IOException
     {
         List<Range<Token>> requestedRanges = Arrays.asList(new Range<>(store.getPartitioner().getMinimumToken(), getTokenAtIndex(4)),
                                                          new Range<>(getTokenAtIndex(2), getTokenAtIndex(6)),
@@ -126,12 +127,12 @@ public void validateFullyContainedIn_SplitRange_Succeeds()
         assertTrue(cof.contained(sections, sstable));
     }
 
-    private DecoratedKey getKeyAtIndex(int i)
+    private DecoratedKey getKeyAtIndex(int i) throws IOException
     {
         int count = 0;
         DecoratedKey key;
 
-        try (KeyIterator iter = new KeyIterator(sstable.descriptor, sstable.metadata()))
+        try (KeyIterator iter = KeyIterator.forSSTable(sstable))
         {
             do
             {
@@ -142,7 +143,7 @@ private DecoratedKey getKeyAtIndex(int i)
         return key;
     }
 
-    private Token getTokenAtIndex(int i)
+    private Token getTokenAtIndex(int i) throws IOException
     {
         return getKeyAtIndex(i).getToken();
     }
diff --git a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java b/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntryTest.java
similarity index 93%
rename from test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
rename to test/unit/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntryTest.java
index 7b774eb03793..ec076b62aef1 100644
--- a/test/unit/org/apache/cassandra/db/RowIndexEntryTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntryTest.java
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.db;
+package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.File;
 import java.io.IOException;
@@ -36,7 +36,17 @@
 import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
-import org.apache.cassandra.db.columniterator.AbstractSSTableIterator;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.LivenessInfo;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
@@ -53,10 +63,8 @@
 import org.apache.cassandra.db.rows.UnfilteredSerializer;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.IndexInfo;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.sstable.format.big.BigFormat;
 import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.serializers.LongSerializer;
@@ -68,7 +76,7 @@
 import static junit.framework.Assert.assertEquals;
 import static junit.framework.Assert.assertTrue;
 
-public class RowIndexEntryTest extends CQLTester
+public class BigTableRowIndexEntryTest extends CQLTester
 {
     private static final List<AbstractType<?>> clusterTypes = Collections.singletonList(LongType.instance);
     private static final ClusteringComparator comp = new ClusteringComparator(clusterTypes);
@@ -146,7 +154,7 @@ private static class DoubleSerializer implements AutoCloseable
         SerializationHeader header = new SerializationHeader(true, metadata, metadata.regularAndStaticColumns(), EncodingStats.NO_STATS);
 
         // create C-11206 + old serializer instances
-        RowIndexEntry.IndexSerializer rieSerializer = new RowIndexEntry.Serializer(version, header);
+        BigTableRowIndexEntry.IndexSerializer rieSerializer = new BigTableRowIndexEntry.Serializer(version, header);
         Pre_C_11206_RowIndexEntry.Serializer oldSerializer = new Pre_C_11206_RowIndexEntry.Serializer(metadata, version, header);
 
         @SuppressWarnings({ "resource", "IOResourceOpenedButNotSafelyClosed" })
@@ -156,9 +164,9 @@ private static class DoubleSerializer implements AutoCloseable
 
         final SequentialWriter dataWriterNew;
         final SequentialWriter dataWriterOld;
-        final org.apache.cassandra.db.ColumnIndex columnIndex;
+        final org.apache.cassandra.io.sstable.format.big.ColumnIndex columnIndex;
 
-        RowIndexEntry rieNew;
+        BigTableRowIndexEntry rieNew;
         ByteBuffer rieNewSerialized;
         Pre_C_11206_RowIndexEntry rieOld;
         ByteBuffer rieOldSerialized;
@@ -168,8 +176,8 @@ private static class DoubleSerializer implements AutoCloseable
             SequentialWriterOption option = SequentialWriterOption.newBuilder().bufferSize(1024).build();
             File f = FileUtils.createTempFile("RowIndexEntryTest-", "db");
             dataWriterNew = new SequentialWriter(f, option);
-            columnIndex = new org.apache.cassandra.db.ColumnIndex(header, dataWriterNew, version, Collections.emptyList(),
-                                                                  rieSerializer.indexInfoSerializer());
+            columnIndex = new org.apache.cassandra.io.sstable.format.big.ColumnIndex(header, dataWriterNew, version, Collections.emptyList(),
+                                                                                     rieSerializer.indexInfoSerializer());
 
             f = FileUtils.createTempFile("RowIndexEntryTest-", "db");
             dataWriterOld = new SequentialWriter(f, option);
@@ -187,17 +195,17 @@ void build(Row staticRow, DecoratedKey partitionKey,
 
             Iterator<Clustering<?>> clusteringIter = clusterings.iterator();
             columnIndex.buildRowIndex(makeRowIter(staticRow, partitionKey, clusteringIter, dataWriterNew));
-            rieNew = RowIndexEntry.create(startPosition, 0L,
-                                          deletionInfo, columnIndex.headerLength, columnIndex.columnIndexCount,
-                                          columnIndex.indexInfoSerializedSize(),
-                                          columnIndex.indexSamples(), columnIndex.offsets(),
-                                          rieSerializer.indexInfoSerializer());
+            rieNew = BigTableRowIndexEntry.create(startPosition, 0L,
+                                                  deletionInfo, columnIndex.headerLength, columnIndex.columnIndexCount,
+                                                  columnIndex.indexInfoSerializedSize(),
+                                                  columnIndex.indexSamples(), columnIndex.offsets(),
+                                                  rieSerializer.indexInfoSerializer());
             rieSerializer.serialize(rieNew, rieOutput, columnIndex.buffer());
             rieNewSerialized = rieOutput.buffer().duplicate();
 
             Iterator<Clustering<?>> clusteringIter2 = clusterings.iterator();
-            ColumnIndex columnIndex = RowIndexEntryTest.ColumnIndex.writeAndBuildIndex(makeRowIter(staticRow, partitionKey, clusteringIter2, dataWriterOld),
-                                                                                       dataWriterOld, header, Collections.emptySet(), BigFormat.latestVersion);
+            ColumnIndex columnIndex = BigTableRowIndexEntryTest.ColumnIndex.writeAndBuildIndex(makeRowIter(staticRow, partitionKey, clusteringIter2, dataWriterOld),
+                                                                                               dataWriterOld, header, Collections.emptySet(), BigFormat.latestVersion);
             rieOld = Pre_C_11206_RowIndexEntry.create(startPosition, deletionInfo, columnIndex);
             oldSerializer.serialize(rieOld, oldOutput);
             rieOldSerialized = oldOutput.buffer().duplicate();
@@ -389,7 +397,7 @@ private ColumnIndex close() throws IOException
 
                 // It's possible we add no rows, just a top level deletion
                 if (written == 0)
-                    return RowIndexEntryTest.ColumnIndex.EMPTY;
+                    return BigTableRowIndexEntryTest.ColumnIndex.EMPTY;
 
                 // the last column may have fallen on an index boundary already.  if not, index it explicitly.
                 if (firstClustering != null)
@@ -427,7 +435,7 @@ public void testSerializedSize() throws Throwable
         File tempFile = FileUtils.createTempFile("row_index_entry_test", null);
         tempFile.deleteOnExit();
         SequentialWriter writer = new SequentialWriter(tempFile);
-        ColumnIndex columnIndex = RowIndexEntryTest.ColumnIndex.writeAndBuildIndex(partition.unfilteredIterator(), writer, header, Collections.emptySet(), BigFormat.latestVersion);
+        ColumnIndex columnIndex = BigTableRowIndexEntryTest.ColumnIndex.writeAndBuildIndex(partition.unfilteredIterator(), writer, header, Collections.emptySet(), BigFormat.latestVersion);
         Pre_C_11206_RowIndexEntry withIndex = Pre_C_11206_RowIndexEntry.create(0xdeadbeef, DeletionTime.LIVE, columnIndex);
         IndexInfo.Serializer indexSerializer = IndexInfo.serializer(BigFormat.latestVersion, header);
 
@@ -774,7 +782,7 @@ public void testIndexFor() throws IOException
         indexes.add(new IndexInfo(cn(10L), cn(15L), 0, 0, deletionInfo));
         indexes.add(new IndexInfo(cn(20L), cn(25L), 0, 0, deletionInfo));
 
-        RowIndexEntry rie = new RowIndexEntry(0L)
+        BigTableRowIndexEntry rie = new BigTableRowIndexEntry(0L)
         {
             public IndexInfoRetriever openWithIndex(FileHandle indexFile)
             {
@@ -797,7 +805,7 @@ public int columnsIndexCount()
             }
         };
         
-        AbstractSSTableIterator.IndexState indexState = new AbstractSSTableIterator.IndexState(
+        IndexState indexState = new IndexState(
             null, comp, rie, false, null                                                                                              
         );
         
@@ -811,7 +819,7 @@ public int columnsIndexCount()
         assertEquals(3, indexState.indexFor(cn(100L), 2));
         assertEquals(3, indexState.indexFor(cn(100L), 3));
 
-        indexState = new AbstractSSTableIterator.IndexState(
+        indexState = new IndexState(
             null, comp, rie, true, null
         );
 
diff --git a/test/unit/org/apache/cassandra/db/columniterator/SSTableReverseIteratorTest.java b/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
similarity index 93%
rename from test/unit/org/apache/cassandra/db/columniterator/SSTableReverseIteratorTest.java
rename to test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
index 9040f1197cfa..58185782f8ec 100644
--- a/test/unit/org/apache/cassandra/db/columniterator/SSTableReverseIteratorTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.db.columniterator;
+package org.apache.cassandra.io.sstable.format.columniterator;
 
 import java.nio.ByteBuffer;
 import java.util.Random;
@@ -33,7 +33,7 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.KeyspaceParams;
@@ -84,7 +84,7 @@ public void emptyBlockTolerance()
         tbl.forceBlockingFlush();
         SSTableReader sstable = Iterables.getOnlyElement(tbl.getLiveSSTables());
         DecoratedKey dk = tbl.getPartitioner().decorateKey(Int32Type.instance.decompose(key));
-        RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
+        BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
         Assert.assertTrue(indexEntry.isIndexed());
         Assert.assertTrue(indexEntry.columnsIndexCount() > 2);
 
diff --git a/update-history/STAR-801/72-3aba2f97e3 STAR-15: Allow for other implementations of SSTableFormat b/update-history/STAR-801/72-3aba2f97e3 STAR-15: Allow for other implementations of SSTableFormat
new file mode 100644
index 000000000000..7ac4f0b46925
--- /dev/null
+++ b/update-history/STAR-801/72-3aba2f97e3 STAR-15: Allow for other implementations of SSTableFormat	
@@ -0,0 +1,212 @@
+--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
++++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+@@ -168,28 +168,7 @@
+         try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, false, sstable.maxDataAge);
+              Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable)))
+         {
+-<<<<<<<
+             assert !indexAvailable() || indexIterator.dataPosition() == 0 : indexIterator.dataPosition();
+-=======
+-            try
+-            {
+-                nextIndexKey = indexAvailable() ? ByteBufferUtil.readWithShortLength(indexFile) : null;
+-                if (indexAvailable())
+-                {
+-                    // throw away variable so we don't have a side effect in the assert
+-                    long firstRowPositionFromIndex = rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
+-                    assert firstRowPositionFromIndex == 0 : firstRowPositionFromIndex;
+-                }
+-            }
+-            catch (Throwable ex)
+-            {
+-                throwIfFatal(ex);
+-                nextIndexKey = null;
+-                nextRowPositionFromIndex = dataFile.length();
+-                if (indexFile != null)
+-                    indexFile.seek(indexFile.length());
+-            }
+->>>>>>>
+ 
+             StatsMetadata metadata = sstable.getSSTableMetadata();
+             writer.switchWriter(CompactionManager.createWriter(cfs, destination, expectedBloomFilterSize, metadata.repairedAt, metadata.pendingRepair, metadata.isTransient, sstable, transaction));
+@@ -215,7 +194,6 @@
+                     // check for null key below
+                 }
+ 
+-<<<<<<<
+                 long dataStart = dataFile.getFilePointer();
+ 
+                 long dataStartFromIndex = -1;
+@@ -226,24 +204,8 @@
+                     dataStartFromIndex = indexIterator.dataPosition() + TypeSizes.SHORT_SIZE + currentIndexKey.remaining();
+                     if (advanceIndexNoThrow())
+                         dataSizeFromIndex = indexIterator.dataPosition() - dataStartFromIndex;
+-=======
+-                long dataStartFromIndex = -1;
+-                long dataSizeFromIndex = -1;
+-
+-                updateIndexKey();
+-
+-                if (indexAvailable())
+-                {
+-                    if (currentIndexKey != null)
+-                    {
+-                        dataStartFromIndex = currentRowPositionFromIndex + 2 + currentIndexKey.remaining();
+-                        dataSizeFromIndex = nextRowPositionFromIndex - dataStartFromIndex;
+-                    }
+->>>>>>>
+                 }
+ 
+-                long dataStart = dataFile.getFilePointer();
+-
+                 // avoid an NPE if key is null
+                 String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
+                 outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex)));
+@@ -404,29 +366,14 @@
+     {
+         try
+         {
+-<<<<<<<
+-            nextIndexKey = !indexAvailable() ? null : ByteBufferUtil.readWithShortLength(indexFile);
+-
+-            nextRowPositionFromIndex = !indexAvailable()
+-                                       ? dataFile.length()
+-                                       : rowIndexEntrySerializer.deserializePositionAndSkip(indexFile);
+-=======
+             return indexAvailable() && indexIterator.advance();
+->>>>>>>
+         }
+         catch (Throwable th)
+         {
+             JVMStabilityInspector.inspectThrowable(th);
+             outputHandler.warn("Error reading index file", th);
+-<<<<<<<
+             indexIterator.close();
+             return false;
+-=======
+-            nextIndexKey = null;
+-            nextRowPositionFromIndex = dataFile.length();
+-            if (indexFile != null)
+-                indexFile.seek(indexFile.length());
+->>>>>>>
+         }
+     }
+ 
+--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+@@ -1671,9 +1671,6 @@
+         return bloomFilterTracker.getRecentTruePositiveCount();
+     }
+ 
+-<<<<<<<
+-    public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
+-=======
+     public long getBloomFilterTrueNegativeCount()
+     {
+         return bloomFilterTracker.getTrueNegativeCount();
+@@ -1684,8 +1681,7 @@
+         return bloomFilterTracker.getRecentTrueNegativeCount();
+     }
+ 
+-    public InstrumentingCache<KeyCacheKey, RowIndexEntry> getKeyCache()
+->>>>>>>
++    public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
+     {
+         return keyCache;
+     }
+--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+@@ -24,11 +24,7 @@
+ import org.apache.cassandra.db.DecoratedKey;
+ import org.apache.cassandra.db.SerializationHeader;
+ import org.apache.cassandra.io.sstable.*;
+-<<<<<<<
+-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+-=======
+ import org.apache.cassandra.io.sstable.metadata.MetadataType;
+->>>>>>>
+ import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+ import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+ import org.apache.cassandra.io.util.DiskOptimizationStrategy;
+@@ -45,7 +41,6 @@
+ import java.io.DataInputStream;
+ import java.io.File;
+ import java.io.IOException;
+-import java.nio.ByteBuffer;
+ import java.nio.file.Files;
+ import java.nio.file.Path;
+ import java.nio.file.Paths;
+@@ -96,47 +91,20 @@
+ 
+     public abstract SSTableReader build();
+ 
+-<<<<<<<
+-=======
+-    public SSTableReaderBuilder dfile(FileHandle dfile)
+-    {
+-        this.dfile = dfile;
+-        return this;
+-    }
+-
+-    public SSTableReaderBuilder ifile(FileHandle ifile)
+-    {
+-        this.ifile = ifile;
+-        return this;
+-    }
+-
+-    public SSTableReaderBuilder bf(IFilter bf)
+-    {
+-        this.bf = bf;
+-        return this;
+-    }
+-
+-    public SSTableReaderBuilder summary(IndexSummary summary)
+-    {
+-        this.summary = summary;
+-        return this;
+-    }
+-
+     public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor)
+     {
+         return new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX))
+-               .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
+-               .withChunkCache(ChunkCache.instance);
++                .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
++                .withChunkCache(ChunkCache.instance);
+     }
+ 
+     public static FileHandle.Builder defaultDataHandleBuilder(Descriptor descriptor)
+     {
+         return new FileHandle.Builder(descriptor.filenameFor(Component.DATA))
+-               .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap)
+-               .withChunkCache(ChunkCache.instance);
++                .mmapped(DatabaseDescriptor.getDiskAccessMode() == Config.DiskAccessMode.mmap)
++                .withChunkCache(ChunkCache.instance);
+     }
+ 
+->>>>>>>
+     /**
+      * Load index summary, first key and last key from Summary.db file if it exists.
+      *
+diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+index 26133dcb13..b0d601937a 100644
+--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
++++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+@@ -206,8 +206,6 @@ public class Scrubber implements Closeable
+                         dataSizeFromIndex = indexIterator.dataPosition() - dataStartFromIndex;
+                 }
+ 
+-                long dataStart = dataFile.getFilePointer();
+-
+                 // avoid an NPE if key is null
+                 String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
+                 outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex)));
+@@ -376,8 +374,6 @@ public class Scrubber implements Closeable
+             outputHandler.warn("Error reading index file", th);
+             indexIterator.close();
+             return false;
+-            if (indexFile != null)
+-                indexFile.seek(indexFile.length());
+         }
+     }
+ 

From 4b0e2387e03f1b8f6797f0f6c1a22fd1cb7373c5 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 16 Dec 2020 14:49:52 +0200
Subject: [PATCH 032/151] STAR-17: Implements inverse byte-comparable
 translation

Also adds ValueAccessor support, fixes minor TupleType and DynamicCompositeType problems and adds documentation.

patch by Branimir Lambov; reviewed by Dimitar Dimitrov and Jacek Lewandowski

(cherry picked from commit b54130cec96961330ee83cc52b6e6f439f466fa4)
(cherry picked from commit cc13730c2131b6be9b2b982b878f1bba31de2419)
---
 .../org/apache/cassandra/cql3/Tuples.java     |    7 +-
 .../org/apache/cassandra/cql3/UserTypes.java  |    2 +-
 .../cql3/conditions/ColumnCondition.java      |   10 +-
 .../cql3/selection/FieldSelector.java         |    3 +-
 .../cassandra/db/BufferDecoratedKey.java      |   26 +
 .../cassandra/db/ClusteringComparator.java    |  178 ++-
 .../apache/cassandra/db/ClusteringPrefix.java |    6 +-
 .../org/apache/cassandra/db/DecoratedKey.java |   34 +-
 .../cassandra/db/NativeDecoratedKey.java      |   62 +
 .../cassandra/db/PartitionPosition.java       |    4 +-
 .../cassandra/db/marshal/AbstractType.java    |   44 +-
 .../cassandra/db/marshal/BooleanType.java     |   19 +-
 .../db/marshal/ByteArrayAccessor.java         |    7 +
 .../db/marshal/ByteArrayObjectFactory.java    |   35 +-
 .../db/marshal/ByteBufferAccessor.java        |    7 +
 .../db/marshal/ByteBufferObjectFactory.java   |   24 +-
 .../apache/cassandra/db/marshal/ByteType.java |   20 +-
 .../cassandra/db/marshal/CompositeType.java   |   86 +-
 .../apache/cassandra/db/marshal/DateType.java |   15 +-
 .../cassandra/db/marshal/DecimalType.java     |  133 ++-
 .../cassandra/db/marshal/DoubleType.java      |   15 +-
 .../db/marshal/DynamicCompositeType.java      |  149 ++-
 .../cassandra/db/marshal/EmptyType.java       |   12 +-
 .../cassandra/db/marshal/FloatType.java       |   15 +-
 .../cassandra/db/marshal/Int32Type.java       |   15 +-
 .../cassandra/db/marshal/IntegerType.java     |  114 +-
 .../cassandra/db/marshal/LexicalUUIDType.java |   29 +-
 .../apache/cassandra/db/marshal/ListType.java |   55 +-
 .../apache/cassandra/db/marshal/LongType.java |   15 +-
 .../apache/cassandra/db/marshal/MapType.java  |   66 +-
 .../db/marshal/PartitionerDefinedOrder.java   |   22 +-
 .../cassandra/db/marshal/ReversedType.java    |   14 +-
 .../apache/cassandra/db/marshal/SetType.java  |   14 +-
 .../cassandra/db/marshal/ShortType.java       |   18 +-
 .../cassandra/db/marshal/SimpleDateType.java  |   20 +-
 .../apache/cassandra/db/marshal/TimeType.java |   20 +-
 .../cassandra/db/marshal/TimeUUIDType.java    |   67 +-
 .../cassandra/db/marshal/TimestampType.java   |   15 +-
 .../cassandra/db/marshal/TupleType.java       |   78 +-
 .../apache/cassandra/db/marshal/UUIDType.java |   55 +-
 .../apache/cassandra/db/marshal/UserType.java |    2 +-
 .../cassandra/db/marshal/ValueAccessor.java   |    8 +-
 .../cassandra/dht/ByteOrderedPartitioner.java |   11 +-
 .../cassandra/dht/LocalPartitioner.java       |   13 +-
 .../cassandra/dht/Murmur3Partitioner.java     |   11 +-
 .../dht/OrderPreservingPartitioner.java       |   10 +-
 .../cassandra/dht/RandomPartitioner.java      |   13 +-
 src/java/org/apache/cassandra/dht/Token.java  |   36 +-
 .../sstable/format/big/SSTableIterator.java   |    1 +
 .../format/big/SSTableReversedIterator.java   |    1 +
 .../serializers/BooleanSerializer.java        |    4 +-
 .../{ => bytecomparable}/ByteComparable.java  |   10 +-
 .../utils/bytecomparable/ByteComparable.md    |  590 ++++++++++
 .../{ => bytecomparable}/ByteSource.java      |  262 ++---
 .../bytecomparable/ByteSourceInverse.java     |  448 ++++++++
 .../AbstractTypeByteSourceDecodingBench.java  |  140 +++
 .../validation/entities/TupleTypeTest.java    |    5 +-
 .../validation/entities/UserTypesTest.java    |   50 +
 .../db/marshal/TypeValidationTest.java        |    2 +-
 .../cassandra/dht/KeyCollisionTest.java       |    4 +-
 .../cassandra/dht/LengthPartitioner.java      |    7 +
 .../cassandra/transport/SerDeserTest.java     |    2 +-
 .../AbstractTypeByteSourceTest.java           | 1018 +++++++++++++++++
 .../ByteSourceComparisonTest.java}            |  272 ++---
 .../ByteSourceConversionTest.java             |  718 ++++++++++++
 .../bytecomparable/ByteSourceInverseTest.java |  321 ++++++
 .../ByteSourceSequenceTest.java               |  781 +++++++++++++
 .../bytecomparable/ByteSourceTestBase.java    |  255 +++++
 .../DecoratedKeyByteSourceTest.java           |   83 ++
 69 files changed, 5977 insertions(+), 631 deletions(-)
 rename src/java/org/apache/cassandra/utils/{ => bytecomparable}/ByteComparable.java (94%)
 create mode 100644 src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md
 rename src/java/org/apache/cassandra/utils/{ => bytecomparable}/ByteSource.java (79%)
 create mode 100644 src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java
 create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java
 rename test/unit/org/apache/cassandra/utils/{ByteSourceTest.java => bytecomparable/ByteSourceComparisonTest.java} (73%)
 create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java
 create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java

diff --git a/src/java/org/apache/cassandra/cql3/Tuples.java b/src/java/org/apache/cassandra/cql3/Tuples.java
index b8acd5954af4..6e028c274d31 100644
--- a/src/java/org/apache/cassandra/cql3/Tuples.java
+++ b/src/java/org/apache/cassandra/cql3/Tuples.java
@@ -154,14 +154,14 @@ public Value(ByteBuffer[] elements)
 
         public static Value fromSerialized(ByteBuffer bytes, TupleType type)
         {
-            ByteBuffer[] values = type.split(bytes);
+            ByteBuffer[] values = type.split(ByteBufferAccessor.instance, bytes);
             if (values.length > type.size())
             {
                 throw new InvalidRequestException(String.format(
                         "Tuple value contained too many fields (expected %s, got %s)", type.size(), values.length));
             }
 
-            return new Value(type.split(bytes));
+            return new Value(type.split(ByteBufferAccessor.instance, bytes));
         }
 
         public ByteBuffer get(ProtocolVersion protocolVersion)
@@ -272,7 +272,8 @@ public static InValue fromSerialized(ByteBuffer value, ListType type, QueryOptio
                 // type.split(bytes)
                 List<List<ByteBuffer>> elements = new ArrayList<>(l.size());
                 for (Object element : l)
-                    elements.add(Arrays.asList(tupleType.split(type.getElementsType().decompose(element))));
+                    elements.add(Arrays.asList(tupleType.split(ByteBufferAccessor.instance,
+                                                               type.getElementsType().decompose(element))));
                 return new InValue(elements);
             }
             catch (MarshalException e)
diff --git a/src/java/org/apache/cassandra/cql3/UserTypes.java b/src/java/org/apache/cassandra/cql3/UserTypes.java
index b023a8a0b8f6..a63420fca3cd 100644
--- a/src/java/org/apache/cassandra/cql3/UserTypes.java
+++ b/src/java/org/apache/cassandra/cql3/UserTypes.java
@@ -217,7 +217,7 @@ public Value(UserType type, ByteBuffer[] elements)
         public static Value fromSerialized(ByteBuffer bytes, UserType type)
         {
             type.validate(bytes);
-            return new Value(type, type.split(bytes));
+            return new Value(type, type.split(ByteBufferAccessor.instance, bytes));
         }
 
         public ByteBuffer get(ProtocolVersion protocolVersion)
diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
index 93ed6ae941bd..8e34f6ecc2a3 100644
--- a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
+++ b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java
@@ -612,10 +612,16 @@ private ByteBuffer rowValue(Row row)
                 return cell == null ? null : cell.buffer();
             }
 
-            Cell<?> cell = getCell(row, column);
+            // getCell returns Cell<?>, which requires a method call to properly convert.
+            return getCellBuffer(getCell(row, column), userType);
+        }
+
+        private <V> ByteBuffer getCellBuffer(Cell<V> cell, UserType userType)
+        {
             return cell == null
                       ? null
-                      : userType.split(cell.buffer())[userType.fieldPosition(field)];
+                      : ByteBufferAccessor.instance.convert(userType.split(cell.accessor(), cell.value())[userType.fieldPosition(field)],
+                                                            cell.accessor());
         }
 
         private boolean isSatisfiedBy(ByteBuffer rowValue)
diff --git a/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java b/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java
index c67fc0330e95..403d3bed2e20 100644
--- a/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java
+++ b/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java
@@ -23,6 +23,7 @@
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.transport.ProtocolVersion;
@@ -89,7 +90,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) throws InvalidReque
         ByteBuffer value = selected.getOutput(protocolVersion);
         if (value == null)
             return null;
-        ByteBuffer[] buffers = type.split(value);
+        ByteBuffer[] buffers = type.split(ByteBufferAccessor.instance, value);
         return field < buffers.length ? buffers[field] : null;
     }
 
diff --git a/src/java/org/apache/cassandra/db/BufferDecoratedKey.java b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
index d375162240d7..ae3e9d44e08a 100644
--- a/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
@@ -19,7 +19,9 @@
 
 import java.nio.ByteBuffer;
 
+import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 
 public class BufferDecoratedKey extends DecoratedKey
 {
@@ -36,4 +38,28 @@ public ByteBuffer getKey()
     {
         return key;
     }
+
+    /**
+     * A factory method that translates the given byte-comparable representation to a {@link BufferDecoratedKey}
+     * instance. If the given byte comparable doesn't represent the encoding of a buffer decorated key, anything from a
+     * wide variety of throwables may be thrown (e.g. {@link AssertionError}, {@link IndexOutOfBoundsException},
+     * {@link IllegalStateException}, etc.).
+     *
+     * @param byteComparable A byte-comparable representation (presumably of a {@link BufferDecoratedKey} instance).
+     * @param version The encoding version used for the given byte comparable.
+     * @param partitioner The partitioner of the encoded decorated key. Needed in order to correctly decode the token
+     *                    bytes of the key.
+     * @return A new {@link BufferDecoratedKey} instance, corresponding to the given byte-comparable representation. If
+     * we were to call {@link #asComparableBytes(Version)} on the returned object, we should get a {@link ByteSource}
+     * equal to the one of the input byte comparable.
+     */
+    public static BufferDecoratedKey fromByteComparable(ByteComparable byteComparable,
+                                                        Version version,
+                                                        IPartitioner partitioner)
+    {
+        return DecoratedKey.fromByteComparable(byteComparable,
+                                               version,
+                                               partitioner,
+                                               (token, keyBytes) -> new BufferDecoratedKey(token, ByteBuffer.wrap(keyBytes)));
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java
index e5b63aa6a27a..6d67e6bc9590 100644
--- a/src/java/org/apache/cassandra/db/ClusteringComparator.java
+++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.db;
 
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Objects;
@@ -31,8 +32,14 @@
 import org.apache.cassandra.serializers.MarshalException;
 
 import org.apache.cassandra.io.sstable.format.big.IndexInfo;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+import static org.apache.cassandra.utils.bytecomparable.ByteSource.EXCLUDED;
+import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT;
+import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT_NULL;
+import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT_NULL_REVERSED;
+import static org.apache.cassandra.utils.bytecomparable.ByteSource.TERMINATOR;
 
 /**
  * A comparator of clustering prefixes (or more generally of {@link Clusterable}}.
@@ -302,7 +309,7 @@ public int next()
                     if (srcnum == sz)
                         return src.kind().asByteComparableValue(version);
 
-                    current = subtype(srcnum).asComparableBytes(src.accessor().toBuffer(src.get(srcnum)), version);
+                    current = subtype(srcnum).asComparableBytes(src.accessor(), src.get(srcnum), version);
                     if (current == null)
                         return subtype(srcnum).isReversed() ? NEXT_COMPONENT_NULL_REVERSED : NEXT_COMPONENT_NULL;
 
@@ -317,6 +324,171 @@ public String toString()
         }
     }
 
+    /**
+     * Produces a clustering from the given byte-comparable value. The method will throw an exception if the value
+     * does not correctly encode a clustering of this type, including if it encodes a position before or after a
+     * clustering (i.e. a bound/boundary).
+     *
+     * @param accessor Accessor to use to construct components. Because this will be used to construct individual
+     *                 arrays/buffers for each component, it may be sensible to use an accessor that allocates larger
+     *                 buffers in advance.
+     * @param comparable The clustering encoded as a byte-comparable sequence.
+     */
+    public <V> Clustering<V> clusteringFromByteComparable(ValueAccessor<V> accessor,
+                                                          ByteComparable comparable)
+    {
+        ByteComparable.Version version = ByteComparable.Version.OSS41;
+        ByteSource.Peekable orderedBytes = ByteSource.peekable(comparable.asComparableBytes(version));
+        if (orderedBytes == null)
+            return null;
+
+        // First check for special cases (partition key only, static clustering) that can do without buffers.
+        int sep = orderedBytes.next();
+        switch (sep)
+        {
+        case TERMINATOR:
+            assert size() == 0 : "Terminator should be after " + size() + " components, got 0";
+            return accessor.factory().clustering();
+        case EXCLUDED:
+            return accessor.factory().staticClustering();
+        default:
+            // continue with processing
+        }
+
+        int cc = 0;
+        V[] components = accessor.createArray(size());
+
+        while (true)
+        {
+            switch (sep)
+            {
+            case NEXT_COMPONENT_NULL:
+            case NEXT_COMPONENT_NULL_REVERSED:
+                components[cc] = accessor.empty();
+                break;
+            case NEXT_COMPONENT:
+                components[cc] = subtype(cc).fromComparableBytes(accessor, orderedBytes, version);
+                break;
+            case TERMINATOR:
+                assert cc == size() : "Terminator should be after " + size() + " components, got " + cc;
+                return accessor.factory().clustering(components);
+            case EXCLUDED:
+                throw new AssertionError("Unexpected static terminator after the first component");
+            default:
+                throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in Clustering encoding");
+            }
+            ++cc;
+            sep = orderedBytes.next();
+        }
+    }
+
+    /**
+     * Produces a clustering bound from the given byte-comparable value. The method will throw an exception if the value
+     * does not correctly encode a bound position of this type, including if it encodes an exact clustering.
+     *
+     * Note that the encoded clustering position cannot specify the type of bound (i.e. start/end/boundary) because to
+     * correctly compare clustering positions the encoding must be the same for the different types (e.g. the position
+     * for a exclusive end and an inclusive start is the same, before the exact clustering). The type must be supplied
+     * separately (in the bound... vs boundary... call and isEnd argument).
+     *
+     * @param accessor Accessor to use to construct components. Because this will be used to construct individual
+     *                 arrays/buffers for each component, it may be sensible to use an accessor that allocates larger
+     *                 buffers in advance.
+     * @param comparable The clustering position encoded as a byte-comparable sequence.
+     * @param isEnd true if the bound marks the end of a range, false is it marks the start.
+     */
+    public <V> ClusteringBound<V> boundFromByteComparable(ValueAccessor<V> accessor,
+                                                          ByteComparable comparable,
+                                                          boolean isEnd)
+    {
+        ByteComparable.Version version = ByteComparable.Version.OSS41;
+        ByteSource.Peekable orderedBytes = ByteSource.peekable(comparable.asComparableBytes(version));
+        if (orderedBytes == null)
+            return null;
+
+        int sep = orderedBytes.next();
+        int cc = 0;
+        V[] components = accessor.createArray(size());
+
+        while (true)
+        {
+            switch (sep)
+            {
+            case NEXT_COMPONENT_NULL:
+            case NEXT_COMPONENT_NULL_REVERSED:
+                components[cc] = accessor.empty();
+                break;
+            case NEXT_COMPONENT:
+                components[cc] = subtype(cc).fromComparableBytes(accessor, orderedBytes, version);
+                break;
+            case ByteSource.LT_NEXT_COMPONENT:
+                return accessor.factory().bound(isEnd ? ClusteringPrefix.Kind.EXCL_END_BOUND
+                                                      : ClusteringPrefix.Kind.INCL_START_BOUND,
+                                                Arrays.copyOf(components, cc));
+            case ByteSource.GT_NEXT_COMPONENT:
+                return accessor.factory().bound(isEnd ? ClusteringPrefix.Kind.INCL_END_BOUND
+                                                      : ClusteringPrefix.Kind.EXCL_START_BOUND,
+                                                Arrays.copyOf(components, cc));
+            default:
+                throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in ClusteringBound encoding");
+            }
+            ++cc;
+            sep = orderedBytes.next();
+        }
+    }
+
+    /**
+     * Produces a clustering boundary from the given byte-comparable value. The method will throw an exception if the
+     * value does not correctly encode a bound position of this type, including if it encodes an exact clustering.
+     *
+     * Note that the encoded clustering position cannot specify the type of bound (i.e. start/end/boundary) because to
+     * correctly compare clustering positions the encoding must be the same for the different types (e.g. the position
+     * for a exclusive end and an inclusive start is the same, before the exact clustering). The type must be supplied
+     * separately (in the bound... vs boundary... call and isEnd argument).
+     *
+     * @param accessor Accessor to use to construct components. Because this will be used to construct individual
+     *                 arrays/buffers for each component, it may be sensible to use an accessor that allocates larger
+     *                 buffers in advance.
+     * @param comparable The clustering position encoded as a byte-comparable sequence.
+     */
+    public <V> ClusteringBoundary<V> boundaryFromByteComparable(ValueAccessor<V> accessor,
+                                                                ByteComparable comparable)
+    {
+        ByteComparable.Version version = ByteComparable.Version.OSS41;
+        ByteSource.Peekable orderedBytes = ByteSource.peekable(comparable.asComparableBytes(version));
+        if (orderedBytes == null)
+            return null;
+
+        // First check for special cases (partition key only, static clustering) that can do without buffers.
+        int sep = orderedBytes.next();
+        int cc = 0;
+        V[] components = accessor.createArray(size());
+
+        while (true)
+        {
+            switch (sep)
+            {
+            case NEXT_COMPONENT_NULL:
+            case NEXT_COMPONENT_NULL_REVERSED:
+                components[cc] = accessor.empty();
+                break;
+            case NEXT_COMPONENT:
+                components[cc] = subtype(cc).fromComparableBytes(accessor, orderedBytes, version);
+                break;
+            case ByteSource.LT_NEXT_COMPONENT:
+                return accessor.factory().boundary(ClusteringPrefix.Kind.EXCL_END_INCL_START_BOUNDARY,
+                                                   Arrays.copyOf(components, cc));
+            case ByteSource.GT_NEXT_COMPONENT:
+                return accessor.factory().boundary(ClusteringPrefix.Kind.INCL_END_EXCL_START_BOUNDARY,
+                                                   Arrays.copyOf(components, cc));
+            default:
+                throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in ClusteringBoundary encoding");
+            }
+            ++cc;
+            sep = orderedBytes.next();
+        }
+    }
+
     /**
      * A comparator for rows.
      *
diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
index 3cf814c6cfc7..c7a2782ecef3 100644
--- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java
+++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
@@ -35,8 +35,8 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.ByteArrayUtil;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 /**
  * A clustering prefix is the unit of what a {@link ClusteringComparator} can compare.
@@ -70,7 +70,7 @@ public enum Kind
         EXCL_END_INCL_START_BOUNDARY(0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
         STATIC_CLUSTERING           (1, -1, v -> v == Version.LEGACY
                                                  ? ByteSource.LT_NEXT_COMPONENT + 1
-                                                 : ByteSource.TERMINATOR - 1),
+                                                 : ByteSource.EXCLUDED),
         CLUSTERING                  (2,  0, v -> v == Version.LEGACY
                                                  ? ByteSource.NEXT_COMPONENT
                                                  : ByteSource.TERMINATOR),
diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java
index 70ca5d1a6b65..b21df8f17d33 100644
--- a/src/java/org/apache/cassandra/db/DecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/DecoratedKey.java
@@ -19,12 +19,15 @@
 
 import java.nio.ByteBuffer;
 import java.util.Comparator;
+import java.util.function.BiFunction;
 
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.dht.Token.KeyBound;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.IFilter.FilterKey;
 import org.apache.cassandra.utils.MurmurHash;
 
@@ -154,4 +157,33 @@ public void filterHash(long[] dest)
         ByteBuffer key = getKey();
         MurmurHash.hash3_x64_128(key, key.position(), key.remaining(), 0, dest);
     }
+
+    /**
+     * A template factory method for creating decorated keys from their byte-comparable representation.
+     */
+    static <T extends DecoratedKey> T fromByteComparable(ByteComparable byteComparable,
+                                                         Version version,
+                                                         IPartitioner partitioner,
+                                                         BiFunction<Token, byte[], T> decoratedKeyFactory)
+    {
+        ByteSource.Peekable peekable = byteComparable.asPeekableBytes(version);
+        // Decode the token from the first component of the multi-component sequence representing the whole decorated key.
+        Token token = partitioner.getTokenFactory().fromComparableBytes(ByteSourceInverse.nextComponentSource(peekable), version);
+        // Decode the key bytes from the second component.
+        byte[] keyBytes = ByteSourceInverse.getUnescapedBytes(ByteSourceInverse.nextComponentSource(peekable));
+        // Instantiate a decorated key from the decoded token and key bytes, using the provided factory method.
+        return decoratedKeyFactory.apply(token, keyBytes);
+    }
+
+    public static byte[] keyFromByteComparable(ByteComparable byteComparable,
+                                               Version version,
+                                               IPartitioner partitioner)
+    {
+        ByteSource.Peekable peekable = byteComparable.asPeekableBytes(version);
+        // Decode the token from the first component of the multi-component sequence representing the whole decorated key.
+        // We won't use it, but the decoding also positions the byte source after it.
+        partitioner.getTokenFactory().fromComparableBytes(ByteSourceInverse.nextComponentSource(peekable), version);
+        // Decode the key bytes from the second component.
+        return ByteSourceInverse.getUnescapedBytes(ByteSourceInverse.nextComponentSource(peekable));
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
index add52189776c..e5517ad48c6a 100644
--- a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
@@ -20,7 +20,11 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 
+import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.memory.MemoryUtil;
 import org.apache.cassandra.utils.memory.NativeAllocator;
@@ -41,8 +45,66 @@ public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group
         MemoryUtil.setBytes(peer + 4, key);
     }
 
+    public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group writeOp, byte[] keyBytes)
+    {
+        super(token);
+        assert keyBytes != null;
+
+        int size = keyBytes.length;
+        this.peer = allocator.allocate(4 + size, writeOp);
+        MemoryUtil.setInt(peer, size);
+        MemoryUtil.setBytes(peer + 4, keyBytes, 0, size);
+    }
+
+    @Inline
+    int length()
+    {
+        return MemoryUtil.getInt(peer);
+    }
+
+    @Inline
+    long address()
+    {
+        return this.peer + 4;
+    }
+
+    @Override
     public ByteBuffer getKey()
     {
         return MemoryUtil.getByteBuffer(peer + 4, MemoryUtil.getInt(peer), ByteOrder.BIG_ENDIAN);
     }
+
+    @Override
+    protected ByteSource keyComparableBytes(Version version)
+    {
+        return ByteSource.of(address(), length(), version);
+    }
+
+    /**
+     * A factory method that translates the given byte-comparable representation to a {@link NativeDecoratedKey}
+     * instance. If the given byte comparable doesn't represent the encoding of a native decorated key, anything from a
+     * wide variety of throwables may be thrown (e.g. {@link AssertionError}, {@link IndexOutOfBoundsException},
+     * {@link IllegalStateException}, etc.).
+     *
+     * @param byteComparable A byte-comparable representation (presumably of a {@link NativeDecoratedKey} instance).
+     * @param version The encoding version used for the given byte comparable.
+     * @param partitioner The partitioner of the encoded decorated key. Needed in order to correctly decode the token
+     *                    bytes of the key.
+     * @param allocator The native allocator needed to copy the key contents to off-heap memory.
+     *
+     * @return A new {@link NativeDecoratedKey} instance, corresponding to the given byte-comparable representation. If
+     * we were to call {@link #asComparableBytes(Version)} on the returned object, we should get a {@link ByteSource}
+     * equal to the one of the input byte comparable.
+     */
+    public static NativeDecoratedKey fromByteComparable(ByteComparable byteComparable,
+                                                        ByteComparable.Version version,
+                                                        IPartitioner partitioner,
+                                                        NativeAllocator allocator,
+                                                        OpOrder.Group opGroup)
+    {
+        return DecoratedKey.fromByteComparable(byteComparable,
+                                               version,
+                                               partitioner,
+                                               (token, keyBytes) -> new NativeDecoratedKey(token, allocator, opGroup, keyBytes));
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/PartitionPosition.java b/src/java/org/apache/cassandra/db/PartitionPosition.java
index 578b109a835c..661dc77d0f26 100644
--- a/src/java/org/apache/cassandra/db/PartitionPosition.java
+++ b/src/java/org/apache/cassandra/db/PartitionPosition.java
@@ -24,8 +24,8 @@
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public interface PartitionPosition extends RingPosition<PartitionPosition>, ByteComparable
 {
diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
index edd4de79e9d8..bc6918118549 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
@@ -39,11 +39,11 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.github.jamm.Unmetered;
 
-import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.BYTE_ORDER;
 import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.CUSTOM;
 
 /**
@@ -608,19 +608,51 @@ public AssignmentTestable.TestResult testAssignment(AbstractType<?> receiverType
      * Depending on the type, this method can be called for null or empty input, in which case the output is allowed to
      * be null (the clustering/tuple encoding will accept and handle it).
      */
-    public ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V value, ByteComparable.Version version)
     {
-        if (comparisonType == BYTE_ORDER)
+        if (isByteOrderComparable)
         {
             // When a type is byte-ordered on its own, we only need to escape it, so that we can include it in
             // multi-component types and make the encoding weakly-prefix-free.
-            return ByteSource.of(byteBuffer, version);
+            return ByteSource.of(accessor, value, version);
         }
         else
             // default is only good for byte-comparables
             throw new UnsupportedOperationException(getClass().getSimpleName() + " does not implement asComparableBytes");
     }
 
+    public final ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version)
+    {
+        return asComparableBytes(ByteBufferAccessor.instance, byteBuffer, version);
+    }
+
+    /**
+     * Translates the given byte-ordered representation to the common, non-byte-ordered binary representation of a
+     * payload for this abstract type (the latter, common binary representation is what we mostly work with in the
+     * storage engine internals). If the given bytes don't correspond to the encoding of some payload value for this
+     * abstract type, an {@link IllegalArgumentException} may be thrown.
+     *
+     * @param accessor value accessor used to construct the value.
+     * @param comparableBytes A byte-ordered representation (presumably of a payload for this abstract type).
+     * @param version The byte-comparable version used to construct the representation.
+     * @return A of a payload for this abstract type, corresponding to the given byte-ordered representation,
+     *         constructed using the supplied value accessor.
+     *
+     * @see #asComparableBytes
+     */
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        if (isByteOrderComparable)
+            return accessor.valueOf(ByteSourceInverse.getUnescapedBytes(comparableBytes));
+        else
+            throw new UnsupportedOperationException(getClass().getSimpleName() + " does not implement fromComparableBytes");
+    }
+
+    public final ByteBuffer fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return fromComparableBytes(ByteBufferAccessor.instance, comparableBytes, version);
+    }
+
     /**
      * This must be overriden by subclasses if necessary so that for any
      * AbstractType, this == TypeParser.parse(toString()).
diff --git a/src/java/org/apache/cassandra/db/marshal/BooleanType.java b/src/java/org/apache/cassandra/db/marshal/BooleanType.java
index fff72203f657..ef8d4fcf7b85 100644
--- a/src/java/org/apache/cassandra/db/marshal/BooleanType.java
+++ b/src/java/org/apache/cassandra/db/marshal/BooleanType.java
@@ -26,8 +26,8 @@
 import org.apache.cassandra.serializers.BooleanSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -57,16 +57,25 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        if (!buf.hasRemaining())
+        if (accessor.isEmpty(data))
             return null;
-        byte b = buf.get(buf.position());
+        byte b = accessor.toByte(data);
         if (b != 0)
             b = 1;
         return ByteSource.oneByte(b);
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        if (comparableBytes == null)
+            return accessor.empty();
+        int b = comparableBytes.next();
+        return b == 1 ? accessor.valueOf(true) : accessor.valueOf(false);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
 
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java b/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java
index 7d13844eb0ca..b13bb1596d95 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java
@@ -234,6 +234,13 @@ public UUID toUUID(byte[] value)
         return new UUID(getLong(value, 0), getLong(value, 8));
     }
 
+    @Override
+    public int putByte(byte[] dst, int offset, byte value)
+    {
+        dst[offset] = value;
+        return TypeSizes.BYTE_SIZE;
+    }
+
     @Override
     public int putShort(byte[] dst, int offset, short value)
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java b/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java
index ea9bf113833b..9b477aeeeace 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java
@@ -18,6 +18,7 @@
 
 package org.apache.cassandra.db.marshal;
 
+import org.apache.cassandra.db.AbstractArrayClusteringPrefix;
 import org.apache.cassandra.db.ArrayClustering;
 import org.apache.cassandra.db.ArrayClusteringBound;
 import org.apache.cassandra.db.ArrayClusteringBoundary;
@@ -33,7 +34,7 @@
 
 class ByteArrayObjectFactory implements ValueAccessor.ObjectFactory<byte[]>
 {
-    private static final Clustering<byte[]> EMPTY_CLUSTERING = new ArrayClustering()
+    private static final Clustering<byte[]> EMPTY_CLUSTERING = new ArrayClustering(AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY)
     {
         public String toString(TableMetadata metadata)
         {
@@ -41,14 +42,37 @@ public String toString(TableMetadata metadata)
         }
     };
 
+    public static final Clustering<byte[]> STATIC_CLUSTERING = new ArrayClustering(AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY)
+    {
+        @Override
+        public Kind kind()
+        {
+            return Kind.STATIC_CLUSTERING;
+        }
+
+        @Override
+        public String toString()
+        {
+            return "STATIC";
+        }
+
+        @Override
+        public String toString(TableMetadata metadata)
+        {
+            return toString();
+        }
+    };
+
     static final ValueAccessor.ObjectFactory<byte[]> instance = new ByteArrayObjectFactory();
 
     private ByteArrayObjectFactory() {}
 
     /** The smallest start bound, i.e. the one that starts before any row. */
-    private static final ArrayClusteringBound BOTTOM_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, new byte[0][]);
+    private static final ArrayClusteringBound BOTTOM_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND,
+                                                                                      AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY);
     /** The biggest end bound, i.e. the one that ends after any row. */
-    private static final ArrayClusteringBound TOP_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, new byte[0][]);
+    private static final ArrayClusteringBound TOP_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND,
+                                                                                   AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY);
 
     public Cell<byte[]> cell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTime, byte[] value, CellPath path)
     {
@@ -65,6 +89,11 @@ public Clustering<byte[]> clustering()
         return EMPTY_CLUSTERING;
     }
 
+    public Clustering<byte[]> staticClustering()
+    {
+        return STATIC_CLUSTERING;
+    }
+
     public ClusteringBound<byte[]> bound(ClusteringPrefix.Kind kind, byte[]... values)
     {
         return new ArrayClusteringBound(kind, values);
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java b/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java
index a0f9c1d8d004..57a13735d303 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java
@@ -238,6 +238,13 @@ public UUID toUUID(ByteBuffer value)
         return UUIDGen.getUUID(value);
     }
 
+    @Override
+    public int putByte(ByteBuffer dst, int offset, byte value)
+    {
+        dst.put(dst.position() + offset, value);
+        return TypeSizes.BYTE_SIZE;
+    }
+
     @Override
     public int putShort(ByteBuffer dst, int offset, short value)
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java b/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java
index 00f4646341d9..2fae7003ca12 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java
@@ -20,6 +20,7 @@
 
 import java.nio.ByteBuffer;
 
+import org.apache.cassandra.db.AbstractBufferClusteringPrefix;
 import org.apache.cassandra.db.BufferClustering;
 import org.apache.cassandra.db.BufferClusteringBound;
 import org.apache.cassandra.db.BufferClusteringBoundary;
@@ -31,24 +32,15 @@
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.CellPath;
 import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
 
 class ByteBufferObjectFactory implements ValueAccessor.ObjectFactory<ByteBuffer>
 {
-    /** Empty clustering for tables having no clustering columns. */
-    private static final Clustering<ByteBuffer> EMPTY_CLUSTERING = new BufferClustering()
-    {
-        @Override
-        public String toString(TableMetadata metadata)
-        {
-            return "EMPTY";
-        }
-    };
-
     /** The smallest start bound, i.e. the one that starts before any row. */
-    private static final BufferClusteringBound BOTTOM_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, new ByteBuffer[0]);
+    private static final BufferClusteringBound BOTTOM_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND,
+                                                                                        AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY);
     /** The biggest end bound, i.e. the one that ends after any row. */
-    private static final BufferClusteringBound TOP_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, new ByteBuffer[0]);
+    private static final BufferClusteringBound TOP_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND,
+                                                                                     AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY);
 
     static final ValueAccessor.ObjectFactory<ByteBuffer> instance = new ByteBufferObjectFactory();
 
@@ -66,7 +58,11 @@ public Clustering<ByteBuffer> clustering(ByteBuffer... values)
 
     public Clustering<ByteBuffer> clustering()
     {
-        return EMPTY_CLUSTERING;
+        return Clustering.EMPTY;
+    }
+
+    public Clustering<ByteBuffer> staticClustering() {
+        return Clustering.STATIC_CLUSTERING;
     }
 
     public ClusteringBound<ByteBuffer> bound(ClusteringPrefix.Kind kind, ByteBuffer... values)
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteType.java b/src/java/org/apache/cassandra/db/marshal/ByteType.java
index e57b479ccb15..a910fbba11a1 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteType.java
@@ -27,9 +27,10 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class ByteType extends NumberType<Byte>
 {
@@ -46,11 +47,16 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
-        return version == Version.LEGACY
-               ? ByteSource.signedFixedLengthNumber(buf)
-               : ByteSource.optionalSignedFixedLengthNumber(buf);
+        // This type does not allow non-present values, but we do just to avoid future complexity.
+        return ByteSource.optionalSignedFixedLengthNumber(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 1);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
index dc4fdcc7112a..e348c9a1c5fc 100644
--- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java
@@ -24,6 +24,7 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 
@@ -31,8 +32,9 @@
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 import static com.google.common.collect.Iterables.any;
 import static com.google.common.collect.Iterables.transform;
@@ -168,28 +170,33 @@ protected <V> AbstractType<?> getAndAppendComparator(int i, V value, ValueAccess
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
-        if (byteBuffer == null || byteBuffer.remaining() == 0)
+        if (data == null || accessor.isEmpty(data))
             return null;
 
         ByteSource[] srcs = new ByteSource[types.size() * 2 + 1];
-        ByteBuffer bb = byteBuffer.duplicate();
+        int length = accessor.size(data);
 
         // statics go first
-        boolean isStatic = readStatic(bb);
+        boolean isStatic = readIsStaticInternal(data, accessor);
+        int offset = startingOffsetInternal(isStatic);
         srcs[0] = isStatic ? null : ByteSource.EMPTY;
 
         int i = 0;
         byte lastEoc = 0;
-        while (bb.remaining() > 0)
+        while (offset < length)
         {
             // Only the end-of-component byte of the last component of this composite can be non-zero, so the
             // component before can't have a non-zero end-of-component byte.
             assert lastEoc == 0 : lastEoc;
 
-            srcs[i * 2 + 1] = types.get(i).asComparableBytes(ByteBufferUtil.readBytesWithShortLength(bb), version);
-            lastEoc = bb.get();
+            int componentLength = accessor.getUnsignedShort(data, offset);
+            offset += 2;
+            srcs[i * 2 + 1] = types.get(i).asComparableBytes(accessor, accessor.slice(data, offset, componentLength), version);
+            offset += componentLength;
+            lastEoc = accessor.getByte(data, offset);
+            offset += 1;
             srcs[i * 2 + 2] = ByteSource.oneByte(lastEoc & 0xFF ^ 0x80); // end-of-component also takes part in comparison as signed byte
             ++i;
         }
@@ -200,6 +207,46 @@ public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
                                          srcs);
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, Version version)
+    {
+        // For ByteComparable.Version.LEGACY the terminator byte is ByteSource.END_OF_STREAM. The latter means that it's
+        // indistinguishable from the END_OF_STREAM byte that gets returned _after_ the terminator byte has already
+        // been consumed, when the composite is part of a multi-component sequence. So if in such a scenario we consume
+        // the ByteSource.END_OF_STREAM terminator here, this will result in actually consuming the multi-component
+        // sequence separator after it and jumping directly into the bytes of the next component, when we try to
+        // consume the (already consumed) separator.
+        // Instead of trying to find a way around the situation, we can just take advantage of the fact that we don't
+        // need to decode from Version.LEGACY, assume that we never do that, and assert it here.
+        assert version != Version.LEGACY;
+
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        int separator = comparableBytes.next();
+        boolean isStatic = ByteSourceInverse.nextComponentNull(separator);
+        int i = 0;
+        V[] buffers = accessor.createArray(types.size());
+        byte lastEoc = 0;
+
+        while ((separator = comparableBytes.next()) != ByteSource.TERMINATOR && i < types.size())
+        {
+            // Only the end-of-component byte of the last component of this composite can be non-zero, so the
+            // component before can't have a non-zero end-of-component byte.
+            assert lastEoc == 0 : lastEoc;
+
+            // Get the next type and decode its payload.
+            AbstractType<?> type = types.get(i);
+            V decoded = type.fromComparableBytes(accessor,
+                                                 ByteSourceInverse.nextComponentSource(comparableBytes, separator),
+                                                 version);
+            buffers[i++] = decoded;
+
+            lastEoc = ByteSourceInverse.getSignedByte(ByteSourceInverse.nextComponentSource(comparableBytes));
+        }
+        return build(accessor, isStatic, Arrays.copyOf(buffers, i), lastEoc);
+    }
+
     protected ParsedComparator parseComparator(int i, String part)
     {
         return new StaticParsedComparator(types.get(i), part);
@@ -425,27 +472,26 @@ public static <V> V build(ValueAccessor<V> accessor, boolean isStatic, V... valu
         return accessor.valueOf(out);
     }
 
-    public static ByteBuffer build(boolean isStatic, ByteBuffer[] buffers, byte lastEoc)
+    @VisibleForTesting
+    public static <V> V build(ValueAccessor<V> accessor, boolean isStatic, V[] values, byte lastEoc)
     {
         int totalLength = isStatic ? 2 : 0;
-        for (ByteBuffer bb : buffers)
-            totalLength += 2 + bb.remaining() + 1;
+        for (V v : values)
+            totalLength += 2 + accessor.size(v) + 1;
 
         ByteBuffer out = ByteBuffer.allocate(totalLength);
 
         if (isStatic)
             out.putShort((short)STATIC_MARKER);
 
-        for (int i = 0; i < buffers.length; ++i)
+        for (int i = 0; i < values.length; ++i)
         {
-            ByteBuffer bb = buffers[i];
-            ByteBufferUtil.writeShortLength(out, bb.remaining());
-            int toCopy = bb.remaining();
-            ByteBufferUtil.arrayCopy(bb, bb.position(), out, out.position(), toCopy);
-            out.position(out.position() + toCopy);
-            out.put(i != buffers.length - 1 ? (byte) 0 : lastEoc);
+            V v = values[i];
+            ByteBufferUtil.writeShortLength(out, accessor.size(v));
+            accessor.write(v, out);
+            out.put(i != values.length - 1 ? (byte) 0 : lastEoc);
         }
         out.flip();
-        return out;
+        return accessor.valueOf(out);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/marshal/DateType.java b/src/java/org/apache/cassandra/db/marshal/DateType.java
index 4e6aa5a27704..595106d3d184 100644
--- a/src/java/org/apache/cassandra/db/marshal/DateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DateType.java
@@ -31,8 +31,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 /**
  * This is the old version of TimestampType, but has been replaced as it wasn't comparing pre-epoch timestamps
@@ -53,10 +54,16 @@ public boolean isEmptyValueMeaningless()
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
         // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient.
-        return ByteSource.optionalFixedLength(buf);
+        return ByteSource.optionalFixedLength(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalFixedLength(accessor, comparableBytes, 8);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/DecimalType.java b/src/java/org/apache/cassandra/db/marshal/DecimalType.java
index d6d47d8e0f59..44782890ef0b 100644
--- a/src/java/org/apache/cassandra/db/marshal/DecimalType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DecimalType.java
@@ -34,8 +34,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public class DecimalType extends NumberType<BigDecimal>
 {
@@ -50,6 +50,10 @@ public class DecimalType extends NumberType<BigDecimal>
     private static final int POSITIVE_DECIMAL_HEADER_MASK = 0x80;
     private static final int NEGATIVE_DECIMAL_HEADER_MASK = 0x00;
     private static final int DECIMAL_EXPONENT_LENGTH_HEADER_MASK = 0x40;
+    private static final byte DECIMAL_LAST_BYTE = (byte) 0x00;
+    private static final BigInteger HUNDRED = BigInteger.valueOf(100);
+
+    private static final ByteBuffer ZERO_BUFFER = instance.decompose(BigDecimal.ZERO);
 
     DecimalType() {super(ComparisonType.CUSTOM);} // singleton
 
@@ -106,53 +110,63 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
      *
      */
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        BigDecimal value = compose(buf);
+        BigDecimal value = compose(data, accessor);
         if (value == null)
             return null;
         if (value.compareTo(BigDecimal.ZERO) == 0)  // Note: 0.equals(0.0) returns false!
             return ByteSource.oneByte(POSITIVE_DECIMAL_HEADER_MASK);
+
         long scale = (((long) value.scale()) - value.precision()) & ~1;
         boolean negative = value.signum() < 0;
-        final int negmul = negative ? -1 : 1;
-        // This should always fit into an int
-        final long exponent = (-scale * negmul) / 2;
+        // Make a base-100 exponent (this will always fit in an int).
+        int exponent = Math.toIntExact(-scale >> 1);
+        // Flip the exponent sign for negative numbers, so that ones with larger magnitudes are propely treated as smaller.
+        final int modulatedExponent = negative ? -exponent : exponent;
         // We should never have scale > Integer.MAX_VALUE, as we're always subtracting the non-negative precision of
         // the encoded BigDecimal, and furthermore we're rounding to negative infinity.
-        if (scale > Integer.MAX_VALUE || scale < Integer.MIN_VALUE)
+        assert scale <= Integer.MAX_VALUE;
+        // However, we may end up overflowing on the negative side.
+        if (scale < Integer.MIN_VALUE)
         {
-            // We are practically out of range here, but let's handle that anyway
-            int mv = Long.signum(scale) * Integer.MAX_VALUE;
+            // As scaleByPowerOfTen needs an int scale, do the scaling in two steps.
+            int mv = Integer.MIN_VALUE;
             value = value.scaleByPowerOfTen(mv);
             scale -= mv;
         }
         final BigDecimal mantissa = value.scaleByPowerOfTen(Ints.checkedCast(scale)).stripTrailingZeros();
+        // We now have a smaller-than-one signed mantissa, and a signed and modulated base-100 exponent.
         assert mantissa.abs().compareTo(BigDecimal.ONE) < 0;
 
         return new ByteSource()
         {
-            int posInExp = 0;
+            // Start with up to 5 bytes for sign + exponent.
+            int exponentBytesLeft = 5;
             BigDecimal current = mantissa;
 
             @Override
             public int next()
             {
-                if (posInExp < 5)
+                if (exponentBytesLeft > 0)
                 {
-                    if (posInExp == 0)
+                    --exponentBytesLeft;
+                    if (exponentBytesLeft == 4)
                     {
-                        int absexp = (int) (exponent < 0 ? -exponent : exponent);
-                        while (posInExp < 5 && absexp >> (32 - ++posInExp * 8) == 0) {}
-                        int explen = DECIMAL_EXPONENT_LENGTH_HEADER_MASK + (exponent < 0 ? -1 : 1) * (5 - posInExp);
+                        // Skip leading zero bytes in the modulatedExponent.
+                        exponentBytesLeft -= Integer.numberOfLeadingZeros(Math.abs(modulatedExponent)) / 8;
+                        // Now prepare the leading byte which includes the sign of the number plus the sign and length of the modulatedExponent.
+                        int explen = DECIMAL_EXPONENT_LENGTH_HEADER_MASK + (modulatedExponent < 0 ? -exponentBytesLeft : exponentBytesLeft);
                         return explen + (negative ? NEGATIVE_DECIMAL_HEADER_MASK : POSITIVE_DECIMAL_HEADER_MASK);
                     }
                     else
-                        return (int) ((exponent >> (32 - posInExp++ * 8))) & 0xFF;
+                        return (modulatedExponent >> (exponentBytesLeft * 8)) & 0xFF;
                 }
-                if (current == null)
+                else if (current == null)
+                {
                     return END_OF_STREAM;
-                if (current.compareTo(BigDecimal.ZERO) == 0)
+                }
+                else if (current.compareTo(BigDecimal.ZERO) == 0)
                 {
                     current = null;
                     return 0x00;
@@ -168,6 +182,87 @@ public int next()
         };
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        int headerBits = comparableBytes.next();
+        if (headerBits == POSITIVE_DECIMAL_HEADER_MASK)
+            return accessor.valueOf(ZERO_BUFFER);
+
+        // I. Extract the exponent.
+        // The sign of the decimal, and the sign and the length (in bytes) of the decimal exponent, are all encoded in
+        // the first byte.
+        // Get the sign of the decimal...
+        boolean isNegative = headerBits < POSITIVE_DECIMAL_HEADER_MASK;
+        headerBits -= isNegative ? NEGATIVE_DECIMAL_HEADER_MASK : POSITIVE_DECIMAL_HEADER_MASK;
+        headerBits -= DECIMAL_EXPONENT_LENGTH_HEADER_MASK;
+        // Get the sign and the length of the exponent (the latter is encoded as its negative if the sign of the
+        // exponent is negative)...
+        boolean isExponentNegative = headerBits < 0;
+        headerBits = isExponentNegative ? -headerBits : headerBits;
+        // Now consume the exponent bytes. If the exponent is negative and uses less than 4 bytes, the remaining bytes
+        // should be padded with 1s, in order for the constructed int to contain the correct (negative) exponent value.
+        // So, if the exponent is negative, we can just start with all bits set to 1 (i.e. we can start with -1).
+        int exponent = isExponentNegative ? -1 : 0;
+        for (int i = 0; i < headerBits; ++i)
+            exponent = (exponent << 8) | comparableBytes.next();
+        // The encoded exponent also contains the decimal sign, in order to correctly compare exponents in case of
+        // negative decimals (e.g. x * 10^y > x * 10^z if x < 0 && y < z). After the decimal sign is "removed", what's
+        // left is a base-100 exponent following BigDecimal's convention for the exponent sign.
+        exponent = isNegative ? -exponent : exponent;
+
+        // II. Extract the mantissa as a BigInteger value. It was encoded as a BigDecimal value between 0 and 1, in
+        // order to be used for comparison (after the sign of the decimal and the sign and the value of the exponent),
+        // but when decoding we don't need that property on the transient mantissa value.
+        BigInteger mantissa = BigInteger.ZERO;
+        int curr = comparableBytes.next();
+        while (curr != DECIMAL_LAST_BYTE)
+        {
+            // The mantissa value is constructed by a standard positional notation value calculation.
+            // The value of the next digit is the next most-significant mantissa byte as an unsigned integer,
+            // offset by a predetermined value (in this case, 0x80)...
+            int currModified = curr - 0x80;
+            // ...multiply the current value by the base (in this case, 100)...
+            mantissa = mantissa.multiply(HUNDRED);
+            // ...then add the next digit to the modified current value...
+            mantissa = mantissa.add(BigInteger.valueOf(currModified));
+            // ...and finally, adjust the base-100, BigDecimal format exponent accordingly.
+            --exponent;
+            curr = comparableBytes.next();
+        }
+
+        // III. Construct the final BigDecimal value, by combining the mantissa and the exponent, guarding against
+        // underflow or overflow when exponents are close to their boundary values.
+        long base10NonBigDecimalFormatExp = 2L * exponent;
+        // When expressing a sufficiently big decimal, BigDecimal's internal scale value will be negative with very
+        // big absolute value. To compute the encoded exponent, this internal scale has the number of digits of the
+        // unscaled value subtracted from it, after which it's divided by 2, rounding down to negative infinity
+        // (before accounting for the decimal sign). When decoding, this exponent is converted to a base-10 exponent in
+        // non-BigDecimal format, which means that it can very well overflow Integer.MAX_VALUE.
+        // For example, see how <code>new BigDecimal(BigInteger.TEN, Integer.MIN_VALUE)</code> is encoded and decoded.
+        if (base10NonBigDecimalFormatExp > Integer.MAX_VALUE)
+        {
+            // If the base-10 exponent will result in an overflow, some of its powers of 10 need to be absorbed by the
+            // mantissa. How much exactly? As little as needed, in order to avoid complex BigInteger operations, which
+            // means exactly as much as to have a scale of -Integer.MAX_VALUE.
+            int exponentReduction = (int) (base10NonBigDecimalFormatExp - Integer.MAX_VALUE);
+            mantissa = mantissa.multiply(BigInteger.TEN.pow(exponentReduction));
+            base10NonBigDecimalFormatExp = Integer.MAX_VALUE;
+        }
+        assert base10NonBigDecimalFormatExp >= Integer.MIN_VALUE && base10NonBigDecimalFormatExp <= Integer.MAX_VALUE;
+        // Here we negate the exponent, as we are not using BigDecimal.scaleByPowerOfTen, where a positive number means
+        // "multiplying by a positive power of 10", but to BigDecimal's internal scale representation, where a positive
+        // number means "dividing by a positive power of 10".
+        byte[] mantissaBytes = mantissa.toByteArray();
+        V resultBuf = accessor.allocate(4 + mantissaBytes.length);
+        accessor.putInt(resultBuf, 0, (int) -base10NonBigDecimalFormatExp);
+        accessor.copyByteArrayTo(mantissaBytes, 0, resultBuf, 4, mantissaBytes.length);
+        return resultBuf;
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/DoubleType.java b/src/java/org/apache/cassandra/db/marshal/DoubleType.java
index d68bc4ca90a4..56ae0131b3a9 100644
--- a/src/java/org/apache/cassandra/db/marshal/DoubleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DoubleType.java
@@ -27,8 +27,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class DoubleType extends NumberType<Double>
 {
@@ -53,9 +54,15 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return ByteSource.optionalSignedFixedLengthFloat(buf);
+        return ByteSource.optionalSignedFixedLengthFloat(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLengthFloat(accessor, comparableBytes, 8);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
index a293f11b2ce1..a0f909c7fc96 100644
--- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java
@@ -21,11 +21,14 @@
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ConcurrentHashMap;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -38,8 +41,9 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 import static com.google.common.collect.Iterables.any;
 
@@ -70,6 +74,7 @@ public class DynamicCompositeType extends AbstractCompositeType
     private static final String REVERSED_TYPE = ReversedType.class.getSimpleName();
 
     private final Map<Byte, AbstractType<?>> aliases;
+    private final Map<AbstractType<?>, Byte> inverseMapping;
 
     // interning instances
     private static final ConcurrentHashMap<Map<Byte, AbstractType<?>>, DynamicCompositeType> instances = new ConcurrentHashMap<>();
@@ -90,6 +95,9 @@ public static DynamicCompositeType getInstance(Map<Byte, AbstractType<?>> aliase
     private DynamicCompositeType(Map<Byte, AbstractType<?>> aliases)
     {
         this.aliases = aliases;
+        this.inverseMapping = new HashMap<>();
+        for (Map.Entry<Byte, AbstractType<?>> en : aliases.entrySet())
+            this.inverseMapping.put(en.getValue(), en.getKey());
     }
 
     protected <V> boolean readIsStatic(V value, ValueAccessor<V> accessor)
@@ -206,25 +214,25 @@ protected <V> AbstractType<?> getAndAppendComparator(int i, V value, ValueAccess
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
         List<ByteSource> srcs = new ArrayList<>();
-        ByteBuffer bb = byteBuffer.duplicate();
+        int length = accessor.size(data);
 
         // statics go first
-        boolean isStatic = readIsStatic(bb, ByteBufferAccessor.instance);
+        boolean isStatic = readIsStatic(data, accessor);
+        int offset = startingOffset(isStatic);
         srcs.add(isStatic ? null : ByteSource.EMPTY);
-        bb.position(bb.position() + startingOffset(isStatic));
 
         byte lastEoc = 0;
-        while (bb.remaining() > 0)
+        while (offset < length)
         {
             // Only the end-of-component byte of the last component of this composite can be non-zero, so the
             // component before can't have a non-zero end-of-component byte.
             assert lastEoc == 0 : lastEoc;
 
-            AbstractType<?> comp = getComparator(bb, ByteBufferAccessor.instance, 0);
-            bb.position(bb.position() + getComparatorSize(bb, ByteBufferAccessor.instance, 0));
+            AbstractType<?> comp = getComparator(data, accessor, offset);
+            offset += getComparatorSize(data, accessor, offset);
             // The comparable bytes for the component need to ensure comparisons consistent with
             // AbstractCompositeType.compareCustom(ByteBuffer, ByteBuffer) and
             // DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer):
@@ -246,9 +254,13 @@ public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
                 srcs.add(ByteSource.of(reversedComp.baseType.getClass().getName(), version));
             }
             // Only then the payload of the component gets encoded.
-            srcs.add(comp.asComparableBytes(ByteBufferUtil.readBytesWithShortLength(bb), version));
+            int componentLength = accessor.getUnsignedShort(data, offset);
+            offset += 2;
+            srcs.add(comp.asComparableBytes(accessor, accessor.slice(data, offset, componentLength), version));
+            offset += componentLength;
             // The end-of-component byte also takes part in the comparison, and therefore needs to be encoded.
-            lastEoc = bb.get();
+            lastEoc = accessor.getByte(data, offset);
+            offset += 1;
             srcs.add(ByteSource.oneByte(version == Version.LEGACY ? lastEoc : lastEoc & 0xFF ^ 0x80));
         }
 
@@ -256,13 +268,79 @@ public ByteSource asComparableBytes(ByteBuffer byteBuffer, Version version)
                                          srcs.toArray(EMPTY_BYTE_SOURCE_ARRAY));
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, Version version)
+    {
+        // For ByteComparable.Version.LEGACY the terminator byte is ByteSource.END_OF_STREAM. Just like with
+        // CompositeType, this means that in multi-component sequences the terminator may be transformed to a regular
+        // component separator, but unlike CompositeType (where we have the expected number of types/components),
+        // this can make the end of the whole dynamic composite type indistinguishable from the end of a component
+        // somewhere in the middle of the dynamic composite type. Because of that, DynamicCompositeType elements
+        // cannot always be safely decoded using that encoding version.
+        // Even more so than with CompositeType, we just take advantage of the fact that we don't need to decode from
+        // Version.LEGACY, assume that we never do that, and assert it here.
+        assert version != Version.LEGACY;
+
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        // The first byte is the isStatic flag which we don't need but must consume to continue past it.
+        comparableBytes.next();
+
+        List<AbstractType<?>> types = new ArrayList<>();
+        List<V> values = new ArrayList<>();
+        byte lastEoc = 0;
+
+        for (int separator = comparableBytes.next(); separator != ByteSource.TERMINATOR; separator = comparableBytes.next())
+        {
+            // Solely the end-of-component byte of the last component of this composite can be non-zero.
+            assert lastEoc == 0 : lastEoc;
+
+            boolean isReversed = false;
+            // Decode the next type's simple class name that is encoded before its fully qualified class name (in order
+            // for comparisons to work correctly).
+            String simpleClassName = ByteSourceInverse.getString(ByteSourceInverse.nextComponentSource(comparableBytes, separator));
+            if (REVERSED_TYPE.equals(simpleClassName))
+            {
+                // Special-handle if the type is reversed (and decode the actual base type simple class name).
+                isReversed = true;
+                simpleClassName = ByteSourceInverse.getString(ByteSourceInverse.nextComponentSource(comparableBytes));
+            }
+
+            // Decode the type's fully qualified class name and parse the actual type from it.
+            String fullClassName = ByteSourceInverse.getString(ByteSourceInverse.nextComponentSource(comparableBytes));
+            assert fullClassName.endsWith(simpleClassName);
+            AbstractType<?> type = isReversed ? ReversedType.getInstance(TypeParser.parse(fullClassName))
+                                              : TypeParser.parse(fullClassName);
+            assert type != null;
+            types.add(type);
+
+            // Decode the payload from this type.
+            V value = type.fromComparableBytes(accessor, ByteSourceInverse.nextComponentSource(comparableBytes), version);
+            values.add(value);
+
+            // Also decode the corresponding end-of-component byte - the last one we decode will be taken into
+            // account when we deserialize the decoded data into an object.
+            lastEoc = ByteSourceInverse.getSignedByte(ByteSourceInverse.nextComponentSource(comparableBytes));
+        }
+        return build(accessor, types, inverseMapping, values, lastEoc);
+    }
+
     public static ByteBuffer build(List<String> types, List<ByteBuffer> values)
     {
-        return build(types, values, (byte) 0);
+        return build(ByteBufferAccessor.instance,
+                     Lists.transform(types, TypeParser::parse),
+                     Collections.emptyMap(),
+                     values,
+                     (byte) 0);
     }
 
     @VisibleForTesting
-    public static ByteBuffer build(List<String> types, List<ByteBuffer> values, byte lastEoc)
+    public static <V> V build(ValueAccessor<V> accessor,
+                              List<AbstractType<?>> types,
+                              Map<AbstractType<?>, Byte> inverseMapping,
+                              List<V> values,
+                              byte lastEoc)
     {
         assert types.size() == values.size();
 
@@ -271,35 +349,54 @@ public static ByteBuffer build(List<String> types, List<ByteBuffer> values, byte
         int totalLength = 0;
         for (int i = 0; i < numComponents; ++i)
         {
-            int typeNameLength = types.get(i).getBytes(StandardCharsets.UTF_8).length;
+            AbstractType<?> type = types.get(i);
+            Byte alias = inverseMapping.get(type);
+            int typeNameLength = alias == null ? type.toString().getBytes(StandardCharsets.UTF_8).length : 0;
             // The type data will be stored by means of the type's fully qualified name, not by aliasing, so:
             //   1. The type data header should be the fully qualified name length in bytes.
             //   2. The length should be small enough so that it fits in 15 bits (2 bytes with the first bit zero).
             assert typeNameLength <= 0x7FFF;
-            int valueLength = values.get(i).remaining();
+            int valueLength = accessor.size(values.get(i));
             // The value length should also expect its first bit to be 0, as the length should be stored as a signed
             // 2-byte value (short).
             assert valueLength <= 0x7FFF;
             totalLength += 2 + typeNameLength + 2 + valueLength + 1;
         }
 
-        ByteBuffer result = ByteBuffer.allocate(totalLength);
+        V result = accessor.allocate(totalLength);
+        int offset = 0;
         for (int i = 0; i < numComponents; ++i)
         {
-            // Write the type data (2-byte length header + the fully qualified type name in UTF-8).
-            byte[] typeNameBytes = types.get(i).getBytes(StandardCharsets.UTF_8);
-            ByteBufferUtil.writeShortLength(result, typeNameBytes.length);
-            result.put(ByteBuffer.wrap(typeNameBytes));
+            AbstractType<?> type = types.get(i);
+            Byte alias = inverseMapping.get(type);
+            if (alias == null)
+            {
+                // Write the type data (2-byte length header + the fully qualified type name in UTF-8).
+                byte[] typeNameBytes = type.toString().getBytes(StandardCharsets.UTF_8);
+                accessor.putShort(result,
+                                  offset,
+                                  (short) typeNameBytes.length); // this should work fine also if length >= 32768
+                offset += 2;
+                accessor.copyByteArrayTo(typeNameBytes, 0, result, offset, typeNameBytes.length);
+                offset += typeNameBytes.length;
+            }
+            else
+            {
+                accessor.putShort(result, offset, (short) (alias | 0x8000));
+                offset += 2;
+            }
 
             // Write the type payload data (2-byte length header + the payload).
-            ByteBuffer value = values.get(i);
-            int bytesToCopy = value.remaining();
-            ByteBufferUtil.writeShortLength(result, bytesToCopy);
-            ByteBufferUtil.arrayCopy(value, value.position(), result, result.position(), bytesToCopy);
-            result.position(result.position() + bytesToCopy);
+            V value = values.get(i);
+            int bytesToCopy = accessor.size(value);
+            accessor.putShort(result, offset, (short) bytesToCopy);
+            offset += 2;
+            accessor.copyTo(value, 0, result, accessor, offset, bytesToCopy);
+            offset += bytesToCopy;
 
             // Write the end-of-component byte.
-            result.put(i != numComponents - 1 ? (byte) 0 : lastEoc);
+            accessor.putByte(result, offset, i != numComponents - 1 ? (byte) 0 : lastEoc);
+            offset += 1;
         }
         return result;
     }
diff --git a/src/java/org/apache/cassandra/db/marshal/EmptyType.java b/src/java/org/apache/cassandra/db/marshal/EmptyType.java
index 80f8950e7c59..dcc57b7c4a75 100644
--- a/src/java/org/apache/cassandra/db/marshal/EmptyType.java
+++ b/src/java/org/apache/cassandra/db/marshal/EmptyType.java
@@ -33,8 +33,8 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.NoSpamLogger;
 
 /**
@@ -71,11 +71,17 @@ private static NonEmptyWriteBehavior parseNonEmptyWriteBehavior()
     private EmptyType() {super(ComparisonType.CUSTOM);} // singleton
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
         return null;
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return accessor.empty();
+    }
+
     public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right, ValueAccessor<VR> accessorR)
     {
         return 0;
diff --git a/src/java/org/apache/cassandra/db/marshal/FloatType.java b/src/java/org/apache/cassandra/db/marshal/FloatType.java
index 8618325f1e45..2adb127d4194 100644
--- a/src/java/org/apache/cassandra/db/marshal/FloatType.java
+++ b/src/java/org/apache/cassandra/db/marshal/FloatType.java
@@ -27,8 +27,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 
 public class FloatType extends NumberType<Float>
@@ -54,9 +55,15 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return ByteSource.optionalSignedFixedLengthFloat(buf);
+        return ByteSource.optionalSignedFixedLengthFloat(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLengthFloat(accessor, comparableBytes, 4);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/Int32Type.java b/src/java/org/apache/cassandra/db/marshal/Int32Type.java
index 7c644633270c..6dee26e22423 100644
--- a/src/java/org/apache/cassandra/db/marshal/Int32Type.java
+++ b/src/java/org/apache/cassandra/db/marshal/Int32Type.java
@@ -28,8 +28,9 @@
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class Int32Type extends NumberType<Integer>
 {
@@ -58,9 +59,15 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return ByteSource.optionalSignedFixedLengthNumber(buf);
+        return ByteSource.optionalSignedFixedLengthNumber(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 4);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/IntegerType.java b/src/java/org/apache/cassandra/db/marshal/IntegerType.java
index fed7e672c268..30a4fbea503d 100644
--- a/src/java/org/apache/cassandra/db/marshal/IntegerType.java
+++ b/src/java/org/apache/cassandra/db/marshal/IntegerType.java
@@ -30,8 +30,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public final class IntegerType extends NumberType<BigInteger>
 {
@@ -42,6 +42,8 @@ public final class IntegerType extends NumberType<BigInteger>
     private static final int POSITIVE_VARINT_HEADER = 0x80;
     private static final int NEGATIVE_VARINT_LENGTH_HEADER = 0x00;
     private static final int POSITIVE_VARINT_LENGTH_HEADER = 0xFF;
+    private static final byte BIG_INTEGER_NEGATIVE_LEADING_ZERO = (byte) 0xFF;
+    private static final byte BIG_INTEGER_POSITIVE_LEADING_ZERO = (byte) 0x00;
 
     private static <V> int findMostSignificantByte(V value, ValueAccessor<V> accessor)
     {
@@ -146,8 +148,11 @@ public static <VL, VR> int compareIntegers(VL lhs, ValueAccessor<VL> accessorL,
      * where a length_byte is:
      *    - 0x80 + (length - 1) for positive numbers (so that longer length sorts bigger)
      *    - 0x7F - (length - 1) for negative numbers (so that longer length sorts smaller)
-     * we don't need to sign-invert the first significant byte as the order there is already determined by the length
-     * byte.
+     *
+     * Because we include the sign in the length byte:
+     * - unlike fixed-length ints, we don't need to sign-invert the first significant byte,
+     * - unlike BigInteger, we don't need to include 0x00 prefix for positive integers whose first byte is >= 0x80
+     *   or 0xFF prefix for negative integers whose first byte is < 0x80.
      *
      * The representations are prefix-free, because representations of different length always have length bytes that
      * differ.
@@ -162,17 +167,24 @@ public static <VL, VR> int compareIntegers(VL lhs, ValueAccessor<VL> accessorL,
      *    2^33          as 840100000000
      */
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        int p = buf.position();
-        final int limit = buf.limit();
+        int p = 0;
+        final int limit = accessor.size(data);
         if (p == limit)
             return null;
 
-        // skip padding
-        final byte signbyte = buf.get(p);
-        if (signbyte == (byte) POSITIVE_VARINT_LENGTH_HEADER || signbyte == (byte) NEGATIVE_VARINT_LENGTH_HEADER)
-            while (p + 1 < limit && buf.get(++p) == signbyte) {}
+        // skip any leading sign-only byte(s)
+        final byte signbyte = accessor.getByte(data, p);
+        if (signbyte == BIG_INTEGER_NEGATIVE_LEADING_ZERO || signbyte == BIG_INTEGER_POSITIVE_LEADING_ZERO)
+        {
+            while (p + 1 < limit)
+            {
+                if (accessor.getByte(data, ++p) != signbyte)
+                    break;
+            }
+        }
+
         final int startpos = p;
 
         return new ByteSource()
@@ -185,25 +197,87 @@ public int next()
             {
                 if (!sizeReported)
                 {
-                    int v = sizeToReport;
-                    if (v >= 128)
-                        v = 128;
+                    if (sizeToReport >= 128)
+                    {
+                        sizeToReport -= 128;
+                        return signbyte >= 0
+                               ? POSITIVE_VARINT_LENGTH_HEADER
+                               : NEGATIVE_VARINT_LENGTH_HEADER;
+                    }
                     else
+                    {
                         sizeReported = true;
-
-                    sizeToReport -= v;
-                    return signbyte >= 0
-                           ? POSITIVE_VARINT_HEADER + (v - 1)
-                           : POSITIVE_VARINT_HEADER - v;
+                        return signbyte >= 0
+                               ? POSITIVE_VARINT_HEADER + (sizeToReport - 1)
+                               : POSITIVE_VARINT_HEADER - sizeToReport;
+                    }
                 }
+
                 if (pos == limit)
                     return END_OF_STREAM;
 
-                return buf.get(pos++) & 0xFF;
+                return accessor.getByte(data, pos++) & 0xFF;
             }
         };
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        int valueBytes;
+        byte signedZero;
+        // Consume the first byte to determine whether the encoded number is positive and
+        // start iterating through the length header bytes and collecting the number of value bytes.
+        int curr = comparableBytes.next();
+        if (curr >= POSITIVE_VARINT_HEADER) // positive number
+        {
+            valueBytes = curr - POSITIVE_VARINT_HEADER + 1;
+            while (curr == POSITIVE_VARINT_LENGTH_HEADER)
+            {
+                curr = comparableBytes.next();
+                valueBytes += curr - POSITIVE_VARINT_HEADER + 1;
+            }
+            signedZero = 0;
+        }
+        else // negative number
+        {
+            valueBytes = POSITIVE_VARINT_HEADER - curr;
+            while (curr == NEGATIVE_VARINT_LENGTH_HEADER)
+            {
+                curr = comparableBytes.next();
+                valueBytes += POSITIVE_VARINT_HEADER - curr;
+            }
+            signedZero = -1;
+        }
+
+        int writtenBytes = 0;
+        V buf;
+        // Add "leading zero" if needed (i.e. in case the leading byte of a positive number corresponds to a negative
+        // value, or in case the leading byte of a negative number corresponds to a non-negative value).
+        // Size the array containing all the value bytes accordingly.
+        curr = comparableBytes.next();
+        if ((curr & 0x80) != (signedZero & 0x80))
+        {
+            ++valueBytes;
+            buf = accessor.allocate(valueBytes);
+            accessor.putByte(buf, writtenBytes++, signedZero);
+        }
+        else
+            buf = accessor.allocate(valueBytes);
+        // Don't forget to add the first consumed value byte after determining whether leading zero should be added
+        // and sizing the value bytes array.
+        accessor.putByte(buf, writtenBytes++, (byte) curr);
+
+        // Consume exactly the number of expected value bytes.
+        while (writtenBytes < valueBytes)
+            accessor.putByte(buf, writtenBytes++, (byte) comparableBytes.next());
+
+        return buf;
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
index c0d099dee174..81ec9d9a566e 100644
--- a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java
@@ -26,8 +26,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.UUIDSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class LexicalUUIDType extends AbstractType<UUID>
 {
@@ -51,23 +52,22 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        if (buf == null || buf.remaining() == 0)
+        if (data == null || accessor.isEmpty(data))
             return null;
 
         // fixed-length (hence prefix-free) representation, but
         // we have to sign-flip the highest bytes of the two longs
-        final int bufstart = buf.position();
         return new ByteSource()
         {
             int bufpos = 0;
 
             public int next()
             {
-                if (bufpos + bufstart >= buf.limit())
+                if (bufpos >= accessor.size(data))
                     return END_OF_STREAM;
-                int v = buf.get(bufpos + bufstart) & 0xFF;
+                int v = accessor.getByte(data, bufpos) & 0xFF;
                 if (bufpos == 0 || bufpos == 8)
                     v ^= 0x80;
                 ++bufpos;
@@ -76,6 +76,21 @@ public int next()
         };
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        // Optional-style encoding of empty values as null sources
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        long hiBits = ByteSourceInverse.getSignedLong(comparableBytes);
+        long loBits = ByteSourceInverse.getSignedLong(comparableBytes);
+
+        // Lexical UUIDs are stored as just two signed longs. The decoding of these longs flips their sign bit back, so
+        // they can directly be used for constructing the original UUID.
+        return UUIDType.makeUuidBytes(accessor, hiBits, loBits);
+    }
+
     public ByteBuffer fromString(String source) throws MarshalException
     {
         // Return an empty ByteBuffer for an empty string.
diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java
index ada7bc198d4c..8a48870dce63 100644
--- a/src/java/org/apache/cassandra/db/marshal/ListType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ListType.java
@@ -36,8 +36,9 @@
 import org.apache.cassandra.serializers.ListSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class ListType<T> extends CollectionType<List<T>>
 {
@@ -202,30 +203,62 @@ static <VL, VR> int compareListOrSet(AbstractType<?> elementsComparator, VL left
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
-        return asComparableBytesListOrSet(getElementsType(), b, version);
+        return asComparableBytesListOrSet(getElementsType(), accessor, data, version);
     }
 
-    static ByteSource asComparableBytesListOrSet(AbstractType<?> elementsComparator, ByteBuffer b, Version version)
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, Version version)
     {
-        if (!b.hasRemaining())
+        return fromComparableBytesListOrSet(accessor, comparableBytes, version, getElementsType());
+    }
+
+    static <V> ByteSource asComparableBytesListOrSet(AbstractType<?> elementsComparator,
+                                                     ValueAccessor<V> accessor,
+                                                     V data,
+                                                     Version version)
+    {
+        if (accessor.isEmpty(data))
             return null;
 
-        b = b.duplicate();
         int offset = 0;
-        int size = CollectionSerializer.readCollectionSize(b, ByteBufferAccessor.instance, ProtocolVersion.V3);
+        int size = CollectionSerializer.readCollectionSize(data, accessor, ProtocolVersion.V3);
         offset += CollectionSerializer.sizeOfCollectionSize(size, ProtocolVersion.V3);
         ByteSource[] srcs = new ByteSource[size];
         for (int i = 0; i < size; ++i)
         {
-            ByteBuffer v = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, ProtocolVersion.V3);
-            offset += CollectionSerializer.sizeOfValue(v, ByteBufferAccessor.instance, ProtocolVersion.V3);
-            srcs[i] = elementsComparator.asComparableBytes(v, version);
+            V v = CollectionSerializer.readValue(data, accessor, offset, ProtocolVersion.V3);
+            offset += CollectionSerializer.sizeOfValue(v, accessor, ProtocolVersion.V3);
+            srcs[i] = elementsComparator.asComparableBytes(accessor, v, version);
         }
         return ByteSource.withTerminator(version == Version.LEGACY ? 0x00 : ByteSource.TERMINATOR, srcs);
     }
 
+    static <V> V fromComparableBytesListOrSet(ValueAccessor<V> accessor,
+                                              ByteSource.Peekable comparableBytes,
+                                              Version version,
+                                              AbstractType<?> elementType)
+    {
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        List<V> buffers = new ArrayList<>();
+        int terminator = version == Version.LEGACY
+                         ? 0x00
+                         : ByteSource.TERMINATOR;
+        int separator = comparableBytes.next();
+        while (separator != terminator)
+        {
+            if (!ByteSourceInverse.nextComponentNull(separator))
+                buffers.add(elementType.fromComparableBytes(accessor, comparableBytes, version));
+            else
+                buffers.add(null);
+            separator = comparableBytes.next();
+        }
+        return CollectionSerializer.pack(buffers, accessor, buffers.size(), ProtocolVersion.V3);
+    }
+
     @Override
     public String toString(boolean ignoreFreezing)
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java
index e8db323731af..9e0335733745 100644
--- a/src/java/org/apache/cassandra/db/marshal/LongType.java
+++ b/src/java/org/apache/cassandra/db/marshal/LongType.java
@@ -28,8 +28,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class LongType extends NumberType<Long>
 {
@@ -60,9 +61,15 @@ public static <VL, VR> int compareLongs(VL left, ValueAccessor<VL> accessorL, VR
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return ByteSource.optionalSignedFixedLengthNumber(buf);
+        return ByteSource.optionalSignedFixedLengthNumber(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 8);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java
index a6e59ad2fcc9..5ae5ed936168 100644
--- a/src/java/org/apache/cassandra/db/marshal/MapType.java
+++ b/src/java/org/apache/cassandra/db/marshal/MapType.java
@@ -31,8 +31,9 @@
 import org.apache.cassandra.serializers.MapSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.Pair;
 
 public class MapType<K, V> extends CollectionType<Map<K, V>>
@@ -221,34 +222,71 @@ public static <TL, TR> int compareMaps(AbstractType<?> keysComparator, AbstractT
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
-        return asComparableBytesMap(getKeysType(), getValuesType(), b, version);
+        return asComparableBytesMap(getKeysType(), getValuesType(), accessor, data, version);
     }
 
-    static ByteSource asComparableBytesMap(AbstractType<?> keysComparator, AbstractType<?> valuesComparator, ByteBuffer b, Version version)
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, Version version)
     {
-        if (!b.hasRemaining())
+        return fromComparableBytesMap(accessor, comparableBytes, version, getKeysType(), getValuesType());
+    }
+
+    static <V> ByteSource asComparableBytesMap(AbstractType<?> keysComparator,
+                                               AbstractType<?> valuesComparator,
+                                               ValueAccessor<V> accessor,
+                                               V data,
+                                               Version version)
+    {
+        if (accessor.isEmpty(data))
             return null;
 
-        b = b.duplicate();
         ProtocolVersion protocolVersion = ProtocolVersion.V3;
         int offset = 0;
-        int size = CollectionSerializer.readCollectionSize(b, ByteBufferAccessor.instance, protocolVersion);
+        int size = CollectionSerializer.readCollectionSize(data, accessor, protocolVersion);
         offset += CollectionSerializer.sizeOfCollectionSize(size, protocolVersion);
         ByteSource[] srcs = new ByteSource[size * 2];
         for (int i = 0; i < size; ++i)
         {
-            ByteBuffer k = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, protocolVersion);
-            offset += CollectionSerializer.sizeOfValue(k, ByteBufferAccessor.instance, protocolVersion);
-            srcs[i * 2 + 0] = keysComparator.asComparableBytes(k, version);
-            ByteBuffer v = CollectionSerializer.readValue(b, ByteBufferAccessor.instance, offset, protocolVersion);
-            offset += CollectionSerializer.sizeOfValue(v, ByteBufferAccessor.instance, protocolVersion);
-            srcs[i * 2 + 1] = valuesComparator.asComparableBytes(v, version);
+            V k = CollectionSerializer.readValue(data, accessor, offset, protocolVersion);
+            offset += CollectionSerializer.sizeOfValue(k, accessor, protocolVersion);
+            srcs[i * 2 + 0] = keysComparator.asComparableBytes(accessor, k, version);
+            V v = CollectionSerializer.readValue(data, accessor, offset, protocolVersion);
+            offset += CollectionSerializer.sizeOfValue(v, accessor, protocolVersion);
+            srcs[i * 2 + 1] = valuesComparator.asComparableBytes(accessor, v, version);
         }
         return ByteSource.withTerminator(version == Version.LEGACY ? 0x00 : ByteSource.TERMINATOR, srcs);
     }
 
+    static <V> V fromComparableBytesMap(ValueAccessor<V> accessor,
+                                        ByteSource.Peekable comparableBytes,
+                                        Version version,
+                                        AbstractType<?> keysComparator,
+                                        AbstractType<?> valuesComparator)
+    {
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        List<V> buffers = new ArrayList<>();
+        int terminator = version == Version.LEGACY
+                         ? 0x00
+                         : ByteSource.TERMINATOR;
+        int separator = comparableBytes.next();
+        while (separator != terminator)
+        {
+            buffers.add(ByteSourceInverse.nextComponentNull(separator)
+                        ? null
+                        : keysComparator.fromComparableBytes(accessor, comparableBytes, version));
+            separator = comparableBytes.next();
+            buffers.add(ByteSourceInverse.nextComponentNull(separator)
+                        ? null
+                        : valuesComparator.fromComparableBytes(accessor, comparableBytes, version));
+            separator = comparableBytes.next();
+        }
+        return CollectionSerializer.pack(buffers, accessor,buffers.size() / 2, ProtocolVersion.V3);
+    }
+
     @Override
     public MapSerializer<K, V> getSerializer()
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
index d72969267ab1..91fe3e31059c 100644
--- a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
+++ b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java
@@ -22,15 +22,15 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 
 /** for sorting columns representing row keys in the row ordering as determined by a partitioner.
@@ -98,8 +98,10 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
+        // Partitioners work with ByteBuffers only.
+        ByteBuffer buf = ByteBufferAccessor.instance.convert(data, accessor);
         if (version != Version.LEGACY)
         {
             // For ByteComparable.Version.OSS41 and above we encode an empty key with a null byte source. This
@@ -112,6 +114,16 @@ public ByteSource asComparableBytes(ByteBuffer buf, Version version)
         return PartitionPosition.ForKey.get(buf, partitioner).asComparableBytes(version);
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        assert version != Version.LEGACY;
+        if (comparableBytes == null)
+            return accessor.empty();
+        byte[] keyBytes = DecoratedKey.keyFromByteComparable(v -> comparableBytes, version, partitioner);
+        return accessor.valueOf(keyBytes);
+    }
+
     @Override
     public void validate(ByteBuffer bytes) throws MarshalException
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/ReversedType.java b/src/java/org/apache/cassandra/db/marshal/ReversedType.java
index 4b753f528184..21f3340b0950 100644
--- a/src/java/org/apache/cassandra/db/marshal/ReversedType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ReversedType.java
@@ -28,8 +28,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public class ReversedType<T> extends AbstractType<T>
 {
@@ -66,9 +66,9 @@ public boolean isEmptyValueMeaningless()
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        ByteSource src = baseType.asComparableBytes(b, version);
+        ByteSource src = baseType.asComparableBytes(accessor, data, version);
         if (src == null)    // Note: this will only compare correctly if used within a sequence
             return null;
         // Invert all bytes.
@@ -85,6 +85,12 @@ public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version
         };
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return baseType.fromComparableBytes(accessor, ReversedPeekableByteSource.of(comparableBytes), version);
+    }
+
     public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right, ValueAccessor<VR> accessorR)
     {
         return baseType.compare(right, accessorR, left, accessorL);
diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java
index cab4d55a2357..2b83afbbc05f 100644
--- a/src/java/org/apache/cassandra/db/marshal/SetType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SetType.java
@@ -30,8 +30,8 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.SetSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public class SetType<T> extends CollectionType<Set<T>>
 {
@@ -160,9 +160,15 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return ListType.asComparableBytesListOrSet(getElementsType(), b, version);
+        return ListType.asComparableBytesListOrSet(getElementsType(), accessor, data, version);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ListType.fromComparableBytesListOrSet(accessor, comparableBytes, version, getElementsType());
     }
 
     public SetSerializer<T> getSerializer()
diff --git a/src/java/org/apache/cassandra/db/marshal/ShortType.java b/src/java/org/apache/cassandra/db/marshal/ShortType.java
index 83a3e054a23e..013fa959497a 100644
--- a/src/java/org/apache/cassandra/db/marshal/ShortType.java
+++ b/src/java/org/apache/cassandra/db/marshal/ShortType.java
@@ -28,8 +28,9 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class ShortType extends NumberType<Short>
 {
@@ -49,11 +50,16 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return version == ByteComparable.Version.LEGACY
-               ? ByteSource.signedFixedLengthNumber(buf)
-               : ByteSource.optionalSignedFixedLengthNumber(buf);
+        // This type does not allow non-present values, but we do just to avoid future complexity.
+        return ByteSource.optionalSignedFixedLengthNumber(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 2);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
index 0f0546af7baa..a0de2c20892e 100644
--- a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
+++ b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java
@@ -28,9 +28,10 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -41,12 +42,17 @@ public class SimpleDateType extends TemporalType<Integer>
     SimpleDateType() {super(ComparisonType.BYTE_ORDER);} // singleton
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
         // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient.
-        return version == Version.LEGACY
-               ? ByteSource.fixedLength(buf)
-               : ByteSource.optionalFixedLength(buf);
+        // This type does not allow non-present values, but we do just to avoid future complexity.
+        return ByteSource.optionalFixedLength(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalFixedLength(accessor, comparableBytes, 4);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/TimeType.java b/src/java/org/apache/cassandra/db/marshal/TimeType.java
index 58a2bdb69fa5..f029b8bb94a8 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeType.java
@@ -28,9 +28,10 @@
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteComparable.Version;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 /**
  * Nanosecond resolution time values
@@ -46,12 +47,17 @@ public ByteBuffer fromString(String source) throws MarshalException
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, Version version)
     {
         // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient.
-        return version == Version.LEGACY
-               ? ByteSource.fixedLength(buf)
-               : ByteSource.optionalFixedLength(buf);
+        // This type does not allow non-present values, but we do just to avoid future complexity.
+        return ByteSource.optionalFixedLength(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalFixedLength(accessor, comparableBytes, 8);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
index 64bee6c430d6..67f2d0bda9cc 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimeUUIDType.java
@@ -24,8 +24,9 @@
 import org.apache.cassandra.cql3.Constants;
 import org.apache.cassandra.cql3.Term;
 import org.apache.cassandra.serializers.TypeSerializer;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TimeUUIDSerializer;
@@ -58,12 +59,12 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
 
         long msb1 = accessorL.getLong(left, 0);
         long msb2 = accessorR.getLong(right, 0);
+        verifyVersion(msb1);
+        verifyVersion(msb2);
+
         msb1 = reorderTimestampBytes(msb1);
         msb2 = reorderTimestampBytes(msb2);
 
-        assert (msb1 & topbyte(0xf0L)) == topbyte(0x10L);
-        assert (msb2 & topbyte(0xf0L)) == topbyte(0x10L);
-
         int c = Long.compare(msb1, msb2);
         if (c != 0)
             return c;
@@ -76,21 +77,39 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        if (!b.hasRemaining())
+        if (accessor.isEmpty(data))
             return null;
 
-        int s = b.position();
-        long msb = b.getLong(s);
-        assert ((msb >>> 12) & 0xf) == 1;
+        long hiBits = accessor.getLong(data, 0);
+        verifyVersion(hiBits);
         ByteBuffer swizzled = ByteBuffer.allocate(16);
-        swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(msb));
-        swizzled.putLong(8, b.getLong(s + 8) ^ 0x8080808080808080L);
+        swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(hiBits));
+        swizzled.putLong(8, accessor.getLong(data, 8) ^ 0x8080808080808080L);
 
         return ByteSource.fixedLength(swizzled);
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        // Optional-style encoding of empty values as null sources
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        // The non-lexical UUID bits are stored as an unsigned fixed-length 128-bit integer.
+        long hiBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8);
+        long loBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8);
+
+        hiBits = reorderBackTimestampBytes(hiBits);
+        verifyVersion(hiBits);
+        // In addition, TimeUUIDType also touches the low bits of the UUID (see CASSANDRA-8730 and DB-1758).
+        loBits ^= 0x8080808080808080L;
+
+        return UUIDType.makeUuidBytes(accessor, hiBits, loBits);
+    }
+
     // takes as input 8 signed bytes in native machine order
     // returns the first byte unchanged, and the following 7 bytes converted to an unsigned representation
     // which is the same as a 2's complement long in native format
@@ -99,16 +118,30 @@ private static long signedBytesToNativeLong(long signedBytes)
         return signedBytes ^ 0x0080808080808080L;
     }
 
-    private static long topbyte(long topbyte)
+    private void verifyVersion(long hiBits)
     {
-        return topbyte << 56;
+        long version = (hiBits >>> 12) & 0xf;
+        if (version != 1)
+            throw new MarshalException(String.format("Invalid UUID version %d for timeuuid",
+                                                     version));
     }
 
     protected static long reorderTimestampBytes(long input)
     {
-        return    (input <<  48)
-                  | ((input <<  16) & 0xFFFF00000000L)
-                  |  (input >>> 32);
+        return (input <<  48)
+               | ((input <<  16) & 0xFFFF00000000L)
+               |  (input >>> 32);
+    }
+
+    protected static long reorderBackTimestampBytes(long input)
+    {
+        // In a time-based UUID the high bits are significantly more shuffled than in other UUIDs - if [X] represents a
+        // 16-bit tuple, [1][2][3][4] should become [3][4][2][1].
+        // See the UUID Javadoc (and more specifically the high bits layout of a Leach-Salz UUID) to understand the
+        // reasoning behind this bit twiddling in the first place (in the context of comparisons).
+        return (input << 32)
+               | ((input >>> 16) & 0xFFFF0000L)
+               | (input >>> 48);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/TimestampType.java b/src/java/org/apache/cassandra/db/marshal/TimestampType.java
index 310eafc50354..5bca7b1f56db 100644
--- a/src/java/org/apache/cassandra/db/marshal/TimestampType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TimestampType.java
@@ -32,8 +32,9 @@
 import org.apache.cassandra.serializers.TimestampSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -63,9 +64,15 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer buf, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        return ByteSource.optionalSignedFixedLengthNumber(buf);
+        return ByteSource.optionalSignedFixedLengthNumber(accessor, data);
+    }
+
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 8);
     }
 
     public ByteBuffer fromString(String source) throws MarshalException
diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java
index 59f9786d33f6..c3b5ddbaac35 100644
--- a/src/java/org/apache/cassandra/db/marshal/TupleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java
@@ -30,13 +30,12 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.serializers.*;
 import org.apache.cassandra.transport.ProtocolVersion;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 import static com.google.common.collect.Iterables.any;
 import static com.google.common.collect.Iterables.transform;
@@ -196,58 +195,93 @@ private <T> boolean allRemainingComponentsAreNull(T v, ValueAccessor<T> accessor
         {
             int size = accessor.getInt(v, offset);
             offset += TypeSizes.INT_SIZE;
-            if (size > 0)
+            if (size >= 0)
                 return false;
         }
         return true;
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version version)
     {
-        ByteBuffer[] bufs = split(byteBuffer);  // this may be shorter than types.size -- other srcs remain null in that case
+        if (accessor.isEmpty(data))
+            return null;
+
+        V[] bufs = split(accessor, data);  // this may be shorter than types.size -- other srcs remain null in that case
         ByteSource[] srcs = new ByteSource[types.size()];
         for (int i = 0; i < bufs.length; ++i)
-            srcs[i] = types.get(i).asComparableBytes(bufs[i], version);
+            srcs[i] = bufs[i] != null ? types.get(i).asComparableBytes(accessor, bufs[i], version) : null;
         // We always have a fixed number of sources, with the trailing ones possibly being nulls.
         // This can only result in a prefix if the last type in the tuple allows prefixes. Since that type is required
         // to be weakly prefix-free, so is the tuple.
         return ByteSource.withTerminator(ByteSource.END_OF_STREAM, srcs);
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        V[] componentBuffers = accessor.createArray(types.size());
+        for (int i = 0; i < types.size(); ++i)
+        {
+            AbstractType<?> componentType = types.get(i);
+            ByteSource.Peekable component = ByteSourceInverse.nextComponentSource(comparableBytes);
+            if (component != null)
+                componentBuffers[i] = componentType.fromComparableBytes(accessor, component, version);
+            else
+                componentBuffers[i] = null;
+        }
+        return buildValue(accessor, componentBuffers);
+    }
+
     /**
      * Split a tuple value into its component values.
      */
-    public ByteBuffer[] split(ByteBuffer value)
+    public <V> V[] split(ValueAccessor<V> accessor, V value)
     {
-        ByteBuffer[] components = new ByteBuffer[size()];
-        ByteBuffer input = value.duplicate();
+        V[] components = accessor.createArray(size());
+        int length = accessor.size(value);
+        int position = 0;
         for (int i = 0; i < size(); i++)
         {
-            if (!input.hasRemaining())
+            if (position == length)
                 return Arrays.copyOfRange(components, 0, i);
 
-            int size = input.getInt();
-
-            if (input.remaining() < size)
+            if (position + 4 > length)
                 throw new MarshalException(String.format("Not enough bytes to read %dth component", i));
 
+            int size = accessor.getInt(value, position);
+            position += 4;
+
             // size < 0 means null value
-            components[i] = size < 0 ? null : ByteBufferUtil.readBytes(input, size);
+            if (size >= 0)
+            {
+                if (position + size > length)
+                    throw new MarshalException(String.format("Not enough bytes to read %dth component", i));
+
+                components[i] = accessor.slice(value, position, size);
+                position += size;
+            }
+            else
+                components[i] = null;
         }
 
         // error out if we got more values in the tuple/UDT than we expected
-        if (input.hasRemaining())
+        if (position < length)
         {
-            throw new InvalidRequestException(String.format(
-            "Expected %s %s for %s column, but got more",
-            size(), size() == 1 ? "value" : "values", this.asCQL3Type()));
+            throw new MarshalException(String.format("Expected %s %s for %s column, but got more",
+                                                     size(),
+                                                     size() == 1 ? "value" : "values",
+                                                     this.asCQL3Type()));
         }
 
         return components;
     }
 
-    public static <V> V buildValue(ValueAccessor<V> accessor, V[] components)
+    @SafeVarargs
+    public static <V> V buildValue(ValueAccessor<V> accessor, V... components)
     {
         int totalLength = 0;
         for (V component : components)
@@ -271,7 +305,7 @@ public static <V> V buildValue(ValueAccessor<V> accessor, V[] components)
         return result;
     }
 
-    public static ByteBuffer buildValue(ByteBuffer[] components)
+    public static ByteBuffer buildValue(ByteBuffer... components)
     {
         return buildValue(ByteBufferAccessor.instance, components);
     }
diff --git a/src/java/org/apache/cassandra/db/marshal/UUIDType.java b/src/java/org/apache/cassandra/db/marshal/UUIDType.java
index 1ff728402ae9..7978ca469cda 100644
--- a/src/java/org/apache/cassandra/db/marshal/UUIDType.java
+++ b/src/java/org/apache/cassandra/db/marshal/UUIDType.java
@@ -30,8 +30,9 @@
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.UUIDSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.UUIDGen;
 
 /**
@@ -98,17 +99,20 @@ public <VL, VR> int compareCustom(VL left, ValueAccessor<VL> accessorL, VR right
                 return c;
         }
 
+        // Amusingly (or not so much), although UUIDType freely takes time UUIDs (UUIDs with version 1), it compares
+        // them differently than TimeUUIDType. This is evident in the least significant bytes comparison (the code
+        // below for UUIDType), where UUIDType treats them as unsigned bytes, while TimeUUIDType compares the bytes
+        // signed. See CASSANDRA-8730 for details around this discrepancy.
         return UnsignedLongs.compare(accessorL.getLong(left, 8), accessorR.getLong(right, 8));
     }
 
     @Override
-    public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version v)
+    public <V> ByteSource asComparableBytes(ValueAccessor<V> accessor, V data, ByteComparable.Version v)
     {
-        if (!b.hasRemaining())
+        if (accessor.isEmpty(data))
             return null;
 
-        int s = b.position();
-        long msb = b.getLong(s);
+        long msb = accessor.getLong(data, 0);
         long version = ((msb >>> 12) & 0xf);
         ByteBuffer swizzled = ByteBuffer.allocate(16);
 
@@ -117,12 +121,49 @@ public ByteSource asComparableBytes(ByteBuffer b, ByteComparable.Version v)
         else
             swizzled.putLong(0, (version << 60) | ((msb >>> 4) & 0x0FFFFFFFFFFFF000L) | (msb & 0xFFFL));
 
-        swizzled.putLong(8, b.getLong(s + 8));
+        swizzled.putLong(8, accessor.getLong(data, 8));
 
         // fixed-length thus prefix-free
         return ByteSource.fixedLength(swizzled);
     }
 
+    @Override
+    public <V> V fromComparableBytes(ValueAccessor<V> accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+    {
+        // Optional-style encoding of empty values as null sources
+        if (comparableBytes == null)
+            return accessor.empty();
+
+        // The UUID bits are stored as an unsigned fixed-length 128-bit integer.
+        long hiBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8);
+        long loBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8);
+
+        long version1 = hiBits >>> 60 & 0xF;
+        if (version1 == 1)
+        {
+            // If the version bits are set to 1, this is a time-based UUID, and its high bits are significantly more
+            // shuffled than in other UUIDs. Revert the shuffle.
+            hiBits = TimeUUIDType.reorderBackTimestampBytes(hiBits);
+        }
+        else
+        {
+            // For non-time UUIDs, the only thing that's needed is to put the version bits back where they were originally.
+            hiBits = hiBits << 4 & 0xFFFFFFFFFFFF0000L
+                     | version1 << 12
+                     | hiBits & 0x0000000000000FFFL;
+        }
+
+        return makeUuidBytes(accessor, hiBits, loBits);
+    }
+
+    static <V> V makeUuidBytes(ValueAccessor<V> accessor, long high, long low)
+    {
+        V buffer = accessor.allocate(16);
+        accessor.putLong(buffer, 0, high);
+        accessor.putLong(buffer, 8, low);
+        return buffer;
+    }
+
     @Override
     public boolean isValueCompatibleWithInternal(AbstractType<?> otherType)
     {
diff --git a/src/java/org/apache/cassandra/db/marshal/UserType.java b/src/java/org/apache/cassandra/db/marshal/UserType.java
index 29afad9583d4..24c05e255967 100644
--- a/src/java/org/apache/cassandra/db/marshal/UserType.java
+++ b/src/java/org/apache/cassandra/db/marshal/UserType.java
@@ -258,7 +258,7 @@ public Term fromJSONObject(Object parsed) throws MarshalException
     @Override
     public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion)
     {
-        ByteBuffer[] buffers = split(buffer);
+        ByteBuffer[] buffers = split(ByteBufferAccessor.instance, buffer);
         StringBuilder sb = new StringBuilder("{");
         for (int i = 0; i < types.size(); i++)
         {
diff --git a/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java b/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java
index 10532ff304bb..5fe0b3006538 100644
--- a/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java
+++ b/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java
@@ -66,6 +66,7 @@ public interface ObjectFactory<V>
         Cell<V> cell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTime, V value, CellPath path);
         Clustering<V> clustering(V... values);
         Clustering<V> clustering();
+        Clustering<V> staticClustering();
         ClusteringBound<V> bound(ClusteringPrefix.Kind kind, V... values);
         ClusteringBound<V> bound(ClusteringPrefix.Kind kind);
         ClusteringBoundary<V> boundary(ClusteringPrefix.Kind kind, V... values);
@@ -103,7 +104,6 @@ default ClusteringBoundary<V> exclusiveCloseInclusiveOpen(boolean reversed, V[]
         {
             return boundary(reversed ? INCL_END_EXCL_START_BOUNDARY : EXCL_END_INCL_START_BOUNDARY, boundValues);
         }
-
     }
     /**
      * @return the size of the given value
@@ -322,6 +322,12 @@ default boolean getBoolean(V value, int offset)
     /** returns a UUID from offset 0 */
     UUID toUUID(V value);
 
+    /**
+     * writes the byte value {@param value} to {@param dst} at offset {@param offset}
+     * @return the number of bytes written to {@param value}
+     */
+    int putByte(V dst, int offset, byte value);
+
     /**
      * writes the short value {@param value} to {@param dst} at offset {@param offset}
      * @return the number of bytes written to {@param value}
diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
index 13e2d9c2f44c..2b0e2a286147 100644
--- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java
@@ -26,8 +26,9 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Hex;
 import org.apache.cassandra.utils.ObjectSizes;
@@ -39,7 +40,6 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
@@ -231,6 +231,11 @@ public BytesToken getRandomToken(Random random)
 
     private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
     {
+        public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+        {
+            return new BytesToken(ByteSourceInverse.getUnescapedBytes(comparableBytes));
+        }
+
         public ByteBuffer toByteArray(Token token)
         {
             BytesToken bytesToken = (BytesToken) token;
diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java
index fe9f12de432d..d69b8cd45493 100644
--- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java
@@ -26,9 +26,10 @@
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.CachedHashDecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.memory.HeapAllocator;
 
@@ -85,6 +86,12 @@ public Token.TokenFactory getTokenFactory()
 
     private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
     {
+        public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+        {
+            ByteBuffer tokenData = comparator.fromComparableBytes(ByteBufferAccessor.instance, comparableBytes, version);
+            return new LocalToken(tokenData);
+        }
+
         public ByteBuffer toByteArray(Token token)
         {
             return ((LocalToken)token).token;
@@ -179,7 +186,7 @@ public boolean equals(Object obj)
         @Override
         public ByteSource asComparableBytes(ByteComparable.Version version)
         {
-            return comparator.asComparableBytes(token, version);
+            return comparator.asComparableBytes(ByteBufferAccessor.instance, token, version);
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
index 94ebb46cbdd7..7e41705853b7 100644
--- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
+++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
@@ -33,8 +33,9 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.MurmurHash;
 import org.apache.cassandra.utils.ObjectSizes;
 
@@ -324,6 +325,12 @@ public Token.TokenFactory getTokenFactory()
 
     private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
     {
+        public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+        {
+            long tokenData = ByteSourceInverse.getSignedLong(comparableBytes);
+            return new LongToken(tokenData);
+        }
+
         public ByteBuffer toByteArray(Token token)
         {
             LongToken longToken = (LongToken) token;
diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
index d248e0c5ee87..cc55b10de620 100644
--- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java
@@ -33,8 +33,9 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.Pair;
@@ -130,6 +131,11 @@ public StringToken getRandomToken(Random random)
 
     private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
     {
+        public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+        {
+            return new StringToken(ByteSourceInverse.getString(comparableBytes));
+        }
+
         public ByteBuffer toByteArray(Token token)
         {
             StringToken stringToken = (StringToken) token;
diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java
index eb7eed8f15ad..d02cfd58adfd 100644
--- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java
+++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java
@@ -27,6 +27,8 @@
 import com.google.common.annotations.VisibleForTesting;
 
 import org.apache.cassandra.db.CachedHashDecoratedKey;
+import org.apache.cassandra.db.marshal.ByteArrayAccessor;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -34,8 +36,8 @@
 import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.GuidGenerator;
 import org.apache.cassandra.utils.ObjectSizes;
@@ -160,6 +162,11 @@ private boolean isValidToken(BigInteger token) {
 
     private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
     {
+        public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+        {
+            return fromByteArray(IntegerType.instance.fromComparableBytes(ByteBufferAccessor.instance, comparableBytes, version));
+        }
+
         public ByteBuffer toByteArray(Token token)
         {
             BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
@@ -249,7 +256,7 @@ public BigIntegerToken(String token)
         @Override
         public ByteSource asComparableBytes(ByteComparable.Version version)
         {
-            return IntegerType.instance.asComparableBytes(ByteBuffer.wrap(token.toByteArray()), version);
+            return IntegerType.instance.asComparableBytes(ByteArrayAccessor.instance, token.toByteArray(), version);
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java
index 5dd3904a2e35..ec28dabfc357 100644
--- a/src/java/org/apache/cassandra/dht/Token.java
+++ b/src/java/org/apache/cassandra/dht/Token.java
@@ -26,8 +26,8 @@
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public abstract class Token implements RingPosition<Token>, Serializable
 {
@@ -39,8 +39,38 @@ public static abstract class TokenFactory
     {
         public abstract ByteBuffer toByteArray(Token token);
         public abstract Token fromByteArray(ByteBuffer bytes);
+
+        /**
+         * Produce a weakly prefix-free byte-comparable representation of the token, i.e. such a sequence of bytes that any
+         * pair x, y of valid tokens of this type and any bytes b1, b2 between 0x10 and 0xEF,
+         * (+ stands for concatenation)
+         *   compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2)
+         * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and:
+         *   asByteComparable(x)+b1 is not a prefix of asByteComparable(y)      (weakly prefix free)
+         * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the
+         * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if
+         * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat
+         * more efficient encoding of arbitrary-length byte-comparable blobs.
+         */
+        public ByteSource asComparableBytes(Token token, ByteComparable.Version version)
+        {
+            return token.asComparableBytes(version);
+        }
+
+        /**
+         * Translates the given byte-comparable representation to a token instance. If the given bytes don't correspond
+         * to the encoding of an instance of the expected token type, an {@link IllegalArgumentException} may be thrown.
+         *
+         * @param comparableBytes A byte-comparable representation (presumably of a token of some expected token type).
+         * @return A new {@link Token} instance, corresponding to the given byte-ordered representation. If we were
+         * to call {@link #asComparableBytes(ByteComparable.Version)} on the returned object, we should get a
+         * {@link ByteSource} equal to the input one as a result.
+         */
+        public abstract Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version);
+
         public abstract String toString(Token token); // serialize as string, not necessarily human-readable
         public abstract Token fromString(String string); // deserialize
+
         public abstract void validate(String token) throws ConfigurationException;
 
         public void serialize(Token token, DataOutputPlus out) throws IOException
@@ -144,7 +174,7 @@ public boolean isMinimum()
 
     /*
      * A token corresponds to the range of all the keys having this token.
-     * A token is thus no comparable directly to a key. But to be able to select
+     * A token is thus not comparable directly to a key. But to be able to select
      * keys given tokens, we introduce two "fake" keys for each token T:
      *   - lowerBoundKey: a "fake" key representing the lower bound T represents.
      *                    In other words, lowerBoundKey is the smallest key that
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
index 30499075a277..40d5d6fa9544 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
@@ -48,6 +48,7 @@ public SSTableIterator(SSTableReader sstable,
         super(sstable, file, key, indexEntry, slices, columns, ifile);
     }
 
+    @SuppressWarnings("resource") // caller to close
     protected RowReader createReaderInternal(BigTableRowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
     {
         return indexEntry.isIndexed()
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
index 0ceb269e9267..64f4325f981e 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
@@ -52,6 +52,7 @@ public SSTableReversedIterator(SSTableReader sstable,
         super(sstable, file, key, indexEntry, slices, columns, ifile);
     }
 
+    @SuppressWarnings("resource") // caller to close
     protected Reader createReaderInternal(BigTableRowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
     {
         return indexEntry.isIndexed()
diff --git a/src/java/org/apache/cassandra/serializers/BooleanSerializer.java b/src/java/org/apache/cassandra/serializers/BooleanSerializer.java
index d372a2ad7736..403e6b75b0f4 100644
--- a/src/java/org/apache/cassandra/serializers/BooleanSerializer.java
+++ b/src/java/org/apache/cassandra/serializers/BooleanSerializer.java
@@ -24,8 +24,8 @@
 
 public class BooleanSerializer extends TypeSerializer<Boolean>
 {
-    private static final ByteBuffer TRUE = ByteBuffer.wrap(new byte[] {1});
-    private static final ByteBuffer FALSE = ByteBuffer.wrap(new byte[] {0});
+    public static final ByteBuffer TRUE = ByteBuffer.wrap(new byte[] {1});
+    public static final ByteBuffer FALSE = ByteBuffer.wrap(new byte[] {0});
 
     public static final BooleanSerializer instance = new BooleanSerializer();
 
diff --git a/src/java/org/apache/cassandra/utils/ByteComparable.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java
similarity index 94%
rename from src/java/org/apache/cassandra/utils/ByteComparable.java
rename to src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java
index 05e53e682460..a27999566472 100644
--- a/src/java/org/apache/cassandra/utils/ByteComparable.java
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java
@@ -16,12 +16,10 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.utils;
+package org.apache.cassandra.utils.bytecomparable;
 
 import java.nio.ByteBuffer;
 
-import static org.apache.cassandra.utils.ByteSource.END_OF_STREAM;
-
 /**
  * Interface indicating a value can be represented/identified by a comparable {@link ByteSource}.
  */
@@ -54,7 +52,7 @@ default String byteComparableAsString(Version version)
         ByteSource stream = asComparableBytes(version);
         if (stream == null)
             return "null";
-        for (int b = stream.next(); b != END_OF_STREAM; b = stream.next())
+        for (int b = stream.next(); b != ByteSource.END_OF_STREAM; b = stream.next())
             builder.append(Integer.toHexString((b >> 4) & 0xF)).append(Integer.toHexString(b & 0xF));
         return builder.toString();
     }
@@ -119,7 +117,7 @@ static int length(ByteComparable src, Version version)
     {
         int l = 0;
         ByteSource s = src.asComparableBytes(version);
-        while (s.next() != END_OF_STREAM)
+        while (s.next() != ByteSource.END_OF_STREAM)
             ++l;
         return l;
     }
@@ -159,7 +157,7 @@ static int diffPoint(ByteComparable bytes1, ByteComparable bytes2, Version versi
         ByteSource s2 = bytes2.asComparableBytes(version);
         int pos = 1;
         int b;
-        while ((b = s1.next()) == s2.next() && b != END_OF_STREAM)
+        while ((b = s1.next()) == s2.next() && b != ByteSource.END_OF_STREAM)
             ++pos;
         return pos;
     }
diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md
new file mode 100644
index 000000000000..a6732a5fece2
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md
@@ -0,0 +1,590 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Byte-comparable translation of types (ByteComparable/ByteSource)
+
+## Problem / Motivation
+
+Cassandra has a very heavy reliance on comparisons — they are used throughout read and write paths, coordination,
+compaction, etc. to be able to order and merge results. It also supports a range of types which often require the 
+compared object to be completely in memory to order correctly, which in turn has necessitated interfaces where 
+comparisons can only be applied if the compared objects are completely loaded.
+
+This has some rather painful implications on the performance of the database, both in terms of the time it takes to load,
+compare and garbage collect, as well as in terms of the space required to hold complete keys in on-disk indices and
+deserialized versions in in-memory data structures. In addition to this, the reliance on comparisons forces Cassandra to
+use only comparison-based structures, which aren’t the most efficient.
+
+There is no way to escape the need to compare and order objects in Cassandra, but the machinery for doing this can be
+done much more smartly if we impose some simple structure in the objects we deal with — byte ordering.
+
+The term “byte order” as used in this document refers to the property of being ordered via lexicographic compare on the
+unsigned values of the byte contents. Some of the types in Cassandra already have this property (e.g. strings, blobs),
+but other most heavily used ones (e.g. integers, uuids) don’t.
+
+When byte order is universally available for the types used for keys, several key advantages can be put to use:
+- Comparisons can be done using a single simple method, core machinery doesn’t need to know anything about types.
+- Prefix differences are enough to define order; unique prefixes can be used instead of complete keys.
+- Tries can be used to store, query and iterate over ranges of keys, providing fast lookup and prefix compression.
+- Merging can be performed by merging tries, significantly reducing the number of necessary comparisons.
+
+## Ordering the types
+
+As we want to keep all existing functionality in Cassandra, we need to be able to deal with existing
+non-byte-order-comparable types. This requires some form of conversion of each value to a sequence of bytes that can be 
+byte-order compared (also called "byte-comparable"), as well as the inverse conversion from byte-comparable to value.
+
+As one of the main advantages of byte order is the ability to decide comparisons early, without having to read the whole
+of the input sequence, byte-ordered interpretations of values are represented as sources of bytes with unknown length, 
+using the interface `ByteSource`. The interface declares one method, `next()` which produces the next byte of the
+stream, or `ByteSource.END_OF_STREAM` if the stream is exhausted.
+
+`END_OF_STREAM` is chosen as `-1` (`(int) -1`, which is outside the range of possible byte values), to make comparing 
+two byte sources as trivial (and thus fast) as possible.
+  
+To be able to completely abstract type information away from the storage machinery, we also flatten complex types into
+single byte sequences. To do this, we add separator bytes in front, between components, and at the end and do some 
+encoding of variable-length sequences.
+
+The other interface provided by this package `ByteComparable`, is an entity whose byte-ordered interpretation can be
+requested. The interface is implemented by `DecoratedKey`, and can be requested for clustering keys and bounds using
+`ClusteringComparator.asByteComparable`. The inverse translation is provided by 
+`Buffer/NativeDecoratedKey.fromByteComparable` and `ClusteringComparator.clustering/bound/boundaryFromByteComparable`.
+
+The (rather technical) paragraphs below detail the encoding we have chosen for the various types. For simplicity we
+only discuss the bidirectional `OSS41` version of the translation. The implementations in code of the various mappings
+are in the releavant `AbstractType` subclass.
+
+### Desired properties
+
+Generally, we desire the following two properties from the byte-ordered translations of values we use in the database:
+- Comparison equivalence (1):  
+    <math xmlns="http://www.w3.org/1998/Math/MathML">
+      <semantics>
+        <mstyle displaystyle="true">
+          <mo>&#x2200;</mo>
+          <mi>x</mi>
+          <mo>,</mo>
+          <mi>y</mi>
+          <mo>&#x2208;</mo>
+          <mi>T</mi>
+          <mo>,</mo>
+          <mrow>
+            <mtext>compareBytesUnsigned</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>T</mi>
+            <mo>.</mo>
+            <mrow>
+              <mtext>byteOrdered</mtext>
+            </mrow>
+            <mrow>
+              <mo>(</mo>
+              <mi>x</mi>
+              <mo>)</mo>
+            </mrow>
+            <mo>,</mo>
+            <mi>T</mi>
+            <mo>.</mo>
+            <mrow>
+              <mtext>byteOrdered</mtext>
+            </mrow>
+            <mrow>
+              <mo>(</mo>
+              <mi>y</mi>
+              <mo>)</mo>
+            </mrow>
+            <mo>)</mo>
+          </mrow>
+          <mo>=</mo>
+          <mi>T</mi>
+          <mo>.</mo>
+          <mrow>
+            <mtext>compare</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>x</mi>
+            <mo>,</mo>
+            <mi>y</mi>
+            <mo>)</mo>
+          </mrow>
+        </mstyle>
+        <!-- <annotation encoding="text/x-asciimath">forall x,y in T, "compareBytesUnsigned"(T."byteOrdered"(x), T."byteOrdered"(y))=T."compare"(x, y)</annotation> -->
+      </semantics>
+    </math>
+- Prefix-freedom (2):  
+    <math xmlns="http://www.w3.org/1998/Math/MathML">
+      <semantics>
+        <mstyle displaystyle="true">
+          <mo>&#x2200;</mo>
+          <mi>x</mi>
+          <mo>,</mo>
+          <mi>y</mi>
+          <mo>&#x2208;</mo>
+          <mi>T</mi>
+          <mo>,</mo>
+          <mi>T</mi>
+          <mo>.</mo>
+          <mrow>
+            <mtext>byteOrdered</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>x</mi>
+            <mo>)</mo>
+          </mrow>
+          <mrow>
+            <mspace width="1ex" />
+            <mtext> is not a prefix of </mtext>
+            <mspace width="1ex" />
+          </mrow>
+          <mi>T</mi>
+          <mo>.</mo>
+          <mrow>
+            <mtext>byteOrdered</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>y</mi>
+            <mo>)</mo>
+          </mrow>
+        </mstyle>
+        <!-- <annotation encoding="text/x-asciimath">forall x,y in T, T."byteOrdered"(x) " is not a prefix of " T."byteOrdered"(y)</annotation> -->
+      </semantics>
+    </math>
+   
+The former is the essential requirement, and the latter allows construction of encodings of sequences of multiple
+values, as well as a little more efficiency in the data structures.
+
+To more efficiently encode byte-ordered blobs, however, we use a slightly tweaked version of the above requirements:
+
+- Comparison equivalence (3):  
+    <math xmlns="http://www.w3.org/1998/Math/MathML">
+      <semantics>
+        <mstyle displaystyle="true">
+          <mo>&#x2200;</mo>
+          <mi>x</mi>
+          <mo>,</mo>
+          <mi>y</mi>
+          <mo>&#x2208;</mo>
+          <mi>T</mi>
+          <mo>,</mo>
+          <mo>&#x2200;</mo>
+          <msub>
+            <mi>b</mi>
+            <mn>1</mn>
+          </msub>
+          <mo>,</mo>
+          <msub>
+            <mi>b</mi>
+            <mn>2</mn>
+          </msub>
+          <mo>&#x2208;</mo>
+          <mrow>
+            <mo>[</mo>
+            <mn>0x10</mn>
+            <mo>-</mo>
+            <mn>0xEF</mn>
+            <mo>]</mo>
+          </mrow>
+          <mo>,</mo>
+            <mtext></mtext>
+          <mrow>
+            <mtext>compareBytesUnsigned</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>T</mi>
+            <mo>.</mo>
+            <mrow>
+              <mtext>byteOrdered</mtext>
+            </mrow>
+            <mrow>
+              <mo>(</mo>
+              <mi>x</mi>
+              <mo>)</mo>
+            </mrow>
+            <mo>+</mo>
+            <msub>
+              <mi>b</mi>
+              <mn>1</mn>
+            </msub>
+            <mo>,</mo>
+            <mi>T</mi>
+            <mo>.</mo>
+            <mrow>
+              <mtext>byteOrdered</mtext>
+            </mrow>
+            <mrow>
+              <mo>(</mo>
+              <mi>y</mi>
+              <mo>)</mo>
+            </mrow>
+            <mo>+</mo>
+            <msub>
+              <mi>b</mi>
+              <mn>2</mn>
+            </msub>
+            <mo>)</mo>
+          </mrow>
+          <mo>=</mo>
+          <mi>T</mi>
+          <mo>.</mo>
+          <mrow>
+            <mtext>compare</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>x</mi>
+            <mo>,</mo>
+            <mi>y</mi>
+            <mo>)</mo>
+          </mrow>
+        </mstyle>
+        <!-- <annotation encoding="text/x-asciimath">forall x,y in T, forall b_1, b_2 in [0x10-0xEF],
+    "compareBytesUnsigned"(T."byteOrdered"(x)+b_1, T."byteOrdered"(y)+b_2)=T."compare"(x, y)</annotation> -->
+      </semantics>
+    </math>
+- Weak prefix-freedom (4):  
+    <math xmlns="http://www.w3.org/1998/Math/MathML">
+      <semantics>
+        <mstyle displaystyle="true">
+          <mo>&#x2200;</mo>
+          <mi>x</mi>
+          <mo>,</mo>
+          <mi>y</mi>
+          <mo>&#x2208;</mo>
+          <mi>T</mi>
+          <mo>,</mo>
+          <mo>&#x2200;</mo>
+          <mi>b</mi>
+          <mo>&#x2208;</mo>
+          <mrow>
+            <mo>[</mo>
+            <mn>0x10</mn>
+            <mo>-</mo>
+            <mn>0xEF</mn>
+            <mo>]</mo>
+          </mrow>
+          <mo>,</mo>
+            <mtext></mtext>
+          <mrow>
+            <mo>(</mo>
+            <mi>T</mi>
+            <mo>.</mo>
+            <mrow>
+              <mtext>byteOrdered</mtext>
+            </mrow>
+            <mrow>
+              <mo>(</mo>
+              <mi>x</mi>
+              <mo>)</mo>
+            </mrow>
+            <mo>+</mo>
+            <mi>b</mi>
+            <mo>)</mo>
+          </mrow>
+          <mrow>
+            <mspace width="1ex" />
+            <mtext> is not a prefix of </mtext>
+            <mspace width="1ex" />
+          </mrow>
+          <mi>T</mi>
+          <mo>.</mo>
+          <mrow>
+            <mtext>byteOrdered</mtext>
+          </mrow>
+          <mrow>
+            <mo>(</mo>
+            <mi>y</mi>
+            <mo>)</mo>
+          </mrow>
+        </mstyle>
+        <!-- <annotation encoding="text/x-asciimath">forall x,y in T, forall b in [0x10-0xEF],
+    (T."byteOrdered"(x)+b) " is not a prefix of " T."byteOrdered"(y)</annotation> -->
+      </semantics>
+    </math>
+
+These versions allow the addition of a separator byte after each value, and guarantee that the combination with 
+separator fulfills the original requirements. (3) is somewhat stronger than (1) but is necessarily true if (2) is also 
+in force, while (4) trivially follows from (2).
+
+## Fixed length unsigned integers (Murmur token, date/time)
+
+This is the trivial case, as we can simply use the input bytes in big-endian order. The comparison result is the same, 
+and fixed length values are trivially prefix free, i.e. (1) and (2) are satisfied, and thus (3) and (4) follow from the
+observation above.
+
+## Fixed-length signed integers (byte, short, int, bigint)
+
+As above, but we need to invert the sign bit of the number to put negative numbers before positives. This maps 
+`MIN_VALUE` to `0x00`..., `-1` to `0x7F…`, `0` to `0x80…`, and `MAX_VALUE` to `0xFF…`; comparing the resulting number 
+as an unsigned integer has the same effect as comparing the source signed.
+
+Examples:
+
+|Type and value|bytes|encodes as|
+|--------------|-----|----------|
+|int 1         |00 00 00 01|             80 00 00 01
+|short -1      |FF FF      |             7F FF
+|byte 0        |00         |             80
+|int MAX_VALUE |7F FF FF FF|             FF FF FF FF
+|long MIN_VALUE|80 00 00 00 00 00 00 00| 00 00 00 00 00 00 00 00
+
+## Fixed-size floating-point numbers (float, double)
+
+IEEE-754 was designed with byte-by-byte comparisons in mind, and provides an important guarantee about the bytes of a
+floating point number:  
+* If x and y are of the same sign, bytes(x) ≥ bytes(y) ⇔ |x| ≥ |y|.
+
+Thus, to be able to order floating point numbers as unsigned integers, we can:
+* Flip the sign bit so negatives are smaller than positive numbers.
+* If the number was negative, also flip all the other bits so larger magnitudes become smaller integers.
+
+This matches exactly the behaviour of `Double.compare`, which doesn’t fully agree with numerical comparisons (see spec) 
+in order to define a natural order over the floating point numbers.
+
+Examples:
+
+|Type and value|bytes|encodes as|
+|---|---|---|
+|float +1.0|            3F 80 00 00|               BF 80 00 00|
+|float +0.0|            00 00 00 00|               80 00 00 00|
+|float -0.0|            80 00 00 00|               7F FF FF FF|
+|float -1.0|            BF 80 00 00|               40 7F FF FF|
+|double +1.0|           3F F0 00 00 00 00 00 00|   BF F0 00 00 00 00 00 00|
+|double +Inf|           7F F0 00 00 00 00 00 00|   FF F0 00 00 00 00 00 00|
+|double -Inf|           FF F0 00 00 00 00 00 00|   00 0F FF FF FF FF FF FF|
+|double NaN|            7F F8 00 00 00 00 00 00|   FF F8 00 00 00 00 00 00|
+
+## UUIDs
+UUIDs are fixed-length unsigned integers, where the UUID version/type is compared first, and where bits need to be 
+reordered for the time UUIDs. To create a byte-ordered representation, we reorder the bytes: pull the version digit 
+first, then the rest of the digits, using the special time order if the version is equal to one.
+
+Examples:
+
+|Type and value|bytes|encodes as|
+|---|---|---|
+|Random (v4)|    cc520882-9507-44fb-8fc9-b349ecdee658 |    4cc52088295074fb8fc9b349ecdee658
+|Time (v1)  |    2a92d750-d8dc-11e6-a2de-cf8ecd4cf053 |    11e6d8dc2a92d750a2decf8ecd4cf053
+
+## Multi-component sequences (Partition or Clustering keys, tuples), bounds and nulls
+
+As mentioned above, we encode sequences by adding separator bytes in front, between components, and a terminator at the
+end. The values we chose for the separator and terminator are `0x40` and `0x38`, and they serve several purposes:
+- Permits partially specified bounds, with strict/exclusive or non-strict/inclusive semantics. This is done by finishing
+  a bound with a terminator value that is smaller/greater than the separator and terminator. We can use `0x20` for </≥
+  and `0x60` for ≤/>.
+- Permits encoding of `null` values. We use `0x3F` as the separator in this case, followed by no value bytes. This is 
+  always smaller than a sequence with non-null value for this component, but not smaller than a sequence that ends in
+  this component.
+- Helps identify the ending of variable-length components (see below).
+
+Examples:
+
+|Types and values|bytes|encodes as|
+|---|---|---|
+|(short 1, float 1.0)    |    00 01, 3F 80 00 00    |   40·80 01·40·BF 80 00 00·38
+|(short -1, null)        |    FF FF, —              |   40·7F FF·3F·38
+|≥ (short 0, float -Inf) |    00 00, FF 80 00 00, >=|   40·80 00·40·00 7F FF FF·20
+|< (short MIN)           |    80 00, <=             |   40·00 00·20
+|\> (null)               |                          |   3F·60
+|BOTTOM                  |                          |   20
+|TOP                     |                          |   60
+
+(The middle dot · doesn't exist in the encoding, it’s just a visualisation of the boundaries in the examples.)
+
+Since:
+- all separators in use are within `0x10`-`0xEF`, and
+- we use the same separator for internal components, with the exception of nulls which we encode with a smaller 
+  separator
+- the sequence has a fixed number of components or we use a different trailing value whenever it can be shorter
+
+the properties (3) and (4) guarantee that the byte comparison of the encoding goes in the same direction as the
+lexicographical comparison of the sequence. In combination with the third point above, (4) also ensures that no encoding 
+is a prefix of another. Since we have (1) and (2), (3) and (4) are also satisfied.
+
+Note that this means that the encodings of all partition and clustering keys used in the database will be prefix-free.
+
+## Variable-length byte comparables (ASCII, UTF-8 strings, blobs, InetAddress)
+
+In isolation, these can be compared directly without reinterpretation. However, once we place these inside a flattened
+sequence of values we need to clearly define the boundaries between values while maintaining order. To do this we use an
+end-of-value marker; since shorter values must be smaller than longer, this marker must be 0 and we need to find a way 
+to encode/escape actual 0s in the input sequence.
+
+The method we chose for this is the following:
+- If the input does not end on `00`, a `00` byte is appended at the end.
+- If the input contains a `00` byte, it is encoded as `00 FF`.
+- If the input contains a sequence of *n* `00` bytes, they are encoded as `00` `FE` (*n*-1 times) `FF`  
+  (so that we don’t double the size of `00` blobs).
+- If the input ends in `00`, the last `FF` is changed to `FE`  
+  (to ensure it’s smaller than the same value with `00` appended).
+
+Examples:
+
+|bytes/sequence|encodes as|
+|---|----|
+|22 00                |        22 00 FE
+|22 00 00 33          |        22 00 FE FF 33 00
+|22 00 11             |        22 00 FF 11 00
+|(blob 22, short 0)   |        40·22 00·40·80 00·40
+| ≥ (blob 22 00)      |        40·22 00 FE·20
+| ≤ (blob 22 00 00)   |        40·22 00 FE FE·60
+
+Within the encoding, a `00` byte can only be followed by a `FE` or `FF` byte, and hence if an encoding is a prefix of 
+another, the latter has to have a `FE` or `FF` as the next byte, which ensures both (4) (adding `10`-`EF` to the former 
+makes it no longer a prefix of the latter) and (3) (adding `10`-`EF` to the former makes it smaller than the latter; in
+this case the original value of the former is a prefix of the original value of the latter).
+
+## Variable-length integers (varint, RandomPartitioner token)
+
+If integers of unbounded length are guaranteed to start with a non-zero digit, to compare them we can first use a signed
+length, as numbers with longer representations have higher magnitudes. Only if the lengths match we need to compare the
+sequence of digits, which now has a known length.
+
+(Note: The meaning of “digit” here is not the same as “decimal digit”. We operate with numbers stored as bytes, thus it
+makes most sense to treat the numbers as encoded in base-256, where each digit is a byte.)
+
+This translates to the following encoding of varints:
+- Strip any leading zeros. Note that for negative numbers, `BigInteger` encodes leading 0 as `0xFF`.
+- If the length is 128 or greater, lead with a byte of `0xFF` (positive) or `0x00` (negative) for every 128 until there
+  are less than 128 left.
+- Encode the sign and (remaining) length of the number as a byte:
+  - `0x80 + (length - 1)` for positive numbers (so that greater magnitude is higher);
+  - `0x7F - (length - 1)` for negative numbers (so that greater magnitude is lower, and all negatives are lower than
+    positives).
+- Paste the bytes of the number, 2’s complement encoded for negative numbers (`BigInteger` already applies the 2’s
+  complement).
+
+Since when comparing two numbers we either have a difference in the length prefix, or the lengths are the same if we 
+need to compare the content bytes, there is no risk that a longer number can be confused with a shorter combined in a
+multi-component sequence. In other words, no value can be a prefix of another, thus we have (1) and (2) and thus (3) and (4)
+as well.
+
+Examples:
+
+|value|bytes|encodes as|
+|---:|---|---|
+|0      |    00              | 80·00
+|1      |    01              | 80·01
+|-1     |    FF              | 7F·FF
+|255    |    00 FF           | 80·FF
+|-256   |    FF 00           | 7F·00
+|2^16   |    01 00 00        | 82·01 00 00
+|-2^32  |    FF 00 00 00 00  | 7C·00 00 00 00
+|2^1024 |    01 00(128 times)| FF 80·01 00(128 times)
+|-2^2048|    FF 00(256 times)| 00 00 80·00(256 times)
+
+(Middle dot · shows the transition point between length and digits.)
+
+## Variable-length floating-point decimals (decimal)
+
+Variable-length floats are more complicated, but we can treat them similarly to IEEE-754 floating point numbers, by
+normalizing them by splitting them into sign, mantissa and signed exponent such that the mantissa is a number below 1 
+with a non-zero leading digit. We can then compare sign, exponent and mantissa in sequence (where the comparison of
+exponent and mantissa are with reversed meaning if the sign is negative) and that gives us the decimal ordering.
+
+A bit of extra care must be exercised when encoding decimals. Since fractions like `0.1` cannot be perfectly encoded in
+binary, decimals (and mantissas) cannot be encoded in binary or base-256 correctly. A decimal base must be used; since 
+we deal with bytes, it makes most sense to make things a little more efficient by using base-100. Floating-point 
+encoding and the comparison idea from the previous paragraph work in any number base.
+
+`BigDecimal` presents a further challenge, as it encodes decimals using a mixture of bases: numbers have a binary-
+encoded integer part and a decimal power-of-ten scale. The bytes produced by a `BigDecimal` are thus not suitable for 
+direct conversion to byte comparable and we must first instantiate the bytes as a `BigDecimal`, and then apply the 
+class’s methods to operate on it as a number.
+
+We then use the following encoding:
+- If the number is 0, the encoding is a single `0x80` byte.
+- Convert the input to signed mantissa and signed exponent in base-100. If the value is negative, invert the sign of the
+  exponent.
+- Output a byte encoding:
+  - the sign of the number encoded as `0x80` if positive and `0x00` if negative,
+  - the exponent length (stripping leading 0s) in bytes as `0x40 + exponent_length * exponent_sign`.
+- Output `exponent_length` of exponent, 2’s complement encoded so that negative values are correctly ordered.
+- Output `0x80 + leading signed byte of mantissa`, which is obtained by multiplying the mantissa by 100 and rounding to
+  -∞. The rounding is done so that the remainder of the mantissa becomes positive, and thus every new byte adds some 
+  value to it, making shorter sequences lower in value.
+- Update the mantissa to be the remainder after the rounding above. The result is guaranteed to be 0 or greater.
+- While the mantissa is non-zero, output `0x80 + leading byte` as above and update the mantissa to be the remainder.
+- Output `0x00`.
+
+As a description of how this produces the correct ordering, consider the result of comparison in the first differing 
+byte:
+- Difference in the first byte can be caused by:
+  - Difference in sign of the number or being zero, which yields the correct ordering because
+    - Negative numbers start with `0x3c` - `0x44`
+    - Zero starts with `0x80`
+    - Positive numbers start with `0xbc` - `0xc4`
+  - Difference in sign of the exponent modulated with the sign of the number. In a positive number negative exponents 
+    mean smaller values, while in a negative number it’s the opposite, thus the modulation with the number’s sign 
+    ensures the correct ordering. 
+  - Difference in modulated length of the exponent: again, since we gave the length a sign that is formed from both 
+    the sign of the exponent and the sign of the number, smaller numbers mean smaller exponent in the positive number 
+    case, and bigger exponent in the negative number case. In either case this provides the correct ordering.
+- Difference in one of the bytes of the modulated exponent (whose length and sign are now equal for both compared
+  numbers):
+  - Smaller byte means a smaller modulated exponent. In the positive case this means a smaller exponent, thus a smaller 
+    number. In the negative case this means the exponent is bigger, the absolute value of the number as well, and thus 
+    the number is smaller.
+- It is not possible for the difference to mix one number’s exponent with another’s mantissa (as such numbers would have
+  different leading bytes).
+- Difference in a mantissa byte present in both inputs:
+  - Smaller byte means smaller signed mantissa and hence smaller number when the exponents are equal.
+- One mantissa ending before another:
+  - This will result in the shorter being treated as smaller (since the trailing byte is `00`).
+  - Since all mantissas have at least one byte, this can’t happen in the leading mantissa byte.
+  - Thus the other number’s bytes from here on are not negative, and at least one of them must be non-zero, which means 
+    its mantissa is bigger and thus it encodes a bigger number.
+    
+Examples:
+
+|value|mexp|mantissa|mantissa in bytes|encodes as|
+|---:|---:|---|---|---|
+|1.1        | 1    | 0.0110 |.  01 10  |    C1·01·81 8A·00
+|1          | 1    | 0.01   |.  01     |    C1·01·81·00
+|0.01       | 0    | 0.01   |.  01     |    C0·81·00
+|0          |      |        |          |    80
+|-0.01      | 0    | -0.01  |. -01     |    40·81·00
+|-1         | -1   | -0.01  |. -01     |    3F·FF·7F·00
+|-1.1       | -1   | -0.0110|. -02 90  |    3F·FF·7E DA·00
+|-98.9      | -1   | -0.9890|. -99 10  |    3F·FF·1D 8A·00
+|-99        | -1   | -0.99  |. -99     |    3F·FF·1D·00
+|-99.9      | -1   | -0.9990|.-100 10  |    3F·FF·1C 8A·00
+|-8.1e2000  | -1001| -0.0810|. -09 90  |    3E·FC 17·77 DA·00
+|-8.1e-2000 | 999  | -0.0810|. -09 90  |    42·03 E7·77 DA·00
+|8.1e-2000  | -999 | 0.0810 |.  08 10  |    BE·FC 19·88 8A·00
+|8.1e2000   | 1001 | 0.0810 |.  08 10  |    C2·03 E9·88 8A·00
+(mexp stands for “modulated exponent”, i.e. exponent * sign)
+
+The values are prefix-free, because no exponent’s encoding can be a prefix of another, and the mantissas can never have
+a `00` byte at any place other than the last byte, and thus all (1)-(4) are satisfied.
+
+## Reversed types
+
+Reversing a type is straightforward: flip all bits of the encoded byte sequence. Since the source type encoding must
+satisfy (3) and (4), the flipped bits also do for the reversed comparator. (It is also true that if the source type 
+satisfies (1)-(2), the reversed will satisfy these too.)
+
+In a sequence we also must correct the `null` encoding for a reversed type (since it must be greater than all values).
+Instead of `0x3F` we use `0x41` as the separator byte.
+
diff --git a/src/java/org/apache/cassandra/utils/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
similarity index 79%
rename from src/java/org/apache/cassandra/utils/ByteSource.java
rename to src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
index 6326861b28f4..4a541776d952 100644
--- a/src/java/org/apache/cassandra/utils/ByteSource.java
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
@@ -15,12 +15,14 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.utils.bytecomparable;
 
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 
-import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.db.marshal.ValueAccessor;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+import org.apache.cassandra.utils.memory.MemoryUtil;
 
 import static com.google.common.base.Preconditions.checkArgument;
 
@@ -62,6 +64,19 @@ public interface ByteSource
     int LT_NEXT_COMPONENT = 0x20;
     int GT_NEXT_COMPONENT = 0x60;
 
+    // Special value for components that should be excluded from the normal min/max span. (static rows)
+    int EXCLUDED = 0x18;
+
+    /**
+     * Reinterprets a byte buffer as a byte-comparable source that has 0s escaped and finishes in an escape.
+     * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences.
+     * (See ByteSource.BufferReinterpreter/Multi for explanation.)
+     */
+    static <V> ByteSource of(ValueAccessor<V> accessor, V data, Version version)
+    {
+        return new AccessorReinterpreter<>(accessor, data, version);
+    }
+
     /**
      * Reinterprets a byte buffer as a byte-comparable source that has 0s escaped and finishes in an escape.
      * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences.
@@ -82,6 +97,16 @@ static ByteSource of(byte[] buf, Version version)
         return new ReinterpreterArray(buf, version);
     }
 
+    /**
+     * Reinterprets a memory range as a byte-comparable source that has 0s escaped and finishes in an escape.
+     * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences.
+     * (See ByteSource.BufferReinterpreter/Multi for explanation.)
+     */
+    static ByteSource of(long address, int length, ByteComparable.Version version)
+    {
+        return new MemoryReinterpreter(address, length, version);
+    }
+
     /**
      * Combines a chain of sources, turning their weak-prefix-free byte-comparable representation into the combination's
      * prefix-free byte-comparable representation, with the included terminator character.
@@ -114,9 +139,9 @@ static ByteSource of(int value)
      * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and
      * ensures the representation is prefix-free.
      */
-    static ByteSource optionalSignedFixedLengthNumber(ByteBuffer b)
+    static <V> ByteSource optionalSignedFixedLengthNumber(ValueAccessor<V> accessor, V data)
     {
-        return b.hasRemaining() ? signedFixedLengthNumber(b) : null;
+        return !accessor.isEmpty(data) ? signedFixedLengthNumber(accessor, data) : null;
     }
 
     /**
@@ -124,9 +149,9 @@ static ByteSource optionalSignedFixedLengthNumber(ByteBuffer b)
      * The first byte has its sign bit inverted, and the rest are passed unchanged.
      * Presumes that the length of the buffer is always constant for the type.
      */
-    static ByteSource signedFixedLengthNumber(ByteBuffer b)
+    static <V> ByteSource signedFixedLengthNumber(ValueAccessor<V> accessor, V data)
     {
-        return new SignedFixedLengthNumber(b);
+        return new SignedFixedLengthNumber<>(accessor, data);
     }
 
     /**
@@ -136,9 +161,9 @@ static ByteSource signedFixedLengthNumber(ByteBuffer b)
      * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and
      * ensures the representation is prefix-free.
      */
-    static ByteSource optionalSignedFixedLengthFloat(ByteBuffer b)
+    static <V> ByteSource optionalSignedFixedLengthFloat(ValueAccessor<V> accessor, V data)
     {
-        return b.hasRemaining() ? signedFixedLengthFloat(b) : null;
+        return !accessor.isEmpty(data) ? signedFixedLengthFloat(accessor, data) : null;
     }
 
     /**
@@ -147,9 +172,9 @@ static ByteSource optionalSignedFixedLengthFloat(ByteBuffer b)
      * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.)
      * Presumes that the length of the buffer is always constant for the type.
      */
-    static ByteSource signedFixedLengthFloat(ByteBuffer b)
+    static <V> ByteSource signedFixedLengthFloat(ValueAccessor<V> accessor, V data)
     {
-        return new SignedFixedLengthFloat(b);
+        return new SignedFixedLengthFloat<>(accessor, data);
     }
 
     /**
@@ -235,31 +260,6 @@ public int next()
     }
 
 
-    static ByteSource MAX = new ByteSource()
-    {
-        public int next()
-        {
-            return 0xFF;
-        }
-
-        public String toString()
-        {
-            return "MAX";
-        }
-    };
-
-    /**
-     * Returns a maximal ByteSource, i.e. something that compares greater to any other byte source.
-     * This is an infinite sequence of 0xFF.
-     *
-     * Note that since the sequence is infinite, trying to calculate this item's length, copying it, trying
-     * to store it in a trie, or comparing it to another max will result in an infinite loop.
-     */
-    public static ByteSource max()
-    {
-        return MAX;
-    }
-
     /**
      * Variable-length encoding. Escapes 0s as ESCAPE + zero or more ESCAPED_0_CONT + ESCAPED_0_DONE.
      * Finishes with an escape value (to which Multi will add non-zero component separator)
@@ -276,7 +276,7 @@ public static ByteSource max()
      * prefix-free. Additionally, any such prefix sequence will compare smaller than the value to which it is a prefix,
      * because any permitted separator byte will be smaller than the byte following the prefix.
      */
-    static abstract class AbstractReinterpreter implements ByteSource
+    abstract static class AbstractReinterpreter implements ByteSource
     {
         final Version version;
         int bufpos;
@@ -329,6 +329,29 @@ public final int next()
         protected abstract int limit();
     }
 
+    static class AccessorReinterpreter<V> extends AbstractReinterpreter
+    {
+        private final V data;
+        private final ValueAccessor<V> accessor;
+
+        private AccessorReinterpreter(ValueAccessor<V> accessor, V data, Version version)
+        {
+            super(0, version);
+            this.accessor = accessor;
+            this.data = data;
+        }
+
+        protected int limit()
+        {
+            return accessor.size(data);
+        }
+
+        protected byte get(int index)
+        {
+            return accessor.getByte(data, index);
+        }
+    }
+
     static class BufferReinterpreter extends AbstractReinterpreter
     {
         final ByteBuffer buf;
@@ -373,27 +396,52 @@ protected int limit()
         }
     }
 
+    static class MemoryReinterpreter extends AbstractReinterpreter
+    {
+        final long address;
+        final int length;
+
+        MemoryReinterpreter(long address, int length, ByteComparable.Version version)
+        {
+            super(0, version);
+            this.address = address;
+            this.length = length;
+        }
+
+        protected byte get(int index)
+        {
+            return MemoryUtil.getByte(address + index);
+        }
+
+        protected int limit()
+        {
+            return length;
+        }
+    }
+
     /**
      * Fixed length signed number encoding. Inverts first bit (so that neg < pos), then just posts all bytes from the
      * buffer. Assumes buffer is of correct length.
      */
-    static class SignedFixedLengthNumber implements ByteSource
+    static class SignedFixedLengthNumber<V> implements ByteSource
     {
-        ByteBuffer buf;
+        final ValueAccessor<V> accessor;
+        final V data;
         int bufpos;
 
-        public SignedFixedLengthNumber(ByteBuffer buf)
+        public SignedFixedLengthNumber(ValueAccessor<V> accessor, V data)
         {
-            this.buf = buf;
-            bufpos = buf.position();
+            this.accessor = accessor;
+            this.data = data;
+            this.bufpos = 0;
         }
 
         public int next()
         {
-            if (bufpos >= buf.limit())
+            if (bufpos >= accessor.size(data))
                 return END_OF_STREAM;
-            int v = buf.get(bufpos) & 0xFF;
-            if (bufpos == buf.position())
+            int v = accessor.getByte(data, bufpos) & 0xFF;
+            if (bufpos == 0)
                 v ^= 0x80;
             ++bufpos;
             return v;
@@ -423,24 +471,26 @@ public int next()
      * Fixed length signed floating point number encoding. First bit is sign. If positive, add sign bit value to make
      * greater than all negatives. If not, invert all content to make negatives with bigger magnitude smaller.
      */
-    static class SignedFixedLengthFloat implements ByteSource
+    static class SignedFixedLengthFloat<V> implements ByteSource
     {
-        final ByteBuffer buf;
+        final ValueAccessor<V> accessor;
+        final V data;
         int bufpos;
         boolean invert;
 
-        public SignedFixedLengthFloat(ByteBuffer buf)
+        public SignedFixedLengthFloat(ValueAccessor<V> accessor, V data)
         {
-            this.buf = buf;
-            this.bufpos = buf.position();
+            this.accessor = accessor;
+            this.data = data;
+            this.bufpos = 0;
         }
 
         public int next()
         {
-            if (bufpos >= buf.limit())
+            if (bufpos >= accessor.size(data))
                 return END_OF_STREAM;
-            int v = buf.get(bufpos) & 0xFF;
-            if (bufpos == buf.position())
+            int v = accessor.getByte(data, bufpos) & 0xFF;
+            if (bufpos == 0)
             {
                 invert = v >= 0x80;
                 v |= 0x80;
@@ -459,12 +509,12 @@ static class Multi implements ByteSource
     {
         final ByteSource[] srcs;
         int srcnum = -1;
-        int terminator;
+        int sequenceTerminator;
 
-        Multi(ByteSource[] srcs, int terminator)
+        Multi(ByteSource[] srcs, int sequenceTerminator)
         {
             this.srcs = srcs;
-            this.terminator = terminator;
+            this.sequenceTerminator = sequenceTerminator;
         }
 
         public int next()
@@ -480,7 +530,7 @@ public int next()
 
             ++srcnum;
             if (srcnum == srcs.length)
-                return terminator;
+                return sequenceTerminator;
             if (srcs[srcnum] == null)
                 return NEXT_COMPONENT_NULL;
             return NEXT_COMPONENT;
@@ -522,9 +572,9 @@ public int next()
         }
     }
 
-    static ByteSource optionalFixedLength(ByteBuffer b)
+    static <V> ByteSource optionalFixedLength(ValueAccessor<V> accessor, V data)
     {
-        return b.hasRemaining() ? fixedLength(b) : null;
+        return !accessor.isEmpty(data) ? fixedLength(accessor, data) : null;
     }
 
     /**
@@ -533,122 +583,64 @@ static ByteSource optionalFixedLength(ByteBuffer b)
      * underlying type has a fixed length.
      * In tests, this method is also used to generate non-escaped test cases.
      */
-    public static ByteSource fixedLength(ByteBuffer b)
+    public static <V> ByteSource fixedLength(ValueAccessor<V> accessor, V data)
     {
         return new ByteSource()
         {
-            int pos = b.position() - 1;
+            int pos = -1;
 
             @Override
             public int next()
             {
-                return ++pos < b.limit() ? b.get(pos) & 0xFF : -1;
+                return ++pos < accessor.size(data) ? accessor.getByte(data, pos) & 0xFF : -1;
             }
         };
     }
 
     /**
      * A byte source of the given bytes without any encoding.
-     * If used in a sequence, the resulting source is only guaranteed to give correct comparison results if the
+     * The resulting source is only guaranteed to give correct comparison results and be prefix-free if the
      * underlying type has a fixed length.
      * In tests, this method is also used to generate non-escaped test cases.
      */
-    public static ByteSource fixedLength(byte[] b)
-    {
-        return fixedLength(b, 0, b.length);
-    }
-
-    public static ByteSource fixedLength(byte[] b, int offset, int length)
-    {
-        checkArgument(offset >= 0 && offset <= b.length);
-        checkArgument(length >= 0 && offset + length <= b.length);
-
-        return new ByteSource()
-        {
-            int pos = offset - 1;
-
-            @Override
-            public int next()
-            {
-                return ++pos < offset + length ? b[pos] & 0xFF : END_OF_STREAM;
-            }
-        };
-    }
-
-    public static ByteSource fourBit(ByteSource s)
+    public static ByteSource fixedLength(ByteBuffer b)
     {
         return new ByteSource()
         {
-            int pos = 0;
-            int v = 0;
+            int pos = b.position() - 1;
 
             @Override
             public int next()
             {
-                if ((pos++ & 1) == 0)
-                {
-                    v = s.next();
-                    if (v == END_OF_STREAM)
-                        return END_OF_STREAM;
-                    return (v >> 4) & 0xF;
-                }
-                else
-                    return v & 0xF;
+                return ++pos < b.limit() ? b.get(pos) & 0xFF : -1;
             }
         };
     }
 
     /**
-     * Splits each byte into portions of bitCount bits.
-     * @param s source
-     * @param bitCount number of bits to issue at a time, 1-4 make sense
+     * A byte source of the given bytes without any encoding.
+     * If used in a sequence, the resulting source is only guaranteed to give correct comparison results if the
+     * underlying type has a fixed length.
+     * In tests, this method is also used to generate non-escaped test cases.
      */
-    public static ByteSource splitBytes(ByteSource s, int bitCount)
+    public static ByteSource fixedLength(byte[] b)
     {
-        return new ByteSource()
-        {
-            int pos = 8;
-            int v = 0;
-            int mask = (1 << bitCount) - 1;
-
-            @Override
-            public int next()
-            {
-                if ((pos += bitCount) >= 8)
-                {
-                    pos = 0;
-                    v = s.next();
-                    if (v == END_OF_STREAM)
-                        return END_OF_STREAM;
-                }
-                v <<= bitCount;
-                return (v >> 8) & mask;
-            }
-        };
+        return fixedLength(b, 0, b.length);
     }
 
-    /**
-     * Returns the key that is immediately after src in the topology.
-     * @param src
-     * @return src with added 00 byte at the end
-     */
-    public static ByteSource nextKey(ByteSource src)
+    public static ByteSource fixedLength(byte[] b, int offset, int length)
     {
+        checkArgument(offset >= 0 && offset <= b.length);
+        checkArgument(length >= 0 && offset + length <= b.length);
+
         return new ByteSource()
         {
-            boolean done = false;
+            int pos = offset - 1;
 
             @Override
             public int next()
             {
-                if (done)
-                    return END_OF_STREAM;
-                int n = src.next();
-                if (n != END_OF_STREAM)
-                    return n;
-
-                done = true;
-                return 0;
+                return ++pos < offset + length ? b[pos] & 0xFF : END_OF_STREAM;
             }
         };
     }
diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java
new file mode 100644
index 000000000000..3cb0c5888c88
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java
@@ -0,0 +1,448 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.bytecomparable;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import org.apache.cassandra.db.marshal.ValueAccessor;
+
+/**
+ * Contains inverse transformation utilities for {@link ByteSource}s.
+ *
+ * See ByteComparable.md for details about the encoding scheme.
+ */
+public final class ByteSourceInverse
+{
+    private static final int INITIAL_BUFFER_CAPACITY = 32;
+    private static final int BYTE_ALL_BITS = 0xFF;
+    private static final int BYTE_NO_BITS = 0x00;
+    private static final int BYTE_SIGN_BIT = 1 << 7;
+    private static final int SHORT_SIGN_BIT = 1 << 15;
+    private static final int INT_SIGN_BIT = 1 << 31;
+    private static final long LONG_SIGN_BIT = 1L << 63;
+
+    /**
+     * Get the given number of bytes and produce a long from them, effectively treating the bytes as a big-endian
+     * unsigned encoding of the number.
+     */
+    public static long getUnsignedFixedLengthAsLong(ByteSource byteSource, int length)
+    {
+        if (byteSource == null)
+            throw new IllegalArgumentException("Unexpected null ByteSource");
+        if (length < 1 || length > 8)
+            throw new IllegalArgumentException("Between 1 and 8 bytes can be read at a time");
+
+        long result = 0;
+        for (int i = 0; i < length; ++i)
+        {
+            int data = byteSource.next();
+            if (data == ByteSource.END_OF_STREAM)
+                throw new IllegalArgumentException(
+                        String.format("Unexpected end of stream reached after %d bytes (expected >= %d)", i, length));
+            assertValidByte(data);
+            result = (result << 8) | data;
+        }
+        return result;
+    }
+
+    /**
+     * Produce the bytes for an encoded signed fixed-length number.
+     * The first byte has its sign bit inverted, and the rest are passed unchanged.
+     */
+    public static <V> V getSignedFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
+    {
+        if (byteSource == null)
+            throw new IllegalArgumentException("Unexpected null ByteSource");
+        if (length < 1)
+            throw new IllegalArgumentException("At least 1 byte should be read");
+
+        V result = accessor.allocate(length);
+        // The first byte needs to have its sign flipped
+        accessor.putByte(result, 0, (byte) (byteSource.next() ^ BYTE_SIGN_BIT));
+        // and the rest can be retrieved unchanged.
+        for (int i = 1; i < length; ++i)
+        {
+            int data = byteSource.next();
+            if (data == ByteSource.END_OF_STREAM)
+                throw new IllegalArgumentException(
+                        String.format("Unexpected end of stream reached after %d bytes (expected >= %d)", i, length));
+            assertValidByte(data);
+            accessor.putByte(result, i, (byte) data);
+        }
+        return result;
+    }
+
+    /**
+     * Produce the bytes for an encoded signed fixed-length number, also translating null to empty buffer.
+     * The first byte has its sign bit inverted, and the rest are passed unchanged.
+     */
+    public static <V> V getOptionalSignedFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
+    {
+        return byteSource == null ? accessor.empty() : getSignedFixedLength(accessor, byteSource, length);
+    }
+
+    /**
+     * Produce the bytes for an encoded signed fixed-length floating-point number.
+     * If sign bit is on, returns negated bytes. If not, clears the sign bit and passes the rest of the bytes unchanged.
+     */
+    public static <V> V getSignedFixedLengthFloat(ValueAccessor<V> accessor, ByteSource byteSource, int length)
+    {
+        if (byteSource == null)
+            throw new IllegalArgumentException("Unexpected null ByteSource");
+        if (length < 1)
+            throw new IllegalArgumentException("At least 1 byte should be read");
+
+        V result = accessor.allocate(length);
+
+        int xor;
+        int first = byteSource.next();
+        assertValidByte(first);
+        if (first < 0x80)
+        {
+            // Negative number. Invert all bits.
+            xor = BYTE_ALL_BITS;
+            first ^= xor;
+        }
+        else
+        {
+            // Positive number. Invert only the sign bit.
+            xor = BYTE_NO_BITS;
+            first ^= BYTE_SIGN_BIT;
+        }
+        accessor.putByte(result, 0, (byte) first);
+
+        // xor is now applied to the rest of the bytes to flip their bits if necessary.
+        for (int i = 1; i < length; ++i)
+        {
+            int data = byteSource.next();
+            if (data == ByteSource.END_OF_STREAM)
+                throw new IllegalArgumentException(
+                String.format("Unexpected end of stream reached after %d bytes (expected >= %d)", i, length));
+            assertValidByte(data);
+            data ^= xor;
+            accessor.putByte(result, i, (byte) data);
+        }
+        return result;
+    }
+
+    /**
+     * Produce the bytes for an encoded signed fixed-length floating-point number, also translating null to an empty
+     * buffer.
+     * If sign bit is on, returns negated bytes. If not, clears the sign bit and passes the rest of the bytes unchanged.
+     */
+    public static <V> V getOptionalSignedFixedLengthFloat(ValueAccessor<V> accessor, ByteSource byteSource, int length)
+    {
+        return byteSource == null ? accessor.empty() : getSignedFixedLengthFloat(accessor, byteSource, length);
+    }
+
+    /**
+     * Get the next length bytes from the source unchanged.
+     */
+    public static <V> V getFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
+    {
+        if (byteSource == null)
+            throw new IllegalArgumentException("Unexpected null ByteSource");
+        if (length < 1)
+            throw new IllegalArgumentException("At least 1 byte should be read");
+
+        V result = accessor.allocate(length);
+        for (int i = 0; i < length; ++i)
+        {
+            int data = byteSource.next();
+            if (data == ByteSource.END_OF_STREAM)
+                throw new IllegalArgumentException(
+                        String.format("Unexpected end of stream reached after %d bytes (expected >= %d)", i, length));
+            assertValidByte(data);
+            accessor.putByte(result, i, (byte) data);
+        }
+        return result;
+    }
+
+    /**
+     * Get the next length bytes from the source unchanged, also translating null to an empty buffer.
+     */
+    public static <V> V getOptionalFixedLength(ValueAccessor<V> accessor, ByteSource byteSource, int length)
+    {
+        return byteSource == null ? accessor.empty() : getFixedLength(accessor, byteSource, length);
+    }
+
+    /**
+     * Gets the next {@code int} from the current position of the given {@link ByteSource}. The source position is
+     * modified accordingly (moved 4 bytes forward).
+     * <p>
+     * The source is not strictly required to represent just the encoding of an {@code int} value, so theoretically
+     * this API could be used for reading data in 4-byte strides. Nevertheless its usage is fairly limited because:
+     * <ol>
+     *     <li>...it presupposes signed fixed-length encoding for the encoding of the original value</li>
+     *     <li>...it decodes the data returned on each stride as an {@code int} (i.e. it inverts its leading bit)</li>
+     *     <li>...it doesn't provide any meaningful guarantees (with regard to throwing) in case there are not enough
+     *     bytes to read, in case a special escape value was not interpreted as such, etc.</li>
+     * </ol>
+     * </p>
+     *
+     * @param byteSource A non-null byte source, containing at least 4 bytes.
+     */
+    public static int getSignedInt(ByteSource byteSource)
+    {
+        return (int) getUnsignedFixedLengthAsLong(byteSource, 4) ^ INT_SIGN_BIT;
+    }
+
+    /**
+     * Gets the next {@code long} from the current position of the given {@link ByteSource}. The source position is
+     * modified accordingly (moved 8 bytes forward).
+     * <p>
+     * The source is not strictly required to represent just the encoding of a {@code long} value, so theoretically
+     * this API could be used for reading data in 8-byte strides. Nevertheless its usage is fairly limited because:
+     * <ol>
+     *     <li>...it presupposes signed fixed-length encoding for the encoding of the original value</li>
+     *     <li>...it decodes the data returned on each stride as a {@code long} (i.e. it inverts its leading bit)</li>
+     *     <li>...it doesn't provide any meaningful guarantees (with regard to throwing) in case there are not enough
+     *     bytes to read, in case a special escape value was not interpreted as such, etc.</li>
+     * </ol>
+     * </p>
+     *
+     * @param byteSource A non-null byte source, containing at least 8 bytes.
+     */
+    public static long getSignedLong(ByteSource byteSource)
+    {
+        return getUnsignedFixedLengthAsLong(byteSource, 8) ^ LONG_SIGN_BIT;
+    }
+
+    /**
+     * Converts the given {@link ByteSource} to a {@code byte}.
+     *
+     * @param byteSource A non-null byte source, containing at least 1 byte.
+     */
+    public static byte getSignedByte(ByteSource byteSource)
+    {
+        if (byteSource == null)
+            throw new IllegalArgumentException("Unexpected null ByteSource");
+        int theByte = byteSource.next();
+        if (theByte == ByteSource.END_OF_STREAM)
+            throw new IllegalArgumentException("Unexpected ByteSource with length 0 instead of 1");
+
+        return (byte) (theByte ^ BYTE_SIGN_BIT);
+    }
+
+    /**
+     * Converts the given {@link ByteSource} to a {@code short}. All terms and conditions valid for
+     * {@link #getSignedInt(ByteSource)} and {@link #getSignedLong(ByteSource)} translate to this as well.
+     *
+     * @param byteSource A non-null byte source, containing at least 2 bytes.
+     *
+     * @see #getSignedInt(ByteSource)
+     * @see #getSignedLong(ByteSource)
+     */
+    public static short getSignedShort(ByteSource byteSource)
+    {
+        return (short) (getUnsignedFixedLengthAsLong(byteSource, 2) ^ SHORT_SIGN_BIT);
+    }
+
+    /**
+     * Reads a single variable-length byte sequence (blob, string, ...) encoded according to the scheme described
+     * in ByteSource.md, decoding it back to its original, unescaped form.
+     *
+     * @param byteSource The source of the variable-length bytes sequence.
+     * @return A byte array containing the original, unescaped bytes of the given source. Unescaped here means
+     * not including any of the escape sequences of the encoding scheme used for variable-length byte sequences.
+     */
+    public static byte[] getUnescapedBytes(ByteSource.Peekable byteSource)
+    {
+        return byteSource == null ? null : readBytes(unescape(byteSource));
+    }
+
+    /**
+     * As above, but converts the result to a ByteSource.
+     */
+    public static ByteSource unescape(ByteSource.Peekable byteSource)
+    {
+        return new ByteSource() {
+            boolean escaped = false;
+
+            public int next()
+            {
+                if (!escaped)
+                {
+                    int data = byteSource.next(); // we consume this byte no matter what it is
+                    if (data > ByteSource.ESCAPE)
+                        return data;        // most used path leads here
+
+                    assert data != ByteSource.END_OF_STREAM : "Invalid escaped byte sequence";
+                    escaped = true;
+                }
+
+                int next = byteSource.peek();
+                switch (next)
+                {
+                    case END_OF_STREAM:
+                        // The end of a byte-comparable outside of a multi-component sequence. No matter what we have
+                        // seen or peeked before, we should stop now.
+                        byteSource.next();
+                        return END_OF_STREAM;
+                    case ESCAPED_0_DONE:
+                        // The end of 1 or more consecutive 0x00 value bytes.
+                        escaped = false;
+                        byteSource.next();
+                        return ESCAPE;
+                    case ESCAPED_0_CONT:
+                        // Escaped sequence continues
+                        byteSource.next();
+                        return ESCAPE;
+                    default:
+                        // An ESCAPE or ESCAPED_0_CONT won't be followed by either another ESCAPED_0_CONT, an
+                        // ESCAPED_0_DONE, or an END_OF_STREAM only when the byte-comparable is part of a multi-component
+                        // sequence and we have reached the end of the encoded byte-comparable. In this case, the byte
+                        // we have just peeked is the separator or terminator byte between or at the end of components
+                        // (which by contact must be 0x10 - 0xFE, which cannot conflict with our special bytes).
+                        assert next >= ByteSource.MIN_SEPARATOR && next <= ByteSource.MAX_SEPARATOR : next;
+                        // Unlike above, we don't consume this byte (the sequence decoding needs it).
+                        return END_OF_STREAM;
+                }
+            }
+        };
+    }
+
+    /**
+     * Reads the bytes of the given source into a byte array. Doesn't do any transformation on the bytes, just reads
+     * them until it reads an {@link ByteSource#END_OF_STREAM} byte, after which it returns an array of all the read
+     * bytes, <strong>excluding the {@link ByteSource#END_OF_STREAM}</strong>.
+     * <p>
+     * This method sizes a tentative internal buffer array at {@code initialBufferCapacity}.  However, if
+     * {@code byteSource} exceeds this size, the buffer array is recreated with doubled capacity as many times as
+     * necessary.  If, after {@code byteSource} is fully exhausted, the number of bytes read from it does not exactly
+     * match the current size of the tentative buffer array, then it is copied into another array sized to fit the
+     * number of bytes read; otherwise, it is returned without that final copy step.
+     *
+     * @param byteSource The source which bytes we're interested in.
+     * @param initialBufferCapacity The initial size of the internal buffer.
+     * @return A byte array containing exactly all the read bytes. In case of a {@code null} source, the returned byte
+     * array will be empty.
+     */
+    public static byte[] readBytes(ByteSource byteSource, final int initialBufferCapacity)
+    {
+        if (byteSource == null)
+            return new byte[0];
+
+        int readBytes = 0;
+        byte[] buf = new byte[initialBufferCapacity];
+        int data;
+        while ((data = byteSource.next()) != ByteSource.END_OF_STREAM)
+        {
+            buf = ensureCapacity(buf, readBytes);
+            buf[readBytes++] = (byte) data;
+        }
+
+        if (readBytes != buf.length)
+        {
+            buf = Arrays.copyOf(buf, readBytes);
+        }
+        return buf;
+    }
+
+    /**
+     * Reads the bytes of the given source into a byte array. Doesn't do any transformation on the bytes, just reads
+     * them until it reads an {@link ByteSource#END_OF_STREAM} byte, after which it returns an array of all the read
+     * bytes, <strong>excluding the {@link ByteSource#END_OF_STREAM}</strong>.
+     * <p>
+     * This is equivalent to {@link #readBytes(ByteSource, int)} where the second actual parameter is
+     * {@linkplain #INITIAL_BUFFER_CAPACITY} ({@value INITIAL_BUFFER_CAPACITY}).
+     *
+     * @param byteSource The source which bytes we're interested in.
+     * @return A byte array containing exactly all the read bytes. In case of a {@code null} source, the returned byte
+     * array will be empty.
+     */
+    public static byte[] readBytes(ByteSource byteSource)
+    {
+        return readBytes(byteSource, INITIAL_BUFFER_CAPACITY);
+    }
+
+    /**
+     * Ensures the given buffer has capacity for taking data with the given length - if it doesn't, it returns a copy
+     * of the buffer, but with double the capacity.
+     */
+    private static byte[] ensureCapacity(byte[] buf, int dataLengthInBytes)
+    {
+        if (dataLengthInBytes == buf.length)
+            // We won't gain much with guarding against overflow. We'll overflow when dataLengthInBytes >= 1 << 30,
+            // and if we do guard, we'll be able to extend the capacity to Integer.MAX_VALUE (which is 1 << 31 - 1).
+            // Controlling the exception that will be thrown shouldn't matter that much, and  in practice, we almost
+            // surely won't be reading gigabytes of ByteSource data at once.
+            return Arrays.copyOf(buf, dataLengthInBytes * 2);
+        else
+            return buf;
+    }
+
+    /**
+     * Converts the given {@link ByteSource} to a UTF-8 {@link String}.
+     *
+     * @param byteSource The source we're interested in.
+     * @return A UTF-8 string corresponding to the given source.
+     */
+    public static String getString(ByteSource.Peekable byteSource)
+    {
+        if (byteSource == null)
+            return null;
+
+        byte[] data = getUnescapedBytes(byteSource);
+
+        return new String(data, StandardCharsets.UTF_8);
+    }
+
+    /*
+     * Multi-component sequence utilities.
+     */
+
+    /**
+     * A utility for consuming components from a peekable multi-component sequence.
+     * It uses the component separators, so the given sequence needs to have its last component fully consumed, in
+     * order for the next consumable byte to be a separator. Identifying the end of the component that will then be
+     * consumed is the responsibility of the consumer (the user of this method).
+     * @param source A peekable multi-component sequence, which next byte is a component separator.
+     * @return the given multi-component sequence if its next component is not null, or {@code null} if it is.
+     */
+    public static ByteSource.Peekable nextComponentSource(ByteSource.Peekable source)
+    {
+        int separator = source.next();
+        return nextComponentNull(separator)
+               ? null
+               : source;
+    }
+
+    /**
+     * A utility for consuming components from a peekable multi-component sequence, very similar to
+     * {@link #nextComponentSource(ByteSource.Peekable)} - the difference being that here the separator can be passed
+     * in case it had to be consumed beforehand.
+     */
+    public static ByteSource.Peekable nextComponentSource(ByteSource.Peekable source, int separator)
+    {
+        return nextComponentNull(separator)
+               ? null
+               : source;
+    }
+
+    public static boolean nextComponentNull(int separator)
+    {
+        return separator == ByteSource.NEXT_COMPONENT_NULL || separator == ByteSource.NEXT_COMPONENT_NULL_REVERSED;
+    }
+
+    private static void assertValidByte(int data)
+    {
+        assert data >= BYTE_NO_BITS && data <= BYTE_ALL_BITS;
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java b/test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java
new file mode 100644
index 000000000000..47b3e08b5385
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java
@@ -0,0 +1,140 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.function.BiFunction;
+
+import net.nicoulaj.compilecommand.annotations.Inline;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 5, time = 2)
+@Fork(value = 1,jvmArgsAppend = { "-Xmx4G", "-Xms4G", "-Djmh.executor=CUSTOM", "-Djmh.executor.class=org.apache.cassandra.test.microbench.FastThreadExecutor"})
+@Threads(1)
+@State(Scope.Benchmark)
+public class AbstractTypeByteSourceDecodingBench
+{
+
+    private static final ByteComparable.Version LATEST = ByteComparable.Version.OSS41;
+
+    private static final Map<AbstractType, BiFunction<Random, Integer, ByteSource.Peekable>> PEEKABLE_GENERATOR_BY_TYPE = new HashMap<>();
+    static
+    {
+        PEEKABLE_GENERATOR_BY_TYPE.put(UTF8Type.instance, (prng, length) ->
+        {
+            byte[] randomBytes = new byte[length];
+            prng.nextBytes(randomBytes);
+            return ByteSource.peekable(ByteSource.of(new String(randomBytes, StandardCharsets.UTF_8), LATEST));
+        });
+        PEEKABLE_GENERATOR_BY_TYPE.put(BytesType.instance, (prng, length) ->
+        {
+            byte[] randomBytes = new byte[length];
+            prng.nextBytes(randomBytes);
+            return ByteSource.peekable(ByteSource.of(randomBytes, LATEST));
+        });
+        PEEKABLE_GENERATOR_BY_TYPE.put(IntegerType.instance, (prng, length) ->
+        {
+            BigInteger randomVarint = BigInteger.valueOf(prng.nextLong());
+            for (int i = 1; i < length / 8; ++i)
+                randomVarint = randomVarint.multiply(BigInteger.valueOf(prng.nextLong()));
+            return ByteSource.peekable(IntegerType.instance.asComparableBytes(IntegerType.instance.decompose(randomVarint), LATEST));
+        });
+        PEEKABLE_GENERATOR_BY_TYPE.put(DecimalType.instance, (prng, length) ->
+        {
+            BigInteger randomMantissa = BigInteger.valueOf(prng.nextLong());
+            for (int i = 1; i < length / 8; ++i)
+                randomMantissa = randomMantissa.multiply(BigInteger.valueOf(prng.nextLong()));
+            int randomScale = prng.nextInt(Integer.MAX_VALUE >> 1) + Integer.MAX_VALUE >> 1;
+            BigDecimal randomDecimal = new BigDecimal(randomMantissa, randomScale);
+            return ByteSource.peekable(DecimalType.instance.asComparableBytes(DecimalType.instance.decompose(randomDecimal), LATEST));
+        });
+    }
+
+    private Random prng = new Random();
+
+    @Param({"32", "128", "512"})
+    private int length;
+
+    @Param({"UTF8Type", "BytesType", "IntegerType", "DecimalType"})
+    private String abstractTypeName;
+
+    private AbstractType abstractType;
+    private BiFunction<Random, Integer, ByteSource.Peekable> peekableGenerator;
+
+    @Setup(Level.Trial)
+    public void setup()
+    {
+        abstractType = TypeParser.parse(abstractTypeName);
+        peekableGenerator = PEEKABLE_GENERATOR_BY_TYPE.get(abstractType);
+    }
+
+    @Inline
+    private ByteSource.Peekable randomPeekableBytes()
+    {
+        return peekableGenerator.apply(prng, length);
+    }
+
+    @Benchmark
+    public int baseline()
+    {
+        // Getting the source is not enough as its content is produced on next() calls.
+        ByteSource.Peekable source = randomPeekableBytes();
+        int count = 0;
+        while (source.next() != ByteSource.END_OF_STREAM)
+            ++count;
+        return count;
+    }
+
+    @Benchmark
+    public ByteBuffer fromComparableBytes()
+    {
+        ByteSource.Peekable peekableBytes = randomPeekableBytes();
+        return abstractType.fromComparableBytes(peekableBytes, ByteComparable.Version.OSS41);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java
index f9ef4cc253f2..4989cb8b2a40 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java
@@ -34,6 +34,7 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.SchemaCQLHelper;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.db.marshal.TupleType;
 import org.apache.cassandra.utils.AbstractTypeGenerators.TypeSupport;
 import org.quicktheories.core.Gen;
@@ -267,7 +268,7 @@ public void tuplePartitionReadWrite()
             for (ByteBuffer value : testcase.uniqueRows)
             {
                 map.put(value, count);
-                ByteBuffer[] tupleBuffers = tupleType.split(value);
+                ByteBuffer[] tupleBuffers = tupleType.split(ByteBufferAccessor.instance, value);
 
                 // use cast to avoid warning
                 execute("INSERT INTO %s (id, value) VALUES (?, ?)", tuple((Object[]) tupleBuffers), count);
@@ -306,7 +307,7 @@ private void tupleCkReadWrite(Order order)
             for (ByteBuffer value : testcase.uniqueRows)
             {
                 map.put(value, count);
-                ByteBuffer[] tupleBuffers = tupleType.split(value);
+                ByteBuffer[] tupleBuffers = tupleType.split(ByteBufferAccessor.instance, value);
 
                 // use cast to avoid warning
                 execute("INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)", 1, tuple((Object[]) tupleBuffers), count);
diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java
index e39dd3517394..79e1648af2a6 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java
@@ -198,6 +198,56 @@ public void testAlterUDT() throws Throwable
         );
     }
 
+    @Test
+    public void testNullsInIntUDT() throws Throwable
+    {
+        String myType = KEYSPACE + '.' + createType("CREATE TYPE %s (a int)");
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<" + myType + ">)");
+        execute("INSERT INTO %s (a, b) VALUES (1, ?)", userType("a", 1));
+
+        assertRows(execute("SELECT b.a FROM %s"), row(1));
+
+        flush();
+
+        schemaChange("ALTER TYPE " + myType + " ADD b int");
+        execute("INSERT INTO %s (a, b) VALUES (2, {a: 2, b: 2})");
+        execute("INSERT INTO %s (a, b) VALUES (3, {b: 3})");
+        execute("INSERT INTO %s (a, b) VALUES (4, {a: null, b: 4})");
+
+        beforeAndAfterFlush(() ->
+                            assertRows(execute("SELECT b.a, b.b FROM %s"),
+                                       row(1, null),
+                                       row(2, 2),
+                                       row(null, 3),
+                                       row(null, 4))
+        );
+    }
+
+    @Test
+    public void testNullsInTextUDT() throws Throwable
+    {
+        String myType = KEYSPACE + '.' + createType("CREATE TYPE %s (a text)");
+        createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<" + myType + ">)");
+        execute("INSERT INTO %s (a, b) VALUES (1, {a: ''})");
+
+        assertRows(execute("SELECT b.a FROM %s"), row(""));
+
+        flush();
+
+        schemaChange("ALTER TYPE " + myType + " ADD b text");
+        execute("INSERT INTO %s (a, b) VALUES (2, {a: '', b: ''})");
+        execute("INSERT INTO %s (a, b) VALUES (3, {b: ''})");
+        execute("INSERT INTO %s (a, b) VALUES (4, {a: null, b: ''})");
+
+        beforeAndAfterFlush(() ->
+                            assertRows(execute("SELECT b.a, b.b FROM %s"),
+                                       row("", null),
+                                       row("", ""),
+                                       row(null, ""),
+                                       row(null, ""))
+        );
+    }
+
     @Test
     public void testAlterNonFrozenUDT() throws Throwable
     {
diff --git a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java
index 7c0c86309589..3dc37af1fdce 100644
--- a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java
+++ b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java
@@ -203,7 +203,7 @@ private static void buildAndSplit(Gen<? extends TupleType> baseGen)
         qt().forAll(tupleWithValueGen(baseGen)).checkAssert(pair -> {
             TupleType tuple = pair.left;
             ByteBuffer value = pair.right;
-            Assertions.assertThat(TupleType.buildValue(tuple.split(value)))
+            Assertions.assertThat(TupleType.buildValue(tuple.split(ByteBufferAccessor.instance, value)))
                       .as("TupleType.buildValue(split(value)) == value")
                       .isEqualTo(value);
         });
diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
index 2881ab96c9ce..c24690b8bf7a 100644
--- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
+++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
@@ -37,8 +37,8 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.ByteComparable;
-import org.apache.cassandra.utils.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 
 /**
diff --git a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
index bd6f3d4e0eae..57cc23830c6e 100644
--- a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
+++ b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
@@ -34,6 +34,8 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 public class LengthPartitioner implements IPartitioner
 {
@@ -95,6 +97,11 @@ public Token fromByteArray(ByteBuffer bytes)
             return new BigIntegerToken(new BigInteger(ByteBufferUtil.getArray(bytes)));
         }
 
+        public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
+        {
+            return fromByteArray(IntegerType.instance.fromComparableBytes(comparableBytes, version));
+        }
+
         public String toString(Token token)
         {
             BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
index da76070b6f0a..75523e1587cd 100644
--- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
@@ -238,7 +238,7 @@ public void udtSerDeserTest(ProtocolVersion version) throws Exception
 
         ByteBuffer serialized = t.bindAndGet(options);
 
-        ByteBuffer[] fields = udt.split(serialized);
+        ByteBuffer[] fields = udt.split(ByteBufferAccessor.instance, serialized);
 
         assertEquals(4, fields.length);
 
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java
new file mode 100644
index 000000000000..3a8960a3a464
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java
@@ -0,0 +1,1018 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.bytecomparable;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.BiFunction;
+import java.util.function.Consumer;
+import java.util.function.Supplier;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.db.marshal.*;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.cql3.Duration;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LengthPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.serializers.SimpleDateSerializer;
+import org.apache.cassandra.serializers.TypeSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDGen;
+
+@RunWith(Parameterized.class)
+public class AbstractTypeByteSourceTest
+{
+    private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()";
+
+    @Parameterized.Parameters(name = "version={0}")
+    public static Iterable<ByteComparable.Version> versions()
+    {
+        return ImmutableList.of(ByteComparable.Version.OSS41);
+    }
+
+    private final ByteComparable.Version version;
+
+    public AbstractTypeByteSourceTest(ByteComparable.Version version)
+    {
+        this.version = version;
+    }
+
+    private <T> void testValuesForType(AbstractType<T> type, T... values)
+    {
+        testValuesForType(type, Arrays.asList(values));
+    }
+
+    private <T> void testValuesForType(AbstractType<T> type, List<T> values)
+    {
+        for (T initial : values)
+            decodeAndAssertEquals(type, initial);
+        if (IntegerType.instance.equals(type))
+            // IntegerType tests go through A LOT of values, so short of randomly picking up to, let's say 1000
+            // values to combine with, we'd rather skip the comparison tests for them.
+            return;
+        for (int i = 0; i < values.size(); ++i)
+        {
+            for (int j = i + 1; j < values.size(); ++j)
+            {
+                ByteBuffer left = type.decompose(values.get(i));
+                ByteBuffer right = type.decompose(values.get(j));
+                int compareBuffers = Integer.signum(type.compare(left, right));
+                ByteSource leftSource = type.asComparableBytes(left.duplicate(), version);
+                ByteSource rightSource = type.asComparableBytes(right.duplicate(), version);
+                int compareBytes = Integer.signum(ByteComparable.compare(v -> leftSource, v -> rightSource, version));
+                Assert.assertEquals(compareBuffers, compareBytes);
+            }
+        }
+    }
+
+    private <T> void testValuesForType(AbstractType<T> type, Stream<T> values)
+    {
+        values.forEach(initial -> decodeAndAssertEquals(type, initial));
+    }
+
+    private <T> void decodeAndAssertEquals(AbstractType<T> type, T initial)
+    {
+        ByteBuffer initialBuffer = type.decompose(initial);
+        // Assert that fromComparableBytes decodes correctly.
+        ByteSource.Peekable peekableBytes = ByteSource.peekable(type.asComparableBytes(initialBuffer, version));
+        ByteBuffer decodedBuffer = type.fromComparableBytes(peekableBytes, version);
+        Assert.assertEquals("For " + ByteSourceComparisonTest.safeStr(initial),
+                            ByteBufferUtil.bytesToHex(initialBuffer),
+                            ByteBufferUtil.bytesToHex(decodedBuffer));
+        // Assert that the value composed from fromComparableBytes is the correct one.
+        peekableBytes = ByteSource.peekable(type.asComparableBytes(initialBuffer, version));
+        T decoded = type.compose(type.fromComparableBytes(peekableBytes, version));
+        Assert.assertEquals(initial, decoded);
+    }
+
+    private static String newRandomAlphanumeric(Random prng, int length)
+    {
+        StringBuilder random = new StringBuilder(length);
+        for (int i = 0; i < length; ++i)
+            random.append(ALPHABET.charAt(prng.nextInt(ALPHABET.length())));
+        return random.toString();
+    }
+
+    @Test
+    public void testAsciiType()
+    {
+        String[] asciiStrings = new String[]
+        {
+                "",
+                "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890",
+                "!@#$%^&*()",
+        };
+        testValuesForType(AsciiType.instance, asciiStrings);
+
+        Random prng = new Random();
+        Stream<String> asciiStream = Stream.generate(() -> newRandomAlphanumeric(prng, 10)).limit(1000);
+        testValuesForType(AsciiType.instance, asciiStream);
+    }
+
+    @Test
+    public void testBooleanType()
+    {
+        testValuesForType(BooleanType.instance, Boolean.TRUE, Boolean.FALSE, null);
+    }
+
+    @Test
+    public void testBytesType()
+    {
+        List<ByteBuffer> byteBuffers = new ArrayList<>();
+        Random prng = new Random();
+        byte[] byteArray;
+        int[] arrayLengths = new int[] {1, 10, 100, 1000};
+        for (int length : arrayLengths)
+        {
+            byteArray = new byte[length];
+            for (int i = 0; i < 1000; ++i)
+            {
+                prng.nextBytes(byteArray);
+                byteBuffers.add(ByteBuffer.wrap(byteArray));
+            }
+        }
+        testValuesForType(BytesType.instance, byteBuffers.toArray(new ByteBuffer[0]));
+    }
+
+    @Test
+    public void testByteType()
+    {
+        testValuesForType(ByteType.instance, new Byte[] { null });
+
+        Stream<Byte> allBytes = IntStream.range(Byte.MIN_VALUE, Byte.MAX_VALUE + 1)
+                                         .mapToObj(value -> (byte) value);
+        testValuesForType(ByteType.instance, allBytes);
+    }
+
+    @Test
+    public void testCompositeType()
+    {
+        CompositeType compType = CompositeType.getInstance(UTF8Type.instance, TimeUUIDType.instance, IntegerType.instance);
+        List<ByteBuffer> byteBuffers = new ArrayList<>();
+        Random prng = new Random();
+        // Test with complete CompositeType rows
+        for (int i = 0; i < 1000; ++i)
+        {
+            String randomString = newRandomAlphanumeric(prng, 10);
+            UUID randomUuid = UUIDGen.getTimeUUID();
+            BigInteger randomVarint = BigInteger.probablePrime(80, prng);
+            byteBuffers.add(compType.decompose(randomString, randomUuid, randomVarint));
+        }
+        // Test with incomplete CompositeType rows, where only the first element is present
+        ByteBuffer[] incompleteComposite = new ByteBuffer[1];
+        incompleteComposite[0] = UTF8Type.instance.decompose(newRandomAlphanumeric(prng, 10));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite));
+        // ...and the last end-of-component byte is not 0.
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) 1));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) 1));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) -1));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) -1));
+        // Test with incomplete CompositeType rows, where only the last element is not present
+        incompleteComposite = new ByteBuffer[2];
+        incompleteComposite[0] = UTF8Type.instance.decompose(newRandomAlphanumeric(prng, 10));
+        incompleteComposite[1] = TimeUUIDType.instance.decompose(UUIDGen.getTimeUUID());
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite));
+        // ...and the last end-of-component byte is not 0.
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) 1));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) 1));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) -1));
+        byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) -1));
+
+        testValuesForType(compType, byteBuffers.toArray(new ByteBuffer[0]));
+    }
+
+    @Test
+    public void testDateType()
+    {
+        Stream<Date> dates = Stream.of(null,
+                                       new Date(Long.MIN_VALUE),
+                                       new Date(Long.MAX_VALUE),
+                                       new Date());
+        testValuesForType(DateType.instance, dates);
+
+        dates = new Random().longs(1000).mapToObj(Date::new);
+        testValuesForType(DateType.instance, dates);
+    }
+
+    @Test
+    public void testDecimalType()
+    {
+        // We won't be using testValuesForType for DecimalType (i.e. we won't also be comparing the initial and decoded
+        // ByteBuffer values). That's because the same BigDecimal value can be represented with a couple of different,
+        // even if equivalent pairs of <mantissa, scale> (e.g. 0.1 is 1 * e-1, as well as 10 * e-2, as well as...).
+        // And in practice it's easier to just convert to BigDecimals and then compare, instead of trying to manually
+        // decode and convert to canonical representations, which then to compare. For example of generating canonical
+        // decimals in the first place, see testReversedType().
+        Consumer<BigDecimal> bigDecimalConsumer = initial ->
+        {
+            ByteSource byteSource = DecimalType.instance.asComparableBytes(DecimalType.instance.decompose(initial), version);
+            BigDecimal decoded = DecimalType.instance.compose(DecimalType.instance.fromComparableBytes(ByteSource.peekable(byteSource), version));
+            if (initial == null)
+                Assert.assertNull(decoded);
+            else
+                Assert.assertEquals(0, initial.compareTo(decoded));
+        };
+        // Test some interesting predefined BigDecimal values.
+        Stream.of(null,
+                  BigDecimal.ZERO,
+                  BigDecimal.ONE,
+                  BigDecimal.ONE.add(BigDecimal.ONE),
+                  BigDecimal.TEN,
+                  BigDecimal.valueOf(0.0000000000000000000000000000000001),
+                  BigDecimal.valueOf(-0.0000000000000000000000000000000001),
+                  BigDecimal.valueOf(0.0000000000000001234567891011121314),
+                  BigDecimal.valueOf(-0.0000000000000001234567891011121314),
+                  BigDecimal.valueOf(12345678910111213.141516171819202122),
+                  BigDecimal.valueOf(-12345678910111213.141516171819202122),
+                  new BigDecimal(BigInteger.TEN, Integer.MIN_VALUE),
+                  new BigDecimal(BigInteger.TEN.negate(), Integer.MIN_VALUE),
+                  new BigDecimal(BigInteger.TEN, Integer.MAX_VALUE),
+                  new BigDecimal(BigInteger.TEN.negate(), Integer.MAX_VALUE),
+                  new BigDecimal(BigInteger.TEN.pow(1000), Integer.MIN_VALUE),
+                  new BigDecimal(BigInteger.TEN.pow(1000).negate(), Integer.MIN_VALUE),
+                  new BigDecimal(BigInteger.TEN.pow(1000), Integer.MAX_VALUE),
+                  new BigDecimal(BigInteger.TEN.pow(1000).negate(), Integer.MAX_VALUE))
+              .forEach(bigDecimalConsumer);
+        // Test BigDecimals created from random double values with predefined range modifiers.
+        double[] bounds = {
+                Double.MIN_VALUE,
+                -1_000_000_000.0,
+                -100_000.0,
+                -1.0,
+                1.0,
+                100_000.0,
+                1_000_000_000.0,
+                Double.MAX_VALUE};
+        for (double bound : bounds)
+        {
+            new Random().doubles(1000)
+                        .mapToObj(initial -> BigDecimal.valueOf(initial * bound))
+                        .forEach(bigDecimalConsumer);
+        }
+    }
+
+    @Test
+    public void testDoubleType()
+    {
+        Stream<Double> doubles = Stream.of(null,
+                                           Double.NaN,
+                                           Double.POSITIVE_INFINITY,
+                                           Double.NEGATIVE_INFINITY,
+                                           Double.MAX_VALUE,
+                                           Double.MIN_VALUE,
+                                           +0.0,
+                                           -0.0,
+                                           +1.0,
+                                           -1.0,
+                                           +12345678910.111213141516,
+                                           -12345678910.111213141516);
+        testValuesForType(DoubleType.instance, doubles);
+
+        doubles = new Random().doubles(1000).boxed();
+        testValuesForType(DoubleType.instance, doubles);
+    }
+
+    @Test
+    public void testDurationType()
+    {
+        Random prng = new Random();
+        Stream<Duration> posDurations = Stream.generate(() ->
+                                                        {
+                                                            int months = prng.nextInt(12) + 1;
+                                                            int days = prng.nextInt(28) + 1;
+                                                            long nanos = (Math.abs(prng.nextLong() % 86_400_000_000_000L)) + 1;
+                                                            return Duration.newInstance(months, days, nanos);
+                                                        })
+                                              .limit(1000);
+        testValuesForType(DurationType.instance, posDurations);
+        Stream<Duration> negDurations = Stream.generate(() ->
+                                                        {
+                                                            int months = prng.nextInt(12) + 1;
+                                                            int days = prng.nextInt(28) + 1;
+                                                            long nanos = (Math.abs(prng.nextLong() % 86_400_000_000_000L)) + 1;
+                                                            return Duration.newInstance(-months, -days, -nanos);
+                                                        })
+                                              .limit(1000);
+        testValuesForType(DurationType.instance, negDurations);
+    }
+
+    @Test
+    public void testDynamicCompositeType()
+    {
+        DynamicCompositeType dynamicCompType = DynamicCompositeType.getInstance(new HashMap<>());
+        ImmutableList<String> allTypes = ImmutableList.of("org.apache.cassandra.db.marshal.BytesType",
+                                                          "org.apache.cassandra.db.marshal.TimeUUIDType",
+                                                          "org.apache.cassandra.db.marshal.IntegerType");
+        List<ByteBuffer> allValues = new ArrayList<>();
+        List<ByteBuffer> byteBuffers = new ArrayList<>();
+        Random prng = new Random();
+        for (int i = 0; i < 10; ++i)
+        {
+            String randomString = newRandomAlphanumeric(prng, 10);
+            allValues.add(ByteBufferUtil.bytes(randomString));
+            UUID randomUuid = UUIDGen.getTimeUUID();
+            allValues.add(ByteBuffer.wrap(UUIDGen.decompose(randomUuid)));
+            byte randomByte = (byte) prng.nextInt();
+            allValues.add(ByteBuffer.allocate(1).put(randomByte));
+
+            // Three-component key with aliased and non-aliased types and end-of-component byte varying (0, 1, -1).
+            byteBuffers.add(DynamicCompositeType.build(allTypes, allValues));
+            byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, randomByte, (byte) 1));
+            byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, randomByte, (byte) -1));
+
+            // Two-component key with aliased and non-aliased types and end-of-component byte varying (0, 1, -1).
+            byteBuffers.add(DynamicCompositeType.build(allTypes.subList(0, 2), allValues.subList(0, 2)));
+            byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, -1, (byte) 1));
+            byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, -1, (byte) -1));
+
+            // One-component key with aliased and non-aliased type and end-of-component byte varying (0, 1, -1).
+            byteBuffers.add(DynamicCompositeType.build(allTypes.subList(0, 1), allValues.subList(0, 1)));
+            byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, null, -1, (byte) 1));
+            byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, null, -1, (byte) -1));
+
+            allValues.clear();
+        }
+        testValuesForType(dynamicCompType, byteBuffers.toArray(new ByteBuffer[0]));
+    }
+
+    // Similar to DynamicCompositeTypeTest.createDynamicCompositeKey(string, uuid, i, true, false), but not using any
+    // aliased types, in order to do an exact comparison of the unmarshalled DynamicCompositeType payload with the
+    // input one. If aliased types are used, due to DynamicCompositeType.build(List<String>, List<ByteBuffer>)
+    // always including the full type info in the newly constructed payload, an exact comparison won't work.
+    private static ByteBuffer createStringUuidVarintDynamicCompositeKey(String string, UUID uuid, int i, byte lastEocByte)
+    {
+        // 1. Calculate how many bytes do we need for a key of this DynamicCompositeType
+        String bytesType = "org.apache.cassandra.db.marshal.BytesType";
+        String timeUuidType = "org.apache.cassandra.db.marshal.TimeUUIDType";
+        String varintType = "org.apache.cassandra.db.marshal.IntegerType";
+        ByteBuffer bytes = ByteBufferUtil.bytes(string);
+        int totalSize = 0;
+        if (string != null)
+        {
+            // Take into account the string component data (BytesType is aliased)
+            totalSize += 2 + bytesType.length() + 2 + bytes.remaining() + 1;
+            if (uuid != null)
+            {
+                // Take into account the UUID component data (TimeUUIDType is aliased)
+                totalSize += 2 + timeUuidType.length() + 2 + 16 + 1;
+                if (i != -1)
+                {
+                    // Take into account the varint component data (IntegerType is _not_ aliased).
+                    // Notice that we account for a single byte of varint data, so we'll downcast the int payload
+                    // to byte and use only that as the actual varint payload.
+                    totalSize += 2 + varintType.length() + 2 + 1 + 1;
+                }
+            }
+        }
+
+        // 2. Allocate a buffer with that many bytes
+        ByteBuffer bb = ByteBuffer.allocate(totalSize);
+
+        // 3. Write the key data for each component in the allocated buffer
+        if (string != null)
+        {
+            bb.putShort((short) bytesType.length());
+            bb.put(ByteBufferUtil.bytes(bytesType));
+            bb.putShort((short) bytes.remaining());
+            bb.put(bytes);
+            // Make the end-of-component byte 1 if requested and the time-UUID component is null.
+            bb.put(uuid == null ? lastEocByte : (byte) 0);
+            if (uuid != null)
+            {
+                bb.putShort((short) timeUuidType.length());
+                bb.put(ByteBufferUtil.bytes(timeUuidType));
+                bb.putShort((short) 16);
+                bb.put(UUIDGen.decompose(uuid));
+                // Set the end-of-component byte if requested and the varint component is null.
+                bb.put(i == -1 ? lastEocByte : (byte) 0);
+                if (i != -1)
+                {
+                    bb.putShort((short) varintType.length());
+                    bb.put(ByteBufferUtil.bytes(varintType));
+                    bb.putShort((short) 1);
+                    bb.put((byte) i);
+                    bb.put(lastEocByte);
+                }
+            }
+        }
+        bb.rewind();
+        return bb;
+    }
+
+    @Test
+    public void testFloatType()
+    {
+        Stream<Float> floats = Stream.of(null,
+                                         Float.NaN,
+                                         Float.POSITIVE_INFINITY,
+                                         Float.NEGATIVE_INFINITY,
+                                         Float.MAX_VALUE,
+                                         Float.MIN_VALUE,
+                                         +0.0F,
+                                         -0.0F,
+                                         +1.0F,
+                                         -1.0F,
+                                         +123456.7891011F,
+                                         -123456.7891011F);
+        testValuesForType(FloatType.instance, floats);
+
+        floats = new Random().ints(1000).mapToObj(Float::intBitsToFloat);
+        testValuesForType(FloatType.instance, floats);
+    }
+
+    @Test
+    public void testInetAddressType() throws UnknownHostException
+    {
+        Stream<InetAddress> inetAddresses = Stream.of(null,
+                                                      InetAddress.getLocalHost(),
+                                                      InetAddress.getLoopbackAddress(),
+                                                      InetAddress.getByName("0.0.0.0"),
+                                                      InetAddress.getByName("10.0.0.1"),
+                                                      InetAddress.getByName("172.16.1.1"),
+                                                      InetAddress.getByName("192.168.2.2"),
+                                                      InetAddress.getByName("224.3.3.3"),
+                                                      InetAddress.getByName("255.255.255.255"),
+                                                      InetAddress.getByName("0000:0000:0000:0000:0000:0000:0000:0000"),
+                                                      InetAddress.getByName("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"),
+                                                      InetAddress.getByName("fe80:1:23:456:7890:1:23:456"));
+        testValuesForType(InetAddressType.instance, inetAddresses);
+
+        Random prng = new Random();
+        byte[] ipv4Bytes = new byte[4];
+        byte[] ipv6Bytes = new byte[16];
+        InetAddress[] addresses = new InetAddress[2000];
+        for (int i = 0; i < addresses.length / 2; ++i)
+        {
+            prng.nextBytes(ipv4Bytes);
+            addresses[2 * i] = InetAddress.getByAddress(ipv4Bytes);
+            addresses[2 * i + 1] = InetAddress.getByAddress(ipv6Bytes);
+        }
+        testValuesForType(InetAddressType.instance, addresses);
+
+    }
+
+    @Test
+    public void testInt32Type()
+    {
+        Stream<Integer> ints = Stream.of(null,
+                                         Integer.MIN_VALUE,
+                                         Integer.MIN_VALUE + 1,
+                                         -256, -255, -128, -127, -1,
+                                         0,
+                                         1, 127, 128, 255, 256,
+                                         Integer.MAX_VALUE - 1,
+                                         Integer.MAX_VALUE);
+        testValuesForType(Int32Type.instance, ints);
+
+        ints = new Random().ints(1000).boxed();
+        testValuesForType(Int32Type.instance, ints);
+    }
+
+    @Test
+    public void testIntegerType()
+    {
+        Stream<BigInteger> varints = IntStream.range(-1000000, 1000000).mapToObj(BigInteger::valueOf);
+        testValuesForType(IntegerType.instance, varints);
+
+        varints = Stream.of(null,
+                            BigInteger.valueOf(12345678910111213L),
+                            BigInteger.valueOf(12345678910111213L).negate(),
+                            BigInteger.valueOf(Long.MAX_VALUE),
+                            BigInteger.valueOf(Long.MAX_VALUE).negate(),
+                            BigInteger.valueOf(Long.MAX_VALUE - 1).multiply(BigInteger.valueOf(Long.MAX_VALUE - 1)),
+                            BigInteger.valueOf(Long.MAX_VALUE - 1).multiply(BigInteger.valueOf(Long.MAX_VALUE - 1)).negate());
+        testValuesForType(IntegerType.instance, varints);
+
+        List<BigInteger> varintList = new ArrayList<>();
+        for (int i = 0; i < 10000; ++i)
+        {
+            BigInteger initial = BigInteger.ONE.shiftLeft(i);
+            varintList.add(initial);
+            BigInteger plusOne = initial.add(BigInteger.ONE);
+            varintList.add(plusOne);
+            varintList.add(plusOne.negate());
+            BigInteger minusOne = initial.subtract(BigInteger.ONE);
+            varintList.add(minusOne);
+            varintList.add(minusOne.negate());
+        }
+        testValuesForType(IntegerType.instance, varintList.toArray(new BigInteger[0]));
+    }
+
+    @Test
+    public void testUuidTypes()
+    {
+        Random prng = new Random();
+        UUID[] testUuids = new UUID[3001];
+        for (int i = 0; i < testUuids.length / 3; ++i)
+        {
+            testUuids[3 * i] = UUID.randomUUID();
+            testUuids[3 * i + 1] = UUIDGen.getTimeUUID();
+            testUuids[3 * i + 2] = UUIDGen.getRandomTimeUUIDFromMicros(prng.nextLong());
+        }
+        testUuids[testUuids.length - 1] = null;
+        testValuesForType(UUIDType.instance, testUuids);
+        testValuesForType(LexicalUUIDType.instance, testUuids);
+        testValuesForType(TimeUUIDType.instance, Arrays.stream(testUuids).filter(u -> u == null || u.version() == 1));
+    }
+
+    private static <E, C extends Collection<E>> List<C> newRandomElementCollections(Supplier<? extends C> collectionProducer,
+                                                                                    Supplier<? extends E> elementProducer,
+                                                                                    int numCollections,
+                                                                                    int numElementsInCollection)
+    {
+        List<C> result = new ArrayList<>();
+        for (int i = 0; i < numCollections; ++i)
+        {
+            C coll = collectionProducer.get();
+            for (int j = 0; j < numElementsInCollection; ++j)
+            {
+                coll.add(elementProducer.get());
+            }
+            result.add(coll);
+        }
+        return result;
+    }
+
+    @Test
+    public void testListType()
+    {
+        // Test lists with element components not having known/computable length (e.g. strings).
+        Random prng = new Random();
+        List<List<String>> stringLists = newRandomElementCollections(ArrayList::new,
+                                                                     () -> newRandomAlphanumeric(prng, 10),
+                                                                     100,
+                                                                     100);
+        testValuesForType(ListType.getInstance(UTF8Type.instance, false), stringLists);
+        testValuesForType(ListType.getInstance(UTF8Type.instance, true), stringLists);
+        // Test lists with element components with known/computable length (e.g. 128-bit UUIDs).
+        List<List<UUID>> uuidLists = newRandomElementCollections(ArrayList::new,
+                                                                 UUID::randomUUID,
+                                                                 100,
+                                                                 100);
+        testValuesForType(ListType.getInstance(UUIDType.instance, false), uuidLists);
+        testValuesForType(ListType.getInstance(UUIDType.instance, true), uuidLists);
+    }
+
+    @Test
+    public void testLongType()
+    {
+        Stream<Long> longs = Stream.of(null,
+                                       Long.MIN_VALUE,
+                                       Long.MIN_VALUE + 1,
+                                       (long) Integer.MIN_VALUE - 1,
+                                       -256L, -255L, -128L, -127L, -1L,
+                                       0L,
+                                       1L, 127L, 128L, 255L, 256L,
+                                       (long) Integer.MAX_VALUE + 1,
+                                       Long.MAX_VALUE - 1,
+                                       Long.MAX_VALUE);
+        testValuesForType(LongType.instance, longs);
+
+        longs = new Random().longs(1000).boxed();
+        testValuesForType(LongType.instance, longs);
+    }
+
+    private static <K, V> List<Map<K, V>> newRandomEntryMaps(Supplier<? extends K> keyProducer,
+                                                             Supplier<? extends V> valueProducer,
+                                                             int numMaps,
+                                                             int numEntries)
+    {
+        List<Map<K, V>> result = new ArrayList<>();
+        for (int i = 0; i < numMaps; ++i)
+        {
+            Map<K, V> map = new HashMap<>();
+            for (int j = 0; j < numEntries; ++j)
+            {
+                K key = keyProducer.get();
+                V value = valueProducer.get();
+                map.put(key, value);
+            }
+            result.add(map);
+        }
+        return result;
+    }
+
+    @Test
+    public void testMapType()
+    {
+        Random prng = new Random();
+        List<Map<String, UUID>> stringToUuidMaps = newRandomEntryMaps(() -> newRandomAlphanumeric(prng, 10),
+                                                                      UUID::randomUUID,
+                                                                      100,
+                                                                      100);
+        testValuesForType(MapType.getInstance(UTF8Type.instance, UUIDType.instance, false), stringToUuidMaps);
+        testValuesForType(MapType.getInstance(UTF8Type.instance, UUIDType.instance, true), stringToUuidMaps);
+
+        List<Map<UUID, String>> uuidToStringMaps = newRandomEntryMaps(UUID::randomUUID,
+                                                                      () -> newRandomAlphanumeric(prng, 10),
+                                                                      100,
+                                                                      100);
+        testValuesForType(MapType.getInstance(UUIDType.instance, UTF8Type.instance, false), uuidToStringMaps);
+        testValuesForType(MapType.getInstance(UUIDType.instance, UTF8Type.instance, true), uuidToStringMaps);
+    }
+
+    @Test
+    public void testPartitionerDefinedOrder()
+    {
+        Random prng = new Random();
+        List<ByteBuffer> byteBuffers = new ArrayList<>();
+        byteBuffers.add(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        for (int i = 0; i < 1000; ++i)
+        {
+            String randomString = newRandomAlphanumeric(prng, 10);
+            byteBuffers.add(UTF8Type.instance.decompose(randomString));
+            int randomInt = prng.nextInt();
+            byteBuffers.add(Int32Type.instance.decompose(randomInt));
+            double randomDouble = prng.nextDouble();
+            byteBuffers.add(DoubleType.instance.decompose(randomDouble));
+            BigInteger randomishVarint = BigInteger.probablePrime(100, prng);
+            byteBuffers.add(IntegerType.instance.decompose(randomishVarint));
+            BigDecimal randomishDecimal = BigDecimal.valueOf(prng.nextLong(), prng.nextInt(100) - 50);
+            byteBuffers.add(DecimalType.instance.decompose(randomishDecimal));
+        }
+
+        byte[] bytes = new byte[100];
+        prng.nextBytes(bytes);
+        ByteBuffer exhausted = ByteBuffer.wrap(bytes);
+        ByteBufferUtil.readBytes(exhausted, 100);
+
+        List<IPartitioner> partitioners = Arrays.asList(
+                Murmur3Partitioner.instance,
+                RandomPartitioner.instance,
+                LengthPartitioner.instance
+                // NOTE LocalPartitioner, OrderPreservingPartitioner, and ByteOrderedPartitioner don't need a dedicated
+                // PartitionerDefinedOrder.
+                //   1) LocalPartitioner uses its inner AbstractType
+                //   2) OrderPreservingPartitioner uses UTF8Type
+                //   3) ByteOrderedPartitioner uses BytesType
+        );
+        for (IPartitioner partitioner : partitioners)
+        {
+            AbstractType<?> partitionOrdering = partitioner.partitionOrdering();
+            Assert.assertTrue(partitionOrdering instanceof PartitionerDefinedOrder);
+            for (ByteBuffer input : byteBuffers)
+            {
+                ByteSource byteSource = partitionOrdering.asComparableBytes(input, version);
+                ByteBuffer output = partitionOrdering.fromComparableBytes(ByteSource.peekable(byteSource), version);
+                Assert.assertEquals("For partitioner " + partitioner.getClass().getSimpleName(),
+                                    ByteBufferUtil.bytesToHex(input),
+                                    ByteBufferUtil.bytesToHex(output));
+            }
+            ByteSource byteSource = partitionOrdering.asComparableBytes(exhausted, version);
+            ByteBuffer output = partitionOrdering.fromComparableBytes(ByteSource.peekable(byteSource), version);
+            Assert.assertEquals(ByteBufferUtil.EMPTY_BYTE_BUFFER, output);
+        }
+    }
+
+    @Test
+    public void testReversedType()
+    {
+        // Test how ReversedType handles null ByteSource.Peekable - here the choice of base type is important, as
+        // the base type should also be able to handle null ByteSource.Peekable.
+        ReversedType<BigInteger> reversedVarintType = (ReversedType<BigInteger>) ReversedType.getInstance(IntegerType.instance);
+        ByteBuffer decodedNull = reversedVarintType.fromComparableBytes(null, ByteComparable.Version.OSS41);
+        Assert.assertEquals(ByteBufferUtil.EMPTY_BYTE_BUFFER, decodedNull);
+
+        // Test how ReversedType handles random data with some common and important base types.
+        Map<AbstractType, BiFunction<Random, Integer, ByteBuffer>> bufferGeneratorByType = new HashMap<>();
+        bufferGeneratorByType.put(UTF8Type.instance, (prng, length) -> UTF8Type.instance.decompose(newRandomAlphanumeric(prng, length)));
+        bufferGeneratorByType.put(BytesType.instance, (prng, length) ->
+        {
+            byte[] randomBytes = new byte[length];
+            prng.nextBytes(randomBytes);
+            return ByteBuffer.wrap(randomBytes);
+        });
+        bufferGeneratorByType.put(IntegerType.instance, (prng, length) ->
+        {
+            BigInteger randomVarint = BigInteger.valueOf(prng.nextLong());
+            for (int i = 1; i < length / 8; ++i)
+                randomVarint = randomVarint.multiply(BigInteger.valueOf(prng.nextLong()));
+            return IntegerType.instance.decompose(randomVarint);
+        });
+        bufferGeneratorByType.put(DecimalType.instance, (prng, length) ->
+        {
+            BigInteger randomMantissa = BigInteger.valueOf(prng.nextLong());
+            for (int i = 1; i < length / 8; ++i)
+                randomMantissa = randomMantissa.multiply(BigInteger.valueOf(prng.nextLong()));
+            // Remove all trailing zeros from the mantissa and use an even scale, in order to have a "canonically
+            // represented" (in the context of DecimalType's encoding) decimal, i.e. one which wouldn't be re-scaled to
+            // conform with the "compacted mantissa between 0 and 1, scale as a power of 100" rule.
+            while (randomMantissa.remainder(BigInteger.TEN).equals(BigInteger.ZERO))
+                randomMantissa = randomMantissa.divide(BigInteger.TEN);
+            int randomScale = prng.nextInt() & -2;
+            BigDecimal randomDecimal = new BigDecimal(randomMantissa, randomScale);
+            return DecimalType.instance.decompose(randomDecimal);
+        });
+        Random prng = new Random();
+        for (Map.Entry<AbstractType, BiFunction<Random, Integer, ByteBuffer>> entry : bufferGeneratorByType.entrySet())
+        {
+            ReversedType reversedType = (ReversedType) ReversedType.getInstance(entry.getKey());
+            for (int length = 32; length <= 512; length *= 4)
+            {
+                for (int i = 0; i < 100; ++i)
+                {
+                    ByteBuffer initial = entry.getValue().apply(prng, length);
+                    ByteSource.Peekable reversedPeekable = ByteSource.peekable(reversedType.asComparableBytes(initial, ByteComparable.Version.OSS41));
+                    ByteBuffer decoded = reversedType.fromComparableBytes(reversedPeekable, ByteComparable.Version.OSS41);
+                    Assert.assertEquals(initial, decoded);
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testSetType()
+    {
+        // Test sets with element components not having known/computable length (e.g. strings).
+        Random prng = new Random();
+        List<Set<String>> stringSets = newRandomElementCollections(HashSet::new,
+                                                                   () -> newRandomAlphanumeric(prng, 10),
+                                                                   100,
+                                                                   100);
+        testValuesForType(SetType.getInstance(UTF8Type.instance, false), stringSets);
+        testValuesForType(SetType.getInstance(UTF8Type.instance, true), stringSets);
+        // Test sets with element components with known/computable length (e.g. 128-bit UUIDs).
+        List<Set<UUID>> uuidSets = newRandomElementCollections(HashSet::new,
+                                                               UUID::randomUUID,
+                                                               100,
+                                                               100);
+        testValuesForType(SetType.getInstance(UUIDType.instance, false), uuidSets);
+        testValuesForType(SetType.getInstance(UUIDType.instance, true), uuidSets);
+    }
+
+    @Test
+    public void testShortType()
+    {
+        testValuesForType(ShortType.instance, new Short[] { null });
+
+        Stream<Short> allShorts = IntStream.range(Short.MIN_VALUE, Short.MAX_VALUE + 1)
+                                           .mapToObj(value -> (short) value);
+        testValuesForType(ShortType.instance, allShorts);
+    }
+
+    @Test
+    public void testSimpleDateType()
+    {
+        testValuesForType(SimpleDateType.instance, new Integer[] { null });
+
+        testValuesForType(SimpleDateType.instance, new Random().ints(1000).boxed());
+
+        // Test by manually creating and manually interpreting simple dates from random millis.
+        new Random().ints(1000).forEach(initialMillis ->
+                                         {
+                                             initialMillis = Math.abs(initialMillis);
+                                             Integer initialDays = SimpleDateSerializer.timeInMillisToDay(initialMillis);
+                                             ByteBuffer simpleDateBuffer = SimpleDateType.instance.fromTimeInMillis(initialMillis);
+                                             ByteSource byteSource = SimpleDateType.instance.asComparableBytes(simpleDateBuffer, version);
+                                             Integer decodedDays = SimpleDateType.instance.compose(SimpleDateType.instance.fromComparableBytes(ByteSource.peekable(byteSource), version));
+                                             Assert.assertEquals(initialDays, decodedDays);
+                                         });
+
+        // Test by manually creating and manually interpreting simple dates from strings.
+        String[] simpleDateStrings = new String[]
+                                             {
+                                                     "1970-01-01",
+                                                     "1970-01-02",
+                                                     "1969-12-31",
+                                                     "-0001-01-02",
+                                                     "-5877521-01-02",
+                                                     "2014-01-01",
+                                                     "+5881580-01-10",
+                                                     "1920-12-01",
+                                                     "1582-10-19"
+                                             };
+        for (String simpleDate : simpleDateStrings)
+        {
+            ByteBuffer simpleDataBuffer = SimpleDateType.instance.fromString(simpleDate);
+            ByteSource byteSource = SimpleDateType.instance.asComparableBytes(simpleDataBuffer, version);
+            Integer decodedDays = SimpleDateType.instance.compose(SimpleDateType.instance.fromComparableBytes(ByteSource.peekable(byteSource), version));
+            String decodedDate = SimpleDateSerializer.instance.toString(decodedDays);
+            Assert.assertEquals(simpleDate, decodedDate);
+        }
+    }
+
+    @Test
+    public void testTimestampType()
+    {
+        Date[] dates = new Date[]
+                               {
+                                       null,
+                                       new Date(),
+                                       new Date(0L),
+                                       new Date(-1L),
+                                       new Date(Long.MAX_VALUE),
+                                       new Date(Long.MIN_VALUE)
+                               };
+        testValuesForType(TimestampType.instance, dates);
+        testValuesForType(TimestampType.instance, new Random().longs(1000).mapToObj(Date::new));
+    }
+
+    @Test
+    public void testTimeType()
+    {
+        testValuesForType(TimeType.instance, new Long[] { null });
+
+        testValuesForType(TimeType.instance, new Random().longs(1000).boxed());
+    }
+
+    @Test
+    public void testTupleType()
+    {
+        TupleType tt = new TupleType(Arrays.asList(UTF8Type.instance,
+                                                   DecimalType.instance,
+                                                   IntegerType.instance,
+                                                   BytesType.instance));
+        Random prng = new Random();
+        List<ByteBuffer> tuplesData = new ArrayList<>();
+        String[] utf8Values = new String[]
+                                      {
+                                              "a",
+                                              "©",
+                                              newRandomAlphanumeric(prng, 10),
+                                              newRandomAlphanumeric(prng, 100)
+                                      };
+        BigDecimal[] decimalValues = new BigDecimal[]
+                                             {
+                                                     null,
+                                                     BigDecimal.ZERO,
+                                                     BigDecimal.ONE,
+                                                     BigDecimal.valueOf(1234567891011121314L, 50),
+                                                     BigDecimal.valueOf(1234567891011121314L, 50).negate()
+                                             };
+        BigInteger[] varintValues = new BigInteger[]
+                                            {
+                                                    null,
+                                                    BigInteger.ZERO,
+                                                    BigInteger.TEN.pow(1000),
+                                                    BigInteger.TEN.pow(1000).negate()
+                                            };
+        byte[] oneByte = new byte[1];
+        byte[] tenBytes = new byte[10];
+        byte[] hundredBytes = new byte[100];
+        byte[] thousandBytes = new byte[1000];
+        prng.nextBytes(oneByte);
+        prng.nextBytes(tenBytes);
+        prng.nextBytes(hundredBytes);
+        prng.nextBytes(thousandBytes);
+        byte[][] bytesValues = new byte[][]
+                                       {
+                                               new byte[0],
+                                               oneByte,
+                                               tenBytes,
+                                               hundredBytes,
+                                               thousandBytes
+                                       };
+        for (String utf8 : utf8Values)
+        {
+            for (BigDecimal decimal : decimalValues)
+            {
+                for (BigInteger varint : varintValues)
+                {
+                    for (byte[] bytes : bytesValues)
+                    {
+                        ByteBuffer tupleData = TupleType.buildValue(UTF8Type.instance.decompose(utf8),
+                                                                    decimal != null ? DecimalType.instance.decompose(decimal) : null,
+                                                                    varint != null ? IntegerType.instance.decompose(varint) : null,
+                                                                    // We could also use the wrapped bytes directly
+                                                                    BytesType.instance.decompose(ByteBuffer.wrap(bytes)));
+                        tuplesData.add(tupleData);
+                    }
+                }
+            }
+        }
+        testValuesForType(tt, tuplesData.toArray(new ByteBuffer[0]));
+    }
+
+    @Test
+    public void testUtf8Type()
+    {
+        Random prng = new Random();
+        testValuesForType(UTF8Type.instance, Stream.generate(() -> newRandomAlphanumeric(prng, 100)).limit(1000));
+    }
+
+    @Test
+    public void testTypeWithByteOrderedComparison()
+    {
+        Random prng = new Random();
+        byte[] singleByte = new byte[] { (byte) prng.nextInt() };
+        byte[] tenBytes = new byte[10];
+        prng.nextBytes(tenBytes);
+        byte[] hundredBytes = new byte[100];
+        prng.nextBytes(hundredBytes);
+        byte[] thousandBytes = new byte[1000];
+        prng.nextBytes(thousandBytes);
+        // No null here, as the default asComparableBytes(ByteBuffer, Version) implementation (and more specifically
+        // the ByteSource.of(ByteBuffer, Version) encoding) would throw then.
+        testValuesForType(ByteOrderedType.instance, Stream.of(ByteBufferUtil.EMPTY_BYTE_BUFFER,
+                                                              ByteBuffer.wrap(singleByte),
+                                                              ByteBuffer.wrap(tenBytes),
+                                                              ByteBuffer.wrap(hundredBytes),
+                                                              ByteBuffer.wrap(thousandBytes)));
+    }
+
+    private static class ByteOrderedType extends AbstractType<ByteBuffer>
+    {
+        public static final ByteOrderedType instance = new ByteOrderedType();
+
+        private ByteOrderedType()
+        {
+            super(ComparisonType.BYTE_ORDER);
+        }
+
+        @Override
+        public ByteBuffer fromString(String source) throws MarshalException
+        {
+            return null;
+        }
+
+        @Override
+        public Term fromJSONObject(Object parsed) throws MarshalException
+        {
+            return null;
+        }
+
+        @Override
+        public TypeSerializer<ByteBuffer> getSerializer()
+        {
+            return ByteOrderedSerializer.instance;
+        }
+
+        static class ByteOrderedSerializer extends TypeSerializer<ByteBuffer>
+        {
+
+            static final ByteOrderedSerializer instance = new ByteOrderedSerializer();
+
+            @Override
+            public ByteBuffer serialize(ByteBuffer value)
+            {
+                return value != null ? value.duplicate() : null;
+            }
+
+            @Override
+            public <V> ByteBuffer deserialize(V bytes, ValueAccessor<V> accessor)
+            {
+                return accessor.toBuffer(bytes);
+            }
+
+            @Override
+            public <V> void validate(V bytes, ValueAccessor<V> accessor) throws MarshalException
+            {
+
+            }
+
+            @Override
+            public String toString(ByteBuffer value)
+            {
+                return ByteBufferUtil.bytesToHex(value);
+            }
+
+            @Override
+            public Class<ByteBuffer> getType()
+            {
+                return ByteBuffer.class;
+            }
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/ByteSourceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java
similarity index 73%
rename from test/unit/org/apache/cassandra/utils/ByteSourceTest.java
rename to test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java
index fd1188f27858..7d39c724c29e 100644
--- a/test/unit/org/apache/cassandra/utils/ByteSourceTest.java
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java
@@ -15,28 +15,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.utils;
+package org.apache.cassandra.utils.bytecomparable;
 
 import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
-import java.time.Instant;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
-import java.util.UUID;
+import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.function.BiFunction;
 import java.util.function.Function;
@@ -44,6 +29,7 @@
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
+import com.google.common.collect.Ordering;
 import org.junit.Assert;
 import org.junit.Rule;
 import org.junit.Test;
@@ -56,148 +42,28 @@
 import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.ClusteringPrefix;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.AsciiType;
-import org.apache.cassandra.db.marshal.BooleanType;
-import org.apache.cassandra.db.marshal.ByteBufferAccessor;
-import org.apache.cassandra.db.marshal.ByteType;
-import org.apache.cassandra.db.marshal.BytesType;
-import org.apache.cassandra.db.marshal.CollectionType;
-import org.apache.cassandra.db.marshal.CompositeType;
-import org.apache.cassandra.db.marshal.DateType;
-import org.apache.cassandra.db.marshal.DecimalType;
-import org.apache.cassandra.db.marshal.DoubleType;
-import org.apache.cassandra.db.marshal.DynamicCompositeType;
-import org.apache.cassandra.db.marshal.DynamicCompositeTypeTest;
-import org.apache.cassandra.db.marshal.EmptyType;
-import org.apache.cassandra.db.marshal.FloatType;
-import org.apache.cassandra.db.marshal.InetAddressType;
-import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.db.marshal.LexicalUUIDType;
-import org.apache.cassandra.db.marshal.ListType;
-import org.apache.cassandra.db.marshal.LongType;
-import org.apache.cassandra.db.marshal.MapType;
-import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
-import org.apache.cassandra.db.marshal.ReversedType;
-import org.apache.cassandra.db.marshal.SetType;
-import org.apache.cassandra.db.marshal.ShortType;
-import org.apache.cassandra.db.marshal.SimpleDateType;
-import org.apache.cassandra.db.marshal.TimeType;
-import org.apache.cassandra.db.marshal.TimeUUIDType;
-import org.apache.cassandra.db.marshal.TimestampType;
-import org.apache.cassandra.db.marshal.TupleType;
-import org.apache.cassandra.db.marshal.UTF8Type;
-import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.RandomPartitioner;
-import org.apache.cassandra.utils.ByteComparable.Version;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.MurmurHash;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
 
 import static org.junit.Assert.assertEquals;
 
-public class ByteSourceTest
+/**
+ * Tests forward conversion to ByteSource/ByteComparable and that the result compares correctly.
+ */
+public class ByteSourceComparisonTest extends ByteSourceTestBase
 {
-    private final static Logger logger = LoggerFactory.getLogger(ByteSourceTest.class);
+    private final static Logger logger = LoggerFactory.getLogger(ByteSourceComparisonTest.class);
 
     @Rule
     public final ExpectedException expectedException = ExpectedException.none();
 
-    String[] testStrings = new String[] { "", "\0", "\0\0", "\001", "A\0\0B", "A\0B\0", "0", "0\0", "00", "1", "\377" };
-    Integer[] testInts = new Integer[] { null, Integer.MIN_VALUE, Integer.MIN_VALUE + 1, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, Integer.MAX_VALUE - 1, Integer.MAX_VALUE };
-    Byte[] testBytes = new Byte[] { -128, -127, -1, 0, 1, 127 };
-    Short[] testShorts = new Short[] { Short.MIN_VALUE, Short.MIN_VALUE + 1, -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, Short.MAX_VALUE - 1, Short.MAX_VALUE };
-    Long[] testLongs = new Long[] { null, Long.MIN_VALUE, Long.MIN_VALUE + 1, Integer.MIN_VALUE - 1L, -256L, -255L, -128L, -127L, -1L, 0L, 1L, 127L, 128L, 255L, 256L, Integer.MAX_VALUE + 1L, Long.MAX_VALUE - 1, Long.MAX_VALUE };
-    Double[] testDoubles = new Double[] { null, Double.NEGATIVE_INFINITY, -Double.MAX_VALUE, -1e+200, -1e3, -1e0, -1e-3, -1e-200, -Double.MIN_VALUE, -0.0, 0.0, Double.MIN_VALUE, 1e-200, 1e-3, 1e0, 1e3, 1e+200, Double.MAX_VALUE, Double.POSITIVE_INFINITY, Double.NaN };
-    Float[] testFloats = new Float[] { null, Float.NEGATIVE_INFINITY, -Float.MAX_VALUE, -1e+30f, -1e3f, -1e0f, -1e-3f, -1e-30f, -Float.MIN_VALUE, -0.0f, 0.0f, Float.MIN_VALUE, 1e-30f, 1e-3f, 1e0f, 1e3f, 1e+30f, Float.MAX_VALUE, Float.POSITIVE_INFINITY, Float.NaN };
-    Boolean[] testBools = new Boolean[] { null, false, true };
-    UUID[] testUUIDs = new UUID[] { null, UUIDGen.getTimeUUID(), UUID.randomUUID(), UUID.randomUUID(), UUID.randomUUID(),
-                                    UUIDGen.getTimeUUID(123, 234), UUIDGen.getTimeUUID(123, 234), UUIDGen.getTimeUUID(123),
-                                    UUID.fromString("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
-                                    UUID.fromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
-                                    UUID.fromString("e902893a-9d22-3c7e-a7b8-d6e313b71d9f"),
-                                    UUID.fromString("74738ff5-5367-5958-9aee-98fffdcd1876"),
-                                    UUID.fromString("52df1bb0-6a2f-11e6-b6e4-a6dea7a01b67"),
-                                    UUID.fromString("52df1bb0-6a2f-11e6-362d-aff2143498ea"),
-                                    UUID.fromString("52df1bb0-6a2f-11e6-b62d-aff2143498ea")};
-    // Instant.MIN/MAX fail Date.from.
-    Date[] testDates = new Date[] { null,
-                                    Date.from(Instant.ofEpochSecond(Integer.MIN_VALUE)),
-                                    Date.from(Instant.ofEpochSecond(Short.MIN_VALUE)),
-                                    Date.from(Instant.ofEpochMilli(-2000)),
-                                    Date.from(Instant.EPOCH),
-                                    Date.from(Instant.ofEpochMilli(2000)),
-                                    Date.from(Instant.ofEpochSecond(Integer.MAX_VALUE)),
-                                    Date.from(Instant.now()) };
-    BigInteger[] testBigInts;
-
-    {
-        Set<BigInteger> bigs = new TreeSet<>();
-        for (Long l : testLongs)
-            if (l != null)
-                bigs.add(BigInteger.valueOf(l));
-        for (int i = 0; i < 11; ++i)
-        {
-            bigs.add(BigInteger.valueOf(i));
-            bigs.add(BigInteger.valueOf(-i));
-
-            bigs.add(BigInteger.valueOf((1L << 4 * i) - 1));
-            bigs.add(BigInteger.valueOf((1L << 4 * i)));
-            bigs.add(BigInteger.valueOf(-(1L << 4 * i) - 1));
-            bigs.add(BigInteger.valueOf(-(1L << 4 * i)));
-            String p = exp10(i);
-            bigs.add(new BigInteger(p));
-            bigs.add(new BigInteger("-" + p));
-            p = exp10(1 << i);
-            bigs.add(new BigInteger(p));
-            bigs.add(new BigInteger("-" + p));
-
-            BigInteger base = BigInteger.ONE.shiftLeft(512 * i);
-            bigs.add(base);
-            bigs.add(base.add(BigInteger.ONE));
-            bigs.add(base.subtract(BigInteger.ONE));
-            base = base.negate();
-            bigs.add(base);
-            bigs.add(base.add(BigInteger.ONE));
-            bigs.add(base.subtract(BigInteger.ONE));
-        }
-        testBigInts = bigs.toArray(new BigInteger[0]);
-    }
-    BigDecimal[] testBigDecimals;
-    {
-        String vals = "0, 1, 1.1, 21, 98.9, 99, 99.9, 100, 100.1, 101, 331, 0.4, 0.07, 0.0700, 0.005, " +
-                      "6e4, 7e200, 6e-300, 8.1e2000, 8.1e-2000, 9e2000, " +
-                      "123456789012.34567890e-1000, 123456.78901234, 1234.56789012e2, " +
-                      "1.0000, 0.01e2, 100e-2, 00, 0.000, 0E-18, 0E+18";
-        List<BigDecimal> decs = new ArrayList<>();
-        for (String s : vals.split(", "))
-        {
-            decs.add(new BigDecimal(s));
-            decs.add(new BigDecimal("-" + s));
-        }
-        testBigDecimals = decs.toArray(new BigDecimal[0]);
-    }
-
-    static String exp10(int pow)
-    {
-        StringBuilder builder = new StringBuilder();
-        builder.append('1');
-        for (int i=0; i<pow; ++i)
-            builder.append('0');
-        return builder.toString();
-    }
-
-    Object[][] testValues = new Object[][] { testStrings, testInts, testBools, testDoubles, testBigInts, testBigDecimals };
-    AbstractType[] testTypes = new AbstractType[] {
-                               AsciiType.instance,
-                               Int32Type.instance,
-                               BooleanType.instance,
-                               DoubleType.instance,
-                               IntegerType.instance,
-                               DecimalType.instance };
-
     @Test
     public void testStringsAscii()
     {
@@ -208,6 +74,7 @@ public void testStringsAscii()
     public void testStringsUTF8()
     {
         testType(UTF8Type.instance, testStrings);
+        testDirect(x -> ByteSource.of(x, Version.OSS41), Ordering.<String>natural()::compare, testStrings);
     }
 
     @Test
@@ -371,15 +238,6 @@ public void testBytesType()
     @Test
     public void testInetAddressType() throws UnknownHostException
     {
-        InetAddress[] testInets = new InetAddress[] { null,
-                                                      InetAddress.getLocalHost(),
-                                                      InetAddress.getLoopbackAddress(),
-                                                      InetAddress.getByName("192.168.0.1"),
-                                                      InetAddress.getByName("fe80::428d:5cff:fe53:1dc9"),
-                                                      InetAddress.getByName("2001:610:3:200a:192:87:36:2"),
-                                                      InetAddress.getByName("10.0.0.1"),
-                                                      InetAddress.getByName("0a00:0001::"),
-                                                      InetAddress.getByName("::10.0.0.1") };
         testType(InetAddressType.instance, testInets);
     }
 
@@ -426,12 +284,6 @@ public void testLocalPatitionerOrder()
         }
     }
 
-    ClusteringPrefix.Kind[] kinds = new ClusteringPrefix.Kind[] {
-    ClusteringPrefix.Kind.INCL_START_BOUND,
-    ClusteringPrefix.Kind.CLUSTERING,
-    ClusteringPrefix.Kind.EXCL_START_BOUND,
-    };
-
     interface PairTester
     {
         void test(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4);
@@ -471,8 +323,8 @@ public void testCombinations()
     void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4)
     {
         for (Version v : Version.values())
-            for (ClusteringPrefix.Kind k1 : kinds)
-                for (ClusteringPrefix.Kind k2 : kinds)
+            for (ClusteringPrefix.Kind k1 : ClusteringPrefix.Kind.values())
+                for (ClusteringPrefix.Kind k2 : ClusteringPrefix.Kind.values())
                 {
                     ClusteringComparator comp = new ClusteringComparator(t1, t2);
                     ByteBuffer[] b = new ByteBuffer[2];
@@ -481,8 +333,8 @@ void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, Object o
                     b[1] = t2.decompose(o2);
                     d[0] = t1.decompose(o3);
                     d[1] = t2.decompose(o4);
-                    ClusteringPrefix<ByteBuffer> c = ByteBufferAccessor.instance.factory().bound(k1, b);
-                    ClusteringPrefix<ByteBuffer> e = ByteBufferAccessor.instance.factory().bound(k2, d);
+                    ClusteringPrefix<ByteBuffer> c = makeBound(k1, b);
+                    ClusteringPrefix<ByteBuffer> e = makeBound(k2, d);
                     final ByteComparable bsc = comp.asByteComparable(c);
                     final ByteComparable bse = comp.asByteComparable(e);
                     int expected = Integer.signum(comp.compare(c, e));
@@ -506,6 +358,36 @@ void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, Object o
                 }
     }
 
+    static ClusteringPrefix<ByteBuffer> makeBound(ClusteringPrefix.Kind k1, ByteBuffer[] b)
+    {
+        return makeBound(ByteBufferAccessor.instance.factory(), k1, b);
+    }
+
+    static <V> ClusteringPrefix<V> makeBound(ValueAccessor.ObjectFactory<V> factory, ClusteringPrefix.Kind k1, V[] b)
+    {
+        switch (k1)
+        {
+        case INCL_END_EXCL_START_BOUNDARY:
+        case EXCL_END_INCL_START_BOUNDARY:
+            return factory.boundary(k1, b);
+
+        case INCL_END_BOUND:
+        case EXCL_END_BOUND:
+        case INCL_START_BOUND:
+        case EXCL_START_BOUND:
+            return factory.bound(k1, b);
+
+        case CLUSTERING:
+            return factory.clustering(b);
+
+        case STATIC_CLUSTERING:
+            return factory.staticClustering();
+
+        default:
+            throw new AssertionError();
+        }
+    }
+
     @Test
     public void testTupleType()
     {
@@ -516,14 +398,18 @@ public void testTupleType()
     @Test
     public void testTupleTypeNonFull()
     {
-        TupleType tt = new TupleType(ImmutableList.of(AsciiType.instance, Int32Type.instance));
+        TupleType tt = new TupleType(ImmutableList.of(UTF8Type.instance, Int32Type.instance));
         List<ByteBuffer> tests = ImmutableList.of
             (
-            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, ""),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(UTF8Type.instance, ""),
+                                                                                decomposeAndRandomPad(Int32Type.instance, 0)}),
+            // Note: a decomposed null (e.g. decomposeAndRandomPad(Int32Type.instance, null)) should not reach a tuple
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(UTF8Type.instance, ""),
+                                                                                null}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {null,
                                                                                 decomposeAndRandomPad(Int32Type.instance, 0)}),
-            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, ""),
-                                                                                decomposeAndRandomPad(Int32Type.instance, null)}),
-            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(AsciiType.instance, "")}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(UTF8Type.instance, "")}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {null}),
             TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[0])
             );
         testBuffers(tt, tests);
@@ -532,9 +418,16 @@ public void testTupleTypeNonFull()
     void assertTupleComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4)
     {
         TupleType tt = new TupleType(ImmutableList.of(t1, t2));
-        ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {t1.decompose(o1), t2.decompose(o2)});
-        ByteBuffer b2 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {t1.decompose(o3), t2.decompose(o4)});
-        assertComparesSame(tt, b1, b2);
+        ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeForTuple(t1, o1),
+                                                                                            decomposeForTuple(t2, o2)});
+        ByteBuffer b2 = TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeForTuple(t1, o3),
+                                                                                            decomposeForTuple(t2, o4)});
+        assertComparesSameBuffers(tt, b1, b2);
+    }
+
+    static ByteBuffer decomposeForTuple(AbstractType t, Object o)
+    {
+        return o != null ? t.decompose(o) : null;
     }
 
     @Test
@@ -547,14 +440,14 @@ public void testCompositeType()
     @Test
     public void testCompositeTypeNonFull()
     {
-        CompositeType tt = CompositeType.getInstance(AsciiType.instance, Int32Type.instance);
+        CompositeType tt = CompositeType.getInstance(UTF8Type.instance, Int32Type.instance);
         List<ByteBuffer> tests = ImmutableList.of
             (
-            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)),
-            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)),
-            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(AsciiType.instance, "")),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, "")),
             CompositeType.build(ByteBufferAccessor.instance),
-            CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(AsciiType.instance, "")),
+            CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(UTF8Type.instance, "")),
             CompositeType.build(ByteBufferAccessor.instance,true)
             );
         for (ByteBuffer b : tests)
@@ -567,7 +460,7 @@ void assertCompositeComparesSame(AbstractType t1, AbstractType t2, Object o1, Ob
         CompositeType tt = CompositeType.getInstance(t1, t2);
         ByteBuffer b1 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o1), decomposeAndRandomPad(t2, o2));
         ByteBuffer b2 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o3), decomposeAndRandomPad(t2, o4));
-        assertComparesSame(tt, b1, b2);
+        assertComparesSameBuffers(tt, b1, b2);
     }
 
     @Test
@@ -591,7 +484,7 @@ public void testDynamicComposite()
     @Test
     public void testListTypeString()
     {
-        testCollection(ListType.getInstance(AsciiType.instance, true), testStrings, () -> new ArrayList<>(), new Random());
+        testCollection(ListType.getInstance(UTF8Type.instance, true), testStrings, () -> new ArrayList<>(), new Random());
     }
 
     @Test
@@ -603,7 +496,7 @@ public void testListTypeLong()
     @Test
     public void testSetTypeString()
     {
-        testCollection(SetType.getInstance(AsciiType.instance, true), testStrings, () -> new HashSet<>(), new Random());
+        testCollection(SetType.getInstance(UTF8Type.instance, true), testStrings, () -> new HashSet<>(), new Random());
     }
 
     @Test
@@ -632,13 +525,13 @@ <T, CT extends Collection<T>> void testCollection(CollectionType<CT> tt, T[] val
     @Test
     public void testMapTypeStringLong()
     {
-        testMap(MapType.getInstance(AsciiType.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random());
+        testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random());
     }
 
     @Test
     public void testMapTypeStringLongTree()
     {
-        testMap(MapType.getInstance(AsciiType.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random());
+        testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random());
     }
 
     @Test
@@ -872,12 +765,11 @@ public void testBuffers(AbstractType type, List<ByteBuffer> values)
     {
         try
         {
-            for (Object i : values) {
-                ByteBuffer b = decomposeAndRandomPad(type, i);
+            for (ByteBuffer b : values) {
                 logger.info("Value {} bytes {} ByteSource {}",
-                                  safeStr(type.getSerializer().toCQLLiteral(b)),
-                                  safeStr(ByteBufferUtil.bytesToHex(b)),
-                                  typeToComparable(type, b).byteComparableAsString(Version.OSS41));
+                            safeStr(type.getSerializer().toCQLLiteral(b)),
+                            safeStr(ByteBufferUtil.bytesToHex(b)),
+                            typeToComparable(type, b).byteComparableAsString(Version.OSS41));
             }
         }
         catch (UnsupportedOperationException e)
@@ -937,7 +829,7 @@ void assertComparesSameDecoratedKeys(IPartitioner type, ByteBuffer b1, ByteBuffe
         }
     }
 
-    private Object safeStr(Object i)
+    static Object safeStr(Object i)
     {
         if (i == null)
             return null;
@@ -1013,7 +905,7 @@ ByteBuffer decomposeAndRandomPad(AbstractType type, Object v)
         int paddedCapacity = b.remaining() + padBefore + padAfter;
         ByteBuffer padded = allocateBuffer(paddedCapacity);
         rand.ints(padBefore).forEach(x -> padded.put((byte) x));
-        padded.put(b);
+        padded.put(b.duplicate());
         rand.ints(padAfter).forEach(x -> padded.put((byte) x));
         padded.clear().limit(padded.capacity() - padAfter).position(padBefore);
         return padded;
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java
new file mode 100644
index 000000000000..8e1371c59bcc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java
@@ -0,0 +1,718 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.bytecomparable;
+
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.Function;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
+
+import static org.apache.cassandra.utils.bytecomparable.ByteSourceComparisonTest.decomposeForTuple;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests that the result of forward + backward ByteSource translation is the same as the original.
+ */
+public class ByteSourceConversionTest extends ByteSourceTestBase
+{
+    private final static Logger logger = LoggerFactory.getLogger(ByteSourceConversionTest.class);
+    public static final Version VERSION = Version.OSS41;
+
+    @Rule
+    public final ExpectedException expectedException = ExpectedException.none();
+
+    @Test
+    public void testStringsAscii()
+    {
+        testType(AsciiType.instance, Arrays.stream(testStrings)
+                                           .filter(s -> s.equals(new String(s.getBytes(StandardCharsets.US_ASCII),
+                                                                            StandardCharsets.US_ASCII)))
+                                           .toArray());
+    }
+
+    @Test
+    public void testStringsUTF8()
+    {
+        testType(UTF8Type.instance, testStrings);
+        testDirect(x -> ByteSource.of(x, VERSION), ByteSourceInverse::getString, testStrings);
+    }
+
+    @Test
+    public void testBooleans()
+    {
+        testType(BooleanType.instance, testBools);
+    }
+
+    @Test
+    public void testInts()
+    {
+        testType(Int32Type.instance, testInts);
+        testDirect(ByteSource::of, ByteSourceInverse::getSignedInt, testInts);
+    }
+
+    @Test
+    public void randomTestInts()
+    {
+        Random rand = new Random();
+        for (int i=0; i<10000; ++i)
+        {
+            int i1 = rand.nextInt();
+            assertConvertsSame(Int32Type.instance, i1);
+        }
+
+    }
+
+    @Test
+    public void testLongs()
+    {
+        testType(LongType.instance, testLongs);
+        testDirect(ByteSource::of, ByteSourceInverse::getSignedLong, testLongs);
+    }
+
+    @Test
+    public void testShorts()
+    {
+        testType(ShortType.instance, testShorts);
+    }
+
+    @Test
+    public void testBytes()
+    {
+        testType(ByteType.instance, testBytes);
+    }
+
+    @Test
+    public void testDoubles()
+    {
+        testType(DoubleType.instance, testDoubles);
+    }
+
+    @Test
+    public void testFloats()
+    {
+        testType(FloatType.instance, testFloats);
+    }
+
+    @Test
+    public void testBigInts()
+    {
+        testType(IntegerType.instance, testBigInts);
+    }
+
+    @Test
+    public void testBigDecimals()
+    {
+        testTypeBuffers(DecimalType.instance, testBigDecimals);
+    }
+
+    @Test
+    public void testUUIDs()
+    {
+        testType(UUIDType.instance, testUUIDs);
+    }
+
+    @Test
+    public void testTimeUUIDs()
+    {
+        testType(TimeUUIDType.instance, Arrays.stream(testUUIDs).filter(x -> x == null || x.version() == 1).toArray());
+    }
+
+    @Test
+    public void testLexicalUUIDs()
+    {
+        testType(LexicalUUIDType.instance, testUUIDs);
+    }
+
+    @Test
+    public void testSimpleDate()
+    {
+        testType(SimpleDateType.instance, Arrays.stream(testInts).filter(x -> x != null).toArray());
+    }
+
+    @Test
+    public void testTimeType()
+    {
+        testType(TimeType.instance, Arrays.stream(testLongs).filter(x -> x != null && x >= 0 && x <= 24L * 60 * 60 * 1000 * 1000 * 1000).toArray());
+    }
+
+    @SuppressWarnings("deprecation")
+    @Test
+    public void testDateType()
+    {
+        testType(DateType.instance, testDates);
+    }
+
+    @Test
+    public void testTimestampType()
+    {
+        testType(TimestampType.instance, testDates);
+    }
+
+    @Test
+    public void testBytesType()
+    {
+        List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < testValues.length; ++i)
+            for (Object o : testValues[i])
+                values.add(testTypes[i].decompose(o));
+
+        testType(BytesType.instance, values.toArray());
+    }
+
+    @Test
+    public void testInetAddressType() throws UnknownHostException
+    {
+        testType(InetAddressType.instance, testInets);
+    }
+
+    @Test
+    public void testEmptyType()
+    {
+        testType(EmptyType.instance, new Void[] { null });
+    }
+
+    @Test
+    public void testPatitionerDefinedOrder()
+    {
+        List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < testValues.length; ++i)
+            for (Object o : testValues[i])
+                values.add(testTypes[i].decompose(o));
+
+        testBuffers(new PartitionerDefinedOrder(Murmur3Partitioner.instance), values);
+        testBuffers(new PartitionerDefinedOrder(RandomPartitioner.instance), values);
+        testBuffers(new PartitionerDefinedOrder(ByteOrderedPartitioner.instance), values);
+    }
+
+    @Test
+    public void testPatitionerOrder()
+    {
+        List<ByteBuffer> values = new ArrayList<>();
+        for (int i = 0; i < testValues.length; ++i)
+            for (Object o : testValues[i])
+                values.add(testTypes[i].decompose(o));
+
+        testDecoratedKeys(Murmur3Partitioner.instance, values);
+        testDecoratedKeys(RandomPartitioner.instance, values);
+        testDecoratedKeys(ByteOrderedPartitioner.instance, values);
+    }
+
+    @Test
+    public void testLocalPatitionerOrder()
+    {
+        for (int i = 0; i < testValues.length; ++i)
+        {
+            final AbstractType testType = testTypes[i];
+            testDecoratedKeys(new LocalPartitioner(testType), Lists.transform(Arrays.asList(testValues[i]),
+                                                                                            v -> testType.decompose(v)));
+        }
+    }
+
+    interface PairTester
+    {
+        void test(AbstractType t1, AbstractType t2, Object o1, Object o2);
+    }
+
+    void testCombinationSampling(Random rand, PairTester tester)
+    {
+        for (int i=0;i<testTypes.length;++i)
+            for (int j=0;j<testTypes.length;++j)
+            {
+                Object[] tv1 = new Object[3];
+                Object[] tv2 = new Object[3];
+                for (int t=0; t<tv1.length; ++t)
+                {
+                    tv1[t] = testValues[i][rand.nextInt(testValues[i].length)];
+                    tv2[t] = testValues[j][rand.nextInt(testValues[j].length)];
+                }
+
+                for (Object o1 : tv1)
+                    for (Object o2 : tv2)
+
+                {
+                    tester.test(testTypes[i], testTypes[j], o1, o2);
+                }
+            }
+    }
+
+    @Test
+    public void testCombinations()
+    {
+        Random rand = new Random(0);
+        testCombinationSampling(rand, this::assertClusteringPairConvertsSame);
+    }
+
+    void assertClusteringPairConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2)
+    {
+        for (ValueAccessor<?> accessor : ValueAccessors.ACCESSORS)
+            assertClusteringPairConvertsSame(accessor, t1, t2, o1, o2);
+    }
+
+    <V> void assertClusteringPairConvertsSame(ValueAccessor<V> accessor, AbstractType t1, AbstractType t2, Object o1, Object o2)
+    {
+        boolean checkEquals = t1 != DecimalType.instance && t2 != DecimalType.instance;
+        for (ClusteringPrefix.Kind k1 : ClusteringPrefix.Kind.values())
+            {
+                ClusteringComparator comp = new ClusteringComparator(t1, t2);
+                V[] b = accessor.createArray(2);
+                b[0] = accessor.valueOf(t1.decompose(o1));
+                b[1] = accessor.valueOf(t2.decompose(o2));
+                ClusteringPrefix<V> c = ByteSourceComparisonTest.makeBound(accessor.factory(), k1, b);
+                final ByteComparable bsc = comp.asByteComparable(c);
+                logger.info("Clustering {} bytesource {}", c.clusteringString(comp.subtypes()), bsc.byteComparableAsString(VERSION));
+                ClusteringPrefix<V> converted = getClusteringPrefix(accessor, k1, comp, bsc);
+                assertEquals(String.format("Failed compare(%s, converted %s ByteSource %s) == 0\ntype %s",
+                                           safeStr(c.clusteringString(comp.subtypes())),
+                                           safeStr(converted.clusteringString(comp.subtypes())),
+                                           bsc.byteComparableAsString(VERSION),
+                                           comp),
+                             0, comp.compare(c, converted));
+                if (checkEquals)
+                    assertEquals(String.format("Failed equals %s, got %s ByteSource %s\ntype %s",
+                                               safeStr(c.clusteringString(comp.subtypes())),
+                                               safeStr(converted.clusteringString(comp.subtypes())),
+                                               bsc.byteComparableAsString(VERSION),
+                                               comp),
+                                 c, converted);
+
+                ClusteringComparator compR = new ClusteringComparator(ReversedType.getInstance(t1), ReversedType.getInstance(t2));
+                final ByteComparable bsrc = compR.asByteComparable(c);
+                converted = getClusteringPrefix(accessor, k1, compR, bsrc);
+                assertEquals(String.format("Failed reverse compare(%s, converted %s ByteSource %s) == 0\ntype %s",
+                                           safeStr(c.clusteringString(compR.subtypes())),
+                                           safeStr(converted.clusteringString(compR.subtypes())),
+                                           bsrc.byteComparableAsString(VERSION),
+                                           compR),
+                             0, compR.compare(c, converted));
+                if (checkEquals)
+                    assertEquals(String.format("Failed reverse equals %s, got %s ByteSource %s\ntype %s",
+                                               safeStr(c.clusteringString(compR.subtypes())),
+                                               safeStr(converted.clusteringString(compR.subtypes())),
+                                               bsrc.byteComparableAsString(VERSION),
+                                               compR),
+                                 c, converted);
+            }
+    }
+
+    private static <V> ClusteringPrefix<V> getClusteringPrefix(ValueAccessor<V> accessor,
+                                                               ClusteringPrefix.Kind k1,
+                                                               ClusteringComparator comp,
+                                                               ByteComparable bsc)
+    {
+        switch (k1)
+        {
+        case STATIC_CLUSTERING:
+        case CLUSTERING:
+            return comp.clusteringFromByteComparable(accessor, bsc);
+        case EXCL_END_BOUND:
+        case INCL_END_BOUND:
+            return comp.boundFromByteComparable(accessor, bsc, true);
+        case INCL_START_BOUND:
+        case EXCL_START_BOUND:
+            return comp.boundFromByteComparable(accessor, bsc, false);
+        case EXCL_END_INCL_START_BOUNDARY:
+        case INCL_END_EXCL_START_BOUNDARY:
+            return comp.boundaryFromByteComparable(accessor, bsc);
+        default:
+            throw new AssertionError();
+        }
+    }
+
+    private static ByteSource.Peekable source(ByteComparable bsc)
+    {
+        if (bsc == null)
+            return null;
+        return ByteSource.peekable(bsc.asComparableBytes(VERSION));
+    }
+
+    @Test
+    public void testTupleType()
+    {
+        Random rand = ThreadLocalRandom.current();
+        testCombinationSampling(rand, this::assertTupleConvertsSame);
+    }
+
+    @Test
+    public void testTupleTypeNonFull()
+    {
+        TupleType tt = new TupleType(ImmutableList.of(UTF8Type.instance, Int32Type.instance));
+        List<ByteBuffer> tests = ImmutableList.of
+            (
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(UTF8Type.instance, ""),
+                                                                                decomposeAndRandomPad(Int32Type.instance, 0)}),
+            // Note: a decomposed null (e.g. decomposeAndRandomPad(Int32Type.instance, null)) should not reach a tuple
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(UTF8Type.instance, ""),
+                                                                                null}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {null,
+                                                                                decomposeAndRandomPad(Int32Type.instance, 0)}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {decomposeAndRandomPad(UTF8Type.instance, "")}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[] {null}),
+            TupleType.buildValue(ByteBufferAccessor.instance, new ByteBuffer[0])
+            );
+        testBuffers(tt, tests);
+    }
+
+    void assertTupleConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2)
+    {
+        TupleType tt = new TupleType(ImmutableList.of(t1, t2));
+        ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance,
+                                             new ByteBuffer[]
+                                             {
+                                                decomposeForTuple(t1, o1),
+                                                decomposeForTuple(t2, o2)
+                                             });
+        assertConvertsSameBuffers(tt, b1);
+    }
+
+    @Test
+    public void testCompositeType()
+    {
+        Random rand = new Random(0);
+        testCombinationSampling(rand, this::assertCompositeConvertsSame);
+    }
+
+    @Test
+    public void testCompositeTypeNonFull()
+    {
+        CompositeType tt = CompositeType.getInstance(UTF8Type.instance, Int32Type.instance);
+        List<ByteBuffer> tests = ImmutableList.of
+            (
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)),
+            CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, "")),
+            CompositeType.build(ByteBufferAccessor.instance),
+            CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(UTF8Type.instance, "")),
+            CompositeType.build(ByteBufferAccessor.instance,true)
+            );
+        for (ByteBuffer b : tests)
+            tt.validate(b);
+        testBuffers(tt, tests);
+    }
+
+    void assertCompositeConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2)
+    {
+        CompositeType tt = CompositeType.getInstance(t1, t2);
+        ByteBuffer b1 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o1), decomposeAndRandomPad(t2, o2));
+        assertConvertsSameBuffers(tt, b1);
+    }
+
+    @Test
+    public void testDynamicComposite()
+    {
+        DynamicCompositeType tt = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases);
+        UUID[] uuids = DynamicCompositeTypeTest.uuids;
+        List<ByteBuffer> tests = ImmutableList.of
+            (
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test1", null, -1, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 24, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 42, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[0], -1, false, true),
+            DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[1], 42, false, true)
+            );
+        for (ByteBuffer b : tests)
+            tt.validate(b);
+        testBuffers(tt, tests);
+    }
+
+    @Test
+    public void testListTypeString()
+    {
+        testCollection(ListType.getInstance(UTF8Type.instance, true), testStrings, () -> new ArrayList<>(), new Random());
+    }
+
+    @Test
+    public void testListTypeLong()
+    {
+        testCollection(ListType.getInstance(LongType.instance, true), testLongs, () -> new ArrayList<>(), new Random());
+    }
+
+    @Test
+    public void testSetTypeString()
+    {
+        testCollection(SetType.getInstance(UTF8Type.instance, true), testStrings, () -> new HashSet<>(), new Random());
+    }
+
+    @Test
+    public void testSetTypeLong()
+    {
+        testCollection(SetType.getInstance(LongType.instance, true), testLongs, () -> new HashSet<>(), new Random());
+    }
+
+    <T, CT extends Collection<T>> void testCollection(CollectionType<CT> tt, T[] values, Supplier<CT> gen, Random rand)
+    {
+        int cnt = 0;
+        List<CT> tests = new ArrayList<>();
+        tests.add(gen.get());
+        for (int c = 1; c <= 3; ++c)
+            for (int j = 0; j < 5; ++j)
+            {
+                CT l = gen.get();
+                for (int i = 0; i < c; ++i)
+                {
+                    T value = values[cnt++ % values.length];
+                    if (value != null)
+                        l.add(value);
+                }
+
+                tests.add(l);
+            }
+        testType(tt, tests.toArray());
+    }
+
+    @Test
+    public void testMapTypeStringLong()
+    {
+        testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random());
+    }
+
+    @Test
+    public void testMapTypeStringLongTree()
+    {
+        testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random());
+    }
+
+    <K, V, M extends Map<K, V>> void testMap(MapType<K, V> tt, K[] keys, V[] values, Supplier<M> gen, Random rand)
+    {
+        List<M> tests = new ArrayList<>();
+        tests.add(gen.get());
+        for (int c = 1; c <= 3; ++c)
+            for (int j = 0; j < 5; ++j)
+            {
+                M l = gen.get();
+                for (int i = 0; i < c; ++i)
+                {
+                    V value = values[rand.nextInt(values.length)];
+                    if (value != null)
+                        l.put(keys[rand.nextInt(keys.length)], value);
+                }
+
+                tests.add(l);
+            }
+        testType(tt, tests.toArray());
+    }
+
+    /*
+     * Convert type to a comparable.
+     */
+    private ByteComparable typeToComparable(AbstractType type, ByteBuffer value)
+    {
+        return new ByteComparable()
+        {
+            @Override
+            public ByteSource asComparableBytes(Version v)
+            {
+                return type.asComparableBytes(value, v);
+            }
+
+            @Override
+            public String toString()
+            {
+                return type.getString(value);
+            }
+        };
+    }
+
+    public void testType(AbstractType type, Object[] values)
+    {
+        for (Object i : values) {
+            ByteBuffer b = decomposeAndRandomPad(type, i);
+            logger.info("Value {} ({}) bytes {} ByteSource {}",
+                              safeStr(i),
+                              safeStr(type.getSerializer().toCQLLiteral(b)),
+                              safeStr(ByteBufferUtil.bytesToHex(b)),
+                              typeToComparable(type, b).byteComparableAsString(VERSION));
+        }
+        for (Object i : values)
+            assertConvertsSame(type, i);
+        if (!type.isReversed())
+            testType(ReversedType.getInstance(type), values);
+    }
+
+    public void testTypeBuffers(AbstractType type, Object[] values)
+    {
+        // Main difference with above is that we use type.compare instead of checking equals
+        testBuffers(type, Arrays.stream(values)
+                                .map(value -> decomposeAndRandomPad(type, value))
+                                .collect(Collectors.toList()));
+
+    }
+    public void testBuffers(AbstractType type, List<ByteBuffer> values)
+    {
+        try
+        {
+            for (ByteBuffer b : values) {
+                logger.info("Value {} bytes {} ByteSource {}",
+                            safeStr(type.getSerializer().toCQLLiteral(b)),
+                            safeStr(ByteBufferUtil.bytesToHex(b)),
+                            typeToComparable(type, b).byteComparableAsString(VERSION));
+            }
+        }
+        catch (UnsupportedOperationException e)
+        {
+            // Continue without listing values.
+        }
+
+        for (ByteBuffer i : values)
+            assertConvertsSameBuffers(type, i);
+    }
+
+    void assertConvertsSameBuffers(AbstractType type, ByteBuffer b1)
+    {
+        final ByteComparable bs1 = typeToComparable(type, b1);
+
+        ByteBuffer actual = type.fromComparableBytes(source(bs1), VERSION);
+        assertEquals(String.format("Failed compare(%s, converted %s (bytesource %s))",
+                                   ByteBufferUtil.bytesToHex(b1),
+                                   ByteBufferUtil.bytesToHex(actual),
+                                   bs1.byteComparableAsString(VERSION)),
+                     0,
+                     type.compare(b1, actual));
+    }
+
+    public void testDecoratedKeys(IPartitioner type, List<ByteBuffer> values)
+    {
+        for (ByteBuffer i : values)
+            assertConvertsSameDecoratedKeys(type, i);
+    }
+
+    void assertConvertsSameDecoratedKeys(IPartitioner type, ByteBuffer b1)
+    {
+        DecoratedKey k1 = type.decorateKey(b1);
+        DecoratedKey actual = BufferDecoratedKey.fromByteComparable(k1, VERSION, type);
+
+        assertEquals(String.format("Failed compare(%s[%s bs %s], %s[%s bs %s])\npartitioner %s",
+                                   k1,
+                                   ByteBufferUtil.bytesToHex(b1),
+                                   k1.byteComparableAsString(VERSION),
+                                   actual,
+                                   ByteBufferUtil.bytesToHex(actual.getKey()),
+                                   actual.byteComparableAsString(VERSION),
+                                   type),
+                     0,
+                     k1.compareTo(actual));
+        assertEquals(String.format("Failed equals(%s[%s bs %s], %s[%s bs %s])\npartitioner %s",
+                                   k1,
+                                   ByteBufferUtil.bytesToHex(b1),
+                                   k1.byteComparableAsString(VERSION),
+                                   actual,
+                                   ByteBufferUtil.bytesToHex(actual.getKey()),
+                                   actual.byteComparableAsString(VERSION),
+                                   type),
+                     k1,
+                     actual);
+    }
+
+    static Object safeStr(Object i)
+    {
+        if (i == null)
+            return null;
+        if (i instanceof ByteBuffer)
+        {
+            ByteBuffer buf = (ByteBuffer) i;
+            i = ByteBufferUtil.bytesToHex(buf);
+        }
+        String s = i.toString();
+        if (s.length() > 100)
+            s = s.substring(0, 100) + "...";
+        return s.replaceAll("\0", "<0>");
+    }
+
+    public <T> void testDirect(Function<T, ByteSource> convertor, Function<ByteSource.Peekable, T> inverse, T[] values)
+    {
+        for (T i : values) {
+            if (i == null)
+                continue;
+
+            logger.info("Value {} ByteSource {}\n",
+                              safeStr(i),
+                              convertor.apply(i));
+
+        }
+        for (T i : values)
+            if (i != null)
+                assertConvertsSame(convertor, inverse, i);
+    }
+
+    <T> void assertConvertsSame(Function<T, ByteSource> convertor, Function<ByteSource.Peekable, T> inverse, T v1)
+    {
+        ByteComparable b1 = v -> convertor.apply(v1);
+        T actual = inverse.apply(source(b1));
+        assertEquals(String.format("ByteSource %s", b1.byteComparableAsString(VERSION)), v1, actual);
+    }
+
+    void assertConvertsSame(AbstractType type, Object v1)
+    {
+        ByteBuffer b1 = decomposeAndRandomPad(type, v1);
+        final ByteComparable bc1 = typeToComparable(type, b1);
+        ByteBuffer convertedBuffer = type.fromComparableBytes(source(bc1), VERSION);
+        Object actual = type.compose(convertedBuffer);
+
+        assertEquals(String.format("Failed equals %s(%s bs %s), got %s",
+                                   safeStr(v1),
+                                   ByteBufferUtil.bytesToHex(b1),
+                                   safeStr(bc1.byteComparableAsString(VERSION)),
+                                   safeStr(actual)),
+                     v1,
+                     actual);
+    }
+
+    ByteBuffer decomposeAndRandomPad(AbstractType type, Object v)
+    {
+        ByteBuffer b = type.decompose(v);
+        Random rand = new Random(0);
+        int padBefore = rand.nextInt(16);
+        int padAfter = rand.nextInt(16);
+        int paddedCapacity = b.remaining() + padBefore + padAfter;
+        ByteBuffer padded = allocateBuffer(paddedCapacity);
+        rand.ints(padBefore).forEach(x -> padded.put((byte) x));
+        padded.put(b.duplicate());
+        rand.ints(padAfter).forEach(x -> padded.put((byte) x));
+        padded.clear().limit(padded.capacity() - padAfter).position(padBefore);
+        return padded;
+    }
+
+    protected ByteBuffer allocateBuffer(int paddedCapacity)
+    {
+        return ByteBuffer.allocate(paddedCapacity);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java
new file mode 100644
index 000000000000..7a20b07ac236
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java
@@ -0,0 +1,321 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.bytecomparable;
+
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.utils.*;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.function.IntConsumer;
+import java.util.function.LongConsumer;
+import java.util.stream.*;
+
+import com.google.common.collect.ImmutableList;
+
+@RunWith(Parameterized.class)
+public class ByteSourceInverseTest
+{
+    private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()";
+
+    @Parameterized.Parameters(name = "version={0}")
+    public static Iterable<ByteComparable.Version> versions()
+    {
+        return ImmutableList.of(ByteComparable.Version.OSS41);
+    }
+
+    private final ByteComparable.Version version;
+
+    public ByteSourceInverseTest(ByteComparable.Version version)
+    {
+        this.version = version;
+    }
+
+    @Test
+    public void testGetSignedInt()
+    {
+        IntConsumer intConsumer = initial ->
+        {
+            ByteSource byteSource = ByteSource.of(initial);
+            int decoded = ByteSourceInverse.getSignedInt(byteSource);
+            Assert.assertEquals(initial, decoded);
+        };
+
+        IntStream.of(Integer.MIN_VALUE, Integer.MIN_VALUE + 1,
+                     -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256,
+                     Integer.MAX_VALUE - 1, Integer.MAX_VALUE)
+                 .forEach(intConsumer);
+        new Random().ints(1000)
+                    .forEach(intConsumer);
+    }
+
+    @Test
+    public void testNextInt()
+    {
+        // The high and low 32 bits of this long differ only in the first and last bit (in the high 32 bits they are
+        // both 0s instead of 1s). The first bit difference will be negated by the bit flipping when writing down a
+        // fixed length signed number, so the only remaining difference will be in the last bit.
+        int hi = 0b0001_0010_0011_0100_0101_0110_0111_1000;
+        int lo = hi | 1 | 1 << 31;
+        long l1 = Integer.toUnsignedLong(hi) << 32 | Integer.toUnsignedLong(lo);
+
+        ByteSource byteSource = ByteSource.of(l1);
+        int i1 = ByteSourceInverse.getSignedInt(byteSource);
+        int i2 = ByteSourceInverse.getSignedInt(byteSource);
+        Assert.assertEquals(i1 + 1, i2);
+
+        try
+        {
+            ByteSourceInverse.getSignedInt(byteSource);
+            Assert.fail();
+        }
+        catch (IllegalArgumentException e)
+        {
+            // Expected.
+        }
+
+        byteSource = ByteSource.of(l1);
+        int iFirst = ByteSourceInverse.getSignedInt(byteSource);
+        Assert.assertEquals(i1, iFirst);
+        int iNext = ByteSourceInverse.getSignedInt(byteSource);
+        Assert.assertEquals(i2, iNext);
+    }
+
+    @Test
+    public void testGetSignedLong()
+    {
+        LongConsumer longConsumer = initial ->
+        {
+            ByteSource byteSource = ByteSource.of(initial);
+            long decoded = ByteSourceInverse.getSignedLong(byteSource);
+            Assert.assertEquals(initial, decoded);
+        };
+
+        LongStream.of(Long.MIN_VALUE, Long.MIN_VALUE + 1, Integer.MIN_VALUE - 1L,
+                      -256L, -255L, -128L, -127L, -1L, 0L, 1L, 127L, 128L, 255L, 256L,
+                      Integer.MAX_VALUE + 1L, Long.MAX_VALUE - 1, Long.MAX_VALUE)
+                  .forEach(longConsumer);
+        new Random().longs(1000)
+                    .forEach(longConsumer);
+    }
+
+    @Test
+    public void testGetSignedByte()
+    {
+        Consumer<Byte> byteConsumer = boxedByte ->
+        {
+            byte initial = boxedByte;
+            ByteBuffer byteBuffer = ByteType.instance.decompose(initial);
+            ByteSource byteSource = ByteType.instance.asComparableBytes(byteBuffer, version);
+            byte decoded = ByteSourceInverse.getSignedByte(byteSource);
+            Assert.assertEquals(initial, decoded);
+        };
+
+        IntStream.range(Byte.MIN_VALUE, Byte.MAX_VALUE + 1)
+                 .forEach(byteInteger -> byteConsumer.accept((byte) byteInteger));
+    }
+
+    @Test
+    public void testGetSignedShort()
+    {
+        Consumer<Short> shortConsumer = boxedShort ->
+        {
+            short initial = boxedShort;
+            ByteBuffer shortBuffer = ShortType.instance.decompose(initial);
+            ByteSource byteSource = ShortType.instance.asComparableBytes(shortBuffer, version);
+            short decoded = ByteSourceInverse.getSignedShort(byteSource);
+            Assert.assertEquals(initial, decoded);
+        };
+
+        IntStream.range(Short.MIN_VALUE, Short.MAX_VALUE + 1)
+                 .forEach(shortInteger -> shortConsumer.accept((short) shortInteger));
+    }
+
+    @Test
+    public void testBadByteSourceForFixedLengthNumbers()
+    {
+        Stream.of("getSignedInt",
+                  "getSignedLong",
+                  "getSignedByte",
+                  "getSignedShort")
+              .map(methodName ->
+                   {
+                       try
+                       {
+                           return ByteSourceInverse.class.getMethod(methodName, ByteSource.class);
+                       }
+                       catch (NoSuchMethodException e)
+                       {
+                           Assert.fail("Expected ByteSourceInverse to have method called " + methodName
+                                               + " with a single parameter of type ByteSource");
+                       }
+                       return null;
+                   })
+              .forEach(fixedLengthNumberMethod ->
+                       {
+                           for (ByteSource badSource : Arrays.asList(null, ByteSource.EMPTY))
+                           {
+                               try
+                               {
+                                   fixedLengthNumberMethod.invoke(ByteSourceInverse.class, badSource);
+                                   Assert.fail("Expected IllegalArgumentException not thrown");
+                               }
+                               catch (Throwable maybe)
+                               {
+                                   if (!(maybe instanceof IllegalArgumentException
+                                           || maybe.getCause() instanceof IllegalArgumentException))
+                                       Assert.fail("Unexpected throwable " + maybe + " with cause " + maybe.getCause());
+                               }
+                           }
+                       });
+    }
+
+    @Test
+    public void testGetString()
+    {
+        Consumer<String> stringConsumer = initial ->
+        {
+            ByteSource.Peekable byteSource = initial == null ? null : ByteSource.peekable(ByteSource.of(initial, version));
+            String decoded = ByteSourceInverse.getString(byteSource);
+            Assert.assertEquals(initial, decoded);
+        };
+
+        Stream.of(null, "© 2018 DataStax", "", "\n", "\0", "\0\0", "\001", "0", "0\0", "00", "1")
+              .forEach(stringConsumer);
+
+        Random prng = new Random();
+        int stringLength = 10;
+        String random;
+        for (int i = 0; i < 1000; ++i)
+        {
+            random = newRandomAlphanumeric(prng, stringLength);
+            stringConsumer.accept(random);
+        }
+    }
+
+    private static String newRandomAlphanumeric(Random prng, int length)
+    {
+        StringBuilder random = new StringBuilder(length);
+        for (int i = 0; i < length; ++i)
+            random.append(ALPHABET.charAt(prng.nextInt(ALPHABET.length())));
+        return random.toString();
+    }
+
+    @Test
+    public void testGetByteBuffer()
+    {
+        Consumer<ByteBuffer> byteBufferConsumer = initial ->
+        {
+            ByteSource.Peekable byteSource = ByteSource.peekable(ByteSource.of(initial, version));
+            byte[] decodedBytes = ByteSourceInverse.getUnescapedBytes(byteSource);
+            byte[] initialBytes = ByteBufferUtil.getArray(initial);
+            Assert.assertTrue(Arrays.equals(initialBytes, decodedBytes));
+        };
+
+        Arrays.asList(
+                // ESCAPE - leading, in the middle, trailing
+                new byte[] {0, 2, 3, 4, 5}, new byte[] {1, 2, 0, 4, 5}, new byte[] {1, 2, 3, 4, 0},
+                // END_OF_STREAM/ESCAPED_0_DONE - leading, in the middle, trailing
+                new byte[] {-1, 2, 3, 4, 5}, new byte[] {1, 2, -1, 4, 5}, new byte[] {1, 2, 3, 4, -1},
+                // ESCAPED_0_CONT - leading, in the middle, trailing
+                new byte[] {-2, 2, 3, 4, 5}, new byte[] {1, 2, -2, 4, 5}, new byte[] {1, 2, 3, 4, -2},
+                // ESCAPE + ESCAPED_0_DONE - leading, in the middle, trailing
+                new byte[] {0, -1, 3, 4, 5}, new byte[] {1, 0, -1, 4, 5}, new byte[] {1, 2, 3, 0, -1},
+                // ESCAPE + ESCAPED_0_CONT + ESCAPED_0_DONE - leading, in the middle, trailing
+                new byte[] {0, -2, -1, 4, 5}, new byte[] {1, 0, -2, -1, 5}, new byte[] {1, 2, 0, -2, -1})
+              .forEach(tricky -> byteBufferConsumer.accept(ByteBuffer.wrap(tricky)));
+
+        byte[] bytes = new byte[1000];
+        Random prng = new Random();
+        for (int i = 0; i < 1000; ++i)
+        {
+            prng.nextBytes(bytes);
+            byteBufferConsumer.accept(ByteBuffer.wrap(bytes));
+        }
+
+        int stringLength = 10;
+        String random;
+        for (int i = 0; i < 1000; ++i)
+        {
+            random = newRandomAlphanumeric(prng, stringLength);
+            byteBufferConsumer.accept(ByteBufferUtil.bytes(random));
+        }
+    }
+
+    @Test
+    public void testReadBytes()
+    {
+        Map<Class<?>, Function<Object, ByteSource>> generatorPerType = new HashMap<>();
+        List<Object> originalValues = new ArrayList<>();
+        Random prng = new Random();
+
+        generatorPerType.put(String.class, s ->
+        {
+            String string = (String) s;
+            return ByteSource.of(string, version);
+        });
+        for (int i = 0; i < 100; ++i)
+            originalValues.add(newRandomAlphanumeric(prng, 10));
+
+        generatorPerType.put(Integer.class, i ->
+        {
+            Integer integer = (Integer) i;
+            return ByteSource.of(integer);
+        });
+        for (int i = 0; i < 100; ++i)
+            originalValues.add(prng.nextInt());
+
+        generatorPerType.put(Long.class, l ->
+        {
+            Long looong = (Long) l;
+            return ByteSource.of(looong);
+        });
+        for (int i = 0; i < 100; ++i)
+            originalValues.add(prng.nextLong());
+
+        generatorPerType.put(UUID.class, u ->
+        {
+            UUID uuid = (UUID) u;
+            ByteBuffer uuidBuffer = UUIDType.instance.decompose(uuid);
+            return UUIDType.instance.asComparableBytes(uuidBuffer, version);
+        });
+        for (int i = 0; i < 100; ++i)
+            originalValues.add(UUID.randomUUID());
+
+        for (Object value : originalValues)
+        {
+            Class<?> type = value.getClass();
+            Function<Object, ByteSource> generator = generatorPerType.get(type);
+            ByteSource originalSource = generator.apply(value);
+            ByteSource originalSourceCopy = generator.apply(value);
+            byte[] bytes = ByteSourceInverse.readBytes(originalSource);
+            // The best way to test the read bytes seems to be to assert that just directly using them as a
+            // ByteSource (using ByteSource.fixedLength(byte[])) they compare as equal to another ByteSource obtained
+            // from the same original value.
+            int compare = ByteComparable.compare(v -> originalSourceCopy, v -> ByteSource.fixedLength(bytes), version);
+            Assert.assertEquals(0, compare);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java
new file mode 100644
index 000000000000..c7189debceec
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java
@@ -0,0 +1,781 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.bytecomparable;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.Function;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.db.BufferClusteringBound;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.utils.UUIDGen;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.CachedHashDecoratedKey;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+@RunWith(Parameterized.class)
+public class ByteSourceSequenceTest
+{
+
+    private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()";
+
+    @Parameterized.Parameters(name = "version={0}")
+    public static Iterable<ByteComparable.Version> versions()
+    {
+        return ImmutableList.of(ByteComparable.Version.OSS41);
+    }
+
+    private final ByteComparable.Version version;
+
+    public ByteSourceSequenceTest(ByteComparable.Version version)
+    {
+        this.version = version;
+    }
+
+    @Test
+    public void testNullsSequence()
+    {
+        ByteSource.Peekable comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                null, null, null
+        ));
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+    }
+
+    @Test
+    public void testNullsAndUnknownLengthsSequence()
+    {
+        ByteSource.Peekable comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                null, ByteSource.of("b", version), ByteSource.of("c", version)
+        ));
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "b");
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "c");
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of("a", version), null, ByteSource.of("c", version)
+        ));
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "a");
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "c");
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of("a", version), ByteSource.of("b", version), null
+        ));
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "a");
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "b");
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of("a", version), null, null
+        ));
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "a");
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                null, null, ByteSource.of("c", version)
+        ));
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "c");
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+    }
+
+    private static void expectNextComponentNull(ByteSource.Peekable comparableBytes)
+    {
+        // We expect null-signifying separator, followed by a null ByteSource component
+        ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes);
+        assertNull(next);
+    }
+
+    private static <T> void expectNextComponentValue(ByteSource.Peekable comparableBytes,
+                                                     Function<ByteSource.Peekable, T> decoder,
+                                                     T expected)
+    {
+        // We expect a regular separator, followed by a ByteSource component corresponding to the expected value
+        ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes);
+        assertNotNull(next);
+        T decoded = decoder.apply(next);
+        assertEquals(expected, decoded);
+    }
+
+    @Test
+    public void testNullsAndKnownLengthsSequence()
+    {
+        int intValue = 42;
+        BigInteger varintValue = BigInteger.valueOf(2018L);
+        ByteSource.Peekable comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                null, ByteSource.of(intValue), varintToByteSource(varintValue)
+        ));
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getSignedInt, intValue);
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(intValue), null, varintToByteSource(varintValue)
+        ));
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getSignedInt, intValue);
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(intValue), varintToByteSource(varintValue), null
+        ));
+        expectNextComponentValue(comparableBytes, ByteSourceInverse::getSignedInt, intValue);
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                null, null, varintToByteSource(varintValue)
+        ));
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                null, varintToByteSource(varintValue), null
+        ));
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                varintToByteSource(varintValue), null, null
+        ));
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        Boolean boolValue = new Random().nextBoolean();
+        ByteSource boolSource = BooleanType.instance.asComparableBytes(BooleanType.instance.decompose(boolValue), version);
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                varintToByteSource(varintValue), boolSource, null
+        ));
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        expectNextComponentValue(comparableBytes, BooleanType.instance, boolValue);
+        expectNextComponentNull(comparableBytes);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+
+        boolSource = BooleanType.instance.asComparableBytes(BooleanType.instance.decompose(boolValue), version);
+        comparableBytes = ByteSource.peekable(ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                varintToByteSource(varintValue), null, boolSource
+        ));
+        expectNextComponentValue(comparableBytes, VARINT, varintValue);
+        expectNextComponentNull(comparableBytes);
+        expectNextComponentValue(comparableBytes, BooleanType.instance, boolValue);
+        assertEquals(ByteSource.TERMINATOR, comparableBytes.next());
+    }
+
+    @Test
+    public void testOptionalSignedFixedLengthTypesSequence()
+    {
+        Random prng = new Random();
+        String randomString = newRandomAlphanumeric(prng, 10);
+        byte randomByte = (byte) prng.nextInt();
+        short randomShort = (short) prng.nextInt();
+        int randomInt = prng.nextInt();
+        long randomLong = prng.nextLong();
+        BigInteger randomVarint = BigInteger.probablePrime(80, prng);
+
+        Map<AbstractType<?>, ByteBuffer> valuesByType = new HashMap<AbstractType<?>, ByteBuffer>()
+        {{
+            put(ByteType.instance, ByteType.instance.decompose(randomByte));
+            put(ShortType.instance, ShortType.instance.decompose(randomShort));
+            put(SimpleDateType.instance, SimpleDateType.instance.decompose(randomInt));
+            put(TimeType.instance, TimeType.instance.decompose(randomLong));
+        }};
+
+        for (Map.Entry<AbstractType<?>, ByteBuffer> entry : valuesByType.entrySet())
+        {
+            AbstractType<?> type = entry.getKey();
+            ByteBuffer value = entry.getValue();
+
+            ByteSource byteSource = type.asComparableBytes(value, version);
+            ByteSource.Peekable sequence = ByteSource.peekable(ByteSource.withTerminator(
+                    ByteSource.TERMINATOR,
+                    ByteSource.of(randomString, version), byteSource, varintToByteSource(randomVarint)
+            ));
+            expectNextComponentValue(sequence, ByteSourceInverse::getString, randomString);
+            expectNextComponentValue(sequence, type, value);
+            expectNextComponentValue(sequence, VARINT, randomVarint);
+            assertEquals(ByteSource.TERMINATOR, sequence.next());
+
+            byteSource = type.asComparableBytes(type.decompose(null), version);
+            sequence = ByteSource.peekable(ByteSource.withTerminator(
+                    ByteSource.TERMINATOR,
+                    ByteSource.of(randomString, version), byteSource, varintToByteSource(randomVarint)
+            ));
+            expectNextComponentValue(sequence, ByteSourceInverse::getString, randomString);
+            expectNextComponentNull(sequence);
+            expectNextComponentValue(sequence, VARINT, randomVarint);
+            assertEquals(ByteSource.TERMINATOR, sequence.next());
+        }
+    }
+
+    private ByteSource varintToByteSource(BigInteger value)
+    {
+        ByteBuffer valueByteBuffer = VARINT.decompose(value);
+        return VARINT.asComparableBytes(valueByteBuffer, version);
+    }
+
+    private static final UTF8Type UTF8 = UTF8Type.instance;
+    private static final DecimalType DECIMAL = DecimalType.instance;
+    private static final IntegerType VARINT = IntegerType.instance;
+
+    // A regular comparator using the natural ordering for all types.
+    private static final ClusteringComparator COMP = new ClusteringComparator(Arrays.asList(
+            UTF8,
+            DECIMAL,
+            VARINT
+    ));
+    // A comparator that reverses the ordering for the first unknown length type
+    private static final ClusteringComparator COMP_REVERSED_UNKNOWN_LENGTH = new ClusteringComparator(Arrays.asList(
+            ReversedType.getInstance(UTF8),
+            DECIMAL,
+            VARINT
+    ));
+    // A comparator that reverses the ordering for the second unknown length type
+    private static final ClusteringComparator COMP_REVERSED_UNKNOWN_LENGTH_2 = new ClusteringComparator(Arrays.asList(
+            UTF8,
+            ReversedType.getInstance(DECIMAL),
+            VARINT
+    ));
+    // A comparator that reverses the ordering for the sole known/computable length type
+    private static final ClusteringComparator COMP_REVERSED_KNOWN_LENGTH = new ClusteringComparator(Arrays.asList(
+            UTF8,
+            DECIMAL,
+            ReversedType.getInstance(VARINT)
+    ));
+    // A comparator that reverses the ordering for all types
+    private static final ClusteringComparator COMP_ALL_REVERSED = new ClusteringComparator(Arrays.asList(
+            ReversedType.getInstance(UTF8),
+            ReversedType.getInstance(DECIMAL),
+            ReversedType.getInstance(VARINT)
+    ));
+
+    @Test
+    public void testClusteringPrefixBoundNormalAndReversed()
+    {
+        String stringValue = "Lorem ipsum dolor sit amet";
+        BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20);
+        BigInteger varintValue = BigInteger.valueOf(2018L);
+
+        // Create some non-null clustering key values that will be encoded and decoded to byte-ordered representation
+        // with different types of clustering comparators (and in other tests with different types of prefixes).
+        ByteBuffer[] clusteringKeyValues = new ByteBuffer[] {
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue),
+                VARINT.decompose(varintValue)
+        };
+
+        for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values())
+        {
+            if (prefixKind.isBoundary())
+                continue;
+
+            ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            // Use the regular comparator.
+            ByteSource.Peekable comparableBytes = COMP.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentValue(comparableBytes, DECIMAL, decimalValue);
+            expectNextComponentValue(comparableBytes, VARINT, varintValue);
+
+            prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            // Use the comparator reversing the ordering for the first unknown length type.
+            comparableBytes = COMP_REVERSED_UNKNOWN_LENGTH.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, ReversedType.getInstance(UTF8), stringValue);
+            expectNextComponentValue(comparableBytes, DECIMAL, decimalValue);
+            expectNextComponentValue(comparableBytes, VARINT, varintValue);
+
+            prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            // Use the comparator reversing the ordering for the second unknown length type.
+            comparableBytes = COMP_REVERSED_UNKNOWN_LENGTH_2.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentValue(comparableBytes, ReversedType.getInstance(DECIMAL), decimalValue);
+            expectNextComponentValue(comparableBytes, VARINT, varintValue);
+
+            prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            // Use the comparator reversing the ordering for the known/computable length type.
+            comparableBytes = COMP_REVERSED_KNOWN_LENGTH.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentValue(comparableBytes, DECIMAL, decimalValue);
+            expectNextComponentValue(comparableBytes, ReversedType.getInstance(VARINT), varintValue);
+
+            prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            // Use the all-reversing comparator.
+            comparableBytes = COMP_ALL_REVERSED.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, ReversedType.getInstance(UTF8), stringValue);
+            expectNextComponentValue(comparableBytes, ReversedType.getInstance(DECIMAL), decimalValue);
+            expectNextComponentValue(comparableBytes, ReversedType.getInstance(VARINT), varintValue);
+        }
+    }
+
+    @Test
+    public void testClusteringPrefixBoundNulls()
+    {
+        String stringValue = "Lorem ipsum dolor sit amet";
+        BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20);
+        BigInteger varintValue = BigInteger.valueOf(2018L);
+
+        // Create clustering key values where the component for an unknown length type is null.
+        ByteBuffer[] unknownLengthNull = new ByteBuffer[] {
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(null),
+                VARINT.decompose(varintValue)
+        };
+        // Create clustering key values where the component for a known/computable length type is null.
+        ByteBuffer[] knownLengthNull = new ByteBuffer[] {
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue),
+                VARINT.decompose(null)
+        };
+
+        for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values())
+        {
+            if (prefixKind.isBoundary())
+                continue;
+
+            // Test the decoding of a null component of a non-reversed unknown length type.
+            ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, unknownLengthNull);
+            ByteSource.Peekable comparableBytes = COMP.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentNull(comparableBytes);
+            expectNextComponentValue(comparableBytes, VARINT, varintValue);
+            // Test the decoding of a null component of a reversed unknown length type.
+            prefix = BufferClusteringBound.create(prefixKind, unknownLengthNull);
+            comparableBytes = COMP_REVERSED_UNKNOWN_LENGTH_2.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentNull(comparableBytes);
+            expectNextComponentValue(comparableBytes, VARINT, varintValue);
+
+            // Test the decoding of a null component of a non-reversed known/computable length type.
+            prefix = BufferClusteringBound.create(prefixKind, knownLengthNull);
+            comparableBytes = COMP.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentValue(comparableBytes, DECIMAL, decimalValue);
+            expectNextComponentNull(comparableBytes);
+            // Test the decoding of a null component of a reversed known/computable length type.
+            prefix = BufferClusteringBound.create(prefixKind, knownLengthNull);
+            comparableBytes = COMP_REVERSED_KNOWN_LENGTH.asByteComparable(prefix).asPeekableBytes(version);
+            expectNextComponentValue(comparableBytes, UTF8, stringValue);
+            expectNextComponentValue(comparableBytes, DECIMAL, decimalValue);
+            expectNextComponentNull(comparableBytes);
+        }
+    }
+
+    private <T> void expectNextComponentValue(ByteSource.Peekable comparableBytes,
+                                              AbstractType<T> type,
+                                              T expected)
+    {
+        // We expect a regular separator, followed by a ByteSource component corresponding to the expected value
+        ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes);
+        T decoded = type.compose(type.fromComparableBytes(next, version));
+        assertEquals(expected, decoded);
+    }
+
+    private void expectNextComponentValue(ByteSource.Peekable comparableBytes,
+                                          AbstractType<?> type,
+                                          ByteBuffer expected)
+    {
+        // We expect a regular separator, followed by a ByteSource component corresponding to the expected value
+        ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes);
+        assertEquals(expected, type.fromComparableBytes(next, version));
+    }
+
+    @Test
+    public void testGetBoundFromPrefixTerminator()
+    {
+        String stringValue = "Lorem ipsum dolor sit amet";
+        BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20);
+        BigInteger varintValue = BigInteger.valueOf(2018L);
+
+        ByteBuffer[] clusteringKeyValues = new ByteBuffer[] {
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue),
+                VARINT.decompose(varintValue)
+        };
+        ByteBuffer[] nullValueBeforeTerminator = new ByteBuffer[] {
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue),
+                VARINT.decompose(null)
+        };
+
+        for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values())
+        {
+            // NOTE dimitar.dimitrov I assume there's a sensible explanation why does STATIC_CLUSTERING use a custom
+            // terminator that's not one of the common separator values, but I haven't spent enough time to get it.
+            if (prefixKind.isBoundary())
+                continue;
+
+            // Test that the read terminator value is exactly the encoded value of this prefix' bound.
+            ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            ByteSource.Peekable comparableBytes = COMP.asByteComparable(prefix).asPeekableBytes(version);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            ByteSourceInverse.getString(comparableBytes);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            DECIMAL.fromComparableBytes(comparableBytes, version);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            VARINT.fromComparableBytes(comparableBytes, version);
+            // Expect the last separator (i.e. the terminator) to be the one specified by the prefix kind.
+            assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next());
+
+            // Test that the read terminator value is exactly the encoded value of this prefix' bound, when the
+            // terminator is preceded by a null value.
+            prefix = BufferClusteringBound.create(prefixKind, nullValueBeforeTerminator);
+            comparableBytes = COMP.asByteComparable(prefix).asPeekableBytes(version);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            ByteSourceInverse.getString(comparableBytes);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            DECIMAL.fromComparableBytes(comparableBytes, version);
+            // Expect null-signifying separator here.
+            assertEquals(ByteSource.NEXT_COMPONENT_NULL, comparableBytes.next());
+            // No varint to read
+            // Expect the last separator (i.e. the terminator) to be the one specified by the prefix kind.
+            assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next());
+
+            // Test that the read terminator value is exactly the encoded value of this prefix' bound, when the
+            // terminator is preceded by a reversed null value.
+            prefix = BufferClusteringBound.create(prefixKind, nullValueBeforeTerminator);
+            // That's the comparator that will reverse the ordering of the type of the last value in the prefix (the
+            // one before the terminator). In other tests we're more interested in the fact that values of this type
+            // have known/computable length, which is why we've named it so...
+            comparableBytes = COMP_REVERSED_KNOWN_LENGTH.asByteComparable(prefix).asPeekableBytes(version);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            ByteSourceInverse.getString(comparableBytes);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            DECIMAL.fromComparableBytes(comparableBytes, version);
+            // Expect reversed null-signifying separator here.
+            assertEquals(ByteSource.NEXT_COMPONENT_NULL_REVERSED, comparableBytes.next());
+            // No varint to read
+            // Expect the last separator (i.e. the terminator) to be the one specified by the prefix kind.
+            assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next());
+        }
+    }
+
+    @Test
+    public void testReversedTypesInClusteringKey()
+    {
+        String stringValue = "Lorem ipsum dolor sit amet";
+        BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20);
+
+        AbstractType<String> reversedStringType = ReversedType.getInstance(UTF8);
+        AbstractType<BigDecimal> reversedDecimalType = ReversedType.getInstance(DECIMAL);
+
+        final ClusteringComparator comparator = new ClusteringComparator(Arrays.asList(
+                // unknown length type
+                UTF8,
+                // known length type
+                DECIMAL,
+                // reversed unknown length type
+                reversedStringType,
+                // reversed known length type
+                reversedDecimalType
+        ));
+        ByteBuffer[] clusteringKeyValues = new ByteBuffer[] {
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue),
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue)
+        };
+
+        final ClusteringComparator comparator2 = new ClusteringComparator(Arrays.asList(
+                // known length type
+                DECIMAL,
+                // unknown length type
+                UTF8,
+                // reversed known length type
+                reversedDecimalType,
+                // reversed unknown length type
+                reversedStringType
+        ));
+        ByteBuffer[] clusteringKeyValues2 = new ByteBuffer[] {
+                DECIMAL.decompose(decimalValue),
+                UTF8.decompose(stringValue),
+                DECIMAL.decompose(decimalValue),
+                UTF8.decompose(stringValue)
+        };
+
+        for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values())
+        {
+            if (prefixKind.isBoundary())
+                continue;
+
+            ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues);
+            ByteSource.Peekable comparableBytes = comparator.asByteComparable(prefix).asPeekableBytes(version);
+
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            assertEquals(getComponentValue(UTF8, comparableBytes), stringValue);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            assertEquals(getComponentValue(DECIMAL, comparableBytes), decimalValue);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            assertEquals(getComponentValue(reversedStringType, comparableBytes), stringValue);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+            assertEquals(getComponentValue(reversedDecimalType, comparableBytes), decimalValue);
+
+            assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next());
+            assertEquals(ByteSource.END_OF_STREAM, comparableBytes.next());
+
+            ClusteringPrefix prefix2 = BufferClusteringBound.create(prefixKind, clusteringKeyValues2);
+            ByteSource.Peekable comparableBytes2 = comparator2.asByteComparable(prefix2).asPeekableBytes(version);
+
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next());
+            assertEquals(getComponentValue(DECIMAL, comparableBytes2), decimalValue);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next());
+            assertEquals(getComponentValue(UTF8, comparableBytes2), stringValue);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next());
+            assertEquals(getComponentValue(reversedDecimalType, comparableBytes2), decimalValue);
+            assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next());
+            assertEquals(getComponentValue(reversedStringType, comparableBytes2), stringValue);
+
+            assertEquals(prefixKind.asByteComparableValue(version), comparableBytes2.next());
+            assertEquals(ByteSource.END_OF_STREAM, comparableBytes2.next());
+        }
+    }
+
+    private <T extends AbstractType<E>, E> E getComponentValue(T type, ByteSource.Peekable comparableBytes)
+    {
+        return type.compose(type.fromComparableBytes(comparableBytes, version));
+    }
+
+    @Test
+    public void testReadingNestedSequence_Simple()
+    {
+        String padding1 = "A string";
+        String padding2 = "Another string";
+
+        BigInteger varint1 = BigInteger.valueOf(0b10000000);
+        BigInteger varint2 = BigInteger.valueOf(1 >> 30);
+        BigInteger varint3 = BigInteger.valueOf(0x10000000L);
+        BigInteger varint4 = BigInteger.valueOf(Long.MAX_VALUE);
+
+        String string1 = "Testing byte sources";
+        String string2 = "is neither easy nor fun;";
+        String string3 = "But do it we must.";
+        String string4 = "— DataStax, 2018";
+
+        MapType<BigInteger, String> varintStringMapType = MapType.getInstance(VARINT, UTF8, false);
+        Map<BigInteger, String> varintStringMap = new TreeMap<>();
+        varintStringMap.put(varint1, string1);
+        varintStringMap.put(varint2, string2);
+        varintStringMap.put(varint3, string3);
+        varintStringMap.put(varint4, string4);
+
+        ByteSource sequence = ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(padding1, version),
+                varintStringMapType.asComparableBytes(varintStringMapType.decompose(varintStringMap), version),
+                ByteSource.of(padding2, version)
+        );
+        ByteSource.Peekable comparableBytes = ByteSource.peekable(sequence);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding1);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(varintStringMapType, comparableBytes), varintStringMap);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding2);
+        sequence = ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                varintStringMapType.asComparableBytes(varintStringMapType.decompose(varintStringMap), version),
+                ByteSource.of(padding1, version),
+                ByteSource.of(padding2, version)
+        );
+        comparableBytes = ByteSource.peekable(sequence);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(varintStringMapType, comparableBytes), varintStringMap);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding1);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding2);
+        sequence = ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(padding1, version),
+                ByteSource.of(padding2, version),
+                varintStringMapType.asComparableBytes(varintStringMapType.decompose(varintStringMap), version)
+        );
+        comparableBytes = ByteSource.peekable(sequence);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding1);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding2);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(varintStringMapType, comparableBytes), varintStringMap);
+
+        MapType<String, BigInteger> stringVarintMapType = MapType.getInstance(UTF8, VARINT, false);
+        Map<String, BigInteger> stringVarintMap = new HashMap<>();
+        stringVarintMap.put(string1, varint1);
+        stringVarintMap.put(string2, varint2);
+        stringVarintMap.put(string3, varint3);
+        stringVarintMap.put(string4, varint4);
+
+        sequence = ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(padding1, version),
+                stringVarintMapType.asComparableBytes(stringVarintMapType.decompose(stringVarintMap), version),
+                ByteSource.of(padding2, version)
+        );
+        comparableBytes = ByteSource.peekable(sequence);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding1);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(stringVarintMapType, comparableBytes), stringVarintMap);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding2);
+
+        MapType<String, String> stringStringMapType = MapType.getInstance(UTF8, UTF8, false);
+        Map<String, String> stringStringMap = new HashMap<>();
+        stringStringMap.put(string1, string4);
+        stringStringMap.put(string2, string3);
+        stringStringMap.put(string3, string2);
+        stringStringMap.put(string4, string1);
+
+        sequence = ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(padding1, version),
+                stringStringMapType.asComparableBytes(stringStringMapType.decompose(stringStringMap), version),
+                ByteSource.of(padding2, version)
+        );
+        comparableBytes = ByteSource.peekable(sequence);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding1);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(stringStringMapType, comparableBytes), stringStringMap);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding2);
+
+        MapType<BigInteger, BigInteger> varintVarintMapType = MapType.getInstance(VARINT, VARINT, false);
+        Map<BigInteger, BigInteger> varintVarintMap = new HashMap<>();
+        varintVarintMap.put(varint1, varint4);
+        varintVarintMap.put(varint2, varint3);
+        varintVarintMap.put(varint3, varint2);
+        varintVarintMap.put(varint4, varint1);
+
+        sequence = ByteSource.withTerminator(
+                ByteSource.TERMINATOR,
+                ByteSource.of(padding1, version),
+                varintVarintMapType.asComparableBytes(varintVarintMapType.decompose(varintVarintMap), version),
+                ByteSource.of(padding2, version)
+        );
+        comparableBytes = ByteSource.peekable(sequence);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding1);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(varintVarintMapType, comparableBytes), varintVarintMap);
+        assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next());
+        assertEquals(getComponentValue(UTF8, comparableBytes), padding2);
+    }
+
+    @Test
+    public void testReadingNestedSequence_DecoratedKey()
+    {
+        Random prng = new Random();
+
+        MapType<String, BigDecimal> stringDecimalMapType = MapType.getInstance(UTF8, DECIMAL, false);
+        Map<String, BigDecimal> stringDecimalMap = new HashMap<>();
+        for (int i = 0; i < 4; ++i)
+            stringDecimalMap.put(newRandomAlphanumeric(prng, 10), BigDecimal.valueOf(prng.nextDouble()));
+        ByteBuffer key = stringDecimalMapType.decompose(stringDecimalMap);
+        testDecodingKeyWithLocalPartitionerForType(key, stringDecimalMapType);
+
+        MapType<BigDecimal, String> decimalStringMapType = MapType.getInstance(DECIMAL, UTF8, false);
+        Map<BigDecimal, String> decimalStringMap = new HashMap<>();
+        for (int i = 0; i < 4; ++i)
+            decimalStringMap.put(BigDecimal.valueOf(prng.nextDouble()), newRandomAlphanumeric(prng, 10));
+        key = decimalStringMapType.decompose(decimalStringMap);
+        testDecodingKeyWithLocalPartitionerForType(key, decimalStringMapType);
+
+        if (version != ByteComparable.Version.LEGACY)
+        {
+            CompositeType stringDecimalCompType = CompositeType.getInstance(UTF8, DECIMAL);
+            key = stringDecimalCompType.decompose(newRandomAlphanumeric(prng, 10), BigDecimal.valueOf(prng.nextDouble()));
+            testDecodingKeyWithLocalPartitionerForType(key, stringDecimalCompType);
+
+            CompositeType decimalStringCompType = CompositeType.getInstance(DECIMAL, UTF8);
+            key = decimalStringCompType.decompose(BigDecimal.valueOf(prng.nextDouble()), newRandomAlphanumeric(prng, 10));
+            testDecodingKeyWithLocalPartitionerForType(key, decimalStringCompType);
+
+            DynamicCompositeType dynamicCompType = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases);
+            key = DynamicCompositeTypeTest.createDynamicCompositeKey(
+                    newRandomAlphanumeric(prng, 10), UUIDGen.getTimeUUID(), 42, true, false);
+            testDecodingKeyWithLocalPartitionerForType(key, dynamicCompType);
+
+            key = DynamicCompositeTypeTest.createDynamicCompositeKey(
+                    newRandomAlphanumeric(prng, 10), UUIDGen.getTimeUUID(), 42, true, true);
+            testDecodingKeyWithLocalPartitionerForType(key, dynamicCompType);
+        }
+    }
+
+    private static String newRandomAlphanumeric(Random prng, int length)
+    {
+        StringBuilder random = new StringBuilder(length);
+        for (int i = 0; i < length; ++i)
+            random.append(ALPHABET.charAt(prng.nextInt(ALPHABET.length())));
+        return random.toString();
+    }
+
+    private <T> void testDecodingKeyWithLocalPartitionerForType(ByteBuffer key, AbstractType<T> type)
+    {
+        IPartitioner partitioner = new LocalPartitioner(type);
+        CachedHashDecoratedKey initial = (CachedHashDecoratedKey) partitioner.decorateKey(key);
+        BufferDecoratedKey base = BufferDecoratedKey.fromByteComparable(initial, version, partitioner);
+        CachedHashDecoratedKey decoded = new CachedHashDecoratedKey(base.getToken(), base.getKey());
+        Assert.assertEquals(initial, decoded);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java
new file mode 100644
index 000000000000..72f439e8407e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils.bytecomparable;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.UUID;
+
+import com.google.common.base.Throwables;
+
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.DoubleType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.UUIDGen;
+
+public class ByteSourceTestBase
+{
+    String[] testStrings = new String[]{ "", "\0", "\0\0", "\001", "A\0\0B", "A\0B\0", "0", "0\0", "00", "1", "\377" };
+    Integer[] testInts = new Integer[]{ null,
+                                        Integer.MIN_VALUE,
+                                        Integer.MIN_VALUE + 1,
+                                        -256,
+                                        -255,
+                                        -128,
+                                        -127,
+                                        -1,
+                                        0,
+                                        1,
+                                        127,
+                                        128,
+                                        255,
+                                        256,
+                                        Integer.MAX_VALUE - 1,
+                                        Integer.MAX_VALUE };
+    Byte[] testBytes = new Byte[]{ -128, -127, -1, 0, 1, 127 };
+    Short[] testShorts = new Short[]{ Short.MIN_VALUE,
+                                      Short.MIN_VALUE + 1,
+                                      -256,
+                                      -255,
+                                      -128,
+                                      -127,
+                                      -1,
+                                      0,
+                                      1,
+                                      127,
+                                      128,
+                                      255,
+                                      256,
+                                      Short.MAX_VALUE - 1,
+                                      Short.MAX_VALUE };
+    Long[] testLongs = new Long[]{ null,
+                                   Long.MIN_VALUE,
+                                   Long.MIN_VALUE + 1,
+                                   Integer.MIN_VALUE - 1L,
+                                   -256L,
+                                   -255L,
+                                   -128L,
+                                   -127L,
+                                   -1L,
+                                   0L,
+                                   1L,
+                                   127L,
+                                   128L,
+                                   255L,
+                                   256L,
+                                   Integer.MAX_VALUE + 1L,
+                                   Long.MAX_VALUE - 1,
+                                   Long.MAX_VALUE };
+    Double[] testDoubles = new Double[]{ null,
+                                         Double.NEGATIVE_INFINITY,
+                                         -Double.MAX_VALUE,
+                                         -1e+200,
+                                         -1e3,
+                                         -1e0,
+                                         -1e-3,
+                                         -1e-200,
+                                         -Double.MIN_VALUE,
+                                         -0.0,
+                                         0.0,
+                                         Double.MIN_VALUE,
+                                         1e-200,
+                                         1e-3,
+                                         1e0,
+                                         1e3,
+                                         1e+200,
+                                         Double.MAX_VALUE,
+                                         Double.POSITIVE_INFINITY,
+                                         Double.NaN };
+    Float[] testFloats = new Float[]{ null,
+                                      Float.NEGATIVE_INFINITY,
+                                      -Float.MAX_VALUE,
+                                      -1e+30f,
+                                      -1e3f,
+                                      -1e0f,
+                                      -1e-3f,
+                                      -1e-30f,
+                                      -Float.MIN_VALUE,
+                                      -0.0f,
+                                      0.0f,
+                                      Float.MIN_VALUE,
+                                      1e-30f,
+                                      1e-3f,
+                                      1e0f,
+                                      1e3f,
+                                      1e+30f,
+                                      Float.MAX_VALUE,
+                                      Float.POSITIVE_INFINITY,
+                                      Float.NaN };
+    Boolean[] testBools = new Boolean[]{ null, false, true };
+    UUID[] testUUIDs = new UUID[]{ null,
+                                   UUIDGen.getTimeUUID(),
+                                   UUID.randomUUID(),
+                                   UUID.randomUUID(),
+                                   UUID.randomUUID(),
+                                   UUIDGen.getTimeUUID(123, 234),
+                                   UUIDGen.getTimeUUID(123, 234),
+                                   UUIDGen.getTimeUUID(123),
+                                   UUID.fromString("6ba7b811-9dad-11d1-80b4-00c04fd430c8"),
+                                   UUID.fromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8"),
+                                   UUID.fromString("e902893a-9d22-3c7e-a7b8-d6e313b71d9f"),
+                                   UUID.fromString("74738ff5-5367-5958-9aee-98fffdcd1876"),
+                                   UUID.fromString("52df1bb0-6a2f-11e6-b6e4-a6dea7a01b67"),
+                                   UUID.fromString("52df1bb0-6a2f-11e6-362d-aff2143498ea"),
+                                   UUID.fromString("52df1bb0-6a2f-11e6-b62d-aff2143498ea") };
+    // Instant.MIN/MAX fail Date.from.
+    Date[] testDates = new Date[]{ null,
+                                   Date.from(Instant.ofEpochSecond(Integer.MIN_VALUE)),
+                                   Date.from(Instant.ofEpochSecond(Short.MIN_VALUE)),
+                                   Date.from(Instant.ofEpochMilli(-2000)),
+                                   Date.from(Instant.EPOCH),
+                                   Date.from(Instant.ofEpochMilli(2000)),
+                                   Date.from(Instant.ofEpochSecond(Integer.MAX_VALUE)),
+                                   Date.from(Instant.now()) };
+    InetAddress[] testInets;
+    {
+        try
+        {
+            testInets = new InetAddress[]{ null,
+                                           InetAddress.getLocalHost(),
+                                           InetAddress.getLoopbackAddress(),
+                                           InetAddress.getByName("192.168.0.1"),
+                                           InetAddress.getByName("fe80::428d:5cff:fe53:1dc9"),
+                                           InetAddress.getByName("2001:610:3:200a:192:87:36:2"),
+                                           InetAddress.getByName("10.0.0.1"),
+                                           InetAddress.getByName("0a00:0001::"),
+                                           InetAddress.getByName("::10.0.0.1") };
+        }
+        catch (UnknownHostException e)
+        {
+            throw Throwables.propagate(e);
+        }
+    }
+
+    BigInteger[] testBigInts;
+
+    {
+        Set<BigInteger> bigs = new TreeSet<>();
+        for (Long l : testLongs)
+            if (l != null)
+                bigs.add(BigInteger.valueOf(l));
+        for (int i = 0; i < 11; ++i)
+        {
+            bigs.add(BigInteger.valueOf(i));
+            bigs.add(BigInteger.valueOf(-i));
+
+            bigs.add(BigInteger.valueOf((1L << 4 * i) - 1));
+            bigs.add(BigInteger.valueOf((1L << 4 * i)));
+            bigs.add(BigInteger.valueOf(-(1L << 4 * i) - 1));
+            bigs.add(BigInteger.valueOf(-(1L << 4 * i)));
+            String p = exp10(i);
+            bigs.add(new BigInteger(p));
+            bigs.add(new BigInteger("-" + p));
+            p = exp10(1 << i);
+            bigs.add(new BigInteger(p));
+            bigs.add(new BigInteger("-" + p));
+
+            BigInteger base = BigInteger.ONE.shiftLeft(512 * i);
+            bigs.add(base);
+            bigs.add(base.add(BigInteger.ONE));
+            bigs.add(base.subtract(BigInteger.ONE));
+            base = base.negate();
+            bigs.add(base);
+            bigs.add(base.add(BigInteger.ONE));
+            bigs.add(base.subtract(BigInteger.ONE));
+        }
+        testBigInts = bigs.toArray(new BigInteger[0]);
+    }
+
+    static String exp10(int pow)
+    {
+        StringBuilder builder = new StringBuilder();
+        builder.append('1');
+        for (int i=0; i<pow; ++i)
+            builder.append('0');
+        return builder.toString();
+    }
+
+    BigDecimal[] testBigDecimals;
+    {
+        String vals = "0, 1, 1.1, 21, 98.9, 99, 99.9, 100, 100.1, 101, 331, 0.4, 0.07, 0.0700, 0.005, " +
+                      "6e4, 7e200, 6e-300, 8.1e2000, 8.1e-2000, 9e2000000000, " +
+                      "123456789012.34567890e-1000000000, 123456.78901234, 1234.56789012e2, " +
+                      "1.0000, 0.01e2, 100e-2, 00, 0.000, 0E-18, 0E+18";
+        List<BigDecimal> decs = new ArrayList<>();
+        for (String s : vals.split(", "))
+        {
+            decs.add(new BigDecimal(s));
+            decs.add(new BigDecimal("-" + s));
+        }
+        testBigDecimals = decs.toArray(new BigDecimal[0]);
+    }
+
+    Object[][] testValues = new Object[][]{ testStrings,
+                                            testInts,
+                                            testBools,
+                                            testDoubles,
+                                            testBigInts,
+                                            testBigDecimals };
+
+    AbstractType[] testTypes = new AbstractType[]{ UTF8Type.instance,
+                                                   Int32Type.instance,
+                                                   BooleanType.instance,
+                                                   DoubleType.instance,
+                                                   IntegerType.instance,
+                                                   DecimalType.instance };
+}
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java
new file mode 100644
index 000000000000..15220bda6f7c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.bytecomparable;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+
+@RunWith(Parameterized.class)
+public class DecoratedKeyByteSourceTest
+{
+    private static final int NUM_ITERATIONS = 100;
+    private static final int RANDOM_BYTES_LENGTH = 100;
+
+    @Parameterized.Parameters(name = "version={0}")
+    public static Iterable<ByteComparable.Version> versions()
+    {
+        return ImmutableList.of(ByteComparable.Version.OSS41);
+    }
+
+    private final ByteComparable.Version version;
+
+    public DecoratedKeyByteSourceTest(ByteComparable.Version version)
+    {
+        this.version = version;
+    }
+
+    @Test
+    public void testDecodeBufferDecoratedKey()
+    {
+        for (int i = 0; i < NUM_ITERATIONS; ++i)
+        {
+            BufferDecoratedKey initialBuffer =
+                    (BufferDecoratedKey) ByteOrderedPartitioner.instance.decorateKey(newRandomBytesBuffer());
+            BufferDecoratedKey decodedBuffer = BufferDecoratedKey.fromByteComparable(
+                    initialBuffer, version, ByteOrderedPartitioner.instance);
+            Assert.assertEquals(initialBuffer, decodedBuffer);
+        }
+    }
+
+    @Test
+    public void testDecodeKeyBytes()
+    {
+        for (int i = 0; i < NUM_ITERATIONS; ++i)
+        {
+            BufferDecoratedKey initialBuffer =
+                    (BufferDecoratedKey) ByteOrderedPartitioner.instance.decorateKey(newRandomBytesBuffer());
+            byte[] keyBytes = DecoratedKey.keyFromByteComparable(initialBuffer, version, ByteOrderedPartitioner.instance);
+            Assert.assertArrayEquals(initialBuffer.getKey().array(), keyBytes);
+        }
+    }
+
+    private static ByteBuffer newRandomBytesBuffer()
+    {
+        byte[] randomBytes = new byte[RANDOM_BYTES_LENGTH];
+        new Random().nextBytes(randomBytes);
+        return ByteBuffer.wrap(randomBytes);
+    }
+}

From ce17849dc0690a22a6b6688ca1cd899e590edb77 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Mon, 11 Jan 2021 16:02:12 +0200
Subject: [PATCH 033/151] STAR-55: Provides the Trie interface with
 MemtableTrie implementation

also includes functionality to merge, intersect and iterate on tries.

patch by Branimir Lambov; reviewed by Jason Rutherglen and Jacek Lewandowski

(cherry picked from commit 78d30623ff9e174ba3599380443eb86846416530)
(cherry picked from commit a7c1e38b7ad03eded53589b5ec6d57062cb5952e)
---
 build.xml                                     |   2 +
 .../db/tries/CollectionMergeTrie.java         | 313 ++++++
 .../cassandra/db/tries/MemtableReadTrie.java  | 826 ++++++++++++++++
 .../cassandra/db/tries/MemtableTrie.java      | 906 ++++++++++++++++++
 .../apache/cassandra/db/tries/MemtableTrie.md | 754 +++++++++++++++
 .../cassandra/db/tries/MemtableTrie.md.a1.svg | 599 ++++++++++++
 .../cassandra/db/tries/MemtableTrie.md.g1.svg |  76 ++
 .../cassandra/db/tries/MemtableTrie.md.g2.svg | 116 +++
 .../cassandra/db/tries/MemtableTrie.md.g3.svg | 253 +++++
 .../cassandra/db/tries/MemtableTrie.md.g4.svg | 290 ++++++
 .../cassandra/db/tries/MemtableTrie.md.m1.svg | 349 +++++++
 .../cassandra/db/tries/MemtableTrie.md.m2.svg | 430 +++++++++
 .../cassandra/db/tries/MemtableTrie.md.m3.svg | 500 ++++++++++
 .../cassandra/db/tries/MemtableTrie.md.p1.svg | 405 ++++++++
 .../cassandra/db/tries/MemtableTrie.md.w1.svg | 226 +++++
 .../cassandra/db/tries/MemtableTrie.md.w2.svg | 326 +++++++
 .../cassandra/db/tries/MemtableTrie.md.w3.svg | 276 ++++++
 .../cassandra/db/tries/MemtableTrie.md.w4.svg | 268 ++++++
 .../apache/cassandra/db/tries/MergeTrie.java  | 180 ++++
 .../cassandra/db/tries/RangeTrieSet.java      | 152 +++
 .../db/tries/SetIntersectionTrie.java         | 133 +++
 .../cassandra/db/tries/SingletonTrie.java     | 121 +++
 .../org/apache/cassandra/db/tries/Trie.java   | 453 +++++++++
 .../apache/cassandra/db/tries/TrieDumper.java |  76 ++
 .../db/tries/TrieEntriesIterator.java         |  61 ++
 .../cassandra/db/tries/TrieIterator.java      | 112 +++
 .../db/tries/TrieIteratorWithKey.java         | 126 +++
 .../apache/cassandra/db/tries/TrieSet.java    | 126 +++
 .../db/tries/TrieValuesIterator.java          |  57 ++
 .../apache/cassandra/db/tries/TrieWalker.java |  96 ++
 .../tries/MemtableTrieReadBench.java          | 120 +++
 .../tries/MemtableTrieUnionBench.java         | 146 +++
 .../tries/MemtableTrieWriteBench.java         | 104 ++
 .../db/tries/CollectionMergeTrieTest.java     | 175 ++++
 .../db/tries/MemtableTrieApplyTest.java       |  28 +
 .../db/tries/MemtableTriePutTest.java         | 123 +++
 .../db/tries/MemtableTrieTestBase.java        | 581 +++++++++++
 .../db/tries/MemtableTrieThreadedTest.java    | 176 ++++
 .../cassandra/db/tries/MergeTrieTest.java     |  99 ++
 .../db/tries/SetIntersectionTrieTest.java     | 208 ++++
 40 files changed, 10368 insertions(+)
 create mode 100644 src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.a1.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g1.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g2.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g3.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g4.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m1.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m2.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m3.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.p1.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w1.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w2.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w3.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w4.svg
 create mode 100644 src/java/org/apache/cassandra/db/tries/MergeTrie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/SingletonTrie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/Trie.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieDumper.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieIterator.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieSet.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
 create mode 100644 src/java/org/apache/cassandra/db/tries/TrieWalker.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieReadBench.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieUnionBench.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/MemtableTrieApplyTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/MemtableTriePutTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/MemtableTrieThreadedTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java

diff --git a/build.xml b/build.xml
index 884d9cd60e51..be54496ed643 100644
--- a/build.xml
+++ b/build.xml
@@ -655,6 +655,7 @@
             <exclusion groupId="org.hamcrest" artifactId="hamcrest"/>
           </dependency>
           <dependency groupId="org.hamcrest" artifactId="hamcrest" version="2.2" scope="test"/>
+          <dependency groupId="org.agrona" artifactId="agrona" version="0.9.26" />
         </dependencyManagement>
         <developer id="adelapena" name="Andres de la Peña"/>
         <developer id="alakshman" name="Avinash Lakshman"/>
@@ -812,6 +813,7 @@
         <dependency groupId="javax.inject" artifactId="javax.inject"/>
         <dependency groupId="com.google.j2objc" artifactId="j2objc-annotations"/>
         <dependency groupId="org.hdrhistogram" artifactId="HdrHistogram"/>
+        <dependency groupId="org.agrona" artifactId="agrona"/>
 
         <!-- sasi deps -->
         <dependency groupId="de.jflex" artifactId="jflex" />
diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
new file mode 100644
index 000000000000..c6e311bea3a9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+
+/**
+ * A merged view of multiple tries.
+ *
+ * Note: We use same input and output types to be able to switch to directly returning single-origin branches.
+ */
+class CollectionMergeTrie<T> extends Trie<T>
+{
+    private final CollectionMergeResolver<T> resolver;  // only called on more than one input
+    protected final Collection<? extends Trie<T>> inputs;
+
+    CollectionMergeTrie(Collection<? extends Trie<T>> inputs, CollectionMergeResolver<T> resolver)
+    {
+        this.resolver = resolver;
+        this.inputs = inputs;
+    }
+
+    public <L> Node<T, L> root()
+    {
+        List<Node<T, L>> nodes = new ArrayList<>(inputs.size());
+        for (Trie<T> input : inputs)
+        {
+            Node<T, L> root = input.root();
+            if (root != null)
+                nodes.add(root);
+        }
+        return makeMerge(resolver, nodes);
+    }
+
+    private static <T, L> Node<T, L> makeMerge(CollectionMergeResolver<T> resolver, List<Node<T, L>> nodes)
+    {
+        switch (nodes.size())
+        {
+        case 0:
+            return null;
+        case 1:
+            return nodes.get(0);
+        case 2:
+            return new MergeTrie.MergeNode<>(resolver, nodes.get(0), nodes.get(1));
+        default:
+            return new MergeNode<>(resolver, nodes);
+        }
+    }
+
+    static class MergeNode<T, L> extends Node<T, L>
+    {
+        private final CollectionMergeResolver<T> resolver;  // only called on more than one input
+        final List<Node<T, L>> nodes;
+        T content;
+        volatile boolean contentMerged = false;
+
+        MergeNode(CollectionMergeResolver<T> resolver, List<Node<T, L>> nodes)
+        {
+            // All children necessarily use the same parent link (given during getCurrentChild). Make that ours.
+            super(nodes.get(0).parentLink);
+            this.resolver = resolver;
+            this.nodes = nodes;
+        }
+
+        /*
+         * The merge node is effectively a merge iterator of children.
+         *
+         * The most straightforward way to implement merging of iterators is to use a {@code PriorityQueue},
+         * {@code poll} it to find the next item to consume, then {@code add} the iterator back after advancing.
+         * This is not very efficient as {@code poll} and {@code add} in all cases require at least
+         * {@code log(size)} comparisons and swaps (usually more than {@code 2*log(size)}) per consumed item, even
+         * if the input is suitable for fast iteration.
+         *
+         * The implementation below makes use of the fact that replacing the top element in a binary heap can be
+         * done much more efficiently than separately removing it and placing it back, especially in the cases where
+         * the top iterator is to be used again very soon (e.g. when there are large sections of the output where
+         * only a limited number of input iterators overlap, which is normally the case in many practically useful
+         * situations, e.g. levelled compaction).
+         *
+         * The implementation builds and maintains a binary heap of sources (stored in an array), where we do not
+         * add items after the initial construction. Instead we advance the smallest element (which is at the top
+         * of the heap) and push it down to find its place for its new transition character. Should this source
+         * be exhausted, we swap it with the last source in the heap and proceed by pushing that down in the
+         * heap.
+         *
+         * In the case where we have multiple sources with matching transition characters, the merging algorithm
+         * must be able to merge all equal values. To achieve this {@code getCurrentChild} walks the heap to
+         * find all equal items without advancing the sources, and separately {@code advanceIteration} advances
+         * all equal sources and restores the heap structure.
+         *
+         * The latter is done equivalently to the process of building the initial heap in {@code startIteration}
+         * using back-to-front heapification as done in the classic heapsort algorithm. It only needs to heapify
+         * subheaps whose top item is advanced (i.e. one whose transition character matches the current),
+         * and we can do that recursively from bottom to top. Should a source be exhausted when advancing, it can
+         * be thrown away by swapping in the last source in the heap (note: we must be careful to advance that
+         * source too if required).
+         *
+         * Note: This is a simplification of the MergeIterator code from CASSANDRA-8915, without the leading ordered
+         * section and equalParent flag since comparisons of transition characters are cheap.
+         */
+
+        public Remaining startIteration()
+        {
+            int count = nodes.size();
+            // Get every input's initial state and move nodes with no children at the end.
+            for (int i = 0; i < count; ++i)
+            {
+                Node<T, L> ni = nodes.get(i);
+                boolean sHas = ni.startIteration() != null;
+                if (!sHas)
+                {
+                    --count;
+                    // put last one at its place (will do nothing if count now equals i)
+                    nodes.set(i, nodes.get(count));
+                    nodes.remove(count);
+                    // make sure the moved input is processed
+                    --i;
+                }
+            }
+            // We now create a heap from the input states we got. This process has linear complexity in the number
+            // of input states (see heapsort algorithm).
+            while (--count >= 0)
+                heapifyDown(count);
+
+            if (nodes.isEmpty())
+                return null;
+            currentTransition = nodes.get(0).currentTransition;
+            return Remaining.MULTIPLE;
+        }
+
+        public Remaining advanceIteration()
+        {
+            int current = currentTransition;
+            advance(current, 0);
+
+            if (nodes.isEmpty())
+                return null;
+            currentTransition = nodes.get(0).currentTransition;
+            return Remaining.MULTIPLE;
+        }
+
+        public Node<T, L> getCurrentChild(L parent)
+        {
+            int current = currentTransition;
+            List<Node<T, L>> children = new ArrayList<>(nodes.size());
+            collectEqual(0, current, parent, children);
+            return makeMerge(resolver, children);
+        }
+
+        /**
+         * Gets the child for every input in the heap rooted at the given index that is at the given transition.
+         * Calls itself recursively and used by getCurrentChild with index = 0 and transition = state[0].transition
+         * to get the child for all inputs that are positioned at the current minimal transition.
+         */
+        void collectEqual(int index, int transition, L parent, List<Node<T, L>> list)
+        {
+            Node<T, L> child = nodes.get(index).getCurrentChild(parent);
+            if (child != null)
+                list.add(child);
+
+            // Check if any of the children in the heap are at the same transition.
+            // If so, collect children recursively.
+            int next = index * 2 + 1;
+            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
+                collectEqual(next, transition, parent, list);
+            ++next;
+            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
+                collectEqual(next, transition, parent, list);
+        }
+
+        /**
+         * Advance the state of the input at the given index and any of its descendants that are at the same
+         * transition byte and restore the heap invariant for the subtree rooted at the given index.
+         * Calls itself recursively and used by advanceState with index = 0 and transition = state[0].transition
+         * to advance the state of the merge.
+         */
+        private void advance(int transition, int index)
+        {
+            Node<T, L> n = nodes.get(index);
+            // Advance current node and remove it from active heap if it has no further children.
+            while (n.advanceIteration() == null)
+            {
+                // n has no further children, it needs to be removed from the active heap.
+                // Move the last to index'th position and continue processing with that node.
+                int nodeCount = nodes.size() - 1;
+                n = nodes.remove(nodeCount);
+                if (nodeCount == index)
+                    return; // done, n was at the end of the heap so the subheap to advance is now empty thus
+                            // the invariant is trivially true
+
+                nodes.set(index, n);
+                // The node we swapped in may also need advancing. If so, repeat the procedure above.
+                if (n.currentTransition > transition)
+                    break;
+            }
+
+            // If the children are at the same transition byte, they also need advancing and their subheap
+            // invariant to be restored.
+            int next = index * 2 + 1;
+            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
+                advance(transition, next);
+            ++next;
+            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
+                advance(transition, next);
+
+            // At this point the heaps at both children are advanced and well-formed. Place current node in its
+            // proper position.
+            heapifyDown(index);
+            // The heap rooted at index is now advanced and well-formed.
+        }
+
+        /**
+         * Push the given state down in the heap from the given index until it finds its proper place among
+         * the subheap rooted at that position.
+         */
+        private void heapifyDown(int index)
+        {
+            Node<T, L> node = nodes.get(index);
+
+            int transition = node.currentTransition;
+            while (true)
+            {
+                int next = index * 2 + 1;
+                if (next >= nodes.size())
+                    break;
+                // Select the smaller of the two children to push down to.
+                if (next + 1 < nodes.size() && nodes.get(next).currentTransition > nodes.get(next + 1).currentTransition)
+                    ++next;
+                // If the child is greater, the invariant has been restored.
+                if (nodes.get(next).currentTransition >= transition)
+                    break;
+                nodes.set(index, nodes.get(next));
+                index = next;
+            }
+            nodes.set(index, node);
+        }
+
+        public T content()
+        {
+            if (!contentMerged)
+            {
+                // If we only have input from zero or one source, we will keep it here, avoiding the allocation
+                // of the list until necessary.
+                T v = null;
+                Collection<T> values = null;
+                for (Node<T, L> n : nodes)
+                {
+                    T c = n.content();
+                    if (c == null)
+                        continue;
+                    if (v == null)
+                    {
+                        v = c;  // one element
+                        continue;
+                    }
+                    if (values == null)
+                    {
+                        // more than one
+                        values = new ArrayList<>();
+                        values.add(v);
+                    }
+                    values.add(c);
+                }
+
+                if (values == null)
+                    content = v;
+                else
+                    content = resolver.resolve(values);
+
+                // Save content to avoid doing this costly operation again.
+                contentMerged = true;
+            }
+            return content;
+        }
+    }
+
+    /**
+     * Special instance for sources that are guaranteed distinct. The main difference is that we can form unordered
+     * value list by concatenating sources.
+     */
+    static class Distinct<T> extends CollectionMergeTrie<T>
+    {
+        Distinct(Collection<? extends Trie<T>> inputs)
+        {
+            super(inputs, throwingResolver());
+        }
+
+        @Override
+        public Iterable<T> valuesUnordered()
+        {
+            return Iterables.concat(Iterables.transform(inputs, Trie::valuesUnordered));
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
new file mode 100644
index 000000000000..f4c8ec9820f8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -0,0 +1,826 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.util.concurrent.atomic.AtomicReferenceArray;
+import java.util.function.Function;
+
+import org.agrona.concurrent.UnsafeBuffer;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Memtable trie, i.e. an in-memory trie built for fast modification and reads executing concurrently with writes from
+ * a single mutator thread.
+ *
+ * This class provides the read-only functionality, expanded in {@link MemtableTrie} to writes.
+ */
+public class MemtableReadTrie<T> extends Trie<T>
+{
+    /*
+    TRIE FORMAT AND NODE TYPES
+
+    The memtable trie uses five different types of nodes:
+     - "leaf" nodes, which have content and no children;
+     - single-transition "chain" nodes, which have exactly one child; while each node is a single transition, they are
+       called "chain" because multiple such transition are packed in a block.
+     - "sparse" nodes which have between two and six children;
+     - "split" nodes for anything above six children;
+     - "prefix" nodes that augment one of the other types (except leaf) with content.
+
+    The data for all nodes except leaf ones is stored in a contiguous 'node buffer' and laid out in blocks of 32 bytes.
+    A block only contains data for a single type of node, but there is no direct correspondence between block and node
+    in that:
+     - a single block can contain multiple "chain" nodes.
+     - a sparse node occupies exactly one block.
+     - a split node occupies a variable number of blocks.
+     - a prefix node can be placed in the same block as the node it augments, or in a separate block.
+
+    Nodes are referenced in that buffer by an integer position/pointer, the 'node pointer'. Note that node pointers are
+    not pointing at the beginning of blocks, and we call 'pointer offset' the offset of the node pointer to the block it
+    points into. The value of a 'node pointer' is used to decide what kind of node is pointed:
+
+     - If the pointer is negative, we have a leaf node. Since a leaf has no children, we need no data outside of its
+       content to represent it, and that content is stored in a 'content list', not in the nodes buffer. The content
+       of a particular leaf node is located at the ~pointer position in the content list (~ instead of - so that -1 can
+       correspond to position 0).
+
+     - If the 'pointer offset' is smaller than 28, we have a chain node with one transition. The transition character is
+       the byte at the position pointed in the 'node buffer', and the child is pointed by:
+       - the integer value at offset 28 of the block pointed if the 'pointer offset' is 27
+       - pointer + 1 (which is guaranteed to have offset smaller than 28, i.e. to be a chain node), otherwise
+       In other words, a chain block contains a sequence of characters that leads to the child whose address is at
+       offset 28. It may have between 1 and 28 characters depending on the pointer with which the block is entered.
+
+     - If the 'pointer offset' is 30, we have a sparse node. The data of a sparse node occupies a full block and is laid
+       out as:
+       - six pointers to children at offsets 0 to 24
+       - six transition characters at offsets 24 to 30
+       - an order word stored in the two bytes at offset 30
+       To enable in-place addition of children, the pointers and transition characters are not stored ordered.
+       Instead, we use an order encoding in the last 2 bytes of the node. The encoding is a base-6 number which
+       describes the order of the transitions (least significant digit being the smallest).
+       The node must have at least two transitions and the transition at position 0 is never the biggest (we can
+       enforce this by choosing for position 0 the smaller of the two transitions a sparse node starts with). This
+       allows iteration over the order word (which divides said word by 6 each step) to finish when the result becomes 0.
+
+     - If the 'pointer offset' is 28, the node is a split one. Split nodes are dense, meaning that there is a direct
+       mapping between a transition character and the address of the associated pointer, and new children can easily be
+       added in place.
+       Split nodes occupy multiple blocks, and a child is located by traversing 3 layers of pointers:
+       - the first pointer is within the top-level block (the one pointed by the pointer) and points to a "mid" block.
+         The top-level block has 4 such pointers to "mid" block, located between offset 16 and 32.
+       - the 2nd pointer is within the "mid" block and points to a "tail" block. A "mid" block has 8 such pointers
+         occupying the whole block.
+       - the 3rd pointer is with the "tail" block and is the actual child pointer. Like "mid" block, there are 8 such
+         pointers (so we finally address 4 * 8 * 8 = 256 children).
+       To find a child, we thus need to know the index of the pointer to follow within the top-level block, the index
+       of the one in the "mid" block and the index in the "tail" block. For that, we split the transition byte in a
+       sequence of 2-3-3 bits:
+       - the first 2 bits are the index in the top-level block;
+       - the next 3 bits, the index in the "mid" block;
+       - and the last 3 bits the index in the "tail" block.
+       This layout allows the node to use the smaller fixed-size blocks (instead of 256*4 bytes for the whole character
+       space) and also leaves some room in the head block (the 16 first bytes) for additional information (which we can
+       use to store prefix nodes containing things like deletion times).
+       One split node may need up to 1 + 4 + 4*8 blocks (1184 bytes) to store all its children.
+
+     - If the pointer offset is 31, we have a prefix node. These are two types:
+       -- Embedded prefix nodes occupy the free bytes in a chain or split node. The byte at offset 4 has the offset
+          within the 32-byte block for the augmented node.
+       -- Full prefix nodes have 0xFF at offset 4 and a pointer at 28, pointing to the augmented node.
+       Both types contain an index for content at offset 0. The augmented node cannot be a leaf or NONE -- in the former
+       case the leaf itself contains the content index, in the latter we use a leaf instead.
+       The term "node" when applied to these is a bit of a misnomer as they are not presented as separate nodes during
+       traversals. Instead, they augment a node, changing only its content. Internally we create a Node object for the
+       augmented node and wrap a PrefixNode around it, which changes the `content()` method and routes all other
+       calls to the augmented node's methods.
+
+     When building a trie we first allocate the content, then create a chain node leading to it. While we only have
+     single transitions leading to a chain node, we can expand that node (attaching a character and using pointer - 1)
+     instead of creating a new one. When a chain node already has a child and needs a new one added we change the type
+     (i.e. create a new node and remap the parent) to sparse with two children. When a six-child sparse node needs a new
+     child, we switch to split.
+
+     Blocks currently are not reused, because we do not yet have a mechanism to tell when readers are done with blocks
+     they are referencing. This currently causes a very low overhead (because we change data in place with the only
+     exception of nodes needing to change type) and is planned to be addressed later.
+
+     For an example of the evolution of the trie, see MemtableTrie.md.
+     */
+
+    static final int BLOCK_SIZE = 32;
+
+    // Biggest block offset that can contain a pointer.
+    static final int LAST_POINTER_OFFSET = BLOCK_SIZE - 4;
+
+    /*
+     Block offsets used to identify node types (by comparing them to the node 'pointer offset').
+     */
+
+    // split node (dense, 2-3-3 transitions), laid out as 4 pointers to "mid" block, with has 8 pointers to "tail" block,
+    // which has 8 pointers to children
+    static final int SPLIT_OFFSET = BLOCK_SIZE - 4;
+    // sparse node, unordered list of up to 6 transition, laid out as 6 transition pointers followed by 6 transition
+    // bytes. The last two bytes contain an ordering of the transitions (in base-6) which is used for iteration. On
+    // update the pointer is set last, i.e. during reads the node may show that a transition exists and list a character
+    // for it, but pointer may still be null.
+    static final int SPARSE_OFFSET = BLOCK_SIZE - 2;
+    // min and max offset for a chain node. A block of chain node is laid out as a pointer at LAST_POINTER_OFFSET,
+    // preceded by characters that lead to it. Thus a full chain block contains BLOCK_SIZE-4 transitions/chain nodes.
+    static final int CHAIN_MIN_OFFSET = 0;
+    static final int CHAIN_MAX_OFFSET = BLOCK_SIZE - 5;
+    // Prefix node, an intermediate node augmenting its child node with content.
+    static final int PREFIX_OFFSET = BLOCK_SIZE - 1;
+
+    /*
+     Offsets and values for navigating in a block for particular node type. Those offsets are 'from the node pointer'
+     (not the block start) and can be thus negative since node pointers points towards the end of blocks.
+     */
+
+    // Offset to the first pointer (to "mid" blocks) of a split node.
+    static final int SPLIT_POINTER_OFFSET = 16 - SPLIT_OFFSET;
+
+    static final int SPARSE_CHILD_COUNT = 6;
+    // Offset to the first child pointer of a spare node (laid out from the start of the block)
+    static final int SPARSE_CHILDREN_OFFSET = 0 - SPARSE_OFFSET;
+    // Offset to the first transition byte of a sparse node (laid out after the child pointers)
+    static final int SPARSE_BYTES_OFFSET = SPARSE_CHILD_COUNT * 4 - SPARSE_OFFSET;
+    // Offset to the order word of a sparse node (laid out after the children (pointer + transition byte))
+    static final int SPARSE_ORDER_OFFSET = SPARSE_CHILD_COUNT * 5 - SPARSE_OFFSET;  // 0
+
+    // Offset of the flag byte in a prefix node. In shared blocks, this contains the offset of the next node.
+    static final int PREFIX_FLAGS_OFFSET = 4 - PREFIX_OFFSET;
+    // Offset of the content id
+    static final int PREFIX_CONTENT_OFFSET = 0 - PREFIX_OFFSET;
+    // Offset of the next pointer in a non-shared prefix node
+    static final int PREFIX_POINTER_OFFSET = LAST_POINTER_OFFSET - PREFIX_OFFSET;
+
+    // Initial capacity for the node data buffer.
+    static final int INITIAL_BUFFER_CAPACITY = 256;
+
+    /**
+     * Value used as null for node pointers.
+     * No node can use this address (we enforce this by not allowing chain nodes to grow to position 0).
+     * Do not change this as the code relies there being a NONE placed in all bytes of the block that are not set.
+     */
+    static final int NONE = 0;
+
+    volatile int root;
+
+    final UnsafeBuffer buffer;
+
+    volatile AtomicReferenceArray<T> contentArray;
+
+    MemtableReadTrie(UnsafeBuffer buffer, AtomicReferenceArray<T> contentArray, int root)
+    {
+        this.buffer = buffer;
+        this.contentArray = contentArray;
+        this.root = root;
+    }
+
+    /*
+     Buffer, content list and block management
+     */
+
+    /** Pointer offset for a node pointer. */
+    int offset(int pos)
+    {
+        return pos & (BLOCK_SIZE - 1);
+    }
+
+    final int getByte(int pos)
+    {
+        return buffer.getByte(pos) & 0xFF;
+    }
+
+    final int getShort(int pos)
+    {
+        return buffer.getShort(pos) & 0xFFFF;
+    }
+
+    final int getInt(int pos) { return buffer.getInt(pos); }
+
+    T getContent(int index)
+    {
+        return contentArray.get(index);
+    }
+
+    /*
+     Reading node content
+     */
+
+    boolean isNull(int node)
+    {
+        return node == NONE;
+    }
+
+    boolean isLeaf(int node)
+    {
+        return node < NONE;
+    }
+
+    boolean isNullOrLeaf(int node)
+    {
+        return node <= NONE;
+    }
+
+    /**
+     * Returns the child pointer of a chain-block (that is, the point to the child of the last node of said
+     * chain-block).
+     */
+    private int chainBlockChildPointer(int node)
+    {
+        return (node & -BLOCK_SIZE) | LAST_POINTER_OFFSET;
+    }
+
+    /** Create a trie node for the given pointer */
+    <L> BaseNode<L> makeNode(int node, L parent)
+    {
+        if (isNull(node))
+            return null;
+
+        if (isLeaf(node))
+            return new LeafNode<>(node, parent);
+
+        switch (offset(node))
+        {
+            case SPARSE_OFFSET:
+                return new SparseNode<>(node, parent);
+            case SPLIT_OFFSET:
+                return new SplitNode<>(node, parent);
+            case PREFIX_OFFSET:
+                return new PrefixNode<>(node, parent);
+            default:
+                return new ChainNode<>(node, parent);
+        }
+    }
+
+    /** Get a node's child for the given transition character */
+    int getChild(int node, int trans)
+    {
+        if (isNullOrLeaf(node))
+            return NONE;
+
+        node = followContentTransition(node);
+
+        switch (offset(node))
+        {
+            case SPARSE_OFFSET:
+                return getSparseChild(node, trans);
+            case SPLIT_OFFSET:
+                return getSplitChild(node, trans);
+            case CHAIN_MAX_OFFSET:
+                if (trans != getByte(node))
+                    return NONE;
+                return getInt(node + 1);
+            default:
+                if (trans != getByte(node))
+                    return NONE;
+                return node + 1;
+        }
+    }
+
+    protected int followContentTransition(int node)
+    {
+        if (isNullOrLeaf(node))
+            return NONE;
+
+        if (offset(node) == PREFIX_OFFSET)
+        {
+            int b = getByte(node + PREFIX_FLAGS_OFFSET);
+            if (b < BLOCK_SIZE)
+                node = node - PREFIX_OFFSET + b;
+            else
+                node = getInt(node + PREFIX_POINTER_OFFSET);
+
+            assert node >= 0 && offset(node) != PREFIX_OFFSET;
+        }
+        return node;
+    }
+
+    /**
+     * Advance as long as the cell pointed to by the given pointer will let you.
+     *
+     * This is the same as getChild(node, first), except for chain nodes where it would walk the fill chain as long as
+     * the input source matches.
+     */
+    int advance(int node, int first, ByteSource rest)
+    {
+        if (isNullOrLeaf(node))
+            return NONE;
+
+        node = followContentTransition(node);
+
+        switch (offset(node))
+        {
+            case SPARSE_OFFSET:
+                return getSparseChild(node, first);
+            case SPLIT_OFFSET:
+                return getSplitChild(node, first);
+            default:
+                // Check the first byte matches the expected
+                if (getByte(node) != first)
+                    return NONE;
+                // Check the rest of the bytes provided by the chain node (limit - node - 1 many)
+                int limit = chainBlockChildPointer(node);
+                while (++node < limit)
+                {
+                    first = rest.next();
+                    if (getByte(node) != first)
+                        return NONE;
+                }
+                // All bytes matched, follow the pointer
+                return getInt(limit);
+        }
+    }
+
+    /** Get the child for the given transition character, knowing that the node is sparse */
+    int getSparseChild(int node, int trans)
+    {
+        for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
+        {
+            if (getByte(node + SPARSE_BYTES_OFFSET + i) == trans)
+            {
+                int child = getInt(node + SPARSE_CHILDREN_OFFSET + i * 4);
+
+                // we can't trust the transition character read above, because it may have been fetched before a
+                // concurrent update happened, and the update may have managed to modify the pointer by now.
+                // However, if we read it now that we have accessed the volatile pointer, it must have the correct
+                // value as it is set before the pointer.
+                if (child != NONE && getByte(node + SPARSE_BYTES_OFFSET + i) == trans)
+                    return child;
+            }
+        }
+        return NONE;
+    }
+
+    /** Given a transition, returns the corresponding index (within the node block) of the pointer to the mid block of
+     * a split node. */
+    int splitNodeMidIndex(int trans)
+    {
+        // first 2 bytes of the 2-3-3 split
+        return (trans >> 6) & 0x3;
+    }
+
+    /** Given a transition, returns the corresponding index (within the mid block) of the pointer to the tail block of
+     * a split node. */
+    int splitNodeTailIndex(int trans)
+    {
+        // second 3 bytes of the 2-3-3 split
+        return (trans >> 3) & 0x7;
+    }
+
+    /** Given a transition, returns the corresponding index (within the tail block) of the pointer to the child of
+     * a split node. */
+    int splitNodeChildIndex(int trans)
+    {
+        // third 3 bytes of the 2-3-3 split
+        return trans & 0x7;
+    }
+
+    /** Get the child for the given transition character, knowing that the node is split */
+    int getSplitChild(int node, int trans)
+    {
+        int mid = getInt(node + SPLIT_POINTER_OFFSET + splitNodeMidIndex(trans) * 4);
+        if (isNull(mid))
+            return NONE;
+
+        int tail = getInt(mid + splitNodeTailIndex(trans) * 4);
+        if (isNull(tail))
+            return NONE;
+        return getInt(tail + splitNodeChildIndex(trans) * 4);
+    }
+
+    /** Get the content for a given node */
+    T getNodeContent(int node)
+    {
+        if (isLeaf(node))
+            return getContent(~node);
+
+        if (offset(node) != PREFIX_OFFSET)
+            return null;
+
+        int index = getInt(node + PREFIX_CONTENT_OFFSET);
+        return (index >= 0)
+               ? getContent(index)
+               : null;
+    }
+
+    /*
+     Trie.Node implementations
+     */
+
+    abstract class BaseNode<L> extends Node<T, L>
+    {
+        final int node;
+
+        BaseNode(int node, L parent)
+        {
+            super(parent);
+            this.node = node;
+        }
+
+        // MemtableTrie nodes don't throw and always return MemtableTrie nodes.
+        @Override
+        public abstract BaseNode<L> getCurrentChild(L parent);
+
+        @Override
+        public T content()
+        {
+            return null;
+        }
+
+        abstract void dump(int indent, StringBuilder b, Function<T, String> contentToString);
+    }
+
+    class SplitNode<L> extends BaseNode<L>
+    {
+        SplitNode(int node, L parent)
+        {
+            super(node, parent);
+            assert offset(node) == SPLIT_OFFSET;
+        }
+
+        @Override
+        public BaseNode<L> getCurrentChild(L parent)
+        {
+            int child = getChild(currentTransition);
+            return makeNode(child, parent);
+        }
+
+        int getChild(int idx)
+        {
+            return getSplitChild(node, idx);
+        }
+
+        @Override
+        public Remaining startIteration()
+        {
+            return nextValid(0);
+        }
+
+        @Override
+        public Remaining advanceIteration()
+        {
+            return nextValid(currentTransition + 1);
+        }
+
+        Remaining nextValid(int trans)
+        {
+            if (trans >= 0x100)
+                return null;
+
+            // Splits the 2-3-3 parts of the transition
+            int midIndex = splitNodeMidIndex(trans);
+            int tailIdx = splitNodeTailIndex(trans);
+            int childIdx = splitNodeChildIndex(trans);
+
+            while (midIndex < 4)
+            {
+                int mid = getInt(node + SPLIT_POINTER_OFFSET + midIndex * 4);
+                if (!isNull(mid))
+                {
+                    while (tailIdx < 8)
+                    {
+                        int tail = getInt(mid + tailIdx * 4);
+                        if (!isNull(tail))
+                        {
+                            while (childIdx < 8)
+                            {
+                                int child = getInt(tail + childIdx * 4);
+                                if (!isNull(child))
+                                {
+                                    currentTransition = ((midIndex << 6) | (tailIdx << 3) | childIdx);
+                                    return Remaining.MULTIPLE;  // no need to be precise on the count
+                                }
+                                ++childIdx;
+                            }
+                        }
+                        childIdx = 0;
+                        ++tailIdx;
+                    }
+                }
+                tailIdx = 0;
+                ++midIndex;
+            }
+            return null;
+        }
+
+        @Override
+        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
+        {
+            indent++;
+            b.append(" -> Split\n");
+            for (int idx = 0; idx < 256; ++idx)
+            {
+                BaseNode<L> child = makeNode(getChild(idx), null);
+                if (child != null)
+                {
+                    for (int i = 0; i < indent; ++i)
+                        b.append("  ");
+                    b.append(String.format("%02x", idx));
+                    child.dump(indent, b, contentToString);
+                }
+            }
+        }
+    }
+
+    class SparseNode<L> extends BaseNode<L>
+    {
+        int iterationState;
+
+        SparseNode(int node, L parent)
+        {
+            super(node, parent);
+            assert offset(node) == SPARSE_OFFSET;
+        }
+
+        @Override
+        public BaseNode<L> getCurrentChild(L parent)
+        {
+            int child = getInt(node + SPARSE_CHILDREN_OFFSET + 4 * (iterationState % SPARSE_CHILD_COUNT));
+            return makeNode(child, parent);
+        }
+
+        @Override
+        public Remaining startIteration()
+        {
+            iterationState = getShort(node + SPARSE_ORDER_OFFSET);
+            currentTransition = getByte(node + SPARSE_BYTES_OFFSET + iterationState % SPARSE_CHILD_COUNT);
+            return Remaining.MULTIPLE;
+        }
+
+        @Override
+        public Remaining advanceIteration()
+        {
+            iterationState /= SPARSE_CHILD_COUNT;
+            // the last item is never in position 0
+            if (iterationState == 0)
+                return null;
+            currentTransition = getByte(node + SPARSE_BYTES_OFFSET + iterationState % SPARSE_CHILD_COUNT);
+            return iterationState >= SPARSE_CHILD_COUNT ? Remaining.MULTIPLE : Remaining.ONE;
+        }
+
+        @Override
+        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
+        {
+            indent++;
+            b.append(" -> Sparse\n");
+            for (int idx = 0; idx < SPARSE_CHILD_COUNT; ++idx)
+            {
+                BaseNode<L> child = makeNode(getInt(node + SPARSE_CHILDREN_OFFSET + idx * 4), null);
+                if (child != null)
+                {
+                    for (int i = 0; i < indent; ++i)
+                        b.append("  ");
+                    b.append(String.format("%02x", getByte(node + SPARSE_BYTES_OFFSET + idx)));
+                    child.dump(indent, b, contentToString);
+                }
+            }
+        }
+    }
+
+    class ChainNode<L> extends BaseNode<L>
+    {
+        // This node's pos points to the exact character of the next transition. The number of characters left is what
+        // needs to be added to that position to be one int away from the end of the node.
+        ChainNode(int node, L parent)
+        {
+            super(node, parent);
+            assert offset(node) >= CHAIN_MIN_OFFSET && offset(node) <= CHAIN_MAX_OFFSET;
+            currentTransition = getByte(node);
+        }
+
+        @Override
+        public Remaining startIteration()
+        {
+            return Remaining.ONE;
+        }
+
+        @Override
+        public Remaining advanceIteration()
+        {
+            return null;
+        }
+
+        @Override
+        public BaseNode<L> getCurrentChild(L parent)
+        {
+            if (offset(node + 1) == LAST_POINTER_OFFSET)
+                return makeNode(getInt(node + 1), parent);
+            return new ChainNode<>(node + 1, parent);
+        }
+
+        @Override
+        public BaseNode<L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver)
+        {
+            int child = node;
+            do
+            {
+                final int pointerPos =  chainBlockChildPointer(child);
+                if (receiver != null)
+                    receiver.add(buffer, child, pointerPos - child);
+                // jump directly to the child at the end of the chain
+                child = getInt(pointerPos);
+                // and continue jumping as long as the resulting node is a chain
+            }
+            while (child > 0 && offset(child) <= CHAIN_MAX_OFFSET);
+
+            return makeNode(child, parentLink);
+        }
+
+        @Override
+        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
+        {
+            b.append(" -> Chain\n");
+            for (int i = 0; i < indent + 1; ++i)
+                b.append("  ");
+            int limit = chainBlockChildPointer(node);
+            for (int p = node; p < limit; ++p)
+            {
+                indent ++;
+                b.append(String.format("%02x", getByte(p)));
+            }
+            makeNode(getInt(limit), null).dump(indent, b, contentToString);
+        }
+    }
+
+    class PrefixNode<L> extends BaseNode<L>
+    {
+        /**
+         * The augmented node. Prefix nodes are not presented as separate nodes, but instead only add content to
+         * another type of node. To prevent having separate instances for prefix-augmented split/sparse/chain, we
+         * instantiate and wrap a node of that type and only change what content() and getUniqueDescendant() do.
+         */
+        final BaseNode<L> augmentedNode;
+
+        PrefixNode(int node, L parent)
+        {
+            super(node, parent);
+            assert offset(node) == PREFIX_OFFSET;
+            this.augmentedNode = makeNode(followContentTransition(node), parent);
+        }
+
+        @Override
+        public Remaining startIteration()
+        {
+            Remaining result = augmentedNode.startIteration();
+            currentTransition = augmentedNode.currentTransition;
+            return result;
+        }
+
+        @Override
+        public Remaining advanceIteration()
+        {
+            Remaining result = augmentedNode.advanceIteration();
+            currentTransition = augmentedNode.currentTransition;
+            return result;
+        }
+
+        @Override
+        public T content()
+        {
+            return getNodeContent(node);
+        }
+
+        @Override
+        public BaseNode<L> getCurrentChild(L parent)
+        {
+            return augmentedNode.getCurrentChild(parent);
+        }
+
+        // Note: we do not map getUniqueDescendant to the augmented node's method as we want consumers to pay
+        // attention to this node.
+
+        @Override
+        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
+        {
+            T content = content();
+            b.append(" -> ");
+            b.append(contentToString.apply(content));
+            b.append('\n');
+            for (int i = 0; i < indent + 1; ++i)
+                b.append("  ");
+            augmentedNode.dump(indent, b, contentToString);
+        }
+    }
+
+    class LeafNode<L> extends BaseNode<L>
+    {
+        LeafNode(int node, L parent)
+        {
+            super(node, parent);
+            assert node < NONE;
+        }
+
+        IllegalStateException error()
+        {
+            return new IllegalStateException("Node has no children.");
+        }
+
+        @Override
+        public Remaining startIteration()
+        {
+            return null;
+        }
+
+        @Override
+        public Remaining advanceIteration()
+        {
+            throw error();
+        }
+
+        @Override
+        public BaseNode<L> getCurrentChild(L parent)
+        {
+            throw error();
+        }
+
+        @Override
+        public T content()
+        {
+            return getContent(~node);
+        }
+
+        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
+        {
+            b.append(" -> ");
+            b.append(contentToString.apply(content()));
+            b.append("\n");
+        }
+    }
+
+    /*
+     Direct read methods
+     */
+
+    public <L> BaseNode<L> root()
+    {
+        return makeNode(root, null);
+    }
+
+    /**
+     * Get the content mapped by the specified key.
+     * Fast implementation using integer node addresses.
+     */
+    public T get(ByteComparable path)
+    {
+        int n = root;
+        ByteSource source = path.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        while (!isNull(n))
+        {
+            int c = source.next();
+            if (c == ByteSource.END_OF_STREAM)
+                return getNodeContent(n);
+
+            n = advance(n, c, source);
+        }
+
+        return null;
+    }
+
+    public boolean isEmpty()
+    {
+        return isNull(root);
+    }
+
+
+    /**
+     * Override of dump to provide more detailed printout that includes the type of each node in the trie.
+     */
+    @Override
+    public String dump(Function<T, String> contentToString)
+    {
+        StringBuilder b = new StringBuilder();
+        if (!isNull(root))
+            root().dump(0, b, contentToString);
+        else
+            b.append("empty");
+        return b.toString();
+    }
+
+    /**
+     * Override as non-throwing.
+     */
+    @Override
+    public String dump()
+    {
+        return dump(Object::toString);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
new file mode 100644
index 000000000000..dde198313d07
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -0,0 +1,906 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+import java.util.concurrent.atomic.AtomicReferenceArray;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.slf4j.LoggerFactory;
+
+import org.agrona.concurrent.UnsafeBuffer;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.github.jamm.MemoryLayoutSpecification;
+
+/**
+ * Memtable trie, i.e. an in-memory trie built for fast modification and reads executing concurrently with writes from
+ * a single mutator thread.
+ *
+ * Writes to this should be atomic (i.e. reads should see either the content before the write, or the content after the
+ * write; if any read sees the write, then any subsequent (i.e. started after it completed) read should also see it).
+ * This implementation does not currently guarantee this, but we still get the desired result as `apply` is only used
+ * with singleton tries.
+ */
+public class MemtableTrie<T> extends MemtableReadTrie<T>
+{
+    // See the trie format description in MemtableReadTrie.
+
+    /**
+     * Trie size limit. This is not enforced, but users must check from time to time that it is not exceeded (using
+     * reachedAllocatedSizeThreshold()) and start switching to a new trie if it is.
+     * This must be done to avoid tries growing beyond their hard 2GB size limit (due to the 32-bit pointers).
+     */
+    private static final int ALLOCATED_SIZE_THRESHOLD;
+    static
+    {
+        String propertyName = "dse.trie_size_limit_mb";
+        // Default threshold + 10% == 1 GB. Adjusted slightly up to avoid a tiny final allocation for the 2G max.
+        int limitInMB = Integer.parseInt(System.getProperty(propertyName,
+                                                            Integer.toString(1024 * 10 / 11 + 1)));
+        if (limitInMB < 1 || limitInMB > 2047)
+            throw new AssertionError(propertyName + " must be within 1 and 2047");
+        ALLOCATED_SIZE_THRESHOLD = 1024 * 1024 * limitInMB;
+    }
+
+    private int allocatedPos = 0;
+    private int contentCount = 0;
+
+    private final BufferType bufferType;    // on or off heap
+
+    private static final long EMPTY_SIZE_ON_HEAP; // for space calculations
+    private static final long EMPTY_SIZE_OFF_HEAP; // for space calculations
+
+    static
+    {
+        MemtableTrie<Object> empty = new MemtableTrie<>(BufferType.ON_HEAP);
+        EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(empty)
+                             - empty.contentArray.length() * MemoryLayoutSpecification.SPEC.getReferenceSize()
+                             - empty.buffer.capacity();
+        empty = new MemtableTrie<>(BufferType.OFF_HEAP);
+        EMPTY_SIZE_OFF_HEAP = ObjectSizes.measureDeep(empty)
+                              - empty.contentArray.length() * MemoryLayoutSpecification.SPEC.getReferenceSize()
+                              - empty.buffer.capacity();
+    }
+
+    public MemtableTrie(BufferType bufferType)
+    {
+        super(new UnsafeBuffer(bufferType.allocate(INITIAL_BUFFER_CAPACITY)), new AtomicReferenceArray<>(16), NONE);
+        this.bufferType = bufferType;
+        assert INITIAL_BUFFER_CAPACITY % BLOCK_SIZE == 0;
+    }
+
+    // Buffer, content list and block management
+
+    public static class SpaceExhaustedException extends Exception
+    {
+        public SpaceExhaustedException()
+        {
+            super("The hard 2GB limit on trie size has been exceeded");
+        }
+    }
+
+    private int allocateBlock() throws SpaceExhaustedException
+    {
+        // Note: If this method is modified, please run MemtableTrieTest.testOver1GSize to verify it acts correctly
+        // close to the 2G limit.
+        int v = allocatedPos;
+        if (buffer.capacity() == v)
+        {
+            int newSize;
+            if (v >= ALLOCATED_SIZE_THRESHOLD)
+            {
+                // we don't expect to write much after the threshold has been reached
+                // to avoid allocating too much space which will be left unused,
+                // grow by 10% of the limit, rounding up to BLOCK_SIZE
+                newSize = (v + ALLOCATED_SIZE_THRESHOLD / 10 + BLOCK_SIZE - 1) & -BLOCK_SIZE;
+                // If we do this repeatedly and the calculated size grows over 2G, it will overflow and result in a
+                // negative integer. In that case, cap it to a size that can be allocated.
+                if (newSize < 0)
+                {
+                    newSize = 0x7FFFFF00;   // 2G - 256 bytes
+                    if (newSize == allocatedPos)    // already at limit
+                        throw new SpaceExhaustedException();
+                    LoggerFactory.getLogger(getClass()).debug("Growing memtable trie to maximum size {}",
+                                                              FBUtilities.prettyPrintMemory(newSize));
+                }
+                else
+                    LoggerFactory.getLogger(getClass()).debug("Growing memtable trie by 10% over the {} limit to {}",
+                                                              FBUtilities.prettyPrintMemory(ALLOCATED_SIZE_THRESHOLD),
+                                                              FBUtilities.prettyPrintMemory(newSize));
+            } else
+                newSize = v * 2;
+
+            ByteBuffer newBuffer = bufferType.allocate(newSize);
+            buffer.getBytes(0, newBuffer, v);
+            buffer.wrap(newBuffer);
+            // The above does not contain any happens-before enforcing writes, thus at this point the new buffer may be
+            // invisible to any concurrent readers. Touching the volatile root pointer (which any new read must go
+            // through) enforces a happens-before that makes it visible to all new reads (note: when the write completes
+            // it must do some volatile write, but that will be in the new buffer and without the line below could
+            // remain unreachable by other cores).
+            root = root;
+        }
+
+        allocatedPos += BLOCK_SIZE;
+        return v;
+    }
+
+    private int addContent(T value)
+    {
+        int index = contentCount++;
+        if (index == contentArray.length())
+        {
+            AtomicReferenceArray<T> newContent = new AtomicReferenceArray<>(index * 2);
+            for (int i = 0; i < contentArray.length(); ++i)
+                newContent.lazySet(i, contentArray.get(i));
+            contentArray = newContent;  // This is a volatile set, hence all previous stores must become visible
+        }
+        contentArray.lazySet(index, value); // no need for a volatile set here; at this point the item is not referenced
+                                            // by any node in the trie, and a volatile set will be made to reference it.
+        return index;
+    }
+
+    private void setContent(int index, T value)
+    {
+        contentArray.set(index, value);
+    }
+
+    // Write methods
+
+    // Write visibility model: writes are not volatile, with the exception of the final write before a call returns
+    // the same value that was present before (e.g. content was updated in-place / existing node got a new child or had
+    // a child pointer updated); if the whole path including the root node changed, the root itself gets a volatile
+    // write.
+    // This final write is the point where any new cells created during the write become visible for readers for the
+    // first time, and such readers must pass through reading that pointer, which forces a happens-before relationship
+    // that extends to all values written by this thread before it.
+
+    /**
+     * Attach a child to the given non-content node. This may be an update for an existing branch, or a new child for
+     * the node. An update _is_ required (i.e. this is only called when the newChild pointer is not the same as the
+     * existing value).
+     */
+    private int attachChild(int node, int trans, int newChild) throws SpaceExhaustedException
+    {
+        if (isLeaf(node))
+            throw new AssertionError("attachChild cannot be used on content nodes.");
+
+        switch (offset(node))
+        {
+            case PREFIX_OFFSET:
+                throw new AssertionError("attachChild cannot be used on content nodes.");
+            case SPARSE_OFFSET:
+                return attachChildToSparse(node, trans, newChild);
+            case SPLIT_OFFSET:
+                attachChildToSplit(node, trans, newChild);
+                return node;
+            case LAST_POINTER_OFFSET - 1:
+                // If this is the last character in a Chain block, we can modify the child in-place
+                if (trans == getByte(node))
+                {
+                    buffer.putIntVolatile(node + 1, newChild);
+                    return node;
+                }
+                // else pass through
+            default:
+                return attachChildToChain(node, trans, newChild);
+        }
+    }
+
+    /**
+     * Attach a child to the given split node. This may be an update for an existing branch, or a new child for the node.
+     */
+    private void attachChildToSplit(int node, int trans, int newChild) throws SpaceExhaustedException
+    {
+        int midPos = node + SPLIT_POINTER_OFFSET + splitNodeMidIndex(trans) * 4;
+        int mid = getInt(midPos);
+        if (isNull(mid))
+        {
+            mid = allocateBlock();
+            buffer.putIntOrdered(midPos, mid);  // ordered write to ensure no uncleaned state is visible to readers
+            // i.e. if block is reused it may need to be set to all zero. if this is not ordered the writes clearing
+            // it may execute after this link is created, and readers could see old content.
+            // Not currently necessary (we don't reuse), but let's avoid the surprise when we start doing so.
+        }
+
+        int tailPos = mid + splitNodeTailIndex(trans) * 4;
+        int tail = getInt(tailPos);
+        if (isNull(tail))
+        {
+            tail = allocateBlock();
+            buffer.putIntOrdered(tailPos, tail); // as above
+        }
+
+        int childPos = tail + splitNodeChildIndex(trans) * 4;
+        buffer.putIntVolatile(childPos, newChild);
+    }
+
+    /**
+     * Attach a child to the given sparse node. This may be an update for an existing branch, or a new child for the node.
+     */
+    private int attachChildToSparse(int node, int trans, int newChild) throws SpaceExhaustedException
+    {
+        int i;
+        // first check if this is an update and modify in-place if so
+        for (i = 0; i < SPARSE_CHILD_COUNT; ++i)
+        {
+            if (isNull(getInt(node + SPARSE_CHILDREN_OFFSET + i * 4)))
+                break;
+            if ((getByte(node + SPARSE_BYTES_OFFSET + i)) == trans)
+            {
+                buffer.putIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4, newChild);
+                return node;
+            }
+        }
+
+        if (i == SPARSE_CHILD_COUNT)
+        {
+            // Node is full. Switch to split
+            int split = createEmptySplitNode();
+            for (i = 0; i < SPARSE_CHILD_COUNT; ++i)
+            {
+                int t = getByte(node + SPARSE_BYTES_OFFSET + i);
+                int p = getInt(node + SPARSE_CHILDREN_OFFSET + i * 4);
+                attachChildToSplitNonVolatile(split, t, p);
+            }
+            attachChildToSplitNonVolatile(split, trans, newChild);
+            return split;
+        }
+
+        // Add a new transition. They are not kept in order, so append it at the first free position.
+        buffer.putByte(node + SPARSE_BYTES_OFFSET + i,  (byte) trans);
+
+        // Update order word.
+        int order = buffer.getShort(node + SPARSE_ORDER_OFFSET) & 0xFFFF;
+        int newOrder = insertInOrderWord(order, i, trans, node + SPARSE_BYTES_OFFSET);
+
+        // Sparse nodes have two access modes: via the order word, when listing transitions, or directly to characters
+        // and addresses.
+        // To support the former, we volatile write to the order word last, and everything is correctly set up.
+        // The latter does not touch the order word. To support that too, we volatile write the address, as the reader
+        // can't determine if the position is in use based on the character byte alone (00 is also a valid transition).
+        // Note that this means that reader must check the transition byte AFTER the address, to ensure they get the
+        // correct value (see getSparseChild).
+
+        // setting child enables reads to start seeing the new branch
+        buffer.putIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4, newChild);
+
+        // some readers will decide whether to check the pointer based on the order word
+        // write that volatile to make sure they see the new change too
+        buffer.putShortVolatile(node + SPARSE_ORDER_OFFSET,  (short) newOrder);
+        return node;
+    }
+
+    /**
+     * Insert the given newIndex in the base-6 encoded order word in the correct position with respect to the ordering.
+     *
+     * E.g. if the existing bytes were 20, 50, 30 with order word 120 (decimal 48), then
+     *   - insertOrderWord(120, 3, 5, ptr)  must return 1203 (decimal 48*6 + 3)
+     *   - insertOrderWord(120, 3, 25, ptr) must return 1230 (decimal 8*36 + 3*6 + 0)
+     *   - insertOrderWord(120, 3, 35, ptr) must return 1320 (decimal 1*216 + 3*36 + 12)
+     *   - insertOrderWord(120, 3, 55, ptr) must return 3120 (decimal 3*216 + 48)
+     */
+    private int insertInOrderWord(int order, int newIndex, int transitionByte, int bytesPosition)
+    {
+        int s = order;
+        int r = 1;
+        while (s != 0)
+        {
+            int b = getByte(bytesPosition + s % SPARSE_CHILD_COUNT);
+            if (b > transitionByte)
+                break;
+
+            assert b < transitionByte;
+            r *= 6;
+            s /= 6;
+        }
+        // insert i after the ones we have passed (order % r) and before the remaining (s)
+        return order % r + (s * 6 + newIndex) * r;
+    }
+
+    /**
+     * Non-volatile version of attachChildToSplit. Used when the split node is not reachable yet (during the conversion
+     * from sparse).
+     */
+    private void attachChildToSplitNonVolatile(int node, int trans, int newChild) throws SpaceExhaustedException
+    {
+        int midPos = node + SPLIT_POINTER_OFFSET + splitNodeMidIndex(trans) * 4;
+        int mid = getInt(midPos);
+        if (isNull(mid))
+        {
+            mid = allocateBlock();
+            buffer.putInt(midPos, mid);
+        }
+
+        int tailPos = mid + splitNodeTailIndex(trans) * 4;
+        int tail = getInt(tailPos);
+        if (isNull(tail))
+        {
+            tail = allocateBlock();
+            buffer.putInt(tailPos, tail);
+        }
+
+        int childPos = tail + splitNodeChildIndex(trans) * 4;
+        buffer.putInt(childPos, newChild);
+    }
+
+    /**
+     * Attach a child to the given chain node. This may be an update for an existing branch with different target
+     * address, or a second child for the node.
+     * This method always copies the node -- with the exception of updates that change the child of the last node in a
+     * chain block with matching transition byte (which this method is not used for, see attachChild), modifications to
+     * chain nodes cannot be done in place, either because we introduce a new transition byte and have to convert from
+     * the single-transition chain type to sparse, or because we have to remap the child from the implicit node + 1 to
+     * something else.
+     */
+    private int attachChildToChain(int node, int transitionByte, int newChild) throws SpaceExhaustedException
+    {
+        int existingByte = getByte(node);
+        if (transitionByte == existingByte)
+        {
+            // This will only be called if new child is different from old, and the update is not on the final child
+            // where we can change it in place (see attachChild). We must always create something new.
+            // If the child is a chain, we can expand it (since it's a different value, its branch must be new and
+            // nothing can already reside in the rest of the block).
+            return expandOrCreateChainNode(transitionByte, newChild);
+        }
+
+        // The new transition is different, so we no longer have only one transition. Change type.
+        int existingChild = node + 1;
+        if (offset(existingChild) == LAST_POINTER_OFFSET)
+        {
+            existingChild = getInt(existingChild);
+        }
+        return createSparseNode(existingByte, existingChild, transitionByte, newChild);
+    }
+
+    private boolean isExpandableChain(int newChild)
+    {
+        int newOffset = offset(newChild);
+        return newChild > 0 && newChild - 1 > NONE && newOffset > CHAIN_MIN_OFFSET && newOffset <= CHAIN_MAX_OFFSET;
+    }
+
+    /**
+     * Create a sparse node with two children.
+     */
+    private int createSparseNode(int byte1, int child1, int byte2, int child2) throws SpaceExhaustedException
+    {
+        assert byte1 != byte2;
+        if (byte1 > byte2)
+        {
+            // swap them so the smaller is byte1, i.e. there's always something bigger than child 0 so 0 never is
+            // at the end of the order
+            int t = byte1; byte1 = byte2; byte2 = t;
+            t = child1; child1 = child2; child2 = t;
+        }
+
+        int node = allocateBlock() + SPARSE_OFFSET;
+        buffer.putByte(node + SPARSE_BYTES_OFFSET + 0,  (byte) byte1);
+        buffer.putByte(node + SPARSE_BYTES_OFFSET + 1,  (byte) byte2);
+        buffer.putInt(node + SPARSE_CHILDREN_OFFSET + 0 * 4, child1);
+        buffer.putInt(node + SPARSE_CHILDREN_OFFSET + 1 * 4, child2);
+        buffer.putShort(node + SPARSE_ORDER_OFFSET,  (short) (1 * 6 + 0));
+        // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be
+        // put in an existing node or the root. That action ends in a happens-before enforcing write.
+        return node;
+    }
+
+    /**
+     * Creates a chain node with the single provided transition (pointing to the provided child).
+     * Note that to avoid creating inefficient tries with under-utilized chain nodes, this should only be called from
+     * {@link #expandOrCreateChainNode} and other call-sites should call {@link #expandOrCreateChainNode}.
+     */
+    private int createNewChainNode(int transitionByte, int newChild) throws SpaceExhaustedException
+    {
+        int newNode = allocateBlock() + LAST_POINTER_OFFSET - 1;
+        buffer.putByte(newNode, (byte) transitionByte);
+        buffer.putInt(newNode + 1, newChild);
+        // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be
+        // put in an existing node or the root. That action ends in a happens-before enforcing write.
+        return newNode;
+    }
+
+    /** Like {@link #createNewChainNode}, but if the new child is already a chain node and has room, expand
+     * it instead of creating a brand new node. */
+    private int expandOrCreateChainNode(int transitionByte, int newChild) throws SpaceExhaustedException
+    {
+        if (isExpandableChain(newChild))
+        {
+            // attach as a new character in child node
+            int newNode = newChild - 1;
+            buffer.putByte(newNode, (byte) transitionByte);
+            return newNode;
+        }
+
+        return createNewChainNode(transitionByte, newChild);
+    }
+
+    private int createEmptySplitNode() throws SpaceExhaustedException
+    {
+        int pos = allocateBlock();
+        return pos + SPLIT_OFFSET;
+    }
+
+    private int createContentNode(int contentIndex, int child, boolean isSafeChain) throws SpaceExhaustedException
+    {
+        assert !isLeaf(child);
+        if (isNull(child))
+            return ~contentIndex;
+
+        int offset = offset(child);
+        int node;
+        if (offset == SPLIT_OFFSET || isSafeChain && offset > (PREFIX_FLAGS_OFFSET + PREFIX_OFFSET) && offset <= CHAIN_MAX_OFFSET)
+        {
+            // We can do an embedded prefix node
+            // Note: for chain nodes we have a risk that the node continues beyond the current point, in which case
+            // creating the embedded node may overwrite information that is still needed by concurrent readers or the
+            // mutation process itself.
+            node = (child & -BLOCK_SIZE) | PREFIX_OFFSET;
+            buffer.putByte(node + PREFIX_FLAGS_OFFSET, (byte) offset);
+        }
+        else
+        {
+            // Full prefix node
+            node = allocateBlock() + PREFIX_OFFSET;
+            buffer.putByte(node + PREFIX_FLAGS_OFFSET, (byte) 0xFF);
+            buffer.putInt(node + PREFIX_POINTER_OFFSET, child);
+        }
+
+        buffer.putInt(node + PREFIX_CONTENT_OFFSET, contentIndex);
+        return node;
+    }
+
+    private int updatePrefixNodeChild(int node, int child) throws SpaceExhaustedException
+    {
+        assert offset(node) == PREFIX_OFFSET;
+
+        if (isNull(child))
+            return ~getInt(node + PREFIX_CONTENT_OFFSET);
+
+        // We can only update in-place if we have a full prefix node
+        if (!isEmbeddedPrefixNode(node))
+        {
+            // This attaches the child branch and makes it reachable -- the write must be volatile.
+            buffer.putIntVolatile(node + PREFIX_POINTER_OFFSET, child);
+            return node;
+        }
+        else
+        {
+            int contentIndex = getInt(node + PREFIX_CONTENT_OFFSET);
+            return createContentNode(contentIndex, child, true);
+        }
+    }
+
+    private boolean isEmbeddedPrefixNode(int node)
+    {
+        return getByte(node + PREFIX_FLAGS_OFFSET) < BLOCK_SIZE;
+    }
+
+    /**
+     * Copy the content from an existing node, if it has any, to a newly-prepared update for its child.
+     *
+     * @param existingPreContentNode pointer to the existing node before skipping over content nodes, i.e. this is
+     *                               either the same as existingPostContentNode or a pointer to a prefix or leaf node
+     *                               whose child is existingPostContentNode
+     * @param existingPostContentNode pointer to the existing node being updated, after any content nodes have been
+     *                                skipped and before any modification have been applied; always a non-content node
+     * @param updatedPostContentNode is the updated node, i.e. the node to which all relevant modifications have been
+     *                               applied; if the modifications were applied in-place, this will be the same as
+     *                               existingPostContentNode, otherwise a completely different pointer; always a non-
+     *                               content node
+     * @return a node which has the children of updatedPostContentNode combined with the content of
+     *         existingPreContentNode
+     */
+    private int preserveContent(int existingPreContentNode,
+                               int existingPostContentNode,
+                               int updatedPostContentNode) throws SpaceExhaustedException
+    {
+        if (existingPreContentNode == existingPostContentNode)
+            return updatedPostContentNode;     // no content to preserve
+
+        if (existingPostContentNode == updatedPostContentNode)
+            return existingPreContentNode;     // child didn't change, no update necessary
+
+        // else we have existing prefix node, and we need to reference a new child
+        if (isLeaf(existingPreContentNode))
+        {
+            assert isNull(existingPostContentNode);
+            return createContentNode(~existingPreContentNode, updatedPostContentNode, true);
+        }
+
+        assert offset(existingPreContentNode) == PREFIX_OFFSET;
+        return updatePrefixNodeChild(existingPreContentNode, updatedPostContentNode);
+    }
+
+    /**
+     * State of the walk of the given mutation trie. Passed to mutation nodes in their parentState link.
+     */
+    class ApplyState<U>
+    {
+        /**
+         * The node from the mutation trie.
+         */
+        final Node<U, ApplyState<U>> mutationNode;
+
+        /**
+         * Pointer to the existing node before skipping over content nodes, i.e. this is either the same as
+         * existingPostContentNode or a pointer to a prefix or leaf node whose child is existingPostContentNode.
+         */
+        final int existingPreContentNode;
+
+        /**
+         * Pointer to the existing node being updated, after any content nodes have been skipped and before any
+         * modification have been applied. Always a non-content node.
+         */
+        final int existingPostContentNode;
+
+        /**
+         * The updated node, i.e. the node to which the relevant modifications are being applied. This will change as
+         * children are processed and attached to the node. After all children have been processed, this will contain
+         * the fully updated node (i.e. the union of existingPostContentNode and mutationNode) without any content,
+         * which will be processed separately and, if necessary, attached ahead of this. If the modifications were
+         * applied in-place, this will be the same as existingPostContentNode, otherwise a completely different
+         * pointer. Always a non-content node.
+         */
+        int updatedPostContentNode;
+
+        ApplyState(Node<U, ApplyState<U>> mutationNode, int transition)
+        {
+            ApplyState<U> parentState = mutationNode.parentLink;
+            if (parentState == null)
+                existingPreContentNode = root;
+            else
+            {
+                existingPreContentNode = isNull(parentState.existingPostContentNode)
+                                         ? NONE
+                                         : getChild(parentState.existingPostContentNode, transition);
+            }
+
+            existingPostContentNode = followContentTransition(existingPreContentNode);
+            updatedPostContentNode = existingPostContentNode;
+
+            this.mutationNode = mutationNode;
+        }
+
+        private void attachChild(int ourChild) throws SpaceExhaustedException
+        {
+            int transition = mutationNode.currentTransition;
+            if (isNull(updatedPostContentNode))
+                updatedPostContentNode = expandOrCreateChainNode(transition, ourChild);
+            else
+                updatedPostContentNode = MemtableTrie.this.attachChild(updatedPostContentNode,
+                                                                       transition,
+                                                                       ourChild);
+        }
+
+        private int applyContent(U mutationContent, UpsertTransformer<T, U> transformer) throws SpaceExhaustedException
+        {
+            // common case, no new content
+            if (mutationContent == null)
+                return preserveContent(existingPreContentNode, existingPostContentNode, updatedPostContentNode);
+
+            int contentIndex = -1;
+            int existingContentIndex = -1;
+
+            if (existingPreContentNode != existingPostContentNode)
+            {
+                // There is pre-existing content which must be merged with the new.
+                if (isLeaf(existingPreContentNode))
+                    existingContentIndex = ~existingPreContentNode;
+                else
+                {
+                    assert offset(existingPreContentNode) == PREFIX_OFFSET;
+                    existingContentIndex = getInt(existingPreContentNode + PREFIX_CONTENT_OFFSET);
+                }
+
+                final T existingContent = contentArray.get(existingContentIndex);
+                T combinedContent = transformer.apply(existingContent, mutationContent);
+                setContent(existingContentIndex, combinedContent);
+                if (combinedContent != null)
+                    contentIndex = existingContentIndex;
+            }
+            else
+            {
+                // No pre-existing content.
+                T combinedContent = transformer.apply(null, mutationContent);
+                if (combinedContent != null)
+                    contentIndex = addContent(combinedContent);
+            }
+
+            // The supplied transformer may return null, e.g. to delete data. In this case we don't have a content index.
+            if (contentIndex == -1)
+                return updatedPostContentNode;
+
+            if (isNull(updatedPostContentNode))
+                return ~contentIndex;
+
+            // We can't update in-place if there was no preexisting prefix, or if the prefix was embedded and the target
+            // node must change.
+            if (existingPreContentNode == existingPostContentNode ||
+                isEmbeddedPrefixNode(existingPreContentNode) && updatedPostContentNode != existingPostContentNode)
+                return createContentNode(contentIndex, updatedPostContentNode, isNull(existingPostContentNode));
+
+            // Otherwise modify in place
+            if (updatedPostContentNode != existingPostContentNode) // to use volatile write but also ensure we don't corrupt embedded nodes
+                buffer.putIntVolatile(existingPreContentNode + PREFIX_POINTER_OFFSET, updatedPostContentNode);
+            assert contentIndex == existingContentIndex;
+            return existingPreContentNode;
+        }
+
+        private ApplyState<U> attachAndMoveToParentState(UpsertTransformer<T, U> transformer) throws SpaceExhaustedException
+        {
+            ApplyState<U> parentState = mutationNode.parentLink;
+
+            int updatedPreContentNode = applyContent(mutationNode.content(),
+                                                     transformer);
+
+            if (parentState == null)
+            {
+                assert root == existingPreContentNode;
+                if (updatedPreContentNode != existingPreContentNode)
+                {
+                    // Only write to root if they are different (value doesn't change, but
+                    // we don't want to invalidate the value in other cores' caches unnecessarily).
+                    root = updatedPreContentNode;
+                }
+
+                return null;
+            }
+
+            if (updatedPreContentNode != existingPreContentNode)
+                parentState.attachChild(updatedPreContentNode);
+
+            return parentState;
+        }
+    }
+
+    /**
+     * Somewhat similar to {@link MergeResolver}, this encapsulates logic to be applied whenever new content is being
+     * upserted into a {@link MemtableTrie}. Unlike {@link MergeResolver}, {@link UpsertTransformer} will be applied no
+     * matter if there's pre-existing content for that trie key/path or not.
+     *
+     * @param <T> The content type for this {@link MemtableTrie}.
+     * @param <U> The type of the new content being applied to this {@link MemtableTrie}.
+     */
+    public interface UpsertTransformer<T, U>
+    {
+        /**
+         * Called when there's content in the updating trie.
+         *
+         * @param existing Existing content for this key, or null if there isn't any.
+         * @param update   The update, always non-null.
+         * @return The combined value to use.
+         */
+        T apply(T existing, U update);
+    }
+
+    /**
+     * Modify this trie to apply the mutation given in the form of a trie. Any content in the mutation will be resolved
+     * with the given function before being placed in this trie (even if there's no pre-existing content in this trie).
+     * @param mutation the mutation to be applied, given in the form of a trie. Note that its content can be of type
+     * different than the element type for this memtable trie.
+     * @param transformer a function applied to the potentially pre-existing value for the given key, and the new
+     * value. Applied even if there's no pre-existing value in the memtable trie.
+     */
+    public <U> void apply(Trie<U> mutation, final UpsertTransformer<T, U> transformer) throws SpaceExhaustedException
+    {
+        Node<U, ApplyState<U>> current = mutation.root();
+        if (current == null)
+            return;
+
+        ApplyState<U> state = new ApplyState<U>(current, current.parentLink != null ? current.parentLink.mutationNode.currentTransition : -1);
+
+        Trie.Remaining has = current.startIteration();
+        while (true)
+        {
+            if (has != null)
+            {
+                // We have a transition, get child to descend into
+                Node<U, ApplyState<U>> child = current.getCurrentChild(state);
+
+                if (child == null)
+                {
+                    // no child, get next
+                    has = current.advanceIteration();
+                }
+                else
+                {
+                    state = new ApplyState<U>(child, current.currentTransition);
+                    current = child;
+                    has = current.startIteration();
+                }
+            }
+            else
+            {
+                // There are no more children. Ascend to the parent state to continue walk.
+                state = state.attachAndMoveToParentState(transformer);
+                if (state == null)
+                    break;
+                current = state.mutationNode;
+                has = current.advanceIteration();
+            }
+        }
+    }
+
+    /**
+     * Map-like put method, using the apply machinery above which cannot run into stack overflow. When the correct
+     * position in the trie has been reached, the value will be resolved with the given function before being placed in
+     * the trie (even if there's no pre-existing content in this trie).
+     * @param key the trie path/key for the given value.
+     * @param value the value being put in the memtable trie. Note that it can be of type different than the element
+     * type for this memtable trie. It's up to the {@code transformer} to return the final value that will stay in
+     * the memtable trie.
+     * @param transformer a function applied to the potentially pre-existing value for the given key, and the new
+     * value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied
+     * even if there's no pre-existing value in the memtable trie.
+     */
+    public <R> void putSingleton(ByteComparable key,
+                                 R value,
+                                 UpsertTransformer<T, ? super R> transformer) throws SpaceExhaustedException
+    {
+        apply(Trie.singleton(key, value), transformer);
+    }
+
+    /**
+     * A version of putSingleton which uses recursive put if the last argument is true.
+     */
+    public <R> void putSingleton(ByteComparable key,
+                                 R value,
+                                 UpsertTransformer<T, ? super R> transformer,
+                                 boolean useRecursive) throws SpaceExhaustedException
+    {
+        if (useRecursive)
+            putRecursive(key, value, transformer);
+        else
+            putSingleton(key, value, transformer);
+    }
+
+    /**
+     * Map-like put method, using a fast recursive implementation through the key bytes. May run into stack overflow if
+     * the trie becomes too deep. When the correct position in the trie has been reached, the value will be resolved
+     * with the given function before being placed in the trie (even if there's no pre-existing content in this trie).
+     * @param key the trie path/key for the given value.
+     * @param value the value being put in the memtable trie. Note that it can be of type different than the element
+     * type for this memtable trie. It's up to the {@code transformer} to return the final value that will stay in
+     * the memtable trie.
+     * @param transformer a function applied to the potentially pre-existing value for the given key, and the new
+     * value (of a potentially different type), returning the final value that will stay in the memtable trie. Applied
+     * even if there's no pre-existing value in the memtable trie.
+     */
+    public <R> void putRecursive(ByteComparable key, R value, final UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
+    {
+        int newRoot = putRecursive(root, key.asComparableBytes(BYTE_COMPARABLE_VERSION), value, transformer);
+        if (newRoot != root)
+            root = newRoot;
+    }
+
+    private <R> int putRecursive(int node, ByteSource key, R value, final UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
+    {
+        int transition = key.next();
+        if (transition == ByteSource.END_OF_STREAM)
+            return applyContent(node, value, transformer);
+
+        int child = NONE;
+        if (!isNull(node))
+            child = getChild(node, transition);
+
+        int newChild = putRecursive(child, key, value, transformer);
+        if (newChild == child)
+            return node;
+
+        int skippedContent = followContentTransition(node);
+        int attachedChild = !isNull(skippedContent)
+                            ? attachChild(skippedContent, transition, newChild)  // Single path, no copying required
+                            : expandOrCreateChainNode(transition, newChild);
+
+        return preserveContent(node, skippedContent, attachedChild);
+    }
+
+    private <R> int applyContent(int node, R value, UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
+    {
+        if (isNull(node))
+            return ~addContent(transformer.apply(null, value));
+
+        if (isLeaf(node))
+        {
+            int contentIndex = ~node;
+            setContent(contentIndex, transformer.apply(getContent(contentIndex), value));
+            return node;
+        }
+
+        if (offset(node) == PREFIX_OFFSET)
+        {
+            int contentIndex = getInt(node + PREFIX_CONTENT_OFFSET);
+            setContent(contentIndex, transformer.apply(getContent(contentIndex), value));
+            return node;
+        }
+        else
+            return createContentNode(addContent(transformer.apply(null, value)), node, false);
+    }
+
+    /**
+     * Returns true if the allocation threshold has been reached. To be called by the the writing thread (ideally, just
+     * after the write completes). When this returns true, the user should switch to a new trie as soon as feasible.
+     *
+     * The trie expects up to 10% growth above this threshold. Any growth beyond that may be done inefficiently, and
+     * the trie will fail altogether when the size grows beyond 2G - 256 bytes.
+     */
+    public boolean reachedAllocatedSizeThreshold()
+    {
+        return allocatedPos >= ALLOCATED_SIZE_THRESHOLD;
+    }
+
+    /**
+     * For tests only! Advance the allocation pointer (and allocate space) by this much to test behaviour close to
+     * full.
+     */
+    @VisibleForTesting
+    int advanceAllocatedPos(int wantedPos) throws SpaceExhaustedException
+    {
+        while (allocatedPos < wantedPos)
+            allocateBlock();
+        return allocatedPos;
+    }
+
+    /** Returns the off heap size of the memtable trie itself, not counting any space taken by referenced content. */
+    public long sizeOffHeap()
+    {
+        return bufferType == BufferType.ON_HEAP ? 0 : allocatedPos;
+    }
+
+    /** Returns the on heap size of the memtable trie itself, not counting any space taken by referenced content. */
+    public long sizeOnHeap()
+    {
+        return contentCount * MemoryLayoutSpecification.SPEC.getReferenceSize() +
+               (bufferType == BufferType.ON_HEAP ? allocatedPos + EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP);
+    }
+
+    @Override
+    public Iterable<T> valuesUnordered()
+    {
+        return () -> new Iterator<T>()
+        {
+            int idx = 0;
+
+            public boolean hasNext()
+            {
+                return idx < contentCount;
+            }
+
+            public T next()
+            {
+                if (!hasNext())
+                    throw new NoSuchElementException();
+
+                return getContent(idx++);
+            }
+        };
+    }
+
+    public int valuesCount()
+    {
+        return contentCount;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md
new file mode 100644
index 000000000000..1f0cfed6cd41
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md
@@ -0,0 +1,754 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# MemtableTrie Design
+
+The `MemtableTrie` is one of the main components of the trie infrastructure, a mutable in-memory trie built for fast
+modification and reads executing concurrently with writes from a single mutator thread.
+
+The main features of its implementation is:
+- support for writes from a single mutator thread concurrent with multiple readers
+- full support of the Trie interface
+- uses nodes of several different types for efficiency
+- supports content on any node, including intermediate (prefix)
+- upper limit for the trie size
+
+
+## Memory layout
+
+One of the main design drivers of the memtable trie is the desire to avoid on-heap storage and Java object management.
+The trie thus implements its own memory management for the structure of the trie (content is, at this time, still given
+as Java objects in a content array). The structure resides in one `UnsafeBuffer` (which can be on or off heap as
+desired) and is broken up in 32-byte "cells" (also called "blocks" in the code), which are the unit of allocation,
+update and reuse.
+
+Like all tries, `MemtableTrie` is built from nodes and has a root pointer. The nodes reside in cells, but there is no
+1:1 correspondence between nodes and cells - some node types pack multiple in one cell, while other types require
+multiple cells.
+
+### Pointers and node types
+
+A "pointer" is an integer that points to a node in the trie buffer. A pointer specifies the location of the node
+(its starting cell), but also defines the type of node in its 5 lowest-order bits (i.e. the offset within the cell).
+If a pointer has a negative value, it refers to a value in the content array, and implies a leaf node with the specified
+content. Additionally, the special pointer value `NONE` (0) is used to specify "no child". We use 32-bit integers as
+pointers, therefore the size of the trie structure is limited to a little less than 2GB.
+
+For example, the pointer `0x0109E` specifies a node residing in the cell at bytes `0x01080`-`0x0109F` in the buffer
+(specified by the pointers' 27 leading bits), where the node type is `Sparse` (specified by `0x1E` in the last 5 bits).
+
+The pointer `0xFFFFFFF0` specifies a leaf node (being negative), where the content's index is `0xF` (obtained by
+negating all bits of the pointer).
+
+To save space and reduce pointer chasing, we use several different types of nodes that address different common patterns
+in a trie. It is common for a trie to have one or a couple of top levels which have many children, and where it is
+important to make decisions with as few if-then-else branches as possible (served by the `Split` type), another one or
+two levels of nodes with a small number of children, where it is most important to save space as the number of these
+nodes is high (served by the `Sparse` type), and a lot of sequences of single-child nodes containing the trailing bytes
+of the key or of some common key prefix (served by the `Chain` type). Most of the payload/content of the trie resides
+at the leaves, where it makes sense to avoid taking any space for a node (the `Leaf` type), but we must also allow the
+possibility for values to be present in intermediate nodes &mdash; because this is rare, we support it with a special
+`Prefix` type instead of reserving a space for payload in all other node types.
+
+The Split-Sparse-Chain-Leaf/Prefix pattern may repeat several times. For example, we could have these four layers for
+the partition key with some metadata associated with the partition, then for the first component of the clustering key,
+then for the second component etc.
+
+The sections below specify the layout of each supported node type.
+
+#### Leaf nodes
+
+Leaf nodes do not have a corresponding cell in the buffer. Instead, they reference a value (i.e. a POJO in the
+`MemtableTrie`'s content type) in the content array. The index of the value is specified by `~pointer` (unlike `-x`,
+`~x` allows one to also encode 0 in a negative number).
+
+Leaf nodes have no children, and return the specified value for `content()`.
+
+Example: -1 is a leaf cell with content `contentArray[0]`.
+
+#### `Chain` nodes - single path, multiple transitions in one cell
+
+Chain nodes are one-child nodes. Multiple chain nodes, forming a chain of transitions to one target, can reside in a
+single cell. Chain nodes are identified by the lowest 5 bits of a pointer being between `0x00` and `0x1B`. In addition
+to the the type of node, in this case the bits also define the length of the chain &mdash; the difference between
+`0x1C` and the pointer offset specifies the number of characters in the chain.
+
+The simplest chain node has one transition leading to one child and is laid out like this:
+
+offset|content|example
+---|---|---
+00 - 1A|unused|
+1B     |character|41 A
+1C - 1F|child pointer|FFFFFFFF
+
+where the pointer points to the `1B` line in the cell.
+
+Example: The cell `xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxx41 FFFFFFFF` at bytes `0x120`-`0x13F` and
+pointer `0x13B` point to a node with one child with transition `0x41` `A` to a leaf node with content `contentArray[0]`.
+
+Another chain cell, which points to this one, can be added in the same cell by placing a character at offset `1A`. This
+new node is effectively laid out as
+
+offset|content|example
+---|---|---
+00 - 19|unused|
+1A     |character|48 H
+1B - 1F|unused|
+
+where the pointer points to line `1A`. This node has one transition, and the child pointer is implicit as the node's
+pointer plus one.
+
+This can continue until all the bytes in the "unused" area are filled.
+
+Example: The cell `xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx434841 FFFFFFFF` at bytes `0x120`-`0x13F` and
+pointer `0x139` point to a node with one child with transition `0x43` `C` to a node with one child with transition
+`0x48` `H` to a node with one child with transition `0x41` `A` to a leaf node with content `contentArray[0]`.
+
+offset|content|example
+---|---|---
+00 - 18|unused|
+19     |character|43 C
+1A     |character|48 H
+1B     |character|41 A
+1C - 1F|child pointer|FFFFFFFF
+
+
+In this example `0x13A` and `0x13B` are also valid pointers to the respective chains and could be referenced from other
+nodes (an example will be given below). In any case, the byte pointed directly by the node pointer contains the
+transition byte. The child pointer is either `pointer + 1` (if the lowest 5 pointer bits are less than `0x1B`), or the
+integer stored at `pointer + 1` (if the pointer's last 5 bits are `0x1B`).
+
+![graph](MemtableTrie.md.g1.svg)
+
+Note: offset `0x00` also specifies a chain node, but the pointer 0 is a special case and care must be taken to ensure no
+28-byte chain node is placed in the cell at bytes `0x00`-`0x1F`.
+
+#### `Sparse` nodes - between 2 and 6 children in one cell
+
+Sparse nodes are used when a node has at least two children, and all pointers and transition characters can fit in one
+cell, which limits the maximum number of children to 6. Their layout is:
+
+offset|content|
+---|---|
+00 - 03|child pointer 0|
+04 - 07|child pointer 1|
+08 - 0B|child pointer 2|
+0C - 0F|child pointer 3|
+10 - 13|child pointer 4|
+14 - 17|child pointer 5|
+18     |character 0|
+19     |character 1|
+1A     |character 2|
+1B     |character 3|
+1C     |character 4|
+1D     |character 5|
+1E - 1F|order word|
+
+where the pointer points to the line `1E` (i.e. the type identifier for a sparse node is `0x1E`).
+
+It is important to note that the pointers and characters are not in order. This is done so that an update to a sparse
+node where a new child is inserted can be done while the previous state of the node is still valid and readable for
+any concurrent readers. Instead, new children are appended, and the order is maintained in the "order word". This word
+is a number whose digits specify the order of the children's transition characters (where higher-order digits specify
+bigger characters) encoded, to be able to fit into a 16-bit word, in base 6. Its number of digits also specifies the
+number of children of the node.
+
+To explain this better, we will give an example of the evolution of a sparse node. Suppose we had the `0x139` node from
+the previous section, and some update needs to attach a second child to that, e.g. with the character `A` and child
+`0x238`.
+
+![graph](MemtableTrie.md.g2.svg)
+
+To do this, the mutating thread will have to convert the chain node into a sparse by allocating a new cell
+(e.g. `0x240`-`0x25F`) and filling in the sparse node `00000238 0000013A 00000000 00000000 00000000 00000000 41430000
+00000006` with pointer `0x25E`:
+
+offset|content|example
+---|---|---
+00 - 03|child pointer 0| 00000238
+04 - 07|child pointer 1| 0000013A
+08 - 17|unused|
+18     |character 0| 41 A
+19     |character 1| 43 C
+1A - 1D|unused|
+1E - 1F|order word, always 10| 0006 = 10 (base 6)
+
+This is the smallest kind of sparse node, with just two children. Two-children sparse nodes always
+put their two children in order (we can do this as this does not happen in response to an addition of a new child to
+an existing sparse node, but this is constructed directly) and thus their order word is always 10 (if they were
+not in order, the order word would have to be 01, which would be misinterpreted as the single-digit 1).
+
+This node has two (the number of digits in the order word) children. The first child is at the position specified by the
+least significant digit of the order word, 0. The second child is specified by the second least significant digit, 1.
+
+Suppose we then need to add a new child, using character `0x35` `5` and child `0x33B`. The node will change to `00000238
+0000013A 0000033B 00000000 00000000 00000000 41433500 00000026` and the pointer to it stays the same.
+
+offset|content|example
+---|---|---
+00 - 03|child pointer 0| 00000238
+04 - 07|child pointer 1| 0000013A
+08 - 0B|child pointer 2| 0000033B
+0C - 17|unused|
+18     |character 0| 41 A
+19     |character 1| 43 C
+1A     |character 2| 35 5
+1B - 1D|unused|
+1E - 1F|order word| 0026 = 102 (base 6)
+
+This node has three (the number of digits in the order word) children. The first child is at the position specified by
+the least significant digit of the order word, 2. The second child is specified by the second least significant digit,
+0, and the last child is specified by the leading digit, 1.
+
+Note that because of the ordering of the two children in the smallest sparse node, the digit 0 is always preceded by a
+more-significant 1 in the order word in base 6. Therefore the leading digit of the order word can never be 0 and thus we
+cannot miscount the number of children.
+
+The addition of children can continue until we have 6, for example `00000238 0000013A 0000033B 0000035C 0000037A
+0000041B 41433542 50338129` (pointer `0x25E`) for
+
+offset|content|example
+---|---|---
+00 - 03|child pointer 0| 00000238
+04 - 07|child pointer 1| 0000013A
+08 - 0B|child pointer 2| 0000033B
+0C - 0F|child pointer 3| 0000035C
+10 - 13|child pointer 4| 0000037A
+14 - 17|child pointer 5| 0000041B
+18     |character 0| 41 A
+19     |character 1| 43 C
+1A     |character 2| 35 5
+1B     |character 3| 42 B
+1C     |character 4| 50 P
+1D     |character 5| 33 3
+1E - 1F|order word| 8129 = 413025 (base 6)
+
+Beyond 6 children, a node needs to be converted to split.
+
+#### `Split` nodes - up to 256 children in multiple cells
+
+Split nodes are used to handle the nodes with a large number of children. We can only allocate cells of 32 bytes, thus
+we have to distribute the child transitions among cells in some way that is efficient for reading and updating. The
+method we chose is to construct a "mini-trie" with 2-3-3 bit transitions.
+
+A split node is identified by the `0x1C` offset. The starting cell of a split node has this layout:
+
+offset|content|
+---|---|
+00 - 0F|unused|
+10 - 13|mid-cell for leading 00|
+14 - 17|mid-cell for leading 01|
+18 - 1B|mid-cell for leading 10|
+1C - 1F|mid-cell for leading 11|
+
+(pointers to this node point to the `1C` line) and where each mid-cell contains:
+
+offset|content|
+---|---|
+00 - 03|end-cell for middle 000|
+04 - 07|end-cell for middle 001|
+08 - 0B|end-cell for middle 010|
+0C - 0F|end-cell for middle 011|
+10 - 13|end-cell for middle 100|
+14 - 17|end-cell for middle 101|
+18 - 1B|end-cell for middle 110|
+1C - 1F|end-cell for middle 111|
+
+and end-cell:
+
+offset|content|
+---|---|
+00 - 03|pointer to child for ending 000|
+04 - 07|pointer to child for ending 001|
+08 - 0B|pointer to child for ending 010|
+0C - 0F|pointer to child for ending 011|
+10 - 13|pointer to child for ending 100|
+14 - 17|pointer to child for ending 101|
+18 - 1B|pointer to child for ending 110|
+1C - 1F|pointer to child for ending 111|
+
+In any of the cell or pointer positions we can have `NONE`, meaning that such a child (or block of children) does not
+exist. At minimum, a split node occupies 3 cells (one leading, one mid and one end), and at maximum &mdash;
+`1 + 4 + 4*8 = 37` cells i.e. `1184` bytes. If we could allocate contiguous arrays, a full split node would use `1024`
+bytes, thus this splitting can add ~15% overhead. However, real data often has additional structure that this can make
+use of to avoid creating some of the blocks, e.g. if the trie encodes US-ASCII or UTF-encoded strings where some
+character ranges are not allowed at all, and others are prevalent. Another benefit is that to change a transition while
+preserving the previous state of the node for concurrent readers we have to only copy three blocks and not the entire
+range of children (applications of this will be given later).
+
+As an example, suppose we need to add a `0x51` `Q` transition to `0x455` to the 6-children sparse node from the previous
+section. This will generate the following structure:
+
+Leading cell (e.g. `0x500`-`0x51F` with pointer `0x51C`)
+
+offset|content|example
+---|---|---
+00 - 0F|unused|
+10 - 13|mid-cell for leading 00|00000520
+14 - 17|mid-cell for leading 01|00000560
+18 - 1B|mid-cell for leading 10|00000000 NONE
+1C - 1F|mid-cell for leading 11|00000000 NONE
+
+Mid cell `00` at `0x520`-`0x53F`:
+
+offset|content|example
+---|---|---
+00 - 03|end-cell for middle 000|00000000 NONE
+04 - 07|end-cell for middle 001|00000000 NONE
+08 - 0B|end-cell for middle 010|00000000 NONE
+0C - 0F|end-cell for middle 011|00000000 NONE
+10 - 13|end-cell for middle 100|00000000 NONE
+14 - 17|end-cell for middle 101|00000000 NONE
+18 - 1B|end-cell for middle 110|00000540
+1C - 1F|end-cell for middle 111|00000000 NONE
+
+End cell `00 110` at `0x540`-`0x55F`:
+
+offset|content|example
+---|---|---
+00 - 03|pointer to child for ending 000|00000000 NONE
+04 - 07|pointer to child for ending 001|00000000 NONE
+08 - 0B|pointer to child for ending 010|00000000 NONE
+0C - 0F|pointer to child for ending 011|0000041B
+10 - 13|pointer to child for ending 100|00000000 NONE
+14 - 17|pointer to child for ending 101|0000033B
+18 - 1B|pointer to child for ending 110|00000000 NONE
+1C - 1F|pointer to child for ending 111|00000000 NONE
+
+Mid cell `01` at `0x560`-`0x57F`:
+
+offset|content|example
+---|---|---
+00 - 03|end-cell for middle 000|00000580
+04 - 07|end-cell for middle 001|00000000 NONE
+08 - 0B|end-cell for middle 010|000005A0
+0C - 0F|end-cell for middle 011|00000000 NONE
+10 - 13|end-cell for middle 100|00000000 NONE
+14 - 17|end-cell for middle 101|00000000 NONE
+18 - 1B|end-cell for middle 110|00000000 NONE
+1C - 1F|end-cell for middle 111|00000000 NONE
+
+End cell `01 000` at `0x580`-`0x59F`:
+
+offset|content|example
+---|---|---
+00 - 03|pointer to child for ending 000|00000000 NONE
+04 - 07|pointer to child for ending 001|00000238
+08 - 0B|pointer to child for ending 010|0000035C
+0C - 0F|pointer to child for ending 011|0000013A
+10 - 13|pointer to child for ending 100|00000000 NONE
+14 - 17|pointer to child for ending 101|00000000 NONE
+18 - 1B|pointer to child for ending 110|00000000 NONE
+1C - 1F|pointer to child for ending 111|00000000 NONE
+
+End cell `01 010` at `0x5A0`-`0x5BF`:
+
+offset|content|example
+---|---|---
+00 - 03|pointer to child for ending 000|0000037A
+04 - 07|pointer to child for ending 001|00000455
+08 - 0B|pointer to child for ending 010|00000000 NONE
+0C - 0F|pointer to child for ending 011|00000000 NONE
+10 - 13|pointer to child for ending 100|00000000 NONE
+14 - 17|pointer to child for ending 101|00000000 NONE
+18 - 1B|pointer to child for ending 110|00000000 NONE
+1C - 1F|pointer to child for ending 111|00000000 NONE
+
+To find a child in this structure, we follow the transitions along the bits of the mini-trie. For example, for `0x42`
+`B` = `0b01000010` we start at `0x51C`, take the `01` pointer to `0x560`, then the `000` pointer to `0x580` and finally
+the `010` index to retrieve the node pointer `0x35C`. Note that the intermediate cells (dashed in the diagram) are not
+reachable with pointers, they only make sense as substructure of the split node.
+
+![graph](MemtableTrie.md.g3.svg)
+
+#### Content `Prefix`
+
+Prefix nodes are not nodes in themselves, but they add information to the node they lead to. Specifically, they
+encode an index in the content array, and a pointer to the node to which this content is attached. In anything other
+than the content, they are equivalent to the linked node &mdash; i.e. a prefix node pointer has the same children as
+the node it links to (another way to see this is as a content-carrying node is one that has an _ε_ transition to the
+linked node and no other features except added content). We do not allow more than one prefix to a node (i.e. prefix
+can't point to another prefix), and the child of a prefix node cannot be a leaf.
+
+There are two types of prefixes:
+- standalone, which has a full 32-bit pointer to the linked node
+- embedded, which occupies unused space in `Chain` or `Split` nodes and specifies the 5-bit offset within the same cell
+of the linked node
+
+Standalone prefixes have this layout:
+
+offset|content|example
+---|---|---
+00 - 03|content index|00000001
+04|standalone flag, 0xFF|FF
+05 - 1B|unused|
+1C - 1F|linked node pointer|0000025E
+
+and pointer offset `0x1F`. The sample values above will be the ones used to link a prefix node to our `Sparse`
+example, where a prefix cannot be embedded as all the bytes of the cell are in use.
+
+If we want to attach the same prefix to the `Split` example, we will place this
+
+offset|content|example
+---|---|---
+00 - 03|content index|00000001
+04|embedded offset within cell|1C
+05 - 1F|unused|
+
+_inside_ the leading split cell, with pointer `0x1F`. Since this is an embedded node, the augmented one resides within
+the same cell, and thus we need only 5 bits to encode the pointer (the other 27 are the same as the prefix's).
+The combined content of the cell at `0x500-0x51F` will then be `00000001 1C000000 00000000 00000000 00000520 00000560
+00000000 00000000`:
+
+offset|content|example
+---|---|---
+00 - 03|content index|00000001
+04|embedded offset within cell|1C
+05 - 0F|unused|
+10 - 13|mid-cell for leading 00|00000520
+14 - 17|mid-cell for leading 01|00000560
+18 - 1B|mid-cell for leading 10|00000000 NONE
+1C - 1F|mid-cell for leading 11|00000000 NONE
+
+Both `0x51C` and `0x51F` are valid pointers in this cell. The former refers to the plain split node, the latter to its
+content-augmented version. The only difference between the two is the result of a call to `Node.content()`.
+
+![graph](MemtableTrie.md.g4.svg)
+
+
+## Reading a trie
+
+`MemtableTrie` is mainly meant to be used as an implementation of `Trie`. As such, the main method of retrieval of
+information is via some selection (i.e. intersection) of a subtrie followed by a walk over the content in this
+subtrie. Straightforward methods for direct retrieval of data by key are also provided, but they are mainly for testing.
+
+The `Trie` interface relies on nodes keeping track of the state of iteration, so that it can be continued over result
+consumption or pauses to retrieve information asynchronously. `MemtableTrie` supports this by providing `Trie.Node`
+implementations (residing in `MemtableReadTrie.xxxNode`) with several special features to aid quick walks over the
+trie's content:
+
+- Like all `Trie.Node` descendants, the nodes are stateful and keep track of the parent chain, as well as the current
+ iteration position.
+- `Chain` nodes, which always have a single descendant, implement `getUniqueDescendant` so that walks can jump
+ straight to the chain's child instead of walking it one character at a time; this also applies on backtracking
+  &mdash; the walk will skip over the intermediate nodes and go directly to the chain's parent.
+- `Chain` and `Sparse` nodes return `Remaining.ONE` to iteration requests when the returned item is the last. This
+ helps with backtracking as it lets the walk know this node does not need to be visited on the backtracking path,
+ which can jump straight to the parent.
+
+As an example, suppose we want to walk the following trie:
+
+![graph](MemtableTrie.md.w1.svg)
+
+The classic walk descends (blue) on every character and backtracks (pink) to the parent, resulting in the following
+ walk:
+
+![graph](MemtableTrie.md.w2.svg)
+
+Making use of `getUniqueDescendant` skips the intermediate transitions in `Chain` nodes, and also avoids the
+backtracking to the start of the chain (as there are no further transitions to examine there), resulting in:
+
+![graph](MemtableTrie.md.w3.svg)
+
+Finally, taking advantage of the `Remaining.ONE` returned by the `Sparse` node after the last child has been listed
+lets the backtracking avoid returning to that node, simplifying the walk to:
+
+![graph](MemtableTrie.md.w4.svg)
+
+In addition to making the walk simpler, shortening the backtracking paths means a smaller walk state representation,
+which is quite helpful in keeping the garbage collection cost down.
+Technically, this is achieved by getting a child not with the current node in `parentLink`, but directly using
+the node's own `parentLink` (see `TrieValuesIterator.getChild` and `TrieIteratorWithKey.getChild`).
+
+## Mutation
+
+Mutation of `MemtableTrie` must be done by one thread only (for performance reasons we don't enforce it, user must
+make sure that's the case), but writes may be concurrent with multiple reads over the data that is being mutated. The
+trie is built to support this by making sure that any modification of a node is safe for any reader that is operating
+concurrently.
+
+The main method for mutating a `MemtableTrie` is `apply`, which merges the structure of another `Trie` in. 
+`MemtableTrie` also provides simpler recursive method of modification, `putRecursive`, which creates a single 
+`key -> value` mapping in the trie. We will describe the mutation process starting with a `putRecursive` example.
+
+### Adding a new key -> value mapping using `putRecursive`
+
+Suppose we want to insert the value `traverse` into the trie described in the previous paragraph. The recursive
+insertion process walks the trie to find corresponding existing nodes for the ones in the path to be inserted.
+When it has to leave the existing trie, because it has no entries for the path, the process continues using `NONE` as
+the trie node.
+
+![graph](MemtableTrie.md.m1.svg)
+
+When it reaches the end of the path, it needs to attach the value. We don't support content in intermediate nodes, so
+we expect the matching trie node to either be `NONE` or a leaf node. Here it's `NONE`, so we create a item in the
+content array, `contentArray[3]`, put the value in it, and thus form the leaf node `~3` (`0xFFFFFFFC`). The recursive
+process returns this to the previous step.
+
+The previous step must attach a child with the transition `e` to the node `NONE`. Since this is a new node, we do this
+by creating a new `Chain` node at address `0x0BB` mapping `e` to `~3` and return that. For the node above, we again
+need to attach a child to `NONE`, but this time the child is a `Chain` node, so we can do this by expanding it, i.e.
+writing the new character at the address just before the child pointer, and returning that address (note that the
+child chain node is newly created, so we can't be overwriting any existing data there). We can do this several more
+times.
+
+![graph](MemtableTrie.md.m2.svg)
+
+(<span style="color:lightblue">Light blue</span> specifies the descent path, <span style="color:pink">pink</span>
+the values returned, <span style="color:blue">blue</span> stands for newly-created nodes and links, and
+<span style="color:lightgray">light gray</span> for obsoleted nodes and links.)
+
+In the next step we must attach the child `0x0B8` with transition `v` to the existing `Chain` node `0x018`. This is a
+different transition from the one that node already has, so the change cannot be accommodated by a node of type `Chain`,
+thus we need to copy this into a new `Sparse` node `0x0DE` with two children, the existing `c -> 0x019` and the new
+`v -> 0x0B8` and return `0x0DE` to the parent step.
+
+The parent step must then change its existing pointer for the character `a` from `0x018` to `0x0DE` which it can do in
+place by writing the new value in its pointer cell for `a`. This is the attachment point for the newly created
+substructure, i.e. before this, the new nodes were not reachable, and now become reachable; before this, the node
+`0x018 ` was reachable, and now becomes unreachable. The attachment is done by a volatile write, to enforce a happens
+-before relationship that makes sure that all the new substructure (all written by this thread) is fully readable by all
+readers who pass through the new pointer (which is the only way they can reach it). The same happens-before also ensures
+that any new readers cannot reach the obsoleted nodes (there may be existing reader threads that are already in them).
+
+It can then return its address `0x07E` unchanged up, and no changes need to be done in any of the remaining steps. The
+process finishes in a new value for `root`, which in this case remains unchanged.
+
+![graph](MemtableTrie.md.m3.svg)
+
+The process created a few new nodes (in blue), and made one obsolete (in grey). What can concurrent readers see depends
+on where they are at the time the attachment point write is done. Forward traversals, if they are in the path below
+`0x07E`, will continue working with the obsoleted data and will not see any of the new changes. If they are above
+`0x07E`, they will see the updated content. If they are _at_ the `0x07E` node, they may see either, depending on the
+time they read the pointer for `a`. Reverse traversals that happen to be in the region to the right of the new nodes
+_will_ see the updated content, as they will read the pointer after it has been updated.
+
+In any case, the obsolete paths remain correct and usable for any thread that has already reached them, and the new
+paths are correct and usable from the moment they become reachable.
+
+Note that if we perform multiple mutations in sequence, and a reader happens to be stalled between them (in iteration
+order), such reader may see only the mutation that is ahead of it _in iteration order_, which is not necessarily the
+mutation that happened first. For the example above, if we also inserted `trespass`, a reader thread that was paused
+at `0x018` in a forward traversal and wakes up after both insertions have completed will see `trespass`, but _will not_
+see `traverse` even though it was inserted earlier. This inconsistency is often undesirable.
+
+### In-place modifications
+
+When the backtracking process returns with a new mapping, there are several cases when we can apply a change in place
+(creating an attachment point for the new path). We will explain these in detail, as it is important to understand what
+exactly happens from concurrent readers' point of view in all of them.
+
+Note that if a modification cannot be done in place, we copy the content to a new node. The copied node is always
+unreachable and there will always be an attachment point that makes it reachable somewhere in the parent chain.
+
+#### Changing the child pointer of the last `Chain` node in a chain
+
+This happens when the existing transition matches the transition of the new character, but the pointer is different,
+and only applies to `Chain` nodes whose offset is `0x1B`. In this case the child pointer is written at offset `0x1C`,
+and we can put in the new value by performing a volatile write.
+
+For example, updating `N -> 0x39C` is accomplished by making the volatile write:
+
+offset|content|before|after
+---|---|---|---
+00-1A|irrelevant||
+1B|character|N|N
+1C-1F|pointer|0000031E|_**0000039C**_
+
+(Here and below normal writes are in bold and volatile writes in bold italic.)
+
+Readers have to read the pointer to reach the child (old or new), so this achieves the happens-before guarantees we
+seek. Readers either see the old value (where none of the branch's data has been modified in any way), or the new value
+(where the happens-before guarantees all writes creating the attached substructure are fully visible).
+
+Note that if the node is not the last in the chain, the pointer is implicit and we cannot change it. Thus we have
+to copy, i.e. create a new node, which in this case will also be a `Chain` node, because there is nothing else in the
+original node that needs to be preserved (the only existing transition is replaced by the update).
+
+#### Changing the child pointer of a `Sparse` or `Split` node
+
+Similarly to above, in this case the transition matches an existing one, and thus we already have a 4-byte location
+where the pointer to the old child is written, and we can update it by doing a volatile write.
+
+For example, updating `C -> 0x51E` in a sparse node can be:
+
+offset|content|before|after
+---|---|---|---
+00 - 03|child pointer 0| 00000238|00000238
+04 - 07|child pointer 1| 0000013A|_**0000051E**_
+08 - 0B|child pointer 2| 0000033B|0000033B
+0C - 17|unused|
+18     |character 0| 41 A|41 A
+19     |character 1| 43 C|43 C
+1A     |character 2| 35 5|35 5
+1B - 1D|unused|
+1E - 1F|order word| 0026 = 102 (base 6)
+
+
+#### Adding a new child to `Split`
+
+If we already have the substructure that leads to the pointer for the new transition (i.e. a mid- and end-cell for the
+transition's first 2-3 bits already exists), the situation is as above, where the existing pointer is `NONE`, and we can
+simply perform a volatile write.
+
+If an end-cell mapping does not exist, we allocate a new cleared cell (so that all pointers are `NONE`), write the new
+pointer at its position using a non-volatile write, and then create a mapping to this end-cell in the mid cell by
+volatile writing its pointer over the `NONE` in the correct offset. Similarly, if there's no mid-cell either, we create
+empty end-cell and mid-cell, write pointer in end-cell and mapping in mid-cell non-volatile, and write the mapping in
+the leading cell volatile.
+
+In any of these cases, readers have to pass through the volatile update to reach any of the new content.
+
+For example, to add `x -> 0x71A` (`x` is `0x78` or `0b01111000`) to the split node example needs a new end cell for
+`01 111` (for example at `0x720-0x73F`) (these writes can be non-volatile):
+
+offset|content|before|after
+---|---|---|---
+00 - 03|pointer to child for ending 000|n/a|**0000071A**
+04 - 07|pointer to child for ending 001|n/a|**00000000** NONE
+08 - 0B|pointer to child for ending 010|n/a|**00000000** NONE
+0C - 0F|pointer to child for ending 011|n/a|**00000000** NONE
+10 - 13|pointer to child for ending 100|n/a|**00000000** NONE
+14 - 17|pointer to child for ending 101|n/a|**00000000** NONE
+18 - 1B|pointer to child for ending 110|n/a|**00000000** NONE
+1C - 1F|pointer to child for ending 111|n/a|**00000000** NONE
+
+and this volatile write to the mid cell `0x520`:
+
+offset|content|before|after
+---|---|---|---
+00 - 03|end-cell for middle 000|00000000 NONE|00000000 NONE
+04 - 07|end-cell for middle 001|00000000 NONE|00000000 NONE
+08 - 0B|end-cell for middle 010|00000000 NONE|00000000 NONE
+0C - 0F|end-cell for middle 011|00000000 NONE|00000000 NONE
+10 - 13|end-cell for middle 100|00000000 NONE|00000000 NONE
+14 - 17|end-cell for middle 101|00000000 NONE|00000000 NONE
+18 - 1B|end-cell for middle 110|00000540|00000540
+1C - 1F|end-cell for middle 111|00000000 NONE|_**00000720**_
+
+The start cell, and the other mid and end cells remain unchanged.
+
+#### Adding a new child to `Sparse` with 5 or fewer existing children
+
+The need to maintain a correct view for concurrent readers without blocking is the reason why we cannot keep the
+children in a `Sparse` cell ordered (if we insert ordered, we open ourselves to readers possibly seeing the same pointer
+or child twice, or even going back in the iteration order). Instead we always add new characters and pointers at the
+next free position and then update the order word to include it. More precisely:
+- we find the smallest index `i < 6` for which the pointer is `NONE`
+- we write the transition character at position `i`
+- we write the pointer at position `i` over `NONE` volatile
+- we compile a new order word by inserting `i` after all indexes with greater transition and before all indexes with
+  smaller in the base-6 representation (e.g. to insert `j` in sparse node that has `a@4 f@0 g@2 k@1 q@3` we change the
+  order word `31204` to `315204`) and write it volatile
+
+This ensures that any reader that iterates over children (i.e. one that needs the order word) will have to pass through
+the volatile order word update and will see the correct character and pointer values. Readers who have read the order
+word at some earlier time will not include the new pointer or character in the iteration.
+
+Readers that directly select the child for a given transition must read the pointer for each index _before_ reading the
+character to ensure they can see the properly updated value (otherwise they could match e.g. a `00` transition to the
+new branch because the real character was not written when they read the byte, but the pointer was when they got to it)
+and stop searching when they find a `NONE` pointer.
+
+For example, adding `x -> 0x71A` to the sparse example above is done by:
+
+offset|content|before|after
+---|---|---|---
+00 - 03|child pointer 0| 00000238|00000238
+04 - 07|child pointer 1| 0000051E|0000051E
+08 - 0B|child pointer 2| 0000033B|0000033B
+0C - 0F|child pointer 3|any|_**0000071A**_
+10 - 17|unused|NONE|NONE
+18     |character 0| 41 A|41 A
+19     |character 1| 43 C|43 C
+1A     |character 2| 35 5|35 5
+1B     |character 3| any |**78** x
+1C - 1D|unused|00 00|00 00
+1E - 1F|order word|0026 = 102 (base 6)|_**02AE**_ = 3102 (base 6)
+
+where we first write the character, then volatile write the pointer, and finally the order word.
+
+#### Changing the root pointer
+
+If an update propagates with copying modifications all the way to the root, we must update the root pointer. The latter
+is a volatile variable, so this also enforces the happens-before relationship we need.
+
+### Merging a branch using `apply`
+
+This is a generalization of the mutation procedure above, which applies to more complex branches, where each node may
+potentially need multiple updates to attach more than one child. The process proceeds as above; instead of keeping the
+backtrack information in the call stack, we use the `Node.parentLink` pointers to point to `ApplyState` objects for
+each node, which point to
+- `mutationNode`, the node in the mutation trie, which contains a pointer up in its `parentLink`
+- `existingNode`, the corresponding pointer in the memtable trie
+- `updatedNode`, the current corresponding pointer in the memtable trie, which may be different from the above if the
+  mutation node is branching and one or more of its children have been already added
+
+When we descend, we follow the transitions in the memtable trie corresponding to the ones from an iteration over the
+structure of the mutation trie to obtain the `existingNode` pointers, and initialize `updatedNode` to the same. When the
+iteration processes a child, we apply the update to the node, which may happen in place, or may require copying
+&mdash; in the latter case `updatedNode` will change to the new value. Note that if `updatedNode` was different from
+the original `existingNode`, it was pointing to an unreachable copied node which will remain unreachable as we will only
+attach the newer version.
+
+After all modifications coming as the result of application of child branches have been applied, we have an
+`updatedNode` that reflects all. As we ascend we apply that new value to the parent's `updatedNode`.
+
+For example (adding a trie containing "traverse, truck" to the "tractor, tree, trie" one):
+
+![graph](MemtableTrie.md.a1.svg)
+
+In this diagram `existingNode`s are the ones reached through the light blue arrows during the descent phase (e.g.
+`0x018` for the `ApplyState` at `tra`, or `NONE` for `tru`), and `updatedNode`s are the ones ascent (pink arrows)
+returns with (e.g . `0x0DE` and `0x0FA` for the respective states).
+
+During this process, readers can see any modifications made in place (each in-place modification is an attachment point
+which makes part of the new nodes reachable). The update mechanism above makes sure both that the state before the
+update is preserved, and that the state after the update is fully visible for readers that can reach it, but it does not
+guarantee that the mutation is seen atomically by the readers if it contains multiple separate branches.
+It is possible for a reader to see only a part of the update, for example:
+- a reading thread racing with the mutator can iterate over `traverse` but finish iterating before the mutator
+manages to attach `truck`;
+- a reading thread that iterated to `tree` (while `traverse` was not yet attached) and paused, will see `truck` if the
+mutating thread applies the update during the pause.
+
+### Handling prefix nodes
+
+The descriptions above were given without prefix nodes. Handling prefixes is just a little complication over the update
+process where we must augment `updatedNode` with any applicable content before applying the change to the parent.
+To do this we expand the state tracked in `ApplyState` a little to:
+- `existingPreContentNode` which points to the existing node including any prefix
+- `existingPostContentNode` which is obtained by skipping over the prefix (for simplicity we also treat leaf
+  nodes like a prefix with no child) and is the base for all child updates (i.e. it takes the role of
+  `existingNode` in the descriptions above)
+- `updatedPostContentNode` which is the node as changed/copied after children modifications are applied
+
+and we then apply the content (from existing prefix or newly introduced) to compile an `updatedPreContentNode` which
+the parent is made to link to (which is equal to `updatedPostContentNode` if no content applies).
+
+("Pre-" and "post-" refer to descent/iteration order, not to construction order; e.g. `updatedPreContentNode` is
+constructed after `updatedPostContentNode` but links above it in the trie.)
+
+As an example, consider the process of adding `trees` to our sample trie:
+
+![graph](MemtableTrie.md.p1.svg)
+
+When descending at `tree` we set `existingPreContentNode = ~1` and `existingPostContentNode = NONE`. Ascending back
+to add the child `~3`, we add a child to `NONE` and get `updatedPostContentNode = 0x0BB`. To then apply the existing
+content, we create the embedded prefix node `updatedPreContentNode = 0x0BF` with `contentIndex = 1` and pass that on to
+the recursion.
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.a1.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.a1.svg
new file mode 100644
index 000000000000..4237fb599a79
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.a1.svg
@@ -0,0 +1,599 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"; color = "lightgrey"; fontcolor = lightgray]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tra [label = " a"; color = "lightgrey"; fontcolor = lightgray]
+    tra -> trac [label = " c"; color = "lightgrey"; fontcolor = lightgray]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; arrowhead="vee"]
+        node [color = "blue"; fontcolor="blue"]
+
+        start -> root
+
+        root -> t [label = " t"]
+        t -> tr [label = " r"]
+        tr -> tra [label = " a"]
+        tra -> trav [label = " v"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+
+        tra2 [label = "Sparse\n0x0DE"]
+        trav [label = "Chain\n0x0B8"]
+        trave [label = "0x0B9"]
+        traver [label = "0x0BA"]
+        travers [label = "0x0BB"]
+        traverse [label = "contentArray[3]"]
+
+        tr -> tru [label = " u"]
+        tru -> truc [label = " c"]
+        truc -> truck [label = " k"]
+
+        tru [label = "Chain\n0x0FA"]
+        truc [label = "0x0FB"]
+        truck [label = "contentArray[4]"]
+    }
+
+    {rank=same tra -> tra2 -> tre -> tri -> tru [style=invis]}
+    {rank=same trac -> trav -> tree -> trie -> truc [style=invis]}
+
+    {
+        edge [color = "blue"]
+        tr -> tra2 [label = " a"]
+        tra2 -> trac [label = " c"]
+        tra2 -> trav [label = " v"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+        tr -> tru [label = " u"]
+        tru -> truc [label = " c"]
+        truc -> truck [label = " k"]
+    }
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="red"; arrowhead="vee"; constrain="false"]
+
+        traverse -> travers [label = " ~3"]
+        travers -> traver [label = "0x0BB"]
+        traver -> trave [label = "0x0BA"]
+        trave -> trav [label = "0x0B9"]
+        trav -> tra2 [label = "0x0B8"]
+        tra2 -> tr [label = "0x0DE"]
+        tr -> t [label = "0x07E"]
+        t -> root [label = "0x09B"]
+        root -> start [label = "0x09A"]
+
+        truck -> truc [label = "~4"]
+        truc -> tru [label = "0x0FB"]
+        tru -> tr [label = "0x0FA"]
+    }
+}
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="757pt" height="846pt"
+     viewBox="0.00 0.00 757.47 845.73" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 841.7251)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-841.7251 753.4738,-841.7251 753.4738,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="357.2369" cy="-808.3095" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="357.2369" y="-812.5095" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="357.2369" y="-795.7095" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="490.2369" cy="-808.3095" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="490.2369" y="-804.1095" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- root&#45;&gt;start -->
+        <g id="edge51" class="edge">
+            <title>root&#45;&gt;start</title>
+            <path fill="none" stroke="#ffc0cb" d="M390.7984,-808.3095C404.3037,-808.3095 420.2339,-808.3095 435.3398,-808.3095"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="445.6261,-808.3095 435.6262,-812.8096 440.6261,-808.3095 435.6261,-808.3096 435.6261,-808.3096 435.6261,-808.3096 440.6261,-808.3095 435.6261,-803.8096 445.6261,-808.3095 445.6261,-808.3095"/>
+            <text text-anchor="middle" x="418.2509" y="-815.5095" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x09A</text>
+        </g>
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="357.2369" cy="-708.0939" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="357.2369" y="-703.8939" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge2" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#000000" d="M337.9564,-784.2612C330.7393,-772.3485 325.2768,-757.6462 329.8463,-744.0939 331.1636,-740.1869 333.0926,-736.3684 335.3269,-732.7663"/>
+            <polygon fill="#000000" stroke="#000000" points="338.3844,-734.5068 341.3357,-724.3313 332.683,-730.4454 338.3844,-734.5068"/>
+            <text text-anchor="middle" x="333.9322" y="-748.2939" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge14" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#add8e6" d="M357.2369,-778.7835C357.2369,-765.2984 357.2369,-749.4062 357.2369,-736.1092"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="357.2369,-726.0972 361.737,-736.0971 357.2369,-731.0972 357.237,-736.0972 357.237,-736.0972 357.237,-736.0972 357.2369,-731.0972 352.737,-736.0972 357.2369,-726.0972 357.2369,-726.0972"/>
+            <text text-anchor="middle" x="360.9322" y="-748.2939" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- start&#45;&gt;root -->
+        <g id="edge13" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M446.4042,-805.6333C434.2938,-805.1818 421.1479,-804.9584 409.0184,-805.3095 406.3264,-805.3874 403.5568,-805.4881 400.7643,-805.6051"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="390.6504,-806.089 400.4239,-801.1161 395.6447,-805.85 400.639,-805.611 400.639,-805.611 400.639,-805.611 395.6447,-805.85 400.8541,-810.1058 390.6504,-806.089 390.6504,-806.089"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="74.2369" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="74.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge6" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#000000" d="M74.0297,-177.2006C73.8934,-165.0949 73.7122,-149.0076 73.5575,-135.2674"/>
+            <polygon fill="#000000" stroke="#000000" points="77.0529,-134.8319 73.4404,-124.872 70.0533,-134.9108 77.0529,-134.8319"/>
+            <text text-anchor="middle" x="78.3172" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="78.2369" cy="-284.4" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="78.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge5" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#000000" d="M77.4081,-266.0006C76.8628,-253.8949 76.1381,-237.8076 75.5192,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="78.9975,-223.5044 75.0509,-213.672 72.0046,-223.8194 78.9975,-223.5044"/>
+            <text text-anchor="middle" x="82.4869" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="94.2369" cy="-384.6156" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="94.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge4" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#000000" d="M91.3022,-366.2345C88.9115,-351.2603 85.4955,-329.8643 82.756,-312.7055"/>
+            <polygon fill="#000000" stroke="#000000" points="86.1858,-311.9877 81.1529,-302.6646 79.2734,-313.0913 86.1858,-311.9877"/>
+            <text text-anchor="middle" x="89.9322" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- trav -->
+        <g id="node14" class="node">
+            <title>trav</title>
+            <ellipse fill="none" stroke="#0000ff" cx="202.2369" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="202.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#0000ff">Chain</text>
+            <text text-anchor="middle" x="202.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0B8</text>
+        </g>
+        <!-- trac&#45;&gt;trav -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#d3d3d3" cx="133.2369" cy="-496.2469" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="133.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#d3d3d3">Chain</text>
+            <text text-anchor="middle" x="133.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#d3d3d3">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge8" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#d3d3d3" d="M114.5188,-470.6739C110.3016,-463.8622 106.2666,-456.3098 103.5223,-448.8313 99.3758,-437.5318 97.0731,-424.4776 95.7973,-413.1021"/>
+            <polygon fill="#d3d3d3" stroke="#d3d3d3" points="99.2538,-412.4638 94.8622,-402.822 92.2826,-413.098 99.2538,-412.4638"/>
+            <text text-anchor="middle" x="109.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#d3d3d3"> c</text>
+        </g>
+        <!-- tra&#45;&gt;trav -->
+        <g id="edge17" class="edge">
+            <title>tra&#45;&gt;trav</title>
+            <path fill="none" stroke="#add8e6" d="M130.7527,-466.6197C131.0399,-455.1889 132.9734,-442.3975 138.7369,-432.0313 144.2651,-422.0881 152.8469,-413.6214 161.933,-406.7148"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="170.346,-400.8129 164.7439,-410.2398 166.2527,-403.6844 162.1595,-406.5559 162.1595,-406.5559 162.1595,-406.5559 166.2527,-403.6844 159.5751,-402.872 170.346,-400.8129 170.346,-400.8129"/>
+            <text text-anchor="middle" x="144.4869" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> v</text>
+        </g>
+        <!-- tra2 -->
+        <g id="node19" class="node">
+            <title>tra2</title>
+            <ellipse fill="none" stroke="#0000ff" cx="244.2369" cy="-496.2469" rx="39.2145" ry="29.3315"/>
+            <text text-anchor="middle" x="244.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#0000ff">Sparse</text>
+            <text text-anchor="middle" x="244.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0DE</text>
+        </g>
+        <!-- tra&#45;&gt;tra2 -->
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="348.2369" cy="-384.6156" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="348.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="530.2369" cy="-384.6156" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="530.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="357.2369" cy="-496.2469" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="357.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="357.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge10" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#000000" d="M354.8487,-466.6249C353.5073,-449.9873 351.8422,-429.334 350.5186,-412.9163"/>
+            <polygon fill="#000000" stroke="#000000" points="353.9973,-412.5103 349.7049,-402.8239 347.0199,-413.0729 353.9973,-412.5103"/>
+            <text text-anchor="middle" x="359.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="468.2369" cy="-496.2469" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="468.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="468.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tre&#45;&gt;tri -->
+        <!-- truc -->
+        <g id="node21" class="node">
+            <title>truc</title>
+            <ellipse fill="none" stroke="#0000ff" cx="676.2369" cy="-384.6156" rx="37.1616" ry="18"/>
+            <text text-anchor="middle" x="676.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0FB</text>
+        </g>
+        <!-- trie&#45;&gt;truc -->
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge12" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#000000" d="M483.2454,-469.2241C492.9216,-451.8019 505.4604,-429.2258 515.1517,-411.7765"/>
+            <polygon fill="#000000" stroke="#000000" points="518.4109,-413.1168 520.2066,-402.6752 512.2914,-409.718 518.4109,-413.1168"/>
+            <text text-anchor="middle" x="507.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tru -->
+        <g id="node20" class="node">
+            <title>tru</title>
+            <ellipse fill="none" stroke="#0000ff" cx="580.2369" cy="-496.2469" rx="38.626" ry="29.3315"/>
+            <text text-anchor="middle" x="580.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#0000ff">Chain</text>
+            <text text-anchor="middle" x="580.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0FA</text>
+        </g>
+        <!-- tri&#45;&gt;tru -->
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="357.2369" cy="-607.8782" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="357.2369" y="-612.0782" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="357.2369" y="-595.2782" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge7" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#d3d3d3" d="M321.8872,-597.2153C295.7144,-588.7379 259.543,-575.7815 229.5223,-560.4626 208.2971,-549.6318 186.0738,-535.0548 168.3323,-522.5268"/>
+            <polygon fill="#d3d3d3" stroke="#d3d3d3" points="170.1754,-519.5418 160.0068,-516.5667 166.1007,-525.2336 170.1754,-519.5418"/>
+            <text text-anchor="middle" x="235.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#d3d3d3"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge16" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M319.8208,-603.7035C283.8354,-598.2919 229.0725,-586.2816 188.5223,-560.4626 176.3594,-552.7182 165.4058,-541.4906 156.5184,-530.5822"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="150.1392,-522.3658 159.8263,-527.5049 153.2055,-526.3152 156.2719,-530.2646 156.2719,-530.2646 156.2719,-530.2646 153.2055,-526.3152 152.7174,-533.0243 150.1392,-522.3658 150.1392,-522.3658"/>
+            <text text-anchor="middle" x="194.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge9" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#000000" d="M357.2369,-578.2562C357.2369,-565.3881 357.2369,-550.1179 357.2369,-536.2631"/>
+            <polygon fill="#000000" stroke="#000000" points="360.737,-536.0074 357.2369,-526.0074 353.737,-536.0074 360.737,-536.0074"/>
+            <text text-anchor="middle" x="362.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge11" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#000000" d="M380.5061,-584.4766C397.0825,-567.806 419.5291,-545.2318 437.5674,-527.0908"/>
+            <polygon fill="#000000" stroke="#000000" points="440.4292,-529.1767 444.9983,-519.6177 435.4654,-524.241 440.4292,-529.1767"/>
+            <text text-anchor="middle" x="423.9322" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> i</text>
+        </g>
+        <!-- tr&#45;&gt;t -->
+        <g id="edge49" class="edge">
+            <title>tr&#45;&gt;t</title>
+            <path fill="none" stroke="#ffc0cb" d="M355.1399,-637.287C354.605,-648.2159 354.2816,-660.7205 354.6843,-672.0939 354.7703,-674.5242 354.889,-677.0451 355.0283,-679.5733"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="355.6875,-689.7931 350.5531,-680.1035 355.3656,-684.8034 355.0438,-679.8138 355.0438,-679.8138 355.0438,-679.8138 355.3656,-684.8034 359.5344,-679.5241 355.6875,-689.7931 355.6875,-689.7931"/>
+            <text text-anchor="middle" x="373.5132" y="-659.4939" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra2 -->
+        <g id="edge33" class="edge">
+            <title>tr&#45;&gt;tra2</title>
+            <path fill="none" stroke="#0000ff" d="M333.9323,-584.2913C321.8427,-572.1071 306.8029,-557.03 293.2369,-543.6626 287.6045,-538.1127 281.5754,-532.2309 275.74,-526.5682"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="278.1153,-523.9962 268.4966,-519.5541 273.2458,-529.0249 278.1153,-523.9962"/>
+            <text text-anchor="middle" x="314.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tru -->
+        <g id="edge22" class="edge">
+            <title>tr&#45;&gt;tru</title>
+            <path fill="none" stroke="#add8e6" d="M389.1874,-591.8842C428.4573,-572.2261 495.1969,-538.817 538.4895,-517.1452"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="547.6205,-512.5743 540.6927,-521.0747 543.1494,-514.8125 538.6783,-517.0507 538.6783,-517.0507 538.6783,-517.0507 543.1494,-514.8125 536.664,-513.0267 547.6205,-512.5743 547.6205,-512.5743"/>
+            <text text-anchor="middle" x="486.4869" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> u</text>
+        </g>
+        <!-- tr&#45;&gt;tru -->
+        <g id="edge40" class="edge">
+            <title>tr&#45;&gt;tru</title>
+            <path fill="none" stroke="#0000ff" d="M393.3967,-599.0169C422.2538,-591.1383 463.1788,-578.126 496.2369,-560.4626 514.2689,-550.8278 532.5102,-537.4219 547.3655,-525.3499"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="549.7845,-527.8908 555.2473,-518.8128 545.3158,-522.5028 549.7845,-527.8908"/>
+            <text text-anchor="middle" x="525.4869" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> u</text>
+        </g>
+        <!-- t&#45;&gt;root -->
+        <g id="edge50" class="edge">
+            <title>t&#45;&gt;root</title>
+            <path fill="none" stroke="#ffc0cb" d="M362.0886,-726.2786C363.3802,-731.9426 364.5942,-738.2388 365.2369,-744.0939 366.1247,-752.1817 365.87,-760.8327 365.048,-769.081"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="363.7469,-779.2407 360.5537,-768.7501 364.3821,-774.2812 365.0173,-769.3217 365.0173,-769.3217 365.0173,-769.3217 364.3821,-774.2812 369.4808,-769.8934 363.7469,-779.2407 363.7469,-779.2407"/>
+            <text text-anchor="middle" x="384.9052" y="-748.2939" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x09B</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge3" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#000000" d="M378.7729,-693.2308C385.3244,-687.4682 391.7179,-680.2751 395.2369,-672.0939 398.1872,-665.2348 397.7115,-662.3385 395.2369,-655.2939 393.2659,-649.6828 390.3202,-644.2436 386.9247,-639.1727"/>
+            <polygon fill="#000000" stroke="#000000" points="389.6469,-636.9677 380.8782,-631.0211 384.0247,-641.138 389.6469,-636.9677"/>
+            <text text-anchor="middle" x="402.3172" y="-659.4939" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge15" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M340.324,-692.0138C335.3995,-686.2543 330.6543,-679.3722 328.0763,-672.0939 324.4827,-661.9481 326.7954,-651.1983 331.382,-641.4556"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="336.3576,-632.4758 335.4471,-643.4038 333.9342,-636.8493 331.5109,-641.2228 331.5109,-641.2228 331.5109,-641.2228 333.9342,-636.8493 327.5747,-639.0418 336.3576,-632.4758 336.3576,-632.4758"/>
+            <text text-anchor="middle" x="332.3172" y="-659.4939" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trav&#45;&gt;tree -->
+        <!-- trave -->
+        <g id="node15" class="node">
+            <title>trave</title>
+            <ellipse fill="none" stroke="#0000ff" cx="202.2369" cy="-284.4" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="202.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0B9</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge18" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#add8e6" d="M181.4692,-359.4476C176.9833,-352.6281 172.8815,-344.9707 170.5223,-337.2 168.3531,-330.0554 167.8843,-327.3851 170.5223,-320.4 172.1555,-316.0753 174.5712,-311.9338 177.3533,-308.0992"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="183.8676,-300.2431 180.9485,-310.8134 180.6761,-304.0921 177.4845,-307.941 177.4845,-307.941 177.4845,-307.941 180.6761,-304.0921 174.0205,-305.0686 183.8676,-300.2431 183.8676,-300.2431"/>
+            <text text-anchor="middle" x="176.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge36" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#0000ff" d="M202.2369,-355.0897C202.2369,-341.6046 202.2369,-325.7123 202.2369,-312.4153"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="205.737,-312.4033 202.2369,-302.4033 198.737,-312.4034 205.737,-312.4033"/>
+            <text text-anchor="middle" x="207.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- trav&#45;&gt;tra2 -->
+        <g id="edge47" class="edge">
+            <title>trav&#45;&gt;tra2</title>
+            <path fill="none" stroke="#ffc0cb" d="M213.0163,-412.9388C215.4002,-419.2228 217.909,-425.8514 220.2369,-432.0313 223.4375,-440.5282 226.8619,-449.6636 230.0924,-458.3025"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="233.6352,-467.7845 225.9197,-459.992 231.8851,-463.1007 230.1351,-458.417 230.1351,-458.417 230.1351,-458.417 231.8851,-463.1007 234.3505,-456.8419 233.6352,-467.7845 233.6352,-467.7845"/>
+            <text text-anchor="middle" x="244.9052" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0B8</text>
+        </g>
+        <!-- trave&#45;&gt;trav -->
+        <g id="edge46" class="edge">
+            <title>trave&#45;&gt;trav</title>
+            <path fill="none" stroke="#ffc0cb" d="M208.9024,-302.4722C210.6782,-308.1317 212.3487,-314.4535 213.2369,-320.4 214.4724,-328.6713 214.0766,-337.5213 212.8888,-345.926"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="211.1036,-355.888 208.4382,-345.2509 211.9856,-350.9664 212.8676,-346.0448 212.8676,-346.0448 212.8676,-346.0448 211.9856,-350.9664 217.297,-346.8386 211.1036,-355.888 211.1036,-355.888"/>
+            <text text-anchor="middle" x="232.9052" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0B9</text>
+        </g>
+        <!-- traver -->
+        <g id="node16" class="node">
+            <title>traver</title>
+            <ellipse fill="none" stroke="#0000ff" cx="202.2369" cy="-195.6" rx="38.8671" ry="18"/>
+            <text text-anchor="middle" x="202.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0BA</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge19" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#add8e6" d="M185.324,-268.32C180.3995,-262.5605 175.6543,-255.6784 173.0763,-248.4 170.5834,-241.3618 170.5834,-238.6382 173.0763,-231.6 174.4962,-227.5912 176.5735,-223.7026 178.9762,-220.0556"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="185.0164,-212.0415 182.5911,-222.7358 182.007,-216.0344 178.9975,-220.0273 178.9975,-220.0273 178.9975,-220.0273 182.007,-216.0344 175.4039,-217.3188 185.0164,-212.0415 185.0164,-212.0415"/>
+            <text text-anchor="middle" x="177.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge37" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#0000ff" d="M202.2369,-266.0006C202.2369,-253.8949 202.2369,-237.8076 202.2369,-224.0674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="205.737,-223.672 202.2369,-213.672 198.737,-223.6721 205.737,-223.672"/>
+            <text text-anchor="middle" x="206.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- traver&#45;&gt;trave -->
+        <g id="edge45" class="edge">
+            <title>traver&#45;&gt;trave</title>
+            <path fill="none" stroke="#ffc0cb" d="M207.6937,-213.7509C209.1467,-219.4136 210.5127,-225.7174 211.2369,-231.6 212.1492,-239.0107 212.1492,-240.9893 211.2369,-248.4 210.9201,-250.9736 210.4804,-253.6279 209.9643,-256.2738"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="207.6937,-266.2491 205.5255,-255.4997 208.8035,-261.3738 209.9132,-256.4985 209.9132,-256.4985 209.9132,-256.4985 208.8035,-261.3738 214.301,-257.4973 207.6937,-266.2491 207.6937,-266.2491"/>
+            <text text-anchor="middle" x="232.4585" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BA</text>
+        </g>
+        <!-- travers -->
+        <g id="node17" class="node">
+            <title>travers</title>
+            <ellipse fill="none" stroke="#0000ff" cx="203.2369" cy="-106.8" rx="38.305" ry="18"/>
+            <text text-anchor="middle" x="203.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0BB</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge20" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#add8e6" d="M183.9854,-179.6658C174.6965,-169.5397 166.3137,-155.9116 171.2923,-142.8 172.9385,-138.4645 175.3723,-134.3169 178.1749,-130.4793"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="184.7367,-122.6213 181.7812,-133.1814 181.5319,-126.4592 178.3271,-130.2971 178.3271,-130.2971 178.3271,-130.2971 181.5319,-126.4592 174.873,-127.4128 184.7367,-122.6213 184.7367,-122.6213"/>
+            <text text-anchor="middle" x="176.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#0000ff"> s</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge38" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#0000ff" d="M202.4441,-177.2006C202.5804,-165.0949 202.7616,-149.0076 202.9163,-135.2674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="206.4205,-134.9108 203.0334,-124.872 199.4209,-134.8319 206.4205,-134.9108"/>
+            <text text-anchor="middle" x="207.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> s</text>
+        </g>
+        <!-- travers&#45;&gt;traver -->
+        <g id="edge44" class="edge">
+            <title>travers&#45;&gt;traver</title>
+            <path fill="none" stroke="#ffc0cb" d="M209.0666,-124.8376C211.6638,-135.0186 213.8193,-147.9908 212.2369,-159.6 211.8717,-162.2793 211.3602,-165.0407 210.7599,-167.7863"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="208.2983,-177.4866 206.3963,-166.6869 209.5282,-172.6402 210.758,-167.7938 210.758,-167.7938 210.758,-167.7938 209.5282,-172.6402 215.1198,-168.9007 208.2983,-177.4866 208.2983,-177.4866"/>
+            <text text-anchor="middle" x="233.0735" y="-147" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BB</text>
+        </g>
+        <!-- traverse -->
+        <g id="node18" class="node">
+            <title>traverse</title>
+            <ellipse fill="none" stroke="#0000ff" cx="203.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="203.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#0000ff">contentArray[3]</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge21" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#add8e6" d="M184.8676,-90.9569C179.5135,-85.2132 174.3474,-78.2805 171.5223,-70.8 168.8843,-63.8149 168.8843,-60.9851 171.5223,-54 172.8668,-50.4399 174.7415,-47.004 176.9144,-43.7612"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="183.203,-35.676 180.6155,-46.3323 180.1332,-39.6227 177.0634,-43.5695 177.0634,-43.5695 177.0634,-43.5695 180.1332,-39.6227 173.5114,-40.8067 183.203,-35.676 183.203,-35.676"/>
+            <text text-anchor="middle" x="177.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge39" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#0000ff" d="M203.2369,-88.4006C203.2369,-76.2949 203.2369,-60.2076 203.2369,-46.4674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="206.737,-46.072 203.2369,-36.072 199.737,-46.0721 206.737,-46.072"/>
+            <text text-anchor="middle" x="208.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- traverse&#45;&gt;travers -->
+        <g id="edge43" class="edge">
+            <title>traverse&#45;&gt;travers</title>
+            <path fill="none" stroke="#ffc0cb" d="M209.9024,-36.0722C211.6782,-41.7317 213.3487,-48.0535 214.2369,-54 215.3399,-61.3847 215.3399,-63.4153 214.2369,-70.8 213.8344,-73.4945 213.2713,-76.2661 212.6106,-79.0181"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="209.9024,-88.7278 208.2546,-77.8864 211.2458,-83.9116 212.5891,-79.0954 212.5891,-79.0954 212.5891,-79.0954 211.2458,-83.9116 216.9237,-80.3044 209.9024,-88.7278 209.9024,-88.7278"/>
+            <text text-anchor="middle" x="224.2732" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#ff0000"> ~3</text>
+        </g>
+        <!-- tra2&#45;&gt;trac -->
+        <g id="edge34" class="edge">
+            <title>tra2&#45;&gt;trac</title>
+            <path fill="none" stroke="#0000ff" d="M216.023,-475.2499C189.2207,-455.3034 149.1456,-425.4792 122.3699,-405.5525"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="124.2894,-402.6181 114.1775,-399.4556 120.1102,-408.2337 124.2894,-402.6181"/>
+            <text text-anchor="middle" x="183.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- tra2&#45;&gt;tre -->
+        <!-- tra2&#45;&gt;tr -->
+        <g id="edge48" class="edge">
+            <title>tra2&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M240.4944,-525.6305C240.4789,-537.419 242.498,-550.5344 249.5777,-560.4626 263.7679,-580.3622 288.2176,-592.0186 310.2338,-598.7899"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="320.1326,-601.5694 309.2883,-603.1984 315.3187,-600.2177 310.5049,-598.866 310.5049,-598.866 310.5049,-598.866 315.3187,-600.2177 311.7214,-594.5335 320.1326,-601.5694 320.1326,-601.5694"/>
+            <text text-anchor="middle" x="270.0665" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0DE</text>
+        </g>
+        <!-- tra2&#45;&gt;trav -->
+        <g id="edge35" class="edge">
+            <title>tra2&#45;&gt;trav</title>
+            <path fill="none" stroke="#0000ff" d="M222.659,-471.5823C217.635,-464.6387 212.8503,-456.7959 209.7369,-448.8313 206.7154,-441.1019 204.8136,-432.4806 203.6368,-424.1275"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="207.1115,-423.7045 202.5344,-414.1494 200.1538,-424.4733 207.1115,-423.7045"/>
+            <text text-anchor="middle" x="215.4869" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> v</text>
+        </g>
+        <!-- tru&#45;&gt;tr -->
+        <g id="edge54" class="edge">
+            <title>tru&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M566.5842,-524.088C558.9936,-536.8985 548.4041,-551.2608 535.2369,-560.4626 496.6598,-587.4217 443.613,-599.0546 405.481,-604.073"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="395.1457,-605.3203 404.5344,-599.6545 400.1097,-604.7212 405.0736,-604.1221 405.0736,-604.1221 405.0736,-604.1221 400.1097,-604.7212 405.6128,-608.5897 395.1457,-605.3203 395.1457,-605.3203"/>
+            <text text-anchor="middle" x="571.6815" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0FA</text>
+        </g>
+        <!-- tru&#45;&gt;truc -->
+        <g id="edge23" class="edge">
+            <title>tru&#45;&gt;truc</title>
+            <path fill="none" stroke="#add8e6" d="M584.906,-467.0117C587.8872,-455.247 592.7225,-442.1094 600.5223,-432.0313 610.5674,-419.0519 625.2451,-408.7011 639.0092,-401.0235"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="647.8644,-396.3635 641.1106,-405.0028 643.4397,-398.692 639.0149,-401.0205 639.0149,-401.0205 639.0149,-401.0205 643.4397,-398.692 636.9193,-397.0383 647.8644,-396.3635 647.8644,-396.3635"/>
+            <text text-anchor="middle" x="605.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> c</text>
+        </g>
+        <!-- tru&#45;&gt;truc -->
+        <g id="edge41" class="edge">
+            <title>tru&#45;&gt;truc</title>
+            <path fill="none" stroke="#0000ff" d="M601.5424,-471.4723C617.5177,-452.8958 639.2774,-427.5931 655.1898,-409.0897"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="657.9656,-411.2298 661.8323,-401.3657 652.6583,-406.6656 657.9656,-411.2298"/>
+            <text text-anchor="middle" x="638.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- truc&#45;&gt;tru -->
+        <g id="edge53" class="edge">
+            <title>truc&#45;&gt;tru</title>
+            <path fill="none" stroke="#ffc0cb" d="M670.9731,-402.6462C666.3263,-416.3625 658.5213,-435.1464 647.2369,-448.8313 639.6641,-458.015 629.8999,-466.2538 620.1806,-473.1865"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="611.7071,-478.9336 617.4572,-469.5962 615.8452,-476.127 619.9832,-473.3204 619.9832,-473.3204 619.9832,-473.3204 615.8452,-476.127 622.5091,-477.0446 611.7071,-478.9336 611.7071,-478.9336"/>
+            <text text-anchor="middle" x="676.2965" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0FB</text>
+        </g>
+        <!-- truck -->
+        <g id="node22" class="node">
+            <title>truck</title>
+            <ellipse fill="none" stroke="#0000ff" cx="676.2369" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="676.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#0000ff">contentArray[4]</text>
+        </g>
+        <!-- truc&#45;&gt;truck -->
+        <g id="edge24" class="edge">
+            <title>truc&#45;&gt;truck</title>
+            <path fill="none" stroke="#add8e6" d="M661.1247,-367.9416C654.3993,-359.4018 647.222,-348.4489 643.7369,-337.2 641.5272,-330.0678 641.0565,-327.369 643.7369,-320.4 645.1585,-316.7039 647.1547,-313.1507 649.4663,-309.8129"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="655.7148,-302.0002 652.9831,-312.6204 652.5918,-305.9049 649.4689,-309.8097 649.4689,-309.8097 649.4689,-309.8097 652.5918,-305.9049 645.9546,-306.999 655.7148,-302.0002 655.7148,-302.0002"/>
+            <text text-anchor="middle" x="648.4869" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> k</text>
+        </g>
+        <!-- truc&#45;&gt;truck -->
+        <g id="edge42" class="edge">
+            <title>truc&#45;&gt;truck</title>
+            <path fill="none" stroke="#0000ff" d="M676.2369,-366.2345C676.2369,-351.2603 676.2369,-329.8643 676.2369,-312.7055"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="679.737,-312.6645 676.2369,-302.6646 672.737,-312.6646 679.737,-312.6645"/>
+            <text text-anchor="middle" x="681.4869" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> k</text>
+        </g>
+        <!-- truck&#45;&gt;truc -->
+        <g id="edge52" class="edge">
+            <title>truck&#45;&gt;truc</title>
+            <path fill="none" stroke="#ffc0cb" d="M683.5061,-302.4275C685.4432,-308.0852 687.266,-314.4172 688.2369,-320.4 690.1842,-332.3997 688.3971,-345.6086 685.6614,-356.9103"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="682.9623,-366.6639 681.2924,-355.8259 684.2958,-361.845 685.6294,-357.0261 685.6294,-357.0261 685.6294,-357.0261 684.2958,-361.845 689.9664,-358.2263 682.9623,-366.6639 682.9623,-366.6639"/>
+            <text text-anchor="middle" x="695.5232" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#ff0000">~4</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g1.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g1.svg
new file mode 100644
index 000000000000..e43b324e2ba8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g1.svg
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="154pt" height="310pt"
+     viewBox="0.00 0.00 154.47 310.40" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 306.4)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-306.4 150.4738,-306.4 150.4738,4 -4,4"/>
+        <!-- 0x13B -->
+        <g id="node1" class="node">
+            <title>0x13B</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x13B</text>
+        </g>
+        <!-- contentArray[0] -->
+        <g id="node2" class="node">
+            <title>contentArray[0]</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- 0x13B&#45;&gt;contentArray[0] -->
+        <g id="edge1" class="edge">
+            <title>0x13B&#45;&gt;contentArray[0]</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-88.4006C73.2369,-76.2949 73.2369,-60.2076 73.2369,-46.4674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-46.072 73.2369,-36.072 69.737,-46.0721 76.737,-46.072"/>
+            <text text-anchor="middle" x="81.7902" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;A</text>
+        </g>
+        <!-- 0x13A -->
+        <g id="node3" class="node">
+            <title>0x13A</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x13A</text>
+        </g>
+        <!-- 0x13A&#45;&gt;0x13B -->
+        <g id="edge2" class="edge">
+            <title>0x13A&#45;&gt;0x13B</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-177.2006C73.2369,-165.0949 73.2369,-149.0076 73.2369,-135.2674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-134.872 73.2369,-124.872 69.737,-134.8721 76.737,-134.872"/>
+            <text text-anchor="middle" x="81.4052" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;H</text>
+        </g>
+        <!-- 0x139 -->
+        <g id="node4" class="node">
+            <title>0x139</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x139</text>
+        </g>
+        <!-- 0x139&#45;&gt;0x13A -->
+        <g id="edge3" class="edge">
+            <title>0x139&#45;&gt;0x13A</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-266.0006C73.2369,-253.8949 73.2369,-237.8076 73.2369,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-223.672 73.2369,-213.672 69.737,-223.6721 76.737,-223.672"/>
+            <text text-anchor="middle" x="81.4052" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;C</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g2.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g2.svg
new file mode 100644
index 000000000000..a5c7eed6097b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g2.svg
@@ -0,0 +1,116 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+strict digraph G {
+  "0x139" [color=grey,fontcolor=grey];
+
+  "0x13B" -> "contentArray[0]" [label="  A"];
+  "0x13A" -> "0x13B" [label="  H"];
+  "0x139" -> "0x13A" [label="  C",color="grey",fontcolor="grey"];
+
+  "0x25E" -> "0x238" [label="  A"];
+  "0x25E" -> "0x13A" [label="  C"];
+//   "0x25E" -> "0x33B" [label="  5"];
+//   "0x25E" -> "0x35C" [label="  B"];
+//   "0x25E" -> "0x37A" [label="  P"];
+//   "0x25E" -> "0x41B" [label="  3"];
+}
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="207pt" height="310pt"
+     viewBox="0.00 0.00 207.49 310.40" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 306.4)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-306.4 203.4867,-306.4 203.4867,4 -4,4"/>
+        <!-- 0x139 -->
+        <g id="node1" class="node">
+            <title>0x139</title>
+            <ellipse fill="none" stroke="#c0c0c0" cx="73.2369" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#c0c0c0">0x139</text>
+        </g>
+        <!-- 0x13A -->
+        <g id="node4" class="node">
+            <title>0x13A</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x13A</text>
+        </g>
+        <!-- 0x139&#45;&gt;0x13A -->
+        <g id="edge3" class="edge">
+            <title>0x139&#45;&gt;0x13A</title>
+            <path fill="none" stroke="#c0c0c0" d="M73.2369,-266.0006C73.2369,-253.8949 73.2369,-237.8076 73.2369,-224.0674"/>
+            <polygon fill="#c0c0c0" stroke="#c0c0c0" points="76.737,-223.672 73.2369,-213.672 69.737,-223.6721 76.737,-223.672"/>
+            <text text-anchor="middle" x="81.4052" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#c0c0c0"> &#160;C</text>
+        </g>
+        <!-- 0x13B -->
+        <g id="node2" class="node">
+            <title>0x13B</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x13B</text>
+        </g>
+        <!-- contentArray[0] -->
+        <g id="node3" class="node">
+            <title>contentArray[0]</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- 0x13B&#45;&gt;contentArray[0] -->
+        <g id="edge1" class="edge">
+            <title>0x13B&#45;&gt;contentArray[0]</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-88.4006C73.2369,-76.2949 73.2369,-60.2076 73.2369,-46.4674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-46.072 73.2369,-36.072 69.737,-46.0721 76.737,-46.072"/>
+            <text text-anchor="middle" x="81.7902" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;A</text>
+        </g>
+        <!-- 0x13A&#45;&gt;0x13B -->
+        <g id="edge2" class="edge">
+            <title>0x13A&#45;&gt;0x13B</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-177.2006C73.2369,-165.0949 73.2369,-149.0076 73.2369,-135.2674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-134.872 73.2369,-124.872 69.737,-134.8721 76.737,-134.872"/>
+            <text text-anchor="middle" x="81.4052" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;H</text>
+        </g>
+        <!-- 0x25E -->
+        <g id="node5" class="node">
+            <title>0x25E</title>
+            <ellipse fill="none" stroke="#000000" cx="163.2369" cy="-284.4" rx="36.5014" ry="18"/>
+            <text text-anchor="middle" x="163.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x25E</text>
+        </g>
+        <!-- 0x25E&#45;&gt;0x13A -->
+        <g id="edge5" class="edge">
+            <title>0x25E&#45;&gt;0x13A</title>
+            <path fill="none" stroke="#000000" d="M146.7295,-268.1127C132.7742,-254.3435 112.6083,-234.4465 96.9293,-218.9765"/>
+            <polygon fill="#000000" stroke="#000000" points="99.2162,-216.3161 89.6396,-211.784 94.2998,-221.299 99.2162,-216.3161"/>
+            <text text-anchor="middle" x="135.4052" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;C</text>
+        </g>
+        <!-- 0x238 -->
+        <g id="node6" class="node">
+            <title>0x238</title>
+            <ellipse fill="none" stroke="#000000" cx="164.2369" cy="-195.6" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="164.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x238</text>
+        </g>
+        <!-- 0x25E&#45;&gt;0x238 -->
+        <g id="edge4" class="edge">
+            <title>0x25E&#45;&gt;0x238</title>
+            <path fill="none" stroke="#000000" d="M163.4441,-266.0006C163.5804,-253.8949 163.7616,-237.8076 163.9163,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="167.4205,-223.7108 164.0334,-213.672 160.4209,-223.6319 167.4205,-223.7108"/>
+            <text text-anchor="middle" x="171.7902" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;A</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g3.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g3.svg
new file mode 100644
index 000000000000..da6619c18547
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g3.svg
@@ -0,0 +1,253 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# http://www.graphviz.org/content/cluster
+
+strict digraph G {
+    "0x51C";
+
+    subgraph cluster_51C {
+        label= "Split node 0x51C";
+        color=grey;
+        node [style=dashed] "0x520", "0x540", "0x560", "0x580", "0x5A0";
+
+        "0x51C" -> "0x520" [label="  00"];
+        "0x51C" -> "0x560" [label="  01"];
+        "0x520" -> "0x540" [label="  110"];
+        "0x560" -> "0x580" [label="  000"];
+        "0x560" -> "0x5A0" [label="  010"];
+    }
+
+    "0x540" -> "0x41B" [label="  011"];
+    "0x540" -> "0x33B" [label="  101"];
+    "0x580" -> "0x238" [label="  001"];
+    "0x580" -> "0x35C" [label="  010"];
+    "0x580" -> "0x13A" [label="  011"];
+    "0x5A0" -> "0x37A" [label="  000"];
+    "0x5A0" -> "0x455" [label="  001"];
+
+    "0x13B" -> "contentArray[0]" [label="  A"];
+    "0x13A" -> "0x13B" [label="  H"];
+}
+
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="628pt" height="605pt"
+     viewBox="0.00 0.00 627.97 604.80" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 600.8)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-600.8 623.9695,-600.8 623.9695,4 -4,4"/>
+        <g id="clust1" class="cluster">
+            <title>cluster_51C</title>
+            <polygon fill="none" stroke="#c0c0c0" points="177.7906,-258.4 177.7906,-588.8 444.7906,-588.8 444.7906,-258.4 177.7906,-258.4"/>
+            <text text-anchor="middle" x="311.2906" y="-572.2" font-family="Times,serif" font-size="14.00" fill="#000000">Split node 0x51C</text>
+        </g>
+        <!-- 0x51C -->
+        <g id="node1" class="node">
+            <title>0x51C</title>
+            <ellipse fill="none" stroke="#000000" cx="264.7906" cy="-538" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="264.7906" y="-533.8" font-family="Times,serif" font-size="14.00" fill="#000000">0x51C</text>
+        </g>
+        <!-- 0x520 -->
+        <g id="node2" class="node">
+            <title>0x520</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="220.7906" cy="-411.2" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="220.7906" y="-407" font-family="Times,serif" font-size="14.00" fill="#000000">0x520</text>
+        </g>
+        <!-- 0x51C&#45;&gt;0x520 -->
+        <g id="edge1" class="edge">
+            <title>0x51C&#45;&gt;0x520</title>
+            <path fill="none" stroke="#000000" d="M258.5798,-520.1016C251.2095,-498.8616 238.8421,-463.2211 230.2726,-438.5254"/>
+            <polygon fill="#000000" stroke="#000000" points="233.5547,-437.3073 226.9698,-429.0074 226.9416,-439.6022 233.5547,-437.3073"/>
+            <text text-anchor="middle" x="256.2906" y="-470.4" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;00</text>
+        </g>
+        <!-- 0x560 -->
+        <g id="node4" class="node">
+            <title>0x560</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="308.7906" cy="-411.2" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="308.7906" y="-407" font-family="Times,serif" font-size="14.00" fill="#000000">0x560</text>
+        </g>
+        <!-- 0x51C&#45;&gt;0x560 -->
+        <g id="edge2" class="edge">
+            <title>0x51C&#45;&gt;0x560</title>
+            <path fill="none" stroke="#000000" d="M271.0014,-520.1016C278.3718,-498.8616 290.7391,-463.2211 299.3086,-438.5254"/>
+            <polygon fill="#000000" stroke="#000000" points="302.6397,-439.6022 302.6114,-429.0074 296.0265,-437.3073 302.6397,-439.6022"/>
+            <text text-anchor="middle" x="300.2906" y="-470.4" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;01</text>
+        </g>
+        <!-- 0x540 -->
+        <g id="node3" class="node">
+            <title>0x540</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="220.7906" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="220.7906" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x540</text>
+        </g>
+        <!-- 0x520&#45;&gt;0x540 -->
+        <g id="edge3" class="edge">
+            <title>0x520&#45;&gt;0x540</title>
+            <path fill="none" stroke="#000000" d="M220.7906,-393.0327C220.7906,-372.0352 220.7906,-337.2261 220.7906,-312.679"/>
+            <polygon fill="#000000" stroke="#000000" points="224.2907,-312.5336 220.7906,-302.5336 217.2907,-312.5337 224.2907,-312.5336"/>
+            <text text-anchor="middle" x="234.7906" y="-343.6" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;110</text>
+        </g>
+        <!-- 0x41B -->
+        <g id="node7" class="node">
+            <title>0x41B</title>
+            <ellipse fill="none" stroke="#000000" cx="36.7906" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="36.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x41B</text>
+        </g>
+        <!-- 0x540&#45;&gt;0x41B -->
+        <g id="edge6" class="edge">
+            <title>0x540&#45;&gt;0x41B</title>
+            <path fill="none" stroke="#000000" d="M192.599,-273.4087C175.6606,-266.6053 153.7802,-257.4635 134.7906,-248.4 112.7572,-237.8837 88.6053,-224.889 69.791,-214.4226"/>
+            <polygon fill="#000000" stroke="#000000" points="71.4518,-211.3412 61.0164,-209.5097 68.0319,-217.449 71.4518,-211.3412"/>
+            <text text-anchor="middle" x="148.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;011</text>
+        </g>
+        <!-- 0x33B -->
+        <g id="node8" class="node">
+            <title>0x33B</title>
+            <ellipse fill="none" stroke="#000000" cx="128.7906" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="128.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x33B</text>
+        </g>
+        <!-- 0x540&#45;&gt;0x33B -->
+        <g id="edge7" class="edge">
+            <title>0x540&#45;&gt;0x33B</title>
+            <path fill="none" stroke="#000000" d="M203.9164,-268.1127C189.651,-254.3435 169.037,-234.4465 153.0095,-218.9765"/>
+            <polygon fill="#000000" stroke="#000000" points="155.1837,-216.2106 145.5579,-211.784 150.3223,-221.2472 155.1837,-216.2106"/>
+            <text text-anchor="middle" x="197.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;101</text>
+        </g>
+        <!-- 0x580 -->
+        <g id="node5" class="node">
+            <title>0x580</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="308.7906" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="308.7906" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x580</text>
+        </g>
+        <!-- 0x560&#45;&gt;0x580 -->
+        <g id="edge4" class="edge">
+            <title>0x560&#45;&gt;0x580</title>
+            <path fill="none" stroke="#000000" d="M308.7906,-393.0327C308.7906,-372.0352 308.7906,-337.2261 308.7906,-312.679"/>
+            <polygon fill="#000000" stroke="#000000" points="312.2907,-312.5336 308.7906,-302.5336 305.2907,-312.5337 312.2907,-312.5336"/>
+            <text text-anchor="middle" x="322.7906" y="-343.6" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;000</text>
+        </g>
+        <!-- 0x5A0 -->
+        <g id="node6" class="node">
+            <title>0x5A0</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="399.7906" cy="-284.4" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="399.7906" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x5A0</text>
+        </g>
+        <!-- 0x560&#45;&gt;0x5A0 -->
+        <g id="edge5" class="edge">
+            <title>0x560&#45;&gt;0x5A0</title>
+            <path fill="none" stroke="#000000" d="M321.0633,-394.0992C336.5995,-372.451 363.5002,-334.9674 381.471,-309.9267"/>
+            <polygon fill="#000000" stroke="#000000" points="384.4347,-311.7999 387.4218,-301.6348 378.7476,-307.7185 384.4347,-311.7999"/>
+            <text text-anchor="middle" x="373.7906" y="-343.6" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;010</text>
+        </g>
+        <!-- 0x238 -->
+        <g id="node9" class="node">
+            <title>0x238</title>
+            <ellipse fill="none" stroke="#000000" cx="218.7906" cy="-195.6" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="218.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x238</text>
+        </g>
+        <!-- 0x580&#45;&gt;0x238 -->
+        <g id="edge8" class="edge">
+            <title>0x580&#45;&gt;0x238</title>
+            <path fill="none" stroke="#000000" d="M292.2832,-268.1127C278.3279,-254.3435 258.162,-234.4465 242.483,-218.9765"/>
+            <polygon fill="#000000" stroke="#000000" points="244.77,-216.3161 235.1934,-211.784 239.8535,-221.299 244.77,-216.3161"/>
+            <text text-anchor="middle" x="286.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;001</text>
+        </g>
+        <!-- 0x35C -->
+        <g id="node10" class="node">
+            <title>0x35C</title>
+            <ellipse fill="none" stroke="#000000" cx="308.7906" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="308.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x35C</text>
+        </g>
+        <!-- 0x580&#45;&gt;0x35C -->
+        <g id="edge9" class="edge">
+            <title>0x580&#45;&gt;0x35C</title>
+            <path fill="none" stroke="#000000" d="M308.7906,-266.0006C308.7906,-253.8949 308.7906,-237.8076 308.7906,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="312.2907,-223.672 308.7906,-213.672 305.2907,-223.6721 312.2907,-223.672"/>
+            <text text-anchor="middle" x="322.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;010</text>
+        </g>
+        <!-- 0x13A -->
+        <g id="node11" class="node">
+            <title>0x13A</title>
+            <ellipse fill="none" stroke="#000000" cx="400.7906" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="400.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x13A</text>
+        </g>
+        <!-- 0x580&#45;&gt;0x13A -->
+        <g id="edge10" class="edge">
+            <title>0x580&#45;&gt;0x13A</title>
+            <path fill="none" stroke="#000000" d="M325.6648,-268.1127C339.9303,-254.3435 360.5443,-234.4465 376.5717,-218.9765"/>
+            <polygon fill="#000000" stroke="#000000" points="379.259,-221.2472 384.0234,-211.784 374.3976,-216.2106 379.259,-221.2472"/>
+            <text text-anchor="middle" x="377.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;011</text>
+        </g>
+        <!-- 0x37A -->
+        <g id="node12" class="node">
+            <title>0x37A</title>
+            <ellipse fill="none" stroke="#000000" cx="493.7906" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="493.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x37A</text>
+        </g>
+        <!-- 0x5A0&#45;&gt;0x37A -->
+        <g id="edge11" class="edge">
+            <title>0x5A0&#45;&gt;0x37A</title>
+            <path fill="none" stroke="#000000" d="M417.0317,-268.1127C431.6072,-254.3435 452.6693,-234.4465 469.0452,-218.9765"/>
+            <polygon fill="#000000" stroke="#000000" points="471.7931,-221.1955 476.6589,-211.784 466.9861,-216.107 471.7931,-221.1955"/>
+            <text text-anchor="middle" x="469.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;000</text>
+        </g>
+        <!-- 0x455 -->
+        <g id="node13" class="node">
+            <title>0x455</title>
+            <ellipse fill="none" stroke="#000000" cx="584.7906" cy="-195.6" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="584.7906" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x455</text>
+        </g>
+        <!-- 0x5A0&#45;&gt;0x455 -->
+        <g id="edge12" class="edge">
+            <title>0x5A0&#45;&gt;0x455</title>
+            <path fill="none" stroke="#000000" d="M429.4765,-273.1983C446.7051,-266.4645 468.7051,-257.4645 487.7906,-248.4 509.7186,-237.9855 533.6828,-224.9577 552.3055,-214.4487"/>
+            <polygon fill="#000000" stroke="#000000" points="554.288,-217.3478 561.2515,-209.3627 550.8284,-211.2625 554.288,-217.3478"/>
+            <text text-anchor="middle" x="533.7906" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;001</text>
+        </g>
+        <!-- 0x13B -->
+        <g id="node14" class="node">
+            <title>0x13B</title>
+            <ellipse fill="none" stroke="#000000" cx="400.7906" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="400.7906" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x13B</text>
+        </g>
+        <!-- 0x13A&#45;&gt;0x13B -->
+        <g id="edge14" class="edge">
+            <title>0x13A&#45;&gt;0x13B</title>
+            <path fill="none" stroke="#000000" d="M400.7906,-177.2006C400.7906,-165.0949 400.7906,-149.0076 400.7906,-135.2674"/>
+            <polygon fill="#000000" stroke="#000000" points="404.2907,-134.872 400.7906,-124.872 397.2907,-134.8721 404.2907,-134.872"/>
+            <text text-anchor="middle" x="409.3439" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;H</text>
+        </g>
+        <!-- contentArray[0] -->
+        <g id="node15" class="node">
+            <title>contentArray[0]</title>
+            <ellipse fill="none" stroke="#000000" cx="400.7906" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="400.7906" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- 0x13B&#45;&gt;contentArray[0] -->
+        <g id="edge13" class="edge">
+            <title>0x13B&#45;&gt;contentArray[0]</title>
+            <path fill="none" stroke="#000000" d="M400.7906,-88.4006C400.7906,-76.2949 400.7906,-60.2076 400.7906,-46.4674"/>
+            <polygon fill="#000000" stroke="#000000" points="404.2907,-46.072 400.7906,-36.072 397.2907,-46.0721 404.2907,-46.072"/>
+            <text text-anchor="middle" x="409.3439" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;A</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g4.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g4.svg
new file mode 100644
index 000000000000..d021029a7058
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.g4.svg
@@ -0,0 +1,290 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# http://www.graphviz.org/content/cluster
+
+strict digraph G {
+//  "0x139" [color=grey,fontcolor=grey];
+
+  "0x13B" -> "contentArray[0]" [label="  A"];
+  "0x13A" -> "0x13B" [label="  H"];
+//   "0x139" -> "0x13A" [label="  C"];
+//  "0x139" -> "0x13A" [label="  C",color="grey",fontcolor="grey"];
+
+//   "0x25E" -> "0x238" [label="  A"];
+//   "0x25E" -> "0x13A" [label="  C"];
+//   "0x25E" -> "0x33B" [label="  5"];
+//   "0x25E" -> "0x35C" [label="  B"];
+//   "0x25E" -> "0x37A" [label="  P"];
+//   "0x25E" -> "0x41B" [label="  3"];
+
+  subgraph cluster_51F {
+    label = "Node 0x51F"
+    "0x51F" [label="Prefix 0x51F\ncontentArray[1]"]
+    "0x51F" -> "0x51C" [label="ε"];
+
+    subgraph cluster_51C {
+      label= "Split node 0x51C";
+      ranksep=1
+      color=grey;
+      node [style=dashed] "0x520", "0x540", "0x560", "0x580", "0x5A0";
+
+      "0x51C" -> "0x520" [label="  00"];
+      "0x51C" -> "0x560" [label="  01"];
+      "0x520" -> "0x540" [label="  110"];
+      "0x560" -> "0x580" [label="  000"];
+      "0x560" -> "0x5A0" [label="  010"];
+    }
+  }
+  "0x540" -> "0x41B" [label="  011",minlen=2];
+  "0x540" -> "0x33B" [label="  101",minlen=2];
+  "0x580" -> "0x238" [label="  001",minlen=2];
+  "0x580" -> "0x35C" [label="  010",minlen=2];
+  "0x580" -> "0x13A" [label="  011",minlen=2];
+  "0x5A0" -> "0x37A" [label="  000",minlen=2];
+  "0x5A0" -> "0x455" [label="  001",minlen=2];
+
+  { rank=same "0x238" -> "0x35C" -> "0x13A" [style=invis,constrain=false]}
+}
+-->
+<!-- Title: G Pages: 1 -->
+<svg width="663pt" height="739pt"
+     viewBox="0.00 0.00 662.97 739.23" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 735.2313)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-735.2313 658.9695,-735.2313 658.9695,4 -4,4"/>
+        <g id="clust1" class="cluster">
+            <title>cluster_51F</title>
+            <polygon fill="none" stroke="#000000" points="186.7906,-231.4 186.7906,-723.2313 469.7906,-723.2313 469.7906,-231.4 186.7906,-231.4"/>
+            <text text-anchor="middle" x="328.2906" y="-706.6313" font-family="Times,serif" font-size="14.00" fill="#000000">Node 0x51F</text>
+        </g>
+        <g id="clust2" class="cluster">
+            <title>cluster_51C</title>
+            <polygon fill="none" stroke="#c0c0c0" points="194.7906,-239.4 194.7906,-577.8 461.7906,-577.8 461.7906,-239.4 194.7906,-239.4"/>
+            <text text-anchor="middle" x="328.2906" y="-561.2" font-family="Times,serif" font-size="14.00" fill="#000000">Split node 0x51C</text>
+        </g>
+        <!-- 0x13B -->
+        <g id="node1" class="node">
+            <title>0x13B</title>
+            <ellipse fill="none" stroke="#000000" cx="435.7906" cy="-88.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="435.7906" y="-84.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x13B</text>
+        </g>
+        <!-- contentArray[0] -->
+        <g id="node2" class="node">
+            <title>contentArray[0]</title>
+            <ellipse fill="none" stroke="#000000" cx="435.7906" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="435.7906" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- 0x13B&#45;&gt;contentArray[0] -->
+        <g id="edge1" class="edge">
+            <title>0x13B&#45;&gt;contentArray[0]</title>
+            <path fill="none" stroke="#000000" d="M435.7906,-70.5672C435.7906,-63.2743 435.7906,-54.6987 435.7906,-46.6137"/>
+            <polygon fill="#000000" stroke="#000000" points="439.2907,-46.417 435.7906,-36.417 432.2907,-46.4171 439.2907,-46.417"/>
+            <text text-anchor="middle" x="444.3439" y="-49.2" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;A</text>
+        </g>
+        <!-- 0x13A -->
+        <g id="node3" class="node">
+            <title>0x13A</title>
+            <ellipse fill="none" stroke="#000000" cx="435.7906" cy="-159.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="435.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x13A</text>
+        </g>
+        <!-- 0x13A&#45;&gt;0x13B -->
+        <g id="edge2" class="edge">
+            <title>0x13A&#45;&gt;0x13B</title>
+            <path fill="none" stroke="#000000" d="M435.7906,-141.3672C435.7906,-134.0743 435.7906,-125.4987 435.7906,-117.4137"/>
+            <polygon fill="#000000" stroke="#000000" points="439.2907,-117.217 435.7906,-107.217 432.2907,-117.2171 439.2907,-117.217"/>
+            <text text-anchor="middle" x="444.3439" y="-120" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;H</text>
+        </g>
+        <!-- 0x51F -->
+        <g id="node4" class="node">
+            <title>0x51F</title>
+            <ellipse fill="none" stroke="#000000" cx="318.7906" cy="-661.0156" rx="117.2629" ry="29.3315"/>
+            <text text-anchor="middle" x="318.7906" y="-665.2156" font-family="Times,serif" font-size="14.00" fill="#000000">Prefix 0x51F</text>
+            <text text-anchor="middle" x="318.7906" y="-648.4156" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- 0x51C -->
+        <g id="node5" class="node">
+            <title>0x51C</title>
+            <ellipse fill="none" stroke="#000000" cx="318.7906" cy="-527" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="318.7906" y="-522.8" font-family="Times,serif" font-size="14.00" fill="#000000">0x51C</text>
+        </g>
+        <!-- 0x51F&#45;&gt;0x51C -->
+        <g id="edge3" class="edge">
+            <title>0x51F&#45;&gt;0x51C</title>
+            <path fill="none" stroke="#000000" d="M318.7906,-631.2732C318.7906,-608.6091 318.7906,-577.6052 318.7906,-555.3044"/>
+            <polygon fill="#000000" stroke="#000000" points="322.2907,-555.1305 318.7906,-545.1306 315.2907,-555.1306 322.2907,-555.1305"/>
+            <text text-anchor="middle" x="322.8709" y="-600" font-family="Times,serif" font-size="14.00" fill="#000000">ε</text>
+        </g>
+        <!-- 0x520 -->
+        <g id="node6" class="node">
+            <title>0x520</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="237.7906" cy="-396.2" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="237.7906" y="-392" font-family="Times,serif" font-size="14.00" fill="#000000">0x520</text>
+        </g>
+        <!-- 0x51C&#45;&gt;0x520 -->
+        <g id="edge4" class="edge">
+            <title>0x51C&#45;&gt;0x520</title>
+            <path fill="none" stroke="#000000" d="M284.1131,-520.6804C267.7647,-515.4937 251.7906,-506.1624 251.7906,-489.5 251.7906,-489.5 251.7906,-489.5 251.7906,-433.7 251.7906,-430.2948 251.2777,-426.8193 250.4482,-423.4252"/>
+            <polygon fill="#000000" stroke="#000000" points="253.6734,-422.0307 247.2065,-413.6385 247.0285,-424.2318 253.6734,-422.0307"/>
+            <text text-anchor="middle" x="262.2906" y="-457.4" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;00</text>
+        </g>
+        <!-- 0x560 -->
+        <g id="node8" class="node">
+            <title>0x560</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="325.7906" cy="-396.2" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="325.7906" y="-392" font-family="Times,serif" font-size="14.00" fill="#000000">0x560</text>
+        </g>
+        <!-- 0x51C&#45;&gt;0x560 -->
+        <g id="edge5" class="edge">
+            <title>0x51C&#45;&gt;0x560</title>
+            <path fill="none" stroke="#000000" d="M320.8414,-508.7129C321.3687,-502.6109 321.7906,-495.77 321.7906,-489.5 321.7906,-489.5 321.7906,-489.5 321.7906,-433.7 321.7906,-430.7537 321.9142,-427.6845 322.118,-424.6266"/>
+            <polygon fill="#000000" stroke="#000000" points="325.6215,-424.7486 323.0562,-414.469 318.6512,-424.1046 325.6215,-424.7486"/>
+            <text text-anchor="middle" x="332.2906" y="-457.4" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;01</text>
+        </g>
+        <!-- 0x540 -->
+        <g id="node7" class="node">
+            <title>0x540</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="237.7906" cy="-265.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="237.7906" y="-261.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x540</text>
+        </g>
+        <!-- 0x520&#45;&gt;0x540 -->
+        <g id="edge6" class="edge">
+            <title>0x520&#45;&gt;0x540</title>
+            <path fill="none" stroke="#000000" d="M237.7906,-377.8895C237.7906,-371.7859 237.7906,-364.95 237.7906,-358.7 237.7906,-358.7 237.7906,-358.7 237.7906,-302.9 237.7906,-299.9703 237.7906,-296.9119 237.7906,-293.8605"/>
+            <polygon fill="#000000" stroke="#000000" points="241.2907,-293.7105 237.7906,-283.7105 234.2907,-293.7106 241.2907,-293.7105"/>
+            <text text-anchor="middle" x="251.7906" y="-326.6" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;110</text>
+        </g>
+        <!-- 0x41B -->
+        <g id="node11" class="node">
+            <title>0x41B</title>
+            <ellipse fill="none" stroke="#000000" cx="36.7906" cy="-159.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="36.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x41B</text>
+        </g>
+        <!-- 0x540&#45;&gt;0x41B -->
+        <g id="edge9" class="edge">
+            <title>0x540&#45;&gt;0x41B</title>
+            <path fill="none" stroke="#000000" d="M212.5449,-252.7617C191.8449,-242.3434 161.8208,-227.105 135.7906,-213.4 113.9245,-201.8874 89.5312,-188.6361 70.4296,-178.1656"/>
+            <polygon fill="#000000" stroke="#000000" points="71.9619,-175.014 61.5118,-173.2686 68.5926,-181.1498 71.9619,-175.014"/>
+            <text text-anchor="middle" x="149.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;011</text>
+        </g>
+        <!-- 0x33B -->
+        <g id="node12" class="node">
+            <title>0x33B</title>
+            <ellipse fill="none" stroke="#000000" cx="128.7906" cy="-159.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="128.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x33B</text>
+        </g>
+        <!-- 0x540&#45;&gt;0x33B -->
+        <g id="edge10" class="edge">
+            <title>0x540&#45;&gt;0x33B</title>
+            <path fill="none" stroke="#000000" d="M221.2359,-249.3313C202.9895,-231.6206 173.4782,-202.9757 152.7171,-182.824"/>
+            <polygon fill="#000000" stroke="#000000" points="155.1145,-180.2735 145.5012,-175.82 150.2391,-185.2964 155.1145,-180.2735"/>
+            <text text-anchor="middle" x="197.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;101</text>
+        </g>
+        <!-- 0x580 -->
+        <g id="node9" class="node">
+            <title>0x580</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="325.7906" cy="-265.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="325.7906" y="-261.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x580</text>
+        </g>
+        <!-- 0x560&#45;&gt;0x580 -->
+        <g id="edge7" class="edge">
+            <title>0x560&#45;&gt;0x580</title>
+            <path fill="none" stroke="#000000" d="M325.7906,-377.8895C325.7906,-371.7859 325.7906,-364.95 325.7906,-358.7 325.7906,-358.7 325.7906,-358.7 325.7906,-302.9 325.7906,-299.9703 325.7906,-296.9119 325.7906,-293.8605"/>
+            <polygon fill="#000000" stroke="#000000" points="329.2907,-293.7105 325.7906,-283.7105 322.2907,-293.7106 329.2907,-293.7105"/>
+            <text text-anchor="middle" x="339.7906" y="-326.6" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;000</text>
+        </g>
+        <!-- 0x5A0 -->
+        <g id="node10" class="node">
+            <title>0x5A0</title>
+            <ellipse fill="none" stroke="#000000" stroke-dasharray="5,2" cx="416.7906" cy="-265.4" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="416.7906" y="-261.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x5A0</text>
+        </g>
+        <!-- 0x560&#45;&gt;0x5A0 -->
+        <g id="edge8" class="edge">
+            <title>0x560&#45;&gt;0x5A0</title>
+            <path fill="none" stroke="#000000" d="M356.6658,-387.2435C378.0878,-380.079 402.7906,-369.4801 402.7906,-358.7 402.7906,-358.7 402.7906,-358.7 402.7906,-302.9 402.7906,-299.4948 403.3035,-296.0193 404.133,-292.6252"/>
+            <polygon fill="#000000" stroke="#000000" points="407.5528,-293.4318 407.3747,-282.8385 400.9078,-291.2307 407.5528,-293.4318"/>
+            <text text-anchor="middle" x="416.7906" y="-326.6" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;010</text>
+        </g>
+        <!-- 0x580&#45;&gt;0x13A -->
+        <g id="edge13" class="edge">
+            <title>0x580&#45;&gt;0x13A</title>
+            <path fill="none" stroke="#000000" d="M342.4972,-249.3313C360.911,-231.6206 390.693,-202.9757 411.6447,-182.824"/>
+            <polygon fill="#000000" stroke="#000000" points="414.1457,-185.2747 418.9268,-175.82 409.2931,-180.2296 414.1457,-185.2747"/>
+            <text text-anchor="middle" x="410.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;011</text>
+        </g>
+        <!-- 0x238 -->
+        <g id="node13" class="node">
+            <title>0x238</title>
+            <ellipse fill="none" stroke="#000000" cx="218.7906" cy="-159.6" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="218.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x238</text>
+        </g>
+        <!-- 0x580&#45;&gt;0x238 -->
+        <g id="edge11" class="edge">
+            <title>0x580&#45;&gt;0x238</title>
+            <path fill="none" stroke="#000000" d="M309.5396,-249.3313C291.7067,-231.6983 262.9122,-203.2267 242.5467,-183.0897"/>
+            <polygon fill="#000000" stroke="#000000" points="244.7663,-180.3623 235.1945,-175.82 239.8445,-185.3399 244.7663,-180.3623"/>
+            <text text-anchor="middle" x="286.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;001</text>
+        </g>
+        <!-- 0x35C -->
+        <g id="node14" class="node">
+            <title>0x35C</title>
+            <ellipse fill="none" stroke="#000000" cx="325.7906" cy="-159.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="325.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x35C</text>
+        </g>
+        <!-- 0x580&#45;&gt;0x35C -->
+        <g id="edge12" class="edge">
+            <title>0x580&#45;&gt;0x35C</title>
+            <path fill="none" stroke="#000000" d="M325.7906,-246.971C325.7906,-230.6622 325.7906,-206.6111 325.7906,-187.8698"/>
+            <polygon fill="#000000" stroke="#000000" points="329.2907,-187.8177 325.7906,-177.8178 322.2907,-187.8178 329.2907,-187.8177"/>
+            <text text-anchor="middle" x="339.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;010</text>
+        </g>
+        <!-- 0x37A -->
+        <g id="node15" class="node">
+            <title>0x37A</title>
+            <ellipse fill="none" stroke="#000000" cx="528.7906" cy="-159.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="528.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x37A</text>
+        </g>
+        <!-- 0x5A0&#45;&gt;0x37A -->
+        <g id="edge14" class="edge">
+            <title>0x5A0&#45;&gt;0x37A</title>
+            <path fill="none" stroke="#000000" d="M433.801,-249.3313C452.5495,-231.6206 482.8731,-202.9757 504.2057,-182.824"/>
+            <polygon fill="#000000" stroke="#000000" points="506.7542,-185.2313 511.6201,-175.82 501.9472,-180.1427 506.7542,-185.2313"/>
+            <text text-anchor="middle" x="502.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;000</text>
+        </g>
+        <!-- 0x455 -->
+        <g id="node16" class="node">
+            <title>0x455</title>
+            <ellipse fill="none" stroke="#000000" cx="619.7906" cy="-159.6" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="619.7906" y="-155.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x455</text>
+        </g>
+        <!-- 0x5A0&#45;&gt;0x455 -->
+        <g id="edge15" class="edge">
+            <title>0x5A0&#45;&gt;0x455</title>
+            <path fill="none" stroke="#000000" d="M443.4197,-252.397C464.5065,-242.0254 494.6583,-227.0286 520.7906,-213.4 542.9996,-201.8175 567.7596,-188.3619 586.9624,-177.8047"/>
+            <polygon fill="#000000" stroke="#000000" points="588.838,-180.7674 595.9072,-172.8758 585.4597,-174.6365 588.838,-180.7674"/>
+            <text text-anchor="middle" x="564.7906" y="-200.8" font-family="Times,serif" font-size="14.00" fill="#000000"> &#160;001</text>
+        </g>
+        <!-- 0x238&#45;&gt;0x35C -->
+        <!-- 0x35C&#45;&gt;0x13A -->
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m1.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m1.svg
new file mode 100644
index 000000000000..ff928a44dbec
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m1.svg
@@ -0,0 +1,349 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    // {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> trie [style=invis]}
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; arrowhead="vee"]
+        node [color = "lightblue"; fontcolor="blue"]
+
+        start -> root
+
+        root -> t [label = " t"]
+        t -> tr [label = " r"]
+        tr -> tra [label = " a"]
+        tra -> trav [label = " v"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+
+        trav [label = "NONE"]
+        trave [label = "NONE"]
+        traver [label = "NONE"]
+        travers [label = "NONE"]
+        traverse [label = "NONE"]
+    }
+}
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="575pt" height="823pt"
+     viewBox="0.00 0.00 575.47 822.89" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 818.8939)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-818.8939 571.4738,-818.8939 571.4738,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="279.2369" cy="-785.4782" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="279.2369" y="-789.6782" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="279.2369" y="-772.8782" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="392.2369" cy="-785.4782" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="392.2369" y="-781.2782" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="279.2369" cy="-685.2626" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="279.2369" y="-681.0626" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge2" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#000000" d="M269.3368,-757.0824C266.6104,-745.9368 264.8794,-733.0211 266.8463,-721.2626 267.3268,-718.39 268.009,-715.4398 268.8091,-712.5238"/>
+            <polygon fill="#000000" stroke="#000000" points="272.1732,-713.494 271.8627,-702.9037 265.5012,-711.3762 272.1732,-713.494"/>
+            <text text-anchor="middle" x="270.9322" y="-725.4626" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge16" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#add8e6" d="M279.2369,-755.9522C279.2369,-742.4671 279.2369,-726.5749 279.2369,-713.2779"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="279.2369,-703.2659 283.737,-713.2659 279.2369,-708.2659 279.237,-713.2659 279.237,-713.2659 279.237,-713.2659 279.2369,-708.2659 274.737,-713.2659 279.2369,-703.2659 279.2369,-703.2659"/>
+            <text text-anchor="middle" x="282.9322" y="-725.4626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- start&#45;&gt;root -->
+        <g id="edge15" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M367.6112,-800.5185C350.7348,-807.9537 333.8585,-809.4839 316.9821,-805.109"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="307.0455,-801.8508 317.9498,-800.6907 311.7966,-803.4087 316.5477,-804.9667 316.5477,-804.9667 316.5477,-804.9667 311.7966,-803.4087 315.1456,-809.2427 307.0455,-801.8508 307.0455,-801.8508"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="91.2369" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="91.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge8" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#000000" d="M87.5943,-177.63C85.1221,-165.4338 81.802,-149.0543 78.9772,-135.1187"/>
+            <polygon fill="#000000" stroke="#000000" points="82.3528,-134.1534 76.9358,-125.0481 75.4923,-135.5441 82.3528,-134.1534"/>
+            <text text-anchor="middle" x="88.3172" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="91.2369" cy="-284.4" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="91.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge7" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#000000" d="M91.2369,-266.0006C91.2369,-253.8949 91.2369,-237.8076 91.2369,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="94.737,-223.672 91.2369,-213.672 87.737,-223.6721 94.737,-223.672"/>
+            <text text-anchor="middle" x="96.4869" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="92.2369" cy="-373.2" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="92.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge6" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#000000" d="M92.0297,-354.8006C91.8934,-342.6949 91.7122,-326.6076 91.5575,-312.8674"/>
+            <polygon fill="#000000" stroke="#000000" points="95.0529,-312.4319 91.4404,-302.472 88.0533,-312.5108 95.0529,-312.4319"/>
+            <text text-anchor="middle" x="95.9322" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="312.2369" cy="-373.2" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="312.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="203.2369" cy="-473.4156" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="203.2369" y="-477.6156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="203.2369" y="-460.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge5" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#000000" d="M178.8775,-451.4229C160.5618,-434.8867 135.5561,-412.3105 117.1156,-395.6616"/>
+            <polygon fill="#000000" stroke="#000000" points="119.421,-393.0276 109.6531,-388.9241 114.7301,-398.2233 119.421,-393.0276"/>
+            <text text-anchor="middle" x="153.0942" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- trav -->
+        <g id="node14" class="node">
+            <title>trav</title>
+            <ellipse fill="none" stroke="#add8e6" cx="183.2369" cy="-373.2" rx="37.7006" ry="18"/>
+            <text text-anchor="middle" x="183.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#0000ff">NONE</text>
+        </g>
+        <!-- tra&#45;&gt;trav -->
+        <g id="edge19" class="edge">
+            <title>tra&#45;&gt;trav</title>
+            <path fill="none" stroke="#add8e6" d="M197.3983,-444.1597C194.7054,-430.666 191.5224,-414.7168 188.8568,-401.3604"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="186.8493,-391.3011 193.2195,-400.227 187.8279,-396.2044 188.8065,-401.1078 188.8065,-401.1078 188.8065,-401.1078 187.8279,-396.2044 184.3935,-401.9885 186.8493,-391.3011 186.8493,-391.3011"/>
+            <text text-anchor="middle" x="198.4869" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#0000ff"> v</text>
+        </g>
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="494.2369" cy="-373.2" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="494.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="304.2369" cy="-473.4156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="304.2369" y="-477.6156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="304.2369" y="-460.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge10" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#000000" d="M306.5939,-443.8897C307.6704,-430.4046 308.939,-414.5123 310.0005,-401.2153"/>
+            <polygon fill="#000000" stroke="#000000" points="313.4928,-401.4502 310.7997,-391.2033 306.515,-400.8931 313.4928,-401.4502"/>
+            <text text-anchor="middle" x="314.0942" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="446.2369" cy="-473.4156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="446.2369" y="-477.6156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="446.2369" y="-460.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge12" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#000000" d="M459.4802,-445.766C466.2441,-431.6441 474.4452,-414.5215 481.1822,-400.456"/>
+            <polygon fill="#000000" stroke="#000000" points="484.4134,-401.812 485.5766,-391.2812 478.1002,-398.7881 484.4134,-401.812"/>
+            <text text-anchor="middle" x="480.0942" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="279.2369" cy="-585.0469" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="279.2369" y="-589.2469" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="279.2369" y="-572.4469" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge4" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#000000" d="M252.4549,-564.0846C243.9686,-556.4495 235.0956,-547.3003 228.5223,-537.6313 223.1796,-529.7724 218.7204,-520.6194 215.11,-511.7284"/>
+            <polygon fill="#000000" stroke="#000000" points="218.3382,-510.37 211.5239,-502.2573 211.7918,-512.8487 218.3382,-510.37"/>
+            <text text-anchor="middle" x="234.0942" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#000000"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge18" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M262.877,-558.5399C255.5232,-546.8584 246.6268,-533.0409 238.2369,-520.8313 235.1223,-516.2987 231.7727,-511.5792 228.424,-506.9511"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="222.4485,-498.7793 231.9836,-504.1952 225.3998,-502.8154 228.3512,-506.8514 228.3512,-506.8514 228.3512,-506.8514 225.3998,-502.8154 224.7187,-509.5077 222.4485,-498.7793 222.4485,-498.7793"/>
+            <text text-anchor="middle" x="254.0942" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge9" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#000000" d="M285.7401,-556.0085C288.6971,-542.8046 292.2409,-526.9807 295.4318,-512.7327"/>
+            <polygon fill="#000000" stroke="#000000" points="298.9093,-513.2196 297.6794,-502.6964 292.0785,-511.6898 298.9093,-513.2196"/>
+            <text text-anchor="middle" x="298.0942" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge11" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#000000" d="M307.9452,-565.8568C335.7854,-547.247 378.1751,-518.9116 408.6826,-498.5188"/>
+            <polygon fill="#000000" stroke="#000000" points="410.9757,-501.196 417.3443,-492.7289 407.0856,-495.3764 410.9757,-501.196"/>
+            <text text-anchor="middle" x="376.9322" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#000000"> i</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge3" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#000000" d="M271.4066,-667.6591C269.2309,-661.9119 267.1669,-655.422 266.0763,-649.2626 264.5952,-640.8975 265.0969,-631.9507 266.5509,-623.4728"/>
+            <polygon fill="#000000" stroke="#000000" points="270.0289,-623.9493 268.7277,-613.4347 263.1879,-622.4658 270.0289,-623.9493"/>
+            <text text-anchor="middle" x="270.3172" y="-636.6626" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge17" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M279.2369,-666.8815C279.2369,-655.1502 279.2369,-639.4774 279.2369,-624.9885"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="279.2369,-614.7209 283.737,-624.7208 279.2369,-619.7209 279.237,-624.7209 279.237,-624.7209 279.237,-624.7209 279.2369,-619.7209 274.737,-624.7209 279.2369,-614.7209 279.2369,-614.7209"/>
+            <text text-anchor="middle" x="283.3172" y="-636.6626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trave -->
+        <g id="node15" class="node">
+            <title>trave</title>
+            <ellipse fill="none" stroke="#add8e6" cx="188.2369" cy="-284.4" rx="37.7006" ry="18"/>
+            <text text-anchor="middle" x="188.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#0000ff">NONE</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge20" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#add8e6" d="M184.2729,-354.8006C184.9613,-342.575 185.8783,-326.2887 186.6569,-312.4599"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="187.2193,-302.472 191.1499,-312.7092 186.9382,-307.4641 186.6571,-312.4562 186.6571,-312.4562 186.6571,-312.4562 186.9382,-307.4641 182.1642,-312.2032 187.2193,-302.472 187.2193,-302.472"/>
+            <text text-anchor="middle" x="191.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- traver -->
+        <g id="node16" class="node">
+            <title>traver</title>
+            <ellipse fill="none" stroke="#add8e6" cx="193.2369" cy="-195.6" rx="37.7006" ry="18"/>
+            <text text-anchor="middle" x="193.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#0000ff">NONE</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge21" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#add8e6" d="M189.2729,-266.0006C189.9613,-253.775 190.8783,-237.4887 191.6569,-223.6599"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="192.2193,-213.672 196.1499,-223.9092 191.9382,-218.6641 191.6571,-223.6562 191.6571,-223.6562 191.6571,-223.6562 191.9382,-218.6641 187.1642,-223.4032 192.2193,-213.672 192.2193,-213.672"/>
+            <text text-anchor="middle" x="195.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- travers -->
+        <g id="node17" class="node">
+            <title>travers</title>
+            <ellipse fill="none" stroke="#add8e6" cx="202.2369" cy="-106.8" rx="37.7006" ry="18"/>
+            <text text-anchor="middle" x="202.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#0000ff">NONE</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge22" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#add8e6" d="M195.1017,-177.2006C196.3408,-164.975 197.9914,-148.6887 199.393,-134.8599"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="200.4053,-124.872 203.8739,-135.2748 199.901,-129.8465 199.3968,-134.821 199.3968,-134.821 199.3968,-134.821 199.901,-129.8465 194.9198,-134.3673 200.4053,-124.872 200.4053,-124.872"/>
+            <text text-anchor="middle" x="203.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#0000ff"> s</text>
+        </g>
+        <!-- traverse -->
+        <g id="node18" class="node">
+            <title>traverse</title>
+            <ellipse fill="none" stroke="#add8e6" cx="202.2369" cy="-18" rx="37.7006" ry="18"/>
+            <text text-anchor="middle" x="202.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#0000ff">NONE</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge23" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#add8e6" d="M202.2369,-88.4006C202.2369,-76.2949 202.2369,-60.2076 202.2369,-46.4674"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="202.2369,-36.072 206.737,-46.072 202.2369,-41.072 202.237,-46.072 202.237,-46.072 202.237,-46.072 202.2369,-41.072 197.737,-46.0721 202.2369,-36.072 202.2369,-36.072"/>
+            <text text-anchor="middle" x="207.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m2.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m2.svg
new file mode 100644
index 000000000000..ba33dd1f2266
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m2.svg
@@ -0,0 +1,430 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    // {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> trie [style=invis]}
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; arrowhead="vee"]
+        node [color = "blue"; fontcolor="blue"]
+
+        start -> root
+
+        root -> t [label = " t"]
+        t -> tr [label = " r"]
+        tr -> tra [label = " a"]
+        tra -> trav [label = " v"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+
+        trav [label = "0x0B8"]
+        trave [label = "0x0B9"]
+        traver [label = "0x0BA"]
+        travers [label = "0x0BB"]
+        traverse [label = "contentArray[3]"]
+    }
+
+    {
+        edge [color = "blue"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+    }
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="red"; arrowhead="vee"; constrain="false"]
+
+        traverse -> travers [label = " ~3"]
+        travers -> traver [label = "0x0BB"]
+        traver -> trave [label = "0x0BA"]
+        trave -> trav [label = "0x0B9"]
+        trav -> tra [label = "0x0B8"]
+    }
+}
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="574pt" height="823pt"
+     viewBox="0.00 0.00 574.47 822.89" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 818.8939)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-818.8939 570.4738,-818.8939 570.4738,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="289.2369" cy="-785.4782" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="289.2369" y="-789.6782" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="289.2369" y="-772.8782" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="402.2369" cy="-785.4782" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="402.2369" y="-781.2782" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="289.2369" cy="-685.2626" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="289.2369" y="-681.0626" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge2" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#000000" d="M279.3368,-757.0824C276.6104,-745.9368 274.8794,-733.0211 276.8463,-721.2626 277.3268,-718.39 278.009,-715.4398 278.8091,-712.5238"/>
+            <polygon fill="#000000" stroke="#000000" points="282.1732,-713.494 281.8627,-702.9037 275.5012,-711.3762 282.1732,-713.494"/>
+            <text text-anchor="middle" x="280.9322" y="-725.4626" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge16" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#add8e6" d="M289.2369,-755.9522C289.2369,-742.4671 289.2369,-726.5749 289.2369,-713.2779"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="289.2369,-703.2659 293.737,-713.2659 289.2369,-708.2659 289.237,-713.2659 289.237,-713.2659 289.237,-713.2659 289.2369,-708.2659 284.737,-713.2659 289.2369,-703.2659 289.2369,-703.2659"/>
+            <text text-anchor="middle" x="292.9322" y="-725.4626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- start&#45;&gt;root -->
+        <g id="edge15" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M377.6112,-800.5185C360.7348,-807.9537 343.8585,-809.4839 326.9821,-805.109"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="317.0455,-801.8508 327.9498,-800.6907 321.7966,-803.4087 326.5477,-804.9667 326.5477,-804.9667 326.5477,-804.9667 321.7966,-803.4087 325.1456,-809.2427 317.0455,-801.8508 317.0455,-801.8508"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="91.2369" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="91.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge8" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#000000" d="M87.5943,-177.63C85.1221,-165.4338 81.802,-149.0543 78.9772,-135.1187"/>
+            <polygon fill="#000000" stroke="#000000" points="82.3528,-134.1534 76.9358,-125.0481 75.4923,-135.5441 82.3528,-134.1534"/>
+            <text text-anchor="middle" x="88.3172" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="92.2369" cy="-284.4" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="92.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge7" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#000000" d="M92.0297,-266.0006C91.8934,-253.8949 91.7122,-237.8076 91.5575,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="95.0529,-223.6319 91.4404,-213.672 88.0533,-223.7108 95.0529,-223.6319"/>
+            <text text-anchor="middle" x="97.4869" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="93.2369" cy="-373.2" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="93.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge6" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#000000" d="M93.0297,-354.8006C92.8934,-342.6949 92.7122,-326.6076 92.5575,-312.8674"/>
+            <polygon fill="#000000" stroke="#000000" points="96.0529,-312.4319 92.4404,-302.472 89.0533,-312.5108 96.0529,-312.4319"/>
+            <text text-anchor="middle" x="96.9322" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="311.2369" cy="-373.2" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="311.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="219.2369" cy="-473.4156" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="219.2369" y="-477.6156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="219.2369" y="-460.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge5" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#000000" d="M191.1461,-454.472C179.0394,-446.0482 164.8584,-435.8286 152.5223,-426 140.7539,-416.6237 128.241,-405.6422 117.7589,-396.1298"/>
+            <polygon fill="#000000" stroke="#000000" points="119.9234,-393.366 110.185,-389.1928 115.1954,-398.5281 119.9234,-393.366"/>
+            <text text-anchor="middle" x="158.0942" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- trav -->
+        <g id="node14" class="node">
+            <title>trav</title>
+            <ellipse fill="none" stroke="#0000ff" cx="183.2369" cy="-373.2" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="183.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0B8</text>
+        </g>
+        <!-- tra&#45;&gt;trav -->
+        <g id="edge19" class="edge">
+            <title>tra&#45;&gt;trav</title>
+            <path fill="none" stroke="#add8e6" d="M217.5652,-443.8618C216.1848,-432.6862 213.6979,-420.0532 209.2369,-409.2 207.6981,-405.4564 205.697,-401.7342 203.4879,-398.182"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="197.713,-389.7873 207.0881,-395.4757 200.5468,-393.9067 203.3807,-398.0261 203.3807,-398.0261 203.3807,-398.0261 200.5468,-393.9067 199.6732,-400.5766 197.713,-389.7873 197.713,-389.7873"/>
+            <text text-anchor="middle" x="219.4869" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#0000ff"> v</text>
+        </g>
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="493.2369" cy="-373.2" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="493.2369" y="-369" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="311.2369" cy="-473.4156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="311.2369" y="-477.6156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="311.2369" y="-460.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge10" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#000000" d="M311.2369,-443.8897C311.2369,-430.4046 311.2369,-414.5123 311.2369,-401.2153"/>
+            <polygon fill="#000000" stroke="#000000" points="314.737,-401.2033 311.2369,-391.2033 307.737,-401.2034 314.737,-401.2033"/>
+            <text text-anchor="middle" x="316.0942" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="448.2369" cy="-473.4156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="448.2369" y="-477.6156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="448.2369" y="-460.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge12" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#000000" d="M460.7719,-445.5C467.1109,-431.383 474.7723,-414.3209 481.0603,-400.3175"/>
+            <polygon fill="#000000" stroke="#000000" points="484.2575,-401.7415 485.1609,-391.1852 477.8717,-398.874 484.2575,-401.7415"/>
+            <text text-anchor="middle" x="480.0942" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="289.2369" cy="-585.0469" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="289.2369" y="-589.2469" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="289.2369" y="-572.4469" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge4" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#000000" d="M262.3516,-564.0201C254.086,-556.4444 245.5803,-547.3475 239.5223,-537.6313 234.7133,-529.9184 230.9544,-520.9679 228.0512,-512.2428"/>
+            <polygon fill="#000000" stroke="#000000" points="231.37,-511.1256 225.1267,-502.5657 224.6692,-513.1506 231.37,-511.1256"/>
+            <text text-anchor="middle" x="245.0942" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#000000"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge18" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M272.7366,-558.4643C265.5384,-546.8917 256.9887,-533.179 249.2369,-520.8313 246.7152,-516.8145 244.0675,-512.6105 241.4331,-508.4359"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="236.0886,-499.9773 245.2344,-506.0275 238.7594,-504.2043 241.4302,-508.4312 241.4302,-508.4312 241.4302,-508.4312 238.7594,-504.2043 237.6259,-510.8349 236.0886,-499.9773 236.0886,-499.9773"/>
+            <text text-anchor="middle" x="264.0942" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge9" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#000000" d="M295.0171,-555.7171C297.6192,-542.5138 300.7286,-526.7364 303.5256,-512.5441"/>
+            <polygon fill="#000000" stroke="#000000" points="306.9955,-513.0376 305.4953,-502.5496 300.1276,-511.6841 306.9955,-513.0376"/>
+            <text text-anchor="middle" x="307.0942" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge11" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#000000" d="M317.2969,-565.3465C343.5549,-546.9111 382.9334,-519.2642 411.6478,-499.1043"/>
+            <polygon fill="#000000" stroke="#000000" points="414.0173,-501.7172 420.1904,-493.1066 409.995,-495.9882 414.0173,-501.7172"/>
+            <text text-anchor="middle" x="381.9322" y="-525.0313" font-family="Times,serif" font-size="14.00" fill="#000000"> i</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge3" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#000000" d="M281.4066,-667.6591C279.2309,-661.9119 277.1669,-655.422 276.0763,-649.2626 274.5952,-640.8975 275.0969,-631.9507 276.5509,-623.4728"/>
+            <polygon fill="#000000" stroke="#000000" points="280.0289,-623.9493 278.7277,-613.4347 273.1879,-622.4658 280.0289,-623.9493"/>
+            <text text-anchor="middle" x="280.3172" y="-636.6626" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge17" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M289.2369,-666.8815C289.2369,-655.1502 289.2369,-639.4774 289.2369,-624.9885"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="289.2369,-614.7209 293.737,-624.7208 289.2369,-619.7209 289.237,-624.7209 289.237,-624.7209 289.237,-624.7209 289.2369,-619.7209 284.737,-624.7209 289.2369,-614.7209 289.2369,-614.7209"/>
+            <text text-anchor="middle" x="293.3172" y="-636.6626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trav&#45;&gt;tra -->
+        <g id="edge32" class="edge">
+            <title>trav&#45;&gt;tra</title>
+            <path fill="none" stroke="#ffc0cb" d="M172.1325,-390.7196C166.9576,-401.1938 162.925,-414.6319 167.9003,-426 171.4315,-434.0686 177.0881,-441.3739 183.3808,-447.7"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="190.9707,-454.6637 180.5599,-451.219 187.2864,-451.2834 183.6021,-447.9031 183.6021,-447.9031 183.6021,-447.9031 187.2864,-451.2834 186.6444,-444.5873 190.9707,-454.6637 190.9707,-454.6637"/>
+            <text text-anchor="middle" x="186.9052" y="-413.4" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0B8</text>
+        </g>
+        <!-- trave -->
+        <g id="node15" class="node">
+            <title>trave</title>
+            <ellipse fill="none" stroke="#0000ff" cx="189.2369" cy="-284.4" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="189.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0B9</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge20" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#add8e6" d="M206.004,-359.0012C213.2194,-353.2102 220.3301,-345.8294 224.2369,-337.2 228.9378,-326.8164 224.0959,-316.1864 216.6675,-307.2114"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="209.6485,-299.798 219.7915,-303.9657 213.0861,-303.4288 216.5238,-307.0596 216.5238,-307.0596 216.5238,-307.0596 213.0861,-303.4288 213.2561,-310.1535 209.6485,-299.798 209.6485,-299.798"/>
+            <text text-anchor="middle" x="231.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge24" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#0000ff" d="M165.9239,-357.0271C157.1626,-346.8113 149.3672,-333.1725 154.5223,-320.4 156.3746,-315.8107 159.123,-311.4803 162.2767,-307.5211"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="165.0223,-309.7042 169.162,-299.9515 159.844,-304.994 165.0223,-309.7042"/>
+            <text text-anchor="middle" x="160.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- trave&#45;&gt;trav -->
+        <g id="edge31" class="edge">
+            <title>trave&#45;&gt;trav</title>
+            <path fill="none" stroke="#ffc0cb" d="M185.3926,-302.6325C184.3695,-308.2985 183.4083,-314.5837 182.9003,-320.4 182.2094,-328.3107 182.0246,-336.9134 182.0825,-344.8339"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="182.2872,-355.0829 177.5883,-345.1748 182.1873,-350.0839 182.0874,-345.0849 182.0874,-345.0849 182.0874,-345.0849 182.1873,-350.0839 186.5865,-344.995 182.2872,-355.0829 182.2872,-355.0829"/>
+            <text text-anchor="middle" x="201.9052" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0B9</text>
+        </g>
+        <!-- traver -->
+        <g id="node16" class="node">
+            <title>traver</title>
+            <ellipse fill="none" stroke="#0000ff" cx="194.2369" cy="-195.6" rx="38.8671" ry="18"/>
+            <text text-anchor="middle" x="194.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0BA</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge21" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#add8e6" d="M213.0898,-270.4154C220.6549,-264.6446 228.1173,-257.2217 232.2369,-248.4 237.2769,-237.6073 231.7142,-226.7456 223.4297,-217.7036"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="216.0035,-210.5778 226.3347,-214.2545 219.6113,-214.0396 223.219,-217.5015 223.219,-217.5015 223.219,-217.5015 219.6113,-214.0396 220.1034,-220.7485 216.0035,-210.5778 216.0035,-210.5778"/>
+            <text text-anchor="middle" x="238.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge25" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#0000ff" d="M172.8782,-268.1851C164.5976,-257.9534 157.2248,-244.3128 162.0763,-231.6 163.7001,-227.3449 166.0838,-223.2741 168.8315,-219.5011"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="171.5668,-221.6859 175.2747,-211.761 166.1869,-217.2074 171.5668,-221.6859"/>
+            <text text-anchor="middle" x="166.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- traver&#45;&gt;trave -->
+        <g id="edge30" class="edge">
+            <title>traver&#45;&gt;trave</title>
+            <path fill="none" stroke="#ffc0cb" d="M190.328,-213.8298C189.2877,-219.4957 188.3103,-225.7815 187.7937,-231.6 187.0912,-239.5125 187.0294,-248.1157 187.2551,-256.0363"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="187.7079,-266.2849 182.7708,-256.4934 187.4871,-261.2898 187.2664,-256.2947 187.2664,-256.2947 187.2664,-256.2947 187.4871,-261.2898 191.762,-256.096 187.7079,-266.2849 187.7079,-266.2849"/>
+            <text text-anchor="middle" x="208.4585" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BA</text>
+        </g>
+        <!-- travers -->
+        <g id="node17" class="node">
+            <title>travers</title>
+            <ellipse fill="none" stroke="#0000ff" cx="203.2369" cy="-106.8" rx="38.305" ry="18"/>
+            <text text-anchor="middle" x="203.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0BB</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge22" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#add8e6" d="M219.6485,-181.4864C227.3996,-175.7839 234.9896,-168.4371 239.2369,-159.6 244.2682,-149.1317 239.1644,-138.3954 231.3732,-129.361"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="224.3601,-122.2156 234.5764,-126.2003 227.8624,-125.784 231.3648,-129.3524 231.3648,-129.3524 231.3648,-129.3524 227.8624,-125.784 228.1532,-132.5046 224.3601,-122.2156 224.3601,-122.2156"/>
+            <text text-anchor="middle" x="245.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#0000ff"> s</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge26" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#0000ff" d="M177.7403,-179.2722C169.4246,-168.9981 162.0986,-155.3524 167.2923,-142.8 169.2677,-138.0258 172.2173,-133.5536 175.5926,-129.4948"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="178.1364,-131.8988 182.4642,-122.2282 173.0503,-127.0892 178.1364,-131.8988"/>
+            <text text-anchor="middle" x="172.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> s</text>
+        </g>
+        <!-- travers&#45;&gt;traver -->
+        <g id="edge29" class="edge">
+            <title>travers&#45;&gt;traver</title>
+            <path fill="none" stroke="#ffc0cb" d="M198.6734,-125.0051C197.4382,-130.6699 196.2533,-136.9614 195.5637,-142.8 194.632,-150.6883 194.1789,-159.2838 193.9873,-167.2046"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="193.8675,-177.4577 189.4847,-167.4058 193.9259,-172.4581 193.9844,-167.4584 193.9844,-167.4584 193.9844,-167.4584 193.9259,-172.4581 198.4841,-167.511 193.8675,-177.4577 193.8675,-177.4577"/>
+            <text text-anchor="middle" x="216.0735" y="-147" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BB</text>
+        </g>
+        <!-- traverse -->
+        <g id="node18" class="node">
+            <title>traverse</title>
+            <ellipse fill="none" stroke="#0000ff" cx="203.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="203.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#0000ff">contentArray[3]</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge23" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#add8e6" d="M184.8676,-90.9569C179.5135,-85.2132 174.3474,-78.2805 171.5223,-70.8 168.8843,-63.8149 168.8843,-60.9851 171.5223,-54 172.8668,-50.4399 174.7415,-47.004 176.9144,-43.7612"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="183.203,-35.676 180.6155,-46.3323 180.1332,-39.6227 177.0634,-43.5695 177.0634,-43.5695 177.0634,-43.5695 180.1332,-39.6227 173.5114,-40.8067 183.203,-35.676 183.203,-35.676"/>
+            <text text-anchor="middle" x="177.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge27" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#0000ff" d="M203.2369,-88.4006C203.2369,-76.2949 203.2369,-60.2076 203.2369,-46.4674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="206.737,-46.072 203.2369,-36.072 199.737,-46.0721 206.737,-46.072"/>
+            <text text-anchor="middle" x="208.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- traverse&#45;&gt;travers -->
+        <g id="edge28" class="edge">
+            <title>traverse&#45;&gt;travers</title>
+            <path fill="none" stroke="#ffc0cb" d="M209.9024,-36.0722C211.6782,-41.7317 213.3487,-48.0535 214.2369,-54 215.3399,-61.3847 215.3399,-63.4153 214.2369,-70.8 213.8344,-73.4945 213.2713,-76.2661 212.6106,-79.0181"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="209.9024,-88.7278 208.2546,-77.8864 211.2458,-83.9116 212.5891,-79.0954 212.5891,-79.0954 212.5891,-79.0954 211.2458,-83.9116 216.9237,-80.3044 209.9024,-88.7278 209.9024,-88.7278"/>
+            <text text-anchor="middle" x="224.2732" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#ff0000"> ~3</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m3.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m3.svg
new file mode 100644
index 000000000000..e71114f89276
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.m3.svg
@@ -0,0 +1,500 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"; color = "lightgrey"; fontcolor = lightgray]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tra [label = " a"; color = "lightgrey"; fontcolor = lightgray]
+    tra -> trac [label = " c"; color = "lightgrey"; fontcolor = lightgray]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; arrowhead="vee"]
+        node [color = "blue"; fontcolor="blue"]
+
+        start -> root
+
+        root -> t [label = " t"]
+        t -> tr [label = " r"]
+        tr -> tra [label = " a"]
+        tra -> trav [label = " v"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+
+        tra2 [label = "Sparse\n0x0DE"]
+        trav [label = "Chain\n0x0B8"]
+        trave [label = "0x0B9"]
+        traver [label = "0x0BA"]
+        travers [label = "0x0BB"]
+        traverse [label = "contentArray[3]"]
+    }
+
+    {rank=same tra -> tra2 -> tre -> tri [style=invis]}
+    {rank=same trac -> trav -> tree -> trie [style=invis]}
+
+    {
+        edge [color = "blue"]
+        tr -> tra2 [label = " a"]
+        tra2 -> trac [label = " c"]
+        tra2 -> trav [label = " v"]
+        trav -> trave [label = " e"]
+        trave -> traver [label = " r"]
+        traver -> travers [label = " s"]
+        travers -> traverse [label = " e"]
+    }
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="red"; arrowhead="vee"; constrain="false"]
+
+        traverse -> travers [label = " ~3"]
+        travers -> traver [label = "0x0BB"]
+        traver -> trave [label = "0x0BA"]
+        trave -> trav [label = "0x0B9"]
+        trav -> tra2 [label = "0x0B8"]
+        tra2 -> tr [label = "0x0DE"]
+        tr -> t [label = "0x07E"]
+        t -> root [label = "0x09B"]
+        root -> start [label = "0x09A"]
+    }
+}
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="611pt" height="846pt"
+     viewBox="0.00 0.00 611.47 845.73" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 841.7251)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-841.7251 607.4738,-841.7251 607.4738,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="230.2369" cy="-808.3095" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="230.2369" y="-812.5095" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="230.2369" y="-795.7095" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="363.2369" cy="-808.3095" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="363.2369" y="-804.1095" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- root&#45;&gt;start -->
+        <g id="edge43" class="edge">
+            <title>root&#45;&gt;start</title>
+            <path fill="none" stroke="#ffc0cb" d="M263.7984,-808.3095C277.3037,-808.3095 293.2339,-808.3095 308.3398,-808.3095"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="318.6261,-808.3095 308.6262,-812.8096 313.6261,-808.3095 308.6261,-808.3096 308.6261,-808.3096 308.6261,-808.3096 313.6261,-808.3095 308.6261,-803.8096 318.6261,-808.3095 318.6261,-808.3095"/>
+            <text text-anchor="middle" x="291.2509" y="-815.5095" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x09A</text>
+        </g>
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="230.2369" cy="-708.0939" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="230.2369" y="-703.8939" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge2" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#000000" d="M210.9564,-784.2612C203.7393,-772.3485 198.2768,-757.6462 202.8463,-744.0939 204.1636,-740.1869 206.0926,-736.3684 208.3269,-732.7663"/>
+            <polygon fill="#000000" stroke="#000000" points="211.3844,-734.5068 214.3357,-724.3313 205.683,-730.4454 211.3844,-734.5068"/>
+            <text text-anchor="middle" x="206.9322" y="-748.2939" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge14" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#add8e6" d="M230.2369,-778.7835C230.2369,-765.2984 230.2369,-749.4062 230.2369,-736.1092"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="230.2369,-726.0972 234.737,-736.0971 230.2369,-731.0972 230.237,-736.0972 230.237,-736.0972 230.237,-736.0972 230.2369,-731.0972 225.737,-736.0972 230.2369,-726.0972 230.2369,-726.0972"/>
+            <text text-anchor="middle" x="233.9322" y="-748.2939" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- start&#45;&gt;root -->
+        <g id="edge13" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M319.4042,-805.6333C307.2938,-805.1818 294.1479,-804.9584 282.0184,-805.3095 279.3264,-805.3874 276.5568,-805.4881 273.7643,-805.6051"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="263.6504,-806.089 273.4239,-801.1161 268.6447,-805.85 273.639,-805.611 273.639,-805.611 273.639,-805.611 268.6447,-805.85 273.8541,-810.1058 263.6504,-806.089 263.6504,-806.089"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="74.2369" cy="-195.6" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="74.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge6" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#000000" d="M74.0297,-177.2006C73.8934,-165.0949 73.7122,-149.0076 73.5575,-135.2674"/>
+            <polygon fill="#000000" stroke="#000000" points="77.0529,-134.8319 73.4404,-124.872 70.0533,-134.9108 77.0529,-134.8319"/>
+            <text text-anchor="middle" x="78.3172" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="78.2369" cy="-284.4" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="78.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge5" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#000000" d="M77.4081,-266.0006C76.8628,-253.8949 76.1381,-237.8076 75.5192,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="78.9975,-223.5044 75.0509,-213.672 72.0046,-223.8194 78.9975,-223.5044"/>
+            <text text-anchor="middle" x="82.4869" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="94.2369" cy="-384.6156" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="94.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge4" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#000000" d="M91.3022,-366.2345C88.9115,-351.2603 85.4955,-329.8643 82.756,-312.7055"/>
+            <polygon fill="#000000" stroke="#000000" points="86.1858,-311.9877 81.1529,-302.6646 79.2734,-313.0913 86.1858,-311.9877"/>
+            <text text-anchor="middle" x="89.9322" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- trav -->
+        <g id="node14" class="node">
+            <title>trav</title>
+            <ellipse fill="none" stroke="#0000ff" cx="202.2369" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="202.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#0000ff">Chain</text>
+            <text text-anchor="middle" x="202.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0B8</text>
+        </g>
+        <!-- trac&#45;&gt;trav -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#d3d3d3" cx="124.2369" cy="-496.2469" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="124.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#d3d3d3">Chain</text>
+            <text text-anchor="middle" x="124.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#d3d3d3">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge8" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#d3d3d3" d="M108.792,-469.6104C105.5657,-463.008 102.5364,-455.818 100.5223,-448.8313 97.2068,-437.3303 95.5641,-424.2429 94.7752,-412.8969"/>
+            <polygon fill="#d3d3d3" stroke="#d3d3d3" points="98.2581,-412.4701 94.2539,-402.6611 91.2672,-412.8262 98.2581,-412.4701"/>
+            <text text-anchor="middle" x="106.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#d3d3d3"> c</text>
+        </g>
+        <!-- tra&#45;&gt;trav -->
+        <g id="edge17" class="edge">
+            <title>tra&#45;&gt;trav</title>
+            <path fill="none" stroke="#add8e6" d="M124.8024,-466.7151C126.1832,-455.1695 129.2555,-442.2712 135.7369,-432.0313 142.2539,-421.7351 151.8894,-413.0083 161.8152,-405.9576"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="170.1591,-400.4315 164.3065,-409.7051 165.9904,-403.1924 161.8218,-405.9533 161.8218,-405.9533 161.8218,-405.9533 165.9904,-403.1924 159.337,-402.2015 170.1591,-400.4315 170.1591,-400.4315"/>
+            <text text-anchor="middle" x="141.4869" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> v</text>
+        </g>
+        <!-- tra2 -->
+        <g id="node19" class="node">
+            <title>tra2</title>
+            <ellipse fill="none" stroke="#0000ff" cx="235.2369" cy="-496.2469" rx="39.2145" ry="29.3315"/>
+            <text text-anchor="middle" x="235.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#0000ff">Sparse</text>
+            <text text-anchor="middle" x="235.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0DE</text>
+        </g>
+        <!-- tra&#45;&gt;tra2 -->
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="348.2369" cy="-384.6156" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="348.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="530.2369" cy="-384.6156" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="530.2369" y="-380.4156" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="348.2369" cy="-496.2469" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="348.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="348.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge10" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#000000" d="M348.2369,-466.6249C348.2369,-449.9873 348.2369,-429.334 348.2369,-412.9163"/>
+            <polygon fill="#000000" stroke="#000000" points="351.737,-412.8239 348.2369,-402.8239 344.737,-412.8239 351.737,-412.8239"/>
+            <text text-anchor="middle" x="353.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="459.2369" cy="-496.2469" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="459.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="459.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tre&#45;&gt;tri -->
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge12" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#000000" d="M476.0629,-469.7919C487.3072,-452.1128 502.0457,-428.9399 513.3106,-411.2283"/>
+            <polygon fill="#000000" stroke="#000000" points="516.4345,-412.8384 518.848,-402.5221 510.5279,-409.0817 516.4345,-412.8384"/>
+            <text text-anchor="middle" x="503.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="230.2369" cy="-607.8782" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="230.2369" y="-612.0782" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="230.2369" y="-595.2782" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge7" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#d3d3d3" d="M195.7438,-595.1807C178.8708,-587.4308 159.461,-575.9889 146.5223,-560.4626 140.4195,-553.1393 135.9491,-544.0436 132.6887,-535.0233"/>
+            <polygon fill="#d3d3d3" stroke="#d3d3d3" points="135.9777,-533.8168 129.6057,-525.3523 129.3084,-535.943 135.9777,-533.8168"/>
+            <text text-anchor="middle" x="152.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#d3d3d3"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge16" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M207.757,-584.2041C191.9107,-567.5159 170.5426,-545.0127 153.387,-526.9457"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="146.3209,-519.5041 156.4699,-523.6571 149.7638,-523.13 153.2067,-526.7558 153.2067,-526.7558 153.2067,-526.7558 149.7638,-523.13 149.9435,-529.8544 146.3209,-519.5041 146.3209,-519.5041"/>
+            <text text-anchor="middle" x="189.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge9" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#000000" d="M254.4007,-585.0186C272.3773,-568.0122 297.076,-544.6465 316.6218,-526.1556"/>
+            <polygon fill="#000000" stroke="#000000" points="319.2463,-528.4909 324.1054,-519.076 314.4357,-523.4058 319.2463,-528.4909"/>
+            <text text-anchor="middle" x="302.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge11" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#000000" d="M262.5613,-592.1209C303.3074,-572.2583 373.3978,-538.0911 417.9723,-516.3623"/>
+            <polygon fill="#000000" stroke="#000000" points="419.6217,-519.452 427.0769,-511.924 416.5544,-513.1598 419.6217,-519.452"/>
+            <text text-anchor="middle" x="361.9322" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> i</text>
+        </g>
+        <!-- tr&#45;&gt;t -->
+        <g id="edge41" class="edge">
+            <title>tr&#45;&gt;t</title>
+            <path fill="none" stroke="#ffc0cb" d="M228.1399,-637.287C227.605,-648.2159 227.2816,-660.7205 227.6843,-672.0939 227.7703,-674.5242 227.889,-677.0451 228.0283,-679.5733"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="228.6875,-689.7931 223.5531,-680.1035 228.3656,-684.8034 228.0438,-679.8138 228.0438,-679.8138 228.0438,-679.8138 228.3656,-684.8034 232.5344,-679.5241 228.6875,-689.7931 228.6875,-689.7931"/>
+            <text text-anchor="middle" x="246.5132" y="-659.4939" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra2 -->
+        <g id="edge28" class="edge">
+            <title>tr&#45;&gt;tra2</title>
+            <path fill="none" stroke="#0000ff" d="M218.5572,-579.8196C215.264,-568.6049 213.1491,-555.5518 215.5223,-543.6626 216.1971,-540.2816 217.118,-536.8402 218.1951,-533.4336"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="221.4953,-534.5993 221.57,-524.0047 214.9047,-532.2403 221.4953,-534.5993"/>
+            <text text-anchor="middle" x="221.0942" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> a</text>
+        </g>
+        <!-- t&#45;&gt;root -->
+        <g id="edge42" class="edge">
+            <title>t&#45;&gt;root</title>
+            <path fill="none" stroke="#ffc0cb" d="M235.0886,-726.2786C236.3802,-731.9426 237.5942,-738.2388 238.2369,-744.0939 239.1247,-752.1817 238.87,-760.8327 238.048,-769.081"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="236.7469,-779.2407 233.5537,-768.7501 237.3821,-774.2812 238.0173,-769.3217 238.0173,-769.3217 238.0173,-769.3217 237.3821,-774.2812 242.4808,-769.8934 236.7469,-779.2407 236.7469,-779.2407"/>
+            <text text-anchor="middle" x="257.9052" y="-748.2939" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x09B</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge3" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#000000" d="M251.7729,-693.2308C258.3244,-687.4682 264.7179,-680.2751 268.2369,-672.0939 271.1872,-665.2348 270.7115,-662.3385 268.2369,-655.2939 266.2659,-649.6828 263.3202,-644.2436 259.9247,-639.1727"/>
+            <polygon fill="#000000" stroke="#000000" points="262.6469,-636.9677 253.8782,-631.0211 257.0247,-641.138 262.6469,-636.9677"/>
+            <text text-anchor="middle" x="275.3172" y="-659.4939" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge15" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M213.324,-692.0138C208.3995,-686.2543 203.6543,-679.3722 201.0763,-672.0939 197.4827,-661.9481 199.7954,-651.1983 204.382,-641.4556"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="209.3576,-632.4758 208.4471,-643.4038 206.9342,-636.8493 204.5109,-641.2228 204.5109,-641.2228 204.5109,-641.2228 206.9342,-636.8493 200.5747,-639.0418 209.3576,-632.4758 209.3576,-632.4758"/>
+            <text text-anchor="middle" x="205.3172" y="-659.4939" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trav&#45;&gt;tree -->
+        <!-- trave -->
+        <g id="node15" class="node">
+            <title>trave</title>
+            <ellipse fill="none" stroke="#0000ff" cx="202.2369" cy="-284.4" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="202.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0B9</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge18" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#add8e6" d="M181.4692,-359.4476C176.9833,-352.6281 172.8815,-344.9707 170.5223,-337.2 168.3531,-330.0554 167.8843,-327.3851 170.5223,-320.4 172.1555,-316.0753 174.5712,-311.9338 177.3533,-308.0992"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="183.8676,-300.2431 180.9485,-310.8134 180.6761,-304.0921 177.4845,-307.941 177.4845,-307.941 177.4845,-307.941 180.6761,-304.0921 174.0205,-305.0686 183.8676,-300.2431 183.8676,-300.2431"/>
+            <text text-anchor="middle" x="176.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- trav&#45;&gt;trave -->
+        <g id="edge31" class="edge">
+            <title>trav&#45;&gt;trave</title>
+            <path fill="none" stroke="#0000ff" d="M202.2369,-355.0897C202.2369,-341.6046 202.2369,-325.7123 202.2369,-312.4153"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="205.737,-312.4033 202.2369,-302.4033 198.737,-312.4034 205.737,-312.4033"/>
+            <text text-anchor="middle" x="207.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- trav&#45;&gt;tra2 -->
+        <g id="edge39" class="edge">
+            <title>trav&#45;&gt;tra2</title>
+            <path fill="none" stroke="#ffc0cb" d="M210.8211,-413.654C214.805,-427.1305 219.5957,-443.3363 223.8744,-457.8102"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="226.7386,-467.4991 219.5882,-459.1851 225.3211,-462.7042 223.9036,-457.9093 223.9036,-457.9093 223.9036,-457.9093 225.3211,-462.7042 228.219,-456.6336 226.7386,-467.4991 226.7386,-467.4991"/>
+            <text text-anchor="middle" x="239.9052" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0B8</text>
+        </g>
+        <!-- trave&#45;&gt;trav -->
+        <g id="edge38" class="edge">
+            <title>trave&#45;&gt;trav</title>
+            <path fill="none" stroke="#ffc0cb" d="M208.9024,-302.4722C210.6782,-308.1317 212.3487,-314.4535 213.2369,-320.4 214.4724,-328.6713 214.0766,-337.5213 212.8888,-345.926"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="211.1036,-355.888 208.4382,-345.2509 211.9856,-350.9664 212.8676,-346.0448 212.8676,-346.0448 212.8676,-346.0448 211.9856,-350.9664 217.297,-346.8386 211.1036,-355.888 211.1036,-355.888"/>
+            <text text-anchor="middle" x="232.9052" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0B9</text>
+        </g>
+        <!-- traver -->
+        <g id="node16" class="node">
+            <title>traver</title>
+            <ellipse fill="none" stroke="#0000ff" cx="202.2369" cy="-195.6" rx="38.8671" ry="18"/>
+            <text text-anchor="middle" x="202.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0BA</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge19" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#add8e6" d="M185.324,-268.32C180.3995,-262.5605 175.6543,-255.6784 173.0763,-248.4 170.5834,-241.3618 170.5834,-238.6382 173.0763,-231.6 174.4962,-227.5912 176.5735,-223.7026 178.9762,-220.0556"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="185.0164,-212.0415 182.5911,-222.7358 182.007,-216.0344 178.9975,-220.0273 178.9975,-220.0273 178.9975,-220.0273 182.007,-216.0344 175.4039,-217.3188 185.0164,-212.0415 185.0164,-212.0415"/>
+            <text text-anchor="middle" x="177.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- trave&#45;&gt;traver -->
+        <g id="edge32" class="edge">
+            <title>trave&#45;&gt;traver</title>
+            <path fill="none" stroke="#0000ff" d="M202.2369,-266.0006C202.2369,-253.8949 202.2369,-237.8076 202.2369,-224.0674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="205.737,-223.672 202.2369,-213.672 198.737,-223.6721 205.737,-223.672"/>
+            <text text-anchor="middle" x="206.3172" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- traver&#45;&gt;trave -->
+        <g id="edge37" class="edge">
+            <title>traver&#45;&gt;trave</title>
+            <path fill="none" stroke="#ffc0cb" d="M207.6937,-213.7509C209.1467,-219.4136 210.5127,-225.7174 211.2369,-231.6 212.1492,-239.0107 212.1492,-240.9893 211.2369,-248.4 210.9201,-250.9736 210.4804,-253.6279 209.9643,-256.2738"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="207.6937,-266.2491 205.5255,-255.4997 208.8035,-261.3738 209.9132,-256.4985 209.9132,-256.4985 209.9132,-256.4985 208.8035,-261.3738 214.301,-257.4973 207.6937,-266.2491 207.6937,-266.2491"/>
+            <text text-anchor="middle" x="232.4585" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BA</text>
+        </g>
+        <!-- travers -->
+        <g id="node17" class="node">
+            <title>travers</title>
+            <ellipse fill="none" stroke="#0000ff" cx="203.2369" cy="-106.8" rx="38.305" ry="18"/>
+            <text text-anchor="middle" x="203.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#0000ff">0x0BB</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge20" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#add8e6" d="M183.9854,-179.6658C174.6965,-169.5397 166.3137,-155.9116 171.2923,-142.8 172.9385,-138.4645 175.3723,-134.3169 178.1749,-130.4793"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="184.7367,-122.6213 181.7812,-133.1814 181.5319,-126.4592 178.3271,-130.2971 178.3271,-130.2971 178.3271,-130.2971 181.5319,-126.4592 174.873,-127.4128 184.7367,-122.6213 184.7367,-122.6213"/>
+            <text text-anchor="middle" x="176.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#0000ff"> s</text>
+        </g>
+        <!-- traver&#45;&gt;travers -->
+        <g id="edge33" class="edge">
+            <title>traver&#45;&gt;travers</title>
+            <path fill="none" stroke="#0000ff" d="M202.4441,-177.2006C202.5804,-165.0949 202.7616,-149.0076 202.9163,-135.2674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="206.4205,-134.9108 203.0334,-124.872 199.4209,-134.8319 206.4205,-134.9108"/>
+            <text text-anchor="middle" x="207.7092" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> s</text>
+        </g>
+        <!-- travers&#45;&gt;traver -->
+        <g id="edge36" class="edge">
+            <title>travers&#45;&gt;traver</title>
+            <path fill="none" stroke="#ffc0cb" d="M209.0666,-124.8376C211.6638,-135.0186 213.8193,-147.9908 212.2369,-159.6 211.8717,-162.2793 211.3602,-165.0407 210.7599,-167.7863"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="208.2983,-177.4866 206.3963,-166.6869 209.5282,-172.6402 210.758,-167.7938 210.758,-167.7938 210.758,-167.7938 209.5282,-172.6402 215.1198,-168.9007 208.2983,-177.4866 208.2983,-177.4866"/>
+            <text text-anchor="middle" x="233.0735" y="-147" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BB</text>
+        </g>
+        <!-- traverse -->
+        <g id="node18" class="node">
+            <title>traverse</title>
+            <ellipse fill="none" stroke="#0000ff" cx="203.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="203.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#0000ff">contentArray[3]</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge21" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#add8e6" d="M184.8676,-90.9569C179.5135,-85.2132 174.3474,-78.2805 171.5223,-70.8 168.8843,-63.8149 168.8843,-60.9851 171.5223,-54 172.8668,-50.4399 174.7415,-47.004 176.9144,-43.7612"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="183.203,-35.676 180.6155,-46.3323 180.1332,-39.6227 177.0634,-43.5695 177.0634,-43.5695 177.0634,-43.5695 180.1332,-39.6227 173.5114,-40.8067 183.203,-35.676 183.203,-35.676"/>
+            <text text-anchor="middle" x="177.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- travers&#45;&gt;traverse -->
+        <g id="edge34" class="edge">
+            <title>travers&#45;&gt;traverse</title>
+            <path fill="none" stroke="#0000ff" d="M203.2369,-88.4006C203.2369,-76.2949 203.2369,-60.2076 203.2369,-46.4674"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="206.737,-46.072 203.2369,-36.072 199.737,-46.0721 206.737,-46.072"/>
+            <text text-anchor="middle" x="208.0942" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- traverse&#45;&gt;travers -->
+        <g id="edge35" class="edge">
+            <title>traverse&#45;&gt;travers</title>
+            <path fill="none" stroke="#ffc0cb" d="M209.9024,-36.0722C211.6782,-41.7317 213.3487,-48.0535 214.2369,-54 215.3399,-61.3847 215.3399,-63.4153 214.2369,-70.8 213.8344,-73.4945 213.2713,-76.2661 212.6106,-79.0181"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="209.9024,-88.7278 208.2546,-77.8864 211.2458,-83.9116 212.5891,-79.0954 212.5891,-79.0954 212.5891,-79.0954 211.2458,-83.9116 216.9237,-80.3044 209.9024,-88.7278 209.9024,-88.7278"/>
+            <text text-anchor="middle" x="224.2732" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#ff0000"> ~3</text>
+        </g>
+        <!-- tra2&#45;&gt;trac -->
+        <g id="edge29" class="edge">
+            <title>tra2&#45;&gt;trac</title>
+            <path fill="none" stroke="#0000ff" d="M208.0507,-474.7233C183.1741,-455.0282 146.5501,-426.0326 121.6137,-406.2901"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="123.6518,-403.4396 113.6389,-399.9764 119.3067,-408.9278 123.6518,-403.4396"/>
+            <text text-anchor="middle" x="178.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- tra2&#45;&gt;tre -->
+        <!-- tra2&#45;&gt;tr -->
+        <g id="edge40" class="edge">
+            <title>tra2&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M233.9039,-526.0074C233.3268,-538.8926 232.6427,-554.1663 232.0226,-568.0109"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="231.5637,-578.2562 227.5157,-568.0648 231.7874,-573.2612 232.0112,-568.2662 232.0112,-568.2662 232.0112,-568.2662 231.7874,-573.2612 236.5067,-568.4676 231.5637,-578.2562 231.5637,-578.2562"/>
+            <text text-anchor="middle" x="253.0665" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0DE</text>
+        </g>
+        <!-- tra2&#45;&gt;trav -->
+        <g id="edge30" class="edge">
+            <title>tra2&#45;&gt;trav</title>
+            <path fill="none" stroke="#0000ff" d="M214.7335,-471.0228C210.2756,-464.2044 206.1695,-456.5615 203.7369,-448.8313 201.3127,-441.1275 200.14,-432.6433 199.7048,-424.4366"/>
+            <polygon fill="#0000ff" stroke="#0000ff" points="203.201,-424.2005 199.5092,-414.2697 196.2023,-424.3352 203.201,-424.2005"/>
+            <text text-anchor="middle" x="209.4869" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> v</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.p1.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.p1.svg
new file mode 100644
index 000000000000..c5268e71d64f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.p1.svg
@@ -0,0 +1,405 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+digraph G {
+    { rank=same root -> start [style=invis] }
+    newrank = true
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"; color = "lightgrey"; fontcolor = lightgray]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"; color = "lightgrey"; fontcolor = lightgray]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    {
+        ranksep = 0.1
+        tree2 [label = "Chain\n0x0BB"]
+        tree2p [label = "Prefix\n0x0BF\ncontentArray[1]"]
+        tree2p -> tree2 [label = " &epsilon;"]
+    }
+
+    tre -> tree2p [label = " e"]
+    tree2 -> trees [label = " s"]
+
+    {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> tree2p -> trie [style=invis]}
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; arrowhead="vee"]
+        node [color = "lightblue"; fontcolor="blue"]
+
+        start -> root
+
+        root -> t [label = " t"]
+        t -> tr [label = " r"]
+        tr -> tre [label = " e"]
+        tre -> tree [label = " e"]
+        tree -> trees [label = " s"]
+
+        trees [label = "contentArray[3]"; constraint = false]
+    }
+
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="red"; arrowhead="vee"; constraint="false"]
+
+        trees -> tree2 [label = " ~3"]
+        tree2 -> tree2p [label = "0x0BB"]
+        tree2p -> tre [label = "0x0BF"]
+        tre -> tr [label = "0x0DE"]
+        tr -> t [label = "0x07E"]
+        t -> root [label = "0x09B"]
+        root -> start [label = "0x09A"]
+    }
+}
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="666pt" height="678pt"
+     viewBox="0.00 0.00 666.47 677.52" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 673.5152)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-673.5152 662.4738,-673.5152 662.4738,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="264.2369" cy="-640.0996" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="264.2369" y="-644.2996" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="264.2369" y="-627.4996" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="397.2369" cy="-640.0996" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="397.2369" y="-635.8996" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- root&#45;&gt;start -->
+        <g id="edge33" class="edge">
+            <title>root&#45;&gt;start</title>
+            <path fill="none" stroke="#ffc0cb" d="M297.7984,-640.0996C311.3037,-640.0996 327.2339,-640.0996 342.3398,-640.0996"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="352.6261,-640.0996 342.6262,-644.5997 347.6261,-640.0996 342.6261,-640.0997 342.6261,-640.0997 342.6261,-640.0997 347.6261,-640.0996 342.6261,-635.5997 352.6261,-640.0996 352.6261,-640.0996"/>
+            <text text-anchor="middle" x="325.2509" y="-647.2996" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x09A</text>
+        </g>
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="264.2369" cy="-557.8839" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="264.2369" y="-553.6839" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge2" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#000000" d="M253.1918,-612.3067C251.4263,-605.2505 250.368,-597.5734 252.6036,-585.3125"/>
+            <polygon fill="#000000" stroke="#000000" points="256.0259,-586.0473 254.8506,-575.5179 249.2031,-584.4821 256.0259,-586.0473"/>
+            <text text-anchor="middle" x="255.9322" y="-589.0839" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge22" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#add8e6" d="M264.2369,-610.4178C264.2369,-602.5756 264.2369,-594.1307 264.2369,-586.3542"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="264.2369,-576.2835 268.737,-586.2835 264.2369,-581.2835 264.237,-586.2835 264.237,-586.2835 264.237,-586.2835 264.2369,-581.2835 259.737,-586.2836 264.2369,-576.2835 264.2369,-576.2835"/>
+            <text text-anchor="middle" x="267.9322" y="-589.0839" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- start&#45;&gt;root -->
+        <g id="edge21" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M353.4042,-637.4233C341.2938,-636.9719 328.1479,-636.7485 316.0184,-637.0996 313.3264,-637.1775 310.5568,-637.2781 307.7643,-637.3951"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="297.6504,-637.8791 307.4239,-632.9062 302.6447,-637.6401 307.639,-637.401 307.639,-637.401 307.639,-637.401 302.6447,-637.6401 307.8541,-641.8959 297.6504,-637.8791 297.6504,-637.8791"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-88.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-84.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge8" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-70.5672C73.2369,-63.2743 73.2369,-54.6987 73.2369,-46.6137"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-46.417 73.2369,-36.417 69.737,-46.4171 76.737,-46.417"/>
+            <text text-anchor="middle" x="77.3172" y="-49.2" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-171.0156" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-166.8156" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge7" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-152.7693C73.2369,-142.338 73.2369,-129.027 73.2369,-117.2514"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-116.9953 73.2369,-106.9953 69.737,-116.9953 76.737,-116.9953"/>
+            <text text-anchor="middle" x="78.4869" y="-120" font-family="Times,serif" font-size="14.00" fill="#000000"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-276.5263" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-272.3263" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge6" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-258.1478C73.2369,-241.8835 73.2369,-217.8982 73.2369,-199.2081"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-199.1836 73.2369,-189.1836 69.737,-199.1836 76.737,-199.1836"/>
+            <text text-anchor="middle" x="76.9322" y="-213.6313" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#d3d3d3" cx="217.2369" cy="-276.5263" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="217.2369" y="-272.3263" font-family="Times,serif" font-size="14.00" fill="#d3d3d3">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="141.2369" cy="-382.037" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="141.2369" y="-386.237" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="141.2369" y="-369.437" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge5" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#000000" d="M124.4279,-355.9557C113.8753,-339.582 100.3244,-318.5561 89.8158,-302.2506"/>
+            <polygon fill="#000000" stroke="#000000" points="92.6883,-300.2467 84.329,-293.7372 86.8044,-304.0388 92.6883,-300.2467"/>
+            <text text-anchor="middle" x="121.0942" y="-331.0214" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="250.2369" cy="-382.037" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="250.2369" y="-386.237" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="250.2369" y="-369.437" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tra&#45;&gt;tre -->
+        <!-- tree2p -->
+        <g id="node15" class="node">
+            <title>tree2p</title>
+            <ellipse fill="none" stroke="#000000" cx="401.2369" cy="-276.5263" rx="75.1528" ry="41.0911"/>
+            <text text-anchor="middle" x="401.2369" y="-289.1263" font-family="Times,serif" font-size="14.00" fill="#000000">Prefix</text>
+            <text text-anchor="middle" x="401.2369" y="-272.3263" font-family="Times,serif" font-size="14.00" fill="#000000">0x0BF</text>
+            <text text-anchor="middle" x="401.2369" y="-255.5263" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- tree&#45;&gt;tree2p -->
+        <!-- trees -->
+        <g id="node16" class="node">
+            <title>trees</title>
+            <ellipse fill="none" stroke="#000000" cx="295.2369" cy="-88.8" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="295.2369" y="-84.6" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[3]</text>
+        </g>
+        <!-- tree&#45;&gt;trees -->
+        <g id="edge26" class="edge">
+            <title>tree&#45;&gt;trees</title>
+            <path fill="none" stroke="#add8e6" d="M224.7159,-258.5263C238.4018,-225.5876 267.5561,-155.4207 283.7655,-116.4088"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="287.7806,-106.7455 288.0991,-117.7068 285.8621,-111.3628 283.9435,-115.9801 283.9435,-115.9801 283.9435,-115.9801 285.8621,-111.3628 279.788,-114.2534 287.7806,-106.7455 287.7806,-106.7455"/>
+            <text text-anchor="middle" x="277.7092" y="-166.8156" font-family="Times,serif" font-size="14.00" fill="#0000ff"> s</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge10" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#d3d3d3" d="M227.0349,-358.8106C223.5725,-354.1203 220.5044,-348.9742 218.5223,-343.6214 214.0163,-331.453 213.1878,-317.1731 213.6897,-304.9221"/>
+            <polygon fill="#d3d3d3" stroke="#d3d3d3" points="217.1925,-305.0065 214.4144,-294.7824 210.2104,-304.5074 217.1925,-305.0065"/>
+            <text text-anchor="middle" x="224.0942" y="-331.0214" font-family="Times,serif" font-size="14.00" fill="#d3d3d3"> e</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge25" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#add8e6" d="M241.2195,-353.2057C236.4749,-338.0358 230.683,-319.5176 225.9798,-304.4799"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="222.9239,-294.7093 230.2038,-302.9101 224.4164,-299.4813 225.909,-304.2534 225.909,-304.2534 225.909,-304.2534 224.4164,-299.4813 221.6142,-305.5967 222.9239,-294.7093 222.9239,-294.7093"/>
+            <text text-anchor="middle" x="243.0942" y="-331.0214" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="361.2369" cy="-382.037" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="361.2369" y="-386.237" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="361.2369" y="-369.437" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tre&#45;&gt;tri -->
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="264.2369" cy="-475.6683" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="264.2369" y="-479.8683" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="264.2369" y="-463.0683" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tre&#45;&gt;tr -->
+        <g id="edge30" class="edge">
+            <title>tre&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M220.1451,-399.8185C206.9606,-410.0222 196.0135,-423.5087 203.5777,-437.2526 207.8544,-445.0234 214.4401,-451.4422 221.7208,-456.6604"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="230.4902,-462.2435 219.638,-460.6689 226.2725,-459.5582 222.0547,-456.8729 222.0547,-456.8729 222.0547,-456.8729 226.2725,-459.5582 224.4715,-453.0769 230.4902,-462.2435 230.4902,-462.2435"/>
+            <text text-anchor="middle" x="224.0665" y="-424.6526" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0DE</text>
+        </g>
+        <!-- tre&#45;&gt;tree2p -->
+        <g id="edge14" class="edge">
+            <title>tre&#45;&gt;tree2p</title>
+            <path fill="none" stroke="#000000" d="M281.9441,-365.9135C292.0842,-361.18 303.4736,-356.2938 314.2369,-352.6214 331.0728,-346.877 338.4386,-353.4933 353.2369,-343.6214 360.9367,-338.4848 367.8078,-331.6659 373.7781,-324.3413"/>
+            <polygon fill="#000000" stroke="#000000" points="376.657,-326.3371 379.8994,-316.2506 371.0746,-322.1136 376.657,-326.3371"/>
+            <text text-anchor="middle" x="377.0942" y="-331.0214" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="585.2369" cy="-276.5263" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="585.2369" y="-272.3263" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge12" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#000000" d="M393.8084,-366.6948C432.9219,-348.2712 498.6628,-317.3053 541.999,-296.8927"/>
+            <polygon fill="#000000" stroke="#000000" points="543.6019,-300.0065 551.1571,-292.5789 540.619,-293.6739 543.6019,-300.0065"/>
+            <text text-anchor="middle" x="478.0942" y="-331.0214" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge4" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#000000" d="M230.3862,-462.3473C216.3498,-455.9318 200.378,-447.4106 187.5223,-437.2526 179.7381,-431.102 172.3346,-423.4626 165.8229,-415.8473"/>
+            <polygon fill="#000000" stroke="#000000" points="168.1714,-413.1902 159.1314,-407.6649 162.7526,-417.6217 168.1714,-413.1902"/>
+            <text text-anchor="middle" x="193.0942" y="-424.6526" font-family="Times,serif" font-size="14.00" fill="#000000"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge9" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#000000" d="M262.3234,-446.0575C261.7354,-439.4979 260.9854,-432.4951 259.2059,-421.283"/>
+            <polygon fill="#000000" stroke="#000000" points="262.641,-420.6036 257.5414,-411.3168 255.7366,-421.7567 262.641,-420.6036"/>
+            <text text-anchor="middle" x="267.0942" y="-424.6526" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge24" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#add8e6" d="M251.4368,-447.9439C250.2405,-444.4076 249.2174,-440.7879 248.5223,-437.2526 247.53,-432.2063 247.0287,-426.8513 246.8562,-421.5383"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="246.8871,-411.4654 251.3563,-421.4792 246.8717,-416.4654 246.8563,-421.4654 246.8563,-421.4654 246.8563,-421.4654 246.8717,-416.4654 242.3563,-421.4516 246.8871,-411.4654 246.8871,-411.4654"/>
+            <text text-anchor="middle" x="254.0942" y="-424.6526" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge11" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#000000" d="M288.2144,-452.5235C300.8059,-440.3693 316.39,-425.3264 329.9234,-412.263"/>
+            <polygon fill="#000000" stroke="#000000" points="332.4349,-414.7033 337.199,-405.2401 327.5733,-409.6669 332.4349,-414.7033"/>
+            <text text-anchor="middle" x="325.9322" y="-424.6526" font-family="Times,serif" font-size="14.00" fill="#000000"> i</text>
+        </g>
+        <!-- tr&#45;&gt;t -->
+        <g id="edge31" class="edge">
+            <title>tr&#45;&gt;t</title>
+            <path fill="none" stroke="#ffc0cb" d="M264.2369,-505.1649C264.2369,-513.0799 264.2369,-521.6204 264.2369,-529.4751"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="264.2369,-539.6376 259.737,-529.6376 264.2369,-534.6376 264.237,-529.6376 264.237,-529.6376 264.237,-529.6376 264.2369,-534.6376 268.737,-529.6377 264.2369,-539.6376 264.2369,-539.6376"/>
+            <text text-anchor="middle" x="282.5132" y="-518.2839" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x07E</text>
+        </g>
+        <!-- t&#45;&gt;root -->
+        <g id="edge32" class="edge">
+            <title>t&#45;&gt;root</title>
+            <path fill="none" stroke="#ffc0cb" d="M272.5745,-575.5942C273.6774,-578.6326 274.6336,-581.8043 275.2369,-584.8839 276.3056,-590.3394 276.3564,-596.0962 275.789,-601.7512"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="274.1862,-611.733 271.3286,-601.146 274.9789,-606.7962 275.7717,-601.8595 275.7717,-601.8595 275.7717,-601.8595 274.9789,-606.7962 280.2148,-602.573 274.1862,-611.733 274.1862,-611.733"/>
+            <text text-anchor="middle" x="294.9052" y="-589.0839" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x09B</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge3" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#000000" d="M254.2705,-540.2954C252.9499,-537.2423 251.8032,-534.0343 251.0763,-530.8839 249.8023,-525.3626 249.7646,-519.536 250.472,-513.8197"/>
+            <polygon fill="#000000" stroke="#000000" points="253.9589,-514.2255 252.4441,-503.7395 247.0891,-512.8814 253.9589,-514.2255"/>
+            <text text-anchor="middle" x="255.3172" y="-518.2839" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge23" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M296.3684,-548.8052C304.9495,-544.7547 313.2223,-538.9976 318.2369,-530.8839 325.8698,-518.5336 317.3847,-506.7525 305.051,-497.2687"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="296.521,-491.3854 307.3079,-493.3588 300.6369,-494.2243 304.7529,-497.0631 304.7529,-497.0631 304.7529,-497.0631 300.6369,-494.2243 302.1979,-500.7675 296.521,-491.3854 296.521,-491.3854"/>
+            <text text-anchor="middle" x="325.3172" y="-518.2839" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- tree2 -->
+        <g id="node14" class="node">
+            <title>tree2</title>
+            <ellipse fill="none" stroke="#000000" cx="354.2369" cy="-171.0156" rx="39.2342" ry="29.3315"/>
+            <text text-anchor="middle" x="354.2369" y="-175.2156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="354.2369" y="-158.4156" font-family="Times,serif" font-size="14.00" fill="#000000">0x0BB</text>
+        </g>
+        <!-- tree2&#45;&gt;tree2p -->
+        <g id="edge28" class="edge">
+            <title>tree2&#45;&gt;tree2p</title>
+            <path fill="none" stroke="#ffc0cb" d="M332.5998,-195.7225C326.8044,-205.3439 323.4112,-216.3994 328.5637,-226.2313 331.3469,-231.5422 334.9738,-236.4037 339.122,-240.8292"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="346.5383,-247.8677 336.1871,-244.2478 342.9116,-244.4257 339.2849,-240.9838 339.2849,-240.9838 339.2849,-240.9838 342.9116,-244.4257 342.3827,-237.7197 346.5383,-247.8677 346.5383,-247.8677"/>
+            <text text-anchor="middle" x="349.0735" y="-213.6313" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BB</text>
+        </g>
+        <!-- tree2&#45;&gt;trees -->
+        <g id="edge15" class="edge">
+            <title>tree2&#45;&gt;trees</title>
+            <path fill="none" stroke="#000000" d="M335.5689,-145.0021C328.6935,-135.4213 320.9064,-124.5701 314.0208,-114.9751"/>
+            <polygon fill="#000000" stroke="#000000" points="316.6766,-112.6728 308.0027,-106.5889 310.9895,-116.7541 316.6766,-112.6728"/>
+            <text text-anchor="middle" x="331.7092" y="-120" font-family="Times,serif" font-size="14.00" fill="#000000"> s</text>
+        </g>
+        <!-- tree2p&#45;&gt;tre -->
+        <g id="edge29" class="edge">
+            <title>tree2p&#45;&gt;tre</title>
+            <path fill="none" stroke="#ffc0cb" d="M346.159,-304.6442C334.2868,-311.4197 322.0324,-318.9824 311.1177,-326.8214 300.7955,-334.2347 290.2869,-343.1729 280.9708,-351.6776"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="273.4561,-358.6766 277.7068,-348.5681 277.1149,-355.2689 280.7738,-351.8611 280.7738,-351.8611 280.7738,-351.8611 277.1149,-355.2689 283.8408,-355.1541 273.4561,-358.6766 273.4561,-358.6766"/>
+            <text text-anchor="middle" x="330.2965" y="-331.0214" font-family="Times,serif" font-size="14.00" fill="#ff0000">0x0BF</text>
+        </g>
+        <!-- tree2p&#45;&gt;trie -->
+        <!-- tree2p&#45;&gt;tree2 -->
+        <g id="edge13" class="edge">
+            <title>tree2p&#45;&gt;tree2</title>
+            <path fill="none" stroke="#000000" d="M383.3341,-236.3363C379.2138,-227.0866 374.8498,-217.2898 370.7898,-208.1754"/>
+            <polygon fill="#000000" stroke="#000000" points="373.9581,-206.6863 366.6918,-198.9757 367.5638,-209.5346 373.9581,-206.6863"/>
+            <text text-anchor="middle" x="383.0672" y="-213.6313" font-family="Times,serif" font-size="14.00" fill="#000000"> ε</text>
+        </g>
+        <!-- trees&#45;&gt;tree2 -->
+        <g id="edge27" class="edge">
+            <title>trees&#45;&gt;tree2</title>
+            <path fill="none" stroke="#ffc0cb" d="M290.8236,-106.8267C289.7499,-115.1419 289.9269,-124.8606 294.1643,-132.6 298.1354,-139.8531 304.124,-145.9366 310.807,-150.9671"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="319.5438,-156.7845 308.7261,-154.9878 315.382,-154.0133 311.2202,-151.2421 311.2202,-151.2421 311.2202,-151.2421 315.382,-154.0133 313.7143,-147.4965 319.5438,-156.7845 319.5438,-156.7845"/>
+            <text text-anchor="middle" x="303.2732" y="-120" font-family="Times,serif" font-size="14.00" fill="#ff0000"> ~3</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w1.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w1.svg
new file mode 100644
index 000000000000..1be94ae23739
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w1.svg
@@ -0,0 +1,226 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+``` plantuml
+digraph G {
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    // edge [color="none", fontcolor="none"]
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    // {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> trie [style=invis]}
+}
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="480pt" height="734pt"
+     viewBox="0.00 0.00 480.47 734.09" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 730.0939)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-730.0939 476.4738,-730.0939 476.4738,4 -4,4"/>
+        <!-- tractor -->
+        <g id="node1" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node2" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge7" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-88.4006C73.2369,-76.2949 73.2369,-60.2076 73.2369,-46.4674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-46.072 73.2369,-36.072 69.737,-46.0721 76.737,-46.072"/>
+            <text text-anchor="middle" x="77.3172" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node3" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge6" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-177.2006C73.2369,-165.0949 73.2369,-149.0076 73.2369,-135.2674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-134.872 73.2369,-124.872 69.737,-134.8721 76.737,-134.872"/>
+            <text text-anchor="middle" x="78.4869" y="-147" font-family="Times,serif" font-size="14.00" fill="#000000"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node4" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge5" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#000000" d="M73.2369,-266.0006C73.2369,-253.8949 73.2369,-237.8076 73.2369,-224.0674"/>
+            <polygon fill="#000000" stroke="#000000" points="76.737,-223.672 73.2369,-213.672 69.737,-223.6721 76.737,-223.672"/>
+            <text text-anchor="middle" x="76.9322" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+        <!-- tree -->
+        <g id="node6" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="217.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra -->
+        <g id="node5" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="112.2369" cy="-384.6156" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="112.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="112.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge4" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#000000" d="M101.2694,-356.4333C95.8306,-342.4575 89.2887,-325.6472 83.8929,-311.7821"/>
+            <polygon fill="#000000" stroke="#000000" points="87.0872,-310.3393 80.1988,-302.2895 80.5638,-312.878 87.0872,-310.3393"/>
+            <text text-anchor="middle" x="98.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> c</text>
+        </g>
+        <!-- trie -->
+        <g id="node8" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="399.2369" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="399.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tre -->
+        <g id="node7" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="217.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="217.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge9" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#000000" d="M217.2369,-355.0897C217.2369,-341.6046 217.2369,-325.7123 217.2369,-312.4153"/>
+            <polygon fill="#000000" stroke="#000000" points="220.737,-312.4033 217.2369,-302.4033 213.737,-312.4034 220.737,-312.4033"/>
+            <text text-anchor="middle" x="222.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tri -->
+        <g id="node9" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="354.2369" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="354.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="354.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge11" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#000000" d="M366.7719,-356.7C373.1109,-342.583 380.7723,-325.5209 387.0603,-311.5175"/>
+            <polygon fill="#000000" stroke="#000000" points="390.2575,-312.9415 391.1609,-302.3852 383.8717,-310.074 390.2575,-312.9415"/>
+            <text text-anchor="middle" x="386.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr -->
+        <g id="node10" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-496.2469" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="217.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="217.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge3" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#000000" d="M194.7118,-472.2992C179.1047,-455.7065 158.1828,-433.4633 141.317,-415.5323"/>
+            <polygon fill="#000000" stroke="#000000" points="143.7653,-413.0267 134.3644,-408.1406 138.6664,-417.8227 143.7653,-413.0267"/>
+            <text text-anchor="middle" x="176.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge8" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#000000" d="M217.2369,-466.6249C217.2369,-453.7568 217.2369,-438.4867 217.2369,-424.6319"/>
+            <polygon fill="#000000" stroke="#000000" points="220.737,-424.3761 217.2369,-414.3761 213.737,-424.3762 220.737,-424.3761"/>
+            <text text-anchor="middle" x="222.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge10" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#000000" d="M243.328,-474.9872C264.9956,-457.3319 295.9963,-432.0716 319.7169,-412.7435"/>
+            <polygon fill="#000000" stroke="#000000" points="322.2348,-415.2066 327.7763,-406.1765 317.813,-409.78 322.2348,-415.2066"/>
+            <text text-anchor="middle" x="296.9322" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#000000"> i</text>
+        </g>
+        <!-- t -->
+        <g id="node11" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-596.4626" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="217.2369" y="-592.2626" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge2" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#000000" d="M217.2369,-578.0815C217.2369,-566.3502 217.2369,-550.6774 217.2369,-536.1885"/>
+            <polygon fill="#000000" stroke="#000000" points="220.737,-535.9208 217.2369,-525.9209 213.737,-535.9209 220.737,-535.9208"/>
+            <text text-anchor="middle" x="221.3172" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#000000"> r</text>
+        </g>
+        <!-- root -->
+        <g id="node12" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-696.6782" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="217.2369" y="-700.8782" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="217.2369" y="-684.0782" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge1" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#000000" d="M217.2369,-667.1522C217.2369,-653.6671 217.2369,-637.7749 217.2369,-624.4779"/>
+            <polygon fill="#000000" stroke="#000000" points="220.737,-624.4659 217.2369,-614.4659 213.737,-624.4659 220.737,-624.4659"/>
+            <text text-anchor="middle" x="220.9322" y="-636.6626" font-family="Times,serif" font-size="14.00" fill="#000000"> t</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w2.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w2.svg
new file mode 100644
index 000000000000..9d8ab22e69a3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w2.svg
@@ -0,0 +1,326 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    // {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> trie [style=invis]}
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; arrowhead="vee"]
+
+        start -> root
+
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+    }
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="blue"; constraint="false"; arrowhead="vee"]
+
+        tractor -> tracto -> tract -> trac -> tra -> tr
+        tree -> tre -> tr
+        trie -> tri -> tr -> t -> root -> start
+    }
+}
+
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="480pt" height="734pt"
+     viewBox="0.00 0.00 480.47 734.09" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 730.0939)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-730.0939 476.4738,-730.0939 476.4738,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-696.6782" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="217.2369" y="-700.8782" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="217.2369" y="-684.0782" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="330.2369" cy="-696.6782" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="330.2369" y="-692.4782" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- root&#45;&gt;start -->
+        <g id="edge27" class="edge">
+            <title>root&#45;&gt;start</title>
+            <path fill="none" stroke="#ffc0cb" d="M245.0455,-713.0508C262.0796,-719.8061 279.1137,-720.5455 296.1478,-715.2689"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="305.6112,-711.7185 297.8292,-719.4444 300.9299,-713.4748 296.2485,-715.2312 296.2485,-715.2312 296.2485,-715.2312 300.9299,-713.4748 294.6678,-711.0179 305.6112,-711.7185 305.6112,-711.7185"/>
+        </g>
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-596.4626" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="217.2369" y="-592.2626" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <g id="edge5" class="edge">
+            <title>root&#45;&gt;t</title>
+            <path fill="none" stroke="#add8e6" d="M217.2369,-667.1522C217.2369,-653.6671 217.2369,-637.7749 217.2369,-624.4779"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="217.2369,-614.4659 221.737,-624.4659 217.2369,-619.4659 217.237,-624.4659 217.237,-624.4659 217.237,-624.4659 217.2369,-619.4659 212.737,-624.4659 217.2369,-614.4659 217.2369,-614.4659"/>
+            <text text-anchor="middle" x="220.9322" y="-636.6626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- start&#45;&gt;root -->
+        <g id="edge4" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M285.8549,-696.6782C277.5437,-696.6782 269.2326,-696.6782 260.9215,-696.6782"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="250.7838,-696.6782 260.7838,-692.1783 255.7838,-696.6783 260.7838,-696.6783 260.7838,-696.6783 260.7838,-696.6783 255.7838,-696.6783 260.7837,-701.1783 250.7838,-696.6782 250.7838,-696.6782"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tractor&#45;&gt;tracto -->
+        <g id="edge16" class="edge">
+            <title>tractor&#45;&gt;tracto</title>
+            <path fill="none" stroke="#ffc0cb" d="M78.6937,-36.1509C80.1467,-41.8136 81.5127,-48.1174 82.2369,-54 83.1492,-61.4107 83.1492,-63.3893 82.2369,-70.8 81.9201,-73.3736 81.4804,-76.0279 80.9643,-78.6738"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="78.6937,-88.6491 76.5255,-77.8997 79.8035,-83.7738 80.9132,-78.8985 80.9132,-78.8985 80.9132,-78.8985 79.8035,-83.7738 85.301,-79.8973 78.6937,-88.6491 78.6937,-88.6491"/>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <g id="edge11" class="edge">
+            <title>tracto&#45;&gt;tractor</title>
+            <path fill="none" stroke="#add8e6" d="M73.2369,-88.4006C73.2369,-76.2949 73.2369,-60.2076 73.2369,-46.4674"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="73.2369,-36.072 77.737,-46.072 73.2369,-41.072 73.237,-46.072 73.237,-46.072 73.237,-46.072 73.2369,-41.072 68.737,-46.0721 73.2369,-36.072 73.2369,-36.072"/>
+            <text text-anchor="middle" x="77.3172" y="-58.2" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tracto&#45;&gt;tract -->
+        <g id="edge17" class="edge">
+            <title>tracto&#45;&gt;tract</title>
+            <path fill="none" stroke="#ffc0cb" d="M80.3796,-124.4595C82.3635,-130.2097 84.2449,-136.6872 85.2369,-142.8 86.4329,-150.1703 86.4329,-152.2297 85.2369,-159.6 84.797,-162.3109 84.1821,-165.0936 83.461,-167.8527"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="80.5061,-177.5725 79.1093,-166.6959 81.9604,-172.7886 83.4148,-168.0048 83.4148,-168.0048 83.4148,-168.0048 81.9604,-172.7886 87.7202,-169.3137 80.5061,-177.5725 80.5061,-177.5725"/>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <g id="edge10" class="edge">
+            <title>tract&#45;&gt;tracto</title>
+            <path fill="none" stroke="#add8e6" d="M73.2369,-177.2006C73.2369,-165.0949 73.2369,-149.0076 73.2369,-135.2674"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="73.2369,-124.872 77.737,-134.872 73.2369,-129.872 73.237,-134.872 73.237,-134.872 73.237,-134.872 73.2369,-129.872 68.737,-134.8721 73.2369,-124.872 73.2369,-124.872"/>
+            <text text-anchor="middle" x="78.4869" y="-147" font-family="Times,serif" font-size="14.00" fill="#0000ff"> o</text>
+        </g>
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="73.2369" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="73.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- tract&#45;&gt;trac -->
+        <g id="edge18" class="edge">
+            <title>tract&#45;&gt;trac</title>
+            <path fill="none" stroke="#ffc0cb" d="M78.0886,-213.7847C79.3802,-219.4487 80.5942,-225.7449 81.2369,-231.6 82.0516,-239.0221 82.0516,-240.9779 81.2369,-248.4 80.9557,-250.9616 80.5652,-253.6076 80.1066,-256.2482"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="78.0886,-266.2153 75.6626,-255.5211 79.0808,-261.3147 80.0731,-256.4142 80.0731,-256.4142 80.0731,-256.4142 79.0808,-261.3147 84.4836,-257.3072 78.0886,-266.2153 78.0886,-266.2153"/>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <g id="edge9" class="edge">
+            <title>trac&#45;&gt;tract</title>
+            <path fill="none" stroke="#add8e6" d="M73.2369,-266.0006C73.2369,-253.8949 73.2369,-237.8076 73.2369,-224.0674"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="73.2369,-213.672 77.737,-223.672 73.2369,-218.672 73.237,-223.672 73.237,-223.672 73.237,-223.672 73.2369,-218.672 68.737,-223.6721 73.2369,-213.672 73.2369,-213.672"/>
+            <text text-anchor="middle" x="76.9322" y="-235.8" font-family="Times,serif" font-size="14.00" fill="#0000ff"> t</text>
+        </g>
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="112.2369" cy="-384.6156" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="112.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="112.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- trac&#45;&gt;tra -->
+        <g id="edge19" class="edge">
+            <title>trac&#45;&gt;tra</title>
+            <path fill="none" stroke="#ffc0cb" d="M92.2023,-299.8107C98.0398,-305.6172 103.8226,-312.698 107.2369,-320.4 110.6198,-328.0312 112.455,-336.6721 113.3574,-345.0849"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="114.0369,-355.1505 108.8735,-345.4763 113.7001,-350.1618 113.3633,-345.1732 113.3633,-345.1732 113.3633,-345.1732 113.7001,-350.1618 117.853,-344.87 114.0369,-355.1505 114.0369,-355.1505"/>
+        </g>
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="217.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra&#45;&gt;trac -->
+        <g id="edge8" class="edge">
+            <title>tra&#45;&gt;trac</title>
+            <path fill="none" stroke="#add8e6" d="M101.2694,-356.4333C95.8306,-342.4575 89.2887,-325.6472 83.8929,-311.7821"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="80.1988,-302.2895 88.0191,-309.9766 82.0121,-306.9491 83.8255,-311.6087 83.8255,-311.6087 83.8255,-311.6087 82.0121,-306.9491 79.6319,-313.2407 80.1988,-302.2895 80.1988,-302.2895"/>
+            <text text-anchor="middle" x="98.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> c</text>
+        </g>
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-496.2469" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="217.2369" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="217.2369" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- tra&#45;&gt;tr -->
+        <g id="edge20" class="edge">
+            <title>tra&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M134.8244,-407.6467C142.3198,-415.356 150.6782,-424.0253 158.2369,-432.0313 168.0928,-442.4704 178.7426,-453.987 188.2457,-464.3504"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="195.2572,-472.0136 185.1867,-467.6734 191.882,-468.3247 188.5068,-464.6357 188.5068,-464.6357 188.5068,-464.6357 191.882,-468.3247 191.8268,-461.598 195.2572,-472.0136 195.2572,-472.0136"/>
+        </g>
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="217.2369" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="217.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="217.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tree&#45;&gt;tre -->
+        <g id="edge21" class="edge">
+            <title>tree&#45;&gt;tre</title>
+            <path fill="none" stroke="#ffc0cb" d="M225.7116,-302.3279C227.9713,-307.9814 230.0993,-314.3363 231.2369,-320.4 232.8247,-328.8637 232.2596,-337.9179 230.673,-346.479"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="228.409,-356.2376 226.2855,-345.4793 229.5391,-351.3669 230.6691,-346.4963 230.6691,-346.4963 230.6691,-346.4963 229.5391,-351.3669 235.0527,-347.5133 228.409,-356.2376 228.409,-356.2376"/>
+        </g>
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="399.2369" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="399.2369" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge13" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#add8e6" d="M217.2369,-355.0897C217.2369,-341.6046 217.2369,-325.7123 217.2369,-312.4153"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="217.2369,-302.4033 221.737,-312.4033 217.2369,-307.4033 217.237,-312.4033 217.237,-312.4033 217.237,-312.4033 217.2369,-307.4033 212.737,-312.4034 217.2369,-302.4033 217.2369,-302.4033"/>
+            <text text-anchor="middle" x="222.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tre&#45;&gt;tr -->
+        <g id="edge22" class="edge">
+            <title>tre&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M227.8923,-412.9079C230.9037,-424.04 232.9509,-436.9736 231.2369,-448.8313 230.8027,-451.835 230.2214,-454.9136 229.5418,-457.989"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="227.0968,-467.7057 225.1732,-456.9099 228.317,-462.8569 229.5371,-458.008 229.5371,-458.008 229.5371,-458.008 228.317,-462.8569 233.9011,-459.1062 227.0968,-467.7057 227.0968,-467.7057"/>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="326.2369" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="326.2369" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="326.2369" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- trie&#45;&gt;tri -->
+        <g id="edge23" class="edge">
+            <title>trie&#45;&gt;tri</title>
+            <path fill="none" stroke="#ffc0cb" d="M393.8563,-302.4822C390.1724,-313.1558 384.6144,-326.606 377.2369,-337.2 372.5248,-343.9665 366.7036,-350.493 360.6987,-356.4272"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="353.0638,-363.6005 357.2704,-353.4736 356.7078,-360.1768 360.3518,-356.7531 360.3518,-356.7531 360.3518,-356.7531 356.7078,-360.1768 363.4331,-360.0327 353.0638,-363.6005 353.0638,-363.6005"/>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge15" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#add8e6" d="M342.2386,-357.7635C349.5562,-346.0182 358.5767,-332.2461 367.5223,-320.4 370.1199,-316.9602 372.9633,-313.4399 375.8469,-310.0124"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="382.5518,-302.2644 379.4108,-312.7708 379.2799,-306.0453 376.0081,-309.8261 376.0081,-309.8261 376.0081,-309.8261 379.2799,-306.0453 372.6053,-306.8815 382.5518,-302.2644 382.5518,-302.2644"/>
+            <text text-anchor="middle" x="372.0942" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tri&#45;&gt;tr -->
+        <g id="edge24" class="edge">
+            <title>tri&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M314.1697,-412.7968C308.1382,-424.7883 300.0242,-438.3853 290.2369,-448.8313 280.9326,-458.7618 269.1419,-467.5728 257.7651,-474.8231"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="249.1939,-480.056 255.3841,-471.0044 253.4614,-477.4506 257.729,-474.8451 257.729,-474.8451 257.729,-474.8451 253.4614,-477.4506 260.0739,-478.6859 249.1939,-480.056 249.1939,-480.056"/>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge7" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M186.1766,-479.0539C173.479,-471.0083 159.2794,-460.5991 148.5223,-448.8313 141.1239,-440.7377 134.615,-430.8736 129.2283,-421.329"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="124.4153,-412.3441 133.1041,-419.0341 126.7763,-416.7516 129.1373,-421.159 129.1373,-421.159 129.1373,-421.159 126.7763,-416.7516 125.1706,-423.284 124.4153,-412.3441 124.4153,-412.3441"/>
+            <text text-anchor="middle" x="154.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge12" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#add8e6" d="M217.2369,-466.6249C217.2369,-453.7568 217.2369,-438.4867 217.2369,-424.6319"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="217.2369,-414.3761 221.737,-424.3761 217.2369,-419.3761 217.237,-424.3761 217.237,-424.3761 217.237,-424.3761 217.2369,-419.3761 212.737,-424.3762 217.2369,-414.3761 217.2369,-414.3761"/>
+            <text text-anchor="middle" x="222.0942" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge14" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#add8e6" d="M240.0869,-472.8454C256.2553,-456.2866 278.1111,-433.9032 295.7628,-415.8254"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="303.0416,-408.3709 299.275,-418.6696 299.5485,-411.9483 296.0553,-415.5258 296.0553,-415.5258 296.0553,-415.5258 299.5485,-411.9483 292.8356,-412.382 303.0416,-408.3709 303.0416,-408.3709"/>
+            <text text-anchor="middle" x="282.9322" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> i</text>
+        </g>
+        <!-- tr&#45;&gt;t -->
+        <g id="edge25" class="edge">
+            <title>tr&#45;&gt;t</title>
+            <path fill="none" stroke="#ffc0cb" d="M224.5566,-525.3096C226.4734,-536.3112 227.6536,-548.9542 226.2369,-560.4626 225.9201,-563.0362 225.4804,-565.6905 224.9643,-568.3364"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="222.6937,-578.3117 220.5255,-567.5623 223.8035,-573.4364 224.9132,-568.5611 224.9132,-568.5611 224.9132,-568.5611 223.8035,-573.4364 229.301,-569.5599 222.6937,-578.3117 222.6937,-578.3117"/>
+        </g>
+        <!-- t&#45;&gt;root -->
+        <g id="edge26" class="edge">
+            <title>t&#45;&gt;root</title>
+            <path fill="none" stroke="#ffc0cb" d="M222.0886,-614.6473C223.3802,-620.3113 224.5942,-626.6075 225.2369,-632.4626 226.1247,-640.5504 225.87,-649.2014 225.048,-657.4497"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="223.7469,-667.6094 220.5537,-657.1188 224.3821,-662.6499 225.0173,-657.6904 225.0173,-657.6904 225.0173,-657.6904 224.3821,-662.6499 229.4808,-658.2621 223.7469,-667.6094 223.7469,-667.6094"/>
+        </g>
+        <!-- t&#45;&gt;tr -->
+        <g id="edge6" class="edge">
+            <title>t&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M217.2369,-578.0815C217.2369,-566.3502 217.2369,-550.6774 217.2369,-536.1885"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="217.2369,-525.9209 221.737,-535.9208 217.2369,-530.9209 217.237,-535.9209 217.237,-535.9209 217.237,-535.9209 217.2369,-530.9209 212.737,-535.9209 217.2369,-525.9209 217.2369,-525.9209"/>
+            <text text-anchor="middle" x="221.3172" y="-547.8626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> r</text>
+        </g>
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w3.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w3.svg
new file mode 100644
index 000000000000..626107a251a7
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w3.svg
@@ -0,0 +1,276 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    { edge [style=invis]
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    // {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> trie [style=invis]}
+    }
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; constraint="false"; arrowhead="vee"]
+
+        start -> root
+
+        root -> tr [label=" tr"]
+        tr -> tra [label=" a"]
+        tra -> tractor [label=" ctor"]
+        // tractor -> tr [xlabel="back 5"]
+        tr -> tre [label=" e"]
+        tre -> tree [label=" e"]
+        // tree -> tr [xlabel="back 2"]
+        tr -> tri [label=" i"]
+        tri -> trie [label=" e"]
+        // trie -> start [xlabel="back 4"]
+    }
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="blue"; constraint="false"; arrowhead="vee"]
+        tractor -> tr
+        tree -> tr
+        trie -> tr
+        tr -> start
+    }
+}
+
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="514pt" height="734pt"
+     viewBox="0.00 0.00 514.43 734.09" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 730.0939)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-730.0939 510.4319,-730.0939 510.4319,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="251.195" cy="-696.6782" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="251.195" y="-700.8782" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="251.195" y="-684.0782" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="364.195" cy="-696.6782" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="364.195" y="-692.4782" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="251.195" cy="-496.2469" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="251.195" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="251.195" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- root&#45;&gt;tr -->
+        <g id="edge16" class="edge">
+            <title>root&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M229.3316,-674.0074C216.0344,-658.5794 200.2812,-636.9454 193.1438,-614.4626 183.3037,-583.4666 202.3168,-550.4221 221.1595,-527.173"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="227.7698,-519.3775 224.7345,-529.915 224.536,-523.1911 221.3023,-527.0046 221.3023,-527.0046 221.3023,-527.0046 224.536,-523.1911 217.8701,-524.0943 227.7698,-519.3775 227.7698,-519.3775"/>
+            <text text-anchor="middle" x="199.2206" y="-592.2626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> tr</text>
+        </g>
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="251.195" cy="-596.4626" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="251.195" y="-592.2626" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <!-- start&#45;&gt;root -->
+        <g id="edge15" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M339.5693,-711.7185C322.6929,-719.1537 305.8165,-720.6839 288.9401,-716.309"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="279.0036,-713.0508 289.9079,-711.8907 283.7547,-714.6087 288.5058,-716.1667 288.5058,-716.1667 288.5058,-716.1667 283.7547,-714.6087 287.1036,-720.4427 279.0036,-713.0508 279.0036,-713.0508"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="107.195" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="107.195" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tractor&#45;&gt;tr -->
+        <g id="edge23" class="edge">
+            <title>tractor&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M84.63,-35.1602C69.4788,-48.0928 50.624,-67.2465 41.195,-88.8 -16.74,-221.232 -10.7218,-279.9131 43.195,-414.0313 50.7046,-432.7114 55.2178,-438.0092 72.195,-448.8313 112.0895,-474.262 165.2695,-486.1237 203.2767,-491.613"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="213.5707,-492.9982 203.0598,-496.1243 208.6154,-492.3313 203.66,-491.6645 203.66,-491.6645 203.66,-491.6645 208.6154,-492.3313 204.2602,-487.2047 213.5707,-492.9982 213.5707,-492.9982"/>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="107.195" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="107.195" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="107.195" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="107.195" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="107.195" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="107.195" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="251.195" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="251.195" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="158.195" cy="-384.6156" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="158.195" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="158.195" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;tractor -->
+        <g id="edge18" class="edge">
+            <title>tra&#45;&gt;tractor</title>
+            <path fill="none" stroke="#add8e6" d="M129.4035,-366.2846C108.261,-351.4371 80.3464,-328.7126 63.195,-302.4 40.6505,-267.8137 42.0561,-254.428 35.9292,-213.6 27.5307,-157.6347 37.7739,-140.318 61.195,-88.8 68.2927,-73.1876 78.511,-57.0573 87.515,-44.1437"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="93.5012,-35.7633 91.3504,-46.5162 90.5949,-39.8319 87.6886,-43.9005 87.6886,-43.9005 87.6886,-43.9005 90.5949,-39.8319 84.0269,-41.2849 93.5012,-35.7633 93.5012,-35.7633"/>
+            <text text-anchor="middle" x="48.8279" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#0000ff"> ctor</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="433.195" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="433.195" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tree&#45;&gt;tr -->
+        <g id="edge24" class="edge">
+            <title>tree&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M191.7753,-295.0456C138.4603,-304.7362 68.2532,-317.9026 66.195,-320.4 29.2104,-365.2753 48.1813,-409.6972 91.195,-448.8313 107.8743,-464.0063 163.4286,-478.2581 204.4135,-487.0987"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="214.2244,-489.1725 203.5099,-491.507 209.3325,-488.1384 204.4406,-487.1043 204.4406,-487.1043 204.4406,-487.1043 209.3325,-488.1384 205.3713,-482.7016 214.2244,-489.1725 214.2244,-489.1725"/>
+        </g>
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="251.195" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="251.195" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="251.195" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge20" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#add8e6" d="M262.3671,-356.2376C265.4458,-345.0943 267.4041,-332.1755 265.195,-320.4 264.6617,-317.5576 263.9109,-314.6513 263.0308,-311.7832"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="259.6697,-302.3279 267.2592,-310.243 261.3444,-307.0391 263.0192,-311.7503 263.0192,-311.7503 263.0192,-311.7503 261.3444,-307.0391 258.7791,-313.2575 259.6697,-302.3279 259.6697,-302.3279"/>
+            <text text-anchor="middle" x="270.0523" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- trie&#45;&gt;tr -->
+        <g id="edge25" class="edge">
+            <title>trie&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M375.9621,-295.6423C362.0088,-298.1228 347.0972,-300.557 333.195,-302.4 286.7634,-308.5553 159.3708,-290.6496 123.195,-320.4 90.8711,-346.9827 93.9946,-376.8451 113.195,-414.0313 131.5732,-449.625 172.7739,-471.1596 205.3999,-483.2062"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="215.2151,-486.6452 204.2896,-487.5853 210.4964,-484.9918 205.7776,-483.3385 205.7776,-483.3385 205.7776,-483.3385 210.4964,-484.9918 207.2657,-479.0916 215.2151,-486.6452 215.2151,-486.6452"/>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="358.195" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="358.195" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="358.195" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge22" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#add8e6" d="M378.6156,-359.7523C384.3949,-352.5509 390.6255,-344.6271 396.195,-337.2 402.5745,-328.6926 409.3145,-319.2281 415.2689,-310.6885"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="421.1416,-302.203 419.151,-312.9867 418.2962,-306.3144 415.4507,-310.4258 415.4507,-310.4258 415.4507,-310.4258 418.2962,-306.3144 411.7505,-307.8649 421.1416,-302.203 421.1416,-302.203"/>
+            <text text-anchor="middle" x="412.0523" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tr&#45;&gt;start -->
+        <g id="edge26" class="edge">
+            <title>tr&#45;&gt;start</title>
+            <path fill="none" stroke="#ffc0cb" d="M266.5237,-523.436C288.1956,-561.8759 327.547,-631.6747 349.0941,-669.8934"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="354.167,-678.8913 345.3359,-672.3903 351.7114,-674.5358 349.2558,-670.1803 349.2558,-670.1803 349.2558,-670.1803 351.7114,-674.5358 353.1758,-667.9703 354.167,-678.8913 354.167,-678.8913"/>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge17" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M230.5553,-471.4723C217.1904,-455.43 199.6466,-434.3715 185.1808,-417.0077"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="178.777,-409.321 188.6352,-414.1237 181.9774,-413.1625 185.1778,-417.0041 185.1778,-417.0041 185.1778,-417.0041 181.9774,-413.1625 181.7204,-419.8845 178.777,-409.321 178.777,-409.321"/>
+            <text text-anchor="middle" x="216.0523" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge19" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#add8e6" d="M261.0549,-467.7057C262.775,-461.5528 264.2986,-455.0326 265.195,-448.8313 266.4269,-440.3086 265.7158,-431.23 264.1002,-422.6641"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="261.8503,-412.9079 268.4824,-421.641 262.9739,-417.7801 264.0975,-422.6522 264.0975,-422.6522 264.0975,-422.6522 262.9739,-417.7801 259.7126,-423.6634 261.8503,-412.9079 261.8503,-412.9079"/>
+            <text text-anchor="middle" x="270.0523" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge21" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#add8e6" d="M282.4484,-479.3274C295.4663,-471.2619 310.1001,-460.763 321.195,-448.8313 328.6315,-440.8339 335.204,-431.0808 340.668,-421.6153"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="345.5591,-412.695 344.6971,-423.6269 343.1552,-417.0792 340.7513,-421.4634 340.7513,-421.4634 340.7513,-421.4634 343.1552,-417.0792 336.8055,-419.2999 345.5591,-412.695 345.5591,-412.695"/>
+            <text text-anchor="middle" x="336.8903" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> i</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w4.svg b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w4.svg
new file mode 100644
index 000000000000..32bae7bd8175
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.md.w4.svg
@@ -0,0 +1,268 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+        "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Generated by graphviz version 2.40.1 (20161225.0304)
+# embedded DOT (plantuml) works in IDEA preview
+# but not on GitHub
+```plantuml
+digraph G {
+    { rank=same root -> start [style=invis] }
+    start [label="start/end"]
+
+    tractor [label = "contentArray[0]"]
+    tracto [label = "0x01B"]
+    tract [label = "0x01A"]
+    trac [label = "0x019"]
+    tra [label = "Chain\n0x018"]
+
+    tree [label = "contentArray[1]"]
+    tre [label = "Chain\n0x03B"]
+    trie [label = "contentArray[2]"]
+    tri [label = "Chain\n0x05B"]
+
+    tr [label = "Sparse\n0x07E"]
+
+    t [label = "0x09B"]
+    root [label = "Chain\n0x9A"]
+
+    { edge [style=invis]
+    root -> t [label = " t"]
+    t -> tr [label = " r"]
+    tr -> tra [label = " a"]
+    tra -> trac [label = " c"]
+    trac -> tract [label = " t"]
+    tract -> tracto [label = " o"]
+    tracto -> tractor [label = " r"]
+
+    tr -> tre [label = " e"]
+    tre -> tree [label = " e"]
+
+    tr -> tri [label = " i"]
+    tri -> trie [label = " e"]
+
+    // {rank=same tra -> tre -> tri [style=invis]}
+    {rank=same trac -> tree -> trie [style=invis]}
+    }
+
+    subgraph path {
+        edge [color = "lightblue"; fontcolor="blue"; constraint="false"; arrowhead="vee"]
+
+        start -> root
+
+        root -> tr [label=" tr"]
+        tr -> tra [label=" a"]
+        tra -> tractor [label=" ctor"]
+        // tractor -> tr [xlabel="back 5"]
+        tr -> tre [label=" e"]
+        tre -> tree [label=" e"]
+        // tree -> tr [xlabel="back 2"]
+        tr -> tri [label=" i"]
+        tri -> trie [label=" e"]
+        // trie -> start [xlabel="back 4"]
+    }
+
+    subgraph back {
+        edge [color = "pink"; fontcolor="blue"; constraint="false"; arrowhead="vee"]
+        tractor -> tr
+        tree -> tr
+        trie -> start
+    }
+}
+```
+ -->
+<!-- Title: G Pages: 1 -->
+<svg width="498pt" height="734pt"
+     viewBox="0.00 0.00 498.17 734.09" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 730.0939)">
+        <title>G</title>
+        <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-730.0939 494.1727,-730.0939 494.1727,4 -4,4"/>
+        <!-- root -->
+        <g id="node1" class="node">
+            <title>root</title>
+            <ellipse fill="none" stroke="#000000" cx="234.9358" cy="-696.6782" rx="33.1337" ry="29.3315"/>
+            <text text-anchor="middle" x="234.9358" y="-700.8782" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="234.9358" y="-684.0782" font-family="Times,serif" font-size="14.00" fill="#000000">0x9A</text>
+        </g>
+        <!-- start -->
+        <g id="node2" class="node">
+            <title>start</title>
+            <ellipse fill="none" stroke="#000000" cx="347.9358" cy="-696.6782" rx="44.0775" ry="18"/>
+            <text text-anchor="middle" x="347.9358" y="-692.4782" font-family="Times,serif" font-size="14.00" fill="#000000">start/end</text>
+        </g>
+        <!-- root&#45;&gt;start -->
+        <!-- tr -->
+        <g id="node12" class="node">
+            <title>tr</title>
+            <ellipse fill="none" stroke="#000000" cx="234.9358" cy="-496.2469" rx="37.9027" ry="29.3315"/>
+            <text text-anchor="middle" x="234.9358" y="-500.4469" font-family="Times,serif" font-size="14.00" fill="#000000">Sparse</text>
+            <text text-anchor="middle" x="234.9358" y="-483.6469" font-family="Times,serif" font-size="14.00" fill="#000000">0x07E</text>
+        </g>
+        <!-- root&#45;&gt;tr -->
+        <g id="edge16" class="edge">
+            <title>root&#45;&gt;tr</title>
+            <path fill="none" stroke="#add8e6" d="M213.0724,-674.0074C199.7752,-658.5794 184.022,-636.9454 176.8846,-614.4626 167.0445,-583.4666 186.0576,-550.4221 204.9004,-527.173"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="211.5106,-519.3775 208.4753,-529.915 208.2769,-523.1911 205.0431,-527.0046 205.0431,-527.0046 205.0431,-527.0046 208.2769,-523.1911 201.6109,-524.0943 211.5106,-519.3775 211.5106,-519.3775"/>
+            <text text-anchor="middle" x="182.9614" y="-592.2626" font-family="Times,serif" font-size="14.00" fill="#0000ff"> tr</text>
+        </g>
+        <!-- t -->
+        <g id="node13" class="node">
+            <title>t</title>
+            <ellipse fill="none" stroke="#000000" cx="234.9358" cy="-596.4626" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="234.9358" y="-592.2626" font-family="Times,serif" font-size="14.00" fill="#000000">0x09B</text>
+        </g>
+        <!-- root&#45;&gt;t -->
+        <!-- start&#45;&gt;root -->
+        <g id="edge15" class="edge">
+            <title>start&#45;&gt;root</title>
+            <path fill="none" stroke="#add8e6" d="M323.3101,-711.7185C306.4337,-719.1537 289.5573,-720.6839 272.6809,-716.309"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="262.7444,-713.0508 273.6487,-711.8907 267.4955,-714.6087 272.2466,-716.1667 272.2466,-716.1667 272.2466,-716.1667 267.4955,-714.6087 270.8444,-720.4427 262.7444,-713.0508 262.7444,-713.0508"/>
+        </g>
+        <!-- tractor -->
+        <g id="node3" class="node">
+            <title>tractor</title>
+            <ellipse fill="none" stroke="#000000" cx="90.9358" cy="-18" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="90.9358" y="-13.8" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[0]</text>
+        </g>
+        <!-- tractor&#45;&gt;tr -->
+        <g id="edge23" class="edge">
+            <title>tractor&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M68.3708,-35.1602C53.2196,-48.0928 34.3648,-67.2465 24.9358,-88.8 -13.1135,-175.7758 -2.9101,-211.9541 25.9358,-302.4 51.6621,-383.0647 83.6843,-432.7139 102.9358,-448.8313 127.1072,-469.0677 160.589,-480.9627 187.7828,-487.7778"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="197.8647,-490.1478 187.1003,-492.2399 192.9974,-489.0036 188.1301,-487.8593 188.1301,-487.8593 188.1301,-487.8593 192.9974,-489.0036 189.1599,-483.4788 197.8647,-490.1478 197.8647,-490.1478"/>
+        </g>
+        <!-- tracto -->
+        <g id="node4" class="node">
+            <title>tracto</title>
+            <ellipse fill="none" stroke="#000000" cx="90.9358" cy="-106.8" rx="36.5824" ry="18"/>
+            <text text-anchor="middle" x="90.9358" y="-102.6" font-family="Times,serif" font-size="14.00" fill="#000000">0x01B</text>
+        </g>
+        <!-- tracto&#45;&gt;tractor -->
+        <!-- tract -->
+        <g id="node5" class="node">
+            <title>tract</title>
+            <ellipse fill="none" stroke="#000000" cx="90.9358" cy="-195.6" rx="37.1443" ry="18"/>
+            <text text-anchor="middle" x="90.9358" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#000000">0x01A</text>
+        </g>
+        <!-- tract&#45;&gt;tracto -->
+        <!-- trac -->
+        <g id="node6" class="node">
+            <title>trac</title>
+            <ellipse fill="none" stroke="#000000" cx="90.9358" cy="-284.4" rx="35.3587" ry="18"/>
+            <text text-anchor="middle" x="90.9358" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">0x019</text>
+        </g>
+        <!-- trac&#45;&gt;tract -->
+        <!-- tree -->
+        <g id="node8" class="node">
+            <title>tree</title>
+            <ellipse fill="none" stroke="#000000" cx="234.9358" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="234.9358" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[1]</text>
+        </g>
+        <!-- trac&#45;&gt;tree -->
+        <!-- tra -->
+        <g id="node7" class="node">
+            <title>tra</title>
+            <ellipse fill="none" stroke="#000000" cx="137.9358" cy="-384.6156" rx="36.125" ry="29.3315"/>
+            <text text-anchor="middle" x="137.9358" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="137.9358" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x018</text>
+        </g>
+        <!-- tra&#45;&gt;tractor -->
+        <g id="edge18" class="edge">
+            <title>tra&#45;&gt;tractor</title>
+            <path fill="none" stroke="#add8e6" d="M109.9558,-365.7127C89.7668,-350.7265 63.2887,-328.0827 46.9358,-302.4 24.7617,-267.575 25.7969,-254.428 19.67,-213.6 11.2715,-157.6347 21.5147,-140.318 44.9358,-88.8 52.0335,-73.1876 62.2518,-57.0573 71.2558,-44.1437"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="77.242,-35.7633 75.0912,-46.5162 74.3357,-39.8319 71.4294,-43.9005 71.4294,-43.9005 71.4294,-43.9005 74.3357,-39.8319 67.7677,-41.2849 77.242,-35.7633 77.242,-35.7633"/>
+            <text text-anchor="middle" x="32.5687" y="-191.4" font-family="Times,serif" font-size="14.00" fill="#0000ff"> ctor</text>
+        </g>
+        <!-- tra&#45;&gt;trac -->
+        <!-- trie -->
+        <g id="node10" class="node">
+            <title>trie</title>
+            <ellipse fill="none" stroke="#000000" cx="416.9358" cy="-284.4" rx="73.4745" ry="18"/>
+            <text text-anchor="middle" x="416.9358" y="-280.2" font-family="Times,serif" font-size="14.00" fill="#000000">contentArray[2]</text>
+        </g>
+        <!-- tree&#45;&gt;trie -->
+        <!-- tree&#45;&gt;tr -->
+        <g id="edge24" class="edge">
+            <title>tree&#45;&gt;tr</title>
+            <path fill="none" stroke="#ffc0cb" d="M176.0292,-295.1578C146.8453,-301.5575 115.6009,-310.3755 104.9358,-320.4 93.0148,-331.605 95.6396,-339.0646 92.9358,-355.2 88.6145,-380.9877 80.6184,-390.967 92.9358,-414.0313 112.4778,-450.6238 155.7364,-472.1892 189.4058,-483.9828"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="199.0642,-487.1947 188.1551,-488.3091 194.3197,-485.6169 189.5751,-484.0391 189.5751,-484.0391 189.5751,-484.0391 194.3197,-485.6169 190.9952,-479.769 199.0642,-487.1947 199.0642,-487.1947"/>
+        </g>
+        <!-- tre -->
+        <g id="node9" class="node">
+            <title>tre</title>
+            <ellipse fill="none" stroke="#000000" cx="234.9358" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="234.9358" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="234.9358" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x03B</text>
+        </g>
+        <!-- tre&#45;&gt;tree -->
+        <!-- tre&#45;&gt;tree -->
+        <g id="edge20" class="edge">
+            <title>tre&#45;&gt;tree</title>
+            <path fill="none" stroke="#add8e6" d="M246.1079,-356.2376C249.1866,-345.0943 251.1449,-332.1755 248.9358,-320.4 248.4025,-317.5576 247.6517,-314.6513 246.7716,-311.7832"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="243.4105,-302.3279 251.0001,-310.243 245.0852,-307.0391 246.76,-311.7503 246.76,-311.7503 246.76,-311.7503 245.0852,-307.0391 242.5199,-313.2575 243.4105,-302.3279 243.4105,-302.3279"/>
+            <text text-anchor="middle" x="253.7931" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- trie&#45;&gt;start -->
+        <g id="edge25" class="edge">
+            <title>trie&#45;&gt;start</title>
+            <path fill="none" stroke="#ffc0cb" d="M413.9107,-302.4748C403.0344,-367.4612 365.8611,-589.5735 352.6455,-668.5372"/>
+            <polygon fill="#ffc0cb" stroke="#ffc0cb" points="350.945,-678.698 348.1575,-668.0924 351.7704,-673.7666 352.5957,-668.8352 352.5957,-668.8352 352.5957,-668.8352 351.7704,-673.7666 357.034,-669.578 350.945,-678.698 350.945,-678.698"/>
+        </g>
+        <!-- tri -->
+        <g id="node11" class="node">
+            <title>tri</title>
+            <ellipse fill="none" stroke="#000000" cx="339.9358" cy="-384.6156" rx="37.9306" ry="29.3315"/>
+            <text text-anchor="middle" x="339.9358" y="-388.8156" font-family="Times,serif" font-size="14.00" fill="#000000">Chain</text>
+            <text text-anchor="middle" x="339.9358" y="-372.0156" font-family="Times,serif" font-size="14.00" fill="#000000">0x05B</text>
+        </g>
+        <!-- tri&#45;&gt;trie -->
+        <!-- tri&#45;&gt;trie -->
+        <g id="edge22" class="edge">
+            <title>tri&#45;&gt;trie</title>
+            <path fill="none" stroke="#add8e6" d="M355.6083,-357.5072C362.865,-345.7144 371.9288,-331.9761 381.2212,-320.4 384.2316,-316.6497 387.5835,-312.8608 390.9939,-309.2205"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="397.9861,-302.0169 394.2501,-312.3268 394.5035,-305.6047 391.021,-309.1925 391.021,-309.1925 391.021,-309.1925 394.5035,-305.6047 387.792,-306.0582 397.9861,-302.0169 397.9861,-302.0169"/>
+            <text text-anchor="middle" x="385.7931" y="-324.6" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tra -->
+        <!-- tr&#45;&gt;tra -->
+        <g id="edge17" class="edge">
+            <title>tr&#45;&gt;tra</title>
+            <path fill="none" stroke="#add8e6" d="M213.6487,-471.749C199.5271,-455.4972 180.8544,-434.0081 165.5865,-416.4372"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="158.8397,-408.6726 168.7956,-413.2695 162.1192,-412.4469 165.3988,-416.2211 165.3988,-416.2211 165.3988,-416.2211 162.1192,-412.4469 162.002,-419.1727 158.8397,-408.6726 158.8397,-408.6726"/>
+            <text text-anchor="middle" x="197.7931" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> a</text>
+        </g>
+        <!-- tr&#45;&gt;tre -->
+        <!-- tr&#45;&gt;tre -->
+        <g id="edge19" class="edge">
+            <title>tr&#45;&gt;tre</title>
+            <path fill="none" stroke="#add8e6" d="M244.7957,-467.7057C246.5159,-461.5528 248.0394,-455.0326 248.9358,-448.8313 250.1677,-440.3086 249.4566,-431.23 247.841,-422.6641"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="245.5912,-412.9079 252.2232,-421.641 246.7147,-417.7801 247.8383,-422.6522 247.8383,-422.6522 247.8383,-422.6522 246.7147,-417.7801 243.4534,-423.6634 245.5912,-412.9079 245.5912,-412.9079"/>
+            <text text-anchor="middle" x="253.7931" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> e</text>
+        </g>
+        <!-- tr&#45;&gt;tri -->
+        <!-- tr&#45;&gt;tri -->
+        <g id="edge21" class="edge">
+            <title>tr&#45;&gt;tri</title>
+            <path fill="none" stroke="#add8e6" d="M265.8,-479.3124C278.6421,-471.2442 293.0589,-460.7473 303.9358,-448.8313 311.249,-440.8193 317.6701,-431.0614 322.9868,-421.5958"/>
+            <polygon fill="#add8e6" stroke="#add8e6" points="327.7388,-412.6767 327.008,-423.6182 325.3877,-417.0895 323.0366,-421.5022 323.0366,-421.5022 323.0366,-421.5022 325.3877,-417.0895 319.0651,-419.3862 327.7388,-412.6767 327.7388,-412.6767"/>
+            <text text-anchor="middle" x="319.6311" y="-436.2313" font-family="Times,serif" font-size="14.00" fill="#0000ff"> i</text>
+        </g>
+        <!-- t&#45;&gt;tr -->
+    </g>
+</svg>
diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
new file mode 100644
index 000000000000..4aba992848dd
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import com.google.common.collect.Iterables;
+
+/**
+ * A merged view of two tries.
+ */
+class MergeTrie<T> extends Trie<T>
+{
+    /**
+     * Transition value used to indicate a transition is not present. Must be greater than all valid transition values
+     * (0-0xFF).
+     */
+    public static final int NOT_PRESENT = 0x100;
+
+    private final MergeResolver<T> resolver;
+    protected final Trie<T> t1;
+    protected final Trie<T> t2;
+
+    MergeTrie(MergeResolver<T> resolver, Trie<T> t1, Trie<T> t2)
+    {
+        this.resolver = resolver;
+        this.t1 = t1;
+        this.t2 = t2;
+    }
+
+    @Override
+    public <L> Node<T, L> root()
+    {
+        return makeNode(resolver, t1.root(), t2.root());
+    }
+
+    private static <T, L> Node<T, L> makeNode(MergeResolver<T> resolver, Node<T, L> child1, Node<T, L> child2)
+    {
+        if (child1 != null && child2 != null)
+            return new MergeNode<>(resolver, child1, child2);
+
+        if (child1 != null)
+            return child1;
+
+        if (child2 != null)
+            return child2;
+
+        return null;
+    }
+
+    static class MergeNode<T, L> extends Node<T, L>
+    {
+        private final MergeResolver<T> resolver;
+        final Node<T, L> n1;
+        final Node<T, L> n2;
+        int b1;
+        int b2;
+
+        MergeNode(MergeResolver<T> resolver, Node<T, L> n1, Node<T, L> n2)
+        {
+            // Both children have the same parent link (passed during getCurrentChild). Use either as ours.
+            super(n1.parentLink);
+            assert n2.parentLink == n1.parentLink;
+            this.resolver = resolver;
+            this.n1 = n1;
+            this.n2 = n2;
+        }
+
+        private Remaining makeState(Remaining has1, Remaining has2)
+        {
+            Remaining result;
+            if (has1 != null)
+            {
+                b1 = n1.currentTransition;
+                result = Remaining.MULTIPLE;
+            }
+            else
+            {
+                b1 = NOT_PRESENT;
+                result = has2;
+            }
+            currentTransition = b1;
+            if (has2 != null)
+            {
+                b2 = n2.currentTransition;
+                if (b2 < b1)
+                    currentTransition = b2;
+                else if (b1 == b2 && has1 == Remaining.ONE && has2 == Remaining.ONE)
+                    result = Remaining.ONE;
+            }
+            else
+            {
+                b2 = NOT_PRESENT;
+                result = has1;
+            }
+            return result;
+        }
+
+        public Remaining startIteration()
+        {
+            return makeState(n1.startIteration(), n2.startIteration());
+        }
+
+        public Remaining advanceIteration()
+        {
+            int prevb1 = b1;
+            int prevb2 = b2;
+            // We must advance the state of the source with the smaller transition byte.
+            // If their transition bytes are equal, we advance both.
+            if (prevb1 <= prevb2)
+            {
+                boolean has = n1.advanceIteration() != null;
+                b1 = has ? n1.currentTransition : NOT_PRESENT;
+            }
+            if (prevb1 >= prevb2)
+            {
+                boolean has = n2.advanceIteration() != null;
+                b2 = has ? n2.currentTransition : NOT_PRESENT;
+            }
+            currentTransition = Math.min(b1, b2);
+            return b1 < NOT_PRESENT || b2 < NOT_PRESENT ? Remaining.MULTIPLE : null;
+        }
+
+        public Node<T, L> getCurrentChild(L parent)
+        {
+            Node<T, L> child1 = null;
+            Node<T, L> child2 = null;
+
+            if (b1 <= b2)
+                child1 = n1.getCurrentChild(parent);
+            if (b1 >= b2)
+                child2 = n2.getCurrentChild(parent);
+
+            return makeNode(resolver, child1, child2);
+        }
+
+        public T content()
+        {
+            T mc = n2.content();
+            T nc = n1.content();
+            if (mc == null)
+                return nc;
+            else if (nc == null)
+                return mc;
+            else
+                return resolver.resolve(nc, mc);
+        }
+    }
+
+    /**
+     * Special instance for sources that are guaranteed (by the caller) distinct. The main difference is that we can
+     * form unordered value list by concatenating sources.
+     */
+    static class Distinct<T> extends MergeTrie<T>
+    {
+        Distinct(Trie<T> input1, Trie<T> input2)
+        {
+            super(throwingResolver(), input1, input2);
+        }
+
+        @Override
+        public Iterable<T> valuesUnordered()
+        {
+            return Iterables.concat(t1.valuesUnordered(), t2.valuesUnordered());
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
new file mode 100644
index 000000000000..2dfe7ccd9ca0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+
+/**
+ * TrieSet representing the range between two keys.
+ *
+ * The keys must be correctly ordered, including with respect to the `includeLeft` and `includeRight` constraints.
+ * (i.e. RangeTrieSet(x, false, x, false) is an invalid call but RangeTrieSet(x, true, x, false) is inefficient
+ * but fine for an empty set).
+ */
+public class RangeTrieSet extends TrieSet
+{
+    /** Left-side boundary. The characters of this are requested as we descend along the left-side boundary. */
+    private final ByteComparable left;
+
+    /** Right-side boundary. The characters of this are requested as we descend along the right-side boundary. */
+    private final ByteComparable right;
+
+    private final boolean includeLeft;
+    private final boolean includeRight;
+
+    RangeTrieSet(ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight)
+    {
+        this.left = left;
+        this.includeLeft = includeLeft;
+        this.right = right;
+        this.includeRight = includeRight;
+    }
+
+    public SetNode root()
+    {
+        return makeNode(left == null ? null : left.asComparableBytes(Trie.BYTE_COMPARABLE_VERSION),
+                        left != null,
+                        right == null ? null : right.asComparableBytes(Trie.BYTE_COMPARABLE_VERSION),
+                        right != null);
+    }
+
+    private SetNode makeNode(ByteSource lLimit, boolean atLLimit, ByteSource rLimit, boolean atRLimit)
+    {
+        // We only have a constraint on the branch if we are at one or both boundaries.
+        // If the node falls completely between them, the whole branch (at any depth) is in the set.
+        if (!atLLimit && !atRLimit)
+            return FULL;
+
+        return new RangeNode(lLimit, atLLimit, rLimit, atRLimit);
+    }
+
+    class RangeNode implements SetNode
+    {
+        /** Byte at the left boundary, inclusive. */
+        final int llimit;
+        final ByteSource remainingLLimit;
+        /** Byte at the right boundary, inclusive. */
+        final int rlimit;
+        final ByteSource remainingRLimit;
+        /** Whether or not we are descending along the left boundary. */
+        final boolean atLLimit;
+        /** Whether or not we are descending along the right boundary. */
+        final boolean atRLimit;
+
+        /** Whether the current path is in the covered set. */
+        final boolean inSet;
+
+        int currentTransition;
+
+
+        RangeNode(ByteSource remainingLLimit, boolean atLLimit, ByteSource remainingRLimit, boolean atRLimit)
+        {
+            int llimit = 0;
+            boolean inSet = true;
+            if (atLLimit)
+            {
+                llimit = remainingLLimit.next();
+                if (llimit == ByteSource.END_OF_STREAM)
+                {
+                    atLLimit = false;
+                    llimit = 0;
+                    inSet &= includeLeft; // The current path matches left boundary
+                }
+                else
+                    inSet = false;  // The current path is a prefix of the left boundary, ie. smaller.
+            }
+            int rlimit = 255;
+            if (atRLimit)
+            {
+                rlimit = remainingRLimit.next();
+                if (rlimit == ByteSource.END_OF_STREAM)
+                {
+                    atRLimit = false;
+                    rlimit = -1;    // no op, added for clarity. Node should have no children.
+                    inSet &= includeRight; // The current path matches right boundary
+                }
+            }
+            assert llimit <= rlimit || rlimit == -1 : "Bound " + left + " not <= " + right + " in range " + llimit + " vs " + rlimit;
+
+            this.llimit = llimit;
+            this.remainingLLimit = remainingLLimit;
+            this.rlimit = rlimit;
+            this.remainingRLimit = remainingRLimit;
+            this.atLLimit = atLLimit;
+            this.atRLimit = atRLimit;
+            this.inSet = inSet;
+        }
+
+        public SetNode getCurrentChild()
+        {
+            return makeNode(remainingLLimit, atLLimit && (currentTransition == llimit),
+                            remainingRLimit, atRLimit && (currentTransition == rlimit));
+        }
+
+        public int currentTransition()
+        {
+            return currentTransition;
+        }
+
+        public boolean startIteration()
+        {
+            currentTransition = llimit;
+            return currentTransition <= rlimit;
+        }
+
+        public boolean advanceIteration()
+        {
+            return ++currentTransition <= rlimit;
+        }
+
+        public boolean inSet()
+        {
+            return inSet;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
new file mode 100644
index 000000000000..52ee84feb0ea
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+/**
+ * The intersection of a trie with the given set.
+ */
+class SetIntersectionTrie<T> extends Trie<T>
+{
+    private final Trie<T> trie;
+    private final TrieSet intersectingSet;
+
+    SetIntersectionTrie(Trie<T> trie, TrieSet intersectingSet)
+    {
+        this.trie = trie;
+        this.intersectingSet = intersectingSet;
+    }
+
+    @Override
+    public <L> Node<T, L> root()
+    {
+        TrieSet.SetNode sRoot = intersectingSet.root();
+        if (sRoot == null)
+            return null;
+
+        Node<T, L> tRoot = trie.root();
+        if (sRoot == TrieSet.FULL)
+            return tRoot;
+        if (tRoot == null)
+            return null;
+
+        return new IntersectionNode<>(tRoot, sRoot);
+    }
+
+    static class IntersectionNode<T, L> extends Node<T, L>
+    {
+        final Node<T, L> tNode;
+        final TrieSet.SetNode sNode;
+
+        public IntersectionNode(Node<T, L> tNode, TrieSet.SetNode sNode)
+        {
+            super(tNode.parentLink);
+            this.tNode = tNode;
+            this.sNode = sNode;
+        }
+
+        public Remaining startIteration()
+        {
+            boolean sHas = sNode.startIteration();
+            if (!sHas)
+                return null;
+
+            return advanceToIntersection(tNode.startIteration());
+        }
+
+        public Remaining advanceIteration()
+        {
+            boolean sHas = sNode.advanceIteration();
+            if (!sHas)
+                return null;
+            return advanceToIntersection(tNode.advanceIteration());
+        }
+
+        public Remaining advanceToIntersection(Remaining tHas)
+        {
+            boolean sHas;
+            if (tHas == null)
+                return null;
+            int sByte = sNode.currentTransition();
+            int tByte = tNode.currentTransition;
+
+            while (tByte != sByte)
+            {
+                if (tByte < sByte)
+                {
+                    tHas = tNode.advanceIteration();
+                    if (tHas == null)
+                        return null;
+                    tByte = tNode.currentTransition;
+                }
+                else // (tByte > sByte)
+                {
+                    sHas = sNode.advanceIteration();
+                    if (!sHas)
+                        return null;
+                    sByte = sNode.currentTransition();
+                }
+            }
+            currentTransition = sByte;
+            return tHas;    // ONE or MULTIPLE
+        }
+
+        public Node<T, L> getCurrentChild(L parent)
+        {
+            TrieSet.SetNode receivedSetNode = sNode.getCurrentChild();
+
+            if (receivedSetNode == null)
+                return null;    // branch is completely outside the set
+
+            Node<T, L> nn = tNode.getCurrentChild(parent);
+
+            if (nn == null)
+                return null;
+
+            if (receivedSetNode == TrieSet.FULL)
+                return nn;     // Branch is fully covered, we no longer need to augment nodes there.
+
+            return new IntersectionNode<>(nn, receivedSetNode);
+        }
+
+        public T content()
+        {
+            if (sNode.inSet())
+                return tNode.content();
+            return null;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
new file mode 100644
index 000000000000..8c79d1e17aba
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * Singleton trie, mapping the given key to value.
+ * Formed as a chain of single-child SNodes leading to one ENode with no children and the given value as content.
+ */
+class SingletonTrie<T> extends Trie<T>
+{
+    private final ByteComparable key;
+    private final T value;
+
+    SingletonTrie(ByteComparable key, T value)
+    {
+        this.key = key;
+        this.value = value;
+    }
+
+    private class ENode<L> extends NoChildrenNode<T, L>
+    {
+        ENode(L parent)
+        {
+            super(parent);
+        }
+
+        @Override
+        public T content()
+        {
+            return value;
+        }
+    }
+
+    private class SNode<L> extends Node<T, L>
+    {
+        private final ByteSource source;
+        boolean requested = false;
+
+        SNode(int trans, L parent, ByteSource source)
+        {
+            super(parent);
+            this.currentTransition = trans;
+            this.source = source;
+        }
+
+        @Override
+        public Node<T, L> getCurrentChild(L parent)
+        {
+            // Requesting more than once will screw up the iteration of source.
+            assert !requested : "getCurrentChild can only be called once for a given transition.";
+            requested = true;
+            return makeNode(parent, source);
+        }
+
+        @Override
+        public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver)
+        {
+            if (receiver != null)
+            {
+                receiver.add(currentTransition);
+                int next;
+                while ((next = source.next()) != ByteSource.END_OF_STREAM)
+                {
+                    receiver.add(next);
+                }
+            }
+
+            return new ENode<>(parentLink);
+        }
+
+        @Override
+        public Remaining startIteration()
+        {
+            return Remaining.ONE;
+        }
+
+        @Override
+        public Remaining advanceIteration()
+        {
+            return null;
+        }
+
+        @Override
+        public T content()
+        {
+            return null;
+        }
+    }
+
+    private <L> Node<T, L> makeNode(L parent, ByteSource source)
+    {
+        int next = source.next();
+        if (next == ByteSource.END_OF_STREAM)
+            return new ENode<>(parent);
+        else
+            return new SNode<>(next, parent, source);
+    }
+
+    public <L> Node<T, L> root()
+    {
+        return makeNode(null, key.asComparableBytes(BYTE_COMPARABLE_VERSION));
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
new file mode 100644
index 000000000000..2873e0a0704c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.function.Function;
+
+import com.google.common.collect.ImmutableList;
+
+import org.agrona.concurrent.UnsafeBuffer;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Base class for tries.
+ *
+ * Normal users of tries will only use the public transformation methods, which various transformations of the trie
+ * and conversion of its content to other formats (e.g. iterable of values).
+ *
+ * For any unimplemented data extraction operations one can rely on the TrieWalker (to aggregate value) and TrieIterator
+ * (to iterator) base classes, which provide the necessary mechanisms to handle walking the trie.
+ *
+ * The internal representation of tries using this interface is defined in the Node interface.
+ *
+ * Its design is largely defined by the requirement for iteratively retrieving content of the trie, for which it needs
+ * to be able to represent and save the state of any traversal efficiently, so that it can be preserved while a consumer
+ * is operating on an item. This also enables a possible future extension to support asynchronous retrieval of nodes.
+ *
+ * To enable that efficient state representation, the nodes that are used to retrieve the internal state of the trie are
+ * light stateful objects and always contain a link to some parent state. The role of parent state can often be played
+ * by the parent node, because its current state defines the transition that was taken to obtain the child, and it also
+ * has a reference to its own parent, effectively building a stack of nodes left to process each holding its own state.
+ * It is also possible to skip some levels of the descent in the state description, if e.g. there are no other paths to
+ * examine at those levels to continue the traversal (see getUniqueDescendant). Some traversal types may require more
+ * information (e.g. position in a character array or list of nodes being merged). The type of parent state link is
+ * defined by the consumer through the generic parameter L and it is supplied by the consumer as an argument to the
+ * getCurrentChild call -- that parent state is presented by the child in its parentLink field.
+ *
+ *
+ * To begin traversal over a trie, one must retrieve the root node of the trie by calling root(). Because the nodes are
+ * stateful, the traversal must always proceed from one thread. Should concurrent reads be required, separate calls to
+ * root() must be made.
+ *
+ * Once a node is available, one can retrieve any associated content and list the children of the node along with their
+ * associated transition byte by:
+ * - calling startIteration to set the node's state to its first child
+ * - retrieving the associated transition byte using the node's currentTransition field
+ * - optionally retrieving the child using getCurrentChild giving it something you can use to restore your state
+ *   to continue processing the rest of the children of this node
+ * - when processing the child is complete/skipped or child is null, request the next child using advanceIteration and
+ *   repeat
+ * - if start/advanceIteration return null, there are no further children of the node
+ * - if they return Remaining.ONE, this is the last child of the node (the inverse is not always true, nodes will try
+ *   but do not guarantee they will report ONE on their last child)
+ * - when the children are exhausted, use the node's parent link to restore your state to what it was when the relevant
+ *   parent was being processed
+ * For an example of simple traversal, see TrieWalker. For a more complex traversal example, refer to TrieValuesIterator
+ * and TrieEntriesIterator.
+ *
+ * Note: This model only supports depth-first traversals. We do not currently have a need for breadth-first walks.
+ *
+ * @param <T> The content type of the trie. Content is only allowed on leaf nodes.
+ */
+public abstract class Trie<T>
+{
+    /**
+     * Enum used to indicate the presence of more children during the iteration of a node.
+     * Generally iteration will return null or MULTIPLE, but it can return ONE if it is known that there are no further
+     * children to optimize walks.
+     */
+    protected enum Remaining
+    {
+        ONE, MULTIPLE
+    }
+
+    /**
+     * Used by {@link Node#getUniqueDescendant} to feed the transitions taken.
+     */
+    protected interface TransitionsReceiver
+    {
+        /** Add a single byte to the path. */
+        void add(int t);
+        /** Add the count bytes from position pos at the given buffer. */
+        void add(UnsafeBuffer b, int pos, int count);
+    }
+
+    /**
+     * A trie node. Provides methods for listing the transition bytes and children of the node, as well as its content.
+     * Once a node is made available, all its methods, except the ones retrieving children, must proceed without
+     * blocking or throwing exceptions.
+     *
+     * To enable efficient traversals the node effectively stores a call stack, a back link to the state that
+     * was used to obtain the node. This data is used to resume walks along the items in a trie.
+     *
+     * A node is a stateful non-thread-safe object. It is okay to access it from different threads, provided such
+     * accesses are not concurrent, i.e. there is a happens-before relationship between calling each of a node's
+     * methods.
+     */
+    protected abstract static class Node<T, L>
+    {
+        /**
+         * Parent state, as set when {@link #getCurrentChild} or {@link #getUniqueDescendant} is called, or
+         * {@code null} if this is a root node.
+         * Often a node (which also holds its iteration state), but it does not need to be. Users/subscribers of the
+         * trie interface can choose what this link needs to contain, e.g. a merge node with a list of source nodes
+         * or a pair of a parent node with a byte array containing the key that leads to it.
+         */
+        public final L parentLink;
+
+        /** Current transition byte, set after each call to {@link #startIteration} and {@link #advanceIteration}. */
+        protected int currentTransition = -1;
+
+        protected Node(L parentLink)
+        {
+            this.parentLink = parentLink;
+        }
+
+        /**
+         * Sets up the node for forward iteration, positions it on the first child and sets {@link #currentTransition}.
+         * Note: It is expected that the node will be traversed only once, more precisely that no consumer will ask
+         * twice for the same child. Some implementations (e.g. singleton, subtrie) may fail if this is violated.
+         *
+         * @return null if the node has no children, otherwise {@link Remaining#MULTIPLE} or {@link Remaining#ONE} (if
+         * it knows this is the only transition).
+         */
+        public abstract Remaining startIteration();
+
+        /**
+         * Advances the node state to the next transition of the node and sets {@link #currentTransition}.
+         * <p>
+         * This can only be called after an iteration has been started by {@link #startIteration}.
+         *
+         * @return null if the node has no more children, otherwise {@link Remaining#MULTIPLE} or {@link Remaining#ONE}
+         * (if it knows this is the last transition).
+         *
+         * @throws IllegalStateException if no iteration has been started (with {@link #startIteration}), or if the
+         * preceding call to {@link #startIteration} or this method returned {@code null}. (Note: Implementations
+         * should permit this to be called after {@link Remaining#ONE}, which is redundant but easier to work with.)
+         */
+        public abstract Remaining advanceIteration();
+
+        /**
+         * Gets the child of this node corresponding to the current transition and with the given parent link.
+         * The current transition must have been set using {@link #startIteration} or {@link #advanceIteration},
+         * and it's an error to call this after either has returned {@code null}. This should only be
+         * called once for a given transition/child.
+         *
+         * The method may return null if the child turns out to not be present (e.g. in a dense node where it could be
+         * better to leave the check for the request call, or if a concurrent write has prepared the transition but not
+         * yet made it active by writing the child).
+         *
+         * @param parentLink the parent state to use to set {@link Node#parentLink} in the node provided as result to
+         * this request.
+         * @return the child corresponding to the current transition or null if the child does not exist
+         * (even though {@link #startIteration}/{@link #advanceIteration} thought it did).
+         */
+        public abstract Node<T, L> getCurrentChild(L parentLink);
+
+        /**
+         * If the node has exactly one child and no content, go to that child and continue descending while this is
+         * the case.
+         * This is done so that iteration over the content of the trie does not need to remember the parts of the path
+         * that are not branching points and thus don't need to be revisited while backtracking up the trie.
+         * Overridden by chain nodes (MemtableTrie.ChainNode); see TrieValuesIterator for usage.
+         * The receiver argument can be null if the caller does not need a record of the transitions taken.
+         */
+        public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver)
+        {
+            return this;
+        }
+
+        /**
+         * The content of this node, if any.
+         *
+         * @return the content of this node, or {@code null} if it has no attached content.
+         */
+        public abstract T content();
+    }
+
+    /**
+     * Returns an instantiation of the root node with null parent link.
+     * This is the only method that needs to be implemented in children.
+     *
+     * @param <L> The type of parent link that will be used in the traversal.
+     */
+    protected abstract <L> Node<T, L> root();
+
+    // Version of the byte comparable conversion to use for all operations
+    static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS41;
+
+    /**
+     * Base helper class to write node having no childen.
+     */
+    protected abstract static class NoChildrenNode<T, L> extends Node<T, L>
+    {
+        NoChildrenNode(L parent)
+        {
+            super(parent);
+        }
+
+        public IllegalStateException error()
+        {
+            return new IllegalStateException("Node has no children.");
+        }
+
+        public Remaining startIteration()
+        {
+            return null;
+        }
+
+        public Remaining advanceIteration()
+        {
+            throw error();
+        }
+
+        public Node<T, L> getCurrentChild(L parent)
+        {
+            throw error();
+        }
+    }
+
+    public <V> V walk(TrieWalker<T, V> walker)
+    {
+        return TrieWalker.process(walker, this);
+    }
+
+    public String dump()
+    {
+        return dump(Object::toString);
+    }
+
+    public String dump(Function<T, String> contentToString)
+    {
+        return walk(new TrieDumper<>(contentToString));
+    }
+
+    /**
+     * Returns a singleton trie mapping the given byte path to content.
+     */
+    public static <T> Trie<T> singleton(ByteComparable b, T v)
+    {
+        return new SingletonTrie<>(b, v);
+    }
+
+    /**
+     * Returns a view of the subtrie containing everything in this trie whose keys fall between the given boundaries.
+     *
+     * This method will throw an assertion error if the bounds provided are not correctly ordered, including with
+     * respect to the `includeLeft` and `includeRight` constraints (i.e. subtrie(x, false, x, false) is an invalid call
+     * but subtrie(x, true, x, false) is inefficient but fine for an empty subtrie).
+     *
+     * @param left the left bound for the returned subtrie. If {@code null}, the resulting subtrie is not left-bounded.
+     * @param includeLeft whether {@code left} is an inclusive bound of not.
+     * @param right the right bound for the returned subtrie. If {@code null}, the resulting subtrie is not right-bounded.
+     * @param includeRight whether {@code right} is an inclusive bound of not.
+     * @return a view of the subtrie containing all the keys of this trie falling between {@code left} (inclusively if
+     * {@code includeLeft}) and {@code right} (inclusively if {@code includeRight}).
+     */
+    public Trie<T> subtrie(ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight)
+    {
+        if (left == null && right == null)
+            return this;
+
+        return new SetIntersectionTrie<>(this, TrieSet.range(left, includeLeft, right, includeRight));
+    }
+
+    /**
+     * Returns the ordered entry set of this trie's content as an iterable.
+     */
+    public Iterable<Map.Entry<ByteComparable, T>> entrySet()
+    {
+        return this::entryIterator;
+    }
+
+    /**
+     * Returns the ordered entry set of this trie's content in an iterator.
+     */
+    public Iterator<Map.Entry<ByteComparable, T>> entryIterator()
+    {
+        return new TrieEntriesIterator.AsEntries<>(this);
+    }
+
+    /**
+     * Returns the ordered set of values of this trie as an iterable.
+     */
+    public Iterable<T> values()
+    {
+
+        return this::valueIterator;
+    }
+
+    /**
+     * Returns the ordered set of values of this trie in an iterator.
+     */
+    public Iterator<T> valueIterator()
+    {
+        return new TrieValuesIterator<>(this);
+    }
+
+    /**
+     * Returns the values in any order. For some tries this is much faster than the ordered iterable.
+     */
+    public Iterable<T> valuesUnordered()
+    {
+        return values();
+    }
+
+    /**
+     * Resolver of content of merged nodes, used for two-source merges (i.e. mergeWith).
+     */
+    public interface MergeResolver<T>
+    {
+        // Note: No guarantees about argument order.
+        // E.g. during t1.mergeWith(t2, resolver), resolver may be called with t1 or t2's items as first argument.
+        T resolve(T b1, T b2);
+    }
+
+    /**
+     * Constructs a view of the merge of this trie with the given one. The view is live, i.e. any write to any of the
+     * sources will be reflected in the merged view.
+     *
+     * If there is content for a given key in both sources, the resolver will be called to obtain the combination.
+     * (The resolver will not be called if there's content from only one source.)
+     */
+    public Trie<T> mergeWith(Trie<T> other, MergeResolver<T> resolver)
+    {
+        return new MergeTrie<>(resolver, this, other);
+    }
+
+    /**
+     * Resolver of content of merged nodes.
+     *
+     * The resolver's methods are only called if more than one of the merged nodes contain content, and the
+     * order in which the arguments are given is not defined. Only present non-null values will be included in the
+     * collection passed to the resolving methods.
+     *
+     * Can also be used as a two-source resolver.
+     */
+    public interface CollectionMergeResolver<T> extends MergeResolver<T>
+    {
+        T resolve(Collection<T> contents);
+
+        default T resolve(T c1, T c2)
+        {
+            return resolve(ImmutableList.of(c1, c2));
+        }
+    }
+
+    private static final CollectionMergeResolver<Object> THROWING_RESOLVER = new CollectionMergeResolver<Object>()
+    {
+        public Object resolve(Collection contents)
+        {
+            throw error();
+        }
+
+        private AssertionError error()
+        {
+            throw new AssertionError("Entries must be distinct.");
+        }
+    };
+
+    /**
+     * Returns a resolver that throws whenever more than one of the merged nodes contains content.
+     * Can be used to merge tries that are known to have distinct content paths.
+     */
+    public static <T> CollectionMergeResolver<T> throwingResolver()
+    {
+        return (CollectionMergeResolver<T>) THROWING_RESOLVER;
+    }
+
+    /**
+     * Constructs a view of the merge of multiple tries. The view is live, i.e. any write to any of the
+     * sources will be reflected (eventually consistently) in the merged view.
+     *
+     * If there is content for a given key in more than one sources, the resolver will be called to obtain the combination.
+     * (The resolver will not be called if there's content from only one source.)
+     */
+    public static <T> Trie<T> merge(Collection<? extends Trie<T>> sources, CollectionMergeResolver<T> resolver)
+    {
+        switch (sources.size())
+        {
+        case 0:
+            return empty();
+        case 1:
+            return sources.iterator().next();
+        case 2:
+        {
+            Iterator<? extends Trie<T>> it = sources.iterator();
+            Trie<T> t1 = it.next();
+            Trie<T> t2 = it.next();
+            return t1.mergeWith(t2, resolver);
+        }
+        default:
+            return new CollectionMergeTrie<>(sources, resolver);
+        }
+    }
+
+    /**
+     * Constructs a view of the merge of multiple tries, where each source must have distinct keys. The view is live,
+     * i.e. any write to any of the sources will be reflected in the merged view.
+     *
+     * If there is content for a given key in more than one sources, the merge will throw an assertion error.
+     */
+    public static <T> Trie<T> mergeDistinct(Collection<? extends Trie<T>> sources)
+    {
+        switch (sources.size())
+        {
+        case 0:
+            return empty();
+        case 1:
+            return sources.iterator().next();
+        case 2:
+        {
+            Iterator<? extends Trie<T>> it = sources.iterator();
+            Trie<T> t1 = it.next();
+            Trie<T> t2 = it.next();
+            return new MergeTrie.Distinct<>(t1, t2);
+        }
+        default:
+            return new CollectionMergeTrie.Distinct<>(sources);
+        }
+    }
+
+    private static final Trie<Object> EMPTY = new Trie<Object>()
+    {
+        public <L> Node<Object, L> root()
+        {
+            return null;
+        }
+    };
+
+    @SuppressWarnings("unchecked")
+    public static <T> Trie<T> empty()
+    {
+        return (Trie<T>) EMPTY;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumper.java b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
new file mode 100644
index 000000000000..60f11963000a
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.util.function.Function;
+
+/**
+ * Simple utility class for dumping the structure of a trie to string.
+ */
+class TrieDumper<T> implements TrieWalker<T, String>
+{
+    private final Function<T, String> contentToString;
+    private final StringBuilder b = new StringBuilder();
+    private int depth = -1;
+    private boolean indented = true;
+
+    TrieDumper(Function<T, String> contentToString)
+    {
+        this.contentToString = contentToString;
+    }
+
+    public void onNodeEntry(int incomingTransition, T content)
+    {
+        if (!indented)
+        {
+            for (int i = 0; i < depth; ++i)
+                b.append("  ");
+            indented = true;
+        }
+
+        ++depth;
+        if (incomingTransition != -1)
+            b.append(String.format("%02x", incomingTransition));
+
+        if (content != null)
+        {
+            // Only go to a new line once a payload is reached
+            indented = false;
+            b.append(" -> ");
+            b.append(contentToString.apply(content));
+            b.append('\n');
+        }
+    }
+
+    public void onNodeExit()
+    {
+        if (indented)
+        {
+            // We are backtracking without having printed content or meta. Although unexpected, this can legally happen
+            // (e.g. if an intersection has resulted in an empty node).
+            indented = false;
+            b.append('\n');
+        }
+        --depth;
+    }
+
+    public String completion()
+    {
+        return b.toString();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
new file mode 100644
index 000000000000..e0a1d5281fe2
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.util.Map;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Convertor of trie entries to iterator where each entry is passed through {@link #mapContent} (to be implemented by
+ * descendants).
+ */
+public abstract class TrieEntriesIterator<T, V> extends TrieIteratorWithKey<T, V>
+{
+    protected TrieEntriesIterator(Trie<T> trie)
+    {
+        super(trie);
+    }
+
+    V contentOf(Trie.Node<T, NodeWithPosition<T>> node)
+    {
+        T content = node.content();
+        if (content == null)
+            return null;
+        return mapContent(content, path, ppos);
+    }
+
+    protected abstract V mapContent(T content, byte[] bytes, int byteLength);
+
+    /**
+     * Iterator representing the content of the trie a sequence of (path, content) pairs.
+     */
+    static class AsEntries<T>
+    extends TrieEntriesIterator<T, Map.Entry<ByteComparable, T>>
+    {
+        public AsEntries(Trie<T> trie)
+        {
+            super(trie);
+        }
+
+        protected Map.Entry<ByteComparable, T> mapContent(T content, byte[] bytes, int byteLength)
+        {
+            return toEntry(content, bytes, byteLength);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieIterator.java b/src/java/org/apache/cassandra/db/tries/TrieIterator.java
new file mode 100644
index 000000000000..b5591fc67fa1
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieIterator.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import org.apache.cassandra.utils.AbstractIterator;
+
+/**
+ * Utility class for performing some walks over the trie that result in an iterator of items.
+ * See TrieValuesIterator and TrieEntriesIterator for usage.
+ */
+abstract class TrieIterator<T, L, V> extends AbstractIterator<V>
+{
+    private Trie.Node<T, L> current;
+
+    protected TrieIterator(Trie<T> trie)
+    {
+        current = trie.root();
+        if (current == null)
+            endOfData();
+    }
+
+    protected V computeNext()
+    {
+        Trie.Remaining has = startIteration();
+
+        while (true)
+        {
+            if (has != null)
+            {
+                // We have a transition, get child to descend into
+                Trie.Node<T, L> child = getChild(current, has);
+
+                if (child == null)
+                {
+                    // no child, get next
+                    has = advanceIteration();
+                }
+                else
+                {
+                    // Enter node
+                    current = child;
+                    // Check payload
+                    V v = contentOf(child);
+                    if (v != null)
+                        return v; // payload was produced, wait for next()
+
+                    has = startIteration();
+                }
+            }
+            else
+            {
+                // There are no more children. Ascend to the parent state to continue walk.
+                current = exitNodeAndReturnParent(current);
+                if (current == null)
+                {
+                    // We've reached back the root, our walk is finished
+                    return endOfData();
+                }
+                has = advanceIteration();
+            }
+        }
+    }
+
+    /**
+     * Start the iteration on a node. Can be overridden by children (e.g. to skip processing branch).
+     */
+    Trie.Remaining startIteration()
+    {
+        return current.startIteration();
+    }
+
+    /**
+     * Advance the iteration on a node. Can be overridden by children (e.g. to skip processing selected transitions).
+     */
+    Trie.Remaining advanceIteration()
+    {
+        return current.advanceIteration();
+    }
+
+    // The methods below are to be overridden by subclasses.
+
+    /**
+     * Called by advance to descend into a child node.
+     */
+    abstract Trie.Node<T, L> getChild(Trie.Node<T, L> node, Trie.Remaining has);
+
+    /**
+     * Called when a node is exited.
+     * Returns the parent with which to continue the traversal.
+     */
+    abstract Trie.Node<T, L> exitNodeAndReturnParent(Trie.Node<T, L> n);
+
+    /**
+     * Called to retrieve the content to be issued for a given node (e.g. content(), or a (path, content()) pair.
+     */
+    abstract V contentOf(Trie.Node<T, L> n);
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java b/src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java
new file mode 100644
index 000000000000..97fb8ecbf482
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import java.util.AbstractMap;
+import java.util.Arrays;
+
+import org.agrona.concurrent.UnsafeBuffer;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Convertor of trie content to flow, with information about he paths used to reach the content node.
+ * Descendants need to implement {@link TrieIterator#contentOf(Trie.Node)}; when the method is called the first
+ * {@link #ppos} bytes of {@link #path} will be filled with the path used to reach the node.
+ */
+public abstract class TrieIteratorWithKey<T, V>
+extends TrieIterator<T, TrieIteratorWithKey.NodeWithPosition<T>, V>
+        implements Trie.TransitionsReceiver
+{
+    byte[] path = new byte[256];
+    int ppos = 0;
+    NodeWithPosition<T> currentParentLink = (NodeWithPosition<T>) NO_LINK;
+
+    static final NodeWithPosition<Object> NO_LINK = new NodeWithPosition<>(-1, null);
+
+    static class NodeWithPosition<T>
+    {
+        final int ppos;
+        final Trie.Node<T, NodeWithPosition<T>> node;
+
+        NodeWithPosition(int ppos, Trie.Node<T, NodeWithPosition<T>> node)
+        {
+            this.ppos = ppos;
+            this.node = node;
+        }
+    }
+
+    protected TrieIteratorWithKey(Trie<T> trie)
+    {
+        super(trie);
+    }
+
+    public void add(int t)
+    {
+        if (ppos == path.length)
+            path = Arrays.copyOf(path, path.length * 2);
+        path[ppos++] = (byte) t;
+    }
+
+    public void add(UnsafeBuffer b, int pos, int count)
+    {
+        if (ppos + count > path.length)
+            path = Arrays.copyOf(path, Math.max(ppos + count + 16, path.length * 2));
+        b.getBytes(pos, path, ppos, count);
+        ppos += count;
+    }
+
+    Trie.Node<T, NodeWithPosition<T>> getChild(Trie.Node<T, NodeWithPosition<T>> node, Trie.Remaining has)
+    {
+        int currentPos = ppos;
+        add(node.currentTransition);
+
+        NodeWithPosition<T> parentLink;
+        if (has == Trie.Remaining.ONE)
+        {
+            // As in TrieValuesIterator, when we are processing the last child of a node we can skip it when backtracking.
+            parentLink = node.parentLink;
+        }
+        else
+        {
+            assert has != null;
+            // Otherwise, we need to be returning to this node. Create a parentLink object if one doesn't yet exist,
+            // saving the byte position in the path.
+            if (currentParentLink.node != node)
+            {
+                assert currentParentLink.ppos < currentPos;
+                currentParentLink = new NodeWithPosition<>(currentPos, node);
+            }
+            parentLink = currentParentLink;
+        }
+
+        Trie.Node<T, NodeWithPosition<T>> child = node.getCurrentChild(parentLink);
+
+        if (child != null)
+            child = child.getUniqueDescendant(parentLink, this);
+
+        if (child == null)
+            ppos = parentLink != null ? parentLink.ppos : 0; // restore state as we won't get an exitNodeAndReturnParent call.
+
+        return child;
+    }
+
+    Trie.Node<T, NodeWithPosition<T>> exitNodeAndReturnParent(Trie.Node<T, NodeWithPosition<T>> n)
+    {
+        NodeWithPosition<T> parentLink = n.parentLink;
+        if (parentLink == null)
+            return null;
+        else
+        {
+            ppos = parentLink.ppos;
+            currentParentLink = parentLink;
+            return parentLink.node;
+        }
+    }
+
+    static <T> java.util.Map.Entry<ByteComparable, T> toEntry(T content, byte[] bytes, int byteLength)
+    {
+        ByteComparable b = ByteComparable.fixedLength(Arrays.copyOf(bytes, byteLength));
+        return new AbstractMap.SimpleImmutableEntry<>(b, content);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieSet.java b/src/java/org/apache/cassandra/db/tries/TrieSet.java
new file mode 100644
index 000000000000..c1c0d46a879b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieSet.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * A simplified version of Trie used for sets (whose ultimate function is to intersect a Trie).
+ *
+ * Sets cannot be asynchronous and support a special value to denote a branch is fully included in the set,
+ * which is used to speed up intersections.
+ *
+ * Like Trie nodes, set nodes are stateful and not thread-safe. If the consumer can use multiple threads when accessing
+ * a node (e.g. with asynchronous trie walks), it must enforce a happens-before relationship between calls to the
+ * methods of a node.
+ */
+public abstract class TrieSet
+{
+    public abstract SetNode root();
+
+    interface SetNode
+    {
+        boolean startIteration();
+        boolean advanceIteration();
+        int currentTransition();
+        SetNode getCurrentChild();
+
+        /**
+         * Returns true if this specific position is in the set (i.e. if content in the intersected node should be
+         * returned).
+         *
+         * Note: Having a node produced by the trie set does not necessarily mean the relevant key is in the set.
+         * Imagine a singleton set, e.g. {010203}. It will be represented as the following trie:
+         *     root -01-> node1 -02-> node2 -03-> node3
+         * where only node3 will have inSet() == true. Root (corresponding to empty key), node1 (key 01) and node2 (key
+         * 0102) are not in the set and thus their inSet() will be false.
+         */
+        boolean inSet();
+    }
+
+    protected static final SetNode FULL = new SetNode()
+    {
+        public AssertionError error()
+        {
+            throw new AssertionError("SetNode FULL must be handled explicitly.");
+        }
+
+        public boolean startIteration()
+        {
+            throw error();
+        }
+
+        public boolean advanceIteration()
+        {
+            throw error();
+        }
+
+        public int currentTransition()
+        {
+            throw error();
+        }
+
+        public SetNode getCurrentChild()
+        {
+            throw error();
+        }
+
+        public boolean inSet()
+        {
+            throw error();
+        }
+    };
+
+    private static final TrieSet FULL_SET = new TrieSet()
+    {
+        public SetNode root()
+        {
+            return FULL;
+        }
+    };
+
+    private static final TrieSet EMPTY_SET = new TrieSet()
+    {
+        public SetNode root()
+        {
+            return null;
+        }
+    };
+
+    /**
+     * Range of keys between the given boundaries.
+     * A null argument for any of the limits means that the set should be unbounded on that side.
+     * The keys must be correctly ordered, including with respect to the `includeLeft` and `includeRight` constraints.
+     * (i.e. range(x, false, x, false) is an invalid call but range(x, true, x, false) is inefficient
+     * but fine for an empty set).
+     */
+    public static TrieSet range(ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight)
+    {
+        return new RangeTrieSet(left, includeLeft, right, includeRight);
+    }
+
+    public static TrieSet full()
+    {
+        return FULL_SET;
+    }
+
+    public static TrieSet empty()
+    {
+        return EMPTY_SET;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
new file mode 100644
index 000000000000..7a473f80bdb0
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+/**
+ * Convertor of trie contents to flow.
+ *
+ * Note: the type argument L must be equal to {@code Trie.Node<T, L>}, but we can't define such a recursive type in
+ * Java. Using {@code <>} when instantiating works, but any subclasses will also need to declare this useless type
+ * argument.
+ */
+class TrieValuesIterator<T, L extends Trie.Node<T, L>> extends TrieIterator<T, L, T>
+{
+    public TrieValuesIterator(Trie<T> trie)
+    {
+        super(trie);
+    }
+
+    Trie.Node<T, L> getChild(Trie.Node<T, L> node, Trie.Remaining has)
+    {
+        // If we know this is the last child for this node, we can just as well skip this node when backtracking,
+        final L parentLink = has == Trie.Remaining.ONE ? node.parentLink : (L) node;
+
+        Trie.Node<T, L> child = node.getCurrentChild(parentLink);
+
+        // and as long as any child has single descendant, we don't need to backtrack to that either.
+        if (child != null)
+            child = child.getUniqueDescendant(parentLink, null);
+
+        return child;
+    }
+
+    Trie.Node<T, L> exitNodeAndReturnParent(Trie.Node<T, L> n)
+    {
+        return n.parentLink;
+    }
+
+    T contentOf(Trie.Node<T, L> node)
+    {
+        return node.content();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieWalker.java b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
new file mode 100644
index 000000000000..fb76c9d5ba0e
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.tries;
+
+/**
+ * Utility class for performing some types of walks over the trie, where the result can be used as a
+ * CompletableFuture.
+ * See TrieDumper for sample usage.
+ */
+interface TrieWalker<T, V>
+{
+    /**
+     * Called when entering a node of the trie.
+     *
+     * @param incomingTransition the transition that led here, -1 if this is the root.
+     */
+    void onNodeEntry(int incomingTransition, T content);
+
+    /**
+     * Called when leaving a node of the trie, that is after having exited its last children.
+     */
+    void onNodeExit();
+
+    /**
+     * The final value of the trie walk.
+     * <p>
+     * This is called on completion of the walk (after calling {@link #onNodeExit} on the root node) to obtain the
+     * final outcome of the walk.
+     * <p>
+     * Note: the type parameter L must be equal to {@code Trie.Node<T, L>}. There is no way to specify such recursive
+     * types in Java, but it does get inferred correctly in calls to this method.
+     *
+     * @return the final outcome of the walk.
+     */
+    V completion();
+
+    public static <T, V, L extends Trie.Node<T, L>> V process(TrieWalker<T, V> walker, Trie<T> trie)
+    {
+        Trie.Node<T, L> current = trie.root();
+        if (current == null)
+            return walker.completion();
+
+        walker.onNodeEntry(-1, current.content());
+
+        Trie.Remaining has = current.startIteration();
+
+        while (true)
+        {
+            if (has != null)
+            {
+                // We have a transition, get child to descend into
+                Trie.Node<T, L> child = current.getCurrentChild((L) current);
+                if (child == null)
+                {
+                    // no child, get next
+                    has = current.advanceIteration();
+                }
+                else
+                {
+                    walker.onNodeEntry(current.currentTransition, child.content());
+
+                    // We have a new child. Move to it
+                    current = child;
+                    has = child.startIteration();
+                }
+            }
+            else
+            {
+                // There are no more children. Ascend to the parent state to continue walk.
+                walker.onNodeExit();
+                current = current.parentLink;
+                if (current == null)
+                {
+                    // We've reached back the root, our walk is finished
+                    return walker.completion();
+                }
+                has = current.advanceIteration();
+            }
+        }
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieReadBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieReadBench.java
new file mode 100644
index 000000000000..5db89ec8aeff
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieReadBench.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.test.microbench.tries;
+
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.db.tries.MemtableTrie;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 5, time = 1)
+@Fork(value = 1,jvmArgsAppend = { "-Xmx4G", "-Xms4G", "-Djmh.executor=CUSTOM", "-Djmh.executor.class=org.apache.cassandra.test.microbench.FastThreadExecutor"})
+@Threads(1) // no concurrent writes
+@State(Scope.Benchmark)
+public class MemtableTrieReadBench
+{
+    @Param({"ON_HEAP", "OFF_HEAP"})
+    BufferType bufferType = BufferType.OFF_HEAP;
+
+    @Param({"1000", "100000", "10000000"})
+    int count = 1000;
+
+    final static MemtableTrie.UpsertTransformer<Byte, Byte> resolver = (x, y) -> y;
+
+    MemtableTrie<Byte> trie;
+
+    @Setup(Level.Trial)
+    public void setup() throws Throwable
+    {
+        trie = new MemtableTrie<>(bufferType);
+        Random rand = new Random(1);
+
+        System.out.format("Putting %,d\n", count);
+        for (long current = 0; current < count; ++current)
+        {
+            long l = rand.nextLong();
+            trie.putRecursive(ByteComparable.of(l), Byte.valueOf((byte) (l >> 56)), resolver);
+        }
+        System.out.format("Trie size on heap %,d off heap %,d\n",
+                          trie.sizeOnHeap(), trie.sizeOffHeap());
+        System.out.format("per entry on heap %.2f off heap %.2f\n",
+                          trie.sizeOnHeap() * 1.0 / count, trie.sizeOffHeap() * 1.0 / count);
+    }
+
+    @Benchmark
+    public void getRandom()
+    {
+        Random rand = new Random(1);
+
+        for (long current = 0; current < count; ++current)
+        {
+            long l = rand.nextLong();
+            Byte res = trie.get(ByteComparable.of(l));
+            if (res.byteValue() != l >> 56)
+                throw new AssertionError();
+        }
+    }
+
+    @Benchmark
+    public int iterateValues()
+    {
+        int sum = 0;
+        for (byte b : trie.values())
+            sum += b;
+        return sum;
+    }
+
+    @Benchmark
+    public int iterateValuesUnordered()
+    {
+        int sum = 0;
+        for (byte b : trie.valuesUnordered())
+            sum += b;
+        return sum;
+    }
+
+    @Benchmark
+    public int iterateEntries()
+    {
+        int sum = 0;
+        for (Map.Entry<ByteComparable, Byte> en : trie.entrySet())
+            sum += en.getValue();
+        return sum;
+    }
+
+    @Benchmark
+    public int iterateValuesLimited()
+    {
+        Iterable<Byte> values = trie.subtrie(ByteComparable.of(0L),
+                                             true,
+                                             ByteComparable.of(Long.MAX_VALUE / 2),         // 1/4 of all
+                                             false)
+                                    .values();
+        int sum = 0;
+        for (byte b : values)
+            sum += b;
+        return sum;
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieUnionBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieUnionBench.java
new file mode 100644
index 000000000000..3d40b0bfa64c
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieUnionBench.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.test.microbench.tries;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.db.tries.MemtableTrie;
+import org.apache.cassandra.db.tries.Trie;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 5, time = 1)
+@Fork(value = 1,jvmArgsAppend = { "-Xmx4G", "-Xms4G", "-Djmh.executor=CUSTOM", "-Djmh.executor.class=org.apache.cassandra.test.microbench.FastThreadExecutor"})
+@Threads(1) // no concurrent writes
+@State(Scope.Benchmark)
+public class MemtableTrieUnionBench
+{
+    @Param({"ON_HEAP", "OFF_HEAP"})
+    BufferType bufferType = BufferType.OFF_HEAP;
+
+    @Param({"1000", "100000", "10000000"})
+    int count = 1000;
+
+    @Param({"2", "3", "8"})
+    int sources = 2;
+
+    @Param({"false", "true"})
+    boolean sequential = true;
+
+    final static MemtableTrie.UpsertTransformer<Byte, Byte> resolver = (x, y) -> y;
+
+    Trie<Byte> trie;
+
+    @Setup(Level.Trial)
+    public void setup() throws Throwable
+    {
+        List<MemtableTrie<Byte>> tries = new ArrayList<>(sources);
+        System.out.format("Putting %,d among %d tries\n", count, sources);
+        Random rand = new Random(1);
+        if (sequential)
+        {
+            long sz = 65536 / sources;
+            for (int i = 0; i < sources; ++i)
+                tries.add(new MemtableTrie<>(bufferType));
+
+            for (long current = 0; current < count; ++current)
+            {
+                long l = rand.nextLong();
+                MemtableTrie<Byte> tt = tries.get(Math.min((int) (((l >> 48) + 32768) / sz), sources - 1));
+                tt.putRecursive(ByteComparable.of(l), (byte) (l >> 56), resolver);
+            }
+
+        }
+        else
+        {
+            long current = 0;
+            for (int i = 0; i < sources; ++i)
+            {
+                MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
+                int currMax = this.count * (i + 1) / sources;
+
+                for (; current < currMax; ++current)
+                {
+                    long l = rand.nextLong();
+                    trie.putRecursive(ByteComparable.of(l), (byte) (l >> 56), resolver);
+                }
+                tries.add(trie);
+            }
+        }
+
+        for (MemtableTrie<Byte> trie : tries)
+        {
+            System.out.format("Trie size on heap %,d off heap %,d\n",
+                              trie.sizeOnHeap(), trie.sizeOffHeap());
+        }
+        trie = Trie.mergeDistinct(tries);
+
+        System.out.format("Actual count %,d\n", Iterables.size(trie.values()));
+    }
+
+    @Benchmark
+    public int iterateValues()
+    {
+        int sum = 0;
+        for (byte b : trie.values())
+            sum += b;
+        return sum;
+    }
+
+    @Benchmark
+    public int iterateValuesUnordered()
+    {
+        int sum = 0;
+        for (byte b : trie.valuesUnordered())
+            sum += b;
+        return sum;
+    }
+
+    @Benchmark
+    public int iterateEntries()
+    {
+        int sum = 0;
+        for (Map.Entry<ByteComparable, Byte> en : trie.entrySet())
+            sum += en.getValue();
+        return sum;
+    }
+
+    @Benchmark
+    public int iterateValuesLimited()
+    {
+        Iterable<Byte> values = trie.subtrie(ByteComparable.of(0L),
+                                             true,
+                                             ByteComparable.of(Long.MAX_VALUE / 2),         // 1/4 of all
+                                             false)
+                                    .values();
+        int sum = 0;
+        for (byte b : values)
+            sum += b;
+        return sum;
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
new file mode 100644
index 000000000000..8b99ac95eb07
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.test.microbench.tries;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.db.tries.MemtableTrie;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 5, time = 1)
+@Fork(value = 1,jvmArgsAppend = { "-Xmx4G", "-Xms4G", "-Djmh.executor=CUSTOM", "-Djmh.executor.class=org.apache.cassandra.test.microbench.FastThreadExecutor"})
+@Threads(1) // no concurrent writes
+@State(Scope.Benchmark)
+public class MemtableTrieWriteBench
+{
+    @Param({"ON_HEAP", "OFF_HEAP"})
+    BufferType bufferType = BufferType.OFF_HEAP;
+
+    @Param({"1000", "100000", "10000000"})
+    int count = 1000;
+
+    @Param({"8"})
+    int keyLength = 8;
+
+    final static MemtableTrie.UpsertTransformer<Byte, Byte> resolver = (x, y) -> x;
+
+    @Benchmark
+    public void putSequential() throws MemtableTrie.SpaceExhaustedException
+    {
+        MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
+        ByteBuffer buf = ByteBuffer.allocate(keyLength);
+
+        for (long current = 0; current < count; ++current)
+        {
+            long l = current;
+            buf.putLong(keyLength - 8, l);
+            trie.putRecursive(ByteComparable.fixedLength(buf), Byte.valueOf((byte) (l >> 56)), resolver);
+        }
+    }
+
+    @Benchmark
+    public void putRandom() throws MemtableTrie.SpaceExhaustedException
+    {
+        MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
+        Random rand = new Random(1);
+        byte[] buf = new byte[keyLength];
+
+        for (long current = 0; current < count; ++current)
+        {
+            rand.nextBytes(buf);
+            trie.putRecursive(ByteComparable.fixedLength(buf), buf[0], resolver);
+        }
+    }
+
+    @Benchmark
+    public void applySequential() throws MemtableTrie.SpaceExhaustedException
+    {
+        MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
+        ByteBuffer buf = ByteBuffer.allocate(keyLength);
+
+        for (long current = 0; current < count; ++current)
+        {
+            long l = current;
+            buf.putLong(keyLength - 8, l);
+            trie.putSingleton(ByteComparable.fixedLength(buf), Byte.valueOf((byte) (l >> 56)), resolver);
+        }
+    }
+
+    @Benchmark
+    public void applyRandom() throws MemtableTrie.SpaceExhaustedException
+    {
+        MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
+        Random rand = new Random(1);
+        byte[] buf = new byte[keyLength];
+
+        for (long current = 0; current < count; ++current)
+        {
+            rand.nextBytes(buf);
+            trie.putSingleton(ByteComparable.fixedLength(buf), Byte.valueOf(buf[0]), resolver);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java b/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java
new file mode 100644
index 000000000000..0183680c7692
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/CollectionMergeTrieTest.java
@@ -0,0 +1,175 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Test;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.*;
+import static org.apache.cassandra.db.tries.MergeTrieTest.removeDuplicates;
+
+public class CollectionMergeTrieTest
+{
+    private static final int COUNT = 15000;
+    Random rand = new Random();
+
+    @Test
+    public void testDirect()
+    {
+        ByteComparable[] src1 = generateKeys(rand, COUNT);
+        ByteComparable[] src2 = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        SortedMap<ByteComparable, ByteBuffer> content2 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
+        MemtableTrie<ByteBuffer> trie2 = makeMemtableTrie(src2, content2, true);
+
+        content1.putAll(content2);
+        // construct directly, trie.merge() will defer to mergeWith on two sources
+        Trie<ByteBuffer> union = new CollectionMergeTrie<>(ImmutableList.of(trie1, trie2), x -> x.iterator().next());
+
+        assertSameContent(union, content1);
+    }
+
+    @Test
+    public void testWithDuplicates()
+    {
+        ByteComparable[] src1 = generateKeys(rand, COUNT);
+        ByteComparable[] src2 = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        SortedMap<ByteComparable, ByteBuffer> content2 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
+        MemtableTrie<ByteBuffer> trie2 = makeMemtableTrie(src2, content2, true);
+
+        addToMemtableTrie(generateKeys(new Random(5), COUNT), content1, trie1, true);
+        addToMemtableTrie(generateKeys(new Random(5), COUNT), content2, trie2, true);
+
+        content1.putAll(content2);
+        Trie<ByteBuffer> union = new CollectionMergeTrie<>(ImmutableList.of(trie1, trie2), x -> x.iterator().next());
+
+        assertSameContent(union, content1);
+    }
+
+    @Test
+    public void testDistinct()
+    {
+        ByteComparable[] src1 = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
+
+        ByteComparable[] src2 = generateKeys(rand, COUNT);
+        src2 = removeDuplicates(src2, content1);
+        SortedMap<ByteComparable, ByteBuffer> content2 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        MemtableTrie<ByteBuffer> trie2 = makeMemtableTrie(src2, content2, true);
+
+        content1.putAll(content2);
+        Trie<ByteBuffer> union = new CollectionMergeTrie.Distinct<>(ImmutableList.of(trie1, trie2));
+
+        assertSameContent(union, content1);
+    }
+
+    @Test
+    public void testMultiple()
+    {
+        for (int i = 0; i < 10; ++i)
+        {
+            testMultiple(rand.nextInt(10) + 5, COUNT / 10);
+        }
+    }
+
+    @Test
+    public void testMerge1()
+    {
+        testMultiple(1, COUNT / 10);
+    }
+
+    @Test
+    public void testMerge2()
+    {
+        testMultiple(2, COUNT / 10);
+    }
+
+    @Test
+    public void testMerge3()
+    {
+        testMultiple(3, COUNT / 10);
+    }
+
+    @Test
+    public void testMerge5()
+    {
+        testMultiple(5, COUNT / 10);
+    }
+
+    @Test
+    public void testMerge0()
+    {
+        testMultiple(0, COUNT / 10);
+    }
+
+    public void testMultiple(int mergeCount, int count)
+    {
+        testMultipleDistinct(mergeCount, count);
+        testMultipleWithDuplicates(mergeCount, count);
+    }
+
+    public void testMultipleDistinct(int mergeCount, int count)
+    {
+        List<Trie<ByteBuffer>> tries = new ArrayList<>(mergeCount);
+        SortedMap<ByteComparable, ByteBuffer> content = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+
+        for (int i = 0; i < mergeCount; ++i)
+        {
+            ByteComparable[] src = removeDuplicates(generateKeys(rand, count), content);
+            Trie<ByteBuffer> trie = makeMemtableTrie(src, content, true);
+            tries.add(trie);
+        }
+
+        Trie<ByteBuffer> union = Trie.mergeDistinct(tries);
+
+        assertSameContent(union, content);
+    }
+
+    public void testMultipleWithDuplicates(int mergeCount, int count)
+    {
+        List<Trie<ByteBuffer>> tries = new ArrayList<>(mergeCount);
+        SortedMap<ByteComparable, ByteBuffer> content = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+
+        for (int i = 0; i < mergeCount; ++i)
+        {
+            ByteComparable[] src = generateKeys(rand, count);
+            Trie<ByteBuffer> trie = makeMemtableTrie(src, content, true);
+            tries.add(trie);
+        }
+
+        Trie<ByteBuffer> union = Trie.merge(tries, x -> x.iterator().next());
+
+        assertSameContent(union, content);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTrieApplyTest.java b/test/unit/org/apache/cassandra/db/tries/MemtableTrieApplyTest.java
new file mode 100644
index 000000000000..760717293d6e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTrieApplyTest.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+public class MemtableTrieApplyTest extends MemtableTrieTestBase
+{
+    @Override
+    boolean usePut()
+    {
+        return false;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTriePutTest.java b/test/unit/org/apache/cassandra/db/tries/MemtableTriePutTest.java
new file mode 100644
index 000000000000..6ff8871478d2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTriePutTest.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.fail;
+
+public class MemtableTriePutTest extends MemtableTrieTestBase
+{
+    @Override
+    boolean usePut()
+    {
+        return true;
+    }
+
+    @Test
+    public void testLongKey_StackOverflow() throws MemtableTrie.SpaceExhaustedException
+    {
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.ON_HEAP);
+        Random rand = new Random(1);
+        byte[] key = new byte[40960];
+        rand.nextBytes(key);
+        ByteBuffer buf = ByteBuffer.wrap(key);
+
+        try
+        {
+            trie.putRecursive(ByteComparable.fixedLength(buf), "value", (x, y) -> y);
+            Assert.fail("StackOverflowError expected with a recursive put for very long keys!");
+        }
+        catch (StackOverflowError soe)
+        {
+            // Expected.
+        }
+        // Using non-recursive put should work.
+        putSimpleResolve(trie, ByteComparable.fixedLength(buf), "value", (x, y) -> y, false);
+    }
+
+    // This tests that trie space allocation works correctly close to the 2G limit. It is normally disabled because
+    // the test machines don't provide enough heap memory (test requires ~8G heap to finish). Run it manually when
+    // MemtableTrie.allocateBlock is modified.
+    @Ignore
+    @Test
+    public void testOver1GSize() throws MemtableTrie.SpaceExhaustedException
+    {
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.ON_HEAP);
+        trie.advanceAllocatedPos(0x20000000);
+        String t1 = "test1";
+        String t2 = "testing2";
+        String t3 = "onemoretest3";
+        trie.putRecursive(ByteComparable.of(t1), t1, (x, y) -> y);
+        Assert.assertEquals(t1, trie.get(ByteComparable.of(t1)));
+        Assert.assertNull(trie.get(ByteComparable.of(t2)));
+        Assert.assertFalse(trie.reachedAllocatedSizeThreshold());
+
+        trie.advanceAllocatedPos(0x40001000);  // over 1G
+        trie.putRecursive(ByteComparable.of(t2), t2, (x, y) -> y);
+        Assert.assertEquals(t1, trie.get(ByteComparable.of(t1)));
+        Assert.assertEquals(t2, trie.get(ByteComparable.of(t2)));
+        Assert.assertNull(trie.get(ByteComparable.of(t3)));
+        Assert.assertTrue(trie.reachedAllocatedSizeThreshold());
+
+        trie.advanceAllocatedPos(0x7FFFFEE0);  // close to 2G
+        Assert.assertEquals(t1, trie.get(ByteComparable.of(t1)));
+        Assert.assertEquals(t2, trie.get(ByteComparable.of(t2)));
+        Assert.assertNull(trie.get(ByteComparable.of(t3)));
+        Assert.assertTrue(trie.reachedAllocatedSizeThreshold());
+
+        try
+        {
+            trie.putRecursive(ByteComparable.of(t3), t3, (x, y) -> y);  // should put it over the edge
+            fail("MemtableTrie.SpaceExhaustedError was expected");
+        }
+        catch (MemtableTrie.SpaceExhaustedException e)
+        {
+            // expected
+        }
+
+        Assert.assertEquals(t1, trie.get(ByteComparable.of(t1)));
+        Assert.assertEquals(t2, trie.get(ByteComparable.of(t2)));
+        Assert.assertNull(trie.get(ByteComparable.of(t3)));
+        Assert.assertTrue(trie.reachedAllocatedSizeThreshold());
+
+        try
+        {
+            trie.advanceAllocatedPos(Integer.MAX_VALUE);
+            fail("MemtableTrie.SpaceExhaustedError was expected");
+        }
+        catch (MemtableTrie.SpaceExhaustedException e)
+        {
+            // expected
+        }
+
+        Assert.assertEquals(t1, trie.get(ByteComparable.of(t1)));
+        Assert.assertEquals(t2, trie.get(ByteComparable.of(t2)));
+        Assert.assertNull(trie.get(ByteComparable.of(t3)));
+        Assert.assertTrue(trie.reachedAllocatedSizeThreshold());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
new file mode 100644
index 000000000000..2dec4746d3a9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
@@ -0,0 +1,581 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.function.Function;
+import java.util.stream.Stream;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Throwables;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.ObjectSizes;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
+
+public abstract class MemtableTrieTestBase
+{
+    private static final int COUNT = 100000;
+    private static final int KEY_CHOICE = 25;
+    private static final int MIN_LENGTH = 10;
+    private static final int MAX_LENGTH = 50;
+    Random rand = new Random();
+
+    static final ByteComparable.Version VERSION = MemtableTrie.BYTE_COMPARABLE_VERSION;
+
+    abstract boolean usePut();
+
+    @Test
+    public void testSingle()
+    {
+        ByteComparable e = ByteComparable.of("test");
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.OFF_HEAP);
+        putSimpleResolve(trie, e, "test", (x, y) -> y);
+        System.out.println("Trie " + trie.dump());
+        assertEquals("test", trie.get(e));
+        assertEquals(null, trie.get(ByteComparable.of("teste")));
+    }
+
+    @Test
+    public void testSplitMulti()
+    {
+        testEntries(new String[] { "testing", "tests", "trials", "trial", "aaaa", "aaaab", "abdddd", "abeeee" });
+    }
+
+    @Test
+    public void testSplitMultiBug()
+    {
+        testEntriesHex(new String[] { "0c4143aeff", "0c4143ae69ff" });
+    }
+
+
+    @Test
+    public void testSparse00bug()
+    {
+        String[] tests = new String[] {
+        "40bd256e6fd2adafc44033303000",
+        "40bdd47ec043641f2b403131323400",
+        "40bd00bf5ae8cf9d1d403133323800",
+        };
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.OFF_HEAP);
+        for (String test : tests)
+        {
+            ByteComparable e = ByteComparable.fixedLength(ByteBufferUtil.hexToBytes(test));
+            System.out.println("Adding " + asString(e) + ": " + test);
+            putSimpleResolve(trie, e, test, (x, y) -> y);
+        }
+
+        System.out.println(trie.dump());
+
+        for (String test : tests)
+            assertEquals(test, trie.get(ByteComparable.fixedLength(ByteBufferUtil.hexToBytes(test))));
+
+        Arrays.sort(tests);
+
+        int idx = 0;
+        for (String s : trie.values())
+        {
+            if (s != tests[idx])
+                throw new AssertionError("" + s + "!=" + tests[idx]);
+            ++idx;
+        }
+        assertEquals(tests.length, idx);
+    }
+
+    @Test
+    public void testUpdateContent()
+    {
+        String[] tests = new String[] {"testing", "tests", "trials", "trial", "testing", "trial", "trial"};
+        String[] values = new String[] {"testing", "tests", "trials", "trial", "t2", "x2", "y2"};
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.OFF_HEAP);
+        for (int i = 0; i < tests.length; ++i)
+        {
+            String test = tests[i];
+            String v = values[i];
+            ByteComparable e = ByteComparable.of(test);
+            System.out.println("Adding " + asString(e) + ": " + v);
+            putSimpleResolve(trie, e, v, (x, y) -> "" + x + y);
+            System.out.println("Trie " + trie.dump());
+        }
+
+        for (int i = 0; i < tests.length; ++i)
+        {
+            String test = tests[i];
+            assertEquals(Stream.iterate(0, x -> x + 1)
+                               .limit(tests.length)
+                               .filter(x -> tests[x] == test)
+                               .map(x -> values[x])
+                               .reduce("", (x, y) -> "" + x + y),
+                         trie.get(ByteComparable.of(test)));
+        }
+    }
+
+    static class SpecifiedChildrenNode<L> extends Trie.Node<ByteBuffer, L>
+    {
+        final Object[] children;
+
+        SpecifiedChildrenNode(L parent, Object[] children)
+        {
+            super(parent);
+            this.children = children;
+        }
+
+        public Trie.Remaining startIteration()
+        {
+            currentTransition = 0x30;
+            return remaining();
+        }
+
+        private Trie.Remaining remaining()
+        {
+            final int left = children.length - (currentTransition - 0x30);
+            return left > 1
+                   ? Trie.Remaining.MULTIPLE
+                   : left == 1
+                     ? Trie.Remaining.ONE
+                     : null;
+        }
+
+        public Trie.Remaining advanceIteration()
+        {
+            ++currentTransition;
+            return remaining();
+        }
+
+        public Trie.Node<ByteBuffer, L> getCurrentChild(L parentLink)
+        {
+            return makeSpecifiedChildrenNode(parentLink, children[currentTransition - 0x30]);
+        }
+
+        public Trie.Node<ByteBuffer, L> getUniqueDescendant(L parentLink, Trie.TransitionsReceiver receiver)
+        {
+            if (children.length != 1)
+                return this;
+
+            if (receiver != null)
+                receiver.add(0x30);
+
+            Object child;
+            for (child = children[0];
+                 child instanceof Object[] && ((Object[]) child).length == 1;
+                 child = ((Object[]) child)[0])
+                if (receiver != null)
+                    receiver.add(0x30);
+
+            return makeSpecifiedChildrenNode(parentLink, child);
+        }
+
+        public ByteBuffer content()
+        {
+            return null;
+        }
+    }
+
+    static <L> Trie.Node<ByteBuffer, L> makeSpecifiedChildrenNode(L parent, Object nodeDef)
+    {
+        if (nodeDef == null)
+            return null;
+        else if (nodeDef instanceof Object[])
+            return new SpecifiedChildrenNode<>(parent, (Object[]) nodeDef);
+        else
+            return new Trie.NoChildrenNode<ByteBuffer, L>(parent)
+            {
+                public ByteBuffer content()
+                {
+                    return (ByteBuffer) nodeDef;
+                }
+            };
+    }
+
+    static Trie<ByteBuffer> specifiedTrie(Object nodeDef)
+    {
+        return new Trie<ByteBuffer>()
+        {
+            protected <L> Node<ByteBuffer, L> root()
+            {
+                return makeSpecifiedChildrenNode(null, nodeDef);
+            }
+        };
+    }
+
+    @Test
+    public void testEntriesNullChildBug()
+    {
+        Object[] trieDef = new Object[]
+                                   {
+                                           new Object[] { // 0
+                                                   ByteBufferUtil.bytes(1), // 01
+                                                   ByteBufferUtil.bytes(2)  // 02
+                                           },
+                                           // If requestChild returns null, bad things can happen (DB-2982)
+                                           null, // 1
+                                           ByteBufferUtil.bytes(3), // 2
+                                           new Object[] {  // 3
+                                                   ByteBufferUtil.bytes(4), // 30
+                                                   // Also try null on the Remaining.ONE path
+                                                   null // 31
+                                           },
+                                           ByteBufferUtil.bytes(5), // 4
+                                           // Also test requestUniqueDescendant returning null
+                                           new Object[] { // 5
+                                                   new Object[] { // 50
+                                                           new Object[] { // 500
+                                                                   null // 5000
+                                                           }
+                                                   }
+                                           },
+                                           ByteBufferUtil.bytes(6) // 6
+                                   };
+
+        SortedMap<ByteComparable, ByteBuffer> expected = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        expected.put(comparable("00"), ByteBufferUtil.bytes(1));
+        expected.put(comparable("01"), ByteBufferUtil.bytes(2));
+        expected.put(comparable("2"), ByteBufferUtil.bytes(3));
+        expected.put(comparable("30"), ByteBufferUtil.bytes(4));
+        expected.put(comparable("4"), ByteBufferUtil.bytes(5));
+        expected.put(comparable("6"), ByteBufferUtil.bytes(6));
+
+        Trie<ByteBuffer> trie = specifiedTrie(trieDef);
+        System.out.println(trie.dump());
+        assertSameContent(trie, expected);
+    }
+
+    static ByteComparable comparable(String s)
+    {
+        ByteBuffer b = ByteBufferUtil.bytes(s);
+        return ByteComparable.fixedLength(b);
+    }
+
+    @Test
+    public void testDirect()
+    {
+        ByteComparable[] src = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        MemtableTrie<ByteBuffer> trie = makeMemtableTrie(src, content, usePut());
+        int keysize = Arrays.stream(src)
+                            .mapToInt(src1 -> ByteComparable.length(src1, VERSION))
+                            .sum();
+        long ts = ObjectSizes.measureDeep(content);
+        long onh = ObjectSizes.measureDeep(trie.contentArray);
+        System.out.format("Trie size on heap %,d off heap %,d measured %,d keys %,d treemap %,d\n",
+                          trie.sizeOnHeap(), trie.sizeOffHeap(), onh, keysize, ts);
+        System.out.format("per entry on heap %.2f off heap %.2f measured %.2f keys %.2f treemap %.2f\n",
+                          trie.sizeOnHeap() * 1.0 / COUNT, trie.sizeOffHeap() * 1.0 / COUNT, onh * 1.0 / COUNT, keysize * 1.0 / COUNT, ts * 1.0 / COUNT);
+        // System.out.println("Trie " + trie.dump(ByteBufferUtil::bytesToHex).get());
+
+        assertSameContent(trie, content);
+        checkGet(trie, content);
+    }
+
+    @Test
+    public void testPrefixEvolution()
+    {
+        testEntries(new String[] { "testing",
+                                   "test",
+                                   "tests",
+                                   "tester",
+                                   "testers",
+                                   // test changing type with prefix
+                                   "types",
+                                   "types1",
+                                   "types",
+                                   "types2",
+                                   "types3",
+                                   "types4",
+                                   "types",
+                                   "types5",
+                                   "types6",
+                                   "types7",
+                                   "types8",
+                                   "types",
+                                   // test adding prefix to chain
+                                   "chain123",
+                                   "chain",
+                                   // test adding prefix to sparse
+                                   "sparse1",
+                                   "sparse2",
+                                   "sparse3",
+                                   "sparse",
+                                   // test adding prefix to split
+                                   "split1",
+                                   "split2",
+                                   "split3",
+                                   "split4",
+                                   "split5",
+                                   "split6",
+                                   "split7",
+                                   "split8",
+                                   "split"
+        });
+    }
+
+    @Test
+    public void testPrefixUnsafeMulti()
+    {
+        // Make sure prefixes on inside a multi aren't overwritten by embedded metadata node.
+
+        testEntries(new String[] { "test89012345678901234567890",
+                                   "test8",
+                                   "test89",
+                                   "test890",
+                                   "test8901",
+                                   "test89012",
+                                   "test890123",
+                                   "test8901234",
+                                   });
+    }
+
+    private void testEntries(String[] tests)
+    {
+        for (Function<String, ByteComparable> mapping :
+                ImmutableList.<Function<String, ByteComparable>>of(ByteComparable::of,
+                                                                   s -> ByteComparable.fixedLength(s.getBytes())))
+        {
+            testEntries(tests, mapping);
+        }
+    }
+
+    private void testEntriesHex(String[] tests)
+    {
+        testEntries(tests, s -> ByteComparable.fixedLength(ByteBufferUtil.hexToBytes(s)));
+        // Run the other translations just in case.
+        testEntries(tests);
+    }
+
+    private void testEntries(String[] tests, Function<String, ByteComparable> mapping)
+
+    {
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.OFF_HEAP);
+        for (String test : tests)
+        {
+            ByteComparable e = mapping.apply(test);
+            System.out.println("Adding " + asString(e) + ": " + test);
+            putSimpleResolve(trie, e, test, (x, y) -> y);
+            System.out.println("Trie\n" + trie.dump());
+        }
+
+        for (String test : tests)
+            assertEquals(test, trie.get(mapping.apply(test)));
+    }
+
+    static MemtableTrie<ByteBuffer> makeMemtableTrie(ByteComparable[] src,
+                                                     Map<ByteComparable, ByteBuffer> content,
+                                                     boolean usePut)
+
+    {
+        MemtableTrie<ByteBuffer> trie = new MemtableTrie<>(BufferType.OFF_HEAP);
+        addToMemtableTrie(src, content, trie, usePut);
+        return trie;
+    }
+
+    static void addToMemtableTrie(ByteComparable[] src,
+                                  Map<ByteComparable, ByteBuffer> content,
+                                  MemtableTrie<ByteBuffer> trie,
+                                  boolean usePut)
+
+    {
+        for (ByteComparable b : src)
+        {
+            // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload
+            // (so that all sources have the same value).
+            int payload = asString(b).hashCode();
+            ByteBuffer v = ByteBufferUtil.bytes(payload);
+            content.put(b, v);
+//             System.out.println("Adding " + asString(b) + ": " + ByteBufferUtil.bytesToHex(v));
+            putSimpleResolve(trie, b, v, (x, y) -> y, usePut);
+//             System.out.println(trie.dump(ByteBufferUtil::bytesToHex));
+        }
+    }
+
+    static void checkGet(MemtableTrie<ByteBuffer> trie, Map<ByteComparable, ByteBuffer> items)
+    {
+        for (Map.Entry<ByteComparable, ByteBuffer> en : items.entrySet())
+        {
+            assertEquals(en.getValue(), trie.get(en.getKey()));
+        }
+    }
+
+    static void assertSameContent(Trie<ByteBuffer> trie, SortedMap<ByteComparable, ByteBuffer> map)
+    {
+        assertMapEquals(trie, map);
+        assertValuesEqual(trie, map);
+        assertUnorderedValuesEqual(trie, map);
+    }
+
+    private static void assertValuesEqual(Trie<ByteBuffer> trie, SortedMap<ByteComparable, ByteBuffer> map)
+    {
+        assertIterablesEqual(trie.values(), map.values());
+    }
+
+    private static void assertUnorderedValuesEqual(Trie<ByteBuffer> trie, SortedMap<ByteComparable, ByteBuffer> map)
+    {
+        Multiset<ByteBuffer> unordered = HashMultiset.create();
+        StringBuilder errors = new StringBuilder();
+        for (ByteBuffer b : trie.valuesUnordered())
+            unordered.add(b);
+
+        for (ByteBuffer b : map.values())
+            if (!unordered.remove(b))
+                errors.append("\nMissing value in valuesUnordered: " + ByteBufferUtil.bytesToHex(b));
+
+        for (ByteBuffer b : unordered)
+            errors.append("\nExtra value in valuesUnordered: " + ByteBufferUtil.bytesToHex(b));
+
+        assertEquals("", errors.toString());
+    }
+
+    static void assertMapEquals(Trie<ByteBuffer> trie, SortedMap<ByteComparable, ByteBuffer> map)
+    {
+        assertMapEquals(trie.entrySet(), map.entrySet());
+    }
+
+    static void assertMapEquals(Iterable<Map.Entry<ByteComparable, ByteBuffer>> container1,
+                                Iterable<Map.Entry<ByteComparable, ByteBuffer>> container2)
+    {
+        Iterator<Map.Entry<ByteComparable, ByteBuffer>> it1 = container1.iterator();
+        Iterator<Map.Entry<ByteComparable, ByteBuffer>> it2 = container2.iterator();
+        List<ByteComparable> failedAt = new ArrayList<>();
+        StringBuilder b = new StringBuilder();
+        while (it1.hasNext() && it2.hasNext())
+        {
+            Map.Entry<ByteComparable, ByteBuffer> en1 = it1.next();
+            Map.Entry<ByteComparable, ByteBuffer> en2 = it2.next();
+            b.append(String.format("TreeSet %s:%s\n", asString(en2.getKey()), ByteBufferUtil.bytesToHex(en2.getValue())));
+            b.append(String.format("Trie    %s:%s\n", asString(en1.getKey()), ByteBufferUtil.bytesToHex(en1.getValue())));
+            if (ByteComparable.compare(en1.getKey(), en2.getKey(), VERSION) != 0 || ByteBufferUtil.compareUnsigned(en1.getValue(), en2.getValue()) != 0)
+                failedAt.add(en1.getKey());
+        }
+        while (it1.hasNext())
+        {
+            Map.Entry<ByteComparable, ByteBuffer> en1 = it1.next();
+            b.append(String.format("Trie    %s:%s\n", asString(en1.getKey()), ByteBufferUtil.bytesToHex(en1.getValue())));
+            failedAt.add(en1.getKey());
+        }
+        while (it2.hasNext())
+        {
+            Map.Entry<ByteComparable, ByteBuffer> en2 = it2.next();
+            b.append(String.format("TreeSet %s:%s\n", asString(en2.getKey()), ByteBufferUtil.bytesToHex(en2.getValue())));
+            failedAt.add(en2.getKey());
+        }
+        if (!failedAt.isEmpty())
+        {
+            String message = "Failed at " + Lists.transform(failedAt, MemtableTrieTestBase::asString);
+            System.err.println(message);
+            System.err.println(b);
+            Assert.fail(message);
+        }
+    }
+
+    static <E extends Comparable<E>> void assertIterablesEqual(Iterable<E> expectedIterable, Iterable<E> actualIterable)
+    {
+        Iterator<E> expected = expectedIterable.iterator();
+        Iterator<E> actual = actualIterable.iterator();
+        while (actual.hasNext() && expected.hasNext())
+        {
+            Assert.assertEquals(actual.next(), expected.next());
+        }
+        if (expected.hasNext())
+            Assert.fail("Remaing values in expected, starting with " + expected.next());
+        else if (actual.hasNext())
+            Assert.fail("Remaing values in actual, starting with " + actual.next());
+    }
+
+    static ByteComparable[] generateKeys(Random rand, int count)
+    {
+        ByteComparable[] sources = new ByteComparable[count];
+        TreeSet<ByteComparable> added = new TreeSet<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        for (int i = 0; i < count; ++i)
+        {
+            sources[i] = generateKey(rand);
+            if (!added.add(sources[i]))
+                --i;
+        }
+
+        // note: not sorted!
+        return sources;
+    }
+
+    static ByteComparable generateKey(Random rand)
+    {
+        return generateKey(rand, MIN_LENGTH, MAX_LENGTH);
+    }
+
+    static ByteComparable generateKey(Random rand, int minLength, int maxLength)
+    {
+        int len = rand.nextInt(maxLength - minLength + 1) + minLength;
+        byte[] bytes = new byte[len];
+        int p = 0;
+        int length = bytes.length;
+        while (p < length)
+        {
+            int seed = rand.nextInt(KEY_CHOICE);
+            Random r2 = new Random(seed);
+            int m = r2.nextInt(5) + 2 + p;
+            if (m > length)
+                m = length;
+            while (p < m)
+                bytes[p++] = (byte) r2.nextInt(256);
+        }
+        return ByteComparable.fixedLength(bytes);
+    }
+
+    static String asString(ByteComparable bc)
+    {
+        return bc != null ? bc.byteComparableAsString(VERSION) : "null";
+    }
+
+    <T, M> void putSimpleResolve(MemtableTrie<T> trie,
+                                 ByteComparable key,
+                                 T value,
+                                 Trie.MergeResolver<T> resolver)
+    {
+        putSimpleResolve(trie, key, value, resolver, usePut());
+    }
+
+    static <T, M> void putSimpleResolve(MemtableTrie<T> trie,
+                                        ByteComparable key,
+                                        T value,
+                                        Trie.MergeResolver<T> resolver,
+                                        boolean usePut)
+    {
+        try
+        {
+            trie.putSingleton(key,
+                              value,
+                              (existing, update) -> existing != null ? resolver.resolve(existing, update) : update,
+                              usePut);
+        }
+        catch (MemtableTrie.SpaceExhaustedException e)
+        {
+            throw Throwables.propagate(e);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTrieThreadedTest.java b/test/unit/org/apache/cassandra/db/tries/MemtableTrieThreadedTest.java
new file mode 100644
index 000000000000..68b6051792f3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTrieThreadedTest.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.VERSION;
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.generateKeys;
+
+public class MemtableTrieThreadedTest
+{
+    private static final int COUNT = 300000;
+    private static final int OTHERS = COUNT / 10;
+    private static final int PROGRESS_UPDATE = COUNT / 15;
+    private static final int READERS = 8;
+    private static final int WALKERS = 2;
+    Random rand = new Random();
+
+    static String value(ByteComparable b)
+    {
+        return b.byteComparableAsString(VERSION);
+    }
+
+    @Test
+    public void testThreaded() throws InterruptedException
+    {
+        ByteComparable[] src = generateKeys(rand, COUNT + OTHERS);
+        MemtableTrie<String> trie = new MemtableTrie<>(BufferType.ON_HEAP);
+        ConcurrentLinkedQueue<Throwable> errors = new ConcurrentLinkedQueue<>();
+        List<Thread> threads = new ArrayList<Thread>();
+        AtomicBoolean writeCompleted = new AtomicBoolean(false);
+        AtomicInteger writeProgress = new AtomicInteger(0);
+
+        for (int i = 0; i < WALKERS; ++i)
+            threads.add(new Thread()
+            {
+                public void run()
+                {
+                    try
+                    {
+                        Random r = ThreadLocalRandom.current();
+                        while (!writeCompleted.get())
+                        {
+                            int min = writeProgress.get();
+                            int count = 0;
+                            for (Map.Entry<ByteComparable, String> en : trie.entrySet())
+                            {
+                                String v = value(en.getKey());
+                                Assert.assertEquals(en.getKey()
+                                                      .byteComparableAsString(
+                                                      VERSION), v, en.getValue());
+                                ++count;
+                            }
+                            Assert.assertTrue("Got only " + count + " while progress is at " + min, count >= min);
+                        }
+                    }
+                    catch (Throwable t)
+                    {
+                        t.printStackTrace();
+                        errors.add(t);
+                    }
+                }
+            });
+
+        for (int i = 0; i < READERS; ++i)
+        {
+            threads.add(new Thread()
+            {
+                public void run()
+                {
+                    try
+                    {
+                        Random r = ThreadLocalRandom.current();
+                        while (!writeCompleted.get())
+                        {
+                            int min = writeProgress.get();
+
+                            for (int i = 0; i < PROGRESS_UPDATE; ++i)
+                            {
+                                int index = r.nextInt(COUNT + OTHERS);
+                                ByteComparable b = src[index];
+                                String v = value(b);
+                                String result = trie.get(b);
+                                if (result != null)
+                                {
+                                    Assert.assertTrue("Got not added " + index + " when COUNT is " + COUNT,
+                                                      index < COUNT);
+                                    Assert.assertEquals("Failed " + index, v, result);
+                                }
+                                else if (index < min)
+                                    Assert.fail("Failed index " + index + " while progress is at " + min);
+                            }
+                        }
+                    }
+                    catch (Throwable t)
+                    {
+                        t.printStackTrace();
+                        errors.add(t);
+                    }
+                }
+            });
+        }
+
+        threads.add(new Thread()
+        {
+            public void run()
+            {
+                try
+                {
+                    for (int i = 0; i < COUNT; i++)
+                    {
+                        ByteComparable b = src[i];
+
+                        // Note: Because we don't ensure order when calling resolve, just use a hash of the key as payload
+                        // (so that all sources have the same value).
+                        String v = value(b);
+                        if (i % 2 == 0)
+                            trie.apply(Trie.singleton(b, v), (x, y) -> y);
+                        else
+                            trie.putRecursive(b, v, (x, y) -> y);
+
+                        if (i % PROGRESS_UPDATE == 0)
+                            writeProgress.set(i);
+                    }
+                }
+                catch (Throwable t)
+                {
+                    t.printStackTrace();
+                    errors.add(t);
+                }
+                finally
+                {
+                    writeCompleted.set(true);
+                }
+            }
+        });
+
+        for (Thread t : threads)
+            t.start();
+
+        for (Thread t : threads)
+            t.join();
+
+        if (!errors.isEmpty())
+            Assert.fail("Got errors:\n" + errors);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java b/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java
new file mode 100644
index 000000000000..13ecca7b8051
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/MergeTrieTest.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.junit.Test;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.*;
+
+public class MergeTrieTest
+{
+    private static final int COUNT = 15000;
+    Random rand = new Random();
+
+    @Test
+    public void testDirect()
+    {
+        ByteComparable[] src1 = generateKeys(rand, COUNT);
+        ByteComparable[] src2 = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        SortedMap<ByteComparable, ByteBuffer> content2 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
+        MemtableTrie<ByteBuffer> trie2 = makeMemtableTrie(src2, content2, true);
+
+        content1.putAll(content2);
+        Trie<ByteBuffer> union = trie1.mergeWith(trie2, (x, y) -> x);
+
+        assertSameContent(union, content1);
+    }
+
+    @Test
+    public void testWithDuplicates()
+    {
+        ByteComparable[] src1 = generateKeys(rand, COUNT);
+        ByteComparable[] src2 = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        SortedMap<ByteComparable, ByteBuffer> content2 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+
+        MemtableTrie trie1 = makeMemtableTrie(src1, content1, true);
+        MemtableTrie trie2 = makeMemtableTrie(src2, content2, true);
+
+        addToMemtableTrie(generateKeys(new Random(5), COUNT), content1, trie1, true);
+        addToMemtableTrie(generateKeys(new Random(5), COUNT), content2, trie2, true);
+
+        content1.putAll(content2);
+        Trie union = trie1.mergeWith(trie2, (x, y) -> y);
+
+        assertSameContent(union, content1);
+    }
+
+    @Test
+    public void testDistinct()
+    {
+        ByteComparable[] src1 = generateKeys(rand, COUNT);
+        SortedMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
+
+        ByteComparable[] src2 = generateKeys(rand, COUNT);
+        src2 = removeDuplicates(src2, content1);
+        SortedMap<ByteComparable, ByteBuffer> content2 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, VERSION));
+        MemtableTrie<ByteBuffer> trie2 = makeMemtableTrie(src2, content2, true);
+
+        content1.putAll(content2);
+        Trie<ByteBuffer> union = new MergeTrie.Distinct<>(trie1, trie2);
+
+        assertSameContent(union, content1);
+    }
+
+    static ByteComparable[] removeDuplicates(ByteComparable[] keys, SortedMap<ByteComparable, ByteBuffer> content1)
+    {
+        return Arrays.stream(keys)
+                     .filter(key -> !content1.containsKey(key))
+                     .toArray(ByteComparable[]::new);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
new file mode 100644
index 000000000000..9661361fc76d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.NavigableMap;
+import java.util.Random;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+import org.junit.Test;
+
+import com.googlecode.concurrenttrees.common.Iterables;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.asString;
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.assertMapEquals;
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.assertSameContent;
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.generateKeys;
+import static org.apache.cassandra.db.tries.MemtableTrieTestBase.makeMemtableTrie;
+import static java.util.Arrays.asList;
+import static org.junit.Assert.assertEquals;
+
+public class SetIntersectionTrieTest
+{
+    private static final int COUNT = 15000;
+    Random rand = new Random();
+
+    @Test
+    public void testIntersectRangeDirect()
+    {
+        testIntersectRange(COUNT);
+    }
+
+    public void testIntersectRange(int count)
+    {
+        ByteComparable[] src1 = generateKeys(rand, count);
+        NavigableMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>((bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, Trie.BYTE_COMPARABLE_VERSION));
+
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
+
+        checkEqualRange(content1, trie1, null, true, null, true);
+        checkEqualRange(content1, trie1, MemtableTrieTestBase.generateKey(rand), true, null, true);
+        checkEqualRange(content1, trie1, null, true, MemtableTrieTestBase.generateKey(rand), true);
+        for (int i = 0; i < 4; ++i)
+        {
+            ByteComparable l = rand.nextBoolean() ? MemtableTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)];
+            ByteComparable r = rand.nextBoolean() ? MemtableTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)];
+            int cmp = ByteComparable.compare(l, r, Trie.BYTE_COMPARABLE_VERSION);
+            if (cmp > 0)
+            {
+                ByteComparable t = l;l = r;r = t; // swap
+            }
+
+            boolean includeLeft = (i & 1) != 0;
+            boolean includeRight = (i & 2) != 0;
+            if (!includeLeft && !includeRight && cmp == 0)
+                includeRight = true;
+            checkEqualRange(content1, trie1, l, includeLeft, r, includeRight);
+        }
+    }
+
+    public void checkEqualRange(NavigableMap<ByteComparable, ByteBuffer> content1,
+                                Trie<ByteBuffer> t1,
+                                ByteComparable l,
+                                boolean includeLeft,
+                                ByteComparable r,
+                                boolean includeRight)
+    {
+        System.out.format("Intersection with %s%s:%s%s\n", includeLeft ? "[" : "(", asString(l), asString(r), includeRight ? "]" : ")");
+        SortedMap<ByteComparable, ByteBuffer> imap = l == null
+                                                     ? r == null
+                                                       ? content1
+                                                       : content1.headMap(r, includeRight)
+                                                     : r == null
+                                                       ? content1.tailMap(l, includeLeft)
+                                                       : content1.subMap(l, includeLeft, r, includeRight);
+        Trie<ByteBuffer> intersection = t1.subtrie(l, includeLeft, r, includeRight);
+
+        assertSameContent(intersection, imap);
+    }
+
+    /**
+     * Extract the values of the provide trie into a list.
+     */
+    private static <T> List<T> toList(Trie<T> trie)
+    {
+        return Iterables.toList(trie.values());
+    }
+
+    /**
+     * Creates a simple trie with a root having the provided number of childs, where each child is a leaf whose content
+     * is simply the value of the transition leading to it.
+     *
+     * In other words, {@code singleLevelIntTrie(4)} creates the following trie:
+     *       Root
+     * t= 0  1  2  3
+     *    |  |  |  |
+     *    0  1  2  3
+     */
+    private static Trie<Integer> singleLevelIntTrie(int childs)
+    {
+        return new Trie<Integer>()
+        {
+            protected <L> Node<Integer, L> root()
+            {
+                return new RootNode<>();
+            }
+
+            /** Root node of the trie: has {@code childs} transition, each leading to a {@link LeafNode} whose content
+             * is the value of the transition. */
+            class RootNode<L> extends Node<Integer, L>
+            {
+                RootNode()
+                {
+                    super(null);
+                    currentTransition = 0;
+                }
+
+                public Remaining startIteration()
+                {
+                    currentTransition = 0;
+                    return childs == 0 ? null : (childs == 1 ? Remaining.ONE : Remaining.MULTIPLE);
+                }
+
+                public Remaining advanceIteration()
+                {
+                    return ++currentTransition >= childs ? null : Remaining.MULTIPLE;
+                }
+
+                public Node<Integer, L> getCurrentChild(L parent)
+                {
+                    return new LeafNode<>(parent, currentTransition);
+                }
+
+                public Integer content()
+                {
+                    return null;
+                }
+
+                @Override
+                public String toString()
+                {
+                    return String.format("ROOT(t=%s, parent=%s)", currentTransition, parentLink);
+                }
+            }
+
+            /** Leaf nodes: no children but a content corresponding ot the transition leading to them */
+            class LeafNode<L> extends NoChildrenNode<Integer, L>
+            {
+                private final int value;
+
+                LeafNode(L parent, int value)
+                {
+                    super(parent);
+                    this.value = value;
+                }
+
+                public Integer content()
+                {
+                    return value;
+                }
+
+                @Override
+                public String toString()
+                {
+                    return String.format("LEAF(%d, parent=%s)", value, parentLink);
+                }
+            }
+        };
+
+    }
+
+    /** Creates a single byte {@link ByteComparable} with the provide value */
+    private static ByteComparable of(int value)
+    {
+        assert value >= 0 && value <= Byte.MAX_VALUE;
+        return ByteComparable.fixedLength(new byte[]{ (byte)value });
+    }
+
+    @Test
+    public void testSimpleIntersection()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(of(3), true, of(7), true);
+        // Currently returns [4, 5, 6, 7, 8, 9], which "looks" wrong.
+        assertEquals(asList(3, 4, 5, 6, 7), toList(intersection));
+    }
+}

From 2f8fbf1b95b6d79c732a73d1351479778ad47346 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Thu, 26 Nov 2020 10:11:29 +0100
Subject: [PATCH 034/151] STAR-1: Read path optimizations

Add MetadataCollectorBench

Extract AbstractType serialization code into separate serializer class
and refactor SerializationHeader to use that

Add a new SSTable version which supports partition deletion presence marker
and improved min/max

Replaced `min/maxClusteringValues` with `coveredClustering` of `Slice` type
Added `clusteringTypes` and `hasPartitionLevelDeletions` fields

Other refactorings are related to the required code changes due to those
new fields in `StatsMetadata`. No semantic changes were made in this commit
except that slices intersection will never return `true` if any of the
slices is reversed (start > end).

Apply read path optimizations when static columns are queried

Extend the usage of lower bound opttimizations

(cherry picked from commit 1544bb6f4fb2d14e254c607265e5f76e9647d0bf)
(cherry picked from commit 5541d1acae04fe86d8bf5836f5355aeba6fe36e5)
---
 .../org/apache/cassandra/db/Clustering.java   |  12 ++
 .../apache/cassandra/db/ClusteringBound.java  |  42 +++-
 .../cassandra/db/ClusteringBoundary.java      |  12 ++
 .../cassandra/db/ClusteringComparator.java    |   9 +-
 .../apache/cassandra/db/ClusteringPrefix.java |  22 ++-
 .../cassandra/db/SerializationHeader.java     |  57 ++----
 .../db/SinglePartitionReadCommand.java        | 147 +++++++-------
 src/java/org/apache/cassandra/db/Slice.java   |  35 ++--
 src/java/org/apache/cassandra/db/Slices.java  |  39 ++--
 .../db/filter/ClusteringIndexFilter.java      |  10 +-
 .../db/filter/ClusteringIndexNamesFilter.java |  11 +-
 .../db/filter/ClusteringIndexSliceFilter.java |  13 +-
 .../db/marshal/ByteArrayObjectFactory.java    |  26 ++-
 .../db/marshal/ByteBufferObjectFactory.java   |  26 ++-
 .../PartitionStatisticsCollector.java         |   1 +
 .../db/rows/ArtificialBoundMarker.java        |  59 ++++++
 .../cassandra/db/rows/EncodingStats.java      |   6 +
 .../UnfilteredRowIteratorWithLowerBound.java  | 125 +++++++-----
 .../cassandra/io/sstable/format/Version.java  |  10 +
 .../io/sstable/format/big/BigFormat.java      |  16 ++
 .../io/sstable/format/big/BigTableWriter.java |   4 +-
 .../sstable/metadata/MetadataCollector.java   | 117 ++++++++---
 .../io/sstable/metadata/StatsMetadata.java    | 168 +++++++++++-----
 .../serializers/AbstractTypeSerializer.java   |  75 +++++++
 .../tools/SSTableMetadataViewer.java          |  18 +-
 .../utils/bytecomparable/ByteSource.java      |   4 +
 .../microbench/MetadataCollectorBench.java    | 155 +++++++++++++++
 .../miscellaneous/SSTablesIteratedTest.java   |  34 +++-
 .../db/SinglePartitionSliceCommandTest.java   | 184 +++++++++++++++++-
 .../db/compaction/CompactionsTest.java        |  10 +-
 .../apache/cassandra/db/filter/SliceTest.java | 130 +++++++------
 .../apache/cassandra/db/rows/RowsTest.java    |   6 +
 .../io/sstable/SSTableMetadataTest.java       |  16 +-
 .../ByteSourceComparisonTest.java             |   5 +-
 .../ByteSourceConversionTest.java             |   3 +-
 35 files changed, 1163 insertions(+), 444 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/db/rows/ArtificialBoundMarker.java
 create mode 100644 src/java/org/apache/cassandra/serializers/AbstractTypeSerializer.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/MetadataCollectorBench.java

diff --git a/src/java/org/apache/cassandra/db/Clustering.java b/src/java/org/apache/cassandra/db/Clustering.java
index c6856382482b..09cc48d4d320 100644
--- a/src/java/org/apache/cassandra/db/Clustering.java
+++ b/src/java/org/apache/cassandra/db/Clustering.java
@@ -55,6 +55,18 @@ public default Clustering<?> copy(AbstractAllocator allocator)
         return new BufferClustering(newValues);
     }
 
+    @Override
+    default ClusteringBound<V> asStartBound()
+    {
+        return ClusteringBound.inclusiveStartOf(this);
+    }
+
+    @Override
+    default ClusteringBound<V> asEndBound()
+    {
+        return ClusteringBound.inclusiveEndOf(this);
+    }
+
     public default String toString(TableMetadata metadata)
     {
         StringBuilder sb = new StringBuilder();
diff --git a/src/java/org/apache/cassandra/db/ClusteringBound.java b/src/java/org/apache/cassandra/db/ClusteringBound.java
index 364856f21229..0eb6a920cdf3 100644
--- a/src/java/org/apache/cassandra/db/ClusteringBound.java
+++ b/src/java/org/apache/cassandra/db/ClusteringBound.java
@@ -26,15 +26,22 @@
 import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.utils.memory.AbstractAllocator;
 
+import static org.apache.cassandra.db.AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY;
+
 /**
  * The start or end of a range of clusterings, either inclusive or exclusive.
  */
 public interface ClusteringBound<V> extends ClusteringBoundOrBoundary<V>
 {
     /** The smallest start bound, i.e. the one that starts before any row. */
-    public static final ClusteringBound<?> BOTTOM = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, BufferClusteringBound.EMPTY_VALUES_ARRAY);
+    public static final ClusteringBound<?> BOTTOM = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, EMPTY_VALUES_ARRAY);
     /** The biggest end bound, i.e. the one that ends after any row. */
-    public static final ClusteringBound<?> TOP = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, BufferClusteringBound.EMPTY_VALUES_ARRAY);
+    public static final ClusteringBound<?> TOP = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, EMPTY_VALUES_ARRAY);
+
+    /** The biggest start bound, i.e. the one that starts after any row. */
+    public static final ClusteringBound<?> MAX_START = new BufferClusteringBound(Kind.EXCL_START_BOUND, EMPTY_VALUES_ARRAY);
+    /** The smallest end bound, i.e. the one that end before any row. */
+    public static final ClusteringBound<?> MIN_END = new BufferClusteringBound(Kind.EXCL_END_BOUND, EMPTY_VALUES_ARRAY);
 
     public static ClusteringPrefix.Kind boundKind(boolean isStart, boolean isInclusive)
     {
@@ -69,6 +76,21 @@ default boolean isExclusive()
         return kind() == Kind.EXCL_START_BOUND || kind() == Kind.EXCL_END_BOUND;
     }
 
+    default boolean isArtificial()
+    {
+        return kind() == Kind.SSTABLE_LOWER_BOUND || kind() == Kind.SSTABLE_UPPER_BOUND;
+    }
+
+    default ClusteringBound<V> artificialLowerBound()
+    {
+        return create(Kind.SSTABLE_LOWER_BOUND, this);
+    }
+
+    default ClusteringBound<V> artificialUpperBound()
+    {
+        return create(Kind.SSTABLE_UPPER_BOUND, this);
+    }
+
     // For use by intersects, it's called with the sstable bound opposite to the slice bound
     // (so if the slice bound is a start, it's call with the max sstable bound)
     default int compareTo(ClusteringComparator comparator, List<ByteBuffer> sstableBound)
@@ -102,12 +124,12 @@ static <V> ClusteringBound<V> create(ClusteringPrefix.Kind kind, ClusteringPrefi
         return from.accessor().factory().bound(kind, from.getRawValues());
     }
 
-    public static ClusteringBound<?> inclusiveStartOf(ClusteringPrefix<?> from)
+    public static <V> ClusteringBound<V> inclusiveStartOf(ClusteringPrefix<V> from)
     {
         return create(ClusteringPrefix.Kind.INCL_START_BOUND, from);
     }
 
-    public static ClusteringBound<?> inclusiveEndOf(ClusteringPrefix<?> from)
+    public static <V> ClusteringBound<V> inclusiveEndOf(ClusteringPrefix<V> from)
     {
         return create(ClusteringPrefix.Kind.INCL_END_BOUND, from);
     }
@@ -134,4 +156,16 @@ public static ClusteringBound<?> create(ClusteringComparator comparator, boolean
         }
         return builder.buildBound(isStart, isInclusive);
     }
+
+    @Override
+    default ClusteringBound<V> asStartBound()
+    {
+        return this;
+    }
+
+    @Override
+    default ClusteringBound<V> asEndBound()
+    {
+        return this;
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/ClusteringBoundary.java b/src/java/org/apache/cassandra/db/ClusteringBoundary.java
index 3e50f5264ccf..df35a5d1bb14 100644
--- a/src/java/org/apache/cassandra/db/ClusteringBoundary.java
+++ b/src/java/org/apache/cassandra/db/ClusteringBoundary.java
@@ -37,4 +37,16 @@ public static <V> ClusteringBoundary<V> create(ClusteringBound.Kind kind, Cluste
     {
         return from.accessor().factory().boundary(kind, from.getRawValues());
     }
+
+    @Override
+    default ClusteringBound<V> asStartBound()
+    {
+        return openBound(false);
+    }
+
+    @Override
+    default ClusteringBound<V> asEndBound()
+    {
+        return closeBound(false);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java
index 6d67e6bc9590..81cf57ebfe9b 100644
--- a/src/java/org/apache/cassandra/db/ClusteringComparator.java
+++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java
@@ -248,9 +248,9 @@ public <T> void validate(ClusteringPrefix<T> clustering)
      * and
      *   asByteComparable(x) is not a prefix of asByteComparable(y)
      */
-    public ByteComparable asByteComparable(ClusteringPrefix clustering)
+    public <V> ByteComparable asByteComparable(ClusteringPrefix<V> clustering)
     {
-        return new ByteComparableClustering(clustering);
+        return new ByteComparableClustering<>(clustering);
     }
 
     /**
@@ -429,6 +429,11 @@ public <V> ClusteringBound<V> boundFromByteComparable(ValueAccessor<V> accessor,
                 return accessor.factory().bound(isEnd ? ClusteringPrefix.Kind.INCL_END_BOUND
                                                       : ClusteringPrefix.Kind.EXCL_START_BOUND,
                                                 Arrays.copyOf(components, cc));
+
+            case ByteSource.LTLT_NEXT_COMPONENT:
+            case ByteSource.GTGT_NEXT_COMPONENT:
+                throw new AssertionError("Unexpected sstable lower/upper bound - byte comparable representation of artificial sstable bounds is not supported");
+
             default:
                 throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in ClusteringBound encoding");
             }
diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
index c7a2782ecef3..0a22306b6bff 100644
--- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java
+++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
@@ -65,18 +65,20 @@ public enum Kind
     {
         // WARNING: the ordering of that enum matters because we use ordinal() in the serialization
 
-        EXCL_END_BOUND              (0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
-        INCL_START_BOUND            (0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
-        EXCL_END_INCL_START_BOUNDARY(0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
-        STATIC_CLUSTERING           (1, -1, v -> v == Version.LEGACY
+        EXCL_END_BOUND              ( 0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
+        INCL_START_BOUND            ( 0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
+        EXCL_END_INCL_START_BOUNDARY( 0, -1, v -> ByteSource.LT_NEXT_COMPONENT),
+        STATIC_CLUSTERING           ( 1, -1, v -> v == Version.LEGACY
                                                  ? ByteSource.LT_NEXT_COMPONENT + 1
                                                  : ByteSource.EXCLUDED),
-        CLUSTERING                  (2,  0, v -> v == Version.LEGACY
+        CLUSTERING                  ( 2,  0, v -> v == Version.LEGACY
                                                  ? ByteSource.NEXT_COMPONENT
                                                  : ByteSource.TERMINATOR),
-        INCL_END_EXCL_START_BOUNDARY(3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
-        INCL_END_BOUND              (3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
-        EXCL_START_BOUND            (3,  1, v -> ByteSource.GT_NEXT_COMPONENT);
+        INCL_END_EXCL_START_BOUNDARY( 3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
+        INCL_END_BOUND              ( 3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
+        EXCL_START_BOUND            ( 3,  1, v -> ByteSource.GT_NEXT_COMPONENT),
+        SSTABLE_LOWER_BOUND         (-1, -1, v -> ByteSource.LTLT_NEXT_COMPONENT),
+        SSTABLE_UPPER_BOUND         ( 4,  1, v -> ByteSource.GTGT_NEXT_COMPONENT);
 
 
         private final int comparison;
@@ -313,6 +315,10 @@ default int dataSize()
      */
     public String toString(TableMetadata metadata);
 
+    public ClusteringBound<V> asStartBound();
+
+    public ClusteringBound<V> asEndBound();
+
     /*
      * TODO: we should stop using Clustering for partition keys. Maybe we can add
      * a few methods to DecoratedKey so we don't have to (note that while using a Clustering
diff --git a/src/java/org/apache/cassandra/db/SerializationHeader.java b/src/java/org/apache/cassandra/db/SerializationHeader.java
index dca52c0a9d74..5403c7ee1023 100644
--- a/src/java/org/apache/cassandra/db/SerializationHeader.java
+++ b/src/java/org/apache/cassandra/db/SerializationHeader.java
@@ -25,7 +25,6 @@
 
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.TypeParser;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.UnknownColumnException;
@@ -38,6 +37,7 @@
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.serializers.AbstractTypeSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 public class SerializationHeader
@@ -398,6 +398,8 @@ public EncodingStats getEncodingStats()
 
     public static class Serializer implements IMetadataComponentSerializer<Component>
     {
+        private final AbstractTypeSerializer typeSerializer = new AbstractTypeSerializer();
+
         public void serializeForMessaging(SerializationHeader header, ColumnFilter selection, DataOutputPlus out, boolean hasStatic) throws IOException
         {
             EncodingStats.serializer.serialize(header.stats, out);
@@ -462,10 +464,8 @@ public void serialize(Version version, Component header, DataOutputPlus out) thr
         {
             EncodingStats.serializer.serialize(header.stats, out);
 
-            writeType(header.keyType, out);
-            out.writeUnsignedVInt(header.clusteringTypes.size());
-            for (AbstractType<?> type : header.clusteringTypes)
-                writeType(type, out);
+            typeSerializer.serialize(header.keyType, out);
+            typeSerializer.serializeList(header.clusteringTypes, out);
 
             writeColumnsWithTypes(header.staticColumns, out);
             writeColumnsWithTypes(header.regularColumns, out);
@@ -476,17 +476,11 @@ public Component deserialize(Version version, DataInputPlus in) throws IOExcepti
         {
             EncodingStats stats = EncodingStats.serializer.deserialize(in);
 
-            AbstractType<?> keyType = readType(in);
-            int size = (int)in.readUnsignedVInt();
-            List<AbstractType<?>> clusteringTypes = new ArrayList<>(size);
-            for (int i = 0; i < size; i++)
-                clusteringTypes.add(readType(in));
-
-            Map<ByteBuffer, AbstractType<?>> staticColumns = new LinkedHashMap<>();
-            Map<ByteBuffer, AbstractType<?>> regularColumns = new LinkedHashMap<>();
+            AbstractType<?> keyType = typeSerializer.deserialize(in);
+            List<AbstractType<?>> clusteringTypes = typeSerializer.deserializeList(in);
 
-            readColumnsWithType(in, staticColumns);
-            readColumnsWithType(in, regularColumns);
+            Map<ByteBuffer, AbstractType<?>> staticColumns = readColumnsWithType(in);
+            Map<ByteBuffer, AbstractType<?>> regularColumns = readColumnsWithType(in);
 
             return new Component(keyType, clusteringTypes, staticColumns, regularColumns, stats);
         }
@@ -496,10 +490,10 @@ public int serializedSize(Version version, Component header)
         {
             int size = EncodingStats.serializer.serializedSize(header.stats);
 
-            size += sizeofType(header.keyType);
+            size += typeSerializer.serializedSize(header.keyType);
             size += TypeSizes.sizeofUnsignedVInt(header.clusteringTypes.size());
             for (AbstractType<?> type : header.clusteringTypes)
-                size += sizeofType(type);
+                size += typeSerializer.serializedSize(type);
 
             size += sizeofColumnsWithTypes(header.staticColumns);
             size += sizeofColumnsWithTypes(header.regularColumns);
@@ -512,7 +506,7 @@ private void writeColumnsWithTypes(Map<ByteBuffer, AbstractType<?>> columns, Dat
             for (Map.Entry<ByteBuffer, AbstractType<?>> entry : columns.entrySet())
             {
                 ByteBufferUtil.writeWithVIntLength(entry.getKey(), out);
-                writeType(entry.getValue(), out);
+                typeSerializer.serialize(entry.getValue(), out);
             }
         }
 
@@ -522,36 +516,21 @@ private long sizeofColumnsWithTypes(Map<ByteBuffer, AbstractType<?>> columns)
             for (Map.Entry<ByteBuffer, AbstractType<?>> entry : columns.entrySet())
             {
                 size += ByteBufferUtil.serializedSizeWithVIntLength(entry.getKey());
-                size += sizeofType(entry.getValue());
+                size += typeSerializer.serializedSize(entry.getValue());
             }
             return size;
         }
 
-        private void readColumnsWithType(DataInputPlus in, Map<ByteBuffer, AbstractType<?>> typeMap) throws IOException
+        private Map<ByteBuffer, AbstractType<?>> readColumnsWithType(DataInputPlus in) throws IOException
         {
-            int length = (int)in.readUnsignedVInt();
+            int length = (int) in.readUnsignedVInt();
+            Map<ByteBuffer, AbstractType<?>> typeMap = new LinkedHashMap<>(length);
             for (int i = 0; i < length; i++)
             {
                 ByteBuffer name = ByteBufferUtil.readWithVIntLength(in);
-                typeMap.put(name, readType(in));
+                typeMap.put(name, typeSerializer.deserialize(in));
             }
-        }
-
-        private void writeType(AbstractType<?> type, DataOutputPlus out) throws IOException
-        {
-            // TODO: we should have a terser serializaion format. Not a big deal though
-            ByteBufferUtil.writeWithVIntLength(UTF8Type.instance.decompose(type.toString()), out);
-        }
-
-        private AbstractType<?> readType(DataInputPlus in) throws IOException
-        {
-            ByteBuffer raw = ByteBufferUtil.readWithVIntLength(in);
-            return TypeParser.parse(UTF8Type.instance.compose(raw));
-        }
-
-        private int sizeofType(AbstractType<?> type)
-        {
-            return ByteBufferUtil.serializedSizeWithVIntLength(UTF8Type.instance.decompose(type.toString()));
+            return typeMap;
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index df52130e8a25..5d52052c5f6b 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -650,44 +650,43 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs
                     break;
                 }
 
-                if (shouldInclude(sstable))
-                {
-                    if (!sstable.isRepaired())
-                        oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
+                boolean intersects = intersects(sstable);
+                boolean hasRequiredStatics = hasRequiredStatics(sstable);
+                boolean hasPartitionLevelDeletions = hasPartitionLevelDeletions(sstable);
 
-                    // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
-                    @SuppressWarnings("resource")
-                    UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, metricsCollector);
-                    inputCollector.addSSTableIterator(sstable, iter);
-                    mostRecentPartitionTombstone = Math.max(mostRecentPartitionTombstone,
-                                                            iter.partitionLevelDeletion().markedForDeleteAt());
+                if (!intersects && !hasRequiredStatics && !hasPartitionLevelDeletions)
+                {
+                    continue;
                 }
-                else
+
+                @SuppressWarnings("resource")
+                UnfilteredRowIterator iter = intersects
+                                             ? makeIterator(cfs, sstable, metricsCollector)
+                                             : makeIteratorWithSkippedNonStaticContent(cfs, sstable, metricsCollector);
+                if (!intersects)
                 {
                     nonIntersectingSSTables++;
-                    // sstable contains no tombstone if maxLocalDeletionTime == Integer.MAX_VALUE, so we can safely skip those entirely
-                    if (sstable.mayHaveTombstones())
-                    {
-                        // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
-                        @SuppressWarnings("resource")
-                        UnfilteredRowIteratorWithLowerBound iter = makeIterator(cfs, sstable, metricsCollector);
-                        // if the sstable contains a partition delete, then we must include it regardless of whether it
-                        // shadows any other data seen locally as we can't guarantee that other replicas have seen it
+
+                    if (!hasRequiredStatics) { // => has partition level deletions
                         if (!iter.partitionLevelDeletion().isLive())
                         {
-                            if (!sstable.isRepaired())
-                                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
-                            inputCollector.addSSTableIterator(sstable, iter);
                             includedDueToTombstones++;
-                            mostRecentPartitionTombstone = Math.max(mostRecentPartitionTombstone,
-                                                                    iter.partitionLevelDeletion().markedForDeleteAt());
                         }
                         else
                         {
                             iter.close();
+                            continue;
                         }
                     }
                 }
+
+                if (!sstable.isRepaired())
+                    oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, sstable.getMinLocalDeletionTime());
+
+                inputCollector.addSSTableIterator(sstable, iter);
+                if (hasPartitionLevelDeletions)
+                    mostRecentPartitionTombstone = Math.max(mostRecentPartitionTombstone,
+                                                            iter.partitionLevelDeletion().markedForDeleteAt());
             }
 
             if (Tracing.isTracing())
@@ -715,15 +714,20 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs
         }
     }
 
-    private boolean shouldInclude(SSTableReader sstable)
+    private boolean intersects(SSTableReader sstable)
     {
+        return clusteringIndexFilter().intersects(sstable.metadata().comparator, sstable.getSSTableMetadata().coveredClustering);
+    }
+
+    private boolean hasRequiredStatics(SSTableReader sstable) {
         // If some static columns are queried, we should always include the sstable: the clustering values stats of the sstable
         // don't tell us if the sstable contains static values in particular.
-        // TODO: we could record if a sstable contains any static value at all.
-        if (!columnFilter().fetchedColumns().statics.isEmpty())
-            return true;
+        return !columnFilter().fetchedColumns().statics.isEmpty() && sstable.header.hasStatic();
+    }
 
-        return clusteringIndexFilter().shouldInclude(sstable);
+    private boolean hasPartitionLevelDeletions(SSTableReader sstable)
+    {
+        return sstable.getSSTableMetadata().hasPartitionLevelDeletions;
     }
 
     private UnfilteredRowIteratorWithLowerBound makeIterator(ColumnFamilyStore cfs,
@@ -739,6 +743,19 @@ private UnfilteredRowIteratorWithLowerBound makeIterator(ColumnFamilyStore cfs,
 
     }
 
+    private UnfilteredRowIterator makeIteratorWithSkippedNonStaticContent(ColumnFamilyStore cfs,
+                                                                          SSTableReader sstable,
+                                                                          SSTableReadsListener listener)
+    {
+        return StorageHook.instance.makeRowIterator(cfs,
+                                                    sstable,
+                                                    partitionKey(),
+                                                    Slices.NONE,
+                                                    columnFilter(),
+                                                    clusteringIndexFilter().isReversed(),
+                                                    listener);
+    }
+
     /**
      * Return a wrapped iterator that when closed will update the sstables iterated and READ sample metrics.
      * Note that we cannot use the Transformations framework because they greedily get the static row, which
@@ -834,68 +851,48 @@ private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFam
             if (filter == null)
                 break;
 
-            if (!shouldInclude(sstable))
+            boolean intersects = intersects(sstable);
+            boolean hasRequiredStatics = hasRequiredStatics(sstable);
+            boolean hasPartitionLevelDeletions = hasPartitionLevelDeletions(sstable);
+
+            if (!intersects && !hasRequiredStatics && !hasPartitionLevelDeletions)
             {
                 // This mean that nothing queried by the filter can be in the sstable. One exception is the top-level partition deletion
                 // however: if it is set, it impacts everything and must be included. Getting that top-level partition deletion costs us
                 // some seek in general however (unless the partition is indexed and is in the key cache), so we first check if the sstable
                 // has any tombstone at all as a shortcut.
-                if (!sstable.mayHaveTombstones())
-                    continue; // no tombstone at all, we can skip that sstable
-
-                // We need to get the partition deletion and include it if it's live. In any case though, we're done with that sstable.
-                try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs,
-                                                                                       sstable,
-                                                                                       partitionKey(),
-                                                                                       filter.getSlices(metadata()),
-                                                                                       columnFilter(),
-                                                                                       filter.isReversed(),
-                                                                                       metricsCollector))
-                {
-                    if (!iter.partitionLevelDeletion().isLive())
-                    {
-                        result = add(
-                            UnfilteredRowIterators.noRowsIterator(iter.metadata(),
-                                                                  iter.partitionKey(),
-                                                                  Rows.EMPTY_STATIC_ROW,
-                                                                  iter.partitionLevelDeletion(),
-                                                                  filter.isReversed()),
-                            result,
-                            filter,
-                            sstable.isRepaired()
-                        );
-                    }
-                    else
-                    {
-                        result = add(
-                            RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false),
-                            result,
-                            filter,
-                            sstable.isRepaired()
-                        );
-                    }
-                }
-
                 continue;
             }
 
             try (UnfilteredRowIterator iter = StorageHook.instance.makeRowIterator(cfs,
                                                                                    sstable,
                                                                                    partitionKey(),
-                                                                                   filter.getSlices(metadata()),
+                                                                                   intersects ? filter.getSlices(metadata()) : Slices.NONE,
                                                                                    columnFilter(),
                                                                                    filter.isReversed(),
                                                                                    metricsCollector))
             {
-                if (iter.isEmpty())
-                    continue;
+                if (!hasRequiredStatics && !intersects && !iter.partitionLevelDeletion().isLive()) // => partitionLevelDelections == true
+                {
+                    result = add(UnfilteredRowIterators.noRowsIterator(iter.metadata(),
+                                                                       iter.partitionKey(),
+                                                                       Rows.EMPTY_STATIC_ROW,
+                                                                       iter.partitionLevelDeletion(),
+                                                                       filter.isReversed()),
+                                 result,
+                                 filter,
+                                 sstable.isRepaired());
+                }
+                else
+                {
+                    if (!hasRequiredStatics && iter.isEmpty())
+                        continue;
 
-                result = add(
-                    RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false),
-                    result,
-                    filter,
-                    sstable.isRepaired()
-                );
+                    result = add(RTBoundValidator.validate(iter, RTBoundValidator.Stage.SSTABLE, false),
+                                 result,
+                                 filter,
+                                 sstable.isRepaired());
+                }
             }
         }
 
diff --git a/src/java/org/apache/cassandra/db/Slice.java b/src/java/org/apache/cassandra/db/Slice.java
index 8956bd178e32..be848c961bf8 100644
--- a/src/java/org/apache/cassandra/db/Slice.java
+++ b/src/java/org/apache/cassandra/db/Slice.java
@@ -22,7 +22,6 @@
 import java.util.*;
 
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ByteArrayAccessor;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
@@ -47,9 +46,9 @@ public boolean includes(ClusteringComparator comparator, ClusteringPrefix<?> clu
         }
 
         @Override
-        public boolean intersects(ClusteringComparator comparator, List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        public boolean intersects(ClusteringComparator comparator, Slice other)
         {
-            return true;
+            return !other.isEmpty(comparator);
         }
 
         @Override
@@ -107,6 +106,13 @@ public static Slice make(Clustering<?> start, Clustering<?> end)
         return new Slice(ClusteringBound.inclusiveStartOf(start), ClusteringBound.inclusiveEndOf(end));
     }
 
+    public static Slice make(ClusteringPrefix<?> start, ClusteringPrefix<?> end)
+    {
+        // This doesn't give us what we want with the clustering prefix
+        assert start != Clustering.STATIC_CLUSTERING && end != Clustering.STATIC_CLUSTERING;
+        return make(start.asStartBound(), end.asEndBound());
+    }
+
     public ClusteringBound<?> start()
     {
         return start;
@@ -230,20 +236,25 @@ public Slice forPaging(ClusteringComparator comparator, Clustering<?> lastReturn
     }
 
     /**
-     * Given the per-clustering column minimum and maximum value a sstable contains, whether or not this slice potentially
-     * intersects that sstable or not.
+     * Whether this slice and the provided slice intersects.
      *
      * @param comparator the comparator for the table this is a slice of.
-     * @param minClusteringValues the smallest values for each clustering column that a sstable contains.
-     * @param maxClusteringValues the biggest values for each clustering column that a sstable contains.
+     * @param other the other slice to check intersection with.
      *
-     * @return whether the slice might intersects with the sstable having {@code minClusteringValues} and
-     * {@code maxClusteringValues}.
+     * @return whether this slice intersects {@code other}.
      */
-    public boolean intersects(ClusteringComparator comparator, List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+    public boolean intersects(ClusteringComparator comparator, Slice other)
     {
-        // If this slice starts after max clustering or ends before min clustering, it can't intersect
-        return start.compareTo(comparator, maxClusteringValues) <= 0 && end.compareTo(comparator, minClusteringValues) >= 0;
+        // Empty slices never intersect anything (and we have to special case it as there is many ways to build an
+        // empty slice; for instance, without this, (0, 0) would intersect Slice.ALL or [-1, 1]).
+        if (isEmpty(comparator) || other.isEmpty(comparator))
+            return false;
+
+        // Otherwise, the slice intersects if they contains more than just their boundaries. That is, the comparison
+        // below needs to be strict, because for instance, a=[0, 3] and b=(3, 5] do not intersects, yet the end of a is
+        // equal to end start of b as far as `ClusteringPrefix.Kind#compare` goes (see the javadoc on that method for
+        // why that is).
+        return comparator.compare(start, other.end) < 0 && comparator.compare(end, other.start) > 0;
     }
 
     public String toString(ClusteringComparator comparator)
diff --git a/src/java/org/apache/cassandra/db/Slices.java b/src/java/org/apache/cassandra/db/Slices.java
index 441a5d3d88da..04e4eb13fd46 100644
--- a/src/java/org/apache/cassandra/db/Slices.java
+++ b/src/java/org/apache/cassandra/db/Slices.java
@@ -94,6 +94,16 @@ public static Slices with(ClusteringComparator comparator, Slice slice)
      */
     public abstract Slice get(int i);
 
+    public ClusteringBound<?> start()
+    {
+        return get(0).start();
+    }
+
+    public ClusteringBound<?> end()
+    {
+        return get(size() - 1).end();
+    }
+
     /**
      * Returns slices for continuing the paging of those slices given the last returned clustering prefix.
      *
@@ -128,18 +138,13 @@ public static Slices with(ClusteringComparator comparator, Slice slice)
      */
     public abstract boolean selects(Clustering<?> clustering);
 
-
     /**
-     * Given the per-clustering column minimum and maximum value a sstable contains, whether or not this slices potentially
-     * intersects that sstable or not.
+     * Checks whether any of the slices intersects witht the given one.
      *
-     * @param minClusteringValues the smallest values for each clustering column that a sstable contains.
-     * @param maxClusteringValues the biggest values for each clustering column that a sstable contains.
-     *
-     * @return whether the slices might intersects with the sstable having {@code minClusteringValues} and
-     * {@code maxClusteringValues}.
+     * @return {@code true} if there exists a slice which ({@link Slice#intersects(ClusteringComparator, Slice)}) with
+     * the provided slice
      */
-    public abstract boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues);
+    public abstract boolean intersects(Slice slice);
 
     public abstract String toCQLString(TableMetadata metadata);
 
@@ -439,11 +444,12 @@ private Slices forReversePaging(ClusteringComparator comparator, Clustering<?> l
             return Slices.NONE;
         }
 
-        public boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        @Override
+        public boolean intersects(Slice slice)
         {
-            for (Slice slice : this)
+            for (Slice s : this)
             {
-                if (slice.intersects(comparator, minClusteringValues, maxClusteringValues))
+                if (s.intersects(comparator, slice))
                     return true;
             }
             return false;
@@ -748,7 +754,8 @@ public InOrderTester inOrderTester(boolean reversed)
             return trivialTester;
         }
 
-        public boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer> maxClusteringValues)
+        @Override
+        public boolean intersects(Slice slice)
         {
             return true;
         }
@@ -828,6 +835,12 @@ public boolean intersects(List<ByteBuffer> minClusteringValues, List<ByteBuffer>
             return false;
         }
 
+        @Override
+        public boolean intersects(Slice slice)
+        {
+            return false;
+        }
+
         public Iterator<Slice> iterator()
         {
             return Collections.emptyIterator();
diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java
index 6ea0435b54c3..8d3b350988fd 100644
--- a/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexFilter.java
@@ -23,7 +23,6 @@
 import org.apache.cassandra.db.partitions.CachedPartition;
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.db.rows.*;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.schema.TableMetadata;
@@ -142,13 +141,14 @@ static interface InternalDeserializer
     public UnfilteredRowIterator getUnfilteredRowIterator(ColumnFilter columnFilter, Partition partition);
 
     /**
-     * Whether the provided sstable may contain data that is selected by this filter (based on the sstable metadata).
+     * Whether the data selected by this filter intersects with the provided slice.
      *
-     * @param sstable the sstable for which we want to test the need for inclusion.
+     * @param comparator the comparator of the table this if a filter on.
+     * @param slice the slice to check intersection with,
      *
-     * @return whether {@code sstable} should be included to answer this filter.
+     * @return whether the data selected by this filter intersects with {@code slice}.
      */
-    public boolean shouldInclude(SSTableReader sstable);
+    public boolean intersects(ClusteringComparator comparator, Slice slice);
 
     public Kind kind();
 
diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java
index ef9ceff9ef1e..f61a39aa87d4 100644
--- a/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexNamesFilter.java
@@ -18,14 +18,12 @@
 package org.apache.cassandra.db.filter;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.*;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.transform.Transformation;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -140,16 +138,11 @@ public UnfilteredRowIterator getUnfilteredRowIterator(final ColumnFilter columnF
         return partition.unfilteredIterator(columnFilter, clusteringsInQueryOrder, isReversed());
     }
 
-    public boolean shouldInclude(SSTableReader sstable)
+    public boolean intersects(ClusteringComparator comparator, Slice slice)
     {
-        ClusteringComparator comparator = sstable.metadata().comparator;
-        List<ByteBuffer> minClusteringValues = sstable.getSSTableMetadata().minClusteringValues;
-        List<ByteBuffer> maxClusteringValues = sstable.getSSTableMetadata().maxClusteringValues;
-
-        // If any of the requested clustering is within the bounds covered by the sstable, we need to include the sstable
         for (Clustering<?> clustering : clusterings)
         {
-            if (Slice.make(clustering).intersects(comparator, minClusteringValues, maxClusteringValues))
+            if (slice.includes(comparator, clustering))
                 return true;
         }
         return false;
diff --git a/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java b/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java
index 5df98c3e70c4..5b72a6f91bbe 100644
--- a/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/ClusteringIndexSliceFilter.java
@@ -18,8 +18,6 @@
 package org.apache.cassandra.db.filter;
 
 import java.io.IOException;
-import java.util.List;
-import java.nio.ByteBuffer;
 
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
@@ -27,7 +25,6 @@
 import org.apache.cassandra.db.partitions.CachedPartition;
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.db.transform.Transformation;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 
@@ -126,15 +123,9 @@ public UnfilteredRowIterator getUnfilteredRowIterator(ColumnFilter columnFilter,
         return partition.unfilteredIterator(columnFilter, slices, reversed);
     }
 
-    public boolean shouldInclude(SSTableReader sstable)
+    public boolean intersects(ClusteringComparator comparator, Slice slice)
     {
-        List<ByteBuffer> minClusteringValues = sstable.getSSTableMetadata().minClusteringValues;
-        List<ByteBuffer> maxClusteringValues = sstable.getSSTableMetadata().maxClusteringValues;
-
-        if (minClusteringValues.isEmpty() || maxClusteringValues.isEmpty())
-            return true;
-
-        return slices.intersects(minClusteringValues, maxClusteringValues);
+        return slices.intersects(slice);
     }
 
     public String toString(TableMetadata metadata)
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java b/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java
index 9b477aeeeace..6b9a8d8c6a25 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java
@@ -18,14 +18,7 @@
 
 package org.apache.cassandra.db.marshal;
 
-import org.apache.cassandra.db.AbstractArrayClusteringPrefix;
-import org.apache.cassandra.db.ArrayClustering;
-import org.apache.cassandra.db.ArrayClusteringBound;
-import org.apache.cassandra.db.ArrayClusteringBoundary;
-import org.apache.cassandra.db.Clustering;
-import org.apache.cassandra.db.ClusteringBound;
-import org.apache.cassandra.db.ClusteringBoundary;
-import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.ArrayCell;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.CellPath;
@@ -74,6 +67,13 @@ private ByteArrayObjectFactory() {}
     private static final ArrayClusteringBound TOP_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND,
                                                                                    AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY);
 
+    /** The biggest start bound, i.e. the one that starts after any row. */
+    private static final ArrayClusteringBound MAX_START_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.EXCL_START_BOUND,
+                                                                                      AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY);
+    /** The smallest end bound, i.e. the one that end before any row. */
+    private static final ArrayClusteringBound MIN_END_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.EXCL_END_BOUND,
+                                                                                   AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY);
+
     public Cell<byte[]> cell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTime, byte[] value, CellPath path)
     {
         return new ArrayCell(column, timestamp, ttl, localDeletionTime, value, path);
@@ -101,7 +101,15 @@ public ClusteringBound<byte[]> bound(ClusteringPrefix.Kind kind, byte[]... value
 
     public ClusteringBound<byte[]> bound(ClusteringPrefix.Kind kind)
     {
-        return kind.isStart() ? BOTTOM_BOUND : TOP_BOUND;
+        switch (kind)
+        {
+            case EXCL_END_BOUND: return MIN_END_BOUND;
+            case INCL_START_BOUND: return BOTTOM_BOUND;
+            case INCL_END_BOUND: return TOP_BOUND;
+            case EXCL_START_BOUND: return MAX_START_BOUND;
+            default:
+                throw new AssertionError(String.format("Unexpected kind %s for empty bound or boundary", kind));
+        }
     }
 
     public ClusteringBoundary<byte[]> boundary(ClusteringPrefix.Kind kind, byte[]... values)
diff --git a/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java b/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java
index 2fae7003ca12..4634979a1ef5 100644
--- a/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java
+++ b/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java
@@ -20,14 +20,7 @@
 
 import java.nio.ByteBuffer;
 
-import org.apache.cassandra.db.AbstractBufferClusteringPrefix;
-import org.apache.cassandra.db.BufferClustering;
-import org.apache.cassandra.db.BufferClusteringBound;
-import org.apache.cassandra.db.BufferClusteringBoundary;
-import org.apache.cassandra.db.Clustering;
-import org.apache.cassandra.db.ClusteringBound;
-import org.apache.cassandra.db.ClusteringBoundary;
-import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.BufferCell;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.CellPath;
@@ -42,6 +35,13 @@ class ByteBufferObjectFactory implements ValueAccessor.ObjectFactory<ByteBuffer>
     private static final BufferClusteringBound TOP_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND,
                                                                                      AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY);
 
+    /** The biggest start bound, i.e. the one that starts after any row. */
+    private static final BufferClusteringBound MAX_START_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.EXCL_START_BOUND,
+                                                                                         AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY);
+    /** The smallest end bound, i.e. the one that end before any row. */
+    private static final BufferClusteringBound MIN_END_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.EXCL_END_BOUND,
+                                                                                       AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY);
+
     static final ValueAccessor.ObjectFactory<ByteBuffer> instance = new ByteBufferObjectFactory();
 
     private ByteBufferObjectFactory() {}
@@ -72,7 +72,15 @@ public ClusteringBound<ByteBuffer> bound(ClusteringPrefix.Kind kind, ByteBuffer.
 
     public ClusteringBound<ByteBuffer> bound(ClusteringPrefix.Kind kind)
     {
-        return kind.isStart() ? BOTTOM_BOUND : TOP_BOUND;
+        switch (kind)
+        {
+            case EXCL_END_BOUND: return MIN_END_BOUND;
+            case INCL_START_BOUND: return BOTTOM_BOUND;
+            case INCL_END_BOUND: return TOP_BOUND;
+            case EXCL_START_BOUND: return MAX_START_BOUND;
+            default:
+                throw new AssertionError(String.format("Unexpected kind %s for empty bound or boundary", kind));
+        }
     }
 
     public ClusteringBoundary<ByteBuffer> boundary(ClusteringPrefix.Kind kind, ByteBuffer... values)
diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java b/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java
index 6d376401464c..a9beb2a11d0f 100644
--- a/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java
+++ b/src/java/org/apache/cassandra/db/partitions/PartitionStatisticsCollector.java
@@ -23,6 +23,7 @@
 public interface PartitionStatisticsCollector
 {
     public void update(LivenessInfo info);
+    public void updatePartitionDeletion(DeletionTime dt);
     public void update(DeletionTime deletionTime);
     public void update(Cell<?> cell);
     public void updateColumnSetPerRow(long columnSetInRow);
diff --git a/src/java/org/apache/cassandra/db/rows/ArtificialBoundMarker.java b/src/java/org/apache/cassandra/db/rows/ArtificialBoundMarker.java
new file mode 100644
index 000000000000..40402fece5fd
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/rows/ArtificialBoundMarker.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.rows;
+
+import java.util.Objects;
+
+import org.apache.cassandra.db.ClusteringBound;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.schema.TableMetadata;
+
+public class ArtificialBoundMarker extends RangeTombstoneBoundMarker
+{
+    public ArtificialBoundMarker(ClusteringBound<?> bound)
+    {
+        super(bound, DeletionTime.LIVE);
+        assert bound.isArtificial();
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (this == other)
+            return true;
+
+        if (!(other instanceof ArtificialBoundMarker))
+            return false;
+
+        ArtificialBoundMarker that = (ArtificialBoundMarker) other;
+        return Objects.equals(bound, that.bound);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(bound);
+    }
+
+    @Override
+    public String toString(TableMetadata metadata)
+    {
+        return String.format("LowerBoundMarker %s", bound.toString(metadata));
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/rows/EncodingStats.java b/src/java/org/apache/cassandra/db/rows/EncodingStats.java
index 37dd34e92185..dfaf672117ac 100644
--- a/src/java/org/apache/cassandra/db/rows/EncodingStats.java
+++ b/src/java/org/apache/cassandra/db/rows/EncodingStats.java
@@ -215,6 +215,12 @@ public void update(DeletionTime deletionTime)
             updateLocalDeletionTime(deletionTime.localDeletionTime());
         }
 
+        @Override
+        public void updatePartitionDeletion(DeletionTime dt)
+        {
+            update(dt);
+        }
+
         public void updateTimestamp(long timestamp)
         {
             isTimestampSet = true;
diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java
index d0ba98f7075f..c90f47043fbd 100644
--- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIteratorWithLowerBound.java
@@ -21,21 +21,19 @@
 package org.apache.cassandra.db.rows;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
 import java.util.Comparator;
-import java.util.List;
+import javax.annotation.Nonnull;
 
-import org.apache.cassandra.db.marshal.ByteBufferAccessor;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ClusteringIndexFilter;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.transform.RTBoundValidator;
-import org.apache.cassandra.io.sstable.format.big.IndexInfo;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.big.IndexInfo;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.IteratorWithLowerBound;
 
 /**
@@ -49,10 +47,11 @@
 public class UnfilteredRowIteratorWithLowerBound extends LazilyInitializedUnfilteredRowIterator implements IteratorWithLowerBound<Unfiltered>
 {
     private final SSTableReader sstable;
-    private final ClusteringIndexFilter filter;
+    private final Slices slices;
+    private final boolean isReverseOrder;
     private final ColumnFilter selectedColumns;
     private final SSTableReadsListener listener;
-    private ClusteringBound<?> lowerBound;
+    private Unfiltered lowerBoundMarker;
     private boolean firstItemRetrieved;
 
     public UnfilteredRowIteratorWithLowerBound(DecoratedKey partitionKey,
@@ -60,26 +59,41 @@ public UnfilteredRowIteratorWithLowerBound(DecoratedKey partitionKey,
                                                ClusteringIndexFilter filter,
                                                ColumnFilter selectedColumns,
                                                SSTableReadsListener listener)
+    {
+        this(partitionKey, sstable, filter.getSlices(sstable.metadata()), filter.isReversed(), selectedColumns, listener);
+    }
+
+    public UnfilteredRowIteratorWithLowerBound(DecoratedKey partitionKey,
+                                               SSTableReader sstable,
+                                               Slices slices,
+                                               boolean isReverseOrder,
+                                               ColumnFilter selectedColumns,
+                                               SSTableReadsListener listener)
     {
         super(partitionKey);
         this.sstable = sstable;
-        this.filter = filter;
+        this.slices = slices;
+        this.isReverseOrder = isReverseOrder;
         this.selectedColumns = selectedColumns;
         this.listener = listener;
-        this.lowerBound = null;
         this.firstItemRetrieved = false;
     }
 
     public Unfiltered lowerBound()
     {
-        if (lowerBound != null)
-            return makeBound(lowerBound);
+        if (lowerBoundMarker != null)
+            return lowerBoundMarker;
 
         // The partition index lower bound is more accurate than the sstable metadata lower bound but it is only
-        // present if the iterator has already been initialized, which we only do when there are tombstones since in
-        // this case we cannot use the sstable metadata clustering values
-        ClusteringBound<?> ret = getPartitionIndexLowerBound();
-        return ret != null ? makeBound(ret) : makeBound(getMetadataLowerBound());
+        // present if the iterator has already been initialized
+        ClusteringBound<?> lowerBound = getKeyCacheLowerBound();
+
+        if (lowerBound == null && canUseMetadataLowerBound())
+            // If we coudn't get the lower bound from key cache, we try with metadata
+            lowerBound = getMetadataLowerBound();
+
+        lowerBoundMarker = makeBound(lowerBound);
+        return lowerBoundMarker;
     }
 
     private Unfiltered makeBound(ClusteringBound<?> bound)
@@ -87,10 +101,7 @@ private Unfiltered makeBound(ClusteringBound<?> bound)
         if (bound == null)
             return null;
 
-        if (lowerBound != bound)
-            lowerBound = bound;
-
-        return new RangeTombstoneBoundMarker(lowerBound, DeletionTime.LIVE);
+        return new ArtificialBoundMarker(bound);
     }
 
     @Override
@@ -98,7 +109,7 @@ protected UnfilteredRowIterator initializeIterator()
     {
         @SuppressWarnings("resource") // 'iter' is added to iterators which is closed on exception, or through the closing of the final merged iterator
         UnfilteredRowIterator iter = RTBoundValidator.validate(
-            sstable.iterator(partitionKey(), filter.getSlices(metadata()), selectedColumns, filter.isReversed(), listener),
+            sstable.iterator(partitionKey(), slices, selectedColumns, isReverseOrder, listener),
             RTBoundValidator.Stage.SSTABLE,
             false
         );
@@ -114,10 +125,10 @@ protected Unfiltered computeNext()
 
         // Check that the lower bound is not bigger than the first item retrieved
         firstItemRetrieved = true;
-        if (lowerBound != null && ret != null)
-            assert comparator().compare(lowerBound, ret.clustering()) <= 0
+        if (lowerBoundMarker != null && ret != null)
+            assert comparator().compare(lowerBoundMarker.clustering(), ret.clustering()) <= 0
                 : String.format("Lower bound [%s ]is bigger than first returned value [%s] for sstable %s",
-                                lowerBound.toString(metadata()),
+                                lowerBoundMarker.clustering().toString(metadata()),
                                 ret.toString(metadata()),
                                 sstable.getFilename());
 
@@ -126,7 +137,7 @@ assert comparator().compare(lowerBound, ret.clustering()) <= 0
 
     private Comparator<Clusterable> comparator()
     {
-        return filter.isReversed() ? metadata().comparator.reversed() : metadata().comparator;
+        return isReverseOrder ? metadata().comparator.reversed() : metadata().comparator;
     }
 
     @Override
@@ -138,7 +149,7 @@ public TableMetadata metadata()
     @Override
     public boolean isReverseOrder()
     {
-        return filter.isReversed();
+        return isReverseOrder;
     }
 
     @Override
@@ -156,7 +167,7 @@ public EncodingStats stats()
     @Override
     public DeletionTime partitionLevelDeletion()
     {
-        if (!sstable.mayHaveTombstones())
+        if (!sstable.getSSTableMetadata().hasPartitionLevelDeletions)
             return DeletionTime.LIVE;
 
         return super.partitionLevelDeletion();
@@ -171,39 +182,34 @@ public Row staticRow()
         return super.staticRow();
     }
 
-    private static <V> ClusteringBound<V> createInclusiveOpen(boolean isReversed, ClusteringPrefix<V> from)
+    private static <V> ClusteringBound<V> createArtificialLowerBound(boolean isReversed, ClusteringPrefix<V> from)
     {
-        return from.accessor().factory().inclusiveOpen(isReversed, from.getRawValues());
+        return !isReversed
+               ? from.accessor().factory().inclusiveOpen(false, from.getRawValues()).artificialLowerBound()
+               : from.accessor().factory().inclusiveOpen(true, from.getRawValues()).artificialUpperBound();
     }
 
     /**
      * @return the lower bound stored on the index entry for this partition, if available.
      */
-    private ClusteringBound<?> getPartitionIndexLowerBound()
+    private ClusteringBound<?> getKeyCacheLowerBound()
     {
-        // NOTE: CASSANDRA-11206 removed the lookup against the key-cache as the IndexInfo objects are no longer
-        // in memory for not heap backed IndexInfo objects (so, these are on disk).
-        // CASSANDRA-11369 is there to fix this afterwards.
-
-        // Creating the iterator ensures that rowIndexEntry is loaded if available (partitions bigger than
-        // DatabaseDescriptor.column_index_size_in_kb)
-        if (!canUseMetadataLowerBound())
-            maybeInit();
-
         BigTableRowIndexEntry rowIndexEntry = sstable.getCachedPosition(partitionKey(), false);
         if (rowIndexEntry == null || !rowIndexEntry.indexOnHeap())
             return null;
 
         try (BigTableRowIndexEntry.IndexInfoRetriever onHeapRetriever = rowIndexEntry.openWithIndex(null))
         {
-            IndexInfo column = onHeapRetriever.columnsIndex(filter.isReversed() ? rowIndexEntry.columnsIndexCount() - 1 : 0);
-            ClusteringPrefix<?> lowerBoundPrefix = filter.isReversed() ? column.lastName : column.firstName;
+            IndexInfo column = onHeapRetriever.columnsIndex(isReverseOrder ? rowIndexEntry.columnsIndexCount() - 1 : 0);
+            ClusteringPrefix<?> lowerBoundPrefix = isReverseOrder ? column.lastName : column.firstName;
+
             assert lowerBoundPrefix.getRawValues().length <= metadata().comparator.size() :
             String.format("Unexpected number of clustering values %d, expected %d or fewer for %s",
                           lowerBoundPrefix.getRawValues().length,
                           metadata().comparator.size(),
                           sstable.getFilename());
-            return createInclusiveOpen(filter.isReversed(), lowerBoundPrefix);
+
+            return createArtificialLowerBound(isReverseOrder, lowerBoundPrefix);
         }
         catch (IOException e)
         {
@@ -236,28 +242,39 @@ private ClusteringBound<?> getPartitionIndexLowerBound()
      */
     private boolean canUseMetadataLowerBound()
     {
-        // Side-note: pre-2.1 sstable stat file had clustering value arrays whose size may not match the comparator size
-        // and that would break getMetadataLowerBound. We don't support upgrade from 2.0 to 3.0 directly however so it's
-        // not a true concern. Besides, !sstable.mayHaveTombstones already ensure this is a 3.0 sstable anyway.
-        return !sstable.mayHaveTombstones() && !sstable.metadata().isCompactTable();
+        if (sstable.metadata().isCompactTable())
+            return false;
+
+        Slices requestedSlices = slices;
+
+        if (requestedSlices.isEmpty())
+            return true;
+
+        if (!isReverseOrder())
+        {
+            return !requestedSlices.hasLowerBound() ||
+                   metadata().comparator.compare(requestedSlices.start(), sstable.getSSTableMetadata().coveredClustering.start()) < 0;
+        }
+        else
+        {
+            return !requestedSlices.hasUpperBound() ||
+                   metadata().comparator.compare(requestedSlices.end(), sstable.getSSTableMetadata().coveredClustering.end()) > 0;
+        }
     }
 
     /**
      * @return a global lower bound made from the clustering values stored in the sstable metadata, note that
      * this currently does not correctly compare tombstone bounds, especially ranges.
      */
-    private ClusteringBound<?> getMetadataLowerBound()
+    private @Nonnull ClusteringBound<?> getMetadataLowerBound()
     {
-        if (!canUseMetadataLowerBound())
-            return null;
-
         final StatsMetadata m = sstable.getSSTableMetadata();
-        List<ByteBuffer> vals = filter.isReversed() ? m.maxClusteringValues : m.minClusteringValues;
-        assert vals.size() <= metadata().comparator.size() :
+        ClusteringBound<?> bound = m.coveredClustering.open(isReverseOrder);
+        assert bound.size() <= metadata().comparator.size() :
         String.format("Unexpected number of clustering values %d, expected %d or fewer for %s",
-                      vals.size(),
+                      bound.size(),
                       metadata().comparator.size(),
                       sstable.getFilename());
-        return ByteBufferAccessor.instance.factory().inclusiveOpen(filter.isReversed(), vals.toArray(new ByteBuffer[vals.size()]));
+        return !isReverseOrder ? bound.artificialLowerBound() : bound.artificialUpperBound();
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/Version.java b/src/java/org/apache/cassandra/io/sstable/format/Version.java
index aa41b149c2a9..7ca5bc09b09c 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/Version.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/Version.java
@@ -68,6 +68,16 @@ protected Version(SSTableFormat format, String version)
 
     public abstract boolean hasAccurateMinMax();
 
+    /**
+     * If the sstable has improved min/max encoding.
+     */
+    public abstract boolean hasImprovedMinMax();
+
+    /**
+     * Records in th stats if the sstable has any partition deletions.
+     */
+    public abstract boolean hasPartitionLevelDeletionsPresenceMarker();
+
     public String getVersion()
     {
         return version;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index 9c8b161386a9..e444e81612c9 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -152,6 +152,8 @@ static class BigVersion extends Version
         private final boolean hasCommitLogIntervals;
         private final boolean hasAccurateMinMax;
         private final boolean hasOriginatingHostId;
+        private final boolean hasImprovedMinMax;
+        private final boolean hasPartitionLevelDeletionPresenceMarker;
         public final boolean hasMaxCompressedLength;
         private final boolean hasPendingRepair;
         private final boolean hasMetadataChecksum;
@@ -174,6 +176,8 @@ static class BigVersion extends Version
             hasCommitLogIntervals = version.compareTo("mc") >= 0;
             hasAccurateMinMax = version.compareTo("md") >= 0;
             hasOriginatingHostId = version.matches("(m[e-z])|(n[b-z])");
+            hasImprovedMinMax = version.compareTo("nb") >= 0;
+            hasPartitionLevelDeletionPresenceMarker = version.compareTo("nb") >= 0;
             hasMaxCompressedLength = version.compareTo("na") >= 0;
             hasPendingRepair = version.compareTo("na") >= 0;
             hasIsTransient = version.compareTo("na") >= 0;
@@ -228,6 +232,18 @@ public boolean hasAccurateMinMax()
             return hasAccurateMinMax;
         }
 
+        @Override
+        public boolean hasImprovedMinMax()
+        {
+            return hasImprovedMinMax;
+        }
+
+        @Override
+        public boolean hasPartitionLevelDeletionsPresenceMarker()
+        {
+            return hasPartitionLevelDeletionPresenceMarker;
+        }
+
         public boolean isCompatible()
         {
             return version.compareTo(earliest_supported_version) >= 0 && version.charAt(0) <= current_version.charAt(0);
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
index 286103598cd7..3285cce2ed56 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
@@ -283,7 +283,7 @@ public Row applyToRow(Row row)
         @Override
         public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
         {
-            collector.updateClusteringValues(marker.clustering());
+            collector.updateClusteringValuesByBoundOrBoundary(marker.clustering());
             if (marker.isBoundary())
             {
                 RangeTombstoneBoundaryMarker bm = (RangeTombstoneBoundaryMarker)marker;
@@ -306,7 +306,7 @@ public void onPartitionClose()
         @Override
         public DeletionTime applyToDeletion(DeletionTime deletionTime)
         {
-            collector.update(deletionTime);
+            collector.updatePartitionDeletion(deletionTime);
             return deletionTime;
         }
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
index be824efa4517..733f594bc766 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
@@ -18,35 +18,32 @@
 package org.apache.cassandra.io.sstable.metadata;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
 import java.util.Collections;
 import java.util.EnumMap;
-import java.util.List;
 import java.util.Map;
 import java.util.UUID;
 
-import com.google.common.base.Preconditions;
-
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
 import com.clearspring.analytics.stream.cardinality.ICardinality;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.commitlog.IntervalSet;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.partitions.PartitionStatisticsCollector;
 import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Unfiltered;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.EstimatedHistogram;
 import org.apache.cassandra.utils.MurmurHash;
-import org.apache.cassandra.utils.streamhist.TombstoneHistogram;
 import org.apache.cassandra.utils.streamhist.StreamingTombstoneHistogramBuilder;
+import org.apache.cassandra.utils.streamhist.TombstoneHistogram;
 
 public class MetadataCollector implements PartitionStatisticsCollector
 {
     public static final double NO_COMPRESSION_RATIO = -1.0;
-    private static final ByteBuffer[] EMPTY_CLUSTERING = new ByteBuffer[0];
 
     static EstimatedHistogram defaultCellPerPartitionCountHistogram()
     {
@@ -79,8 +76,9 @@ public static StatsMetadata defaultStatsMetadata()
                                  NO_COMPRESSION_RATIO,
                                  defaultTombstoneDropTimeHistogram(),
                                  0,
-                                 Collections.<ByteBuffer>emptyList(),
-                                 Collections.<ByteBuffer>emptyList(),
+                                 Collections.emptyList(),
+                                 Slice.ALL,
+                                 true,
                                  true,
                                  ActiveRepairService.UNREPAIRED_SSTABLE,
                                  -1,
@@ -100,12 +98,29 @@ public static StatsMetadata defaultStatsMetadata()
     protected double compressionRatio = NO_COMPRESSION_RATIO;
     protected StreamingTombstoneHistogramBuilder estimatedTombstoneDropTime = new StreamingTombstoneHistogramBuilder(SSTable.TOMBSTONE_HISTOGRAM_BIN_SIZE, SSTable.TOMBSTONE_HISTOGRAM_SPOOL_SIZE, SSTable.TOMBSTONE_HISTOGRAM_TTL_ROUND_SECONDS);
     protected int sstableLevel;
-    private ClusteringPrefix<?> minClustering = null;
-    private ClusteringPrefix<?> maxClustering = null;
+
+    /**
+     * The smallest clustering prefix for any {@link Unfiltered} in the sstable.
+     *
+     * <p>This is always either a Clustering, or a start bound (since for any end range tombstone bound, there should
+     * be a corresponding start bound that is smaller).
+     */
+    private ClusteringPrefix<?> minClustering = ClusteringBound.MAX_START;
+    /**
+     * The largest clustering prefix for any {@link Unfiltered} in the sstable.
+     *
+     * <p>This is always either a Clustering, or an end bound (since for any start range tombstone bound, there should
+     * be a corresponding end bound that is bigger).
+     */
+    private ClusteringPrefix<?> maxClustering = ClusteringBound.MIN_END;
+    private boolean clusteringInitialized = false;
+
     protected boolean hasLegacyCounterShards = false;
+    private boolean hasPartitionLevelDeletions = false;
     protected long totalColumnsSet;
     protected long totalRows;
 
+    private final AbstractType<?>[] comparators;
     /**
      * Default cardinality estimation method is to use HyperLogLog++.
      * Parameter here(p=13, sp=25) should give reasonable estimation
@@ -125,6 +140,11 @@ public MetadataCollector(ClusteringComparator comparator)
     public MetadataCollector(ClusteringComparator comparator, UUID originatingHostId)
     {
         this.comparator = comparator;
+
+        int clusteringTypesNum = comparator.size();
+        this.comparators = new AbstractType[clusteringTypesNum];
+        for (int i = 0; i < clusteringTypesNum; i++)
+            comparators[i] = comparator.subtype(i);
         this.originatingHostId = originatingHostId;
     }
 
@@ -191,6 +211,13 @@ public void update(Cell<?> cell)
         updateLocalDeletionTime(cell.localDeletionTime());
     }
 
+    public void updatePartitionDeletion(DeletionTime dt)
+    {
+        if (!dt.isLive())
+            hasPartitionLevelDeletions = true;
+        update(dt);
+    }
+
     public void update(DeletionTime dt)
     {
         if (!dt.isLive())
@@ -235,11 +262,52 @@ public MetadataCollector sstableLevel(int sstableLevel)
         return this;
     }
 
-    public MetadataCollector updateClusteringValues(ClusteringPrefix<?> clustering)
+    public void updateClusteringValues(Clustering<?> clustering)
     {
-        minClustering = minClustering == null || comparator.compare(clustering, minClustering) < 0 ? clustering.minimize() : minClustering;
-        maxClustering = maxClustering == null || comparator.compare(clustering, maxClustering) > 0 ? clustering.minimize() : maxClustering;
-        return this;
+        if (clustering == Clustering.STATIC_CLUSTERING)
+            return;
+
+        if (!clusteringInitialized)
+        {
+            clusteringInitialized = true;
+            minClustering = clustering.minimize();
+            maxClustering = minClustering;
+        }
+        else if (comparator.compare((ClusteringPrefix<?>) clustering, (ClusteringPrefix<?>) maxClustering) > 0)
+        {
+            maxClustering = clustering.minimize();
+        }
+        else if (comparator.compare((ClusteringPrefix<?>) clustering, (ClusteringPrefix<?>) minClustering) < 0)
+        {
+            minClustering = clustering.minimize();
+        }
+    }
+
+    public void updateClusteringValuesByBoundOrBoundary(ClusteringBoundOrBoundary<?> clusteringBoundOrBoundary)
+    {
+        // In a SSTable, every opening marker will be closed, so the start of a range tombstone marker will never be
+        // be the maxClustering (the corresponding close might though) and there is no point in doing the comparison
+        // (and vice-versa for the close). By the same reasoning, a boundary will never be either the min or max
+        // clustering and we can save comparisons.
+        if (clusteringBoundOrBoundary.isBoundary())
+            return;
+
+        if (!clusteringInitialized)
+        {
+            clusteringInitialized = true;
+            minClustering = clusteringBoundOrBoundary.minimize();
+            maxClustering = minClustering;
+        }
+        else if (clusteringBoundOrBoundary.kind().isStart())
+        {
+            if (comparator.compare(clusteringBoundOrBoundary, minClustering) < 0)
+                minClustering = clusteringBoundOrBoundary.minimize();
+        }
+        else
+        {
+            if (comparator.compare(clusteringBoundOrBoundary, maxClustering) > 0)
+                maxClustering = clusteringBoundOrBoundary.minimize();
+        }
     }
 
     public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
@@ -249,10 +317,6 @@ public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
 
     public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner, double bloomFilterFPChance, long repairedAt, UUID pendingRepair, boolean isTransient, SerializationHeader header)
     {
-        Preconditions.checkState((minClustering == null && maxClustering == null)
-                                 || comparator.compare(maxClustering, minClustering) >= 0);
-        ByteBuffer[] minValues = minClustering != null ? minClustering.getBufferArray() : EMPTY_CLUSTERING;
-        ByteBuffer[] maxValues = maxClustering != null ? maxClustering.getBufferArray() : EMPTY_CLUSTERING;
         Map<MetadataType, MetadataComponent> components = new EnumMap<>(MetadataType.class);
         components.put(MetadataType.VALIDATION, new ValidationMetadata(partitioner, bloomFilterFPChance));
         components.put(MetadataType.STATS, new StatsMetadata(estimatedPartitionSize,
@@ -267,9 +331,10 @@ public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner,
                                                              compressionRatio,
                                                              estimatedTombstoneDropTime.build(),
                                                              sstableLevel,
-                                                             makeList(minValues),
-                                                             makeList(maxValues),
+                                                             comparator.subtypes(),
+                                                             Slice.make(minClustering, maxClustering),
                                                              hasLegacyCounterShards,
+                                                             hasPartitionLevelDeletions,
                                                              repairedAt,
                                                              totalColumnsSet,
                                                              totalRows,
@@ -289,18 +354,6 @@ public void release()
         estimatedTombstoneDropTime.releaseBuffers();
     }
 
-    private static List<ByteBuffer> makeList(ByteBuffer[] values)
-    {
-        // In most case, l will be the same size than values, but it's possible for it to be smaller
-        List<ByteBuffer> l = new ArrayList<ByteBuffer>(values.length);
-        for (int i = 0; i < values.length; i++)
-            if (values[i] == null)
-                break;
-            else
-                l.add(values[i]);
-        return l;
-    }
-
     public static class MinMaxLongTracker
     {
         private final long defaultMin;
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
index 69ca455ec313..589b13d331ba 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
@@ -19,7 +19,6 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
 import java.util.List;
 import java.util.UUID;
 
@@ -28,20 +27,24 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.BufferClusteringBound;
+import org.apache.cassandra.db.ClusteringBound;
+import org.apache.cassandra.db.Slice;
 import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.commitlog.IntervalSet;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.EncodingStats;
 import org.apache.cassandra.io.ISerializer;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
-import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.serializers.AbstractTypeSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.EstimatedHistogram;
-import org.apache.cassandra.utils.streamhist.TombstoneHistogram;
-import org.apache.cassandra.utils.UUIDSerializer;
 import org.apache.cassandra.utils.UUIDSerializer;
+import org.apache.cassandra.utils.streamhist.TombstoneHistogram;
 
 /**
  * SSTable metadata that always stay on heap.
@@ -63,9 +66,19 @@ public class StatsMetadata extends MetadataComponent
     public final double compressionRatio;
     public final TombstoneHistogram estimatedTombstoneDropTime;
     public final int sstableLevel;
-    public final List<ByteBuffer> minClusteringValues;
-    public final List<ByteBuffer> maxClusteringValues;
+    public final Slice coveredClustering;
     public final boolean hasLegacyCounterShards;
+    /**
+     * This boolean is used as an approximation of whether a given key can be guaranteed not to have partition
+     * deletions in this sstable. Obviously, this is pretty imprecise: a single partition deletion in the sstable
+     * means we have to assume _any_ key may have a partition deletion. This is still likely useful as workloads that
+     * does not use partition level deletions, or only very rarely, are probably not that rare.
+     *
+     * TODO we could replace this by a small bloom-filter instead; the only downside being that we'd have to care about
+     *  the size of this bloom filters not getting out of hands, and it's a tiny bit unclear if it's worth the added
+     *  complexity.
+     */
+    public final boolean hasPartitionLevelDeletions;
     public final long repairedAt;
     public final long totalColumnsSet;
     public final long totalRows;
@@ -75,6 +88,9 @@ public class StatsMetadata extends MetadataComponent
     // just holds the current encoding stats to avoid allocating - it is not serialized
     public final EncodingStats encodingStats;
 
+    // Used to serialize min/max clustering. Can be null if the metadata was deserialized from a legacy version
+    private final List<AbstractType<?>> clusteringTypes;
+
     public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
                          EstimatedHistogram estimatedCellPerPartitionCount,
                          IntervalSet<CommitLogPosition> commitLogIntervals,
@@ -87,9 +103,10 @@ public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
                          double compressionRatio,
                          TombstoneHistogram estimatedTombstoneDropTime,
                          int sstableLevel,
-                         List<ByteBuffer> minClusteringValues,
-                         List<ByteBuffer> maxClusteringValues,
+                         List<AbstractType<?>> clusteringTypes,
+                         Slice coveredClustering,
                          boolean hasLegacyCounterShards,
+                         boolean hasPartitionLevelDeletions,
                          long repairedAt,
                          long totalColumnsSet,
                          long totalRows,
@@ -109,9 +126,10 @@ public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
         this.compressionRatio = compressionRatio;
         this.estimatedTombstoneDropTime = estimatedTombstoneDropTime;
         this.sstableLevel = sstableLevel;
-        this.minClusteringValues = minClusteringValues;
-        this.maxClusteringValues = maxClusteringValues;
+        this.clusteringTypes = clusteringTypes;
+        this.coveredClustering = coveredClustering;
         this.hasLegacyCounterShards = hasLegacyCounterShards;
+        this.hasPartitionLevelDeletions = hasPartitionLevelDeletions;
         this.repairedAt = repairedAt;
         this.totalColumnsSet = totalColumnsSet;
         this.totalRows = totalRows;
@@ -164,9 +182,10 @@ public StatsMetadata mutateLevel(int newLevel)
                                  compressionRatio,
                                  estimatedTombstoneDropTime,
                                  newLevel,
-                                 minClusteringValues,
-                                 maxClusteringValues,
+                                 clusteringTypes,
+                                 coveredClustering,
                                  hasLegacyCounterShards,
+                                 hasPartitionLevelDeletions,
                                  repairedAt,
                                  totalColumnsSet,
                                  totalRows,
@@ -189,9 +208,10 @@ public StatsMetadata mutateRepairedMetadata(long newRepairedAt, UUID newPendingR
                                  compressionRatio,
                                  estimatedTombstoneDropTime,
                                  sstableLevel,
-                                 minClusteringValues,
-                                 maxClusteringValues,
+                                 clusteringTypes,
+                                 coveredClustering,
                                  hasLegacyCounterShards,
+                                 hasPartitionLevelDeletions,
                                  newRepairedAt,
                                  totalColumnsSet,
                                  totalRows,
@@ -221,9 +241,9 @@ public boolean equals(Object o)
                        .append(estimatedTombstoneDropTime, that.estimatedTombstoneDropTime)
                        .append(sstableLevel, that.sstableLevel)
                        .append(repairedAt, that.repairedAt)
-                       .append(maxClusteringValues, that.maxClusteringValues)
-                       .append(minClusteringValues, that.minClusteringValues)
+                       .append(coveredClustering, that.coveredClustering)
                        .append(hasLegacyCounterShards, that.hasLegacyCounterShards)
+                       .append(hasPartitionLevelDeletions, that.hasPartitionLevelDeletions)
                        .append(totalColumnsSet, that.totalColumnsSet)
                        .append(totalRows, that.totalRows)
                        .append(originatingHostId, that.originatingHostId)
@@ -248,9 +268,9 @@ public int hashCode()
                        .append(estimatedTombstoneDropTime)
                        .append(sstableLevel)
                        .append(repairedAt)
-                       .append(maxClusteringValues)
-                       .append(minClusteringValues)
+                       .append(coveredClustering)
                        .append(hasLegacyCounterShards)
+                       .append(hasPartitionLevelDeletions)
                        .append(totalColumnsSet)
                        .append(totalRows)
                        .append(originatingHostId)
@@ -262,6 +282,8 @@ public static class StatsMetadataSerializer implements IMetadataComponentSeriali
     {
         private static final Logger logger = LoggerFactory.getLogger(StatsMetadataSerializer.class);
 
+        private final AbstractTypeSerializer typeSerializer = new AbstractTypeSerializer();
+
         public int serializedSize(Version version, StatsMetadata component) throws IOException
         {
             int size = 0;
@@ -271,14 +293,26 @@ public int serializedSize(Version version, StatsMetadata component) throws IOExc
             size += 8 + 8 + 4 + 4 + 4 + 4 + 8 + 8; // mix/max timestamp(long), min/maxLocalDeletionTime(int), min/max TTL, compressionRatio(double), repairedAt (long)
             size += TombstoneHistogram.serializer.serializedSize(component.estimatedTombstoneDropTime);
             size += TypeSizes.sizeof(component.sstableLevel);
-            // min column names
-            size += 4;
-            for (ByteBuffer value : component.minClusteringValues)
-                size += 2 + value.remaining(); // with short length
-            // max column names
-            size += 4;
-            for (ByteBuffer value : component.maxClusteringValues)
-                size += 2 + value.remaining(); // with short length
+
+            if (version.hasImprovedMinMax())
+            {
+                size += typeSerializer.serializedListSize(component.clusteringTypes);
+                size += Slice.serializer.serializedSize(component.coveredClustering,
+                                                        version.correspondingMessagingVersion(),
+                                                        component.clusteringTypes);
+            }
+            else
+            {
+                // min column names
+                size += 4;
+                ClusteringBound<?> minClusteringValues = component.coveredClustering.start();
+                size += minClusteringValues.size() * 2 /* short length */ + minClusteringValues.dataSize();
+                // max column names
+                size += 4;
+                ClusteringBound<?> maxClusteringValues = component.coveredClustering.end();
+                size += maxClusteringValues.size() * 2 /* short length */ + maxClusteringValues.dataSize();
+            }
+
             size += TypeSizes.sizeof(component.hasLegacyCounterShards);
             size += 8 + 8; // totalColumnsSet, totalRows
             if (version.hasCommitLogLowerBound())
@@ -298,6 +332,9 @@ public int serializedSize(Version version, StatsMetadata component) throws IOExc
                 size += TypeSizes.sizeof(component.isTransient);
             }
 
+            if (version.hasPartitionLevelDeletionsPresenceMarker())
+                size += TypeSizes.sizeof(component.hasPartitionLevelDeletions);
+
             if (version.hasOriginatingHostId())
             {
                 size += 1; // boolean: is originatingHostId present
@@ -323,12 +360,28 @@ public void serialize(Version version, StatsMetadata component, DataOutputPlus o
             TombstoneHistogram.serializer.serialize(component.estimatedTombstoneDropTime, out);
             out.writeInt(component.sstableLevel);
             out.writeLong(component.repairedAt);
-            out.writeInt(component.minClusteringValues.size());
-            for (ByteBuffer value : component.minClusteringValues)
-                ByteBufferUtil.writeWithShortLength(value, out);
-            out.writeInt(component.maxClusteringValues.size());
-            for (ByteBuffer value : component.maxClusteringValues)
-                ByteBufferUtil.writeWithShortLength(value, out);
+
+            if (version.hasImprovedMinMax())
+            {
+                assert component.clusteringTypes != null;
+                typeSerializer.serializeList(component.clusteringTypes, out);
+                Slice.serializer.serialize(component.coveredClustering,
+                                           out,
+                                           version.correspondingMessagingVersion(),
+                                           component.clusteringTypes);
+            }
+            else
+            {
+                ClusteringBound<?> minClusteringValues = component.coveredClustering.start();
+                out.writeInt(minClusteringValues.size());
+                for (ByteBuffer value : minClusteringValues.getBufferArray())
+                    ByteBufferUtil.writeWithShortLength(value, out);
+                ClusteringBound<?> maxClusteringValues = component.coveredClustering.end();
+                out.writeInt(maxClusteringValues.size());
+                for (ByteBuffer value : maxClusteringValues.getBufferArray())
+                    ByteBufferUtil.writeWithShortLength(value, out);
+            }
+
             out.writeBoolean(component.hasLegacyCounterShards);
 
             out.writeLong(component.totalColumnsSet);
@@ -357,6 +410,9 @@ public void serialize(Version version, StatsMetadata component, DataOutputPlus o
                 out.writeBoolean(component.isTransient);
             }
 
+            if (version.hasPartitionLevelDeletionsPresenceMarker())
+                out.writeBoolean(component.hasPartitionLevelDeletions);
+
             if (version.hasOriginatingHostId())
             {
                 if (component.originatingHostId != null)
@@ -408,24 +464,31 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
             int sstableLevel = in.readInt();
             long repairedAt = in.readLong();
 
-            // for legacy sstables, we skip deserializing the min and max clustering value
-            // to prevent erroneously excluding sstables from reads (see CASSANDRA-14861)
-            int colCount = in.readInt();
-            List<ByteBuffer> minClusteringValues = new ArrayList<>(colCount);
-            for (int i = 0; i < colCount; i++)
+            List<AbstractType<?>> clusteringTypes = null;
+            Slice coveredClustering = Slice.ALL;
+            if (version.hasImprovedMinMax())
             {
-                ByteBuffer val = ByteBufferUtil.readWithShortLength(in);
-                if (version.hasAccurateMinMax())
-                    minClusteringValues.add(val);
+                clusteringTypes = typeSerializer.deserializeList(in);
+                coveredClustering = Slice.serializer.deserialize(in, version.correspondingMessagingVersion(), clusteringTypes);
             }
-
-            colCount = in.readInt();
-            List<ByteBuffer> maxClusteringValues = new ArrayList<>(colCount);
-            for (int i = 0; i < colCount; i++)
+            else
             {
-                ByteBuffer val = ByteBufferUtil.readWithShortLength(in);
+                // for legacy sstables, we skip deserializing the min and max clustering value
+                // to prevent erroneously excluding sstables from reads (see CASSANDRA-14861)
+                int colCount = in.readInt();
+                ByteBuffer[] minClusteringValues = new ByteBuffer[colCount];
+                for (int i = 0; i < colCount; i++)
+                    minClusteringValues[i] = ByteBufferUtil.readWithShortLength(in);
+
+                colCount = in.readInt();
+                ByteBuffer[] maxClusteringValues = new ByteBuffer[colCount];
+                for (int i = 0; i < colCount; i++)
+                    maxClusteringValues[i] = ByteBufferUtil.readWithShortLength(in);
+
                 if (version.hasAccurateMinMax())
-                    maxClusteringValues.add(val);
+                    coveredClustering = Slice.make(BufferClusteringBound.inclusiveStartOf(minClusteringValues),
+                                                   BufferClusteringBound.inclusiveEndOf(maxClusteringValues));
+
             }
 
             boolean hasLegacyCounterShards = in.readBoolean();
@@ -449,6 +512,12 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
 
             boolean isTransient = version.hasIsTransient() && in.readBoolean();
 
+            // If not recorded, the only time we can guarantee there is no partition level deletion is if there is no
+            // deletion at all. Otherwise, we have to assume there may be some.
+            boolean hasPartitionLevelDeletions = version.hasPartitionLevelDeletionsPresenceMarker()
+                                                 ? in.readBoolean()
+                                                 : minLocalDeletionTime != Cell.NO_DELETION_TIME;
+
             UUID originatingHostId = null;
             if (version.hasOriginatingHostId() && in.readByte() != 0)
                 originatingHostId = UUIDSerializer.serializer.deserialize(in, 0);
@@ -465,9 +534,10 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
                                      compressionRatio,
                                      tombstoneHistogram,
                                      sstableLevel,
-                                     minClusteringValues,
-                                     maxClusteringValues,
+                                     clusteringTypes,
+                                     coveredClustering,
                                      hasLegacyCounterShards,
+                                     hasPartitionLevelDeletions,
                                      repairedAt,
                                      totalColumnsSet,
                                      totalRows,
diff --git a/src/java/org/apache/cassandra/serializers/AbstractTypeSerializer.java b/src/java/org/apache/cassandra/serializers/AbstractTypeSerializer.java
new file mode 100644
index 000000000000..36b4dd7aa1e9
--- /dev/null
+++ b/src/java/org/apache/cassandra/serializers/AbstractTypeSerializer.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.serializers;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.TypeParser;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class AbstractTypeSerializer
+{
+    public void serialize(AbstractType<?> type, DataOutputPlus out) throws IOException
+    {
+        ByteBufferUtil.writeWithVIntLength(UTF8Type.instance.decompose(type.toString()), out);
+    }
+
+    public void serializeList(List<AbstractType<?>> types, DataOutputPlus out) throws IOException
+    {
+        out.writeUnsignedVInt(types.size());
+        for (AbstractType<?> type : types)
+            serialize(type, out);
+    }
+
+    public AbstractType<?> deserialize(DataInputPlus in) throws IOException
+    {
+        ByteBuffer raw = ByteBufferUtil.readWithVIntLength(in);
+        return TypeParser.parse(UTF8Type.instance.compose(raw));
+    }
+
+    public List<AbstractType<?>> deserializeList(DataInputPlus in) throws IOException
+    {
+        int size = (int) in.readUnsignedVInt();
+        List<AbstractType<?>> types = new ArrayList<>(size);
+        for (int i = 0; i < size; i++)
+            types.add(deserialize(in));
+        return types;
+    }
+
+    public long serializedSize(AbstractType<?> type)
+    {
+        return ByteBufferUtil.serializedSizeWithVIntLength(UTF8Type.instance.decompose(type.toString()));
+    }
+
+    public long serializedListSize(List<AbstractType<?>> types)
+    {
+        long size = TypeSizes.sizeofUnsignedVInt(types.size());
+        for (AbstractType<?> type : types)
+            size += serializedSize(type);
+        return size;
+    }
+}
diff --git a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
index a4da97cde5f9..b9f9ad0eb5dd 100755
--- a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
+++ b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
@@ -30,7 +30,6 @@
 import java.io.PrintWriter;
 import java.nio.ByteBuffer;
 import java.nio.file.Files;
-import java.util.Arrays;
 import java.util.Comparator;
 import java.util.EnumSet;
 import java.util.List;
@@ -39,6 +38,7 @@
 import java.util.stream.Collectors;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -355,20 +355,10 @@ private void printSStableMetadata(String fname, boolean scan) throws IOException
             if (validation != null && header != null)
                 printMinMaxToken(descriptor, FBUtilities.newPartitioner(descriptor), header.getKeyType());
 
-            if (header != null && header.getClusteringTypes().size() == stats.minClusteringValues.size())
+            if (header != null)
             {
-                List<AbstractType<?>> clusteringTypes = header.getClusteringTypes();
-                List<ByteBuffer> minClusteringValues = stats.minClusteringValues;
-                List<ByteBuffer> maxClusteringValues = stats.maxClusteringValues;
-                String[] minValues = new String[clusteringTypes.size()];
-                String[] maxValues = new String[clusteringTypes.size()];
-                for (int i = 0; i < clusteringTypes.size(); i++)
-                {
-                    minValues[i] = clusteringTypes.get(i).getString(minClusteringValues.get(i));
-                    maxValues[i] = clusteringTypes.get(i).getString(maxClusteringValues.get(i));
-                }
-                field("minClusteringValues", Arrays.toString(minValues));
-                field("maxClusteringValues", Arrays.toString(maxValues));
+                ClusteringComparator comparator = new ClusteringComparator(header.getClusteringTypes());
+                field("covered clusterings", stats.coveredClustering.toString(comparator));
             }
             field("Estimated droppable tombstones",
                   stats.getEstimatedDroppableTombstoneRatio((int) (System.currentTimeMillis() / 1000) - this.gc));
diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
index 4a541776d952..dd7c231dfa88 100644
--- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
@@ -64,6 +64,10 @@ public interface ByteSource
     int LT_NEXT_COMPONENT = 0x20;
     int GT_NEXT_COMPONENT = 0x60;
 
+    // Unsupported, for artificial bounds
+    int LTLT_NEXT_COMPONENT = 0x1F;
+    int GTGT_NEXT_COMPONENT = 0x61;
+
     // Special value for components that should be excluded from the normal min/max span. (static rows)
     int EXCLUDED = 0x18;
 
diff --git a/test/microbench/org/apache/cassandra/test/microbench/MetadataCollectorBench.java b/test/microbench/org/apache/cassandra/test/microbench/MetadataCollectorBench.java
new file mode 100644
index 000000000000..53359be29faa
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/MetadataCollectorBench.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringBound;
+import org.apache.cassandra.db.ClusteringBoundary;
+import org.apache.cassandra.db.ClusteringPrefix.Kind;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.jctools.util.Pow2;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 10, time = 1)
+@Measurement(iterations = 10, time = 1)
+@Fork(1)
+@State(Scope.Benchmark)
+public class MetadataCollectorBench
+{
+    @Param({ "10" })
+    int clusteringKeyNum;
+
+    @Param({ "10000" })
+    int datasetSize;
+    int datumIndex;
+    Cell<?>[] cells;
+    Clustering<?>[] clusterings;
+    ClusteringBound<?>[] clusteringBounds;
+    ClusteringBoundary<?>[] clusteringBoundaries;
+    MetadataCollector collector;
+
+    @Setup
+    public void setup()
+    {
+        TableMetadata.Builder tableMetadataBuilder = TableMetadata.builder("k", "t")
+                                                                  .addPartitionKeyColumn("pk", LongType.instance)
+                                                                  .addRegularColumn("rc", LongType.instance);
+        for (int i = 0; i < clusteringKeyNum; i++)
+            tableMetadataBuilder.addClusteringColumn("ck" + i, LongType.instance);
+        TableMetadata tableMetadata = tableMetadataBuilder.build();
+        collector = new MetadataCollector(tableMetadata.comparator);
+
+        ColumnMetadata columnMetadata = tableMetadata.regularColumns().iterator().next();
+        ThreadLocalRandom current = ThreadLocalRandom.current();
+        datasetSize = Pow2.roundToPowerOfTwo(datasetSize);
+        cells = new Cell[datasetSize];
+        for (int i = 0; i < datasetSize; i++)
+        {
+            cells[i] = new BufferCell(columnMetadata, current.nextLong(0, Long.MAX_VALUE), current.nextInt(1, Integer.MAX_VALUE), Cell.NO_DELETION_TIME, null, null);
+        }
+        clusterings = new Clustering[datasetSize];
+        clusteringBounds = new ClusteringBound[datasetSize];
+        clusteringBoundaries = new ClusteringBoundary[datasetSize];
+        ByteBuffer[] cks = new ByteBuffer[clusteringKeyNum];
+        Kind[] clusteringBoundKinds = new Kind[]{ Kind.INCL_START_BOUND, Kind.INCL_END_BOUND, Kind.EXCL_START_BOUND, Kind.EXCL_END_BOUND };
+        Kind[] clusteringBoundaryKinds = new Kind[]{ Kind.INCL_END_EXCL_START_BOUNDARY, Kind.EXCL_END_INCL_START_BOUNDARY };
+        for (int i = 0; i < datasetSize; i++)
+        {
+            for (int j = 0; j < clusteringKeyNum; j++)
+                cks[j] = LongType.instance.decompose(current.nextLong());
+            clusterings[i] = Clustering.make(Arrays.copyOf(cks, cks.length));
+            clusteringBounds[i] = ClusteringBound.create(clusteringBoundKinds[i % clusteringBoundKinds.length], clusterings[i]);
+            clusteringBoundaries[i] = ClusteringBoundary.create(clusteringBoundaryKinds[i % clusteringBoundaryKinds.length], clusterings[i]);
+        }
+
+        System.gc();
+        // shuffle array contents to ensure a more 'natural' layout
+        for (int i = 0; i < datasetSize; i++)
+        {
+            int to = current.nextInt(0, datasetSize);
+            Cell<?> temp = cells[i];
+            cells[i] = cells[to];
+            cells[to] = temp;
+        }
+
+        for (int i = 0; i < datasetSize; i++)
+        {
+            int to = current.nextInt(0, datasetSize);
+            Clustering<?> temp = clusterings[i];
+            clusterings[i] = clusterings[to];
+            clusterings[to] = temp;
+        }
+    }
+
+    @Benchmark
+    public void updateCell()
+    {
+        collector.update(nextCell());
+    }
+
+    @Benchmark
+    public void updateClustering()
+    {
+        collector.updateClusteringValues(nextClustering());
+    }
+
+    @Benchmark
+    public void updateClusteringBound()
+    {
+        collector.updateClusteringValuesByBoundOrBoundary(nextClusteringBound());
+    }
+
+    @Benchmark
+    public void updateClusteringBoundary()
+    {
+        collector.updateClusteringValuesByBoundOrBoundary(nextClusteringBoundary());
+    }
+
+    public Cell<?> nextCell()
+    {
+        return cells[datumIndex++ & (cells.length - 1)];
+    }
+
+    public Clustering<?> nextClustering()
+    {
+        return clusterings[datumIndex++ & (clusterings.length - 1)];
+    }
+
+    public ClusteringBound<?> nextClusteringBound()
+    {
+        return clusteringBounds[datumIndex++ & (clusteringBounds.length - 1)];
+    }
+
+    public ClusteringBoundary<?> nextClusteringBoundary()
+    {
+        return clusteringBoundaries[datumIndex++ & (clusteringBoundaries.length - 1)];
+    }
+}
diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
index 3f58af19cac7..1bd08ceee5ab 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
@@ -431,13 +431,19 @@ private void testDeletionOnIndexedSSTableASC(boolean deleteWithRange) throws Thr
         }
         flush();
 
+        // The code has to read the 1st and 3rd sstables to see that everything before the 2nd sstable is deleted, and
+        // overall has to read the 3 sstables
         executeAndCheck("SELECT * FROM %s WHERE id=1 LIMIT 1", 3, row(1, 1001, "1001"));
         executeAndCheck("SELECT * FROM %s WHERE id=1 LIMIT 2", 3, row(1, 1001, "1001"), row(1, 1002, "1002"));
 
         executeAndCheck("SELECT * FROM %s WHERE id=1", 3, allRows);
-        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col > 1000 LIMIT 1", 2, row(1, 1001, "1001"));
+
+        // The 1st and 3rd sstables have data only up to 1000, so they will be skipped
+        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col > 1000 LIMIT 1", 1, row(1, 1001, "1001"));
+        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col > 1000", 1, allRows);
+
+        // The condition makes no difference to the code, and all 3 sstables have to read
         executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 2000 LIMIT 1", 3, row(1, 1001, "1001"));
-        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col > 1000", 2, allRows);
         executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 2000", 3, allRows);
     }
 
@@ -515,14 +521,30 @@ private void testDeletionOnOverlappingIndexedSSTable(boolean deleteWithRange) th
                 allRows[idx] = row(1, i, Integer.toString(i), Integer.toString(i));
         }
 
-        executeAndCheck("SELECT * FROM %s WHERE id=1 LIMIT 1", 2, row(1, 1, "1", "1"));
-        executeAndCheck("SELECT * FROM %s WHERE id=1 LIMIT 2", 2, row(1, 1, "1", "1"), row(1, 2, "2", null));
+        // The 500th first rows are in the first sstable (and there is no partition deletion/static row), so the 'lower
+        // bound' optimization will kick in and we'll only read the 1st sstable.
+        executeAndCheck("SELECT * FROM %s WHERE id=1 LIMIT 1", 1, row(1, 1, "1", "1"));
+        executeAndCheck("SELECT * FROM %s WHERE id=1 LIMIT 2", 1, row(1, 1, "1", "1"), row(1, 2, "2", null));
 
+        // Getting everything obviously requires reading both sstables
         executeAndCheck("SELECT * FROM %s WHERE id=1", 2, allRows);
+
+        // The 'lower bound' optimization don't help us because while the row to fetch is in the 1st sstable, the lower
+        // bound for the 2nd sstable is 501, which is lower than 1000.
         executeAndCheck("SELECT * FROM %s WHERE id=1 AND col > 1000 LIMIT 1", 2, row(1, 1001, "1001", "1001"));
-        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 2000 LIMIT 1", 2, row(1, 1, "1", "1"));
+
+        // Somewhat similar to the previous one: the row is in th 2nd sstable in this case, but as the lower bound for
+        // the first sstable is 1, this doesn't help.
         executeAndCheck("SELECT * FROM %s WHERE id=1 AND col > 500 LIMIT 1", 2, row(1, 751, "751", "751"));
-        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 500 LIMIT 1", 2, row(1, 1, "1", "1"));
+
+        // The 'col <= ?' condition in both queries doesn't impact the read path, which can still make use of the lower
+        // bound optimization and read only the first sstable.
+        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 2000 LIMIT 1", 1, row(1, 1, "1", "1"));
+        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 500 LIMIT 1", 1, row(1, 1, "1", "1"));
+
+        // Making sure the 'lower bound' optimization also work in reverse queries (in which it's more of a 'upper
+        // bound' optimization).
+        executeAndCheck("SELECT * FROM %s WHERE id=1 AND col <= 2000 ORDER BY col DESC LIMIT 1", 1, row(1, 2000, "2000", null));
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
index 9572f28f0066..ffa44228e3bc 100644
--- a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
+++ b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
@@ -21,6 +21,7 @@
 package org.apache.cassandra.db;
 
 import java.io.IOException;
+import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
 import java.util.Iterator;
@@ -38,6 +39,11 @@
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.Util;
@@ -57,11 +63,6 @@
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
-import org.apache.cassandra.db.rows.Cell;
-import org.apache.cassandra.db.rows.RangeTombstoneMarker;
-import org.apache.cassandra.db.rows.Row;
-import org.apache.cassandra.db.rows.Unfiltered;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataInputPlus;
@@ -73,9 +74,11 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.btree.BTreeSet;
+import org.mockito.Mockito;
 
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 public class SinglePartitionSliceCommandTest
@@ -231,7 +234,7 @@ private void checkForS(UnfilteredPartitionIterator pi)
         Cell<?> cell = cellIterator.next();
         Assert.assertEquals(s, cell.column());
         Assert.assertEquals(ByteBufferUtil.bytesToHex(cell.buffer()), ByteBufferUtil.bytes("s"), cell.buffer());
-        Assert.assertFalse(cellIterator.hasNext());
+        assertFalse(cellIterator.hasNext());
     }
 
     @Test
@@ -240,7 +243,7 @@ public void staticColumnsAreReturned() throws IOException
         DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
 
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, s) VALUES ('k1', 's')");
-        Assert.assertFalse(QueryProcessor.executeInternal("SELECT s FROM ks.tbl WHERE k='k1'").isEmpty());
+        assertFalse(QueryProcessor.executeInternal("SELECT s FROM ks.tbl WHERE k='k1'").isEmpty());
 
         ColumnFilter columnFilter = ColumnFilter.selection(RegularAndStaticColumns.of(s));
         ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(Slices.NONE, false);
@@ -481,7 +484,7 @@ public void toCQLStringIsSafeToCall() throws IOException
                                                             sliceFilter);
         String ret = cmd.toCQLString();
         Assert.assertNotNull(ret);
-        Assert.assertFalse(ret.isEmpty());
+        assertFalse(ret.isEmpty());
     }
 
     public static UnfilteredRowIterator getIteratorFromSinglePartition(String q)
@@ -566,6 +569,171 @@ public void sstableFiltering()
 
     }
 
+    @Test
+    public void testLowerBoundApplicableSingleColumnAsc()
+    {
+        String query = "INSERT INTO %s.%s (k, i) VALUES ('k1', %s)";
+        SSTableReader sstable = createSSTable(metadata, KEYSPACE, TABLE, query);
+        assertEquals(Slice.make(Util.clustering(metadata.comparator, BigInteger.valueOf(0)),
+                                Util.clustering(metadata.comparator, BigInteger.valueOf(9))),
+                     sstable.getSSTableMetadata().coveredClustering);
+        DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
+
+        Slice slice1 = Slice.make(Util.clustering(metadata.comparator, BigInteger.valueOf(3)), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(metadata, key, slice1, sstable, false));
+        assertTrue(lowerBoundApplicable(metadata, key, slice1, sstable, true));
+
+        Slice slice2 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(metadata.comparator, BigInteger.valueOf(3)));
+        assertTrue(lowerBoundApplicable(metadata, key, slice2, sstable, false));
+        assertFalse(lowerBoundApplicable(metadata, key, slice2, sstable, true));
+
+        // corner cases
+        Slice slice3 = Slice.make(Util.clustering(metadata.comparator, BigInteger.valueOf(0)), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(metadata, key, slice3, sstable, false));
+        assertTrue(lowerBoundApplicable(metadata, key, slice3, sstable, true));
+
+        Slice slice4 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(metadata.comparator, BigInteger.valueOf(9)));
+        assertTrue(lowerBoundApplicable(metadata, key, slice4, sstable, false));
+        assertFalse(lowerBoundApplicable(metadata, key, slice4, sstable, true));
+    }
+
+    @Test
+    public void testLowerBoundApplicableSingleColumnDesc()
+    {
+        String TABLE_REVERSED = "tbl_reversed";
+        String createTable = String.format(
+        "CREATE TABLE %s.%s (k text, i varint, v int, primary key (k, i)) WITH CLUSTERING ORDER BY (i DESC)",
+        KEYSPACE, TABLE_REVERSED);
+        QueryProcessor.executeOnceInternal(createTable);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_REVERSED);
+        TableMetadata metadata = cfs.metadata();
+        String query = "INSERT INTO %s.%s (k, i) VALUES ('k1', %s)";
+        SSTableReader sstable = createSSTable(metadata, KEYSPACE, TABLE_REVERSED, query);
+        assertEquals(Slice.make(Util.clustering(metadata.comparator, BigInteger.valueOf(9)),
+                                Util.clustering(metadata.comparator, BigInteger.valueOf(0))),
+                     sstable.getSSTableMetadata().coveredClustering);
+        DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
+
+        Slice slice1 = Slice.make(Util.clustering(metadata.comparator, BigInteger.valueOf(8)), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(metadata, key, slice1, sstable, false));
+        assertTrue(lowerBoundApplicable(metadata, key, slice1, sstable, true));
+
+        Slice slice2 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(metadata.comparator, BigInteger.valueOf(8)));
+        assertTrue(lowerBoundApplicable(metadata, key, slice2, sstable, false));
+        assertFalse(lowerBoundApplicable(metadata, key, slice2, sstable, true));
+
+        // corner cases
+        Slice slice3 = Slice.make(Util.clustering(metadata.comparator, BigInteger.valueOf(9)), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(metadata, key, slice3, sstable, false));
+        assertTrue(lowerBoundApplicable(metadata, key, slice3, sstable, true));
+
+        Slice slice4 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(metadata.comparator, BigInteger.valueOf(0)));
+        assertTrue(lowerBoundApplicable(metadata, key, slice4, sstable, false));
+        assertFalse(lowerBoundApplicable(metadata, key, slice4, sstable, true));
+    }
+
+    @Test
+    public void testLowerBoundApplicableMultipleColumnsAsc()
+    {
+        String query = "INSERT INTO %s.%s (k, c1, c2) VALUES ('k1', 0, %s)";
+        SSTableReader sstable = createSSTable(CFM_SLICES, KEYSPACE, TABLE_SCLICES, query);
+        assertEquals(Slice.make(Util.clustering(CFM_SLICES.comparator, 0, 0),
+                                Util.clustering(CFM_SLICES.comparator, 0, 9)),
+                     sstable.getSSTableMetadata().coveredClustering);
+        DecoratedKey key = CFM_SLICES.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
+
+        Slice slice1 = Slice.make(Util.clustering(CFM_SLICES.comparator, 0, 3), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(CFM_SLICES, key, slice1, sstable, false));
+        assertTrue(lowerBoundApplicable(CFM_SLICES, key, slice1, sstable, true));
+
+        Slice slice2 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(CFM_SLICES.comparator, 0, 3));
+        assertTrue(lowerBoundApplicable(CFM_SLICES, key, slice2, sstable, false));
+        assertFalse(lowerBoundApplicable(CFM_SLICES, key, slice2, sstable, true));
+
+        // corner cases
+        Slice slice3 = Slice.make(Util.clustering(CFM_SLICES.comparator, 0, 0), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(CFM_SLICES, key, slice3, sstable, false));
+        assertTrue(lowerBoundApplicable(CFM_SLICES, key, slice3, sstable, true));
+
+        Slice slice4 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(CFM_SLICES.comparator, 0, 9));
+        assertTrue(lowerBoundApplicable(CFM_SLICES, key, slice4, sstable, false));
+        assertFalse(lowerBoundApplicable(CFM_SLICES, key, slice4, sstable, true));
+    }
+
+    @Test
+    public void testLowerBoundApplicableMultipleColumnsDesc()
+    {
+        String TABLE_REVERSED = "tbl_slices_reversed";
+        String createTable = String.format(
+        "CREATE TABLE %s.%s (k text, c1 int, c2 int, v int, primary key (k, c1, c2)) WITH CLUSTERING ORDER BY (c1 ASC, c2 DESC)",
+        KEYSPACE, TABLE_REVERSED);
+        QueryProcessor.executeOnceInternal(createTable);
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_REVERSED);
+        TableMetadata metadata = cfs.metadata();
+
+        String query = "INSERT INTO %s.%s (k, c1, c2) VALUES ('k1', 0, %s)";
+        SSTableReader sstable = createSSTable(metadata, KEYSPACE, TABLE_REVERSED, query);
+        assertEquals(Slice.make(Util.clustering(metadata.comparator, 0, 9),
+                                Util.clustering(metadata.comparator, 0, 0)),
+                     sstable.getSSTableMetadata().coveredClustering);
+        DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
+
+        Slice slice1 = Slice.make(Util.clustering(metadata.comparator, 0, 8), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(metadata, key, slice1, sstable, false));
+        assertTrue(lowerBoundApplicable(metadata, key, slice1, sstable, true));
+
+        Slice slice2 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(metadata.comparator, 0, 8));
+        assertTrue(lowerBoundApplicable(metadata, key, slice2, sstable, false));
+        assertFalse(lowerBoundApplicable(metadata, key, slice2, sstable, true));
+
+        // corner cases
+        Slice slice3 = Slice.make(Util.clustering(metadata.comparator, 0, 9), ClusteringBound.TOP);
+        assertFalse(lowerBoundApplicable(metadata, key, slice3, sstable, false));
+        assertTrue(lowerBoundApplicable(metadata, key, slice3, sstable, true));
+
+        Slice slice4 = Slice.make(ClusteringBound.BOTTOM, Util.clustering(metadata.comparator, 0, 0));
+        assertTrue(lowerBoundApplicable(metadata, key, slice4, sstable, false));
+        assertFalse(lowerBoundApplicable(metadata, key, slice4, sstable, true));
+    }
+
+    private SSTableReader createSSTable(TableMetadata metadata, String keyspace, String table, String query)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+        for (int i = 0; i < 10; i++)
+            QueryProcessor.executeInternal(String.format(query, keyspace, table, i));
+        cfs.forceBlockingFlush();
+        DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
+        ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, key));
+        assertEquals(1, view.sstables.size());
+        return view.sstables.get(0);
+    }
+
+    private boolean lowerBoundApplicable(TableMetadata metadata, DecoratedKey key, Slice slice, SSTableReader sstable, boolean isReversed)
+    {
+        Slices.Builder slicesBuilder = new Slices.Builder(metadata.comparator);
+        slicesBuilder.add(slice);
+        Slices slices = slicesBuilder.build();
+        ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(slices, isReversed);
+
+        SinglePartitionReadCommand cmd = SinglePartitionReadCommand.create(metadata,
+                                                                           FBUtilities.nowInSeconds(),
+                                                                           ColumnFilter.all(metadata),
+                                                                           RowFilter.NONE,
+                                                                           DataLimits.NONE,
+                                                                           key,
+                                                                           filter);
+
+        try (UnfilteredRowIteratorWithLowerBound iter = new UnfilteredRowIteratorWithLowerBound(key,
+                                                                                                sstable,
+                                                                                                slices,
+                                                                                                isReversed,
+                                                                                                ColumnFilter.all(metadata),
+                                                                                                Mockito.mock(SSTableReadsListener.class)))
+        {
+            return iter.lowerBound() != null;
+        }
+    }
+
     private String toString(List<Unfiltered> unfiltereds, TableMetadata metadata)
     {
         return unfiltereds.stream().map(u -> u.toString(metadata, true)).collect(Collectors.toList()).toString();
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
index 3bdb1584592a..f5a0f10668df 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
@@ -19,7 +19,6 @@
 package org.apache.cassandra.db.compaction;
 
 import java.io.File;
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@@ -42,7 +41,6 @@
 import org.apache.cassandra.db.DeletionTime;
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.RangeTombstone;
 import org.apache.cassandra.db.ReadExecutionController;
 import org.apache.cassandra.db.RowUpdateBuilder;
@@ -54,7 +52,6 @@
 import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.marshal.ValueAccessors;
-import org.apache.cassandra.db.marshal.AsciiType;
 import org.apache.cassandra.db.partitions.FilteredPartition;
 import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
 import org.apache.cassandra.db.partitions.PartitionIterator;
@@ -63,8 +60,6 @@
 import org.apache.cassandra.db.rows.RowIterator;
 import org.apache.cassandra.db.rows.Unfiltered;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
@@ -82,7 +77,6 @@
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertTrue;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
@@ -369,8 +363,8 @@ public void testRangeTombstones()
         for (SSTableReader sstable : cfs.getLiveSSTables())
         {
             StatsMetadata stats = sstable.getSSTableMetadata();
-            assertEquals(ByteBufferUtil.bytes("0"), stats.minClusteringValues.get(0));
-            assertEquals(ByteBufferUtil.bytes("b"), stats.maxClusteringValues.get(0));
+            assertEquals(ByteBufferUtil.bytes("0"), stats.coveredClustering.start().bufferAt(0));
+            assertEquals(ByteBufferUtil.bytes("b"), stats.coveredClustering.end().bufferAt(0));
         }
 
         assertEquals(keys, k);
diff --git a/test/unit/org/apache/cassandra/db/filter/SliceTest.java b/test/unit/org/apache/cassandra/db/filter/SliceTest.java
index 77c0ec2cbffb..baddd5af880c 100644
--- a/test/unit/org/apache/cassandra/db/filter/SliceTest.java
+++ b/test/unit/org/apache/cassandra/db/filter/SliceTest.java
@@ -33,6 +33,8 @@
 import static org.apache.cassandra.db.ClusteringPrefix.Kind.*;
 import static org.junit.Assert.*;
 
+// TODO refactor this test - clustering comparator should be tested independently from slices intersections
+// TODO each intersection test should be done in both directions as "intersects" relation is symmetric
 public class SliceTest
 {
     @Test
@@ -49,221 +51,221 @@ public void testIntersectsSingleSlice()
 
         // filter falls entirely before sstable
         Slice slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with empty start
         slice = Slice.make(makeBound(sk), makeBound(ek, 1, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with missing components for start
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with missing components for start and end
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
 
         // end of slice matches start of sstable for the first component, but not the second component
         slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with missing components for start
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with missing components for start and end
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0));
-        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 3, 0, 0))));
 
         // first two components match, but not the last
         slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 0));
-        assertFalse(slice.intersects(cc, columnNames(1, 1, 1), columnNames(3, 1, 1)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 3, 1, 1))));
 
         // all three components in slice end match the start of the sstable
         slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 1, 1), columnNames(3, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 3, 1, 1))));
 
 
         // filter falls entirely after sstable
         slice = Slice.make(makeBound(sk, 4, 0, 0), makeBound(ek, 4, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with empty end
         slice = Slice.make(makeBound(sk, 4, 0, 0), makeBound(ek));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with missing components for end
         slice = Slice.make(makeBound(sk, 4, 0, 0), makeBound(ek, 1));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         // same case, but with missing components for start and end
         slice = Slice.make(makeBound(sk, 4, 0), makeBound(ek, 1));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
 
         // start of slice matches end of sstable for the first component, but not the second component
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 2, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(0, 0, 0), columnNames(1, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 0, 0))));
 
         // start of slice matches end of sstable for the first two components, but not the last component
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 2, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(0, 0, 0), columnNames(1, 1, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 0))));
 
         // all three components in the slice start match the end of the sstable
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 2, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(0, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 1))));
 
 
         // slice covers entire sstable (with no matching edges)
         slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 2, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // same case, but with empty ends
         slice = Slice.make(makeBound(sk), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // same case, but with missing components
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 2, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // slice covers entire sstable (with matching start)
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // slice covers entire sstable (with matching end)
         slice = Slice.make(makeBound(sk, 0, 0, 0), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // slice covers entire sstable (with matching start and end)
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
 
         // slice falls entirely within sstable (with matching start)
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // same case, but with a missing end component
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // slice falls entirely within sstable (with matching end)
         slice = Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
         // same case, but with a missing start component
         slice = Slice.make(makeBound(sk, 1, 1), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(1, 1, 1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 1, 1, 1))));
 
 
         // slice falls entirely within sstable
         slice = Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 2, 2))));
 
         // same case, but with a missing start component
         slice = Slice.make(makeBound(sk, 1, 1), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 2, 2))));
 
         // same case, but with a missing start and end components
         slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1, 2));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 2, 2))));
 
         // same case, but with an equal first component and missing start and end components
         slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 2, 2))));
 
         // slice falls entirely within sstable (slice start and end are the same)
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 2, 2))));
 
 
         // slice starts within sstable, empty end
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // same case, but with missing end components
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 3));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // slice starts within sstable (matching sstable start), empty end
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // same case, but with missing end components
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 3));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // slice starts within sstable (matching sstable end), empty end
         slice = Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // same case, but with missing end components
         slice = Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
 
         // slice ends within sstable, empty end
         slice = Slice.make(makeBound(sk), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // same case, but with missing start components
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 1, 1));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // slice ends within sstable (matching sstable start), empty start
         slice = Slice.make(makeBound(sk), makeBound(ek, 1, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // same case, but with missing start components
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // slice ends within sstable (matching sstable end), empty start
         slice = Slice.make(makeBound(sk), makeBound(ek, 2, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // same case, but with missing start components
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 2, 0, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 0, 0)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 0, 0))));
 
         // empty min/max column names
         slice = Slice.make(makeBound(sk), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek))));
 
         slice = Slice.make(makeBound(sk, 1), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek))));
 
         slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames()));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 2))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek, 2));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk, 2), makeBound(ek, 3));
-        assertFalse(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         // basic check on reversed slices
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 0, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(2, 0, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 2, 0, 0), makeBound(ek, 3, 0, 0))));
 
         slice = Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 0, 0, 0));
-        assertFalse(slice.intersects(cc, columnNames(1, 1, 0), columnNames(3, 0, 0)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1, 1, 0), makeBound(ek, 3, 0, 0))));
 
         slice = Slice.make(makeBound(sk, 1, 1, 1), makeBound(ek, 1, 1, 0));
-        assertTrue(slice.intersects(cc, columnNames(1, 0, 0), columnNames(2, 2, 2)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1, 0, 0), makeBound(ek, 2, 2, 2))));
     }
 
     @Test
@@ -280,32 +282,32 @@ public void testDifferentMinMaxLengths()
 
         // slice does intersect
         Slice slice = Slice.make(makeBound(sk), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(1), columnNames(1, 2)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk, 1), makeBound(ek, 1, 2))));
 
         slice = Slice.make(makeBound(sk), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk, 1), makeBound(ek));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk, 1), makeBound(ek, 1));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 1, 2, 3));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk, 1, 2, 3), makeBound(ek, 2));
-        assertTrue(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertTrue(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         // slice does not intersect
         slice = Slice.make(makeBound(sk, 2), makeBound(ek, 3, 4, 5));
-        assertFalse(slice.intersects(cc, columnNames(), columnNames(1)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk), makeBound(ek, 1))));
 
         slice = Slice.make(makeBound(sk, 0), makeBound(ek, 0, 1, 2));
-        assertFalse(slice.intersects(cc, columnNames(1), columnNames(1, 2)));
+        assertFalse(slice.intersects(cc, Slice.make(makeBound(sk, 1), makeBound(ek, 1, 2))));
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/db/rows/RowsTest.java b/test/unit/org/apache/cassandra/db/rows/RowsTest.java
index 13045da38254..0ab4c51a189c 100644
--- a/test/unit/org/apache/cassandra/db/rows/RowsTest.java
+++ b/test/unit/org/apache/cassandra/db/rows/RowsTest.java
@@ -201,6 +201,12 @@ public void updateHasLegacyCounterShards(boolean hasLegacyCounterShards)
         {
             this.hasLegacyCounterShards |= hasLegacyCounterShards;
         }
+
+        @Override
+        public void updatePartitionDeletion(DeletionTime dt)
+        {
+            update(dt);
+        }
     }
 
     private static long secondToTs(int now)
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
index aecddf9ca3da..33a5d7ca055c 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
@@ -216,11 +216,11 @@ public void trackMaxMinColNames() throws CharacterCodingException
         assertEquals(1, store.getLiveSSTables().size());
         for (SSTableReader sstable : store.getLiveSSTables())
         {
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().minClusteringValues.get(0)), "0col100");
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxClusteringValues.get(0)), "7col149");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0)), "0col100");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0)), "7col149");
             // make sure the clustering values are minimised
-            assertTrue(sstable.getSSTableMetadata().minClusteringValues.get(0).capacity() < 50);
-            assertTrue(sstable.getSSTableMetadata().maxClusteringValues.get(0).capacity() < 50);
+            assertTrue(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0).capacity() < 50);
+            assertTrue(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0).capacity() < 50);
         }
         String key = "row2";
 
@@ -238,11 +238,11 @@ public void trackMaxMinColNames() throws CharacterCodingException
         assertEquals(1, store.getLiveSSTables().size());
         for (SSTableReader sstable : store.getLiveSSTables())
         {
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().minClusteringValues.get(0)), "0col100");
-            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().maxClusteringValues.get(0)), "9col298");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0)), "0col100");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0)), "9col298");
             // and make sure the clustering values are still minimised after compaction
-            assertTrue(sstable.getSSTableMetadata().minClusteringValues.get(0).capacity() < 50);
-            assertTrue(sstable.getSSTableMetadata().maxClusteringValues.get(0).capacity() < 50);
+            assertTrue(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0).capacity() < 50);
+            assertTrue(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0).capacity() < 50);
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java
index 7d39c724c29e..259c6b502ef0 100644
--- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java
@@ -322,9 +322,10 @@ public void testCombinations()
 
     void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4)
     {
+        EnumSet<ClusteringPrefix.Kind> skippedKinds = EnumSet.of(ClusteringPrefix.Kind.SSTABLE_LOWER_BOUND, ClusteringPrefix.Kind.SSTABLE_UPPER_BOUND);
         for (Version v : Version.values())
-            for (ClusteringPrefix.Kind k1 : ClusteringPrefix.Kind.values())
-                for (ClusteringPrefix.Kind k2 : ClusteringPrefix.Kind.values())
+            for (ClusteringPrefix.Kind k1 : EnumSet.complementOf(skippedKinds))
+                for (ClusteringPrefix.Kind k2 : EnumSet.complementOf(skippedKinds))
                 {
                     ClusteringComparator comp = new ClusteringComparator(t1, t2);
                     ByteBuffer[] b = new ByteBuffer[2];
diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java
index 8e1371c59bcc..ee99960a7fca 100644
--- a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java
+++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java
@@ -291,7 +291,8 @@ void assertClusteringPairConvertsSame(AbstractType t1, AbstractType t2, Object o
     <V> void assertClusteringPairConvertsSame(ValueAccessor<V> accessor, AbstractType t1, AbstractType t2, Object o1, Object o2)
     {
         boolean checkEquals = t1 != DecimalType.instance && t2 != DecimalType.instance;
-        for (ClusteringPrefix.Kind k1 : ClusteringPrefix.Kind.values())
+        EnumSet<ClusteringPrefix.Kind> skippedKinds = EnumSet.of(ClusteringPrefix.Kind.SSTABLE_LOWER_BOUND, ClusteringPrefix.Kind.SSTABLE_UPPER_BOUND);
+        for (ClusteringPrefix.Kind k1 : EnumSet.complementOf(skippedKinds))
             {
                 ClusteringComparator comp = new ClusteringComparator(t1, t2);
                 V[] b = accessor.createArray(2);

From d5b41b45747bf622700442476045e14eb7a62e06 Mon Sep 17 00:00:00 2001
From: Jakub Zytka <jakub.zytka@datastax.com>
Date: Mon, 1 Mar 2021 18:10:08 +0100
Subject: [PATCH 035/151] STAR-172: Update artifacts to be called dse-db rather
 than cassandra

(cherry picked from commit d8708d5b62db0e8eceb4744d0948557932e6a8b4)
(cherry picked from commit a98a1053bbbc988bd462c62b129659de0d230a87)
---
 README.asc                                    |  4 +-
 bin/cassandra.in.sh                           | 12 ++--
 build.xml                                     | 58 +++++++++----------
 doc/source/development/dependencies.rst       |  4 +-
 tools/bin/cassandra.in.sh                     |  8 +--
 ... to be called dse-db rather than cassandra | 44 ++++++++++++++
 6 files changed, 87 insertions(+), 43 deletions(-)
 create mode 100644 update-history/STAR-801/68-a98a1053bb STAR-172: Update artifacts to be called dse-db rather than cassandra

diff --git a/README.asc b/README.asc
index f1270a83df50..913c6ce55fef 100644
--- a/README.asc
+++ b/README.asc
@@ -22,8 +22,8 @@ and running, and demonstrate some simple reads and writes. For a more-complete g
 
 First, we'll unpack our archive:
 
-  $ tar -zxvf apache-cassandra-$VERSION.tar.gz
-  $ cd apache-cassandra-$VERSION
+  $ tar -zxvf dse-db-$VERSION.tar.gz
+  $ cd dse-db-$VERSION
 
 After that we start the server. Running the startup script with the -f argument will cause
 Cassandra to remain in the foreground and log to standard out; it can be stopped with ctrl-C.
diff --git a/bin/cassandra.in.sh b/bin/cassandra.in.sh
index 58b4dd2896b2..c3ded52b6921 100644
--- a/bin/cassandra.in.sh
+++ b/bin/cassandra.in.sh
@@ -30,10 +30,10 @@ CLASSPATH="$CASSANDRA_CONF"
 # compiled classes. NOTE: This isn't needed by the startup script,
 # it's just used here in constructing the classpath.
 if [ -d $CASSANDRA_HOME/build ] ; then
-    #cassandra_bin="$CASSANDRA_HOME/build/classes/main"
-    cassandra_bin=`ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
+    #dse_db_bin="$CASSANDRA_HOME/build/classes/main"
+    dse_db_bin=`ls -1 $CASSANDRA_HOME/build/dse-db*.jar`
 
-    CLASSPATH="$CLASSPATH:$cassandra_bin"
+    CLASSPATH="$CLASSPATH:$dse_db_bin"
 fi
 
 # the default location for commitlogs, sstables, and saved caches
@@ -112,16 +112,16 @@ JAVA_VERSION=11
 if [ "$JVM_VERSION" = "1.8.0" ]  ; then
     JVM_PATCH_VERSION=${jvmver#*_}
     if [ "$JVM_VERSION" \< "1.8" ] || [ "$JVM_VERSION" \> "1.8.2" ] ; then
-        echo "Cassandra 4.0 requires either Java 8 (update 151 or newer) or Java 11 (or newer). Java $JVM_VERSION is not supported."
+        echo "DSE DB 4.0 requires either Java 8 (update 151 or newer) or Java 11 (or newer). Java $JVM_VERSION is not supported."
         exit 1;
     fi
     if [ "$JVM_PATCH_VERSION" -lt 151 ] ; then
-        echo "Cassandra 4.0 requires either Java 8 (update 151 or newer) or Java 11 (or newer). Java 8 update $JVM_PATCH_VERSION is not supported."
+        echo "DSE DB 4.0 requires either Java 8 (update 151 or newer) or Java 11 (or newer). Java 8 update $JVM_PATCH_VERSION is not supported."
         exit 1;
     fi
     JAVA_VERSION=8
 elif [ "$JVM_VERSION" \< "11" ] ; then
-    echo "Cassandra 4.0 requires either Java 8 (update 151 or newer) or Java 11 (or newer)."
+    echo "DSE DB 4.0 requires either Java 8 (update 151 or newer) or Java 11 (or newer)."
     exit 1;
 fi
 
diff --git a/build.xml b/build.xml
index be54496ed643..223b438bc7eb 100644
--- a/build.xml
+++ b/build.xml
@@ -14,7 +14,7 @@
   See the License for the specific language governing permissions and
   limitations under the License.
 -->
-<project basedir="." default="jar" name="apache-cassandra"
+<project basedir="." default="jar" name="dse-db"
          xmlns:artifact="antlib:org.apache.maven.artifact.ant"
          xmlns:if="ant:if"
          xmlns:unless="ant:unless">
@@ -25,9 +25,9 @@
 
     <!-- default version and SCM information -->
     <property name="base.version" value="4.0-rc2"/>
-    <property name="scm.connection" value="scm:https://gitbox.apache.org/repos/asf/cassandra.git"/>
-    <property name="scm.developerConnection" value="scm:https://gitbox.apache.org/repos/asf/cassandra.git"/>
-    <property name="scm.url" value="https://gitbox.apache.org/repos/asf?p=cassandra.git;a=tree"/>
+    <property name="scm.connection" value="scm:git:ssh://git@github.com:datastax/cassandra.git"/>
+    <property name="scm.developerConnection" value="scm:git:ssh://git@github.com:datastax/cassandra.git"/>
+    <property name="scm.url" value="scm:git:ssh://git@github.com:datastax/cassandra.git"/>
 
     <!-- directory details -->
     <property name="basedir" value="."/>
@@ -84,14 +84,14 @@
               value="https://repo.maven.apache.org/maven2/org/apache/maven/maven-ant-tasks" />
     <!-- details of how and which Maven repository we publish to -->
     <property name="maven.version" value="3.0.3" />
-    <condition property="maven-repository-url" value="https://repository.apache.org/service/local/staging/deploy/maven2">
+    <condition property="maven-repository-url" value="https://repo.sjc.dsinternal.org/artifactory/datastax-releases-local">
       <isset property="release"/>
     </condition>
-    <condition property="maven-repository-id" value="apache.releases.https">
+    <condition property="maven-repository-id" value="datastax.releases.https">
       <isset property="release"/>
     </condition>
-    <property name="maven-repository-url" value="https://repository.apache.org/content/repositories/snapshots"/>
-    <property name="maven-repository-id" value="apache.snapshots.https"/>
+    <property name="maven-repository-url" value="https://repo.sjc.dsinternal.org/artifactory/datastax-snapshots-local"/>
+    <property name="maven-repository-id" value="datastax.snapshots.https"/>
 
     <property name="test.timeout" value="240000" />
     <property name="test.memory.timeout" value="480000" />
@@ -284,7 +284,7 @@
     <sequential>
       <javadoc destdir="@{destdir}" author="true" version="true" use="true"
         windowtitle="${ant.project.name} API" classpathref="cassandra.classpath"
-        bottom="Copyright &amp;copy; 2009-2021 The Apache Software Foundation"
+        bottom="Copyright &amp;copy; 2009-2021 The Apache Software Foundation; All changes to the original code are Copyright DataStax, Inc."
         useexternalfile="yes" encoding="UTF-8" failonerror="false"
         maxmemory="256m" additionalparam="${jdk11-javadoc-exports}">
         <filesets/>
@@ -472,14 +472,14 @@
             description="Define dependencies and dependency versions">
       <!-- The parent pom defines the versions of all dependencies -->
       <artifact:pom id="parent-pom"
-                    groupId="org.apache.cassandra"
-                    artifactId="cassandra-parent"
+                    groupId="com.datastax.dse"
+                    artifactId="dse-db-parent"
                     packaging="pom"
                     version="${version}"
-                    url="https://cassandra.apache.org"
-                    name="Apache Cassandra"
+                    url="https://datastax.com"
+                    name="Datastax DB"
                     inceptionYear="2009"
-                    description="The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.">
+                    description="The Apache Cassandra Project develops a highly scalable second-generation distributed database. DataStax, Inc. provides additional improvements on top of Apache Cassandra">
 
         <!-- Inherit from the ASF template pom file, ref http://maven.apache.org/pom/asf/ -->
         <parent groupId="org.apache" artifactId="apache" version="22"/>
@@ -563,7 +563,7 @@
 
           <dependency groupId="org.apache.ant" artifactId="ant-junit" version="1.9.7" scope="test"/>
 
-          <dependency groupId="org.apache.cassandra" artifactId="cassandra-all" version="${version}" />
+          <dependency groupId="com.datastax.dse" artifactId="dse-db-all" version="${version}" />
           <dependency groupId="io.dropwizard.metrics" artifactId="metrics-core" version="3.1.5" />
           <dependency groupId="io.dropwizard.metrics" artifactId="metrics-jvm" version="3.1.5" />
           <dependency groupId="io.dropwizard.metrics" artifactId="metrics-logback" version="3.1.5" />
@@ -708,8 +708,8 @@
       <!-- each dependency set then defines the subset of the dependencies for that dependency set -->
       <artifact:pom id="build-deps-pom"
                     artifactId="cassandra-build-deps">
-        <parent groupId="org.apache.cassandra"
-                artifactId="cassandra-parent"
+        <parent groupId="com.datastax.dse"
+                artifactId="dse-db-parent"
                 version="${version}"
                 relativePath="${final.name}-parent.pom"/>
         <dependency groupId="junit" artifactId="junit"/>
@@ -734,11 +734,11 @@
 
       <!-- now the pom's for artifacts being deployed to Maven Central -->
       <artifact:pom id="all-pom"
-                    artifactId="cassandra-all"
-                    url="https://cassandra.apache.org"
-                    name="Apache Cassandra">
-        <parent groupId="org.apache.cassandra"
-                artifactId="cassandra-parent"
+                    artifactId="dse-db-all"
+                    url="https://datastax.com"
+                    name="DataStax DB">
+        <parent groupId="com.datastax.dse"
+                artifactId="dse-db-parent"
                 version="${version}"
                 relativePath="${final.name}-parent.pom"/>
         <scm connection="${scm.connection}" developerConnection="${scm.developerConnection}" url="${scm.url}"/>
@@ -1006,7 +1006,7 @@
     -->
     <target name="_main-jar"
             depends="build"
-            description="Assemble Cassandra JAR files">
+            description="Assemble DSE DB JAR files">
       <mkdir dir="${build.classes.main}/META-INF" />
       <copy file="LICENSE.txt"
             tofile="${build.classes.main}/META-INF/LICENSE.txt"/>
@@ -1020,16 +1020,16 @@
         <manifest>
         <!-- <section name="org/apache/cassandra/infrastructure"> -->
           <attribute name="Multi-Release" value="true"/>
-          <attribute name="Implementation-Title" value="Cassandra"/>
+          <attribute name="Implementation-Title" value="DSE DB"/>
           <attribute name="Implementation-Version" value="${version}"/>
-          <attribute name="Implementation-Vendor" value="Apache"/>
+          <attribute name="Implementation-Vendor" value="DataStax"/>
         <!-- </section> -->
         </manifest>
       </jar>
     </target>
     <target name="jar"
             depends="_main-jar,build-test,stress-build,fqltool-build,write-poms"
-            description="Assemble Cassandra JAR files">
+            description="Assemble DSE DB JAR files">
       <!-- Stress jar -->
       <manifest file="${stress.manifest}">
         <attribute name="Built-By" value="Pavel Yaskevich"/>
@@ -1055,7 +1055,7 @@
     <!--
         The javadoc-jar target makes cassandra-javadoc.jar output required for publishing to Maven central repository.
     -->
-    <target name="javadoc-jar" depends="javadoc" unless="no-javadoc" description="Assemble Cassandra JavaDoc JAR file">
+    <target name="javadoc-jar" depends="javadoc" unless="no-javadoc" description="Assemble DSE DB JavaDoc JAR file">
       <jar jarfile="${build.dir}/${final.name}-javadoc.jar" basedir="${javadoc.dir}"/>
       <!-- javadoc task always rebuilds so might as well remove the generated docs to prevent
            being pulled into the distribution by accident -->
@@ -1065,7 +1065,7 @@
     <!--
         The sources-jar target makes cassandra-sources.jar output required for publishing to Maven central repository.
     -->
-    <target name="sources-jar" depends="init" description="Assemble Cassandra Sources JAR file">
+    <target name="sources-jar" depends="init" description="Assemble DSE DB Sources JAR file">
       <jar jarfile="${build.dir}/${final.name}-sources.jar">
         <fileset dir="${build.src.java}" defaultexcludes="yes">
           <include name="org/apache/**/*.java"/>
@@ -1133,7 +1133,7 @@
 
     <!-- creates release tarballs -->
     <target name="artifacts" depends="_artifacts-init,gen-doc,sources-jar,javadoc-jar"
-            description="Create Cassandra release artifacts">
+            description="Create DSE DB release artifacts">
       <tar compression="gzip" longfile="gnu"
         destfile="${build.dir}/${final.name}-bin.tar.gz">
 
diff --git a/doc/source/development/dependencies.rst b/doc/source/development/dependencies.rst
index 6dd1cc46bc8f..ec7b56e53a35 100644
--- a/doc/source/development/dependencies.rst
+++ b/doc/source/development/dependencies.rst
@@ -47,7 +47,7 @@ Troubleshooting and conflict resolution
 Here are some useful commands that may help you out resolving conflicts.
 
 * ``ant realclean`` - gets rid of the build directory, including build artifacts.
-* ``mvn dependency:tree -f build/apache-cassandra-*-SNAPSHOT.pom -Dverbose -Dincludes=org.slf4j`` - shows transitive dependency tree for artifacts, e.g. org.slf4j. In case the command above fails due to a missing parent pom file, try running ``ant mvn-install``.
-* ``rm ~/.m2/repository/org/apache/cassandra/apache-cassandra/`` - removes cached local Cassandra maven artifacts
+* ``mvn dependency:tree -f build/dse-db-*-SNAPSHOT.pom -Dverbose -Dincludes=org.slf4j`` - shows transitive dependency tree for artifacts, e.g. org.slf4j. In case the command above fails due to a missing parent pom file, try running ``ant mvn-install``.
+* ``rm ~/.m2/repository/com/datastax/dse/dse-db/`` - removes cached local Cassandra maven artifacts
 
 
diff --git a/tools/bin/cassandra.in.sh b/tools/bin/cassandra.in.sh
index bf1ecc414c05..5f74ede17f1b 100644
--- a/tools/bin/cassandra.in.sh
+++ b/tools/bin/cassandra.in.sh
@@ -30,10 +30,10 @@ CLASSPATH="$CASSANDRA_CONF"
 # compiled classes. NOTE: This isn't needed by the startup script,
 # it's just used here in constructing the classpath.
 if [ -d $CASSANDRA_HOME/build ] ; then
-    #cassandra_bin="$CASSANDRA_HOME/build/classes/main"
-    cassandra_bin=`ls -1 $CASSANDRA_HOME/build/apache-cassandra*.jar`
-    cassandra_bin="$cassandra_bin:$CASSANDRA_HOME/build/classes/stress:$CASSANDRA_HOME/build/classes/fqltool"
-    CLASSPATH="$CLASSPATH:$cassandra_bin"
+    #dse_db_bin="$CASSANDRA_HOME/build/classes/main"
+    dse_db_bin=`ls -1 $CASSANDRA_HOME/build/dse-db*.jar`
+    dse_db_bin="$dse_db_bin:$CASSANDRA_HOME/build/classes/stress:$CASSANDRA_HOME/build/classes/fqltool"
+    CLASSPATH="$CLASSPATH:$dse_db_bin"
 fi
 
 # the default location for commitlogs, sstables, and saved caches
diff --git a/update-history/STAR-801/68-a98a1053bb STAR-172: Update artifacts to be called dse-db rather than cassandra b/update-history/STAR-801/68-a98a1053bb STAR-172: Update artifacts to be called dse-db rather than cassandra
new file mode 100644
index 000000000000..854e521f4c8b
--- /dev/null
+++ b/update-history/STAR-801/68-a98a1053bb STAR-172: Update artifacts to be called dse-db rather than cassandra	
@@ -0,0 +1,44 @@
+--- a/build.xml
++++ b/build.xml
+@@ -561,13 +561,9 @@
+           <dependency groupId="org.openjdk.jmh" artifactId="jmh-core" version="1.21" scope="test"/>
+           <dependency groupId="org.openjdk.jmh" artifactId="jmh-generator-annprocess" version="1.21" scope="test"/>
+ 
+-<<<<<<<
+-          <dependency groupId="com.datastax.dse" artifactId="dse-db-all" version="${version}" />
+-=======
+           <dependency groupId="org.apache.ant" artifactId="ant-junit" version="1.9.7" scope="test"/>
+ 
+-          <dependency groupId="org.apache.cassandra" artifactId="cassandra-all" version="${version}" />
+->>>>>>>
++          <dependency groupId="com.datastax.dse" artifactId="dse-db-all" version="${version}" />
+           <dependency groupId="io.dropwizard.metrics" artifactId="metrics-core" version="3.1.5" />
+           <dependency groupId="io.dropwizard.metrics" artifactId="metrics-jvm" version="3.1.5" />
+           <dependency groupId="io.dropwizard.metrics" artifactId="metrics-logback" version="3.1.5" />
+@@ -728,26 +724,12 @@
+         <dependency groupId="org.apache.ant" artifactId="ant-junit"/>
+         <!-- adding this dependency is necessary for assertj. When updating assertj, need to also update the version of
+              this that the new assertj's `assertj-parent-pom` depends on. -->
+-<<<<<<<
+-          <dependency groupId="org.junit" artifactId="junit-bom" type="pom" scope="test"/>
+-          <dependency groupId="org.assertj" artifactId="assertj-core" scope="test"/>
+-          <dependency groupId="org.awaitility" artifactId="awaitility" scope="test"/>
+-          <dependency groupId="org.hamcrest" artifactId="hamcrest" scope="test"/>
+-          <!-- coverage debs -->
+-          <dependency groupId="org.jacoco" artifactId="org.jacoco.agent" scope="test"/>
+-          <dependency groupId="org.jacoco" artifactId="org.jacoco.ant" scope="test"/>
+-          <dependency groupId="org.jboss.byteman" artifactId="byteman-install" scope="test"/>
+-          <dependency groupId="org.jboss.byteman" artifactId="byteman" scope="test"/>
+-          <dependency groupId="org.jboss.byteman" artifactId="byteman-submit" scope="test"/>
+-          <dependency groupId="org.jboss.byteman" artifactId="byteman-bmunit" scope="test"/>
+-=======
+         <dependency groupId="org.junit" artifactId="junit-bom" type="pom"/>
+         <dependency groupId="org.awaitility" artifactId="awaitility"/>
+         <dependency groupId="org.hamcrest" artifactId="hamcrest"/>
+         <!-- coverage debs -->
+         <dependency groupId="org.jacoco" artifactId="org.jacoco.agent"/>
+         <dependency groupId="org.jacoco" artifactId="org.jacoco.ant"/>
+->>>>>>>
+       </artifact:pom>
+ 
+       <!-- now the pom's for artifacts being deployed to Maven Central -->

From 02613e3831e4ee30ef7572bf822461d7478fcc3c Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Wed, 3 Mar 2021 13:15:04 +0100
Subject: [PATCH 036/151] STAR-75: On-disk Trie support

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
Co-authored-by: Jacek Lewandowski <jacek.lewandowski@datastax.com>
Co-authored-by: Mike Adamson <madamson@datastax.com>
(cherry picked from commit 0a442fefc891c87e2f0947433361345981055bc9)
(cherry picked from commit d5d5fe3c179ab3dc82d8d157d3cc0f532ffa6b1f)
---
 .../apache/cassandra/db/tries/TrieWalker.java |   2 +-
 .../compress/CompressedSequentialWriter.java  |  47 +-
 .../IncrementalDeepTrieWriterPageAware.java   | 323 ++++++
 .../io/tries/IncrementalTrieWriter.java       |  79 ++
 .../io/tries/IncrementalTrieWriterBase.java   | 257 +++++
 .../tries/IncrementalTrieWriterPageAware.java | 530 ++++++++++
 .../io/tries/IncrementalTrieWriterSimple.java | 133 +++
 .../cassandra/io/tries/SerializationNode.java |  55 +
 .../apache/cassandra/io/tries/TrieNode.java   | 968 ++++++++++++++++++
 .../cassandra/io/tries/TrieSerializer.java    |  28 +
 .../cassandra/io/tries/ValueIterator.java     | 220 ++++
 .../org/apache/cassandra/io/tries/Walker.java | 377 +++++++
 .../cassandra/io/util/DataOutputPlus.java     |  37 +
 .../cassandra/io/util/SequentialWriter.java   |  41 +-
 .../io/util/TailOverridingRebufferer.java     |  65 ++
 .../cassandra/io/util/WrappingRebufferer.java | 148 +++
 .../org/apache/cassandra/utils/PageAware.java |  99 ++
 .../org/apache/cassandra/utils/SizedInts.java | 108 ++
 .../utils/concurrent/LightweightRecycler.java |  98 ++
 .../utils/concurrent/ThreadLocals.java        |  62 ++
 .../io/tries/AbstractTrieTestBase.java        | 189 ++++
 .../cassandra/io/tries/TrieBuilderTest.java   | 105 ++
 .../cassandra/io/tries/TrieNodeTest.java      | 349 +++++++
 .../apache/cassandra/io/tries/WalkerTest.java | 268 +++++
 .../io/util/TailOverridingRebuffererTest.java | 128 +++
 .../io/util/WrappingRebuffererTest.java       | 151 +++
 .../apache/cassandra/utils/PageAwareTest.java | 150 +++
 .../apache/cassandra/utils/SizedIntsTest.java | 125 +++
 28 files changed, 5133 insertions(+), 9 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/io/tries/IncrementalDeepTrieWriterPageAware.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/IncrementalTrieWriter.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterBase.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterPageAware.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterSimple.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/SerializationNode.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/TrieNode.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/TrieSerializer.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/ValueIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/Walker.java
 create mode 100644 src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java
 create mode 100644 src/java/org/apache/cassandra/io/util/WrappingRebufferer.java
 create mode 100644 src/java/org/apache/cassandra/utils/PageAware.java
 create mode 100644 src/java/org/apache/cassandra/utils/SizedInts.java
 create mode 100644 src/java/org/apache/cassandra/utils/concurrent/LightweightRecycler.java
 create mode 100644 src/java/org/apache/cassandra/utils/concurrent/ThreadLocals.java
 create mode 100644 test/unit/org/apache/cassandra/io/tries/AbstractTrieTestBase.java
 create mode 100644 test/unit/org/apache/cassandra/io/tries/TrieBuilderTest.java
 create mode 100644 test/unit/org/apache/cassandra/io/tries/TrieNodeTest.java
 create mode 100644 test/unit/org/apache/cassandra/io/tries/WalkerTest.java
 create mode 100644 test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java
 create mode 100644 test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/PageAwareTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/SizedIntsTest.java

diff --git a/src/java/org/apache/cassandra/db/tries/TrieWalker.java b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
index fb76c9d5ba0e..a17e9907bd24 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieWalker.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
@@ -22,7 +22,7 @@
  * CompletableFuture.
  * See TrieDumper for sample usage.
  */
-interface TrieWalker<T, V>
+public interface TrieWalker<T, V>
 {
     /**
      * Called when entering a node of the trie.
diff --git a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
index 219082482a19..6496824b8f37 100644
--- a/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/compress/CompressedSequentialWriter.java
@@ -81,12 +81,12 @@ public CompressedSequentialWriter(File file,
                                       MetadataCollector sstableMetadataCollector)
     {
         super(file, SequentialWriterOption.newBuilder()
-                            .bufferSize(option.bufferSize())
-                            .bufferType(option.bufferType())
-                            .bufferSize(parameters.chunkLength())
-                            .bufferType(parameters.getSstableCompressor().preferredBufferType())
-                            .finishOnClose(option.finishOnClose())
-                            .build());
+                                          .bufferSize(option.bufferSize())
+                                          .bufferType(option.bufferType())
+                                          .bufferSize(parameters.chunkLength())
+                                          .bufferType(parameters.getSstableCompressor().preferredBufferType())
+                                          .finishOnClose(option.finishOnClose())
+                                          .build());
         this.compressor = parameters.getSstableCompressor();
         this.digestFile = Optional.ofNullable(digestFile);
 
@@ -333,6 +333,41 @@ private void seekToChunkStart()
         }
     }
 
+    // Page management using chunk boundaries
+
+    @Override
+    public int maxBytesInPage()
+    {
+        return buffer.capacity();
+    }
+
+    @Override
+    public void padToPageBoundary() throws IOException
+    {
+        if (buffer.position() == 0)
+            return;
+
+        int padLength = buffer.remaining();
+
+        // Flush as much as we have
+        doFlush(0);
+        // But pretend we had a whole chunk
+        bufferOffset += padLength;
+        lastFlushOffset += padLength;
+    }
+
+    @Override
+    public int bytesLeftInPage()
+    {
+        return buffer.remaining();
+    }
+
+    @Override
+    public long paddedPosition()
+    {
+        return position() + (buffer.position() == 0 ? 0 : buffer.remaining());
+    }
+
     protected class TransactionalProxy extends SequentialWriter.TransactionalProxy
     {
         @Override
diff --git a/src/java/org/apache/cassandra/io/tries/IncrementalDeepTrieWriterPageAware.java b/src/java/org/apache/cassandra/io/tries/IncrementalDeepTrieWriterPageAware.java
new file mode 100644
index 000000000000..4e0a8a715cad
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/IncrementalDeepTrieWriterPageAware.java
@@ -0,0 +1,323 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * This class is a variant of {@link IncrementalTrieWriterPageAware} which is able to build even very deep
+ * tries. While the parent class uses recursion for clarity, it may end up with stack overflow for tries with
+ * very long keys. This implementation can switch processing from stack to heap at a certain depth (provided
+ * as a constructor param).
+ */
+public class IncrementalDeepTrieWriterPageAware<VALUE> extends IncrementalTrieWriterPageAware<VALUE>
+{
+    private final int maxRecursionDepth;
+
+    public IncrementalDeepTrieWriterPageAware(TrieSerializer<VALUE, ? super DataOutputPlus> trieSerializer, DataOutputPlus dest, int maxRecursionDepth)
+    {
+        super(trieSerializer, dest);
+        this.maxRecursionDepth = maxRecursionDepth;
+    }
+
+    public IncrementalDeepTrieWriterPageAware(TrieSerializer<VALUE, ? super DataOutputPlus> trieSerializer, DataOutputPlus dest)
+    {
+        this(trieSerializer, dest, 64);
+    }
+
+    @Override
+    protected int recalcTotalSize(Node<VALUE> node, long nodePosition) throws IOException
+    {
+        return recalcTotalSizeRecursiveOnStack(node, nodePosition, 0);
+    }
+
+    private int recalcTotalSizeRecursiveOnStack(Node<VALUE> node, long nodePosition, int depth) throws IOException
+    {
+        if (node.hasOutOfPageInBranch)
+        {
+            int sz = 0;
+            for (Node<VALUE> child : node.children)
+            {
+                if (depth < maxRecursionDepth)
+                    sz += recalcTotalSizeRecursiveOnStack(child, nodePosition + sz, depth + 1);
+                else
+                    sz += recalcTotalSizeRecursiveOnHeap(child, nodePosition + sz);
+            }
+            node.branchSize = sz;
+        }
+
+        // The sizing below will use the branch size calculated above. Since that can change on out-of-page in branch,
+        // we need to recalculate the size if either flag is set.
+        if (node.hasOutOfPageChildren || node.hasOutOfPageInBranch)
+            node.nodeSize = serializer.sizeofNode(node, nodePosition + node.branchSize);
+
+        return node.branchSize + node.nodeSize;
+    }
+
+    @Override
+    protected long write(Node<VALUE> node) throws IOException
+    {
+        return writeRecursiveOnStack(node, 0);
+    }
+
+    private long writeRecursiveOnStack(Node<VALUE> node, int depth) throws IOException
+    {
+        long nodePosition = dest.position();
+        for (Node<VALUE> child : node.children)
+            if (child.filePos == -1)
+            {
+                if (depth < maxRecursionDepth)
+                    child.filePos = writeRecursiveOnStack(child, depth + 1);
+                else
+                    child.filePos = writeRecursiveOnHeap(child);
+            }
+
+        nodePosition += node.branchSize;
+        assert dest.position() == nodePosition
+                : "Expected node position to be " + nodePosition + " but got " + dest.position() + " after writing children.\n" + dumpNode(node, dest.position());
+
+        serializer.write(dest, node, nodePosition);
+
+        assert dest.position() == nodePosition + node.nodeSize
+               || dest.paddedPosition() == dest.position() // For PartitionIndexTest.testPointerGrowth where position may jump on page boundaries.
+                : "Expected node position to be " + (nodePosition + node.nodeSize) + " but got " + dest.position() + " after writing node, nodeSize " + node.nodeSize + ".\n" + dumpNode(node, nodePosition);
+        return nodePosition;
+    }
+
+    @Override
+    protected long writePartial(Node<VALUE> node, DataOutputPlus dest, long baseOffset) throws IOException
+    {
+        return writePartialRecursiveOnStack(node, dest, baseOffset, 0);
+    }
+
+    private long writePartialRecursiveOnStack(Node<VALUE> node, DataOutputPlus dest, long baseOffset, int depth) throws IOException
+    {
+        long startPosition = dest.position() + baseOffset;
+
+        List<Node<VALUE>> childrenToClear = new ArrayList<>();
+        for (Node<VALUE> child : node.children)
+        {
+            if (child.filePos == -1)
+            {
+                childrenToClear.add(child);
+                if (depth < maxRecursionDepth)
+                    child.filePos = writePartialRecursiveOnStack(child, dest, baseOffset, depth + 1);
+                else
+                    child.filePos = writePartialRecursiveOnHeap(child, dest, baseOffset);
+            }
+        }
+
+        long nodePosition = dest.position() + baseOffset;
+
+        if (node.hasOutOfPageInBranch)
+        {
+            // Update the branch size with the size of what we have just written. This may be used by the node's
+            // maxPositionDelta and it's a better approximation for later fitting calculations.
+            node.branchSize = (int) (nodePosition - startPosition);
+        }
+
+        serializer.write(dest, node, nodePosition);
+
+        if (node.hasOutOfPageChildren || node.hasOutOfPageInBranch)
+        {
+            // Update the node size with what we have just seen. It's a better approximation for later fitting
+            // calculations.
+            long endPosition = dest.position() + baseOffset;
+            node.nodeSize = (int) (endPosition - nodePosition);
+        }
+
+        for (Node<VALUE> child : childrenToClear)
+            child.filePos = -1;
+        return nodePosition;
+    }
+
+    private int recalcTotalSizeRecursiveOnHeap(Node<VALUE> node, long nodePosition) throws IOException
+    {
+        if (node.hasOutOfPageInBranch)
+            new RecalcTotalSizeRecursion(node, null, nodePosition).process();
+
+        if (node.hasOutOfPageChildren || node.hasOutOfPageInBranch)
+            node.nodeSize = serializer.sizeofNode(node, nodePosition + node.branchSize);
+
+        return node.branchSize + node.nodeSize;
+    }
+
+    private long writeRecursiveOnHeap(Node<VALUE> node) throws IOException
+    {
+        return new WriteRecursion(node, null).process().node.filePos;
+    }
+
+    private long writePartialRecursiveOnHeap(Node<VALUE> node, DataOutputPlus dest, long baseOffset) throws IOException
+    {
+        new WritePartialRecursion(node, dest, baseOffset).process();
+        long pos = node.filePos;
+        node.filePos = -1;
+        return pos;
+    }
+
+    class RecalcTotalSizeRecursion extends Recursion<Node<VALUE>>
+    {
+        final long nodePosition;
+        int sz;
+
+        RecalcTotalSizeRecursion(Node<VALUE> node, Recursion<Node<VALUE>> parent, long nodePosition)
+        {
+            super(node, node.children.iterator(), parent);
+            sz = 0;
+            this.nodePosition = nodePosition;
+        }
+
+        @Override
+        Recursion<Node<VALUE>> makeChild(Node<VALUE> child)
+        {
+            if (child.hasOutOfPageInBranch)
+                return new RecalcTotalSizeRecursion(child, this, nodePosition + sz);
+            else
+                return null;
+        }
+
+        @Override
+        void complete()
+        {
+            node.branchSize = sz;
+        }
+
+        @Override
+        void completeChild(Node<VALUE> child)
+        {
+            // This will be called for nodes that were recursively processed as well as the ones that weren't.
+
+            // The sizing below will use the branch size calculated above. Since that can change on out-of-page in branch,
+            // we need to recalculate the size if either flag is set.
+            if (child.hasOutOfPageChildren || child.hasOutOfPageInBranch)
+            {
+                long childPosition = this.nodePosition + sz;
+                child.nodeSize = serializer.sizeofNode(child, childPosition + child.branchSize);
+            }
+
+            sz += child.branchSize + child.nodeSize;
+        }
+    }
+
+    class WriteRecursion extends Recursion<Node<VALUE>>
+    {
+        long nodePosition;
+
+        WriteRecursion(Node<VALUE> node, Recursion<Node<VALUE>> parent)
+        {
+            super(node, node.children.iterator(), parent);
+            nodePosition = dest.position();
+        }
+
+        @Override
+        Recursion<Node<VALUE>> makeChild(Node<VALUE> child)
+        {
+            if (child.filePos == -1)
+                return new WriteRecursion(child, this);
+            else
+                return null;
+        }
+
+        @Override
+        void complete() throws IOException
+        {
+            nodePosition = nodePosition + node.branchSize;
+            assert dest.position() == nodePosition
+                    : "Expected node position to be " + nodePosition + " but got " + dest.position() + " after writing children.\n" + dumpNode(node, dest.position());
+
+            serializer.write(dest, node, nodePosition);
+
+            assert dest.position() == nodePosition + node.nodeSize
+                   || dest.paddedPosition() == dest.position() // For PartitionIndexTest.testPointerGrowth where position may jump on page boundaries.
+                    : "Expected node position to be " + (nodePosition + node.nodeSize) + " but got " + dest.position() + " after writing node, nodeSize " + node.nodeSize + ".\n" + dumpNode(node, nodePosition);
+
+            node.filePos = nodePosition;
+        }
+    }
+
+    class WritePartialRecursion extends Recursion<Node<VALUE>>
+    {
+        final DataOutputPlus dest;
+        final long baseOffset;
+        final long startPosition;
+        final List<Node<VALUE>> childrenToClear;
+
+        WritePartialRecursion(Node<VALUE> node, WritePartialRecursion parent)
+        {
+            super(node, node.children.iterator(), parent);
+            this.dest = parent.dest;
+            this.baseOffset = parent.baseOffset;
+            this.startPosition = dest.position() + baseOffset;
+            childrenToClear = new ArrayList<>();
+        }
+
+        WritePartialRecursion(Node<VALUE> node, DataOutputPlus dest, long baseOffset)
+        {
+            super(node, node.children.iterator(), null);
+            this.dest = dest;
+            this.baseOffset = baseOffset;
+            this.startPosition = dest.position() + baseOffset;
+            childrenToClear = new ArrayList<>();
+        }
+
+        @Override
+        Recursion<Node<VALUE>> makeChild(Node<VALUE> child)
+        {
+            if (child.filePos == -1)
+            {
+                childrenToClear.add(child);
+                return new WritePartialRecursion(child, this);
+            }
+            else
+                return null;
+        }
+
+        @Override
+        void complete() throws IOException
+        {
+            long nodePosition = dest.position() + baseOffset;
+
+            if (node.hasOutOfPageInBranch)
+            {
+                // Update the branch size with the size of what we have just written. This may be used by the node's
+                // maxPositionDelta and it's a better approximation for later fitting calculations.
+                node.branchSize = (int) (nodePosition - startPosition);
+            }
+
+            serializer.write(dest, node, nodePosition);
+
+            if (node.hasOutOfPageChildren || node.hasOutOfPageInBranch)
+            {
+                // Update the node size with what we have just seen. It's a better approximation for later fitting
+                // calculations.
+                long endPosition = dest.position() + baseOffset;
+                node.nodeSize = (int) (endPosition - nodePosition);
+            }
+
+            for (Node<VALUE> child : childrenToClear)
+                child.filePos = -1;
+
+            node.filePos = nodePosition;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriter.java b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriter.java
new file mode 100644
index 000000000000..15b69d802ff6
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Common interface for incremental trie writers. Incremental writers take sorted input to construct a trie file while
+ * buffering only limited amount of data.
+ * The writing itself is done by some node serializer passed on construction time.
+ */
+public interface IncrementalTrieWriter<VALUE> extends AutoCloseable
+{
+    /**
+     * Add an entry to the trie with the associated value.
+     */
+    void add(ByteComparable next, VALUE value) throws IOException;
+
+    /**
+     * Return the number of added entries.
+     */
+    long count();
+
+    /**
+     * Complete the process and return the position in the file of the root node.
+     */
+    long complete() throws IOException;
+
+    void reset();
+
+    void close();
+
+    /**
+     * Make a temporary in-memory representation of the unwritten nodes that covers everything added to the trie until
+     * this point. The object returned represents a "tail" for the file that needs to be attached at the "cutoff" point
+     * to the file (using e.g. TailOverridingRebufferer).
+     */
+    PartialTail makePartialRoot() throws IOException;
+
+
+    interface PartialTail
+    {
+        /** Position of the root of the partial representation. Resides in the tail buffer. */ 
+        long root();
+        /** Number of keys written */
+        long count();
+        /** Cutoff point. Positions lower that this are to be read from the file; higher ones from the tail buffer. */
+        long cutoff();
+        /** Buffer containing in-memory representation of the tail. */
+        ByteBuffer tail();
+    }
+
+    /**
+     * Construct a suitable trie writer.
+     */
+    static <VALUE> IncrementalTrieWriter<VALUE> open(TrieSerializer<VALUE, ? super DataOutputPlus> trieSerializer, DataOutputPlus dest)
+    {
+        return new IncrementalDeepTrieWriterPageAware<>(trieSerializer, dest);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterBase.java b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterBase.java
new file mode 100644
index 000000000000..e22ff6850c10
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterBase.java
@@ -0,0 +1,257 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Deque;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.concurrent.LightweightRecycler;
+import org.apache.cassandra.utils.concurrent.ThreadLocals;
+
+/**
+ * Helper base class for incremental trie builders.
+ */
+public abstract class IncrementalTrieWriterBase<VALUE, DEST, NODE extends IncrementalTrieWriterBase.BaseNode<VALUE, NODE>>
+implements IncrementalTrieWriter<VALUE>
+{
+    protected final Deque<NODE> stack = new ArrayDeque<>();
+    protected final TrieSerializer<VALUE, ? super DEST> serializer;
+    protected final DEST dest;
+    protected ByteComparable prev = null;
+    long count = 0;
+
+    protected IncrementalTrieWriterBase(TrieSerializer<VALUE, ? super DEST> serializer, DEST dest, NODE root)
+    {
+        this.serializer = serializer;
+        this.dest = dest;
+        this.stack.addLast(root);
+    }
+
+    protected void reset(NODE root)
+    {
+        this.prev = null;
+        this.count = 0;
+        this.stack.clear();
+        this.stack.addLast(root);
+    }
+
+
+    @Override
+    public void close()
+    {
+        this.prev = null;
+        this.count = 0;
+        this.stack.clear();
+    }
+
+    @Override
+    public void add(ByteComparable next, VALUE value) throws IOException
+    {
+        ++count;
+        int stackpos = 0;
+        ByteSource sn = next.asComparableBytes(Walker.BYTE_COMPARABLE_VERSION);
+        int n = sn.next();
+
+        if (prev != null)
+        {
+            ByteSource sp = prev.asComparableBytes(Walker.BYTE_COMPARABLE_VERSION);
+            int p = sp.next();
+            while ( n == p )
+            {
+                assert n != ByteSource.END_OF_STREAM : String.format("Incremental trie requires unique sorted keys, got equal %s after %s.", next, prev);
+                ++stackpos;
+                n = sn.next();
+                p = sp.next();
+            }
+            assert p < n : String.format("Incremental trie requires sorted keys, got %s after %s.", next, prev);
+        }
+        prev = next;
+
+        while (stack.size() > stackpos + 1)
+            completeLast();
+
+        NODE node = stack.getLast();
+        while (n != ByteSource.END_OF_STREAM)
+        {
+            node = node.addChild((byte) n);
+            stack.addLast(node);
+            ++stackpos;
+            n = sn.next();
+        }
+
+        VALUE existingPayload = node.setPayload(value);
+        assert existingPayload == null;
+    }
+
+    public long complete() throws IOException
+    {
+        NODE root = stack.getFirst();
+        if (root.filePos != -1)
+            return root.filePos;
+
+        return performCompletion().filePos;
+    }
+
+    NODE performCompletion() throws IOException
+    {
+        NODE root = null;
+        while (!stack.isEmpty())
+            root = completeLast();
+        stack.addLast(root);
+        return root;
+    }
+
+    public long count()
+    {
+        return count;
+    }
+
+    protected NODE completeLast() throws IOException
+    {
+        NODE node = stack.removeLast();
+        complete(node);
+        return node;
+    }
+
+    abstract void complete(NODE value) throws IOException;
+    abstract public PartialTail makePartialRoot() throws IOException;
+
+    static class PTail implements PartialTail
+    {
+        long root;
+        long cutoff;
+        long count;
+        ByteBuffer tail;
+
+        @Override
+        public long root()
+        {
+            return root;
+        }
+
+        @Override
+        public long cutoff()
+        {
+            return cutoff;
+        }
+
+        @Override
+        public ByteBuffer tail()
+        {
+            return tail;
+        }
+
+        @Override
+        public long count()
+        {
+            return count;
+        }
+    }
+
+    static abstract class BaseNode<VALUE, NODE extends BaseNode<VALUE, NODE>> implements SerializationNode<VALUE>
+    {
+        private static final int CHILDREN_LIST_RECYCLER_LIMIT = 1024;
+        @SuppressWarnings("rawtypes")
+        private static final LightweightRecycler<ArrayList> CHILDREN_LIST_RECYCLER = ThreadLocals.createLightweightRecycler(CHILDREN_LIST_RECYCLER_LIMIT);
+        @SuppressWarnings("rawtypes")
+        private static final ArrayList EMPTY_LIST = new ArrayList<>(0);
+
+        @SuppressWarnings({ "unchecked", "rawtypes" })
+        private static <NODE> ArrayList<NODE> allocateChildrenList()
+        {
+            return CHILDREN_LIST_RECYCLER.reuseOrAllocate(() -> new ArrayList(4));
+        }
+
+        private static <NODE> void recycleChildrenList(ArrayList<NODE> children)
+        {
+            CHILDREN_LIST_RECYCLER.tryRecycle(children);
+        }
+
+        VALUE payload;
+        ArrayList<NODE> children;
+        final int transition;
+        long filePos = -1;
+
+        @SuppressWarnings("unchecked")
+        BaseNode(int transition)
+        {
+            children = EMPTY_LIST;
+            this.transition = transition;
+        }
+
+        public VALUE payload()
+        {
+            return payload;
+        }
+
+        public VALUE setPayload(VALUE newPayload)
+        {
+            VALUE p = payload;
+            payload = newPayload;
+            return p;
+        }
+
+        public NODE addChild(byte b)
+        {
+            assert children.isEmpty() || (children.get(children.size() - 1).transition & 0xFF) < (b & 0xFF);
+            NODE node = newNode(b);
+            if (children == EMPTY_LIST)
+                children = allocateChildrenList();
+
+            children.add(node);
+            return node;
+        }
+
+        public int childCount()
+        {
+            return children.size();
+        }
+
+        void finalizeWithPosition(long position)
+        {
+            this.filePos = position;
+
+            // Make sure we are not holding on to pointers to data we no longer need
+            // (otherwise we keep the whole trie in memory).
+            if (children != EMPTY_LIST)
+                // the recycler will also clear the collection before adding it to the pool
+                recycleChildrenList(children);
+
+            children = null;
+            payload = null;
+        }
+
+        public int transition(int i)
+        {
+            return children.get(i).transition;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%02x", transition);
+        }
+
+        abstract NODE newNode(byte transition);
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterPageAware.java b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterPageAware.java
new file mode 100644
index 000000000000..813165c74e67
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterPageAware.java
@@ -0,0 +1,530 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NavigableSet;
+import java.util.TreeSet;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * Incremental builders of on-disk tries which packs trie stages into disk cache pages.
+ *
+ * The incremental core is as in {@link IncrementalTrieWriterSimple}, which this augments by:
+ *   - calculating branch sizes reflecting the amount of data that needs to be written to store the trie
+ *     branch rooted at each node
+ *   - delaying writing any part of a completed node until its branch size is above the page size
+ *   - laying out (some of) its children branches (each smaller than a page) to be contained within a page
+ *   - adjusting the branch size to reflect the fact that the children are now written (i.e. removing their size)
+ *
+ * The process is bottom-up, i.e. pages are packed at the bottom and the root page is usually smaller.
+ * This may appear less efficient than a top-down process which puts more information in the top pages that
+ * tend to stay in cache, but in both cases performing a search will usually require an additional disk read
+ * for the leaf page. When we maximize the amount of relevant data that read brings by using the bottom-up
+ * process, we have practically the same efficiency with smaller intermediate page footprint, i.e. less data
+ * to keep in cache.
+ *
+ * As an example, taking a sample page size fitting 4 nodes, a simple trie would be split like this:
+ * Node 0 |
+ *   -a-> | Node 1
+ *        |   -s-> Node 2
+ *        |          -k-> Node 3 (payload 1)
+ *        |          -s-> Node 4 (payload 2)
+ *        -----------------------------------
+ *   -b-> Node 5 |
+ *          -a-> |Node 6
+ *               |  -n-> Node 7
+ *               |         -k-> Node 8 (payload 3)
+ *               |                -s-> Node 9 (payload 4)
+ * where lines denote page boundaries.
+ *
+ * The process itself will start by adding "ask" which adds three nodes after the root to the stack. Adding "ass"
+ * completes Node 3, setting its branch a size of 1 and replaces it on the stack with Node 4.
+ * The step of adding "bank" starts by completing Node 4 (size 1), Node 2 (size 3), Node 1 (size 4), then adds 4 more
+ * nodes to the stack. Adding "banks" descends one more node.
+ * The trie completion step completes nodes 9 (size 1), 8 (size 2), 7 (size 3), 6 (size 4), 5 (size 5). Since the size
+ * of node 5 is above the page size, the algorithm lays out its children. Nodes 6, 7, 8, 9 are written in order. The
+ * size of node 5 is now just the size of it individually, 1. The process continues with completing Node 0 (size 6).
+ * This is bigger than the page size, so some of its children need to be written. The algorithm takes the largest,
+ * Node 1, and lays it out with its children in the file. Node 0 now has an adjusted size of 2 which is below the
+ * page size and we can continue the process.
+ * Since this was the root of the trie, the current page is padded and the remaining nodes 0, 5 are written.
+ */
+public class IncrementalTrieWriterPageAware<VALUE>
+extends IncrementalTrieWriterBase<VALUE, DataOutputPlus, IncrementalTrieWriterPageAware.Node<VALUE>>
+implements IncrementalTrieWriter<VALUE>
+{
+    final int maxBytesPerPage;
+
+    private final static Comparator<Node<?>> BRANCH_SIZE_COMPARATOR = (l, r) ->
+    {
+        // Smaller branches first.
+        int c = Integer.compare(l.branchSize + l.nodeSize, r.branchSize + r.nodeSize);
+        if (c != 0)
+            return c;
+
+        // Then order by character, which serves several purposes:
+        // - enforces inequality to make sure equal sizes aren't treated as duplicates,
+        // - makes sure the item we use for comparison key comes greater than all equal-sized nodes,
+        // - orders equal sized items so that most recently processed (and potentially having closer children) comes
+        //   last and is thus the first one picked for layout.
+        c = Integer.compare(l.transition, r.transition);
+
+        assert c != 0 || l == r;
+        return c;
+    };
+
+    IncrementalTrieWriterPageAware(TrieSerializer<VALUE, ? super DataOutputPlus> trieSerializer, DataOutputPlus dest)
+    {
+        super(trieSerializer, dest, new Node<>((byte) 0));
+        this.maxBytesPerPage = dest.maxBytesInPage();
+    }
+
+    @Override
+    public void reset()
+    {
+        reset(new Node<>((byte) 0));
+    }
+
+    @Override
+    Node<VALUE> performCompletion() throws IOException
+    {
+        Node<VALUE> root = super.performCompletion();
+
+        int actualSize = recalcTotalSize(root, dest.position());
+        int bytesLeft = dest.bytesLeftInPage();
+        if (actualSize > bytesLeft)
+        {
+            if (actualSize <= maxBytesPerPage)
+            {
+                dest.padToPageBoundary();
+                bytesLeft = maxBytesPerPage;
+                // position changed, recalculate again
+                actualSize = recalcTotalSize(root, dest.position());
+            }
+
+            if (actualSize > bytesLeft)
+            {
+                // Still greater. Lay out children separately.
+                layoutChildren(root);
+
+                // Pad if needed and place.
+                if (root.nodeSize > dest.bytesLeftInPage())
+                {
+                    dest.padToPageBoundary();
+                    // Recalculate again as pointer size may have changed, triggering assertion in writeRecursive.
+                    recalcTotalSize(root, dest.position());
+                }
+            }
+        }
+
+
+        root.finalizeWithPosition(write(root));
+        return root;
+    }
+
+    @Override
+    void complete(Node<VALUE> node) throws IOException
+    {
+        assert node.filePos == -1;
+
+        int branchSize = 0;
+        for (Node<VALUE> child : node.children)
+            branchSize += child.branchSize + child.nodeSize;
+
+        node.branchSize = branchSize;
+
+        int nodeSize = serializer.sizeofNode(node, dest.position());
+        if (nodeSize + branchSize < maxBytesPerPage)
+        {
+            // Good. This node and all children will (most probably) fit page.
+            node.nodeSize = nodeSize;
+            node.hasOutOfPageChildren = false;
+            node.hasOutOfPageInBranch = false;
+
+            for (Node<VALUE> child : node.children)
+                if (child.filePos != -1)
+                    node.hasOutOfPageChildren = true;
+                else if (child.hasOutOfPageChildren || child.hasOutOfPageInBranch)
+                    node.hasOutOfPageInBranch = true;
+
+            return;
+        }
+
+        // Cannot fit. Lay out children; The current node will be marked as one with out-of-page children.
+        layoutChildren(node);
+    }
+
+    private void layoutChildren(Node<VALUE> node) throws IOException
+    {
+        assert node.filePos == -1;
+
+        NavigableSet<Node<VALUE>> children = node.getChildrenWithUnsetPosition();
+
+        int bytesLeft = dest.bytesLeftInPage();
+        Node<VALUE> cmp = new Node<>(256); // goes after all equal-sized unplaced nodes (whose transition character is 0-255)
+        cmp.nodeSize = 0;
+        while (!children.isEmpty())
+        {
+            cmp.branchSize = bytesLeft;
+            Node<VALUE> child = children.headSet(cmp, true).pollLast();    // grab biggest that could fit
+            if (child == null)
+            {
+                dest.padToPageBoundary();
+                bytesLeft = maxBytesPerPage;
+                child = children.pollLast();       // just biggest
+            }
+
+            assert child != null;
+            if (child.hasOutOfPageChildren || child.hasOutOfPageInBranch)
+            {
+                // We didn't know what size this branch will actually need to be, node's children may be far.
+                // We now know where we would place it, so let's reevaluate size.
+                int actualSize = recalcTotalSize(child, dest.position());
+                if (actualSize > bytesLeft)
+                {
+                    if (bytesLeft == maxBytesPerPage)
+                    {
+                        // Branch doesn't even fit in a page.
+
+                        // Note: In this situation we aren't actually making the best choice as the layout should have
+                        // taken place at the child (which could have made the current parent small enough to fit).
+                        // This is not trivial to fix but should be very rare.
+
+                        layoutChildren(child);
+                        bytesLeft = dest.bytesLeftInPage();
+
+                        assert (child.filePos == -1);
+                    }
+
+                    // Doesn't fit, but that's probably because we don't have a full page. Put it back with the new
+                    // size and retry when we do have enough space.
+                    children.add(child);
+                    continue;
+                }
+            }
+
+            child.finalizeWithPosition(write(child));
+            bytesLeft = dest.bytesLeftInPage();
+        }
+
+        // The sizing below will use the branch size, so make sure it's set.
+        node.branchSize = 0;
+        node.hasOutOfPageChildren = true;
+        node.hasOutOfPageInBranch = false;
+        node.nodeSize = serializer.sizeofNode(node, dest.position());
+    }
+
+    /**
+     * Simple framework for executing recursion using on-heap linked trace to avoid stack overruns.
+     */
+    static abstract class Recursion<NODE>
+    {
+        final Recursion<NODE> parent;
+        final NODE node;
+        final Iterator<NODE> childIterator;
+
+        Recursion(NODE node, Iterator<NODE> childIterator, Recursion<NODE> parent)
+        {
+            this.parent = parent;
+            this.node = node;
+            this.childIterator = childIterator;
+        }
+
+        /**
+         * Make a child Recursion object for the given node and initialize it as necessary to continue processing
+         * with it.
+         *
+         * May return null if the recursion does not need to continue inside the child branch.
+         */
+        abstract Recursion<NODE> makeChild(NODE child);
+
+        /**
+         * Complete the processing this Recursion object.
+         *
+         * Note: this method is not called for the nodes for which makeChild() returns null.
+         */
+        abstract void complete() throws IOException;
+
+        /**
+         * Complete processing of the given child (possibly retrieve data to apply to any accumulation performed
+         * in this Recursion object).
+         *
+         * This is called when processing a child completes, including when recursion inside the child branch
+         * is skipped by makeChild() returning null.
+         */
+        void completeChild(NODE child)
+        {}
+
+        /**
+         * Recursive process, in depth-first order, the branch rooted at this recursion node.
+         *
+         * Returns this.
+         */
+        Recursion<NODE> process() throws IOException
+        {
+            Recursion<NODE> curr = this;
+
+            while (true)
+            {
+                if (curr.childIterator.hasNext())
+                {
+                    NODE child = curr.childIterator.next();
+                    Recursion<NODE> childRec = curr.makeChild(child);
+                    if (childRec != null)
+                        curr = childRec;
+                    else
+                        curr.completeChild(child);
+                }
+                else
+                {
+                    curr.complete();
+                    Recursion<NODE> currParent = curr.parent;
+                    if (currParent == null)
+                        return curr;
+                    currParent.completeChild(curr.node);
+                    curr = currParent;
+                }
+            }
+        }
+    }
+
+    protected int recalcTotalSize(Node<VALUE> node, long nodePosition) throws IOException
+    {
+        if (node.hasOutOfPageInBranch)
+        {
+            int sz = 0;
+            for (Node<VALUE> child : node.children)
+                sz += recalcTotalSize(child, nodePosition + sz);
+            node.branchSize = sz;
+        }
+
+        // The sizing below will use the branch size calculated above. Since that can change on out-of-page in branch,
+        // we need to recalculate the size if either flag is set.
+        if (node.hasOutOfPageChildren || node.hasOutOfPageInBranch)
+            node.nodeSize = serializer.sizeofNode(node, nodePosition + node.branchSize);
+
+        return node.branchSize + node.nodeSize;
+    }
+
+    protected long write(Node<VALUE> node) throws IOException
+    {
+        long nodePosition = dest.position();
+        for (Node<VALUE> child : node.children)
+            if (child.filePos == -1)
+                child.filePos = write(child);
+
+        nodePosition += node.branchSize;
+        assert dest.position() == nodePosition
+                : "Expected node position to be " + nodePosition + " but got " + dest.position() + " after writing children.\n" + dumpNode(node, dest.position());
+
+        serializer.write(dest, node, nodePosition);
+
+        assert dest.position() == nodePosition + node.nodeSize
+                || dest.paddedPosition() == dest.position() // For PartitionIndexTest.testPointerGrowth where position may jump on page boundaries.
+                : "Expected node position to be " + (nodePosition + node.nodeSize) + " but got " + dest.position() + " after writing node, nodeSize " + node.nodeSize + ".\n" + dumpNode(node, nodePosition);
+        return nodePosition;
+    }
+
+    protected String dumpNode(Node<VALUE> node, long nodePosition)
+    {
+        StringBuilder res = new StringBuilder(String.format("At %,d(%x) type %s child count %s nodeSize %,d branchSize %,d %s%s%n",
+                                                            nodePosition, nodePosition,
+                                                            TrieNode.typeFor(node, nodePosition), node.childCount(), node.nodeSize, node.branchSize,
+                                                            node.hasOutOfPageChildren ? "C" : "",
+                                                            node.hasOutOfPageInBranch ? "B" : ""));
+        for (Node<VALUE> child : node.children)
+            res.append(String.format("Child %2x at %,d(%x) type %s child count %s size %s nodeSize %,d branchSize %,d %s%s%n",
+                                     child.transition & 0xFF,
+                                     child.filePos,
+                                     child.filePos,
+                                     child.children != null ? TrieNode.typeFor(child, child.filePos) : "n/a",
+                                     child.children != null ? child.childCount() : "n/a",
+                                     child.children != null ? serializer.sizeofNode(child, child.filePos) : "n/a",
+                                     child.nodeSize,
+                                     child.branchSize,
+                                     child.hasOutOfPageChildren ? "C" : "",
+                                     child.hasOutOfPageInBranch ? "B" : ""));
+
+        return res.toString();
+    }
+
+    @Override
+    public PartialTail makePartialRoot() throws IOException
+    {
+        // The expectation is that the partial tail will be in memory, so we don't bother with page-fitting.
+        // We could also send some completed children to disk, but that could make suboptimal layout choices so we'd
+        // rather not. Just write anything not written yet to a buffer, from bottom to top, and we're done.
+        try (DataOutputBuffer buf = new DataOutputBuffer())
+        {
+            PTail tail = new PTail();
+            // Readers ask rebufferers for page-aligned positions, so make sure tail starts at one.
+            // "Padding" of the cutoff point may leave some unaddressable space in the constructed file view.
+            // Nothing will point to it, though, so that's fine.
+            tail.cutoff = dest.paddedPosition();
+            tail.count = count;
+            tail.root = writePartial(stack.getFirst(), buf, tail.cutoff);
+            tail.tail = buf.asNewBuffer();
+            return tail;
+        }
+    }
+
+    protected long writePartial(Node<VALUE> node, DataOutputPlus dest, long baseOffset) throws IOException
+    {
+        long startPosition = dest.position() + baseOffset;
+
+        List<Node<VALUE>> childrenToClear = new ArrayList<>();
+        for (Node<VALUE> child : node.children)
+        {
+            if (child.filePos == -1)
+            {
+                childrenToClear.add(child);
+                    child.filePos = writePartial(child, dest, baseOffset);
+            }
+        }
+
+        long nodePosition = dest.position() + baseOffset;
+
+        if (node.hasOutOfPageInBranch)
+        {
+            // Update the branch size with the size of what we have just written. This may be used by the node's
+            // maxPositionDelta and it's a better approximation for later fitting calculations.
+            node.branchSize = (int) (nodePosition - startPosition);
+        }
+
+        serializer.write(dest, node, nodePosition);
+
+        if (node.hasOutOfPageChildren || node.hasOutOfPageInBranch)
+        {
+            // Update the node size with what we have just seen. It's a better approximation for later fitting
+            // calculations.
+            long endPosition = dest.position() + baseOffset;
+            node.nodeSize = (int) (endPosition - nodePosition);
+        }
+
+        for (Node<VALUE> child : childrenToClear)
+            child.filePos = -1;
+        return nodePosition;
+    }
+
+    static class Node<Value> extends IncrementalTrieWriterBase.BaseNode<Value, Node<Value>>
+    {
+        /**
+         * Currently calculated size of the branch below this node, not including the node itself.
+         * If hasOutOfPageInBranch is true, this may be underestimated as the size
+         * depends on the position the branch is written.
+         */
+        int branchSize = -1;
+        /**
+         * Currently calculated node size. If hasOutOfPageChildren is true, this may be underestimated as the size
+         * depends on the position the node is written.
+         */
+        int nodeSize = -1;
+
+        /**
+         * Whether there is an out-of-page, already written node in the branches below the immediate children of the
+         * node.
+         */
+        boolean hasOutOfPageInBranch = false;
+        /**
+         * Whether a child of the node is out of page, already written.
+         * Forced to true before being set to make sure maxPositionDelta performs its evaluation on non-completed
+         * nodes for makePartialRoot.
+         */
+        boolean hasOutOfPageChildren = true;
+
+        Node(int transition)
+        {
+            super(transition);
+        }
+
+        @Override
+        Node<Value> newNode(byte transition)
+        {
+            return new Node<>(transition & 0xFF);
+        }
+
+        public long serializedPositionDelta(int i, long nodePosition)
+        {
+            assert (children.get(i).filePos != -1);
+            return children.get(i).filePos - nodePosition;
+        }
+
+        /**
+         * The max delta is the delta with either:
+         * - the position where the first child not-yet-placed child will be laid out.
+         * - the position of the furthest child that is already placed.
+         *
+         * This method assumes all children's branch and node sizes, as well as this node's branchSize, are already
+         * calculated.
+         */
+        public long maxPositionDelta(long nodePosition)
+        {
+            // The max delta is the position the first child would be laid out.
+            assert (childCount() > 0);
+
+            if (!hasOutOfPageChildren)
+                // We need to be able to address the first child. We don't need to cover its branch, though.
+                return -(branchSize - children.get(0).branchSize);
+
+            long minPlaced = 0;
+            long minUnplaced = 1;
+            for (Node<Value> child : children)
+            {
+                if (child.filePos != -1)
+                    minPlaced = Math.min(minPlaced, child.filePos - nodePosition);
+                else if (minUnplaced > 0)   // triggers once
+                    minUnplaced = -(branchSize - child.branchSize);
+            }
+
+            return Math.min(minPlaced, minUnplaced);
+        }
+
+        NavigableSet<Node<Value>> getChildrenWithUnsetPosition()
+        {
+            NavigableSet<Node<Value>> result = new TreeSet<>(BRANCH_SIZE_COMPARATOR);
+            for (Node<Value> child : children)
+                if (child.filePos == -1)
+                    result.add(child);
+
+            return result;
+        }
+
+        @Override
+        void finalizeWithPosition(long position)
+        {
+            this.branchSize = 0;                // takes no space in current page
+            this.nodeSize = 0;
+            this.hasOutOfPageInBranch = false;  // its size no longer needs to be recalculated
+            this.hasOutOfPageChildren = false;
+            super.finalizeWithPosition(position);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%02x branchSize=%04x nodeSize=%04x %s%s", transition, branchSize, nodeSize, hasOutOfPageInBranch ? "B" : "", hasOutOfPageChildren ? "C" : "");
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterSimple.java b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterSimple.java
new file mode 100644
index 000000000000..ce6693668689
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/IncrementalTrieWriterSimple.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+/**
+ * Incremental builder of on-disk tries. Takes sorted input.
+ * <p>
+ * Incremental building is done by maintaining a stack of nodes in progress which follows the path to reach the last
+ * added entry. When a new entry is needed, comparison with the previous can tell us how much of the parents stack
+ * remains the same. The rest of the stack is complete as no new entry can affect them due to the input sorting.
+ * The completed nodes can be written to disk and discarded, keeping only a pointer to their location in the file
+ * (this pointer will be discarded too when the parent node is completed). This ensures that a very limited amount of
+ * data is kept in memory at all times.
+ * <p>
+ * Note: This class is currently unused and stands only as form of documentation for {@link IncrementalTrieWriterPageAware}.
+ */
+public class IncrementalTrieWriterSimple<VALUE>
+        extends IncrementalTrieWriterBase<VALUE, DataOutput, IncrementalTrieWriterSimple.Node<VALUE>>
+        implements IncrementalTrieWriter<VALUE>
+{
+    private long position = 0;
+
+    public IncrementalTrieWriterSimple(TrieSerializer<VALUE, ? super DataOutput> trieSerializer, DataOutputPlus dest)
+    {
+        super(trieSerializer, dest, new Node<>((byte) 0));
+    }
+
+    @Override
+    protected void complete(Node<VALUE> node) throws IOException
+    {
+        long nodePos = position;
+        position += write(node, dest, position);
+        node.finalizeWithPosition(nodePos);
+    }
+
+    @Override
+    public void close()
+    {
+        super.close();
+    }
+
+    @Override
+    public void reset()
+    {
+        reset(new Node<>((byte) 0));
+        position = 0;
+    }
+
+    @Override
+    public PartialTail makePartialRoot() throws IOException
+    {
+        try (DataOutputBuffer buf = new DataOutputBuffer())
+        {
+            PTail tail = new PTail();
+            tail.cutoff = position;
+            tail.count = count;
+            long nodePos = position;
+            for (Node<VALUE> node : (Iterable<Node<VALUE>>) stack::descendingIterator)
+            {
+                node.filePos = nodePos;
+                nodePos += write(node, buf, nodePos);
+                // Hacky but works: temporarily write node's position. Will be overwritten when we finalize node.
+            }
+
+            tail.tail = buf.asNewBuffer();
+            tail.root = stack.getFirst().filePos;
+
+            for (Node<VALUE> node : (Iterable<Node<VALUE>>) stack::descendingIterator)
+                node.filePos = -1;
+
+            return tail;
+        }
+    }
+
+    private long write(Node<VALUE> node, DataOutput dest, long nodePosition) throws IOException
+    {
+        long size = serializer.sizeofNode(node, nodePosition);
+        serializer.write(dest, node, nodePosition);
+        return size;
+    }
+
+    static class Node<Value> extends IncrementalTrieWriterBase.BaseNode<Value, Node<Value>>
+    {
+        Node(int transition)
+        {
+            super(transition);
+        }
+
+        @Override
+        Node<Value> newNode(byte transition)
+        {
+            return new Node<>(transition & 0xFF);
+        }
+
+        public long serializedPositionDelta(int i, long nodePosition)
+        {
+            assert children.get(i).filePos != -1;
+            return children.get(i).filePos - nodePosition;
+        }
+
+        public long maxPositionDelta(long nodePosition)
+        {
+            long min = 0;
+            for (Node<Value> child : children)
+            {
+                if (child.filePos != -1)
+                    min = Math.min(min, child.filePos - nodePosition);
+            }
+            return min;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/tries/SerializationNode.java b/src/java/org/apache/cassandra/io/tries/SerializationNode.java
new file mode 100644
index 000000000000..aae7a4f10b1b
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/SerializationNode.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+/**
+ * An abstraction of a node given to the trie serializer to write.
+ */
+public interface SerializationNode<VALUE>
+{
+    /**
+     * The number of children of the node.
+     */
+    int childCount();
+
+    /**
+     * The payload of the node if the node has any associated, otherwise null.
+     */
+    VALUE payload();
+
+    /**
+     * The transition character for the child at position i. Must be an integer between 0 and 255.
+     */
+    int transition(int i);
+
+    /**
+     * Returns the distance between this node's position and the child at index i.
+     * Given as a difference calculation to be able to handle two different types of calls:
+     * - writing nodes where all children's positions are already completely determined
+     * - sizing and writing branches within a page where we don't know where we'll actually place
+     *   the nodes, but we know how far backward the child nodes will end up
+     */
+    long serializedPositionDelta(int i, long nodePosition);
+
+    /**
+     * Returns the furthest distance that needs to be written to store this node, i.e.
+     *   min(serializedPositionDelta(i, nodePosition) for 0 <= i < childCount())
+     * Given separately as the loop above can be inefficient (e.g. when children are not yet written).
+     */
+    long maxPositionDelta(long nodePosition);
+}
diff --git a/src/java/org/apache/cassandra/io/tries/TrieNode.java b/src/java/org/apache/cassandra/io/tries/TrieNode.java
new file mode 100644
index 000000000000..d8dc15e18ebe
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/TrieNode.java
@@ -0,0 +1,968 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.utils.SizedInts;
+
+/**
+ * Trie node types and manipulation mechanisms. The main purpose of this is to allow for handling tries directly as
+ * they are on disk without any serialization, and to enable the creation of such files.
+ *
+ * The serialization methods take as argument a generic {@code SerializationNode} and provide a method {@code typeFor}
+ * for choosing a suitable type to represent it, which can then be used to calculate size and write the node.
+ *
+ * To read a file containing trie nodes, one would use {@code at} to identify the node type and then the various
+ * read methods to retrieve the data. They all take a buffer (usually memory-mapped) containing the data, and a position
+ * in it that identifies the node.
+ *
+ * These node types do not specify any treatment of payloads. They are only concerned with providing 4 bits of
+ * space for {@code payloadFlags}, and a way of calculating the position after the node. Users of this class by convention
+ * use non-zero payloadFlags to indicate a payload exists, write it (possibly in flag-dependent format) at serialization
+ * time after the node itself is written, and read it using the {@code payloadPosition} value.
+ *
+ * To improve efficiency, multiple node types depending on the number of transitions are provided:
+ *   -- payload only, which has no outgoing transitions
+ *   -- single outgoing transition
+ *   -- sparse, which provides a list of transition bytes with corresponding targets
+ *   -- dense, where the transitions span a range of values and having the list (and the search in it) can be avoided
+ *
+ * For each of the transition-carrying types we also have "in-page" versions where transition targets are the 4, 8 or 12
+ * lowest bits of the position within the same page. To save one further byte, the single in-page versions using 4 or 12
+ * bits cannot carry a payload.
+ *
+ * This class is effectively an enumeration; abstract class permits instances to extends each other and reuse code.
+ */
+public abstract class TrieNode
+{
+    // Consumption (read) methods
+
+    /**
+     * Returns the type of node stored at this position. It can then be used to call the methods below.
+     */
+    public static TrieNode at(ByteBuffer src, int position)
+    {
+        return values[(src.get(position) >> 4) & 0xF];
+    }
+
+    /** Returns the 4 payload flag bits. Node types that cannot carry a payload return 0. */
+    public int payloadFlags(ByteBuffer src, int position)
+    {
+        return src.get(position) & 0x0F;
+    }
+    /**
+     * Return the position just after the node, where the payload is usually stored.
+     */
+    abstract public int payloadPosition(ByteBuffer src, int position);
+    /**
+     * Returns search index for the given byte in the node. If exact match is present, this is >= 0, otherwise as in
+     * binary search.
+     */
+    abstract public int search(ByteBuffer src, int position, int transitionByte);       // returns as binarySearch
+    /**
+     * Returns the upper childIndex limit. Calling transition with values 0 .. transitionRange - 1 is valid.
+     */
+    abstract public int transitionRange(ByteBuffer src, int position);
+    /**
+     * Returns the byte value for this child index, or Integer.MAX_VALUE if there are no transitions with this index or
+     * higher to permit listing the children without needing to call transitionRange.
+     *
+     * @param childIndex must be >= 0, though it is allowed to pass a value greater than {@code transitionRange - 1}
+     */
+    abstract public int transitionByte(ByteBuffer src, int position, int childIndex);
+    /**
+     * Returns the delta between the position of this node and the position of the target of the specified transition.
+     * This is always a negative number. Dense nodes use 0 to specify "no transition".
+     *
+     * @param childIndex must be >= 0 and < {@link #transitionRange(ByteBuffer, int)} - note that this is not validated
+     *                   and behaviour of this method is undefined for values outside of that range
+     */
+    abstract long transitionDelta(ByteBuffer src, int position, int childIndex);
+    /**
+     * Returns position of node to transition to for the given search index. Argument must be positive. May return -1
+     * if a transition with that index does not exist (DENSE nodes).
+     * Position is the offset of the node within the ByteBuffer. positionLong is its global placement, which is the
+     * base for any offset calculations.
+     *
+     * @param positionLong although it seems to be obvious, this argument must be "real", that is, each child must have
+     *                     the calculated absolute position >= 0, otherwise the behaviour of this method is undefined
+     * @param childIndex must be >= 0 and < {@link #transitionRange(ByteBuffer, int)} - note that this is not validated
+     *                   and behaviour of this method is undefined for values outside of that range
+     */
+    public long transition(ByteBuffer src, int position, long positionLong, int childIndex)
+    {
+        // note: incorrect for dense nodes
+        return positionLong + transitionDelta(src, position, childIndex);
+    }
+    /**
+     * Returns the highest transition for this node, or -1 if none exist (PAYLOAD_ONLY nodes).
+     */
+    public long lastTransition(ByteBuffer src, int position, long positionLong)
+    {
+        return transition(src, position, positionLong, transitionRange(src, position) - 1);
+    }
+    /**
+     * Returns a transition that is higher than the index returned by {@code search}. This may not exist (if the
+     * argument was higher than the last transition byte), in which case this returns the given {@code defaultValue}.
+     */
+    abstract public long greaterTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue);
+    /**
+     * Returns a transition that is lower than the index returned by {@code search}. Returns {@code defaultValue} for
+     * {@code searchIndex} equals 0 or 1 as lesser transition for those indexes does not exist.
+     */
+    abstract public long lesserTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue);
+
+    // Construction (serialization) methods
+
+    /**
+     * Returns a node type that is suitable to store the node.
+     */
+    public static TrieNode typeFor(SerializationNode<?> node, long nodePosition)
+    {
+        int c = node.childCount();
+        if (c == 0)
+            return PAYLOAD_ONLY;
+
+        int bitsPerPointerIndex = 0;
+        long delta = node.maxPositionDelta(nodePosition);
+        assert delta < 0;
+        while (!singles[bitsPerPointerIndex].fits(-delta))
+            ++bitsPerPointerIndex;
+
+        if (c == 1)
+        {
+            if (node.payload() != null && singles[bitsPerPointerIndex].bytesPerPointer == FRACTIONAL_BYTES)
+                ++bitsPerPointerIndex; // next index will permit payload
+
+            return singles[bitsPerPointerIndex];
+        }
+
+        TrieNode sparse = sparses[bitsPerPointerIndex];
+        TrieNode dense = denses[bitsPerPointerIndex];
+        return (sparse.sizeofNode(node) < dense.sizeofNode(node)) ? sparse : dense;
+    }
+
+    /**
+     * Returns the size needed to serialize this node.
+     */
+    abstract public int sizeofNode(SerializationNode<?> node);
+    /**
+     * Serializes the node. All transition target positions must already have been defined. {@code payloadBits} must
+     * be four bits.
+     */
+    abstract public void serialize(DataOutput out, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException;
+
+    // Implementations
+
+    final int bytesPerPointer;
+    static final int FRACTIONAL_BYTES = 0;
+
+    TrieNode(int bytesPerPointer)
+    {
+        this.bytesPerPointer = bytesPerPointer;
+    }
+
+    int ordinal = -1;
+
+    static final PayloadOnly PAYLOAD_ONLY = new PayloadOnly();
+    static class PayloadOnly extends TrieNode
+    {
+        // byte flags
+        // var payload
+        PayloadOnly()
+        {
+            super(FRACTIONAL_BYTES);
+        }
+
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            return position + 1;
+        }
+
+        @Override
+        public int search(ByteBuffer src, int position, int transitionByte)
+        {
+            return -1;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return 0;
+        }
+
+        @Override
+        public long transition(ByteBuffer src, int position, long positionLong, int childIndex)
+        {
+            return -1;
+        }
+
+        @Override
+        public long lastTransition(ByteBuffer src, int position, long positionLong)
+        {
+            return -1;
+        }
+
+        @Override
+        public long greaterTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            return defaultValue;
+        }
+
+        @Override
+        public long lesserTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            return defaultValue;
+        }
+
+        @Override
+        public int transitionByte(ByteBuffer src, int position, int childIndex)
+        {
+            return Integer.MAX_VALUE;
+        }
+
+        @Override
+        public int transitionRange(ByteBuffer src, int position)
+        {
+            return 0;
+        }
+
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            return 1;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            dest.writeByte((ordinal << 4) + (payloadBits & 0x0F));
+        }
+    };
+
+    static final Single SINGLE_8 = new Single(1);
+    static final Single SINGLE_16 = new Single(2);
+
+    static class Single extends TrieNode
+    {
+        // byte flags
+        // byte transition
+        // bytesPerPointer bytes transition target
+        // var payload
+
+        Single(int bytesPerPointer)
+        {
+            super(bytesPerPointer);
+        }
+
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            return position + 2 + bytesPerPointer;
+        }
+
+        @Override
+        public int search(ByteBuffer src, int position, int transitionByte)
+        {
+            int c = src.get(position + 1) & 0xFF;
+            if (transitionByte == c)
+                return 0;
+            return transitionByte < c ? -1 : -2;
+        }
+
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -readBytes(src, position + 2);
+        }
+
+        @Override
+        public long lastTransition(ByteBuffer src, int position, long positionLong)
+        {
+            return transition(src, position, positionLong, 0);
+        }
+
+        @Override
+        public long greaterTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            return (searchIndex == -1) ? transition(src, position, positionLong, 0) : defaultValue;
+        }
+
+        @Override
+        public long lesserTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            return searchIndex == 0 || searchIndex == -1 ? defaultValue : transition(src, position, positionLong, 0);
+        }
+
+        @Override
+        public int transitionByte(ByteBuffer src, int position, int childIndex)
+        {
+            return childIndex == 0 ? src.get(position + 1) & 0xFF : Integer.MAX_VALUE;
+        }
+
+        @Override
+        public int transitionRange(ByteBuffer src, int position)
+        {
+            return 1;
+        }
+
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            return 2 + bytesPerPointer;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            int childCount = node.childCount();
+            assert childCount == 1;
+            dest.writeByte((ordinal << 4) + (payloadBits & 0x0F));
+
+            dest.writeByte(node.transition(0));
+            writeBytes(dest, -node.serializedPositionDelta(0, nodePosition));
+        }
+    };
+
+
+    static final Single SINGLE_NOPAYLOAD_4 = new SingleNoPayload4();
+    static class SingleNoPayload4 extends Single
+    {
+        // 4-bit type ordinal
+        // 4-bit target delta
+        // byte transition
+        // no payload!
+        SingleNoPayload4()
+        {
+            super(FRACTIONAL_BYTES);
+        }
+
+        @Override
+        public int payloadFlags(ByteBuffer src, int position)
+        {
+            return 0;
+        }
+
+        // Although we don't have a payload position, provide one for calculating the size of the node.
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            return position + 2;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -(src.get(position) & 0xF);
+        }
+
+        @Override
+        boolean fits(long delta)
+        {
+            return 0 <= delta && delta <= 0xF;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            assert payloadBits == 0;
+            int childCount = node.childCount();
+            assert childCount == 1;
+            long pd = -node.serializedPositionDelta(0, nodePosition);
+            assert pd > 0 && pd < 0x10;
+            dest.writeByte((ordinal << 4) + (int) (pd & 0x0F));
+            dest.writeByte(node.transition(0));
+        }
+
+        @Override
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            return 2;
+        }
+    };
+
+    static final Single SINGLE_NOPAYLOAD_12 = new SingleNoPayload12();
+    static class SingleNoPayload12 extends Single
+    {
+        // 4-bit type ordinal
+        // 12-bit target delta
+        // byte transition
+        // no payload!
+        SingleNoPayload12()
+        {
+            super(FRACTIONAL_BYTES);
+        }
+
+        @Override
+        public int payloadFlags(ByteBuffer src, int position)
+        {
+            return 0;
+        }
+
+        // Although we don't have a payload position, provide one for calculating the size of the node.
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            return position + 3;
+        }
+
+        @Override
+        public int search(ByteBuffer src, int position, int transitionByte)
+        {
+            int c = src.get(position + 2) & 0xFF;
+            if (transitionByte == c)
+                return 0;
+            return transitionByte < c ? -1 : -2;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -(src.getShort(position) & 0xFFF);
+        }
+
+        @Override
+        public int transitionByte(ByteBuffer src, int position, int childIndex)
+        {
+            return childIndex == 0 ? src.get(position + 2) & 0xFF : Integer.MAX_VALUE;
+        }
+
+        @Override
+        boolean fits(long delta)
+        {
+            return 0 <= delta && delta <= 0xFFF;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            assert payloadBits == 0;
+            int childCount = node.childCount();
+            assert childCount == 1;
+            long pd = -node.serializedPositionDelta(0, nodePosition);
+            assert pd > 0 && pd < 0x1000;
+            dest.writeByte((ordinal << 4) + (int) ((pd >> 8) & 0x0F));
+            dest.writeByte((byte) pd);
+            dest.writeByte(node.transition(0));
+        }
+
+        @Override
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            return 3;
+        }
+    };
+
+    static final Sparse SPARSE_8 = new Sparse(1);
+    static final Sparse SPARSE_16 = new Sparse(2);
+    static final Sparse SPARSE_24 = new Sparse(3);
+    static final Sparse SPARSE_40 = new Sparse(5);
+    static class Sparse extends TrieNode
+    {
+        // byte flags
+        // byte count (<= 255)
+        // count bytes transitions
+        // count ints transition targets
+        // var payload
+
+        Sparse(int bytesPerPointer)
+        {
+            super(bytesPerPointer);
+        }
+
+        @Override
+        public int transitionRange(ByteBuffer src, int position)
+        {
+            return src.get(position + 1) & 0xFF;
+        }
+
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            int count = transitionRange(src, position);
+            return position + 2 + (bytesPerPointer + 1) * count;
+        }
+
+        @Override
+        public int search(ByteBuffer src, int position, int key)
+        {
+            int l = -1; // known < key
+            int r = transitionRange(src, position);   // known > key
+            position += 2;
+
+            while (l + 1 < r)
+            {
+                int m = (l + r + 1) / 2;
+                int childTransition = src.get(position + m) & 0xFF;
+                int cmp = Integer.compare(key, childTransition);
+                if (cmp < 0)
+                    r = m;
+                else if (cmp > 0)
+                    l = m;
+                else
+                    return m;
+            }
+
+            return -r - 1;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            assert childIndex >= 0;
+            int range = transitionRange(src, position);
+            assert childIndex < range;
+            return -readBytes(src, position + 2 + range + bytesPerPointer * childIndex);
+        }
+
+        @Override
+        public long greaterTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            if (searchIndex < 0)
+                searchIndex = -1 - searchIndex;
+            else
+                ++searchIndex;
+            if (searchIndex >= transitionRange(src, position))
+                return defaultValue;
+            return transition(src, position, positionLong, searchIndex);
+        }
+
+        public long lesserTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            if (searchIndex == 0 || searchIndex == -1)
+                return defaultValue;
+            if (searchIndex < 0)
+                searchIndex = -2 - searchIndex;
+            else
+                --searchIndex;
+            return transition(src, position, positionLong, searchIndex);
+        }
+
+        @Override
+        public int transitionByte(ByteBuffer src, int position, int childIndex)
+        {
+            return childIndex < transitionRange(src, position) ? src.get(position + 2 + childIndex) & 0xFF : Integer.MAX_VALUE;
+        }
+
+        @Override
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            return 2 + node.childCount() * (1 + bytesPerPointer);
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            int childCount = node.childCount();
+            assert childCount > 0;
+            assert childCount < 256;
+            dest.writeByte((ordinal << 4) + (payloadBits & 0x0F));
+            dest.writeByte(childCount);
+
+            for (int i = 0; i < childCount; ++i)
+                dest.writeByte(node.transition(i));
+            for (int i = 0; i < childCount; ++i)
+                writeBytes(dest, -node.serializedPositionDelta(i, nodePosition));
+        }
+    };
+
+    static final Sparse12 SPARSE_12 = new Sparse12();
+    static class Sparse12 extends Sparse
+    {
+        // byte flags
+        // byte count (<= 255)
+        // count bytes transitions
+        // count 12-bits transition targets
+        // var payload
+        Sparse12()
+        {
+            super(FRACTIONAL_BYTES);
+        }
+
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            int count = transitionRange(src, position);
+            return position + 2 + (5 * count + 1) / 2;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -read12Bits(src, position + 2 + transitionRange(src, position), childIndex);
+        }
+
+        @Override
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            return 2 + (node.childCount() * 5 + 1) / 2;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            int childCount = node.childCount();
+            assert childCount < 256;
+            dest.writeByte((ordinal << 4) + (payloadBits & 0x0F));
+            dest.writeByte(childCount);
+
+            for (int i = 0; i < childCount; ++i)
+                dest.writeByte(node.transition(i));
+            int i;
+            for (i = 0; i + 2 <= childCount; i += 2)
+            {
+                int p0 = (int) -node.serializedPositionDelta(i, nodePosition);
+                int p1 = (int) -node.serializedPositionDelta(i + 1, nodePosition);
+                assert p0 > 0 && p0 < (1 << 12);
+                assert p1 > 0 && p1 < (1 << 12);
+                dest.writeByte(p0 >> 4);
+                dest.writeByte((p0 << 4) | (p1 >> 8));
+                dest.writeByte(p1);
+            }
+            if (i < childCount)
+            {
+                long pd = -node.serializedPositionDelta(i, nodePosition);
+                assert pd > 0 && pd < (1 << 12);
+                dest.writeShort((short) (pd << 4));
+            }
+        }
+
+        @Override
+        boolean fits(long delta)
+        {
+            return 0 <= delta && delta <= 0xFFF;
+        }
+    };
+
+    static final Dense DENSE_16 = new Dense(2);
+    static final Dense DENSE_24 = new Dense(3);
+    static final Dense DENSE_32 = new Dense(4);
+    static final Dense DENSE_40 = new Dense(5);
+    static class Dense extends TrieNode
+    {
+        // byte flags
+        // byte start
+        // byte length-1
+        // length ints transition targets (-1 for not present)
+        // var payload
+
+        static final int NULL_VALUE = 0;
+
+        Dense(int bytesPerPointer)
+        {
+            super(bytesPerPointer);
+        }
+
+        @Override
+        public int transitionRange(ByteBuffer src, int position)
+        {
+            return 1 + (src.get(position + 2) & 0xFF);
+        }
+
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            return position + 3 + transitionRange(src, position) * bytesPerPointer;
+        }
+
+        @Override
+        public int search(ByteBuffer src, int position, int transitionByte)
+        {
+            int l = src.get(position + 1) & 0xFF;
+            int i = transitionByte - l;
+            if (i < 0)
+                return -1;
+            int len = transitionRange(src, position);
+            if (i >= len)
+                return -len - 1;
+            long t = transition(src, position, 0L, i);
+            return t != -1 ? i : -i - 1;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -readBytes(src, position + 3 + childIndex * bytesPerPointer);
+        }
+
+        @Override
+        public long transition(ByteBuffer src, int position, long positionLong, int childIndex)
+        {
+            long v = transitionDelta(src, position, childIndex);
+            return v != NULL_VALUE ? v + positionLong : -1;
+        }
+
+        @Override
+        public long greaterTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            if (searchIndex < 0)
+                searchIndex = -1 - searchIndex;
+            else
+                ++searchIndex;
+            int len = transitionRange(src, position);
+            for (; searchIndex < len; ++searchIndex)
+            {
+                long t = transition(src, position, positionLong, searchIndex);
+                if (t != -1)
+                    return t;
+            }
+            return defaultValue;
+        }
+
+        @Override
+        public long lesserTransition(ByteBuffer src, int position, long positionLong, int searchIndex, long defaultValue)
+        {
+            if (searchIndex == 0 || searchIndex == -1)
+                return defaultValue;
+
+            if (searchIndex < 0)
+                searchIndex = -2 - searchIndex;
+            else
+                --searchIndex;
+            for (; searchIndex >= 0; --searchIndex)
+            {
+                long t = transition(src, position, positionLong, searchIndex);
+                if (t != -1)
+                    return t;
+            }
+            assert false : "transition must always exist at 0 and we should not be called for less of that";
+            return defaultValue;
+        }
+
+        @Override
+        public int transitionByte(ByteBuffer src, int position, int childIndex)
+        {
+            if (childIndex >= transitionRange(src, position))
+                return Integer.MAX_VALUE;
+            int l = src.get(position + 1) & 0xFF;
+            return l + childIndex;
+        }
+
+        @Override
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            int l = node.transition(0);
+            int r = node.transition(node.childCount() - 1);
+            return 3 + (r - l + 1) * bytesPerPointer;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            int childCount = node.childCount();
+            dest.writeByte((ordinal << 4) + (payloadBits & 0x0F));
+            int l = node.transition(0);
+            int r = node.transition(childCount - 1);
+            assert 0 <= l && l <= r && r <= 255;
+            dest.writeByte(l);
+            dest.writeByte(r - l);      // r is included, i.e. this is len - 1
+
+            for (int i = 0; i < childCount; ++i)
+            {
+                int next = node.transition(i);
+                while (l < next)
+                {
+                    writeBytes(dest, NULL_VALUE);
+                    ++l;
+                }
+                writeBytes(dest, -node.serializedPositionDelta(i, nodePosition));
+                ++l;
+            }
+        }
+    };
+
+    static final Dense12 DENSE_12 = new Dense12();
+    static class Dense12 extends Dense
+    {
+        // byte flags
+        // byte start
+        // byte length-1
+        // length 12-bits transition targets (-1 for not present)
+        // var payload
+
+        Dense12()
+        {
+            super(FRACTIONAL_BYTES);
+        }
+
+        @Override
+        public int payloadPosition(ByteBuffer src, int position)
+        {
+            return position + 3 + (transitionRange(src, position) * 3 + 1) / 2;
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -read12Bits(src, position + 3, childIndex);
+        }
+
+        @Override
+        public int sizeofNode(SerializationNode<?> node)
+        {
+            int l = node.transition(0);
+            int r = node.transition(node.childCount() - 1);
+            return 3 + ((r - l + 1) * 3 + 1) / 2;
+        }
+
+        @Override
+        public void serialize(DataOutput dest, SerializationNode<?> node, int payloadBits, long nodePosition) throws IOException
+        {
+            int childCount = node.childCount();
+            dest.writeByte((ordinal << 4) + (payloadBits & 0x0F));
+            int l = node.transition(0);
+            int r = node.transition(childCount - 1);
+            assert 0 <= l && l <= r && r <= 255;
+            dest.writeByte(l);
+            dest.writeByte(r - l);      // r is included, i.e. this is len - 1
+
+            int carry = 0;
+            int start = l;
+            for (int i = 0; i < childCount; ++i)
+            {
+                int next = node.transition(i);
+                while (l < next)
+                {
+                    carry = write12Bits(dest, NULL_VALUE, l - start, carry);
+                    ++l;
+                }
+                long pd = node.serializedPositionDelta(i, nodePosition);
+                carry = write12Bits(dest, (int) -pd, l - start, carry);
+                ++l;
+            }
+            if (((l - start) & 1) == 1)
+                dest.writeByte(carry);
+        }
+
+        @Override
+        boolean fits(long delta)
+        {
+            return 0 <= delta && delta <= 0xFFF;
+        }
+    };
+
+    static final LongDense LONG_DENSE = new LongDense();
+    static class LongDense extends Dense
+    {
+        // byte flags
+        // byte start
+        // byte length-1
+        // length long transition targets (-1 for not present)
+        // var payload
+        LongDense()
+        {
+            super(8);
+        }
+
+        @Override
+        public long transitionDelta(ByteBuffer src, int position, int childIndex)
+        {
+            return -src.getLong(position + 3 + childIndex * 8);
+        }
+
+        @Override
+        public void writeBytes(DataOutput dest, long ofs) throws IOException
+        {
+            dest.writeLong(ofs);
+        }
+
+        @Override
+        boolean fits(long delta)
+        {
+            return true;
+        }
+    };
+
+
+    static int read12Bits(ByteBuffer src, int base, int searchIndex)
+    {
+        int word = src.getShort(base + (3 * searchIndex) / 2);
+        if ((searchIndex & 1) == 0)
+            word = (word >> 4);
+        return word & 0xFFF;
+    }
+
+    static int write12Bits(DataOutput dest, int value, int index, int carry) throws IOException
+    {
+        assert 0 <= value && value <= 0xFFF;
+        if ((index & 1) == 0)
+        {
+            dest.writeByte(value >> 4);
+            return value << 4;
+        }
+        else
+        {
+            dest.writeByte(carry | (value >> 8));
+            dest.writeByte(value);
+            return 0;
+        }
+    }
+
+    long readBytes(ByteBuffer src, int position)
+    {
+        return SizedInts.readUnsigned(src, position, bytesPerPointer);
+    }
+
+    void writeBytes(DataOutput dest, long ofs) throws IOException
+    {
+        assert fits(ofs);
+        SizedInts.write(dest, ofs, bytesPerPointer);
+    }
+
+    boolean fits(long delta)
+    {
+        return 0 <= delta && delta < (1L << (bytesPerPointer * 8));
+    }
+
+    @Override
+    public String toString()
+    {
+        String res = getClass().getSimpleName();
+        if (bytesPerPointer >= 1)
+            res += (bytesPerPointer * 8);
+        return res;
+    }
+
+    public static Object nodeTypeString(int ordinal)
+    {
+        return values[ordinal].toString();
+    }
+
+    static final TrieNode[] values = new TrieNode[] { PAYLOAD_ONLY,
+                                                      SINGLE_NOPAYLOAD_4, SINGLE_8, SINGLE_NOPAYLOAD_12, SINGLE_16,
+                                                      SPARSE_8, SPARSE_12, SPARSE_16, SPARSE_24, SPARSE_40,
+                                                      DENSE_12, DENSE_16, DENSE_24, DENSE_32, DENSE_40,
+                                                      LONG_DENSE}; // Catch-all
+    // We can't fit all types * all sizes in 4 bits, so we use a selection. When we don't have a matching instance
+    // we just use something more general that can do its job.
+    // The arrays below must have corresponding types for all sizes specified by the singles row.
+    // Note: 12 bit sizes are important, because that size will fit any pointer within a page-packed branch.
+    static final TrieNode[] singles = new TrieNode[]{ SINGLE_NOPAYLOAD_4, SINGLE_8, SINGLE_NOPAYLOAD_12, SINGLE_16, DENSE_24, DENSE_32, DENSE_40, LONG_DENSE };
+    static final TrieNode[] sparses = new TrieNode[]{ SPARSE_8, SPARSE_8, SPARSE_12, SPARSE_16, SPARSE_24, SPARSE_40, SPARSE_40, LONG_DENSE };
+    static final TrieNode[] denses = new TrieNode[]{ DENSE_12, DENSE_12, DENSE_12, DENSE_16, DENSE_24, DENSE_32, DENSE_40, LONG_DENSE };
+    static
+    {
+        //noinspection ConstantConditions
+        assert sparses.length == singles.length && denses.length == singles.length && values.length <= 16;
+        for (int i = 0; i < values.length; ++i)
+            values[i].ordinal = i;
+    }
+
+    public static final ByteBuffer EMPTY = ByteBuffer.wrap(new byte[] { (byte) (PAYLOAD_ONLY.ordinal << 4) } );
+}
diff --git a/src/java/org/apache/cassandra/io/tries/TrieSerializer.java b/src/java/org/apache/cassandra/io/tries/TrieSerializer.java
new file mode 100644
index 000000000000..e0010cd0d6f0
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/TrieSerializer.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+
+public interface TrieSerializer<VALUE, DEST>
+{
+    int sizeofNode(SerializationNode<VALUE> node, long nodePosition);
+
+    // Only called after all children's serializedPositions have been set.
+    void write(DEST dest, SerializationNode<VALUE> node, long nodePosition) throws IOException;
+}
diff --git a/src/java/org/apache/cassandra/io/tries/ValueIterator.java b/src/java/org/apache/cassandra/io/tries/ValueIterator.java
new file mode 100644
index 000000000000..2c25cf38f8b6
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/ValueIterator.java
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * Thread-unsafe value iterator for on-disk tries. Uses the assumptions of Walker.
+ */
+public class ValueIterator<CONCRETE extends ValueIterator<CONCRETE>> extends Walker<CONCRETE>
+{
+    private final ByteSource limit;
+    private IterationPosition stack;
+    private long next;
+
+    static class IterationPosition
+    {
+        long node;
+        int childIndex;
+        int limit;
+        IterationPosition prev;
+
+        IterationPosition(long node, int childIndex, int limit, IterationPosition prev)
+        {
+            super();
+            this.node = node;
+            this.childIndex = childIndex;
+            this.limit = limit;
+            this.prev = prev;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("[Node %d, child %d, limit %d]", node, childIndex, limit);
+        }
+    }
+
+    protected ValueIterator(Rebufferer source, long root)
+    {
+        super(source, root);
+        limit = null;
+        initializeNoLeftBound(root, 256);
+    }
+
+    protected ValueIterator(Rebufferer source, long root, ByteComparable start, ByteComparable end, boolean admitPrefix)
+    {
+        super(source, root);
+        limit = end != null ? end.asComparableBytes(BYTE_COMPARABLE_VERSION) : null;
+
+        if (start != null)
+            initializeWithLeftBound(root, start.asComparableBytes(BYTE_COMPARABLE_VERSION), admitPrefix, limit != null);
+        else
+            initializeNoLeftBound(root, limit != null ? limit.next() : 256);
+    }
+
+    private void initializeWithLeftBound(long root, ByteSource startStream, boolean admitPrefix, boolean atLimit)
+    {
+        IterationPosition prev = null;
+        int childIndex;
+        int limitByte;
+        long payloadedNode = -1;
+
+        try
+        {
+            // Follow start position while we still have a prefix, stacking path and saving prefixes.
+            go(root);
+            while (true)
+            {
+                int s = startStream.next();
+                childIndex = search(s);
+
+                // For a separator trie the latest payload met along the prefix is a potential match for start
+                if (admitPrefix)
+                {
+                    if (childIndex == 0 || childIndex == -1)
+                    {
+                        if (payloadFlags() != 0)
+                            payloadedNode = position;
+                    }
+                    else
+                    {
+                        payloadedNode = -1;
+                    }
+                }
+
+                limitByte = 256;
+                if (atLimit)
+                {
+                    limitByte = limit.next();
+                    if (s < limitByte)
+                        atLimit = false;
+                }
+                if (childIndex < 0)
+                    break;
+
+                prev = new IterationPosition(position, childIndex, limitByte, prev);
+                go(transition(childIndex));
+            }
+
+            childIndex = -1 - childIndex - 1;
+            stack = new IterationPosition(position, childIndex, limitByte, prev);
+
+            // Advancing now gives us first match if we didn't find one already.
+            if (payloadedNode != -1)
+                next = payloadedNode;
+            else
+                next = advanceNode();
+        }
+        catch (Throwable t)
+        {
+            super.close();
+            throw t;
+        }
+    }
+
+    private void initializeNoLeftBound(long root, int limitByte)
+    {
+        stack = new IterationPosition(root, -1, limitByte, null);
+
+        try
+        {
+            go(root);
+            if (payloadFlags() != 0)
+                next = root;
+            else
+                next = advanceNode();
+        }
+        catch (Throwable t)
+        {
+            super.close();
+            throw t;
+        }
+    }
+
+    /**
+     * Returns the payload node position.
+     *
+     * This method must be async-read-safe, see {@link #advanceNode()}.
+     */
+    protected long nextPayloadedNode()
+    {
+        long toReturn = next;
+        if (next != -1)
+            next = advanceNode();
+        return toReturn;
+    }
+
+    /**
+     * This method must be async-read-safe. Every read from new buffering position (the go() calls) can
+     * trigger NotInCacheException, and iteration must be able to redo the work that was interrupted during the next
+     * call. Hence the mutable state must be fully ready before all go() calls (i.e. they must either be the
+     * last step in the loop or the state must be unchanged until that call has succeeded).
+     */
+    private long advanceNode()
+    {
+        long child;
+        int transitionByte;
+
+        go(stack.node); // can throw NotInCacheException, OK no state modified yet
+        while (true)
+        {
+            // advance position in node but don't change the stack just yet due to NotInCacheExceptions
+            int childIndex = stack.childIndex + 1;
+            transitionByte = transitionByte(childIndex);
+
+            if (transitionByte > stack.limit)
+            {
+                // ascend
+                stack = stack.prev;
+                if (stack == null)        // exhausted whole trie
+                    return -1;
+                go(stack.node); // can throw NotInCacheException, OK - stack ready to re-enter loop with parent
+                continue;
+            }
+
+            child = transition(childIndex);
+
+            if (child != -1)
+            {
+                assert child >= 0 : String.format("Expected value >= 0 but got %d - %s", child, this);
+
+                // descend
+                go(child); // can throw NotInCacheException, OK - stack not yet changed, limit not yet incremented
+
+                int l = 256;
+                if (transitionByte == stack.limit)
+                    l = limit.next();
+
+                stack.childIndex = childIndex;
+                stack = new IterationPosition(child, -1, l, stack);
+
+                if (payloadFlags() != 0)
+                    return child;
+            }
+            else
+            {
+                stack.childIndex = childIndex;
+            }
+        }
+
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/tries/Walker.java b/src/java/org/apache/cassandra/io/tries/Walker.java
new file mode 100644
index 000000000000..a7e4a40d3148
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/Walker.java
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.PrintStream;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.io.util.Rebufferer.BufferHolder;
+import org.apache.cassandra.utils.PageAware;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * Thread-unsafe trie walking helper. This is analogous to RandomAccessReader for tries -- takes an on-disk trie
+ * accessible via a supplied Rebufferer and lets user seek to nodes and work with them.
+ * <p>
+ * Assumes data was written using page-aware builder and thus no node crosses a page and thus a buffer boundary.
+ */
+public class Walker<VALUE extends Walker<VALUE>> implements AutoCloseable
+{
+    private final Rebufferer source;
+    protected final long root;
+
+    // State relating to current node.
+    private BufferHolder bh;    // from Rebufferer
+    private int offset;         // offset of current node within buf
+    protected TrieNode nodeType;  // type of current node
+    protected ByteBuffer buf;   // buffer containing the data
+    protected long position;    // file position of current node
+
+    // State relating to searches.
+    protected long greaterBranch;
+    protected long lesserBranch;
+
+    // Version of the byte comparable conversion to use -- trie-based indices use the 6.0 conversion
+    public static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS41;
+
+    /**
+     * Creates a walker. Rebufferer must be aligned and with a buffer size that is at least 4k.
+     */
+    public Walker(Rebufferer source, long root)
+    {
+        this.source = source;
+        this.root = root;
+        try
+        {
+            bh = source.rebuffer(root);
+            buf = bh.buffer();
+        }
+        catch (RuntimeException ex)
+        {
+            if (bh != null) bh.release();
+            source.closeReader();
+            throw ex;
+        }
+    }
+
+    public void close()
+    {
+        bh.release();
+        source.closeReader();
+    }
+
+    protected final void go(long position)
+    {
+        long curOffset = position - bh.offset();
+        if (curOffset < 0 || curOffset >= buf.limit())
+        {
+            BufferHolder currentBh = bh;
+            bh = source.rebuffer(position);
+            currentBh.release();
+            buf = bh.buffer();
+            curOffset = position - bh.offset();
+            assert curOffset >= 0 && curOffset < buf.limit() : String.format("Invalid offset: %d, buf: %s, bh: %s", curOffset, buf, bh);
+        }
+        this.offset = (int) curOffset;
+        this.position = position;
+        nodeType = TrieNode.at(buf, (int) curOffset);
+    }
+
+    protected final int payloadFlags()
+    {
+        return nodeType.payloadFlags(buf, offset);
+    }
+
+    protected final int payloadPosition()
+    {
+        return nodeType.payloadPosition(buf, offset);
+    }
+
+    protected final int search(int transitionByte)
+    {
+        return nodeType.search(buf, offset, transitionByte);
+    }
+
+    protected final long transition(int childIndex)
+    {
+        return nodeType.transition(buf, offset, position, childIndex);
+    }
+
+    protected final long lastTransition()
+    {
+        return nodeType.lastTransition(buf, offset, position);
+    }
+
+    protected final long greaterTransition(int searchIndex, long defaultValue)
+    {
+        return nodeType.greaterTransition(buf, offset, position, searchIndex, defaultValue);
+    }
+
+    protected final long lesserTransition(int searchIndex, long defaultValue)
+    {
+        return nodeType.lesserTransition(buf, offset, position, searchIndex, defaultValue);
+    }
+
+    protected final int transitionByte(int childIndex)
+    {
+        return nodeType.transitionByte(buf, offset, childIndex);
+    }
+
+    protected final int transitionRange()
+    {
+        return nodeType.transitionRange(buf, offset);
+    }
+
+    protected final boolean hasChildren()
+    {
+        return transitionRange() > 0;
+    }
+
+    protected final void goMax(long pos)
+    {
+        go(pos);
+        while (true)
+        {
+            long lastChild = lastTransition();
+            if (lastChild == -1)
+                return;
+            go(lastChild);
+        }
+    }
+
+    protected final void goMin(long pos)
+    {
+        go(pos);
+        while (true)
+        {
+            int payloadBits = payloadFlags();
+            if (payloadBits > 0)
+                return;
+
+            go(transition(0));
+        }
+    }
+
+    public interface Extractor<RESULT, VALUE>
+    {
+        RESULT extract(VALUE walker, int payloadPosition, int payloadFlags);
+    }
+
+    /**
+     * Follows the given key while there are transitions in the trie for it.
+     *
+     * @return the first unmatched byte of the key, may be {@link ByteSource#END_OF_STREAM}
+     */
+    public int follow(ByteComparable key)
+    {
+        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        go(root);
+        while (true)
+        {
+            int b = stream.next();
+            int childIndex = search(b);
+
+            if (childIndex < 0)
+                return b;
+
+            go(transition(childIndex));
+        }
+    }
+
+    /**
+     * Follows the trie for a given key, remembering the closest greater branch.
+     * On return the walker is positioned at the longest prefix that matches the input (with or without payload), and
+     * min(greaterBranch) is the immediate greater neighbour.
+     *
+     * @return the first unmatched byte of the key, may be {@link ByteSource#END_OF_STREAM}
+     */
+    public int followWithGreater(ByteComparable key)
+    {
+        greaterBranch = -1;
+
+        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        go(root);
+        while (true)
+        {
+            int b = stream.next();
+            int searchIndex = search(b);
+
+            greaterBranch = greaterTransition(searchIndex, greaterBranch);
+            if (searchIndex < 0)
+                return b;
+
+            go(transition(searchIndex));
+        }
+    }
+
+    /**
+     * Follows the trie for a given key, remembering the closest lesser branch.
+     * On return the walker is positioned at the longest prefix that matches the input (with or without payload), and
+     * max(lesserBranch) is the immediate lesser neighbour.
+     *
+     * @return the first unmatched byte of the key, may be {@link ByteSource#END_OF_STREAM}
+     */
+    public int followWithLesser(ByteComparable key)
+    {
+        lesserBranch = -1;
+
+        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        go(root);
+        while (true)
+        {
+            int b = stream.next();
+            int searchIndex = search(b);
+
+            lesserBranch = lesserTransition(searchIndex, lesserBranch);
+
+            if (searchIndex < 0)
+                return b;
+
+            go(transition(searchIndex));
+        }
+    }
+
+
+    /**
+     * Takes a prefix of the given key. The prefix is in the sense of a separator key match, i.e. it is only
+     * understood as valid if there are no greater entries in the trie (e.g. data at 'a' is ignored if 'ab' or 'abba'
+     * is in the trie when looking for 'abc' or 'ac', but accepted when looking for 'aa').
+     * In order to not have to go back to data that may have exited cache, payloads are extracted when the node is
+     * visited (instead of saving the node's position), which requires an extractor to be passed as parameter.
+     */
+    @SuppressWarnings("unchecked")
+    public <RESULT> RESULT prefix(ByteComparable key, Extractor<RESULT, VALUE> extractor)
+    {
+        RESULT payload = null;
+
+        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        go(root);
+        while (true)
+        {
+            int b = stream.next();
+            int childIndex = search(b);
+
+            if (childIndex > 0)
+                payload = null;
+            else
+            {
+                int payloadBits = payloadFlags();
+                if (payloadBits > 0)
+                    payload = extractor.extract((VALUE) this, payloadPosition(), payloadBits);
+                if (childIndex < 0)
+                    return payload;
+            }
+
+            go(transition(childIndex));
+        }
+    }
+
+    /**
+     * Follows the trie for a given key, taking a prefix (in the sense above) and searching for neighboring values.
+     * On return min(greaterBranch) and max(lesserBranch) are the immediate non-prefix neighbours for the sought value.
+     * <p>
+     * Note: in a separator trie the closest smaller neighbour can be another prefix of the given key. This method
+     * does not take that into account. E.g. if trie contains "abba", "as" and "ask", looking for "asking" will find
+     * "ask" as the match, but max(lesserBranch) will point to "abba" instead of the correct "as". This problem can
+     * only occur if there is a valid prefix match.
+     */
+    @SuppressWarnings("unchecked")
+    public <RESULT> RESULT prefixAndNeighbours(ByteComparable key, Extractor<RESULT, VALUE> extractor)
+    {
+        RESULT payload = null;
+        greaterBranch = -1;
+        lesserBranch = -1;
+
+        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        go(root);
+        while (true)
+        {
+            int b = stream.next();
+            int searchIndex = search(b);
+            payload = null;
+
+            greaterBranch = greaterTransition(searchIndex, greaterBranch);
+            lesserBranch = lesserTransition(searchIndex, lesserBranch);
+
+            if (searchIndex == -1 || searchIndex == 0)
+            {
+                int payloadBits = payloadFlags();
+                if (payloadBits > 0)
+                    payload = extractor.extract((VALUE) this, payloadPosition(), payloadBits);
+            }
+
+            if (searchIndex < 0)
+                return payload;
+
+            go(transition(searchIndex));
+        }
+    }
+
+    /**
+     * To be used only in analysis.
+     */
+    protected int nodeTypeOrdinal()
+    {
+        return nodeType.ordinal;
+    }
+
+    /**
+     * To be used only in analysis.
+     */
+    protected int nodeSize()
+    {
+        return payloadPosition() - offset;
+    }
+
+    public interface PayloadToString
+    {
+        String payloadAsString(ByteBuffer buf, int payloadPos, int payloadFlags);
+    }
+
+    public void dumpTrie(PrintStream out, PayloadToString payloadReader)
+    {
+        out.print("ROOT");
+        dumpTrie(out, payloadReader, root, "");
+    }
+
+    private void dumpTrie(PrintStream out, PayloadToString payloadReader, long node, String indent)
+    {
+        go(node);
+        int bits = payloadFlags();
+        out.format(" %s@%x %s%n", nodeType.toString(), node, bits == 0 ? "" : payloadReader.payloadAsString(buf, payloadPosition(), bits));
+        int range = transitionRange();
+        for (int i = 0; i < range; ++i)
+        {
+            long child = transition(i);
+            if (child == -1)
+                continue;
+            out.format("%s%02x %s>", indent, transitionByte(i), PageAware.pageStart(position) == PageAware.pageStart(child) ? "--" : "==");
+            dumpTrie(out, payloadReader, child, indent + "  ");
+            go(node);
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("[Trie Walker - NodeType: %s, source: %s, buffer: %s, buffer file offset: %d, Node buffer offset: %d, Node file position: %d]",
+                             nodeType, source, buf, bh.offset(), offset, position);
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
index b94d097b6031..f24092b0cd4f 100644
--- a/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
+++ b/src/java/org/apache/cassandra/io/util/DataOutputPlus.java
@@ -77,4 +77,41 @@ default boolean hasPosition()
     {
         return false;
     }
+
+    // The methods below support page-aware layout for writing. These would only be implemented if position() is
+    // also supported.
+
+    /**
+     * Returns the number of bytes that a page can take at maximum.
+     */
+    default int maxBytesInPage()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Pad this page with 0s to move on to the next.
+     * @throws IOException
+     */
+    default void padToPageBoundary() throws IOException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Returns how many bytes are left in the page.
+     */
+    default int bytesLeftInPage()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Returns the next padded position. This is either the current position (if already padded), or the start of next
+     * page.
+     */
+    default long paddedPosition()
+    {
+        throw new UnsupportedOperationException();
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/util/SequentialWriter.java b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
index a17135621786..daab46e6588e 100644
--- a/src/java/org/apache/cassandra/io/util/SequentialWriter.java
+++ b/src/java/org/apache/cassandra/io/util/SequentialWriter.java
@@ -25,6 +25,7 @@
 
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.utils.PageAware;
 import org.apache.cassandra.utils.SyncUtil;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
@@ -84,16 +85,19 @@ protected Throwable doPreCleanup(Throwable accumulate)
             return accumulate;
         }
 
+        @Override
         protected void doPrepare()
         {
             syncInternal();
         }
 
+        @Override
         protected Throwable doCommit(Throwable accumulate)
         {
             return accumulate;
         }
 
+        @Override
         protected Throwable doAbort(Throwable accumulate)
         {
             return accumulate;
@@ -137,7 +141,7 @@ private static FileChannel openChannel(File file)
      */
     public SequentialWriter(File file)
     {
-       this(file, SequentialWriterOption.DEFAULT);
+        this(file, SequentialWriterOption.DEFAULT);
     }
 
     /**
@@ -168,7 +172,7 @@ public SequentialWriter(File file, SequentialWriterOption option, boolean strict
         this.option = option;
     }
 
-    public void skipBytes(int numBytes) throws IOException
+    public void skipBytes(long numBytes) throws IOException
     {
         flush();
         fchannel.position(fchannel.position() + numBytes);
@@ -251,16 +255,46 @@ protected void flushData()
             runPostFlush.run();
     }
 
+    @Override
     public boolean hasPosition()
     {
         return true;
     }
 
+    @Override
     public long position()
     {
         return current();
     }
 
+    // Page management using on-disk pages
+
+    @Override
+    public int maxBytesInPage()
+    {
+        return PageAware.PAGE_SIZE;
+    }
+
+    @Override
+    public void padToPageBoundary() throws IOException
+    {
+        PageAware.pad(this);
+    }
+
+    @Override
+    public int bytesLeftInPage()
+    {
+        long position = position();
+        long bytesLeft = PageAware.pageLimit(position) - position;
+        return (int) bytesLeft;
+    }
+
+    @Override
+    public long paddedPosition()
+    {
+        return PageAware.padded(position());
+    }
+
     /**
      * Returns the current file pointer of the underlying on-disk file.
      * Note that since write works by buffering data, the value of this will increase by buffer
@@ -374,11 +408,13 @@ public boolean isOpen()
         return channel.isOpen();
     }
 
+    @Override
     public final void prepareToCommit()
     {
         txnProxy.prepareToCommit();
     }
 
+    @Override
     public final Throwable commit(Throwable accumulate)
     {
         return txnProxy.commit(accumulate);
@@ -391,6 +427,7 @@ public final Throwable commit(Throwable accumulate)
      * This is thread-unsafe, releasing and cleaning the buffer while it is being written can have disastrous
      * consequences (e.g. SIGSEGV).
      */
+    @Override
     public final Throwable abort(Throwable accumulate)
     {
         return txnProxy.abort(accumulate);
diff --git a/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java b/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java
new file mode 100644
index 000000000000..3578b2d9e859
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.nio.ByteBuffer;
+
+/**
+ * Special rebufferer that replaces the tail of the file (from the specified cutoff point) with the given buffer.
+ */
+public class TailOverridingRebufferer extends WrappingRebufferer
+{
+    private final long cutoff;
+    private final ByteBuffer tail;
+
+    public TailOverridingRebufferer(Rebufferer source, long cutoff, ByteBuffer tail)
+    {
+        super(source);
+        this.cutoff = cutoff;
+        this.tail = tail;
+    }
+
+    @Override
+    public Rebufferer.BufferHolder rebuffer(long position)
+    {
+        if (position < cutoff)
+        {
+            WrappingBufferHolder ret = (WrappingBufferHolder) super.rebuffer(position);
+            if (ret.offset() + ret.limit() > cutoff)
+                ret.limit((int) (cutoff - ret.offset()));
+            return ret;
+        }
+        else
+        {
+            return newBufferHolder().initialize(null, tail.duplicate(), cutoff);
+        }
+    }
+
+    @Override
+    public long fileLength()
+    {
+        return cutoff + tail.limit();
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("%s[+%d@%d]:%s", getClass().getSimpleName(), tail.limit(), cutoff, source.toString());
+    }
+
+}
diff --git a/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java b/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java
new file mode 100644
index 000000000000..965f1157892e
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.util;
+
+import java.nio.ByteBuffer;
+import java.util.Deque;
+import java.util.concurrent.ConcurrentLinkedDeque;
+import javax.annotation.Nullable;
+
+public class WrappingRebufferer implements Rebufferer
+{
+    protected final Rebufferer source;
+    private final Deque<WrappingBufferHolder> buffers;
+
+    public WrappingRebufferer(Rebufferer source)
+    {
+        this.source = source;
+        this.buffers = new ConcurrentLinkedDeque<>();
+    }
+
+    @Override
+    public BufferHolder rebuffer(long position)
+    {
+        BufferHolder bufferHolder = source.rebuffer(position);
+        return newBufferHolder().initialize(bufferHolder, bufferHolder.buffer(), bufferHolder.offset());
+    }
+
+    protected WrappingBufferHolder newBufferHolder()
+    {
+        WrappingBufferHolder ret = buffers.pollFirst();
+        if (ret == null)
+            ret = new WrappingBufferHolder();
+
+        return ret;
+    }
+
+    @Override
+    public ChannelProxy channel()
+    {
+        return source.channel();
+    }
+
+    @Override
+    public long fileLength()
+    {
+        return source.fileLength();
+    }
+
+    @Override
+    public double getCrcCheckChance()
+    {
+        return source.getCrcCheckChance();
+    }
+
+    @Override
+    public void close()
+    {
+        source.close();
+    }
+
+    @Override
+    public void closeReader()
+    {
+        source.closeReader();
+    }
+
+
+    @Override
+    public String toString()
+    {
+        return String.format("%s[]:%s", getClass().getSimpleName(), source.toString());
+    }
+
+    protected final class WrappingBufferHolder implements BufferHolder
+    {
+        @Nullable
+        private BufferHolder bufferHolder;
+
+        private ByteBuffer buffer;
+        private long offset;
+
+        protected WrappingBufferHolder initialize(@Nullable BufferHolder bufferHolder, ByteBuffer buffer, long offset)
+        {
+            assert this.bufferHolder == null && this.buffer == null && this.offset == 0L : "initialized before release";
+
+            this.bufferHolder = bufferHolder;
+            this.buffer = buffer;
+            this.offset = offset;
+
+            return this;
+        }
+
+        @Override
+        public ByteBuffer buffer()
+        {
+            return buffer;
+        }
+
+        @Override
+        public long offset()
+        {
+            return offset;
+        }
+
+
+        public int limit()
+        {
+            return buffer.limit();
+        }
+
+        public void limit(int limit)
+        {
+            this.buffer.limit(limit);
+        }
+
+        @Override
+        public void release()
+        {
+            assert buffer != null : "released twice";
+
+            if (bufferHolder != null)
+            {
+                bufferHolder.release();
+                bufferHolder = null;
+            }
+
+            buffer = null;
+            offset = 0L;
+
+            buffers.offerFirst(this);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/PageAware.java b/src/java/org/apache/cassandra/utils/PageAware.java
new file mode 100644
index 000000000000..72527e9df315
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/PageAware.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.io.IOException;
+
+import org.apache.cassandra.io.util.DataOutputPlus;
+
+public final class PageAware
+{
+    public static final int PAGE_SIZE = 4096; // must be a power of two
+    public static final int PAGE_SIZE_SHIFT = Integer.numberOfTrailingZeros(PAGE_SIZE);
+
+    /**
+     * Calculate the end of the page identified by the given position.
+     * Equivalent to floor(dstPos / PAGE_SIZE + 1) * PAGE_SIZE.
+     * <p>
+     * When the argument is equal to the page boundary, returns the next page boundary. E.g. pageLimit(0) == PAGE_SIZE.
+     */
+    public static long pageLimit(long dstPos)
+    {
+        return (dstPos | (PAGE_SIZE - 1)) + 1;
+    }
+
+    /**
+     * Calculate the start of the page that contains the given position.
+     * Equivalent to floor(dstPos / PAGE_SIZE) * PAGE_SIZE.
+     */
+    public static long pageStart(long dstPos)
+    {
+        return dstPos & -PAGE_SIZE;
+    }
+
+    /**
+     * Calculate the earliest page boundary for the given position.
+     * Equivalent to ceil(dstPos / PAGE_SIZE) * PAGE_SIZE.
+     * <p>
+     * When the argument is equal to a page boundary, returns the argument.
+     */
+    public static long padded(long dstPos)
+    {
+        return pageStart(dstPos + PAGE_SIZE - 1);
+    }
+
+    /**
+     * Calculate the number of pages that fit in the given size, rounded up to a page if the size is not an exact multiple.
+     *
+     * @param size the size that needs to cover a number of pages
+     * @return the number of pages, rounded up
+     */
+    public static int numPages(int size)
+    {
+        return (size + PAGE_SIZE - 1) >> PAGE_SIZE_SHIFT;
+    }
+
+    /**
+     * Given a position relative to the start of a number of pages, determine the exact page number this
+     * position falls into. For example, positions from 0 to {@link #PAGE_SIZE} -1 will fall into page zero
+     * and so forth.
+     *
+     * @param dstPos the position
+     * @return the page number, indexed at zero
+     */
+    public static int pageNum(long dstPos)
+    {
+        return Math.toIntExact(dstPos >> PAGE_SIZE_SHIFT);
+    }
+
+    /**
+     * Pad the given output stream with zeroes until the next page boundary.
+     * If the destination position is already at a page boundary, do not do anything.
+     */
+    public static void pad(DataOutputPlus dest) throws IOException
+    {
+        long position = dest.position();
+        long bytesLeft = padded(position) - position;
+        dest.write(EmptyPage.EMPTY_PAGE, 0, (int) bytesLeft);
+    }
+
+    static class EmptyPage
+    {
+        static final byte[] EMPTY_PAGE = new byte[PAGE_SIZE];
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/SizedInts.java b/src/java/org/apache/cassandra/utils/SizedInts.java
new file mode 100644
index 000000000000..e65390ddf266
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/SizedInts.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Utility class for sizing, writing and reading ints with length stored separately.
+ * Used for trie payloads.
+ */
+public class SizedInts
+{
+    /**
+     * Returns the number of bytes we need to store the given position.
+     * This method understands 0 to need 1 byte.
+     *
+     * If your use case permits 0 to be encoded in 0 length, use {@link #sizeAllowingZero} below.
+     */
+    public static int nonZeroSize(long value)
+    {
+        if (value < 0)
+            value = ~value;
+        int lz = Long.numberOfLeadingZeros(value);       // 1 <= lz <= 64
+        return (64 - lz + 1 + 7) / 8;   // significant bits, +1 for sign, rounded up. At least 1, at most 8.
+    }
+
+    /**
+     * Returns the number of bytes we need to store the given position. Returns 0 for 0 argument.
+     */
+    public static int sizeAllowingZero(long value)
+    {
+        if (value == 0)
+            return 0;
+        return nonZeroSize(value);
+    }
+
+    public static long read(ByteBuffer src, int startPos, int bytes)
+    {
+        switch (bytes)
+        {
+        case 0:
+            return 0;
+        case 1:
+            return src.get(startPos);
+        case 2:
+            return src.getShort(startPos);
+        case 3:
+        {
+            long high = src.get(startPos);
+            return (high << 16L) | (src.getShort(startPos + 1) & 0xFFFFL);
+        }
+        case 4:
+            return src.getInt(startPos);
+        case 5:
+        {
+            long high = src.get(startPos);
+            return (high << 32L) | (src.getInt(startPos + 1) & 0xFFFFFFFFL);
+        }
+        case 6:
+        {
+            long high = src.getShort(startPos);
+            return (high << 32L) | (src.getInt(startPos + 2) & 0xFFFFFFFFL);
+        }
+        case 7:
+        {
+            long high = src.get(startPos);
+            high = (high << 16L) | (src.getShort(startPos + 1) & 0xFFFFL);
+            return (high << 32L) | (src.getInt(startPos + 3) & 0xFFFFFFFFL);
+        }
+        case 8:
+            return src.getLong(startPos);
+        default:
+            throw new AssertionError();
+        }
+    }
+
+    public static long readUnsigned(ByteBuffer src, int startPos, int bytes)
+    {
+        if (bytes == 8)
+            return read(src, startPos, bytes);
+        else
+            return read(src, startPos, bytes) & ((1L << (bytes * 8)) - 1);
+    }
+
+    public static void write(DataOutput dest, long value, int size) throws IOException
+    {
+        for (int i = size - 1; i >= 0; --i)
+            dest.writeByte((int) (value >> (i * 8)));
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/concurrent/LightweightRecycler.java b/src/java/org/apache/cassandra/utils/concurrent/LightweightRecycler.java
new file mode 100644
index 000000000000..844a6f0c1d5d
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/concurrent/LightweightRecycler.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils.concurrent;
+
+import java.util.ArrayDeque;
+import java.util.Collection;
+import java.util.Objects;
+import java.util.function.Supplier;
+
+
+interface LightweightRecyclerPoolHolder<T>
+{
+    ArrayDeque<T> get();
+}
+
+/**
+ * A simple thread local object reuse facility with limited capacity and no attempt at rebalancing pooling between
+ * threads. This is meant to be put in place where churn is high, but single object allocation and footprint are not
+ * so high to justify a more sophisticated approach.
+ * <p>
+ * <b>Internal use only</b>
+ *
+ * @param <T>
+ * @see ThreadLocals#createLightweightRecycler(int)
+ */
+public interface LightweightRecycler<T> extends LightweightRecyclerPoolHolder<T>
+{
+    /**
+     * @return a reusable instance, or null if none is available
+     */
+    default T reuse()
+    {
+        return get().pollFirst();
+    }
+
+    /**
+     * @param supplier
+     * @return a reusable instance, or allocate one via the provided supplier
+     */
+    default T reuseOrAllocate(Supplier<T> supplier)
+    {
+        final T reuse = reuse();
+        return reuse != null ? reuse : supplier.get();
+    }
+
+    /**
+     * @param t to be recycled, if t is a collection it will be cleared before recycling, but not cleared if not
+     *          recycled
+     * @return true if t was recycled, false otherwise
+     */
+    default boolean tryRecycle(T t)
+    {
+        Objects.requireNonNull(t);
+
+        final ArrayDeque<T> pool = get();
+        if (pool.size() < capacity())
+        {
+            if (t instanceof Collection)
+                ((Collection) t).clear();
+            pool.offerFirst(t);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * @return current count of available instances for reuse
+     */
+    default int available()
+    {
+        return get().size();
+    }
+
+
+    /**
+     * @return maximum capacity of the recycler
+     */
+    int capacity();
+}
diff --git a/src/java/org/apache/cassandra/utils/concurrent/ThreadLocals.java b/src/java/org/apache/cassandra/utils/concurrent/ThreadLocals.java
new file mode 100644
index 000000000000..7b345a843fcc
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/concurrent/ThreadLocals.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils.concurrent;
+
+import java.util.ArrayDeque;
+
+import io.netty.util.concurrent.FastThreadLocal;
+
+public final class ThreadLocals
+{
+    private ThreadLocals()
+    {
+    }
+
+    public static <T> LightweightRecycler<T> createLightweightRecycler(int limit)
+    {
+        return new FastThreadLocalLightweightRecycler<>(limit);
+    }
+
+    /**
+     * A {@link LightweightRecycler} which is backed by a {@link FastThreadLocal}.
+     */
+    private static final class FastThreadLocalLightweightRecycler<T> extends FastThreadLocal<ArrayDeque<T>> implements LightweightRecycler<T>
+    {
+        private final int capacity;
+
+        public FastThreadLocalLightweightRecycler(int capacity)
+        {
+            super();
+            this.capacity = capacity;
+        }
+
+        protected ArrayDeque<T> initialValue()
+        {
+            return new ArrayDeque<>(capacity);
+        }
+
+        /**
+         * @return maximum capacity of the recycler
+         */
+        public int capacity()
+        {
+            return capacity;
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/tries/AbstractTrieTestBase.java b/test/unit/org/apache/cassandra/io/tries/AbstractTrieTestBase.java
new file mode 100644
index 000000000000..f19c107ef675
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/tries/AbstractTrieTestBase.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.tries;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+
+import org.junit.Before;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.utils.PageAware;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+abstract public class AbstractTrieTestBase
+{
+    protected final static Logger logger = LoggerFactory.getLogger(TrieBuilderTest.class);
+    protected final static int BASE = 80;
+
+    protected boolean dump = false;
+    protected int payloadSize = 0;
+
+    @Before
+    public void beforeTest()
+    {
+        dump = false;
+        payloadSize = 0;
+    }
+
+    protected final TrieSerializer<Integer, DataOutput> serializer = new TrieSerializer<Integer, DataOutput>()
+    {
+        public int sizeofNode(SerializationNode<Integer> node, long nodePosition)
+        {
+            return TrieNode.typeFor(node, nodePosition).sizeofNode(node) + payloadSize;
+        }
+
+        public void write(DataOutput dataOutput, SerializationNode<Integer> node, long nodePosition) throws IOException
+        {
+            if (dump)
+                logger.info("Writing at {} type {} size {}: {}", Long.toHexString(nodePosition), TrieNode.typeFor(node, nodePosition), TrieNode.typeFor(node, nodePosition).sizeofNode(node), node);
+            TrieNode.typeFor(node, nodePosition).serialize(dataOutput, node, node.payload() != null ? node.payload() : 0, nodePosition);
+            dataOutput.write(new byte[payloadSize]);
+        }
+    };
+
+
+    protected int valueFor(long found)
+    {
+        return Long.bitCount(found + 1) & 0xF;
+    }
+
+    protected ByteComparable source(String s)
+    {
+        ByteBuffer buf = ByteBuffer.allocate(s.length());
+        for (int i = 0; i < s.length(); ++i)
+            buf.put((byte) s.charAt(i));
+        buf.rewind();
+        return ByteComparable.fixedLength(buf);
+    }
+
+    protected String toBase(long v)
+    {
+        return BigInteger.valueOf(v).toString(BASE);
+    }
+
+    // In-memory buffer with added paging parameters, to make sure the code below does the proper layout
+    protected static class DataOutputBufferPaged extends DataOutputBuffer
+    {
+        public int maxBytesInPage()
+        {
+            return PageAware.PAGE_SIZE;
+        }
+
+        public void padToPageBoundary() throws IOException
+        {
+            PageAware.pad(this);
+        }
+
+        public int bytesLeftInPage()
+        {
+            long position = position();
+            long bytesLeft = PageAware.pageLimit(position) - position;
+            return (int) bytesLeft;
+        }
+
+        public long paddedPosition()
+        {
+            return PageAware.padded(position());
+        }
+    }
+
+    protected static class InternalIterator extends ValueIterator<InternalIterator>
+    {
+        public InternalIterator(Rebufferer source, long root)
+        {
+            super(source, root);
+        }
+
+        public InternalIterator(Rebufferer source, long root, ByteComparable start, ByteComparable end, boolean admitPrefix)
+        {
+            super(source, root, start, end, admitPrefix);
+        }
+    }
+
+    protected static class ByteBufRebufferer implements Rebufferer, Rebufferer.BufferHolder
+    {
+        final ByteBuffer buffer;
+
+        ByteBufRebufferer(ByteBuffer buffer)
+        {
+            this.buffer = buffer;
+        }
+
+        @Override
+        public ChannelProxy channel()
+        {
+            return null;
+        }
+
+        @Override
+        public ByteBuffer buffer()
+        {
+            return buffer;
+        }
+
+        @Override
+        public long fileLength()
+        {
+            return buffer.remaining();
+        }
+
+        @Override
+        public double getCrcCheckChance()
+        {
+            return 0;
+        }
+
+        @Override
+        public BufferHolder rebuffer(long position)
+        {
+            return this;
+        }
+
+        @Override
+        public long offset()
+        {
+            return 0;
+        }
+
+        @Override
+        public void release()
+        {
+            // nothing
+        }
+
+        @Override
+        public void close()
+        {
+            // nothing
+        }
+
+        @Override
+        public void closeReader()
+        {
+            // nothing
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/tries/TrieBuilderTest.java b/test/unit/org/apache/cassandra/io/tries/TrieBuilderTest.java
new file mode 100644
index 000000000000..409b62e16d0f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/tries/TrieBuilderTest.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.io.util.TailOverridingRebufferer;
+
+import static org.junit.Assert.assertEquals;
+
+public class TrieBuilderTest extends AbstractTrieTestBase
+{
+    @Test
+    public void testPartialBuild_Apollo1148() throws IOException
+    {
+        DataOutputBuffer buf = new DataOutputBufferPaged();
+        IncrementalTrieWriter<Integer> builder = IncrementalTrieWriter.open(serializer, buf);
+        long count = 0;
+
+        count += addUntilBytesWritten(buf, builder, "a", 1);            // Make a node whose children are written
+        long reset = count;
+        count += addUntilBytesWritten(buf, builder, "c", 64 * 1024);    // Finalize it and write long enough to grow its pointer size
+
+        dump = true;
+        IncrementalTrieWriter.PartialTail tail = builder.makePartialRoot();
+        // The line above hit an assertion as that node's parent had a pre-calculated branch size which was no longer
+        // correct and we didn't bother to reset it.
+        dump = false;
+
+        // Check that partial representation has the right content.
+        Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
+        source = new TailOverridingRebufferer(source, tail.cutoff(), tail.tail());
+        verifyContent(count, source, tail.root(), reset);
+
+        long reset2 = count;
+
+        // Also check the completed trie.
+        count += addUntilBytesWritten(buf, builder, "e", 16 * 1024);
+        dump = true;
+        long root = builder.complete();
+        // The line above hit another size assertion as the size of a node's branch growing caused it to need to switch
+        // format, but we didn't bother to recalculate its size.
+        dump = false;
+
+        source = new ByteBufRebufferer(buf.asNewBuffer());
+        verifyContent(count, source, root, reset, reset2);
+    }
+
+    public void verifyContent(long count, Rebufferer source, long root, long... resets)
+    {
+        InternalIterator iter = new InternalIterator(source, root);
+        long found = 0;
+        long ofs = 0;
+        int rpos = 0;
+        long pos;
+        while ((pos = iter.nextPayloadedNode()) != -1)
+        {
+            iter.go(pos);
+            assertEquals(valueFor(found - ofs), iter.payloadFlags());
+            ++found;
+            if (rpos < resets.length && found >= resets[rpos])
+            {
+                ofs = resets[rpos];
+                ++rpos;
+            }
+        }
+        assertEquals(count, found);
+    }
+
+    private long addUntilBytesWritten(DataOutputBuffer buf,
+                                      IncrementalTrieWriter<Integer> builder,
+                                      String prefix,
+                                      long howMany) throws IOException
+    {
+        long pos = buf.position();
+        long idx = 0;
+        while (pos + howMany > buf.position())
+        {
+            builder.add(source(String.format("%s%8s", prefix, toBase(idx))), valueFor(idx));
+            logger.info("Adding {} : {}", String.format("%s%8s", prefix, toBase(idx)), valueFor(idx));
+            ++idx;
+        }
+        logger.info(String.format("%s%8s", prefix, toBase(idx - 1)));
+        return idx;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/tries/TrieNodeTest.java b/test/unit/org/apache/cassandra/io/tries/TrieNodeTest.java
new file mode 100644
index 000000000000..efccdbcfff05
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/tries/TrieNodeTest.java
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.tries;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Test;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.mockito.Mockito;
+import org.quicktheories.api.Pair;
+import org.quicktheories.generators.Generate;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+import static org.mockito.Mockito.reset;
+import static org.mockito.Mockito.when;
+import static org.quicktheories.QuickTheory.qt;
+import static org.quicktheories.generators.SourceDSL.longs;
+
+@SuppressWarnings("unchecked")
+public class TrieNodeTest
+{
+    private final static Logger logger = LoggerFactory.getLogger(TrieNodeTest.class);
+    private final SerializationNode<Integer> sn = Mockito.mock(SerializationNode.class);
+    private final DataOutputBuffer out = new DataOutputBuffer();
+
+    @After
+    public void after()
+    {
+        reset(sn);
+        out.clear();
+    }
+
+    @Test
+    public void testTypeFor0Children()
+    {
+        when(sn.childCount()).thenReturn(0);
+        assertSame(TrieNode.PAYLOAD_ONLY, TrieNode.typeFor(sn, 0));
+    }
+
+    @Test
+    public void testTypeFor1ChildNoPayload()
+    {
+        when(sn.childCount()).thenReturn(1);
+        when(sn.payload()).thenReturn(null);
+
+        qt().forAll(Generate.pick(Arrays.asList(
+                Pair.of(Ranges.BITS_4, TrieNode.SINGLE_NOPAYLOAD_4),
+                Pair.of(Ranges.BITS_8, TrieNode.SINGLE_8),
+                Pair.of(Ranges.BITS_12, TrieNode.SINGLE_NOPAYLOAD_12),
+                Pair.of(Ranges.BITS_16, TrieNode.SINGLE_16),
+                Pair.of(Ranges.BITS_24, TrieNode.DENSE_24),
+                Pair.of(Ranges.BITS_32, TrieNode.DENSE_32),
+                Pair.of(Ranges.BITS_40, TrieNode.DENSE_40),
+                Pair.of(Ranges.BITS_GT40, TrieNode.LONG_DENSE)
+        )).flatMap(p -> longs().between(p._1.min, p._1.max).map(v -> Pair.of(v, p._2)))).check(p -> {
+            when(sn.maxPositionDelta(0)).thenReturn(-p._1);
+            return p._2 == TrieNode.typeFor(sn, 0);
+        });
+    }
+
+    @Test
+    public void testTypeFor1ChildAndPayload()
+    {
+        when(sn.childCount()).thenReturn(1);
+        when(sn.payload()).thenReturn(1);
+
+        qt().forAll(Generate.pick(Arrays.asList(
+                Pair.of(Ranges.BITS_4, TrieNode.SINGLE_8),
+                Pair.of(Ranges.BITS_8, TrieNode.SINGLE_8),
+                Pair.of(Ranges.BITS_12, TrieNode.SINGLE_16),
+                Pair.of(Ranges.BITS_16, TrieNode.SINGLE_16),
+                Pair.of(Ranges.BITS_24, TrieNode.DENSE_24),
+                Pair.of(Ranges.BITS_32, TrieNode.DENSE_32),
+                Pair.of(Ranges.BITS_40, TrieNode.DENSE_40),
+                Pair.of(Ranges.BITS_GT40, TrieNode.LONG_DENSE)
+        )).flatMap(p -> longs().between(p._1.min, p._1.max).map(v -> Pair.of(v, p._2)))).check(p -> {
+            when(sn.maxPositionDelta(0)).thenReturn(-p._1);
+            return p._2 == TrieNode.typeFor(sn, 0);
+        });
+    }
+
+    @Test
+    public void testTypeForMoreChildrenAndNoPayload()
+    {
+        when(sn.childCount()).thenReturn(2);
+        when(sn.payload()).thenReturn(null);
+
+        qt().forAll(Generate.pick(Arrays.asList(
+                Pair.of(Ranges.BITS_4, TrieNode.DENSE_12),
+                Pair.of(Ranges.BITS_8, TrieNode.DENSE_12),
+                Pair.of(Ranges.BITS_12, TrieNode.DENSE_12),
+                Pair.of(Ranges.BITS_16, TrieNode.DENSE_16),
+                Pair.of(Ranges.BITS_24, TrieNode.DENSE_24),
+                Pair.of(Ranges.BITS_32, TrieNode.DENSE_32),
+                Pair.of(Ranges.BITS_40, TrieNode.DENSE_40),
+                Pair.of(Ranges.BITS_GT40, TrieNode.LONG_DENSE)
+        )).flatMap(p -> longs().between(p._1.min, p._1.max).map(v -> Pair.of(v, p._2)))).check(p -> {
+            when(sn.maxPositionDelta(0)).thenReturn(-p._1);
+            return p._2 == TrieNode.typeFor(sn, 0);
+        });
+    }
+
+    @Test
+    public void testPayloadOnlyNode() throws IOException
+    {
+        TrieNode.PAYLOAD_ONLY.serialize(out, null, 1 | 4, 0);
+        out.flush();
+
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 0);
+        assertEquals(TrieNode.PAYLOAD_ONLY, node);
+        assertEquals(1 | 4, node.payloadFlags(out.asNewBuffer(), 0));
+        assertEquals(1, node.payloadPosition(out.asNewBuffer(), 0));
+        assertEquals(1, node.sizeofNode(null));
+        assertEquals(-1, node.search(null, 0, 0));
+        assertEquals(Integer.MAX_VALUE, node.transitionByte(null, 0, 0));
+        assertEquals(123, node.lesserTransition(null, 0, 0, 0, 123));
+        assertEquals(123, node.greaterTransition(null, 0, 0, 0, 123));
+        assertEquals(-1, node.lastTransition(null, 0, 0));
+        assertEquals(-1, node.transition(null, 0, 0, 0));
+        assertEquals(0, node.transitionRange(null, 0));
+        assertEquals(0, node.transitionDelta(null, 0, 0));
+    }
+
+    private void prepareSingleNode(long delta)
+    {
+        when(sn.childCount()).thenReturn(1);
+        when(sn.transition(0)).thenReturn(123);
+        when(sn.serializedPositionDelta(0, 0)).thenReturn(delta);
+    }
+
+    private void singleNodeAssertions(TrieNode node, int payloadFlags, int size, long pos)
+    {
+        assertEquals(payloadFlags, node.payloadFlags(out.asNewBuffer(), 0));
+        assertEquals(size, node.sizeofNode(null));
+        assertEquals(0, node.search(out.asNewBuffer(), 0, 123));
+        assertEquals(-1, node.search(out.asNewBuffer(), 0, 122));
+        assertEquals(-2, node.search(out.asNewBuffer(), 0, 124));
+        assertEquals(123, node.transitionByte(out.asNewBuffer(), 0, 0));
+        assertEquals(Integer.MAX_VALUE, node.transitionByte(out.asNewBuffer(), 0, 1));
+        assertEquals(100, node.lesserTransition(out.asNewBuffer(), 0, 100 - pos, -2, 123));
+        assertEquals(234, node.greaterTransition(null, 0, 0, 0, 234));
+        assertEquals(234, node.greaterTransition(out.asNewBuffer(), 0, 100 - pos, -2, 234));
+        assertEquals(100, node.greaterTransition(out.asNewBuffer(), 0, 100 - pos, -1, 234));
+        assertEquals(234, node.greaterTransition(out.asNewBuffer(), 0, 100 - pos, 0, 234));
+        assertEquals(100, node.lastTransition(out.asNewBuffer(), 0, 100 - pos));
+        assertEquals(100, node.transition(out.asNewBuffer(), 0, 100 - pos, 0));
+        assertEquals(1, node.transitionRange(out.asNewBuffer(), 0));
+        assertEquals(pos, node.transitionDelta(out.asNewBuffer(), 0, 0));
+    }
+
+    @Test
+    public void testSingle16Node() throws IOException
+    {
+        prepareSingleNode(-43210L);
+        TrieNode.SINGLE_16.serialize(out, sn, 1 | 4, 0);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 0);
+        assertEquals(TrieNode.SINGLE_16, node);
+        singleNodeAssertions(node, 1 | 4, 4, -43210);
+    }
+
+    @Test
+    public void testSingleNoPayload4Node() throws IOException
+    {
+        prepareSingleNode(-7L);
+        TrieNode.SINGLE_NOPAYLOAD_4.serialize(out, sn, 0, 0);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 0);
+        assertEquals(TrieNode.SINGLE_NOPAYLOAD_4, node);
+        singleNodeAssertions(node, 0, 2, -7);
+    }
+
+    @Test
+    public void testSingleNoPayload12Node() throws IOException
+    {
+        prepareSingleNode(-1234L);
+        TrieNode.SINGLE_NOPAYLOAD_12.serialize(out, sn, 0, 0);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 0);
+        assertEquals(TrieNode.SINGLE_NOPAYLOAD_12, node);
+        singleNodeAssertions(node, 0, 3, -1234L);
+    }
+
+    private void prepareSparseNode(long delta) throws IOException
+    {
+        out.write(new byte[6]);
+        when(sn.childCount()).thenReturn(3);
+        when(sn.transition(0)).thenReturn(10);
+        when(sn.transition(1)).thenReturn(20);
+        when(sn.transition(2)).thenReturn(30);
+        when(sn.serializedPositionDelta(0, 6)).thenReturn(delta);
+        when(sn.serializedPositionDelta(1, 6)).thenReturn(delta + 2);
+        when(sn.serializedPositionDelta(2, 6)).thenReturn(delta + 4);
+    }
+
+    private void sparseOrDenseNodeAssertions(TrieNode node, int payloadFlags, int size, long pos)
+    {
+        assertEquals(size, node.sizeofNode(sn));
+        assertEquals(payloadFlags, node.payloadFlags(out.asNewBuffer(), 6));
+        assertEquals(3, node.transitionRange(out.asNewBuffer(), 6));
+        assertEquals(6 + size, node.payloadPosition(out.asNewBuffer(), 6));
+
+        assertEquals(Integer.MAX_VALUE, node.transitionByte(out.asNewBuffer(), 6, 3));
+
+        assertEquals(10, node.lesserTransition(out.asNewBuffer(), 6, 10 - pos, 1, 123));
+        assertEquals(14, node.lesserTransition(out.asNewBuffer(), 6, 10 - pos, -4, 123));
+
+        assertEquals(14, node.greaterTransition(out.asNewBuffer(), 6, 10 - pos, 1, 234));
+        assertEquals(234, node.greaterTransition(out.asNewBuffer(), 6, 10 - pos, 2, 234));
+        assertEquals(10, node.greaterTransition(out.asNewBuffer(), 6, 10 - pos, -1, 234));
+
+        assertEquals(14, node.lastTransition(out.asNewBuffer(), 6, 10 - pos));
+
+        assertEquals(pos, node.transitionDelta(out.asNewBuffer(), 6, 0));
+        assertEquals(pos + 2, node.transitionDelta(out.asNewBuffer(), 6, 1));
+        assertEquals(pos + 4, node.transitionDelta(out.asNewBuffer(), 6, 2));
+
+        assertEquals(10, node.transition(out.asNewBuffer(), 6, 10 - pos, 0));
+        assertEquals(12, node.transition(out.asNewBuffer(), 6, 10 - pos, 1));
+        assertEquals(14, node.transition(out.asNewBuffer(), 6, 10 - pos, 2));
+    }
+
+    private void sparseNodeAssertions(TrieNode node, int payloadFlags, int size, long pos)
+    {
+        sparseOrDenseNodeAssertions(node, payloadFlags, size, pos);
+        assertEquals(-1, node.search(out.asNewBuffer(), 6, 5));
+        assertEquals(0, node.search(out.asNewBuffer(), 6, 10));
+        assertEquals(-2, node.search(out.asNewBuffer(), 6, 15));
+        assertEquals(-4, node.search(out.asNewBuffer(), 6, 35));
+
+        assertEquals(10, node.transitionByte(out.asNewBuffer(), 6, 0));
+        assertEquals(20, node.transitionByte(out.asNewBuffer(), 6, 1));
+        assertEquals(30, node.transitionByte(out.asNewBuffer(), 6, 2));
+    }
+
+    @Test
+    public void testSparse16Node() throws IOException
+    {
+        prepareSparseNode(-43210L);
+        TrieNode.SPARSE_16.serialize(out, sn, 1 | 4, 6);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 6);
+        assertEquals(TrieNode.SPARSE_16, node);
+        sparseNodeAssertions(node, 1 | 4, 11, -43210L);
+    }
+
+    @Test
+    public void testSparse12Node() throws IOException
+    {
+        prepareSparseNode(-1234L);
+        TrieNode.SPARSE_12.serialize(out, sn, 1 | 4, 6);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 6);
+        assertEquals(TrieNode.SPARSE_12, node);
+        sparseNodeAssertions(node, 1 | 4, 10, -1234L);
+    }
+
+    private void prepareDenseNode(long delta) throws IOException
+    {
+        out.write(new byte[6]);
+        when(sn.childCount()).thenReturn(3);
+        when(sn.transition(0)).thenReturn(11);
+        when(sn.transition(1)).thenReturn(12);
+        when(sn.transition(2)).thenReturn(13);
+        when(sn.serializedPositionDelta(0, 6)).thenReturn(delta);
+        when(sn.serializedPositionDelta(1, 6)).thenReturn(delta + 2);
+        when(sn.serializedPositionDelta(2, 6)).thenReturn(delta + 4);
+    }
+
+    private void denseNodeAssertions(TrieNode node, int payload, int size, long pos)
+    {
+        sparseOrDenseNodeAssertions(node, payload, size, pos);
+        assertEquals(-1, node.search(out.asNewBuffer(), 6, 10));
+        assertEquals(0, node.search(out.asNewBuffer(), 6, 11));
+        assertEquals(-4, node.search(out.asNewBuffer(), 6, 14));
+
+        assertEquals(11, node.transitionByte(out.asNewBuffer(), 6, 0));
+        assertEquals(12, node.transitionByte(out.asNewBuffer(), 6, 1));
+        assertEquals(13, node.transitionByte(out.asNewBuffer(), 6, 2));
+    }
+
+    @Test
+    public void testDense16Node() throws IOException
+    {
+        prepareDenseNode(-43210L);
+        TrieNode.DENSE_16.serialize(out, sn, 1 | 4, 6);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 6);
+        assertEquals(TrieNode.DENSE_16, node);
+        denseNodeAssertions(node, 1 | 4, 9, -43210L);
+    }
+
+    @Test
+    public void testDense12Node() throws IOException
+    {
+        prepareDenseNode(-1234L);
+        TrieNode.DENSE_12.serialize(out, sn, 1 | 4, 6);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 6);
+        assertEquals(TrieNode.DENSE_12, node);
+        denseNodeAssertions(node, 1 | 4, 8, -1234L);
+    }
+
+    @Test
+    public void testLongDenseNode() throws IOException
+    {
+        prepareDenseNode(-0x7ffffffffffffffL);
+        TrieNode.LONG_DENSE.serialize(out, sn, 1 | 4, 6);
+        TrieNode node = TrieNode.at(out.asNewBuffer(), 6);
+        assertEquals(TrieNode.LONG_DENSE, node);
+        denseNodeAssertions(node, 1 | 4, 27, -0x7ffffffffffffffL);
+    }
+
+    private enum Ranges
+    {
+        BITS_4(0x1L, 0xfL),
+        BITS_8(0x10L, 0xffL),
+        BITS_12(0x100L, 0xfffL),
+        BITS_16(0x1000L, 0xffffL),
+        BITS_24(0x100000L, 0xffffffL),
+        BITS_32(0x10000000L, 0xffffffffL),
+        BITS_40(0x1000000000L, 0xffffffffffL),
+        BITS_GT40(0x100000000000L, 0x7fffffffffffffffL);
+
+        private final long min, max;
+
+        Ranges(long min, long max)
+        {
+            this.min = min;
+            this.max = max;
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/io/tries/WalkerTest.java b/test/unit/org/apache/cassandra/io/tries/WalkerTest.java
new file mode 100644
index 000000000000..cfd3f88c326f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/tries/WalkerTest.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.tries;
+
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.lang3.StringUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.io.util.TailOverridingRebufferer;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNull;
+
+@SuppressWarnings("unchecked")
+@RunWith(Parameterized.class)
+public class WalkerTest extends AbstractTrieTestBase
+{
+    @Parameterized.Parameter(0)
+    public Class<? extends IncrementalTrieWriter> writerClass;
+
+    @Parameterized.Parameters(name = "{index}: trie writer class={0}")
+    public static Collection<Object[]> data()
+    {
+        return Arrays.asList(new Object[]{ IncrementalTrieWriterSimple.class },
+                             new Object[]{ IncrementalTrieWriterPageAware.class },
+                             new Object[]{ IncrementalDeepTrieWriterPageAware.class });
+    }
+
+    @Test
+    public void testWithoutBounds() throws IOException
+    {
+        DataOutputBuffer buf = new AbstractTrieTestBase.DataOutputBufferPaged();
+        IncrementalTrieWriter<Integer> builder = makeTrie(buf);
+        long rootPos = builder.complete();
+
+        Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
+
+        InternalIterator it = new InternalIterator(source, rootPos);
+
+        DataOutputBuffer dumpBuf = new DataOutputBuffer();
+        it.dumpTrie(new PrintStream(dumpBuf), (buf1, payloadPos, payloadFlags) -> String.format("%d/%d", payloadPos, payloadFlags));
+        logger.info("Trie dump: \n{}", new String(dumpBuf.getData()));
+        logger.info("Trie toString: {}", it.toString());
+
+        it.goMax(rootPos);
+        assertEquals(7, it.payloadFlags());
+        assertEquals(TrieNode.PAYLOAD_ONLY.ordinal, it.nodeTypeOrdinal());
+        assertEquals(1, it.nodeSize());
+        assertFalse(it.hasChildren());
+
+        it.goMin(rootPos);
+        assertEquals(1, it.payloadFlags());
+        assertEquals(TrieNode.PAYLOAD_ONLY.ordinal, it.nodeTypeOrdinal());
+        assertEquals(1, it.nodeSize());
+        assertFalse(it.hasChildren());
+
+        assertEquals(-1, it.follow(source("151")));
+        assertEquals(2, it.payloadFlags());
+
+        assertEquals('3', it.follow(source("135")));
+
+        assertEquals('3', it.followWithGreater(source("135")));
+        it.goMin(it.greaterBranch);
+        assertEquals(2, it.payloadFlags());
+
+        assertEquals('3', it.followWithLesser(source("135")));
+        it.goMax(it.lesserBranch);
+        assertEquals(1, it.payloadFlags());
+
+        assertEquals(3, (Object) it.prefix(source("155"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        assertNull(it.prefix(source("516"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        assertEquals(5, (Object) it.prefix(source("5151"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        assertEquals(1, (Object) it.prefix(source("1151"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+
+        assertEquals(3, (Object) it.prefixAndNeighbours(source("155"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        assertNull(it.prefixAndNeighbours(source("516"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        assertEquals(5, (Object) it.prefixAndNeighbours(source("5151"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        assertEquals(1, (Object) it.prefixAndNeighbours(source("1151"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+
+        assertEquals(3, (Object) it.prefixAndNeighbours(source("1555"), (walker, payloadPosition, payloadFlags) -> payloadFlags));
+        it.goMax(it.lesserBranch);
+        assertEquals(2, it.payloadFlags());
+        it.goMin(it.greaterBranch);
+        assertEquals(4, it.payloadFlags());
+    }
+
+    @Test
+    public void testWithBounds() throws IOException
+    {
+        DataOutputBuffer buf = new AbstractTrieTestBase.DataOutputBufferPaged();
+        IncrementalTrieWriter<Integer> builder = makeTrie(buf);
+        long rootPos = builder.complete();
+
+        Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
+
+        InternalIterator it = new InternalIterator(source, rootPos, source("151"), source("515"), false);
+        long pos;
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(3, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(4, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(5, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+
+        assertEquals(-1, it.nextPayloadedNode());
+    }
+
+    @Test
+    public void testWithBoundsAndAdmitPrefix() throws IOException
+    {
+        DataOutputBuffer buf = new AbstractTrieTestBase.DataOutputBufferPaged();
+        IncrementalTrieWriter<Integer> builder = makeTrie(buf);
+        long rootPos = builder.complete();
+
+        Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
+
+        InternalIterator it = new InternalIterator(source, rootPos, source("151"), source("515"), true);
+        long pos;
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(2, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(3, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(4, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+        assertNotEquals(-1, pos = it.nextPayloadedNode());
+        assertEquals(5, TrieNode.at(buf.asNewBuffer(), (int) pos).payloadFlags(buf.asNewBuffer(), (int) pos));
+
+        assertEquals(-1, it.nextPayloadedNode());
+    }
+
+    @Test
+    public void testPartialTail() throws IOException
+    {
+        DataOutputBuffer buf = new AbstractTrieTestBase.DataOutputBufferPaged();
+        IncrementalTrieWriter<Integer> builder = makeTrie(buf);
+        IncrementalTrieWriter.PartialTail ptail = builder.makePartialRoot();
+        long rootPos = builder.complete();
+        Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
+        Rebufferer partialSource = new TailOverridingRebufferer(new ByteBufRebufferer(buf.asNewBuffer()), ptail.cutoff(), ptail.tail());
+
+        InternalIterator it = new InternalIterator(source, rootPos, source("151"), source("515"), true);
+        InternalIterator tailIt = new InternalIterator(partialSource, ptail.root(), source("151"), source("515"), true);
+
+        while (true)
+        {
+            long i1 = it.nextPayloadedNode();
+            long i2 = tailIt.nextPayloadedNode();
+            if (i1 == -1 || i2 == -1)
+                break;
+
+            Rebufferer.BufferHolder bh1 = source.rebuffer(i1);
+            Rebufferer.BufferHolder bh2 = partialSource.rebuffer(i2);
+
+            int f1 = TrieNode.at(bh1.buffer(), (int) (i1 - bh1.offset())).payloadFlags(bh1.buffer(), (int) (i1 - bh1.offset()));
+            int f2 = TrieNode.at(bh2.buffer(), (int) (i2 - bh2.offset())).payloadFlags(bh2.buffer(), (int) (i2 - bh2.offset()));
+            assertEquals(f1, f2);
+        }
+    }
+
+    @Test
+    public void testBigTrie() throws IOException
+    {
+        DataOutputBuffer buf = new AbstractTrieTestBase.DataOutputBufferPaged();
+        IncrementalTrieWriter<Integer> builder = newTrieWriter(serializer, buf);
+        payloadSize = 0;
+        makeBigTrie(builder);
+        builder.reset();
+        payloadSize = 200;
+        makeBigTrie(builder);
+
+        long rootPos = builder.complete();
+        Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
+        InternalIterator it = new InternalIterator(source, rootPos);
+
+        while (true)
+        {
+            long i1 = it.nextPayloadedNode();
+            if (i1 == -1)
+                break;
+
+            TrieNode node = TrieNode.at(buf.asNewBuffer(), (int) i1);
+            assertNotEquals(0, node.payloadFlags(buf.asNewBuffer(), (int) i1));
+        }
+    }
+
+
+    private IncrementalTrieWriter<Integer> makeTrie(DataOutputBuffer out) throws IOException
+    {
+        IncrementalTrieWriter<Integer> builder = newTrieWriter(serializer, out);
+        dump = true;
+        builder.add(source("115"), 1);
+        builder.add(source("151"), 2);
+        builder.add(source("155"), 3);
+        builder.add(source("511"), 4);
+        builder.add(source("515"), 5);
+        builder.add(source("551"), 6);
+        builder.add(source("555555555555555555555555555555555555555555555555555555555555555555"), 7);
+        return builder;
+    }
+
+    private void makeBigTrie(IncrementalTrieWriter<Integer> builder) throws IOException
+    {
+        dump = false;
+        for (int shift = 0; shift < 8; shift++)
+            for (long i = 1; i < 80; i++)
+                builder.add(longSource(i, shift * 8, 100), (int) (i % 7) + 1);
+    }
+
+    private ByteComparable longSource(long l, int shift, int size)
+    {
+        String s = StringUtils.leftPad(toBase(l), 8, '0');
+        s = StringUtils.rightPad(s, 8 + shift, '0');
+        s = StringUtils.leftPad(s, size, '0');
+        return source(s);
+    }
+
+    private IncrementalTrieWriter<Integer> newTrieWriter(TrieSerializer<Integer, DataOutput> serializer, DataOutputPlus out)
+    {
+        if (writerClass == IncrementalTrieWriterSimple.class)
+        {
+            return new IncrementalTrieWriterSimple<>(serializer, out);
+        }
+        else if (writerClass == IncrementalTrieWriterPageAware.class)
+        {
+            return new IncrementalTrieWriterPageAware<>(serializer, out);
+        }
+        else if (writerClass == IncrementalDeepTrieWriterPageAware.class)
+        {
+            return new IncrementalDeepTrieWriterPageAware<>(serializer, out, 4);
+        }
+        else
+        {
+            throw new AssertionError("Unknown writer class " + writerClass.getName());
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java b/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java
new file mode 100644
index 000000000000..937588d6e22f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.mockito.Mockito;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+import static org.mockito.ArgumentMatchers.anyLong;
+import static org.mockito.Mockito.reset;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
+
+
+public class TailOverridingRebuffererTest
+{
+    ByteBuffer head = ByteBuffer.wrap(new byte[]{ 1, 2, 3, 4, 5, 6, 7, 8 });
+    ByteBuffer tail = ByteBuffer.wrap(new byte[]{ 9, 10 });
+
+    Rebufferer r = Mockito.mock(Rebufferer.class);
+    Rebufferer.BufferHolder bh = Mockito.mock(Rebufferer.BufferHolder.class);
+
+    public void before()
+    {
+        reset(r, bh);
+    }
+
+    @Test
+    public void testAccessLeftToTailFully()
+    {
+        when(r.rebuffer(anyLong())).thenReturn(bh);
+        when(bh.buffer()).thenReturn(head.duplicate());
+        when(bh.offset()).thenReturn(0L);
+        Rebufferer tor = new TailOverridingRebufferer(r, 8, tail.duplicate());
+
+        for (int i = 0; i < 8; i++)
+        {
+            Rebufferer.BufferHolder bh = tor.rebuffer(i);
+            assertEquals(head, bh.buffer());
+        }
+
+        assertEquals(10, tor.fileLength());
+    }
+
+    @Test
+    public void testAccessLeftToTailPartial()
+    {
+        when(r.rebuffer(anyLong())).thenReturn(bh);
+        when(bh.buffer()).thenReturn(head.duplicate());
+        when(bh.offset()).thenReturn(2L);
+        Rebufferer tor = new TailOverridingRebufferer(r, 8, tail.duplicate());
+
+        for (int i = 2; i < 8; i++)
+        {
+            Rebufferer.BufferHolder bh = tor.rebuffer(i);
+            assertEquals(head.limit(6), bh.buffer());
+        }
+
+        assertEquals(10, tor.fileLength());
+    }
+
+    @Test
+    public void testAccessRightToTail()
+    {
+        when(r.rebuffer(anyLong())).thenReturn(bh);
+        when(bh.buffer()).thenReturn(head.duplicate());
+        when(bh.offset()).thenReturn(0L);
+        Rebufferer tor = new TailOverridingRebufferer(r, 8, tail.duplicate());
+
+        for (int i = 8; i < 10; i++)
+        {
+            Rebufferer.BufferHolder bh = tor.rebuffer(i);
+            assertEquals(tail, bh.buffer());
+        }
+
+        assertEquals(10, tor.fileLength());
+    }
+
+    @Test
+    public void testOtherMethods() throws IOException
+    {
+        Rebufferer tor = new TailOverridingRebufferer(r, 8, tail.duplicate());
+
+        File tmp = File.createTempFile("fakeChannelProxy", "");
+        try (ChannelProxy channelProxy = new ChannelProxy(tmp))
+        {
+            when(r.channel()).thenReturn(channelProxy);
+            assertSame(channelProxy, tor.channel());
+            verify(r).channel();
+            reset(r);
+        }
+
+        tor.closeReader();
+        verify(r).closeReader();
+        reset(r);
+
+        tor.close();
+        verify(r).close();
+        reset(r);
+
+        when(r.getCrcCheckChance()).thenReturn(0.123d);
+        assertEquals(0.123d, tor.getCrcCheckChance(), 0);
+        verify(r).getCrcCheckChance();
+        reset(r);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java b/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java
new file mode 100644
index 000000000000..bc34f15dd626
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+public class WrappingRebuffererTest
+{
+    @Test
+    public void testRecycleSameHolder()
+    {
+        TestRebufferer mock = new TestRebufferer();
+        try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock))
+        {
+            Rebufferer.BufferHolder ret = rebufferer.rebuffer(0);
+            assertNotNull(ret);
+            assertEquals(mock.buffer(), ret.buffer());
+            assertEquals(mock.offset(), ret.offset());
+
+            ret.release();
+            assertTrue(mock.released);
+
+            assertSame(ret, rebufferer.rebuffer(0)); // same buffer holder was recycled
+        }
+    }
+
+    @Test
+    public void testRecycleTwoHolders()
+    {
+        TestRebufferer mock = new TestRebufferer();
+        try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock))
+        {
+
+            Rebufferer.BufferHolder ret1 = rebufferer.rebuffer(0);
+            assertNotNull(ret1);
+            assertEquals(mock.buffer(), ret1.buffer());
+            assertEquals(mock.offset(), ret1.offset());
+
+            Rebufferer.BufferHolder ret2 = rebufferer.rebuffer(1);
+            assertNotNull(ret2);
+            assertEquals(mock.buffer(), ret2.buffer());
+            assertEquals(mock.offset(), ret2.offset());
+
+            ret1.release();
+            assertTrue(mock.released);
+
+            mock.released = false;
+            ret2.release();
+            assertTrue(mock.released);
+
+            assertSame(ret2, rebufferer.rebuffer(0)); // first buffer holder was recycled
+            assertSame(ret1, rebufferer.rebuffer(1)); // second buffer holder was recycled
+        }
+    }
+
+
+    private static class TestRebufferer implements Rebufferer, Rebufferer.BufferHolder
+    {
+        final ByteBuffer buffer;
+        boolean released;
+        long offset;
+
+        TestRebufferer()
+        {
+            this.buffer = ByteBuffer.allocate(0);
+            this.released = false;
+            this.offset = 0;
+        }
+
+        @Override
+        public ChannelProxy channel()
+        {
+            return null;
+        }
+
+        @Override
+        public ByteBuffer buffer()
+        {
+            return buffer;
+        }
+
+        public long fileLength()
+        {
+            return buffer.remaining();
+        }
+
+        public double getCrcCheckChance()
+        {
+            return 0;
+        }
+
+        public BufferHolder rebuffer(long position)
+        {
+            offset = position;
+            return this;
+        }
+
+        public long offset()
+        {
+            return offset;
+        }
+
+        public void release()
+        {
+            released = true;
+        }
+
+        public long adjustExternal(long position)
+        {
+            return position;
+        }
+
+        public long adjustInternal(long position)
+        {
+            return position;
+        }
+
+        public void close()
+        {
+            // nothing
+        }
+
+        public void closeReader()
+        {
+            // nothing
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/PageAwareTest.java b/test/unit/org/apache/cassandra/utils/PageAwareTest.java
new file mode 100644
index 000000000000..fabbcb5cb392
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/PageAwareTest.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+
+import static org.apache.cassandra.utils.PageAware.PAGE_SIZE;
+import static org.junit.Assert.assertEquals;
+
+public class PageAwareTest
+{
+    @Test
+    public void pageLimit()
+    {
+        assertEquals(PAGE_SIZE, PageAware.pageLimit(0));
+        assertEquals(2 * PAGE_SIZE, PageAware.pageLimit(PAGE_SIZE));
+        assertEquals(3 * PAGE_SIZE, PageAware.pageLimit(2 * PAGE_SIZE));
+
+        assertEquals(PAGE_SIZE, PageAware.pageLimit(PAGE_SIZE / 3));
+        assertEquals(PAGE_SIZE, PageAware.pageLimit(PAGE_SIZE - 1));
+        assertEquals(PAGE_SIZE, PageAware.pageLimit(1));
+
+        assertEquals(2 * PAGE_SIZE, PageAware.pageLimit(PAGE_SIZE + PAGE_SIZE / 3));
+        assertEquals(2 * PAGE_SIZE, PageAware.pageLimit(PAGE_SIZE + PAGE_SIZE - 1));
+        assertEquals(2 * PAGE_SIZE, PageAware.pageLimit(PAGE_SIZE + 1));
+    }
+
+    @Test
+    public void pageStart()
+    {
+        assertEquals(0, PageAware.pageStart(0));
+        assertEquals(PAGE_SIZE, PageAware.pageStart(PAGE_SIZE));
+        assertEquals(2 * PAGE_SIZE, PageAware.pageStart(2 * PAGE_SIZE));
+
+        assertEquals(0, PageAware.pageStart(PAGE_SIZE / 3));
+        assertEquals(0, PageAware.pageStart(PAGE_SIZE - 1));
+        assertEquals(0, PageAware.pageStart(1));
+
+        assertEquals(PAGE_SIZE, PageAware.pageStart(PAGE_SIZE + PAGE_SIZE / 3));
+        assertEquals(PAGE_SIZE, PageAware.pageStart(PAGE_SIZE + PAGE_SIZE - 1));
+        assertEquals(PAGE_SIZE, PageAware.pageStart(PAGE_SIZE + 1));
+    }
+
+    @Test
+    public void padded()
+    {
+        assertEquals(0, PageAware.padded(0));
+        assertEquals(PAGE_SIZE, PageAware.padded(PAGE_SIZE));
+        assertEquals(2 * PAGE_SIZE, PageAware.padded(2 * PAGE_SIZE));
+
+        assertEquals(PAGE_SIZE, PageAware.padded(PAGE_SIZE / 3));
+        assertEquals(PAGE_SIZE, PageAware.padded(PAGE_SIZE - 1));
+        assertEquals(PAGE_SIZE, PageAware.padded(1));
+
+        assertEquals(2 * PAGE_SIZE, PageAware.padded(PAGE_SIZE + PAGE_SIZE / 3));
+        assertEquals(2 * PAGE_SIZE, PageAware.padded(PAGE_SIZE + PAGE_SIZE - 1));
+        assertEquals(2 * PAGE_SIZE, PageAware.padded(PAGE_SIZE + 1));
+    }
+
+    @Test
+    public void numPages()
+    {
+        assertEquals(0, PageAware.numPages(0));
+        assertEquals(1, PageAware.numPages(PAGE_SIZE));
+        assertEquals(2, PageAware.numPages(2 * PAGE_SIZE));
+
+        assertEquals(1, PageAware.numPages(PAGE_SIZE / 3));
+        assertEquals(1, PageAware.numPages(PAGE_SIZE - 1));
+        assertEquals(1, PageAware.numPages(1));
+
+        assertEquals(2, PageAware.numPages(PAGE_SIZE + PAGE_SIZE / 3));
+        assertEquals(2, PageAware.numPages(PAGE_SIZE + PAGE_SIZE - 1));
+        assertEquals(2, PageAware.numPages(PAGE_SIZE + 1));
+    }
+
+    @Test
+    public void pageNum()
+    {
+        assertEquals(0, PageAware.pageNum(0));
+        assertEquals(1, PageAware.pageNum(PAGE_SIZE));
+        assertEquals(2, PageAware.pageNum(2 * PAGE_SIZE));
+
+        assertEquals(0, PageAware.pageNum(PAGE_SIZE / 3));
+        assertEquals(0, PageAware.pageNum(PAGE_SIZE - 1));
+        assertEquals(0, PageAware.pageNum(1));
+
+        assertEquals(1, PageAware.pageNum(PAGE_SIZE + PAGE_SIZE / 3));
+        assertEquals(1, PageAware.pageNum(PAGE_SIZE + PAGE_SIZE - 1));
+        assertEquals(1, PageAware.pageNum(PAGE_SIZE + 1));
+    }
+
+    @Test
+    public void pad() throws IOException
+    {
+        testPad(0, 0);
+        testPad(PAGE_SIZE, PAGE_SIZE);
+        testPad(2 * PAGE_SIZE, 2 * PAGE_SIZE);
+
+        testPad(PAGE_SIZE, PAGE_SIZE / 3);
+        testPad(PAGE_SIZE, PAGE_SIZE - 1);
+        testPad(PAGE_SIZE, 1);
+
+        testPad(2 * PAGE_SIZE, PAGE_SIZE + PAGE_SIZE / 3);
+        testPad(2 * PAGE_SIZE, PAGE_SIZE + PAGE_SIZE - 1);
+        testPad(2 * PAGE_SIZE, PAGE_SIZE + 1);
+    }
+
+    private void testPad(int expectedSize, int currentSize) throws IOException
+    {
+        ByteBuffer expectedBuf = ByteBuffer.allocate(expectedSize);
+        try (DataOutputBuffer out = new DataOutputBuffer())
+        {
+            for (int i = 0; i < currentSize; i++)
+            {
+                expectedBuf.put((byte) 1);
+                out.write(1);
+            }
+            for (int i = currentSize; i < expectedSize; i++)
+            {
+                expectedBuf.put((byte) 0);
+            }
+
+            PageAware.pad(out);
+            out.flush();
+
+            assertEquals(expectedBuf.rewind(), out.asNewBuffer());
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/SizedIntsTest.java b/test/unit/org/apache/cassandra/utils/SizedIntsTest.java
new file mode 100644
index 000000000000..0d1e348c53e9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/SizedIntsTest.java
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import java.io.IOException;
+import java.math.BigInteger;
+
+import org.apache.commons.io.EndianUtils;
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.DataOutputBuffer;
+
+import static org.junit.Assert.assertEquals;
+import static org.quicktheories.QuickTheory.qt;
+import static org.quicktheories.generators.SourceDSL.longs;
+
+public class SizedIntsTest
+{
+
+    @Test
+    public void nonZeroSize()
+    {
+        assertEquals(1, SizedInts.nonZeroSize(0));
+        assertEquals(1, SizedInts.nonZeroSize(1));
+        assertEquals(1, SizedInts.nonZeroSize(127));
+        assertEquals(1, SizedInts.nonZeroSize(-127));
+        assertEquals(2, SizedInts.nonZeroSize(128));
+        assertEquals(1, SizedInts.nonZeroSize(-128));
+        assertEquals(2, SizedInts.nonZeroSize(-129));
+        assertEquals(8, SizedInts.nonZeroSize(0x7fffffffffffffffL));
+        assertEquals(8, SizedInts.nonZeroSize(0x7ffffffffffffffL));
+        assertEquals(7, SizedInts.nonZeroSize(0x7fffffffffffffL));
+        assertEquals(7, SizedInts.nonZeroSize(0x7ffffffffffffL));
+        assertEquals(6, SizedInts.nonZeroSize(0x7fffffffffffL));
+        assertEquals(6, SizedInts.nonZeroSize(0x7ffffffffffL));
+        assertEquals(5, SizedInts.nonZeroSize(0x7fffffffffL));
+        assertEquals(5, SizedInts.nonZeroSize(0x7ffffffffL));
+        assertEquals(4, SizedInts.nonZeroSize(0x7fffffffL));
+        assertEquals(4, SizedInts.nonZeroSize(0x7ffffffL));
+        assertEquals(3, SizedInts.nonZeroSize(0x7fffffL));
+        assertEquals(3, SizedInts.nonZeroSize(0x7ffffL));
+        assertEquals(2, SizedInts.nonZeroSize(0x7fffL));
+        assertEquals(2, SizedInts.nonZeroSize(0x7ffL));
+        assertEquals(1, SizedInts.nonZeroSize(0x7fL));
+        assertEquals(1, SizedInts.nonZeroSize(0x7L));
+    }
+
+    @Test
+    public void sizeAllowingZero()
+    {
+        assertEquals(0, SizedInts.sizeAllowingZero(0));
+        assertEquals(1, SizedInts.sizeAllowingZero(1));
+        assertEquals(1, SizedInts.sizeAllowingZero(127));
+        assertEquals(1, SizedInts.sizeAllowingZero(-127));
+        assertEquals(2, SizedInts.sizeAllowingZero(128));
+        assertEquals(1, SizedInts.sizeAllowingZero(-128));
+        assertEquals(2, SizedInts.sizeAllowingZero(-129));
+    }
+
+
+    @Test
+    public void readWrite()
+    {
+        try (DataOutputBuffer out = new DataOutputBuffer(8))
+        {
+            qt().forAll(Generators.bytes(1, 8).map(bb -> new BigInteger(ByteBufferUtil.getArray(bb)).longValue()).mix(longs().all())).check(v -> {
+                out.clear();
+                try
+                {
+                    SizedInts.write(out, v, SizedInts.sizeAllowingZero(v));
+                    out.flush();
+                    long r = SizedInts.read(out.asNewBuffer(), 0, SizedInts.sizeAllowingZero(v));
+                    return v == r;
+                }
+                catch (IOException e)
+                {
+                    throw Throwables.cleaned(e);
+                }
+            });
+        }
+    }
+
+    @Test
+    public void readWriteUnsigned()
+    {
+        try (DataOutputBuffer out = new DataOutputBuffer(8))
+        {
+            byte[] buf1 = new byte[8];
+            byte[] buf2 = new byte[8];
+            qt().forAll(Generators.bytes(1, 8).map(bb -> new BigInteger(ByteBufferUtil.getArray(bb)).longValue()).mix(longs().all())).check(v -> {
+                out.clear();
+                try
+                {
+                    int size = SizedInts.sizeAllowingZero(v);
+                    EndianUtils.writeSwappedLong(buf1, 0, v);
+                    SizedInts.write(out, v, size);
+                    out.flush();
+                    long r = SizedInts.readUnsigned(out.asNewBuffer(), 0, size);
+                    EndianUtils.writeSwappedLong(buf2, 0, r);
+                    return ByteArrayUtil.compareUnsigned(buf1, 0, buf2, 0, size) == 0;
+                }
+                catch (IOException e)
+                {
+                    throw Throwables.cleaned(e);
+                }
+            });
+        }
+    }
+}
\ No newline at end of file

From 20b4628d0de65195f00d6752fb3a1244cb2411a8 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 13:11:46 +0000
Subject: [PATCH 037/151] STAR-121: Index group API changes

(cherry picked from commit eec063120d7c617a5bc7215234cbaab60d8b78bc)
(cherry picked from commit 8363f8e4696741f0a31e072921e20344e7e5686b)
---
 .../schema/CreateIndexStatement.java          |  12 +-
 .../db/CassandraTableWriteHandler.java        |   4 +-
 .../cassandra/db/ColumnFamilyStore.java       |  59 +-
 .../org/apache/cassandra/db/Keyspace.java     |   6 +-
 .../org/apache/cassandra/db/Memtable.java     |  19 +
 .../db/PartitionRangeReadCommand.java         |  30 +-
 .../org/apache/cassandra/db/ReadCommand.java  |  79 +--
 .../cassandra/db/ReadExecutionController.java |  12 +-
 .../db/SinglePartitionReadCommand.java        |  25 +-
 .../cassandra/db/TableWriteHandler.java       |   2 +-
 .../AbstractCompactionStrategy.java           |   4 +-
 .../db/compaction/AbstractStrategyHolder.java |   2 +-
 .../db/compaction/ActiveCompactions.java      |   8 +
 .../db/compaction/CompactionInfo.java         |  36 ++
 .../db/compaction/CompactionIterator.java     |   3 +-
 .../db/compaction/CompactionManager.java      |  69 ++-
 .../compaction/CompactionStrategyHolder.java  |   4 +-
 .../compaction/CompactionStrategyManager.java |   4 +-
 .../db/compaction/OperationType.java          |   7 +
 .../db/compaction/PendingRepairHolder.java    |   4 +-
 .../cassandra/db/compaction/Upgrader.java     |   2 +-
 .../writers/DefaultCompactionWriter.java      |   2 +-
 .../writers/MajorLeveledCompactionWriter.java |   2 +-
 .../writers/MaxSSTableSizeWriter.java         |   2 +-
 .../SplittingSizeTieredCompactionWriter.java  |   2 +-
 .../cassandra/db/lifecycle/Tracker.java       |   6 +
 .../exceptions/RequestFailureReason.java      |  10 +-
 .../cassandra/gms/ApplicationState.java       |   1 +
 .../apache/cassandra/gms/VersionedValue.java  |   5 +
 .../org/apache/cassandra/index/Index.java     | 340 +++++++++-
 .../apache/cassandra/index/IndexRegistry.java |  88 ++-
 .../index/SecondaryIndexManager.java          | 465 +++++++++++---
 .../cassandra/index/SingletonIndexGroup.java  | 112 ++++
 .../index/SingletonIndexQueryPlan.java        |  87 +++
 .../index/internal/CassandraIndex.java        |  15 +-
 .../cassandra/index/sasi/SASIIndex.java       |  19 +-
 .../sasi/disk/PerSSTableIndexWriter.java      |  13 +
 ...{QueryPlan.java => SASIIndexSearcher.java} |   9 +-
 .../io/sstable/SSTableTxnWriter.java          |   4 +-
 .../io/sstable/SimpleSSTableMultiWriter.java  |   4 +-
 .../sstable/format/SSTableFlushObserver.java  |  67 +-
 .../io/sstable/format/SSTableReader.java      | 108 ++++
 .../io/sstable/format/SSTableWriter.java      |  75 ++-
 .../io/sstable/format/big/BigFormat.java      |   7 +-
 .../io/sstable/format/big/BigTableReader.java |  24 +-
 .../io/sstable/format/big/BigTableWriter.java |  15 +-
 .../io/sstable/format/big/ColumnIndex.java    |  11 +-
 .../cassandra/locator/ReplicaPlans.java       |  18 +-
 .../metrics/ClientRangeRequestMetrics.java    |  47 ++
 .../cassandra/schema/IndexMetadata.java       |  26 +-
 .../cassandra/service/StorageProxy.java       |   2 +
 .../cassandra/service/StorageService.java     |   9 +
 .../service/reads/AbstractReadExecutor.java   |   2 +-
 .../cassandra/service/reads/DataResolver.java |   3 +-
 .../reads/ShortReadRowsProtection.java        |   2 +-
 .../service/reads/range/RangeCommands.java    |   2 +-
 .../reads/range/ReplicaPlanIterator.java      |  11 +-
 .../format/ForwardingSSTableReader.java       |   6 +
 .../org/apache/cassandra/db/ScrubTest.java    |   2 +-
 .../cassandra/db/SecondaryIndexTest.java      |   2 +-
 .../db/compaction/ActiveCompactionsTest.java  |   4 +-
 .../db/lifecycle/RealTransactionsTest.java    |   2 +-
 .../cassandra/index/CustomIndexTest.java      | 584 +++++++++++++++++-
 .../index/SecondaryIndexManagerTest.java      |   4 +-
 .../org/apache/cassandra/index/StubIndex.java |   3 +-
 .../cassandra/index/StubIndexGroup.java       |  99 +++
 .../index/internal/CustomCassandraIndex.java  |   7 +-
 .../cassandra/index/sasi/SASIIndexTest.java   |   6 +-
 .../sasi/disk/PerSSTableIndexWriterTest.java  |   5 +-
 .../io/sstable/SSTableWriterTestBase.java     |   2 +-
 .../format/SSTableFlushObserverTest.java      | 518 +++++++++++++---
 .../AssureSufficientLiveNodesTest.java        |  10 +-
 .../reads/range/RangeCommandIteratorTest.java |   4 +-
 .../reads/range/RangeCommandsTest.java        |   6 +-
 .../reads/range/ReplicaPlanIteratorTest.java  |   2 +-
 .../reads/range/ReplicaPlanMergerTest.java    |   2 +-
 76 files changed, 2803 insertions(+), 471 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/index/SingletonIndexGroup.java
 create mode 100644 src/java/org/apache/cassandra/index/SingletonIndexQueryPlan.java
 rename src/java/org/apache/cassandra/index/sasi/plan/{QueryPlan.java => SASIIndexSearcher.java} (95%)
 create mode 100644 src/java/org/apache/cassandra/metrics/ClientRangeRequestMetrics.java
 create mode 100644 test/unit/org/apache/cassandra/index/StubIndexGroup.java

diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
index d5014236a791..a0ac6e9433c4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
@@ -115,11 +115,11 @@ public Keyspaces apply(Keyspaces schema)
                     throw ire("Duplicate column '%s' in index target list", target.column);
         }
 
-        indexTargets.forEach(t -> validateIndexTarget(table, t));
+        IndexMetadata.Kind kind = attrs.isCustom ? IndexMetadata.Kind.CUSTOM : IndexMetadata.Kind.COMPOSITES;
 
-        String name = null == indexName ? generateIndexName(keyspace, indexTargets) : indexName;
+        indexTargets.forEach(t -> validateIndexTarget(table, kind, t));
 
-        IndexMetadata.Kind kind = attrs.isCustom ? IndexMetadata.Kind.CUSTOM : IndexMetadata.Kind.COMPOSITES;
+        String name = null == indexName ? generateIndexName(keyspace, indexTargets) : indexName;
 
         Map<String, String> options = attrs.isCustom ? attrs.getOptions() : Collections.emptyMap();
 
@@ -150,13 +150,17 @@ Set<String> clientWarnings(KeyspacesDiff diff)
         return ImmutableSet.of();
     }
 
-    private void validateIndexTarget(TableMetadata table, IndexTarget target)
+    private void validateIndexTarget(TableMetadata table, IndexMetadata.Kind kind, IndexTarget target)
     {
         ColumnMetadata column = table.getColumn(target.column);
 
         if (null == column)
             throw ire("Column '%s' doesn't exist", target.column);
 
+        if ((kind == IndexMetadata.Kind.CUSTOM) && !SchemaConstants.isValidName(target.column.toString()))
+            throw ire("Column '%s' is longer than the permissible name length of %d characters or" +
+                      " contains non-alphanumeric-underscore characters", target.column, SchemaConstants.NAME_LENGTH);
+
         if (column.type.referencesDuration())
         {
             if (column.type.isCollection())
diff --git a/src/java/org/apache/cassandra/db/CassandraTableWriteHandler.java b/src/java/org/apache/cassandra/db/CassandraTableWriteHandler.java
index 146539c26ef4..d9dbfa1e0ead 100644
--- a/src/java/org/apache/cassandra/db/CassandraTableWriteHandler.java
+++ b/src/java/org/apache/cassandra/db/CassandraTableWriteHandler.java
@@ -33,10 +33,10 @@ public CassandraTableWriteHandler(ColumnFamilyStore cfs)
 
     @Override
     @SuppressWarnings("resource")
-    public void write(PartitionUpdate update, WriteContext context, UpdateTransaction updateTransaction)
+    public void write(PartitionUpdate update, WriteContext context, boolean updateIndexes)
     {
         CassandraWriteContext ctx = CassandraWriteContext.fromContext(context);
         Tracing.trace("Adding to {} memtable", update.metadata().name);
-        cfs.apply(update, updateTransaction, ctx.getGroup(), ctx.getPosition());
+        cfs.apply(update, ctx, updateIndexes);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index f80a04366238..99aca2b3b7e4 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -519,7 +519,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long k
 
     public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, UUID pendingRepair, boolean isTransient, MetadataCollector metadataCollector, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
     {
-        return getCompactionStrategyManager().createSSTableMultiWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadataCollector, header, indexManager.listIndexes(), lifecycleNewTracker);
+        return getCompactionStrategyManager().createSSTableMultiWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadataCollector, header, indexManager.listIndexGroups(), lifecycleNewTracker);
     }
 
     public boolean supportsEarlyOpen()
@@ -1339,17 +1339,20 @@ private static String ratio(float onHeap, float offHeap)
     /**
      * Insert/Update the column family for this key.
      * Caller is responsible for acquiring Keyspace.switchLock
-     * param @ lock - lock that needs to be used.
-     * param @ key - key for update/insert
-     * param @ columnFamily - columnFamily changes
+     * @param update to be applied
+     * @param context write context for current update
+     * @param updateIndexes whether secondary indexes should be updated
      */
-    public void apply(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup, CommitLogPosition commitLogPosition)
-
+    @SuppressWarnings("resource") // opGroup
+    public void apply(PartitionUpdate update, CassandraWriteContext context, boolean updateIndexes)
     {
         long start = System.nanoTime();
+        OpOrder.Group opGroup = context.getGroup();
+        CommitLogPosition commitLogPosition = context.getPosition();
         try
         {
             Memtable mt = data.getMemtableFor(opGroup, commitLogPosition);
+            UpdateTransaction indexer = newUpdateTransaction(update, context, updateIndexes, mt);
             long timeDelta = mt.put(update, indexer, opGroup);
             DecoratedKey key = update.partitionKey();
             invalidateCachedPartition(key);
@@ -1374,6 +1377,13 @@ public void apply(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Gro
         }
     }
 
+    private UpdateTransaction newUpdateTransaction(PartitionUpdate update, CassandraWriteContext context, boolean updateIndexes, Memtable memtable)
+    {
+        return updateIndexes
+               ? indexManager.newUpdateTransaction(update, context, FBUtilities.nowInSeconds(), memtable)
+               : UpdateTransaction.NO_OP;
+    }
+
     /**
      * @param sstables
      * @return sstables whose key range overlaps with that of the given sstables, not including itself.
@@ -2244,6 +2254,14 @@ public void clearUnsafe()
      * Truncate deletes the entire column family's data with no expensive tombstone creation
      */
     public void truncateBlocking()
+    {
+        truncateBlocking(DatabaseDescriptor.isAutoSnapshot());
+    }
+
+    /**
+     * Truncate deletes the entire column family's data with no expensive tombstone creation
+     */
+    public void truncateBlocking(boolean snapshot)
     {
         // We have two goals here:
         // - truncate should delete everything written before truncate was invoked
@@ -2264,7 +2282,7 @@ public void truncateBlocking()
         final long truncatedAt;
         final CommitLogPosition replayAfter;
 
-        if (keyspace.getMetadata().params.durableWrites || DatabaseDescriptor.isAutoSnapshot())
+        if (keyspace.getMetadata().params.durableWrites || snapshot)
         {
             replayAfter = forceBlockingFlush();
             viewManager.forceBlockingFlush();
@@ -2302,13 +2320,13 @@ public void run()
                                                    "Stopping parent sessions {} due to truncation of tableId="+metadata.id);
                 data.notifyTruncated(truncatedAt);
 
-            if (DatabaseDescriptor.isAutoSnapshot())
-                snapshot(Keyspace.getTimestampedSnapshotNameWithPrefix(name, SNAPSHOT_TRUNCATE_PREFIX));
+                if (snapshot)
+                    snapshot(Keyspace.getTimestampedSnapshotNameWithPrefix(name, SNAPSHOT_TRUNCATE_PREFIX));
 
-            discardSSTables(truncatedAt);
+                discardSSTables(truncatedAt);
 
-            indexManager.truncateAllIndexesBlocking(truncatedAt);
-            viewManager.truncateBlocking(replayAfter, truncatedAt);
+                indexManager.truncateAllIndexesBlocking(truncatedAt);
+                viewManager.truncateBlocking(replayAfter, truncatedAt);
 
                 SystemKeyspace.saveTruncationRecord(ColumnFamilyStore.this, truncatedAt, replayAfter);
                 logger.trace("cleaning out row cache");
@@ -2317,10 +2335,9 @@ public void run()
             }
         };
 
-        runWithCompactionsDisabled(Executors.callable(truncateRunnable), true, true);
+        runWithCompactionsDisabled(Executors.callable(truncateRunnable), true, true, CompactionInfo.StopTrigger.TRUNCATE);
 
         viewManager.build();
-
         logger.info("Truncate of {}.{} is complete", keyspace.getName(), name);
     }
 
@@ -2343,6 +2360,16 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptV
         return runWithCompactionsDisabled(callable, (sstable) -> true, interruptValidation, interruptViews, true);
     }
 
+    public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptValidation, boolean interruptViews, CompactionInfo.StopTrigger trigger)
+    {
+        return runWithCompactionsDisabled(callable, (sstable) -> true, interruptValidation, interruptViews, true, trigger);
+    }
+
+    public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableReader> sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes)
+    {
+        return runWithCompactionsDisabled(callable, sstablesPredicate, interruptValidation, interruptViews, interruptIndexes, CompactionInfo.StopTrigger.NONE);
+    }
+
     /**
      * Runs callable with compactions paused and compactions including sstables matching sstablePredicate stopped
      *
@@ -2353,7 +2380,7 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptV
      * @param interruptIndexes if we should interrupt compactions on indexes. NOTE: if you set this to true your sstablePredicate
      *                         must be able to handle LocalPartitioner sstables!
      */
-    public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableReader> sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes)
+    public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableReader> sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes, CompactionInfo.StopTrigger trigger)
     {
         // synchronize so that concurrent invocations don't re-enable compactions partway through unexpectedly,
         // and so we only run one major compaction at a time
@@ -2372,7 +2399,7 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableR
                  CompactionManager.CompactionPauser pausedStrategies = pauseCompactionStrategies(toInterruptFor))
             {
                 // interrupt in-progress compactions
-                CompactionManager.instance.interruptCompactionForCFs(toInterruptFor, sstablesPredicate, interruptValidation);
+                CompactionManager.instance.interruptCompactionForCFs(toInterruptFor, sstablesPredicate, interruptValidation, trigger);
                 CompactionManager.instance.waitForCessation(toInterruptFor, sstablesPredicate);
 
                 // doublecheck that we finished, instead of timing out
diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java
index ead01fbe241e..9c32f4a67797 100644
--- a/src/java/org/apache/cassandra/db/Keyspace.java
+++ b/src/java/org/apache/cassandra/db/Keyspace.java
@@ -626,7 +626,6 @@ else if (isDeferrable)
                     columnFamilyStores.get(tableId).metric.viewLockAcquireTime.update(acquireTime, MILLISECONDS);
             }
         }
-        int nowInSec = FBUtilities.nowInSeconds();
         try (WriteContext ctx = getWriteHandler().beginWrite(mutation, makeDurable))
         {
             for (PartitionUpdate upd : mutation.getPartitionUpdates())
@@ -655,10 +654,7 @@ else if (isDeferrable)
                     }
                 }
 
-                UpdateTransaction indexTransaction = updateIndexes
-                                                     ? cfs.indexManager.newUpdateTransaction(upd, ctx, nowInSec)
-                                                     : UpdateTransaction.NO_OP;
-                cfs.getWriteHandler().write(upd, ctx, indexTransaction);
+                cfs.getWriteHandler().write(upd, ctx, updateIndexes);
 
                 if (requiresViewUpdate)
                     baseComplete.set(System.currentTimeMillis());
diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java
index 3186ffb92aeb..4da1c26503e7 100644
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ b/src/java/org/apache/cassandra/db/Memtable.java
@@ -33,10 +33,14 @@
 import java.util.concurrent.atomic.AtomicReference;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 import com.google.common.base.Throwables;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
@@ -110,6 +114,8 @@ private static MemtablePool createMemtableAllocatorPool()
     private final AtomicLong liveDataSize = new AtomicLong(0);
     private final AtomicLong currentOperations = new AtomicLong(0);
 
+    // Allows us to find a Memtable by its tracker
+    private volatile LifecycleNewTracker tracker;
     // the write barrier for directing writes to this memtable or the next during a switch
     private volatile OpOrder.Barrier writeBarrier;
     // the precise upper bound of CommitLogPosition owned by this memtable
@@ -178,6 +184,11 @@ public MemtableAllocator getAllocator()
         return allocator;
     }
 
+    public void allocateExtraOnHeap(long additionalSpace, OpOrder.Group opGroup)
+    {
+        getAllocator().onHeap().allocate(additionalSpace, opGroup);
+    }
+
     public long getLiveDataSize()
     {
         return liveDataSize.get();
@@ -308,8 +319,16 @@ public int partitionCount()
         return partitions.size();
     }
 
+    public LifecycleNewTracker tracker()
+    {
+        return tracker;
+    }
+
     public List<FlushRunnable> flushRunnables(LifecycleTransaction txn)
     {
+        Preconditions.checkState(this.tracker == null, "Attempted to flush Memtable more than once on %s.%s", cfs.keyspace.getName(), cfs.name);
+        this.tracker = txn;
+
         return createFlushRunnables(txn);
     }
 
diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
index 82b6e8a6e8de..a1f88f9b761d 100644
--- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
@@ -22,7 +22,6 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
-import org.apache.cassandra.net.MessageFlag;
 import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -41,8 +40,6 @@
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.metrics.TableMetrics;
-import org.apache.cassandra.net.Message;
-import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.StorageProxy;
 import org.apache.cassandra.tracing.Tracing;
@@ -65,9 +62,9 @@ private PartitionRangeReadCommand(boolean isDigest,
                                      RowFilter rowFilter,
                                      DataLimits limits,
                                      DataRange dataRange,
-                                     IndexMetadata index)
+                                     Index.QueryPlan indexQueryPlan)
     {
-        super(Kind.PARTITION_RANGE, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, index);
+        super(Kind.PARTITION_RANGE, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan);
         this.dataRange = dataRange;
     }
 
@@ -87,7 +84,7 @@ public static PartitionRangeReadCommand create(TableMetadata metadata,
                                              rowFilter,
                                              limits,
                                              dataRange,
-                                             findIndex(metadata, rowFilter));
+                                             findIndexQueryPlan(metadata, rowFilter));
     }
 
     /**
@@ -158,7 +155,7 @@ public PartitionRangeReadCommand forSubRange(AbstractBounds<PartitionPosition> r
                                              rowFilter(),
                                              isRangeContinuation ? limits() : limits().withoutState(),
                                              dataRange().forSubRange(range),
-                                             indexMetadata());
+                                             indexQueryPlan());
     }
 
     public PartitionRangeReadCommand copy()
@@ -172,7 +169,7 @@ public PartitionRangeReadCommand copy()
                                              rowFilter(),
                                              limits(),
                                              dataRange(),
-                                             indexMetadata());
+                                             indexQueryPlan());
     }
 
     @Override
@@ -187,7 +184,7 @@ protected PartitionRangeReadCommand copyAsDigestQuery()
                                              rowFilter(),
                                              limits(),
                                              dataRange(),
-                                             indexMetadata());
+                                             indexQueryPlan());
     }
 
     @Override
@@ -202,7 +199,7 @@ protected PartitionRangeReadCommand copyAsTransientQuery()
                                              rowFilter(),
                                              limits(),
                                              dataRange(),
-                                             indexMetadata());
+                                             indexQueryPlan());
     }
 
     @Override
@@ -217,7 +214,7 @@ public PartitionRangeReadCommand withUpdatedLimit(DataLimits newLimits)
                                              rowFilter(),
                                              newLimits,
                                              dataRange(),
-                                             indexMetadata());
+                                             indexQueryPlan());
     }
 
     @Override
@@ -232,7 +229,7 @@ public PartitionRangeReadCommand withUpdatedLimitsAndDataRange(DataLimits newLim
                                              rowFilter(),
                                              newLimits,
                                              newDataRange,
-                                             indexMetadata());
+                                             indexQueryPlan());
     }
 
     public long getTimeout(TimeUnit unit)
@@ -383,9 +380,8 @@ protected void appendCQLWhereClause(StringBuilder sb)
      */
     public PartitionIterator postReconciliationProcessing(PartitionIterator result)
     {
-        ColumnFamilyStore cfs = Keyspace.open(metadata().keyspace).getColumnFamilyStore(metadata().name);
-        Index index = getIndex(cfs);
-        return index == null ? result : index.postProcessorFor(this).apply(result, this);
+        Index.QueryPlan queryPlan = indexQueryPlan();
+        return queryPlan == null ? result : queryPlan.postProcessor().apply(result);
     }
 
     @Override
@@ -439,11 +435,11 @@ public ReadCommand deserialize(DataInputPlus in,
                                        ColumnFilter columnFilter,
                                        RowFilter rowFilter,
                                        DataLimits limits,
-                                       IndexMetadata index)
+                                       Index.QueryPlan indexQueryPlan)
         throws IOException
         {
             DataRange range = DataRange.serializer.deserialize(in, version, metadata);
-            return new PartitionRangeReadCommand(isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, range, index);
+            return new PartitionRangeReadCommand(isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, range, indexQueryPlan);
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index 7b889d188a18..b63a5ec9dd72 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java
@@ -47,8 +47,6 @@
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.exceptions.UnknownIndexException;
 import org.apache.cassandra.index.Index;
-import org.apache.cassandra.index.IndexNotAvailableException;
-import org.apache.cassandra.index.IndexRegistry;
 import org.apache.cassandra.io.IVersionedSerializer;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.DataInputPlus;
@@ -99,7 +97,7 @@ public abstract class ReadCommand extends AbstractReadQuery
     int oldestUnrepairedTombstone = Integer.MAX_VALUE;
 
     @Nullable
-    private final IndexMetadata index;
+    private final Index.QueryPlan indexQueryPlan;
 
     protected static abstract class SelectionDeserializer
     {
@@ -113,7 +111,7 @@ public abstract ReadCommand deserialize(DataInputPlus in,
                                                 ColumnFilter columnFilter,
                                                 RowFilter rowFilter,
                                                 DataLimits limits,
-                                                IndexMetadata index) throws IOException;
+                                                Index.QueryPlan indexQueryPlan) throws IOException;
     }
 
     protected enum Kind
@@ -138,7 +136,7 @@ protected ReadCommand(Kind kind,
                           ColumnFilter columnFilter,
                           RowFilter rowFilter,
                           DataLimits limits,
-                          IndexMetadata index)
+                          Index.QueryPlan indexQueryPlan)
     {
         super(metadata, nowInSec, columnFilter, rowFilter, limits);
         if (acceptsTransient && isDigestQuery)
@@ -148,7 +146,7 @@ protected ReadCommand(Kind kind,
         this.isDigestQuery = isDigestQuery;
         this.digestVersion = digestVersion;
         this.acceptsTransient = acceptsTransient;
-        this.index = index;
+        this.indexQueryPlan = indexQueryPlan;
     }
 
     protected abstract void serializeSelection(DataOutputPlus out, int version) throws IOException;
@@ -281,14 +279,20 @@ public boolean isRepairedDataDigestConclusive()
     }
 
     /**
-     * Index (metadata) chosen for this query. Can be null.
+     * Index query plan chosen for this query. Can be null.
      *
-     * @return index (metadata) chosen for this query
+     * @return index query plan chosen for this query
      */
     @Nullable
-    public IndexMetadata indexMetadata()
+    public Index.QueryPlan indexQueryPlan()
     {
-        return index;
+        return indexQueryPlan;
+    }
+
+    @VisibleForTesting
+    public Index.Searcher indexSearcher()
+    {
+        return indexQueryPlan == null ? null : indexQueryPlan.searcherFor(this);
     }
 
     /**
@@ -384,30 +388,26 @@ public ReadResponse createResponse(UnfilteredPartitionIterator iterator)
 
     long indexSerializedSize(int version)
     {
-        return null != index
-             ? IndexMetadata.serializer.serializedSize(index, version)
+        return null != indexQueryPlan
+             ? IndexMetadata.serializer.serializedSize(indexQueryPlan.getFirst().getIndexMetadata(), version)
              : 0;
     }
 
     public Index getIndex(ColumnFamilyStore cfs)
     {
-        return null != index
-             ? cfs.indexManager.getIndex(index)
+        return null != indexQueryPlan
+             ? indexQueryPlan.getFirst()
              : null;
     }
 
-    static IndexMetadata findIndex(TableMetadata table, RowFilter rowFilter)
+    static Index.QueryPlan findIndexQueryPlan(TableMetadata table, RowFilter rowFilter)
     {
         if (table.indexes.isEmpty() || rowFilter.isEmpty())
             return null;
 
         ColumnFamilyStore cfs = Keyspace.openAndGetStore(table);
 
-        Index index = cfs.indexManager.getBestIndexFor(rowFilter);
-
-        return null != index
-             ? index.getIndexMetadata()
-             : null;
+        return cfs.indexManager.getBestIndexQueryPlanFor(rowFilter);
     }
 
     /**
@@ -418,8 +418,8 @@ static IndexMetadata findIndex(TableMetadata table, RowFilter rowFilter)
      */
     public void maybeValidateIndex()
     {
-        if (null != index)
-            IndexRegistry.obtain(metadata()).getIndex(index).validate(this);
+        if (null != indexQueryPlan)
+            indexQueryPlan.validate(this);
     }
 
     /**
@@ -436,15 +436,15 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut
         long startTimeNanos = System.nanoTime();
 
         ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata());
-        Index index = getIndex(cfs);
+        Index.QueryPlan indexQueryPlan = indexQueryPlan();
 
         Index.Searcher searcher = null;
-        if (index != null)
+        if (indexQueryPlan != null)
         {
-            if (!cfs.indexManager.isIndexQueryable(index))
-                throw new IndexNotAvailableException(index);
+            cfs.indexManager.checkQueryability(indexQueryPlan);
 
-            searcher = index.searcherFor(this);
+            Index index = indexQueryPlan.getFirst();
+            searcher = indexQueryPlan.searcherFor(this);
             Tracing.trace("Executing read on {}.{} using index {}", cfs.metadata.keyspace, cfs.metadata.name, index.getIndexMetadata().name);
         }
 
@@ -468,7 +468,7 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut
 
             // If we've used a 2ndary index, we know the result already satisfy the primary expression used, so
             // no point in checking it again.
-            RowFilter filter = (null == searcher) ? rowFilter() : index.getPostIndexQueryFilter(rowFilter());
+            RowFilter filter = (null == searcher) ? rowFilter() : indexQueryPlan.postIndexQueryFilter();
 
             /*
              * TODO: We'll currently do filtering by the rowFilter here because it's convenient. However,
@@ -967,7 +967,7 @@ public void serialize(ReadCommand command, DataOutputPlus out, int version) thro
             out.writeByte(command.kind.ordinal());
             out.writeByte(
                     digestFlag(command.isDigestQuery())
-                    | indexFlag(null != command.indexMetadata())
+                    | indexFlag(null != command.indexQueryPlan())
                     | acceptsTransientFlag(command.acceptsTransient())
             );
             if (command.isDigestQuery())
@@ -977,8 +977,8 @@ public void serialize(ReadCommand command, DataOutputPlus out, int version) thro
             ColumnFilter.serializer.serialize(command.columnFilter(), out, version);
             RowFilter.serializer.serialize(command.rowFilter(), out, version);
             DataLimits.serializer.serialize(command.limits(), out, version, command.metadata().comparator);
-            if (null != command.index)
-                IndexMetadata.serializer.serialize(command.index, out, version);
+            if (null != command.indexQueryPlan)
+                IndexMetadata.serializer.serialize(command.indexQueryPlan.getFirst().getIndexMetadata(), out, version);
 
             command.serializeSelection(out, version);
         }
@@ -993,9 +993,9 @@ public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
             // better complain loudly than doing the wrong thing.
             if (isForThrift(flags))
                 throw new IllegalStateException("Received a command with the thrift flag set. "
-                                              + "This means thrift is in use in a mixed 3.0/3.X and 4.0+ cluster, "
-                                              + "which is unsupported. Make sure to stop using thrift before "
-                                              + "upgrading to 4.0");
+                                               + "This means thrift is in use in a mixed 3.0/3.X and 4.0+ cluster, "
+                                               + "which is unsupported. Make sure to stop using thrift before "
+                                               + "upgrading to 4.0");
 
             boolean hasIndex = hasIndex(flags);
             int digestVersion = isDigest ? (int)in.readUnsignedVInt() : 0;
@@ -1004,9 +1004,16 @@ public ReadCommand deserialize(DataInputPlus in, int version) throws IOException
             ColumnFilter columnFilter = ColumnFilter.serializer.deserialize(in, version, metadata);
             RowFilter rowFilter = RowFilter.serializer.deserialize(in, version, metadata);
             DataLimits limits = DataLimits.serializer.deserialize(in, version,  metadata.comparator);
-            IndexMetadata index = hasIndex ? deserializeIndexMetadata(in, version, metadata) : null;
+            Index.QueryPlan indexQueryPlan = null;
+            if (hasIndex)
+            {
+                IndexMetadata index = deserializeIndexMetadata(in, version, metadata);
+                Index.Group indexGroup =  Keyspace.openAndGetStore(metadata).indexManager.getIndexGroup(index);
+                if (indexGroup != null)
+                    indexQueryPlan = indexGroup.queryPlanFor(rowFilter);
+            }
 
-            return kind.selectionDeserializer.deserialize(in, version, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, index);
+            return kind.selectionDeserializer.deserialize(in, version, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan);
         }
 
         private IndexMetadata deserializeIndexMetadata(DataInputPlus in, int version, TableMetadata metadata) throws IOException
diff --git a/src/java/org/apache/cassandra/db/ReadExecutionController.java b/src/java/org/apache/cassandra/db/ReadExecutionController.java
index 73ddad8022d8..6e917e8e3511 100644
--- a/src/java/org/apache/cassandra/db/ReadExecutionController.java
+++ b/src/java/org/apache/cassandra/db/ReadExecutionController.java
@@ -93,7 +93,7 @@ public static ReadExecutionController empty()
     static ReadExecutionController forCommand(ReadCommand command)
     {
         ColumnFamilyStore baseCfs = Keyspace.openAndGetStore(command.metadata());
-        ColumnFamilyStore indexCfs = maybeGetIndexCfs(baseCfs, command);
+        ColumnFamilyStore indexCfs = maybeGetIndexCfs(command);
 
         long createdAtNanos = baseCfs.metric.topLocalReadQueryTime.isEnabled() ? clock.now() : NO_SAMPLING;
 
@@ -134,10 +134,14 @@ static ReadExecutionController forCommand(ReadCommand command)
         }
     }
 
-    private static ColumnFamilyStore maybeGetIndexCfs(ColumnFamilyStore baseCfs, ReadCommand command)
+    private static ColumnFamilyStore maybeGetIndexCfs(ReadCommand command)
     {
-        Index index = command.getIndex(baseCfs);
-        return index == null ? null : index.getBackingTable().orElse(null);
+        Index.QueryPlan queryPlan = command.indexQueryPlan();
+        if (queryPlan == null)
+            return null;
+
+        // only the index groups with a single member are allowed to have a backing table
+        return queryPlan.getFirst().getBackingTable().orElse(null);
     }
 
     public TableMetadata metadata()
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index 5d52052c5f6b..7b82929e1032 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.db.transform.RTBoundValidator;
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
@@ -72,9 +73,9 @@ protected SinglePartitionReadCommand(boolean isDigest,
                                          DataLimits limits,
                                          DecoratedKey partitionKey,
                                          ClusteringIndexFilter clusteringIndexFilter,
-                                         IndexMetadata index)
+                                         Index.QueryPlan indexQueryPlan)
     {
-        super(Kind.SINGLE_PARTITION, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, index);
+        super(Kind.SINGLE_PARTITION, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan);
         assert partitionKey.getPartitioner() == metadata.partitioner;
         this.partitionKey = partitionKey;
         this.clusteringIndexFilter = clusteringIndexFilter;
@@ -90,7 +91,7 @@ protected SinglePartitionReadCommand(boolean isDigest,
      * @param limits the limits to use for the query.
      * @param partitionKey the partition key for the partition to query.
      * @param clusteringIndexFilter the clustering index filter to use for the query.
-     * @param indexMetadata explicitly specified index to use for the query
+     * @param indexQueryPlan explicitly specified index to use for the query
      *
      * @return a newly created read command.
      */
@@ -101,7 +102,7 @@ public static SinglePartitionReadCommand create(TableMetadata metadata,
                                                     DataLimits limits,
                                                     DecoratedKey partitionKey,
                                                     ClusteringIndexFilter clusteringIndexFilter,
-                                                    IndexMetadata indexMetadata)
+                                                    Index.QueryPlan indexQueryPlan)
     {
         return new SinglePartitionReadCommand(false,
                                               0,
@@ -113,7 +114,7 @@ public static SinglePartitionReadCommand create(TableMetadata metadata,
                                               limits,
                                               partitionKey,
                                               clusteringIndexFilter,
-                                              indexMetadata);
+                                              indexQueryPlan);
     }
 
     /**
@@ -144,7 +145,7 @@ public static SinglePartitionReadCommand create(TableMetadata metadata,
                       limits,
                       partitionKey,
                       clusteringIndexFilter,
-                      findIndex(metadata, rowFilter));
+                      findIndexQueryPlan(metadata, rowFilter));
     }
 
     /**
@@ -289,7 +290,7 @@ public SinglePartitionReadCommand copy()
                                               limits(),
                                               partitionKey(),
                                               clusteringIndexFilter(),
-                                              indexMetadata());
+                                              indexQueryPlan());
     }
 
     @Override
@@ -305,7 +306,7 @@ protected SinglePartitionReadCommand copyAsDigestQuery()
                                               limits(),
                                               partitionKey(),
                                               clusteringIndexFilter(),
-                                              indexMetadata());
+                                              indexQueryPlan());
     }
 
     @Override
@@ -321,7 +322,7 @@ protected SinglePartitionReadCommand copyAsTransientQuery()
                                               limits(),
                                               partitionKey(),
                                               clusteringIndexFilter(),
-                                              indexMetadata());
+                                              indexQueryPlan());
     }
 
     @Override
@@ -337,7 +338,7 @@ public SinglePartitionReadCommand withUpdatedLimit(DataLimits newLimits)
                                               newLimits,
                                               partitionKey(),
                                               clusteringIndexFilter(),
-                                              indexMetadata());
+                                              indexQueryPlan());
     }
 
     @Override
@@ -1129,12 +1130,12 @@ public ReadCommand deserialize(DataInputPlus in,
                                        ColumnFilter columnFilter,
                                        RowFilter rowFilter,
                                        DataLimits limits,
-                                       IndexMetadata index)
+                                       Index.QueryPlan indexQueryPlan)
         throws IOException
         {
             DecoratedKey key = metadata.partitioner.decorateKey(metadata.partitionKeyType.readBuffer(in, DatabaseDescriptor.getMaxValueSize()));
             ClusteringIndexFilter filter = ClusteringIndexFilter.serializer.deserialize(in, version, metadata);
-            return new SinglePartitionReadCommand(isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, key, filter, index);
+            return new SinglePartitionReadCommand(isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, key, filter, indexQueryPlan);
         }
     }
 
diff --git a/src/java/org/apache/cassandra/db/TableWriteHandler.java b/src/java/org/apache/cassandra/db/TableWriteHandler.java
index 4e4722192528..e4caebfaebb0 100644
--- a/src/java/org/apache/cassandra/db/TableWriteHandler.java
+++ b/src/java/org/apache/cassandra/db/TableWriteHandler.java
@@ -23,5 +23,5 @@
 
 public interface TableWriteHandler
 {
-    void write(PartitionUpdate update, WriteContext context, UpdateTransaction updateTransaction);
+    void write(PartitionUpdate update, WriteContext context, boolean updateIndexes);
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
index 0b37c225c4ca..6b4df9471328 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
@@ -546,10 +546,10 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                        boolean isTransient,
                                                        MetadataCollector meta,
                                                        SerializationHeader header,
-                                                       Collection<Index> indexes,
+                                                       Collection<Index.Group> indexGroups,
                                                        LifecycleNewTracker lifecycleNewTracker)
     {
-        return SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, cfs.metadata, meta, header, indexes, lifecycleNewTracker);
+        return SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, cfs.metadata, meta, header, indexGroups, lifecycleNewTracker);
     }
 
     public boolean supportsEarlyOpen()
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java b/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java
index 95fc7b85b08f..ab8c0046dae2 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java
@@ -197,7 +197,7 @@ public abstract SSTableMultiWriter createSSTableMultiWriter(Descriptor descripto
                                                                 boolean isTransient,
                                                                 MetadataCollector collector,
                                                                 SerializationHeader header,
-                                                                Collection<Index> indexes,
+                                                                Collection<Index.Group> indexGroups,
                                                                 LifecycleNewTracker lifecycleNewTracker);
 
     /**
diff --git a/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java b/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java
index 7b6b5bf1fe87..abaad6349036 100644
--- a/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java
+++ b/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java
@@ -72,4 +72,12 @@ public Collection<CompactionInfo> getCompactionsForSSTable(SSTableReader sstable
         }
         return toReturn;
     }
+
+    /**
+     * @return true if given compaction is still active
+     */
+    public boolean isActive(CompactionInfo.Holder ci)
+    {
+        return compactions.contains(ci);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
index bdddaab61198..148690b88e43 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
@@ -167,9 +167,30 @@ boolean shouldStop(Predicate<SSTableReader> sstablePredicate)
         return sstables.stream().anyMatch(sstablePredicate);
     }
 
+    public enum StopTrigger
+    {
+        NONE(false),
+        TRUNCATE(true);
+
+        private final boolean isFinal;
+
+        StopTrigger(boolean isFinal)
+        {
+            this.isFinal = isFinal;
+        }
+
+        // A stop trigger marked as final should not be overwritten. So a table operation that is
+        // marked with a final stop trigger cannot have it's stop trigger changed to another value.
+        public boolean isFinal()
+        {
+            return isFinal;
+        }
+    }
+
     public static abstract class Holder
     {
         private volatile boolean stopRequested = false;
+        private volatile StopTrigger trigger = StopTrigger.NONE;
         public abstract CompactionInfo getCompactionInfo();
 
         public void stop()
@@ -177,6 +198,13 @@ public void stop()
             stopRequested = true;
         }
 
+        public void stop(StopTrigger trigger)
+        {
+            this.stopRequested = true;
+            if (!this.trigger.isFinal())
+                this.trigger = trigger;
+        }
+
         /**
          * if this compaction involves several/all tables we can safely check globalCompactionsPaused
          * in isStopRequested() below
@@ -187,6 +215,14 @@ public boolean isStopRequested()
         {
             return stopRequested || (isGlobal() && CompactionManager.instance.isGlobalCompactionPaused());
         }
+
+        /**
+         * @return cause of compaction interruption.
+         */
+        public StopTrigger trigger()
+        {
+            return trigger;
+        }
     }
 
     public enum Unit
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
index ec6a4d464c21..873fca0547e9 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
@@ -23,6 +23,7 @@
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Ordering;
 
+import org.apache.cassandra.index.transactions.IndexTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.TableMetadata;
 
@@ -169,7 +170,7 @@ public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey par
 
                 CompactionIterator.this.updateCounterFor(merged);
 
-                if (type != OperationType.COMPACTION || !controller.cfs.indexManager.hasIndexes())
+                if (type != OperationType.COMPACTION || !controller.cfs.indexManager.handles(IndexTransaction.Type.COMPACTION))
                     return null;
 
                 Columns statics = Columns.NONE;
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index 8a0926d48c47..7d503892c136 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
@@ -1429,7 +1429,7 @@ public static SSTableWriter createWriter(ColumnFamilyStore cfs,
                                     isTransient,
                                     sstable.getSSTableLevel(),
                                     sstable.header,
-                                    cfs.indexManager.listIndexes(),
+                                    cfs.indexManager.listIndexGroups(),
                                     txn);
     }
 
@@ -1466,7 +1466,7 @@ public static SSTableWriter createWriterForAntiCompaction(ColumnFamilyStore cfs,
                                     cfs.metadata,
                                     new MetadataCollector(sstables, cfs.metadata().comparator, minLevel),
                                     SerializationHeader.make(cfs.metadata(), sstables),
-                                    cfs.indexManager.listIndexes(),
+                                    cfs.indexManager.listIndexGroups(),
                                     txn);
     }
 
@@ -2166,6 +2166,60 @@ public void setMaxConcurrentAutoUpgradeTasks(int value)
         }
     }
 
+    /**
+     * Try to stop all of the compactions for given tables.
+     *
+     * Note that this method does not wait for all compactions to finish; you'll need to loop against
+     * isCompacting if you want that behavior.
+     *
+     * @param tables The tables to try to stop compaction upon.
+     * @param opPredicate Predicate to define which compaction operation to stop, based on its type.
+     * @param readerPredicate Predicate to define which compaction to stop based on candidate sstables.
+     * @param waitForInterruption whether to wait until interrupted compaction has fully stopped
+     *
+     * @return True if any compaction has been interrupted false otherwise.
+     */
+    public boolean interruptCompactionFor(Iterable<TableMetadata> tables, Predicate<OperationType> opPredicate, Predicate<SSTableReader> readerPredicate,
+                                          boolean waitForInterruption)
+    {
+        assert tables != null;
+
+        // interrupt in-progress compactions
+        Set<Holder> interrupted = new HashSet<>();
+        for (Holder compactionHolder : active.getCompactions())
+        {
+            CompactionInfo info = compactionHolder.getCompactionInfo();
+
+            if (Iterables.contains(tables, info.getTableMetadata()) && opPredicate.test(info.getTaskType()))
+            {
+                compactionHolder.stop();
+                interrupted.add(compactionHolder);
+            }
+        }
+
+        if (waitForInterruption)
+        {
+            // wait at most 2 minutes
+            long start = System.nanoTime();
+            long wait = TimeUnit.MINUTES.toNanos(2);
+
+            for (Holder operation : interrupted)
+            {
+                while (active.isActive(operation) && System.nanoTime() - start < wait)
+                    Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
+
+                if (active.isActive(operation))
+                    throw new RuntimeException(String.format("Compaction task (%s) didn't finish within 2 minutes", operation.getCompactionInfo()));
+            }
+        }
+
+        return !interrupted.isEmpty();
+    }
+
+    public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation)
+    {
+        interruptCompactionFor(columnFamilies, sstablePredicate, interruptValidation, CompactionInfo.StopTrigger.NONE);
+    }
     /**
      * Try to stop all of the compactions for given ColumnFamilies.
      *
@@ -2176,7 +2230,7 @@ public void setMaxConcurrentAutoUpgradeTasks(int value)
      * @param sstablePredicate the sstable predicate to match on
      * @param interruptValidation true if validation operations for repair should also be interrupted
      */
-    public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation)
+    public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation, CompactionInfo.StopTrigger trigger)
     {
         assert columnFamilies != null;
 
@@ -2190,18 +2244,23 @@ public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predi
             if (info.getTableMetadata() == null || Iterables.contains(columnFamilies, info.getTableMetadata()))
             {
                 if (info.shouldStop(sstablePredicate))
-                    compactionHolder.stop();
+                    compactionHolder.stop(trigger);
             }
         }
     }
 
     public void interruptCompactionForCFs(Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation)
+    {
+        interruptCompactionForCFs(cfss, sstablePredicate, interruptValidation, CompactionInfo.StopTrigger.NONE);
+    }
+
+    public void interruptCompactionForCFs(Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation, CompactionInfo.StopTrigger trigger)
     {
         List<TableMetadata> metadata = new ArrayList<>();
         for (ColumnFamilyStore cfs : cfss)
             metadata.add(cfs.metadata());
 
-        interruptCompactionFor(metadata, sstablePredicate, interruptValidation);
+        interruptCompactionFor(metadata, sstablePredicate, interruptValidation, trigger);
     }
 
     public void waitForCessation(Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablePredicate)
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java
index 129ee797ee0e..bd2ac772657c 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java
@@ -223,7 +223,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                        boolean isTransient,
                                                        MetadataCollector collector,
                                                        SerializationHeader header,
-                                                       Collection<Index> indexes,
+                                                       Collection<Index.Group> indexGroups,
                                                        LifecycleNewTracker lifecycleNewTracker)
     {
         if (isRepaired)
@@ -247,7 +247,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                  isTransient,
                                                  collector,
                                                  header,
-                                                 indexes,
+                                                 indexGroups,
                                                  lifecycleNewTracker);
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
index deece30d45d9..963dafee22c8 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
@@ -1065,7 +1065,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                        boolean isTransient,
                                                        MetadataCollector collector,
                                                        SerializationHeader header,
-                                                       Collection<Index> indexes,
+                                                       Collection<Index.Group> indexGroups,
                                                        LifecycleNewTracker lifecycleNewTracker)
     {
         SSTable.validateRepairedMetadata(repairedAt, pendingRepair, isTransient);
@@ -1080,7 +1080,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                                                               isTransient,
                                                                                               collector,
                                                                                               header,
-                                                                                              indexes,
+                                                                                              indexGroups,
                                                                                               lifecycleNewTracker);
         }
         finally
diff --git a/src/java/org/apache/cassandra/db/compaction/OperationType.java b/src/java/org/apache/cassandra/db/compaction/OperationType.java
index e957e42c9df5..7c602aa8a18f 100644
--- a/src/java/org/apache/cassandra/db/compaction/OperationType.java
+++ b/src/java/org/apache/cassandra/db/compaction/OperationType.java
@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.db.compaction;
 
+import com.google.common.base.Predicate;
+
 public enum OperationType
 {
     /** Each modification here should be also applied to {@link org.apache.cassandra.tools.nodetool.Stop#compactionType} */
@@ -64,4 +66,9 @@ public String toString()
     {
         return type;
     }
+
+    public static final Predicate<OperationType> REWRITES_SSTABLES = o -> o == COMPACTION || o == CLEANUP || o == SCRUB ||
+                                                                          o == TOMBSTONE_COMPACTION || o == ANTICOMPACTION ||
+                                                                          o == UPGRADE_SSTABLES || o == RELOCATE ||
+                                                                          o == GARBAGE_COLLECT;
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java b/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java
index 03d411174503..000f5c48be56 100644
--- a/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java
+++ b/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java
@@ -241,7 +241,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                        boolean isTransient,
                                                        MetadataCollector collector,
                                                        SerializationHeader header,
-                                                       Collection<Index> indexes,
+                                                       Collection<Index.Group> indexGroups,
                                                        LifecycleNewTracker lifecycleNewTracker)
     {
         Preconditions.checkArgument(repairedAt == ActiveRepairService.UNREPAIRED_SSTABLE,
@@ -257,7 +257,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                  isTransient,
                                                  collector,
                                                  header,
-                                                 indexes,
+                                                 indexGroups,
                                                  lifecycleNewTracker);
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/Upgrader.java b/src/java/org/apache/cassandra/db/compaction/Upgrader.java
index e1406aa7ed97..3972579efc88 100644
--- a/src/java/org/apache/cassandra/db/compaction/Upgrader.java
+++ b/src/java/org/apache/cassandra/db/compaction/Upgrader.java
@@ -80,7 +80,7 @@ private SSTableWriter createCompactionWriter(StatsMetadata metadata)
                                     cfs.metadata,
                                     sstableMetadataCollector,
                                     SerializationHeader.make(cfs.metadata(), Sets.newHashSet(sstable)),
-                                    cfs.indexManager.listIndexes(),
+                                    cfs.indexManager.listIndexGroups(),
                                     transaction);
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
index 6180f96100b5..f74c7532383c 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
@@ -76,7 +76,7 @@ public void switchCompactionLocation(Directories.DataDirectory directory)
                                                     cfs.metadata,
                                                     new MetadataCollector(txn.originals(), cfs.metadata().comparator, sstableLevel),
                                                     SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
-                                                    cfs.indexManager.listIndexes(),
+                                                    cfs.indexManager.listIndexGroups(),
                                                     txn);
         sstableWriter.switchWriter(writer);
     }
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
index 93043913f39b..c5ec0eef7be6 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
@@ -100,7 +100,7 @@ public void switchCompactionLocation(Directories.DataDirectory location)
                 cfs.metadata,
                 new MetadataCollector(txn.originals(), cfs.metadata().comparator, currentLevel),
                 SerializationHeader.make(cfs.metadata(), txn.originals()),
-                cfs.indexManager.listIndexes(),
+                cfs.indexManager.listIndexGroups(),
                 txn));
         partitionsWritten = 0;
         sstablesWritten = 0;
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
index af21e51ed4f3..8eaa8c18612c 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
@@ -102,7 +102,7 @@ public void switchCompactionLocation(Directories.DataDirectory location)
                                                     cfs.metadata,
                                                     new MetadataCollector(allSSTables, cfs.metadata().comparator, level),
                                                     SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
-                                                    cfs.indexManager.listIndexes(),
+                                                    cfs.indexManager.listIndexGroups(),
                                                     txn);
 
         sstableWriter.switchWriter(writer);
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
index f2d6fe91674a..c43d224fd92e 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
@@ -111,7 +111,7 @@ public void switchCompactionLocation(Directories.DataDirectory location)
                                                     cfs.metadata,
                                                     new MetadataCollector(allSSTables, cfs.metadata().comparator, 0),
                                                     SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
-                                                    cfs.indexManager.listIndexes(),
+                                                    cfs.indexManager.listIndexGroups(),
                                                     txn);
         logger.trace("Switching writer, currentPartitionsToWrite = {}", currentPartitionsToWrite);
         sstableWriter.switchWriter(writer);
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
index 3d72a113b804..a0cbd8b8b66b 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
@@ -182,6 +182,12 @@ Throwable updateSizeTracking(Iterable<SSTableReader> oldSSTables, Iterable<SSTab
         return accumulate;
     }
 
+    public void updateSizeTracking(long adjustment)
+    {
+        cfstore.metric.liveDiskSpaceUsed.inc(adjustment);
+        cfstore.metric.totalDiskSpaceUsed.inc(adjustment);
+    }
+
     // SETUP / CLEANUP
 
     public void addInitialSSTables(Iterable<SSTableReader> sstables)
diff --git a/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java b/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java
index 1cdbdb544d28..c32dfc700dc1 100644
--- a/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java
+++ b/src/java/org/apache/cassandra/exceptions/RequestFailureReason.java
@@ -35,7 +35,8 @@ public enum RequestFailureReason
     UNKNOWN                  (0),
     READ_TOO_MANY_TOMBSTONES (1),
     TIMEOUT                  (2),
-    INCOMPATIBLE_SCHEMA      (3);
+    INCOMPATIBLE_SCHEMA      (3),
+    INDEX_NOT_AVAILABLE      (4);
 
     public static final Serializer serializer = new Serializer();
 
@@ -46,6 +47,13 @@ public enum RequestFailureReason
         this.code = code;
     }
 
+    public int codeForNativeProtocol()
+    {
+        // We explicitly indicated in the protocol spec that drivers should not error out on unknown code, and we
+        // currently support a superset of the OSS codes, so we don't yet worry about the version.
+        return code;
+    }
+
     private static final RequestFailureReason[] codeToReasonMap;
 
     static
diff --git a/src/java/org/apache/cassandra/gms/ApplicationState.java b/src/java/org/apache/cassandra/gms/ApplicationState.java
index 4e20d6204884..f5a0670e8dab 100644
--- a/src/java/org/apache/cassandra/gms/ApplicationState.java
+++ b/src/java/org/apache/cassandra/gms/ApplicationState.java
@@ -56,6 +56,7 @@ public enum ApplicationState
      * a comma-separated list.
      **/
     SSTABLE_VERSIONS,
+    INDEX_STATUS,
     // DO NOT EDIT OR REMOVE PADDING STATES BELOW - only add new states above.  See CASSANDRA-16484
     X1,
     X2,
diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 880cb98e067f..938ba5348c3f 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java
@@ -245,6 +245,11 @@ public VersionedValue shutdown(boolean value)
             return new VersionedValue(VersionedValue.SHUTDOWN + VersionedValue.DELIMITER + value);
         }
 
+        public VersionedValue indexStatus(String status)
+        {
+            return new VersionedValue(status);
+        }
+
         public VersionedValue datacenter(String dcId)
         {
             return new VersionedValue(dcId);
diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
index e9d3d3c3d332..fb2d248c2900 100644
--- a/src/java/org/apache/cassandra/index/Index.java
+++ b/src/java/org/apache/cassandra/index/Index.java
@@ -20,17 +20,22 @@
  */
 package org.apache.cassandra.index;
 
+import java.nio.ByteBuffer;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.Callable;
-import java.util.function.BiFunction;
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
 
-import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
@@ -39,11 +44,16 @@
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.index.internal.CollatedViewIndexBuilder;
 import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.ReducingKeyIterator;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+
 
 /**
  * Consisting of a top level Index interface and two sub-interfaces which handle read and write operations,
@@ -163,7 +173,7 @@ public boolean supportsReads()
      */
     interface IndexBuildingSupport
     {
-        SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index> indexes, Collection<SSTableReader> sstables);
+        SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index> indexes, Collection<SSTableReader> sstables, boolean isFullRebuild);
     }
 
     /**
@@ -173,7 +183,7 @@ interface IndexBuildingSupport
     public static class CollatedViewIndexBuildingSupport implements IndexBuildingSupport
     {
         @SuppressWarnings("resource")
-        public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index> indexes, Collection<SSTableReader> sstables)
+        public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index> indexes, Collection<SSTableReader> sstables, boolean isFullRebuild)
         {
             return new CollatedViewIndexBuilder(cfs, indexes, new ReducingKeyIterator(sstables), sstables);
         }
@@ -315,11 +325,11 @@ default public Callable<?> getPreJoinTask(boolean hadBootstrap)
      * Get flush observer to observe partition/cell events generated by flushing SSTable (memtable flush or compaction).
      *
      * @param descriptor The descriptor of the sstable observer is requested for.
-     * @param opType The type of the operation which requests observer e.g. memtable flush or compaction.
+     * @param tracker The {@link LifecycleNewTracker} associated with the SSTable being written
      *
      * @return SSTable flush observer.
      */
-    default SSTableFlushObserver getFlushObserver(Descriptor descriptor, OperationType opType)
+    default SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker)
     {
         return null;
     }
@@ -364,6 +374,23 @@ default SSTableFlushObserver getFlushObserver(Descriptor descriptor, OperationTy
      */
     public AbstractType<?> customExpressionValueType();
 
+    /**
+     * If the index supports custom search expressions using the
+     * {@code SELECT * FROM table WHERE expr(index_name, expression)} syntax, this method should return a new
+     * {@link RowFilter.CustomExpression} for the specified expression value. Index implementations may provide their
+     * own implementations using method {@link RowFilter.CustomExpression#isSatisfiedBy(TableMetadata, DecoratedKey, Row)}
+     * to filter reconciled rows in the coordinator. Otherwise, the default implementation will accept all rows.
+     * See DB-2185 and DSP-16537 for further details.
+     *
+     * @param metadata the indexed table metadata
+     * @param value the custom expression value
+     * @return a custom index expression for the specified value
+     */
+    default RowFilter.CustomExpression customExpressionFor(TableMetadata metadata, ByteBuffer value)
+    {
+        return new RowFilter.CustomExpression(metadata, getIndexMetadata(), value);
+    }
+
     /**
      * Transform an initial RowFilter into the filter that will still need to applied
      * to a set of Rows after the index has performed it's initial scan.
@@ -386,6 +413,17 @@ default SSTableFlushObserver getFlushObserver(Descriptor descriptor, OperationTy
      */
     public long getEstimatedResultRows();
 
+    /**
+     * Check if current index is queryable based on the index status.
+     *
+     * @param status current status of the index
+     * @return true if index should be queryable, false if index should be non-queryable
+     */
+    default boolean isQueryable(Status status)
+    {
+        return true;
+    }
+
     /*
      * Input validation
      */
@@ -401,6 +439,16 @@ default SSTableFlushObserver getFlushObserver(Descriptor descriptor, OperationTy
      */
     public void validate(PartitionUpdate update) throws InvalidRequestException;
 
+    /**
+     * Returns the SSTable-attached {@link Component}s created by this index.
+     *
+     * @return the SSTable components created by this index
+     */
+    default Set<Component> getComponents()
+    {
+        return Collections.emptySet();
+    }
+
     /*
      * Update processing
      */
@@ -416,6 +464,7 @@ default SSTableFlushObserver getFlushObserver(Descriptor descriptor, OperationTy
      * @param ctx WriteContext spanning the update operation
      * @param transactionType indicates what kind of update is being performed on the base data
      *                        i.e. a write time insert/update/delete or the result of compaction
+     * @param memtable The current memtable that is the source of the updates
      * @return the newly created indexer or {@code null} if the index is not interested by the update
      * (this could be because the index doesn't care about that particular partition, doesn't care about
      * that type of transaction, ...).
@@ -424,7 +473,8 @@ public Indexer indexerFor(DecoratedKey key,
                               RegularAndStaticColumns columns,
                               int nowInSec,
                               WriteContext ctx,
-                              IndexTransaction.Type transactionType);
+                              IndexTransaction.Type transactionType,
+                              Memtable memtable);
 
     /**
      * Listener for processing events emitted during a single partition update.
@@ -557,27 +607,12 @@ default void validate(ReadCommand command) throws InvalidRequestException
      * @param rowFilter rowFilter of query to decide if it supports replica filtering protection or not
      * @return true if this index supports replica filtering protection, false otherwise
      */
+    //TODO Need to confirm whether SAI needs to implement this as false
     default boolean supportsReplicaFilteringProtection(RowFilter rowFilter)
     {
         return true;
     }
 
-    /**
-     * Return a function which performs post processing on the results of a partition range read command.
-     * In future, this may be used as a generalized mechanism for transforming results on the coordinator prior
-     * to returning them to the caller.
-     *
-     * This is used on the coordinator during execution of a range command to perform post
-     * processing of merged results obtained from the necessary replicas. This is the only way in which results are
-     * transformed in this way but this may change over time as usage is generalized.
-     * See CASSANDRA-8717 for further discussion.
-     *
-     * The function takes a PartitionIterator of the results from the replicas which has already been collated
-     * and reconciled, along with the command being executed. It returns another PartitionIterator containing the results
-     * of the transformation (which may be the same as the input if the transformation is a no-op).
-     */
-    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command);
-
     /**
      * Factory method for query time search helper.
      *
@@ -599,4 +634,261 @@ public interface Searcher
          */
         public UnfilteredPartitionIterator search(ReadExecutionController executionController);
     }
+
+    /**
+     * Class providing grouped operations for indexes that communicate with each other.
+     *
+     * Index implementations should provide a {@code Group} implementation calling to
+     * {@link SecondaryIndexManager#registerIndex(Index, Object, Supplier)} during index registering
+     * at {@link #register(IndexRegistry)} method.
+     */
+    interface Group
+    {
+        /**
+         * Returns the indexes that are members of this group.
+         *
+         * @return the indexes that are members of this group
+         */
+        Set<Index> getIndexes();
+
+        /**
+         * Adds the specified {@link Index} as a member of this group.
+         *
+         * @param index the index to be added
+         */
+        void addIndex(Index index);
+
+        /**
+         * Removes the specified {@link Index} from the members of this group.
+         *
+         * @param index the index to be removed
+         */
+        void removeIndex(Index index);
+
+        /**
+         * Returns if this group contains the specified {@link Index}.
+         *
+         * @param index the index to be removed
+         * @return {@code true} if this group contains {@code index}, {@code false} otherwise
+         */
+        boolean containsIndex(Index index);
+
+        /**
+         * Creates an new {@code Indexer} object for updates to a given partition.
+         *
+         * @param indexSelector a predicate selecting the targeted members
+         * @param key key of the partition being modified
+         * @param columns the regular and static columns the created indexer will have to deal with.
+         * This can be empty as an update might only contain partition, range and row deletions, but
+         * the indexer is guaranteed to not get any cells for a column that is not part of {@code columns}.
+         * @param nowInSec current time of the update operation
+         * @param ctx WriteContext spanning the update operation
+         * @param transactionType indicates what kind of update is being performed on the base data
+         *                        i.e. a write time insert/update/delete or the result of compaction
+         * @param memtable the {@link Memtable} to which the updates are being applied or {@code null}
+         *                 if the source of the updates is an existing {@link SSTable}
+         *
+         * @return the newly created indexer or {@code null} if the index is not interested by the update
+         * (this could be because the index doesn't care about that particular partition, doesn't care about
+         * that type of transaction, ...).
+         */
+        Indexer indexerFor(Predicate<Index> indexSelector,
+                           DecoratedKey key,
+                           RegularAndStaticColumns columns,
+                           int nowInSec,
+                           WriteContext ctx,
+                           IndexTransaction.Type transactionType,
+                           Memtable memtable);
+
+        /**
+         * Returns a new {@link QueryPlan} for the specified {@link RowFilter}, or {@code null} if none of the indexes in
+         * this group supports the expression in the row filter.
+         *
+         * @param rowFilter a row filter
+         * @return a new query plan for the specified {@link RowFilter} if it's supported, {@code null} otherwise
+         */
+        @Nullable
+        QueryPlan queryPlanFor(RowFilter rowFilter);
+
+        /**
+         * Get flush observer to observe partition/cell events generated by flushing SSTable (memtable flush or compaction).
+         *
+         * @param descriptor The descriptor of the sstable observer is requested for.
+         * @param tracker The {@link LifecycleNewTracker} associated with the SSTable being written
+         * @param tableMetadata The immutable metadata of the table at the moment the SSTable is flushed
+         *
+         * @return SSTable flush observer.
+         */
+        SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker, TableMetadata tableMetadata);
+
+        /**
+         * @param type index transaction type
+         * @return true if index will be able to handle given index transaction type
+         */
+        default boolean handles(IndexTransaction.Type type)
+        {
+            return true;
+        }
+
+        /**
+         * Called when the table associated with this group has been invalidated. Implementations
+         * should dispose of any resources tied to the lifecycle of the {@link Group}.
+         */
+        default void invalidate() { }
+
+        /**
+         * Returns the SSTable-attached {@link Component}s created by this index group.
+         *
+         * @return the SSTable components created by this group
+         */
+        Set<Component> getComponents();
+
+        /**
+         * @return true if this index group is capable of supporting multiple contains restrictions, false otherwise
+         */
+        default boolean supportsMultipleContains()
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Specifies a set of compatible indexes to be used with a query according to its {@link RowFilter}, ignoring data
+     * ranges, limits, etc. All the indexes in it should belong to the same {@link Group}.
+     * <p>
+     * It's created by {@link Group#queryPlanFor} from the {@link RowFilter} that is common to all the subcommands of a
+     * user query's {@link ReadCommand}, so it can be reused by those subcommands along the cluster nodes. The
+     * {@link #searcherFor(ReadCommand)} method provides the {@link Searcher} object to read the index for each
+     * particular (sub)command.
+     */
+    interface QueryPlan extends Comparable<QueryPlan>
+    {
+        /**
+         * Returns the indexes selected by this query plan, all of them belonging to the same {@link Group}.
+         *
+         * It should never be empty.
+         *
+         * @return the indexes selected by this query plan, which is never empty
+         */
+        Set<Index> getIndexes();
+
+        /**
+         * Returns the first index in this plan.
+         *
+         * @return the first index
+         */
+        @Nonnull
+        default Index getFirst()
+        {
+            return getIndexes().iterator().next();
+        }
+
+        /**
+         * Return an estimate of the number of results this plan is expected to return for any given {@link ReadCommand}
+         * that it can be used to answer. Used by  {@link SecondaryIndexManager#getBestIndexQueryPlanFor(RowFilter)}
+         * to determine the {@link Group} with the most selective plan for a given {@link RowFilter}.
+         * Additionally, this is also used by StorageProxy.estimateResultsPerRange to calculate the initial concurrency
+         * factor for range requests
+         *
+         * @return the estimated average number of results a Searcher may return for any given command
+         */
+        default long getEstimatedResultRows()
+        {
+            // CQL only supports AND expressions, so the estimated number of results for multiple indexes will be the
+            // the lowest of the estimates for each index
+            return getIndexes().stream()
+                               .mapToLong(Index::getEstimatedResultRows)
+                               .min()
+                               .orElseThrow(AssertionError::new); // registered groups are never empty
+        }
+
+        /**
+         * Used to determine whether to estimate initial concurrency during remote range reads. Default is true, each
+         * implementation must override this method if they choose a different strategy (e.g. StorageAttachedIndexQueryPlan).
+         *
+         * @return true if the {@link QueryPlan} should estimate initial concurrency, false otherwise
+         */
+        default boolean shouldEstimateInitialConcurrency()
+        {
+            return true;
+        }
+
+        @Override
+        default int compareTo(QueryPlan other)
+        {
+            // initially, we prefer the plan with less estimated results
+            int results = Long.compare(getEstimatedResultRows(), other.getEstimatedResultRows());
+            if (results != 0)
+                return results;
+
+            // In case of having the same number of estimated results, we favour the plan that involves more indexes.
+            // This way, we honour the possible absence of ALLOW FILTERING in the CQL query. Also, this criteria should
+            // not break the transitivity of this method because the estimated number of results for a plan is the
+            // minimum of the estimates of its members.
+            return Integer.compare(getIndexes().size(), other.getIndexes().size());
+        }
+
+        /**
+         * Used to validate the various parameters of a supplied {@link ReadCommand} against the indexes in this plan.
+         *
+         * @param command a ReadCommand whose parameters are to be verified
+         * @throws InvalidRequestException if the details of the command fail to meet the validation rules of the
+         * indexes in the query plan
+         */
+        default void validate(ReadCommand command) throws InvalidRequestException
+        {
+            getIndexes().forEach(i -> i.validate(command));
+        }
+
+        /**
+         * Factory method for query time search helper.
+         *
+         * @param command the read command being executed
+         * @return an Searcher with which to perform the supplied command
+         */
+        Searcher searcherFor(ReadCommand command);
+
+        /**
+         * Return a function which performs post processing on the results of a partition range read command.
+         * In future, this may be used as a generalized mechanism for transforming results on the coordinator prior
+         * to returning them to the caller.
+         *
+         * This is used on the coordinator during execution of a range command to perform post
+         * processing of merged results obtained from the necessary replicas. This is the only way in which results are
+         * transformed in this way but this may change over time as usage is generalized.
+         * See CASSANDRA-8717 for further discussion.
+         *
+         * The function takes a PartitionIterator of the results from the replicas which has already been collated
+         * and reconciled, along with the command being executed. It returns another PartitionIterator containing the results
+         * of the transformation (which may be the same as the input if the transformation is a no-op).
+         */
+        default Function<PartitionIterator, PartitionIterator> postProcessor()
+        {
+            return partitions -> partitions;
+        }
+
+        /**
+         * Transform an initial {@link RowFilter} into the filter that will still need to applied to a set of Rows after
+         * the index has performed it's initial scan.
+         *
+         * Used in {@link ReadCommand#executeLocally(ReadExecutionController)} to reduce the amount of filtering performed on the
+         * results of the index query.
+         *
+         * @return the (hopefully) reduced filter that would still need to be applied after
+         *         the index was used to narrow the initial result set
+         */
+        RowFilter postIndexQueryFilter();
+    }
+
+    /*
+     * Status of index used to determine queryability
+     */
+    enum Status
+    {
+        UNKNOWN,
+        FULL_REBUILD_STARTED,
+        BUILD_FAILED,
+        BUILD_SUCCEEDED,
+        DROPPED
+    }
 }
diff --git a/src/java/org/apache/cassandra/index/IndexRegistry.java b/src/java/org/apache/cassandra/index/IndexRegistry.java
index 0cf1cbb6c2a5..7ff4799693c0 100644
--- a/src/java/org/apache/cassandra/index/IndexRegistry.java
+++ b/src/java/org/apache/cassandra/index/IndexRegistry.java
@@ -23,19 +23,29 @@
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.function.BiFunction;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
+
+import javax.annotation.Nullable;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.TableMetadata;
@@ -56,19 +66,20 @@ public interface IndexRegistry
     public static final IndexRegistry EMPTY = new IndexRegistry()
     {
         @Override
-        public void unregisterIndex(Index index)
+        public void registerIndex(Index index, Object groupKey, Supplier<Index.Group> groupSupplier)
         {
         }
 
         @Override
-        public void registerIndex(Index index)
+        public Collection<Index> listIndexes()
         {
+            return Collections.emptyList();
         }
 
         @Override
-        public Collection<Index> listIndexes()
+        public Collection<Index.Group> listIndexGroups()
         {
-            return Collections.emptyList();
+            return Collections.emptySet();
         }
 
         @Override
@@ -173,23 +184,70 @@ public void validate(PartitionUpdate update) throws InvalidRequestException
             {
             }
 
-            public Indexer indexerFor(DecoratedKey key, RegularAndStaticColumns columns, int nowInSec, WriteContext ctx, IndexTransaction.Type transactionType)
+            public Indexer indexerFor(DecoratedKey key, RegularAndStaticColumns columns, int nowInSec, WriteContext ctx, IndexTransaction.Type transactionType, Memtable memtable)
             {
                 return null;
             }
 
-            public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command)
+            public Searcher searcherFor(ReadCommand command)
             {
                 return null;
             }
+        };
 
-            public Searcher searcherFor(ReadCommand command)
+        Index.Group group = new Index.Group()
+        {
+            @Override
+            public Set<Index> getIndexes()
+            {
+                return Collections.singleton(index);
+            }
+
+            @Override
+            public void addIndex(Index index)
+            {
+            }
+
+            @Override
+            public void removeIndex(Index index)
+            {
+            }
+
+            @Override
+            public boolean containsIndex(Index i)
+            {
+                return index == i;
+            }
+
+            @Nullable
+            @Override
+            public Index.Indexer indexerFor(Predicate<Index> indexSelector, DecoratedKey key, RegularAndStaticColumns columns, int nowInSec, WriteContext ctx, IndexTransaction.Type transactionType, Memtable memtable)
+            {
+                return null;
+            }
+
+            @Nullable
+            @Override
+            public Index.QueryPlan queryPlanFor(RowFilter rowFilter)
+            {
+                return null;
+            }
+
+            @Nullable
+            @Override
+            public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker, TableMetadata tableMetadata)
+            {
+                return null;
+            }
+
+            @Override
+            public Set<Component> getComponents()
             {
                 return null;
             }
         };
 
-        public void registerIndex(Index index)
+        public void registerIndex(Index index, Object groupKey, Supplier<Index.Group> groupSupplier)
         {
         }
 
@@ -207,6 +265,12 @@ public Collection<Index> listIndexes()
             return Collections.singletonList(index);
         }
 
+        @Override
+        public Collection<Index.Group> listIndexGroups()
+        {
+            return Collections.singletonList(group);
+        }
+
         public Optional<Index> getBestIndexFor(RowFilter.Expression expression)
         {
             return Optional.empty();
@@ -218,8 +282,12 @@ public void validate(PartitionUpdate update)
         }
     };
 
-    void registerIndex(Index index);
-    void unregisterIndex(Index index);
+    default void registerIndex(Index index)
+    {
+        registerIndex(index, index, () -> new SingletonIndexGroup(index));
+    }
+    public void registerIndex(Index index, Object groupKey, Supplier<Index.Group> groupSupplier);
+    Collection<Index.Group> listIndexGroups();
 
     Index getIndex(IndexMetadata indexMetadata);
     Collection<Index> listIndexes();
diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
index e9b22ef6d561..95d7f67143ff 100644
--- a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
@@ -22,9 +22,13 @@
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
+import javax.annotation.Nullable;
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Joiner;
 import com.google.common.base.Strings;
@@ -33,7 +37,6 @@
 import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
-import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.FutureCallback;
 import com.google.common.util.concurrent.Futures;
 import com.google.common.util.concurrent.ListenableFuture;
@@ -56,26 +59,34 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.View;
-import org.apache.cassandra.db.marshal.ValueAccessor;
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.index.Index.IndexBuildingSupport;
+import org.apache.cassandra.exceptions.ReadFailureException;
+import org.apache.cassandra.exceptions.RequestFailureReason;
+import org.apache.cassandra.gms.ApplicationState;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.gms.VersionedValue;
 import org.apache.cassandra.index.internal.CassandraIndex;
 import org.apache.cassandra.index.transactions.*;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.locator.Endpoints;
+import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.notifications.INotification;
 import org.apache.cassandra.notifications.INotificationConsumer;
 import org.apache.cassandra.notifications.SSTableAddedNotification;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.Indexes;
+import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.pager.SinglePartitionPager;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.concurrent.Refs;
+import org.json.simple.JSONValue;
 
 import static org.apache.cassandra.utils.ExecutorUtils.awaitTermination;
 import static org.apache.cassandra.utils.ExecutorUtils.shutdown;
@@ -132,6 +143,11 @@ public class SecondaryIndexManager implements IndexRegistry, INotificationConsum
     // default page size (in rows) when rebuilding the index for a whole partition
     public static final int DEFAULT_PAGE_SIZE = 10000;
 
+    // store per-endpoint index status: the key of inner map is identifier "keyspace.index"
+    public static final Map<InetAddressAndPort, Map<String, Index.Status>> peerIndexStatus = new ConcurrentHashMap<>();
+    // executes index status propagation task asynchronously to avoid potential deadlock on SIM
+    private static final ExecutorService statusPropagationExecutor = Executors.newSingleThreadExecutor();
+
     /**
      * All registered indexes.
      */
@@ -152,6 +168,11 @@ public class SecondaryIndexManager implements IndexRegistry, INotificationConsum
      */
     private final Map<String, Index> writableIndexes = Maps.newConcurrentMap();
 
+    /**
+     * The groups of all the registered indexes
+     */
+    private final Map<Object, Index.Group> indexGroups = Maps.newConcurrentMap();
+
     /**
      * The count of pending index builds for each index.
      */
@@ -219,9 +240,15 @@ private synchronized Future<?> createIndex(IndexMetadata indexDef, boolean isNew
 
         markIndexesBuilding(ImmutableSet.of(index), true, isNewCF);
 
+        return buildIndex(index);
+    }
+
+    @VisibleForTesting
+    public Future<?> buildIndex(final Index index)
+    {
         Callable<?> initialBuildTask = null;
         // if the index didn't register itself, we can probably assume that no initialization needs to happen
-        if (indexes.containsKey(indexDef.name))
+        if (indexes.containsKey(index.getIndexMetadata().name))
         {
             try
             {
@@ -267,7 +294,7 @@ public void onSuccess(Object o)
      * Adds and builds a index
      *
      * @param indexDef the IndexMetadata describing the index
-     * @param isNewCF true if the index is added as part of a new table/columnfamily (i.e. loading a CF at startup), 
+     * @param isNewCF true if the index is added as part of a new table/columnfamily (i.e. loading a CF at startup),
      * false for all other cases (i.e. newly added index)
      */
     public synchronized Future<?> addIndex(IndexMetadata indexDef, boolean isNewCF)
@@ -278,6 +305,22 @@ public synchronized Future<?> addIndex(IndexMetadata indexDef, boolean isNewCF)
             return createIndex(indexDef, isNewCF);
     }
 
+    /**
+     * Throws an {@link IndexNotAvailableException} if any of the indexes in the specified {@link Index.QueryPlan} is
+     * not queryable, as it's defined by {@link #isIndexQueryable(Index)}.
+     *
+     * @param queryPlan a query plan
+     * @throws IndexNotAvailableException if the query plan has any index that is not queryable
+     */
+    public void checkQueryability(Index.QueryPlan queryPlan)
+    {
+        for (Index index : queryPlan.getIndexes())
+        {
+            if (!isIndexQueryable(index))
+                throw new IndexNotAvailableException(index);
+        }
+    }
+
     /**
      * Checks if the specified index is queryable.
      *
@@ -497,7 +540,7 @@ private void buildIndexesBlocking(Collection<SSTableReader> sstables, Set<Index>
         {
             logger.info("Submitting index {} of {} for data in {}",
                         isFullRebuild ? "recovery" : "build",
-                        indexes.stream().map(i -> i.getIndexMetadata().name).collect(Collectors.joining(",")),
+                        commaSeparated(indexes),
                         sstables.stream().map(SSTableReader::toString).collect(Collectors.joining(",")));
 
             // Group all building tasks
@@ -515,7 +558,7 @@ private void buildIndexesBlocking(Collection<SSTableReader> sstables, Set<Index>
             List<Future<?>> futures = new ArrayList<>(byType.size());
             byType.forEach((buildingSupport, groupedIndexes) ->
                            {
-                               SecondaryIndexBuilder builder = buildingSupport.getIndexBuildTask(baseCfs, groupedIndexes, sstables);
+                               SecondaryIndexBuilder builder = buildingSupport.getIndexBuildTask(baseCfs, groupedIndexes, sstables, isFullRebuild);
                                final SettableFuture build = SettableFuture.create();
                                Futures.addCallback(CompactionManager.instance.submitIndexBuild(builder), new FutureCallback()
                                {
@@ -624,7 +667,7 @@ private String getIndexNames(Set<Index> indexes)
      *
      * @param indexes the index to be marked as building
      * @param isFullRebuild {@code true} if this method is invoked as a full index rebuild, {@code false} otherwise
-     * @param isNewCF {@code true} if this method is invoked when initializing a new table/columnfamily (i.e. loading a CF at startup), 
+     * @param isNewCF {@code true} if this method is invoked when initializing a new table/columnfamily (i.e. loading a CF at startup),
      * {@code false} for all other cases (i.e. newly added index)
      */
     private synchronized void markIndexesBuilding(Set<Index> indexes, boolean isFullRebuild, boolean isNewCF)
@@ -650,7 +693,10 @@ private synchronized void markIndexesBuilding(Set<Index> indexes, boolean isFull
                             AtomicInteger counter = inProgressBuilds.computeIfAbsent(indexName, ignored -> new AtomicInteger(0));
 
                             if (isFullRebuild)
+                            {
                                 needsFullRebuild.remove(indexName);
+                                makeIndexNonQueryable(index, Index.Status.FULL_REBUILD_STARTED);
+                            }
 
                             if (counter.getAndIncrement() == 0 && DatabaseDescriptor.isDaemonInitialized() && !isNewCF)
                                 SystemKeyspace.setIndexRemoved(keyspaceName, indexName);
@@ -668,14 +714,8 @@ private synchronized void markIndexBuilt(Index index, boolean isFullRebuild)
     {
         String indexName = index.getIndexMetadata().name;
         if (isFullRebuild)
-        {
-            if (queryableIndexes.add(indexName))
-                logger.info("Index [{}] became queryable after successful build.", indexName);
+            makeIndexQueryable(index, Index.Status.BUILD_SUCCEEDED);
 
-            if (writableIndexes.put(indexName, index) == null)
-                logger.info("Index [{}] became writable after successful build.", indexName);
-        }
-        
         AtomicInteger counter = inProgressBuilds.get(indexName);
         if (counter != null)
         {
@@ -742,6 +782,8 @@ private synchronized void markIndexRemoved(String indexName)
         writableIndexes.remove(indexName);
         needsFullRebuild.remove(indexName);
         inProgressBuilds.remove(indexName);
+        // remove existing indexing status
+        propagateLocalIndexStatus(keyspace.getName(), indexName, Index.Status.DROPPED);
     }
 
     public Index getIndexByName(String indexName)
@@ -755,8 +797,10 @@ private Index createInstance(IndexMetadata indexDef)
         if (indexDef.isCustom())
         {
             assert indexDef.options != null;
-            String className = indexDef.options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME);
+            // Find any aliases to the fully qualified index class name:
+            String className = IndexMetadata.expandAliases(indexDef.options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME));
             assert !Strings.isNullOrEmpty(className);
+
             try
             {
                 Class<? extends Index> indexClass = FBUtilities.classForName(className, "Index");
@@ -790,6 +834,8 @@ public void dropAllIndexes()
     {
         markAllIndexesRemoved();
         invalidateAllIndexesBlocking();
+
+        indexGroups.forEach((key, group) -> group.invalidate());
     }
 
     @VisibleForTesting
@@ -917,14 +963,16 @@ public void indexPartition(DecoratedKey key, Set<Index> indexes, int pageSize)
 
                     try (UnfilteredRowIterator partition = page.next())
                     {
-                        Set<Index.Indexer> indexers = indexes.stream()
-                                                             .map(index -> index.indexerFor(key,
-                                                                                            partition.columns(),
-                                                                                            nowInSec,
-                                                                                            ctx,
-                                                                                            IndexTransaction.Type.UPDATE))
-                                                             .filter(Objects::nonNull)
-                                                             .collect(Collectors.toSet());
+                        Set<Index.Indexer> indexers = indexGroups.values().stream()
+                                                                 .map(g -> g.indexerFor(indexes::contains,
+                                                                                        key,
+                                                                                        partition.columns(),
+                                                                                        nowInSec,
+                                                                                        ctx,
+                                                                                        IndexTransaction.Type.UPDATE,
+                                                                                        null))
+                                                                 .filter(Objects::nonNull)
+                                                                 .collect(Collectors.toSet());
 
                         // Short-circuit empty partitions if static row is processed or isn't read
                         if (!readStatic && partition.isEmpty() && partition.staticRow().isEmpty())
@@ -1023,6 +1071,9 @@ public int calculateIndexingPageSize()
      */
     public void deletePartition(UnfilteredRowIterator partition, int nowInSec)
     {
+        if (!handles(IndexTransaction.Type.CLEANUP))
+            return;
+
         // we need to acquire memtable lock because secondary index deletion may
         // cause a race (see CASSANDRA-3712). This is done internally by the
         // index transaction when it commits
@@ -1070,15 +1121,14 @@ public void deletePartition(UnfilteredRowIterator partition, int nowInSec)
      * ReadOrderGroup, or an estimate of the result size from an average index query.
      *
      * @param rowFilter RowFilter of the command to be executed
-     * @return an Index instance, ready to use during execution of the command, or null if none
-     * of the registered indexes can support the command.
+     * @return the best available index query plan for the row filter, or {@code null} if none of the registered indexes
+     * can support the command.
      */
-    public Index getBestIndexFor(RowFilter rowFilter)
+    public Index.QueryPlan getBestIndexQueryPlanFor(RowFilter rowFilter)
     {
         if (indexes.isEmpty() || rowFilter.isEmpty())
             return null;
 
-        Set<Index> searchableIndexes = new HashSet<>();
         for (RowFilter.Expression expression : rowFilter)
         {
             if (expression.isCustom())
@@ -1088,46 +1138,63 @@ public Index getBestIndexFor(RowFilter rowFilter)
                 RowFilter.CustomExpression customExpression = (RowFilter.CustomExpression) expression;
                 logger.trace("Command contains a custom index expression, using target index {}", customExpression.getTargetIndex().name);
                 Tracing.trace("Command contains a custom index expression, using target index {}", customExpression.getTargetIndex().name);
-                return indexes.get(customExpression.getTargetIndex().name);
-            }
-            else if (!expression.isUserDefined())
-            {
-                indexes.values().stream()
-                       .filter(index -> index.supportsExpression(expression.column(), expression.operator()))
-                       .forEach(searchableIndexes::add);
+                Index.Group group = getIndexGroup(customExpression.getTargetIndex());
+                return group == null ? null : group.queryPlanFor(rowFilter);
             }
         }
 
-        if (searchableIndexes.isEmpty())
+        Set<Index.QueryPlan> queryPlans = indexGroups.values()
+                                                     .stream()
+                                                     .map(g -> g.queryPlanFor(rowFilter))
+                                                     .filter(Objects::nonNull)
+                                                     .collect(Collectors.toSet());
+
+        if (queryPlans.isEmpty())
         {
             logger.trace("No applicable indexes found");
             Tracing.trace("No applicable indexes found");
             return null;
         }
 
-        Index selected = searchableIndexes.size() == 1
-                         ? Iterables.getOnlyElement(searchableIndexes)
-                         : searchableIndexes.stream()
-                                            .min((a, b) -> Longs.compare(a.getEstimatedResultRows(),
-                                                                         b.getEstimatedResultRows()))
-                                            .orElseThrow(() -> new AssertionError("Could not select most selective index"));
+        // find the best plan
+        Index.QueryPlan selected = queryPlans.size() == 1
+                                   ? Iterables.getOnlyElement(queryPlans)
+                                   : queryPlans.stream()
+                                               .min(Comparator.naturalOrder())
+                                               .orElseThrow(() -> new AssertionError("Could not select most selective index"));
 
         // pay for an additional threadlocal get() rather than build the strings unnecessarily
         if (Tracing.isTracing())
         {
             Tracing.trace("Index mean cardinalities are {}. Scanning with {}.",
-                          searchableIndexes.stream().map(i -> i.getIndexMetadata().name + ':' + i.getEstimatedResultRows())
-                                           .collect(Collectors.joining(",")),
-                          selected.getIndexMetadata().name);
+                          queryPlans.stream()
+                                    .map(p -> commaSeparated(p.getIndexes()) + ':' + p.getEstimatedResultRows())
+                                    .collect(Collectors.joining(",")),
+                          commaSeparated(selected.getIndexes()));
         }
         return selected;
     }
 
+    private static String commaSeparated(Collection<Index> indexes)
+    {
+        return indexes.stream().map(i -> i.getIndexMetadata().name).collect(Collectors.joining(","));
+    }
+
+
     public Optional<Index> getBestIndexFor(RowFilter.Expression expression)
     {
         return indexes.values().stream().filter((i) -> i.supportsExpression(expression.column(), expression.operator())).findFirst();
     }
 
+    public <T extends Index> Optional<T> getBestIndexFor(RowFilter.Expression expression, Class<T> indexType)
+    {
+        return indexes.values()
+                      .stream()
+                      .filter(i -> indexType.isInstance(i) && i.supportsExpression(expression.column(), expression.operator()))
+                      .map(indexType::cast)
+                      .findFirst();
+    }
+
     /**
      * Called at write time to ensure that values present in the update
      * are valid according to the rules of all registered indexes which
@@ -1147,23 +1214,50 @@ public void validate(PartitionUpdate update) throws InvalidRequestException
     /*
      * IndexRegistry methods
      */
-
-    public void registerIndex(Index index)
+    public void registerIndex(Index index, Object groupKey, Supplier<Index.Group> groupSupplier)
     {
         String name = index.getIndexMetadata().name;
         indexes.put(name, index);
         logger.trace("Registered index {}", name);
-    }
 
-    public void unregisterIndex(Index index)
-    {
-        unregisterIndex(index.getIndexMetadata().name);
+        // instantiate and add the index group if it hasn't been already added
+        Index.Group group = indexGroups.computeIfAbsent(groupKey, k -> groupSupplier.get());
+
+        // add the created index to its group if it is not a singleton group
+        if (!(group instanceof SingletonIndexGroup))
+        {
+            if (index.getBackingTable().isPresent())
+                throw new InvalidRequestException("Indexes belonging to a group of indexes shouldn't have a backing table");
+
+            group.addIndex(index);
+        }
     }
 
     private Index unregisterIndex(String name)
     {
         Index removed = indexes.remove(name);
         logger.trace(removed == null ? "Index {} was not registered" : "Removed index {} from registry", name);
+
+        if (removed != null)
+        {
+            // Remove the index from any non-singleton groups...
+            for (Index.Group group : listIndexGroups())
+            {
+                if (!(group instanceof SingletonIndexGroup) && group.containsIndex(removed))
+                {
+                    group.removeIndex(removed);
+
+                    if (group.getIndexes().isEmpty())
+                    {
+                        indexGroups.remove(group);
+                    }
+                }
+            }
+
+            // ...and remove singleton groups entirely.
+            indexGroups.remove(removed);
+        }
+
         return removed;
     }
 
@@ -1177,6 +1271,42 @@ public Collection<Index> listIndexes()
         return ImmutableSet.copyOf(indexes.values());
     }
 
+    public Set<Index.Group> listIndexGroups()
+    {
+        return ImmutableSet.copyOf(indexGroups.values());
+    }
+
+    public Index.Group getIndexGroup(Object key)
+    {
+        return indexGroups.get(key);
+    }
+
+    /**
+     * Returns the {@link Index.Group} the specified index belongs to, as specified during registering with
+     * {@link #registerIndex(Index, Object, Supplier)}.
+     *
+     * @param metadata the index metadata
+     * @return the group the index belongs to, or {@code null} if the index is not registered or if it hasn't been
+     * associated to any group
+     */
+    @Nullable
+    public Index.Group getIndexGroup(IndexMetadata metadata)
+    {
+        Index index = getIndex(metadata);
+        return index == null ? null : getIndexGroup(index);
+    }
+
+    @VisibleForTesting
+    public boolean needsFullRebuild(String index)
+    {
+        return needsFullRebuild.contains(index);
+    }
+
+    public Index.Group getIndexGroup(Index index)
+    {
+        return indexGroups.values().stream().filter(g -> g.containsIndex(index)).findAny().orElse(null);
+    }
+
     /*
      * Handling of index updates.
      * Implementations of the various IndexTransaction interfaces, for keeping indexes in sync with base data
@@ -1186,23 +1316,28 @@ public Collection<Index> listIndexes()
     /**
      * Transaction for updates on the write path.
      */
-    public UpdateTransaction newUpdateTransaction(PartitionUpdate update, WriteContext ctx, int nowInSec)
+    public UpdateTransaction newUpdateTransaction(PartitionUpdate update, WriteContext ctx, int nowInSec, Memtable memtable)
     {
         if (!hasIndexes())
             return UpdateTransaction.NO_OP;
-        
-        ArrayList<Index.Indexer> idxrs = new ArrayList<>();
-        for (Index i : writableIndexes.values())
-        {
-            Index.Indexer idxr = i.indexerFor(update.partitionKey(), update.columns(), nowInSec, ctx, IndexTransaction.Type.UPDATE);
-            if (idxr != null)
-                idxrs.add(idxr);
-        }
-        
-        if (idxrs.size() == 0)
-            return UpdateTransaction.NO_OP;
-        else
-            return new WriteTimeTransaction(idxrs.toArray(new Index.Indexer[idxrs.size()]));
+
+        Index.Indexer[] indexers = listIndexGroups().stream()
+                                                    .map(g -> g.indexerFor(writableIndexSelector(),
+                                                                           update.partitionKey(),
+                                                                           update.columns(),
+                                                                           nowInSec,
+                                                                           ctx,
+                                                                           IndexTransaction.Type.UPDATE,
+                                                                           memtable))
+                                                    .filter(Objects::nonNull)
+                                                    .toArray(Index.Indexer[]::new);
+
+        return indexers.length == 0 ? UpdateTransaction.NO_OP : new WriteTimeTransaction(indexers);
+    }
+
+    private Predicate<Index> writableIndexSelector()
+    {
+        return index -> writableIndexes.containsKey(index.getIndexMetadata().name);
     }
 
     /**
@@ -1214,7 +1349,7 @@ public CompactionTransaction newCompactionTransaction(DecoratedKey key,
                                                           int nowInSec)
     {
         // the check for whether there are any registered indexes is already done in CompactionIterator
-        return new IndexGCTransaction(key, regularAndStaticColumns, keyspace, versions, nowInSec, writableIndexes.values());
+        return new IndexGCTransaction(key, regularAndStaticColumns, keyspace, versions, nowInSec, listIndexGroups(), writableIndexSelector());
     }
 
     /**
@@ -1227,7 +1362,21 @@ public CleanupTransaction newCleanupTransaction(DecoratedKey key,
         if (!hasIndexes())
             return CleanupTransaction.NO_OP;
 
-        return new CleanupGCTransaction(key, regularAndStaticColumns, keyspace, nowInSec, writableIndexes.values());
+        return new CleanupGCTransaction(key, regularAndStaticColumns, keyspace, nowInSec, listIndexGroups(), writableIndexSelector());
+    }
+
+    /**
+     * @param type index transaction type
+     * @return true if at least one of the indexes will be able to handle given index transaction type
+     */
+    public boolean handles(IndexTransaction.Type type)
+    {
+        for (Index.Group group : indexGroups.values())
+        {
+            if (group.handles(type))
+                return true;
+        }
+        return false;
     }
 
     /**
@@ -1281,19 +1430,19 @@ public void onUpdated(Row existing, Row updated)
             // diff listener collates the columns to be added & removed from the indexes
             RowDiffListener diffListener = new RowDiffListener()
             {
-                public void onPrimaryKeyLivenessInfo(int i, Clustering<?> clustering, LivenessInfo merged, LivenessInfo original)
+                public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original)
                 {
                 }
 
-                public void onDeletion(int i, Clustering<?> clustering, Row.Deletion merged, Row.Deletion original)
+                public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original)
                 {
                 }
 
-                public void onComplexDeletion(int i, Clustering<?> clustering, ColumnMetadata column, DeletionTime merged, DeletionTime original)
+                public void onComplexDeletion(int i, Clustering clustering, ColumnMetadata column, DeletionTime merged, DeletionTime original)
                 {
                 }
 
-                public void onCell(int i, Clustering<?> clustering, Cell<?> merged, Cell<?> original)
+                public void onCell(int i, Clustering clustering, Cell merged, Cell original)
                 {
                     if (merged != null && !merged.equals(original))
                         toInsert.addCell(merged);
@@ -1315,7 +1464,7 @@ public void commit()
                 indexer.finish();
         }
 
-        private <V1, V2> boolean shouldCleanupOldValue(Cell<V1> oldCell, Cell<V2> newCell)
+        private boolean shouldCleanupOldValue(Cell oldCell, Cell newCell)
         {
             // If either the value or timestamp is different, then we
             // should delete from the index. If not, then we can infer that
@@ -1326,7 +1475,7 @@ private <V1, V2> boolean shouldCleanupOldValue(Cell<V1> oldCell, Cell<V2> newCel
             // Completely identical cells (including expiring columns with
             // identical ttl & localExpirationTime) will not get this far due
             // to the oldCell.equals(newCell) in StandardUpdater.update
-            return !Cells.valueEqual(oldCell, newCell) || oldCell.timestamp() != newCell.timestamp();
+            return !oldCell.value().equals(newCell.value()) || oldCell.timestamp() != newCell.timestamp();
         }
     }
 
@@ -1343,7 +1492,8 @@ private static final class IndexGCTransaction implements CompactionTransaction
         private final Keyspace keyspace;
         private final int versions;
         private final int nowInSec;
-        private final Collection<Index> indexes;
+        private final Collection<Index.Group> indexGroups;
+        private final Predicate<Index> writableIndexSelector;
 
         private Row[] rows;
 
@@ -1352,14 +1502,16 @@ private IndexGCTransaction(DecoratedKey key,
                                    Keyspace keyspace,
                                    int versions,
                                    int nowInSec,
-                                   Collection<Index> indexes)
+                                   Collection<Index.Group> indexGroups,
+                                   Predicate<Index> writableIndexSelector)
         {
             this.key = key;
             this.columns = columns;
             this.keyspace = keyspace;
             this.versions = versions;
-            this.indexes = indexes;
+            this.indexGroups = indexGroups;
             this.nowInSec = nowInSec;
+            this.writableIndexSelector = writableIndexSelector;
         }
 
         public void start()
@@ -1375,27 +1527,27 @@ public void onRowMerge(Row merged, Row... versions)
             final Row.Builder[] builders = new Row.Builder[versions.length];
             RowDiffListener diffListener = new RowDiffListener()
             {
-                public void onPrimaryKeyLivenessInfo(int i, Clustering<?> clustering, LivenessInfo merged, LivenessInfo original)
+                public void onPrimaryKeyLivenessInfo(int i, Clustering clustering, LivenessInfo merged, LivenessInfo original)
                 {
                     if (original != null && (merged == null || !merged.isLive(nowInSec)))
                         getBuilder(i, clustering).addPrimaryKeyLivenessInfo(original);
                 }
 
-                public void onDeletion(int i, Clustering<?> clustering, Row.Deletion merged, Row.Deletion original)
+                public void onDeletion(int i, Clustering clustering, Row.Deletion merged, Row.Deletion original)
                 {
                 }
 
-                public void onComplexDeletion(int i, Clustering<?> clustering, ColumnMetadata column, DeletionTime merged, DeletionTime original)
+                public void onComplexDeletion(int i, Clustering clustering, ColumnMetadata column, DeletionTime merged, DeletionTime original)
                 {
                 }
 
-                public void onCell(int i, Clustering<?> clustering, Cell<?> merged, Cell<?> original)
+                public void onCell(int i, Clustering clustering, Cell merged, Cell original)
                 {
                     if (original != null && (merged == null || !merged.isLive(nowInSec)))
                         getBuilder(i, clustering).addCell(original);
                 }
 
-                private Row.Builder getBuilder(int index, Clustering<?> clustering)
+                private Row.Builder getBuilder(int index, Clustering clustering)
                 {
                     if (builders[index] == null)
                     {
@@ -1420,9 +1572,9 @@ public void commit()
 
             try (WriteContext ctx = keyspace.getWriteHandler().createContextForIndexing())
             {
-                for (Index index : indexes)
+                for (Index.Group group : indexGroups)
                 {
-                    Index.Indexer indexer = index.indexerFor(key, columns, nowInSec, ctx, Type.COMPACTION);
+                    Index.Indexer indexer = group.indexerFor(writableIndexSelector, key, columns, nowInSec, ctx, Type.COMPACTION, null);
                     if (indexer == null)
                         continue;
 
@@ -1448,7 +1600,8 @@ private static final class CleanupGCTransaction implements CleanupTransaction
         private final RegularAndStaticColumns columns;
         private final Keyspace keyspace;
         private final int nowInSec;
-        private final Collection<Index> indexes;
+        private final Collection<Index.Group> indexGroups;
+        private final Predicate<Index> writableIndexSelector;
 
         private Row row;
         private DeletionTime partitionDelete;
@@ -1457,13 +1610,15 @@ private CleanupGCTransaction(DecoratedKey key,
                                      RegularAndStaticColumns columns,
                                      Keyspace keyspace,
                                      int nowInSec,
-                                     Collection<Index> indexes)
+                                     Collection<Index.Group> indexGroups,
+                                     Predicate<Index> writableIndexSelector)
         {
             this.key = key;
             this.columns = columns;
             this.keyspace = keyspace;
-            this.indexes = indexes;
+            this.indexGroups = indexGroups;
             this.nowInSec = nowInSec;
+            this.writableIndexSelector = writableIndexSelector;
         }
 
         public void start()
@@ -1487,9 +1642,9 @@ public void commit()
 
             try (WriteContext ctx = keyspace.getWriteHandler().createContextForIndexing())
             {
-                for (Index index : indexes)
+                for (Index.Group group : indexGroups)
                 {
-                    Index.Indexer indexer = index.indexerFor(key, columns, nowInSec, ctx, Type.CLEANUP);
+                    Index.Indexer indexer = group.indexerFor(writableIndexSelector, key, columns, nowInSec, ctx, Type.CLEANUP, null);
                     if (indexer == null)
                         continue;
 
@@ -1562,4 +1717,140 @@ public static void shutdownAndWait(long timeout, TimeUnit units) throws Interrup
         shutdown(asyncExecutor, blockingExecutor);
         awaitTermination(timeout, units, asyncExecutor, blockingExecutor);
     }
+
+    /**
+     * Remove endpoints whose indexes are not queryable for the specified {@link Index.QueryPlan}.
+     *
+     * @param liveEndpoints current live endpoints where non-queryable endpoints will be removed
+     * @param keyspace to be queried
+     * @param indexQueryPlan index query plan used in the read command
+     * @param level consistency level of read command
+     */
+    public static <E extends Endpoints<E>> E filterForQuery(E liveEndpoints, Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel level)
+    {
+        E queryableEndpoints = liveEndpoints.filter(replica -> {
+
+            for (Index index : indexQueryPlan.getIndexes())
+            {
+                Index.Status status = getIndexStatus(replica.endpoint(), keyspace.getName(), index.getIndexMetadata().name);
+                if (!index.isQueryable(status))
+                    return false;
+            }
+
+            return true;
+        });
+
+        int initial = liveEndpoints.size();
+        int filtered = queryableEndpoints.size();
+
+        // Throw ReadFailureException if read request cannot satisfy Consistency Level due to non-queryable indexes.
+        // It is to provide a better UX, compared to throwing UnavailableException when the nodes are actually alive.
+        if (initial != filtered)
+        {
+            int required = level.blockFor(keyspace.getReplicationStrategy());
+            if (required <= initial && required > filtered)
+            {
+                Map<InetAddressAndPort, RequestFailureReason> failureReasons = new HashMap<>();
+                liveEndpoints.without(queryableEndpoints.endpoints())
+                             .forEach(replica -> failureReasons.put(replica.endpoint(), RequestFailureReason.INDEX_NOT_AVAILABLE));
+
+                throw new ReadFailureException(level, filtered, required, false, failureReasons);
+            }
+        }
+
+        return queryableEndpoints;
+    }
+
+    public void makeIndexNonQueryable(Index index, Index.Status status)
+    {
+        String name = index.getIndexMetadata().name;
+        if (indexes.get(name) == index)
+        {
+            propagateLocalIndexStatus(keyspace.getName(), name, status);
+            if (!index.isQueryable(status))
+                queryableIndexes.remove(name);
+        }
+    }
+
+    public void makeIndexQueryable(Index index, Index.Status status)
+    {
+        String name = index.getIndexMetadata().name;
+        if (indexes.get(name) == index)
+        {
+            propagateLocalIndexStatus(keyspace.getName(), name, status);
+            if (index.isQueryable(status))
+            {
+                if (queryableIndexes.add(name))
+                    logger.info("Index [{}] became queryable after successful build.", name);
+            }
+
+            if (writableIndexes.put(name, index) == null)
+                logger.info("Index [{}] became writable after successful build.", name);
+        }
+    }
+
+    public static Index.Status getIndexStatus(InetAddressAndPort peer, String keyspace, String index)
+    {
+        return peerIndexStatus.getOrDefault(peer, Collections.emptyMap())
+                              .getOrDefault(identifier(keyspace, index), Index.Status.UNKNOWN);
+    }
+
+    public synchronized static void receivePeerIndexStatus(InetAddressAndPort endpoint, VersionedValue versionedValue)
+    {
+        try
+        {
+            if (endpoint.equals(FBUtilities.getBroadcastAddressAndPort()))
+                return;
+
+            Map<String, String> peerStatus = (Map<String, String>) JSONValue.parseWithException(versionedValue.value);
+            Map<String, Index.Status> indexStatus = new ConcurrentHashMap<>();
+
+            for (Map.Entry<String, String> e : peerStatus.entrySet())
+            {
+                String keyspaceIndex = e.getKey();
+                Index.Status status = Index.Status.valueOf(e.getValue());
+                indexStatus.put(keyspaceIndex, status);
+            }
+
+            peerIndexStatus.put(endpoint, indexStatus);
+        }
+        catch (Throwable e)
+        {
+            logger.warn("Unable to parse index status: {}", e.getMessage());
+        }
+    }
+
+    @VisibleForTesting
+    public synchronized static void propagateLocalIndexStatus(String keyspace, String index, Index.Status status)
+    {
+        try
+        {
+            Map<String, Index.Status> states = peerIndexStatus.computeIfAbsent(FBUtilities.getBroadcastAddressAndPort(),
+                                                                               k -> new ConcurrentHashMap<>());
+            String keyspaceIndex = identifier(keyspace, index);
+
+            if (status == Index.Status.DROPPED)
+                states.remove(keyspaceIndex);
+            else
+                states.put(keyspaceIndex, status);
+
+            String newStatus = JSONValue.toJSONString(states.entrySet().stream()
+                                                            .collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue().toString())));
+            statusPropagationExecutor.submit(() -> {
+                // schedule gossiper update asynchronously to avoid potential deadlock when another thread is holding
+                // gossiper taskLock.
+                VersionedValue value = StorageService.instance.valueFactory.indexStatus(newStatus);
+                Gossiper.instance.addLocalApplicationState(ApplicationState.INDEX_STATUS, value);
+            });
+        }
+        catch (Throwable e)
+        {
+            logger.warn("Unable to propagate index status: {}", e.getMessage());
+        }
+    }
+
+    private static String identifier(String keyspace, String index)
+    {
+        return new StringBuilder().append(keyspace).append('.').append(index).toString();
+    }
 }
diff --git a/src/java/org/apache/cassandra/index/SingletonIndexGroup.java b/src/java/org/apache/cassandra/index/SingletonIndexGroup.java
new file mode 100644
index 000000000000..94fb482e6bb8
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/SingletonIndexGroup.java
@@ -0,0 +1,112 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.index;
+
+import java.util.Collections;
+import java.util.Set;
+import java.util.function.Predicate;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.WriteContext;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * An {@link Index.Group} containing a single {@link Index}, to which it just delegates the calls.
+ */
+public class SingletonIndexGroup implements Index.Group
+{
+    private final Index delegate;
+    private final Set<Index> indexes;
+
+    protected SingletonIndexGroup(Index delegate)
+    {
+        this.delegate = delegate;
+        this.indexes = Collections.singleton(delegate);
+    }
+
+    @Override
+    public Set<Index> getIndexes()
+    {
+        return indexes;
+    }
+
+    public Index getIndex()
+    {
+        return delegate;
+    }
+
+    @Override
+    public void addIndex(Index index)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void removeIndex(Index index)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean containsIndex(Index index)
+    {
+        return index.equals(delegate);
+    }
+
+    @Override
+    public Index.Indexer indexerFor(Predicate<Index> indexSelector,
+                                    DecoratedKey key,
+                                    RegularAndStaticColumns columns,
+                                    int nowInSec,
+                                    WriteContext ctx,
+                                    IndexTransaction.Type transactionType,
+                                    Memtable memtable)
+    {
+        return indexSelector.test(delegate) ? delegate.indexerFor(key, columns, nowInSec, ctx, transactionType, memtable) : null;
+    }
+
+    @Override
+    public Index.QueryPlan queryPlanFor(RowFilter rowFilter)
+    {
+        return SingletonIndexQueryPlan.create(delegate, rowFilter);
+    }
+
+    @Override
+    public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker, TableMetadata tableMetadata)
+    {
+        return delegate.getFlushObserver(descriptor, tracker);
+    }
+
+    @Override
+    public Set<Component> getComponents()
+    {
+        return delegate.getComponents();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/SingletonIndexQueryPlan.java b/src/java/org/apache/cassandra/index/SingletonIndexQueryPlan.java
new file mode 100644
index 000000000000..b475cee145a3
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/SingletonIndexQueryPlan.java
@@ -0,0 +1,87 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.index;
+
+import java.util.Collections;
+import java.util.Set;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.filter.RowFilter;
+
+public class SingletonIndexQueryPlan implements Index.QueryPlan
+{
+    private final Index index;
+    private final Set<Index> indexes;
+    private final RowFilter postIndexFilter;
+
+    protected SingletonIndexQueryPlan(Index index, RowFilter postIndexFilter)
+    {
+        this.index = index;
+        this.indexes = Collections.singleton(index);
+        this.postIndexFilter = postIndexFilter;
+    }
+
+    @Nullable
+    protected static SingletonIndexQueryPlan create(Index index, RowFilter rowFilter)
+    {
+        for (RowFilter.Expression e : rowFilter.getExpressions())
+        {
+            if (index.supportsExpression(e.column(), e.operator()))
+                return new SingletonIndexQueryPlan(index, index.getPostIndexQueryFilter(rowFilter));
+        }
+
+        return null;
+    }
+
+    @Override
+    public Set<Index> getIndexes()
+    {
+        return indexes;
+    }
+
+    @Override
+    @Nonnull
+    public Index getFirst()
+    {
+        return index;
+    }
+
+    @Override
+    public long getEstimatedResultRows()
+    {
+        return index.getEstimatedResultRows();
+    }
+
+    @Override
+    public Index.Searcher searcherFor(ReadCommand command)
+    {
+        return index.searcherFor(command);
+    }
+
+    @Override
+    public RowFilter postIndexQueryFilter()
+    {
+        return postIndexFilter;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
index 06a6cd9cfd66..68db86da069c 100644
--- a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
@@ -271,14 +271,6 @@ public long getEstimatedResultRows()
         return indexCfs.getMeanRowCount();
     }
 
-    /**
-     * No post processing of query results, just return them unchanged
-     */
-    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command)
-    {
-        return (partitionIterator, readCommand) -> partitionIterator;
-    }
-
     public RowFilter getPostIndexQueryFilter(RowFilter filter)
     {
         return getTargetExpression(filter.getExpressions()).map(filter::without)
@@ -340,7 +332,8 @@ public Indexer indexerFor(final DecoratedKey key,
                               final RegularAndStaticColumns columns,
                               final int nowInSec,
                               final WriteContext ctx,
-                              final IndexTransaction.Type transactionType)
+                              final IndexTransaction.Type transactionType,
+                              final Memtable memtable)
     {
         /**
          * Indexes on regular and static columns (the non primary-key ones) only care about updates with live
@@ -527,7 +520,7 @@ private void insert(ByteBuffer rowKey,
                                                                cell));
         Row row = BTreeRow.noCellLiveRow(buildIndexClustering(rowKey, clustering, cell), info);
         PartitionUpdate upd = partitionUpdate(valueKey, row);
-        indexCfs.getWriteHandler().write(upd, ctx, UpdateTransaction.NO_OP);
+        indexCfs.getWriteHandler().write(upd, ctx, false);
         logger.trace("Inserted entry into index for value {}", valueKey);
     }
 
@@ -573,7 +566,7 @@ private void doDelete(DecoratedKey indexKey,
     {
         Row row = BTreeRow.emptyDeletedRow(indexClustering, Row.Deletion.regular(deletion));
         PartitionUpdate upd = partitionUpdate(indexKey, row);
-        indexCfs.getWriteHandler().write(upd, ctx, UpdateTransaction.NO_OP);
+        indexCfs.getWriteHandler().write(upd, ctx, false);
         logger.trace("Removed index entry for value {}", indexKey);
     }
 
diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndex.java b/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
index b1998bc40b5b..fdcf9e427b09 100644
--- a/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
+++ b/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.lifecycle.Tracker;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.partitions.PartitionIterator;
@@ -46,7 +47,7 @@
 import org.apache.cassandra.index.sasi.conf.IndexMode;
 import org.apache.cassandra.index.sasi.disk.OnDiskIndexBuilder.Mode;
 import org.apache.cassandra.index.sasi.disk.PerSSTableIndexWriter;
-import org.apache.cassandra.index.sasi.plan.QueryPlan;
+import org.apache.cassandra.index.sasi.plan.SASIIndexSearcher;
 import org.apache.cassandra.index.transactions.IndexTransaction;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
@@ -70,7 +71,8 @@ private static class SASIIndexBuildingSupport implements IndexBuildingSupport
     {
         public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs,
                                                        Set<Index> indexes,
-                                                       Collection<SSTableReader> sstablesToRebuild)
+                                                       Collection<SSTableReader> sstablesToRebuild,
+                                                       boolean isFullRebuild)
         {
             NavigableMap<SSTableReader, Map<ColumnMetadata, ColumnIndex>> sstables = new TreeMap<>((a, b) -> {
                 return Integer.compare(a.descriptor.generation, b.descriptor.generation);
@@ -255,7 +257,7 @@ public boolean supportsReplicaFilteringProtection(RowFilter rowFilter)
         return false;
     }
 
-    public Indexer indexerFor(DecoratedKey key, RegularAndStaticColumns columns, int nowInSec, WriteContext context, IndexTransaction.Type transactionType)
+    public Indexer indexerFor(DecoratedKey key, RegularAndStaticColumns columns, int nowInSec, WriteContext context, IndexTransaction.Type transactionType, Memtable memtable)
     {
         return new Indexer()
         {
@@ -303,17 +305,12 @@ public Searcher searcherFor(ReadCommand command) throws InvalidRequestException
     {
         TableMetadata config = command.metadata();
         ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(config.id);
-        return controller -> new QueryPlan(cfs, command, DatabaseDescriptor.getRangeRpcTimeout(MILLISECONDS)).execute(controller);
+        return new SASIIndexSearcher(cfs, command, DatabaseDescriptor.getRangeRpcTimeout(MILLISECONDS));
     }
 
-    public SSTableFlushObserver getFlushObserver(Descriptor descriptor, OperationType opType)
+    public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker)
     {
-        return newWriter(baseCfs.metadata().partitionKeyType, descriptor, Collections.singletonMap(index.getDefinition(), index), opType);
-    }
-
-    public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand command)
-    {
-        return (partitionIterator, readCommand) -> partitionIterator;
+        return newWriter(baseCfs.metadata().partitionKeyType, descriptor, Collections.singletonMap(index.getDefinition(), index), tracker.opType());
     }
 
     public IndexBuildingSupport getBuildTaskSupport()
diff --git a/src/java/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriter.java b/src/java/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriter.java
index 0af4ba2aef8e..a9e58ba7fa53 100644
--- a/src/java/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriter.java
+++ b/src/java/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriter.java
@@ -27,6 +27,7 @@
 
 import org.apache.cassandra.concurrent.JMXEnabledThreadPoolExecutor;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.db.DeletionTime;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.compaction.OperationType;
@@ -111,6 +112,18 @@ public void startPartition(DecoratedKey key, long curPosition)
         currentKeyPosition = curPosition;
     }
 
+    @Override
+    public void partitionLevelDeletion(DeletionTime deletionTime, long position)
+    {
+        // do nothing
+    }
+
+    @Override
+    public void staticRow(Row staticRow, long position)
+    {
+        nextUnfilteredCluster(staticRow);
+    }
+
     public void nextUnfilteredCluster(Unfiltered unfiltered)
     {
         if (!unfiltered.isRow())
diff --git a/src/java/org/apache/cassandra/index/sasi/plan/QueryPlan.java b/src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java
similarity index 95%
rename from src/java/org/apache/cassandra/index/sasi/plan/QueryPlan.java
rename to src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java
index a54dfc8dd588..4e78d3e6c5e5 100644
--- a/src/java/org/apache/cassandra/index/sasi/plan/QueryPlan.java
+++ b/src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java
@@ -23,18 +23,18 @@
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.sasi.disk.Token;
 import org.apache.cassandra.index.sasi.plan.Operation.OperationType;
-import org.apache.cassandra.exceptions.RequestTimeoutException;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.AbstractIterator;
 
-public class QueryPlan
+public class SASIIndexSearcher implements Index.Searcher
 {
     private final QueryController controller;
 
-    public QueryPlan(ColumnFamilyStore cfs, ReadCommand command, long executionQuotaMs)
+    public SASIIndexSearcher(ColumnFamilyStore cfs, ReadCommand command, long executionQuotaMs)
     {
         this.controller = new QueryController(cfs, (PartitionRangeReadCommand) command, executionQuotaMs);
     }
@@ -63,7 +63,8 @@ private Operation analyze()
         }
     }
 
-    public UnfilteredPartitionIterator execute(ReadExecutionController executionController) throws RequestTimeoutException
+    @Override
+    public UnfilteredPartitionIterator search(ReadExecutionController executionController)
     {
         return new ResultIterator(analyze(), controller, executionController);
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
index cfb1365649c7..a10b9fc4dbcc 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
@@ -144,12 +144,12 @@ public static SSTableTxnWriter create(TableMetadataRef metadata,
                                           boolean isTransient,
                                           int sstableLevel,
                                           SerializationHeader header,
-                                          Collection<Index> indexes)
+                                          Collection<Index.Group> indexGroups)
     {
         // if the column family store does not exist, we create a new default SSTableMultiWriter to use:
         LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.WRITE);
         MetadataCollector collector = new MetadataCollector(metadata.get().comparator).sstableLevel(sstableLevel);
-        SSTableMultiWriter writer = SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, collector, header, indexes, txn);
+        SSTableMultiWriter writer = SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, collector, header, indexGroups, txn);
         return new SSTableTxnWriter(txn, writer);
     }
 
diff --git a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
index d38d03292b9a..b0c773243420 100644
--- a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
@@ -115,10 +115,10 @@ public static SSTableMultiWriter create(Descriptor descriptor,
                                             TableMetadataRef metadata,
                                             MetadataCollector metadataCollector,
                                             SerializationHeader header,
-                                            Collection<Index> indexes,
+                                            Collection<Index.Group> indexGroups,
                                             LifecycleNewTracker lifecycleNewTracker)
     {
-        SSTableWriter writer = SSTableWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, indexes, lifecycleNewTracker);
+        SSTableWriter writer = SSTableWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, indexGroups, lifecycleNewTracker);
         return new SimpleSSTableMultiWriter(writer, lifecycleNewTracker);
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java
index f0b6bac86144..1f9177bffcdb 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java
@@ -18,6 +18,9 @@
 package org.apache.cassandra.io.sstable.format;
 
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.db.rows.Unfiltered;
 
 /**
@@ -40,16 +43,70 @@ public interface SSTableFlushObserver
     void startPartition(DecoratedKey key, long indexPosition);
 
     /**
-     * Called after the unfiltered cluster is written to the sstable.
-     * Will be preceded by a call to {@code startPartition(DecoratedKey, long)},
-     * and the cluster should be assumed to belong to that partition.
+     * Called when the deletion time of a partition is written to the sstable.
      *
-     * @param unfilteredCluster The unfiltered cluster being added to SSTable.
+     * Will be preceded by a call to {@link #startPartition(DecoratedKey, long)},
+     * and the deletion time should be assumed to belong to that partition.
+     *
+     * @param deletionTime the partition-level deletion time being written to the SSTable
+     * @param position the position of the written deletion time in the data file,
+     * as required by {@link SSTableReader#partitionLevelDeletionAt(long)}
+     */
+    void partitionLevelDeletion(DeletionTime deletionTime, long position);
+
+    /**
+     * Called when the static row of a partition is written to the sstable.
+     *
+     * Will be preceded by a call to {@link #startPartition(DecoratedKey, long)},
+     * and the static row should be assumed to belong to that partition.
+     *
+     * @param staticRow the static row being written to the SSTable
+     * @param position the position of the written static row in the data file,
+     * as required by {@link SSTableReader#staticRowAt(long, ColumnFilter)}
      */
-    void nextUnfilteredCluster(Unfiltered unfilteredCluster);
+    void staticRow(Row staticRow, long position);
+
+    /**
+     * Called after an unfiltered is written to the sstable.
+     *
+     * Will be preceded by a call to {@link #startPartition(DecoratedKey, long)},
+     * and the unfiltered should be assumed to belong to that partition.
+     *
+     * Implementations overriding {@link #nextUnfilteredCluster(Unfiltered, long)} shouldn't implement this method
+     * since only one of the two methods is required.
+     *
+     * @param unfiltered the unfiltered being written to the SSTable
+     */
+    default void nextUnfilteredCluster(Unfiltered unfiltered)
+    {
+    }
+
+    /**
+     * Called after an unfiltered is written to the sstable.
+     *
+     * Will be preceded by a call to {@link #startPartition(DecoratedKey, long)},
+     * and the unfiltered should be assumed to belong to that partition.
+     *
+     * Implementations overriding {@link #nextUnfilteredCluster(Unfiltered)} shouldn't implement this method
+     * since only one of the two methods is required.
+     *
+     * @param unfiltered the unfiltered being written to the SSTable
+     * @param position the position of the written unfiltered in the data file,
+     * as required by {@link SSTableReader#clusteringAt(long)}
+     * and {@link SSTableReader#unfilteredAt(long, ColumnFilter)}
+     */
+    default void nextUnfilteredCluster(Unfiltered unfiltered, long position)
+    {
+        nextUnfilteredCluster(unfiltered);
+    }
 
     /**
      * Called when all data is written to the file and it's ready to be finished up.
      */
     void complete();
+
+    /**
+     * Clean up resources on error. There should be no side effects if called multiple times.
+     */
+    default void abort(Throwable accumulator) {}
 }
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index 7aa38482169b..ae58fb1acb75 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -46,9 +46,15 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.BTreeRow;
 import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.DeserializationHelper;
 import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.Unfiltered;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredSerializer;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Range;
@@ -1592,8 +1598,110 @@ public boolean isRepaired()
         return sstableMetadata.repairedAt != ActiveRepairService.UNREPAIRED_SSTABLE;
     }
 
+    public DecoratedKey keyAt(RandomAccessReader indexFileReader, long indexPosition) throws IOException
+    {
+        indexFileReader.seek(indexPosition);
+        return keyAt(indexFileReader);
+    }
+
     public abstract DecoratedKey keyAt(long indexPosition) throws IOException;
 
+    public abstract DecoratedKey keyAt(FileDataInput reader) throws IOException;
+
+    /**
+     * Retrieves the partition-level deletion time at the given position of the data file, as specified by
+     * {@link SSTableFlushObserver#partitionLevelDeletion(DeletionTime, long)}.
+     *
+     * @param position the start position of the partion-level deletion time in the data file
+     * @return the partion-level deletion time at the specified position
+     */
+    public DeletionTime partitionLevelDeletionAt(long position) throws IOException
+    {
+        try (FileDataInput in = dfile.createReader(position))
+        {
+            if (in.isEOF())
+                return null;
+
+            return DeletionTime.serializer.deserialize(in);
+        }
+    }
+
+    /**
+     * Retrieves the static row at the given position of the data file, as specified by
+     * {@link SSTableFlushObserver#staticRow(Row, long)}.
+     *
+     * @param position the start position of the static row in the data file
+     * @param columnFilter the columns to fetch, {@code null} to select all the columns
+     * @return the static row at the specified position
+     */
+    public Row staticRowAt(long position, ColumnFilter columnFilter) throws IOException
+    {
+        if (!header.hasStatic())
+            return Rows.EMPTY_STATIC_ROW;
+
+        try (FileDataInput in = dfile.createReader(position))
+        {
+            if (in.isEOF())
+                return null;
+
+            int version = descriptor.version.correspondingMessagingVersion();
+            DeserializationHelper helper = new DeserializationHelper(metadata.get(),
+                                                                     version,
+                                                                     DeserializationHelper.Flag.LOCAL,
+                                                                     columnFilter);
+
+            return UnfilteredSerializer.serializer.deserializeStaticRow(in, header, helper);
+        }
+    }
+
+    /**
+     * Retrieves the clustering prefix of the unfiltered at the given position of the data file, as specified by
+     * {@link SSTableFlushObserver#nextUnfilteredCluster(Unfiltered, long)}.
+     *
+     * @param position the start position of the unfiltered in the data file
+     * @return the clustering prefix of the unfiltered at the specified position
+     */
+    public ClusteringPrefix clusteringAt(long position) throws IOException
+    {
+        try (FileDataInput in = dfile.createReader(position))
+        {
+            if (in.isEOF())
+                return null;
+
+            int version = descriptor.version.correspondingMessagingVersion();
+            int flags = in.readUnsignedByte();
+            boolean isRow = UnfilteredSerializer.kind(flags) == Unfiltered.Kind.ROW;
+
+            return isRow
+                   ? Clustering.serializer.deserialize(in, version, header.clusteringTypes())
+                   : ClusteringBoundOrBoundary.serializer.deserialize(in, version, header.clusteringTypes());
+        }
+    }
+
+    /**
+     * Retrieves the unfiltered at the given position of the data file, as specified by
+     * {@link SSTableFlushObserver#nextUnfilteredCluster(Unfiltered, long)}.
+     *
+     * @param position the start position of the unfiltered in the data file
+     * @param columnFilter the columns to fetch, {@code null} to select all the columns
+     * @return the unfiltered at the specified position
+     */
+    public Unfiltered unfilteredAt(long position, ColumnFilter columnFilter) throws IOException
+    {
+        try (FileDataInput in = dfile.createReader(position))
+        {
+            if (in.isEOF())
+                return null;
+
+            int version = descriptor.version.correspondingMessagingVersion();
+            DeserializationHelper helper = new DeserializationHelper(metadata.get(),
+                                                                     version,
+                                                                     DeserializationHelper.Flag.LOCAL,
+                                                                     columnFilter);
+            return UnfilteredSerializer.serializer.deserialize(in, header, helper, BTreeRow.sortedBuilder());
+        }
+    }
+
     public boolean isPendingRepair()
     {
         return sstableMetadata.pendingRepair != ActiveRepairService.NO_PENDING_REPAIR;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index 52667bfe78ed..263b150135c7 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
@@ -81,9 +81,10 @@ protected SSTableWriter(Descriptor descriptor,
                             TableMetadataRef metadata,
                             MetadataCollector metadataCollector,
                             SerializationHeader header,
-                            Collection<SSTableFlushObserver> observers)
+                            Collection<SSTableFlushObserver> observers,
+                            Set<Component> indexComponents)
     {
-        super(descriptor, components(metadata.getLocal()), metadata, DatabaseDescriptor.getDiskOptimizationStrategy());
+        super(descriptor, components(metadata.getLocal(), indexComponents), metadata, DatabaseDescriptor.getDiskOptimizationStrategy());
         this.keyCount = keyCount;
         this.repairedAt = repairedAt;
         this.pendingRepair = pendingRepair;
@@ -93,6 +94,20 @@ protected SSTableWriter(Descriptor descriptor,
         this.observers = observers == null ? Collections.emptySet() : observers;
     }
 
+    private static Set<Component> indexComponents(Collection<Index.Group> indexGroups)
+    {
+        if (indexGroups == null)
+            return Collections.emptySet();
+
+        Set<Component> components = new HashSet<>();
+        for (Index.Group group : indexGroups)
+        {
+            components.addAll(group.getComponents());
+        }
+
+        return components;
+    }
+
     public static SSTableWriter create(Descriptor descriptor,
                                        Long keyCount,
                                        Long repairedAt,
@@ -101,11 +116,12 @@ public static SSTableWriter create(Descriptor descriptor,
                                        TableMetadataRef metadata,
                                        MetadataCollector metadataCollector,
                                        SerializationHeader header,
-                                       Collection<Index> indexes,
+                                       Collection<Index.Group> indexGroups,
                                        LifecycleNewTracker lifecycleNewTracker)
     {
         Factory writerFactory = descriptor.getFormat().getWriterFactory();
-        return writerFactory.open(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers(descriptor, indexes, lifecycleNewTracker.opType()), lifecycleNewTracker);
+        return writerFactory.open(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers(descriptor, indexGroups, lifecycleNewTracker, metadata.get()), lifecycleNewTracker,
+                                  indexComponents(indexGroups));
     }
 
     public static SSTableWriter create(Descriptor descriptor,
@@ -115,11 +131,11 @@ public static SSTableWriter create(Descriptor descriptor,
                                        boolean isTransient,
                                        int sstableLevel,
                                        SerializationHeader header,
-                                       Collection<Index> indexes,
+                                       Collection<Index.Group> indexGroups,
                                        LifecycleNewTracker lifecycleNewTracker)
     {
         TableMetadataRef metadata = Schema.instance.getTableMetadataRef(descriptor);
-        return create(metadata, descriptor, keyCount, repairedAt, pendingRepair, isTransient, sstableLevel, header, indexes, lifecycleNewTracker);
+        return create(metadata, descriptor, keyCount, repairedAt, pendingRepair, isTransient, sstableLevel, header, indexGroups, lifecycleNewTracker);
     }
 
     public static SSTableWriter create(TableMetadataRef metadata,
@@ -130,11 +146,11 @@ public static SSTableWriter create(TableMetadataRef metadata,
                                        boolean isTransient,
                                        int sstableLevel,
                                        SerializationHeader header,
-                                       Collection<Index> indexes,
+                                       Collection<Index.Group> indexGroups,
                                        LifecycleNewTracker lifecycleNewTracker)
     {
         MetadataCollector collector = new MetadataCollector(metadata.get().comparator).sstableLevel(sstableLevel);
-        return create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, collector, header, indexes, lifecycleNewTracker);
+        return create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, collector, header, indexGroups, lifecycleNewTracker);
     }
 
     @VisibleForTesting
@@ -144,13 +160,13 @@ public static SSTableWriter create(Descriptor descriptor,
                                        UUID pendingRepair,
                                        boolean isTransient,
                                        SerializationHeader header,
-                                       Collection<Index> indexes,
+                                       Collection<Index.Group> indexGroups,
                                        LifecycleNewTracker lifecycleNewTracker)
     {
-        return create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, 0, header, indexes, lifecycleNewTracker);
+        return create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, 0, header, indexGroups, lifecycleNewTracker);
     }
 
-    private static Set<Component> components(TableMetadata metadata)
+    private static Set<Component> components(TableMetadata metadata, Collection<Component> indexComponents)
     {
         Set<Component> components = new HashSet<Component>(Arrays.asList(Component.DATA,
                 Component.PRIMARY_INDEX,
@@ -172,20 +188,24 @@ private static Set<Component> components(TableMetadata metadata)
             // but the components are unmodifiable after construction
             components.add(Component.CRC);
         }
+
+        components.addAll(indexComponents);
+
         return components;
     }
 
     private static Collection<SSTableFlushObserver> observers(Descriptor descriptor,
-                                                              Collection<Index> indexes,
-                                                              OperationType operationType)
+                                                              Collection<Index.Group> indexGroups,
+                                                              LifecycleNewTracker tracker,
+                                                              TableMetadata metadata)
     {
-        if (indexes == null)
+        if (indexGroups == null)
             return Collections.emptyList();
 
-        List<SSTableFlushObserver> observers = new ArrayList<>(indexes.size());
-        for (Index index : indexes)
+        List<SSTableFlushObserver> observers = new ArrayList<>(indexGroups.size());
+        for (Index.Group group : indexGroups)
         {
-            SSTableFlushObserver observer = index.getFlushObserver(descriptor, operationType);
+            SSTableFlushObserver observer = group.getFlushObserver(descriptor, tracker, metadata);
             if (observer != null)
             {
                 observer.begin();
@@ -295,7 +315,14 @@ public final Throwable commit(Throwable accumulate)
 
     public final Throwable abort(Throwable accumulate)
     {
-        return txnProxy.abort(accumulate);
+        try
+        {
+            return txnProxy.abort(accumulate);
+        }
+        finally
+        {
+            observers.forEach(observer -> observer.abort(accumulate));
+        }
     }
 
     public final void close()
@@ -305,7 +332,14 @@ public final void close()
 
     public final void abort()
     {
-        txnProxy.abort();
+        try
+        {
+            txnProxy.abort();
+        }
+        finally
+        {
+            observers.forEach(observer -> observer.abort(null));
+        }
     }
 
     protected Map<MetadataType, MetadataComponent> finalizeMetadata()
@@ -381,6 +415,7 @@ public abstract SSTableWriter open(Descriptor descriptor,
                                            MetadataCollector metadataCollector,
                                            SerializationHeader header,
                                            Collection<SSTableFlushObserver> observers,
-                                           LifecycleNewTracker lifecycleNewTracker);
+                                           LifecycleNewTracker lifecycleNewTracker,
+                                           Set<Component> indexComponents);
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index e444e81612c9..4dea115bc205 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -19,8 +19,10 @@
 
 import java.io.IOException;
 import java.util.Collection;
+import java.util.Set;
 import java.util.UUID;
 
+import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.util.FileHandle;
@@ -90,10 +92,11 @@ public SSTableWriter open(Descriptor descriptor,
                                   MetadataCollector metadataCollector,
                                   SerializationHeader header,
                                   Collection<SSTableFlushObserver> observers,
-                                  LifecycleNewTracker lifecycleNewTracker)
+                                  LifecycleNewTracker lifecycleNewTracker,
+                                  Set<Component> indexComponents)
         {
             SSTable.validateRepairedMetadata(repairedAt, pendingRepair, isTransient);
-            return new BigTableWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers, lifecycleNewTracker);
+            return new BigTableWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers, lifecycleNewTracker, indexComponents);
         }
     }
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
index 708488873098..d91267830c8c 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
@@ -303,20 +303,24 @@ protected BigTableRowIndexEntry getPosition(PartitionPosition key,
     @Override
     public DecoratedKey keyAt(long indexPosition) throws IOException
     {
-        DecoratedKey key;
         try (FileDataInput in = ifile.createReader(indexPosition))
         {
-            if (in.isEOF())
-                return null;
+            return keyAt(in);
+        }
+    }
 
-            key = decorateKey(ByteBufferUtil.readWithShortLength(in));
+    @Override
+    public DecoratedKey keyAt(FileDataInput reader) throws IOException
+    {
+        if (reader.isEOF()) return null;
 
-            // hint read path about key location if caching is enabled
-            // this saves index summary lookup and index file iteration which whould be pretty costly
-            // especially in presence of promoted column indexes
-            if (isKeyCacheEnabled())
-                cacheKey(key, rowIndexEntrySerializer.deserialize(in));
-        }
+        DecoratedKey key = decorateKey(ByteBufferUtil.readWithShortLength(reader));
+
+        // hint read path about key location if caching is enabled
+        // this saves index summary lookup and index file iteration which whould be pretty costly
+        // especially in presence of promoted column indexes
+        if (isKeyCacheEnabled())
+            cacheKey(key, rowIndexEntrySerializer.deserialize(reader));
 
         return key;
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
index 3285cce2ed56..637dab9290b3 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
@@ -75,12 +75,13 @@ public BigTableWriter(Descriptor descriptor,
                           UUID pendingRepair,
                           boolean isTransient,
                           TableMetadataRef metadata,
-                          MetadataCollector metadataCollector, 
+                          MetadataCollector metadataCollector,
                           SerializationHeader header,
                           Collection<SSTableFlushObserver> observers,
-                          LifecycleNewTracker lifecycleNewTracker)
+                          LifecycleNewTracker lifecycleNewTracker,
+                          Set<Component> indexComponents)
     {
-        super(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers);
+        super(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers, indexComponents);
         lifecycleNewTracker.trackNew(this); // must track before any files are created
 
         if (compression)
@@ -335,10 +336,10 @@ public SSTableReader openEarly()
                                                            components, metadata,
                                                            ifile, dfile,
                                                            indexSummary,
-                                                           iwriter.bf.sharedCopy(), 
-                                                           maxDataAge, 
-                                                           stats, 
-                                                           SSTableReader.OpenReason.EARLY, 
+                                                           iwriter.bf.sharedCopy(),
+                                                           maxDataAge,
+                                                           stats,
+                                                           SSTableReader.OpenReason.EARLY,
                                                            header);
 
         // now it's open, find the ACTUAL last readable key (i.e. for which the data file has also been flushed)
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java b/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
index 680644d941ec..399c273cfbcc 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
@@ -128,14 +128,20 @@ public void buildRowIndex(UnfilteredRowIterator iterator) throws IOException
     private void writePartitionHeader(UnfilteredRowIterator iterator) throws IOException
     {
         ByteBufferUtil.writeWithShortLength(iterator.partitionKey().getKey(), writer);
+
+        long partitionDeletionPosition = writer.position();
         DeletionTime.serializer.serialize(iterator.partitionLevelDeletion(), writer);
+        if (!observers.isEmpty())
+            observers.forEach((o) -> o.partitionLevelDeletion(iterator.partitionLevelDeletion(), partitionDeletionPosition));
+
         if (header.hasStatic())
         {
             Row staticRow = iterator.staticRow();
+            long staticRowPosition = writer.position();
 
             UnfilteredSerializer.serializer.serializeStaticRow(staticRow, helper, writer, version);
             if (!observers.isEmpty())
-                observers.forEach((o) -> o.nextUnfilteredCluster(staticRow));
+                observers.forEach((o) -> o.staticRow(staticRow, staticRowPosition));
         }
     }
 
@@ -255,11 +261,12 @@ private void add(Unfiltered unfiltered) throws IOException
             startPosition = pos;
         }
 
+        long unfilteredPosition = writer.position();
         UnfilteredSerializer.serializer.serialize(unfiltered, helper, writer, pos - previousRowStart, version);
 
         // notify observers about each new row
         if (!observers.isEmpty())
-            observers.forEach((o) -> o.nextUnfilteredCluster(unfiltered));
+            observers.forEach((o) -> o.nextUnfilteredCluster(unfiltered, unfilteredPosition));
 
         lastClustering = unfiltered.clustering();
         previousRowStart = pos;
diff --git a/src/java/org/apache/cassandra/locator/ReplicaPlans.java b/src/java/org/apache/cassandra/locator/ReplicaPlans.java
index 67b89e52ecee..076c0a95b903 100644
--- a/src/java/org/apache/cassandra/locator/ReplicaPlans.java
+++ b/src/java/org/apache/cassandra/locator/ReplicaPlans.java
@@ -38,6 +38,8 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.UnavailableException;
 import org.apache.cassandra.gms.FailureDetector;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.service.reads.AlwaysSpeculativeRetryPolicy;
@@ -524,11 +526,11 @@ public static ReplicaPlan.ForPaxosWrite forPaxos(Keyspace keyspace, DecoratedKey
     }
 
 
-    private static <E extends Endpoints<E>> E candidatesForRead(ConsistencyLevel consistencyLevel, E liveNaturalReplicas)
+    private static <E extends Endpoints<E>> E candidatesForRead(Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, E liveNaturalReplicas)
     {
-        return consistencyLevel.isDatacenterLocal()
-                ? liveNaturalReplicas.filter(InOurDcTester.replicas())
-                : liveNaturalReplicas;
+        E replicas = consistencyLevel.isDatacenterLocal() ? liveNaturalReplicas.filter(InOurDcTester.replicas()) : liveNaturalReplicas;
+
+        return indexQueryPlan != null ? SecondaryIndexManager.filterForQuery(replicas, keyspace, indexQueryPlan, consistencyLevel) : replicas;
     }
 
     private static <E extends Endpoints<E>> E contactForEachQuorumRead(NetworkTopologyStrategy replicationStrategy, E candidates)
@@ -588,10 +590,10 @@ public static ReplicaPlan.ForRangeRead forSingleReplicaRead(Keyspace keyspace, A
      * The candidate collection can be used for speculation, although at present
      * it would break EACH_QUORUM to do so without further filtering
      */
-    public static ReplicaPlan.ForTokenRead forRead(Keyspace keyspace, Token token, ConsistencyLevel consistencyLevel, SpeculativeRetryPolicy retry)
+    public static ReplicaPlan.ForTokenRead forRead(Keyspace keyspace, Token token, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, SpeculativeRetryPolicy retry)
     {
         AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy();
-        EndpointsForToken candidates = candidatesForRead(consistencyLevel, ReplicaLayout.forTokenReadLiveSorted(replicationStrategy, token).natural());
+        EndpointsForToken candidates = candidatesForRead(keyspace, indexQueryPlan, consistencyLevel, ReplicaLayout.forTokenReadLiveSorted(replicationStrategy, token).natural());
         EndpointsForToken contacts = contactForRead(replicationStrategy, consistencyLevel, retry.equals(AlwaysSpeculativeRetryPolicy.INSTANCE), candidates);
 
         assureSufficientLiveReplicasForRead(replicationStrategy, consistencyLevel, contacts);
@@ -605,10 +607,10 @@ public static ReplicaPlan.ForTokenRead forRead(Keyspace keyspace, Token token, C
      *
      * There is no speculation for range read queries at present, so we never 'always speculate' here, and a failed response fails the query.
      */
-    public static ReplicaPlan.ForRangeRead forRangeRead(Keyspace keyspace, ConsistencyLevel consistencyLevel, AbstractBounds<PartitionPosition> range, int vnodeCount)
+    public static ReplicaPlan.ForRangeRead forRangeRead(Keyspace keyspace, Index.QueryPlan indexQueryPlan, ConsistencyLevel consistencyLevel, AbstractBounds<PartitionPosition> range, int vnodeCount)
     {
         AbstractReplicationStrategy replicationStrategy = keyspace.getReplicationStrategy();
-        EndpointsForRange candidates = candidatesForRead(consistencyLevel, ReplicaLayout.forRangeReadLiveSorted(replicationStrategy, range).natural());
+        EndpointsForRange candidates = candidatesForRead(keyspace, indexQueryPlan, consistencyLevel, ReplicaLayout.forRangeReadLiveSorted(replicationStrategy, range).natural());
         EndpointsForRange contacts = contactForRead(replicationStrategy, consistencyLevel, false, candidates);
 
         assureSufficientLiveReplicasForRead(replicationStrategy, consistencyLevel, contacts);
diff --git a/src/java/org/apache/cassandra/metrics/ClientRangeRequestMetrics.java b/src/java/org/apache/cassandra/metrics/ClientRangeRequestMetrics.java
new file mode 100644
index 000000000000..9bcf83ebed6d
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/ClientRangeRequestMetrics.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.cassandra.metrics;
+
+import com.codahale.metrics.Histogram;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+/**
+ * Metrics for tracking information about range requests.
+ *
+ */
+public class ClientRangeRequestMetrics extends ClientRequestMetrics
+{
+    /**
+     * Metric for tracking the number of round trips made for a range request.
+     */
+    public final Histogram roundTrips;
+
+    public ClientRangeRequestMetrics(String scope)
+    {
+        super(scope);
+        roundTrips = Metrics.histogram(factory.createMetricName("RoundTripsPerReadHistogram"), false);
+    }
+
+    public void release()
+    {
+        super.release();
+        Metrics.remove(factory.createMetricName("RoundTripsPerReadHistogram"));
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/schema/IndexMetadata.java b/src/java/org/apache/cassandra/schema/IndexMetadata.java
index 4adbce170bc4..d4188aa2c9f6 100644
--- a/src/java/org/apache/cassandra/schema/IndexMetadata.java
+++ b/src/java/org/apache/cassandra/schema/IndexMetadata.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.lang.reflect.InvocationTargetException;
 import java.util.*;
+import java.util.concurrent.ConcurrentHashMap;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
@@ -35,8 +36,10 @@
 import org.apache.cassandra.cql3.CqlBuilder;
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnknownIndexException;
 import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.internal.CassandraIndex;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.FBUtilities;
@@ -55,6 +58,11 @@ public final class IndexMetadata
 
     public static final Serializer serializer = new Serializer();
 
+    /**
+     * A mapping of user-friendly index names to their fully qualified index class names.
+     */
+    private static final Map<String, String> indexNameAliases = new ConcurrentHashMap<>();
+
     public enum Kind
     {
         KEYS, CUSTOM, COMPOSITES
@@ -122,7 +130,9 @@ public void validate(TableMetadata table)
             if (options == null || !options.containsKey(IndexTarget.CUSTOM_INDEX_OPTION_NAME))
                 throw new ConfigurationException(String.format("Required option missing for index %s : %s",
                                                                name, IndexTarget.CUSTOM_INDEX_OPTION_NAME));
-            String className = options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME);
+            // Find any aliases to the fully qualified index class name:
+            String className = expandAliases(options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME));
+
             Class<Index> indexerClass = FBUtilities.classForName(className, "custom indexer");
             if (!Index.class.isAssignableFrom(indexerClass))
                 throw new ConfigurationException(String.format("Specified Indexer class (%s) does not implement the Indexer interface", className));
@@ -130,6 +140,18 @@ public void validate(TableMetadata table)
         }
     }
 
+    public String getIndexClassName()
+    {
+        if (isCustom())
+            return expandAliases(options.get(IndexTarget.CUSTOM_INDEX_OPTION_NAME));
+        return CassandraIndex.class.getName();
+    }
+
+    public static String expandAliases(String className)
+    {
+        return indexNameAliases.getOrDefault(className, className);
+    }
+
     private void validateCustomIndexOptions(TableMetadata table, Class<? extends Index> indexerClass, Map<String, String> options)
     {
         try
@@ -159,6 +181,8 @@ private void validateCustomIndexOptions(TableMetadata table, Class<? extends Ind
         }
         catch (InvocationTargetException e)
         {
+            if (e.getTargetException() instanceof RequestValidationException)
+                throw (RequestValidationException) e.getTargetException();
             if (e.getTargetException() instanceof ConfigurationException)
                 throw (ConfigurationException) e.getTargetException();
             throw new ConfigurationException("Failed to validate custom indexer options: " + options);
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index 72801a9d129b..ac0f83c1d706 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -102,6 +102,7 @@
 import org.apache.cassandra.locator.Replicas;
 import org.apache.cassandra.metrics.CASClientRequestMetrics;
 import org.apache.cassandra.metrics.CASClientWriteRequestMetrics;
+import org.apache.cassandra.metrics.ClientRangeRequestMetrics;
 import org.apache.cassandra.metrics.ClientRequestMetrics;
 import org.apache.cassandra.metrics.ClientWriteRequestMetrics;
 import org.apache.cassandra.metrics.ReadRepairMetrics;
@@ -167,6 +168,7 @@ public AtomicInteger load(InetAddressAndPort inetAddress)
         }
     };
     private static final ClientRequestMetrics readMetrics = new ClientRequestMetrics("Read");
+    public static final ClientRangeRequestMetrics rangeMetrics = new ClientRangeRequestMetrics("RangeSlice");
     private static final ClientWriteRequestMetrics writeMetrics = new ClientWriteRequestMetrics("Write");
     private static final CASClientWriteRequestMetrics casWriteMetrics = new CASClientWriteRequestMetrics("CASWrite");
     private static final CASClientRequestMetrics casReadMetrics = new CASClientRequestMetrics("CASRead");
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index 92b57d9b186b..9e1afa743b45 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -52,6 +52,7 @@
 import org.apache.cassandra.fql.FullQueryLogger;
 import org.apache.cassandra.fql.FullQueryLoggerOptions;
 import org.apache.cassandra.fql.FullQueryLoggerOptionsCompositeData;
+import org.apache.cassandra.index.SecondaryIndexManager;
 import org.apache.cassandra.locator.ReplicaCollection.Builder.Conflict;
 import org.apache.commons.lang3.StringUtils;
 
@@ -2376,6 +2377,9 @@ public void onChange(InetAddressAndPort endpoint, ApplicationState state, Versio
                     case NET_VERSION:
                         updateNetVersion(endpoint, value);
                         break;
+                    case INDEX_STATUS:
+                        updateIndexStatus(endpoint, value);
+                        break;
                 }
             }
         }
@@ -2386,6 +2390,11 @@ private static String[] splitValue(VersionedValue value)
         return value.value.split(VersionedValue.DELIMITER_STR, -1);
     }
 
+    private void updateIndexStatus(InetAddressAndPort endpoint, VersionedValue versionedValue)
+    {
+        SecondaryIndexManager.receivePeerIndexStatus(endpoint, versionedValue);
+    }
+
     private void updateNetVersion(InetAddressAndPort endpoint, VersionedValue value)
     {
         try
diff --git a/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java b/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java
index 8a83d3e7a815..5079b1f64556 100644
--- a/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java
+++ b/src/java/org/apache/cassandra/service/reads/AbstractReadExecutor.java
@@ -186,7 +186,7 @@ public static AbstractReadExecutor getReadExecutor(SinglePartitionReadCommand co
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.metadata().id);
         SpeculativeRetryPolicy retry = cfs.metadata().params.speculativeRetry;
 
-        ReplicaPlan.ForTokenRead replicaPlan = ReplicaPlans.forRead(keyspace, command.partitionKey().getToken(), consistencyLevel, retry);
+        ReplicaPlan.ForTokenRead replicaPlan = ReplicaPlans.forRead(keyspace, command.partitionKey().getToken(), command.indexQueryPlan(), consistencyLevel, retry);
 
         // Speculative retry is disabled *OR*
         // 11980: Disable speculative retry if using EACH_QUORUM in order to prevent miscounting DC responses
diff --git a/src/java/org/apache/cassandra/service/reads/DataResolver.java b/src/java/org/apache/cassandra/service/reads/DataResolver.java
index 7c76336043ce..f9741957801d 100644
--- a/src/java/org/apache/cassandra/service/reads/DataResolver.java
+++ b/src/java/org/apache/cassandra/service/reads/DataResolver.java
@@ -121,7 +121,8 @@ private boolean needsReplicaFilteringProtection()
         if (command.rowFilter().isEmpty())
             return false;
 
-        IndexMetadata indexMetadata = command.indexMetadata();
+        Index.QueryPlan queryPlan = command.indexQueryPlan();
+        IndexMetadata indexMetadata = queryPlan == null ? null : queryPlan.getFirst().getIndexMetadata();
 
         if (indexMetadata == null || !indexMetadata.isCustom())
         {
diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java
index 8061f0a008b5..4ed9e329563b 100644
--- a/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java
+++ b/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java
@@ -184,6 +184,6 @@ private SinglePartitionReadCommand makeFetchAdditionalRowsReadCommand(int toQuer
                                                  command.limits().forShortReadRetry(toQuery),
                                                  partitionKey,
                                                  filter,
-                                                 command.indexMetadata());
+                                                 command.indexQueryPlan());
     }
 }
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
index 3452a352593c..ae355bcd5eaf 100644
--- a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
+++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
@@ -70,7 +70,7 @@ static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand comma
         Tracing.trace("Computing ranges to query");
 
         Keyspace keyspace = Keyspace.open(command.metadata().keyspace);
-        ReplicaPlanIterator replicaPlans = new ReplicaPlanIterator(command.dataRange().keyRange(), keyspace, consistencyLevel);
+        ReplicaPlanIterator replicaPlans = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, consistencyLevel);
 
         // our estimate of how many result rows there will be per-range
         float resultsPerRange = estimateResultsPerRange(command, keyspace);
diff --git a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java
index ef88c9dffda4..0135d651514c 100644
--- a/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java
+++ b/src/java/org/apache/cassandra/service/reads/range/ReplicaPlanIterator.java
@@ -31,6 +31,7 @@
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.locator.LocalStrategy;
 import org.apache.cassandra.locator.ReplicaPlan;
 import org.apache.cassandra.locator.ReplicaPlans;
@@ -43,15 +44,21 @@ class ReplicaPlanIterator extends AbstractIterator<ReplicaPlan.ForRangeRead>
 {
     private final Keyspace keyspace;
     private final ConsistencyLevel consistency;
+    private final Index.QueryPlan indexQueryPlan;
     @VisibleForTesting
     final Iterator<? extends AbstractBounds<PartitionPosition>> ranges;
     private final int rangeCount;
 
-    ReplicaPlanIterator(AbstractBounds<PartitionPosition> keyRange, Keyspace keyspace, ConsistencyLevel consistency)
+    ReplicaPlanIterator(AbstractBounds<PartitionPosition> keyRange,
+                        Index.QueryPlan indexQueryPlan,
+                        Keyspace keyspace,
+                        ConsistencyLevel consistency)
     {
+        this.indexQueryPlan = indexQueryPlan;
         this.keyspace = keyspace;
         this.consistency = consistency;
 
+
         List<? extends AbstractBounds<PartitionPosition>> l = keyspace.getReplicationStrategy() instanceof LocalStrategy
                                                               ? keyRange.unwrap()
                                                               : getRestrictedRanges(keyRange);
@@ -73,7 +80,7 @@ protected ReplicaPlan.ForRangeRead computeNext()
         if (!ranges.hasNext())
             return endOfData();
 
-        return ReplicaPlans.forRangeRead(keyspace, consistency, ranges.next(), 1);
+        return ReplicaPlans.forRangeRead(keyspace, indexQueryPlan, consistency, ranges.next(), 1);
     }
 
     /**
diff --git a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
index 30e78970d574..d1f0b3c9250f 100644
--- a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
+++ b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
@@ -421,6 +421,12 @@ public DecoratedKey keyAt(long indexPosition) throws IOException
         return delegate.keyAt(indexPosition);
     }
 
+    @Override
+    public DecoratedKey keyAt(FileDataInput reader) throws IOException
+    {
+        return delegate.keyAt(reader);
+    }
+
     @Override
     public boolean isPendingRepair()
     {
diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index e17d202a9835..502f6182cce0 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
@@ -781,7 +781,7 @@ private static class TestWriter extends BigTableWriter
         TestWriter(Descriptor descriptor, long keyCount, long repairedAt, UUID pendingRepair, boolean isTransient, TableMetadataRef metadata,
                    MetadataCollector collector, SerializationHeader header, LifecycleTransaction txn)
         {
-            super(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, collector, header, Collections.emptySet(), txn);
+            super(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, collector, header, Collections.emptySet(), txn, Collections.emptySet());
         }
 
         @Override
diff --git a/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
index e7bfb120bd8c..3ec0c6838fe8 100644
--- a/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
@@ -546,7 +546,7 @@ public void testSelectivityWithMultipleIndexes()
                              .filterOn("notbirthdate", Operator.EQ, 0L)
                              .build();
 
-        assertEquals("notbirthdate_key_index", rc.indexMetadata().name);
+        assertEquals("notbirthdate_key_index", rc.indexQueryPlan().getFirst().getIndexMetadata().name);
     }
 
     private void assertIndexedNone(ColumnFamilyStore cfs, ByteBuffer col, Object val)
diff --git a/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
index 08c76bfa27f9..3baad0ae5ad0 100644
--- a/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
@@ -79,7 +79,7 @@ public void testActiveCompactionTrackingRaceWithIndexBuilder() throws Throwable
         for (int ii = 0; ii < loopCount; ii++)
         {
             CountDownLatch trigger = new CountDownLatch(1);
-            SecondaryIndexBuilder builder = idx.getBuildTaskSupport().getIndexBuildTask(getCurrentColumnFamilyStore(), Collections.singleton(idx), sstables);
+            SecondaryIndexBuilder builder = idx.getBuildTaskSupport().getIndexBuildTask(getCurrentColumnFamilyStore(), Collections.singleton(idx), sstables, true);
             Future<?> f1 = es.submit(() -> {
                 Uninterruptibles.awaitUninterruptibly(trigger);
                 try
@@ -116,7 +116,7 @@ public void testSecondaryIndexTracking() throws Throwable
 
         Index idx = getCurrentColumnFamilyStore().indexManager.getIndexByName(idxName);
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
-        SecondaryIndexBuilder builder = idx.getBuildTaskSupport().getIndexBuildTask(getCurrentColumnFamilyStore(), Collections.singleton(idx), sstables);
+        SecondaryIndexBuilder builder = idx.getBuildTaskSupport().getIndexBuildTask(getCurrentColumnFamilyStore(), Collections.singleton(idx), sstables, false);
 
         MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
         CompactionManager.instance.submitIndexBuild(builder, mockActiveCompactions).get();
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
index b7b7d4ac6f06..aef525740024 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
@@ -167,7 +167,7 @@ private SSTableReader replaceSSTable(ColumnFamilyStore cfs, LifecycleTransaction
                                                            false,
                                                            0,
                                                            SerializationHeader.make(cfs.metadata(), txn.originals()),
-                                                           cfs.indexManager.listIndexes(),
+                                                           cfs.indexManager.listIndexGroups(),
                                                            txn));
                 while (ci.hasNext())
                 {
diff --git a/test/unit/org/apache/cassandra/index/CustomIndexTest.java b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
index 84a36dfc8eb0..27a946ce7688 100644
--- a/test/unit/org/apache/cassandra/index/CustomIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
@@ -24,14 +24,24 @@
 import java.util.concurrent.Callable;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
 import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import org.junit.Test;
 
 import com.datastax.driver.core.exceptions.QueryValidationException;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.ColumnIdentifier;
@@ -186,39 +196,39 @@ public void requireFullQualifierForFrozenCollectionTargets() throws Throwable
                     " PRIMARY KEY(k,c))");
 
         assertInvalidMessage("Cannot create keys() index on frozen column fmap. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(fmap)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, keys(fmap)) USING'%s'",
                                            StubIndex.class.getName()));
         assertInvalidMessage("Cannot create entries() index on frozen column fmap. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(fmap)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, entries(fmap)) USING'%s'",
                                            StubIndex.class.getName()));
         assertInvalidMessage("Cannot create values() index on frozen column fmap. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(fmap)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, fmap) USING'%s'", StubIndex.class.getName()));
 
         assertInvalidMessage("Cannot create keys() index on frozen column flist. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(flist)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, keys(flist)) USING'%s'",
                                            StubIndex.class.getName()));
         assertInvalidMessage("Cannot create entries() index on frozen column flist. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(flist)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, entries(flist)) USING'%s'",
                                            StubIndex.class.getName()));
         assertInvalidMessage("Cannot create values() index on frozen column flist. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(flist)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, flist) USING'%s'", StubIndex.class.getName()));
 
         assertInvalidMessage("Cannot create keys() index on frozen column fset. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(fset)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, keys(fset)) USING'%s'",
                                            StubIndex.class.getName()));
         assertInvalidMessage("Cannot create entries() index on frozen column fset. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(fset)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, entries(fset)) USING'%s'",
                                            StubIndex.class.getName()));
         assertInvalidMessage("Cannot create values() index on frozen column fset. " +
-                             "Frozen collections are immutable and must be fully indexed",
+                             "Frozen collections are immutable and must be fully indexed by using the 'full(fset)' modifier",
                              String.format("CREATE CUSTOM INDEX ON %%s(c, fset) USING'%s'", StubIndex.class.getName()));
 
         createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c, full(fmap)) USING'%s'", StubIndex.class.getName()));
@@ -386,11 +396,7 @@ public void testCustomIndexExpressionSyntax() throws Throwable
                                   String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND expr(other_custom_index, 'bar')",
                                                 indexName));
 
-        assertInvalidThrowMessage(Optional.of(ProtocolVersion.CURRENT),
-                                  StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
-                                  QueryValidationException.class,
-                                  String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND d=0", indexName));
-        assertRows(execute(String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND d=0 ALLOW FILTERING", indexName)), row);
+        assertRows(execute(String.format("SELECT * FROM %%s WHERE expr(%s, 'foo') AND d=0", indexName)), row);
     }
 
     @Test
@@ -1081,11 +1087,13 @@ public SimulateConcurrentFlushingIndex(ColumnFamilyStore baseCfs, IndexMetadata
         // When we're done indexing the partition, the test checks the states of the
         // various OpOrder.Groups, which it can obtain from this index.
 
+        @Override
         public Indexer indexerFor(final DecoratedKey key,
                                   RegularAndStaticColumns columns,
                                   int nowInSec,
                                   WriteContext ctx,
-                                  IndexTransaction.Type transactionType)
+                                  IndexTransaction.Type transactionType,
+                                  Memtable memtable)
         {
             CassandraWriteContext cassandraWriteContext = (CassandraWriteContext) ctx;
             if (readOrderingAtStart == null)
@@ -1134,4 +1142,550 @@ public void removeRow(Row row) { }
             };
         }
     }
+
+
+    @Test
+    public void testFlushObserver() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, s int static, v int, PRIMARY KEY (k, c))");
+        String indexName = "test_index_with_flush_observer";
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v) USING '%s'",
+                                  indexName, IndexWithFlushObserver.class.getName()));
+
+        execute("INSERT INTO %s (k, c, s, v) VALUES (?, ?, ?, ?)", 0, 0, 0, 0);
+        execute("INSERT INTO %s (k, c, s, v) VALUES (?, ?, ?, ?)", 0, 1, 1, 1);
+        execute("INSERT INTO %s (k, c, s, v) VALUES (?, ?, ?, ?)", 1, 0, 2, 2);
+        execute("INSERT INTO %s (k, c, s, v) VALUES (?, ?, ?, ?)", 1, 1, 3, 3);
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        SecondaryIndexManager indexManager = cfs.indexManager;
+        IndexWithFlushObserver index = (IndexWithFlushObserver) indexManager.getIndexByName(indexName);
+
+        assertEquals(0, index.beginFlushCalls.get());
+        assertEquals(0, index.flushedPartitions.get());
+        assertEquals(0, index.flushedPartitionDeletions.get());
+        assertEquals(0, index.flushedStaticRows.get());
+        assertEquals(0, index.flushedUnfiltereds.get());
+        assertEquals(0, index.completeFlushCalls.get());
+
+        cfs.forceBlockingFlush();
+
+        assertEquals(1, index.beginFlushCalls.get());
+        assertEquals(2, index.flushedPartitions.get());
+        assertEquals(2, index.flushedPartitionDeletions.get());
+        assertEquals(2, index.flushedStaticRows.get());
+        assertEquals(4, index.flushedUnfiltereds.get());
+        assertEquals(1, index.completeFlushCalls.get());
+
+        execute("DELETE FROM %s WHERE k=?", 0);
+        execute("DELETE FROM %s WHERE k=? AND c>=?", 1, 1);
+        index.reset();
+        cfs.forceBlockingFlush();
+
+        assertEquals(1, index.beginFlushCalls.get());
+        assertEquals(2, index.flushedPartitions.get());
+        assertEquals(2, index.flushedPartitionDeletions.get());
+        assertEquals(0, index.flushedStaticRows.get()); // flushed data has no static values..
+        assertEquals(2, index.flushedUnfiltereds.get());
+        assertEquals(1, index.completeFlushCalls.get());
+    }
+
+    /**
+     * A {@link StubIndex} using a {@link SSTableFlushObserver} that just keeps count of operations.
+     */
+    public static final class IndexWithFlushObserver extends StubIndex
+    {
+
+        AtomicInteger beginFlushCalls = new AtomicInteger();
+        AtomicInteger flushedPartitions = new AtomicInteger();
+        AtomicInteger flushedPartitionDeletions = new AtomicInteger();
+        AtomicInteger flushedStaticRows = new AtomicInteger();
+        AtomicInteger flushedUnfiltereds = new AtomicInteger();
+        AtomicInteger completeFlushCalls = new AtomicInteger();
+
+        public IndexWithFlushObserver(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        @Override
+        public void reset()
+        {
+            super.reset();
+            beginFlushCalls.set(0);
+            flushedPartitions.set(0);
+            flushedPartitionDeletions.set(0);
+            flushedStaticRows.set(0);
+            flushedUnfiltereds.set(0);
+            completeFlushCalls.set(0);
+        }
+
+        @Override
+        public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker)
+        {
+            return new SSTableFlushObserver() {
+
+                @Override
+                public void begin()
+                {
+                    beginFlushCalls.incrementAndGet();
+                }
+
+                @Override
+                public void startPartition(DecoratedKey key, long position)
+                {
+                    flushedPartitions.incrementAndGet();
+                }
+
+                @Override
+                public void partitionLevelDeletion(DeletionTime deletionTime, long position)
+                {
+                    flushedPartitionDeletions.incrementAndGet();
+                }
+
+                @Override
+                public void staticRow(Row staticRow, long position)
+                {
+                    flushedStaticRows.incrementAndGet();
+                }
+
+                @Override
+                public void nextUnfilteredCluster(Unfiltered unfiltered, long position)
+                {
+                    flushedUnfiltereds.incrementAndGet();
+                }
+
+                @Override
+                public void complete()
+                {
+                    completeFlushCalls.incrementAndGet();
+                }
+            };
+        }
+    }
+
+    /**
+     * Verify that writes for indexes in the same {@link Index.Group} are grouped.
+     */
+    @Test
+    public void testGroupedWrites() throws Throwable
+    {
+        // create the schema with two indexes in the same group
+        String indexClassName = IndexWithSharedGroup.class.getName();
+        createTable("CREATE TABLE %s (k int, c int, s int static, v int, PRIMARY KEY (k,c))");
+        createIndex(String.format("CREATE CUSTOM INDEX grouped_index_c ON %%s(c) USING '%s'", indexClassName));
+        createIndex(String.format("CREATE CUSTOM INDEX grouped_index_v ON %%s(v) USING '%s'", indexClassName));
+
+        // retrieve the indexes and their shared group
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        SecondaryIndexManager indexManager = cfs.indexManager;
+        StubIndex index1 = (IndexWithSharedGroup) indexManager.getIndexByName("grouped_index_c");
+        StubIndex index2 = (IndexWithSharedGroup) indexManager.getIndexByName("grouped_index_v");
+        IndexWithSharedGroup.Group group = indexManager.listIndexGroups()
+                                                       .stream()
+                                                       .filter(g -> g instanceof IndexWithSharedGroup.Group)
+                                                       .map(g -> (IndexWithSharedGroup.Group) g)
+                                                       .findAny()
+                                                       .orElseThrow(AssertionError::new);
+
+        // verify that row insertions get to the index group and they are propagated to their members
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 1, 1);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 0, 3);
+        assertEquals(3, group.rowsInserted.get());
+        assertEquals(3, index1.rowsInserted.size());
+        assertEquals(3, index2.rowsInserted.size());
+
+        // verify that row updates get to the index group and they are propagated to their members
+        execute("UPDATE %s SET v=? WHERE k=? AND c=?", 10, 0, 0);
+        execute("UPDATE %s SET v=? WHERE k=? AND c=?", 10, 1, 0);
+        assertEquals(2, group.rowsUpdated.get());
+        assertEquals(2, index1.rowsUpdated.size());
+        assertEquals(2, index2.rowsUpdated.size());
+
+        // verify that partition deletions get to the index group and its members
+        ReadCommand cmd = Util.cmd(cfs, 0).build();
+        try (ReadExecutionController executionController = cmd.executionController();
+             UnfilteredPartitionIterator iterator = cmd.executeLocally(executionController))
+        {
+            assertTrue(iterator.hasNext());
+            cfs.indexManager.deletePartition(iterator.next(), FBUtilities.nowInSeconds());
+        }
+        assertEquals(1, group.partitionDeletions.get());
+        assertEquals(1, index1.partitionDeletions.size());
+        assertEquals(1, index2.partitionDeletions.size());
+
+        // verify that the row deletions produced by the previous partition deletion get to the group and its members
+        assertEquals(2, group.rowsDeleted.get());
+        assertEquals(2, index1.rowsDeleted.size());
+        assertEquals(2, index2.rowsDeleted.size());
+
+        // verify that range tombstones get to the index group and its members
+        execute("DELETE FROM %s WHERE k=? AND c>?", 0, 0);
+        execute("DELETE FROM %s WHERE k=? AND c>?", 1, 1);
+        assertEquals(2, group.rangeTombstones.get());
+        assertEquals(2, index1.rangeTombstones.size());
+        assertEquals(2, index2.rangeTombstones.size());
+
+        // verify the total number of begin calls
+        assertEquals(10, group.beginCalls.get());
+        assertEquals(10, index1.beginCalls);
+        assertEquals(10, index2.beginCalls);
+
+        // verify the total number of finish calls
+        assertEquals(10, group.finishCalls.get());
+        assertEquals(10, index1.finishCalls);
+        assertEquals(10, index2.finishCalls);
+
+        // flush the previous data to get rid of it, reset the group counters and flush a new memtable
+        cfs.forceBlockingFlush();
+        group.reset();
+        execute("INSERT INTO %s (k, s) VALUES (?, ?)", 1, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 0, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 1, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 2, 0, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 2, 1, 0);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 2, 2, 0);
+        execute("DELETE FROM %s WHERE k=? AND c=?", 2, 3);
+        execute("DELETE FROM %s WHERE k=?", 3);
+        cfs.forceBlockingFlush();
+
+        // verify that the flush observer calls get only once to the group
+        assertEquals(1, group.beginFlushCalls.get());
+        assertEquals(3, group.flushedPartitions.get());
+        assertEquals(3, group.flushedPartitionDeletions.get());
+        assertEquals(3, group.flushedStaticRows.get());
+        assertEquals(6, group.flushedUnfiltereds.get());
+        assertEquals(1, group.completeFlushCalls.get());
+
+        // verify that the index rebuilds can be directed only to the first index
+        group.reset();
+        indexManager.rebuildIndexesBlocking(Collections.singleton(index1.getIndexMetadata().name));
+        assertEquals(8, group.rowsInserted.get());
+        assertEquals(8, index1.rowsInserted.size());
+        assertEquals(0, index2.rowsInserted.size());
+
+        // verify that the index rebuilds can be directed only to the second index
+        group.reset();
+        indexManager.rebuildIndexesBlocking(Collections.singleton(index2.getIndexMetadata().name));
+        assertEquals(8, group.rowsInserted.get());
+        assertEquals(0, index1.rowsInserted.size());
+        assertEquals(8, index2.rowsInserted.size());
+    }
+
+    @Test
+    public void testIndexGroupsInstancesManagement() throws Throwable
+    {
+        String indexClassName = IndexWithSharedGroup.class.getName();
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int, v3 int, v4 int, v5 int)");
+        SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
+
+        // create two indexes belonging to the same group and verify that only one group is added to the manager
+        String idx1 = createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", indexClassName));
+        String idx2 = createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", indexClassName));
+        IndexWithSharedGroup.Group group = indexManager.listIndexGroups()
+                                                       .stream()
+                                                       .filter(g -> g instanceof IndexWithSharedGroup.Group)
+                                                       .map(g -> (IndexWithSharedGroup.Group) g)
+                                                       .findAny()
+                                                       .orElseThrow(AssertionError::new);
+
+        // verify that only one group has been added to the manager
+        assertEquals(2, indexManager.listIndexes().size());
+        assertEquals(1, indexManager.listIndexGroups().size());
+        assertEquals(2, group.indexes.size());
+
+        // create two indexes belonging to their own singleton group and verify that two groups are added to the manager
+        String idx3 = createIndex("CREATE INDEX ON %s(v3)");
+        String idx4 = createIndex("CREATE INDEX ON %s(v4)");
+        assertEquals(4, indexManager.listIndexes().size());
+        assertEquals(3, indexManager.listIndexGroups().size());
+
+        // create another index to the shared group and verify that they are added to the existing group instance
+        String idx5 = createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v5) USING '%s'", indexClassName));
+        assertEquals(5, indexManager.listIndexes().size());
+        assertEquals(3, indexManager.listIndexGroups().size());
+        assertEquals(3, group.indexes.size());
+
+        // drop one of the shared group members and verify that the manager still has the same group count
+        dropIndex("DROP INDEX %s." + idx1);
+        assertEquals(4, indexManager.listIndexes().size());
+        assertEquals(3, indexManager.listIndexGroups().size());
+        assertEquals(2, group.indexes.size());
+
+        // drop the standalone indexes and verify that their singleton groups are removed from the manager
+        dropIndex("DROP INDEX %s." + idx3);
+        dropIndex("DROP INDEX %s." + idx4);
+        assertEquals(2, indexManager.listIndexes().size());
+        assertEquals(1, indexManager.listIndexGroups().size());
+
+        // drop the remaining members of the shared group and verify that it is kept empty in the manager
+        dropIndex("DROP INDEX %s." + idx2);
+        dropIndex("DROP INDEX %s." + idx5);
+        assertEquals(0, indexManager.listIndexes().size());
+        assertEquals(1, indexManager.listIndexGroups().size());
+        assertEquals(0, group.indexes.size());
+
+        // create the sharing group members again and verify that they are added to the existing group instance
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v1) USING '%s'", idx1, indexClassName));
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v2) USING '%s'", idx2, indexClassName));
+        createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v3) USING '%s'", idx3, indexClassName));
+        assertEquals(3, indexManager.listIndexes().size());
+        assertEquals(1, indexManager.listIndexGroups().size());
+        assertEquals(3, group.indexes.size());
+    }
+
+    /**
+     * {@link StubIndex} implementation that uses the same {@link Index.Group} for all its instances.
+     * That group keeps count of the calls and passes them to its members.
+     */
+    public static final class IndexWithSharedGroup extends StubIndex
+    {
+        public IndexWithSharedGroup(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+        }
+
+        @Override
+        public boolean shouldBuildBlocking()
+        {
+            return true;
+        }
+
+        @Override
+        public void register(IndexRegistry registry)
+        {
+            registry.registerIndex(this, Group.class, Group::new);
+        }
+
+        private static class Group implements Index.Group
+        {
+            Map<String, IndexWithSharedGroup> indexes = Maps.newConcurrentMap();
+
+            AtomicInteger beginCalls = new AtomicInteger();
+            AtomicInteger finishCalls = new AtomicInteger();
+            AtomicInteger partitionDeletions = new AtomicInteger();
+            AtomicInteger rangeTombstones = new AtomicInteger();
+            AtomicInteger rowsInserted = new AtomicInteger();
+            AtomicInteger rowsDeleted = new AtomicInteger();
+            AtomicInteger rowsUpdated = new AtomicInteger();
+
+            AtomicInteger beginFlushCalls = new AtomicInteger();
+            AtomicInteger flushedPartitions = new AtomicInteger();
+            AtomicInteger flushedPartitionDeletions = new AtomicInteger();
+            AtomicInteger flushedStaticRows = new AtomicInteger();
+            AtomicInteger flushedUnfiltereds = new AtomicInteger();
+            AtomicInteger completeFlushCalls = new AtomicInteger();
+
+            public void reset()
+            {
+                beginCalls.set(0);
+                finishCalls.set(0);
+                partitionDeletions.set(0);
+                rangeTombstones.set(0);
+                rowsInserted.set(0);
+                rowsDeleted.set(0);
+                rowsUpdated.set(0);
+                beginFlushCalls.set(0);
+                flushedPartitions.set(0);
+                flushedPartitionDeletions.set(0);
+                flushedStaticRows.set(0);
+                flushedUnfiltereds.set(0);
+                completeFlushCalls.set(0);
+                indexes.values().forEach(IndexWithSharedGroup::reset);
+            }
+
+            @Override
+            public Set<Index> getIndexes()
+            {
+                return ImmutableSet.copyOf(indexes.values());
+            }
+
+            @Override
+            public void addIndex(Index index)
+            {
+                indexes.put(index.getIndexMetadata().name, (IndexWithSharedGroup) index);
+            }
+
+            @Override
+            public void removeIndex(Index index)
+            {
+                indexes.remove(index.getIndexMetadata().name);
+            }
+
+            @Override
+            public boolean containsIndex(Index index)
+            {
+                return indexes.containsKey(index.getIndexMetadata().name);
+            }
+
+            @Override
+            public Index.Indexer indexerFor(Predicate<Index> indexSelector,
+                                            DecoratedKey key,
+                                            RegularAndStaticColumns columns,
+                                            int nowInSec,
+                                            WriteContext context,
+                                            IndexTransaction.Type transactionType,
+                                            Memtable memtable)
+            {
+                Set<Index.Indexer> indexers = indexes.values()
+                                                     .stream()
+                                                     .filter(indexSelector)
+                                                     .map(i -> i.indexerFor(key, columns, nowInSec, context, transactionType, memtable))
+                                                     .filter(Objects::nonNull)
+                                                     .collect(Collectors.toSet());
+
+                return indexers.isEmpty() ? null : new Index.Indexer() {
+
+                    @Override
+                    public void begin()
+                    {
+                        beginCalls.incrementAndGet();
+                        indexers.forEach(Indexer::begin);
+                    }
+
+                    @Override
+                    public void partitionDelete(DeletionTime deletionTime)
+                    {
+                        partitionDeletions.incrementAndGet();
+                        indexers.forEach(indexer -> indexer.partitionDelete(deletionTime));
+                    }
+
+                    @Override
+                    public void rangeTombstone(RangeTombstone tombstone)
+                    {
+                        rangeTombstones.incrementAndGet();
+                        indexers.forEach(indexer -> indexer.rangeTombstone(tombstone));
+                    }
+
+                    @Override
+                    public void insertRow(Row row)
+                    {
+                        rowsInserted.incrementAndGet();
+                        indexers.forEach(indexer -> indexer.insertRow(row));
+                    }
+
+                    @Override
+                    public void removeRow(Row row)
+                    {
+                        rowsDeleted.incrementAndGet();
+                        indexers.forEach(indexer -> indexer.removeRow(row));
+                    }
+
+                    @Override
+                    public void updateRow(Row oldRow, Row newRow)
+                    {
+                        rowsUpdated.incrementAndGet();
+                        indexers.forEach(indexer -> indexer.updateRow(oldRow, newRow));
+                    }
+
+                    @Override
+                    public void finish()
+                    {
+                        finishCalls.incrementAndGet();
+                        indexers.forEach(Indexer::finish);
+                    }
+                };
+            }
+
+            @Override
+            public QueryPlan queryPlanFor(RowFilter rowFilter)
+            {
+                throw new UnsupportedOperationException();
+            }
+
+            @Override
+            public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker, TableMetadata tableMetadata)
+            {
+                Set<SSTableFlushObserver> observers = indexes.values()
+                                                             .stream()
+                                                             .map(i -> i.getFlushObserver(descriptor, tracker))
+                                                             .filter(Objects::nonNull)
+                                                             .collect(Collectors.toSet());
+
+                return new SSTableFlushObserver() {
+
+                    @Override
+                    public void begin()
+                    {
+                        beginFlushCalls.incrementAndGet();
+                        observers.forEach(SSTableFlushObserver::begin);
+                    }
+
+                    @Override
+                    public void startPartition(DecoratedKey key, long position)
+                    {
+                        flushedPartitions.incrementAndGet();
+                        observers.forEach(o -> o.startPartition(key, position));
+                    }
+
+                    @Override
+                    public void partitionLevelDeletion(DeletionTime deletionTime, long position)
+                    {
+                        flushedPartitionDeletions.incrementAndGet();
+                        observers.forEach(o -> o.partitionLevelDeletion(deletionTime, position));
+                    }
+
+                    @Override
+                    public void staticRow(Row staticRow, long position)
+                    {
+                        flushedStaticRows.incrementAndGet();
+                        observers.forEach(o -> o.staticRow(staticRow, position));
+                    }
+
+                    @Override
+                    public void nextUnfilteredCluster(Unfiltered unfiltered, long position)
+                    {
+                        flushedUnfiltereds.incrementAndGet();
+                        observers.forEach(o -> o.nextUnfilteredCluster(unfiltered, position));
+                    }
+
+                    @Override
+                    public void complete()
+                    {
+                        completeFlushCalls.incrementAndGet();
+                        observers.forEach(SSTableFlushObserver::complete);
+                    }
+                };
+            }
+
+            @Override
+            public Set<Component> getComponents()
+            {
+                return Collections.emptySet();
+            }
+        }
+    }
+
+    @Test
+    public void testMulticolumnIndexWithBaseTable() throws Throwable
+    {
+        createTable("CREATE TABLE %s(k int PRIMARY KEY, v int)");
+        assertInvalidMessage("Indexes belonging to a group of indexes shouldn't have a backing table",
+                             String.format("CREATE CUSTOM INDEX ON %%s(v) USING '%s'",
+                                           MulticolumnIndexWithBaseTable.class.getName()));
+    }
+
+    public static final class MulticolumnIndexWithBaseTable extends StubIndex
+    {
+        private final ColumnFamilyStore baseCfs;
+
+        public MulticolumnIndexWithBaseTable(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+        {
+            super(baseCfs, metadata);
+            this.baseCfs = baseCfs;
+        }
+
+        @Override
+        public void register(IndexRegistry registry)
+        {
+            registry.registerIndex(this, MulticolumnIndexWithBaseTable.class, StubIndexGroup::new);
+        }
+
+        @Override
+        public Optional<ColumnFamilyStore> getBackingTable()
+        {
+            return Optional.of(baseCfs);
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java b/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java
index d8fb99f40fe8..c9dde2ae694e 100644
--- a/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java
+++ b/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java
@@ -766,7 +766,7 @@ public IndexBuildingSupport getBuildTaskSupport()
         {
             return new CollatedViewIndexBuildingSupport()
             {
-                public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index> indexes, Collection<SSTableReader> sstables)
+                public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index> indexes, Collection<SSTableReader> sstables, boolean isFullRebuild)
                 {
                     try
                     {
@@ -775,7 +775,7 @@ public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs, Set<Index>
                             buildWaitLatch.countDown();
                             buildLatch.await();
                         }
-                        final SecondaryIndexBuilder builder = super.getIndexBuildTask(cfs, indexes, sstables);
+                        final SecondaryIndexBuilder builder = super.getIndexBuildTask(cfs, indexes, sstables, isFullRebuild);
                         return new SecondaryIndexBuilder()
                         {
 
diff --git a/test/unit/org/apache/cassandra/index/StubIndex.java b/test/unit/org/apache/cassandra/index/StubIndex.java
index 02ccbff13499..d63fe46d33da 100644
--- a/test/unit/org/apache/cassandra/index/StubIndex.java
+++ b/test/unit/org/apache/cassandra/index/StubIndex.java
@@ -100,7 +100,8 @@ public Indexer indexerFor(final DecoratedKey key,
                               RegularAndStaticColumns columns,
                               int nowInSec,
                               WriteContext ctx,
-                              IndexTransaction.Type transactionType)
+                              IndexTransaction.Type transactionType,
+                              Memtable memtable)
     {
         return new Indexer()
         {
diff --git a/test/unit/org/apache/cassandra/index/StubIndexGroup.java b/test/unit/org/apache/cassandra/index/StubIndexGroup.java
new file mode 100644
index 000000000000..8d6177853ce3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/StubIndexGroup.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.function.Predicate;
+import javax.annotation.Nullable;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.WriteContext;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * Basic custom index group implementation for testing.
+ */
+public class StubIndexGroup implements Index.Group
+{
+    private final Set<Index> indexes = new HashSet<>();
+
+    @Override
+    public Set<Index> getIndexes()
+    {
+        return indexes;
+    }
+
+    @Override
+    public void addIndex(Index index)
+    {
+        indexes.add(index);
+    }
+
+    @Override
+    public void removeIndex(Index index)
+    {
+        indexes.remove(index);
+    }
+
+    @Override
+    public boolean containsIndex(Index index)
+    {
+        return indexes.contains(index);
+    }
+
+    @Override
+    public Index.Indexer indexerFor(Predicate<Index> indexSelector,
+                                    DecoratedKey key,
+                                    RegularAndStaticColumns columns,
+                                    int nowInSec,
+                                    WriteContext context,
+                                    IndexTransaction.Type transactionType,
+                                    Memtable memtable)
+    {
+        return null;
+    }
+
+    @Nullable
+    @Override
+    public Index.QueryPlan queryPlanFor(RowFilter rowFilter)
+    {
+        return null;
+    }
+
+    @Override
+    public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker, TableMetadata tableMetadata)
+    {
+        return null;
+    }
+
+    public Set<Component> getComponents()
+    {
+        return Collections.emptySet();
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java b/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
index 51bb6bbf6a7d..115a92ad43ca 100644
--- a/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
+++ b/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
@@ -289,7 +289,8 @@ public Indexer indexerFor(final DecoratedKey key,
                               final RegularAndStaticColumns columns,
                               final int nowInSec,
                               final WriteContext ctx,
-                              final IndexTransaction.Type transactionType)
+                              final IndexTransaction.Type transactionType,
+                              final Memtable memtable)
     {
         if (!isPrimaryKeyIndex() && !columns.contains(indexedColumn))
             return null;
@@ -461,7 +462,7 @@ private void insert(ByteBuffer rowKey,
                                                                cell));
         Row row = BTreeRow.noCellLiveRow(buildIndexClustering(rowKey, clustering, cell), info);
         PartitionUpdate upd = partitionUpdate(valueKey, row);
-        indexCfs.getWriteHandler().write(upd, ctx, UpdateTransaction.NO_OP);
+        indexCfs.getWriteHandler().write(upd, ctx, false);
         logger.debug("Inserted entry into index for value {}", valueKey);
     }
 
@@ -507,7 +508,7 @@ private void doDelete(DecoratedKey indexKey,
     {
         Row row = BTreeRow.emptyDeletedRow(indexClustering, Row.Deletion.regular(deletion));
         PartitionUpdate upd = partitionUpdate(indexKey, row);
-        indexCfs.getWriteHandler().write(upd, ctx, UpdateTransaction.NO_OP);
+        indexCfs.getWriteHandler().write(upd, ctx, false);
         logger.debug("Removed index entry for value {}", indexKey);
     }
 
diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
index 3396e3d86732..78e7b29905b7 100644
--- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
@@ -40,6 +40,7 @@
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.sasi.plan.SASIIndexSearcher;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadata;
@@ -71,7 +72,6 @@
 import org.apache.cassandra.index.sasi.exceptions.TimeQuotaExceededException;
 import org.apache.cassandra.index.sasi.memory.IndexMemtable;
 import org.apache.cassandra.index.sasi.plan.QueryController;
-import org.apache.cassandra.index.sasi.plan.QueryPlan;
 import org.apache.cassandra.index.sasi.utils.RangeIterator;
 import org.apache.cassandra.io.sstable.IndexSummaryManager;
 import org.apache.cassandra.io.sstable.SSTable;
@@ -1381,7 +1381,7 @@ public void testSearchTimeouts()
                                              DataRange.allData(store.metadata().partitioner));
         try
         {
-            new QueryPlan(store, command, 0).execute(ReadExecutionController.empty());
+            new SASIIndexSearcher(store, command, 0).search(ReadExecutionController.empty());
             Assert.fail();
         }
         catch (TimeQuotaExceededException e)
@@ -1398,7 +1398,7 @@ public void testSearchTimeouts()
 
         try (ReadExecutionController controller = command.executionController())
         {
-            Set<String> rows = getKeys(new QueryPlan(store, command, DatabaseDescriptor.getRangeRpcTimeout(MILLISECONDS)).execute(controller));
+            Set<String> rows = getKeys(new SASIIndexSearcher(store, command, DatabaseDescriptor.getRangeRpcTimeout(MILLISECONDS)).search(controller));
             assertRows(rows, "key1", "key2", "key3", "key4");
         }
     }
diff --git a/test/unit/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriterTest.java b/test/unit/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriterTest.java
index 97b3433a8fba..32286d0d7a29 100644
--- a/test/unit/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriterTest.java
+++ b/test/unit/org/apache/cassandra/index/sasi/disk/PerSSTableIndexWriterTest.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.db.rows.BTreeRow;
 import org.apache.cassandra.db.rows.BufferCell;
@@ -84,7 +85,7 @@ public void testPartialIndexWrites() throws Exception
 
         File directory = cfs.getDirectories().getDirectoryForNewSSTables();
         Descriptor descriptor = cfs.newSSTableDescriptor(directory);
-        PerSSTableIndexWriter indexWriter = (PerSSTableIndexWriter) sasi.getFlushObserver(descriptor, OperationType.FLUSH);
+        PerSSTableIndexWriter indexWriter = (PerSSTableIndexWriter) sasi.getFlushObserver(descriptor, LifecycleTransaction.offline(OperationType.FLUSH));
 
         SortedMap<DecoratedKey, Row> expectedKeys = new TreeMap<>(DecoratedKey.comparator);
 
@@ -176,7 +177,7 @@ public void testSparse() throws Exception
 
         File directory = cfs.getDirectories().getDirectoryForNewSSTables();
         Descriptor descriptor = cfs.newSSTableDescriptor(directory);
-        PerSSTableIndexWriter indexWriter = (PerSSTableIndexWriter) sasi.getFlushObserver(descriptor, OperationType.FLUSH);
+        PerSSTableIndexWriter indexWriter = (PerSSTableIndexWriter) sasi.getFlushObserver(descriptor, LifecycleTransaction.offline(OperationType.FLUSH));
 
         final long now = System.currentTimeMillis();
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java
index 962e1a15a809..9781bcacc527 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTestBase.java
@@ -166,7 +166,7 @@ public static void validateCFS(ColumnFamilyStore cfs)
     public static SSTableWriter getWriter(ColumnFamilyStore cfs, File directory, LifecycleTransaction txn, long repairedAt, UUID pendingRepair, boolean isTransient)
     {
         Descriptor desc = cfs.newSSTableDescriptor(directory);
-        return SSTableWriter.create(desc, 0, repairedAt, pendingRepair, isTransient, new SerializationHeader(true, cfs.metadata(), cfs.metadata().regularAndStaticColumns(), EncodingStats.NO_STATS), cfs.indexManager.listIndexes(), txn);
+        return SSTableWriter.create(desc, 0, repairedAt, pendingRepair, isTransient, new SerializationHeader(true, cfs.metadata(), cfs.metadata().regularAndStaticColumns(), EncodingStats.NO_STATS), cfs.indexManager.listIndexGroups(), txn);
     }
 
     public static SSTableWriter getWriter(ColumnFamilyStore cfs, File directory, LifecycleTransaction txn)
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java b/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java
index 5f1920639b14..8983433345fb 100644
--- a/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java
@@ -20,14 +20,20 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
 
 import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Clustering;
 import org.apache.cassandra.db.DecoratedKey;
@@ -42,96 +48,270 @@
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.schema.TableMetadataRef;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.Pair;
-
-import com.google.common.collect.ArrayListMultimap;
-import com.google.common.collect.Multimap;
-
-import org.junit.Assert;
+import org.apache.cassandra.utils.FBUtilities;
 
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import static org.apache.cassandra.db.rows.RangeTombstoneBoundMarker.exclusiveClose;
+import static org.apache.cassandra.db.rows.RangeTombstoneBoundMarker.exclusiveOpen;
+import static org.apache.cassandra.db.rows.RangeTombstoneBoundaryMarker.exclusiveCloseInclusiveOpen;
+import static org.apache.cassandra.db.rows.RangeTombstoneBoundaryMarker.inclusiveCloseExclusiveOpen;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
 public class SSTableFlushObserverTest
 {
     @BeforeClass
     public static void initDD()
     {
         DatabaseDescriptor.daemonInitialization();
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
         CommitLog.instance.start();
     }
 
     private static final String KS_NAME = "test";
-    private static final String CF_NAME = "flush_observer";
 
+    private final long now = System.currentTimeMillis();
+    private final int nowInSec = FBUtilities.nowInSeconds();
+
+    /**
+     * Test {@link SSTableFlushObserver} when the schema doesn't have a clustering key.
+     */
+    @Test
+    public void testWithEmptyClustering()
+    {
+        TableMetadata metadata = TableMetadata.builder(KS_NAME, "flush_observer")
+                                              .addPartitionKeyColumn("id", UTF8Type.instance)
+                                              .addRegularColumn("age", Int32Type.instance)
+                                              .addRegularColumn("height", LongType.instance)
+                                              .addRegularColumn("name", UTF8Type.instance)
+                                              .build();
+
+        Map<PartitionHeader, List<Unfiltered>> partitions = new LinkedHashMap<>();
+
+        partitions.put(header(decompose("key1"), new DeletionTime(1, 10), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(Clustering.EMPTY,
+                                       liveCell(metadata, "age", decompose(27)),
+                                       liveCell(metadata, "height", decompose(183L)),
+                                       liveCell(metadata, "name", decompose("jack")))));
+
+        partitions.put(header(decompose("key3"), new DeletionTime(2, 20), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(Clustering.EMPTY,
+                                       liveCell(metadata, "age", decompose(30)),
+                                       liveCell(metadata, "height", decompose(178L)),
+                                       liveCell(metadata, "name", decompose("ken")))));
+
+        partitions.put(header(decompose("key2"), new DeletionTime(3, 30), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(Clustering.EMPTY,
+                                       liveCell(metadata, "age", decompose(30)),
+                                       liveCell(metadata, "height", decompose(180L)),
+                                       liveCell(metadata, "name", decompose("jim")))));
+
+        testFlushObserver(metadata, partitions);
+    }
+
+    /**
+     * Test {@link SSTableFlushObserver} when the schema has a clustering key.
+     */
+    @Test
+    public void testWithNotEmptyClustering()
+    {
+        TableMetadata metadata = TableMetadata.builder(KS_NAME, "flush_observer_clustering")
+                                              .addPartitionKeyColumn("id", UTF8Type.instance)
+                                              .addClusteringColumn("name", UTF8Type.instance)
+                                              .addRegularColumn("age", Int32Type.instance)
+                                              .addRegularColumn("height", LongType.instance)
+                                              .build();
+
+        Map<PartitionHeader, List<Unfiltered>> partitions = new LinkedHashMap<>();
+
+        partitions.put(header(decompose("key1"), new DeletionTime(1, 10), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(clustering(decompose("kim")),
+                                       liveCell(metadata, "age", decompose(27)),
+                                       liveCell(metadata, "height", decompose(183L))),
+                                   row(clustering(decompose("jim")),
+                                       liveCell(metadata, "age", decompose(27)),
+                                       liveCell(metadata, "height", decompose(183L))),
+                                   row(clustering(decompose("tim")),
+                                       liveCell(metadata, "age", decompose(54)),
+                                       liveCell(metadata, "height", decompose(181L)))));
+
+        partitions.put(header(decompose("key2"), new DeletionTime(2, 20), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(clustering(decompose("kim")),
+                                       liveCell(metadata, "age", decompose(36)),
+                                       liveCell(metadata, "height", decompose(172L))),
+                                   row(clustering(decompose("jim")),
+                                       liveCell(metadata, "age", decompose(30)),
+                                       liveCell(metadata, "height", decompose(178L))),
+                                   row(clustering(decompose("tom")),
+                                       liveCell(metadata, "age", decompose(22)),
+                                       liveCell(metadata, "height", decompose(164L)))));
+
+        testFlushObserver(metadata, partitions);
+    }
+
+    /**
+     * Test {@link SSTableFlushObserver} when the schema has static rows.
+     */
+    @Test
+    public void testWithStaticRow()
+    {
+        TableMetadata metadata = TableMetadata.builder(KS_NAME, "flush_observer_static")
+                                              .addPartitionKeyColumn("id", UTF8Type.instance)
+                                              .addClusteringColumn("name", UTF8Type.instance)
+                                              .addStaticColumn("static_1", UTF8Type.instance)
+                                              .addStaticColumn("static_2", UTF8Type.instance)
+                                              .addRegularColumn("age", Int32Type.instance)
+                                              .addRegularColumn("height", LongType.instance)
+                                              .build();
+
+        Map<PartitionHeader, List<Unfiltered>> partitions = new LinkedHashMap<>();
+
+        partitions.put(header(decompose("key0"), new DeletionTime(1, 10),
+                              staticRow(liveCell(metadata, "static_1", decompose("static_1_0")))),
+                       unfiltereds());
+
+        partitions.put(header(decompose("key1"), new DeletionTime(2, 20),
+                              staticRow(liveCell(metadata, "static_2", decompose("static_2_1")))),
+                       unfiltereds());
+
+        partitions.put(header(decompose("key4"), new DeletionTime(3, 30),
+                              staticRow(liveCell(metadata, "static_1", decompose("static_1_4")),
+                                        liveCell(metadata, "static_2", decompose("static_2_4")))),
+                       unfiltereds());
+
+        partitions.put(header(decompose("key3"), new DeletionTime(4, 40), staticRow()),
+                       unfiltereds(row(clustering(decompose("bob")),
+                                       liveCell(metadata, "age", decompose(36)),
+                                       liveCell(metadata, "height", decompose(172L))),
+                                   row(clustering(decompose("ron")),
+                                       liveCell(metadata, "age", decompose(41)),
+                                       liveCell(metadata, "height", decompose(183L)))));
+
+        partitions.put(header(decompose("key2"), new DeletionTime(5, 50),
+                              staticRow(liveCell(metadata, "static_1", decompose("static_1_2")),
+                                        liveCell(metadata, "static_2", decompose("static_2_2")))),
+                       unfiltereds(row(clustering(decompose("kim")),
+                                       liveCell(metadata, "age", decompose(27)),
+                                       liveCell(metadata, "height", decompose(183L))),
+                                   row(clustering(decompose("tim")),
+                                       liveCell(metadata, "age", decompose(24)),
+                                       liveCell(metadata, "height", decompose(165L)))));
+
+        testFlushObserver(metadata, partitions);
+    }
+
+    /**
+     * Test {@link SSTableFlushObserver} with tombstones.
+     */
     @Test
-    public void testFlushObserver()
+    public void testWithTombstones()
     {
-        TableMetadata cfm =
-            TableMetadata.builder(KS_NAME, CF_NAME)
-                         .addPartitionKeyColumn("id", UTF8Type.instance)
-                         .addRegularColumn("first_name", UTF8Type.instance)
-                         .addRegularColumn("age", Int32Type.instance)
-                         .addRegularColumn("height", LongType.instance)
-                         .build();
+        TableMetadata metadata = TableMetadata.builder(KS_NAME, "flush_observer_tombstones")
+                                              .addPartitionKeyColumn("id", UTF8Type.instance)
+                                              .addRegularColumn("age", Int32Type.instance)
+                                              .addRegularColumn("height", LongType.instance)
+                                              .addRegularColumn("name", UTF8Type.instance)
+                                              .build();
+
+        Map<PartitionHeader, List<Unfiltered>> partitions = new LinkedHashMap<>();
+
+        partitions.put(header(decompose("key1"), new DeletionTime(1, 10), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(Clustering.EMPTY,
+                                       tombstone(metadata, "age"),
+                                       liveCell(metadata, "height", decompose(183L)),
+                                       liveCell(metadata, "name", decompose("jack")))));
+
+        partitions.put(header(decompose("key3"), new DeletionTime(2, 20), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(Clustering.EMPTY,
+                                       liveCell(metadata, "age", decompose(30)),
+                                       liveCell(metadata, "height", decompose(178L)),
+                                       tombstone(metadata, "name"))));
+
+        partitions.put(header(decompose("key2"), new DeletionTime(3, 30), Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(row(Clustering.EMPTY,
+                                       tombstone(metadata, "age"),
+                                       tombstone(metadata, "height"),
+                                       tombstone(metadata, "name"))));
+
+        testFlushObserver(metadata, partitions);
+    }
 
+    /**
+     * Test {@link SSTableFlushObserver} with {@link RangeTombstoneMarker}.
+     */
+    @Test
+    public void testWithRangeTombstoneMarkers()
+    {
+        TableMetadata metadata = TableMetadata.builder(KS_NAME, "flush_observer_range_tombstone_markers")
+                                              .addPartitionKeyColumn("id", UTF8Type.instance)
+                                              .addClusteringColumn("name", UTF8Type.instance)
+                                              .addRegularColumn("age", Int32Type.instance)
+                                              .build();
+
+        DeletionTime dt = new DeletionTime(now, nowInSec);
+        Map<PartitionHeader, List<Unfiltered>> partitions = new LinkedHashMap<>();
+
+        partitions.put(header(decompose("key1"), dt, Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(exclusiveOpen(false, Clustering.make(decompose("alice")), dt),
+                                   exclusiveClose(false, Clustering.make(decompose("bob")), dt),
+                                   exclusiveCloseInclusiveOpen(false, Clustering.make(decompose("carol")), dt, dt),
+                                   inclusiveCloseExclusiveOpen(false, Clustering.make(decompose("dan")), dt, dt)));
+
+        partitions.put(header(decompose("key2"), dt, Rows.EMPTY_STATIC_ROW),
+                       unfiltereds(exclusiveOpen(true, Clustering.make(decompose("alice")), dt),
+                                   exclusiveClose(true, Clustering.make(decompose("bob")), dt),
+                                   exclusiveCloseInclusiveOpen(true, Clustering.make(decompose("carol")), dt, dt),
+                                   inclusiveCloseExclusiveOpen(true, Clustering.make(decompose("dan")), dt, dt)));
+
+        testFlushObserver(metadata, partitions);
+    }
+
+    private static void testFlushObserver(TableMetadata metadata, Map<PartitionHeader, List<Unfiltered>> partitions)
+    {
+        for (SSTableFormat.Type type : SSTableFormat.Type.values())
+            testFlushObserver(type, metadata, partitions);
+    }
+
+    private static void testFlushObserver(SSTableFormat.Type type,
+                                          TableMetadata metadata,
+                                          Map<PartitionHeader, List<Unfiltered>> partitions)
+    {
         LifecycleTransaction transaction = LifecycleTransaction.offline(OperationType.COMPACTION);
         FlushObserver observer = new FlushObserver();
 
         String sstableDirectory = DatabaseDescriptor.getAllDataFileLocations()[0];
-        File directory = new File(sstableDirectory + File.pathSeparator + KS_NAME + File.pathSeparator + CF_NAME);
+        File directory = new File(sstableDirectory, metadata.keyspace + File.separator + metadata.name);
         directory.deleteOnExit();
 
         if (!directory.exists() && !directory.mkdirs())
-            throw new FSWriteError(new IOException("failed to create tmp directory"), directory.getAbsolutePath());
-
-        SSTableFormat.Type sstableFormat = SSTableFormat.Type.current();
-
-        BigTableWriter writer = new BigTableWriter(new Descriptor(sstableFormat.info.getLatestVersion(),
-                                                                  directory,
-                                                                  KS_NAME, CF_NAME,
-                                                                  0,
-                                                                  sstableFormat),
-                                                   10L, 0L, null, false, TableMetadataRef.forOfflineTools(cfm),
-                                                   new MetadataCollector(cfm.comparator).sstableLevel(0),
-                                                   new SerializationHeader(true, cfm, cfm.regularAndStaticColumns(), EncodingStats.NO_STATS),
-                                                   Collections.singletonList(observer),
-                                                   transaction);
-
-        SSTableReader reader = null;
-        Multimap<ByteBuffer, Cell<?>> expected = ArrayListMultimap.create();
-
+            throw new FSWriteError(new IOException("failed to create tmp directory"), directory);
+
+        SSTableFormat format = type.info;
+        Descriptor descriptor = new Descriptor(format.getLatestVersion(),
+                                               directory,
+                                               metadata.keyspace,
+                                               metadata.name,
+                                               0,
+                                               type);
+
+        SSTableWriter writer = format.getWriterFactory()
+                                     .open(descriptor,
+                                           10L, 0L, null, false, TableMetadataRef.forOfflineTools(metadata),
+                                           new MetadataCollector(metadata.comparator).sstableLevel(0),
+                                           new SerializationHeader(true, metadata, metadata.regularAndStaticColumns(), EncodingStats.NO_STATS),
+                                           Collections.singletonList(observer),
+                                           transaction,
+                                           Collections.emptySet());
+
+        SSTableReader reader;
         try
         {
-            final long now = System.currentTimeMillis();
-
-            ByteBuffer key = UTF8Type.instance.fromString("key1");
-            expected.putAll(key, Arrays.asList(BufferCell.live(getColumn(cfm, "age"), now, Int32Type.instance.decompose(27)),
-                                               BufferCell.live(getColumn(cfm, "first_name"), now,UTF8Type.instance.fromString("jack")),
-                                               BufferCell.live(getColumn(cfm, "height"), now, LongType.instance.decompose(183L))));
-
-            writer.append(new RowIterator(cfm, key.duplicate(), Collections.singletonList(buildRow(expected.get(key)))));
-
-            key = UTF8Type.instance.fromString("key2");
-            expected.putAll(key, Arrays.asList(BufferCell.live(getColumn(cfm, "age"), now, Int32Type.instance.decompose(30)),
-                                               BufferCell.live(getColumn(cfm, "first_name"), now,UTF8Type.instance.fromString("jim")),
-                                               BufferCell.live(getColumn(cfm, "height"), now, LongType.instance.decompose(180L))));
-
-            writer.append(new RowIterator(cfm, key, Collections.singletonList(buildRow(expected.get(key)))));
-
-            key = UTF8Type.instance.fromString("key3");
-            expected.putAll(key, Arrays.asList(BufferCell.live(getColumn(cfm, "age"), now, Int32Type.instance.decompose(30)),
-                                               BufferCell.live(getColumn(cfm, "first_name"), now,UTF8Type.instance.fromString("ken")),
-                                               BufferCell.live(getColumn(cfm, "height"), now, LongType.instance.decompose(178L))));
-
-            writer.append(new RowIterator(cfm, key, Collections.singletonList(buildRow(expected.get(key)))));
-
+            partitions.forEach((key, rows) -> writer.append(new RowIterator(metadata, key, rows)));
             reader = writer.finish(true);
         }
         finally
@@ -139,92 +319,236 @@ public void testFlushObserver()
             FileUtils.closeQuietly(writer);
         }
 
-        Assert.assertTrue(observer.isComplete);
-        Assert.assertEquals(expected.size(), observer.rows.size());
+        assertTrue(observer.isComplete);
+        assertEquals(partitions.size(), observer.headers.size());
+        assertEquals(partitions.values().stream().mapToInt(List::size).sum(), observer.unfiltereds.size());
 
-        for (Pair<ByteBuffer, Long> e : observer.rows.keySet())
-        {
-            ByteBuffer key = e.left;
-            Long indexPosition = e.right;
+        ColumnFilter columnFilter = ColumnFilter.all(metadata);
 
-            try (FileDataInput index = reader.ifile.createReader(indexPosition))
+        try
+        {
+            for (FlushObserver.HeaderEntry e : observer.headers)
             {
-                ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(index);
-                Assert.assertEquals(0, UTF8Type.instance.compare(key, indexKey));
+                assertEquals(e.key, reader.keyAt(e.keyPosition));
+                assertEquals(e.deletionTime, reader.partitionLevelDeletionAt(e.deletionTimePosition));
+                assertEquals(e.staticRow, reader.staticRowAt(e.staticRowPosition, columnFilter));
             }
-            catch (IOException ex)
+
+            for (FlushObserver.UnfilteredEntry e : observer.unfiltereds)
             {
-                throw new FSReadError(ex, reader.getIndexFilename());
+                assertEquals(e.key, reader.keyAt(e.keyPosition));
+                assertEquals(e.unfiltered.clustering(), reader.clusteringAt(e.unfilteredPosition));
+                assertEquals(e.unfiltered, reader.unfilteredAt(e.unfilteredPosition, columnFilter));
             }
-
-            Assert.assertEquals(expected.get(key), observer.rows.get(e));
+        }
+        catch (IOException ex)
+        {
+            throw new FSReadError(ex, reader.getFilename());
         }
     }
 
     private static class RowIterator extends AbstractUnfilteredRowIterator
     {
-        private final Iterator<Unfiltered> rows;
+        private final Iterator<Unfiltered> unfiltereds;
 
-        public RowIterator(TableMetadata cfm, ByteBuffer key, Collection<Unfiltered> content)
+        private RowIterator(TableMetadata metadata, PartitionHeader header, Collection<Unfiltered> unfiltereds)
         {
-            super(cfm,
-                  DatabaseDescriptor.getPartitioner().decorateKey(key),
-                  DeletionTime.LIVE,
-                  cfm.regularAndStaticColumns(),
-                  BTreeRow.emptyRow(Clustering.STATIC_CLUSTERING),
+            super(metadata,
+                  header.key,
+                  header.deletionTime,
+                  metadata.regularAndStaticColumns(),
+                  header.staticRow,
                   false,
                   EncodingStats.NO_STATS);
-
-            rows = content.iterator();
+            this.unfiltereds = unfiltereds.iterator();
         }
 
         @Override
         protected Unfiltered computeNext()
         {
-            return rows.hasNext() ? rows.next() : endOfData();
+            return unfiltereds.hasNext() ? unfiltereds.next() : endOfData();
         }
     }
 
     private static class FlushObserver implements SSTableFlushObserver
     {
-        private final Multimap<Pair<ByteBuffer, Long>, Cell<?>> rows = ArrayListMultimap.create();
-        private Pair<ByteBuffer, Long> currentKey;
+        private final List<HeaderEntry> headers = new ArrayList<>();
+        private final List<UnfilteredEntry> unfiltereds = new ArrayList<>();
+
+        private boolean pendingHeader;
+        private DecoratedKey currentKey;
+        private long currentKeyPosition;
+        private DeletionTime currentDeletionTime;
+        private long currentDeletionTimePosition;
+        private Row currentStaticRow = Rows.EMPTY_STATIC_ROW;
+        private long currentStaticRowPosition;
         private boolean isComplete;
 
         @Override
         public void begin()
-        {}
+        {
+            pendingHeader = false;
+        }
+
+        @Override
+        public void startPartition(DecoratedKey key, long position)
+        {
+            currentKey = key;
+            currentKeyPosition = position;
+
+            if (pendingHeader)
+                headers.add(new HeaderEntry());
+            pendingHeader = true;
+        }
+
+        @Override
+        public void partitionLevelDeletion(DeletionTime deletionTime, long position)
+        {
+            currentDeletionTime = deletionTime;
+            currentDeletionTimePosition = position;
+        }
 
         @Override
-        public void startPartition(DecoratedKey key, long indexPosition)
+        public void staticRow(Row staticRow, long position)
         {
-            currentKey = Pair.create(key.getKey(), indexPosition);
+            currentStaticRow = staticRow;
+            currentStaticRowPosition = position;
         }
 
         @Override
-        public void nextUnfilteredCluster(Unfiltered row)
+        public void nextUnfilteredCluster(Unfiltered unfiltered, long position)
         {
-            if (row.isRow())
-                ((Row) row).forEach((c) -> rows.put(currentKey, (Cell<?>) c));
+            unfiltereds.add(new UnfilteredEntry(unfiltered, position));
         }
 
         @Override
         public void complete()
         {
             isComplete = true;
+            if (pendingHeader)
+                headers.add(new HeaderEntry());
+        }
+
+        class HeaderEntry
+        {
+            final DecoratedKey key;
+            final long keyPosition;
+            final DeletionTime deletionTime;
+            final long deletionTimePosition;
+            final Row staticRow;
+            final long staticRowPosition;
+
+            HeaderEntry()
+            {
+                this.key = currentKey;
+                this.keyPosition = currentKeyPosition;
+                this.deletionTime = currentDeletionTime;
+                this.deletionTimePosition = currentDeletionTimePosition;
+                this.staticRow = currentStaticRow;
+                this.staticRowPosition = currentStaticRowPosition;
+            }
+        }
+
+        class UnfilteredEntry
+        {
+            final DecoratedKey key;
+            final long keyPosition;
+            final long unfilteredPosition;
+            final Unfiltered unfiltered;
+
+            UnfilteredEntry(Unfiltered unfiltered, long unfilteredPosition)
+            {
+                assertTrue(!unfiltered.isRow() || !((Row) unfiltered).isStatic());
+                this.key = currentKey;
+                this.keyPosition = currentKeyPosition;
+                this.unfilteredPosition = unfilteredPosition;
+                this.unfiltered = unfiltered;
+            }
+        }
+    }
+
+    private static class PartitionHeader
+    {
+        final DecoratedKey key;
+        final DeletionTime deletionTime;
+        final Row staticRow;
+
+        PartitionHeader(DecoratedKey key, DeletionTime deletionTime, Row staticRow)
+        {
+            this.key = key;
+            this.deletionTime = deletionTime;
+            this.staticRow = staticRow;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            PartitionHeader that = (PartitionHeader) o;
+            return Objects.equals(key, that.key) &&
+                   Objects.equals(deletionTime, that.deletionTime) &&
+                   Objects.equals(staticRow, that.staticRow);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hash(key, deletionTime, staticRow);
         }
     }
 
-    private static Row buildRow(Collection<Cell<?>> cells)
+    private PartitionHeader header(ByteBuffer key, DeletionTime deletionTime, Row staticRow)
+    {
+        return new PartitionHeader(DatabaseDescriptor.getPartitioner().decorateKey(key), deletionTime, staticRow);
+    }
+
+    private BufferCell liveCell(TableMetadata metadata, String name, ByteBuffer value)
+    {
+        return BufferCell.live(metadata.getColumn(decompose(name)), now, value);
+    }
+
+    private BufferCell tombstone(TableMetadata metadata, String name)
+    {
+        return BufferCell.tombstone(metadata.getColumn(decompose(name)), now, nowInSec);
+    }
+
+    private static Clustering clustering(ByteBuffer... values)
+    {
+        return Clustering.make(values);
+    }
+
+    private static Row row(Clustering clustering, Cell... cells)
     {
         Row.Builder rowBuilder = BTreeRow.sortedBuilder();
-        rowBuilder.newRow(Clustering.EMPTY);
-        cells.forEach(rowBuilder::addCell);
+        rowBuilder.newRow(clustering);
+        for (Cell cell : cells)
+            rowBuilder.addCell(cell);
         return rowBuilder.build();
     }
 
-    private static ColumnMetadata getColumn(TableMetadata cfm, String name)
+    private static Row staticRow(Cell... cells)
+    {
+        return row(Clustering.STATIC_CLUSTERING, cells);
+    }
+
+    private static List<Unfiltered> unfiltereds(Unfiltered... unfiltereds)
+    {
+        return Arrays.asList(unfiltereds);
+    }
+
+    private static ByteBuffer decompose(String s)
+    {
+        return UTF8Type.instance.decompose(s);
+    }
+
+    private static ByteBuffer decompose(int s)
+    {
+        return Int32Type.instance.decompose(s);
+    }
+
+    private static ByteBuffer decompose(long s)
     {
-        return cfm.getColumn(UTF8Type.instance.fromString(name));
+        return LongType.instance.decompose(s);
     }
 }
diff --git a/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java b/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java
index d5f62d7530bf..903b2f3a4e16 100644
--- a/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java
+++ b/test/unit/org/apache/cassandra/locator/AssureSufficientLiveNodesTest.java
@@ -150,7 +150,7 @@ public void addDatacenterShouldNotCausesUnavailableWithEachQuorumTest() throws T
             // alter to
             KeyspaceParams.nts(DC1, 3, DC2, 3),
             // test
-            keyspace -> ReplicaPlans.forRead(keyspace, tk, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
+            keyspace -> ReplicaPlans.forRead(keyspace, tk, null, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
         );
     }
 
@@ -183,7 +183,7 @@ public void addDatacenterShouldNotCausesUnavailableWithQuorumTest() throws Throw
             // alter to
             KeyspaceParams.nts(DC1, 3, DC2, 3),
             // test
-            keyspace -> ReplicaPlans.forRead(keyspace, tk, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
+            keyspace -> ReplicaPlans.forRead(keyspace, tk, null, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
         );
         raceOfReplicationStrategyTest(
             // init. The # of live endpoints is 3 = 2 + 1
@@ -191,7 +191,7 @@ public void addDatacenterShouldNotCausesUnavailableWithQuorumTest() throws Throw
             // alter to. (3 + 3) / 2 + 1 > 3
             KeyspaceParams.nts(DC1, 2, DC2, 1, DC3, 3),
             // test
-            keyspace -> ReplicaPlans.forRead(keyspace, tk, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
+            keyspace -> ReplicaPlans.forRead(keyspace, tk, null, QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
         );
     }
 
@@ -215,7 +215,7 @@ public void raceOnRemoveDatacenterNotCausesUnavailable() throws Throwable
             // alter to
             KeyspaceParams.nts(DC1, 3),
             // test
-            keyspace -> ReplicaPlans.forRead(keyspace, tk, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
+            keyspace -> ReplicaPlans.forRead(keyspace, tk, null, EACH_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
         );
     }
 
@@ -239,7 +239,7 @@ public void increaseReplicationFactorShouldNotCausesUnavailableTest() throws Thr
             // alter to
             KeyspaceParams.nts(DC1, 3),
             // test
-            keyspace -> ReplicaPlans.forRead(keyspace, tk, LOCAL_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
+            keyspace -> ReplicaPlans.forRead(keyspace, tk, null, LOCAL_QUORUM, NeverSpeculativeRetryPolicy.INSTANCE)
         );
     }
 
diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
index d82a503c4f8e..348f9adca975 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
@@ -69,7 +69,7 @@ public void testRangeCountWithRangeMerge()
         for (int i = 0; i + 1 < tokens.size(); i++)
         {
             Range<PartitionPosition> range = Range.makeRowRange(tokens.get(i), tokens.get(i + 1));
-            ranges.add(ReplicaPlans.forRangeRead(keyspace, ConsistencyLevel.ONE, range, 1));
+            ranges.add(ReplicaPlans.forRangeRead(keyspace, null, ConsistencyLevel.ONE, range, 1));
             vnodeCount++;
         }
 
@@ -163,7 +163,7 @@ private static CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlanIterator(A
                                                                                    Keyspace keyspace,
                                                                                    boolean withRangeMerger)
     {
-        CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = new ReplicaPlanIterator(keyRange, keyspace, ConsistencyLevel.ONE);
+        CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = new ReplicaPlanIterator(keyRange, null, keyspace, ConsistencyLevel.ONE);
         if (withRangeMerger)
             replicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, ConsistencyLevel.ONE);
 
diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java
index 294be2a717c0..b3ac4a2507c9 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java
@@ -74,7 +74,7 @@ public void tesConcurrencyFactor()
         // verify that a low concurrency factor is not capped by the max concurrency factor
         PartitionRangeReadCommand command = command(cfs, 50, 50);
         try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, System.nanoTime());
-             ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), keyspace, ONE))
+             ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, ONE))
         {
             assertEquals(2, partitions.concurrencyFactor());
             assertEquals(MAX_CONCURRENCY_FACTOR, partitions.maxConcurrencyFactor());
@@ -84,7 +84,7 @@ public void tesConcurrencyFactor()
         // verify that a high concurrency factor is capped by the max concurrency factor
         command = command(cfs, 1000, 50);
         try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, System.nanoTime());
-             ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), keyspace, ONE))
+             ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, ONE))
         {
             assertEquals(MAX_CONCURRENCY_FACTOR, partitions.concurrencyFactor());
             assertEquals(MAX_CONCURRENCY_FACTOR, partitions.maxConcurrencyFactor());
@@ -94,7 +94,7 @@ public void tesConcurrencyFactor()
         // with 0 estimated results per range the concurrency factor should be 1
         command = command(cfs, 1000, 0);
         try (RangeCommandIterator partitions = RangeCommands.rangeCommandIterator(command, ONE, System.nanoTime());
-             ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), keyspace, ONE))
+             ReplicaPlanIterator ranges = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, ONE))
         {
             assertEquals(1, partitions.concurrencyFactor());
             assertEquals(MAX_CONCURRENCY_FACTOR, partitions.maxConcurrencyFactor());
diff --git a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java
index 745ad4e6c8f7..84f3a5e2e750 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanIteratorTest.java
@@ -163,7 +163,7 @@ private final void testRanges(AbstractBounds<PartitionPosition> queryRange, Abst
     @SafeVarargs
     private final void testRanges(Keyspace keyspace, AbstractBounds<PartitionPosition> queryRange, AbstractBounds<PartitionPosition>... expected)
     {
-        try (ReplicaPlanIterator iterator = new ReplicaPlanIterator(queryRange, keyspace, ConsistencyLevel.ANY))
+        try (ReplicaPlanIterator iterator = new ReplicaPlanIterator(queryRange, null, keyspace, ConsistencyLevel.ANY))
         {
             List<AbstractBounds<PartitionPosition>> restrictedRanges = new ArrayList<>(expected.length);
             while (iterator.hasNext())
diff --git a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java
index 721551d71dc4..b577678c3ba0 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/ReplicaPlanMergerTest.java
@@ -392,7 +392,7 @@ private final void testRanges(ConsistencyLevel consistencyLevel,
                                   AbstractBounds<PartitionPosition> queryRange,
                                   AbstractBounds<PartitionPosition>... expected)
     {
-        try (ReplicaPlanIterator originals = new ReplicaPlanIterator(queryRange, keyspace, ANY); // ANY avoids endpoint erros
+        try (ReplicaPlanIterator originals = new ReplicaPlanIterator(queryRange, null, keyspace, ANY); // ANY avoids endpoint erros
              ReplicaPlanMerger merger = new ReplicaPlanMerger(originals, keyspace, consistencyLevel))
         {
             // collect the merged ranges

From bbc9ca4222bf7a2e57c7ab4ec777ce6ad0e49542 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 15:04:37 +0000
Subject: [PATCH 038/151] STAR-121: Add index support to select statement
 restrictions

(cherry picked from commit abe04b3f031d01f543eb8f4a1f0c3665e710e96f)
(cherry picked from commit 0249d0559e3015cdbc232d05e4c88e22ee349c61)
---
 .../org/apache/cassandra/cql3/Operator.java   |   9 +
 .../cassandra/cql3/SingleColumnRelation.java  |  10 -
 .../ClusteringColumnRestrictions.java         | 173 +++--
 .../restrictions/CustomIndexExpression.java   |  12 +
 .../cql3/restrictions/IndexRestrictions.java  | 182 +++++-
 .../restrictions/MultiColumnRestriction.java  |  70 +-
 .../PartitionKeySingleRestrictionSet.java     | 109 +++-
 .../cql3/restrictions/Restriction.java        |  16 +-
 .../cql3/restrictions/RestrictionSet.java     | 614 ++++++++++--------
 .../restrictions/RestrictionSetWrapper.java   |  39 +-
 .../cql3/restrictions/Restrictions.java       |  21 +-
 .../restrictions/SingleColumnRestriction.java |  34 +-
 .../restrictions/StatementRestrictions.java   | 315 ++++++---
 .../cql3/restrictions/TokenFilter.java        | 149 +++--
 .../cql3/restrictions/TokenRestriction.java   |  15 +-
 .../cql3/statements/SelectStatement.java      |  14 +-
 .../apache/cassandra/db/MultiCBuilder.java    |  60 ++
 .../apache/cassandra/db/filter/RowFilter.java |  34 +-
 .../ClusteringColumnRestrictionsTest.java     | 518 +++++++++------
 .../entities/FrozenCollectionsTest.java       |   2 +-
 .../entities/SecondaryIndexTest.java          |  10 +-
 .../operations/CompactStorageTest.java        |  25 +-
 .../SelectMultiColumnRelationTest.java        |  16 +-
 .../SelectSingleColumnRelationTest.java       |  12 +-
 .../validation/operations/SelectTest.java     |  68 +-
 ...x support to select statement restrictions |  23 +
 26 files changed, 1687 insertions(+), 863 deletions(-)
 create mode 100644 update-history/STAR-801/65-0249d0559e STAR-121: Add index support to select statement restrictions

diff --git a/src/java/org/apache/cassandra/cql3/Operator.java b/src/java/org/apache/cassandra/cql3/Operator.java
index 1acedeeeab90..7a900b8e6c26 100644
--- a/src/java/org/apache/cassandra/cql3/Operator.java
+++ b/src/java/org/apache/cassandra/cql3/Operator.java
@@ -316,6 +316,15 @@ public int serializedSize()
         return 4;
     }
 
+    /**
+     * Checks if this operator is a like operator.
+     * @return {@code true} if this operator is a like operator, {@code false} otherwise.
+     */
+    public boolean isLike()
+    {
+        return this == LIKE_PREFIX || this == LIKE_CONTAINS || this == LIKE_SUFFIX || this == LIKE_MATCHES;
+    }
+
     /**
      * Checks if this operator is a slice operator.
      * @return {@code true} if this operator is a slice operator, {@code false} otherwise.
diff --git a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
index 9ff3f075db53..cf1cb69066e6 100644
--- a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
+++ b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
@@ -274,16 +274,6 @@ private List<? extends ColumnSpecification> toReceivers(ColumnMetadata columnDef
     {
         ColumnSpecification receiver = columnDef;
 
-        if (isIN())
-        {
-            // We only allow IN on the row key and the clustering key so far, never on non-PK columns, and this even if
-            // there's an index
-            // Note: for backward compatibility reason, we conside a IN of 1 value the same as a EQ, so we let that
-            // slide.
-            checkFalse(!columnDef.isPrimaryKeyColumn() && !canHaveOnlyOneValue(),
-                       "IN predicates on non-primary-key columns (%s) is not yet supported", columnDef.name);
-        }
-
         checkFalse(isContainsKey() && !(receiver.type instanceof MapType), "Cannot use CONTAINS KEY on non-map column %s", receiver.name);
         checkFalse(isContains() && !(receiver.type.isCollection()), "Cannot use CONTAINS on non-collection column %s", receiver.name);
 
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
index 0a252ff557f0..d5d153e5e797 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
@@ -27,6 +27,7 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.btree.BTreeSet;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
@@ -42,71 +43,22 @@ final class ClusteringColumnRestrictions extends RestrictionSetWrapper
      */
     protected final ClusteringComparator comparator;
 
-    /**
-     * <code>true</code> if filtering is allowed for this restriction, <code>false</code> otherwise
-     */
-    private final boolean allowFiltering;
-
-    public ClusteringColumnRestrictions(TableMetadata table)
-    {
-        this(table, false);
-    }
-
-    public ClusteringColumnRestrictions(TableMetadata table, boolean allowFiltering)
-    {
-        this(table.comparator, new RestrictionSet(), allowFiltering);
-    }
-
     private ClusteringColumnRestrictions(ClusteringComparator comparator,
-                                         RestrictionSet restrictionSet,
-                                         boolean allowFiltering)
+                                         RestrictionSet restrictionSet)
     {
         super(restrictionSet);
         this.comparator = comparator;
-        this.allowFiltering = allowFiltering;
-    }
-
-    public ClusteringColumnRestrictions mergeWith(Restriction restriction) throws InvalidRequestException
-    {
-        SingleRestriction newRestriction = (SingleRestriction) restriction;
-        RestrictionSet newRestrictionSet = restrictions.addRestriction(newRestriction);
-
-        if (!isEmpty() && !allowFiltering)
-        {
-            SingleRestriction lastRestriction = restrictions.lastRestriction();
-            ColumnMetadata lastRestrictionStart = lastRestriction.getFirstColumn();
-            ColumnMetadata newRestrictionStart = restriction.getFirstColumn();
-
-            checkFalse(lastRestriction.isSlice() && newRestrictionStart.position() > lastRestrictionStart.position(),
-                       "Clustering column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                       newRestrictionStart.name,
-                       lastRestrictionStart.name);
-
-            if (newRestrictionStart.position() < lastRestrictionStart.position() && newRestriction.isSlice())
-                throw invalidRequest("PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
-                                     restrictions.nextColumn(newRestrictionStart).name,
-                                     newRestrictionStart.name);
-        }
-
-        return new ClusteringColumnRestrictions(this.comparator, newRestrictionSet, allowFiltering);
-    }
-
-    private boolean hasMultiColumnSlice()
-    {
-        for (SingleRestriction restriction : restrictions)
-        {
-            if (restriction.isMultiColumn() && restriction.isSlice())
-                return true;
-        }
-        return false;
     }
 
     public NavigableSet<Clustering<?>> valuesAsClustering(QueryOptions options) throws InvalidRequestException
     {
         MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN());
-        for (SingleRestriction r : restrictions)
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
         {
+            SingleRestriction r = restrictions.get(i);
             r.appendTo(builder, options);
+
             if (builder.hasMissingElements())
                 break;
         }
@@ -115,11 +67,14 @@ public NavigableSet<Clustering<?>> valuesAsClustering(QueryOptions options) thro
 
     public NavigableSet<ClusteringBound<?>> boundsAsClustering(Bound bound, QueryOptions options) throws InvalidRequestException
     {
-        MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN() || hasMultiColumnSlice());
+        List<SingleRestriction> restrictionsList = restrictions();
+
+        MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN() || restrictions.hasMultiColumnSlice());
         int keyPosition = 0;
 
-        for (SingleRestriction r : restrictions)
+        for (int i = 0; i < restrictionsList.size(); i++)
         {
+            SingleRestriction r = restrictionsList.get(i);
             if (handleInFilter(r, keyPosition))
                 break;
 
@@ -144,50 +99,20 @@ public NavigableSet<ClusteringBound<?>> boundsAsClustering(Bound bound, QueryOpt
         return builder.buildBound(bound.isStart(), true);
     }
 
-    /**
-     * Checks if any of the underlying restriction is a CONTAINS or CONTAINS KEY.
-     *
-     * @return <code>true</code> if any of the underlying restriction is a CONTAINS or CONTAINS KEY,
-     * <code>false</code> otherwise
-     */
-    public final boolean hasContains()
-    {
-        for (SingleRestriction restriction : restrictions)
-        {
-            if (restriction.isContains())
-                return true;
-        }
-        return false;
-    }
-
-    /**
-     * Checks if any of the underlying restriction is a slice restrictions.
-     *
-     * @return <code>true</code> if any of the underlying restriction is a slice restrictions,
-     * <code>false</code> otherwise
-     */
-    public final boolean hasSlice()
-    {
-        for (SingleRestriction restriction : restrictions)
-        {
-            if (restriction.isSlice())
-                return true;
-        }
-        return false;
-    }
-
     /**
      * Checks if underlying restrictions would require filtering
      *
      * @return <code>true</code> if any underlying restrictions require filtering, <code>false</code>
      * otherwise
      */
-    public final boolean needFiltering()
+    public boolean needFiltering()
     {
         int position = 0;
 
-        for (SingleRestriction restriction : restrictions)
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
         {
+            SingleRestriction restriction = restrictions.get(i);
             if (handleInFilter(restriction, position))
                 return true;
 
@@ -198,18 +123,20 @@ public final boolean needFiltering()
     }
 
     @Override
-    public void addRowFilterTo(RowFilter filter,
+    public void addToRowFilter(RowFilter filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options) throws InvalidRequestException
     {
         int position = 0;
 
-        for (SingleRestriction restriction : restrictions)
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
         {
+            SingleRestriction restriction = restrictions.get(i);
             // We ignore all the clustering columns that can be handled by slices.
             if (handleInFilter(restriction, position) || restriction.hasSupportingIndex(indexRegistry))
             {
-                restriction.addRowFilterTo(filter, indexRegistry, options);
+                restriction.addToRowFilter(filter, indexRegistry, options);
                 continue;
             }
 
@@ -223,4 +150,64 @@ private boolean handleInFilter(SingleRestriction restriction, int index)
         return restriction.isContains() || restriction.isLIKE() || index != restriction.getFirstColumn().position();
     }
 
+    public static ClusteringColumnRestrictions.Builder builder(TableMetadata table, boolean allowFiltering)
+    {
+        return new Builder(table, allowFiltering, null);
+    }
+
+    public static ClusteringColumnRestrictions.Builder builder(TableMetadata table, boolean allowFiltering, IndexRegistry indexRegistry)
+    {
+        return new Builder(table, allowFiltering, indexRegistry);
+    }
+
+    public static class Builder
+    {
+        private final TableMetadata table;
+        private final boolean allowFiltering;
+        private final IndexRegistry indexRegistry;
+
+        private final RestrictionSet.Builder restrictions = RestrictionSet.builder();
+
+        private Builder(TableMetadata table, boolean allowFiltering, IndexRegistry indexRegistry)
+        {
+            this.table = table;
+            this.allowFiltering = allowFiltering;
+            this.indexRegistry = indexRegistry;
+        }
+
+        public ClusteringColumnRestrictions.Builder addRestriction(Restriction restriction)
+        {
+            SingleRestriction newRestriction = (SingleRestriction) restriction;
+            boolean isEmpty = restrictions.isEmpty();
+
+            if (!isEmpty && !allowFiltering && (indexRegistry == null || !newRestriction.hasSupportingIndex(indexRegistry)))
+            {
+                SingleRestriction lastRestriction = restrictions.lastRestriction();
+                ColumnMetadata lastRestrictionStart = lastRestriction.getFirstColumn();
+                ColumnMetadata newRestrictionStart = newRestriction.getFirstColumn();
+                restrictions.addRestriction(newRestriction);
+
+                checkFalse(lastRestriction.isSlice() && newRestrictionStart.position() > lastRestrictionStart.position(),
+                           "Clustering column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
+                           newRestrictionStart.name,
+                           lastRestrictionStart.name);
+
+                if (newRestrictionStart.position() < lastRestrictionStart.position() && newRestriction.isSlice())
+                    throw invalidRequest("PRIMARY KEY column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
+                                         restrictions.nextColumn(newRestrictionStart).name,
+                                         newRestrictionStart.name);
+            }
+            else
+            {
+                restrictions.addRestriction(newRestriction);
+            }
+
+            return this;
+        }
+
+        public ClusteringColumnRestrictions build()
+        {
+            return new ClusteringColumnRestrictions(table.comparator, restrictions.build());
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
index 7a5fff62b4c4..ff6a2a6e25af 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
@@ -22,6 +22,7 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.schema.TableMetadata;
 
 public class CustomIndexExpression
@@ -60,6 +61,17 @@ public String toCQLString()
         return String.format("expr(%s,%s)", targetIndex.toCQLString(), valueRaw.getText());
     }
 
+    public boolean needsFiltering(Index.Group indexGroup)
+    {
+        String indexName = targetIndex.getName();
+
+        for (Index index : indexGroup.getIndexes())
+            if (index.getIndexMetadata().name.equals(indexName))
+                return false;
+
+        return true;
+    }
+
     @Override
     public String toString()
     {
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
index fd89d1b47000..10608b85c1c2 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
@@ -19,48 +19,136 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.cassandra.cql3.QualifiedName;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.IndexRegistry;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
 
-public class IndexRestrictions
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+
+public final class IndexRestrictions
 {
+    /**
+     * The empty {@code IndexRestrictions}.
+     */
+    private static final IndexRestrictions EMPTY_RESTRICTIONS = new IndexRestrictions(Collections.EMPTY_LIST, Collections.EMPTY_LIST);
+
     public static final String INDEX_NOT_FOUND = "Invalid index expression, index %s not found for %s";
     public static final String INVALID_INDEX = "Target index %s cannot be used to query %s";
     public static final String CUSTOM_EXPRESSION_NOT_SUPPORTED = "Index %s does not support custom expressions";
     public static final String NON_CUSTOM_INDEX_IN_EXPRESSION = "Only CUSTOM indexes may be used in custom index expressions, %s is not valid";
     public static final String MULTIPLE_EXPRESSIONS = "Multiple custom index expressions in a single query are not supported";
 
-    private final List<Restrictions> regularRestrictions = new ArrayList<>();
-    private final List<CustomIndexExpression> customExpressions = new ArrayList<>();
+    private final List<Restrictions> regularRestrictions;
+    private final List<CustomIndexExpression> externalRestrictions;
 
-    public void add(Restrictions restrictions)
+    private IndexRestrictions(List<Restrictions> regularRestrictions, List<CustomIndexExpression> externalExpressions)
     {
-        regularRestrictions.add(restrictions);
+        this.regularRestrictions = regularRestrictions;
+        this.externalRestrictions = externalExpressions;
     }
 
-    public void add(CustomIndexExpression expression)
+    /**
+     * Returns an empty {@code IndexRestrictions}.
+     * @return an empty {@code IndexRestrictions}
+     */
+    public static IndexRestrictions of()
     {
-        customExpressions.add(expression);
+        return EMPTY_RESTRICTIONS;
+    }
+
+    /**
+     * Creates a new {@code IndexRestrictions.Builder} instance.
+     * @return a new {@code IndexRestrictions.Builder} instance.
+     */
+    public static Builder builder()
+    {
+        return new IndexRestrictions.Builder();
     }
 
     public boolean isEmpty()
     {
-        return regularRestrictions.isEmpty() && customExpressions.isEmpty();
+        return regularRestrictions.isEmpty() && externalRestrictions.isEmpty();
     }
 
+    /**
+     * Returns the regular restrictions.
+     * @return the regular restrictions
+     */
     public List<Restrictions> getRestrictions()
     {
         return regularRestrictions;
     }
 
-    public List<CustomIndexExpression> getCustomIndexExpressions()
+    /**
+     * Returns the external restrictions.
+     * @return the external restrictions
+     */
+    public List<CustomIndexExpression> getExternalExpressions()
+    {
+        return externalRestrictions;
+    }
+
+    /**
+     * Returns the number of restrictions in external expression and regular restrictions.
+     * @return Returns the number of restrictions in external expression and regular restrictions.
+     */
+    private int numOfSupportedRestrictions()
     {
-        return customExpressions;
+        int numberOfRestrictions = getExternalExpressions().size();
+        for (Restrictions restrictions : getRestrictions())
+            numberOfRestrictions += restrictions.size();
+
+        return numberOfRestrictions;
+    }
+
+    /**
+     * Returns whether these restrictions would need filtering if the specified index registry were used.
+     *
+     * @param indexRegistry an index registry
+     * @param hasClusteringColumnRestrictions {@code true} if there are restricted clustering columns
+     * @param hasMultipleContains {@code true} if there are multiple "contains" restrictions
+     * @return {@code true} if this would need filtering if {@code indexRegistry} were used, {@code false} otherwise
+     */
+    public boolean needFiltering(IndexRegistry indexRegistry, boolean hasClusteringColumnRestrictions, boolean hasMultipleContains)
+    {
+        // We need filtering if any clustering columns have restrictions that are not supported
+        // by their indexes.
+        if (numOfSupportedRestrictions() == 0)
+            return hasClusteringColumnRestrictions;
+
+        for (Index.Group group : indexRegistry.listIndexGroups())
+            if (!needFiltering(group, hasMultipleContains))
+                return false;
+
+        return true;
+    }
+
+    /**
+     * Returns whether these restrictions would need filtering if the specified index group were used.
+     *
+     * @param indexGroup an index group
+     * @param hasMultipleContains {@code true} if there are multiple "contains" restrictions
+     * @return {@code true} if this would need filtering if {@code indexGroup} were used, {@code false} otherwise
+     */
+    private boolean needFiltering(Index.Group indexGroup, boolean hasMultipleContains)
+    {
+        if (hasMultipleContains && !indexGroup.supportsMultipleContains())
+            return true;
+
+        for (Restrictions restrictions : regularRestrictions)
+            if (restrictions.needsFiltering(indexGroup))
+                return true;
+
+        for (CustomIndexExpression restriction : externalRestrictions)
+            if (restriction.needsFiltering(indexGroup))
+                return true;
+
+        return false;
     }
 
     static InvalidRequestException invalidIndex(QualifiedName indexName, TableMetadata table)
@@ -75,17 +163,75 @@ static InvalidRequestException indexNotFound(QualifiedName indexName, TableMetad
 
     static InvalidRequestException nonCustomIndexInExpression(QualifiedName indexName)
     {
-        return new InvalidRequestException(String.format(NON_CUSTOM_INDEX_IN_EXPRESSION, indexName.getName()));
+        return invalidRequest(NON_CUSTOM_INDEX_IN_EXPRESSION, indexName.getName());
     }
 
     static InvalidRequestException customExpressionNotSupported(QualifiedName indexName)
     {
-        return new InvalidRequestException(String.format(CUSTOM_EXPRESSION_NOT_SUPPORTED, indexName.getName()));
+        return invalidRequest(CUSTOM_EXPRESSION_NOT_SUPPORTED, indexName.getName());
     }
-    
-    @Override
-    public String toString()
+
+    /**
+     * Builder for IndexRestrictions.
+     */
+    public static final class Builder
     {
-        return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
+        /**
+         * Builder for the regular restrictions.
+         */
+        private List<Restrictions> regularRestrictions = new ArrayList<>();
+
+        /**
+         * Builder for the custom expressions.
+         */
+        private List<CustomIndexExpression> externalRestrictions = new ArrayList<>();
+
+        private Builder() {}
+
+        /**
+         * Adds the specified restrictions.
+         *
+         * @param restrictions the restrictions to add
+         * @return this {@code Builder}
+         */
+        public Builder add(Restrictions restrictions)
+        {
+            regularRestrictions.add(restrictions);
+            return this;
+        }
+
+        /**
+         * Adds the restrictions and custom expressions from the specified {@code IndexRestrictions}.
+         *
+         * @param restrictions the restrictions and custom expressions to add
+         * @return this {@code Builder}
+         */
+        public Builder add(IndexRestrictions restrictions)
+        {
+            regularRestrictions.addAll(restrictions.regularRestrictions);
+            externalRestrictions.addAll(restrictions.externalRestrictions);
+            return this;
+        }
+
+        /**
+         * Adds the specified index expression.
+         *
+         * @param restriction the index expression to add
+         * @return this {@code Builder}
+         */
+        public Builder add(CustomIndexExpression restriction)
+        {
+            externalRestrictions.add(restriction);
+            return this;
+        }
+
+        /**
+         * Builds a new {@code IndexRestrictions} instance
+         * @return a new {@code IndexRestrictions} instance
+         */
+        public IndexRestrictions build()
+        {
+            return new IndexRestrictions(Collections.unmodifiableList(regularRestrictions), Collections.unmodifiableList(externalRestrictions));
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
index 4c6ce2f80edd..3a771333ac6f 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
@@ -114,8 +114,27 @@ protected final String getColumnsInCommons(Restriction otherRestriction)
     public final boolean hasSupportingIndex(IndexRegistry indexRegistry)
     {
         for (Index index : indexRegistry.listIndexes())
-           if (isSupportedBy(index))
-               return true;
+            if (isSupportingIndex(index))
+                return true;
+        return false;
+    }
+
+    @Override
+    public boolean needsFiltering(Index.Group indexGroup)
+    {
+        for (ColumnMetadata column : columnDefs)
+            if (!isSupportedBy(indexGroup, column))
+                return true;
+
+        return false;
+    }
+
+    private boolean isSupportedBy(Index.Group indexGroup, ColumnMetadata column)
+    {
+        for (Index index : indexGroup.getIndexes())
+            if (isSupportedBy(index, column))
+                return true;
+
         return false;
     }
 
@@ -126,7 +145,16 @@ public final boolean hasSupportingIndex(IndexRegistry indexRegistry)
      * @return <code>true</code> this type of restriction is supported by the specified index,
      * <code>false</code> otherwise.
      */
-    protected abstract boolean isSupportedBy(Index index);
+    private boolean isSupportingIndex(Index index)
+    {
+        for (ColumnMetadata column : columnDefs)
+            if (isSupportedBy(index, column))
+                return true;
+
+        return false;
+    }
+
+    protected abstract boolean isSupportedBy(Index index, ColumnMetadata def);
 
     public static class EQRestriction extends MultiColumnRestriction
     {
@@ -164,12 +192,9 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        protected boolean isSupportedBy(Index index)
+        protected boolean isSupportedBy(Index index, ColumnMetadata column)
         {
-            for(ColumnMetadata column : columnDefs)
-                if (index.supportsExpression(column, Operator.EQ))
-                    return true;
-            return false;
+            return index.supportsExpression(column, Operator.EQ);
         }
 
         @Override
@@ -186,7 +211,7 @@ public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         }
 
         @Override
-        public final void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public final void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             Tuples.Value t = ((Tuples.Value) value.bind(options));
             List<ByteBuffer> values = t.getElements();
@@ -234,16 +259,13 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        protected boolean isSupportedBy(Index index)
+        protected boolean isSupportedBy(Index index, ColumnMetadata column)
         {
-            for (ColumnMetadata column: columnDefs)
-                if (index.supportsExpression(column, Operator.IN))
-                    return true;
-            return false;
+            return index.supportsExpression(column, Operator.IN);
         }
 
         @Override
-        public final void addRowFilterTo(RowFilter filter,
+        public final void addToRowFilter(RowFilter filter,
                                          IndexRegistry indexRegistry,
                                          QueryOptions options)
         {
@@ -416,12 +438,9 @@ public MultiCBuilder appendBoundTo(MultiCBuilder builder, Bound bound, QueryOpti
         }
 
         @Override
-        protected boolean isSupportedBy(Index index)
+        protected boolean isSupportedBy(Index index, ColumnMetadata column)
         {
-            for(ColumnMetadata def : columnDefs)
-                if (slice.isSupportedBy(def, index))
-                    return true;
-            return false;
+            return slice.isSupportedBy(column, index);
         }
 
         @Override
@@ -472,7 +491,7 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        public final void addRowFilterTo(RowFilter filter,
+        public final void addToRowFilter(RowFilter filter,
                                          IndexRegistry indexRegistry,
                                          QueryOptions options)
         {
@@ -546,12 +565,9 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        protected boolean isSupportedBy(Index index)
+        protected boolean isSupportedBy(Index index, ColumnMetadata column)
         {
-            for(ColumnMetadata column : columnDefs)
-                if (index.supportsExpression(column, Operator.IS_NOT))
-                    return true;
-            return false;
+            return index.supportsExpression(column, Operator.IS_NOT);
         }
 
         @Override
@@ -561,7 +577,7 @@ public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         }
 
         @Override
-        public final void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public final void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             throw new UnsupportedOperationException("Secondary indexes do not support IS NOT NULL restrictions");
         }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
index fbe5673c05ba..2ced74127024 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
@@ -24,9 +24,9 @@
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
 import org.apache.cassandra.db.ClusteringComparator;
-import org.apache.cassandra.db.ClusteringPrefix;
 import org.apache.cassandra.db.MultiCBuilder;
 import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.index.IndexRegistry;
 
 /**
@@ -42,27 +42,12 @@ final class PartitionKeySingleRestrictionSet extends RestrictionSetWrapper imple
      */
     protected final ClusteringComparator comparator;
 
-    public PartitionKeySingleRestrictionSet(ClusteringComparator comparator)
+    private PartitionKeySingleRestrictionSet(RestrictionSet restrictionSet, ClusteringComparator comparator)
     {
-        super(new RestrictionSet());
+        super(restrictionSet);
         this.comparator = comparator;
     }
 
-    private PartitionKeySingleRestrictionSet(PartitionKeySingleRestrictionSet restrictionSet,
-                                       SingleRestriction restriction)
-    {
-        super(restrictionSet.restrictions.addRestriction(restriction));
-        this.comparator = restrictionSet.comparator;
-    }
-
-    private List<ByteBuffer> toByteBuffers(SortedSet<? extends ClusteringPrefix> clusterings)
-    {
-        List<ByteBuffer> l = new ArrayList<>(clusterings.size());
-        for (ClusteringPrefix clustering : clusterings)
-            l.add(clustering.serializeAsPartitionKey());
-        return l;
-    }
-
     @Override
     public PartitionKeyRestrictions mergeWith(Restriction restriction)
     {
@@ -71,36 +56,49 @@ public PartitionKeyRestrictions mergeWith(Restriction restriction)
             if (isEmpty())
                 return (PartitionKeyRestrictions) restriction;
 
-            return new TokenFilter(this, (TokenRestriction) restriction);
+            return TokenFilter.create(this, (TokenRestriction) restriction);
         }
 
-        return new PartitionKeySingleRestrictionSet(this, (SingleRestriction) restriction);
+        Builder builder = PartitionKeySingleRestrictionSet.builder(comparator);
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
+        {
+            SingleRestriction r = restrictions.get(i);
+            builder.addRestriction(r);
+        }
+        return builder.addRestriction(restriction)
+                      .build();
     }
 
     @Override
     public List<ByteBuffer> values(QueryOptions options)
     {
         MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN());
-        for (SingleRestriction r : restrictions)
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
         {
+            SingleRestriction r = restrictions.get(i);
             r.appendTo(builder, options);
+
             if (builder.hasMissingElements())
                 break;
         }
-        return toByteBuffers(builder.build());
+        return builder.buildSerializedPartitionKeys();
     }
 
     @Override
     public List<ByteBuffer> bounds(Bound bound, QueryOptions options)
     {
         MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN());
-        for (SingleRestriction r : restrictions)
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
         {
+            SingleRestriction r = restrictions.get(i);
             r.appendBoundTo(builder, bound, options);
             if (builder.hasMissingElements())
-                return Collections.emptyList();
+                return Collections.EMPTY_LIST;
         }
-        return toByteBuffers(builder.buildBound(bound.isStart(), true));
+        return builder.buildSerializedPartitionKeys();
     }
 
     @Override
@@ -120,13 +118,15 @@ public boolean isInclusive(Bound b)
     }
 
     @Override
-    public void addRowFilterTo(RowFilter filter,
+    public void addToRowFilter(RowFilter filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options)
     {
-        for (SingleRestriction restriction : restrictions)
+        List<SingleRestriction> restrictions = restrictions();
+        for (int i = 0; i < restrictions.size(); i++)
         {
-             restriction.addRowFilterTo(filter, indexRegistry, options);
+            SingleRestriction r = restrictions.get(i);
+            r.addToRowFilter(filter, indexRegistry, options);
         }
     }
 
@@ -146,9 +146,56 @@ public boolean hasUnrestrictedPartitionKeyComponents(TableMetadata table)
         return size() < table.partitionKeyColumns().size();
     }
 
-    @Override
-    public boolean hasSlice()
+    public static Builder builder(ClusteringComparator clusteringComparator)
+    {
+        return new Builder(clusteringComparator);
+    }
+
+    public static final class Builder
     {
-        return restrictions.hasSlice();
+        private final ClusteringComparator clusteringComparator;
+
+        private final List<Restriction> restrictions = new ArrayList<>();
+
+        private Builder(ClusteringComparator clusteringComparator) {
+            this.clusteringComparator = clusteringComparator;
+        }
+
+        public Builder addRestriction(Restriction restriction) {
+            restrictions.add(restriction);
+            return this;
+        }
+
+        public PartitionKeyRestrictions build() {
+            RestrictionSet.Builder restrictionSet = RestrictionSet.builder();
+
+            for (int i = 0; i < restrictions.size(); i++) {
+                Restriction restriction = restrictions.get(i);
+
+                // restrictions on tokens are handled in a special way
+                if (restriction.isOnToken())
+                    return buildWithTokens(restrictionSet, i);
+
+                restrictionSet.addRestriction((SingleRestriction) restriction);
+            }
+
+            return buildPartitionKeyRestrictions(restrictionSet);
+        }
+
+        private PartitionKeyRestrictions buildWithTokens(RestrictionSet.Builder restrictionSet, int i) {
+            PartitionKeyRestrictions merged = buildPartitionKeyRestrictions(restrictionSet);
+
+            for (; i < restrictions.size(); i++) {
+                Restriction restriction = restrictions.get(i);
+
+                merged = merged.mergeWith(restriction);
+            }
+
+            return merged;
+        }
+
+        private PartitionKeySingleRestrictionSet buildPartitionKeyRestrictions(RestrictionSet.Builder restrictionSet) {
+            return new PartitionKeySingleRestrictionSet(restrictionSet.build(), clusteringComparator);
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java b/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
index 91dedad9536b..f523d45096e9 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
@@ -18,7 +18,9 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.util.List;
+import java.util.function.Consumer;
 
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
@@ -30,6 +32,10 @@
  */
 public interface Restriction
 {
+    /**
+     * Check if the restriction is on a partition key
+     * @return <code>true</code> if the restriction is on a partition key, <code>false</code>
+     */
     public default boolean isOnToken()
     {
         return false;
@@ -68,6 +74,14 @@ public default boolean isOnToken()
      */
     public boolean hasSupportingIndex(IndexRegistry indexRegistry);
 
+    /**
+     * Returns whether this restriction would need filtering if the specified index group were used.
+     *
+     * @param indexGroup an index group
+     * @return {@code true} if this would need filtering if {@code indexGroup} were used, {@code false} otherwise
+     */
+    public boolean needsFiltering(Index.Group indexGroup);
+
     /**
      * Adds to the specified row filter the expressions corresponding to this <code>Restriction</code>.
      *
@@ -75,7 +89,7 @@ public default boolean isOnToken()
      * @param indexRegistry the index registry
      * @param options the query options
      */
-    public void addRowFilterTo(RowFilter filter,
+    public void addToRowFilter(RowFilter filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options);
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
index 7a5d5b964b0a..8548ea2b8b9d 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
@@ -18,358 +18,446 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.util.*;
+import java.util.function.Consumer;
 
-import com.google.common.collect.AbstractIterator;
-
-import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.restrictions.SingleColumnRestriction.ContainsRestriction;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.IndexRegistry;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
+import org.apache.cassandra.schema.ColumnMetadata;
 
 /**
  * Sets of column restrictions.
  *
  * <p>This class is immutable.</p>
  */
-final class RestrictionSet implements Restrictions, Iterable<SingleRestriction>
+public abstract class RestrictionSet implements Restrictions
 {
     /**
      * The comparator used to sort the <code>Restriction</code>s.
      */
-    private static final Comparator<ColumnMetadata> COLUMN_DEFINITION_COMPARATOR = new Comparator<ColumnMetadata>()
+    private static final Comparator<ColumnMetadata> COLUMN_DEFINITION_COMPARATOR = Comparator.comparingInt(ColumnMetadata::position).thenComparing(column -> column.name.bytes);
+
+    private static final class EmptyRestrictionSet extends RestrictionSet
     {
+        private static final EmptyRestrictionSet INSTANCE = new EmptyRestrictionSet();
+
+        private EmptyRestrictionSet()
+        {
+        }
+
         @Override
-        public int compare(ColumnMetadata column, ColumnMetadata otherColumn)
+        public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options) throws InvalidRequestException
         {
-            int value = Integer.compare(column.position(), otherColumn.position());
-            return value != 0 ? value : column.name.bytes.compareTo(otherColumn.name.bytes);
         }
-    };
 
-    private static final TreeMap<ColumnMetadata, SingleRestriction> EMPTY = new TreeMap<>(COLUMN_DEFINITION_COMPARATOR);
+        @Override
+        public List<ColumnMetadata> getColumnDefs()
+        {
+            return Collections.EMPTY_LIST;
+        }
 
-    /**
-     * The restrictions per column.
-     */
-    protected final TreeMap<ColumnMetadata, SingleRestriction> restrictions;
+        @Override
+        public void addFunctionsTo(List<Function> functions)
+        {
+        }
 
-    /**
-     * {@code true} if it contains multi-column restrictions, {@code false} otherwise.
-     */
-    private final boolean hasMultiColumnRestrictions;
+        @Override
+        public boolean isEmpty()
+        {
+            return true;
+        }
 
-    private final boolean hasIn;
-    private final boolean hasContains;
-    private final boolean hasSlice;
-    private final boolean hasOnlyEqualityRestrictions;
+        @Override
+        public int size()
+        {
+            return 0;
+        }
 
-    public RestrictionSet()
-    {
-        this(EMPTY, false,
-             false,
-             false,
-             false,
-             true);
-    }
+        @Override
+        public boolean hasRestrictionFor(ColumnMetadata.Kind kind)
+        {
+            return false;
+        }
 
-    private RestrictionSet(TreeMap<ColumnMetadata, SingleRestriction> restrictions,
-                           boolean hasMultiColumnRestrictions,
-                           boolean hasIn,
-                           boolean hasContains,
-                           boolean hasSlice,
-                           boolean hasOnlyEqualityRestrictions)
-    {
-        this.restrictions = restrictions;
-        this.hasMultiColumnRestrictions = hasMultiColumnRestrictions;
-        this.hasIn = hasIn;
-        this.hasContains = hasContains;
-        this.hasSlice = hasSlice;
-        this.hasOnlyEqualityRestrictions = hasOnlyEqualityRestrictions;
-    }
+        @Override
+        public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
+        {
+            return Collections.emptySet();
+        }
 
-    @Override
-    public void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options) throws InvalidRequestException
-    {
-        for (Restriction restriction : restrictions.values())
-            restriction.addRowFilterTo(filter, indexRegistry, options);
-    }
+        @Override
+        public boolean hasSupportingIndex(IndexRegistry indexRegistry)
+        {
+            return false;
+        }
 
-    @Override
-    public List<ColumnMetadata> getColumnDefs()
-    {
-        return new ArrayList<>(restrictions.keySet());
-    }
+        @Override
+        public boolean needsFiltering(Index.Group indexGroup)
+        {
+            return false;
+        }
 
-    @Override
-    public void addFunctionsTo(List<Function> functions)
-    {
-        for (Restriction restriction : this)
-            restriction.addFunctionsTo(functions);
-    }
+        @Override
+        public ColumnMetadata getFirstColumn()
+        {
+            return null;
+        }
 
-    @Override
-    public boolean isEmpty()
-    {
-        return restrictions.isEmpty();
-    }
+        @Override
+        public ColumnMetadata getLastColumn()
+        {
+            return null;
+        }
 
-    @Override
-    public int size()
-    {
-        return restrictions.size();
-    }
+        @Override
+        public SingleRestriction lastRestriction()
+        {
+            return null;
+        }
 
-    /**
-     * Checks if one of the restrictions applies to a column of the specific kind.
-     * @param kind the column kind
-     * @return {@code true} if one of the restrictions applies to a column of the specific kind, {@code false} otherwise.
-     */
-    public boolean hasRestrictionFor(ColumnMetadata.Kind kind)
-    {
-        for (ColumnMetadata column : restrictions.keySet())
+        @Override
+        public boolean hasMultipleContains()
         {
-            if (column.kind == kind)
-                return true;
+            return false;
         }
-        return false;
-    }
 
-    /**
-     * Adds the specified restriction to this set of restrictions.
-     *
-     * @param restriction the restriction to add
-     * @return the new set of restrictions
-     */
-    public RestrictionSet addRestriction(SingleRestriction restriction)
-    {
-        // RestrictionSet is immutable so we need to clone the restrictions map.
-        TreeMap<ColumnMetadata, SingleRestriction> newRestricitons = new TreeMap<>(this.restrictions);
-
-        boolean newHasIn = hasIn || restriction.isIN();
-        boolean newHasContains = hasContains || restriction.isContains();
-        boolean newHasSlice = hasSlice || restriction.isSlice();
-        boolean newHasOnlyEqualityRestrictions = hasOnlyEqualityRestrictions && (restriction.isEQ() || restriction.isIN());
-
-        return new RestrictionSet(mergeRestrictions(newRestricitons, restriction),
-                                  hasMultiColumnRestrictions || restriction.isMultiColumn(),
-                                  newHasIn,
-                                  newHasContains,
-                                  newHasSlice,
-                                  newHasOnlyEqualityRestrictions);
+        @Override
+        public List<SingleRestriction> restrictions()
+        {
+            return Collections.EMPTY_LIST;
+        }
+
+        @Override
+        public boolean hasMultiColumnSlice()
+        {
+            return false;
+        }
     }
 
-    private TreeMap<ColumnMetadata, SingleRestriction> mergeRestrictions(TreeMap<ColumnMetadata, SingleRestriction> restrictions,
-                                                                         SingleRestriction restriction)
+    private static final class DefaultRestrictionSet extends RestrictionSet
     {
-        Collection<ColumnMetadata> columnDefs = restriction.getColumnDefs();
-        Set<SingleRestriction> existingRestrictions = getRestrictions(columnDefs);
 
-        if (existingRestrictions.isEmpty())
-        {
-            for (ColumnMetadata columnDef : columnDefs)
-                restrictions.put(columnDef, restriction);
-        }
-        else
+        /**
+         * The keys from the 'restrictions' parameter to the
+         */
+        private final List<ColumnMetadata> restrictionsKeys;
+        /**
+         * The values as returned from {@link #restrictions()}.
+         */
+        private final List<SingleRestriction> restrictionsValues;
+        private final Map<ColumnMetadata, SingleRestriction> restrictionsHashMap;
+        private final int hasBitmap;
+        private final int restrictionForKindBitmap;
+        private static final int maskHasContains = 1;
+        private static final int maskHasSlice = 2;
+        private static final int maskHasIN = 4;
+        private static final int maskHasOnlyEqualityRestrictions = 8;
+        private static final int maskHasMultiColumnSlice = 16;
+        private static final int maskHasMultipleContains = 32;
+
+        private DefaultRestrictionSet(Map<ColumnMetadata, SingleRestriction> restrictions,
+                                      boolean hasMultiColumnRestrictions)
         {
-            for (SingleRestriction existing : existingRestrictions)
+            this.restrictionsKeys = new ArrayList<>(restrictions.keySet());
+            restrictionsKeys.sort(COLUMN_DEFINITION_COMPARATOR);
+
+            List<SingleRestriction> sortedRestrictions = new ArrayList<>();
+
+            int numberOfContains = 0;
+            int restrictionForBitmap = 0;
+            int bitmap = maskHasOnlyEqualityRestrictions;
+
+            SingleRestriction previous = null;
+            for (int i = 0; i < restrictionsKeys.size(); i++)
             {
-                SingleRestriction newRestriction = mergeRestrictions(existing, restriction);
+                ColumnMetadata col = restrictionsKeys.get(i);
+                SingleRestriction singleRestriction = restrictions.get(col);
+
+                if (singleRestriction.isContains())
+                {
+                    bitmap |= maskHasContains;
+                    ContainsRestriction contains = (ContainsRestriction) singleRestriction;
+                    numberOfContains += (contains.numberOfValues() + contains.numberOfKeys() + contains.numberOfEntries());
+                }
+
+                if (hasMultiColumnRestrictions)
+                {
+                    if (singleRestriction.equals(previous))
+                        continue;
+                    previous = singleRestriction;
+                }
+
+                restrictionForBitmap |= 1 << col.kind.ordinal();
 
-                for (ColumnMetadata columnDef : columnDefs)
-                    restrictions.put(columnDef, newRestriction);
+                sortedRestrictions.add(singleRestriction);
+
+                if (singleRestriction.isSlice())
+                {
+                    bitmap |= maskHasSlice;
+                    if (singleRestriction.isMultiColumn())
+                        bitmap |= maskHasMultiColumnSlice;
+                }
+
+                if (singleRestriction.isIN())
+                    bitmap |= maskHasIN;
+                else if (!singleRestriction.isEQ())
+                    bitmap &= ~maskHasOnlyEqualityRestrictions;
             }
+            this.hasBitmap = bitmap | (numberOfContains > 1 ? maskHasMultipleContains : 0);
+            this.restrictionForKindBitmap = restrictionForBitmap;
+
+            this.restrictionsValues = Collections.unmodifiableList(sortedRestrictions);
+            this.restrictionsHashMap = restrictions;
         }
 
-        return restrictions;
-    }
+        @Override
+        public void addToRowFilter(RowFilter filter,
+                                   IndexRegistry indexRegistry,
+                                   QueryOptions options) throws InvalidRequestException
+        {
+            for (SingleRestriction restriction : restrictionsHashMap.values())
+                restriction.addToRowFilter(filter, indexRegistry, options);
+        }
 
-    @Override
-    public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
-    {
-        Restriction existing = restrictions.get(columnDef);
-        return existing == null ? Collections.emptySet() : Collections.singleton(existing);
-    }
+        @Override
+        public List<ColumnMetadata> getColumnDefs()
+        {
+            return restrictionsKeys;
+        }
 
-    /**
-     * Returns all the restrictions applied to the specified columns.
-     *
-     * @param columnDefs the column definitions
-     * @return all the restrictions applied to the specified columns
-     */
-    private Set<SingleRestriction> getRestrictions(Collection<ColumnMetadata> columnDefs)
-    {
-        Set<SingleRestriction> set = new HashSet<>();
-        for (ColumnMetadata columnDef : columnDefs)
+        @Override
+        public void addFunctionsTo(List<Function> functions)
         {
-            SingleRestriction existing = restrictions.get(columnDef);
-            if (existing != null)
-                set.add(existing);
+            for (int i = 0; i < restrictionsValues.size(); i++)
+                restrictionsValues.get(i).addFunctionsTo(functions);
         }
-        return set;
-    }
 
-    @Override
-    public final boolean hasSupportingIndex(IndexRegistry indexRegistry)
-    {
-        for (Restriction restriction : restrictions.values())
+        @Override
+        public boolean isEmpty()
         {
-            if (restriction.hasSupportingIndex(indexRegistry))
-                return true;
+            return false;
         }
-        return false;
-    }
 
-    /**
-     * Returns the column after the specified one.
-     *
-     * @param columnDef the column for which the next one need to be found
-     * @return the column after the specified one.
-     */
-    ColumnMetadata nextColumn(ColumnMetadata columnDef)
-    {
-        return restrictions.tailMap(columnDef, false).firstKey();
-    }
+        @Override
+        public int size()
+        {
+            return restrictionsKeys.size();
+        }
 
-    @Override
-    public ColumnMetadata getFirstColumn()
-    {
-        return isEmpty() ? null : this.restrictions.firstKey();
-    }
+        @Override
+        public boolean hasRestrictionFor(ColumnMetadata.Kind kind)
+        {
+            return 0 != (restrictionForKindBitmap & 1 << kind.ordinal());
+        }
 
-    @Override
-    public ColumnMetadata getLastColumn()
-    {
-        return isEmpty() ? null : this.restrictions.lastKey();
+        @Override
+        public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
+        {
+            Restriction existing = restrictionsHashMap.get(columnDef);
+            return existing == null ? Collections.emptySet() : Collections.singleton(existing);
+        }
+
+        @Override
+        public boolean hasSupportingIndex(IndexRegistry indexRegistry)
+        {
+            for (SingleRestriction restriction : restrictionsHashMap.values())
+                if (restriction.hasSupportingIndex(indexRegistry))
+                    return true;
+            return false;
+        }
+
+        @Override
+        public boolean needsFiltering(Index.Group indexGroup)
+        {
+            for (SingleRestriction restriction : restrictionsHashMap.values())
+                if (restriction.needsFiltering(indexGroup))
+                    return true;
+
+            return false;
+        }
+
+        @Override
+        public ColumnMetadata getFirstColumn()
+        {
+            return this.restrictionsKeys.get(0);
+        }
+
+        @Override
+        public ColumnMetadata getLastColumn()
+        {
+            return this.restrictionsKeys.get(this.restrictionsKeys.size() - 1);
+        }
+
+        @Override
+        public SingleRestriction lastRestriction()
+        {
+            return this.restrictionsValues.get(this.restrictionsValues.size() - 1);
+        }
+
+        @Override
+        public boolean hasMultipleContains()
+        {
+            return 0 != (hasBitmap & maskHasMultipleContains);
+        }
+
+        @Override
+        public List<SingleRestriction> restrictions()
+        {
+            return restrictionsValues;
+        }
+
+        @Override
+        public boolean hasIN()
+        {
+            return 0 != (hasBitmap & maskHasIN);
+        }
+
+        @Override
+        public boolean hasContains()
+        {
+            return 0 != (hasBitmap & maskHasContains);
+        }
+
+        @Override
+        public boolean hasSlice()
+        {
+            return 0 != (hasBitmap & maskHasSlice);
+        }
+
+        @Override
+        public boolean hasMultiColumnSlice()
+        {
+            return 0 != (hasBitmap & maskHasMultiColumnSlice);
+        }
+
+        @Override
+        public boolean hasOnlyEqualityRestrictions()
+        {
+            return 0 != (hasBitmap & maskHasOnlyEqualityRestrictions);
+        }
     }
 
     /**
-     * Returns the last restriction.
-     *
-     * @return the last restriction.
+     * Checks if one of the restrictions applies to a column of the specific kind.
+     * @param kind the column kind
+     * @return {@code true} if one of the restrictions applies to a column of the specific kind, {@code false} otherwise.
      */
-    SingleRestriction lastRestriction()
-    {
-        return isEmpty() ? null : this.restrictions.lastEntry().getValue();
-    }
+    public abstract boolean hasRestrictionFor(ColumnMetadata.Kind kind);
 
     /**
-     * Merges the two specified restrictions.
-     *
-     * @param restriction the first restriction
-     * @param otherRestriction the second restriction
-     * @return the merged restriction
-     * @throws InvalidRequestException if the two restrictions cannot be merged
+     * Returns the last restriction.
      */
-    private static SingleRestriction mergeRestrictions(SingleRestriction restriction,
-                                                       SingleRestriction otherRestriction)
-    {
-        return restriction == null ? otherRestriction
-                                   : restriction.mergeWith(otherRestriction);
-    }
+    public abstract SingleRestriction lastRestriction();
 
     /**
      * Checks if the restrictions contains multiple contains, contains key, or map[key] = value.
      *
-     * @return <code>true</code> if the restrictions contains multiple contains, contains key, or ,
+     * @return <code>true</code> if the restrictions contain multiple contains, contains key, or ,
      * map[key] = value; <code>false</code> otherwise
      */
-    public final boolean hasMultipleContains()
-    {
-        int numberOfContains = 0;
-        for (SingleRestriction restriction : restrictions.values())
-        {
-            if (restriction.isContains())
-            {
-                ContainsRestriction contains = (ContainsRestriction) restriction;
-                numberOfContains += (contains.numberOfValues() + contains.numberOfKeys() + contains.numberOfEntries());
-            }
-        }
-        return numberOfContains > 1;
-    }
+    public abstract boolean hasMultipleContains();
 
-    @Override
-    public Iterator<SingleRestriction> iterator()
-    {
-        Iterator<SingleRestriction> iterator = restrictions.values().iterator();
-        return hasMultiColumnRestrictions ? new DistinctIterator<>(iterator) : iterator;
-    }
+    public abstract List<SingleRestriction> restrictions();
 
     /**
-     * Checks if any of the underlying restriction is an IN.
-     * @return <code>true</code> if any of the underlying restriction is an IN, <code>false</code> otherwise
+     * Checks if the restrictions contains multiple contains, contains key, or map[key] = value.
+     *
+     * @return <code>true</code> if the restrictions contains multiple contains, contains key, or ,
+     * map[key] = value; <code>false</code> otherwise
      */
-    public final boolean hasIN()
-    {
-        return hasIn;
-    }
-
-    public boolean hasContains()
-    {
-        return hasContains;
-    }
+    public abstract boolean hasMultiColumnSlice();
 
-    public final boolean hasSlice()
+    public static Builder builder()
     {
-        return hasSlice;
+        return new Builder();
     }
 
-    /**
-     * Checks if all of the underlying restrictions are EQ or IN restrictions.
-     *
-     * @return <code>true</code> if all of the underlying restrictions are EQ or IN restrictions,
-     * <code>false</code> otherwise
-     */
-    public final boolean hasOnlyEqualityRestrictions()
+    public static final class Builder
     {
-        return hasOnlyEqualityRestrictions;
-    }
+        private final Map<ColumnMetadata, SingleRestriction> newRestrictions = new HashMap<>();
+        private boolean multiColumn = false;
 
-    /**
-     * {@code Iterator} decorator that removes duplicates in an ordered one.
-     *
-     * @param iterator the decorated iterator
-     * @param <E> the iterator element type.
-     */
-    private static final class DistinctIterator<E> extends AbstractIterator<E>
-    {
-        /**
-         * The decorated iterator.
-         */
-        private final Iterator<E> iterator;
+        private ColumnMetadata lastRestrictionColumn;
+        private SingleRestriction lastRestriction;
 
-        /**
-         * The previous element.
-         */
-        private E previous;
+        private Builder()
+        {
+        }
 
-        public DistinctIterator(Iterator<E> iterator)
+        public void addRestriction(SingleRestriction restriction)
         {
-            this.iterator = iterator;
+            List<ColumnMetadata> columnDefs = restriction.getColumnDefs();
+            Set<SingleRestriction> existingRestrictions = getRestrictions(newRestrictions, columnDefs);
+
+            if (existingRestrictions.isEmpty())
+            {
+                addRestrictionForColumns(columnDefs, restriction);
+            }
+            else
+            {
+                for (SingleRestriction existing : existingRestrictions)
+                {
+                    SingleRestriction newRestriction = existing.mergeWith(restriction);
+
+                    addRestrictionForColumns(columnDefs, newRestriction);
+                }
+            }
         }
 
-        protected E computeNext()
+        private void addRestrictionForColumns(List<ColumnMetadata> columnDefs, SingleRestriction restriction)
         {
-            while(iterator.hasNext())
+            for (int i = 0; i < columnDefs.size(); i++)
             {
-                E next = iterator.next();
-                if (!next.equals(previous))
+                ColumnMetadata column = columnDefs.get(i);
+                if (lastRestrictionColumn == null || COLUMN_DEFINITION_COMPARATOR.compare(lastRestrictionColumn, column) < 0)
                 {
-                    previous = next;
-                    return next;
+                    lastRestrictionColumn = column;
+                    lastRestriction = restriction;
                 }
+                newRestrictions.put(column, restriction);
             }
-            return endOfData();
+
+            multiColumn |= restriction.isMultiColumn();
+        }
+
+        private static Set<SingleRestriction> getRestrictions(Map<ColumnMetadata, SingleRestriction> restrictions,
+                                                              List<ColumnMetadata> columnDefs)
+        {
+            Set<SingleRestriction> set = new HashSet<>();
+            for (int i = 0; i < columnDefs.size(); i++)
+            {
+                SingleRestriction existing = restrictions.get(columnDefs.get(i));
+                if (existing != null)
+                    set.add(existing);
+            }
+            return set;
+        }
+
+        public RestrictionSet build()
+        {
+            return isEmpty() ? EmptyRestrictionSet.INSTANCE : new DefaultRestrictionSet(newRestrictions, multiColumn);
+        }
+
+        public boolean isEmpty()
+        {
+            return newRestrictions.isEmpty();
+        }
+
+        public SingleRestriction lastRestriction()
+        {
+            return lastRestriction;
+        }
+
+        public ColumnMetadata nextColumn(ColumnMetadata columnDef)
+        {
+            // This method is only invoked in the statement-preparation-phase to construct an error message.
+            NavigableSet<ColumnMetadata> columns = new TreeSet<>(COLUMN_DEFINITION_COMPARATOR);
+            columns.addAll(newRestrictions.keySet());
+            return columns.tailSet(columnDef, false).first();
         }
-    }
-    
-    @Override
-    public String toString()
-    {
-        return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
     }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java
index 9803adc4592f..967e1bad8f3c 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java
@@ -19,10 +19,9 @@
 
 import java.util.List;
 import java.util.Set;
+import java.util.function.Consumer;
 
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
-
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
@@ -40,23 +39,26 @@ class RestrictionSetWrapper implements Restrictions
      */
     protected final RestrictionSet restrictions;
 
-    public RestrictionSetWrapper(RestrictionSet restrictions)
+    RestrictionSetWrapper(RestrictionSet restrictions)
     {
         this.restrictions = restrictions;
     }
 
-    public void addRowFilterTo(RowFilter filter,
+    @Override
+    public void addToRowFilter(RowFilter filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options)
     {
-        restrictions.addRowFilterTo(filter, indexRegistry, options);
+        restrictions.addToRowFilter(filter, indexRegistry, options);
     }
 
+    @Override
     public List<ColumnMetadata> getColumnDefs()
     {
         return restrictions.getColumnDefs();
     }
 
+    @Override
     public void addFunctionsTo(List<Function> functions)
     {
         restrictions.addFunctionsTo(functions);
@@ -67,54 +69,67 @@ public boolean isEmpty()
         return restrictions.isEmpty();
     }
 
+    public List<SingleRestriction> restrictions()
+    {
+        return restrictions.restrictions();
+    }
+
     public int size()
     {
         return restrictions.size();
     }
 
+    @Override
     public boolean hasSupportingIndex(IndexRegistry indexRegistry)
     {
         return restrictions.hasSupportingIndex(indexRegistry);
     }
 
+    @Override
+    public boolean needsFiltering(Index.Group indexGroup)
+    {
+        return restrictions.needsFiltering(indexGroup);
+    }
+
+    @Override
     public ColumnMetadata getFirstColumn()
     {
         return restrictions.getFirstColumn();
     }
 
+    @Override
     public ColumnMetadata getLastColumn()
     {
         return restrictions.getLastColumn();
     }
 
+    @Override
     public boolean hasIN()
     {
         return restrictions.hasIN();
     }
 
+    @Override
     public boolean hasContains()
     {
         return restrictions.hasContains();
     }
 
+    @Override
     public boolean hasSlice()
     {
         return restrictions.hasSlice();
     }
 
+    @Override
     public boolean hasOnlyEqualityRestrictions()
     {
         return restrictions.hasOnlyEqualityRestrictions();
     }
 
+    @Override
     public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
     {
         return restrictions.getRestrictions(columnDef);
     }
-    
-    @Override
-    public String toString()
-    {
-        return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
-    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java
index 77e0dd92fe64..8a016835fffb 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/Restrictions.java
@@ -52,18 +52,28 @@ public interface Restrictions extends Restriction
      * Checks if any of the underlying restriction is an IN.
      * @return <code>true</code> if any of the underlying restriction is an IN, <code>false</code> otherwise
      */
-    public boolean hasIN();
+    default public boolean hasIN()
+    {
+        return false;
+    }
 
     /**
      * Checks if any of the underlying restrictions is a CONTAINS / CONTAINS KEY restriction.
      * @return <code>true</code> if any of the underlying restrictions is CONTAINS, <code>false</code> otherwise
      */
-    public boolean hasContains();
+    default public boolean hasContains()
+    {
+        return false;
+    }
+
     /**
      * Checks if any of the underlying restrictions is a slice.
      * @return <code>true</code> if any of the underlying restrictions is a slice, <code>false</code> otherwise
      */
-    public boolean hasSlice();
+    default public boolean hasSlice()
+    {
+        return false;
+    }
 
     /**
      * Checks if all of the underlying restrictions are EQ or IN restrictions.
@@ -71,5 +81,8 @@ public interface Restrictions extends Restriction
      * @return <code>true</code> if all of the underlying restrictions are EQ or IN restrictions,
      * <code>false</code> otherwise
      */
-    public boolean hasOnlyEqualityRestrictions();
+    default public boolean hasOnlyEqualityRestrictions()
+    {
+        return true;
+    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
index 1b3482b72b18..0af4d2e730e1 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
@@ -21,6 +21,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.function.Consumer;
 
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.*;
@@ -31,6 +32,8 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.serializers.ListSerializer;
+import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 
@@ -80,6 +83,16 @@ public boolean hasSupportingIndex(IndexRegistry indexRegistry)
         return false;
     }
 
+    @Override
+    public boolean needsFiltering(Index.Group indexGroup)
+    {
+        for (Index index : indexGroup.getIndexes())
+            if (isSupportedBy(index))
+                return false;
+
+        return true;
+    }
+
     @Override
     public final SingleRestriction mergeWith(SingleRestriction otherRestriction)
     {
@@ -150,7 +163,7 @@ MultiColumnRestriction toMultiColumnRestriction()
         }
 
         @Override
-        public void addRowFilterTo(RowFilter filter,
+        public void addToRowFilter(RowFilter filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
@@ -214,11 +227,18 @@ public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         }
 
         @Override
-        public void addRowFilterTo(RowFilter filter,
+        public void addToRowFilter(RowFilter filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
-            throw invalidRequest("IN restrictions are not supported on indexed columns");
+            List<ByteBuffer> values = getValues(options);
+            for (ByteBuffer v : values)
+            {
+                checkNotNull(v, "Invalid null value for column %s", columnDef.name);
+                checkBindValueSet(v, "Invalid unset value for column %s", columnDef.name);
+            }
+            ByteBuffer buffer = ListSerializer.pack(values, values.size(), ProtocolVersion.V3);
+            filter.add(columnDef, Operator.IN, buffer);
         }
 
         @Override
@@ -385,7 +405,7 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        public void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             for (Bound b : Bound.values())
                 if (hasBound(b))
@@ -475,7 +495,7 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        public void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             for (ByteBuffer value : bindAndGet(values, options))
                 filter.add(columnDef, Operator.CONTAINS, value);
@@ -614,7 +634,7 @@ MultiColumnRestriction toMultiColumnRestriction()
         }
 
         @Override
-        public void addRowFilterTo(RowFilter filter,
+        public void addToRowFilter(RowFilter filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
@@ -690,7 +710,7 @@ MultiColumnRestriction toMultiColumnRestriction()
         }
 
         @Override
-        public void addRowFilterTo(RowFilter filter,
+        public void addToRowFilter(RowFilter filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index d1837d459ddb..70ec0ddadcfe 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
@@ -19,8 +19,12 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
+import java.util.function.Consumer;
 
 import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
 
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.Function;
@@ -35,25 +39,31 @@
 import org.apache.cassandra.index.IndexRegistry;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.btree.BTreeSet;
 
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
-
-import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
-import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
-import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+import static org.apache.cassandra.cql3.statements.RequestValidations.*;
 
 /**
  * The restrictions corresponding to the relations specified on the where-clause of CQL query.
  */
-public final class StatementRestrictions
+public class StatementRestrictions
 {
     public static final String REQUIRES_ALLOW_FILTERING_MESSAGE =
             "Cannot execute this query as it might involve data filtering and " +
             "thus may have unpredictable performance. If you want to execute " +
             "this query despite the performance unpredictability, use ALLOW FILTERING";
 
+    public static final String HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE =
+    "Column '%s' has an index but does not support the operators specified in the query. " +
+    "If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING";
+
+    public static final String HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI =
+    "Columns %s have indexes but do not support the operators specified in the query. " +
+    "If you want to execute this query despite the performance unpredictability, use ALLOW FILTERING";
+
+    public static final String INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE = "Index on column %s does not support LIKE restrictions.";
+
     /**
      * The type of statement
      */
@@ -67,34 +77,34 @@ public final class StatementRestrictions
     /**
      * Restrictions on partitioning columns
      */
-    private PartitionKeyRestrictions partitionKeyRestrictions;
+    protected final PartitionKeyRestrictions partitionKeyRestrictions;
 
     /**
      * Restrictions on clustering columns
      */
-    private ClusteringColumnRestrictions clusteringColumnsRestrictions;
+    private final ClusteringColumnRestrictions clusteringColumnsRestrictions;
 
     /**
      * Restriction on non-primary key columns (i.e. secondary index restrictions)
      */
-    private RestrictionSet nonPrimaryKeyRestrictions;
+    private final RestrictionSet nonPrimaryKeyRestrictions;
 
-    private Set<ColumnMetadata> notNullColumns;
+    private final ImmutableSet<ColumnMetadata> notNullColumns;
 
     /**
      * The restrictions used to build the row filter
      */
-    private final IndexRestrictions filterRestrictions = new IndexRestrictions();
+    private final IndexRestrictions filterRestrictions;
 
     /**
      * <code>true</code> if the secondary index need to be queried, <code>false</code> otherwise
      */
-    private boolean usesSecondaryIndexing;
+    protected boolean usesSecondaryIndexing;
 
     /**
      * Specify if the query will return a range of partition keys.
      */
-    private boolean isKeyRange;
+    protected boolean isKeyRange;
 
     /**
      * <code>true</code> if nonPrimaryKeyRestrictions contains restriction on a regular column,
@@ -118,10 +128,81 @@ private StatementRestrictions(StatementType type, TableMetadata table, boolean a
     {
         this.type = type;
         this.table = table;
-        this.partitionKeyRestrictions = new PartitionKeySingleRestrictionSet(table.partitionKeyAsClusteringComparator());
-        this.clusteringColumnsRestrictions = new ClusteringColumnRestrictions(table, allowFiltering);
-        this.nonPrimaryKeyRestrictions = new RestrictionSet();
-        this.notNullColumns = new HashSet<>();
+        this.partitionKeyRestrictions = PartitionKeySingleRestrictionSet.builder(table.partitionKeyAsClusteringComparator()).build();
+        this.clusteringColumnsRestrictions = ClusteringColumnRestrictions.builder(table, allowFiltering).build();
+        this.nonPrimaryKeyRestrictions = RestrictionSet.builder().build();
+        this.notNullColumns = ImmutableSet.of();
+        this.filterRestrictions = IndexRestrictions.of();
+    }
+
+    private StatementRestrictions(StatementType type,
+                                  TableMetadata table,
+                                  PartitionKeyRestrictions partitionKeyRestrictions,
+                                  ClusteringColumnRestrictions clusteringColumnsRestrictions,
+                                  RestrictionSet nonPrimaryKeyRestrictions,
+                                  ImmutableSet<ColumnMetadata> notNullColumns,
+                                  boolean usesSecondaryIndexing,
+                                  boolean isKeyRange,
+                                  IndexRestrictions filterRestrictions)
+    {
+        this.type = type;
+        this.table = table;
+        this.partitionKeyRestrictions = partitionKeyRestrictions;
+        this.clusteringColumnsRestrictions = clusteringColumnsRestrictions;
+        this.nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictions;
+        this.notNullColumns = notNullColumns;
+        this.usesSecondaryIndexing = usesSecondaryIndexing;
+        this.isKeyRange = isKeyRange;
+        this.filterRestrictions = filterRestrictions;
+    }
+
+    /**
+     * Adds the following restrictions to the index restrictions.
+     *
+     * @param restrictions the restrictions to add to the index restrictions
+     * @return a new {@code StatementRestrictions} with the new index restrictions
+     */
+    public StatementRestrictions addIndexRestrictions(Restrictions restrictions)
+    {
+        IndexRestrictions newIndexRestrictions = IndexRestrictions.builder()
+                                                                  .add(filterRestrictions)
+                                                                  .add(restrictions)
+                                                                  .build();
+
+        return new StatementRestrictions(type,
+                                         table,
+                                         partitionKeyRestrictions,
+                                         clusteringColumnsRestrictions,
+                                         nonPrimaryKeyRestrictions,
+                                         notNullColumns,
+                                         usesSecondaryIndexing,
+                                         isKeyRange,
+                                         newIndexRestrictions);
+    }
+
+    /**
+     * Adds the following external restrictions (mostly custom and user index expressions) to the index restrictions.
+     *
+     * @param restrictions the restrictions to add to the index restrictions
+     * @return a new {@code StatementRestrictions} with the new index restrictions
+     */
+    public StatementRestrictions addExternalRestrictions(Iterable<CustomIndexExpression> restrictions)
+    {
+        IndexRestrictions.Builder newIndexRestrictions = IndexRestrictions.builder()
+                                                                          .add(filterRestrictions);
+
+        for (CustomIndexExpression restriction : restrictions)
+            newIndexRestrictions.add(restriction);
+
+        return new StatementRestrictions(type,
+                                         table,
+                                         partitionKeyRestrictions,
+                                         clusteringColumnsRestrictions,
+                                         nonPrimaryKeyRestrictions,
+                                         notNullColumns,
+                                         usesSecondaryIndexing,
+                                         isKeyRange,
+                                         newIndexRestrictions.build());
     }
 
     public StatementRestrictions(StatementType type,
@@ -148,12 +229,22 @@ public StatementRestrictions(StatementType type,
                                  boolean allowFiltering,
                                  boolean forView)
     {
-        this(type, table, allowFiltering);
+        this.type = type;
+        this.table = table;
 
         IndexRegistry indexRegistry = null;
-        if (type.allowUseOfSecondaryIndices())
+
+        // We want to avoid opening the keyspace during view construction
+        // since we're parsing these for restore and the base table or keyspace might not exist in the current schema.
+        if (allowUseOfSecondaryIndices && type.allowUseOfSecondaryIndices())
             indexRegistry = IndexRegistry.obtain(table);
 
+        PartitionKeySingleRestrictionSet.Builder partitionKeyRestrictionSet = PartitionKeySingleRestrictionSet.builder(table.partitionKeyAsClusteringComparator());
+        ClusteringColumnRestrictions.Builder clusteringColumnsRestrictionSet = ClusteringColumnRestrictions.builder(table, allowFiltering, indexRegistry);
+        RestrictionSet.Builder nonPrimaryKeyRestrictionSet = RestrictionSet.builder();
+
+        ImmutableSet.Builder<ColumnMetadata> notNullColumnsBuilder = ImmutableSet.builder();
+
         /*
          * WHERE clause. For a given entity, rules are:
          *   - EQ relation conflicts with anything else (including a 2nd EQ)
@@ -168,39 +259,65 @@ public StatementRestrictions(StatementType type,
             if (relation.operator() == Operator.IS_NOT)
             {
                 if (!forView)
-                    throw new InvalidRequestException("Unsupported restriction: " + relation);
+                    throw invalidRequest("Unsupported restriction: %s", relation);
 
-                this.notNullColumns.addAll(relation.toRestriction(table, boundNames).getColumnDefs());
+                notNullColumnsBuilder.addAll(relation.toRestriction(table, boundNames).getColumnDefs());
             }
-            else if (relation.isLIKE())
+            else
             {
                 Restriction restriction = relation.toRestriction(table, boundNames);
 
-                if (!type.allowUseOfSecondaryIndices() || !restriction.hasSupportingIndex(indexRegistry))
-                    throw new InvalidRequestException(String.format("LIKE restriction is only supported on properly " +
-                                                                    "indexed columns. %s is not valid.",
-                                                                    relation.toString()));
+                if (relation.isLIKE() && (!type.allowUseOfSecondaryIndices() || !restriction.hasSupportingIndex(indexRegistry)))
+                {
+                    if (getColumnsWithUnsupportedIndexRestrictions(table, ImmutableList.of(restriction)).isEmpty())
+                    {
+                        throw invalidRequest("LIKE restriction is only supported on properly indexed columns. %s is not valid.", relation.toString());
+                    }
+                    else
+                    {
+                        throw invalidRequest(INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE, restriction.getFirstColumn());
+                    }
+                }
 
-                addRestriction(restriction);
-            }
-            else
-            {
-                addRestriction(relation.toRestriction(table, boundNames));
+                ColumnMetadata def = restriction.getFirstColumn();
+                if (def.isPartitionKey())
+                {
+                    partitionKeyRestrictionSet.addRestriction(restriction);
+                }
+                else if (def.isClusteringColumn())
+                {
+                    clusteringColumnsRestrictionSet.addRestriction(restriction);
+                }
+                else
+                {
+                    nonPrimaryKeyRestrictionSet.addRestriction((SingleRestriction) restriction);
+                }
             }
         }
 
-        hasRegularColumnsRestrictions = nonPrimaryKeyRestrictions.hasRestrictionFor(ColumnMetadata.Kind.REGULAR);
+        this.partitionKeyRestrictions = partitionKeyRestrictionSet.build();
+        this.clusteringColumnsRestrictions = clusteringColumnsRestrictionSet.build();
+        this.nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictionSet.build();
+        this.notNullColumns = notNullColumnsBuilder.build();
+        this.hasRegularColumnsRestrictions = nonPrimaryKeyRestrictions.hasRestrictionFor(ColumnMetadata.Kind.REGULAR);
 
         boolean hasQueriableClusteringColumnIndex = false;
         boolean hasQueriableIndex = false;
 
+        IndexRestrictions.Builder filterRestrictionsBuilder = IndexRestrictions.builder();
+
         if (allowUseOfSecondaryIndices)
         {
             if (whereClause.containsCustomExpressions())
-                processCustomIndexExpressions(whereClause.expressions, boundNames, indexRegistry);
+            {
+                CustomIndexExpression customExpression = prepareCustomIndexExpression(whereClause.expressions,
+                                                                                      boundNames,
+                                                                                      indexRegistry);
+                filterRestrictionsBuilder.add(customExpression);
+            }
 
             hasQueriableClusteringColumnIndex = clusteringColumnsRestrictions.hasSupportingIndex(indexRegistry);
-            hasQueriableIndex = !filterRestrictions.getCustomIndexExpressions().isEmpty()
+            hasQueriableIndex = whereClause.containsCustomExpressions()
                     || hasQueriableClusteringColumnIndex
                     || partitionKeyRestrictions.hasSupportingIndex(indexRegistry)
                     || nonPrimaryKeyRestrictions.hasSupportingIndex(indexRegistry);
@@ -212,7 +329,7 @@ else if (relation.isLIKE())
         // Some but not all of the partition key columns have been specified;
         // hence we need turn these restrictions into a row filter.
         if (usesSecondaryIndexing || partitionKeyRestrictions.needFiltering(table))
-            filterRestrictions.add(partitionKeyRestrictions);
+            filterRestrictionsBuilder.add(partitionKeyRestrictions);
 
         if (selectsOnlyStaticColumns && hasClusteringColumnsRestrictions())
         {
@@ -229,8 +346,6 @@ else if (relation.isLIKE())
             if (type.isDelete() || type.isUpdate())
                 throw invalidRequest("Invalid restrictions on clustering columns since the %s statement modifies only static columns",
                                      type);
-            if (type.isSelect())
-                throw invalidRequest("Cannot restrict clustering columns when selecting only static columns");
         }
 
         processClusteringColumnsRestrictions(hasQueriableIndex,
@@ -243,7 +358,7 @@ else if (relation.isLIKE())
             usesSecondaryIndexing = true;
 
         if (usesSecondaryIndexing || clusteringColumnsRestrictions.needFiltering())
-            filterRestrictions.add(clusteringColumnsRestrictions);
+            filterRestrictionsBuilder.add(clusteringColumnsRestrictions);
 
         // Even if usesSecondaryIndexing is false at this point, we'll still have to use one if
         // there is restrictions not covered by the PK.
@@ -260,24 +375,32 @@ else if (relation.isLIKE())
             if (hasQueriableIndex)
                 usesSecondaryIndexing = true;
             else if (!allowFiltering)
-                throw invalidRequest(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+                throwRequiresAllowFilteringError(table);
 
-            filterRestrictions.add(nonPrimaryKeyRestrictions);
+            filterRestrictionsBuilder.add(nonPrimaryKeyRestrictions);
         }
 
+        filterRestrictions = filterRestrictionsBuilder.build();
+
         if (usesSecondaryIndexing)
             validateSecondaryIndexSelections();
     }
 
-    private void addRestriction(Restriction restriction)
+    public void throwRequiresAllowFilteringError(TableMetadata table)
     {
-        ColumnMetadata def = restriction.getFirstColumn();
-        if (def.isPartitionKey())
-            partitionKeyRestrictions = partitionKeyRestrictions.mergeWith(restriction);
-        else if (def.isClusteringColumn())
-            clusteringColumnsRestrictions = clusteringColumnsRestrictions.mergeWith(restriction);
+        Set<ColumnMetadata> unsupported = getColumnsWithUnsupportedIndexRestrictions(table);
+        if (unsupported.isEmpty())
+        {
+            throw invalidRequest(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+        }
         else
-            nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictions.addRestriction((SingleRestriction) restriction);
+        {
+            // If there's an index on these columns but the restriction is not supported on this index, throw a more specific error message
+            if (unsupported.size() == 1)
+                throw invalidRequest(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, unsupported.iterator().next()));
+            else
+                throw invalidRequest(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI, unsupported));
+        }
     }
 
     public void addFunctionsTo(List<Function> functions)
@@ -323,7 +446,7 @@ public Set<ColumnMetadata> nonPKRestrictedColumns(boolean includeNotNullRestrict
     /**
      * @return the set of columns that have an IS NOT NULL restriction on them
      */
-    public Set<ColumnMetadata> notNullColumns()
+    public ImmutableSet<ColumnMetadata> notNullColumns()
     {
         return notNullColumns;
     }
@@ -381,7 +504,7 @@ public boolean isColumnRestrictedByEq(ColumnMetadata columnDef)
      * @param kind the column type
      * @return the <code>Restrictions</code> for the specified type of columns
      */
-    private Restrictions getRestrictions(ColumnMetadata.Kind kind)
+    protected Restrictions getRestrictions(ColumnMetadata.Kind kind)
     {
         switch (kind)
         {
@@ -401,7 +524,7 @@ public boolean usesSecondaryIndexing()
         return this.usesSecondaryIndexing;
     }
 
-    private void processPartitionKeyRestrictions(boolean hasQueriableIndex, boolean allowFiltering, boolean forView)
+    protected void processPartitionKeyRestrictions(boolean hasQueriableIndex, boolean allowFiltering, boolean forView)
     {
         if (!type.allowPartitionKeyRanges())
         {
@@ -440,9 +563,6 @@ private void processPartitionKeyRestrictions(boolean hasQueriableIndex, boolean
                 if (!allowFiltering && !forView && !hasQueriableIndex)
                     throw new InvalidRequestException(REQUIRES_ALLOW_FILTERING_MESSAGE);
 
-                if (partitionKeyRestrictions.hasIN())
-                    throw new InvalidRequestException("IN restrictions are not supported when the query involves filtering");
-
                 isKeyRange = true;
                 usesSecondaryIndexing = hasQueriableIndex;
             }
@@ -532,7 +652,7 @@ private void processClusteringColumnsRestrictions(boolean hasQueriableIndex,
                 else if (!allowFiltering)
                 {
                     List<ColumnMetadata> clusteringColumns = table.clusteringColumns();
-                    List<ColumnMetadata> restrictedColumns = new LinkedList<>(clusteringColumnsRestrictions.getColumnDefs());
+                    List<ColumnMetadata> restrictedColumns = clusteringColumnsRestrictions.getColumnDefs();
 
                     for (int i = 0, m = restrictedColumns.size(); i < m; i++)
                     {
@@ -560,7 +680,7 @@ else if (!allowFiltering)
     private Collection<ColumnIdentifier> getUnrestrictedClusteringColumns()
     {
         List<ColumnMetadata> missingClusteringColumns = new ArrayList<>(table.clusteringColumns());
-        missingClusteringColumns.removeAll(new LinkedList<>(clusteringColumnsRestrictions.getColumnDefs()));
+        missingClusteringColumns.removeAll(clusteringColumnsRestrictions.getColumnDefs());
         return ColumnMetadata.toIdentifiers(missingClusteringColumns);
     }
 
@@ -573,9 +693,9 @@ private boolean hasUnrestrictedClusteringColumns()
         return table.clusteringColumns().size() != clusteringColumnsRestrictions.size();
     }
 
-    private void processCustomIndexExpressions(List<CustomIndexExpression> expressions,
-                                               VariableSpecifications boundNames,
-                                               IndexRegistry indexRegistry)
+    private CustomIndexExpression prepareCustomIndexExpression(List<CustomIndexExpression> expressions,
+                                                               VariableSpecifications boundNames,
+                                                               IndexRegistry indexRegistry)
     {
         if (expressions.size() > 1)
             throw new InvalidRequestException(IndexRestrictions.MULTIPLE_EXPRESSIONS);
@@ -599,20 +719,19 @@ private void processCustomIndexExpressions(List<CustomIndexExpression> expressio
             throw IndexRestrictions.customExpressionNotSupported(expression.targetIndex);
 
         expression.prepareValue(table, expressionType, boundNames);
-
-        filterRestrictions.add(expression);
+        return expression;
     }
 
-    public RowFilter getRowFilter(IndexRegistry indexRegistry, QueryOptions options)
+    public RowFilter getRowFilter(IndexRegistry indexManager, QueryOptions options)
     {
         if (filterRestrictions.isEmpty())
             return RowFilter.NONE;
 
         RowFilter filter = RowFilter.create();
         for (Restrictions restrictions : filterRestrictions.getRestrictions())
-            restrictions.addRowFilterTo(filter, indexRegistry, options);
+            restrictions.addToRowFilter(filter, indexManager, options);
 
-        for (CustomIndexExpression expression : filterRestrictions.getCustomIndexExpressions())
+        for (CustomIndexExpression expression : filterRestrictions.getExternalExpressions())
             expression.addToRowFilter(filter, table, options);
 
         return filter;
@@ -776,37 +895,61 @@ public NavigableSet<ClusteringBound<?>> getClusteringColumnsBounds(Bound b, Quer
      */
     public boolean isColumnRange()
     {
-        int numberOfClusteringColumns = table.clusteringColumns().size();
-        if (table.isStaticCompactTable())
-        {
-            // For static compact tables we want to ignore the fake clustering column (note that if we weren't special casing,
-            // this would mean a 'SELECT *' on a static compact table would query whole partitions, even though we'll only return
-            // the static part as far as CQL is concerned. This is thus mostly an optimization to use the query-by-name path).
-            numberOfClusteringColumns = 0;
-        }
-
+        // For static compact tables we want to ignore the fake clustering column (note that if we weren't special casing,
+        // this would mean a 'SELECT *' on a static compact table would query whole partitions, even though we'll only return
+        // the static part as far as CQL is concerned. This is thus mostly an optimization to use the query-by-name path).
+        int numberOfClusteringColumns = table.isStaticCompactTable() ? 0 : table.clusteringColumns().size();
         // it is a range query if it has at least one the column alias for which no relation is defined or is not EQ or IN.
         return clusteringColumnsRestrictions.size() < numberOfClusteringColumns
             || !clusteringColumnsRestrictions.hasOnlyEqualityRestrictions();
     }
 
     /**
-     * Checks if the query need to use filtering.
+     * Checks if the query needs to use filtering.
+     *
      * @return <code>true</code> if the query need to use filtering, <code>false</code> otherwise.
      */
-    public boolean needFiltering()
+    public boolean needFiltering(TableMetadata table)
     {
-        int numberOfRestrictions = filterRestrictions.getCustomIndexExpressions().size();
-        for (Restrictions restrictions : filterRestrictions.getRestrictions())
-            numberOfRestrictions += restrictions.size();
+        IndexRegistry indexRegistry = IndexRegistry.obtain(table);
+        boolean hasClusteringColumnRestrictions = !clusteringColumnsRestrictions.isEmpty();
+        boolean hasMultipleContains = nonPrimaryKeyRestrictions.hasMultipleContains();
+        return filterRestrictions.needFiltering(indexRegistry, hasClusteringColumnRestrictions, hasMultipleContains);
+    }
 
-        return numberOfRestrictions > 1
-                || (numberOfRestrictions == 0 && !clusteringColumnsRestrictions.isEmpty())
-                || (numberOfRestrictions != 0
-                        && nonPrimaryKeyRestrictions.hasMultipleContains());
+    public Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table)
+    {
+        return getColumnsWithUnsupportedIndexRestrictions(table, Iterables.concat(clusteringColumnsRestrictions.restrictions(), nonPrimaryKeyRestrictions.restrictions()));
     }
 
-    private void validateSecondaryIndexSelections()
+    public Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table, Iterable<Restriction> restrictions)
+    {
+        IndexRegistry indexRegistry = IndexRegistry.obtain(table);
+        if (indexRegistry.listIndexes().isEmpty())
+            return Collections.emptySet();
+
+        ImmutableSet.Builder<ColumnMetadata> builder = ImmutableSet.builder();
+
+        for (Restriction restriction : restrictions)
+        {
+            if (!restriction.hasSupportingIndex(indexRegistry))
+            {
+                for (Index index : indexRegistry.listIndexes())
+                {
+                    // If a column restriction has an index which was not picked up by hasSupportingIndex, it means it's an unsupported restriction
+                    for (ColumnMetadata column : restriction.getColumnDefs())
+                    {
+                        if (index.dependsOn(column))
+                            builder.add(column);
+                    }
+                }
+            }
+        }
+
+        return builder.build();
+    }
+
+    protected void validateSecondaryIndexSelections()
     {
         checkFalse(keyIsInRelation(),
                    "Select on indexed columns and with IN clause for the PRIMARY KEY are not supported");
@@ -862,10 +1005,4 @@ public boolean returnStaticContentOnPartitionWithNoRows()
         // a full partition query, then we include that content.
         return queriesFullPartitions();
     }
-    
-    @Override
-    public String toString()
-    {
-        return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
-    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
index 437b17c617ab..84ec5e6b1c3b 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
@@ -18,15 +18,17 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Consumer;
 
 import com.google.common.collect.BoundType;
 import com.google.common.collect.ImmutableRangeSet;
 import com.google.common.collect.Range;
 import com.google.common.collect.RangeSet;
 
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
@@ -34,7 +36,11 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.QueryState;
 
 import static org.apache.cassandra.cql3.statements.Bound.END;
 import static org.apache.cassandra.cql3.statements.Bound.START;
@@ -45,36 +51,108 @@
  * <p>If all partition key columns have non-token restrictions and do not need filtering, they take precedence
  * when calculating bounds, incusiveness etc (see CASSANDRA-12149).</p>
  */
-final class TokenFilter implements PartitionKeyRestrictions
+abstract class TokenFilter implements PartitionKeyRestrictions
 {
     /**
      * The decorated restriction
      */
-    private final PartitionKeyRestrictions restrictions;
+    final PartitionKeyRestrictions restrictions;
 
     /**
      * The restriction on the token
      */
-    private final TokenRestriction tokenRestriction;
+    final TokenRestriction tokenRestriction;
 
     /**
      * Partitioner to manage tokens, extracted from tokenRestriction metadata.
      */
     private final IPartitioner partitioner;
 
-    public boolean hasIN()
+    static TokenFilter create(PartitionKeyRestrictions restrictions, TokenRestriction tokenRestriction)
     {
-        return isOnToken() ? false : restrictions.hasIN();
+        boolean onToken = restrictions.needFiltering(tokenRestriction.metadata) || restrictions.size() < tokenRestriction.size();
+        return onToken ? new TokenFilter.OnToken(restrictions, tokenRestriction)
+                       : new TokenFilter.NotOnToken(restrictions, tokenRestriction);
     }
 
-    public boolean hasContains()
+    private TokenFilter(PartitionKeyRestrictions restrictions, TokenRestriction tokenRestriction)
     {
-        return isOnToken() ? false : restrictions.hasContains();
+        this.restrictions = restrictions;
+        this.tokenRestriction = tokenRestriction;
+        this.partitioner = tokenRestriction.metadata.partitioner;
     }
 
-    public boolean hasOnlyEqualityRestrictions()
+    private static final class OnToken extends TokenFilter
     {
-        return isOnToken() ? false : restrictions.hasOnlyEqualityRestrictions();
+        private OnToken(PartitionKeyRestrictions restrictions, TokenRestriction tokenRestriction)
+        {
+            super(restrictions, tokenRestriction);
+        }
+
+        @Override
+        public boolean isOnToken()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean isInclusive(Bound bound)
+        {
+            return tokenRestriction.isInclusive(bound);
+        }
+
+        @Override
+        public boolean hasBound(Bound bound)
+        {
+            return tokenRestriction.hasBound(bound);
+        }
+
+        @Override
+        public List<ByteBuffer> bounds(Bound bound, QueryOptions options) throws InvalidRequestException
+        {
+            return tokenRestriction.bounds(bound, options);
+        }
+    }
+
+    private static final class NotOnToken extends TokenFilter
+    {
+        private NotOnToken(PartitionKeyRestrictions restrictions, TokenRestriction tokenRestriction)
+        {
+            super(restrictions, tokenRestriction);
+        }
+
+        @Override
+        public boolean isInclusive(Bound bound)
+        {
+            return restrictions.isInclusive(bound);
+        }
+
+        @Override
+        public boolean hasBound(Bound bound)
+        {
+            return restrictions.hasBound(bound);
+        }
+
+        @Override
+        public List<ByteBuffer> bounds(Bound bound, QueryOptions options) throws InvalidRequestException
+        {
+            return restrictions.bounds(bound, options);
+        }
+
+        public boolean hasIN()
+        {
+            return restrictions.hasIN();
+        }
+
+        public boolean hasContains()
+        {
+            return restrictions.hasContains();
+        }
+
+        public boolean hasOnlyEqualityRestrictions()
+        {
+            return restrictions.hasOnlyEqualityRestrictions();
+        }
     }
 
     @Override
@@ -86,21 +164,6 @@ public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
         return set;
     }
 
-    @Override
-    public boolean isOnToken()
-    {
-        // if all partition key columns have non-token restrictions and do not need filtering,
-        // we can simply use the token range to filter those restrictions and then ignore the token range
-        return needFiltering(tokenRestriction.metadata) || restrictions.size() < tokenRestriction.size();
-    }
-
-    public TokenFilter(PartitionKeyRestrictions restrictions, TokenRestriction tokenRestriction)
-    {
-        this.restrictions = restrictions;
-        this.tokenRestriction = tokenRestriction;
-        this.partitioner = tokenRestriction.metadata.partitioner;
-    }
-
     @Override
     public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
     {
@@ -111,27 +174,9 @@ public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestExcept
     public PartitionKeyRestrictions mergeWith(Restriction restriction) throws InvalidRequestException
     {
         if (restriction.isOnToken())
-            return new TokenFilter(restrictions, (TokenRestriction) tokenRestriction.mergeWith(restriction));
-
-        return new TokenFilter(restrictions.mergeWith(restriction), tokenRestriction);
-    }
+            return TokenFilter.create(restrictions, (TokenRestriction) tokenRestriction.mergeWith(restriction));
 
-    @Override
-    public boolean isInclusive(Bound bound)
-    {
-        return isOnToken() ? tokenRestriction.isInclusive(bound) : restrictions.isInclusive(bound);
-    }
-
-    @Override
-    public boolean hasBound(Bound bound)
-    {
-        return isOnToken() ? tokenRestriction.hasBound(bound) : restrictions.hasBound(bound);
-    }
-
-    @Override
-    public List<ByteBuffer> bounds(Bound bound, QueryOptions options) throws InvalidRequestException
-    {
-        return isOnToken() ? tokenRestriction.bounds(bound, options) : restrictions.bounds(bound, options);
+        return TokenFilter.create(restrictions.mergeWith(restriction), tokenRestriction);
     }
 
     /**
@@ -278,9 +323,15 @@ public boolean hasSupportingIndex(IndexRegistry indexRegistry)
     }
 
     @Override
-    public void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+    public boolean needsFiltering(Index.Group indexGroup)
+    {
+        return restrictions.needsFiltering(indexGroup);
+    }
+
+    @Override
+    public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
     {
-        restrictions.addRowFilterTo(filter, indexRegistry, options);
+        restrictions.addToRowFilter(filter, indexRegistry, options);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
index e71b17782d15..62c93236b567 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
@@ -22,6 +22,7 @@
 
 import com.google.common.base.Joiner;
 
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
@@ -122,7 +123,13 @@ public boolean hasSupportingIndex(IndexRegistry indexRegistry)
     }
 
     @Override
-    public void addRowFilterTo(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+    public boolean needsFiltering(Index.Group indexGroup)
+    {
+        return false;
+    }
+
+    @Override
+    public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
     {
         throw new UnsupportedOperationException("Index expression cannot be created for token restriction");
     }
@@ -153,7 +160,7 @@ protected final String getColumnNamesAsString()
     public final PartitionKeyRestrictions mergeWith(Restriction otherRestriction) throws InvalidRequestException
     {
         if (!otherRestriction.isOnToken())
-            return new TokenFilter(toPartitionKeyRestrictions(otherRestriction), this);
+            return TokenFilter.create(toPartitionKeyRestrictions(otherRestriction), this);
 
         return doMergeWith((TokenRestriction) otherRestriction);
     }
@@ -176,7 +183,9 @@ private PartitionKeyRestrictions toPartitionKeyRestrictions(Restriction restrict
         if (restriction instanceof PartitionKeyRestrictions)
             return (PartitionKeyRestrictions) restriction;
 
-        return new PartitionKeySingleRestrictionSet(metadata.partitionKeyAsClusteringComparator()).mergeWith(restriction);
+        return PartitionKeySingleRestrictionSet.builder(metadata.partitionKeyAsClusteringComparator())
+                                               .addRestriction(restriction)
+                                               .build();
     }
 
     public static final class EQRestriction extends TokenRestriction
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index 774bd689799d..6cb022b98c98 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -975,7 +975,7 @@ public SelectStatement prepare(boolean forView) throws InvalidRequestException
                     orderingComparator = Collections.reverseOrder(orderingComparator);
             }
 
-            checkNeedsFiltering(restrictions);
+            checkNeedsFiltering(table, restrictions);
 
             return new SelectStatement(table,
                                        bindVariables,
@@ -1234,15 +1234,17 @@ private boolean isReversed(TableMetadata table, Map<ColumnMetadata, Boolean> ord
         }
 
         /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
-        private void checkNeedsFiltering(StatementRestrictions restrictions) throws InvalidRequestException
+        private void checkNeedsFiltering(TableMetadata table, StatementRestrictions restrictions) throws InvalidRequestException
         {
             // non-key-range non-indexed queries cannot involve filtering underneath
             if (!parameters.allowFiltering && (restrictions.isKeyRange() || restrictions.usesSecondaryIndexing()))
             {
-                // We will potentially filter data if either:
-                //  - Have more than one IndexExpression
-                //  - Have no index expression and the row filter is not the identity
-                checkFalse(restrictions.needFiltering(), StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+                // We will potentially filter data if the row filter is not the identity and there isn't any index group
+                // supporting all the expressions in the filter.
+                if (restrictions.needFiltering(table))
+                {
+                    restrictions.throwRequiresAllowFilteringError(table);
+                }
             }
         }
 
diff --git a/src/java/org/apache/cassandra/db/MultiCBuilder.java b/src/java/org/apache/cassandra/db/MultiCBuilder.java
index 0b5625b26f2a..787755192678 100644
--- a/src/java/org/apache/cassandra/db/MultiCBuilder.java
+++ b/src/java/org/apache/cassandra/db/MultiCBuilder.java
@@ -20,9 +20,15 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.List;
 import java.util.NavigableSet;
+import java.util.TreeSet;
 
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
+import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.btree.BTreeSet;
@@ -165,6 +171,13 @@ public boolean hasMissingElements()
      */
     public abstract NavigableSet<Clustering<?>> build();
 
+    /**
+     * Builds the serialized partition keys.
+     *
+     * @return the serialized partition keys
+     */
+    public abstract List<ByteBuffer> buildSerializedPartitionKeys();
+
     /**
      * Builds the <code>ClusteringBound</code>s for slice restrictions.
      *
@@ -262,6 +275,23 @@ public NavigableSet<Clustering<?>> build()
             return BTreeSet.of(comparator, size == 0 ? Clustering.EMPTY : Clustering.make(elements));
         }
 
+        @Override
+        public List<ByteBuffer> buildSerializedPartitionKeys()
+        {
+            built = true;
+
+            if (hasMissingElements)
+                return Collections.EMPTY_LIST;
+
+            if (size == 0)
+                return ImmutableList.of(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+
+            if (size == 1)
+                return ImmutableList.of(elements[0]);
+
+            return ImmutableList.of(CompositeType.build(ByteBufferAccessor.instance, elements));
+        }
+
         @Override
         public NavigableSet<ClusteringBound<?>> buildBoundForSlice(boolean isStart,
                                                                    boolean isInclusive,
@@ -418,6 +448,36 @@ public NavigableSet<Clustering<?>> build()
             return set.build();
         }
 
+        @Override
+        public List<ByteBuffer> buildSerializedPartitionKeys()
+        {
+            built = true;
+
+            if (hasMissingElements)
+                return Collections.EMPTY_LIST;
+
+            // Use a TreeSet here to remove duplicates and return the values in comparator sorted order
+            TreeSet<ByteBuffer> set = comparator.size() == 1 ? new TreeSet<>(comparator.subtype(0))
+                                                             : new TreeSet<>(CompositeType.getInstance(comparator.subtypes()));
+
+            for (int i = 0, m = elementsList.size(); i < m; i++)
+            {
+                List<ByteBuffer> elements = elementsList.get(i);
+                set.add(comparator.size() == 1 ? elements.get(0) : toComposite(elements));
+            }
+            return new ArrayList<>(set);
+        }
+
+        private ByteBuffer toComposite(List<ByteBuffer> elements)
+        {
+            ByteBuffer[] tmp = new ByteBuffer[elements.size()];
+            for (int i = 0, m = elements.size(); i < m; i++)
+            {
+                tmp[i] = elements.get(i);
+            }
+            return CompositeType.build(ByteBufferAccessor.instance, tmp);
+        }
+
         public NavigableSet<ClusteringBound<?>> buildBoundForSlice(boolean isStart,
                                                                    boolean isInclusive,
                                                                    boolean isOtherBoundInclusive,
diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java
index 68a1d57fe1ac..1892a51f4fa3 100644
--- a/src/java/org/apache/cassandra/db/filter/RowFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/RowFilter.java
@@ -23,8 +23,10 @@
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
 
 import com.google.common.base.Objects;
+import com.google.common.base.Predicate;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -95,7 +97,7 @@ public void addMapEquality(ColumnMetadata def, ByteBuffer key, Operator op, Byte
 
     public void addCustomIndexExpression(TableMetadata metadata, IndexMetadata targetIndex, ByteBuffer value)
     {
-        add(new CustomExpression(metadata, targetIndex, value));
+        add(CustomExpression.build(metadata, targetIndex, value));
     }
 
     private void add(Expression expression)
@@ -243,6 +245,16 @@ public RowFilter withoutExpressions()
         return withNewExpressions(Collections.emptyList());
     }
 
+    public RowFilter restrict(Predicate<Expression> filter)
+    {
+        return fromExpressions(expressions.stream().filter(filter).collect(Collectors.toList()));
+    }
+
+    private RowFilter fromExpressions(List<Expression> expressions)
+    {
+        return expressions.isEmpty() ? NONE : withNewExpressions(expressions);
+    }
+
     protected abstract RowFilter withNewExpressions(List<Expression> expressions);
 
     public boolean isEmpty()
@@ -522,9 +534,9 @@ public Expression deserialize(DataInputPlus in, int version, TableMetadata metad
                 // custom expressions (3.0+ only) do not contain a column or operator, only a value
                 if (kind == Kind.CUSTOM)
                 {
-                    return new CustomExpression(metadata,
-                            IndexMetadata.serializer.deserialize(in, version, metadata),
-                            ByteBufferUtil.readWithShortLength(in));
+                    return CustomExpression.build(metadata,
+                                                  IndexMetadata.serializer.deserialize(in, version, metadata),
+                                                  ByteBufferUtil.readWithShortLength(in));
                 }
 
                 if (kind == Kind.USER)
@@ -603,6 +615,7 @@ public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey,
             switch (operator)
             {
                 case EQ:
+                case IN:
                 case LT:
                 case LTE:
                 case GTE:
@@ -695,11 +708,6 @@ public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey,
                         ByteBuffer foundValue = getValue(metadata, partitionKey, row);
                         return foundValue != null && mapType.getSerializer().getSerializedValue(foundValue, value, mapType.getKeysType()) != null;
                     }
-
-                case IN:
-                    // It wouldn't be terribly hard to support this (though doing so would imply supporting
-                    // IN for 2ndary index) but currently we don't.
-                    throw new AssertionError();
             }
             throw new AssertionError();
         }
@@ -833,7 +841,7 @@ protected Kind kind()
      * A custom index expression for use with 2i implementations which support custom syntax and which are not
      * necessarily linked to a single column in the base table.
      */
-    public static final class CustomExpression extends Expression
+    public static class CustomExpression extends Expression
     {
         private final IndexMetadata targetIndex;
         private final TableMetadata table;
@@ -846,6 +854,12 @@ public CustomExpression(TableMetadata table, IndexMetadata targetIndex, ByteBuff
             this.table = table;
         }
 
+        public static CustomExpression build(TableMetadata metadata, IndexMetadata targetIndex, ByteBuffer value)
+        {
+            // delegate the expression creation to the target custom index
+            return Keyspace.openAndGetStore(metadata).indexManager.getIndex(targetIndex).customExpressionFor(metadata, value);
+        }
+
         private static ColumnMetadata makeDefinition(TableMetadata table, IndexMetadata index)
         {
             // Similarly to how we handle non-defined columns in thift, we create a fake column definition to
diff --git a/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java b/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java
index 029729fe78de..d58e53f8f14c 100644
--- a/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictionsTest.java
@@ -54,7 +54,7 @@ public void testBoundsAsClusteringWithNoRestrictions()
     {
         TableMetadata tableMetadata = newTableMetadata(Sort.ASC);
 
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false).build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -76,8 +76,9 @@ public void testBoundsAsClusteringWithOneEqRestrictionsAndOneClusteringColumn()
         ByteBuffer clustering_0 = ByteBufferUtil.bytes(1);
         Restriction eq = newSingleEq(tableMetadata, 0, clustering_0);
 
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(eq)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -99,8 +100,9 @@ public void testBoundsAsClusteringWithOneEqRestrictionsAndTwoClusteringColumns()
         ByteBuffer clustering_0 = ByteBufferUtil.bytes(1);
         Restriction eq = newSingleEq(tableMetadata, 0, clustering_0);
 
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(eq)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -125,8 +127,9 @@ public void testBoundsAsClusteringWithOneInRestrictionsAndOneClusteringColumn()
 
         Restriction in = newSingleIN(tableMetadata, 0, value1, value2, value3);
 
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(in);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(in)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
@@ -153,8 +156,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newSingleSlice(tableMetadata, 0, Bound.START, false, value1);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -165,8 +169,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
         assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(tableMetadata, 0, Bound.START, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -177,8 +182,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
         assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(tableMetadata, 0, Bound.END, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -189,8 +195,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
         assertEndBound(get(bounds, 0), true, value1);
 
         slice = newSingleSlice(tableMetadata, 0, Bound.END, false, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -202,8 +209,10 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
 
         slice = newSingleSlice(tableMetadata, 0, Bound.START, false, value1);
         Restriction slice2 = newSingleSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -215,8 +224,10 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneClusteringColumn()
 
         slice = newSingleSlice(tableMetadata, 0, Bound.START, true, value1);
         slice2 = newSingleSlice(tableMetadata, 0, Bound.END, true, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -239,8 +250,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusterin
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newSingleSlice(tableMetadata, 0, Bound.START, false, value1);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -251,8 +263,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusterin
         assertEndBound(get(bounds, 0), false, value1);
 
         slice = newSingleSlice(tableMetadata, 0, Bound.START, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -263,8 +276,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusterin
         assertEndBound(get(bounds, 0), true, value1);
 
         slice = newSingleSlice(tableMetadata, 0, Bound.END, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -275,8 +289,9 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusterin
         assertEmptyEnd(get(bounds, 0));
 
         slice = newSingleSlice(tableMetadata, 0, Bound.END, false, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -288,8 +303,10 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusterin
 
         slice = newSingleSlice(tableMetadata, 0, Bound.START, false, value1);
         Restriction slice2 = newSingleSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -301,8 +318,10 @@ public void testBoundsAsClusteringWithSliceRestrictionsAndOneDescendingClusterin
 
         slice = newSingleSlice(tableMetadata, 0, Bound.START, true, value1);
         slice2 = newSingleSlice(tableMetadata, 0, Bound.END, true, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -326,8 +345,10 @@ public void testBoundsAsClusteringWithEqAndInRestrictions()
         ByteBuffer value3 = ByteBufferUtil.bytes(3);
         Restriction eq = newSingleEq(tableMetadata, 0, value1);
         Restriction in = newSingleIN(tableMetadata, 1, value1, value2, value3);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(in);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(eq)
+                                                                                .addRestriction(in)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
@@ -357,8 +378,10 @@ public void testBoundsAsClusteringWithEqAndSliceRestrictions()
         Restriction eq = newSingleEq(tableMetadata, 0, value3);
 
         Restriction slice = newSingleSlice(tableMetadata, 1, Bound.START, false, value1);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(eq)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -369,8 +392,10 @@ public void testBoundsAsClusteringWithEqAndSliceRestrictions()
         assertEndBound(get(bounds, 0), true, value3);
 
         slice = newSingleSlice(tableMetadata, 1, Bound.START, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(eq)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -381,8 +406,10 @@ public void testBoundsAsClusteringWithEqAndSliceRestrictions()
         assertEndBound(get(bounds, 0), true, value3);
 
         slice = newSingleSlice(tableMetadata, 1, Bound.END, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(eq)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -393,8 +420,10 @@ public void testBoundsAsClusteringWithEqAndSliceRestrictions()
         assertEndBound(get(bounds, 0), true, value3, value1);
 
         slice = newSingleSlice(tableMetadata, 1, Bound.END, false, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(eq)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -406,8 +435,11 @@ public void testBoundsAsClusteringWithEqAndSliceRestrictions()
 
         slice = newSingleSlice(tableMetadata, 1, Bound.START, false, value1);
         Restriction slice2 = newSingleSlice(tableMetadata, 1, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(eq)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -419,8 +451,11 @@ public void testBoundsAsClusteringWithEqAndSliceRestrictions()
 
         slice = newSingleSlice(tableMetadata, 1, Bound.START, true, value1);
         slice2 = newSingleSlice(tableMetadata, 1, Bound.END, true, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq).mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(eq)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -442,8 +477,9 @@ public void testBoundsAsClusteringWithMultiEqRestrictions()
         ByteBuffer value1 = ByteBufferUtil.bytes(1);
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
         Restriction eq = newMultiEq(tableMetadata, 0, value1, value2);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(eq);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(eq)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -466,8 +502,9 @@ public void testBoundsAsClusteringWithMultiInRestrictions()
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
         ByteBuffer value3 = ByteBufferUtil.bytes(3);
         Restriction in = newMultiIN(tableMetadata, 0, asList(value1, value2), asList(value2, value3));
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(in);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(in)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -493,8 +530,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringCol
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -505,8 +543,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringCol
         assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -517,8 +556,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringCol
         assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -529,8 +569,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringCol
         assertEndBound(get(bounds, 0), true, value1);
 
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -542,8 +583,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringCol
 
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -555,8 +598,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneClusteringCol
 
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -580,8 +625,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClu
         ByteBuffer value2 = ByteBufferUtil.bytes(2);
 
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -592,8 +638,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClu
         assertEndBound(get(bounds, 0), false, value1);
 
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -604,8 +651,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClu
         assertEndBound(get(bounds, 0), true, value1);
 
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -616,8 +664,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClu
         assertEmptyEnd(get(bounds, 0));
 
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -629,8 +678,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClu
 
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -642,8 +693,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingClu
 
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -667,8 +720,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringCol
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -680,8 +734,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringCol
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -693,8 +748,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringCol
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -706,8 +762,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringCol
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -720,8 +777,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringCol
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -734,8 +793,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoClusteringCol
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -759,8 +820,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClu
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -772,8 +834,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClu
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -785,8 +848,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClu
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -798,9 +862,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClu
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
-
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
         assertStartBound(get(bounds, 0), false, value1, value2);
@@ -813,8 +877,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClu
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -827,8 +893,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoDescendingClu
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -853,8 +921,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -868,8 +937,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -883,8 +953,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -898,8 +969,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -914,8 +986,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -930,8 +1004,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
         // (clustering_0) > (1) AND (clustering_0, clustering1) < (2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -946,8 +1022,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneDescendingAnd
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
@@ -976,8 +1054,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndO
 
         // (clustering_0, clustering1) > (1, 2)
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -991,8 +1070,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndO
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1006,8 +1086,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndO
 
         // (clustering_0, clustering1) <= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1021,8 +1102,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndO
 
         // (clustering_0, clustering1) < (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1037,8 +1119,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndO
         // (clustering_0, clustering1) > (1, 2) AND (clustering_0) < (2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1053,8 +1137,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithOneAscendingAndO
         // (clustering_0, clustering1) >= (1, 2) AND (clustering_0, clustering1) <= (2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
@@ -1085,8 +1171,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
 
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4)
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2, value3, value4);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1101,9 +1188,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
         // clustering_0 = 1 AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
         Restriction eq = newSingleEq(tableMetadata, 0, value1);
         slice = newMultiSlice(tableMetadata, 1, Bound.START, false, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
-        restrictions = restrictions.mergeWith(eq);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(eq)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1118,9 +1206,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
         // clustering_0 IN (1, 2) AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
         Restriction in = newSingleIN(tableMetadata, 0, value1, value2);
         slice = newMultiSlice(tableMetadata, 1, Bound.START, false, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
-        restrictions = restrictions.mergeWith(in);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(in)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
@@ -1138,8 +1227,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1151,8 +1241,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
 
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1166,8 +1257,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
 
         // (clustering_0, clustering1, clustering_2, clustering_3) <= (1, 2, 3, 4)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1181,8 +1273,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
 
         // (clustering_0, clustering1, clustering_2, clustering_3) < (1, 2, 3, 4)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1197,8 +1290,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4) AND (clustering_0, clustering_1) < (2, 3)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2, value3, value4);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2, value3);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1213,8 +1308,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithTwoAscendingAndT
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4) AND (clustering_0, clustering1, clustering_2, clustering_3) <= (4, 3, 2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2, value3, value4);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value4, value3, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
@@ -1245,8 +1342,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
 
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4)
         Restriction slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2, value3, value4);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(slice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
@@ -1266,9 +1364,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
         // clustering_0 = 1 AND (clustering_1, clustering_2, clustering_3) > (2, 3, 4)
         Restriction eq = newSingleEq(tableMetadata, 0, value1);
         slice = newMultiSlice(tableMetadata, 1, Bound.START, false, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
-        restrictions = restrictions.mergeWith(eq);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(eq)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(3, bounds.size());
@@ -1284,8 +1383,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
 
         // (clustering_0, clustering1) >= (1, 2)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1299,8 +1399,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
 
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
@@ -1318,8 +1419,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
 
         // (clustering_0, clustering1, clustering_2, clustering_3) <= (1, 2, 3, 4)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, true, value1, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
@@ -1337,8 +1439,9 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
 
         // (clustering_0, clustering1, clustering_2, clustering_3) < (1, 2, 3, 4)
         slice = newMultiSlice(tableMetadata, 0, Bound.END, false, value1, value2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(4, bounds.size());
@@ -1357,8 +1460,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
         // (clustering_0, clustering1, clustering_2, clustering_3) > (1, 2, 3, 4) AND (clustering_0, clustering_1) < (2, 3)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, false, value1, value2, value3, value4);
         Restriction slice2 = newMultiSlice(tableMetadata, 0, Bound.END, false, value2, value3);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(5, bounds.size());
@@ -1379,8 +1484,10 @@ public void testBoundsAsClusteringWithMultiSliceRestrictionsWithAscendingDescend
         // (clustering_0, clustering1, clustering_2, clustering_3) >= (1, 2, 3, 4) AND (clustering_0, clustering1, clustering_2, clustering_3) <= (4, 3, 2, 1)
         slice = newMultiSlice(tableMetadata, 0, Bound.START, true, value1, value2, value3, value4);
         slice2 = newMultiSlice(tableMetadata, 0, Bound.END, true, value4, value3, value2, value1);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(slice).mergeWith(slice2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(slice)
+                                                   .addRestriction(slice2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(7, bounds.size());
@@ -1419,8 +1526,10 @@ public void testBoundsAsClusteringWithSingleEqAndMultiEqRestrictions()
         // clustering_0 = 1 AND (clustering_1, clustering_2) = (2, 3)
         Restriction singleEq = newSingleEq(tableMetadata, 0, value1);
         Restriction multiEq = newMultiEq(tableMetadata, 1, value2, value3);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(multiEq);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(singleEq)
+                                                                                .addRestriction(multiEq)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1434,8 +1543,11 @@ public void testBoundsAsClusteringWithSingleEqAndMultiEqRestrictions()
         singleEq = newSingleEq(tableMetadata, 0, value1);
         Restriction singleEq2 = newSingleEq(tableMetadata, 1, value2);
         multiEq = newMultiEq(tableMetadata, 2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(singleEq2).mergeWith(multiEq);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(singleEq)
+                                                   .addRestriction(singleEq2)
+                                                   .addRestriction(multiEq)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1448,8 +1560,10 @@ public void testBoundsAsClusteringWithSingleEqAndMultiEqRestrictions()
         // (clustering_0, clustering_1) = (1, 2) AND clustering_2 = 3
         singleEq = newSingleEq(tableMetadata, 2, value3);
         multiEq = newMultiEq(tableMetadata, 0, value1, value2);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(multiEq);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(singleEq)
+                                                   .addRestriction(multiEq)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1463,8 +1577,11 @@ public void testBoundsAsClusteringWithSingleEqAndMultiEqRestrictions()
         singleEq = newSingleEq(tableMetadata, 0, value1);
         singleEq2 = newSingleEq(tableMetadata, 3, value4);
         multiEq = newMultiEq(tableMetadata, 1, value2, value3);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(multiEq).mergeWith(singleEq2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(singleEq)
+                                                   .addRestriction(multiEq)
+                                                   .addRestriction(singleEq2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1492,8 +1609,10 @@ public void testBoundsAsClusteringWithSingleEqAndMultiINRestrictions()
         // clustering_0 = 1 AND (clustering_1, clustering_2) IN ((2, 3), (4, 5))
         Restriction singleEq = newSingleEq(tableMetadata, 0, value1);
         Restriction multiIN = newMultiIN(tableMetadata, 1, asList(value2, value3), asList(value4, value5));
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(multiIN);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(singleEq)
+                                                                                .addRestriction(multiIN)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1508,8 +1627,10 @@ public void testBoundsAsClusteringWithSingleEqAndMultiINRestrictions()
         // clustering_0 = 1 AND (clustering_1, clustering_2) IN ((2, 3))
         singleEq = newSingleEq(tableMetadata, 0, value1);
         multiIN = newMultiIN(tableMetadata, 1, asList(value2, value3));
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiIN).mergeWith(singleEq);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(multiIN)
+                                                   .addRestriction(singleEq)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1523,8 +1644,11 @@ public void testBoundsAsClusteringWithSingleEqAndMultiINRestrictions()
         singleEq = newSingleEq(tableMetadata, 0, value1);
         Restriction singleEq2 = newSingleEq(tableMetadata, 1, value5);
         multiIN = newMultiIN(tableMetadata, 2, asList(value2, value3), asList(value4, value5));
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(multiIN).mergeWith(singleEq2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(singleEq)
+                                                   .addRestriction(multiIN)
+                                                   .addRestriction(singleEq2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1555,8 +1679,10 @@ public void testBoundsAsClusteringWithSingleEqAndSliceRestrictions()
         // clustering_0 = 1 AND (clustering_1, clustering_2) > (2, 3)
         Restriction singleEq = newSingleEq(tableMetadata, 0, value1);
         Restriction multiSlice = newMultiSlice(tableMetadata, 1, Bound.START, false, value2, value3);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(singleEq).mergeWith(multiSlice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(singleEq)
+                                                                                .addRestriction(multiSlice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1570,8 +1696,11 @@ public void testBoundsAsClusteringWithSingleEqAndSliceRestrictions()
         singleEq = newSingleEq(tableMetadata, 0, value1);
         multiSlice = newMultiSlice(tableMetadata, 1, Bound.START, false, value2, value3);
         Restriction multiSlice2 = newMultiSlice(tableMetadata, 1, Bound.END, false, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiSlice2).mergeWith(singleEq).mergeWith(multiSlice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(multiSlice2)
+                                                   .addRestriction(singleEq)
+                                                   .addRestriction(multiSlice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1585,8 +1714,11 @@ public void testBoundsAsClusteringWithSingleEqAndSliceRestrictions()
         singleEq = newSingleEq(tableMetadata, 0, value1);
         multiSlice = newMultiSlice(tableMetadata, 1, Bound.START, true, value2, value3);
         multiSlice2 = newMultiSlice(tableMetadata, 1, Bound.END, true, value4, value5);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiSlice2).mergeWith(singleEq).mergeWith(multiSlice);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(multiSlice2)
+                                                   .addRestriction(singleEq)
+                                                   .addRestriction(multiSlice)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1613,8 +1745,10 @@ public void testBoundsAsClusteringWithMultiEqAndSingleSliceRestrictions()
         // (clustering_0, clustering_1) = (1, 2) AND clustering_2 > 3
         Restriction multiEq = newMultiEq(tableMetadata, 0, value1, value2);
         Restriction singleSlice = newSingleSlice(tableMetadata, 2, Bound.START, false, value3);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiEq).mergeWith(singleSlice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(multiEq)
+                                                                                .addRestriction(singleSlice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1639,8 +1773,10 @@ public void testBoundsAsClusteringWithSeveralMultiColumnRestrictions()
         // (clustering_0, clustering_1) = (1, 2) AND (clustering_2, clustering_3) > (3, 4)
         Restriction multiEq = newMultiEq(tableMetadata, 0, value1, value2);
         Restriction multiSlice = newMultiSlice(tableMetadata, 2, Bound.START, false, value3, value4);
-        ClusteringColumnRestrictions restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiEq).mergeWith(multiSlice);
+        ClusteringColumnRestrictions restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                                                .addRestriction(multiEq)
+                                                                                .addRestriction(multiSlice)
+                                                                                .build();
 
         SortedSet<ClusteringBound<?>> bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
@@ -1653,8 +1789,10 @@ public void testBoundsAsClusteringWithSeveralMultiColumnRestrictions()
         // (clustering_0, clustering_1) = (1, 2) AND (clustering_2, clustering_3) IN ((3, 4), (4, 5))
         multiEq = newMultiEq(tableMetadata, 0, value1, value2);
         Restriction multiIN = newMultiIN(tableMetadata, 2, asList(value3, value4), asList(value4, value5));
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiEq).mergeWith(multiIN);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(multiEq)
+                                                   .addRestriction(multiIN)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(2, bounds.size());
@@ -1669,8 +1807,10 @@ public void testBoundsAsClusteringWithSeveralMultiColumnRestrictions()
         // (clustering_0, clustering_1) = (1, 2) AND (clustering_2, clustering_3) = (3, 4)
         multiEq = newMultiEq(tableMetadata, 0, value1, value2);
         Restriction multiEq2 = newMultiEq(tableMetadata, 2, value3, value4);
-        restrictions = new ClusteringColumnRestrictions(tableMetadata);
-        restrictions = restrictions.mergeWith(multiEq).mergeWith(multiEq2);
+        restrictions = ClusteringColumnRestrictions.builder(tableMetadata, false)
+                                                   .addRestriction(multiEq)
+                                                   .addRestriction(multiEq2)
+                                                   .build();
 
         bounds = restrictions.boundsAsClustering(Bound.START, QueryOptions.DEFAULT);
         assertEquals(1, bounds.size());
diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java
index 5cbfb4cff56f..8f8a9b437fd9 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/FrozenCollectionsTest.java
@@ -866,7 +866,7 @@ public void testSecondaryIndex() throws Throwable
                    row(0, list(1, 2, 3), set(1, 2, 3), map(1, "a")),
                    row(1, list(1, 2, 3), set(4, 5, 6), map(2, "b")));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "d"),
                              "SELECT * FROM %s WHERE d CONTAINS KEY ?", 1);
 
         assertRows(execute("SELECT * FROM %s WHERE b CONTAINS ? AND d CONTAINS KEY ? ALLOW FILTERING", 1, 1),
diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
index 3d97df931068..3b5a6105aab6 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
@@ -901,16 +901,16 @@ public void prepareStatementsWithLIKEClauses() throws Throwable
 
         // LIKE is not supported on indexes of non-literal values
         // this is rejected before binding, so the value isn't available in the error message
-        assertInvalidMessage("LIKE restriction is only supported on properly indexed columns. v3 LIKE ? is not valid",
+        assertInvalidMessage("Index on column v3 does not support LIKE restrictions.",
                              "SELECT * FROM %s WHERE v3 LIKE ?",
                              "%abc");
-        assertInvalidMessage("LIKE restriction is only supported on properly indexed columns. v3 LIKE ? is not valid",
+        assertInvalidMessage("Index on column v3 does not support LIKE restrictions.",
                              "SELECT * FROM %s WHERE v3 LIKE ?",
                              "%abc%");
-        assertInvalidMessage("LIKE restriction is only supported on properly indexed columns. v3 LIKE ? is not valid",
+        assertInvalidMessage("Index on column v3 does not support LIKE restrictions.",
                              "SELECT * FROM %s WHERE v3 LIKE ?",
                              "%abc%");
-        assertInvalidMessage("LIKE restriction is only supported on properly indexed columns. v3 LIKE ? is not valid",
+        assertInvalidMessage("Index on column v3 does not support LIKE restrictions.",
                              "SELECT * FROM %s WHERE v3 LIKE ?",
                              "abc");
     }
@@ -1594,7 +1594,7 @@ public void testIndexOnFrozenCollectionOfUDT() throws Throwable
         execute("INSERT INTO %s (k, v) VALUES (?, ?)", 2, set(udt2));
         assertTrue(waitForIndex(keyspace(), tableName, indexName));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "v"),
                              "SELECT * FROM %s WHERE v CONTAINS ?", udt1);
 
         assertRows(execute("SELECT * FROM %s WHERE v = ?", set(udt1, udt2)), row(1, set(udt1, udt2)));
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/CompactStorageTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/CompactStorageTest.java
index 87ffa05cbcfa..c18dc3c3a7aa 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/CompactStorageTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/CompactStorageTest.java
@@ -2446,11 +2446,12 @@ public void testFilteringOnCompactTablesWithoutIndices() throws Throwable
             assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = 4 ALLOW FILTERING"),
                        row(1, 4, 4));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7)");
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
-                                 "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING");
+            assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING"),
+                       row(1, 3, 6),
+                       row(2, 3, 7));
 
             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE c > 4");
@@ -2517,11 +2518,11 @@ public void testFilteringOnCompactTablesWithoutIndices() throws Throwable
             assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 2 AND c = 4 ALLOW FILTERING"),
                        row(1, 2, 4));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7)");
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
-                                 "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING");
+            assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING"),
+                       row(2, 1, 6));
 
             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE c > 4");
@@ -3484,11 +3485,11 @@ public void testAllowFilteringOnPartitionKeyOnCompactTablesWithoutIndices() thro
             assertRows(execute("SELECT * FROM %s WHERE a = 1 AND b = 4 AND c = 4 ALLOW FILTERING"),
                        row(1, 4, 4, 5));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (d) is not yet supported",
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE a IN (1, 2) AND b = 3 AND d IN (6, 7)");
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (d) is not yet supported",
-                                 "SELECT * FROM %s WHERE a IN (1, 2) AND b = 3 AND d IN (6, 7) ALLOW FILTERING");
+            assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2) AND b = 3 AND d IN (6, 7) ALLOW FILTERING"),
+                       row(1, 3, 6, 7));
 
             assertRows(execute("SELECT * FROM %s WHERE a < 2 AND c > 4 AND c <= 6 ALLOW FILTERING"),
                        row(1, 3, 6, 7));
@@ -3569,11 +3570,11 @@ public void testAllowFilteringOnPartitionKeyOnCompactTablesWithoutIndices() thro
             assertRows(execute("SELECT * FROM %s WHERE a = 1 AND c >= 4 ALLOW FILTERING"),
                        row(1, 2, 4));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (b) is not yet supported",
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE a = 1 AND b IN (1, 2) AND c IN (6, 7)");
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
-                                 "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING");
+            assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING"),
+                       row(2, 1, 6));
 
             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE c > 4");
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java
index 5062448fef96..bf0c8575ea87 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectMultiColumnRelationTest.java
@@ -827,7 +827,7 @@ public void testMultipleClusteringWithIndex() throws Throwable
         assertRows(execute("SELECT * FROM %s WHERE (b) IN ((?)) AND e = ? ALLOW FILTERING", 1, 2),
                    row(0, 1, 1, 1, 2));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"),
                              "SELECT * FROM %s WHERE (b) IN ((?), (?)) AND e = ?", 0, 1, 2);
         assertRows(execute("SELECT * FROM %s WHERE (b) IN ((?), (?)) AND e = ? ALLOW FILTERING", 0, 1, 2),
                    row(0, 0, 1, 1, 2),
@@ -838,18 +838,18 @@ public void testMultipleClusteringWithIndex() throws Throwable
         assertRows(execute("SELECT * FROM %s WHERE (b, c) IN ((?, ?)) AND e = ? ALLOW FILTERING", 0, 1, 2),
                    row(0, 0, 1, 1, 2));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"),
                              "SELECT * FROM %s WHERE (b, c) IN ((?, ?), (?, ?)) AND e = ?", 0, 1, 1, 1, 2);
         assertRows(execute("SELECT * FROM %s WHERE (b, c) IN ((?, ?), (?, ?)) AND e = ? ALLOW FILTERING", 0, 1, 1, 1, 2),
                    row(0, 0, 1, 1, 2),
                    row(0, 1, 1, 1, 2));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"),
                              "SELECT * FROM %s WHERE (b) >= (?) AND e = ?", 1, 2);
         assertRows(execute("SELECT * FROM %s WHERE (b) >= (?) AND e = ? ALLOW FILTERING", 1, 2),
                    row(0, 1, 1, 1, 2));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"),
                              "SELECT * FROM %s WHERE (b, c) >= (?, ?) AND e = ?", 1, 1, 2);
         assertRows(execute("SELECT * FROM %s WHERE (b, c) >= (?, ?) AND e = ? ALLOW FILTERING", 1, 1, 2),
                    row(0, 1, 1, 1, 2));
@@ -943,12 +943,12 @@ public void testMultiplePartitionKeyAndMultiClusteringWithIndex() throws Throwab
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ? AND (c) IN ((?)) AND f = ?", 0, 0, 1, 5),
                    row(0, 0, 1, 1, 1, 5));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND (c) IN ((?), (?)) AND f = ?", 0, 1, 3, 5);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND (c) IN ((?), (?)) AND f = ? ALLOW FILTERING", 0, 1, 3, 5),
                    row(0, 0, 1, 1, 1, 5));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND (c) IN ((?), (?)) AND f = ?", 0, 1, 2, 5);
 
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ? AND (c) IN ((?), (?)) AND f = ?", 0, 0, 1, 2, 5),
@@ -966,7 +966,7 @@ public void testMultiplePartitionKeyAndMultiClusteringWithIndex() throws Throwab
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND (c, d) IN ((?, ?)) AND f = ? ALLOW FILTERING", 0, 1, 0, 3),
                    row(0, 0, 1, 0, 0, 3));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND (c) >= (?) AND f = ?", 0, 1, 5);
 
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ? AND (c) >= (?) AND f = ?", 0, 0, 1, 5),
@@ -980,7 +980,7 @@ public void testMultiplePartitionKeyAndMultiClusteringWithIndex() throws Throwab
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ? AND (c, d) >= (?, ?) AND f = ?", 0, 0, 1, 1, 5),
                    row(0, 0, 1, 1, 1, 5),
                    row(0, 0, 2, 0, 0, 5));
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND (c, d) >= (?, ?) AND f = ?", 0, 1, 1, 5);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND (c, d) >= (?, ?) AND f = ? ALLOW FILTERING", 0, 1, 1, 5),
                    row(0, 0, 1, 1, 1, 5),
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java
index 3795ce5ae338..3a005f250aae 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectSingleColumnRelationTest.java
@@ -351,8 +351,8 @@ public void testIndexOnClusteringColumns() throws Throwable
 
         assertRows(execute("SELECT v1 FROM %s WHERE time = 1"), row("B"), row("E"));
 
-        assertInvalidMessage("IN restrictions are not supported on indexed columns",
-                             "SELECT v1 FROM %s WHERE id2 = 0 and time IN (1, 2) ALLOW FILTERING");
+        assertRows(execute("SELECT v1 FROM %s WHERE id2 = 0 and time IN (1, 2) ALLOW FILTERING"),
+                   row("B"));
 
         assertRows(execute("SELECT v1 FROM %s WHERE author > 'ted' AND time = 1 ALLOW FILTERING"), row("E"));
         assertRows(execute("SELECT v1 FROM %s WHERE author > 'amy' AND author < 'zoe' AND time = 0 ALLOW FILTERING"),
@@ -484,18 +484,18 @@ public void testMultiplePartitionKeyWithIndex() throws Throwable
                 row(0, 0, 1, 1, 1, 5),
                 row(0, 0, 2, 0, 0, 5));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND c IN (?, ?) AND f = ?", 0, 0, 1, 5);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND c IN (?, ?) AND f = ? ALLOW FILTERING", 0, 1, 3, 5),
                    row(0, 0, 1, 1, 1, 5));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND c IN (?, ?) AND f = ?", 0, 1, 2, 5);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND c IN (?, ?) AND f = ? ALLOW FILTERING", 0, 1, 2, 5),
                    row(0, 0, 1, 1, 1, 5),
                    row(0, 0, 2, 0, 0, 5));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND c IN (?, ?) AND d IN (?) AND f = ?", 0, 1, 3, 0, 3);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND c IN (?, ?) AND d IN (?) AND f = ? ALLOW FILTERING", 0, 1, 3, 0, 3),
                    row(0, 0, 1, 0, 0, 3));
@@ -506,7 +506,7 @@ public void testMultiplePartitionKeyWithIndex() throws Throwable
                 row(0, 0, 1, 1, 1, 5),
                 row(0, 0, 2, 0, 0, 5));
 
-        assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"),
                              "SELECT * FROM %s WHERE a = ? AND c >= ? AND f = ?", 0, 1, 5);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ? AND c >= ? AND f = ?", 0, 0, 1, 5),
                    row(0, 0, 1, 1, 1, 5),
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java
index d0493cf8fc75..2d7fe8b495f9 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/SelectTest.java
@@ -568,7 +568,7 @@ public void testContainsKeyAndContainsWithIndexOnMapKey() throws Throwable
         execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 6, map("lmn", "foo2"));
 
         beforeAndAfterFlush(() -> {
-            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+            assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "categories"),
                                  "SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "test", "foo");
 
             assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS KEY ?", "test", "lmn"),
@@ -593,7 +593,7 @@ public void testContainsKeyAndContainsWithIndexOnMapValue() throws Throwable
         execute("INSERT INTO %s (account, id , categories) VALUES (?, ?, ?)", "test", 6, map("lmn2", "foo"));
 
         beforeAndAfterFlush(() -> {
-            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+            assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "categories"),
                                  "SELECT * FROM %s WHERE account = ? AND categories CONTAINS KEY ?", "test", "lmn");
 
             assertRows(execute("SELECT * FROM %s WHERE account = ? AND categories CONTAINS ?", "test", "foo"),
@@ -1276,11 +1276,12 @@ public void testFilteringWithoutIndices() throws Throwable
             assertRows(execute("SELECT * FROM %s WHERE s = 1 AND d = 12 ALLOW FILTERING"),
                        row(1, 3, 1, 6, 12));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7)");
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (c) is not yet supported",
-                                 "SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING");
+            assertRows(execute("SELECT * FROM %s WHERE a IN (1, 2) AND c IN (6, 7) ALLOW FILTERING"),
+                       row(1, 3, 1, 6, 12),
+                       row(2, 3, 2, 7, 12));
 
             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                                  "SELECT * FROM %s WHERE c > 4");
@@ -1669,8 +1670,8 @@ public void testAllowFilteringOnPartitionKey() throws Throwable
 
         beforeAndAfterFlush(() -> {
 
-            assertInvalidMessage("IN restrictions are not supported when the query involves filtering",
-                    "SELECT * FROM %s WHERE b in (11,12) ALLOW FILTERING");
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                    "SELECT * FROM %s WHERE b in (11,12)");
 
             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                     "SELECT * FROM %s WHERE a = 11");
@@ -1745,8 +1746,8 @@ public void testAllowFilteringOnPartitionKey() throws Throwable
 
         beforeAndAfterFlush(() -> {
 
-             assertInvalidMessage("IN restrictions are not supported when the query involves filtering",
-                    "SELECT * FROM %s WHERE b in (11,12) ALLOW FILTERING");
+             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
+                    "SELECT * FROM %s WHERE b in (11,12)");
 
             assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE,
                     "SELECT * FROM %s WHERE a = 11");
@@ -2838,8 +2839,9 @@ public void testFilteringOnDurationColumn() throws Throwable
                    row(0, Duration.from("1s")),
                    row(2, Duration.from("1s")));
 
-        assertInvalidMessage("IN predicates on non-primary-key columns (d) is not yet supported",
-                             "SELECT * FROM %s WHERE d IN (1s, 2s) ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE d IN (1s, 3s) ALLOW FILTERING"),
+                   row(0, Duration.from("1s")),
+                   row(2, Duration.from("1s")));
 
         assertInvalidMessage("Slice restrictions are not supported on duration columns",
                              "SELECT * FROM %s WHERE d > 1s ALLOW FILTERING");
@@ -2867,11 +2869,19 @@ public void testFilteringOnListContainingDurations() throws Throwable
             execute("INSERT INTO %s (k, l) VALUES (2, [1s, 3s])");
 
             if (frozen)
+            {
                 assertRows(execute("SELECT * FROM %s WHERE l = [1s, 2s] ALLOW FILTERING"),
                            row(0, list(Duration.from("1s"), Duration.from("2s"))));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (l) is not yet supported",
-                                 "SELECT * FROM %s WHERE l IN ([1s, 2s], [2s, 3s]) ALLOW FILTERING");
+                assertRows(execute("SELECT * FROM %s WHERE l IN ([1s, 2s], [2s, 3s]) ALLOW FILTERING"),
+                           row(1, list(Duration.from("2s"), Duration.from("3s"))),
+                           row(0, list(Duration.from("1s"), Duration.from("2s"))));
+            }
+            else
+            {
+                assertInvalidMessage("Collection column 'l' (list<duration>) cannot be restricted by a 'IN' relation",
+                                     "SELECT * FROM %s WHERE l IN ([1s, 2s], [2s, 3s]) ALLOW FILTERING");
+            }
 
             assertInvalidMessage("Slice restrictions are not supported on collections containing durations",
                                  "SELECT * FROM %s WHERE l > [2s, 3s] ALLOW FILTERING");
@@ -2904,11 +2914,19 @@ public void testFilteringOnMapContainingDurations() throws Throwable
             execute("INSERT INTO %s (k, m) VALUES (2, {1:1s, 3:3s})");
 
             if (frozen)
+            {
                 assertRows(execute("SELECT * FROM %s WHERE m = {1:1s, 2:2s} ALLOW FILTERING"),
                            row(0, map(1, Duration.from("1s"), 2, Duration.from("2s"))));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (m) is not yet supported",
-                    "SELECT * FROM %s WHERE m IN ({1:1s, 2:2s}, {1:1s, 3:3s}) ALLOW FILTERING");
+                assertRows(execute("SELECT * FROM %s WHERE m IN ({1:1s, 2:2s}, {1:1s, 3:3s}) ALLOW FILTERING"),
+                           row(0, map(1, Duration.from("1s"), 2, Duration.from("2s"))),
+                           row(2, map(1, Duration.from("1s"), 3, Duration.from("3s"))));
+            }
+            else
+            {
+                assertInvalidMessage("Collection column 'm' (map<int, duration>) cannot be restricted by a 'IN' relation",
+                                     "SELECT * FROM %s WHERE m IN ({1:1s, 2:2s}, {1:1s, 3:3s}) ALLOW FILTERING");
+            }
 
             assertInvalidMessage("Slice restrictions are not supported on collections containing durations",
                     "SELECT * FROM %s WHERE m > {1:1s, 3:3s} ALLOW FILTERING");
@@ -2939,8 +2957,9 @@ public void testFilteringOnTupleContainingDurations() throws Throwable
         assertRows(execute("SELECT * FROM %s WHERE t = (1, 2s) ALLOW FILTERING"),
                    row(0, tuple(1, Duration.from("2s"))));
 
-        assertInvalidMessage("IN predicates on non-primary-key columns (t) is not yet supported",
-                "SELECT * FROM %s WHERE t IN ((1, 2s), (1, 3s)) ALLOW FILTERING");
+        assertRows(execute("SELECT * FROM %s WHERE t IN ((1, 2s), (1, 3s)) ALLOW FILTERING"),
+                   row(0, tuple(1, Duration.from("2s"))),
+                   row(2, tuple(1, Duration.from("3s"))));
 
         assertInvalidMessage("Slice restrictions are not supported on tuples containing durations",
                 "SELECT * FROM %s WHERE t > (1, 2s) ALLOW FILTERING");
@@ -2970,11 +2989,22 @@ public void testFilteringOnUdtContainingDurations() throws Throwable
             execute("INSERT INTO %s (k, u) VALUES (2, {i: 1, d:3s})");
 
             if (frozen)
+            {
                 assertRows(execute("SELECT * FROM %s WHERE u = {i: 1, d:2s} ALLOW FILTERING"),
                            row(0, userType("i", 1, "d", Duration.from("2s"))));
 
-            assertInvalidMessage("IN predicates on non-primary-key columns (u) is not yet supported",
-                    "SELECT * FROM %s WHERE u IN ({i: 2, d:3s}, {i: 1, d:3s}) ALLOW FILTERING");
+                assertRows(execute("SELECT * FROM %s WHERE u IN ({i: 2, d:3s}, {i: 1, d:3s}) ALLOW FILTERING"),
+                           row(1, userType("i", 2, "d", Duration.from("3s"))),
+                           row(2, userType("i", 1, "d", Duration.from("3s"))));
+            }
+            else
+            {
+                assertInvalidMessage("Non-frozen UDT column 'u' (" + udt + ") cannot be restricted by any relation",
+                                     "SELECT * FROM %s WHERE u = {i: 1, d:2s} ALLOW FILTERING");
+
+                assertInvalidMessage("Non-frozen UDT column 'u' (" + udt + ") cannot be restricted by any relation",
+                                     "SELECT * FROM %s WHERE u IN ({i: 2, d:3s}, {i: 1, d:3s}) ALLOW FILTERING");
+            }
 
             assertInvalidMessage("Slice restrictions are not supported on UDTs containing durations",
                     "SELECT * FROM %s WHERE u > {i: 1, d:3s} ALLOW FILTERING");
diff --git a/update-history/STAR-801/65-0249d0559e STAR-121: Add index support to select statement restrictions b/update-history/STAR-801/65-0249d0559e STAR-121: Add index support to select statement restrictions
new file mode 100644
index 000000000000..f1d8141d2aea
--- /dev/null
+++ b/update-history/STAR-801/65-0249d0559e STAR-121: Add index support to select statement restrictions	
@@ -0,0 +1,23 @@
+--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
++++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+@@ -978,8 +978,6 @@
+     {
+         return hasRegularColumnsRestrictions;
+     }
+-<<<<<<<
+-=======
+ 
+     /**
+      * Checks if the query is a full partitions selection.
+@@ -1007,11 +1005,4 @@
+         // a full partition query, then we include that content.
+         return queriesFullPartitions();
+     }
+-    
+-    @Override
+-    public String toString()
+-    {
+-        return ToStringBuilder.reflectionToString(this, ToStringStyle.SHORT_PREFIX_STYLE);
+-    }
+->>>>>>>
+ }

From ee4575d13405ea40443b3ce3fa68ed76d7b00024 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 16:00:49 +0000
Subject: [PATCH 039/151] STAR-121: Build dependencies for SAI

(cherry picked from commit 32580250852ce46f2f71e0631241eb64e51753a8)
(cherry picked from commit 77850e3a2d37db82a6f0b52aa420bf3970266dbb)
---
 build.xml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/build.xml b/build.xml
index 223b438bc7eb..b118d7bb7a87 100644
--- a/build.xml
+++ b/build.xml
@@ -656,6 +656,9 @@
           </dependency>
           <dependency groupId="org.hamcrest" artifactId="hamcrest" version="2.2" scope="test"/>
           <dependency groupId="org.agrona" artifactId="agrona" version="0.9.26" />
+          <dependency groupId="org.apache.lucene" artifactId="lucene-core" version="7.5.0" />
+          <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" />
+          <dependency groupId="org.hamcrest" artifactId="hamcrest-all" version="1.3" />
         </dependencyManagement>
         <developer id="adelapena" name="Andres de la Peña"/>
         <developer id="alakshman" name="Avinash Lakshman"/>

From 36d022e3d553f26769a7f7a7aa6fc067026e8799 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 16:10:39 +0000
Subject: [PATCH 040/151] STAR-121: Config support for SAI

(cherry picked from commit 5db0b08fc6eee05fe914261600de51c1524685d0)
(cherry picked from commit 24192212f5b456ee16aed5a4c065da43b393c46c)
---
 .../org/apache/cassandra/config/Config.java   |  2 +
 .../cassandra/config/DatabaseDescriptor.java  | 20 ++++++
 .../config/StorageAttachedIndexOptions.java   | 64 +++++++++++++++++++
 ...4192212f5 STAR-121: Config support for SAI | 27 ++++++++
 4 files changed, 113 insertions(+)
 create mode 100644 src/java/org/apache/cassandra/config/StorageAttachedIndexOptions.java
 create mode 100644 update-history/STAR-801/63-24192212f5 STAR-121: Config support for SAI

diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 4a4002e738be..764656abcd7d 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -513,6 +513,8 @@ public class Config
      */
     public volatile double range_tombstone_list_growth_factor = 1.5;
 
+    public StorageAttachedIndexOptions sai_options = new StorageAttachedIndexOptions();
+
     /**
      * @deprecated migrate to {@link DatabaseDescriptor#isClientInitialized()}
      */
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 79057b91eb7a..d49195a191c6 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -3375,4 +3375,24 @@ public static void setConsecutiveMessageErrorsThreshold(int value)
     {
         conf.consecutive_message_errors_threshold = value;
     }
+
+    public static int getSAISegmentWriteBufferSpace()
+    {
+        return conf.sai_options.segment_write_buffer_space_mb;
+    }
+
+    public static void setSAISegmentWriteBufferSpace(int bufferSpace)
+    {
+        conf.sai_options.segment_write_buffer_space_mb = bufferSpace;
+    }
+
+    public static double getSAIZeroCopyUsedThreshold()
+    {
+        return conf.sai_options.zerocopy_used_threshold;
+    }
+
+    public static void setSAIZeroCopyUsedThreshold(double threshold)
+    {
+        conf.sai_options.zerocopy_used_threshold = threshold;
+    }
 }
diff --git a/src/java/org/apache/cassandra/config/StorageAttachedIndexOptions.java b/src/java/org/apache/cassandra/config/StorageAttachedIndexOptions.java
new file mode 100644
index 000000000000..84e8d8aee130
--- /dev/null
+++ b/src/java/org/apache/cassandra/config/StorageAttachedIndexOptions.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.config;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+public class StorageAttachedIndexOptions
+{
+    public static final int DEFAULT_SEGMENT_BUFFER_MB = 1024;
+    public static final double DEFAULT_ZEROCOPY_USED_THRESHOLD = 0.3;
+
+    private static final int MAXIMUM_SEGMENT_BUFFER_MB = 32768;
+
+    public int segment_write_buffer_space_mb = DEFAULT_SEGMENT_BUFFER_MB;
+    public double zerocopy_used_threshold = DEFAULT_ZEROCOPY_USED_THRESHOLD;
+
+    public void validate()
+    {
+        if ((segment_write_buffer_space_mb < 0) || (segment_write_buffer_space_mb > MAXIMUM_SEGMENT_BUFFER_MB))
+        {
+            throw new ConfigurationException("Invalid value for segment_write_buffer_space_mb. " +
+                                             "Value must be a positive integer less than 32768");
+        }
+
+        if ((zerocopy_used_threshold < 0.0) || (zerocopy_used_threshold > 1.0))
+        {
+            throw new ConfigurationException("Invalid value for zero_copy_used_threshold. " +
+                                             "Value must be between 0.0 and 1.0");
+        }
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        StorageAttachedIndexOptions that = (StorageAttachedIndexOptions) o;
+        return Objects.equal(segment_write_buffer_space_mb, that.segment_write_buffer_space_mb) &&
+               Objects.equal(zerocopy_used_threshold, that.zerocopy_used_threshold);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(segment_write_buffer_space_mb, zerocopy_used_threshold);
+    }
+}
diff --git a/update-history/STAR-801/63-24192212f5 STAR-121: Config support for SAI b/update-history/STAR-801/63-24192212f5 STAR-121: Config support for SAI
new file mode 100644
index 000000000000..0e409894c764
--- /dev/null
+++ b/update-history/STAR-801/63-24192212f5 STAR-121: Config support for SAI	
@@ -0,0 +1,27 @@
+--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
++++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+@@ -3366,7 +3366,6 @@
+         conf.keyspace_count_warn_threshold = value;
+     }
+ 
+-<<<<<<<
+     public static int getConsecutiveMessageErrorsThreshold()
+     {
+         return conf.consecutive_message_errors_threshold;
+@@ -3375,7 +3374,8 @@
+     public static void setConsecutiveMessageErrorsThreshold(int value)
+     {
+         conf.consecutive_message_errors_threshold = value;
+-=======
++    }
++
+     public static int getSAISegmentWriteBufferSpace()
+     {
+         return conf.sai_options.segment_write_buffer_space_mb;
+@@ -3394,6 +3394,5 @@
+     public static void setSAIZeroCopyUsedThreshold(double threshold)
+     {
+         conf.sai_options.zerocopy_used_threshold = threshold;
+->>>>>>>
+     }
+ }

From c80316cada90998a0f558b582425c2ac40899d40 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 16:15:54 +0000
Subject: [PATCH 041/151] STAR-121: On-disk component support for SAI

(cherry picked from commit 89668cf917f0ea6a663f88e3081865644daae150)
(cherry picked from commit 906845012771c0f10d6e051138883915f592ab8f)
---
 .../cassandra/io/sstable/Descriptor.java      | 10 ++++
 .../apache/cassandra/io/sstable/SSTable.java  | 59 ++++++++++++++++++-
 .../apache/cassandra/io/util/FileHandle.java  |  5 ++
 3 files changed, 71 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/cassandra/io/sstable/Descriptor.java b/src/java/org/apache/cassandra/io/sstable/Descriptor.java
index b781ebf50cd5..31becad06a15 100644
--- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java
+++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java
@@ -116,6 +116,11 @@ public Descriptor withFormatType(SSTableFormat.Type newType)
         return new Descriptor(newType.info.getLatestVersion(), directory, ksname, cfname, generation, newType);
     }
 
+    public File tmpFileFor(Component component)
+    {
+        return new File(tmpFilenameFor(component));
+    }
+
     public String tmpFilenameFor(Component component)
     {
         return filenameFor(component) + TMP_EXT;
@@ -131,6 +136,11 @@ public String tmpFilenameForStreaming(Component component)
         return String.format("%s.%s%s", filenameFor(component), UUIDGen.getTimeUUID(), TMP_EXT);
     }
 
+    public File fileFor(Component component)
+    {
+        return new File(filenameFor(component));
+    }
+
     public String filenameFor(Component component)
     {
         return baseFilename() + separator + component.name();
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java
index ba4b323becaf..f138c1b63669 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTable.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java
@@ -23,6 +23,7 @@
 import java.util.*;
 import java.util.concurrent.CopyOnWriteArraySet;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Predicates;
 import com.google.common.collect.Collections2;
@@ -33,6 +34,7 @@
 
 import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.lifecycle.Tracker;
 import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.IPartitioner;
@@ -71,7 +73,7 @@ public abstract class SSTable
     public static final int TOMBSTONE_HISTOGRAM_TTL_ROUND_SECONDS = Integer.valueOf(System.getProperty("cassandra.streaminghistogram.roundseconds", "60"));
 
     public final Descriptor descriptor;
-    protected final Set<Component> components;
+    public final Set<Component> components;
     public final boolean compression;
 
     public DecoratedKey first;
@@ -309,7 +311,8 @@ public String toString()
      * Reads the list of components from the TOC component.
      * @return set of components found in the TOC
      */
-    protected static Set<Component> readTOC(Descriptor descriptor) throws IOException
+    @VisibleForTesting
+    public static Set<Component> readTOC(Descriptor descriptor) throws IOException
     {
         return readTOC(descriptor, true);
     }
@@ -335,6 +338,17 @@ protected static Set<Component> readTOC(Descriptor descriptor, boolean skipMissi
         return components;
     }
 
+    /**
+     * Rewrite TOC components by deleting existing TOC file and append new components
+     */
+    private static void rewriteTOC(Descriptor descriptor, Collection<Component> components)
+    {
+        File tocFile = descriptor.fileFor(Component.TOC);
+        if (!tocFile.delete())
+            logger.error("Failed to delete TOC component for " + descriptor);
+        appendTOC(descriptor, components);
+    }
+
     /**
      * Appends new component names to the TOC component.
      */
@@ -360,9 +374,48 @@ protected static void appendTOC(Descriptor descriptor, Collection<Component> com
      */
     public synchronized void addComponents(Collection<Component> newComponents)
     {
-        Collection<Component> componentsToAdd = Collections2.filter(newComponents, Predicates.not(Predicates.in(components)));
+        registerComponents(newComponents, null);
+    }
+
+    /**
+     * Registers new custom components into sstable and update size tracking
+     * @param newComponents collection of components to be added
+     * @param tracker used to update on-disk size metrics
+     */
+    public synchronized void registerComponents(Collection<Component> newComponents, Tracker tracker)
+    {
+        Collection<Component> componentsToAdd = new HashSet<>(Collections2.filter(newComponents, x -> !components.contains(x)));
         appendTOC(descriptor, componentsToAdd);
         components.addAll(componentsToAdd);
+
+        if (tracker == null)
+            return;
+
+        for (Component component : componentsToAdd)
+        {
+            File file = descriptor.fileFor(component);
+            if (file.exists())
+                tracker.updateSizeTracking(file.length());
+        }
+    }
+
+    /**
+     * Unregisters custom components from sstable and update size tracking
+     * @param removeComponents collection of components to be remove
+     * @param tracker used to update on-disk size metrics
+     */
+    public synchronized void unregisterComponents(Collection<Component> removeComponents, Tracker tracker)
+    {
+        Collection<Component> componentsToRemove = new HashSet<>(Collections2.filter(removeComponents, components::contains));
+        components.removeAll(componentsToRemove);
+        rewriteTOC(descriptor, components);
+
+        for (Component component : componentsToRemove)
+        {
+            File file = descriptor.fileFor(component);
+            if (file.exists())
+                tracker.updateSizeTracking(-file.length());
+        }
     }
 
     public AbstractBounds<Token> getBounds()
diff --git a/src/java/org/apache/cassandra/io/util/FileHandle.java b/src/java/org/apache/cassandra/io/util/FileHandle.java
index 6d3ae7c4697f..50da403c236e 100644
--- a/src/java/org/apache/cassandra/io/util/FileHandle.java
+++ b/src/java/org/apache/cassandra/io/util/FileHandle.java
@@ -168,6 +168,11 @@ public void dropPageCache(long before)
         NativeLibrary.trySkipCache(channel.getFileDescriptor(), 0, position, path());
     }
 
+    public Rebufferer instantiateRebufferer()
+    {
+        return instantiateRebufferer(null);
+    }
+
     private Rebufferer instantiateRebufferer(RateLimiter limiter)
     {
         Rebufferer rebufferer = rebuffererFactory.instantiateRebufferer();

From 868c0a730f93a1b1f59c7c0b08345183fc4add3d Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 16:44:10 +0000
Subject: [PATCH 042/151] STAR-121: Key iteration and utility support for SAI

(cherry picked from commit 36c018bca0a27c53c9c91a40bf0621e56fc16152)
(cherry picked from commit daf5f712e38ca71357112d496ecebb370b47d90e)
---
 .../UnfilteredPartitionIterators.java         |  4 +-
 .../org/apache/cassandra/db/rows/Row.java     |  4 +-
 .../org/apache/cassandra/db/rows/Rows.java    |  2 +-
 .../db/rows/UnfilteredRowIterators.java       |  2 +-
 .../cassandra/dht/Murmur3Partitioner.java     |  6 +++
 src/java/org/apache/cassandra/dht/Token.java  | 14 ++++++
 .../apache/cassandra/io/util/FileUtils.java   |  5 +++
 .../cassandra/utils/ByteBufferUtil.java       | 23 ++++++++++
 .../org/apache/cassandra/utils/HeapUtils.java |  2 +-
 .../apache/cassandra/utils/MergeIterator.java | 35 ++++++++++++++-
 .../apache/cassandra/utils/Throwables.java    | 44 +++++++++++++++++++
 .../utils/memory/NativeAllocator.java         |  2 +-
 .../utils/MergeIteratorComparisonTest.java    |  4 +-
 .../cassandra/fqltool/commands/Replay.java    |  2 +-
 14 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java
index a051ee1a9d26..e72463ffc608 100644
--- a/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java
+++ b/src/java/org/apache/cassandra/db/partitions/UnfilteredPartitionIterators.java
@@ -149,7 +149,7 @@ protected UnfilteredRowIterator getReduced()
                 return UnfilteredRowIterators.merge(toMerge, rowListener);
             }
 
-            protected void onKeyChange()
+            public void onKeyChange()
             {
                 toMerge.clear();
                 for (int i = 0; i < iterators.size(); i++)
@@ -215,7 +215,7 @@ protected UnfilteredRowIterator initializeIterator()
                 };
             }
 
-            protected void onKeyChange()
+            public void onKeyChange()
             {
                 toMerge.clear();
             }
diff --git a/src/java/org/apache/cassandra/db/rows/Row.java b/src/java/org/apache/cassandra/db/rows/Row.java
index 5c28cd1b42f3..4b4d88db2663 100644
--- a/src/java/org/apache/cassandra/db/rows/Row.java
+++ b/src/java/org/apache/cassandra/db/rows/Row.java
@@ -839,7 +839,7 @@ protected ColumnData getReduced()
                 }
             }
 
-            protected void onKeyChange()
+            public void onKeyChange()
             {
                 column = null;
                 versions.clear();
@@ -868,7 +868,7 @@ protected Cell<?> getReduced()
                 return merged;
             }
 
-            protected void onKeyChange()
+            public void onKeyChange()
             {
                 merged = null;
             }
diff --git a/src/java/org/apache/cassandra/db/rows/Rows.java b/src/java/org/apache/cassandra/db/rows/Rows.java
index 82abb03d7e99..9ce45e68150f 100644
--- a/src/java/org/apache/cassandra/db/rows/Rows.java
+++ b/src/java/org/apache/cassandra/db/rows/Rows.java
@@ -242,7 +242,7 @@ else if (cmp < 0)
                 return null;
             }
 
-            protected void onKeyChange()
+            public void onKeyChange()
             {
                 mergedData = null;
                 Arrays.fill(inputDatas, null);
diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java
index 2eb5d8fde7bd..a38ef7d9f139 100644
--- a/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java
+++ b/src/java/org/apache/cassandra/db/rows/UnfilteredRowIterators.java
@@ -594,7 +594,7 @@ protected Unfiltered getReduced()
                 }
             }
 
-            protected void onKeyChange()
+            public void onKeyChange()
             {
                 if (nextKind == Unfiltered.Kind.ROW)
                     rowMerger.clear();
diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
index 7e41705853b7..dc5ee1308332 100644
--- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
+++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java
@@ -203,6 +203,12 @@ public Object getTokenValue()
             return token;
         }
 
+        @Override
+        public long getLongValue()
+        {
+            return token;
+        }
+
         @Override
         public double size(Token next)
         {
diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java
index ec28dabfc357..5af8d7609a41 100644
--- a/src/java/org/apache/cassandra/dht/Token.java
+++ b/src/java/org/apache/cassandra/dht/Token.java
@@ -131,6 +131,20 @@ public long serializedSize(Token object, int version)
     abstract public long getHeapSize();
     abstract public Object getTokenValue();
 
+    /**
+     * This methods exists so that callers can access the primitive {@code long} value for this {@link Token}, if
+     * one exits. It is especially useful when the auto-boxing induced by a call to {@link #getTokenValue()} would
+     * be unacceptable for reasons of performance.
+     *
+     * @return the primitive {@code long} value of this token, if one exists
+     *
+     * @throws UnsupportedOperationException if this {@link Token} is not backed by a primitive {@code long} value
+     */
+    public long getLongValue()
+    {
+        throw new UnsupportedOperationException();
+    }
+
     /**
      * Produce a weakly prefix-free byte-comparable representation of the token, i.e. such a sequence of bytes that any
      * pair x, y of valid tokens of this type and any bytes b1, b2 between 0x10 and 0xEF,
diff --git a/src/java/org/apache/cassandra/io/util/FileUtils.java b/src/java/org/apache/cassandra/io/util/FileUtils.java
index 7798bd785d4d..0711b911a050 100644
--- a/src/java/org/apache/cassandra/io/util/FileUtils.java
+++ b/src/java/org/apache/cassandra/io/util/FileUtils.java
@@ -434,6 +434,11 @@ public static void close(Iterable<? extends Closeable> cs) throws IOException
         maybeFail(e, IOException.class);
     }
 
+    public static void closeQuietly(Closeable... cs)
+    {
+        closeQuietly(Arrays.asList(cs));
+    }
+
     public static void closeQuietly(Iterable<? extends AutoCloseable> cs)
     {
         for (AutoCloseable c : cs)
diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
index 26d9437f9f5a..499c2297d7f1 100644
--- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
+++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
@@ -38,6 +38,7 @@
 import org.apache.cassandra.io.compress.BufferType;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
 /**
  * Utility methods to make ByteBuffers less painful
@@ -878,4 +879,26 @@ private static boolean startsWith(ByteBuffer src, ByteBuffer prefix, int offset)
 
         return true;
     }
+
+    public static int toBytes(ByteSource byteSource, byte[] bytes)
+    {
+        int n = 0;
+
+        while (true)
+        {
+            int b = byteSource.next();
+
+            if (b == ByteSource.END_OF_STREAM) break;
+
+            if (n >= bytes.length)
+            {
+                throw new RuntimeException(String.format("Number of bytes read, %d, exceeds the buffer size of %d.", n + 1, bytes.length));
+            }
+
+            bytes[n] = (byte)b;
+            n++;
+        }
+
+        return n;
+    }
 }
diff --git a/src/java/org/apache/cassandra/utils/HeapUtils.java b/src/java/org/apache/cassandra/utils/HeapUtils.java
index 4dd0d46b43d2..c82aad1b6c13 100644
--- a/src/java/org/apache/cassandra/utils/HeapUtils.java
+++ b/src/java/org/apache/cassandra/utils/HeapUtils.java
@@ -115,7 +115,7 @@ private static void logProcessOutput(Process p) throws IOException
      * Retrieves the process ID or <code>null</code> if the process ID cannot be retrieved.
      * @return the process ID or <code>null</code> if the process ID cannot be retrieved.
      */
-    private static Long getProcessId()
+    public static Long getProcessId()
     {
         long pid = NativeLibrary.getProcessID();
         if (pid >= 0)
diff --git a/src/java/org/apache/cassandra/utils/MergeIterator.java b/src/java/org/apache/cassandra/utils/MergeIterator.java
index 6713dd0a4313..38d9c024dd17 100644
--- a/src/java/org/apache/cassandra/utils/MergeIterator.java
+++ b/src/java/org/apache/cassandra/utils/MergeIterator.java
@@ -442,7 +442,7 @@ public boolean trivialReduceIsTrivial()
          * Called at the beginning of each new key, before any reduce is called.
          * To be overridden by implementing classes.
          */
-        protected void onKeyChange() {}
+        public void onKeyChange() {}
 
         /**
          * May be overridden by implementations that require cleaning up after use
@@ -488,4 +488,37 @@ protected Out computeNext()
             return (Out) source.next();
         }
     }
+
+    public static <In> Reducer<In, In> getIdentity()
+    {
+        return new IdentityReducer<>();
+    }
+
+    private static class IdentityReducer<In> extends Reducer<In, In>
+    {
+        private In reduced;
+
+        @Override
+        public void reduce(int idx, In current)
+        {
+            this.reduced = current;
+        }
+
+        @Override
+        public In getReduced()
+        {
+            return reduced;
+        }
+
+        @Override
+        public void onKeyChange() {
+            this.reduced = null;
+        }
+
+        @Override
+        public boolean trivialReduceIsTrivial()
+        {
+            return true;
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/utils/Throwables.java b/src/java/org/apache/cassandra/utils/Throwables.java
index 86c0156f5a65..a9ee2e46c55a 100644
--- a/src/java/org/apache/cassandra/utils/Throwables.java
+++ b/src/java/org/apache/cassandra/utils/Throwables.java
@@ -41,6 +41,25 @@ public interface DiscreteAction<E extends Exception>
         void perform() throws E;
     }
 
+    /**
+     * Check if the provided throwable is of the provided class, or than any of the throwable in his clause chain is
+     * of the provided class.
+     *
+     * @param t the {@link Throwable} to check.
+     * @param causeClass the class to check if the exception is an instance of, or is caused by.
+     * @return {@code true} if {@code t} is of class {@code causeClass} or any of its cause is.
+     */
+    public static <T extends Throwable> boolean isCausedBy(Throwable t, Class<T> causeClass)
+    {
+        while (t != null)
+        {
+            if (causeClass.isInstance(t))
+                return true;
+            t = t.getCause();
+        }
+        return false;
+    }
+
     public static boolean isCausedBy(Throwable t, Predicate<Throwable> cause)
     {
         return cause.test(t) || (t.getCause() != null && cause.test(t.getCause()));
@@ -178,8 +197,33 @@ public static Throwable perform(Throwable accumulate, String filePath, FileOpTyp
         }));
     }
 
+    public static Throwable close(Throwable accumulate, AutoCloseable ... closeables)
+    {
+        if (closeables == null)
+            return accumulate;
+
+        for (AutoCloseable closeable : closeables)
+        {
+            if (closeable != null)
+            {
+                try
+                {
+                    closeable.close();
+                }
+                catch (Throwable t)
+                {
+                    accumulate = merge(accumulate, t);
+                }
+            }
+        }
+        return accumulate;
+    }
+
     public static Throwable close(Throwable accumulate, Iterable<? extends AutoCloseable> closeables)
     {
+        if (closeables == null)
+            return accumulate;
+        
         for (AutoCloseable closeable : closeables)
         {
             try
diff --git a/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java b/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java
index 9aecf85e4525..c77cc50d596b 100644
--- a/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/NativeAllocator.java
@@ -76,7 +76,7 @@ private CloningBTreeRowBuilder(OpOrder.Group writeOp, NativeAllocator allocator)
         @Override
         public void newRow(Clustering<?> clustering)
         {
-            if (clustering != Clustering.STATIC_CLUSTERING)
+            if (clustering != Clustering.EMPTY && clustering != Clustering.STATIC_CLUSTERING)
                 clustering = new NativeClustering(allocator, writeOp, clustering);
             super.newRow(clustering);
         }
diff --git a/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java b/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java
index 6d9d2f6cc815..4f0efa6cec24 100644
--- a/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java
+++ b/test/unit/org/apache/cassandra/utils/MergeIteratorComparisonTest.java
@@ -552,7 +552,7 @@ public void reduce(int idx, T next)
         }
 
         @Override
-        protected void onKeyChange()
+        public void onKeyChange()
         {
             assert read;
             current = null;
@@ -603,7 +603,7 @@ public void reduce(int idx, KeyedSet<K, V> next)
         }
 
         @Override
-        protected void onKeyChange()
+        public void onKeyChange()
         {
             assert read;
             current = null;
diff --git a/tools/fqltool/src/org/apache/cassandra/fqltool/commands/Replay.java b/tools/fqltool/src/org/apache/cassandra/fqltool/commands/Replay.java
index 492e6ac1bf2a..dffe5727c0c3 100644
--- a/tools/fqltool/src/org/apache/cassandra/fqltool/commands/Replay.java
+++ b/tools/fqltool/src/org/apache/cassandra/fqltool/commands/Replay.java
@@ -153,7 +153,7 @@ protected List<FQLQuery> getReduced()
         {
             return queries;
         }
-        protected void onKeyChange()
+        public void onKeyChange()
         {
             queries.clear();
         }

From c1cad34f418e4dec97bc02feb373980a1d6cd485 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 17:08:50 +0000
Subject: [PATCH 043/151] STAR-121: Test support for SAI

(cherry picked from commit 6501503cf75a9a1d2f0ace5eb5d56bb9f28bb361)
(cherry picked from commit f4d12eae47d3585897798df1b437bb4e4177391b)
---
 build.xml                                     |  13 +-
 .../org/apache/cassandra/cql3/CQLTester.java  | 174 +++++-
 .../cassandra/cql3/GcCompactionTest.java      |   4 +-
 .../cassandra/cql3/KeyCacheCqlTest.java       |   4 +-
 .../miscellaneous/SSTablesIteratedTest.java   |   4 +-
 .../org/apache/cassandra/db/KeyspaceTest.java |   4 +-
 .../cassandra/dht/KeyCollisionTest.java       |  41 +-
 .../cassandra/dht/LengthPartitioner.java      | 129 +++-
 .../index/ExpressionFilteringIndex.java       | 117 ++++
 .../cassandra/inject/ActionBuilder.java       | 294 +++++++++
 .../cassandra/inject/CyclicBarrier.java       |  71 +++
 .../apache/cassandra/inject/Expression.java   | 115 ++++
 .../apache/cassandra/inject/Injection.java    |  87 +++
 .../cassandra/inject/InjectionBuilder.java    |  42 ++
 .../apache/cassandra/inject/Injections.java   | 590 ++++++++++++++++++
 .../cassandra/inject/InvokePointBuilder.java  | 133 ++++
 .../org/apache/cassandra/inject/Rule.java     |  59 ++
 ...-f4d12eae47 STAR-121: Test support for SAI |  51 ++
 18 files changed, 1846 insertions(+), 86 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java
 create mode 100644 test/unit/org/apache/cassandra/inject/ActionBuilder.java
 create mode 100644 test/unit/org/apache/cassandra/inject/CyclicBarrier.java
 create mode 100644 test/unit/org/apache/cassandra/inject/Expression.java
 create mode 100644 test/unit/org/apache/cassandra/inject/Injection.java
 create mode 100644 test/unit/org/apache/cassandra/inject/InjectionBuilder.java
 create mode 100644 test/unit/org/apache/cassandra/inject/Injections.java
 create mode 100644 test/unit/org/apache/cassandra/inject/InvokePointBuilder.java
 create mode 100644 test/unit/org/apache/cassandra/inject/Rule.java
 create mode 100644 update-history/STAR-801/60-f4d12eae47 STAR-121: Test support for SAI

diff --git a/build.xml b/build.xml
index b118d7bb7a87..479c3fb99fa1 100644
--- a/build.xml
+++ b/build.xml
@@ -657,8 +657,9 @@
           <dependency groupId="org.hamcrest" artifactId="hamcrest" version="2.2" scope="test"/>
           <dependency groupId="org.agrona" artifactId="agrona" version="0.9.26" />
           <dependency groupId="org.apache.lucene" artifactId="lucene-core" version="7.5.0" />
-          <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" />
-          <dependency groupId="org.hamcrest" artifactId="hamcrest-all" version="1.3" />
+          <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2">
+              <exclusion groupId="junit" artifactId="junit"/>
+          </dependency>
         </dependencyManagement>
         <developer id="adelapena" name="Andres de la Peña"/>
         <developer id="alakshman" name="Avinash Lakshman"/>
@@ -733,6 +734,9 @@
         <!-- coverage debs -->
         <dependency groupId="org.jacoco" artifactId="org.jacoco.agent"/>
         <dependency groupId="org.jacoco" artifactId="org.jacoco.ant"/>
+
+        <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
+        <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
       </artifact:pom>
 
       <!-- now the pom's for artifacts being deployed to Maven Central -->
@@ -776,7 +780,7 @@
         <dependency groupId="ch.qos.logback" artifactId="logback-core"/>
         <dependency groupId="ch.qos.logback" artifactId="logback-classic"/>
 
-        <!-- don't need hadoop classes to run, but if you use the hadoop stuff -->
+          <!-- don't need hadoop classes to run, but if you use the hadoop stuff -->
         <dependency groupId="org.apache.hadoop" artifactId="hadoop-core" optional="true"/>
         <dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster" optional="true"/>
 
@@ -817,6 +821,9 @@
         <dependency groupId="com.google.j2objc" artifactId="j2objc-annotations"/>
         <dependency groupId="org.hdrhistogram" artifactId="HdrHistogram"/>
         <dependency groupId="org.agrona" artifactId="agrona"/>
+        <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
+        <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
+        <dependency groupId="org.hamcrest" artifactId="hamcrest" scope="test"/>
 
         <!-- sasi deps -->
         <dependency groupId="de.jflex" artifactId="jflex" />
diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 1c9150e20981..00e5218a08a8 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -47,9 +47,13 @@
 import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableSet;
 import org.junit.*;
+import org.junit.rules.TestWatcher;
+import org.junit.runner.Description;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.carrotsearch.randomizedtesting.generators.RandomInts;
+import com.carrotsearch.randomizedtesting.generators.RandomStrings;
 import com.datastax.driver.core.*;
 import com.datastax.driver.core.DataType;
 import com.datastax.driver.core.ResultSet;
@@ -57,6 +61,7 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.ServerTestUtils;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry;
 import org.apache.cassandra.db.virtual.VirtualSchemaKeyspace;
 import org.apache.cassandra.index.SecondaryIndexManager;
@@ -88,6 +93,7 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.JMXServerUtils;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 
 import static com.datastax.driver.core.SocketOptions.DEFAULT_CONNECT_TIMEOUT_MILLIS;
 import static com.datastax.driver.core.SocketOptions.DEFAULT_READ_TIMEOUT_MILLIS;
@@ -117,6 +123,8 @@ public abstract class CQLTester
     protected static int jmxPort;
     protected static MBeanServerConnection jmxConnection;
 
+    private static Randomization random;
+
     protected static final int nativePort;
     protected static final InetAddress nativeAddr;
     protected static final Set<InetAddressAndPort> remoteAddrs = new HashSet<>();
@@ -254,6 +262,16 @@ public static JMXServiceURL getJMXServiceURL() throws MalformedURLException
         return new JMXServiceURL(String.format("service:jmx:rmi:///jndi/rmi://%s:%d/jmxrmi", jmxHost, jmxPort));
     }
 
+    public static Randomization getRandom()
+    {
+        if (random == null)
+            random = new Randomization();
+        return random;
+    }
+
+    @Rule
+    public FailureWatcher failureRule = new FailureWatcher();
+
     @BeforeClass
     public static void setUpClass()
     {
@@ -513,10 +531,12 @@ public ColumnFamilyStore getCurrentColumnFamilyStore()
 
     public ColumnFamilyStore getCurrentColumnFamilyStore(String keyspace)
     {
-        String currentTable = currentTable();
-        return currentTable == null
-             ? null
-             : Keyspace.open(keyspace).getColumnFamilyStore(currentTable);
+        return getColumnFamilyStore(keyspace, currentTable());
+    }
+
+    public ColumnFamilyStore getColumnFamilyStore(String keyspace, String table)
+    {
+        return Keyspace.open(keyspace).getColumnFamilyStore(table);
     }
 
     public void flush(boolean forceFlush)
@@ -532,23 +552,38 @@ public void flush()
 
     public void flush(String keyspace)
     {
-        ColumnFamilyStore store = getCurrentColumnFamilyStore(keyspace);
+        flush(keyspace, currentTable());
+    }
+
+    public void flush(String keyspace, String table)
+    {
+        ColumnFamilyStore store = getColumnFamilyStore(keyspace, table);
         if (store != null)
             store.forceBlockingFlush();
     }
 
     public void disableCompaction(String keyspace)
     {
-        ColumnFamilyStore store = getCurrentColumnFamilyStore(keyspace);
+        disableCompaction(keyspace, currentTable());
+    }
+
+    public void disableCompaction(String keyspace, String table)
+    {
+        ColumnFamilyStore store = getColumnFamilyStore(keyspace, table);
         if (store != null)
             store.disableAutoCompaction();
     }
 
     public void compact()
     {
-         ColumnFamilyStore store = getCurrentColumnFamilyStore();
-         if (store != null)
-             store.forceMajorCompaction();
+        compact(KEYSPACE, currentTable());
+    }
+
+    public void compact(String keyspace, String table)
+    {
+        ColumnFamilyStore store = getColumnFamilyStore(keyspace, table);
+        if (store != null)
+            store.forceMajorCompaction();
     }
 
     public void disableCompaction()
@@ -737,7 +772,7 @@ protected String createKeyspaceName()
         return currentKeyspace;
     }
 
-    protected String createTable(String query)
+    public String createTable(String query)
     {
         return createTable(KEYSPACE, query);
     }
@@ -909,6 +944,23 @@ protected void dropIndex(String query) throws Throwable
         schemaChange(fullQuery);
     }
 
+    /**
+     *  Because the tracing executor is single threaded, submitting an empty event should ensure
+     *  that all tracing events mutations have been applied.
+     */
+    protected void waitForTracingEvents()
+    {
+        try
+        {
+            Stage.TRACING.executor().submit(() -> {}).get();
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.error("Failed to wait for tracing events: {}", t);
+        }
+    }
+
     protected static void assertSchemaChange(String query,
                                              Event.SchemaChange.Change expectedChange,
                                              Event.SchemaChange.Target expectedTarget,
@@ -977,7 +1029,7 @@ protected com.datastax.driver.core.ResultSet executeNetWithPaging(String query,
         return sessionNet().execute(new SimpleStatement(formatQuery(query)).setFetchSize(pageSize));
     }
 
-    protected Session sessionNet()
+    public Session sessionNet()
     {
         return sessionNet(getDefaultVersion());
     }
@@ -1011,12 +1063,12 @@ protected ResultMessage.Prepared prepare(String query) throws Throwable
         return QueryProcessor.prepare(formatQuery(query), ClientState.forInternalCalls());
     }
 
-    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    public UntypedResultSet execute(String query, Object... values) throws Throwable
     {
         return executeFormattedQuery(formatQuery(query), values);
     }
 
-    protected UntypedResultSet executeFormattedQuery(String query, Object... values) throws Throwable
+    public UntypedResultSet executeFormattedQuery(String query, Object... values) throws Throwable
     {
         UntypedResultSet rs;
         if (usePrepared)
@@ -1953,4 +2005,100 @@ public String toString()
             return "UserTypeValue" + toCQLString();
         }
     }
+
+    public static class Randomization
+    {
+        private long seed;
+        private Random random;
+
+        Randomization()
+        {
+            if (random == null)
+            {
+                seed = Long.getLong("cassandra.test.random.seed", System.nanoTime());
+                random = new Random(seed);
+            }
+        }
+
+        public void printSeedOnFailure()
+        {
+            System.err.println("Randomized test failed. To rerun test use -Dcassandra.test.random.seed=" + seed);
+        }
+
+        public int nextInt()
+        {
+            return random.nextInt();
+        }
+
+        public int nextIntBetween(int minValue, int maxValue)
+        {
+            return RandomInts.randomIntBetween(random, minValue, maxValue);
+        }
+
+        public long nextLong()
+        {
+            return random.nextLong();
+        }
+
+        public short nextShort()
+        {
+            return (short)random.nextInt(Short.MAX_VALUE + 1);
+        }
+
+        public byte nextByte()
+        {
+            return (byte)random.nextInt(Byte.MAX_VALUE + 1);
+        }
+
+        public BigInteger nextBigInteger(int minNumBits, int maxNumBits)
+        {
+            return new BigInteger(RandomInts.randomIntBetween(random, minNumBits, maxNumBits), random);
+        }
+
+        public BigDecimal nextBigDecimal(int minUnscaledValue, int maxUnscaledValue, int minScale, int maxScale)
+        {
+            return BigDecimal.valueOf(RandomInts.randomIntBetween(random, minUnscaledValue, maxUnscaledValue),
+                                      RandomInts.randomIntBetween(random, minScale, maxScale));
+        }
+
+        public float nextFloat()
+        {
+            return random.nextFloat();
+        }
+
+        public double nextDouble()
+        {
+            return random.nextDouble();
+        }
+
+        public String nextAsciiString(int minLength, int maxLength)
+        {
+            return RandomStrings.randomAsciiOfLengthBetween(random, minLength, maxLength);
+        }
+
+        public String nextTextString(int minLength, int maxLength)
+        {
+            return RandomStrings.randomRealisticUnicodeOfLengthBetween(random, minLength, maxLength);
+        }
+
+        public boolean nextBoolean()
+        {
+            return random.nextBoolean();
+        }
+
+        public void nextBytes(byte[] bytes)
+        {
+            random.nextBytes(bytes);
+        }
+    }
+
+    public static class FailureWatcher extends TestWatcher
+    {
+        @Override
+        protected void failed(Throwable e, Description description)
+        {
+            if (random != null)
+                random.printSeedOnFailure();
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
index 21475b686b13..144c3f9375ba 100644
--- a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
@@ -47,13 +47,13 @@ public class GcCompactionTest extends CQLTester
     // Test needs synchronous table drop to avoid flushes causing flaky failures
 
     @Override
-    protected String createTable(String query)
+    public String createTable(String query)
     {
         return super.createTable(KEYSPACE_PER_TEST, query);
     }
 
     @Override
-    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    public UntypedResultSet execute(String query, Object... values) throws Throwable
     {
         return executeFormattedQuery(formatQuery(KEYSPACE_PER_TEST, query), values);
     }
diff --git a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
index b76cc784396c..122f97a339fa 100644
--- a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
+++ b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
@@ -107,13 +107,13 @@ public static void setUpClass()
      * that we can assert on the key cache size and metrics.
      */
     @Override
-    protected String createTable(String query)
+    public String createTable(String query)
     {
         return super.createTable(KEYSPACE_PER_TEST, query + " WITH caching = { 'keys' : 'ALL', 'rows_per_partition' : '0' }");
     }
 
     @Override
-    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    public UntypedResultSet execute(String query, Object... values) throws Throwable
     {
         return executeFormattedQuery(formatQuery(KEYSPACE_PER_TEST, query), values);
     }
diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
index 1bd08ceee5ab..0b441d91cdb0 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
@@ -51,7 +51,7 @@ private void executeAndCheck(String query, int numSSTables, Object[]... rows) th
     }
 
     @Override
-    protected String createTable(String query)
+    public String createTable(String query)
     {
         String ret = super.createTable(KEYSPACE_PER_TEST, query);
         disableCompaction(KEYSPACE_PER_TEST);
@@ -59,7 +59,7 @@ protected String createTable(String query)
     }
 
     @Override
-    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    public UntypedResultSet execute(String query, Object... values) throws Throwable
     {
         return executeFormattedQuery(formatQuery(KEYSPACE_PER_TEST, query), values);
     }
diff --git a/test/unit/org/apache/cassandra/db/KeyspaceTest.java b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
index 4aa8a32e1c05..c3980b66648f 100644
--- a/test/unit/org/apache/cassandra/db/KeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
@@ -49,13 +49,13 @@ public class KeyspaceTest extends CQLTester
     // Test needs synchronous table drop to avoid flushes causing flaky failures of testLimitSSTables
 
     @Override
-    protected String createTable(String query)
+    public String createTable(String query)
     {
         return super.createTable(KEYSPACE_PER_TEST, query);
     }
 
     @Override
-    protected UntypedResultSet execute(String query, Object... values) throws Throwable
+    public UntypedResultSet execute(String query, Object... values) throws Throwable
     {
         return executeFormattedQuery(formatQuery(KEYSPACE_PER_TEST, query), values);
     }
diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
index c24690b8bf7a..caa800c9a432 100644
--- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
+++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java
@@ -17,7 +17,6 @@
  */
 package org.apache.cassandra.dht;
 
-import java.math.BigInteger;
 import java.util.List;
 
 import org.junit.AfterClass;
@@ -27,18 +26,15 @@
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.marshal.IntegerType;
-import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.RowUpdateBuilder;
-import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.partitions.FilteredPartition;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.bytecomparable.ByteComparable;
-import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.FBUtilities;
 
 /**
@@ -101,37 +97,4 @@ private void insert(String key)
         RowUpdateBuilder builder = new RowUpdateBuilder(Schema.instance.getTableMetadata(KEYSPACE1, CF), FBUtilities.timestampMicros(), key);
         builder.clustering("c").add("val", "asdf").build().applyUnsafe();
     }
-
-    static class BigIntegerToken extends ComparableObjectToken<BigInteger>
-    {
-        private static final long serialVersionUID = 1L;
-
-        public BigIntegerToken(BigInteger token)
-        {
-            super(token);
-        }
-
-        // convenience method for testing
-        public BigIntegerToken(String token) {
-            this(new BigInteger(token));
-        }
-
-        @Override
-        public IPartitioner getPartitioner()
-        {
-            return LengthPartitioner.instance;
-        }
-
-        @Override
-        public long getHeapSize()
-        {
-            return 0;
-        }
-
-        @Override
-        public ByteSource asComparableBytes(ByteComparable.Version version)
-        {
-            return IntegerType.instance.asComparableBytes(IntegerType.instance.decompose(token), version);
-        }
-    }
 }
diff --git a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
index 57cc23830c6e..a16a65c451ae 100644
--- a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
+++ b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java
@@ -22,25 +22,38 @@
 import java.util.*;
 import java.util.concurrent.ThreadLocalRandom;
 
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.LongType;
 import org.apache.cassandra.db.marshal.PartitionerDefinedOrder;
-import org.apache.cassandra.dht.KeyCollisionTest.BigIntegerToken;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.*;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
 
 public class LengthPartitioner implements IPartitioner
 {
-    public static final BigInteger ZERO = new BigInteger("0");
-    public static final BigIntegerToken MINIMUM = new BigIntegerToken("-1");
+    public static final Long ZERO = 0L;
+    public static final BigIntegerToken MINIMUM = new BigIntegerToken(-1L);
+    public static final BigIntegerToken MAXIMUM = new BigIntegerToken(Long.MAX_VALUE);
+
+    private final Splitter splitter = new Splitter(this)
+    {
+        public Token tokenForValue(BigInteger value)
+        {
+            return new BigIntegerToken(value.longValue());
+        }
+
+        public BigInteger valueForToken(Token token)
+        {
+            return BigInteger.valueOf(((BigIntegerToken)token).getTokenValue());
+        }
+    };
 
     public static LengthPartitioner instance = new LengthPartitioner();
 
@@ -52,16 +65,31 @@ public DecoratedKey decorateKey(ByteBuffer key)
     public BigIntegerToken midpoint(Token ltoken, Token rtoken)
     {
         // the symbolic MINIMUM token should act as ZERO: the empty bit array
-        BigInteger left = ltoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)ltoken).token;
-        BigInteger right = rtoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)rtoken).token;
-        Pair<BigInteger,Boolean> midpair = FBUtilities.midpoint(left, right, 127);
+        Long left = ltoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)ltoken).token;
+        Long right = rtoken.equals(MINIMUM) ? ZERO : ((BigIntegerToken)rtoken).token;
+        Pair<BigInteger,Boolean> midpair = FBUtilities.midpoint(BigInteger.valueOf(left), BigInteger.valueOf(right), 127);
         // discard the remainder
-        return new BigIntegerToken(midpair.left);
+        return new BigIntegerToken(midpair.left.longValue());
     }
 
-    public Token split(Token left, Token right, double ratioToLeft)
+    public Token split(Token tleft, Token tright, double ratio)
     {
-        throw new UnsupportedOperationException();
+        assert ratio >= 0.0 && ratio <= 1.0;
+        BigIntegerToken ltoken = (BigIntegerToken) tleft;
+        BigIntegerToken rtoken = (BigIntegerToken) tright;
+
+        long left = ltoken.token;
+        long right = rtoken.token;
+
+        if (left < right)
+        {
+            return new BigIntegerToken((long)(((right - left) * ratio) + left));
+        }
+        else
+        {  // wrapping case
+            Long max = MAXIMUM.token;
+            return new BigIntegerToken((long)(((max + right) - left) * ratio) + left);
+        }
     }
 
     public BigIntegerToken getMinimumToken()
@@ -72,7 +100,7 @@ public BigIntegerToken getMinimumToken()
     @Override
     public Token getMaximumToken()
     {
-        return null;
+        return MAXIMUM;
     }
 
     public BigIntegerToken getRandomToken()
@@ -82,24 +110,26 @@ public BigIntegerToken getRandomToken()
 
     public BigIntegerToken getRandomToken(Random random)
     {
-        return new BigIntegerToken(BigInteger.valueOf(random.nextInt(15)));
+        return new BigIntegerToken((long)random.nextInt(15));
     }
 
-    private final Token.TokenFactory tokenFactory = new Token.TokenFactory() {
+    private final Token.TokenFactory tokenFactory = new Token.TokenFactory()
+    {
         public ByteBuffer toByteArray(Token token)
         {
             BigIntegerToken bigIntegerToken = (BigIntegerToken) token;
-            return ByteBuffer.wrap(bigIntegerToken.token.toByteArray());
+            return ByteBufferUtil.bytes(bigIntegerToken.token);
         }
 
         public Token fromByteArray(ByteBuffer bytes)
         {
-            return new BigIntegerToken(new BigInteger(ByteBufferUtil.getArray(bytes)));
+            return new BigIntegerToken(ByteBufferUtil.toLong(bytes));
         }
 
+        @Override
         public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version)
         {
-            return fromByteArray(IntegerType.instance.fromComparableBytes(comparableBytes, version));
+            return new BigIntegerToken(ByteSourceInverse.getSignedLong(comparableBytes));
         }
 
         public String toString(Token token)
@@ -110,7 +140,7 @@ public String toString(Token token)
 
         public Token fromString(String string)
         {
-            return new BigIntegerToken(new BigInteger(string));
+            return new BigIntegerToken(Long.valueOf(string));
         }
 
         public void validate(String token) {}
@@ -130,7 +160,7 @@ public BigIntegerToken getToken(ByteBuffer key)
     {
         if (key.remaining() == 0)
             return MINIMUM;
-        return new BigIntegerToken(BigInteger.valueOf(key.remaining()));
+        return new BigIntegerToken((long)key.remaining());
     }
 
     public Map<Token, Float> describeOwnership(List<Token> sortedTokens)
@@ -179,4 +209,57 @@ public AbstractType<?> partitionOrdering()
     {
         return new PartitionerDefinedOrder(this);
     }
-}
\ No newline at end of file
+
+    public Optional<Splitter> splitter()
+    {
+        return Optional.of(splitter);
+    }
+
+    static class BigIntegerToken extends ComparableObjectToken<Long>
+    {
+        private static final long serialVersionUID = 1L;
+
+        public BigIntegerToken(Long token)
+        {
+            super(token);
+        }
+
+        // convenience method for testing
+        public BigIntegerToken(String token) {
+            this(Long.valueOf(token));
+        }
+
+        public ByteSource asComparableBytes(ByteComparable.Version version)
+        {
+            ByteBuffer tokenBuffer = LongType.instance.decompose(token);
+            return LongType.instance.asComparableBytes(tokenBuffer, version);
+        }
+
+        @Override
+        public IPartitioner getPartitioner()
+        {
+            return LengthPartitioner.instance;
+        }
+
+        @Override
+        public long getHeapSize()
+        {
+            return 0;
+        }
+
+        @Override
+        public long getLongValue()
+        {
+            return token;
+        }
+
+        @Override
+        public double size(Token next)
+        {
+            BigIntegerToken n = (BigIntegerToken) next;
+            long v = n.token - token;  // Overflow acceptable and desired.
+            double d = Math.scalb((double)v, -127); // Scale so that the full range is 1.
+            return d > 0.0 ? d : (d + 1.0); // Adjust for signed long, also making sure t.size(t) == 1.
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java b/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java
new file mode 100644
index 000000000000..e762126f0f6d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+
+/**
+ * An {@link Index} that selects rows whose indexed column value is equals to the requested custom expression.
+ * The implementation relies only on {@link #customExpressionFor(TableMetadata, ByteBuffer)}, while the searcher
+ * returns all the rows satisfying the key range.
+ */
+public final class ExpressionFilteringIndex extends StubIndex
+{
+    private final TableMetadata table;
+    private final ColumnMetadata column;
+    public final AtomicInteger searches = new AtomicInteger(0);
+
+    public ExpressionFilteringIndex(ColumnFamilyStore baseCfs, IndexMetadata metadata)
+    {
+        super(baseCfs, metadata);
+        this.table = baseCfs.metadata();
+        String columnName = metadata.options.get(IndexTarget.TARGET_OPTION_NAME);
+        assert columnName != null;
+        column = table.getColumn(UTF8Type.instance.decompose(columnName));
+    }
+
+    @Override
+    public AbstractType<?> customExpressionValueType()
+    {
+        return Int32Type.instance;
+    }
+
+    @Override
+    public RowFilter.CustomExpression customExpressionFor(TableMetadata cfm, ByteBuffer value)
+    {
+        return new RowFilter.CustomExpression(cfm, getIndexMetadata(), value)
+        {
+            @Override
+            public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey, Row row)
+            {
+                Cell cell = row.getCell(ExpressionFilteringIndex.this.column);
+                return cell != null && ByteBufferUtil.compareUnsigned(cell.buffer(), value) == 0;
+            }
+        };
+    }
+
+    @Override
+    public Searcher searcherFor(ReadCommand command)
+    {
+        return controller -> {
+            searches.incrementAndGet();
+
+            ReadCommand all;
+            if (command instanceof SinglePartitionReadCommand)
+            {
+                SinglePartitionReadCommand cmd = (SinglePartitionReadCommand) command;
+                all = SinglePartitionReadCommand.create(table,
+                                                        cmd.nowInSec(),
+                                                        cmd.partitionKey(),
+                                                        cmd.clusteringIndexFilter().getSlices(cmd.metadata()));
+            }
+            else if (command instanceof PartitionRangeReadCommand)
+            {
+                PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
+                all = PartitionRangeReadCommand.create(table,
+                                                       cmd.nowInSec(),
+                                                       ColumnFilter.all(table),
+                                                       RowFilter.NONE,
+                                                       DataLimits.NONE,
+                                                       cmd.dataRange());
+            }
+            else
+            {
+                throw new UnsupportedOperationException();
+            }
+            return all.executeLocally(ReadExecutionController.empty());
+        };
+    }
+}
diff --git a/test/unit/org/apache/cassandra/inject/ActionBuilder.java b/test/unit/org/apache/cassandra/inject/ActionBuilder.java
new file mode 100644
index 000000000000..bfca1d25b6fb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/ActionBuilder.java
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Preconditions;
+
+import org.jboss.byteman.rule.helper.Helper;
+
+import static org.apache.cassandra.inject.Expression.THIS;
+import static org.apache.cassandra.inject.Expression.expr;
+import static org.apache.cassandra.inject.Expression.quote;
+
+/**
+ * Refer to <a href="https://github.com/bytemanproject/byteman/blob/master/docs/asciidoc/src/main/asciidoc/chapters/Byteman-Rule-Language.adoc"/>
+ * and injections.md files in the root directory.
+ */
+public class ActionBuilder
+{
+    private Class<?> helperClass = Helper.class;
+    private ConditionsBuilder conditionsBuilder = new ConditionsBuilder();
+    private BindingsBuilder bindingsBuilder = new BindingsBuilder();
+    private ActionsBuilder actionsBuilder = new ActionsBuilder();
+
+    public static ActionBuilder newActionBuilder() {
+        return new ActionBuilder();
+    }
+
+    private ActionBuilder() {}
+
+    public ActionBuilder withHelperClass(Class<?> helperClass)
+    {
+        this.helperClass = helperClass;
+        return this;
+    }
+
+    public Class<?> getHelperClass()
+    {
+        return helperClass;
+    }
+
+    public ConditionsBuilder conditions()
+    {
+        return conditionsBuilder;
+    }
+
+    public BindingsBuilder bindings()
+    {
+        return bindingsBuilder;
+    }
+
+    public ActionsBuilder actions()
+    {
+        return actionsBuilder;
+    }
+
+    /**
+     * When an invoke point is set on a method in an interface or in some base class and is not overridden in the
+     * class where we want the injection to be executed, we need to add some additional condition. This method
+     * configures
+     * the action to be invoked only in the instance of the specified class.
+     */
+    public ActionBuilder onlyInClass(Class<?> targetClass)
+    {
+        return onlyInClass(targetClass.getName());
+    }
+
+    /** @see #onlyInClass(Class) */
+    public ActionBuilder onlyInClass(String targetClassName)
+    {
+        bindings().addBinding("thisClassName", "String", expr(THIS).method("getClass").args().method("getName").args().toString());
+        conditions().when(expr("thisClassName").method("equals").args(quote(targetClassName)));
+        return this;
+    }
+
+    /**
+     * the action to be invoked only if it's called from given callerMethodName
+     */
+    public ActionBuilder callerEquals(String callerMethodName)
+    {
+        conditions().when(expr("callerEquals").args(quote(callerMethodName)));
+        return this;
+    }
+
+    private enum LogicOp
+    {
+        AND, OR, NOT
+    }
+
+    public class ConditionsBuilder extends Builder
+    {
+        private final LinkedList<Object> elements = new LinkedList<>();
+
+        public ConditionsBuilder clear()
+        {
+            elements.clear();
+            return this;
+        }
+
+        public ConditionsBuilder when(Object expression)
+        {
+            if (!elements.isEmpty() && !(elements.getLast() instanceof LogicOp))
+            {
+                elements.add(LogicOp.AND);
+            }
+            elements.add(expression);
+            return this;
+        }
+
+        public ConditionsBuilder not()
+        {
+            if (!(elements.getLast() instanceof LogicOp))
+            {
+                elements.add(LogicOp.AND);
+            }
+            elements.add(LogicOp.NOT);
+            return this;
+        }
+
+        public ConditionsBuilder and()
+        {
+            Preconditions.checkState(!(elements.getLast() instanceof LogicOp));
+            elements.add(LogicOp.AND);
+            return this;
+        }
+
+        public ConditionsBuilder or()
+        {
+            Preconditions.checkState(!(elements.getLast() instanceof LogicOp));
+            elements.add(LogicOp.OR);
+            return this;
+        }
+
+        public ConditionsBuilder trueLiteral()
+        {
+            return when("TRUE");
+        }
+
+        public ConditionsBuilder falseLiteral()
+        {
+            return when("FALSE");
+        }
+
+        public String buildInternal()
+        {
+            if (elements.isEmpty())
+            {
+                return "IF TRUE";
+            }
+            else
+            {
+                return String.format("IF %s", elements.stream().map(Object::toString).collect(Collectors.joining(" ")));
+            }
+        }
+    }
+
+    public class ActionsBuilder extends Builder
+    {
+        private final List<String> actions = new LinkedList<>();
+
+        public ActionBuilder doThrow(Object newThrowableExpression)
+        {
+            actions.add(String.format("throw %s", newThrowableExpression));
+            return ActionBuilder.this;
+        }
+
+        public ActionBuilder doThrow(Class<? extends Throwable> exceptionClass, Object... args)
+        {
+            return doThrow(Expression.newInstance(exceptionClass).args(args));
+        }
+
+        public ActionBuilder doReturn(Object returnExpression)
+        {
+            actions.add(String.format("return %s", returnExpression));
+            return ActionBuilder.this;
+        }
+
+        public ActionsBuilder doAction(Object expression)
+        {
+            actions.add(expression.toString());
+            return this;
+        }
+
+        public String buildInternal()
+        {
+            if (actions.isEmpty())
+            {
+                return "DO NOTHING";
+            }
+            else
+            {
+                return String.format("DO %s", actions.stream().collect(Collectors.joining(";\n")));
+            }
+        }
+    }
+
+    public class BindingsBuilder extends Builder
+    {
+        private final List<Binding> bindings = new LinkedList<>();
+
+        public BindingsBuilder addBinding(String name, String type, String expression)
+        {
+            bindings.add(new Binding(name, type, expression));
+            return this;
+        }
+
+        public BindingsBuilder clear()
+        {
+            bindings.clear();
+            return this;
+        }
+
+        public class Binding
+        {
+            public final String name;
+            public final String type;
+            public final String expression;
+
+            public Binding(String name, String type, String expression)
+            {
+                this.name = name;
+                this.type = type;
+                this.expression = expression;
+            }
+
+            @Override
+            public String toString()
+            {
+                return String.format("%s:%s = %s", name, type, expression);
+            }
+        }
+
+        @Override
+        String buildInternal()
+        {
+            if (bindings.isEmpty())
+            {
+                return "";
+            }
+            else
+            {
+                return String.format("BIND %s", bindings.stream().map(Binding::toString).collect(Collectors.joining("\n")));
+            }
+        }
+    }
+
+    String buildInternal()
+    {
+        return String.format("%s\n%s\n%s", bindings().buildInternal(), conditions().buildInternal(), actions().buildInternal());
+    }
+
+    public abstract class Builder
+    {
+        abstract String buildInternal();
+
+        public ActionsBuilder actions()
+        {
+            return actionsBuilder;
+        }
+
+        public ConditionsBuilder conditions()
+        {
+            return conditionsBuilder;
+        }
+
+        public BindingsBuilder bindings()
+        {
+            return bindingsBuilder;
+        }
+
+        public ActionBuilder toActionBuilder()
+        {
+            return ActionBuilder.this;
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/inject/CyclicBarrier.java b/test/unit/org/apache/cassandra/inject/CyclicBarrier.java
new file mode 100644
index 000000000000..fbfbc8c6ec11
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/CyclicBarrier.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+
+public class CyclicBarrier
+{
+    private static final int LATCH_TIMEOUT_SECONDS = 20;
+
+    public final String name;
+    public final int parties;
+    public final boolean cyclic;
+
+    private CountDownLatch latch;
+
+    public CyclicBarrier(String name, int parties, boolean cyclic)
+    {
+        this.name = name;
+        this.parties = parties;
+        this.cyclic = cyclic;
+        this.latch = new CountDownLatch(parties);
+    }
+
+    public void await() throws InterruptedException
+    {
+        await(true, true);
+    }
+
+    public void await(boolean doCountDown, boolean doAwait) throws InterruptedException
+    {
+        if (doCountDown)
+        {
+            latch.countDown();
+        }
+        if (doAwait)
+        {
+            latch.await(LATCH_TIMEOUT_SECONDS, TimeUnit.SECONDS);
+            if (cyclic)
+            {
+                reset();
+            }
+        }
+    }
+
+    public void reset()
+    {
+        latch = new CountDownLatch(parties);
+    }
+
+    public long getCount()
+    {
+        return latch.getCount();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/inject/Expression.java b/test/unit/org/apache/cassandra/inject/Expression.java
new file mode 100644
index 000000000000..638be721859e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/Expression.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.lang.annotation.Annotation;
+import java.lang.reflect.Method;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Preconditions;
+
+public class Expression
+{
+    private StringBuilder expression = new StringBuilder();
+
+    public static Expression expr()
+    {
+        return new Expression();
+    }
+
+    public static Expression expr(String str)
+    {
+        Expression e = new Expression();
+        e.expression.append(str);
+        return e;
+    }
+
+    public static Expression newInstance(Class<?> clazz)
+    {
+        return expr("new ").append(clazz.getName());
+    }
+
+    public static Expression newInstance(String clazz)
+    {
+        return expr("new ").append(clazz);
+    }
+
+    public static Expression clazz(Class<?> clazz)
+    {
+        return expr(clazz.getName());
+    }
+
+    public Expression innerClass(String clazz)
+    {
+        expression.append("$").append(clazz);
+        return this;
+    }
+
+    public Expression method(String method)
+    {
+        if (expression.length() > 0)
+        {
+            expression.append(".");
+        }
+        expression.append(method);
+        return this;
+    }
+
+    public Expression args(Object... args)
+    {
+        expression.append("(").append(Arrays.stream(args).map(String::valueOf).collect(Collectors.joining(","))).append(")");
+        return this;
+    }
+
+    public static Expression method(Class<?> clazz, Class<? extends Annotation> annotation)
+    {
+        List<Method> methods = Arrays.stream(clazz.getDeclaredMethods())
+                                     .filter(m -> m.isAnnotationPresent(annotation))
+                                     .collect(Collectors.toList());
+
+        Preconditions.checkArgument(methods.size() == 1, "There are " + methods.size() + " methods annotated with " + annotation.getSimpleName());
+        return expr().clazz(clazz).method(methods.get(0).getName());
+    }
+
+    public Expression append(String elem)
+    {
+        expression.append(elem);
+        return this;
+    }
+
+    @Override
+    public String toString()
+    {
+        return expression.toString();
+    }
+
+    public static String quote(String quoted)
+    {
+        return "\"" + quoted + "\"";
+    }
+
+    public static String arg(int n) { return "$" + n; }
+
+    public final static String THIS = "$this";
+
+    public final static String CLASS = "$CLASS";
+
+    public final static String METHOD = "$METHOD";
+}
diff --git a/test/unit/org/apache/cassandra/inject/Injection.java b/test/unit/org/apache/cassandra/inject/Injection.java
new file mode 100644
index 000000000000..d7333065feb1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/Injection.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+import java.util.Arrays;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.stream.Collectors;
+
+/**
+ * Refer to <a href="https://github.com/bytemanproject/byteman/blob/master/docs/asciidoc/src/main/asciidoc/chapters/Byteman-Rule-Language.adoc"/>
+ * and injections.md files in the root directory.
+ */
+public class Injection
+{
+    private static final Map<String, AtomicBoolean> enableFlags = new ConcurrentHashMap<>();
+    private final String id;
+    private final Rule[] rules;
+
+    public Injection(Rule[] rules)
+    {
+        this(UUID.randomUUID().toString(), rules);
+    }
+
+    public Injection(String id, Rule[] rules)
+    {
+        this.id = id;
+        this.rules = rules;
+        enable();
+    }
+
+    public String format()
+    {
+        return Arrays.stream(rules).map(rule -> rule.script).collect(Collectors.joining("\n"));
+    }
+
+    public String[] getClassesToPreload()
+    {
+        return Arrays.stream(rules).filter(r -> r.classToPreload != null).map(r -> r.classToPreload).toArray(String[]::new);
+    }
+
+    public void enable()
+    {
+        enableFlags.computeIfAbsent(id, id -> new AtomicBoolean()).set(true);
+    }
+
+    public void disable()
+    {
+        enableFlags.computeIfAbsent(id, id -> new AtomicBoolean()).set(false);
+    }
+
+    public boolean isEnabled()
+    {
+        return enableFlags.computeIfAbsent(id, id -> new AtomicBoolean(true)).get();
+    }
+
+    @Retention(RetentionPolicy.RUNTIME)
+    @Target(ElementType.METHOD)
+    public @interface CheckEnabled {}
+
+    @CheckEnabled
+    public static boolean checkEnabled(String id)
+    {
+        return enableFlags.computeIfAbsent(id, i -> new AtomicBoolean(true)).get();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/inject/InjectionBuilder.java b/test/unit/org/apache/cassandra/inject/InjectionBuilder.java
new file mode 100644
index 000000000000..537042e922ff
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/InjectionBuilder.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import static org.apache.cassandra.inject.Expression.method;
+import static org.apache.cassandra.inject.Expression.quote;
+
+public abstract class InjectionBuilder<T extends Injection>
+{
+    public final String id;
+
+    public InjectionBuilder(String id)
+    {
+        this.id = id;
+    }
+
+    protected String getIsEnabledExpression()
+    {
+        return method(Injection.class, Injection.CheckEnabled.class).args(quote(id)).toString();
+    }
+
+    /**
+     * Creates an unmodifiable injection instance.
+     */
+    public abstract T build();
+
+}
diff --git a/test/unit/org/apache/cassandra/inject/Injections.java b/test/unit/org/apache/cassandra/inject/Injections.java
new file mode 100644
index 000000000000..a4d090a6c9f4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/Injections.java
@@ -0,0 +1,590 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.io.IOException;
+import java.lang.annotation.ElementType;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
+import java.lang.annotation.Target;
+import java.net.ServerSocket;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+import org.apache.commons.io.IOUtils;
+
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.HeapUtils;
+import org.jboss.byteman.agent.install.Install;
+import org.jboss.byteman.agent.submit.Submit;
+import org.jboss.byteman.rule.helper.Helper;
+
+import static org.apache.cassandra.inject.ActionBuilder.newActionBuilder;
+import static org.apache.cassandra.inject.Expression.expr;
+import static org.apache.cassandra.inject.Expression.method;
+import static org.apache.cassandra.inject.Expression.quote;
+
+public class Injections
+{
+    private static Submit submitter;
+
+    public static void inject(Injection...injections) throws Throwable
+    {
+        String script = Arrays.stream(injections).map(Injection::format).collect(Collectors.joining("\n"));
+        getSubmitter().addRulesFromResources(Lists.newArrayList(IOUtils.toInputStream(script)));
+    }
+
+    public static void deleteAll()
+    {
+        try
+        {
+            getSubmitter().deleteAllRules();
+        }
+        catch (Throwable ignore)
+        {
+            // Ignore because it will throw if there aren't any injections
+        }
+    }
+
+    private static Submit getSubmitter() throws Throwable
+    {
+        if (submitter == null)
+        {
+            submitter = new Submit(FBUtilities.getBroadcastAddressAndPort().address.getHostAddress(), loadAgent());
+        }
+        return submitter;
+    }
+
+    private static int loadAgent() throws Throwable
+    {
+        int port = getPort();
+        long pid = HeapUtils.getProcessId();
+        List<String> properties = new ArrayList<>();
+        properties.add("org.jboss.byteman.transform.all=true");
+        Install.install(Long.toString(pid), true, true, FBUtilities.getBroadcastAddressAndPort().address.getHostAddress(), port, properties.toArray(new String[0]));
+        return port;
+    }
+
+    private static int getPort()
+    {
+        try (ServerSocket serverSocket = new ServerSocket(0))
+        {
+            return serverSocket.getLocalPort();
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Retention(RetentionPolicy.RUNTIME)
+    @Target(ElementType.METHOD)
+    @interface CallMe {}
+
+    public abstract static class MultiInvokePointInjectionBuilder<T extends Injection, B extends InjectionBuilder<T>>
+            extends InjectionBuilder<T>
+    {
+        final List<InvokePointBuilder> invokePointBuilders = new LinkedList<>();
+
+        MultiInvokePointInjectionBuilder(String id)
+        {
+            super(id);
+        }
+
+        /**
+         * Adds a new invoke point to the injection. Adding a new invoke point cause creation of a new rule because
+         * a single rule can only have a single invoke point.
+         */
+        public B add(InvokePointBuilder ipb)
+        {
+            invokePointBuilders.add(ipb);
+            return (B) this;
+        }
+    }
+
+    public abstract static class CrossProductInjectionBuilder<T extends Injection, B extends InjectionBuilder<T>>
+            extends MultiInvokePointInjectionBuilder<T, B>
+    {
+        final LinkedList<ActionBuilder> actionBuilders = new LinkedList<>();
+
+        CrossProductInjectionBuilder(String id)
+        {
+            super(id);
+        }
+
+        Rule[] getRules()
+        {
+            Rule[] rules = new Rule[actionBuilders.size() * invokePointBuilders.size()];
+            int i = 0;
+            for (InvokePointBuilder ipb : invokePointBuilders)
+            {
+                for (ActionBuilder ab : actionBuilders)
+                {
+                    rules[i++] = Rule.newRule(id + "_" + i, ab, ipb);
+                }
+            }
+            return rules;
+        }
+
+        /**
+         * Adds a new action to the injection. Adding a new action cause creation of a new rule because a single rule
+         * can only have a single action. Do not confuse an action with a statement - an action is bundle of bindings,
+         * condition under which it can be invoked and a sequence of statements.
+         *
+         * If you just need to add a statement to the existing action, see {@link #withLastActionBuilder(Consumer)} and
+         * {@link #lastActionBuilder()}.
+         */
+        public B add(ActionBuilder builder)
+        {
+            if (builder.getHelperClass() == null)
+            {
+                builder.withHelperClass(Helper.class);
+            }
+            builder.conditions().when(getIsEnabledExpression());
+            actionBuilders.add(builder);
+            return (B) this;
+        }
+
+        /**
+         * @see #add(ActionBuilder)
+         */
+        public B add(ActionBuilder.Builder builder)
+        {
+            return add(builder.toActionBuilder());
+        }
+
+        /**
+         * Allows to modify the last defined action. You can add new bindings, conditions and statements. If you need
+         * to create a new action, please see {@link #add(ActionBuilder)}.
+         */
+        public ActionBuilder lastActionBuilder()
+        {
+            Preconditions.checkState(!actionBuilders.isEmpty());
+            ActionBuilder ab = actionBuilders.getLast();
+            return ab;
+        }
+
+        /**
+         * @see #lastActionBuilder()
+         */
+        public B withLastActionBuilder(Consumer<ActionBuilder> builder)
+        {
+            Preconditions.checkState(!actionBuilders.isEmpty());
+            ActionBuilder ab = actionBuilders.getLast();
+            builder.accept(ab);
+            return (B) this;
+        }
+    }
+
+    /**
+     * Creates a new {@link Counter} injection.
+     *
+     * @param name name of the internal counter
+     */
+    public static Counter.CounterBuilder newCounter(String name)
+    {
+        return new Counter.CounterBuilder(name);
+    }
+
+    /**
+     * Creates an injection along with a distributed counter. It increments the counter whenever an invoke point
+     * is reached. You can add multiple invoke points and you can also bundle additional actions - for example,
+     * throw an exception and count how many times it happened.
+     */
+    public static class Counter extends Injection
+    {
+        private static Map<String, AtomicLong> counters = new ConcurrentHashMap<>();
+        private final String name;
+        private final AtomicLong internalCounter;
+
+        private Counter(String id, String name, Rule[] rules)
+        {
+            super(id, rules);
+            this.name = name;
+            this.internalCounter = counters.computeIfAbsent(name, n -> new AtomicLong());
+            reset();
+        }
+
+        /**
+         * Get a current value of the counter.
+         */
+        public long get()
+        {
+            return internalCounter.get();
+        }
+
+        /**
+         * Reset counter value to 0.
+         */
+        public void reset()
+        {
+            internalCounter.set(0);
+        }
+
+        @CallMe
+        public static void increment(String name)
+        {
+            counters.get(name).incrementAndGet();
+        }
+
+        public static class CounterBuilder extends CrossProductInjectionBuilder<Counter, CounterBuilder>
+        {
+            private final String name;
+
+            private CounterBuilder(String name)
+            {
+                super(String.format("counter/%s/%s", name, UUID.randomUUID().toString()));
+                this.name = name;
+                add(newActionBuilder().actions().doAction(method(Counter.class, CallMe.class).args(quote(name))));
+            }
+
+            @Override
+            public Counter build()
+            {
+                return new Counter(id, name, getRules());
+            }
+        }
+    }
+
+    /**
+     * Creates a new {@link Barrier} injection.
+     *
+     * @param name name of the barrier
+     * @param parties how many parties the barrier should wait for
+     * @param cyclic whether the barrier should reset after resume
+     */
+    public static Barrier.BarrierBuilder newBarrier(String name, int parties, boolean cyclic)
+    {
+        return new Barrier.BarrierBuilder(name, parties, cyclic, true, true);
+    }
+
+    /**
+     * Creates a {@link Barrier} injection point where the number of parties is just counted down but it does not await.
+     *
+     * @param name name of the barrier
+     * @param parties how many parties a new barrier should wait for
+     * @param cyclic whether the barrier should reset after resume
+     */
+    public static Barrier.BarrierBuilder newBarrierCountDown(String name, int parties, boolean cyclic)
+    {
+        return new Barrier.BarrierBuilder(name, parties, cyclic, true, false);
+    }
+
+    /**
+     * Creates a {@link Barrier} injection point where it just awaits but it does not decrement the number of parties
+     * it is awaiting for.
+     *
+     * @param name name of the barrier
+     * @param parties how many parties a new barrier should wait for
+     * @param cyclic whether the barrier should reset after resume
+     */
+    public static Barrier.BarrierBuilder newBarrierAwait(String name, int parties, boolean cyclic)
+    {
+        return new Barrier.BarrierBuilder(name, parties, cyclic, false, true);
+    }
+
+    /**
+     * Creates an injection with a distributed barrier awaiting for a defined number of parties to reach it
+     * (including test node). It can be used to synchronize multiple nodes
+     */
+    public static class Barrier extends Injection
+    {
+        private static Map<String, CyclicBarrier> barriers = new ConcurrentHashMap<>();
+        private final boolean doCountDown;
+        private final boolean doAwait;
+        private final CyclicBarrier internalBarrier;
+
+        private Barrier(String id, String name, int parties, boolean cyclic, boolean doCountDown, boolean doAwait, Rule[] rules)
+        {
+            super(id, rules);
+            this.internalBarrier = barriers.computeIfAbsent(name, n -> new CyclicBarrier(name, parties, cyclic));
+            this.doCountDown = doCountDown;
+            this.doAwait = doAwait;
+            reset();
+        }
+
+        /**
+         * Do a single step for this barrier that is, decrement the number of parties and await, depending on which
+         * actions are enabled. I may just decrement the number of parties, without awaiting as well as it may just
+         * await, without decrementing the number of parties.
+         */
+        public void arrive() throws InterruptedException
+        {
+            internalBarrier.await(doCountDown, doAwait);
+        }
+
+        /**
+         * Just wait for this barrier to be released. Do not decrement the number of parties regardless of how this
+         * barrier was setup - this is force await.
+         */
+        public void await() throws InterruptedException
+        {
+            internalBarrier.await(false, true);
+        }
+
+        /**
+         * Just decrement the number of parties this barrier is waiting for but do not wait regardless of how
+         * this barrier was setup - this is force countdown.
+         */
+        public void countDown()
+        {
+            try
+            {
+                internalBarrier.await(true, false);
+            }
+            catch (InterruptedException ignored)
+            {
+                // will not happen in case just a countdown
+            }
+        }
+
+        /**
+         * Get the number of parties the barrier is currently awaiting for.
+         */
+        public long getCount()
+        {
+            return internalBarrier.getCount();
+        }
+
+        /**
+         * Reset the barrier, that is, let all waiting threads resume and reset the number of parties
+         * (if cyclic is enabled).
+         */
+        public void reset()
+        {
+            internalBarrier.reset();
+        }
+
+        @CallMe
+        public static void doAction(String name, boolean doCountDown, boolean doAwait) throws InterruptedException
+        {
+            CyclicBarrier barrier = barriers.get(name);
+            barrier.await(doCountDown, doAwait);
+        }
+
+        public static class BarrierBuilder extends CrossProductInjectionBuilder<Barrier, BarrierBuilder>
+        {
+            private final String name;
+            private final int parties;
+            private final boolean cyclic;
+            private final boolean doAwait;
+            private final boolean doCountDown;
+
+            private BarrierBuilder(String name, int parties, boolean cyclic, boolean doCountDown, boolean doAwait)
+            {
+                super(String.format("barrier/%s/%s", name, UUID.randomUUID().toString()));
+                this.name = name;
+                this.parties = parties;
+                this.cyclic = cyclic;
+                this.doAwait = doAwait;
+                this.doCountDown = doCountDown;
+                add(newActionBuilder().actions().doAction(method(Barrier.class, CallMe.class)
+                        .args(quote(name), doCountDown, doAwait)));
+            }
+
+            @Override
+            public Barrier build()
+            {
+                return new Barrier(id, name, parties, cyclic, doCountDown, doAwait, getRules());
+            }
+        }
+    }
+
+    /**
+     * Creates {@link Times} injection.
+     *
+     * @param name name of the internal counter
+     * @param defaultTimes the number of times the action should be executed
+     */
+    public static Times.TimesBuilder newTimes(String name, int defaultTimes)
+    {
+        return new Times.TimesBuilder(name, defaultTimes);
+    }
+
+    /**
+     * Creates an injection which allows to invoke a defined action for a defined number of times.
+     */
+    public static class Times extends Injection
+    {
+        private static Map<String, AtomicLong> counters = new ConcurrentHashMap<>();
+        private final AtomicLong internalCounter;
+        private final int defaultTimes;
+
+        private Times(String id, String name, int defaultTimes, Rule[] rules)
+        {
+            super(id, rules);
+            this.internalCounter = counters.computeIfAbsent(name, n -> new AtomicLong(defaultTimes));
+            this.defaultTimes = defaultTimes;
+            reset();
+        }
+
+        /**
+         * Get the remaining number of times the action will be attempted to be executed.
+         */
+        public long get()
+        {
+            return internalCounter.get();
+        }
+
+        /**
+         * Reset the internal counter to the original value.
+         */
+        public void reset()
+        {
+            reset(defaultTimes);
+        }
+
+        /**
+         * Reset the internal counter to the given value.
+         */
+        public void reset(int n)
+        {
+            internalCounter.set(n);
+        }
+
+        @CallMe
+        public static boolean decrementAndCheck(String name)
+        {
+            AtomicLong counter = counters.get(name);
+            long value = counter.decrementAndGet();
+            return value >= 0;
+        }
+
+        public static class TimesBuilder extends CrossProductInjectionBuilder<Times, TimesBuilder>
+        {
+            private final String name;
+            private final int defaultTimes;
+
+            private TimesBuilder(String name, int defaultTimes)
+            {
+                super(String.format("times/%s/%s", name, UUID.randomUUID().toString()));
+                this.name = name;
+                this.defaultTimes = defaultTimes;
+            }
+
+            @Override
+            public TimesBuilder add(ActionBuilder builder)
+            {
+                super.add(builder);
+                builder.conditions().when(method(Times.class, CallMe.class).args(quote(name)));
+                return this;
+            }
+
+            @Override
+            public Times build()
+            {
+                return new Times(id, name, defaultTimes, getRules());
+            }
+        }
+    }
+
+    public abstract static class SingleActionBuilder<T extends Injection, B extends InjectionBuilder<T>>
+            extends MultiInvokePointInjectionBuilder<T, B>
+    {
+        protected final ActionBuilder actionBuilder = newActionBuilder();
+
+        public SingleActionBuilder(String id)
+        {
+            super(id);
+            actionBuilder.conditions().when(getIsEnabledExpression());
+        }
+
+        public B action(Consumer<ActionBuilder> builder)
+        {
+            builder.accept(actionBuilder);
+            return (B) this;
+        }
+
+        protected Rule[] getRules()
+        {
+            Rule[] rules = new Rule[invokePointBuilders.size()];
+            int i = 0;
+            for (InvokePointBuilder ipb : invokePointBuilders)
+            {
+                rules[i++] = Rule.newRule(id + "_" + i, actionBuilder, ipb);
+            }
+            return rules;
+        }
+    }
+
+    /**
+     * Creates a custom injection which allows you to define your invoke points and actions from scratch.
+     *
+     * @param name name of the injection
+     */
+    public static CustomBuilder newCustom(String name)
+    {
+        return new CustomBuilder(name);
+    }
+
+    public static class CustomBuilder extends CrossProductInjectionBuilder<Injection, CustomBuilder>
+    {
+        public CustomBuilder(String name)
+        {
+            super(String.format("custom/%s/%s", name, UUID.randomUUID().toString()));
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        @Override
+        public Injection build()
+        {
+            return new Injection(id, getRules()) {};
+        }
+    }
+
+    /**
+     * Creates a pause to hold on the thread for a defined amount of time.
+     *
+     * @param name name of the pause
+     * @param timeout time out in milliseconds
+     */
+    public static PauseBuilder newPause(String name, long timeout)
+    {
+        return new PauseBuilder(name, timeout);
+    }
+
+    public static class PauseBuilder extends SingleActionBuilder<Injection, PauseBuilder>
+    {
+        public PauseBuilder(String name, long timeout)
+        {
+            super(String.format("pause/%s/%s", name, UUID.randomUUID().toString()));
+            actionBuilder.actions().doAction(expr("Thread.sleep").args(timeout));
+        }
+
+        /**
+         * {@inheritDoc}
+         */
+        @Override
+        public Injection build()
+        {
+            return new Injection(id, getRules());
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/inject/InvokePointBuilder.java b/test/unit/org/apache/cassandra/inject/InvokePointBuilder.java
new file mode 100644
index 000000000000..5ae3d91e453a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/InvokePointBuilder.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+/**
+ * Refer to <a href="https://github.com/bytemanproject/byteman/blob/master/docs/asciidoc/src/main/asciidoc/chapters/Byteman-Rule-Language.adoc"/>
+ * and injections.md files in the root directory.
+ */
+public class InvokePointBuilder
+{
+    private String invokePoint = "AT ENTRY";
+    private String targetClassOrInterface;
+    private boolean targetInterface;
+    private String targetMethod;
+
+    public static InvokePointBuilder newInvokePoint()
+    {
+        return new InvokePointBuilder();
+    }
+
+    public InvokePointBuilder onClass(Class<?> targetClass)
+    {
+        if (targetClass.isInterface())
+        {
+            return onInterface(targetClass.getName());
+        }
+        else
+        {
+            return onClass(targetClass.getName());
+        }
+    }
+
+    public InvokePointBuilder onClass(String targetClass)
+    {
+        this.targetClassOrInterface = targetClass;
+        this.targetInterface = false;
+        return this;
+    }
+
+    public InvokePointBuilder onInterface(String targetInterface)
+    {
+        this.targetClassOrInterface = targetInterface;
+        this.targetInterface = true;
+        return this;
+    }
+
+    public InvokePointBuilder onClass(Class<?> enclosingClass, String targetClass)
+    {
+        this.targetClassOrInterface = String.format("%s$%s", enclosingClass.getName(), targetClass);
+        this.targetInterface = false;
+        return this;
+    }
+
+    public InvokePointBuilder onMethod(String targetMethod, Object... methodArgs)
+    {
+        if (methodArgs.length > 0)
+        {
+            targetMethod = targetMethod + Arrays.stream(methodArgs)
+                                                .map(arg -> (arg instanceof Class<?>) ? ((Class) arg).getName() : String.valueOf(arg))
+                                                .collect(Collectors.joining(",", "(", ")"));
+        }
+        this.targetMethod = targetMethod;
+        return this;
+    }
+
+    public InvokePointBuilder atEntry()
+    {
+        invokePoint = "AT ENTRY";
+        return this;
+    }
+
+    public InvokePointBuilder atExit()
+    {
+        invokePoint = "AT EXIT";
+        return this;
+    }
+
+    public InvokePointBuilder at(String atExpression)
+    {
+        invokePoint = String.format("AT %s", atExpression);
+        return this;
+    }
+
+    public InvokePointBuilder after(String afterExpression)
+    {
+        invokePoint = String.format("AFTER %s", afterExpression);
+        return this;
+    }
+
+    public InvokePointBuilder atInvoke(String method)
+    {
+        invokePoint = String.format("AT INVOKE %s", method);
+        return this;
+    }
+
+    public InvokePointBuilder atExceptionExit()
+    {
+        invokePoint = "AT EXCEPTION EXIT";
+        return this;
+    }
+
+    public String getTargetClassOrInterface()
+    {
+        return targetClassOrInterface;
+    }
+
+    String buildInternal()
+    {
+        return String.format("%s %s\nMETHOD %s\n%s",
+                targetInterface ? "INTERFACE" : "CLASS",
+                             targetClassOrInterface,
+                             targetMethod,
+                             invokePoint);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/inject/Rule.java b/test/unit/org/apache/cassandra/inject/Rule.java
new file mode 100644
index 000000000000..5550d95441d1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/inject/Rule.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.inject;
+
+import java.util.UUID;
+
+/**
+ * Refer to <a href="https://github.com/bytemanproject/byteman/blob/master/docs/asciidoc/src/main/asciidoc/chapters/Byteman-Rule-Language.adoc"/>
+ * and injections.md files in the root directory.
+ */
+public class Rule
+{
+    public final String id;
+    public final String script;
+    public final String classToPreload;
+
+    public static Rule newRule(String script)
+    {
+        return newRule(UUID.randomUUID().toString(), script);
+    }
+
+    public static Rule newRule(ActionBuilder actionBuilder, InvokePointBuilder invokePointBuilder)
+    {
+        return newRule(UUID.randomUUID().toString(), actionBuilder, invokePointBuilder);
+    }
+
+    public static Rule newRule(String id, String script)
+    {
+        return new Rule(id, script, null);
+    }
+
+    public static Rule newRule(String id, ActionBuilder actionBuilder, InvokePointBuilder invokePointBuilder)
+    {
+        String script = String.format("RULE %s\n%s\n%s\nENDRULE", id, invokePointBuilder.buildInternal(), actionBuilder.buildInternal());
+        return new Rule(id, script, invokePointBuilder.getTargetClassOrInterface());
+    }
+
+    private Rule(String id, String script, String classToPreload)
+    {
+        this.id = id;
+        this.script = script;
+        this.classToPreload = classToPreload;
+    }
+}
diff --git a/update-history/STAR-801/60-f4d12eae47 STAR-121: Test support for SAI b/update-history/STAR-801/60-f4d12eae47 STAR-121: Test support for SAI
new file mode 100644
index 000000000000..51fb05dcc60a
--- /dev/null
+++ b/update-history/STAR-801/60-f4d12eae47 STAR-121: Test support for SAI	
@@ -0,0 +1,51 @@
+--- a/build.xml
++++ b/build.xml
+@@ -728,29 +728,15 @@
+         <dependency groupId="org.apache.ant" artifactId="ant-junit"/>
+         <!-- adding this dependency is necessary for assertj. When updating assertj, need to also update the version of
+              this that the new assertj's `assertj-parent-pom` depends on. -->
+-<<<<<<<
+-        <dependency groupId="org.junit" artifactId="junit-bom" type="pom" scope="test"/>
+-        <dependency groupId="org.assertj" artifactId="assertj-core" scope="test"/>
+-        <dependency groupId="org.awaitility" artifactId="awaitility" scope="test"/>
+-        <dependency groupId="org.hamcrest" artifactId="hamcrest" scope="test"/>
+-        <!-- coverage debs -->
+-        <dependency groupId="org.jacoco" artifactId="org.jacoco.agent" scope="test"/>
+-        <dependency groupId="org.jacoco" artifactId="org.jacoco.ant" scope="test"/>
+-        <dependency groupId="org.jboss.byteman" artifactId="byteman-install" scope="test"/>
+-        <dependency groupId="org.jboss.byteman" artifactId="byteman" scope="test"/>
+-        <dependency groupId="org.jboss.byteman" artifactId="byteman-submit" scope="test"/>
+-        <dependency groupId="org.jboss.byteman" artifactId="byteman-bmunit" scope="test"/>
+-
+-        <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
+-        <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
+-=======
+         <dependency groupId="org.junit" artifactId="junit-bom" type="pom"/>
+         <dependency groupId="org.awaitility" artifactId="awaitility"/>
+         <dependency groupId="org.hamcrest" artifactId="hamcrest"/>
+         <!-- coverage debs -->
+         <dependency groupId="org.jacoco" artifactId="org.jacoco.agent"/>
+         <dependency groupId="org.jacoco" artifactId="org.jacoco.ant"/>
+->>>>>>>
++
++        <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
++        <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
+       </artifact:pom>
+ 
+       <!-- now the pom's for artifacts being deployed to Maven Central -->
+@@ -794,15 +780,9 @@
+         <dependency groupId="ch.qos.logback" artifactId="logback-core"/>
+         <dependency groupId="ch.qos.logback" artifactId="logback-classic"/>
+ 
+-<<<<<<<
+           <!-- don't need hadoop classes to run, but if you use the hadoop stuff -->
+-        <dependency groupId="org.apache.hadoop" artifactId="hadoop-core" optional="true" scope="provided"/>
+-        <dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster" optional="true" scope="provided"/>
+-=======
+-        <!-- don't need hadoop classes to run, but if you use the hadoop stuff -->
+         <dependency groupId="org.apache.hadoop" artifactId="hadoop-core" optional="true"/>
+         <dependency groupId="org.apache.hadoop" artifactId="hadoop-minicluster" optional="true"/>
+->>>>>>>
+ 
+         <!-- don't need the Java Driver to run, but if you use the hadoop stuff or UDFs -->
+         <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" classifier="shaded" optional="true"/>

From 54af59d4f31bff6ee7cbf8f670e7d82e94cc00d2 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 18 Feb 2021 17:48:11 +0000
Subject: [PATCH 044/151] STAR-121: Storage Attached Index implementation and
 testing

(cherry picked from commit 430c24026fb2a4afea735475408f0ac34944478f)
(cherry picked from commit acbfeb80cf865ebe231146ad8ed54e03fef06cd3)
---
 .../db/virtual/SystemViewsKeyspace.java       |    7 +
 .../cassandra/index/sai/ColumnContext.java    |  591 +++++
 .../cassandra/index/sai/QueryContext.java     |  102 +
 .../org/apache/cassandra/index/sai/README.md  |  129 ++
 .../cassandra/index/sai/SSTableContext.java   |  290 +++
 .../index/sai/SSTableContextManager.java      |  143 ++
 .../cassandra/index/sai/SSTableIndex.java     |  364 +++
 .../index/sai/SSTableQueryContext.java        |   63 +
 .../index/sai/StorageAttachedIndex.java       |  728 ++++++
 .../sai/StorageAttachedIndexBuilder.java      |  369 +++
 .../index/sai/StorageAttachedIndexGroup.java  |  425 ++++
 .../org/apache/cassandra/index/sai/Token.java |  153 ++
 .../index/sai/analyzer/AbstractAnalyzer.java  |  116 +
 .../index/sai/analyzer/NoOpAnalyzer.java      |   68 +
 .../sai/analyzer/NonTokenizingAnalyzer.java   |  160 ++
 .../sai/analyzer/NonTokenizingOptions.java    |  162 ++
 .../analyzer/filter/BasicResultFilters.java   | 2007 +++++++++++++++++
 .../filter/FilterPipelineBuilder.java         |   52 +
 .../filter/FilterPipelineExecutor.java        |   50 +
 .../analyzer/filter/FilterPipelineTask.java   |   60 +
 .../cassandra/index/sai/disk/BKDQueries.java  |  201 ++
 .../index/sai/disk/ByteSliceReader.java       |  168 ++
 .../index/sai/disk/ColumnIndexWriter.java     |   46 +
 .../sai/disk/ImmutableOneDimPointValues.java  |   76 +
 .../index/sai/disk/IndexSearcher.java         |  224 ++
 .../index/sai/disk/IndexWriterConfig.java     |  141 ++
 .../index/sai/disk/InvertedIndexSearcher.java |  120 +
 .../index/sai/disk/KDTreeIndexSearcher.java   |  115 +
 .../index/sai/disk/MemtableIndexWriter.java   |  181 ++
 .../index/sai/disk/MemtableTermsIterator.java |  134 ++
 .../sai/disk/MergeOneDimPointValues.java      |  152 ++
 .../index/sai/disk/MergingIterator.java       |  172 ++
 .../sai/disk/MutableOneDimPointValues.java    |  112 +
 .../index/sai/disk/OnDiskKeyProducer.java     |  110 +
 .../cassandra/index/sai/disk/PostingList.java |  197 ++
 .../sai/disk/PostingListRangeIterator.java    |  199 ++
 .../index/sai/disk/QueryEventListeners.java   |   97 +
 .../index/sai/disk/RAMPostingSlices.java      |  133 ++
 .../index/sai/disk/RAMStringIndexer.java      |  163 ++
 .../sai/disk/SSTableComponentsWriter.java     |  145 ++
 .../index/sai/disk/SSTableIndexWriter.java    |  358 +++
 .../cassandra/index/sai/disk/Segment.java     |  185 ++
 .../index/sai/disk/SegmentBuilder.java        |  295 +++
 .../index/sai/disk/SegmentMerger.java         |  197 ++
 .../index/sai/disk/SegmentMetadata.java       |  406 ++++
 .../sai/disk/StorageAttachedIndexWriter.java  |  255 +++
 .../index/sai/disk/TermsIterator.java         |   45 +
 .../index/sai/disk/TermsIteratorMerger.java   |  141 ++
 .../index/sai/disk/format/Version.java        |   56 +
 .../index/sai/disk/io/BytesRefUtil.java       |   38 +
 .../index/sai/disk/io/CryptoUtils.java        |  134 ++
 .../index/sai/disk/io/EmptyDirectory.java     |  100 +
 .../index/sai/disk/io/FilterIndexInput.java   |   86 +
 .../index/sai/disk/io/IndexComponents.java    |  768 +++++++
 .../index/sai/disk/io/IndexInputReader.java   |  194 ++
 .../index/sai/disk/io/IndexOutputWriter.java  |  115 +
 .../index/sai/disk/io/RAMIndexOutput.java     |   77 +
 .../disk/v1/AbstractBlockPackedReader.java    |  233 ++
 .../disk/v1/AbstractBlockPackedWriter.java    |  117 +
 .../index/sai/disk/v1/BKDPostingsIndex.java   |   95 +
 .../index/sai/disk/v1/BKDReader.java          |  809 +++++++
 .../index/sai/disk/v1/BKDTreeRamBuffer.java   |  194 ++
 .../index/sai/disk/v1/BKDWriter.java          | 1042 +++++++++
 .../index/sai/disk/v1/BlockPackedReader.java  |  144 ++
 .../index/sai/disk/v1/BlockPackedWriter.java  |   79 +
 .../index/sai/disk/v1/DirectReaders.java      |  235 ++
 .../sai/disk/v1/FilteringPostingList.java     |  106 +
 .../sai/disk/v1/InvertedIndexWriter.java      |  109 +
 .../index/sai/disk/v1/LeafOrderMap.java       |   45 +
 .../index/sai/disk/v1/MergePostingList.java   |  126 ++
 .../index/sai/disk/v1/MetadataSource.java     |  104 +
 .../index/sai/disk/v1/MetadataWriter.java     |   86 +
 .../disk/v1/MonotonicBlockPackedReader.java   |  132 ++
 .../disk/v1/MonotonicBlockPackedWriter.java   |   86 +
 .../index/sai/disk/v1/NumericIndexWriter.java |  190 ++
 .../index/sai/disk/v1/NumericValuesMeta.java  |   51 +
 .../sai/disk/v1/NumericValuesWriter.java      |   94 +
 .../sai/disk/v1/OneDimBKDPostingsWriter.java  |  186 ++
 .../index/sai/disk/v1/OrdinalPostingList.java |   29 +
 .../sai/disk/v1/PackedLongsPostingList.java   |   64 +
 .../index/sai/disk/v1/PostingsReader.java     |  395 ++++
 .../index/sai/disk/v1/PostingsWriter.java     |  313 +++
 .../index/sai/disk/v1/TermsReader.java        |  306 +++
 .../sai/disk/v1/TraversingBKDReader.java      |  468 ++++
 .../disk/v1/TrieTermsDictionaryReader.java    |  284 +++
 .../disk/v1/TrieTermsDictionaryWriter.java    |   97 +
 .../index/sai/memory/InMemoryToken.java       |   54 +
 .../index/sai/memory/KeyRangeIterator.java    |   97 +
 .../index/sai/memory/MemoryIndex.java         |   80 +
 .../index/sai/memory/MemtableIndex.java       |  105 +
 .../index/sai/memory/RowMapping.java          |  205 ++
 .../index/sai/memory/TrieMemoryIndex.java     |  339 +++
 .../index/sai/metrics/AbstractMetrics.java    |   90 +
 .../index/sai/metrics/ColumnQueryMetrics.java |  146 ++
 .../index/sai/metrics/IndexGroupMetrics.java  |   39 +
 .../index/sai/metrics/IndexMetrics.java       |   68 +
 .../metrics/MulticastQueryEventListeners.java |  168 ++
 .../index/sai/metrics/QueryEventListener.java |  116 +
 .../index/sai/metrics/TableQueryMetrics.java  |  199 ++
 .../index/sai/metrics/TableStateMetrics.java  |   52 +
 .../cassandra/index/sai/plan/Expression.java  |  425 ++++
 .../cassandra/index/sai/plan/FilterTree.java  |  244 ++
 .../cassandra/index/sai/plan/Operation.java   |  478 ++++
 .../index/sai/plan/QueryController.java       |  393 ++++
 .../plan/StorageAttachedIndexQueryPlan.java   |  142 ++
 .../plan/StorageAttachedIndexSearcher.java    |  311 +++
 .../sai/utils/AbortedOperationException.java  |   36 +
 .../index/sai/utils/AbstractIterator.java     |  161 ++
 .../cassandra/index/sai/utils/LongArray.java  |  127 ++
 .../index/sai/utils/NamedMemoryLimiter.java   |   79 +
 .../index/sai/utils/OffsetFactory.java        |  157 ++
 .../cassandra/index/sai/utils/PrimaryKey.java |  172 ++
 .../index/sai/utils/PrimaryKeys.java          |  226 ++
 .../index/sai/utils/RangeConcatIterator.java  |  128 ++
 .../sai/utils/RangeIntersectionIterator.java  |  258 +++
 .../index/sai/utils/RangeIterator.java        |  370 +++
 .../index/sai/utils/RangeUnionIterator.java   |  213 ++
 .../index/sai/utils/SAICodecUtils.java        |  266 +++
 .../sai/utils/SeekingRandomAccessInput.java   |   70 +
 .../index/sai/utils/TermIterator.java         |  135 ++
 .../cassandra/index/sai/utils/TypeUtil.java   |  533 +++++
 .../index/sai/view/IndexViewManager.java      |  161 ++
 .../index/sai/view/RangeTermTree.java         |  124 +
 .../cassandra/index/sai/view/TermTree.java    |   61 +
 .../apache/cassandra/index/sai/view/View.java |  129 ++
 .../index/sai/virtual/IndexesSystemView.java  |  153 ++
 .../index/sai/virtual/SSTablesSystemView.java |  130 ++
 .../index/sai/virtual/SegmentsSystemView.java |  146 ++
 .../cassandra/schema/IndexMetadata.java       |    6 +
 .../service/reads/range/RangeCommands.java    |    2 +-
 .../config/DatabaseDescriptorRefTest.java     |    1 +
 .../index/sai/IndexingSchemaLoader.java       |  205 ++
 .../apache/cassandra/index/sai/SAITester.java |  750 ++++++
 .../analyzer/NonTokenizingAnalyzerTest.java   |  124 +
 .../filter/BasicResultFiltersTest.java        |   72 +
 .../index/sai/cql/AllowFilteringTest.java     |  437 ++++
 .../index/sai/cql/ClusteringKeyIndexTest.java |  115 +
 .../index/sai/cql/CollectionIndexingTest.java |  261 +++
 .../cql/CompositePartitionKeyIndexTest.java   |  143 ++
 .../cassandra/index/sai/cql/DataModel.java    |  612 +++++
 .../index/sai/cql/DecimalLargeValueTest.java  |  151 ++
 .../index/sai/cql/DuplicateRowIDTest.java     |   78 +
 .../sai/cql/IndexOperatorSupportTest.java     |   74 +
 .../index/sai/cql/IndexQuerySupport.java      |  631 ++++++
 .../cql/InetAddressTypeEquivalencyTest.java   |  206 ++
 .../cql/MixedIndexImplementationsTest.java    |  236 ++
 .../sai/cql/MultipleColumnIndexTest.java      |   75 +
 .../index/sai/cql/NativeIndexDDLTest.java     | 1348 +++++++++++
 .../sai/cql/PartitionRestrictedQueryTest.java |  211 ++
 .../index/sai/cql/QueryCellDeletionsTest.java |   29 +
 .../index/sai/cql/QueryRowDeletionsTest.java  |   29 +
 .../index/sai/cql/QueryTimeToLiveTest.java    |   29 +
 .../index/sai/cql/QueryTimeoutTest.java       |  133 ++
 .../sai/cql/QueryWriteLifecycleTest.java      |   30 +
 .../TinySegmentQueryCellDeletionsTest.java    |   41 +
 .../cql/TinySegmentQueryRowDeletionsTest.java |   41 +
 .../cql/TinySegmentQueryTimeToLiveTest.java   |   41 +
 .../TinySegmentQueryWriteLifecycleTest.java   |   41 +
 .../index/sai/cql/TokenCollisionTest.java     |   69 +
 .../index/sai/cql/types/AsciiTest.java        |   45 +
 .../index/sai/cql/types/BigintTest.java       |   45 +
 .../index/sai/cql/types/DataSet.java          |  679 ++++++
 .../index/sai/cql/types/DateTest.java         |   46 +
 .../index/sai/cql/types/DecimalTest.java      |   45 +
 .../index/sai/cql/types/DoubleTest.java       |   45 +
 .../index/sai/cql/types/FloatTest.java        |   45 +
 .../sai/cql/types/IndexingTypeSupport.java    |  151 ++
 .../index/sai/cql/types/InetTest.java         |   45 +
 .../index/sai/cql/types/IntTest.java          |   45 +
 .../sai/cql/types/NumericTypeSortingTest.java |   96 +
 .../index/sai/cql/types/QuerySet.java         |  500 ++++
 .../index/sai/cql/types/SmallintTest.java     |   45 +
 .../index/sai/cql/types/TextTest.java         |   45 +
 .../index/sai/cql/types/TimeTest.java         |   45 +
 .../index/sai/cql/types/TimestampTest.java    |   45 +
 .../index/sai/cql/types/TimeuuidTest.java     |   45 +
 .../index/sai/cql/types/TinyintTest.java      |   45 +
 .../index/sai/cql/types/UuidTest.java         |   45 +
 .../index/sai/cql/types/VarintTest.java       |   45 +
 .../types/collections/CollectionDataSet.java  |  284 +++
 .../lists/FrozenListAsciiTest.java            |   49 +
 .../lists/FrozenListDecimalTest.java          |   49 +
 .../collections/lists/FrozenListIntTest.java  |   49 +
 .../lists/FrozenListVarintTest.java           |   49 +
 .../collections/lists/ListAsciiTest.java      |   49 +
 .../collections/lists/ListBigintTest.java     |   49 +
 .../types/collections/lists/ListDateTest.java |   49 +
 .../collections/lists/ListDecimalTest.java    |   49 +
 .../collections/lists/ListDoubleTest.java     |   49 +
 .../collections/lists/ListFloatTest.java      |   49 +
 .../lists/ListFrozenCollectionTest.java       |   51 +
 .../types/collections/lists/ListInetTest.java |   49 +
 .../types/collections/lists/ListIntTest.java  |   49 +
 .../collections/lists/ListSmallintTest.java   |   49 +
 .../types/collections/lists/ListTextTest.java |   49 +
 .../types/collections/lists/ListTimeTest.java |   49 +
 .../collections/lists/ListTimestampTest.java  |   49 +
 .../collections/lists/ListTimeuuidTest.java   |   49 +
 .../collections/lists/ListTinyintTest.java    |   49 +
 .../types/collections/lists/ListUuidTest.java |   49 +
 .../collections/lists/ListVarintTest.java     |   49 +
 .../collections/maps/FrozenMapAsciiTest.java  |   49 +
 .../maps/FrozenMapDecimalTest.java            |   49 +
 .../collections/maps/FrozenMapIntTest.java    |   49 +
 .../collections/maps/FrozenMapVarintTest.java |   49 +
 .../types/collections/maps/MapAsciiTest.java  |   49 +
 .../types/collections/maps/MapBigintTest.java |   49 +
 .../types/collections/maps/MapDateTest.java   |   49 +
 .../collections/maps/MapDecimalTest.java      |   50 +
 .../types/collections/maps/MapDoubleTest.java |   49 +
 .../collections/maps/MapEntriesAsciiTest.java |   49 +
 .../maps/MapEntriesFrozenCollectionTest.java  |   51 +
 .../collections/maps/MapEntriesIntTest.java   |   49 +
 .../maps/MapEntriesVarintTest.java            |   49 +
 .../types/collections/maps/MapFloatTest.java  |   49 +
 .../maps/MapFrozenCollectionTest.java         |   51 +
 .../types/collections/maps/MapInetTest.java   |   49 +
 .../types/collections/maps/MapIntTest.java    |   49 +
 .../collections/maps/MapKeysAsciiTest.java    |   49 +
 .../maps/MapKeysFrozenCollectionTest.java     |   51 +
 .../collections/maps/MapKeysIntTest.java      |   49 +
 .../collections/maps/MapKeysVarintTest.java   |   49 +
 .../collections/maps/MapSmallintTest.java     |   49 +
 .../types/collections/maps/MapTextTest.java   |   49 +
 .../types/collections/maps/MapTimeTest.java   |   49 +
 .../collections/maps/MapTimestampTest.java    |   49 +
 .../collections/maps/MapTimeuuidTest.java     |   49 +
 .../collections/maps/MapTinyintTest.java      |   49 +
 .../types/collections/maps/MapUuidTest.java   |   49 +
 .../collections/maps/MapValuesAsciiTest.java  |   49 +
 .../maps/MapValuesFrozenCollectionTest.java   |   51 +
 .../collections/maps/MapValuesIntTest.java    |   49 +
 .../collections/maps/MapValuesVarintTest.java |   49 +
 .../types/collections/maps/MapVarintTest.java |   49 +
 .../collections/maps/MultiMapAsciiTest.java   |   49 +
 .../collections/maps/MultiMapIntTest.java     |   49 +
 .../collections/maps/MultiMapVarintTest.java  |   49 +
 .../collections/sets/FrozenSetAsciiTest.java  |   49 +
 .../sets/FrozenSetDecimalTest.java            |   49 +
 .../collections/sets/FrozenSetIntTest.java    |   49 +
 .../collections/sets/FrozenSetVarintTest.java |   49 +
 .../types/collections/sets/SetAsciiTest.java  |   49 +
 .../types/collections/sets/SetBigintTest.java |   49 +
 .../types/collections/sets/SetDateTest.java   |   49 +
 .../collections/sets/SetDecimalTest.java      |   49 +
 .../types/collections/sets/SetDoubleTest.java |   49 +
 .../types/collections/sets/SetFloatTest.java  |   49 +
 .../sets/SetFrozenCollectionTest.java         |   51 +
 .../types/collections/sets/SetInetTest.java   |   49 +
 .../types/collections/sets/SetIntTest.java    |   49 +
 .../collections/sets/SetSmallintTest.java     |   49 +
 .../types/collections/sets/SetTextTest.java   |   49 +
 .../types/collections/sets/SetTimeTest.java   |   49 +
 .../collections/sets/SetTimestampTest.java    |   49 +
 .../collections/sets/SetTimeuuidTest.java     |   49 +
 .../collections/sets/SetTinyintTest.java      |   49 +
 .../types/collections/sets/SetUuidTest.java   |   49 +
 .../types/collections/sets/SetVarintTest.java |   49 +
 .../index/sai/disk/BKDQueriesTest.java        |  208 ++
 .../disk/ImmutableOneDimPointValuesTest.java  |  130 ++
 .../index/sai/disk/InvertedIndexBuilder.java  |   61 +
 .../sai/disk/InvertedIndexSearcherTest.java   |  203 ++
 .../index/sai/disk/KDTreeIndexBuilder.java    |  332 +++
 .../sai/disk/KDTreeIndexSearcherTest.java     |  236 ++
 .../sai/disk/KDTreeSegmentMergerTest.java     |  277 +++
 .../index/sai/disk/NodeStartupTest.java       |  369 +++
 .../index/sai/disk/RAMPostingSlicesTest.java  |   77 +
 .../index/sai/disk/RAMStringIndexerTest.java  |  113 +
 .../index/sai/disk/SegmentFlushTest.java      |  231 ++
 .../index/sai/disk/SegmentMergerTest.java     |  194 ++
 .../cassandra/index/sai/disk/SegmentTest.java |  164 ++
 .../sai/disk/SelectiveIntersectionTest.java   |  157 ++
 .../sai/disk/SingleNodeQueryFailureTest.java  |  133 ++
 .../sai/disk/TermsIteratorMergerTest.java     |  170 ++
 .../index/sai/disk/TypeUtilTest.java          |  222 ++
 .../index/sai/disk/format/VersionTest.java    |   89 +
 .../sai/disk/io/BKDTempFilesDirectory.java    |  152 ++
 .../disk/io/BKDTempFilesDirectoryTest.java    |  110 +
 .../index/sai/disk/io/BytesRefUtilTest.java   |   49 +
 .../sai/disk/io/TrackingIndexComponents.java  |   81 +
 .../index/sai/disk/v1/BKDReaderTest.java      |  395 ++++
 .../sai/disk/v1/BKDTreeRamBufferTest.java     |   79 +
 .../sai/disk/v1/FilteringPostingListTest.java |  168 ++
 .../index/sai/disk/v1/LeafOrderMapTest.java   |   56 +
 .../sai/disk/v1/MergePostingListTest.java     |  337 +++
 .../index/sai/disk/v1/MetadataTest.java       |  170 ++
 .../sai/disk/v1/NumericIndexWriterTest.java   |  201 ++
 .../index/sai/disk/v1/NumericValuesTest.java  |  219 ++
 .../disk/v1/OneDimBKDPostingsWriterTest.java  |  169 ++
 .../index/sai/disk/v1/PostingsTest.java       |  349 +++
 .../index/sai/disk/v1/SorterTest.java         |   77 +
 .../index/sai/disk/v1/TermsReaderTest.java    |  164 ++
 .../sai/disk/v1/TrieTermsDictionaryTest.java  |  151 ++
 .../index/sai/functional/CompactionTest.java  |  362 +++
 .../index/sai/functional/DiskSpaceTest.java   |   56 +
 .../index/sai/functional/DropTableTest.java   |  108 +
 .../index/sai/functional/FailureTest.java     |  123 +
 .../index/sai/functional/FlushingTest.java    |   80 +
 .../index/sai/functional/NodeRestartTest.java |  181 ++
 .../index/sai/functional/SnapshotTest.java    |  163 ++
 .../memory/AbstractKeyRangeIteratorTest.java  |  171 ++
 .../sai/memory/KeyRangeIteratorTest.java      |   38 +
 .../memory/PriorityKeyRangeIteratorTest.java  |   37 +
 .../index/sai/memory/TrieMemoryIndexTest.java |  156 ++
 .../sai/metrics/AbstractMetricsTest.java      |   86 +
 .../FinalSegmentFlushingFailureTest.java      |   27 +
 .../sai/metrics/IndexGroupMetricsTest.java    |  107 +
 .../index/sai/metrics/IndexMetricsTest.java   |  142 ++
 .../sai/metrics/QueryEventListeners.java      |   95 +
 .../index/sai/metrics/QueryMetricsTest.java   |  357 +++
 .../metrics/SegmentFlushingFailureTest.java   |  283 +++
 .../index/sai/metrics/StateMetricsTest.java   |  103 +
 .../TinySegmentFlushingFailureTest.java       |   38 +
 .../index/sai/plan/ExpressionTest.java        |   54 +
 .../index/sai/plan/OperationTest.java         |  713 ++++++
 .../sai/utils/AbstractRangeIteratorTest.java  |  138 ++
 .../index/sai/utils/ArrayPostingList.java     |  101 +
 .../index/sai/utils/ArrayPostingListTest.java |   41 +
 .../sai/utils/DeferredRangeIteratorTest.java  |  103 +
 .../utils/IndexComponentsLeakDetector.java    |   63 +
 .../cassandra/index/sai/utils/LongArrays.java |   97 +
 .../index/sai/utils/LongIterator.java         |  167 ++
 .../index/sai/utils/LongIteratorTest.java     |   66 +
 .../sai/utils/NamedMemoryLimiterTest.java     |   82 +
 .../index/sai/utils/NdiRandomizedTest.java    |  201 ++
 .../index/sai/utils/OffsetFactoryTest.java    |   88 +
 .../index/sai/utils/PostingListTest.java      |   64 +
 .../sai/utils/RangeConcatIteratorTest.java    |  437 ++++
 .../utils/RangeIntersectionIteratorTest.java  |  434 ++++
 .../sai/utils/RangeUnionIteratorTest.java     |  451 ++++
 .../index/sai/view/IndexViewManagerTest.java  |  252 +++
 .../sai/virtual/IndexesSystemViewTest.java    |  169 ++
 .../sai/virtual/SSTablesSystemViewTest.java   |  192 ++
 .../sai/virtual/SegmentsSystemViewTest.java   |  231 ++
 334 files changed, 51816 insertions(+), 1 deletion(-)
 create mode 100644 src/java/org/apache/cassandra/index/sai/ColumnContext.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/QueryContext.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/README.md
 create mode 100644 src/java/org/apache/cassandra/index/sai/SSTableContext.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/SSTableContextManager.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/SSTableIndex.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/SSTableQueryContext.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/Token.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/NoOpAnalyzer.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFilters.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineBuilder.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineTask.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/BKDQueries.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/ByteSliceReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/ColumnIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValues.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/IndexSearcher.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/IndexWriterConfig.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/InvertedIndexSearcher.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcher.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/MemtableTermsIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/MergeOneDimPointValues.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/MergingIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/MutableOneDimPointValues.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/OnDiskKeyProducer.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/PostingList.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/PostingListRangeIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/RAMPostingSlices.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/RAMStringIndexer.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/SSTableComponentsWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/Segment.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/SegmentBuilder.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/SegmentMerger.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/SegmentMetadata.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/TermsIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/TermsIteratorMerger.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/format/Version.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/BytesRefUtil.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/CryptoUtils.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/EmptyDirectory.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/FilterIndexInput.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/IndexComponents.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/IndexInputReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/IndexOutputWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/io/RAMIndexOutput.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/BKDPostingsIndex.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/BKDReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBuffer.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/BKDWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/DirectReaders.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/FilteringPostingList.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/LeafOrderMap.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/MergePostingList.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/MetadataSource.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/MetadataWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesMeta.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/OrdinalPostingList.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/PackedLongsPostingList.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/TermsReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/TraversingBKDReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryReader.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryWriter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/memory/InMemoryToken.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/memory/KeyRangeIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/memory/MemoryIndex.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/memory/RowMapping.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/AbstractMetrics.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/IndexGroupMetrics.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/IndexMetrics.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/TableQueryMetrics.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/metrics/TableStateMetrics.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/plan/Expression.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/plan/FilterTree.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/plan/Operation.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/plan/QueryController.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/AbortedOperationException.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/AbstractIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/LongArray.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/NamedMemoryLimiter.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/OffsetFactory.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/RangeConcatIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/RangeIntersectionIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/RangeIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/RangeUnionIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/SAICodecUtils.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/SeekingRandomAccessInput.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/TermIterator.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/view/IndexViewManager.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/view/RangeTermTree.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/view/TermTree.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/view/View.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java
 create mode 100644 src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/IndexingSchemaLoader.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/SAITester.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFiltersTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/ClusteringKeyIndexTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/CollectionIndexingTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/DecimalLargeValueTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/DuplicateRowIDTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/InetAddressTypeEquivalencyTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/MixedIndexImplementationsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/MultipleColumnIndexTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/PartitionRestrictedQueryTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/QueryTimeoutTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/TokenCollisionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/AsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/BigintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/DateTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/DecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/DoubleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/FloatTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/InetTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/IntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/NumericTypeSortingTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/SmallintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/TextTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/TimeTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/TimestampTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/TimeuuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/TinyintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/UuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/VarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/CollectionDataSet.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListDecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListBigintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDateTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDoubleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFloatTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFrozenCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListInetTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListSmallintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTextTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimestampTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeuuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTinyintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListUuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapDecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapBigintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDateTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDoubleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesFrozenCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFloatTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFrozenCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapInetTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysFrozenCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapSmallintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTextTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimestampTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeuuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTinyintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapUuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesFrozenCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetDecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetAsciiTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetBigintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDateTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDecimalTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDoubleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFloatTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFrozenCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetInetTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetIntTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetSmallintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTextTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimestampTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeuuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTinyintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetUuidTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetVarintTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/BKDQueriesTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValuesTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexBuilder.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexSearcherTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexBuilder.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcherTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/KDTreeSegmentMergerTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/NodeStartupTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/RAMPostingSlicesTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/RAMStringIndexerTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/SegmentFlushTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/SegmentMergerTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/SegmentTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/SelectiveIntersectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/SingleNodeQueryFailureTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/TermsIteratorMergerTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/format/VersionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectory.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectoryTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/io/BytesRefUtilTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/io/TrackingIndexComponents.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/BKDReaderTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBufferTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/FilteringPostingListTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/LeafOrderMapTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/MergePostingListTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/MetadataTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriterTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/NumericValuesTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriterTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/SorterTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/TermsReaderTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/DiskSpaceTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/DropTableTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/FlushingTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/NodeRestartTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/SnapshotTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/memory/AbstractKeyRangeIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/memory/KeyRangeIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/memory/PriorityKeyRangeIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/AbstractMetricsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/FinalSegmentFlushingFailureTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/IndexGroupMetricsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/IndexMetricsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/SegmentFlushingFailureTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/StateMetricsTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/metrics/TinySegmentFlushingFailureTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/plan/ExpressionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/AbstractRangeIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingList.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingListTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/DeferredRangeIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/IndexComponentsLeakDetector.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/LongArrays.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/LongIterator.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/LongIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/NamedMemoryLimiterTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/NdiRandomizedTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/OffsetFactoryTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/PostingListTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/RangeConcatIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/RangeIntersectionIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/utils/RangeUnionIteratorTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/virtual/SSTablesSystemViewTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/virtual/SegmentsSystemViewTest.java

diff --git a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java
index 92da4afda53e..fe7591c1b979 100644
--- a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java
+++ b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java
@@ -19,6 +19,10 @@
 
 import com.google.common.collect.ImmutableList;
 
+import org.apache.cassandra.index.sai.virtual.IndexesSystemView;
+import org.apache.cassandra.index.sai.virtual.SSTablesSystemView;
+import org.apache.cassandra.index.sai.virtual.SegmentsSystemView;
+
 import static org.apache.cassandra.schema.SchemaConstants.VIRTUAL_VIEWS;
 
 public final class SystemViewsKeyspace extends VirtualKeyspace
@@ -36,6 +40,9 @@ private SystemViewsKeyspace()
                     .add(new ThreadPoolsTable(VIRTUAL_VIEWS))
                     .add(new InternodeOutboundTable(VIRTUAL_VIEWS))
                     .add(new InternodeInboundTable(VIRTUAL_VIEWS))
+                    .add(new SSTablesSystemView(VIRTUAL_VIEWS))
+                    .add(new SegmentsSystemView(VIRTUAL_VIEWS))
+                    .add(new IndexesSystemView(VIRTUAL_VIEWS))
                     .addAll(TableMetricTables.getAll(VIRTUAL_VIEWS))
                     .build());
     }
diff --git a/src/java/org/apache/cassandra/index/sai/ColumnContext.java b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
new file mode 100644
index 000000000000..1f73ef426125
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
@@ -0,0 +1,591 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai;
+
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.MoreObjects;
+import com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.TargetParser;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.disk.IndexSearcher;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.memory.MemtableIndex;
+import org.apache.cassandra.index.sai.metrics.ColumnQueryMetrics;
+import org.apache.cassandra.index.sai.metrics.IndexMetrics;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.RangeUnionIterator;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.index.sai.view.IndexViewManager;
+import org.apache.cassandra.index.sai.view.View;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * Manage metadata for each column index.
+ */
+public class ColumnContext
+{
+    private static final Logger logger = LoggerFactory.getLogger(ColumnContext.class);
+
+    private static final Set<AbstractType<?>> EQ_ONLY_TYPES =
+            ImmutableSet.of(UTF8Type.instance, AsciiType.instance, BooleanType.instance, UUIDType.instance);
+
+    private final AbstractType<?> partitionKeyType;
+    private final ClusteringComparator clusteringComparator;
+
+    private final String keyspace;
+    private final String table;
+    private final Pair<ColumnMetadata, IndexTarget.Type> target;
+    private final AbstractType<?> validator;
+
+    // Config can be null if the column context is "fake" (i.e. created for a filtering expression).
+    private final IndexMetadata config;
+
+    private final ConcurrentMap<Memtable, MemtableIndex> liveMemtables = new ConcurrentHashMap<>();
+
+    private final IndexViewManager viewManager;
+    private final IndexMetrics indexMetrics;
+    private final ColumnQueryMetrics columnQueryMetrics;
+    private final IndexWriterConfig indexWriterConfig;
+
+    public ColumnContext(TableMetadata tableMeta, IndexMetadata metadata)
+    {
+        this.keyspace = tableMeta.keyspace;
+        this.table = tableMeta.name;
+        this.partitionKeyType = tableMeta.partitionKeyType;
+        this.clusteringComparator = tableMeta.comparator;
+        this.target = TargetParser.parse(tableMeta, metadata);
+        this.config = metadata;
+        this.viewManager = new IndexViewManager(this);
+        this.indexMetrics = new IndexMetrics(this, tableMeta);
+        this.validator = TypeUtil.cellValueType(target);
+
+        String fullIndexName = String.format("%s.%s.%s", this.keyspace, this.table, this.config.name);
+        this.indexWriterConfig = IndexWriterConfig.fromOptions(fullIndexName, validator, config.options);
+        this.columnQueryMetrics = isLiteral() ? new ColumnQueryMetrics.TrieIndexMetrics(getIndexName(), tableMeta)
+                                              : new ColumnQueryMetrics.BKDIndexMetrics(getIndexName(), tableMeta);
+
+        logger.info(logMessage("Initialized column context with index writer config: {}"),
+                this.indexWriterConfig.toString());
+    }
+
+    @VisibleForTesting
+    public ColumnContext(String keyspace,
+                         String table,
+                         AbstractType<?> partitionKeyType,
+                         ClusteringComparator clusteringComparator,
+                         ColumnMetadata column,
+                         IndexMetadata config,
+                         IndexWriterConfig indexWriterConfig)
+    {
+        this.keyspace = keyspace;
+        this.table = table;
+        this.partitionKeyType = partitionKeyType;
+        this.clusteringComparator = clusteringComparator;
+        this.target = Pair.create(column, IndexTarget.Type.SIMPLE);
+        this.validator = column.type;
+        this.config = config;
+        this.viewManager = null;
+        this.indexMetrics = null;
+        this.columnQueryMetrics = null;
+        this.indexWriterConfig = indexWriterConfig;
+    }
+
+    public ColumnContext(TableMetadata table, ColumnMetadata column)
+    {
+        this.keyspace = table.keyspace;
+        this.table = table.name;
+        this.partitionKeyType = table.partitionKeyType;
+        this.clusteringComparator = table.comparator;
+        this.target = TargetParser.parse(table, column.name.toString());
+        this.validator = target == null ? null : TypeUtil.cellValueType(target);
+        this.config = null;
+        this.viewManager = null;
+        this.indexMetrics = null;
+        this.columnQueryMetrics = null;
+        this.indexWriterConfig = IndexWriterConfig.emptyConfig();
+    }
+
+    public AbstractType<?> keyValidator()
+    {
+        return partitionKeyType;
+    }
+
+    public ClusteringComparator clusteringComparator()
+    {
+        return clusteringComparator;
+    }
+
+    public IndexMetrics getIndexMetrics()
+    {
+        return indexMetrics;
+    }
+
+    public ColumnQueryMetrics getColumnQueryMetrics()
+    {
+        return columnQueryMetrics;
+    }
+
+    public String getTable()
+    {
+        return table;
+    }
+
+    public long index(DecoratedKey key, Row row, Memtable mt)
+    {
+        MemtableIndex current = liveMemtables.get(mt);
+
+        // We expect the relevant IndexMemtable to be present most of the time, so only make the
+        // call to computeIfAbsent() if it's not. (see https://bugs.openjdk.java.net/browse/JDK-8161372)
+        MemtableIndex target = (current != null)
+                               ? current
+                               : liveMemtables.computeIfAbsent(mt, memtable -> new MemtableIndex(this, mt));
+
+        long start = System.nanoTime();
+
+        long bytes = 0;
+
+        if (isNonFrozenCollection())
+        {
+            Iterator<ByteBuffer> bufferIterator = getValuesOf(row, FBUtilities.nowInSeconds());
+            if (bufferIterator != null)
+            {
+                while (bufferIterator.hasNext())
+                {
+                    ByteBuffer value = bufferIterator.next();
+                    bytes += target.index(key, row.clustering(), value);
+                }
+            }
+        }
+        else
+        {
+            ByteBuffer value = getValueOf(key, row, FBUtilities.nowInSeconds());
+            target.index(key, row.clustering(), value);
+        }
+        indexMetrics.memtableIndexWriteLatency.update(System.nanoTime() - start, TimeUnit.NANOSECONDS);
+        return bytes;
+    }
+
+    public void renewMemtable(Memtable renewed)
+    {
+        for (Memtable memtable : liveMemtables.keySet())
+        {
+            // remove every index but the one that corresponds to the post-truncate Memtable
+            if (renewed != memtable)
+            {
+                liveMemtables.remove(memtable);
+            }
+        }
+    }
+
+    public void discardMemtable(Memtable discarded)
+    {
+        liveMemtables.remove(discarded);
+    }
+
+    public MemtableIndex getPendingMemtableIndex(LifecycleNewTracker tracker)
+    {
+        return liveMemtables.keySet().stream()
+                            .filter(m -> tracker.equals(m.tracker()))
+                            .findFirst()
+                            .map(liveMemtables::get)
+                            .orElse(null);
+    }
+
+    public RangeIterator searchMemtable(Expression e, AbstractBounds<PartitionPosition> keyRange)
+    {
+        Collection<MemtableIndex> memtables = liveMemtables.values();
+
+        if (memtables.isEmpty())
+        {
+            return RangeIterator.empty();
+        }
+
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        for (MemtableIndex index : memtables)
+        {
+            builder.add(index.search(e, keyRange));
+        }
+
+        return builder.build();
+    }
+
+    public long liveMemtableWriteCount()
+    {
+        return liveMemtables.values().stream().mapToLong(MemtableIndex::writeCount).sum();
+    }
+
+    public long estimatedMemIndexMemoryUsed()
+    {
+        return liveMemtables.values().stream().mapToLong(MemtableIndex::estimatedMemoryUsed).sum();
+    }
+
+    /**
+     * @return A set of SSTables which have attached to them invalid index components.
+     */
+    public Set<SSTableContext> onSSTableChanged(Collection<SSTableReader> oldSSTables, Collection<SSTableContext> newSSTables, boolean validate, boolean rename)
+    {
+        return viewManager.update(oldSSTables, newSSTables, validate, rename);
+    }
+
+    public ColumnMetadata getDefinition()
+    {
+        return target.left;
+    }
+
+    public AbstractType<?> getValidator()
+    {
+        return validator;
+    }
+
+    public boolean isNonFrozenCollection()
+    {
+        return TypeUtil.isNonFrozenCollection(target.left.type);
+    }
+
+    public boolean isFrozenCollection()
+    {
+        return TypeUtil.isFrozenCollection(target.left.type);
+    }
+
+    public String getColumnName()
+    {
+        return target.left.name.toString();
+    }
+
+    public String getIndexName()
+    {
+        return this.config == null ? null : config.name;
+    }
+
+    public AbstractAnalyzer getAnalyzer()
+    {
+        Map<String, String> options = config != null ? config.options : Collections.emptyMap();
+        return AbstractAnalyzer.fromOptions(getValidator(), options);
+    }
+
+    public IndexWriterConfig getIndexWriterConfig()
+    {
+        return indexWriterConfig;
+    }
+
+    public View getView()
+    {
+        return viewManager.getView();
+    }
+
+    /**
+     * @return total number of per-index open files
+     */
+    public int openPerIndexFiles()
+    {
+        return viewManager.getView().size() * IndexSearcher.openPerIndexFiles(getValidator());
+    }
+
+    public void drop(Collection<SSTableReader> sstablesToRebuild)
+    {
+        viewManager.drop(sstablesToRebuild);
+    }
+
+    public boolean isIndexed()
+    {
+        return config != null;
+    }
+
+    /**
+     * Called when index is dropped. Mark all {@link SSTableIndex} as obsolete and per-column index files
+     * will be removed when in-flight queries completed.
+     */
+    public void invalidate()
+    {
+        liveMemtables.clear();
+        viewManager.invalidate();
+        indexMetrics.release();
+        columnQueryMetrics.release();
+    }
+
+    @VisibleForTesting
+    public ConcurrentMap<Memtable, MemtableIndex> getLiveMemtables()
+    {
+        return liveMemtables;
+    }
+
+    public boolean supports(Operator op)
+    {
+        if (op.isLike() || op == Operator.LIKE) return false;
+
+        Expression.Op operator = Expression.Op.valueOf(op);
+        IndexTarget.Type type = target.right;
+
+        if (isNonFrozenCollection())
+        {
+            if (type == IndexTarget.Type.KEYS) return operator == Expression.Op.CONTAINS_KEY;
+            if (type == IndexTarget.Type.VALUES) return operator == Expression.Op.CONTAINS_VALUE;
+            return type == IndexTarget.Type.KEYS_AND_VALUES && operator == Expression.Op.EQ;
+        }
+
+        if (type == IndexTarget.Type.FULL)
+            return operator == Expression.Op.EQ;
+
+        AbstractType<?> validator = getValidator();
+
+        if (operator != Expression.Op.EQ && EQ_ONLY_TYPES.contains(validator)) return false;
+
+        // RANGE only applicable to non-literal indexes
+        return (operator != null) && !(TypeUtil.isLiteral(validator) && operator == Expression.Op.RANGE);
+    }
+
+    public ByteBuffer getValueOf(DecoratedKey key, Row row, int nowInSecs)
+    {
+        if (row == null)
+            return null;
+
+        switch (target.left.kind)
+        {
+            case PARTITION_KEY:
+                return partitionKeyType instanceof CompositeType
+                       ? CompositeType.extractComponent(key.getKey(), target.left.position())
+                       : key.getKey();
+            case CLUSTERING:
+                // skip indexing of static clustering when regular column is indexed
+                return row.isStatic() ? null : row.clustering().bufferAt(target.left.position());
+
+            // treat static cell retrieval the same was as regular
+            // only if row kind is STATIC otherwise return null
+            case STATIC:
+                if (!row.isStatic())
+                    return null;
+            case REGULAR:
+                Cell cell = row.getCell(target.left);
+                return cell == null || !cell.isLive(nowInSecs) ? null : cell.buffer();
+
+            default:
+                return null;
+        }
+    }
+
+    public Iterator<ByteBuffer> getValuesOf(Row row, int nowInSecs)
+    {
+        if (row == null)
+            return null;
+
+        switch (target.left.kind)
+        {
+            // treat static cell retrieval the same was as regular
+            // only if row kind is STATIC otherwise return null
+            case STATIC:
+                if (!row.isStatic())
+                    return null;
+            case REGULAR:
+                return TypeUtil.collectionIterator(validator, (ComplexColumnData)row.getComplexColumnData(target.left), target, nowInSecs);
+
+            default:
+                return null;
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("columnName", getColumnName())
+                          .add("indexName", getIndexName())
+                          .toString();
+    }
+
+    public boolean isLiteral()
+    {
+        return TypeUtil.isLiteral(getValidator());
+    }
+
+    public boolean equals(Object obj)
+    {
+        if (obj == this)
+            return true;
+
+        if (!(obj instanceof ColumnContext))
+            return false;
+
+        ColumnContext other = (ColumnContext) obj;
+
+        return Objects.equals(target, other.target) &&
+                Objects.equals(config, other.config) &&
+                Objects.equals(partitionKeyType, other.partitionKeyType) &&
+                Objects.equals(clusteringComparator, other.clusteringComparator);
+    }
+
+    public int hashCode()
+    {
+        return Objects.hash(target, config, partitionKeyType, clusteringComparator);
+    }
+
+    /**
+     * A helper method for constructing consistent log messages for specific column indexes.
+     *
+     * Example: For the index "idx" in keyspace "ks" on table "tb", calling this method with the raw message
+     * "Flushing new index segment..." will produce...
+     *
+     * "[ks.tb.idx] Flushing new index segment..."
+     *
+     * @param message The raw content of a logging message, without information identifying it with an index.
+     *
+     * @return A log message with the proper keyspace, table and index name prepended to it.
+     */
+    public String logMessage(String message)
+    {
+        // Index names are unique only within a keyspace.
+        return String.format("[%s.%s.%s] %s", keyspace, table, config == null ? "?" : config.name, message);
+    }
+
+    /**
+     * @return the indexes that are built on the given SSTables on the left and corrupted indexes'
+     * corresponding contexts on the right
+     */
+    public Pair<Set<SSTableIndex>, Set<SSTableContext>> getBuiltIndexes(Collection<SSTableContext> sstableContexts, boolean validate, boolean rename)
+    {
+        Set<SSTableIndex> valid = new HashSet<>(sstableContexts.size());
+        Set<SSTableContext> invalid = new HashSet<>();
+
+        for (SSTableContext context : sstableContexts)
+        {
+            if (context.sstable.isMarkedCompacted())
+                continue;
+
+            if (!IndexComponents.isColumnIndexComplete(context.descriptor(), getIndexName()))
+            {
+                logger.debug(logMessage("An on-disk index build for SSTable {} has not completed."), context.descriptor());
+                continue;
+            }
+
+            if (IndexComponents.isColumnIndexEmpty(context.descriptor(), getIndexName()))
+            {
+                logger.debug(logMessage("No on-disk index was built for SSTable {} because the SSTable " +
+                                                "had no indexable rows for the index."), context.descriptor());
+                continue;
+            }
+
+            // TODO: does the column name need to be encoded since it's being included in a filename?
+            final IndexComponents components = IndexComponents.create(getIndexName(), context.sstable());
+
+            try
+            {
+                if (validate)
+                {
+                    components.validatePerColumnComponents(isLiteral());
+                }
+
+                SSTableIndex index = new SSTableIndex(context, this, components);
+                logger.debug(logMessage("Successfully created index for SSTable {}."), context.descriptor());
+
+                // Try to add new index to the set, if set already has such index, we'll simply release and move on.
+                // This covers situation when SSTable collection has the same SSTable multiple
+                // times because we don't know what kind of collection it actually is.
+                if (!valid.add(index))
+                {
+                    index.release();
+                }
+            }
+            catch (Throwable e)
+            {
+                invalid.add(context);
+                logger.warn(logMessage("Invalid per-column component for SSTable {}"), context.descriptor(), e);
+            }
+        }
+
+        return Pair.create(valid, invalid);
+    }
+
+    /**
+     * @return the number of indexed rows in this index (aka. pair of term and rowId)
+     */
+    public long getCellCount()
+    {
+        return getView().getIndexes()
+                        .stream()
+                        .mapToLong(SSTableIndex::getRowCount)
+                        .sum();
+    }
+
+    /**
+     * @return the total size (in bytes) of per-column index components
+     */
+    public long diskUsage()
+    {
+        return getView().getIndexes()
+                        .stream()
+                        .mapToLong(SSTableIndex::sizeOfPerColumnComponents)
+                        .sum();
+    }
+
+    /**
+     * @return the total memory usage (in bytes) of per-column index on-disk data structure
+     */
+    public long indexFileCacheSize()
+    {
+        return getView().getIndexes()
+                        .stream()
+                        .mapToLong(SSTableIndex::indexFileCacheSize)
+                        .sum();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/QueryContext.java b/src/java/org/apache/cassandra/index/sai/QueryContext.java
new file mode 100644
index 000000000000..a5abf2a78d82
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/QueryContext.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.index.sai.utils.AbortedOperationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+/**
+ * Tracks state relevant to the execution of a single query, including metrics and timeout monitoring.
+ *
+ * Fields here are non-volatile, as they are accessed from a single thread.
+ */
+@NotThreadSafe
+public class QueryContext
+{
+    private final long queryStartTimeNanos;
+
+    public final long executionQuotaNano;
+
+    public long sstablesHit = 0;
+    public long segmentsHit = 0;
+    public long partitionsRead = 0;
+    public long rowsFiltered = 0;
+
+    public long trieSegmentsHit = 0;
+
+    public long bkdPostingListsHit = 0;
+    public long bkdSegmentsHit = 0;
+
+    public long bkdPostingsSkips = 0;
+    public long bkdPostingsDecodes = 0;
+
+    public long triePostingsSkips = 0;
+    public long triePostingsDecodes = 0;
+
+    public long tokenSkippingCacheHits = 0;
+    public long tokenSkippingLookups = 0;
+
+    public long queryTimeouts = 0;
+
+    private final Map<SSTableReader, SSTableQueryContext> sstableQueryContexts = new HashMap<>();
+
+    @VisibleForTesting
+    public QueryContext()
+    {
+        this(DatabaseDescriptor.getRangeRpcTimeout(TimeUnit.MILLISECONDS));
+    }
+
+    public QueryContext(long executionQuotaMs)
+    {
+        this.executionQuotaNano = TimeUnit.MILLISECONDS.toNanos(executionQuotaMs);
+        queryStartTimeNanos = System.nanoTime();
+    }
+
+    public long totalQueryTimeNs()
+    {
+        return System.nanoTime() - queryStartTimeNanos;
+    }
+
+    public void incSstablesHit()
+    {
+        sstablesHit++;
+    }
+
+    public SSTableQueryContext getSSTableQueryContext(SSTableReader reader)
+    {
+        return sstableQueryContexts.computeIfAbsent(reader, k -> new SSTableQueryContext(this));
+    }
+
+    public void checkpoint()
+    {
+        if (totalQueryTimeNs() >= executionQuotaNano)
+        {
+            queryTimeouts++;
+            throw new AbortedOperationException();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/README.md b/src/java/org/apache/cassandra/index/sai/README.md
new file mode 100644
index 000000000000..a475b0c2222e
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/README.md
@@ -0,0 +1,129 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+ 
+     http://www.apache.org/licenses/LICENSE-2.0
+ 
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+# Storage-Attached Indexing
+
+## Overview
+Storage-attached indexes are a new column-based secondary indexing apparatus for DSE.
+
+The project was inspired by OSS SASI (SSTable-Attached Secondary Indexes) and retains some of its high-level
+architectural character (and even some actual code), but makes significant improvements in a number of areas:
+
+- The on-disk/SSTable index formats for both string and numeric data have been completely replaced. Strings are indexed
+  on disk using our proprietary on-disk byte-ordered trie data structure, while numeric types are indexed using Lucene's
+  balanced kd-tree.
+- While indexes continue to be managed at the column level from the user's perspective, the storage design at the column
+  index level is row-based, with related offset and token information stored only once at the SSTable level. This
+  drastically reduces our on-disk footprint when several columns are indexed on the same table.
+- The query path is synchronous and index searches run on IO threads.
+- Tracing, metrics, virtual table-based metadata, RLAC, and snapshot-based backup/restore are supported out of the box.
+
+Many similarities with standard secondary indexes remain:
+
+- The full set of C* consistency levels is supported for both reads and writes.
+- Index updates are synchronous with mutations and do not require any kind of read-before-write.
+- Queries are implemented on the back of C* range reads.
+- Paging is supported.
+- Only token ordering of results is supported.
+- Index builds are visible to operators as compactions and are executed on compaction threads.
+- All DML and DDL statements are CQL-based.
+- Single-node management operations are available via nodetool. (ex. stop & rebuild_index)
+
+## Quick Start
+
+The following short tutorial will get you up-and-running with storage-attached indexing.
+
+### Build and Start DSE
+
+1.) Make sure you've created the following directories and given yourself permissions on them:
+
+`/var/log/cassandra`
+
+`/var/lib/cassandra`
+
+2.) From the bdp root directory, run the following commands:
+
+`./gradlew jar`
+
+`bin/dse cassandra`
+
+3.) When the node stabilizes, open up `cqlsh` from the bdp root directory.
+
+`bin/cqlsh`
+
+### Create a Simple Data Model
+
+1.) Run the following DDL statements to create a table and two indexes:
+
+`CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy' , 'Cassandra': '1'};`
+
+`USE test;`
+
+`CREATE TABLE person (id int, name text, age int, PRIMARY KEY (id));`
+
+`CREATE CUSTOM INDEX ON person (name) USING 'StorageAttachedIndex' WITH OPTIONS = {'case_sensitive': false};`
+
+`CREATE CUSTOM INDEX ON person (age) USING 'StorageAttachedIndex';`
+
+2.) Add some data.
+
+`INSERT INTO person (id, name, age) VALUES (1, 'John', 21);`
+
+`INSERT INTO person (id, name, age) VALUES (2, 'john', 50);`
+
+`INSERT INTO person (id, name, age) VALUES (3, 'Boris', 43);`
+
+`INSERT INTO person (id, name, age) VALUES (4, 'Caleb', 34);`
+
+### Make Some Queries
+
+1.) Query for everyone named "John", ignoring case.
+
+`SELECT * FROM person WHERE name = 'John';`
+
+```
+ id | age | name
+----+-----+------
+  1 |  21 | John
+  2 |  50 | john
+```
+
+2.) Query for everyone between the ages of 18 and 25.
+
+`SELECT * FROM person WHERE age >= 18 AND age <= 35;`
+
+```
+ id | age | name
+----+-----+-------
+  1 |  21 |  John
+  4 |  34 | Caleb
+```
+
+## Contributors
+
+- Marc Selwan
+- Caleb Rackliffe
+- Zhao Yang
+- Jason Rutherglen
+- Maciej Zasada
+- Andrew de la Peña
+- Mike Adamson
+- Zahir Patni
+- Tomek Lasica
+- Berenguer Blasi
+- Rocco Varela
diff --git a/src/java/org/apache/cassandra/index/sai/SSTableContext.java b/src/java/org/apache/cassandra/index/sai/SSTableContext.java
new file mode 100644
index 000000000000..d43793f93918
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/SSTableContext.java
@@ -0,0 +1,290 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai;
+
+import java.io.IOException;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.BlockPackedReader;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.index.sai.disk.v1.MonotonicBlockPackedReader;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.concurrent.Ref;
+import org.apache.cassandra.utils.concurrent.RefCounted;
+import org.apache.cassandra.utils.concurrent.SharedCloseableImpl;
+
+import static org.apache.cassandra.index.sai.disk.OnDiskKeyProducer.NO_OFFSET;
+
+/**
+ * SSTableContext is created for individual sstable shared across indexes to track per-sstable index files.
+ *
+ * SSTableContext itself will be released when receiving sstable removed notification, but its shared copies in individual
+ * SSTableIndex will be released when in-flight read requests complete.
+ */
+public class SSTableContext extends SharedCloseableImpl
+{
+    public final SSTableReader sstable;
+
+    private final IndexComponents groupComponents;
+    // mapping from sstable row id to token or offset
+    public final LongArray.Factory tokenReaderFactory, offsetReaderFactory;
+    public final KeyFetcher keyFetcher;
+
+    private SSTableContext(SSTableReader sstable,
+                           LongArray.Factory tokenReaderFactory,
+                           LongArray.Factory offsetReaderFactory,
+                           KeyFetcher keyFetcher,
+                           Cleanup cleanup,
+                           IndexComponents groupComponents)
+    {
+        super(cleanup);
+        this.sstable = sstable;
+        this.tokenReaderFactory = tokenReaderFactory;
+        this.offsetReaderFactory = offsetReaderFactory;
+        this.keyFetcher = keyFetcher;
+        this.groupComponents = groupComponents;
+    }
+
+    private SSTableContext(SSTableContext copy)
+    {
+        super(copy);
+        this.sstable = copy.sstable;
+        this.tokenReaderFactory = copy.tokenReaderFactory;
+        this.offsetReaderFactory = copy.offsetReaderFactory;
+        this.groupComponents = copy.groupComponents;
+        this.keyFetcher = copy.keyFetcher;
+    }
+
+    @SuppressWarnings("resource")
+    public static SSTableContext create(SSTableReader sstable)
+    {
+        IndexComponents groupComponents = IndexComponents.perSSTable(sstable);
+
+        Ref<SSTableReader> sstableRef = null;
+        FileHandle token = null, offset = null;
+        LongArray.Factory tokenReaderFactory, offsetReaderFactory;
+        KeyFetcher keyFetcher;
+        try
+        {
+            MetadataSource source = MetadataSource.loadGroupMetadata(groupComponents);
+
+            sstableRef = sstable.tryRef();
+
+            if (sstableRef == null)
+            {
+                throw new IllegalStateException("Couldn't acquire reference to the sstable: " + sstable);
+            }
+
+            token = groupComponents.createFileHandle(IndexComponents.TOKEN_VALUES);
+            offset  = groupComponents.createFileHandle(IndexComponents.OFFSETS_VALUES);
+
+            tokenReaderFactory = new BlockPackedReader(token, IndexComponents.TOKEN_VALUES, groupComponents, source);
+            offsetReaderFactory = new MonotonicBlockPackedReader(offset, IndexComponents.OFFSETS_VALUES, groupComponents, source);
+            keyFetcher = new DecoratedKeyFetcher(sstable);
+
+            Cleanup cleanup = new Cleanup(token, offset, sstableRef);
+
+            return new SSTableContext(sstable, tokenReaderFactory, offsetReaderFactory, keyFetcher, cleanup, groupComponents);
+        }
+        catch (Throwable t)
+        {
+            if (sstableRef != null)
+            {
+                sstableRef.release();
+            }
+
+            throw Throwables.unchecked(Throwables.close(t, token, offset));
+        }
+    }
+
+    /**
+     * @return number of open files per {@link SSTableContext} instance
+     */
+    public static int openFilesPerSSTable()
+    {
+        // token and offset
+        return 2;
+    }
+
+    @Override
+    public SSTableContext sharedCopy()
+    {
+        return new SSTableContext(this);
+    }
+
+    private static class Cleanup implements RefCounted.Tidy
+    {
+        private final FileHandle token, offset;
+        private final Ref<SSTableReader> sstableRef;
+
+        private Cleanup(FileHandle token, FileHandle offset, Ref<SSTableReader> sstableRef)
+        {
+            this.token = token;
+            this.offset = offset;
+            this.sstableRef = sstableRef;
+        }
+
+        @Override
+        public void tidy()
+        {
+            Throwable t = sstableRef.ensureReleased(null);
+            t = Throwables.close(t, token, offset);
+
+            Throwables.maybeFail(t);
+        }
+
+        @Override
+        public String name()
+        {
+            return null;
+        }
+    }
+
+    /**
+     * @return descriptor of attached sstable
+     */
+    public Descriptor descriptor()
+    {
+        return sstable.descriptor;
+    }
+
+    public SSTableReader sstable()
+    {
+        return sstable;
+    }
+
+    /**
+     * @return disk usage of per-sstable index files
+     */
+    public long diskUsage()
+    {
+        return groupComponents.sizeOfPerSSTableComponents();
+    }
+
+    @Override
+    public String toString()
+    {
+        return "SSTableContext{" +
+               "sstable=" + sstable.descriptor +
+               '}';
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        SSTableContext that = (SSTableContext) o;
+        return Objects.equal(sstable.descriptor, that.sstable.descriptor);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(sstable.descriptor.hashCode());
+    }
+
+    public interface KeyFetcher
+    {
+        DecoratedKey apply(RandomAccessReader reader, long keyOffset);
+
+        /**
+         * Create a shared RAR for all tokens in the same segment.
+         */
+        RandomAccessReader createReader();
+    }
+
+    @VisibleForTesting
+    public static class DecoratedKeyFetcher implements KeyFetcher
+    {
+        private final SSTableReader sstable;
+
+        DecoratedKeyFetcher(SSTableReader sstable)
+        {
+            this.sstable = sstable;
+        }
+
+        @Override
+        public RandomAccessReader createReader()
+        {
+            return sstable.openIndexReader();
+        }
+
+        @Override
+        public DecoratedKey apply(RandomAccessReader reader, long keyOffset)
+        {
+            assert reader != null : "RandomAccessReader null";
+
+            // If the returned offset is the sentinel value, we've seen this offset
+            // before or we've run out of valid keys due to ZCS:
+            if (keyOffset == NO_OFFSET)
+                return null;
+
+            try
+            {
+                // can return null
+                return sstable.keyAt(reader, keyOffset);
+            }
+            catch (IOException e)
+            {
+                throw Throwables.cleaned(e);
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            return MoreObjects.toStringHelper(this).add("sstable", sstable).toString();
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return sstable.descriptor.hashCode();
+        }
+
+        @Override
+        public boolean equals(Object other)
+        {
+            if (other == null)
+            {
+                return false;
+            }
+            if (other == this)
+            {
+                return true;
+            }
+            if (other.getClass() != getClass())
+            {
+                return false;
+            }
+            DecoratedKeyFetcher rhs = (DecoratedKeyFetcher) other;
+            return sstable.descriptor.equals(rhs.sstable.descriptor);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/SSTableContextManager.java b/src/java/org/apache/cassandra/index/sai/SSTableContextManager.java
new file mode 100644
index 000000000000..387403f90101
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/SSTableContextManager.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import javax.annotation.concurrent.ThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * Manage per-sstable {@link SSTableContext} for {@link StorageAttachedIndexGroup}
+ */
+@ThreadSafe
+public class SSTableContextManager
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final ConcurrentHashMap<SSTableReader, SSTableContext> sstableContexts = new ConcurrentHashMap<>();
+
+    /**
+     * Initialize {@link SSTableContext}s if they are not already initialized.
+     *
+     * @param removed SSTables being removed
+     * @param added SSTables being added
+     * @param validate if true, header and footer will be validated.
+     *
+     * @return a set of contexts for SSTables with valid per-SSTable components, and a set of
+     * SSTables with invalid or missing components
+     */
+    @SuppressWarnings("resource")
+    public Pair<Set<SSTableContext>, Set<SSTableReader>> update(Collection<SSTableReader> removed, Iterable<SSTableReader> added, boolean validate)
+    {
+        release(removed);
+
+        Set<SSTableContext> contexts = new HashSet<>();
+        Set<SSTableReader> invalid = new HashSet<>();
+
+        for (SSTableReader sstable : added)
+        {
+            if (sstable.isMarkedCompacted())
+            {
+                continue;
+            }
+
+            if (!IndexComponents.isGroupIndexComplete(sstable.descriptor))
+            {
+                // Don't even try to validate or add the context if the completion marker is missing.
+                continue;
+            }
+
+            try
+            {
+                // Only validate on restart or newly refreshed SSTable. Newly built files are unlikely to be corrupted.
+                if (validate && !sstableContexts.containsKey(sstable))
+                {
+                    IndexComponents.perSSTable(sstable).validatePerSSTableComponents();
+                }
+
+                // ConcurrentHashMap#computeIfAbsent guarantees atomicity, so {@link SSTableContext#create(SSTableReader)}}
+                // is called at most once per key.
+                contexts.add(sstableContexts.computeIfAbsent(sstable, SSTableContext::create));
+            }
+            catch (Throwable t)
+            {
+                IndexComponents components = IndexComponents.perSSTable(sstable);
+                logger.warn(components.logMessage("Invalid per-SSTable component after sstable {} add.."), sstable.descriptor, t);
+                invalid.add(sstable);
+                SSTableContext failed = sstableContexts.remove(sstable);
+                if (failed != null)
+                {
+                    failed.close();
+                }
+            }
+        }
+
+        return Pair.create(contexts, invalid);
+    }
+
+    public void release(Collection<SSTableReader> toRelease)
+    {
+        toRelease.stream().map(sstableContexts::remove).filter(Objects::nonNull).forEach(SSTableContext::close);
+    }
+
+    /**
+     * @return total number of per-sstable open files for live sstables
+     */
+    int openFiles()
+    {
+        return size() * SSTableContext.openFilesPerSSTable();
+    }
+
+    /**
+     * @return total disk usage of all per-sstable index files
+     */
+    long diskUsage()
+    {
+        return sstableContexts.values().stream().mapToLong(SSTableContext::diskUsage).sum();
+    }
+
+    Set<SSTableReader> sstables()
+    {
+        return sstableContexts.keySet();
+    }
+
+    @VisibleForTesting
+    public int size()
+    {
+        return sstableContexts.size();
+    }
+
+    @VisibleForTesting
+    public void clear()
+    {
+        sstableContexts.values().forEach(SSTableContext::close);
+        sstableContexts.clear();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/SSTableIndex.java b/src/java/org/apache/cassandra/index/sai/SSTableIndex.java
new file mode 100644
index 000000000000..d2ec0addd19d
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/SSTableIndex.java
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai;
+
+import java.io.Closeable;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.collect.ImmutableList;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.sai.disk.Segment;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.format.Version;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents.IndexComponent;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.RangeConcatIterator;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Throwables;
+
+
+/**
+ * SSTableIndex is created for each column index on individual sstable to track per-column indexer.
+ */
+public class SSTableIndex
+{
+    // sort sstable index by first key then last key
+    public static final Comparator<SSTableIndex> COMPARATOR = Comparator.comparing((SSTableIndex s) -> s.getSSTable().first)
+                                                                        .thenComparing(s -> s.getSSTable().last)
+                                                                        .thenComparing(s -> s.getSSTable().descriptor.generation);
+
+    private final Version version;
+    private final SSTableContext sstableContext;
+    private final ColumnContext columnContext;
+    private final SSTableReader sstable;
+    private final IndexComponents components;
+
+    private final ImmutableList<Segment> segments;
+    private PerIndexFiles indexFiles;
+
+    private final List<SegmentMetadata> metadatas;
+    private final DecoratedKey minKey, maxKey; // in token order
+    private final ByteBuffer minTerm, maxTerm;
+    private final long minSSTableRowId, maxSSTableRowId;
+    private final long numRows;
+
+    private final AtomicInteger references = new AtomicInteger(1);
+    private final AtomicBoolean obsolete = new AtomicBoolean(false);
+
+    public SSTableIndex(SSTableContext sstableContext, ColumnContext columnContext, IndexComponents components)
+    {
+        this.sstableContext = sstableContext.sharedCopy();
+        this.columnContext = columnContext;
+        this.sstable = sstableContext.sstable;
+        this.components = components;
+
+        final AbstractType<?> validator = columnContext.getValidator();
+        assert validator != null;
+
+        try
+        {
+            this.indexFiles = new PerIndexFiles(components, columnContext.isLiteral());
+
+            ImmutableList.Builder<Segment> segmentsBuilder = ImmutableList.builder();
+
+            final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+            version = source.getVersion();
+            metadatas = SegmentMetadata.load(source, null);
+
+            for (SegmentMetadata metadata : metadatas)
+            {
+                segmentsBuilder.add(new Segment(columnContext, sstableContext, indexFiles, metadata));
+            }
+
+            segments = segmentsBuilder.build();
+            assert !segments.isEmpty();
+
+            this.minKey = metadatas.get(0).minKey;
+            this.maxKey = metadatas.get(metadatas.size() - 1).maxKey;
+
+            this.minTerm = metadatas.stream().map(m -> m.minTerm).min(TypeUtil.comparator(validator)).orElse(null);
+            this.maxTerm = metadatas.stream().map(m -> m.maxTerm).max(TypeUtil.comparator(validator)).orElse(null);
+
+            this.numRows = metadatas.stream().mapToLong(m -> m.numRows).sum();
+
+            this.minSSTableRowId = metadatas.get(0).minSSTableRowId;
+            this.maxSSTableRowId = metadatas.get(metadatas.size() - 1).maxSSTableRowId;
+        }
+        catch (Throwable t)
+        {
+            FileUtils.closeQuietly(indexFiles);
+            FileUtils.closeQuietly(sstableContext);
+            throw Throwables.unchecked(t);
+        }
+    }
+
+    public ColumnContext getColumnContext()
+    {
+        return columnContext;
+    }
+
+    public SSTableContext getSSTableContext()
+    {
+        return sstableContext;
+    }
+
+    public long indexFileCacheSize()
+    {
+        return segments.stream().mapToLong(Segment::indexFileCacheSize).sum();
+    }
+
+    /**
+     * @return number of indexed rows, note that rows may have been updated or removed in sstable.
+     */
+    public long getRowCount()
+    {
+        return numRows;
+    }
+
+    /**
+     * @return total size of per-column index components, in bytes
+     */
+    public long sizeOfPerColumnComponents()
+    {
+        return components.sizeOfPerColumnComponents();
+    }
+
+    /**
+     * @return the smallest possible sstable row id in this index.
+     */
+    public long minSSTableRowId()
+    {
+        return minSSTableRowId;
+    }
+
+    /**
+     * @return the largest possible sstable row id in this index.
+     */
+    public long maxSSTableRowId()
+    {
+        return maxSSTableRowId;
+    }
+
+    public ByteBuffer minTerm()
+    {
+        return minTerm;
+    }
+
+    public ByteBuffer maxTerm()
+    {
+        return maxTerm;
+    }
+
+    public DecoratedKey minKey()
+    {
+        return minKey;
+    }
+
+    public DecoratedKey maxKey()
+    {
+        return maxKey;
+    }
+
+    public RangeIterator search(Expression expression, AbstractBounds<PartitionPosition> keyRange, SSTableQueryContext context, boolean defer)
+    {
+        RangeConcatIterator.Builder builder = RangeConcatIterator.builder();
+
+        for (Segment segment : segments)
+        {
+            if (segment.intersects(keyRange))
+            {
+                builder.add(segment.search(expression, context, defer));
+            }
+        }
+
+        return builder.build();
+    }
+
+    public int getSegmentSize()
+    {
+        return segments.size();
+    }
+
+    public List<SegmentMetadata> segments()
+    {
+        return metadatas;
+    }
+
+    public Version getVersion()
+    {
+        return version;
+    }
+
+    /**
+     * container to share per-index file handles(kdtree, terms data, posting lists) among segments.
+     */
+    public static class PerIndexFiles implements Closeable
+    {
+        private final Map<IndexComponent, FileHandle> files = new HashMap<>(2);
+        private final IndexComponents components;
+
+        public PerIndexFiles(IndexComponents components, boolean isStringIndex)
+        {
+            this(components, isStringIndex, false);
+        }
+
+        public PerIndexFiles(IndexComponents components, boolean isStringIndex, boolean temporary)
+        {
+            this.components = components;
+            if (isStringIndex)
+            {
+                files.put(components.postingLists, components.createFileHandle(components.postingLists, temporary));
+                files.put(components.termsData, components.createFileHandle(components.termsData, temporary));
+            }
+            else
+            {
+                files.put(components.kdTree, components.createFileHandle(components.kdTree, temporary));
+                files.put(components.kdTreePostingLists, components.createFileHandle(components.kdTreePostingLists, temporary));
+            }
+        }
+
+        public FileHandle kdtree()
+        {
+            return getFile(components.kdTree);
+        }
+
+        public FileHandle postingLists()
+        {
+            return getFile(components.postingLists);
+        }
+
+        public FileHandle termsData()
+        {
+            return getFile(components.termsData);
+        }
+
+        public FileHandle kdtreePostingLists()
+        {
+            return getFile(components.kdTreePostingLists);
+        }
+
+        private FileHandle getFile(IndexComponent type)
+        {
+            FileHandle file = files.get(type);
+            if (file == null)
+                throw new IllegalArgumentException(String.format("Component %s not found for SSTable %s", type.name, components.descriptor));
+
+            return file;
+        }
+
+        public IndexComponents components()
+        {
+            return this.components;
+        }
+
+        @Override
+        public void close()
+        {
+            FileUtils.closeQuietly(files.values());
+        }
+    }
+
+    public SSTableReader getSSTable()
+    {
+        return sstable;
+    }
+
+    public boolean reference()
+    {
+        while (true)
+        {
+            int n = references.get();
+            if (n <= 0)
+                return false;
+            if (references.compareAndSet(n, n + 1))
+            {
+                return true;
+            }
+        }
+    }
+
+    public boolean isReleased()
+    {
+        return references.get() <= 0;
+    }
+
+    public void release()
+    {
+        int n = references.decrementAndGet();
+
+        if (n == 0)
+        {
+            FileUtils.closeQuietly(indexFiles);
+            FileUtils.closeQuietly(segments);
+            sstableContext.close();
+
+            /*
+             * When SSTable is removed, storage-attached index components will be automatically removed by LogTransaction.
+             * We only remove index components explicitly in case of index corruption or index rebuild.
+             */
+            if (obsolete.get())
+            {
+                components.deleteColumnIndex();
+            }
+        }
+    }
+
+    public void markObsolete()
+    {
+        obsolete.getAndSet(true);
+        release();
+    }
+
+    public boolean equals(Object o)
+    {
+        return o instanceof SSTableIndex && components.equals(((SSTableIndex) o).components);
+    }
+
+    public int hashCode()
+    {
+        return new HashCodeBuilder().append(components.hashCode()).build();
+    }
+
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("column", columnContext.getColumnName())
+                          .add("sstable", sstable.descriptor)
+                          .add("totalRows", sstable.getTotalRows())
+                          .toString();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/SSTableQueryContext.java b/src/java/org/apache/cassandra/index/sai/SSTableQueryContext.java
new file mode 100644
index 000000000000..42177b323b40
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/SSTableQueryContext.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai;
+
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+
+/**
+ * Tracks SSTable-specific state relevant to the execution of a single query.
+ *
+ * Fields here are non-volatile, as they are accessed from a single thread.
+ */
+@NotThreadSafe
+public class SSTableQueryContext
+{
+    public final QueryContext queryContext;
+
+    // During intersection queries, multiple column indexes touch the same exact tokens as we skip
+    // between range iterators. Caching the values of these global SSTable-specific lookups allows us to avoid
+    // large chunks of duplicated work.
+    public long prevTokenValue = Long.MIN_VALUE;
+    public long prevSSTableRowId = -1;
+
+    public long prevSkipToTokenValue = Long.MIN_VALUE;
+    public long prevSkipToSSTableRowId = -1;
+
+    public SSTableQueryContext(QueryContext queryContext)
+    {
+        this.queryContext = queryContext;
+    }
+
+    @VisibleForTesting
+    public static SSTableQueryContext forTest()
+    {
+        return new SSTableQueryContext(new QueryContext());
+    }
+
+    public void markTokenSkippingLookup()
+    {
+        queryContext.tokenSkippingLookups++;
+    }
+
+    public void markTokenSkippingCacheHit()
+    {
+        queryContext.tokenSkippingCacheHits++;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
new file mode 100644
index 000000000000..036d46b75710
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
@@ -0,0 +1,728 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableMap;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.Future;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Predicates;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.util.concurrent.Futures;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.codahale.metrics.Gauge;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.CassandraWriteContext;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.RangeTombstone;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.WriteContext;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.OrderPreservingPartitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.index.SecondaryIndexBuilder;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.TargetParser;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.analyzer.NonTokenizingOptions;
+import org.apache.cassandra.index.sai.disk.ColumnIndexWriter;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.MemtableIndexWriter;
+import org.apache.cassandra.index.sai.disk.SSTableIndexWriter;
+import org.apache.cassandra.index.sai.disk.SegmentBuilder;
+import org.apache.cassandra.index.sai.disk.StorageAttachedIndexWriter;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.memory.RowMapping;
+import org.apache.cassandra.index.sai.metrics.AbstractMetrics;
+import org.apache.cassandra.index.sai.utils.NamedMemoryLimiter;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.index.sai.view.View;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.metrics.CassandraMetricsRegistry;
+import org.apache.cassandra.metrics.DefaultNameFactory;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory;
+
+public class StorageAttachedIndex implements Index
+{
+    private static final Logger logger = LoggerFactory.getLogger(StorageAttachedIndex.class);
+
+    /**
+     * Global limit on heap consumed by all index segment building that occurs outside the context of Memtable flush.
+     *
+     * Note that to avoid flushing extremely small index segments, a segment is only flushed when
+     * both the global size of all building segments has breached the limit and the size of the
+     * segment in question reaches (segment_write_buffer_space_mb / # currently building column indexes).
+     *
+     * ex. If there is only one column index building, it can buffer up to segment_write_buffer_space_mb.
+     *
+     * ex. If there is one column index building per table across 8 compactors, each index will be
+     *     eligible to flush once it reaches (segment_write_buffer_space_mb / 8) MBs.
+     */
+    public static final long SEGMENT_BUILD_MEMORY_LIMIT = Long.getLong("cassandra.test.sai.segment_build_memory_limit",
+                                                          1024L * 1024L * (long) DatabaseDescriptor.getSAISegmentWriteBufferSpace());
+
+    public static final NamedMemoryLimiter SEGMENT_BUILD_MEMORY_LIMITER =
+            new NamedMemoryLimiter(SEGMENT_BUILD_MEMORY_LIMIT, "SSTable-attached Index Segment Builder");
+
+    static
+    {
+        CassandraMetricsRegistry.MetricName bufferSpaceUsed = DefaultNameFactory.createMetricName(AbstractMetrics.TYPE, "SegmentBufferSpaceUsedBytes", null);
+        CassandraMetricsRegistry.Metrics.register(bufferSpaceUsed, (Gauge<Long>) SEGMENT_BUILD_MEMORY_LIMITER::currentBytesUsed);
+
+        CassandraMetricsRegistry.MetricName bufferSpaceLimit = DefaultNameFactory.createMetricName(AbstractMetrics.TYPE, "SegmentBufferSpaceLimitBytes", null);
+        CassandraMetricsRegistry.Metrics.register(bufferSpaceLimit, (Gauge<Long>) () -> SEGMENT_BUILD_MEMORY_LIMIT);
+
+        // Note: The active builder count starts at 1 to avoid dividing by zero.
+        CassandraMetricsRegistry.MetricName buildsInProgress = DefaultNameFactory.createMetricName(AbstractMetrics.TYPE, "ColumnIndexBuildsInProgress", null);
+        CassandraMetricsRegistry.Metrics.register(buildsInProgress, (Gauge<Long>) () -> SegmentBuilder.ACTIVE_BUILDER_COUNT.get() - 1);
+    }
+
+    private static class StorageAttachedIndexBuildingSupport implements IndexBuildingSupport
+    {
+        public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs,
+                                                       Set<Index> indexes,
+                                                       Collection<SSTableReader> sstablesToRebuild,
+                                                       boolean isFullRebuild)
+        {
+            NavigableMap<SSTableReader, Set<StorageAttachedIndex>> sstables = new TreeMap<>(Comparator.comparingInt(a -> a.descriptor.generation));
+            StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+
+            indexes.stream()
+                   .filter((i) -> i instanceof StorageAttachedIndex)
+                   .forEach((i) ->
+                            {
+                                StorageAttachedIndex sai = (StorageAttachedIndex) i;
+                                ColumnContext context = ((StorageAttachedIndex) i).getContext();
+
+                                // If this is not a full manual index rebuild we can skip SSTables that already have an
+                                // attached index. Otherwise, we override any pre-existent index.
+                                Collection<SSTableReader> ss = sstablesToRebuild;
+                                if (!isFullRebuild)
+                                {
+                                    ss = sstablesToRebuild.stream()
+                                                          .filter(s -> !IndexComponents.isColumnIndexComplete(s.descriptor, context.getIndexName()))
+                                                          .collect(Collectors.toList());
+                                }
+
+                                group.dropIndexSSTables(ss, sai);
+
+                                ss.forEach((sstable) ->
+                                           {
+                                               Set<StorageAttachedIndex> toBuild = sstables.get(sstable);
+                                               if (toBuild == null) sstables.put(sstable, (toBuild = new HashSet<>()));
+                                               toBuild.add(sai);
+                                           });
+                            });
+
+            return new StorageAttachedIndexBuilder(StorageAttachedIndexGroup.getIndexGroup(cfs), sstables, isFullRebuild, false);
+        }
+    }
+
+    // Used to build indexes on newly added SSTables:
+    private static final StorageAttachedIndexBuildingSupport INDEX_BUILDER_SUPPORT = new StorageAttachedIndexBuildingSupport();
+
+    private static final Set<String> VALID_OPTIONS = ImmutableSet.of(NonTokenizingOptions.CASE_SENSITIVE,
+                                                                     NonTokenizingOptions.NORMALIZE,
+                                                                     NonTokenizingOptions.ASCII,
+                                                                     IndexTarget.TARGET_OPTION_NAME,
+                                                                     IndexTarget.CUSTOM_INDEX_OPTION_NAME,
+                                                                     IndexWriterConfig.POSTING_LIST_LVL_MIN_LEAVES,
+                                                                     IndexWriterConfig.POSTING_LIST_LVL_SKIP_OPTION);
+
+    public static final Set<CQL3Type> SUPPORTED_TYPES = ImmutableSet.of(CQL3Type.Native.ASCII, CQL3Type.Native.BIGINT, CQL3Type.Native.DATE,
+                                                                        CQL3Type.Native.DOUBLE, CQL3Type.Native.FLOAT, CQL3Type.Native.INT,
+                                                                        CQL3Type.Native.SMALLINT, CQL3Type.Native.TEXT, CQL3Type.Native.TIME,
+                                                                        CQL3Type.Native.TIMESTAMP, CQL3Type.Native.TIMEUUID, CQL3Type.Native.TINYINT,
+                                                                        CQL3Type.Native.UUID, CQL3Type.Native.VARCHAR, CQL3Type.Native.INET,
+                                                                        CQL3Type.Native.VARINT, CQL3Type.Native.DECIMAL);
+
+    private static final Set<Class<? extends IPartitioner>> ILLEGAL_PARTITIONERS =
+            ImmutableSet.of(OrderPreservingPartitioner.class, LocalPartitioner.class, ByteOrderedPartitioner.class, RandomPartitioner.class);
+
+    private final ColumnFamilyStore baseCfs;
+    private final IndexMetadata config;
+    private final ColumnContext context;
+
+    // Tracks whether or not we've started the index build on initialization.
+    private volatile boolean initBuildStarted = false;
+
+    // Tracks whether the index has been invalidated due to removal, a table drop, etc.
+    private volatile boolean valid = true;
+
+    public StorageAttachedIndex(ColumnFamilyStore baseCfs, IndexMetadata config)
+    {
+        this.baseCfs = baseCfs;
+        this.config = config;
+        this.context = new ColumnContext(baseCfs.metadata(), config);
+    }
+
+    /**
+     * Used via reflection in {@link IndexMetadata}
+     */
+    @SuppressWarnings({ "unused" })
+    public static Map<String, String> validateOptions(Map<String, String> options, TableMetadata metadata)
+    {
+        Map<String, String> unknown = new HashMap<>(2);
+
+        for (Map.Entry<String, String> option : options.entrySet())
+        {
+            if (!VALID_OPTIONS.contains(option.getKey()))
+            {
+                unknown.put(option.getKey(), option.getValue());
+            }
+        }
+
+        if (!unknown.isEmpty())
+        {
+            return unknown;
+        }
+
+        if (ILLEGAL_PARTITIONERS.contains(metadata.partitioner.getClass()))
+        {
+            throw new InvalidRequestException("Storage-attached index does not support the following IPartitioner implementations: " + ILLEGAL_PARTITIONERS);
+        }
+
+        String targetColumn = options.get(IndexTarget.TARGET_OPTION_NAME);
+
+        if (targetColumn == null)
+        {
+            throw new InvalidRequestException("Missing target column");
+        }
+
+        if (targetColumn.split(",").length > 1)
+        {
+            throw new InvalidRequestException("A storage-attached index cannot be created over multiple columns: " + targetColumn);
+        }
+
+        Pair<ColumnMetadata, IndexTarget.Type> target = TargetParser.parse(metadata, targetColumn);
+
+        if (target == null)
+        {
+            throw new InvalidRequestException("Failed to retrieve target column for: " + targetColumn);
+        }
+
+        // In order to support different index target on non-frozen map, ie. KEYS, VALUE, ENTRIES, we need to put index
+        // name as part of index file name instead of column name. We only need to check that the target is different
+        // between indexes. This will only allow indexes in the same column with a different IndexTarget.Type.
+        //
+        // Note that: "metadata.indexes" already includes current index
+        if (metadata.indexes.stream().filter(index -> index.getIndexClassName().equals(StorageAttachedIndex.class.getName()))
+                            .map(index -> TargetParser.parse(metadata, index.options.get(IndexTarget.TARGET_OPTION_NAME)))
+                            .filter(Objects::nonNull).filter(t -> t.equals(target)).count() > 1)
+        {
+            throw new InvalidRequestException("Cannot create more than one storage-attached index on the same column: " + target.left);
+        }
+
+        AbstractType<?> type = TypeUtil.cellValueType(target);
+
+        // If we are indexing map entries we need to validate the sub-types
+        if (TypeUtil.isComposite(type))
+        {
+            for (AbstractType<?> subType : type.subTypes())
+            {
+                if (!SUPPORTED_TYPES.contains(subType.asCQL3Type()) && !TypeUtil.isFrozenCollection(subType))
+                    throw new InvalidRequestException("Unsupported type: " + subType.asCQL3Type());
+            }
+        }
+        else if (!SUPPORTED_TYPES.contains(type.asCQL3Type()) && !TypeUtil.isFrozenCollection(type))
+        {
+            throw new InvalidRequestException("Unsupported type: " + type.asCQL3Type());
+        }
+
+        AbstractAnalyzer.fromOptions(type, options);
+        IndexWriterConfig.fromOptions(null, type, options);
+
+        return Collections.emptyMap();
+    }
+
+    @Override
+    public void register(IndexRegistry registry)
+    {
+        // index will be available for writes
+        registry.registerIndex(this, StorageAttachedIndexGroup.class, () -> new StorageAttachedIndexGroup(baseCfs));
+    }
+
+    @Override
+    public IndexMetadata getIndexMetadata()
+    {
+        return config;
+    }
+
+    @Override
+    public Callable<?> getInitializationTask()
+    {
+        // New storage-attached indexes will be available for queries after on disk index data are built.
+        // Memtable data will be indexed via flushing triggered by schema change
+        // We only want to validate the index files if we are starting up
+        return () -> startInitialBuild(baseCfs, StorageService.instance.isStarting()).get();
+    }
+
+    private Future<?> startInitialBuild(ColumnFamilyStore baseCfs, boolean validate)
+    {
+        if (baseCfs.indexManager.isIndexQueryable(this))
+        {
+            logger.debug(context.logMessage("Skipping validation and building in initialization task, as pre-join has already made the storage attached index queryable..."));
+            initBuildStarted = true;
+            return CompletableFuture.completedFuture(null);
+        }
+
+        // stop in-progress compaction tasks to prevent compacted sstable not being index.
+        logger.debug(context.logMessage("Stopping active compactions to make sure all sstables are indexed after initial build."));
+        CompactionManager.instance.interruptCompactionFor(Collections.singleton(baseCfs.metadata()),
+                                                          OperationType.REWRITES_SSTABLES,
+                                                          Predicates.alwaysTrue(),
+                                                          true);
+
+        // Force another flush to make sure on disk index is generated for memtable data before marking it queryable.
+        // In case of offline scrub, there is no live memtables.
+        if (!baseCfs.getTracker().getView().liveMemtables.isEmpty())
+        {
+            baseCfs.forceBlockingFlush();
+        }
+
+        // It is now safe to flush indexes directly from flushing Memtables.
+        initBuildStarted = true;
+
+        StorageAttachedIndexGroup indexGroup = StorageAttachedIndexGroup.getIndexGroup(baseCfs);
+        List<SSTableReader> nonIndexed = findNonIndexedSSTables(baseCfs, indexGroup, validate, true);
+
+        if (nonIndexed.isEmpty())
+        {
+            return CompletableFuture.completedFuture(null);
+        }
+
+        // split sorted sstables into groups with similar size and build each group in separate compaction thread
+        List<List<SSTableReader>> groups = groupBySize(nonIndexed, DatabaseDescriptor.getConcurrentCompactors());
+        List<ListenableFuture<?>> futures = new ArrayList<>();
+
+        for (List<SSTableReader> group : groups)
+        {
+            SortedMap<SSTableReader, Set<StorageAttachedIndex>> current = new TreeMap<>(Comparator.comparingLong(sstable -> sstable.descriptor.generation));
+            group.forEach(sstable -> current.put(sstable, Collections.singleton(this)));
+
+            futures.add(CompactionManager.instance.submitIndexBuild(new StorageAttachedIndexBuilder(indexGroup, current, false, true)));
+        }
+
+        logger.info(context.logMessage("Submitting {} parallel initial index builds over {} total sstables..."), futures.size(), nonIndexed.size());
+        return Futures.allAsList(futures);
+    }
+
+    /**
+     * Splits SSTables into groups of similar overall size.
+     *
+     * @param toRebuild a list of SSTables to split (Note that this list will be sorted in place!)
+     * @param parallelism an upper bound on the number of groups
+     *
+     * @return a {@link List} of SSTable groups, each represented as a {@link List} of {@link SSTableReader}
+     */
+    @VisibleForTesting
+    public static List<List<SSTableReader>> groupBySize(List<SSTableReader> toRebuild, int parallelism)
+    {
+        List<List<SSTableReader>> groups = new ArrayList<>();
+
+        toRebuild.sort(Comparator.comparingLong(SSTableReader::onDiskLength).reversed());
+        Iterator<SSTableReader> sortedSSTables = toRebuild.iterator();
+        double dataPerCompactor = toRebuild.stream().mapToLong(SSTableReader::onDiskLength).sum() * 1.0 / parallelism;
+
+        while (sortedSSTables.hasNext())
+        {
+            long sum = 0;
+            List<SSTableReader> current = new ArrayList<>();
+
+            while (sortedSSTables.hasNext() && sum < dataPerCompactor)
+            {
+                SSTableReader sstable = sortedSSTables.next();
+                sum += sstable.onDiskLength();
+                current.add(sstable);
+            }
+
+            assert !current.isEmpty();
+            groups.add(current);
+        }
+
+        return groups;
+    }
+
+    @Override
+    public Callable<?> getMetadataReloadTask(IndexMetadata indexMetadata)
+    {
+        return null;
+    }
+
+    @Override
+    public Callable<?> getBlockingFlushTask()
+    {
+        return null; // storage-attached indexes are flushed alongside memtable
+    }
+
+    @Override
+    public Callable<?> getInvalidateTask()
+    {
+        return () ->
+        {
+            // mark index as invalid, in-progress SSTableIndexWriters will abort
+            valid = false;
+
+            // in case of dropping table, SSTable indexes should already been removed by SSTableListChangedNotification.
+            Set<Component> toRemove = getComponents();
+            for (SSTableIndex sstableIndex : context.getView().getIndexes())
+                sstableIndex.getSSTable().unregisterComponents(toRemove, baseCfs.getTracker());
+
+            context.invalidate();
+            return null;
+        };
+    }
+
+    @Override
+    public Callable<?> getPreJoinTask(boolean hadBootstrap)
+    {
+        /*
+         * During bootstrap, streamed SSTable are already built for existing indexes via {@link StorageAttachedIndexBuildingSupport}
+         * from {@link org.apache.cassandra.streaming.StreamReceiveTask.OnCompletionRunnable}.
+         *
+         * For indexes created during bootstrapping, we don't have to block bootstrap for them.
+         */
+
+        return this::startPreJoinTask;
+    }
+
+    private Future<?> startPreJoinTask()
+    {
+        try
+        {
+            if (baseCfs.indexManager.isIndexQueryable(this))
+            {
+                logger.debug(context.logMessage("Skipping validation in pre-join task, as the initialization task has already made the index queryable..."));
+                return null;
+            }
+
+            StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(baseCfs);
+            Collection<SSTableReader> nonIndexed = findNonIndexedSSTables(baseCfs, group, true, true);
+
+            if (nonIndexed.isEmpty())
+            {
+                // If the index is complete, mark it queryable before the node starts accepting requests:
+                baseCfs.indexManager.makeIndexQueryable(this, Status.BUILD_SUCCEEDED);
+            }
+        }
+        catch (Throwable t)
+        {
+            logger.error(context.logMessage("Failed in pre-join task!"), t);
+        }
+
+        return null;
+    }
+
+    @Override
+    public Callable<?> getTruncateTask(long truncatedAt)
+    {
+        /*
+         * index files will be removed as part of base sstable lifecycle in
+         * {@link LogTransaction#delete(java.io.File)} asynchronously.
+         */
+        return null;
+    }
+
+    @Override
+    public boolean shouldBuildBlocking()
+    {
+        return true;
+    }
+
+    @Override
+    public Optional<ColumnFamilyStore> getBackingTable()
+    {
+        return Optional.empty();
+    }
+
+    @Override
+    public boolean dependsOn(ColumnMetadata column)
+    {
+        return context.getDefinition().compareTo(column) == 0;
+    }
+
+    @Override
+    public boolean supportsExpression(ColumnMetadata column, Operator operator)
+    {
+        return dependsOn(column) && context.supports(operator);
+    }
+
+    @Override
+    public AbstractType<?> customExpressionValueType()
+    {
+        return null;
+    }
+
+    @Override
+    public RowFilter getPostIndexQueryFilter(RowFilter filter)
+    {
+        // it should be executed from the SAI query plan, this is only used by the singleton index query plan
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long getEstimatedResultRows()
+    {
+        throw new UnsupportedOperationException("Use StorageAttachedIndexQueryPlan#getEstimatedResultRows() instead.");
+    }
+
+    @Override
+    public boolean isQueryable(Status status)
+    {
+        // consider unknown status as queryable, because gossip may not be up-to-date for newly joining nodes.
+        return status == Status.BUILD_SUCCEEDED || status == Status.UNKNOWN;
+    }
+
+    @Override
+    public void validate(PartitionUpdate update) throws InvalidRequestException
+    {}
+
+    /**
+     * This method is called by the startup tasks to find SSTables that don't have indexes. The method is
+     * synchronized so that the view is unchanged between validation and the selection of non-indexed SSTables.
+     *
+     * @return a list SSTables without attached indexes
+     */
+    private synchronized List<SSTableReader> findNonIndexedSSTables(ColumnFamilyStore baseCfs, StorageAttachedIndexGroup group, boolean validate, boolean rename)
+    {
+        Set<SSTableReader> sstables = baseCfs.getLiveSSTables();
+
+        // Initialize the SSTable indexes w/ valid existing components...
+        assert group != null : "Missing index group on " + baseCfs.name;
+        group.onSSTableChanged(Collections.emptyList(), sstables, Collections.singleton(this), validate, rename);
+
+        // ...then identify and rebuild the SSTable indexes that are missing.
+        List<SSTableReader> nonIndexed = new ArrayList<>();
+        View view = context.getView();
+
+        for (SSTableReader sstable : sstables)
+        {
+            // An SSTable is considered not indexed if:
+            //   1. The current view does not contain the SSTable
+            //   2. The SSTable is not marked compacted
+            //   3. The column index does not have a completion marker
+            if (!view.containsSSTable(sstable) && !sstable.isMarkedCompacted() &&
+                    !IndexComponents.isColumnIndexComplete(sstable.descriptor, context.getIndexName()))
+            {
+                nonIndexed.add(sstable);
+            }
+        }
+
+        return nonIndexed;
+    }
+
+    private class UpdateIndexer extends IndexerAdapter
+    {
+        private final DecoratedKey key;
+        private final Memtable mt;
+        private final WriteContext writeContext;
+
+        UpdateIndexer(DecoratedKey key, Memtable mt, WriteContext writeContext)
+        {
+            this.key = key;
+            this.mt = mt;
+            this.writeContext = writeContext;
+        }
+
+        @Override
+        public void insertRow(Row row)
+        {
+            adjustMemtableSize(context.index(key, row, mt), CassandraWriteContext.fromContext(writeContext).getGroup());
+        }
+
+        @Override
+        public void updateRow(Row oldRow, Row newRow)
+        {
+            insertRow(newRow);
+        }
+
+        void adjustMemtableSize(long additionalSpace, OpOrder.Group opGroup)
+        {
+            mt.allocateExtraOnHeap(additionalSpace, opGroup);
+        }
+    }
+
+    protected static abstract class IndexerAdapter implements Indexer
+    {
+        @Override
+        public void begin() { }
+
+        @Override
+        public void finish() { }
+
+        @Override
+        public void partitionDelete(DeletionTime dt)
+        {
+        }
+
+        @Override
+        public void rangeTombstone(RangeTombstone rt)
+        {
+        }
+
+        @Override
+        public void removeRow(Row row)
+        {
+        }
+    }
+
+    @Override
+    public Searcher searcherFor(ReadCommand command) throws InvalidRequestException
+    {
+        // searchers should be created from the query plan, this is only used by the singleton index query plan
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker)
+    {
+        throw new UnsupportedOperationException("Storage-attached index flush observers should never be created directly.");
+    }
+
+    public ColumnIndexWriter newIndexWriter(Descriptor descriptor,
+                                            LifecycleNewTracker tracker,
+                                            RowMapping rowMapping,
+                                            CompressionParams compressionParams)
+    {
+        // If we're not flushing or we haven't yet started the initialization build, flush from SSTable contents.
+        if (tracker.opType() != OperationType.FLUSH || !initBuildStarted)
+        {
+            NamedMemoryLimiter limiter = SEGMENT_BUILD_MEMORY_LIMITER;
+            logger.info(context.logMessage("Starting a compaction index build. Global segment memory usage: {}"), prettyPrintMemory(limiter.currentBytesUsed()));
+
+            return new SSTableIndexWriter(descriptor, context, limiter, () -> valid, compressionParams);
+        }
+
+        return new MemtableIndexWriter(context.getPendingMemtableIndex(tracker), descriptor, context, rowMapping, compressionParams);
+    }
+
+    @Override
+    public Set<Component> getComponents()
+    {
+        return new HashSet<>(IndexComponents.perColumnComponents(context.getIndexName(), context.isLiteral()));
+    }
+
+    @Override
+    public Indexer indexerFor(DecoratedKey key,
+                              RegularAndStaticColumns columns,
+                              int nowInSec,
+                              WriteContext writeContext,
+                              IndexTransaction.Type transactionType,
+                              Memtable memtable)
+    {
+        if (transactionType == IndexTransaction.Type.UPDATE)
+        {
+            return new UpdateIndexer(key, memtable, writeContext);
+        }
+
+        // we are only interested in the data from Memtable
+        // everything else is going to be handled by SSTableWriter observers
+        return null;
+    }
+
+    @Override
+    public IndexBuildingSupport getBuildTaskSupport()
+    {
+        return INDEX_BUILDER_SUPPORT;
+    }
+
+    public ColumnContext getContext()
+    {
+        return context;
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("%s.%s.%s", baseCfs.keyspace.getName(), baseCfs.name, config == null ? "?" : config.name);
+    }
+
+    /**
+     * Removes this index from the {@link SecondaryIndexManager}'s set of queryable indexes.
+     *
+     * This usually happens in response to an index writing failure from {@link StorageAttachedIndexWriter}.
+     */
+    public void makeIndexNonQueryable()
+    {
+        baseCfs.indexManager.makeIndexNonQueryable(this, Status.BUILD_FAILED);
+        logger.warn(context.logMessage("Storage-attached index is no longer queryable. Please restart this node to repair it."));
+    }
+
+    void deleteIndexFiles(SSTableReader sstable)
+    {
+        IndexComponents.create(context.getIndexName(), sstable).deleteColumnIndex();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
new file mode 100644
index 000000000000..87978c27b2bf
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
@@ -0,0 +1,369 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.index.sai;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.UUID;
+import java.util.concurrent.CountDownLatch;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Maps;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.CompactionInterruptedException;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.Tracker;
+import org.apache.cassandra.db.rows.DeserializationHelper;
+import org.apache.cassandra.index.SecondaryIndexBuilder;
+import org.apache.cassandra.index.sai.disk.StorageAttachedIndexWriter;
+import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.KeyIterator;
+import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.SSTableSimpleIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.UUIDGen;
+import org.apache.cassandra.utils.concurrent.Ref;
+
+import static org.apache.cassandra.db.compaction.CompactionInfo.StopTrigger.TRUNCATE;
+
+/**
+ * Multiple storage-attached indexes can start building concurrently. We need to make sure:
+ * 1. Per-SSTable index files are built only once, eg. {@link IndexComponents#PER_SSTABLE_COMPONENTS}
+ *      a. Per-SSTable index files already built, do nothing
+ *      b. Per-SSTable index files are currently building, we need to wait until it's built in order to consider index built.
+ * 2. Per-column index files are built for each column index..{@link IndexComponents#perColumnComponents(String, boolean)}
+ */
+public class StorageAttachedIndexBuilder extends SecondaryIndexBuilder
+{
+    protected static final Logger logger = LoggerFactory.getLogger(StorageAttachedIndexBuilder.class);
+
+    // make sure only one builder can write to per sstable files when multiple storage-attached indexes are created simultaneously.
+    private static final Map<SSTableReader, CountDownLatch> inProgress = Maps.newConcurrentMap();
+
+    private final StorageAttachedIndexGroup group;
+    private final TableMetadata metadata;
+    private final Tracker tracker;
+    private final UUID compactionId = UUIDGen.getTimeUUID();
+    private final boolean isFullRebuild;
+    private final boolean isInitialBuild;
+
+    private final SortedMap<SSTableReader, Set<StorageAttachedIndex>> sstables;
+
+    private long bytesProcessed = 0;
+    private final long totalSizeInBytes;
+
+    StorageAttachedIndexBuilder(StorageAttachedIndexGroup group, SortedMap<SSTableReader, Set<StorageAttachedIndex>> sstables, boolean isFullRebuild, boolean isInitialBuild)
+    {
+        this.group = group;
+        this.metadata = group.metadata();
+        this.sstables = sstables;
+        this.tracker = group.table().getTracker();
+        this.isFullRebuild = isFullRebuild;
+        this.isInitialBuild = isInitialBuild;
+        this.totalSizeInBytes = sstables.keySet().stream().mapToLong(SSTableReader::uncompressedLength).sum();
+    }
+
+    @Override
+    public void build()
+    {
+        for (Map.Entry<SSTableReader, Set<StorageAttachedIndex>> e : sstables.entrySet())
+        {
+            SSTableReader sstable = e.getKey();
+            Set<StorageAttachedIndex> indexes = e.getValue();
+
+            Set<StorageAttachedIndex> existing = validateIndexes(indexes, sstable.descriptor);
+            if (existing.isEmpty())
+            {
+                logger.debug(logMessage("{} dropped during index build"), indexes);
+                continue;
+            }
+
+            if (indexSSTable(sstable, existing))
+            {
+                return;
+            }
+        }
+    }
+
+    private String logMessage(String message) {
+        return String.format("[%s.%s.*] %s", metadata.keyspace, metadata.name, message);
+    }
+
+    /**
+     * @return true if index build should be stopped
+     */
+    private boolean indexSSTable(SSTableReader sstable, Set<StorageAttachedIndex> indexes)
+    {
+        CountDownLatch perSSTableFileLock = null;
+        StorageAttachedIndexWriter indexWriter = null;
+
+        Ref<SSTableReader> ref = sstable.tryRef();
+        if (ref == null)
+        {
+            logger.warn(logMessage("Couldn't acquire reference to the SSTable {}. It may have been removed."), sstable.descriptor);
+            return false;
+        }
+
+        try (RandomAccessReader dataFile = sstable.openDataReader();
+             LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.INDEX_BUILD))
+        {
+            perSSTableFileLock = shouldWriteTokenOffsetFiles(sstable);
+            boolean perColumnOnly = perSSTableFileLock == null;
+            // remove existing per column index files instead of overwriting
+            indexes.forEach(index -> index.deleteIndexFiles(sstable));
+
+            final CompressionParams compressionParams = CryptoUtils.getCompressionParams(sstable);
+
+            indexWriter = new StorageAttachedIndexWriter(sstable.descriptor, indexes, txn, perColumnOnly, compressionParams);
+
+            long previousKeyPosition = 0;
+            indexWriter.begin();
+
+            try (KeyIterator keys = KeyIterator.forSSTable(sstable))
+            {
+                while (keys.hasNext())
+                {
+                    if (isStopRequested())
+                    {
+                        throw new CompactionInterruptedException(getCompactionInfo());
+                    }
+
+                    final DecoratedKey key = keys.next();
+                    final long keyPosition = keys.getKeyPosition();
+
+                    indexWriter.startPartition(key, keyPosition);
+
+                    RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
+                    dataFile.seek(indexEntry.position);
+                    ByteBufferUtil.skipShortLength(dataFile); // key
+
+                    /*
+                     * Not directly using {@link SSTableIdentityIterator#create(SSTableReader, FileDataInput, DecoratedKey)},
+                     * because we need to get position of partition level deletion and static row.
+                     */
+                    long partitionDeletionPosition = dataFile.getFilePointer();
+                    DeletionTime partitionLevelDeletion = DeletionTime.serializer.deserialize(dataFile);
+                    long staticRowPosition = dataFile.getFilePointer();
+
+                    indexWriter.partitionLevelDeletion(partitionLevelDeletion, partitionDeletionPosition);
+
+                    DeserializationHelper helper = new DeserializationHelper(sstable.metadata(), sstable.descriptor.version.correspondingMessagingVersion(), DeserializationHelper.Flag.LOCAL);
+
+                    try (SSTableSimpleIterator iterator = SSTableSimpleIterator.create(sstable.metadata(), dataFile, sstable.header, helper, partitionLevelDeletion);
+                         SSTableIdentityIterator partition = new SSTableIdentityIterator(sstable, key, partitionLevelDeletion, sstable.getFilename(), iterator))
+                    {
+                        // if the row has statics attached, it has to be indexed separately
+                        if (metadata.hasStaticColumns())
+                            indexWriter.nextUnfilteredCluster(partition.staticRow(), staticRowPosition);
+
+                        while (partition.hasNext())
+                        {
+                            long unfilteredPosition = dataFile.getFilePointer();
+                            indexWriter.nextUnfilteredCluster(partition.next(), unfilteredPosition);
+                        }
+                    }
+
+                    bytesProcessed += keyPosition - previousKeyPosition;
+                    previousKeyPosition = keyPosition;
+                }
+
+                completeSSTable(indexWriter, sstable, indexes, perSSTableFileLock);
+            }
+
+            return false;
+        }
+        catch (Throwable t)
+        {
+            if (indexWriter != null)
+            {
+                indexWriter.abort(t, true);
+            }
+
+            if (t instanceof InterruptedException)
+            {
+                // TODO: Is there anything that makes more sense than just restoring the interrupt?
+                logger.warn(logMessage("Interrupted while building indexes {} on SSTable {}"), indexes, sstable.descriptor);
+                Thread.currentThread().interrupt();
+                return true;
+            }
+            else if (t instanceof CompactionInterruptedException)
+            {
+                if (isInitialBuild && trigger() != TRUNCATE)
+                {
+                    logger.error(logMessage("Stop requested while building initial indexes {} on SSTable {}."), indexes, sstable.descriptor);
+                    throw Throwables.unchecked(t);
+                }
+                else
+                {
+                    logger.info(logMessage("Stop requested while building indexes {} on SSTable {}."), indexes, sstable.descriptor);
+                    return true;
+                }
+            }
+            else
+            {
+                logger.error(logMessage("Unable to build indexes {} on SSTable {}. Cause: {}."), indexes, sstable, t.getMessage());
+                throw Throwables.unchecked(t);
+            }
+        }
+        finally
+        {
+            ref.release();
+            // release current lock in case of error
+            if (perSSTableFileLock != null)
+            {
+                inProgress.remove(sstable);
+                perSSTableFileLock.countDown();
+            }
+        }
+    }
+
+    @Override
+    public CompactionInfo getCompactionInfo()
+    {
+        return new CompactionInfo(metadata,
+                                  OperationType.INDEX_BUILD,
+                                  bytesProcessed,
+                                  totalSizeInBytes,
+                                  compactionId,
+                                  sstables.keySet());
+    }
+
+    /**
+     * if the per sstable index files are already created, not need to write it again, unless found corrupted on rebuild
+     * if not created, try to acquire a lock, so only one builder will generate per sstable index files
+     */
+    private CountDownLatch shouldWriteTokenOffsetFiles(SSTableReader sstable)
+    {
+        // if per-table files are incomplete or checksum failed during full rebuild.
+        if (!IndexComponents.isGroupIndexComplete(sstable.descriptor) ||
+            (isFullRebuild && !IndexComponents.perSSTable(sstable).validatePerSSTableComponentsChecksum()))
+        {
+            CountDownLatch latch = new CountDownLatch(1);
+            if (inProgress.putIfAbsent(sstable, latch) == null)
+            {
+                // lock owner should cleanup existing per-SSTable files
+                group.deletePerSSTableFiles(Collections.singleton(sstable));
+                return latch;
+            }
+        }
+        return null;
+    }
+
+    private void completeSSTable(SSTableFlushObserver indexWriter,
+                                 SSTableReader sstable,
+                                 Set<StorageAttachedIndex> indexes,
+                                 CountDownLatch latch) throws InterruptedException
+    {
+        indexWriter.complete();
+
+        if (latch != null)
+        {
+            // current builder owns the lock
+            latch.countDown();
+        }
+        else
+        {
+            /*
+             * When there is no lock, it means the per sstable index files are already created, just proceed to finish.
+             * When there is a lock held by another builder, wait for it to finish before finishing marking current index built.
+             */
+            latch = inProgress.get(sstable);
+            if (latch != null)
+                latch.await();
+        }
+
+        Set<StorageAttachedIndex> existing = validateIndexes(indexes, sstable.descriptor);
+        if (existing.isEmpty())
+        {
+            logger.debug(logMessage("{} dropped during index build"), indexes);
+            return;
+        }
+
+        // register custom index components into existing sstables
+        sstable.registerComponents(group.getComponents(existing), tracker);
+        Set<StorageAttachedIndex> incomplete = group.onSSTableChanged(Collections.emptyList(), Collections.singleton(sstable), existing, false, false);
+
+        if (!incomplete.isEmpty())
+        {
+            // If this occurs during an initial index build, there is only one index in play, and
+            // throwing here to terminate makes sense. (This allows the initialization task to fail
+            // correctly and be marked as failed by the SIM.) In other cases, such as rebuilding a
+            // set of indexes for a new added/streamed SSTables, we terminate pessimistically. In
+            // other words, we abort the SSTable index write across all column indexes and mark
+            // then non-queryable until a restart or other incremental rebuild occurs.
+            throw new RuntimeException(logMessage("Failed to update views on column indexes " + incomplete + " on indexes " + indexes + "."));
+        }
+    }
+
+    /**
+     *  In case of full rebuild, stop the index build if any index is dropped.
+     *  Otherwise, skip dropped indexes to avoid exception during repair/streaming.
+     */
+    private Set<StorageAttachedIndex> validateIndexes(Set<StorageAttachedIndex> indexes, Descriptor descriptor)
+    {
+        Set<StorageAttachedIndex> existing = new HashSet<>();
+        Set<StorageAttachedIndex> dropped = new HashSet<>();
+
+        for (StorageAttachedIndex index : indexes)
+        {
+            if (group.containsIndex(index))
+                existing.add(index);
+            else
+                dropped.add(index);
+        }
+
+        if (!dropped.isEmpty())
+        {
+            String droppedIndexes = dropped.stream().map(sai -> sai.getContext().getIndexName()).collect(Collectors.toList()).toString();
+            if (isFullRebuild)
+                throw new RuntimeException(logMessage(String.format("%s are dropped, will stop index build.", droppedIndexes)));
+            else
+                logger.debug(logMessage("Skip building dropped index {} on sstable {}"), droppedIndexes, descriptor.baseFilename());
+        }
+
+        return existing;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
new file mode 100644
index 000000000000..6355fe875708
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
@@ -0,0 +1,425 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai;
+
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Objects;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.WriteContext;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.lifecycle.Tracker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.sai.disk.StorageAttachedIndexWriter;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.metrics.IndexGroupMetrics;
+import org.apache.cassandra.index.sai.metrics.TableQueryMetrics;
+import org.apache.cassandra.index.sai.metrics.TableStateMetrics;
+import org.apache.cassandra.index.sai.plan.StorageAttachedIndexQueryPlan;
+import org.apache.cassandra.index.transactions.IndexTransaction;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.notifications.INotification;
+import org.apache.cassandra.notifications.INotificationConsumer;
+import org.apache.cassandra.notifications.MemtableDiscardedNotification;
+import org.apache.cassandra.notifications.MemtableRenewedNotification;
+import org.apache.cassandra.notifications.SSTableAddedNotification;
+import org.apache.cassandra.notifications.SSTableListChangedNotification;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.Throwables;
+
+/**
+ * Orchestrates building of storage-attached indices, and manages lifecycle of resources shared between them.
+ */
+public class StorageAttachedIndexGroup implements Index.Group, INotificationConsumer, Iterable<StorageAttachedIndex>
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final TableQueryMetrics queryMetrics;
+    private final TableStateMetrics stateMetrics;
+    private final IndexGroupMetrics groupMetrics;
+    
+    private final Set<StorageAttachedIndex> indices = new HashSet<>();
+    private final ColumnFamilyStore baseCfs;
+
+    private final SSTableContextManager contextManager;
+
+    StorageAttachedIndexGroup(ColumnFamilyStore baseCfs)
+    {
+        this.baseCfs = baseCfs;
+        this.queryMetrics = new TableQueryMetrics(baseCfs.metadata());
+        this.stateMetrics = new TableStateMetrics(baseCfs.metadata(), this);
+        this.groupMetrics = new IndexGroupMetrics(baseCfs.metadata(), this);
+        this.contextManager = new SSTableContextManager();
+
+        Tracker tracker = baseCfs.getTracker();
+        tracker.subscribe(this);
+    }
+
+    @Nullable
+    public static StorageAttachedIndexGroup getIndexGroup(ColumnFamilyStore cfs)
+    {
+        return (StorageAttachedIndexGroup)cfs.indexManager.getIndexGroup(StorageAttachedIndexGroup.class);
+    }
+
+    @Override
+    public Set<Index> getIndexes()
+    {
+        return ImmutableSet.copyOf(indices);
+    }
+
+    @Override
+    public void addIndex(Index index)
+    {
+        assert index instanceof StorageAttachedIndex;
+        indices.add((StorageAttachedIndex) index);
+    }
+
+    @Override
+    public void removeIndex(Index index)
+    {
+        assert index instanceof StorageAttachedIndex;
+        boolean removed = indices.remove(index);
+        assert removed : "Cannot remove non-existing index " + index;
+        /*
+         * per index files are dropped via {@link StorageAttachedIndex#getInvalidateTask()}
+         */
+        if (indices.isEmpty())
+        {
+            Set<Component> toRemove = new HashSet<>(IndexComponents.PER_SSTABLE_COMPONENTS);
+            for (SSTableReader sstable : contextManager.sstables())
+                sstable.unregisterComponents(toRemove, baseCfs.getTracker());
+
+            deletePerSSTableFiles(baseCfs.getLiveSSTables());
+            baseCfs.getTracker().unsubscribe(this);
+        }
+    }
+
+    @Override
+    public void invalidate()
+    {
+        // in case of dropping table, sstable contexts should already been removed by SSTableListChangedNotification.
+        queryMetrics.release();
+        groupMetrics.release();
+        stateMetrics.release();
+        baseCfs.getTracker().unsubscribe(this);
+    }
+
+    @Override
+    public boolean supportsMultipleContains()
+    {
+        return true;
+    }
+
+    @Override
+    public boolean containsIndex(Index index)
+    {
+        return index instanceof StorageAttachedIndex && indices.contains(index);
+    }
+
+    @SuppressWarnings("NullableProblems")
+    @Override
+    public Iterator<StorageAttachedIndex> iterator()
+    {
+        return indices.iterator();
+    }
+
+    @Override
+    public Index.Indexer indexerFor(Predicate<Index> indexSelector,
+                                    DecoratedKey key,
+                                    RegularAndStaticColumns columns,
+                                    int nowInSec,
+                                    WriteContext ctx,
+                                    IndexTransaction.Type transactionType,
+                                    Memtable memtable)
+    {
+        final Set<Index.Indexer> indexers =
+                indices.stream().filter(indexSelector)
+                                .map(i -> i.indexerFor(key, columns, nowInSec, ctx, transactionType, memtable))
+                                .filter(Objects::nonNull)
+                                .collect(Collectors.toSet());
+
+        return indexers.isEmpty() ? null : new StorageAttachedIndex.IndexerAdapter()
+        {
+            @Override
+            public void insertRow(Row row)
+            {
+                forEach(indexer -> indexer.insertRow(row));
+            }
+
+            @Override
+            public void updateRow(Row oldRow, Row newRow)
+            {
+                forEach(indexer -> indexer.updateRow(oldRow, newRow));
+            }
+
+            private void forEach(Consumer<Index.Indexer> action)
+            {
+                indexers.forEach(action::accept);
+            }
+        };
+    }
+
+    @Override
+    public StorageAttachedIndexQueryPlan queryPlanFor(RowFilter rowFilter)
+    {
+        return StorageAttachedIndexQueryPlan.create(baseCfs, queryMetrics, indices, rowFilter);
+    }
+
+    @Override
+    public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker, TableMetadata tableMetadata)
+    {
+        try
+        {
+            return new StorageAttachedIndexWriter(descriptor, indices, tracker, tableMetadata.params.compression);
+        }
+        catch (Throwable t)
+        {
+            String message = "Unable to create storage-attached index writer on SSTable flush. All indexes from this table are going to be marked as non-queryable and will need to be rebuilt.";
+            logger.error(String.format("[%s.%s.*] %s", descriptor.ksname, descriptor.cfname, message), t);
+            indices.forEach(StorageAttachedIndex::makeIndexNonQueryable);
+            return null;
+        }
+    }
+
+    @Override
+    public boolean handles(IndexTransaction.Type type)
+    {
+        // to skip CleanupGCTransaction and IndexGCTransaction
+        return type == IndexTransaction.Type.UPDATE;
+    }
+
+    @Override
+    public Set<Component> getComponents()
+    {
+        return getComponents(indices);
+    }
+
+    static Set<Component> getComponents(Collection<StorageAttachedIndex> indices)
+    {
+        Set<Component> components = new HashSet<>(IndexComponents.PER_SSTABLE_COMPONENTS);
+        indices.forEach(index -> components.addAll(index.getComponents()));
+        return components;
+    }
+
+    @Override
+    public void handleNotification(INotification notification, Object sender)
+    {
+        // unfortunately, we can only check the type of notification via instanceof :(
+        if (notification instanceof SSTableAddedNotification)
+        {
+            SSTableAddedNotification notice = (SSTableAddedNotification) notification;
+
+            // Avoid validation for index files just written following Memtable flush. ZCS streaming should
+            // validate index checksum.
+            boolean validate = !notice.memtable().isPresent();
+            onSSTableChanged(Collections.emptySet(), notice.added, indices, validate, false);
+        }
+        else if (notification instanceof SSTableListChangedNotification)
+        {
+            SSTableListChangedNotification notice = (SSTableListChangedNotification) notification;
+
+            // Avoid validation for index files just written during compaction.
+            onSSTableChanged(notice.removed, notice.added, indices, false, false);
+        }
+        else if (notification instanceof MemtableRenewedNotification)
+        {
+            indices.forEach(index -> index.getContext().renewMemtable(((MemtableRenewedNotification) notification).renewed));
+        }
+        else if (notification instanceof MemtableDiscardedNotification)
+        {
+            indices.forEach(index -> index.getContext().discardMemtable(((MemtableDiscardedNotification) notification).memtable));
+        }
+    }
+
+    void deletePerSSTableFiles(Collection<SSTableReader> sstables)
+    {
+        contextManager.release(sstables);
+        sstables.forEach(sstableReader -> IndexComponents.deletePerSSTableIndexComponents(sstableReader.descriptor));
+    }
+
+    void dropIndexSSTables(Collection<SSTableReader> ss, StorageAttachedIndex index)
+    {
+        try
+        {
+            index.getContext().drop(ss);
+        }
+        catch (Throwable t)
+        {
+            // Mark the index non-queryable, as its view may be compromised.
+            index.makeIndexNonQueryable();
+
+            throw Throwables.unchecked(t);
+        }
+    }
+
+    /**
+     * This method is synchronized to avoid concurrent initialization tasks validating same per-SSTable files.
+     *
+     * @return the set of column indexes that were marked as non-queryable as a result of their per-SSTable index
+     * files being corrupt or being unable to successfully update their views
+     */
+    synchronized Set<StorageAttachedIndex> onSSTableChanged(Collection<SSTableReader> removed, Iterable<SSTableReader> added,
+                                                            Set<StorageAttachedIndex> indexes, boolean validate, boolean rename)
+    {
+        Pair<Set<SSTableContext>, Set<SSTableReader>> results = contextManager.update(removed, added, validate);
+
+        if (!results.right.isEmpty())
+        {
+            results.right.forEach(sstable -> {
+                IndexComponents.deletePerSSTableIndexComponents(sstable.descriptor);
+                // Column indexes are invalid if their SSTable-level components are corrupted so delete
+                // their associated index files and mark them non-queryable.
+                indices.forEach(index -> {
+                    index.deleteIndexFiles(sstable);
+                    index.makeIndexNonQueryable();
+                });
+            });
+            return indices;
+        }
+
+        Set<StorageAttachedIndex> incomplete = new HashSet<>();
+
+        for (StorageAttachedIndex index : indexes)
+        {
+            Set<SSTableContext> invalid = index.getContext().onSSTableChanged(removed, results.left, validate, rename);
+
+            if (!invalid.isEmpty())
+            {
+                // Delete the index files and mark the index non-queryable, as its view may be compromised,
+                // and incomplete, for our callers:
+                invalid.forEach(context -> index.deleteIndexFiles(context.sstable()));
+                index.makeIndexNonQueryable();
+                incomplete.add(index);
+            }
+        }
+        return incomplete;
+    }
+
+    /**
+     * open index files by checking number of {@link SSTableContext} and {@link SSTableIndex},
+     * so transient open files during validation and files that are still open for in-flight requests will not be tracked.
+     *
+     * @return total number of open files for all {@link StorageAttachedIndex}es.
+     */
+    public int openIndexFiles()
+    {
+        return contextManager.openFiles() + indices.stream().mapToInt(index -> index.getContext().openPerIndexFiles()).sum();
+    }
+
+    /**
+     * @return total disk usage of all per-sstable index files
+     */
+    public long diskUsage()
+    {
+        return contextManager.diskUsage();
+    }
+
+    /**
+     * @return count of indexes building
+     */
+    public int totalIndexBuildsInProgress()
+    {
+        return (int) indices.stream().filter(i -> baseCfs.indexManager.isIndexBuilding(i.getIndexMetadata().name)).count();
+    }
+
+    /**
+     * @return count of queryable indexes
+     */
+    public int totalQueryableIndexCount()
+    {
+        return (int) indices.stream().filter(i -> baseCfs.indexManager.isIndexQueryable(i)).count();
+    }
+
+    /**
+     * @return count of indexes
+     */
+    public int totalIndexCount()
+    {
+        return indices.size();
+    }
+
+    /**
+     * @return total disk usage of all per-sstable index files and per-column index files
+     */
+    public long totalDiskUsage()
+    {
+        return diskUsage() + indices.stream().flatMap(i -> i.getContext().getView().getIndexes().stream())
+                                    .mapToLong(SSTableIndex::sizeOfPerColumnComponents).sum();
+    }
+
+    public TableMetadata metadata()
+    {
+        return baseCfs.metadata();
+    }
+
+    public ColumnFamilyStore table()
+    {
+        return baseCfs;
+    }
+
+    @VisibleForTesting
+    public SSTableContextManager sstableContextManager()
+    {
+        return contextManager;
+    }
+
+    /**
+     * simulate index loading on restart with index file validation validation
+     */
+    @VisibleForTesting
+    public void unsafeReload()
+    {
+        contextManager.clear();
+        onSSTableChanged(baseCfs.getLiveSSTables(), Collections.emptySet(), indices, false, false);
+        onSSTableChanged(Collections.emptySet(), baseCfs.getLiveSSTables(), indices, true, true);
+    }
+
+    /**
+     * Simulate the index going through a restart of node
+     */
+    @VisibleForTesting
+    public void reset()
+    {
+        contextManager.clear();
+        indices.forEach(index -> index.makeIndexNonQueryable());
+        onSSTableChanged(baseCfs.getLiveSSTables(), Collections.emptySet(), indices, false, false);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/Token.java b/src/java/org/apache/cassandra/index/sai/Token.java
new file mode 100644
index 000000000000..b73e61991778
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/Token.java
@@ -0,0 +1,153 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.MoreObjects;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.memory.InMemoryToken;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.utils.MergeIterator;
+
+/**
+ * A container that exposes an iterator of {@link DecoratedKey} from disk. It exists primarily
+ * because multiple partition keys may hash to the same token.
+ */
+@NotThreadSafe
+public abstract class Token implements Comparable<Token>
+{
+    protected long token;
+
+    public Token(long token)
+    {
+        this.token = token;
+    }
+
+    /**
+     * Using Long instead long, because {@link RangeIterator} is based on Long and uses null to represent non-existing min/max.
+     */
+    public Long get()
+    {
+        return token;
+    }
+
+    @VisibleForTesting
+    public long getLong()
+    {
+        return token;
+    }
+
+    public abstract Iterator<DecoratedKey> keys();
+
+    @Override
+    public int compareTo(Token o)
+    {
+        return Long.compare(token, o.token);
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this).add("token", token).toString();
+    }
+
+    /**
+     * This interface makes it possible for us to inject custom tokens and mergers in our tests.
+     */
+    public interface TokenMerger
+    {
+        void add(Token other);
+
+        Token merge();
+
+        default void reset() {}
+    }
+
+    @NotThreadSafe
+    public static class ReusableTokenMerger implements TokenMerger
+    {
+        private final MergeIterator.Reducer<DecoratedKey, DecoratedKey> reducer = MergeIterator.getIdentity();
+        private final List<Iterator<DecoratedKey>> keyIterators;
+        private final List<Token> tokens;
+
+        private Token firstToken;
+
+        public ReusableTokenMerger(int capacity)
+        {
+            keyIterators = new ArrayList<>(capacity);
+            tokens = new ArrayList<>(capacity);
+        }
+
+        @Override
+        public void add(Token token)
+        {
+            if (token == null) return;
+
+            if (tokens.isEmpty())
+            {
+                firstToken = token;
+            }
+            else
+            {
+                assert firstToken.token == token.token : "Adding keys with a different token!";
+            }
+
+            tokens.add(token);
+        }
+
+        @Override
+        public Token merge()
+        {
+            assert firstToken != null : "No tokens have been added to this merger!";
+
+            if (tokens.size() == 1) return firstToken;
+
+            // We don't materialize keys until we know a merge is necessary.
+            for (Token token : tokens)
+            {
+                keyIterators.add(token.keys());
+            }
+
+            return new InMemoryToken(firstToken.token, MergeIterator.get(keyIterators, DecoratedKey.comparator, reducer));
+        }
+
+        /**
+         * Clears the state of the merger, preparing it to consume a new group of {@link Token}s.
+         */
+        public void reset()
+        {
+            keyIterators.clear();
+            tokens.clear();
+            firstToken = null;
+            reducer.onKeyChange();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java b/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
new file mode 100644
index 000000000000..e415bae48cc2
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/AbstractAnalyzer.java
@@ -0,0 +1,116 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NoSuchElementException;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+
+public abstract class AbstractAnalyzer implements Iterator<ByteBuffer>
+{
+    public static final Set<AbstractType<?>> ANALYZABLE_TYPES = ImmutableSet.of(UTF8Type.instance, AsciiType.instance);
+
+    protected ByteBuffer next = null;
+    String nextLiteral = null;
+
+    /**
+     * @return true if index value is transformed, eg. normalized or lower-cased or tokenized.
+     */
+    public abstract boolean transformValue();
+
+    /**
+     * Note: This method does not advance, as we rely on {@link #hasNext()} to buffer the next value.
+     *
+     * @return the raw value currently buffered by this iterator
+     */
+    public ByteBuffer next()
+    {
+        if (next == null)
+            throw new NoSuchElementException();
+        return next;
+    }
+
+    /**
+     * Note: This method does not advance, as we rely on {@link #hasNext()} to buffer the next value.
+     *
+     * @return the string value currently buffered by this iterator
+     */
+    public String nextLiteral(AbstractType<?> validator)
+    {
+        if (nextLiteral != null)
+        {
+            return nextLiteral;
+        }
+
+        assert next != null;
+        return TypeUtil.getString(next, validator);
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    protected abstract void resetInternal(ByteBuffer input);
+
+    public void reset(ByteBuffer input)
+    {
+        this.next = null;
+        this.nextLiteral = null;
+
+        resetInternal(input);
+    }
+
+    public static AbstractAnalyzer fromOptions(AbstractType<?> type, Map<String, String> options)
+    {
+        if (hasNonTokenizingOptions(options))
+        {
+            if (TypeUtil.isIn(type, ANALYZABLE_TYPES))
+            {
+                return new NonTokenizingAnalyzer(type, options);
+            }
+            else
+            {
+                throw new InvalidRequestException("CQL type " + type.asCQL3Type() + " cannot be analyzed.");
+            }
+        }
+        return new NoOpAnalyzer();
+    }
+
+    private static boolean hasNonTokenizingOptions(Map<String, String> options)
+    {
+        return options.get(NonTokenizingOptions.ASCII) != null || options.containsKey(NonTokenizingOptions.CASE_SENSITIVE) || options.containsKey(NonTokenizingOptions.NORMALIZE);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/NoOpAnalyzer.java b/src/java/org/apache/cassandra/index/sai/analyzer/NoOpAnalyzer.java
new file mode 100644
index 000000000000..d8ae78ae6297
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/NoOpAnalyzer.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.nio.ByteBuffer;
+
+import com.google.common.base.MoreObjects;
+
+/**
+ * Default noOp tokenizer. The iterator will iterate only once
+ * returning the unmodified input
+ */
+public class NoOpAnalyzer extends AbstractAnalyzer
+{
+    private ByteBuffer input;
+    private boolean hasNext = false;
+
+    @SuppressWarnings("unused")
+    NoOpAnalyzer() {}
+
+    @Override
+    public boolean hasNext()
+    {
+        if (hasNext)
+        {
+            this.next = input;
+            this.hasNext = false;
+            return true;
+        }
+        this.next = null;
+        return false;
+    }
+
+    @Override
+    protected void resetInternal(ByteBuffer input)
+    {
+        this.input = input;
+        this.hasNext = true;
+    }
+
+    @Override
+    public boolean transformValue()
+    {
+        return false;
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this).toString();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java
new file mode 100644
index 000000000000..b493ffab3f57
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzer.java
@@ -0,0 +1,160 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.nio.ByteBuffer;
+import java.util.Map;
+
+import com.google.common.base.MoreObjects;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.analyzer.filter.BasicResultFilters;
+import org.apache.cassandra.index.sai.analyzer.filter.FilterPipelineBuilder;
+import org.apache.cassandra.index.sai.analyzer.filter.FilterPipelineExecutor;
+import org.apache.cassandra.index.sai.analyzer.filter.FilterPipelineTask;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Analyzer that does *not* tokenize the input. Optionally will
+ * apply filters for the input output as defined in analyzers options
+ */
+public class NonTokenizingAnalyzer extends AbstractAnalyzer
+{
+    private static final Logger logger = LoggerFactory.getLogger(NonTokenizingAnalyzer.class);
+
+    private AbstractType<?> type;
+    private NonTokenizingOptions options;
+    private FilterPipelineTask filterPipeline;
+
+    private ByteBuffer input;
+    private boolean hasNext = false;
+
+    NonTokenizingAnalyzer(AbstractType<?> type, Map<String, String> options)
+    {
+        this(type, NonTokenizingOptions.fromMap(options));
+    }
+
+    NonTokenizingAnalyzer(AbstractType<?> type, NonTokenizingOptions tokenizerOptions)
+    {
+        this.type = type;
+        this.options = tokenizerOptions;
+        this.filterPipeline = getFilterPipeline();
+    }
+
+    @Override
+    public boolean hasNext()
+    {
+        // check that we know how to handle the input, otherwise bail
+        if (!TypeUtil.isIn(type, ANALYZABLE_TYPES)) return false;
+
+        if (hasNext)
+        {
+            try
+            {
+                String input = type.getString(this.input);
+
+                if (input == null)
+                {
+                    throw new MarshalException(String.format("'null' deserialized value for %s with %s",
+                                                             ByteBufferUtil.bytesToHex(this.input), type));
+                }
+
+                String result = FilterPipelineExecutor.execute(filterPipeline, input);
+                
+                if (result == null)
+                {
+                    nextLiteral = null;
+                    next = null;
+                    return false;
+                }
+
+                nextLiteral = result;
+                next = type.fromString(result);
+
+                return true;
+            }
+            catch (MarshalException e)
+            {
+                logger.error("Failed to deserialize value with " + type, e);
+                return false;
+            }
+            finally
+            {
+                hasNext = false;
+            }
+        }
+
+        return false;
+    }
+
+    @Override
+    public boolean transformValue()
+    {
+        return !options.isCaseSensitive() || options.isNormalized() || options.isAscii();
+    }
+
+    @Override
+    protected void resetInternal(ByteBuffer input)
+    {
+        this.input = input;
+        this.hasNext = true;
+    }
+
+    private FilterPipelineTask getFilterPipeline()
+    {
+        FilterPipelineBuilder builder = new FilterPipelineBuilder(new BasicResultFilters.NoOperation());
+        
+        if (!options.isCaseSensitive())
+        {
+            builder = builder.add("to_lower", new BasicResultFilters.LowerCase());
+        }
+        
+        if (options.isNormalized())
+        {
+            builder = builder.add("normalize", new BasicResultFilters.Normalize());
+        }
+
+        if (options.isAscii())
+        {
+            builder = builder.add("ascii", new BasicResultFilters.Ascii());
+        }
+        
+        return builder.build();
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("caseSensitive", options.isCaseSensitive())
+                          .add("normalized", options.isNormalized())
+                          .add("ascii", options.isAscii())
+                          .toString();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java
new file mode 100644
index 000000000000..b13ce12b687a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/NonTokenizingOptions.java
@@ -0,0 +1,162 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.base.Strings;
+
+import org.apache.cassandra.exceptions.InvalidRequestException;
+
+public class NonTokenizingOptions
+{
+    public static final String NORMALIZE = "normalize";
+    public static final String CASE_SENSITIVE = "case_sensitive";
+    public static final String ASCII = "ascii";
+
+    private boolean caseSensitive;
+    private boolean normalized;
+    private boolean ascii;
+
+    boolean isCaseSensitive()
+    {
+        return caseSensitive;
+    }
+
+    void setCaseSensitive(boolean caseSensitive)
+    {
+        this.caseSensitive = caseSensitive;
+    }
+    
+    boolean isNormalized()
+    {
+        return this.normalized;
+    }
+
+    void setAscii(boolean ascii)
+    {
+        this.ascii = ascii;
+    }
+
+    boolean isAscii()
+    {
+        return this.ascii;
+    }
+    
+    void setNormalized(boolean normalized)
+    {
+        this.normalized = normalized;
+    }
+
+    public static class OptionsBuilder
+    {
+        private boolean caseSensitive = true;
+        private boolean normalized = false;
+        private boolean ascii = false;
+
+        OptionsBuilder() {}
+
+        OptionsBuilder caseSensitive(boolean caseSensitive)
+        {
+            this.caseSensitive = caseSensitive;
+            return this;
+        }
+
+        OptionsBuilder ascii(boolean ascii)
+        {
+            this.ascii = ascii;
+            return this;
+        }
+
+        OptionsBuilder normalized(boolean normalized)
+        {
+            this.normalized = normalized;
+            return this;
+        }
+
+        public NonTokenizingOptions build()
+        {
+            NonTokenizingOptions options = new NonTokenizingOptions();
+            options.setCaseSensitive(caseSensitive);
+            options.setNormalized(normalized);
+            options.setAscii(ascii);
+            return options;
+        }
+    }
+
+    public static NonTokenizingOptions getDefaultOptions()
+    {
+        return fromMap(new HashMap(1));
+    }
+
+    public static NonTokenizingOptions fromMap(Map<String, String> options)
+    {
+        OptionsBuilder builder = new OptionsBuilder();
+
+        for (Map.Entry<String, String> entry : options.entrySet())
+        {
+            switch (entry.getKey())
+            {
+                case CASE_SENSITIVE:
+                {
+                    boolean boolValue = validateBoolean(entry.getValue(), CASE_SENSITIVE);
+                    builder = builder.caseSensitive(boolValue);
+                    break;
+                }
+                
+                case NORMALIZE:
+                {
+                    boolean boolValue = validateBoolean(entry.getValue(), NORMALIZE);
+                    builder = builder.normalized(boolValue);
+                    break;
+                }
+
+                case ASCII:
+                {
+                    boolean boolValue = validateBoolean(entry.getValue(), ASCII);
+                    builder = builder.ascii(boolValue);
+                    break;
+                }
+            }
+        }
+        return builder.build();
+    }
+
+    private static boolean validateBoolean(String value, String option)
+    {
+        if (Strings.isNullOrEmpty(value))
+        {
+            throw new InvalidRequestException("Empty value for boolean option '" + option + "'");
+        }
+
+        if (!value.equalsIgnoreCase(Boolean.TRUE.toString()) && !value.equalsIgnoreCase(Boolean.FALSE.toString()))
+        {
+            throw new InvalidRequestException("Illegal value for boolean option '" + option + "': " + value);
+        }
+
+        return Boolean.parseBoolean(value);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFilters.java b/src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFilters.java
new file mode 100644
index 000000000000..b5ad225f65fe
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFilters.java
@@ -0,0 +1,2007 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+import java.text.Normalizer;
+import java.util.Locale;
+
+/**
+ * Basic/General Token Filters
+ */
+public class BasicResultFilters
+{
+    private static final Locale DEFAULT_LOCALE = Locale.getDefault();
+
+    public static class LowerCase extends FilterPipelineTask
+    {
+        private final Locale locale;
+
+        public LowerCase()
+        {
+            this.locale = DEFAULT_LOCALE;
+        }
+
+        public String process(String input)
+        {
+            return input.toLowerCase(locale);
+        }
+    }
+
+    public static class Normalize extends FilterPipelineTask
+    {
+        public Normalize() { }
+
+        public String process(String input)
+        {
+            if (input == null) return null;
+            return Normalizer.isNormalized(input, Normalizer.Form.NFC) ? input : Normalizer.normalize(input, Normalizer.Form.NFC);
+        }
+    }
+
+    public static class Ascii extends FilterPipelineTask
+    {
+        public Ascii() { }
+
+        public String process(String input)
+        {
+            if (input == null) return null;
+            char[] inputChars = input.toCharArray();
+            // The output can (potentially) be 4 times the size of the input
+            char[] outputChars = new char[inputChars.length * 4];
+            int outputSize = foldToASCII(inputChars, 0, outputChars, 0, inputChars.length);
+            return new String(outputChars, 0, outputSize);
+        }
+    }
+
+    public static class NoOperation extends FilterPipelineTask
+    {
+        public String process(String input)
+        {
+            return input;
+        }
+    }
+
+    // copied from lucene org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter
+    public static final int foldToASCII(char input[], int inputPos, char output[], int outputPos, int length)
+    {
+        final int end = inputPos + length;
+        for (int pos = inputPos; pos < end ; ++pos) {
+            final char c = input[pos];
+
+            // Quick test: if it's not in range then just keep current character
+            if (c < '\u0080') {
+                output[outputPos++] = c;
+            } else {
+                switch (c) {
+                    case '\u00C0': // À  [LATIN CAPITAL LETTER A WITH GRAVE]
+                    case '\u00C1': // Á  [LATIN CAPITAL LETTER A WITH ACUTE]
+                    case '\u00C2': // Â  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX]
+                    case '\u00C3': // Ã  [LATIN CAPITAL LETTER A WITH TILDE]
+                    case '\u00C4': // Ä  [LATIN CAPITAL LETTER A WITH DIAERESIS]
+                    case '\u00C5': // Å  [LATIN CAPITAL LETTER A WITH RING ABOVE]
+                    case '\u0100': // Ā  [LATIN CAPITAL LETTER A WITH MACRON]
+                    case '\u0102': // Ă  [LATIN CAPITAL LETTER A WITH BREVE]
+                    case '\u0104': // Ą  [LATIN CAPITAL LETTER A WITH OGONEK]
+                    case '\u018F': // Ə  http://en.wikipedia.org/wiki/Schwa  [LATIN CAPITAL LETTER SCHWA]
+                    case '\u01CD': // Ǎ  [LATIN CAPITAL LETTER A WITH CARON]
+                    case '\u01DE': // Ǟ  [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON]
+                    case '\u01E0': // Ǡ  [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON]
+                    case '\u01FA': // Ǻ  [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE]
+                    case '\u0200': // Ȁ  [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE]
+                    case '\u0202': // Ȃ  [LATIN CAPITAL LETTER A WITH INVERTED BREVE]
+                    case '\u0226': // Ȧ  [LATIN CAPITAL LETTER A WITH DOT ABOVE]
+                    case '\u023A': // Ⱥ  [LATIN CAPITAL LETTER A WITH STROKE]
+                    case '\u1D00': // ᴀ  [LATIN LETTER SMALL CAPITAL A]
+                    case '\u1E00': // Ḁ  [LATIN CAPITAL LETTER A WITH RING BELOW]
+                    case '\u1EA0': // Ạ  [LATIN CAPITAL LETTER A WITH DOT BELOW]
+                    case '\u1EA2': // Ả  [LATIN CAPITAL LETTER A WITH HOOK ABOVE]
+                    case '\u1EA4': // Ấ  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE]
+                    case '\u1EA6': // Ầ  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE]
+                    case '\u1EA8': // Ẩ  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
+                    case '\u1EAA': // Ẫ  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE]
+                    case '\u1EAC': // Ậ  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
+                    case '\u1EAE': // Ắ  [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE]
+                    case '\u1EB0': // Ằ  [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE]
+                    case '\u1EB2': // Ẳ  [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE]
+                    case '\u1EB4': // Ẵ  [LATIN CAPITAL LETTER A WITH BREVE AND TILDE]
+                    case '\u1EB6': // Ặ  [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW]
+                    case '\u24B6': // Ⓐ  [CIRCLED LATIN CAPITAL LETTER A]
+                    case '\uFF21': // Ａ  [FULLWIDTH LATIN CAPITAL LETTER A]
+                        output[outputPos++] = 'A';
+                        break;
+                    case '\u00E0': // à  [LATIN SMALL LETTER A WITH GRAVE]
+                    case '\u00E1': // á  [LATIN SMALL LETTER A WITH ACUTE]
+                    case '\u00E2': // â  [LATIN SMALL LETTER A WITH CIRCUMFLEX]
+                    case '\u00E3': // ã  [LATIN SMALL LETTER A WITH TILDE]
+                    case '\u00E4': // ä  [LATIN SMALL LETTER A WITH DIAERESIS]
+                    case '\u00E5': // å  [LATIN SMALL LETTER A WITH RING ABOVE]
+                    case '\u0101': // ā  [LATIN SMALL LETTER A WITH MACRON]
+                    case '\u0103': // ă  [LATIN SMALL LETTER A WITH BREVE]
+                    case '\u0105': // ą  [LATIN SMALL LETTER A WITH OGONEK]
+                    case '\u01CE': // ǎ  [LATIN SMALL LETTER A WITH CARON]
+                    case '\u01DF': // ǟ  [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON]
+                    case '\u01E1': // ǡ  [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON]
+                    case '\u01FB': // ǻ  [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE]
+                    case '\u0201': // ȁ  [LATIN SMALL LETTER A WITH DOUBLE GRAVE]
+                    case '\u0203': // ȃ  [LATIN SMALL LETTER A WITH INVERTED BREVE]
+                    case '\u0227': // ȧ  [LATIN SMALL LETTER A WITH DOT ABOVE]
+                    case '\u0250': // ɐ  [LATIN SMALL LETTER TURNED A]
+                    case '\u0259': // ə  [LATIN SMALL LETTER SCHWA]
+                    case '\u025A': // ɚ  [LATIN SMALL LETTER SCHWA WITH HOOK]
+                    case '\u1D8F': // ᶏ  [LATIN SMALL LETTER A WITH RETROFLEX HOOK]
+                    case '\u1D95': // ᶕ  [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK]
+                    case '\u1E01': // ạ  [LATIN SMALL LETTER A WITH RING BELOW]
+                    case '\u1E9A': // ả  [LATIN SMALL LETTER A WITH RIGHT HALF RING]
+                    case '\u1EA1': // ạ  [LATIN SMALL LETTER A WITH DOT BELOW]
+                    case '\u1EA3': // ả  [LATIN SMALL LETTER A WITH HOOK ABOVE]
+                    case '\u1EA5': // ấ  [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE]
+                    case '\u1EA7': // ầ  [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE]
+                    case '\u1EA9': // ẩ  [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
+                    case '\u1EAB': // ẫ  [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE]
+                    case '\u1EAD': // ậ  [LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
+                    case '\u1EAF': // ắ  [LATIN SMALL LETTER A WITH BREVE AND ACUTE]
+                    case '\u1EB1': // ằ  [LATIN SMALL LETTER A WITH BREVE AND GRAVE]
+                    case '\u1EB3': // ẳ  [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE]
+                    case '\u1EB5': // ẵ  [LATIN SMALL LETTER A WITH BREVE AND TILDE]
+                    case '\u1EB7': // ặ  [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW]
+                    case '\u2090': // ₐ  [LATIN SUBSCRIPT SMALL LETTER A]
+                    case '\u2094': // ₔ  [LATIN SUBSCRIPT SMALL LETTER SCHWA]
+                    case '\u24D0': // ⓐ  [CIRCLED LATIN SMALL LETTER A]
+                    case '\u2C65': // ⱥ  [LATIN SMALL LETTER A WITH STROKE]
+                    case '\u2C6F': // Ɐ  [LATIN CAPITAL LETTER TURNED A]
+                    case '\uFF41': // ａ  [FULLWIDTH LATIN SMALL LETTER A]
+                        output[outputPos++] = 'a';
+                        break;
+                    case '\uA732': // Ꜳ  [LATIN CAPITAL LETTER AA]
+                        output[outputPos++] = 'A';
+                        output[outputPos++] = 'A';
+                        break;
+                    case '\u00C6': // Æ  [LATIN CAPITAL LETTER AE]
+                    case '\u01E2': // Ǣ  [LATIN CAPITAL LETTER AE WITH MACRON]
+                    case '\u01FC': // Ǽ  [LATIN CAPITAL LETTER AE WITH ACUTE]
+                    case '\u1D01': // ᴁ  [LATIN LETTER SMALL CAPITAL AE]
+                        output[outputPos++] = 'A';
+                        output[outputPos++] = 'E';
+                        break;
+                    case '\uA734': // Ꜵ  [LATIN CAPITAL LETTER AO]
+                        output[outputPos++] = 'A';
+                        output[outputPos++] = 'O';
+                        break;
+                    case '\uA736': // Ꜷ  [LATIN CAPITAL LETTER AU]
+                        output[outputPos++] = 'A';
+                        output[outputPos++] = 'U';
+                        break;
+                    case '\uA738': // Ꜹ  [LATIN CAPITAL LETTER AV]
+                    case '\uA73A': // Ꜻ  [LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR]
+                        output[outputPos++] = 'A';
+                        output[outputPos++] = 'V';
+                        break;
+                    case '\uA73C': // Ꜽ  [LATIN CAPITAL LETTER AY]
+                        output[outputPos++] = 'A';
+                        output[outputPos++] = 'Y';
+                        break;
+                    case '\u249C': // ⒜  [PARENTHESIZED LATIN SMALL LETTER A]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\uA733': // ꜳ  [LATIN SMALL LETTER AA]
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = 'a';
+                        break;
+                    case '\u00E6': // æ  [LATIN SMALL LETTER AE]
+                    case '\u01E3': // ǣ  [LATIN SMALL LETTER AE WITH MACRON]
+                    case '\u01FD': // ǽ  [LATIN SMALL LETTER AE WITH ACUTE]
+                    case '\u1D02': // ᴂ  [LATIN SMALL LETTER TURNED AE]
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = 'e';
+                        break;
+                    case '\uA735': // ꜵ  [LATIN SMALL LETTER AO]
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = 'o';
+                        break;
+                    case '\uA737': // ꜷ  [LATIN SMALL LETTER AU]
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = 'u';
+                        break;
+                    case '\uA739': // ꜹ  [LATIN SMALL LETTER AV]
+                    case '\uA73B': // ꜻ  [LATIN SMALL LETTER AV WITH HORIZONTAL BAR]
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = 'v';
+                        break;
+                    case '\uA73D': // ꜽ  [LATIN SMALL LETTER AY]
+                        output[outputPos++] = 'a';
+                        output[outputPos++] = 'y';
+                        break;
+                    case '\u0181': // Ɓ  [LATIN CAPITAL LETTER B WITH HOOK]
+                    case '\u0182': // Ƃ  [LATIN CAPITAL LETTER B WITH TOPBAR]
+                    case '\u0243': // Ƀ  [LATIN CAPITAL LETTER B WITH STROKE]
+                    case '\u0299': // ʙ  [LATIN LETTER SMALL CAPITAL B]
+                    case '\u1D03': // ᴃ  [LATIN LETTER SMALL CAPITAL BARRED B]
+                    case '\u1E02': // Ḃ  [LATIN CAPITAL LETTER B WITH DOT ABOVE]
+                    case '\u1E04': // Ḅ  [LATIN CAPITAL LETTER B WITH DOT BELOW]
+                    case '\u1E06': // Ḇ  [LATIN CAPITAL LETTER B WITH LINE BELOW]
+                    case '\u24B7': // Ⓑ  [CIRCLED LATIN CAPITAL LETTER B]
+                    case '\uFF22': // Ｂ  [FULLWIDTH LATIN CAPITAL LETTER B]
+                        output[outputPos++] = 'B';
+                        break;
+                    case '\u0180': // ƀ  [LATIN SMALL LETTER B WITH STROKE]
+                    case '\u0183': // ƃ  [LATIN SMALL LETTER B WITH TOPBAR]
+                    case '\u0253': // ɓ  [LATIN SMALL LETTER B WITH HOOK]
+                    case '\u1D6C': // ᵬ  [LATIN SMALL LETTER B WITH MIDDLE TILDE]
+                    case '\u1D80': // ᶀ  [LATIN SMALL LETTER B WITH PALATAL HOOK]
+                    case '\u1E03': // ḃ  [LATIN SMALL LETTER B WITH DOT ABOVE]
+                    case '\u1E05': // ḅ  [LATIN SMALL LETTER B WITH DOT BELOW]
+                    case '\u1E07': // ḇ  [LATIN SMALL LETTER B WITH LINE BELOW]
+                    case '\u24D1': // ⓑ  [CIRCLED LATIN SMALL LETTER B]
+                    case '\uFF42': // ｂ  [FULLWIDTH LATIN SMALL LETTER B]
+                        output[outputPos++] = 'b';
+                        break;
+                    case '\u249D': // ⒝  [PARENTHESIZED LATIN SMALL LETTER B]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'b';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00C7': // Ç  [LATIN CAPITAL LETTER C WITH CEDILLA]
+                    case '\u0106': // Ć  [LATIN CAPITAL LETTER C WITH ACUTE]
+                    case '\u0108': // Ĉ  [LATIN CAPITAL LETTER C WITH CIRCUMFLEX]
+                    case '\u010A': // Ċ  [LATIN CAPITAL LETTER C WITH DOT ABOVE]
+                    case '\u010C': // Č  [LATIN CAPITAL LETTER C WITH CARON]
+                    case '\u0187': // Ƈ  [LATIN CAPITAL LETTER C WITH HOOK]
+                    case '\u023B': // Ȼ  [LATIN CAPITAL LETTER C WITH STROKE]
+                    case '\u0297': // ʗ  [LATIN LETTER STRETCHED C]
+                    case '\u1D04': // ᴄ  [LATIN LETTER SMALL CAPITAL C]
+                    case '\u1E08': // Ḉ  [LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE]
+                    case '\u24B8': // Ⓒ  [CIRCLED LATIN CAPITAL LETTER C]
+                    case '\uFF23': // Ｃ  [FULLWIDTH LATIN CAPITAL LETTER C]
+                        output[outputPos++] = 'C';
+                        break;
+                    case '\u00E7': // ç  [LATIN SMALL LETTER C WITH CEDILLA]
+                    case '\u0107': // ć  [LATIN SMALL LETTER C WITH ACUTE]
+                    case '\u0109': // ĉ  [LATIN SMALL LETTER C WITH CIRCUMFLEX]
+                    case '\u010B': // ċ  [LATIN SMALL LETTER C WITH DOT ABOVE]
+                    case '\u010D': // č  [LATIN SMALL LETTER C WITH CARON]
+                    case '\u0188': // ƈ  [LATIN SMALL LETTER C WITH HOOK]
+                    case '\u023C': // ȼ  [LATIN SMALL LETTER C WITH STROKE]
+                    case '\u0255': // ɕ  [LATIN SMALL LETTER C WITH CURL]
+                    case '\u1E09': // ḉ  [LATIN SMALL LETTER C WITH CEDILLA AND ACUTE]
+                    case '\u2184': // ↄ  [LATIN SMALL LETTER REVERSED C]
+                    case '\u24D2': // ⓒ  [CIRCLED LATIN SMALL LETTER C]
+                    case '\uA73E': // Ꜿ  [LATIN CAPITAL LETTER REVERSED C WITH DOT]
+                    case '\uA73F': // ꜿ  [LATIN SMALL LETTER REVERSED C WITH DOT]
+                    case '\uFF43': // ｃ  [FULLWIDTH LATIN SMALL LETTER C]
+                        output[outputPos++] = 'c';
+                        break;
+                    case '\u249E': // ⒞  [PARENTHESIZED LATIN SMALL LETTER C]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'c';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00D0': // Ð  [LATIN CAPITAL LETTER ETH]
+                    case '\u010E': // Ď  [LATIN CAPITAL LETTER D WITH CARON]
+                    case '\u0110': // Đ  [LATIN CAPITAL LETTER D WITH STROKE]
+                    case '\u0189': // Ɖ  [LATIN CAPITAL LETTER AFRICAN D]
+                    case '\u018A': // Ɗ  [LATIN CAPITAL LETTER D WITH HOOK]
+                    case '\u018B': // Ƌ  [LATIN CAPITAL LETTER D WITH TOPBAR]
+                    case '\u1D05': // ᴅ  [LATIN LETTER SMALL CAPITAL D]
+                    case '\u1D06': // ᴆ  [LATIN LETTER SMALL CAPITAL ETH]
+                    case '\u1E0A': // Ḋ  [LATIN CAPITAL LETTER D WITH DOT ABOVE]
+                    case '\u1E0C': // Ḍ  [LATIN CAPITAL LETTER D WITH DOT BELOW]
+                    case '\u1E0E': // Ḏ  [LATIN CAPITAL LETTER D WITH LINE BELOW]
+                    case '\u1E10': // Ḑ  [LATIN CAPITAL LETTER D WITH CEDILLA]
+                    case '\u1E12': // Ḓ  [LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW]
+                    case '\u24B9': // Ⓓ  [CIRCLED LATIN CAPITAL LETTER D]
+                    case '\uA779': // Ꝺ  [LATIN CAPITAL LETTER INSULAR D]
+                    case '\uFF24': // Ｄ  [FULLWIDTH LATIN CAPITAL LETTER D]
+                        output[outputPos++] = 'D';
+                        break;
+                    case '\u00F0': // ð  [LATIN SMALL LETTER ETH]
+                    case '\u010F': // ď  [LATIN SMALL LETTER D WITH CARON]
+                    case '\u0111': // đ  [LATIN SMALL LETTER D WITH STROKE]
+                    case '\u018C': // ƌ  [LATIN SMALL LETTER D WITH TOPBAR]
+                    case '\u0221': // ȡ  [LATIN SMALL LETTER D WITH CURL]
+                    case '\u0256': // ɖ  [LATIN SMALL LETTER D WITH TAIL]
+                    case '\u0257': // ɗ  [LATIN SMALL LETTER D WITH HOOK]
+                    case '\u1D6D': // ᵭ  [LATIN SMALL LETTER D WITH MIDDLE TILDE]
+                    case '\u1D81': // ᶁ  [LATIN SMALL LETTER D WITH PALATAL HOOK]
+                    case '\u1D91': // ᶑ  [LATIN SMALL LETTER D WITH HOOK AND TAIL]
+                    case '\u1E0B': // ḋ  [LATIN SMALL LETTER D WITH DOT ABOVE]
+                    case '\u1E0D': // ḍ  [LATIN SMALL LETTER D WITH DOT BELOW]
+                    case '\u1E0F': // ḏ  [LATIN SMALL LETTER D WITH LINE BELOW]
+                    case '\u1E11': // ḑ  [LATIN SMALL LETTER D WITH CEDILLA]
+                    case '\u1E13': // ḓ  [LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW]
+                    case '\u24D3': // ⓓ  [CIRCLED LATIN SMALL LETTER D]
+                    case '\uA77A': // ꝺ  [LATIN SMALL LETTER INSULAR D]
+                    case '\uFF44': // ｄ  [FULLWIDTH LATIN SMALL LETTER D]
+                        output[outputPos++] = 'd';
+                        break;
+                    case '\u01C4': // Ǆ  [LATIN CAPITAL LETTER DZ WITH CARON]
+                    case '\u01F1': // Ǳ  [LATIN CAPITAL LETTER DZ]
+                        output[outputPos++] = 'D';
+                        output[outputPos++] = 'Z';
+                        break;
+                    case '\u01C5': // ǅ  [LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON]
+                    case '\u01F2': // ǲ  [LATIN CAPITAL LETTER D WITH SMALL LETTER Z]
+                        output[outputPos++] = 'D';
+                        output[outputPos++] = 'z';
+                        break;
+                    case '\u249F': // ⒟  [PARENTHESIZED LATIN SMALL LETTER D]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'd';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0238': // ȸ  [LATIN SMALL LETTER DB DIGRAPH]
+                        output[outputPos++] = 'd';
+                        output[outputPos++] = 'b';
+                        break;
+                    case '\u01C6': // ǆ  [LATIN SMALL LETTER DZ WITH CARON]
+                    case '\u01F3': // ǳ  [LATIN SMALL LETTER DZ]
+                    case '\u02A3': // ʣ  [LATIN SMALL LETTER DZ DIGRAPH]
+                    case '\u02A5': // ʥ  [LATIN SMALL LETTER DZ DIGRAPH WITH CURL]
+                        output[outputPos++] = 'd';
+                        output[outputPos++] = 'z';
+                        break;
+                    case '\u00C8': // È  [LATIN CAPITAL LETTER E WITH GRAVE]
+                    case '\u00C9': // É  [LATIN CAPITAL LETTER E WITH ACUTE]
+                    case '\u00CA': // Ê  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX]
+                    case '\u00CB': // Ë  [LATIN CAPITAL LETTER E WITH DIAERESIS]
+                    case '\u0112': // Ē  [LATIN CAPITAL LETTER E WITH MACRON]
+                    case '\u0114': // Ĕ  [LATIN CAPITAL LETTER E WITH BREVE]
+                    case '\u0116': // Ė  [LATIN CAPITAL LETTER E WITH DOT ABOVE]
+                    case '\u0118': // Ę  [LATIN CAPITAL LETTER E WITH OGONEK]
+                    case '\u011A': // Ě  [LATIN CAPITAL LETTER E WITH CARON]
+                    case '\u018E': // Ǝ  [LATIN CAPITAL LETTER REVERSED E]
+                    case '\u0190': // Ɛ  [LATIN CAPITAL LETTER OPEN E]
+                    case '\u0204': // Ȅ  [LATIN CAPITAL LETTER E WITH DOUBLE GRAVE]
+                    case '\u0206': // Ȇ  [LATIN CAPITAL LETTER E WITH INVERTED BREVE]
+                    case '\u0228': // Ȩ  [LATIN CAPITAL LETTER E WITH CEDILLA]
+                    case '\u0246': // Ɇ  [LATIN CAPITAL LETTER E WITH STROKE]
+                    case '\u1D07': // ᴇ  [LATIN LETTER SMALL CAPITAL E]
+                    case '\u1E14': // Ḕ  [LATIN CAPITAL LETTER E WITH MACRON AND GRAVE]
+                    case '\u1E16': // Ḗ  [LATIN CAPITAL LETTER E WITH MACRON AND ACUTE]
+                    case '\u1E18': // Ḙ  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW]
+                    case '\u1E1A': // Ḛ  [LATIN CAPITAL LETTER E WITH TILDE BELOW]
+                    case '\u1E1C': // Ḝ  [LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE]
+                    case '\u1EB8': // Ẹ  [LATIN CAPITAL LETTER E WITH DOT BELOW]
+                    case '\u1EBA': // Ẻ  [LATIN CAPITAL LETTER E WITH HOOK ABOVE]
+                    case '\u1EBC': // Ẽ  [LATIN CAPITAL LETTER E WITH TILDE]
+                    case '\u1EBE': // Ế  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE]
+                    case '\u1EC0': // Ề  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE]
+                    case '\u1EC2': // Ể  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE]
+                    case '\u1EC4': // Ễ  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE]
+                    case '\u1EC6': // Ệ  [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW]
+                    case '\u24BA': // Ⓔ  [CIRCLED LATIN CAPITAL LETTER E]
+                    case '\u2C7B': // ⱻ  [LATIN LETTER SMALL CAPITAL TURNED E]
+                    case '\uFF25': // Ｅ  [FULLWIDTH LATIN CAPITAL LETTER E]
+                        output[outputPos++] = 'E';
+                        break;
+                    case '\u00E8': // è  [LATIN SMALL LETTER E WITH GRAVE]
+                    case '\u00E9': // é  [LATIN SMALL LETTER E WITH ACUTE]
+                    case '\u00EA': // ê  [LATIN SMALL LETTER E WITH CIRCUMFLEX]
+                    case '\u00EB': // ë  [LATIN SMALL LETTER E WITH DIAERESIS]
+                    case '\u0113': // ē  [LATIN SMALL LETTER E WITH MACRON]
+                    case '\u0115': // ĕ  [LATIN SMALL LETTER E WITH BREVE]
+                    case '\u0117': // ė  [LATIN SMALL LETTER E WITH DOT ABOVE]
+                    case '\u0119': // ę  [LATIN SMALL LETTER E WITH OGONEK]
+                    case '\u011B': // ě  [LATIN SMALL LETTER E WITH CARON]
+                    case '\u01DD': // ǝ  [LATIN SMALL LETTER TURNED E]
+                    case '\u0205': // ȅ  [LATIN SMALL LETTER E WITH DOUBLE GRAVE]
+                    case '\u0207': // ȇ  [LATIN SMALL LETTER E WITH INVERTED BREVE]
+                    case '\u0229': // ȩ  [LATIN SMALL LETTER E WITH CEDILLA]
+                    case '\u0247': // ɇ  [LATIN SMALL LETTER E WITH STROKE]
+                    case '\u0258': // ɘ  [LATIN SMALL LETTER REVERSED E]
+                    case '\u025B': // ɛ  [LATIN SMALL LETTER OPEN E]
+                    case '\u025C': // ɜ  [LATIN SMALL LETTER REVERSED OPEN E]
+                    case '\u025D': // ɝ  [LATIN SMALL LETTER REVERSED OPEN E WITH HOOK]
+                    case '\u025E': // ɞ  [LATIN SMALL LETTER CLOSED REVERSED OPEN E]
+                    case '\u029A': // ʚ  [LATIN SMALL LETTER CLOSED OPEN E]
+                    case '\u1D08': // ᴈ  [LATIN SMALL LETTER TURNED OPEN E]
+                    case '\u1D92': // ᶒ  [LATIN SMALL LETTER E WITH RETROFLEX HOOK]
+                    case '\u1D93': // ᶓ  [LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK]
+                    case '\u1D94': // ᶔ  [LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK]
+                    case '\u1E15': // ḕ  [LATIN SMALL LETTER E WITH MACRON AND GRAVE]
+                    case '\u1E17': // ḗ  [LATIN SMALL LETTER E WITH MACRON AND ACUTE]
+                    case '\u1E19': // ḙ  [LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW]
+                    case '\u1E1B': // ḛ  [LATIN SMALL LETTER E WITH TILDE BELOW]
+                    case '\u1E1D': // ḝ  [LATIN SMALL LETTER E WITH CEDILLA AND BREVE]
+                    case '\u1EB9': // ẹ  [LATIN SMALL LETTER E WITH DOT BELOW]
+                    case '\u1EBB': // ẻ  [LATIN SMALL LETTER E WITH HOOK ABOVE]
+                    case '\u1EBD': // ẽ  [LATIN SMALL LETTER E WITH TILDE]
+                    case '\u1EBF': // ế  [LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE]
+                    case '\u1EC1': // ề  [LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE]
+                    case '\u1EC3': // ể  [LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE]
+                    case '\u1EC5': // ễ  [LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE]
+                    case '\u1EC7': // ệ  [LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW]
+                    case '\u2091': // ₑ  [LATIN SUBSCRIPT SMALL LETTER E]
+                    case '\u24D4': // ⓔ  [CIRCLED LATIN SMALL LETTER E]
+                    case '\u2C78': // ⱸ  [LATIN SMALL LETTER E WITH NOTCH]
+                    case '\uFF45': // ｅ  [FULLWIDTH LATIN SMALL LETTER E]
+                        output[outputPos++] = 'e';
+                        break;
+                    case '\u24A0': // ⒠  [PARENTHESIZED LATIN SMALL LETTER E]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'e';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0191': // Ƒ  [LATIN CAPITAL LETTER F WITH HOOK]
+                    case '\u1E1E': // Ḟ  [LATIN CAPITAL LETTER F WITH DOT ABOVE]
+                    case '\u24BB': // Ⓕ  [CIRCLED LATIN CAPITAL LETTER F]
+                    case '\uA730': // ꜰ  [LATIN LETTER SMALL CAPITAL F]
+                    case '\uA77B': // Ꝼ  [LATIN CAPITAL LETTER INSULAR F]
+                    case '\uA7FB': // ꟻ  [LATIN EPIGRAPHIC LETTER REVERSED F]
+                    case '\uFF26': // Ｆ  [FULLWIDTH LATIN CAPITAL LETTER F]
+                        output[outputPos++] = 'F';
+                        break;
+                    case '\u0192': // ƒ  [LATIN SMALL LETTER F WITH HOOK]
+                    case '\u1D6E': // ᵮ  [LATIN SMALL LETTER F WITH MIDDLE TILDE]
+                    case '\u1D82': // ᶂ  [LATIN SMALL LETTER F WITH PALATAL HOOK]
+                    case '\u1E1F': // ḟ  [LATIN SMALL LETTER F WITH DOT ABOVE]
+                    case '\u1E9B': // ẛ  [LATIN SMALL LETTER LONG S WITH DOT ABOVE]
+                    case '\u24D5': // ⓕ  [CIRCLED LATIN SMALL LETTER F]
+                    case '\uA77C': // ꝼ  [LATIN SMALL LETTER INSULAR F]
+                    case '\uFF46': // ｆ  [FULLWIDTH LATIN SMALL LETTER F]
+                        output[outputPos++] = 'f';
+                        break;
+                    case '\u24A1': // ⒡  [PARENTHESIZED LATIN SMALL LETTER F]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\uFB00': // ﬀ  [LATIN SMALL LIGATURE FF]
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'f';
+                        break;
+                    case '\uFB03': // ﬃ  [LATIN SMALL LIGATURE FFI]
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'i';
+                        break;
+                    case '\uFB04': // ﬄ  [LATIN SMALL LIGATURE FFL]
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'l';
+                        break;
+                    case '\uFB01': // ﬁ  [LATIN SMALL LIGATURE FI]
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'i';
+                        break;
+                    case '\uFB02': // ﬂ  [LATIN SMALL LIGATURE FL]
+                        output[outputPos++] = 'f';
+                        output[outputPos++] = 'l';
+                        break;
+                    case '\u011C': // Ĝ  [LATIN CAPITAL LETTER G WITH CIRCUMFLEX]
+                    case '\u011E': // Ğ  [LATIN CAPITAL LETTER G WITH BREVE]
+                    case '\u0120': // Ġ  [LATIN CAPITAL LETTER G WITH DOT ABOVE]
+                    case '\u0122': // Ģ  [LATIN CAPITAL LETTER G WITH CEDILLA]
+                    case '\u0193': // Ɠ  [LATIN CAPITAL LETTER G WITH HOOK]
+                    case '\u01E4': // Ǥ  [LATIN CAPITAL LETTER G WITH STROKE]
+                    case '\u01E5': // ǥ  [LATIN SMALL LETTER G WITH STROKE]
+                    case '\u01E6': // Ǧ  [LATIN CAPITAL LETTER G WITH CARON]
+                    case '\u01E7': // ǧ  [LATIN SMALL LETTER G WITH CARON]
+                    case '\u01F4': // Ǵ  [LATIN CAPITAL LETTER G WITH ACUTE]
+                    case '\u0262': // ɢ  [LATIN LETTER SMALL CAPITAL G]
+                    case '\u029B': // ʛ  [LATIN LETTER SMALL CAPITAL G WITH HOOK]
+                    case '\u1E20': // Ḡ  [LATIN CAPITAL LETTER G WITH MACRON]
+                    case '\u24BC': // Ⓖ  [CIRCLED LATIN CAPITAL LETTER G]
+                    case '\uA77D': // Ᵹ  [LATIN CAPITAL LETTER INSULAR G]
+                    case '\uA77E': // Ꝿ  [LATIN CAPITAL LETTER TURNED INSULAR G]
+                    case '\uFF27': // Ｇ  [FULLWIDTH LATIN CAPITAL LETTER G]
+                        output[outputPos++] = 'G';
+                        break;
+                    case '\u011D': // ĝ  [LATIN SMALL LETTER G WITH CIRCUMFLEX]
+                    case '\u011F': // ğ  [LATIN SMALL LETTER G WITH BREVE]
+                    case '\u0121': // ġ  [LATIN SMALL LETTER G WITH DOT ABOVE]
+                    case '\u0123': // ģ  [LATIN SMALL LETTER G WITH CEDILLA]
+                    case '\u01F5': // ǵ  [LATIN SMALL LETTER G WITH ACUTE]
+                    case '\u0260': // ɠ  [LATIN SMALL LETTER G WITH HOOK]
+                    case '\u0261': // ɡ  [LATIN SMALL LETTER SCRIPT G]
+                    case '\u1D77': // ᵷ  [LATIN SMALL LETTER TURNED G]
+                    case '\u1D79': // ᵹ  [LATIN SMALL LETTER INSULAR G]
+                    case '\u1D83': // ᶃ  [LATIN SMALL LETTER G WITH PALATAL HOOK]
+                    case '\u1E21': // ḡ  [LATIN SMALL LETTER G WITH MACRON]
+                    case '\u24D6': // ⓖ  [CIRCLED LATIN SMALL LETTER G]
+                    case '\uA77F': // ꝿ  [LATIN SMALL LETTER TURNED INSULAR G]
+                    case '\uFF47': // ｇ  [FULLWIDTH LATIN SMALL LETTER G]
+                        output[outputPos++] = 'g';
+                        break;
+                    case '\u24A2': // ⒢  [PARENTHESIZED LATIN SMALL LETTER G]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'g';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0124': // Ĥ  [LATIN CAPITAL LETTER H WITH CIRCUMFLEX]
+                    case '\u0126': // Ħ  [LATIN CAPITAL LETTER H WITH STROKE]
+                    case '\u021E': // Ȟ  [LATIN CAPITAL LETTER H WITH CARON]
+                    case '\u029C': // ʜ  [LATIN LETTER SMALL CAPITAL H]
+                    case '\u1E22': // Ḣ  [LATIN CAPITAL LETTER H WITH DOT ABOVE]
+                    case '\u1E24': // Ḥ  [LATIN CAPITAL LETTER H WITH DOT BELOW]
+                    case '\u1E26': // Ḧ  [LATIN CAPITAL LETTER H WITH DIAERESIS]
+                    case '\u1E28': // Ḩ  [LATIN CAPITAL LETTER H WITH CEDILLA]
+                    case '\u1E2A': // Ḫ  [LATIN CAPITAL LETTER H WITH BREVE BELOW]
+                    case '\u24BD': // Ⓗ  [CIRCLED LATIN CAPITAL LETTER H]
+                    case '\u2C67': // Ⱨ  [LATIN CAPITAL LETTER H WITH DESCENDER]
+                    case '\u2C75': // Ⱶ  [LATIN CAPITAL LETTER HALF H]
+                    case '\uFF28': // Ｈ  [FULLWIDTH LATIN CAPITAL LETTER H]
+                        output[outputPos++] = 'H';
+                        break;
+                    case '\u0125': // ĥ  [LATIN SMALL LETTER H WITH CIRCUMFLEX]
+                    case '\u0127': // ħ  [LATIN SMALL LETTER H WITH STROKE]
+                    case '\u021F': // ȟ  [LATIN SMALL LETTER H WITH CARON]
+                    case '\u0265': // ɥ  [LATIN SMALL LETTER TURNED H]
+                    case '\u0266': // ɦ  [LATIN SMALL LETTER H WITH HOOK]
+                    case '\u02AE': // ʮ  [LATIN SMALL LETTER TURNED H WITH FISHHOOK]
+                    case '\u02AF': // ʯ  [LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL]
+                    case '\u1E23': // ḣ  [LATIN SMALL LETTER H WITH DOT ABOVE]
+                    case '\u1E25': // ḥ  [LATIN SMALL LETTER H WITH DOT BELOW]
+                    case '\u1E27': // ḧ  [LATIN SMALL LETTER H WITH DIAERESIS]
+                    case '\u1E29': // ḩ  [LATIN SMALL LETTER H WITH CEDILLA]
+                    case '\u1E2B': // ḫ  [LATIN SMALL LETTER H WITH BREVE BELOW]
+                    case '\u1E96': // ẖ  [LATIN SMALL LETTER H WITH LINE BELOW]
+                    case '\u24D7': // ⓗ  [CIRCLED LATIN SMALL LETTER H]
+                    case '\u2C68': // ⱨ  [LATIN SMALL LETTER H WITH DESCENDER]
+                    case '\u2C76': // ⱶ  [LATIN SMALL LETTER HALF H]
+                    case '\uFF48': // ｈ  [FULLWIDTH LATIN SMALL LETTER H]
+                        output[outputPos++] = 'h';
+                        break;
+                    case '\u01F6': // Ƕ  http://en.wikipedia.org/wiki/Hwair  [LATIN CAPITAL LETTER HWAIR]
+                        output[outputPos++] = 'H';
+                        output[outputPos++] = 'V';
+                        break;
+                    case '\u24A3': // ⒣  [PARENTHESIZED LATIN SMALL LETTER H]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'h';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0195': // ƕ  [LATIN SMALL LETTER HV]
+                        output[outputPos++] = 'h';
+                        output[outputPos++] = 'v';
+                        break;
+                    case '\u00CC': // Ì  [LATIN CAPITAL LETTER I WITH GRAVE]
+                    case '\u00CD': // Í  [LATIN CAPITAL LETTER I WITH ACUTE]
+                    case '\u00CE': // Î  [LATIN CAPITAL LETTER I WITH CIRCUMFLEX]
+                    case '\u00CF': // Ï  [LATIN CAPITAL LETTER I WITH DIAERESIS]
+                    case '\u0128': // Ĩ  [LATIN CAPITAL LETTER I WITH TILDE]
+                    case '\u012A': // Ī  [LATIN CAPITAL LETTER I WITH MACRON]
+                    case '\u012C': // Ĭ  [LATIN CAPITAL LETTER I WITH BREVE]
+                    case '\u012E': // Į  [LATIN CAPITAL LETTER I WITH OGONEK]
+                    case '\u0130': // İ  [LATIN CAPITAL LETTER I WITH DOT ABOVE]
+                    case '\u0196': // Ɩ  [LATIN CAPITAL LETTER IOTA]
+                    case '\u0197': // Ɨ  [LATIN CAPITAL LETTER I WITH STROKE]
+                    case '\u01CF': // Ǐ  [LATIN CAPITAL LETTER I WITH CARON]
+                    case '\u0208': // Ȉ  [LATIN CAPITAL LETTER I WITH DOUBLE GRAVE]
+                    case '\u020A': // Ȋ  [LATIN CAPITAL LETTER I WITH INVERTED BREVE]
+                    case '\u026A': // ɪ  [LATIN LETTER SMALL CAPITAL I]
+                    case '\u1D7B': // ᵻ  [LATIN SMALL CAPITAL LETTER I WITH STROKE]
+                    case '\u1E2C': // Ḭ  [LATIN CAPITAL LETTER I WITH TILDE BELOW]
+                    case '\u1E2E': // Ḯ  [LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE]
+                    case '\u1EC8': // Ỉ  [LATIN CAPITAL LETTER I WITH HOOK ABOVE]
+                    case '\u1ECA': // Ị  [LATIN CAPITAL LETTER I WITH DOT BELOW]
+                    case '\u24BE': // Ⓘ  [CIRCLED LATIN CAPITAL LETTER I]
+                    case '\uA7FE': // ꟾ  [LATIN EPIGRAPHIC LETTER I LONGA]
+                    case '\uFF29': // Ｉ  [FULLWIDTH LATIN CAPITAL LETTER I]
+                        output[outputPos++] = 'I';
+                        break;
+                    case '\u00EC': // ì  [LATIN SMALL LETTER I WITH GRAVE]
+                    case '\u00ED': // í  [LATIN SMALL LETTER I WITH ACUTE]
+                    case '\u00EE': // î  [LATIN SMALL LETTER I WITH CIRCUMFLEX]
+                    case '\u00EF': // ï  [LATIN SMALL LETTER I WITH DIAERESIS]
+                    case '\u0129': // ĩ  [LATIN SMALL LETTER I WITH TILDE]
+                    case '\u012B': // ī  [LATIN SMALL LETTER I WITH MACRON]
+                    case '\u012D': // ĭ  [LATIN SMALL LETTER I WITH BREVE]
+                    case '\u012F': // į  [LATIN SMALL LETTER I WITH OGONEK]
+                    case '\u0131': // ı  [LATIN SMALL LETTER DOTLESS I]
+                    case '\u01D0': // ǐ  [LATIN SMALL LETTER I WITH CARON]
+                    case '\u0209': // ȉ  [LATIN SMALL LETTER I WITH DOUBLE GRAVE]
+                    case '\u020B': // ȋ  [LATIN SMALL LETTER I WITH INVERTED BREVE]
+                    case '\u0268': // ɨ  [LATIN SMALL LETTER I WITH STROKE]
+                    case '\u1D09': // ᴉ  [LATIN SMALL LETTER TURNED I]
+                    case '\u1D62': // ᵢ  [LATIN SUBSCRIPT SMALL LETTER I]
+                    case '\u1D7C': // ᵼ  [LATIN SMALL LETTER IOTA WITH STROKE]
+                    case '\u1D96': // ᶖ  [LATIN SMALL LETTER I WITH RETROFLEX HOOK]
+                    case '\u1E2D': // ḭ  [LATIN SMALL LETTER I WITH TILDE BELOW]
+                    case '\u1E2F': // ḯ  [LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE]
+                    case '\u1EC9': // ỉ  [LATIN SMALL LETTER I WITH HOOK ABOVE]
+                    case '\u1ECB': // ị  [LATIN SMALL LETTER I WITH DOT BELOW]
+                    case '\u2071': // ⁱ  [SUPERSCRIPT LATIN SMALL LETTER I]
+                    case '\u24D8': // ⓘ  [CIRCLED LATIN SMALL LETTER I]
+                    case '\uFF49': // ｉ  [FULLWIDTH LATIN SMALL LETTER I]
+                        output[outputPos++] = 'i';
+                        break;
+                    case '\u0132': // Ĳ  [LATIN CAPITAL LIGATURE IJ]
+                        output[outputPos++] = 'I';
+                        output[outputPos++] = 'J';
+                        break;
+                    case '\u24A4': // ⒤  [PARENTHESIZED LATIN SMALL LETTER I]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'i';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0133': // ĳ  [LATIN SMALL LIGATURE IJ]
+                        output[outputPos++] = 'i';
+                        output[outputPos++] = 'j';
+                        break;
+                    case '\u0134': // Ĵ  [LATIN CAPITAL LETTER J WITH CIRCUMFLEX]
+                    case '\u0248': // Ɉ  [LATIN CAPITAL LETTER J WITH STROKE]
+                    case '\u1D0A': // ᴊ  [LATIN LETTER SMALL CAPITAL J]
+                    case '\u24BF': // Ⓙ  [CIRCLED LATIN CAPITAL LETTER J]
+                    case '\uFF2A': // Ｊ  [FULLWIDTH LATIN CAPITAL LETTER J]
+                        output[outputPos++] = 'J';
+                        break;
+                    case '\u0135': // ĵ  [LATIN SMALL LETTER J WITH CIRCUMFLEX]
+                    case '\u01F0': // ǰ  [LATIN SMALL LETTER J WITH CARON]
+                    case '\u0237': // ȷ  [LATIN SMALL LETTER DOTLESS J]
+                    case '\u0249': // ɉ  [LATIN SMALL LETTER J WITH STROKE]
+                    case '\u025F': // ɟ  [LATIN SMALL LETTER DOTLESS J WITH STROKE]
+                    case '\u0284': // ʄ  [LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK]
+                    case '\u029D': // ʝ  [LATIN SMALL LETTER J WITH CROSSED-TAIL]
+                    case '\u24D9': // ⓙ  [CIRCLED LATIN SMALL LETTER J]
+                    case '\u2C7C': // ⱼ  [LATIN SUBSCRIPT SMALL LETTER J]
+                    case '\uFF4A': // ｊ  [FULLWIDTH LATIN SMALL LETTER J]
+                        output[outputPos++] = 'j';
+                        break;
+                    case '\u24A5': // ⒥  [PARENTHESIZED LATIN SMALL LETTER J]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'j';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0136': // Ķ  [LATIN CAPITAL LETTER K WITH CEDILLA]
+                    case '\u0198': // Ƙ  [LATIN CAPITAL LETTER K WITH HOOK]
+                    case '\u01E8': // Ǩ  [LATIN CAPITAL LETTER K WITH CARON]
+                    case '\u1D0B': // ᴋ  [LATIN LETTER SMALL CAPITAL K]
+                    case '\u1E30': // Ḱ  [LATIN CAPITAL LETTER K WITH ACUTE]
+                    case '\u1E32': // Ḳ  [LATIN CAPITAL LETTER K WITH DOT BELOW]
+                    case '\u1E34': // Ḵ  [LATIN CAPITAL LETTER K WITH LINE BELOW]
+                    case '\u24C0': // Ⓚ  [CIRCLED LATIN CAPITAL LETTER K]
+                    case '\u2C69': // Ⱪ  [LATIN CAPITAL LETTER K WITH DESCENDER]
+                    case '\uA740': // Ꝁ  [LATIN CAPITAL LETTER K WITH STROKE]
+                    case '\uA742': // Ꝃ  [LATIN CAPITAL LETTER K WITH DIAGONAL STROKE]
+                    case '\uA744': // Ꝅ  [LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE]
+                    case '\uFF2B': // Ｋ  [FULLWIDTH LATIN CAPITAL LETTER K]
+                        output[outputPos++] = 'K';
+                        break;
+                    case '\u0137': // ķ  [LATIN SMALL LETTER K WITH CEDILLA]
+                    case '\u0199': // ƙ  [LATIN SMALL LETTER K WITH HOOK]
+                    case '\u01E9': // ǩ  [LATIN SMALL LETTER K WITH CARON]
+                    case '\u029E': // ʞ  [LATIN SMALL LETTER TURNED K]
+                    case '\u1D84': // ᶄ  [LATIN SMALL LETTER K WITH PALATAL HOOK]
+                    case '\u1E31': // ḱ  [LATIN SMALL LETTER K WITH ACUTE]
+                    case '\u1E33': // ḳ  [LATIN SMALL LETTER K WITH DOT BELOW]
+                    case '\u1E35': // ḵ  [LATIN SMALL LETTER K WITH LINE BELOW]
+                    case '\u24DA': // ⓚ  [CIRCLED LATIN SMALL LETTER K]
+                    case '\u2C6A': // ⱪ  [LATIN SMALL LETTER K WITH DESCENDER]
+                    case '\uA741': // ꝁ  [LATIN SMALL LETTER K WITH STROKE]
+                    case '\uA743': // ꝃ  [LATIN SMALL LETTER K WITH DIAGONAL STROKE]
+                    case '\uA745': // ꝅ  [LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE]
+                    case '\uFF4B': // ｋ  [FULLWIDTH LATIN SMALL LETTER K]
+                        output[outputPos++] = 'k';
+                        break;
+                    case '\u24A6': // ⒦  [PARENTHESIZED LATIN SMALL LETTER K]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'k';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0139': // Ĺ  [LATIN CAPITAL LETTER L WITH ACUTE]
+                    case '\u013B': // Ļ  [LATIN CAPITAL LETTER L WITH CEDILLA]
+                    case '\u013D': // Ľ  [LATIN CAPITAL LETTER L WITH CARON]
+                    case '\u013F': // Ŀ  [LATIN CAPITAL LETTER L WITH MIDDLE DOT]
+                    case '\u0141': // Ł  [LATIN CAPITAL LETTER L WITH STROKE]
+                    case '\u023D': // Ƚ  [LATIN CAPITAL LETTER L WITH BAR]
+                    case '\u029F': // ʟ  [LATIN LETTER SMALL CAPITAL L]
+                    case '\u1D0C': // ᴌ  [LATIN LETTER SMALL CAPITAL L WITH STROKE]
+                    case '\u1E36': // Ḷ  [LATIN CAPITAL LETTER L WITH DOT BELOW]
+                    case '\u1E38': // Ḹ  [LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON]
+                    case '\u1E3A': // Ḻ  [LATIN CAPITAL LETTER L WITH LINE BELOW]
+                    case '\u1E3C': // Ḽ  [LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW]
+                    case '\u24C1': // Ⓛ  [CIRCLED LATIN CAPITAL LETTER L]
+                    case '\u2C60': // Ⱡ  [LATIN CAPITAL LETTER L WITH DOUBLE BAR]
+                    case '\u2C62': // Ɫ  [LATIN CAPITAL LETTER L WITH MIDDLE TILDE]
+                    case '\uA746': // Ꝇ  [LATIN CAPITAL LETTER BROKEN L]
+                    case '\uA748': // Ꝉ  [LATIN CAPITAL LETTER L WITH HIGH STROKE]
+                    case '\uA780': // Ꞁ  [LATIN CAPITAL LETTER TURNED L]
+                    case '\uFF2C': // Ｌ  [FULLWIDTH LATIN CAPITAL LETTER L]
+                        output[outputPos++] = 'L';
+                        break;
+                    case '\u013A': // ĺ  [LATIN SMALL LETTER L WITH ACUTE]
+                    case '\u013C': // ļ  [LATIN SMALL LETTER L WITH CEDILLA]
+                    case '\u013E': // ľ  [LATIN SMALL LETTER L WITH CARON]
+                    case '\u0140': // ŀ  [LATIN SMALL LETTER L WITH MIDDLE DOT]
+                    case '\u0142': // ł  [LATIN SMALL LETTER L WITH STROKE]
+                    case '\u019A': // ƚ  [LATIN SMALL LETTER L WITH BAR]
+                    case '\u0234': // ȴ  [LATIN SMALL LETTER L WITH CURL]
+                    case '\u026B': // ɫ  [LATIN SMALL LETTER L WITH MIDDLE TILDE]
+                    case '\u026C': // ɬ  [LATIN SMALL LETTER L WITH BELT]
+                    case '\u026D': // ɭ  [LATIN SMALL LETTER L WITH RETROFLEX HOOK]
+                    case '\u1D85': // ᶅ  [LATIN SMALL LETTER L WITH PALATAL HOOK]
+                    case '\u1E37': // ḷ  [LATIN SMALL LETTER L WITH DOT BELOW]
+                    case '\u1E39': // ḹ  [LATIN SMALL LETTER L WITH DOT BELOW AND MACRON]
+                    case '\u1E3B': // ḻ  [LATIN SMALL LETTER L WITH LINE BELOW]
+                    case '\u1E3D': // ḽ  [LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW]
+                    case '\u24DB': // ⓛ  [CIRCLED LATIN SMALL LETTER L]
+                    case '\u2C61': // ⱡ  [LATIN SMALL LETTER L WITH DOUBLE BAR]
+                    case '\uA747': // ꝇ  [LATIN SMALL LETTER BROKEN L]
+                    case '\uA749': // ꝉ  [LATIN SMALL LETTER L WITH HIGH STROKE]
+                    case '\uA781': // ꞁ  [LATIN SMALL LETTER TURNED L]
+                    case '\uFF4C': // ｌ  [FULLWIDTH LATIN SMALL LETTER L]
+                        output[outputPos++] = 'l';
+                        break;
+                    case '\u01C7': // Ǉ  [LATIN CAPITAL LETTER LJ]
+                        output[outputPos++] = 'L';
+                        output[outputPos++] = 'J';
+                        break;
+                    case '\u1EFA': // Ỻ  [LATIN CAPITAL LETTER MIDDLE-WELSH LL]
+                        output[outputPos++] = 'L';
+                        output[outputPos++] = 'L';
+                        break;
+                    case '\u01C8': // ǈ  [LATIN CAPITAL LETTER L WITH SMALL LETTER J]
+                        output[outputPos++] = 'L';
+                        output[outputPos++] = 'j';
+                        break;
+                    case '\u24A7': // ⒧  [PARENTHESIZED LATIN SMALL LETTER L]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'l';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u01C9': // ǉ  [LATIN SMALL LETTER LJ]
+                        output[outputPos++] = 'l';
+                        output[outputPos++] = 'j';
+                        break;
+                    case '\u1EFB': // ỻ  [LATIN SMALL LETTER MIDDLE-WELSH LL]
+                        output[outputPos++] = 'l';
+                        output[outputPos++] = 'l';
+                        break;
+                    case '\u02AA': // ʪ  [LATIN SMALL LETTER LS DIGRAPH]
+                        output[outputPos++] = 'l';
+                        output[outputPos++] = 's';
+                        break;
+                    case '\u02AB': // ʫ  [LATIN SMALL LETTER LZ DIGRAPH]
+                        output[outputPos++] = 'l';
+                        output[outputPos++] = 'z';
+                        break;
+                    case '\u019C': // Ɯ  [LATIN CAPITAL LETTER TURNED M]
+                    case '\u1D0D': // ᴍ  [LATIN LETTER SMALL CAPITAL M]
+                    case '\u1E3E': // Ḿ  [LATIN CAPITAL LETTER M WITH ACUTE]
+                    case '\u1E40': // Ṁ  [LATIN CAPITAL LETTER M WITH DOT ABOVE]
+                    case '\u1E42': // Ṃ  [LATIN CAPITAL LETTER M WITH DOT BELOW]
+                    case '\u24C2': // Ⓜ  [CIRCLED LATIN CAPITAL LETTER M]
+                    case '\u2C6E': // Ɱ  [LATIN CAPITAL LETTER M WITH HOOK]
+                    case '\uA7FD': // ꟽ  [LATIN EPIGRAPHIC LETTER INVERTED M]
+                    case '\uA7FF': // ꟿ  [LATIN EPIGRAPHIC LETTER ARCHAIC M]
+                    case '\uFF2D': // Ｍ  [FULLWIDTH LATIN CAPITAL LETTER M]
+                        output[outputPos++] = 'M';
+                        break;
+                    case '\u026F': // ɯ  [LATIN SMALL LETTER TURNED M]
+                    case '\u0270': // ɰ  [LATIN SMALL LETTER TURNED M WITH LONG LEG]
+                    case '\u0271': // ɱ  [LATIN SMALL LETTER M WITH HOOK]
+                    case '\u1D6F': // ᵯ  [LATIN SMALL LETTER M WITH MIDDLE TILDE]
+                    case '\u1D86': // ᶆ  [LATIN SMALL LETTER M WITH PALATAL HOOK]
+                    case '\u1E3F': // ḿ  [LATIN SMALL LETTER M WITH ACUTE]
+                    case '\u1E41': // ṁ  [LATIN SMALL LETTER M WITH DOT ABOVE]
+                    case '\u1E43': // ṃ  [LATIN SMALL LETTER M WITH DOT BELOW]
+                    case '\u24DC': // ⓜ  [CIRCLED LATIN SMALL LETTER M]
+                    case '\uFF4D': // ｍ  [FULLWIDTH LATIN SMALL LETTER M]
+                        output[outputPos++] = 'm';
+                        break;
+                    case '\u24A8': // ⒨  [PARENTHESIZED LATIN SMALL LETTER M]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'm';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00D1': // Ñ  [LATIN CAPITAL LETTER N WITH TILDE]
+                    case '\u0143': // Ń  [LATIN CAPITAL LETTER N WITH ACUTE]
+                    case '\u0145': // Ņ  [LATIN CAPITAL LETTER N WITH CEDILLA]
+                    case '\u0147': // Ň  [LATIN CAPITAL LETTER N WITH CARON]
+                    case '\u014A': // Ŋ  http://en.wikipedia.org/wiki/Eng_(letter)  [LATIN CAPITAL LETTER ENG]
+                    case '\u019D': // Ɲ  [LATIN CAPITAL LETTER N WITH LEFT HOOK]
+                    case '\u01F8': // Ǹ  [LATIN CAPITAL LETTER N WITH GRAVE]
+                    case '\u0220': // Ƞ  [LATIN CAPITAL LETTER N WITH LONG RIGHT LEG]
+                    case '\u0274': // ɴ  [LATIN LETTER SMALL CAPITAL N]
+                    case '\u1D0E': // ᴎ  [LATIN LETTER SMALL CAPITAL REVERSED N]
+                    case '\u1E44': // Ṅ  [LATIN CAPITAL LETTER N WITH DOT ABOVE]
+                    case '\u1E46': // Ṇ  [LATIN CAPITAL LETTER N WITH DOT BELOW]
+                    case '\u1E48': // Ṉ  [LATIN CAPITAL LETTER N WITH LINE BELOW]
+                    case '\u1E4A': // Ṋ  [LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW]
+                    case '\u24C3': // Ⓝ  [CIRCLED LATIN CAPITAL LETTER N]
+                    case '\uFF2E': // Ｎ  [FULLWIDTH LATIN CAPITAL LETTER N]
+                        output[outputPos++] = 'N';
+                        break;
+                    case '\u00F1': // ñ  [LATIN SMALL LETTER N WITH TILDE]
+                    case '\u0144': // ń  [LATIN SMALL LETTER N WITH ACUTE]
+                    case '\u0146': // ņ  [LATIN SMALL LETTER N WITH CEDILLA]
+                    case '\u0148': // ň  [LATIN SMALL LETTER N WITH CARON]
+                    case '\u0149': // ŉ  [LATIN SMALL LETTER N PRECEDED BY APOSTROPHE]
+                    case '\u014B': // ŋ  http://en.wikipedia.org/wiki/Eng_(letter)  [LATIN SMALL LETTER ENG]
+                    case '\u019E': // ƞ  [LATIN SMALL LETTER N WITH LONG RIGHT LEG]
+                    case '\u01F9': // ǹ  [LATIN SMALL LETTER N WITH GRAVE]
+                    case '\u0235': // ȵ  [LATIN SMALL LETTER N WITH CURL]
+                    case '\u0272': // ɲ  [LATIN SMALL LETTER N WITH LEFT HOOK]
+                    case '\u0273': // ɳ  [LATIN SMALL LETTER N WITH RETROFLEX HOOK]
+                    case '\u1D70': // ᵰ  [LATIN SMALL LETTER N WITH MIDDLE TILDE]
+                    case '\u1D87': // ᶇ  [LATIN SMALL LETTER N WITH PALATAL HOOK]
+                    case '\u1E45': // ṅ  [LATIN SMALL LETTER N WITH DOT ABOVE]
+                    case '\u1E47': // ṇ  [LATIN SMALL LETTER N WITH DOT BELOW]
+                    case '\u1E49': // ṉ  [LATIN SMALL LETTER N WITH LINE BELOW]
+                    case '\u1E4B': // ṋ  [LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW]
+                    case '\u207F': // ⁿ  [SUPERSCRIPT LATIN SMALL LETTER N]
+                    case '\u24DD': // ⓝ  [CIRCLED LATIN SMALL LETTER N]
+                    case '\uFF4E': // ｎ  [FULLWIDTH LATIN SMALL LETTER N]
+                        output[outputPos++] = 'n';
+                        break;
+                    case '\u01CA': // Ǌ  [LATIN CAPITAL LETTER NJ]
+                        output[outputPos++] = 'N';
+                        output[outputPos++] = 'J';
+                        break;
+                    case '\u01CB': // ǋ  [LATIN CAPITAL LETTER N WITH SMALL LETTER J]
+                        output[outputPos++] = 'N';
+                        output[outputPos++] = 'j';
+                        break;
+                    case '\u24A9': // ⒩  [PARENTHESIZED LATIN SMALL LETTER N]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'n';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u01CC': // ǌ  [LATIN SMALL LETTER NJ]
+                        output[outputPos++] = 'n';
+                        output[outputPos++] = 'j';
+                        break;
+                    case '\u00D2': // Ò  [LATIN CAPITAL LETTER O WITH GRAVE]
+                    case '\u00D3': // Ó  [LATIN CAPITAL LETTER O WITH ACUTE]
+                    case '\u00D4': // Ô  [LATIN CAPITAL LETTER O WITH CIRCUMFLEX]
+                    case '\u00D5': // Õ  [LATIN CAPITAL LETTER O WITH TILDE]
+                    case '\u00D6': // Ö  [LATIN CAPITAL LETTER O WITH DIAERESIS]
+                    case '\u00D8': // Ø  [LATIN CAPITAL LETTER O WITH STROKE]
+                    case '\u014C': // Ō  [LATIN CAPITAL LETTER O WITH MACRON]
+                    case '\u014E': // Ŏ  [LATIN CAPITAL LETTER O WITH BREVE]
+                    case '\u0150': // Ő  [LATIN CAPITAL LETTER O WITH DOUBLE ACUTE]
+                    case '\u0186': // Ɔ  [LATIN CAPITAL LETTER OPEN O]
+                    case '\u019F': // Ɵ  [LATIN CAPITAL LETTER O WITH MIDDLE TILDE]
+                    case '\u01A0': // Ơ  [LATIN CAPITAL LETTER O WITH HORN]
+                    case '\u01D1': // Ǒ  [LATIN CAPITAL LETTER O WITH CARON]
+                    case '\u01EA': // Ǫ  [LATIN CAPITAL LETTER O WITH OGONEK]
+                    case '\u01EC': // Ǭ  [LATIN CAPITAL LETTER O WITH OGONEK AND MACRON]
+                    case '\u01FE': // Ǿ  [LATIN CAPITAL LETTER O WITH STROKE AND ACUTE]
+                    case '\u020C': // Ȍ  [LATIN CAPITAL LETTER O WITH DOUBLE GRAVE]
+                    case '\u020E': // Ȏ  [LATIN CAPITAL LETTER O WITH INVERTED BREVE]
+                    case '\u022A': // Ȫ  [LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON]
+                    case '\u022C': // Ȭ  [LATIN CAPITAL LETTER O WITH TILDE AND MACRON]
+                    case '\u022E': // Ȯ  [LATIN CAPITAL LETTER O WITH DOT ABOVE]
+                    case '\u0230': // Ȱ  [LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON]
+                    case '\u1D0F': // ᴏ  [LATIN LETTER SMALL CAPITAL O]
+                    case '\u1D10': // ᴐ  [LATIN LETTER SMALL CAPITAL OPEN O]
+                    case '\u1E4C': // Ṍ  [LATIN CAPITAL LETTER O WITH TILDE AND ACUTE]
+                    case '\u1E4E': // Ṏ  [LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS]
+                    case '\u1E50': // Ṑ  [LATIN CAPITAL LETTER O WITH MACRON AND GRAVE]
+                    case '\u1E52': // Ṓ  [LATIN CAPITAL LETTER O WITH MACRON AND ACUTE]
+                    case '\u1ECC': // Ọ  [LATIN CAPITAL LETTER O WITH DOT BELOW]
+                    case '\u1ECE': // Ỏ  [LATIN CAPITAL LETTER O WITH HOOK ABOVE]
+                    case '\u1ED0': // Ố  [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE]
+                    case '\u1ED2': // Ồ  [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE]
+                    case '\u1ED4': // Ổ  [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE]
+                    case '\u1ED6': // Ỗ  [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE]
+                    case '\u1ED8': // Ộ  [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW]
+                    case '\u1EDA': // Ớ  [LATIN CAPITAL LETTER O WITH HORN AND ACUTE]
+                    case '\u1EDC': // Ờ  [LATIN CAPITAL LETTER O WITH HORN AND GRAVE]
+                    case '\u1EDE': // Ở  [LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE]
+                    case '\u1EE0': // Ỡ  [LATIN CAPITAL LETTER O WITH HORN AND TILDE]
+                    case '\u1EE2': // Ợ  [LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW]
+                    case '\u24C4': // Ⓞ  [CIRCLED LATIN CAPITAL LETTER O]
+                    case '\uA74A': // Ꝋ  [LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY]
+                    case '\uA74C': // Ꝍ  [LATIN CAPITAL LETTER O WITH LOOP]
+                    case '\uFF2F': // Ｏ  [FULLWIDTH LATIN CAPITAL LETTER O]
+                        output[outputPos++] = 'O';
+                        break;
+                    case '\u00F2': // ò  [LATIN SMALL LETTER O WITH GRAVE]
+                    case '\u00F3': // ó  [LATIN SMALL LETTER O WITH ACUTE]
+                    case '\u00F4': // ô  [LATIN SMALL LETTER O WITH CIRCUMFLEX]
+                    case '\u00F5': // õ  [LATIN SMALL LETTER O WITH TILDE]
+                    case '\u00F6': // ö  [LATIN SMALL LETTER O WITH DIAERESIS]
+                    case '\u00F8': // ø  [LATIN SMALL LETTER O WITH STROKE]
+                    case '\u014D': // ō  [LATIN SMALL LETTER O WITH MACRON]
+                    case '\u014F': // ŏ  [LATIN SMALL LETTER O WITH BREVE]
+                    case '\u0151': // ő  [LATIN SMALL LETTER O WITH DOUBLE ACUTE]
+                    case '\u01A1': // ơ  [LATIN SMALL LETTER O WITH HORN]
+                    case '\u01D2': // ǒ  [LATIN SMALL LETTER O WITH CARON]
+                    case '\u01EB': // ǫ  [LATIN SMALL LETTER O WITH OGONEK]
+                    case '\u01ED': // ǭ  [LATIN SMALL LETTER O WITH OGONEK AND MACRON]
+                    case '\u01FF': // ǿ  [LATIN SMALL LETTER O WITH STROKE AND ACUTE]
+                    case '\u020D': // ȍ  [LATIN SMALL LETTER O WITH DOUBLE GRAVE]
+                    case '\u020F': // ȏ  [LATIN SMALL LETTER O WITH INVERTED BREVE]
+                    case '\u022B': // ȫ  [LATIN SMALL LETTER O WITH DIAERESIS AND MACRON]
+                    case '\u022D': // ȭ  [LATIN SMALL LETTER O WITH TILDE AND MACRON]
+                    case '\u022F': // ȯ  [LATIN SMALL LETTER O WITH DOT ABOVE]
+                    case '\u0231': // ȱ  [LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON]
+                    case '\u0254': // ɔ  [LATIN SMALL LETTER OPEN O]
+                    case '\u0275': // ɵ  [LATIN SMALL LETTER BARRED O]
+                    case '\u1D16': // ᴖ  [LATIN SMALL LETTER TOP HALF O]
+                    case '\u1D17': // ᴗ  [LATIN SMALL LETTER BOTTOM HALF O]
+                    case '\u1D97': // ᶗ  [LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK]
+                    case '\u1E4D': // ṍ  [LATIN SMALL LETTER O WITH TILDE AND ACUTE]
+                    case '\u1E4F': // ṏ  [LATIN SMALL LETTER O WITH TILDE AND DIAERESIS]
+                    case '\u1E51': // ṑ  [LATIN SMALL LETTER O WITH MACRON AND GRAVE]
+                    case '\u1E53': // ṓ  [LATIN SMALL LETTER O WITH MACRON AND ACUTE]
+                    case '\u1ECD': // ọ  [LATIN SMALL LETTER O WITH DOT BELOW]
+                    case '\u1ECF': // ỏ  [LATIN SMALL LETTER O WITH HOOK ABOVE]
+                    case '\u1ED1': // ố  [LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE]
+                    case '\u1ED3': // ồ  [LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE]
+                    case '\u1ED5': // ổ  [LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE]
+                    case '\u1ED7': // ỗ  [LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE]
+                    case '\u1ED9': // ộ  [LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW]
+                    case '\u1EDB': // ớ  [LATIN SMALL LETTER O WITH HORN AND ACUTE]
+                    case '\u1EDD': // ờ  [LATIN SMALL LETTER O WITH HORN AND GRAVE]
+                    case '\u1EDF': // ở  [LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE]
+                    case '\u1EE1': // ỡ  [LATIN SMALL LETTER O WITH HORN AND TILDE]
+                    case '\u1EE3': // ợ  [LATIN SMALL LETTER O WITH HORN AND DOT BELOW]
+                    case '\u2092': // ₒ  [LATIN SUBSCRIPT SMALL LETTER O]
+                    case '\u24DE': // ⓞ  [CIRCLED LATIN SMALL LETTER O]
+                    case '\u2C7A': // ⱺ  [LATIN SMALL LETTER O WITH LOW RING INSIDE]
+                    case '\uA74B': // ꝋ  [LATIN SMALL LETTER O WITH LONG STROKE OVERLAY]
+                    case '\uA74D': // ꝍ  [LATIN SMALL LETTER O WITH LOOP]
+                    case '\uFF4F': // ｏ  [FULLWIDTH LATIN SMALL LETTER O]
+                        output[outputPos++] = 'o';
+                        break;
+                    case '\u0152': // Œ  [LATIN CAPITAL LIGATURE OE]
+                    case '\u0276': // ɶ  [LATIN LETTER SMALL CAPITAL OE]
+                        output[outputPos++] = 'O';
+                        output[outputPos++] = 'E';
+                        break;
+                    case '\uA74E': // Ꝏ  [LATIN CAPITAL LETTER OO]
+                        output[outputPos++] = 'O';
+                        output[outputPos++] = 'O';
+                        break;
+                    case '\u0222': // Ȣ  http://en.wikipedia.org/wiki/OU  [LATIN CAPITAL LETTER OU]
+                    case '\u1D15': // ᴕ  [LATIN LETTER SMALL CAPITAL OU]
+                        output[outputPos++] = 'O';
+                        output[outputPos++] = 'U';
+                        break;
+                    case '\u24AA': // ⒪  [PARENTHESIZED LATIN SMALL LETTER O]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'o';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0153': // œ  [LATIN SMALL LIGATURE OE]
+                    case '\u1D14': // ᴔ  [LATIN SMALL LETTER TURNED OE]
+                        output[outputPos++] = 'o';
+                        output[outputPos++] = 'e';
+                        break;
+                    case '\uA74F': // ꝏ  [LATIN SMALL LETTER OO]
+                        output[outputPos++] = 'o';
+                        output[outputPos++] = 'o';
+                        break;
+                    case '\u0223': // ȣ  http://en.wikipedia.org/wiki/OU  [LATIN SMALL LETTER OU]
+                        output[outputPos++] = 'o';
+                        output[outputPos++] = 'u';
+                        break;
+                    case '\u01A4': // Ƥ  [LATIN CAPITAL LETTER P WITH HOOK]
+                    case '\u1D18': // ᴘ  [LATIN LETTER SMALL CAPITAL P]
+                    case '\u1E54': // Ṕ  [LATIN CAPITAL LETTER P WITH ACUTE]
+                    case '\u1E56': // Ṗ  [LATIN CAPITAL LETTER P WITH DOT ABOVE]
+                    case '\u24C5': // Ⓟ  [CIRCLED LATIN CAPITAL LETTER P]
+                    case '\u2C63': // Ᵽ  [LATIN CAPITAL LETTER P WITH STROKE]
+                    case '\uA750': // Ꝑ  [LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER]
+                    case '\uA752': // Ꝓ  [LATIN CAPITAL LETTER P WITH FLOURISH]
+                    case '\uA754': // Ꝕ  [LATIN CAPITAL LETTER P WITH SQUIRREL TAIL]
+                    case '\uFF30': // Ｐ  [FULLWIDTH LATIN CAPITAL LETTER P]
+                        output[outputPos++] = 'P';
+                        break;
+                    case '\u01A5': // ƥ  [LATIN SMALL LETTER P WITH HOOK]
+                    case '\u1D71': // ᵱ  [LATIN SMALL LETTER P WITH MIDDLE TILDE]
+                    case '\u1D7D': // ᵽ  [LATIN SMALL LETTER P WITH STROKE]
+                    case '\u1D88': // ᶈ  [LATIN SMALL LETTER P WITH PALATAL HOOK]
+                    case '\u1E55': // ṕ  [LATIN SMALL LETTER P WITH ACUTE]
+                    case '\u1E57': // ṗ  [LATIN SMALL LETTER P WITH DOT ABOVE]
+                    case '\u24DF': // ⓟ  [CIRCLED LATIN SMALL LETTER P]
+                    case '\uA751': // ꝑ  [LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER]
+                    case '\uA753': // ꝓ  [LATIN SMALL LETTER P WITH FLOURISH]
+                    case '\uA755': // ꝕ  [LATIN SMALL LETTER P WITH SQUIRREL TAIL]
+                    case '\uA7FC': // ꟼ  [LATIN EPIGRAPHIC LETTER REVERSED P]
+                    case '\uFF50': // ｐ  [FULLWIDTH LATIN SMALL LETTER P]
+                        output[outputPos++] = 'p';
+                        break;
+                    case '\u24AB': // ⒫  [PARENTHESIZED LATIN SMALL LETTER P]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'p';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u024A': // Ɋ  [LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL]
+                    case '\u24C6': // Ⓠ  [CIRCLED LATIN CAPITAL LETTER Q]
+                    case '\uA756': // Ꝗ  [LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER]
+                    case '\uA758': // Ꝙ  [LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE]
+                    case '\uFF31': // Ｑ  [FULLWIDTH LATIN CAPITAL LETTER Q]
+                        output[outputPos++] = 'Q';
+                        break;
+                    case '\u0138': // ĸ  http://en.wikipedia.org/wiki/Kra_(letter)  [LATIN SMALL LETTER KRA]
+                    case '\u024B': // ɋ  [LATIN SMALL LETTER Q WITH HOOK TAIL]
+                    case '\u02A0': // ʠ  [LATIN SMALL LETTER Q WITH HOOK]
+                    case '\u24E0': // ⓠ  [CIRCLED LATIN SMALL LETTER Q]
+                    case '\uA757': // ꝗ  [LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER]
+                    case '\uA759': // ꝙ  [LATIN SMALL LETTER Q WITH DIAGONAL STROKE]
+                    case '\uFF51': // ｑ  [FULLWIDTH LATIN SMALL LETTER Q]
+                        output[outputPos++] = 'q';
+                        break;
+                    case '\u24AC': // ⒬  [PARENTHESIZED LATIN SMALL LETTER Q]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'q';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0239': // ȹ  [LATIN SMALL LETTER QP DIGRAPH]
+                        output[outputPos++] = 'q';
+                        output[outputPos++] = 'p';
+                        break;
+                    case '\u0154': // Ŕ  [LATIN CAPITAL LETTER R WITH ACUTE]
+                    case '\u0156': // Ŗ  [LATIN CAPITAL LETTER R WITH CEDILLA]
+                    case '\u0158': // Ř  [LATIN CAPITAL LETTER R WITH CARON]
+                    case '\u0210': // Ȓ  [LATIN CAPITAL LETTER R WITH DOUBLE GRAVE]
+                    case '\u0212': // Ȓ  [LATIN CAPITAL LETTER R WITH INVERTED BREVE]
+                    case '\u024C': // Ɍ  [LATIN CAPITAL LETTER R WITH STROKE]
+                    case '\u0280': // ʀ  [LATIN LETTER SMALL CAPITAL R]
+                    case '\u0281': // ʁ  [LATIN LETTER SMALL CAPITAL INVERTED R]
+                    case '\u1D19': // ᴙ  [LATIN LETTER SMALL CAPITAL REVERSED R]
+                    case '\u1D1A': // ᴚ  [LATIN LETTER SMALL CAPITAL TURNED R]
+                    case '\u1E58': // Ṙ  [LATIN CAPITAL LETTER R WITH DOT ABOVE]
+                    case '\u1E5A': // Ṛ  [LATIN CAPITAL LETTER R WITH DOT BELOW]
+                    case '\u1E5C': // Ṝ  [LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON]
+                    case '\u1E5E': // Ṟ  [LATIN CAPITAL LETTER R WITH LINE BELOW]
+                    case '\u24C7': // Ⓡ  [CIRCLED LATIN CAPITAL LETTER R]
+                    case '\u2C64': // Ɽ  [LATIN CAPITAL LETTER R WITH TAIL]
+                    case '\uA75A': // Ꝛ  [LATIN CAPITAL LETTER R ROTUNDA]
+                    case '\uA782': // Ꞃ  [LATIN CAPITAL LETTER INSULAR R]
+                    case '\uFF32': // Ｒ  [FULLWIDTH LATIN CAPITAL LETTER R]
+                        output[outputPos++] = 'R';
+                        break;
+                    case '\u0155': // ŕ  [LATIN SMALL LETTER R WITH ACUTE]
+                    case '\u0157': // ŗ  [LATIN SMALL LETTER R WITH CEDILLA]
+                    case '\u0159': // ř  [LATIN SMALL LETTER R WITH CARON]
+                    case '\u0211': // ȑ  [LATIN SMALL LETTER R WITH DOUBLE GRAVE]
+                    case '\u0213': // ȓ  [LATIN SMALL LETTER R WITH INVERTED BREVE]
+                    case '\u024D': // ɍ  [LATIN SMALL LETTER R WITH STROKE]
+                    case '\u027C': // ɼ  [LATIN SMALL LETTER R WITH LONG LEG]
+                    case '\u027D': // ɽ  [LATIN SMALL LETTER R WITH TAIL]
+                    case '\u027E': // ɾ  [LATIN SMALL LETTER R WITH FISHHOOK]
+                    case '\u027F': // ɿ  [LATIN SMALL LETTER REVERSED R WITH FISHHOOK]
+                    case '\u1D63': // ᵣ  [LATIN SUBSCRIPT SMALL LETTER R]
+                    case '\u1D72': // ᵲ  [LATIN SMALL LETTER R WITH MIDDLE TILDE]
+                    case '\u1D73': // ᵳ  [LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE]
+                    case '\u1D89': // ᶉ  [LATIN SMALL LETTER R WITH PALATAL HOOK]
+                    case '\u1E59': // ṙ  [LATIN SMALL LETTER R WITH DOT ABOVE]
+                    case '\u1E5B': // ṛ  [LATIN SMALL LETTER R WITH DOT BELOW]
+                    case '\u1E5D': // ṝ  [LATIN SMALL LETTER R WITH DOT BELOW AND MACRON]
+                    case '\u1E5F': // ṟ  [LATIN SMALL LETTER R WITH LINE BELOW]
+                    case '\u24E1': // ⓡ  [CIRCLED LATIN SMALL LETTER R]
+                    case '\uA75B': // ꝛ  [LATIN SMALL LETTER R ROTUNDA]
+                    case '\uA783': // ꞃ  [LATIN SMALL LETTER INSULAR R]
+                    case '\uFF52': // ｒ  [FULLWIDTH LATIN SMALL LETTER R]
+                        output[outputPos++] = 'r';
+                        break;
+                    case '\u24AD': // ⒭  [PARENTHESIZED LATIN SMALL LETTER R]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'r';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u015A': // Ś  [LATIN CAPITAL LETTER S WITH ACUTE]
+                    case '\u015C': // Ŝ  [LATIN CAPITAL LETTER S WITH CIRCUMFLEX]
+                    case '\u015E': // Ş  [LATIN CAPITAL LETTER S WITH CEDILLA]
+                    case '\u0160': // Š  [LATIN CAPITAL LETTER S WITH CARON]
+                    case '\u0218': // Ș  [LATIN CAPITAL LETTER S WITH COMMA BELOW]
+                    case '\u1E60': // Ṡ  [LATIN CAPITAL LETTER S WITH DOT ABOVE]
+                    case '\u1E62': // Ṣ  [LATIN CAPITAL LETTER S WITH DOT BELOW]
+                    case '\u1E64': // Ṥ  [LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE]
+                    case '\u1E66': // Ṧ  [LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE]
+                    case '\u1E68': // Ṩ  [LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE]
+                    case '\u24C8': // Ⓢ  [CIRCLED LATIN CAPITAL LETTER S]
+                    case '\uA731': // ꜱ  [LATIN LETTER SMALL CAPITAL S]
+                    case '\uA785': // ꞅ  [LATIN SMALL LETTER INSULAR S]
+                    case '\uFF33': // Ｓ  [FULLWIDTH LATIN CAPITAL LETTER S]
+                        output[outputPos++] = 'S';
+                        break;
+                    case '\u015B': // ś  [LATIN SMALL LETTER S WITH ACUTE]
+                    case '\u015D': // ŝ  [LATIN SMALL LETTER S WITH CIRCUMFLEX]
+                    case '\u015F': // ş  [LATIN SMALL LETTER S WITH CEDILLA]
+                    case '\u0161': // š  [LATIN SMALL LETTER S WITH CARON]
+                    case '\u017F': // ſ  http://en.wikipedia.org/wiki/Long_S  [LATIN SMALL LETTER LONG S]
+                    case '\u0219': // ș  [LATIN SMALL LETTER S WITH COMMA BELOW]
+                    case '\u023F': // ȿ  [LATIN SMALL LETTER S WITH SWASH TAIL]
+                    case '\u0282': // ʂ  [LATIN SMALL LETTER S WITH HOOK]
+                    case '\u1D74': // ᵴ  [LATIN SMALL LETTER S WITH MIDDLE TILDE]
+                    case '\u1D8A': // ᶊ  [LATIN SMALL LETTER S WITH PALATAL HOOK]
+                    case '\u1E61': // ṡ  [LATIN SMALL LETTER S WITH DOT ABOVE]
+                    case '\u1E63': // ṣ  [LATIN SMALL LETTER S WITH DOT BELOW]
+                    case '\u1E65': // ṥ  [LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE]
+                    case '\u1E67': // ṧ  [LATIN SMALL LETTER S WITH CARON AND DOT ABOVE]
+                    case '\u1E69': // ṩ  [LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE]
+                    case '\u1E9C': // ẜ  [LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE]
+                    case '\u1E9D': // ẝ  [LATIN SMALL LETTER LONG S WITH HIGH STROKE]
+                    case '\u24E2': // ⓢ  [CIRCLED LATIN SMALL LETTER S]
+                    case '\uA784': // Ꞅ  [LATIN CAPITAL LETTER INSULAR S]
+                    case '\uFF53': // ｓ  [FULLWIDTH LATIN SMALL LETTER S]
+                        output[outputPos++] = 's';
+                        break;
+                    case '\u1E9E': // ẞ  [LATIN CAPITAL LETTER SHARP S]
+                        output[outputPos++] = 'S';
+                        output[outputPos++] = 'S';
+                        break;
+                    case '\u24AE': // ⒮  [PARENTHESIZED LATIN SMALL LETTER S]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 's';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00DF': // ß  [LATIN SMALL LETTER SHARP S]
+                        output[outputPos++] = 's';
+                        output[outputPos++] = 's';
+                        break;
+                    case '\uFB06': // ﬆ  [LATIN SMALL LIGATURE ST]
+                        output[outputPos++] = 's';
+                        output[outputPos++] = 't';
+                        break;
+                    case '\u0162': // Ţ  [LATIN CAPITAL LETTER T WITH CEDILLA]
+                    case '\u0164': // Ť  [LATIN CAPITAL LETTER T WITH CARON]
+                    case '\u0166': // Ŧ  [LATIN CAPITAL LETTER T WITH STROKE]
+                    case '\u01AC': // Ƭ  [LATIN CAPITAL LETTER T WITH HOOK]
+                    case '\u01AE': // Ʈ  [LATIN CAPITAL LETTER T WITH RETROFLEX HOOK]
+                    case '\u021A': // Ț  [LATIN CAPITAL LETTER T WITH COMMA BELOW]
+                    case '\u023E': // Ⱦ  [LATIN CAPITAL LETTER T WITH DIAGONAL STROKE]
+                    case '\u1D1B': // ᴛ  [LATIN LETTER SMALL CAPITAL T]
+                    case '\u1E6A': // Ṫ  [LATIN CAPITAL LETTER T WITH DOT ABOVE]
+                    case '\u1E6C': // Ṭ  [LATIN CAPITAL LETTER T WITH DOT BELOW]
+                    case '\u1E6E': // Ṯ  [LATIN CAPITAL LETTER T WITH LINE BELOW]
+                    case '\u1E70': // Ṱ  [LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW]
+                    case '\u24C9': // Ⓣ  [CIRCLED LATIN CAPITAL LETTER T]
+                    case '\uA786': // Ꞇ  [LATIN CAPITAL LETTER INSULAR T]
+                    case '\uFF34': // Ｔ  [FULLWIDTH LATIN CAPITAL LETTER T]
+                        output[outputPos++] = 'T';
+                        break;
+                    case '\u0163': // ţ  [LATIN SMALL LETTER T WITH CEDILLA]
+                    case '\u0165': // ť  [LATIN SMALL LETTER T WITH CARON]
+                    case '\u0167': // ŧ  [LATIN SMALL LETTER T WITH STROKE]
+                    case '\u01AB': // ƫ  [LATIN SMALL LETTER T WITH PALATAL HOOK]
+                    case '\u01AD': // ƭ  [LATIN SMALL LETTER T WITH HOOK]
+                    case '\u021B': // ț  [LATIN SMALL LETTER T WITH COMMA BELOW]
+                    case '\u0236': // ȶ  [LATIN SMALL LETTER T WITH CURL]
+                    case '\u0287': // ʇ  [LATIN SMALL LETTER TURNED T]
+                    case '\u0288': // ʈ  [LATIN SMALL LETTER T WITH RETROFLEX HOOK]
+                    case '\u1D75': // ᵵ  [LATIN SMALL LETTER T WITH MIDDLE TILDE]
+                    case '\u1E6B': // ṫ  [LATIN SMALL LETTER T WITH DOT ABOVE]
+                    case '\u1E6D': // ṭ  [LATIN SMALL LETTER T WITH DOT BELOW]
+                    case '\u1E6F': // ṯ  [LATIN SMALL LETTER T WITH LINE BELOW]
+                    case '\u1E71': // ṱ  [LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW]
+                    case '\u1E97': // ẗ  [LATIN SMALL LETTER T WITH DIAERESIS]
+                    case '\u24E3': // ⓣ  [CIRCLED LATIN SMALL LETTER T]
+                    case '\u2C66': // ⱦ  [LATIN SMALL LETTER T WITH DIAGONAL STROKE]
+                    case '\uFF54': // ｔ  [FULLWIDTH LATIN SMALL LETTER T]
+                        output[outputPos++] = 't';
+                        break;
+                    case '\u00DE': // Þ  [LATIN CAPITAL LETTER THORN]
+                    case '\uA766': // Ꝧ  [LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER]
+                        output[outputPos++] = 'T';
+                        output[outputPos++] = 'H';
+                        break;
+                    case '\uA728': // Ꜩ  [LATIN CAPITAL LETTER TZ]
+                        output[outputPos++] = 'T';
+                        output[outputPos++] = 'Z';
+                        break;
+                    case '\u24AF': // ⒯  [PARENTHESIZED LATIN SMALL LETTER T]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 't';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u02A8': // ʨ  [LATIN SMALL LETTER TC DIGRAPH WITH CURL]
+                        output[outputPos++] = 't';
+                        output[outputPos++] = 'c';
+                        break;
+                    case '\u00FE': // þ  [LATIN SMALL LETTER THORN]
+                    case '\u1D7A': // ᵺ  [LATIN SMALL LETTER TH WITH STRIKETHROUGH]
+                    case '\uA767': // ꝧ  [LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER]
+                        output[outputPos++] = 't';
+                        output[outputPos++] = 'h';
+                        break;
+                    case '\u02A6': // ʦ  [LATIN SMALL LETTER TS DIGRAPH]
+                        output[outputPos++] = 't';
+                        output[outputPos++] = 's';
+                        break;
+                    case '\uA729': // ꜩ  [LATIN SMALL LETTER TZ]
+                        output[outputPos++] = 't';
+                        output[outputPos++] = 'z';
+                        break;
+                    case '\u00D9': // Ù  [LATIN CAPITAL LETTER U WITH GRAVE]
+                    case '\u00DA': // Ú  [LATIN CAPITAL LETTER U WITH ACUTE]
+                    case '\u00DB': // Û  [LATIN CAPITAL LETTER U WITH CIRCUMFLEX]
+                    case '\u00DC': // Ü  [LATIN CAPITAL LETTER U WITH DIAERESIS]
+                    case '\u0168': // Ũ  [LATIN CAPITAL LETTER U WITH TILDE]
+                    case '\u016A': // Ū  [LATIN CAPITAL LETTER U WITH MACRON]
+                    case '\u016C': // Ŭ  [LATIN CAPITAL LETTER U WITH BREVE]
+                    case '\u016E': // Ů  [LATIN CAPITAL LETTER U WITH RING ABOVE]
+                    case '\u0170': // Ű  [LATIN CAPITAL LETTER U WITH DOUBLE ACUTE]
+                    case '\u0172': // Ų  [LATIN CAPITAL LETTER U WITH OGONEK]
+                    case '\u01AF': // Ư  [LATIN CAPITAL LETTER U WITH HORN]
+                    case '\u01D3': // Ǔ  [LATIN CAPITAL LETTER U WITH CARON]
+                    case '\u01D5': // Ǖ  [LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON]
+                    case '\u01D7': // Ǘ  [LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE]
+                    case '\u01D9': // Ǚ  [LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON]
+                    case '\u01DB': // Ǜ  [LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE]
+                    case '\u0214': // Ȕ  [LATIN CAPITAL LETTER U WITH DOUBLE GRAVE]
+                    case '\u0216': // Ȗ  [LATIN CAPITAL LETTER U WITH INVERTED BREVE]
+                    case '\u0244': // Ʉ  [LATIN CAPITAL LETTER U BAR]
+                    case '\u1D1C': // ᴜ  [LATIN LETTER SMALL CAPITAL U]
+                    case '\u1D7E': // ᵾ  [LATIN SMALL CAPITAL LETTER U WITH STROKE]
+                    case '\u1E72': // Ṳ  [LATIN CAPITAL LETTER U WITH DIAERESIS BELOW]
+                    case '\u1E74': // Ṵ  [LATIN CAPITAL LETTER U WITH TILDE BELOW]
+                    case '\u1E76': // Ṷ  [LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW]
+                    case '\u1E78': // Ṹ  [LATIN CAPITAL LETTER U WITH TILDE AND ACUTE]
+                    case '\u1E7A': // Ṻ  [LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS]
+                    case '\u1EE4': // Ụ  [LATIN CAPITAL LETTER U WITH DOT BELOW]
+                    case '\u1EE6': // Ủ  [LATIN CAPITAL LETTER U WITH HOOK ABOVE]
+                    case '\u1EE8': // Ứ  [LATIN CAPITAL LETTER U WITH HORN AND ACUTE]
+                    case '\u1EEA': // Ừ  [LATIN CAPITAL LETTER U WITH HORN AND GRAVE]
+                    case '\u1EEC': // Ử  [LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE]
+                    case '\u1EEE': // Ữ  [LATIN CAPITAL LETTER U WITH HORN AND TILDE]
+                    case '\u1EF0': // Ự  [LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW]
+                    case '\u24CA': // Ⓤ  [CIRCLED LATIN CAPITAL LETTER U]
+                    case '\uFF35': // Ｕ  [FULLWIDTH LATIN CAPITAL LETTER U]
+                        output[outputPos++] = 'U';
+                        break;
+                    case '\u00F9': // ù  [LATIN SMALL LETTER U WITH GRAVE]
+                    case '\u00FA': // ú  [LATIN SMALL LETTER U WITH ACUTE]
+                    case '\u00FB': // û  [LATIN SMALL LETTER U WITH CIRCUMFLEX]
+                    case '\u00FC': // ü  [LATIN SMALL LETTER U WITH DIAERESIS]
+                    case '\u0169': // ũ  [LATIN SMALL LETTER U WITH TILDE]
+                    case '\u016B': // ū  [LATIN SMALL LETTER U WITH MACRON]
+                    case '\u016D': // ŭ  [LATIN SMALL LETTER U WITH BREVE]
+                    case '\u016F': // ů  [LATIN SMALL LETTER U WITH RING ABOVE]
+                    case '\u0171': // ű  [LATIN SMALL LETTER U WITH DOUBLE ACUTE]
+                    case '\u0173': // ų  [LATIN SMALL LETTER U WITH OGONEK]
+                    case '\u01B0': // ư  [LATIN SMALL LETTER U WITH HORN]
+                    case '\u01D4': // ǔ  [LATIN SMALL LETTER U WITH CARON]
+                    case '\u01D6': // ǖ  [LATIN SMALL LETTER U WITH DIAERESIS AND MACRON]
+                    case '\u01D8': // ǘ  [LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE]
+                    case '\u01DA': // ǚ  [LATIN SMALL LETTER U WITH DIAERESIS AND CARON]
+                    case '\u01DC': // ǜ  [LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE]
+                    case '\u0215': // ȕ  [LATIN SMALL LETTER U WITH DOUBLE GRAVE]
+                    case '\u0217': // ȗ  [LATIN SMALL LETTER U WITH INVERTED BREVE]
+                    case '\u0289': // ʉ  [LATIN SMALL LETTER U BAR]
+                    case '\u1D64': // ᵤ  [LATIN SUBSCRIPT SMALL LETTER U]
+                    case '\u1D99': // ᶙ  [LATIN SMALL LETTER U WITH RETROFLEX HOOK]
+                    case '\u1E73': // ṳ  [LATIN SMALL LETTER U WITH DIAERESIS BELOW]
+                    case '\u1E75': // ṵ  [LATIN SMALL LETTER U WITH TILDE BELOW]
+                    case '\u1E77': // ṷ  [LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW]
+                    case '\u1E79': // ṹ  [LATIN SMALL LETTER U WITH TILDE AND ACUTE]
+                    case '\u1E7B': // ṻ  [LATIN SMALL LETTER U WITH MACRON AND DIAERESIS]
+                    case '\u1EE5': // ụ  [LATIN SMALL LETTER U WITH DOT BELOW]
+                    case '\u1EE7': // ủ  [LATIN SMALL LETTER U WITH HOOK ABOVE]
+                    case '\u1EE9': // ứ  [LATIN SMALL LETTER U WITH HORN AND ACUTE]
+                    case '\u1EEB': // ừ  [LATIN SMALL LETTER U WITH HORN AND GRAVE]
+                    case '\u1EED': // ử  [LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE]
+                    case '\u1EEF': // ữ  [LATIN SMALL LETTER U WITH HORN AND TILDE]
+                    case '\u1EF1': // ự  [LATIN SMALL LETTER U WITH HORN AND DOT BELOW]
+                    case '\u24E4': // ⓤ  [CIRCLED LATIN SMALL LETTER U]
+                    case '\uFF55': // ｕ  [FULLWIDTH LATIN SMALL LETTER U]
+                        output[outputPos++] = 'u';
+                        break;
+                    case '\u24B0': // ⒰  [PARENTHESIZED LATIN SMALL LETTER U]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'u';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u1D6B': // ᵫ  [LATIN SMALL LETTER UE]
+                        output[outputPos++] = 'u';
+                        output[outputPos++] = 'e';
+                        break;
+                    case '\u01B2': // Ʋ  [LATIN CAPITAL LETTER V WITH HOOK]
+                    case '\u0245': // Ʌ  [LATIN CAPITAL LETTER TURNED V]
+                    case '\u1D20': // ᴠ  [LATIN LETTER SMALL CAPITAL V]
+                    case '\u1E7C': // Ṽ  [LATIN CAPITAL LETTER V WITH TILDE]
+                    case '\u1E7E': // Ṿ  [LATIN CAPITAL LETTER V WITH DOT BELOW]
+                    case '\u1EFC': // Ỽ  [LATIN CAPITAL LETTER MIDDLE-WELSH V]
+                    case '\u24CB': // Ⓥ  [CIRCLED LATIN CAPITAL LETTER V]
+                    case '\uA75E': // Ꝟ  [LATIN CAPITAL LETTER V WITH DIAGONAL STROKE]
+                    case '\uA768': // Ꝩ  [LATIN CAPITAL LETTER VEND]
+                    case '\uFF36': // Ｖ  [FULLWIDTH LATIN CAPITAL LETTER V]
+                        output[outputPos++] = 'V';
+                        break;
+                    case '\u028B': // ʋ  [LATIN SMALL LETTER V WITH HOOK]
+                    case '\u028C': // ʌ  [LATIN SMALL LETTER TURNED V]
+                    case '\u1D65': // ᵥ  [LATIN SUBSCRIPT SMALL LETTER V]
+                    case '\u1D8C': // ᶌ  [LATIN SMALL LETTER V WITH PALATAL HOOK]
+                    case '\u1E7D': // ṽ  [LATIN SMALL LETTER V WITH TILDE]
+                    case '\u1E7F': // ṿ  [LATIN SMALL LETTER V WITH DOT BELOW]
+                    case '\u24E5': // ⓥ  [CIRCLED LATIN SMALL LETTER V]
+                    case '\u2C71': // ⱱ  [LATIN SMALL LETTER V WITH RIGHT HOOK]
+                    case '\u2C74': // ⱴ  [LATIN SMALL LETTER V WITH CURL]
+                    case '\uA75F': // ꝟ  [LATIN SMALL LETTER V WITH DIAGONAL STROKE]
+                    case '\uFF56': // ｖ  [FULLWIDTH LATIN SMALL LETTER V]
+                        output[outputPos++] = 'v';
+                        break;
+                    case '\uA760': // Ꝡ  [LATIN CAPITAL LETTER VY]
+                        output[outputPos++] = 'V';
+                        output[outputPos++] = 'Y';
+                        break;
+                    case '\u24B1': // ⒱  [PARENTHESIZED LATIN SMALL LETTER V]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'v';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\uA761': // ꝡ  [LATIN SMALL LETTER VY]
+                        output[outputPos++] = 'v';
+                        output[outputPos++] = 'y';
+                        break;
+                    case '\u0174': // Ŵ  [LATIN CAPITAL LETTER W WITH CIRCUMFLEX]
+                    case '\u01F7': // Ƿ  http://en.wikipedia.org/wiki/Wynn  [LATIN CAPITAL LETTER WYNN]
+                    case '\u1D21': // ᴡ  [LATIN LETTER SMALL CAPITAL W]
+                    case '\u1E80': // Ẁ  [LATIN CAPITAL LETTER W WITH GRAVE]
+                    case '\u1E82': // Ẃ  [LATIN CAPITAL LETTER W WITH ACUTE]
+                    case '\u1E84': // Ẅ  [LATIN CAPITAL LETTER W WITH DIAERESIS]
+                    case '\u1E86': // Ẇ  [LATIN CAPITAL LETTER W WITH DOT ABOVE]
+                    case '\u1E88': // Ẉ  [LATIN CAPITAL LETTER W WITH DOT BELOW]
+                    case '\u24CC': // Ⓦ  [CIRCLED LATIN CAPITAL LETTER W]
+                    case '\u2C72': // Ⱳ  [LATIN CAPITAL LETTER W WITH HOOK]
+                    case '\uFF37': // Ｗ  [FULLWIDTH LATIN CAPITAL LETTER W]
+                        output[outputPos++] = 'W';
+                        break;
+                    case '\u0175': // ŵ  [LATIN SMALL LETTER W WITH CIRCUMFLEX]
+                    case '\u01BF': // ƿ  http://en.wikipedia.org/wiki/Wynn  [LATIN LETTER WYNN]
+                    case '\u028D': // ʍ  [LATIN SMALL LETTER TURNED W]
+                    case '\u1E81': // ẁ  [LATIN SMALL LETTER W WITH GRAVE]
+                    case '\u1E83': // ẃ  [LATIN SMALL LETTER W WITH ACUTE]
+                    case '\u1E85': // ẅ  [LATIN SMALL LETTER W WITH DIAERESIS]
+                    case '\u1E87': // ẇ  [LATIN SMALL LETTER W WITH DOT ABOVE]
+                    case '\u1E89': // ẉ  [LATIN SMALL LETTER W WITH DOT BELOW]
+                    case '\u1E98': // ẘ  [LATIN SMALL LETTER W WITH RING ABOVE]
+                    case '\u24E6': // ⓦ  [CIRCLED LATIN SMALL LETTER W]
+                    case '\u2C73': // ⱳ  [LATIN SMALL LETTER W WITH HOOK]
+                    case '\uFF57': // ｗ  [FULLWIDTH LATIN SMALL LETTER W]
+                        output[outputPos++] = 'w';
+                        break;
+                    case '\u24B2': // ⒲  [PARENTHESIZED LATIN SMALL LETTER W]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'w';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u1E8A': // Ẋ  [LATIN CAPITAL LETTER X WITH DOT ABOVE]
+                    case '\u1E8C': // Ẍ  [LATIN CAPITAL LETTER X WITH DIAERESIS]
+                    case '\u24CD': // Ⓧ  [CIRCLED LATIN CAPITAL LETTER X]
+                    case '\uFF38': // Ｘ  [FULLWIDTH LATIN CAPITAL LETTER X]
+                        output[outputPos++] = 'X';
+                        break;
+                    case '\u1D8D': // ᶍ  [LATIN SMALL LETTER X WITH PALATAL HOOK]
+                    case '\u1E8B': // ẋ  [LATIN SMALL LETTER X WITH DOT ABOVE]
+                    case '\u1E8D': // ẍ  [LATIN SMALL LETTER X WITH DIAERESIS]
+                    case '\u2093': // ₓ  [LATIN SUBSCRIPT SMALL LETTER X]
+                    case '\u24E7': // ⓧ  [CIRCLED LATIN SMALL LETTER X]
+                    case '\uFF58': // ｘ  [FULLWIDTH LATIN SMALL LETTER X]
+                        output[outputPos++] = 'x';
+                        break;
+                    case '\u24B3': // ⒳  [PARENTHESIZED LATIN SMALL LETTER X]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'x';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00DD': // Ý  [LATIN CAPITAL LETTER Y WITH ACUTE]
+                    case '\u0176': // Ŷ  [LATIN CAPITAL LETTER Y WITH CIRCUMFLEX]
+                    case '\u0178': // Ÿ  [LATIN CAPITAL LETTER Y WITH DIAERESIS]
+                    case '\u01B3': // Ƴ  [LATIN CAPITAL LETTER Y WITH HOOK]
+                    case '\u0232': // Ȳ  [LATIN CAPITAL LETTER Y WITH MACRON]
+                    case '\u024E': // Ɏ  [LATIN CAPITAL LETTER Y WITH STROKE]
+                    case '\u028F': // ʏ  [LATIN LETTER SMALL CAPITAL Y]
+                    case '\u1E8E': // Ẏ  [LATIN CAPITAL LETTER Y WITH DOT ABOVE]
+                    case '\u1EF2': // Ỳ  [LATIN CAPITAL LETTER Y WITH GRAVE]
+                    case '\u1EF4': // Ỵ  [LATIN CAPITAL LETTER Y WITH DOT BELOW]
+                    case '\u1EF6': // Ỷ  [LATIN CAPITAL LETTER Y WITH HOOK ABOVE]
+                    case '\u1EF8': // Ỹ  [LATIN CAPITAL LETTER Y WITH TILDE]
+                    case '\u1EFE': // Ỿ  [LATIN CAPITAL LETTER Y WITH LOOP]
+                    case '\u24CE': // Ⓨ  [CIRCLED LATIN CAPITAL LETTER Y]
+                    case '\uFF39': // Ｙ  [FULLWIDTH LATIN CAPITAL LETTER Y]
+                        output[outputPos++] = 'Y';
+                        break;
+                    case '\u00FD': // ý  [LATIN SMALL LETTER Y WITH ACUTE]
+                    case '\u00FF': // ÿ  [LATIN SMALL LETTER Y WITH DIAERESIS]
+                    case '\u0177': // ŷ  [LATIN SMALL LETTER Y WITH CIRCUMFLEX]
+                    case '\u01B4': // ƴ  [LATIN SMALL LETTER Y WITH HOOK]
+                    case '\u0233': // ȳ  [LATIN SMALL LETTER Y WITH MACRON]
+                    case '\u024F': // ɏ  [LATIN SMALL LETTER Y WITH STROKE]
+                    case '\u028E': // ʎ  [LATIN SMALL LETTER TURNED Y]
+                    case '\u1E8F': // ẏ  [LATIN SMALL LETTER Y WITH DOT ABOVE]
+                    case '\u1E99': // ẙ  [LATIN SMALL LETTER Y WITH RING ABOVE]
+                    case '\u1EF3': // ỳ  [LATIN SMALL LETTER Y WITH GRAVE]
+                    case '\u1EF5': // ỵ  [LATIN SMALL LETTER Y WITH DOT BELOW]
+                    case '\u1EF7': // ỷ  [LATIN SMALL LETTER Y WITH HOOK ABOVE]
+                    case '\u1EF9': // ỹ  [LATIN SMALL LETTER Y WITH TILDE]
+                    case '\u1EFF': // ỿ  [LATIN SMALL LETTER Y WITH LOOP]
+                    case '\u24E8': // ⓨ  [CIRCLED LATIN SMALL LETTER Y]
+                    case '\uFF59': // ｙ  [FULLWIDTH LATIN SMALL LETTER Y]
+                        output[outputPos++] = 'y';
+                        break;
+                    case '\u24B4': // ⒴  [PARENTHESIZED LATIN SMALL LETTER Y]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'y';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u0179': // Ź  [LATIN CAPITAL LETTER Z WITH ACUTE]
+                    case '\u017B': // Ż  [LATIN CAPITAL LETTER Z WITH DOT ABOVE]
+                    case '\u017D': // Ž  [LATIN CAPITAL LETTER Z WITH CARON]
+                    case '\u01B5': // Ƶ  [LATIN CAPITAL LETTER Z WITH STROKE]
+                    case '\u021C': // Ȝ  http://en.wikipedia.org/wiki/Yogh  [LATIN CAPITAL LETTER YOGH]
+                    case '\u0224': // Ȥ  [LATIN CAPITAL LETTER Z WITH HOOK]
+                    case '\u1D22': // ᴢ  [LATIN LETTER SMALL CAPITAL Z]
+                    case '\u1E90': // Ẑ  [LATIN CAPITAL LETTER Z WITH CIRCUMFLEX]
+                    case '\u1E92': // Ẓ  [LATIN CAPITAL LETTER Z WITH DOT BELOW]
+                    case '\u1E94': // Ẕ  [LATIN CAPITAL LETTER Z WITH LINE BELOW]
+                    case '\u24CF': // Ⓩ  [CIRCLED LATIN CAPITAL LETTER Z]
+                    case '\u2C6B': // Ⱬ  [LATIN CAPITAL LETTER Z WITH DESCENDER]
+                    case '\uA762': // Ꝣ  [LATIN CAPITAL LETTER VISIGOTHIC Z]
+                    case '\uFF3A': // Ｚ  [FULLWIDTH LATIN CAPITAL LETTER Z]
+                        output[outputPos++] = 'Z';
+                        break;
+                    case '\u017A': // ź  [LATIN SMALL LETTER Z WITH ACUTE]
+                    case '\u017C': // ż  [LATIN SMALL LETTER Z WITH DOT ABOVE]
+                    case '\u017E': // ž  [LATIN SMALL LETTER Z WITH CARON]
+                    case '\u01B6': // ƶ  [LATIN SMALL LETTER Z WITH STROKE]
+                    case '\u021D': // ȝ  http://en.wikipedia.org/wiki/Yogh  [LATIN SMALL LETTER YOGH]
+                    case '\u0225': // ȥ  [LATIN SMALL LETTER Z WITH HOOK]
+                    case '\u0240': // ɀ  [LATIN SMALL LETTER Z WITH SWASH TAIL]
+                    case '\u0290': // ʐ  [LATIN SMALL LETTER Z WITH RETROFLEX HOOK]
+                    case '\u0291': // ʑ  [LATIN SMALL LETTER Z WITH CURL]
+                    case '\u1D76': // ᵶ  [LATIN SMALL LETTER Z WITH MIDDLE TILDE]
+                    case '\u1D8E': // ᶎ  [LATIN SMALL LETTER Z WITH PALATAL HOOK]
+                    case '\u1E91': // ẑ  [LATIN SMALL LETTER Z WITH CIRCUMFLEX]
+                    case '\u1E93': // ẓ  [LATIN SMALL LETTER Z WITH DOT BELOW]
+                    case '\u1E95': // ẕ  [LATIN SMALL LETTER Z WITH LINE BELOW]
+                    case '\u24E9': // ⓩ  [CIRCLED LATIN SMALL LETTER Z]
+                    case '\u2C6C': // ⱬ  [LATIN SMALL LETTER Z WITH DESCENDER]
+                    case '\uA763': // ꝣ  [LATIN SMALL LETTER VISIGOTHIC Z]
+                    case '\uFF5A': // ｚ  [FULLWIDTH LATIN SMALL LETTER Z]
+                        output[outputPos++] = 'z';
+                        break;
+                    case '\u24B5': // ⒵  [PARENTHESIZED LATIN SMALL LETTER Z]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = 'z';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2070': // ⁰  [SUPERSCRIPT ZERO]
+                    case '\u2080': // ₀  [SUBSCRIPT ZERO]
+                    case '\u24EA': // ⓪  [CIRCLED DIGIT ZERO]
+                    case '\u24FF': // ⓿  [NEGATIVE CIRCLED DIGIT ZERO]
+                    case '\uFF10': // ０  [FULLWIDTH DIGIT ZERO]
+                        output[outputPos++] = '0';
+                        break;
+                    case '\u00B9': // ¹  [SUPERSCRIPT ONE]
+                    case '\u2081': // ₁  [SUBSCRIPT ONE]
+                    case '\u2460': // ①  [CIRCLED DIGIT ONE]
+                    case '\u24F5': // ⓵  [DOUBLE CIRCLED DIGIT ONE]
+                    case '\u2776': // ❶  [DINGBAT NEGATIVE CIRCLED DIGIT ONE]
+                    case '\u2780': // ➀  [DINGBAT CIRCLED SANS-SERIF DIGIT ONE]
+                    case '\u278A': // ➊  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE]
+                    case '\uFF11': // １  [FULLWIDTH DIGIT ONE]
+                        output[outputPos++] = '1';
+                        break;
+                    case '\u2488': // ⒈  [DIGIT ONE FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2474': // ⑴  [PARENTHESIZED DIGIT ONE]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00B2': // ²  [SUPERSCRIPT TWO]
+                    case '\u2082': // ₂  [SUBSCRIPT TWO]
+                    case '\u2461': // ②  [CIRCLED DIGIT TWO]
+                    case '\u24F6': // ⓶  [DOUBLE CIRCLED DIGIT TWO]
+                    case '\u2777': // ❷  [DINGBAT NEGATIVE CIRCLED DIGIT TWO]
+                    case '\u2781': // ➁  [DINGBAT CIRCLED SANS-SERIF DIGIT TWO]
+                    case '\u278B': // ➋  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO]
+                    case '\uFF12': // ２  [FULLWIDTH DIGIT TWO]
+                        output[outputPos++] = '2';
+                        break;
+                    case '\u2489': // ⒉  [DIGIT TWO FULL STOP]
+                        output[outputPos++] = '2';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2475': // ⑵  [PARENTHESIZED DIGIT TWO]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '2';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00B3': // ³  [SUPERSCRIPT THREE]
+                    case '\u2083': // ₃  [SUBSCRIPT THREE]
+                    case '\u2462': // ③  [CIRCLED DIGIT THREE]
+                    case '\u24F7': // ⓷  [DOUBLE CIRCLED DIGIT THREE]
+                    case '\u2778': // ❸  [DINGBAT NEGATIVE CIRCLED DIGIT THREE]
+                    case '\u2782': // ➂  [DINGBAT CIRCLED SANS-SERIF DIGIT THREE]
+                    case '\u278C': // ➌  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE]
+                    case '\uFF13': // ３  [FULLWIDTH DIGIT THREE]
+                        output[outputPos++] = '3';
+                        break;
+                    case '\u248A': // ⒊  [DIGIT THREE FULL STOP]
+                        output[outputPos++] = '3';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2476': // ⑶  [PARENTHESIZED DIGIT THREE]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '3';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2074': // ⁴  [SUPERSCRIPT FOUR]
+                    case '\u2084': // ₄  [SUBSCRIPT FOUR]
+                    case '\u2463': // ④  [CIRCLED DIGIT FOUR]
+                    case '\u24F8': // ⓸  [DOUBLE CIRCLED DIGIT FOUR]
+                    case '\u2779': // ❹  [DINGBAT NEGATIVE CIRCLED DIGIT FOUR]
+                    case '\u2783': // ➃  [DINGBAT CIRCLED SANS-SERIF DIGIT FOUR]
+                    case '\u278D': // ➍  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR]
+                    case '\uFF14': // ４  [FULLWIDTH DIGIT FOUR]
+                        output[outputPos++] = '4';
+                        break;
+                    case '\u248B': // ⒋  [DIGIT FOUR FULL STOP]
+                        output[outputPos++] = '4';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2477': // ⑷  [PARENTHESIZED DIGIT FOUR]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '4';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2075': // ⁵  [SUPERSCRIPT FIVE]
+                    case '\u2085': // ₅  [SUBSCRIPT FIVE]
+                    case '\u2464': // ⑤  [CIRCLED DIGIT FIVE]
+                    case '\u24F9': // ⓹  [DOUBLE CIRCLED DIGIT FIVE]
+                    case '\u277A': // ❺  [DINGBAT NEGATIVE CIRCLED DIGIT FIVE]
+                    case '\u2784': // ➄  [DINGBAT CIRCLED SANS-SERIF DIGIT FIVE]
+                    case '\u278E': // ➎  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE]
+                    case '\uFF15': // ５  [FULLWIDTH DIGIT FIVE]
+                        output[outputPos++] = '5';
+                        break;
+                    case '\u248C': // ⒌  [DIGIT FIVE FULL STOP]
+                        output[outputPos++] = '5';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2478': // ⑸  [PARENTHESIZED DIGIT FIVE]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '5';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2076': // ⁶  [SUPERSCRIPT SIX]
+                    case '\u2086': // ₆  [SUBSCRIPT SIX]
+                    case '\u2465': // ⑥  [CIRCLED DIGIT SIX]
+                    case '\u24FA': // ⓺  [DOUBLE CIRCLED DIGIT SIX]
+                    case '\u277B': // ❻  [DINGBAT NEGATIVE CIRCLED DIGIT SIX]
+                    case '\u2785': // ➅  [DINGBAT CIRCLED SANS-SERIF DIGIT SIX]
+                    case '\u278F': // ➏  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX]
+                    case '\uFF16': // ６  [FULLWIDTH DIGIT SIX]
+                        output[outputPos++] = '6';
+                        break;
+                    case '\u248D': // ⒍  [DIGIT SIX FULL STOP]
+                        output[outputPos++] = '6';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2479': // ⑹  [PARENTHESIZED DIGIT SIX]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '6';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2077': // ⁷  [SUPERSCRIPT SEVEN]
+                    case '\u2087': // ₇  [SUBSCRIPT SEVEN]
+                    case '\u2466': // ⑦  [CIRCLED DIGIT SEVEN]
+                    case '\u24FB': // ⓻  [DOUBLE CIRCLED DIGIT SEVEN]
+                    case '\u277C': // ❼  [DINGBAT NEGATIVE CIRCLED DIGIT SEVEN]
+                    case '\u2786': // ➆  [DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN]
+                    case '\u2790': // ➐  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN]
+                    case '\uFF17': // ７  [FULLWIDTH DIGIT SEVEN]
+                        output[outputPos++] = '7';
+                        break;
+                    case '\u248E': // ⒎  [DIGIT SEVEN FULL STOP]
+                        output[outputPos++] = '7';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u247A': // ⑺  [PARENTHESIZED DIGIT SEVEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '7';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2078': // ⁸  [SUPERSCRIPT EIGHT]
+                    case '\u2088': // ₈  [SUBSCRIPT EIGHT]
+                    case '\u2467': // ⑧  [CIRCLED DIGIT EIGHT]
+                    case '\u24FC': // ⓼  [DOUBLE CIRCLED DIGIT EIGHT]
+                    case '\u277D': // ❽  [DINGBAT NEGATIVE CIRCLED DIGIT EIGHT]
+                    case '\u2787': // ➇  [DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT]
+                    case '\u2791': // ➑  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT]
+                    case '\uFF18': // ８  [FULLWIDTH DIGIT EIGHT]
+                        output[outputPos++] = '8';
+                        break;
+                    case '\u248F': // ⒏  [DIGIT EIGHT FULL STOP]
+                        output[outputPos++] = '8';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u247B': // ⑻  [PARENTHESIZED DIGIT EIGHT]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '8';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2079': // ⁹  [SUPERSCRIPT NINE]
+                    case '\u2089': // ₉  [SUBSCRIPT NINE]
+                    case '\u2468': // ⑨  [CIRCLED DIGIT NINE]
+                    case '\u24FD': // ⓽  [DOUBLE CIRCLED DIGIT NINE]
+                    case '\u277E': // ❾  [DINGBAT NEGATIVE CIRCLED DIGIT NINE]
+                    case '\u2788': // ➈  [DINGBAT CIRCLED SANS-SERIF DIGIT NINE]
+                    case '\u2792': // ➒  [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE]
+                    case '\uFF19': // ９  [FULLWIDTH DIGIT NINE]
+                        output[outputPos++] = '9';
+                        break;
+                    case '\u2490': // ⒐  [DIGIT NINE FULL STOP]
+                        output[outputPos++] = '9';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u247C': // ⑼  [PARENTHESIZED DIGIT NINE]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '9';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2469': // ⑩  [CIRCLED NUMBER TEN]
+                    case '\u24FE': // ⓾  [DOUBLE CIRCLED NUMBER TEN]
+                    case '\u277F': // ❿  [DINGBAT NEGATIVE CIRCLED NUMBER TEN]
+                    case '\u2789': // ➉  [DINGBAT CIRCLED SANS-SERIF NUMBER TEN]
+                    case '\u2793': // ➓  [DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '0';
+                        break;
+                    case '\u2491': // ⒑  [NUMBER TEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '0';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u247D': // ⑽  [PARENTHESIZED NUMBER TEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '0';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u246A': // ⑪  [CIRCLED NUMBER ELEVEN]
+                    case '\u24EB': // ⓫  [NEGATIVE CIRCLED NUMBER ELEVEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '1';
+                        break;
+                    case '\u2492': // ⒒  [NUMBER ELEVEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u247E': // ⑾  [PARENTHESIZED NUMBER ELEVEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u246B': // ⑫  [CIRCLED NUMBER TWELVE]
+                    case '\u24EC': // ⓬  [NEGATIVE CIRCLED NUMBER TWELVE]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '2';
+                        break;
+                    case '\u2493': // ⒓  [NUMBER TWELVE FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '2';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u247F': // ⑿  [PARENTHESIZED NUMBER TWELVE]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '2';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u246C': // ⑬  [CIRCLED NUMBER THIRTEEN]
+                    case '\u24ED': // ⓭  [NEGATIVE CIRCLED NUMBER THIRTEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '3';
+                        break;
+                    case '\u2494': // ⒔  [NUMBER THIRTEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '3';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2480': // ⒀  [PARENTHESIZED NUMBER THIRTEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '3';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u246D': // ⑭  [CIRCLED NUMBER FOURTEEN]
+                    case '\u24EE': // ⓮  [NEGATIVE CIRCLED NUMBER FOURTEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '4';
+                        break;
+                    case '\u2495': // ⒕  [NUMBER FOURTEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '4';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2481': // ⒁  [PARENTHESIZED NUMBER FOURTEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '4';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u246E': // ⑮  [CIRCLED NUMBER FIFTEEN]
+                    case '\u24EF': // ⓯  [NEGATIVE CIRCLED NUMBER FIFTEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '5';
+                        break;
+                    case '\u2496': // ⒖  [NUMBER FIFTEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '5';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2482': // ⒂  [PARENTHESIZED NUMBER FIFTEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '5';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u246F': // ⑯  [CIRCLED NUMBER SIXTEEN]
+                    case '\u24F0': // ⓰  [NEGATIVE CIRCLED NUMBER SIXTEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '6';
+                        break;
+                    case '\u2497': // ⒗  [NUMBER SIXTEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '6';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2483': // ⒃  [PARENTHESIZED NUMBER SIXTEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '6';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2470': // ⑰  [CIRCLED NUMBER SEVENTEEN]
+                    case '\u24F1': // ⓱  [NEGATIVE CIRCLED NUMBER SEVENTEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '7';
+                        break;
+                    case '\u2498': // ⒘  [NUMBER SEVENTEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '7';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2484': // ⒄  [PARENTHESIZED NUMBER SEVENTEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '7';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2471': // ⑱  [CIRCLED NUMBER EIGHTEEN]
+                    case '\u24F2': // ⓲  [NEGATIVE CIRCLED NUMBER EIGHTEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '8';
+                        break;
+                    case '\u2499': // ⒙  [NUMBER EIGHTEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '8';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2485': // ⒅  [PARENTHESIZED NUMBER EIGHTEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '8';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2472': // ⑲  [CIRCLED NUMBER NINETEEN]
+                    case '\u24F3': // ⓳  [NEGATIVE CIRCLED NUMBER NINETEEN]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '9';
+                        break;
+                    case '\u249A': // ⒚  [NUMBER NINETEEN FULL STOP]
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '9';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2486': // ⒆  [PARENTHESIZED NUMBER NINETEEN]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '1';
+                        output[outputPos++] = '9';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2473': // ⑳  [CIRCLED NUMBER TWENTY]
+                    case '\u24F4': // ⓴  [NEGATIVE CIRCLED NUMBER TWENTY]
+                        output[outputPos++] = '2';
+                        output[outputPos++] = '0';
+                        break;
+                    case '\u249B': // ⒛  [NUMBER TWENTY FULL STOP]
+                        output[outputPos++] = '2';
+                        output[outputPos++] = '0';
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2487': // ⒇  [PARENTHESIZED NUMBER TWENTY]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '2';
+                        output[outputPos++] = '0';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u00AB': // «  [LEFT-POINTING DOUBLE ANGLE QUOTATION MARK]
+                    case '\u00BB': // »  [RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK]
+                    case '\u201C': // “  [LEFT DOUBLE QUOTATION MARK]
+                    case '\u201D': // ”  [RIGHT DOUBLE QUOTATION MARK]
+                    case '\u201E': // „  [DOUBLE LOW-9 QUOTATION MARK]
+                    case '\u2033': // ″  [DOUBLE PRIME]
+                    case '\u2036': // ‶  [REVERSED DOUBLE PRIME]
+                    case '\u275D': // ❝  [HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT]
+                    case '\u275E': // ❞  [HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT]
+                    case '\u276E': // ❮  [HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT]
+                    case '\u276F': // ❯  [HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT]
+                    case '\uFF02': // ＂  [FULLWIDTH QUOTATION MARK]
+                        output[outputPos++] = '"';
+                        break;
+                    case '\u2018': // ‘  [LEFT SINGLE QUOTATION MARK]
+                    case '\u2019': // ’  [RIGHT SINGLE QUOTATION MARK]
+                    case '\u201A': // ‚  [SINGLE LOW-9 QUOTATION MARK]
+                    case '\u201B': // ‛  [SINGLE HIGH-REVERSED-9 QUOTATION MARK]
+                    case '\u2032': // ′  [PRIME]
+                    case '\u2035': // ‵  [REVERSED PRIME]
+                    case '\u2039': // ‹  [SINGLE LEFT-POINTING ANGLE QUOTATION MARK]
+                    case '\u203A': // ›  [SINGLE RIGHT-POINTING ANGLE QUOTATION MARK]
+                    case '\u275B': // ❛  [HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT]
+                    case '\u275C': // ❜  [HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT]
+                    case '\uFF07': // ＇  [FULLWIDTH APOSTROPHE]
+                        output[outputPos++] = '\'';
+                        break;
+                    case '\u2010': // ‐  [HYPHEN]
+                    case '\u2011': // ‑  [NON-BREAKING HYPHEN]
+                    case '\u2012': // ‒  [FIGURE DASH]
+                    case '\u2013': // –  [EN DASH]
+                    case '\u2014': // —  [EM DASH]
+                    case '\u207B': // ⁻  [SUPERSCRIPT MINUS]
+                    case '\u208B': // ₋  [SUBSCRIPT MINUS]
+                    case '\uFF0D': // －  [FULLWIDTH HYPHEN-MINUS]
+                        output[outputPos++] = '-';
+                        break;
+                    case '\u2045': // ⁅  [LEFT SQUARE BRACKET WITH QUILL]
+                    case '\u2772': // ❲  [LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT]
+                    case '\uFF3B': // ［  [FULLWIDTH LEFT SQUARE BRACKET]
+                        output[outputPos++] = '[';
+                        break;
+                    case '\u2046': // ⁆  [RIGHT SQUARE BRACKET WITH QUILL]
+                    case '\u2773': // ❳  [LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT]
+                    case '\uFF3D': // ］  [FULLWIDTH RIGHT SQUARE BRACKET]
+                        output[outputPos++] = ']';
+                        break;
+                    case '\u207D': // ⁽  [SUPERSCRIPT LEFT PARENTHESIS]
+                    case '\u208D': // ₍  [SUBSCRIPT LEFT PARENTHESIS]
+                    case '\u2768': // ❨  [MEDIUM LEFT PARENTHESIS ORNAMENT]
+                    case '\u276A': // ❪  [MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT]
+                    case '\uFF08': // （  [FULLWIDTH LEFT PARENTHESIS]
+                        output[outputPos++] = '(';
+                        break;
+                    case '\u2E28': // ⸨  [LEFT DOUBLE PARENTHESIS]
+                        output[outputPos++] = '(';
+                        output[outputPos++] = '(';
+                        break;
+                    case '\u207E': // ⁾  [SUPERSCRIPT RIGHT PARENTHESIS]
+                    case '\u208E': // ₎  [SUBSCRIPT RIGHT PARENTHESIS]
+                    case '\u2769': // ❩  [MEDIUM RIGHT PARENTHESIS ORNAMENT]
+                    case '\u276B': // ❫  [MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT]
+                    case '\uFF09': // ）  [FULLWIDTH RIGHT PARENTHESIS]
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u2E29': // ⸩  [RIGHT DOUBLE PARENTHESIS]
+                        output[outputPos++] = ')';
+                        output[outputPos++] = ')';
+                        break;
+                    case '\u276C': // ❬  [MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT]
+                    case '\u2770': // ❰  [HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT]
+                    case '\uFF1C': // ＜  [FULLWIDTH LESS-THAN SIGN]
+                        output[outputPos++] = '<';
+                        break;
+                    case '\u276D': // ❭  [MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT]
+                    case '\u2771': // ❱  [HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT]
+                    case '\uFF1E': // ＞  [FULLWIDTH GREATER-THAN SIGN]
+                        output[outputPos++] = '>';
+                        break;
+                    case '\u2774': // ❴  [MEDIUM LEFT CURLY BRACKET ORNAMENT]
+                    case '\uFF5B': // ｛  [FULLWIDTH LEFT CURLY BRACKET]
+                        output[outputPos++] = '{';
+                        break;
+                    case '\u2775': // ❵  [MEDIUM RIGHT CURLY BRACKET ORNAMENT]
+                    case '\uFF5D': // ｝  [FULLWIDTH RIGHT CURLY BRACKET]
+                        output[outputPos++] = '}';
+                        break;
+                    case '\u207A': // ⁺  [SUPERSCRIPT PLUS SIGN]
+                    case '\u208A': // ₊  [SUBSCRIPT PLUS SIGN]
+                    case '\uFF0B': // ＋  [FULLWIDTH PLUS SIGN]
+                        output[outputPos++] = '+';
+                        break;
+                    case '\u207C': // ⁼  [SUPERSCRIPT EQUALS SIGN]
+                    case '\u208C': // ₌  [SUBSCRIPT EQUALS SIGN]
+                    case '\uFF1D': // ＝  [FULLWIDTH EQUALS SIGN]
+                        output[outputPos++] = '=';
+                        break;
+                    case '\uFF01': // ！  [FULLWIDTH EXCLAMATION MARK]
+                        output[outputPos++] = '!';
+                        break;
+                    case '\u203C': // ‼  [DOUBLE EXCLAMATION MARK]
+                        output[outputPos++] = '!';
+                        output[outputPos++] = '!';
+                        break;
+                    case '\u2049': // ⁉  [EXCLAMATION QUESTION MARK]
+                        output[outputPos++] = '!';
+                        output[outputPos++] = '?';
+                        break;
+                    case '\uFF03': // ＃  [FULLWIDTH NUMBER SIGN]
+                        output[outputPos++] = '#';
+                        break;
+                    case '\uFF04': // ＄  [FULLWIDTH DOLLAR SIGN]
+                        output[outputPos++] = '$';
+                        break;
+                    case '\u2052': // ⁒  [COMMERCIAL MINUS SIGN]
+                    case '\uFF05': // ％  [FULLWIDTH PERCENT SIGN]
+                        output[outputPos++] = '%';
+                        break;
+                    case '\uFF06': // ＆  [FULLWIDTH AMPERSAND]
+                        output[outputPos++] = '&';
+                        break;
+                    case '\u204E': // ⁎  [LOW ASTERISK]
+                    case '\uFF0A': // ＊  [FULLWIDTH ASTERISK]
+                        output[outputPos++] = '*';
+                        break;
+                    case '\uFF0C': // ，  [FULLWIDTH COMMA]
+                        output[outputPos++] = ',';
+                        break;
+                    case '\uFF0E': // ．  [FULLWIDTH FULL STOP]
+                        output[outputPos++] = '.';
+                        break;
+                    case '\u2044': // ⁄  [FRACTION SLASH]
+                    case '\uFF0F': // ／  [FULLWIDTH SOLIDUS]
+                        output[outputPos++] = '/';
+                        break;
+                    case '\uFF1A': // ：  [FULLWIDTH COLON]
+                        output[outputPos++] = ':';
+                        break;
+                    case '\u204F': // ⁏  [REVERSED SEMICOLON]
+                    case '\uFF1B': // ；  [FULLWIDTH SEMICOLON]
+                        output[outputPos++] = ';';
+                        break;
+                    case '\uFF1F': // ？  [FULLWIDTH QUESTION MARK]
+                        output[outputPos++] = '?';
+                        break;
+                    case '\u2047': // ⁇  [DOUBLE QUESTION MARK]
+                        output[outputPos++] = '?';
+                        output[outputPos++] = '?';
+                        break;
+                    case '\u2048': // ⁈  [QUESTION EXCLAMATION MARK]
+                        output[outputPos++] = '?';
+                        output[outputPos++] = '!';
+                        break;
+                    case '\uFF20': // ＠  [FULLWIDTH COMMERCIAL AT]
+                        output[outputPos++] = '@';
+                        break;
+                    case '\uFF3C': // ＼  [FULLWIDTH REVERSE SOLIDUS]
+                        output[outputPos++] = '\\';
+                        break;
+                    case '\u2038': // ‸  [CARET]
+                    case '\uFF3E': // ＾  [FULLWIDTH CIRCUMFLEX ACCENT]
+                        output[outputPos++] = '^';
+                        break;
+                    case '\uFF3F': // ＿  [FULLWIDTH LOW LINE]
+                        output[outputPos++] = '_';
+                        break;
+                    case '\u2053': // ⁓  [SWUNG DASH]
+                    case '\uFF5E': // ～  [FULLWIDTH TILDE]
+                        output[outputPos++] = '~';
+                        break;
+                    default:
+                        output[outputPos++] = c;
+                        break;
+                }
+            }
+        }
+        return outputPos;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineBuilder.java b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineBuilder.java
new file mode 100644
index 000000000000..3a6a72603df2
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineBuilder.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+/**
+ * Creates a Pipeline object for applying n pieces of logic
+ * from the provided methods to the builder in a guaranteed order
+ */
+public class FilterPipelineBuilder
+{
+    private final FilterPipelineTask parent;
+    private FilterPipelineTask current;
+
+    public FilterPipelineBuilder(FilterPipelineTask first)
+    {
+        this(first, first);
+    }
+
+    private FilterPipelineBuilder(FilterPipelineTask first, FilterPipelineTask current)
+    {
+        this.parent = first;
+        this.current = current;
+    }
+
+    public FilterPipelineBuilder add(String name, FilterPipelineTask nextTask)
+    {
+        this.current.setLast(name, nextTask);
+        this.current = nextTask;
+        return this;
+    }
+
+    public FilterPipelineTask build()
+    {
+        return this.parent;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java
new file mode 100644
index 000000000000..ce5c5143dce1
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineExecutor.java
@@ -0,0 +1,50 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+/**
+ * Executes all linked Pipeline Tasks serially and returns
+ * output (if exists) from the executed logic
+ */
+public class FilterPipelineExecutor
+{
+    public static String execute(FilterPipelineTask task, String initialInput)
+    {
+        FilterPipelineTask taskPtr = task;
+        String result = initialInput;
+        
+        while (true)
+        {
+            FilterPipelineTask taskGeneric = taskPtr;
+            result = taskGeneric.process(result);
+            taskPtr = taskPtr.next;
+            
+            if (taskPtr == null)
+            {
+                return result;
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineTask.java b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineTask.java
new file mode 100644
index 000000000000..4e759547def7
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/analyzer/filter/FilterPipelineTask.java
@@ -0,0 +1,60 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+/**
+ * A single task or set of work to process an input
+ * and return a single output. Maintains a link to the
+ * next task to be executed after itself
+ */
+public abstract class FilterPipelineTask
+{
+    private String name;
+    public FilterPipelineTask next;
+
+    void setLast(String name, FilterPipelineTask last)
+    {
+        if (last == this)
+            throw new IllegalArgumentException("provided last task [" + last.name + "] cannot be set to itself");
+
+        if (this.next == null)
+        {
+            this.next = last;
+            this.name = name;
+        }
+        else
+        {
+            this.next.setLast(name, last);
+        }
+    }
+
+    public abstract String process(String input);
+
+    public String getName()
+    {
+        return name;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/BKDQueries.java b/src/java/org/apache/cassandra/index/sai/disk/BKDQueries.java
new file mode 100644
index 000000000000..1a982c0ca902
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/BKDQueries.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.disk.v1.BKDReader;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.util.FutureArrays;
+
+import static org.apache.lucene.index.PointValues.Relation.CELL_INSIDE_QUERY;
+
+//TODO: possible perf improvements when running on Java 9+ after replacing FutureArrays with Arrays; this needs a MR jar;
+class BKDQueries
+{
+    private static final BKDReader.IntersectVisitor MATCH_ALL = new BKDReader.IntersectVisitor()
+    {
+        @Override
+        public boolean visit(byte[] packedValue)
+        {
+            return true;
+        }
+
+        @Override
+        public Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+        {
+            return CELL_INSIDE_QUERY;
+        }
+    };
+
+    static BKDReader.IntersectVisitor bkdQueryFrom(Expression expression, int numDim, int bytesPerDim)
+    {
+        if (expression.lower == null && expression.upper == null)
+        {
+            return MATCH_ALL;
+        }
+
+        Bound lower = null ;
+        if (expression.lower != null)
+        {
+            final byte[] lowerBound = toComparableBytes(numDim, bytesPerDim, expression.lower.value.encoded, expression.validator);
+            lower = new Bound(lowerBound, !expression.lower.inclusive);
+        }
+
+        Bound upper = null;
+        if (expression.upper != null)
+        {
+            final byte[] upperBound = toComparableBytes(numDim, bytesPerDim, expression.upper.value.encoded, expression.validator);
+            upper = new Bound(upperBound, !expression.upper.inclusive);
+        }
+
+        return new RangeQueryVisitor(numDim, bytesPerDim, lower, upper);
+    }
+
+    // TODO: probably move this to TypeUtil
+    private static byte[] toComparableBytes(int numDim, int bytesPerDim, ByteBuffer value, AbstractType<?> type)
+    {
+        byte[] buffer = new byte[TypeUtil.fixedSizeOf(type)];
+        assert buffer.length == bytesPerDim * numDim;
+        TypeUtil.toComparableBytes(value, type, buffer);
+        return buffer;
+    }
+
+    private static abstract class RangeQuery implements BKDReader.IntersectVisitor
+    {
+        final int numDims;
+        final int bytesPerDim;
+
+        RangeQuery(int numDims, int bytesPerDim)
+        {
+            this.numDims = numDims;
+            this.bytesPerDim = bytesPerDim;
+        }
+
+        int compareUnsigned(byte[] packedValue, int dim, Bound bound)
+        {
+            final int offset = dim * bytesPerDim;
+            return FutureArrays.compareUnsigned(packedValue, offset, offset + bytesPerDim, bound.bound, offset, offset + bytesPerDim);
+        }
+    }
+
+    private static class Bound
+    {
+        private final byte[] bound;
+        private final boolean exclusive;
+
+        Bound(byte[] bound, boolean exclusive)
+        {
+            this.bound = bound;
+            this.exclusive = exclusive;
+        }
+
+        boolean smallerThan(int cmp)
+        {
+            return cmp > 0 || (cmp == 0 && exclusive);
+        }
+
+        boolean greaterThan(int cmp)
+        {
+            return cmp < 0 || (cmp == 0 && exclusive);
+        }
+    }
+
+    private static class RangeQueryVisitor extends RangeQuery
+    {
+        private final Bound lower;
+        private final Bound upper;
+
+        private RangeQueryVisitor(int numDims, int bytesPerDim, Bound lower, Bound upper)
+        {
+            super(numDims, bytesPerDim);
+            this.lower = lower;
+            this.upper = upper;
+        }
+
+        @Override
+        public boolean visit(byte[] packedValue)
+        {
+            for (int dim = 0; dim < numDims; dim++)
+            {
+                if (lower != null)
+                {
+                    int cmp = compareUnsigned(packedValue, dim, lower);
+                    if (lower.greaterThan(cmp))
+                    {
+                        // value is too low, in this dimension
+                        return false;
+                    }
+                }
+
+                if (upper != null)
+                {
+                    int cmp = compareUnsigned(packedValue, dim, upper);
+                    if (upper.smallerThan(cmp))
+                    {
+                        // value is too high, in this dimension
+                        return false;
+                    }
+                }
+            }
+
+            return true;
+        }
+
+        @Override
+        public Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+        {
+            boolean crosses = false;
+
+            for (int dim = 0; dim < numDims; dim++)
+            {
+                if (lower != null)
+                {
+                    int maxCmp = compareUnsigned(maxPackedValue, dim, lower);
+                    if (lower.greaterThan(maxCmp))
+                        return Relation.CELL_OUTSIDE_QUERY;
+
+                    int minCmp = compareUnsigned(minPackedValue, dim, lower);
+                    crosses |= lower.greaterThan(minCmp);
+                }
+
+                if (upper != null)
+                {
+                    int minCmp = compareUnsigned(minPackedValue, dim, upper);
+                    if (upper.smallerThan(minCmp))
+                        return Relation.CELL_OUTSIDE_QUERY;
+
+                    int maxCmp = compareUnsigned(maxPackedValue, dim, upper);
+                    crosses |= upper.smallerThan(maxCmp);
+                }
+            }
+
+            if (crosses)
+            {
+                return Relation.CELL_CROSSES_QUERY;
+            }
+            else
+            {
+                return Relation.CELL_INSIDE_QUERY;
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/ByteSliceReader.java b/src/java/org/apache/cassandra/index/sai/disk/ByteSliceReader.java
new file mode 100644
index 000000000000..29ac961ab964
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/ByteSliceReader.java
@@ -0,0 +1,168 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ByteBlockPool;
+
+/* IndexInput that knows how to read the byte slices written
+ * by Posting and PostingVector.  We read the bytes in
+ * each slice until we hit the end of that slice at which
+ * point we read the forwarding address of the next slice
+ * and then jump to it.*/
+final class ByteSliceReader extends DataInput
+{
+    ByteBlockPool pool;
+    int bufferUpto;
+    byte[] buffer;
+    public int upto;
+    int limit;
+    int level;
+    public int bufferOffset;
+
+    public int endIndex;
+
+    public void init(ByteBlockPool pool, int startIndex, int endIndex)
+    {
+
+        assert endIndex - startIndex >= 0 : "startIndex=" + startIndex + " endIndex=" + endIndex;
+        assert startIndex >= 0;
+        assert endIndex >= 0;
+
+        this.pool = pool;
+        this.endIndex = endIndex;
+
+        level = 0;
+        bufferUpto = startIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
+        bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
+        buffer = pool.buffers[bufferUpto];
+        upto = startIndex & ByteBlockPool.BYTE_BLOCK_MASK;
+
+        final int firstSize = ByteBlockPool.LEVEL_SIZE_ARRAY[0];
+
+        if (startIndex + firstSize >= endIndex)
+        {
+            // There is only this one slice to read
+            limit = endIndex & ByteBlockPool.BYTE_BLOCK_MASK;
+        }
+        else
+            limit = upto + firstSize - 4;
+    }
+
+    public boolean eof()
+    {
+        assert upto + bufferOffset <= endIndex;
+        return upto + bufferOffset == endIndex;
+    }
+
+    @Override
+    public byte readByte()
+    {
+        assert !eof();
+        assert upto <= limit;
+        if (upto == limit)
+            nextSlice();
+        return buffer[upto++];
+    }
+
+    public long writeTo(DataOutput out) throws IOException
+    {
+        long size = 0;
+        while (true)
+        {
+            if (limit + bufferOffset == endIndex)
+            {
+                assert endIndex - bufferOffset >= upto;
+                out.writeBytes(buffer, upto, limit - upto);
+                size += limit - upto;
+                break;
+            }
+            else
+            {
+                out.writeBytes(buffer, upto, limit - upto);
+                size += limit - upto;
+                nextSlice();
+            }
+        }
+
+        return size;
+    }
+
+    public void nextSlice()
+    {
+
+        // Skip to our next slice
+        final int nextIndex = ((buffer[limit] & 0xff) << 24) + ((buffer[1 + limit] & 0xff) << 16) + ((buffer[2 + limit] & 0xff) << 8) + (buffer[3 + limit] & 0xff);
+
+        level = ByteBlockPool.NEXT_LEVEL_ARRAY[level];
+        final int newSize = ByteBlockPool.LEVEL_SIZE_ARRAY[level];
+
+        bufferUpto = nextIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
+        bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
+
+        buffer = pool.buffers[bufferUpto];
+        upto = nextIndex & ByteBlockPool.BYTE_BLOCK_MASK;
+
+        if (nextIndex + newSize >= endIndex)
+        {
+            // We are advancing to the final slice
+            assert endIndex - nextIndex > 0;
+            limit = endIndex - bufferOffset;
+        }
+        else
+        {
+            // This is not the final slice (subtract 4 for the
+            // forwarding address at the end of this new slice)
+            limit = upto + newSize - 4;
+        }
+    }
+
+    @Override
+    public void readBytes(byte[] b, int offset, int len)
+    {
+        while (len > 0)
+        {
+            final int numLeft = limit - upto;
+            if (numLeft < len)
+            {
+                // Read entire slice
+                System.arraycopy(buffer, upto, b, offset, numLeft);
+                offset += numLeft;
+                len -= numLeft;
+                nextSlice();
+            }
+            else
+            {
+                // This slice is the last one
+                System.arraycopy(buffer, upto, b, offset, len);
+                upto += len;
+                break;
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/ColumnIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/ColumnIndexWriter.java
new file mode 100644
index 000000000000..6c6d39f73d16
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/ColumnIndexWriter.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.rows.Row;
+
+/**
+ * Creates an on-disk index for a given column.
+ */
+public interface ColumnIndexWriter
+{
+    /**
+     * Adds a row to this index.
+     */
+    void addRow(DecoratedKey rowKey, long ssTableRowId, Row row) throws IOException;
+
+    /**
+     * Builds on-disk index data structures from accumulated data, moves them all to the filesystem, and fsync created files.
+     */
+    void flush() throws IOException;
+
+    /**
+     * Aborts accumulating data. Allows to clean up resources on error.
+     * 
+     * Note: Implementations should be idempotent, i.e. safe to call multiple times without producing undesirable side-effects.
+     */
+    void abort(Throwable cause);
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValues.java b/src/java/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValues.java
new file mode 100644
index 000000000000..9e8c1f16ab5a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValues.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.util.bkd.BKDWriter;
+
+/**
+ * {@link MutablePointValues} that prevents buffered points from reordering, and always skips sorting phase in Lucene
+ * It's the responsibility of the underlying implementation to ensure that all points are correctly sorted.
+ * <p>
+ * It allows to take advantage of an optimised 1-dim writer {@link BKDWriter}
+ * (that is enabled only for {@link MutablePointValues}), and reduce number of times we sort point values.
+ */
+public class ImmutableOneDimPointValues extends MutableOneDimPointValues
+{
+    private final TermsIterator termEnum;
+    private final AbstractType termComparator;
+    private final byte[] scratch;
+
+    private ImmutableOneDimPointValues(TermsIterator termEnum, AbstractType<?> termComparator)
+    {
+        this.termEnum = termEnum;
+        this.termComparator = termComparator;
+        this.scratch = new byte[TypeUtil.fixedSizeOf(termComparator)];
+    }
+
+    public static ImmutableOneDimPointValues fromTermEnum(TermsIterator termEnum, AbstractType<?> termComparator)
+    {
+        return new ImmutableOneDimPointValues(termEnum, termComparator);
+    }
+
+    @Override
+    public void intersect(IntersectVisitor visitor) throws IOException
+    {
+        while (termEnum.hasNext())
+        {
+            ByteBufferUtil.toBytes(termEnum.next().asComparableBytes(ByteComparable.Version.OSS41), scratch);
+            try (final PostingList postings = termEnum.postings())
+            {
+                long segmentRowId;
+                while ((segmentRowId = postings.nextPosting()) != PostingList.END_OF_STREAM)
+                {
+                    visitor.visit(segmentRowId, scratch);
+                }
+            }
+        }
+    }
+
+    @Override
+    public int getBytesPerDimension()
+    {
+        return scratch.length;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/IndexSearcher.java b/src/java/org/apache/cassandra/index/sai/disk/IndexSearcher.java
new file mode 100644
index 000000000000..123129484c5d
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/IndexSearcher.java
@@ -0,0 +1,224 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.metrics.ColumnQueryMetrics;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+
+/**
+ * Abstract reader for individual segments of an on-disk index.
+ *
+ * Accepts shared resources (token/offset file readers), and uses them to perform lookups against on-disk data
+ * structures.
+ */
+public abstract class IndexSearcher implements Closeable
+{
+    private final LongArray.Factory rowIdToTokenFactory;
+    private final LongArray.Factory rowIdToOffsetFactory;
+    private final SSTableContext.KeyFetcher keyFetcher;
+    final SSTableIndex.PerIndexFiles indexFiles;
+
+    final SegmentMetadata metadata;
+
+    final IndexComponents indexComponents;
+
+    IndexSearcher(Segment segment)
+    {
+        this.indexComponents = segment.indexFiles.components();
+        this.rowIdToTokenFactory = segment.segmentRowIdToTokenFactory;
+        this.rowIdToOffsetFactory = segment.segmentRowIdToOffsetFactory;
+        this.keyFetcher = segment.keyFetcher;
+        this.indexFiles = segment.indexFiles;
+        this.metadata = segment.metadata;
+    }
+
+    public static IndexSearcher open(boolean isString, Segment segment, ColumnQueryMetrics listener) throws IOException
+    {
+        return isString ? open(segment, (QueryEventListener.TrieIndexEventListener) listener)
+                        : open(segment, (QueryEventListener.BKDIndexEventListener) listener);
+    }
+
+    public static InvertedIndexSearcher open(Segment segment, QueryEventListener.TrieIndexEventListener listener) throws IOException
+    {
+        return new InvertedIndexSearcher(segment, listener);
+    }
+
+    public static KDTreeIndexSearcher open(Segment segment, QueryEventListener.BKDIndexEventListener listener) throws IOException
+    {
+        return new KDTreeIndexSearcher(segment, listener);
+    }
+
+    /**
+     * @return number of per-index open files attached to a sstable
+     */
+    public static int openPerIndexFiles(AbstractType<?> columnType)
+    {
+        return TypeUtil.isLiteral(columnType) ? InvertedIndexSearcher.openPerIndexFiles() : KDTreeIndexSearcher.openPerIndexFiles();
+    }
+
+    /**
+     * @return memory usage of underlying on-disk data structure
+     */
+    public abstract long indexFileCacheSize();
+
+    /**
+     * Search on-disk index synchronously.
+     *
+     * @param expression to filter on disk index
+     * @param queryContext to track per sstable cache and per query metrics
+     * @param defer create the iterator in a deferred state
+     *
+     * @return {@link RangeIterator} that matches given expression
+     */
+    public abstract RangeIterator search(Expression expression, SSTableQueryContext queryContext, boolean defer);
+
+    RangeIterator toIterator(PostingList postingList, SSTableQueryContext queryContext, boolean defer)
+    {
+        if (postingList == null)
+            return RangeIterator.empty();
+
+        SearcherContext searcherContext = defer ? new DeferredSearcherContext(queryContext, postingList.peekable())
+                                                : new DirectSearcherContext(queryContext, postingList.peekable());
+
+        if (searcherContext.noOverlap)
+            return RangeIterator.empty();
+
+        RangeIterator iterator = new PostingListRangeIterator(searcherContext, keyFetcher, indexComponents);
+
+        return iterator;
+    }
+
+    public abstract class SearcherContext
+    {
+        long minToken;
+        long maxToken;
+        long maxPartitionOffset;
+        boolean noOverlap;
+        final LongArray segmentRowIdToToken;
+        final LongArray segmentRowIdToOffset;
+        final SSTableQueryContext context;
+        final PostingList.PeekablePostingList postingList;
+
+        SearcherContext(SSTableQueryContext context, PostingList.PeekablePostingList postingList)
+        {
+            this.context = context;
+            this.postingList = postingList;
+
+            // startingIndex of 0 means `findTokenRowId` should search all tokens in the segment.
+            this.segmentRowIdToToken = new LongArray.DeferredLongArray(() -> rowIdToTokenFactory.openTokenReader(0, context));
+            this.segmentRowIdToOffset = new LongArray.DeferredLongArray(() -> rowIdToOffsetFactory.open());
+
+            minToken = calculateMinimumToken();
+
+            // use segment's metadata for the range iterator, may not be accurate, but should not matter to performance.
+            maxToken = metadata.maxKey.isMinimum()
+                       ? toLongToken(DatabaseDescriptor.getPartitioner().getMaximumToken())
+                       : toLongToken(metadata.maxKey);
+
+            maxPartitionOffset = Long.MAX_VALUE;
+        }
+
+        long minToken()
+        {
+            return minToken;
+        }
+
+        long maxToken()
+        {
+            return maxToken;
+        }
+
+        abstract long calculateMinimumToken();
+
+        abstract long count();
+
+    }
+
+    public class DirectSearcherContext extends SearcherContext
+    {
+        DirectSearcherContext(SSTableQueryContext context, PostingList.PeekablePostingList postingList)
+        {
+            super(context, postingList);
+        }
+
+        @Override
+        long calculateMinimumToken()
+        {
+            // Use the first row id's token as min
+            return this.segmentRowIdToToken.get(postingList.peek());
+        }
+
+        @Override
+        long count()
+        {
+            return postingList.size();
+        }
+    }
+
+    public class DeferredSearcherContext extends SearcherContext
+    {
+        DeferredSearcherContext(SSTableQueryContext context, PostingList.PeekablePostingList postingList)
+        {
+            super(context, postingList);
+        }
+
+        @Override
+        long calculateMinimumToken()
+        {
+            // Use the segments min key min
+            return toLongToken(metadata.minKey);
+        }
+
+        @Override
+        long count()
+        {
+            return metadata.numRows;
+        }
+    }
+
+    private static long toLongToken(DecoratedKey key)
+    {
+        return toLongToken(key.getToken());
+    }
+
+    private static long toLongToken(ByteBuffer key)
+    {
+        return toLongToken(DatabaseDescriptor.getPartitioner().getToken(key));
+    }
+
+    private static long toLongToken(Token token)
+    {
+        return (long) token.getTokenValue();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/IndexWriterConfig.java b/src/java/org/apache/cassandra/index/sai/disk/IndexWriterConfig.java
new file mode 100644
index 000000000000..4a1dbec3d256
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/IndexWriterConfig.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.Map;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+
+/**
+ * Per-index config for storage-attached index writers.
+ */
+public class IndexWriterConfig
+{
+    public static final String POSTING_LIST_LVL_MIN_LEAVES = "bkd_postings_min_leaves";
+    public static final String POSTING_LIST_LVL_SKIP_OPTION = "bkd_postings_skip";
+
+    private static final int DEFAULT_POSTING_LIST_MIN_LEAVES = 64;
+    private static final int DEFAULT_POSTING_LIST_LVL_SKIP = 3;
+
+    private static final IndexWriterConfig EMPTY_CONFIG = new IndexWriterConfig(null, -1, -1);
+
+    /**
+     * Fully qualified index name, in the format "<keyspace>.<table>.<index_name>".
+     */
+    private final String indexName;
+
+    /**
+     * Skip, or the sampling interval, for selecting a bkd tree level that is eligible for an auxiliary posting list.
+     * Sampling starts from 0, but bkd tree root node is at level 1. For skip = 4, eligible levels are 4, 8, 12, etc (no
+     * level 0, because there is no node at level 0).
+     */
+    private final int bkdPostingsSkip;
+
+    /**
+     * Min. number of reachable leaves for a given node to be eligible for an auxiliary posting list.
+     */
+    private final int bkdPostingsMinLeaves;
+
+    public IndexWriterConfig(String indexName, int bkdPostingsSkip, int bkdPostingsMinLeaves)
+    {
+        this.indexName = indexName;
+        this.bkdPostingsSkip = bkdPostingsSkip;
+        this.bkdPostingsMinLeaves = bkdPostingsMinLeaves;
+    }
+
+    public String getIndexName()
+    {
+        return indexName;
+    }
+
+    public int getBkdPostingsMinLeaves()
+    {
+        return bkdPostingsMinLeaves;
+    }
+
+    public int getBkdPostingsSkip()
+    {
+        return bkdPostingsSkip;
+    }
+
+    public static IndexWriterConfig fromOptions(String indexName, AbstractType<?> type, Map<String, String> options)
+    {
+        int minLeaves = DEFAULT_POSTING_LIST_MIN_LEAVES;
+        int skip = DEFAULT_POSTING_LIST_LVL_SKIP;
+
+        if (options.get(POSTING_LIST_LVL_MIN_LEAVES) != null || options.get(POSTING_LIST_LVL_SKIP_OPTION) != null)
+        {
+            if (TypeUtil.isLiteral(type))
+            {
+                throw new InvalidRequestException(String.format("CQL type %s cannot have auxiliary posting lists on index %s.", type.asCQL3Type(), indexName));
+            }
+
+            for (Map.Entry<String, String> entry : options.entrySet())
+            {
+                switch (entry.getKey())
+                {
+                    case POSTING_LIST_LVL_MIN_LEAVES:
+                    {
+                        minLeaves = Integer.parseInt(entry.getValue());
+
+                        if (minLeaves < 1)
+                        {
+                            throw new InvalidRequestException(String.format("Posting list min. leaves count can't be less than 1 on index %s.", indexName));
+                        }
+
+                        break;
+                    }
+
+                    case POSTING_LIST_LVL_SKIP_OPTION:
+                    {
+                        skip = Integer.parseInt(entry.getValue());
+
+                        if (skip < 1)
+                        {
+                            throw new InvalidRequestException(String.format("Posting list skip can't be less than 1 on index %s.", indexName));
+                        }
+
+                        break;
+                    }
+                }
+            }
+        }
+
+        return new IndexWriterConfig(indexName, skip, minLeaves);
+    }
+
+    public static IndexWriterConfig defaultConfig(String indexName)
+    {
+        return new IndexWriterConfig(indexName, DEFAULT_POSTING_LIST_LVL_SKIP, DEFAULT_POSTING_LIST_MIN_LEAVES);
+    }
+
+    public static IndexWriterConfig emptyConfig()
+    {
+        return EMPTY_CONFIG;
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("IndexWriterConfig{%s=%d, %s=%d}",
+                             POSTING_LIST_LVL_SKIP_OPTION, bkdPostingsSkip,
+                             POSTING_LIST_LVL_MIN_LEAVES, bkdPostingsMinLeaves);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/InvertedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/disk/InvertedIndexSearcher.java
new file mode 100644
index 000000000000..96763f86ec37
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/InvertedIndexSearcher.java
@@ -0,0 +1,120 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Map;
+
+import com.google.common.base.MoreObjects;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.TermsReader;
+import org.apache.cassandra.index.sai.metrics.MulticastQueryEventListeners;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Executes {@link Expression}s against the trie-based terms dictionary for an individual index segment.
+ */
+public class InvertedIndexSearcher extends IndexSearcher
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final TermsReader reader;
+    private final QueryEventListener.TrieIndexEventListener perColumnEventListener;
+
+    InvertedIndexSearcher(Segment segment, QueryEventListener.TrieIndexEventListener listener) throws IOException
+    {
+        super(segment);
+
+        long root = metadata.getIndexRoot(indexComponents.termsData);
+        assert root >= 0;
+
+        perColumnEventListener = listener;
+
+        Map<String,String> map = metadata.componentMetadatas.get(IndexComponents.NDIType.TERMS_DATA).attributes;
+        String footerPointerString = map.get(SAICodecUtils.FOOTER_POINTER);
+        long footerPointer = footerPointerString == null ? -1 : Long.parseLong(footerPointerString);
+
+        reader = new TermsReader(indexComponents,
+                                 indexFiles.termsData().sharedCopy(),
+                                 indexFiles.postingLists().sharedCopy(),
+                                 root, footerPointer);
+    }
+
+    @Override
+    public long indexFileCacheSize()
+    {
+        // trie has no pre-allocated memory.
+        // TODO: Is this still the case now the trie isn't using the chunk cache?
+        return 0;
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public RangeIterator search(Expression exp, SSTableQueryContext context, boolean defer)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace(indexComponents.logMessage("Searching on expression '{}'..."), exp);
+
+        if (!exp.getOp().isEquality())
+            throw new IllegalArgumentException(indexComponents.logMessage("Unsupported expression: " + exp));
+
+        final ByteComparable term = ByteComparable.fixedLength(exp.lower.value.encoded);
+        QueryEventListener.TrieIndexEventListener listener = MulticastQueryEventListeners.of(context.queryContext, perColumnEventListener);
+
+        PostingList postingList = defer ? new PostingList.DeferredPostingList(() -> reader.exactMatch(term, listener, context.queryContext))
+                                        : reader.exactMatch(term, listener, context.queryContext);
+        return toIterator(postingList, context, defer);
+    }
+
+    public static int openPerIndexFiles()
+    {
+        return TermsReader.openPerIndexFiles();
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("indexComponents", indexComponents)
+                          .add("diskSize", RamUsageEstimator.humanReadableUnits(indexComponents.sizeOfPerColumnComponents()))
+                          .toString();
+    }
+
+    @Override
+    public void close()
+    {
+        reader.close();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcher.java
new file mode 100644
index 000000000000..ef7e1e141d6a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcher.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+
+import com.google.common.base.MoreObjects;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.v1.BKDReader;
+import org.apache.cassandra.index.sai.metrics.MulticastQueryEventListeners;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.index.sai.disk.BKDQueries.bkdQueryFrom;
+
+/**
+ * Executes {@link Expression}s against the kd-tree for an individual index segment.
+ */
+public class KDTreeIndexSearcher extends IndexSearcher
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final BKDReader bkdReader;
+    private final QueryEventListener.BKDIndexEventListener perColumnEventListener;
+
+    KDTreeIndexSearcher(Segment segment, QueryEventListener.BKDIndexEventListener listener) throws IOException
+    {
+        super(segment);
+
+        final long bkdPosition = metadata.getIndexRoot(indexComponents.kdTree);
+        assert bkdPosition >= 0;
+        final long postingsPosition = metadata.getIndexRoot(indexComponents.kdTreePostingLists);
+        assert postingsPosition >= 0;
+
+        bkdReader = new BKDReader(indexFiles.components(),
+                                  indexFiles.kdtree().sharedCopy(),
+                                  bkdPosition,
+                                  indexFiles.kdtreePostingLists().sharedCopy(),
+                                  postingsPosition);
+        perColumnEventListener = listener;
+
+    }
+
+    @Override
+    public long indexFileCacheSize()
+    {
+        return bkdReader.memoryUsage();
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public RangeIterator search(Expression exp, SSTableQueryContext context, boolean defer)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace(indexComponents.logMessage("Searching on expression '{}'..."), exp);
+
+        if (exp.getOp().isEqualityOrRange())
+        {
+            final BKDReader.IntersectVisitor query = bkdQueryFrom(exp, bkdReader.getNumDimensions(), bkdReader.getBytesPerDimension());
+            QueryEventListener.BKDIndexEventListener listener = MulticastQueryEventListeners.of(context.queryContext, perColumnEventListener);
+
+            PostingList postingList = defer ? new PostingList.DeferredPostingList(() -> bkdReader.intersect(query, listener, context.queryContext))
+                                            : bkdReader.intersect(query, listener, context.queryContext);
+            return toIterator(postingList, context, defer);
+        }
+        else
+        {
+            throw new IllegalArgumentException(indexComponents.logMessage(indexComponents.logMessage("Unsupported expression during index query: " + exp)));
+        }
+    }
+
+    public static int openPerIndexFiles()
+    {
+        return BKDReader.openPerIndexFiles();
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("indexComponents", indexComponents)
+                          .add("diskSize", FBUtilities.prettyPrintMemory(indexComponents.sizeOfPerColumnComponents()))
+                          .add("count", bkdReader.getPointCount())
+                          .add("numDimensions", bkdReader.getNumDimensions())
+                          .add("bytesPerDimension", bkdReader.getBytesPerDimension())
+                          .toString();
+    }
+
+    @Override
+    public void close()
+    {
+        bkdReader.close();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java
new file mode 100644
index 000000000000..cb276d4842fc
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.InvertedIndexWriter;
+import org.apache.cassandra.index.sai.disk.v1.MetadataWriter;
+import org.apache.cassandra.index.sai.disk.v1.NumericIndexWriter;
+import org.apache.cassandra.index.sai.memory.MemtableIndex;
+import org.apache.cassandra.index.sai.memory.RowMapping;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Column index writer that flushes indexed data directly from the corresponding Memtable index, without buffering index
+ * data in memory.
+ */
+public class MemtableIndexWriter implements ColumnIndexWriter
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final MemtableIndex memtable;
+    private final RowMapping rowMapping;
+    private final ColumnContext context;
+    private final Descriptor descriptor;
+    private final IndexComponents indexComponents;
+
+    public MemtableIndexWriter(MemtableIndex memtable,
+                               Descriptor descriptor,
+                               ColumnContext context,
+                               RowMapping rowMapping,
+                               CompressionParams compressionParams)
+    {
+        assert rowMapping != null && rowMapping != RowMapping.DUMMY : "Row mapping must exist during FLUSH.";
+
+        this.memtable = memtable;
+        this.rowMapping = rowMapping;
+        this.context = context;
+        this.descriptor = descriptor;
+
+        this.indexComponents = IndexComponents.create(context.getIndexName(), descriptor, compressionParams);
+    }
+
+    @Override
+    public void addRow(DecoratedKey rowKey, long ssTableRowId, Row row)
+    {
+        // Memtable indexes are flushed directly to disk with the aid of a mapping between primary
+        // keys and row IDs in the flushing SSTable. This writer, therefore, does nothing in
+        // response to the flushing of individual rows.
+    }
+
+    @Override
+    public void abort(Throwable cause)
+    {
+        logger.warn(context.logMessage("Aborting index memtable flush for {}..."), descriptor, cause);
+        indexComponents.deleteColumnIndex();
+    }
+
+    @Override
+    public void flush() throws IOException
+    {
+        long start = System.nanoTime();
+
+        try
+        {
+            if (!rowMapping.hasRows() || (memtable == null) || memtable.isEmpty())
+            {
+                logger.debug(context.logMessage("No indexed rows to flush from SSTable {}."), descriptor);
+                // Write a completion marker even though we haven't written anything to the index
+                // so we won't try to build the index again for the SSTable
+                indexComponents.createColumnCompletionMarker();
+                return;
+            }
+
+            final DecoratedKey minKey = rowMapping.minKey;
+            final DecoratedKey maxKey = rowMapping.maxKey;
+
+            final Iterator<Pair<ByteComparable, IntArrayList>> iterator = rowMapping.merge(memtable);
+
+            try (MemtableTermsIterator terms = new MemtableTermsIterator(memtable.getMinTerm(), memtable.getMaxTerm(), iterator))
+            {
+                long cellCount = flush(minKey, maxKey, context.getValidator(), terms, rowMapping.maxSegmentRowId);
+
+                indexComponents.createColumnCompletionMarker();
+
+                context.getIndexMetrics().memtableIndexFlushCount.inc();
+
+                long durationMillis = Math.max(1, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
+
+                if (logger.isTraceEnabled())
+                {
+                    logger.trace(context.logMessage("Flushed {} Memtable index cells for {} in {} ms."), cellCount, descriptor, durationMillis);
+                }
+
+                context.getIndexMetrics().memtableFlushCellsPerSecond.update((long) (cellCount * 1000.0 / durationMillis));
+            }
+        }
+        catch (Throwable t)
+        {
+            logger.error(context.logMessage("Error while flushing index {}"), t.getMessage(), t);
+            context.getIndexMetrics().memtableIndexFlushErrors.inc();
+
+            throw t;
+        }
+    }
+
+    private long flush(DecoratedKey minKey, DecoratedKey maxKey, AbstractType<?> termComparator, MemtableTermsIterator terms, int maxSegmentRowId) throws IOException
+    {
+        long numRows;
+        SegmentMetadata.ComponentMetadataMap indexMetas;
+
+        if (TypeUtil.isLiteral(termComparator))
+        {
+            try (InvertedIndexWriter writer = new InvertedIndexWriter(indexComponents, false))
+            {
+                indexMetas = writer.writeAll(terms);
+                numRows = writer.getPostingsCount();
+            }
+        }
+        else
+        {
+            try (NumericIndexWriter writer = new NumericIndexWriter(indexComponents,
+                                                                    TypeUtil.fixedSizeOf(termComparator),
+                                                                    maxSegmentRowId,
+                                                                    // Due to stale entries in IndexMemtable, we may have more indexed rows than num of rowIds.
+                                                                    Integer.MAX_VALUE,
+                                                                    context.getIndexWriterConfig(),
+                                                                    false))
+            {
+                indexMetas = writer.writeAll(ImmutableOneDimPointValues.fromTermEnum(terms, termComparator));
+                numRows = writer.getPointCount();
+            }
+        }
+
+        if (numRows == 0)
+            return 0;
+
+        // During index memtable flush, the data is sorted based on terms.
+        SegmentMetadata metadata = new SegmentMetadata(0, numRows, terms.getMinSSTableRowId(), terms.getMaxSSTableRowId(),
+                                                       minKey, maxKey, terms.getMinTerm(), terms.getMaxTerm(), indexMetas);
+
+        try (MetadataWriter writer = new MetadataWriter(indexComponents.createOutput(indexComponents.meta)))
+        {
+            SegmentMetadata.write(writer, Collections.singletonList(metadata), null);
+        }
+
+        return numRows;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/MemtableTermsIterator.java b/src/java/org/apache/cassandra/index/sai/disk/MemtableTermsIterator.java
new file mode 100644
index 000000000000..3a99639064c3
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/MemtableTermsIterator.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+
+import com.google.common.base.Preconditions;
+
+import com.carrotsearch.hppc.IntArrayList;
+import com.carrotsearch.hppc.cursors.IntCursor;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Iterator over a token range bounded segment of a Memtable index. Used to flush Memtable index segments to disk.
+ */
+public class MemtableTermsIterator implements TermsIterator
+{
+    private final ByteBuffer minTerm;
+    private final ByteBuffer maxTerm;
+    private final Iterator<Pair<ByteComparable, IntArrayList>> iterator;
+
+    private Pair<ByteComparable, IntArrayList> current;
+
+    private long maxSSTableRowId = -1;
+    private long minSSTableRowId = Long.MAX_VALUE;
+
+    public MemtableTermsIterator(ByteBuffer minTerm,
+                                 ByteBuffer maxTerm,
+                                 Iterator<Pair<ByteComparable, IntArrayList>> iterator)
+    {
+        Preconditions.checkArgument(iterator != null);
+        this.minTerm = minTerm;
+        this.maxTerm = maxTerm;
+        this.iterator = iterator;
+    }
+
+    @Override
+    public ByteBuffer getMinTerm()
+    {
+        return minTerm;
+    }
+
+    @Override
+    public ByteBuffer getMaxTerm()
+    {
+        return maxTerm;
+    }
+
+    @Override
+    public void close() {}
+
+    @Override
+    public PostingList postings()
+    {
+        //TODO Confirm that this can stay an IntArray post DSP-19608
+        final IntArrayList list = current.right;
+
+        assert list.size() > 0;
+
+        final int minSegmentRowID = list.get(0);
+        final int maxSegmentRowID = list.get(list.size() - 1);
+
+        minSSTableRowId = Math.min(minSSTableRowId, minSegmentRowID);
+        maxSSTableRowId = Math.max(maxSSTableRowId, maxSegmentRowID);
+
+        final Iterator<IntCursor> it = list.iterator();
+
+        return new PostingList()
+        {
+            @Override
+            public long nextPosting()
+            {
+                if (!it.hasNext())
+                {
+                    return END_OF_STREAM;
+                }
+
+                return it.next().value;
+            }
+
+            @Override
+            public long size()
+            {
+                return list.size();
+            }
+
+            @Override
+            public long advance(long targetRowID)
+            {
+                throw new UnsupportedOperationException();
+            }
+        };
+    }
+
+    @Override
+    public boolean hasNext()
+    {
+        return iterator.hasNext();
+    }
+
+    @Override
+    public ByteComparable next()
+    {
+        current = iterator.next();
+        return current.left;
+    }
+
+    long getMaxSSTableRowId()
+    {
+        return maxSSTableRowId;
+    }
+
+    long getMinSSTableRowId()
+    {
+        return minSSTableRowId;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/MergeOneDimPointValues.java b/src/java/org/apache/cassandra/index/sai/disk/MergeOneDimPointValues.java
new file mode 100644
index 000000000000..f7bbc8a75cbf
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/MergeOneDimPointValues.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.util.List;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.disk.v1.BKDReader;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.bkd.BKDWriter;
+
+/**
+ * {@link MutableOneDimPointValues} that prevents buffered points from reordering, and always skips sorting phase in Lucene
+ * It's the responsibility of the underlying implementation to ensure that all points are correctly sorted.
+ * <p>
+ * It allows to take advantage of an optimised 1-dim writer {@link BKDWriter}
+ * (that is enabled only for {@link MutableOneDimPointValues}), and reduce number of times we sort point values.
+ */
+public class MergeOneDimPointValues extends MutableOneDimPointValues
+{
+    private static final byte[] EMPTY = new byte[0];
+
+    private final byte[] scratch;
+    private final MergeQueue queue;
+
+    public long minRowID = Long.MAX_VALUE;
+    public long maxRowID = Long.MIN_VALUE;
+    public long numRows = 0;
+
+    public MergeOneDimPointValues(List<BKDReader.IteratorState> iterators, AbstractType termComparator) throws IOException
+    {
+        queue = new MergeQueue(iterators.size());
+        this.scratch = new byte[TypeUtil.fixedSizeOf(termComparator)];
+        for (BKDReader.IteratorState iterator : iterators)
+        {
+            if (iterator.hasNext())
+            {
+                queue.add(iterator);
+            }
+        }
+    }
+
+    @VisibleForTesting
+    public MergeOneDimPointValues(List<BKDReader.IteratorState> iterators, int bytesPerDim) throws IOException
+    {
+        queue = new MergeQueue(iterators.size());
+        this.scratch = new byte[bytesPerDim];
+        for (BKDReader.IteratorState iterator : iterators)
+        {
+            if (iterator.hasNext())
+            {
+                queue.add(iterator);
+            }
+        }
+    }
+
+    public long getMinRowID()
+    {
+        return minRowID;
+    }
+
+    public long getMaxRowID()
+    {
+        return maxRowID;
+    }
+
+    public long getNumRows()
+    {
+        return numRows;
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public void intersect(IntersectVisitor visitor) throws IOException
+    {
+        while (queue.size() != 0)
+        {
+            final BKDReader.IteratorState reader = queue.top();
+            if (reader.hasNext())
+            {
+                final long rowID = reader.next();
+
+                minRowID = Math.min(minRowID, rowID);
+                maxRowID = Math.max(maxRowID, rowID);
+                numRows++;
+
+                visitor.visit(rowID, reader.scratch);
+
+                if (reader.hasNext())
+                {
+                    queue.updateTop();
+                }
+                else
+                {
+                    queue.pop();
+                }
+            }
+            else
+            {
+                // iterator is exhausted
+                queue.pop();
+            }
+        }
+    }
+
+    @Override
+    public int getBytesPerDimension()
+    {
+        return scratch.length;
+    }
+
+    private static class MergeQueue extends PriorityQueue<BKDReader.IteratorState>
+    {
+        public MergeQueue(int maxSize)
+        {
+            super(maxSize);
+        }
+
+        @Override
+        public boolean lessThan(BKDReader.IteratorState a, BKDReader.IteratorState b)
+        {
+            assert a != b;
+
+            int cmp = a.compareTo(b);
+
+            if (cmp < 0)
+            {
+                return true;
+            }
+            else return false;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/MergingIterator.java b/src/java/org/apache/cassandra/index/sai/disk/MergingIterator.java
new file mode 100644
index 000000000000..7a82014bb76a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/MergingIterator.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.lucene.util.PriorityQueue;
+
+public final class MergingIterator implements Iterator<ByteComparable>
+{
+    private ByteComparable current;
+    private final TermMergeQueue queue;
+    final SubIterator[] top;
+    private final boolean removeDuplicates = true;
+    private int numTop;
+
+    @SuppressWarnings({ "unchecked", "rawtypes" })
+    public MergingIterator(AbstractType type, Iterator<ByteComparable>... iterators)
+    {
+        queue = new TermMergeQueue(iterators.length, type);
+        top = new SubIterator[iterators.length];
+        int index = 0;
+        for (Iterator<ByteComparable> iterator : iterators)
+        {
+            if (iterator.hasNext())
+            {
+                SubIterator sub = new SubIterator();
+                sub.current = iterator.next();
+                sub.iterator = iterator;
+                sub.index = index++;
+                queue.add(sub);
+            }
+        }
+    }
+
+    @Override
+    public boolean hasNext()
+    {
+        if (queue.size() > 0)
+        {
+            return true;
+        }
+
+        for (int i = 0; i < numTop; i++)
+        {
+            if (top[i].iterator.hasNext())
+            {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    public int getNumTop()
+    {
+        return numTop;
+    }
+
+    @Override
+    public ByteComparable next()
+    {
+        // restore queue
+        pushTop();
+
+        // gather equal top elements
+        if (queue.size() > 0)
+        {
+            pullTop();
+        }
+        else
+        {
+            current = null;
+        }
+        if (current == null)
+        {
+            throw new NoSuchElementException();
+        }
+        return current;
+    }
+
+    @Override
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    private void pullTop()
+    {
+        assert numTop == 0;
+        top[numTop++] = queue.pop();
+        if (removeDuplicates)
+        {
+            // extract all subs from the queue that have the same top element
+            while (queue.size() != 0
+                   && ByteComparable.compare(queue.top().current, top[0].current, ByteComparable.Version.OSS41) == 0)
+            {
+                top[numTop++] = queue.pop();
+            }
+        }
+        current = top[0].current;
+    }
+
+    private void pushTop()
+    {
+        // call next() on each top, and put back into queue
+        for (int i = 0; i < numTop; i++)
+        {
+            if (top[i].iterator.hasNext())
+            {
+                top[i].current = top[i].iterator.next();
+                queue.add(top[i]);
+            }
+            else
+            {
+                // no more elements
+                top[i].current = null;
+            }
+        }
+        numTop = 0;
+    }
+
+    public static class SubIterator
+    {
+        Iterator<ByteComparable> iterator;
+        ByteComparable current;
+        int index;
+    }
+
+    private static class TermMergeQueue extends PriorityQueue<SubIterator>
+    {
+        final AbstractType type;
+
+        TermMergeQueue(int size, AbstractType type)
+        {
+            super(size);
+            this.type = type;
+        }
+
+        @Override
+        protected boolean lessThan(SubIterator a, SubIterator b)
+        {
+            final int cmp = ByteComparable.compare(a.current, b.current, ByteComparable.Version.OSS41);
+
+            if (cmp != 0)
+            {
+                return cmp < 0;
+            }
+            else
+            {
+                return a.index < b.index;
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/MutableOneDimPointValues.java b/src/java/org/apache/cassandra/index/sai/disk/MutableOneDimPointValues.java
new file mode 100644
index 000000000000..3c42b2b74fd7
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/MutableOneDimPointValues.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.util.BytesRef;
+
+public abstract class MutableOneDimPointValues extends MutablePointValues
+{
+    private static final byte[] EMPTY = new byte[0];
+
+    abstract public void intersect(IntersectVisitor visitor) throws IOException;
+
+    @Override
+    public int getDocCount()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long size()
+    {
+        // hack to skip sorting in Lucene
+        return 1;
+    }
+
+    @Override
+    public void getValue(int i, BytesRef packedValue)
+    {
+        // no-op
+    }
+
+    @Override
+    public byte getByteAt(int i, int k)
+    {
+        return 0;
+    }
+
+    @Override
+    public int getDocID(int i)
+    {
+        return 0;
+    }
+
+    @Override
+    public void swap(int i, int j)
+    {
+        throw new IllegalStateException("unexpected sorting");
+    }
+
+    @Override
+    public void intersect(PointValues.IntersectVisitor visitor) throws IOException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long estimatePointCount(PointValues.IntersectVisitor visitor)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public byte[] getMinPackedValue()
+    {
+        return EMPTY;
+    }
+
+    @Override
+    public byte[] getMaxPackedValue()
+    {
+        return EMPTY;
+    }
+
+    @Override
+    public int getNumDimensions()
+    {
+        return 1;
+    }
+
+    @Override
+    public int getBytesPerDimension()
+    {
+        return 0;
+    }
+
+    public interface IntersectVisitor
+    {
+        /** Called for all documents in a leaf cell that crosses the query.  The consumer
+         *  should scrutinize the packedValue to decide whether to accept it.  In the 1D case,
+         *  values are visited in increasing order, and in the case of ties, in increasing
+         *  docID order. */
+        void visit(long docID, byte[] packedValue) throws IOException;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/OnDiskKeyProducer.java b/src/java/org/apache/cassandra/index/sai/disk/OnDiskKeyProducer.java
new file mode 100644
index 000000000000..6361c6bfac6f
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/OnDiskKeyProducer.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.Collections;
+import java.util.Iterator;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.io.util.RandomAccessReader;
+
+/**
+ * A reusable {@link Token} that lazily materializes an iterator of {@link DecoratedKey} from disk.
+ */
+@NotThreadSafe
+public class OnDiskKeyProducer
+{
+    public static final long NO_OFFSET = -1;
+
+    private final SSTableContext.KeyFetcher keyFetcher;
+    private final RandomAccessReader reader;
+    private final LongArray segmentRowIdToOffset;
+
+    private final long maxPartitionOffset;
+
+    private long lastOffset = NO_OFFSET;
+
+    public OnDiskKeyProducer(SSTableContext.KeyFetcher keyFetcher, RandomAccessReader reader, LongArray segmentRowIdToOffset, long maxPartitionOffset)
+    {
+        this.keyFetcher = keyFetcher;
+        this.reader = reader;
+        this.segmentRowIdToOffset = segmentRowIdToOffset;
+        this.maxPartitionOffset = maxPartitionOffset;
+    }
+
+    public Token produceToken(long token, long segmentRowId)
+    {
+        return new OnDiskToken(token, segmentRowId);
+    }
+
+    /**
+     * Used to remove duplicated key offset due rows sharing the same key offset in wide partition schema.
+     */
+    private long getKeyOffset(long segmentRowId)
+    {
+        long offset = segmentRowIdToOffset.get(segmentRowId);
+
+        if (offset == lastOffset)
+        {
+            return NO_OFFSET;
+        }
+
+        // Due to ZCS, index files may still contain partition offsets that are not part of partial SSTable.
+        if (offset > maxPartitionOffset)
+        {
+            return NO_OFFSET;
+        }
+
+        // Catalog the last offset if it's valid:
+        lastOffset = offset;
+
+        return offset;
+    }
+
+    public class OnDiskToken extends Token
+    {
+        private final long segmentRowId;
+
+        public OnDiskToken(long token, long segmentRowId)
+        {
+            super(token);
+            this.segmentRowId = segmentRowId;
+        }
+
+        @Override
+        public Iterator<DecoratedKey> keys()
+        {
+            long keyOffset = getKeyOffset(segmentRowId);
+            DecoratedKey key = keyFetcher.apply(reader, keyOffset);
+            return key == null ? Collections.emptyIterator() : Iterators.singletonIterator(key);
+        }
+
+        @Override
+        public String toString()
+        {
+            return MoreObjects.toStringHelper(this).add("token", token).add("lastOffset", lastOffset).toString();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/PostingList.java b/src/java/org/apache/cassandra/index/sai/disk/PostingList.java
new file mode 100644
index 000000000000..5dfefaf6b933
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/PostingList.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.function.Supplier;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.cassandra.utils.Throwables;
+
+/**
+ * Interface for advancing on and consuming a posting list.
+ */
+//TODO Need to check int and long usage throughout this post DSP-19608
+@NotThreadSafe
+public interface PostingList extends Closeable
+{
+    long OFFSET_NOT_FOUND = -1;
+    long END_OF_STREAM = Long.MAX_VALUE;
+
+    @Override
+    default void close() throws IOException {}
+
+    /**
+     * Retrieves the next segment row ID, not including row IDs that have been returned by {@link #advance(long)}.
+     *
+     * @return next segment row ID
+     */
+    long nextPosting() throws IOException;
+
+    long size();
+
+    /**
+     * Advances to the first row ID beyond the current that is greater than or equal to the
+     * target, and returns that row ID. Exhausts the iterator and returns {@link #END_OF_STREAM} if
+     * the target is greater than the highest row ID.
+     *
+     * Note: Callers must use the return value of this method before calling {@link #nextPosting()}, as calling
+     * that method will return the next posting, not the one to which we have just advanced.
+     *
+     * @param targetRowID target row ID to advance to
+     *
+     * @return first segment row ID which is >= the target row ID or {@link PostingList#END_OF_STREAM} if one does not exist
+     */
+    long advance(long targetRowID) throws IOException;
+
+    /**
+     * @return peekable wrapper of current posting list
+     */
+    default PeekablePostingList peekable()
+    {
+        return new PeekablePostingList(this);
+    }
+
+    class DeferredPostingList implements PostingList
+    {
+        private Supplier<PostingList> supplier;
+        private PostingList postingList;
+        private boolean opened = false;
+
+        public DeferredPostingList(Supplier<PostingList> supplier)
+        {
+            this.supplier = supplier;
+        }
+
+        @Override
+        public long nextPosting() throws IOException
+        {
+            open();
+            return postingList == null ? END_OF_STREAM : postingList.nextPosting();
+        }
+
+        @Override
+        public long size()
+        {
+            open();
+            return postingList == null ? 0 : postingList.size();
+        }
+
+        @Override
+        public long advance(long targetRowID) throws IOException
+        {
+            open();
+            return postingList == null ? END_OF_STREAM : postingList.advance(targetRowID);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            if (opened && (postingList != null))
+                postingList.close();
+        }
+
+        private void open()
+        {
+            if (!opened)
+            {
+                postingList = supplier.get();
+                opened = true;
+            }
+        }
+    }
+
+    public static class PeekablePostingList implements PostingList
+    {
+        private final PostingList wrapped;
+
+        private boolean peeked = false;
+        private long next;
+
+        public PeekablePostingList(PostingList wrapped)
+        {
+            this.wrapped = wrapped;
+        }
+
+        public long peek()
+        {
+            if (peeked)
+                return next;
+
+            try
+            {
+                peeked = true;
+                return next = wrapped.nextPosting();
+            }
+            catch (IOException e)
+            {
+                throw Throwables.cleaned(e);
+            }
+        }
+
+        public long advanceWithoutConsuming(long targetRowID) throws IOException
+        {
+            if (peek() == END_OF_STREAM)
+                return END_OF_STREAM;
+
+            if (peek() >= targetRowID)
+                return peek();
+
+            peeked = true;
+            next = wrapped.advance(targetRowID);
+            return next;
+        }
+
+        @Override
+        public long nextPosting() throws IOException
+        {
+            if (peeked)
+            {
+                peeked = false;
+                return next;
+            }
+            return wrapped.nextPosting();
+        }
+
+        @Override
+        public long size()
+        {
+            return wrapped.size();
+        }
+
+        @Override
+        public long advance(long targetRowID) throws IOException
+        {
+            if (peeked && next >= targetRowID)
+            {
+                peeked = false;
+                return next;
+            }
+
+            peeked = false;
+            return wrapped.advance(targetRowID);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            wrapped.close();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/PostingListRangeIterator.java b/src/java/org/apache/cassandra/index/sai/disk/PostingListRangeIterator.java
new file mode 100644
index 000000000000..0077dacad4ed
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/PostingListRangeIterator.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.concurrent.TimeUnit;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.AbortedOperationException;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.Throwables;
+
+/**
+ * A range iterator based on {@link PostingList}.
+ *
+ * <ol>
+ *   <li> fetch next segment row id from posting list or skip to specific segment row id if {@link #skipTo(Long)} is called </li>
+ *   <li> produce a {@link OnDiskKeyProducer.OnDiskToken} from {@link OnDiskKeyProducer#produceToken(long, int)} which is used
+ *       to avoid fetching duplicated keys due to partition-level indexing on wide partition schema.
+ *       <br/>
+ *       Note: in order to reduce disk access in multi-index query, partition keys will only be fetched for intersected tokens
+ *       in {@link org.apache.cassandra.index.sai.plan.StorageAttachedIndexSearcher}.
+ *  </li>
+ * </ol>
+ *
+ */
+
+@NotThreadSafe
+public class PostingListRangeIterator extends RangeIterator
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final Stopwatch timeToExhaust = Stopwatch.createStarted();
+    private final SSTableQueryContext queryContext;
+    private final IndexComponents components;
+
+    private final PostingList postingList;
+    private final SSTableContext.KeyFetcher keyFetcher;
+    private final IndexSearcher.SearcherContext context;
+    private final LongArray segmentRowIdToToken;
+    private final LongArray segmentRowIdToOffset;
+
+    private RandomAccessReader keyReader = null;
+    private OnDiskKeyProducer producer = null;
+
+    private boolean opened = false;
+    private boolean needsSkipping = false;
+    private long skipToToken = Long.MIN_VALUE;
+
+
+    /**
+     * Create a direct PostingListRangeIterator where the underlying PostingList is materialised
+     * immediately so the posting list size can be used.
+     */
+    public PostingListRangeIterator(IndexSearcher.SearcherContext context,
+                                    SSTableContext.KeyFetcher keyFetcher,
+                                    IndexComponents components)
+    {
+        super(context.minToken(), context.maxToken(), context.count());
+
+        this.keyFetcher = keyFetcher;
+        this.segmentRowIdToToken = context.segmentRowIdToToken;
+        this.segmentRowIdToOffset = context.segmentRowIdToOffset;
+        this.postingList = context.postingList;
+        this.context = context;
+        this.queryContext = context.context;
+        this.components = components;
+    }
+
+    @Override
+    protected void performSkipTo(Long nextToken)
+    {
+        if (skipToToken >= nextToken)
+            return;
+
+        skipToToken = nextToken;
+        needsSkipping = true;
+    }
+
+    @Override
+    protected Token computeNext()
+    {
+        try
+        {
+            queryContext.queryContext.checkpoint();
+
+            if (!opened)
+                open();
+
+            // just end the iterator if we don't have a postingList or current segment is skipped
+            if (exhausted())
+                return endOfData();
+
+            long segmentRowId = getNextSegmentRowId();
+            if (segmentRowId == PostingList.END_OF_STREAM)
+                return endOfData();
+
+            return getNextToken(segmentRowId);
+        }
+        catch (Throwable t)
+        {
+            //TODO We aren't tidying up resources here
+            if (!(t instanceof AbortedOperationException))
+                logger.error(components.logMessage("Unable to provide next token!"), t);
+
+            throw Throwables.cleaned(t);
+        }
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        if (logger.isTraceEnabled())
+        {
+            final long exhaustedInMills = timeToExhaust.stop().elapsed(TimeUnit.MILLISECONDS);
+            logger.trace(components.logMessage("PostinListRangeIterator exhausted after {} ms"), exhaustedInMills);
+        }
+
+        postingList.close();
+        FileUtils.closeQuietly(segmentRowIdToToken, segmentRowIdToOffset, keyReader);
+    }
+
+    private void open()
+    {
+        this.keyReader = keyFetcher.createReader();
+        this.producer = new OnDiskKeyProducer(keyFetcher, keyReader, segmentRowIdToOffset, context.maxPartitionOffset);
+        opened = true;
+    }
+
+    private boolean exhausted()
+    {
+        return needsSkipping && skipToToken > getMaximum();
+    }
+
+    /**
+     * reads the next row ID from the underlying posting list, potentially skipping to get there.
+     */
+    private long getNextSegmentRowId() throws IOException
+    {
+        if (needsSkipping)
+        {
+            int targetRowID = Math.toIntExact(segmentRowIdToToken.findTokenRowID(skipToToken));
+            // skipToToken is larger than max token in token file
+            if (targetRowID < 0)
+            {
+                return PostingList.END_OF_STREAM;
+            }
+
+            long segmentRowId = postingList.advance(targetRowID);
+
+            needsSkipping = false;
+            return segmentRowId;
+        }
+        else
+        {
+            return postingList.nextPosting();
+        }
+    }
+
+    /**
+     * takes a segment row ID and produces a {@link Token} for its partition key.
+     */
+    private Token getNextToken(long segmentRowId)
+    {
+        assert segmentRowId != PostingList.END_OF_STREAM;
+
+        long tokenValue = segmentRowIdToToken.get(segmentRowId);
+
+        // Used to remove duplicated key offset
+        return producer.produceToken(tokenValue, segmentRowId);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java b/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java
new file mode 100644
index 000000000000..711b543eed8e
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+
+public class QueryEventListeners
+{
+    public static final QueryEventListener NO_OP = new BaseQueryEventListener();
+
+    public static final QueryEventListener.BKDIndexEventListener NO_OP_BKD_LISTENER = NO_OP.bkdIndexEventListener();
+
+    public static final QueryEventListener.TrieIndexEventListener NO_OP_TRIE_LISTENER = NO_OP.trieIndexEventListener();
+
+    public static final QueryEventListener.PostingListEventListener NO_OP_POSTINGS_LISTENER = new NoOpPostingListEventListener();
+
+    private static class BaseQueryEventListener implements QueryEventListener
+    {
+        @Override
+        public BKDIndexEventListener bkdIndexEventListener()
+        {
+            return NoOpBKDIndexEventListener.INSTANCE;
+        }
+
+        @Override
+        public TrieIndexEventListener trieIndexEventListener()
+        {
+            return NoOpTrieIndexEventListener.INSTANCE;
+        }
+
+        private enum NoOpTrieIndexEventListener implements TrieIndexEventListener
+        {
+            INSTANCE;
+
+            @Override
+            public void onSegmentHit() { }
+
+            @Override
+            public void onTraversalComplete(long traversalTotalTime, TimeUnit unit) { }
+
+            @Override
+            public PostingListEventListener postingListEventListener()
+            {
+                return NO_OP_POSTINGS_LISTENER;
+            }
+        }
+
+        private enum NoOpBKDIndexEventListener implements BKDIndexEventListener
+        {
+            INSTANCE;
+
+            @Override
+            public void onIntersectionComplete(long intersectionTotalTime, TimeUnit unit) { }
+
+            @Override
+            public void onIntersectionEarlyExit() { }
+
+            @Override
+            public void postingListsHit(int count) { }
+
+            @Override
+            public void onSegmentHit() { }
+
+            @Override
+            public PostingListEventListener postingListEventListener()
+            {
+                return NO_OP_POSTINGS_LISTENER;
+            }
+        }
+    }
+
+    public static class NoOpPostingListEventListener implements QueryEventListener.PostingListEventListener
+    {
+        @Override
+        public void onAdvance() { }
+
+        @Override
+        public void onPostingDecoded() { }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/RAMPostingSlices.java b/src/java/org/apache/cassandra/index/sai/disk/RAMPostingSlices.java
new file mode 100644
index 000000000000..e1381b0234ec
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/RAMPostingSlices.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.mutable.MutableValueInt;
+
+/**
+ * Encodes postings as variable integers into slices
+ */
+class RAMPostingSlices
+{
+    static final int DEFAULT_TERM_DICT_SIZE = 1024;
+
+    private final ByteBlockPool postingsPool;
+    private int[] postingStarts = new int[DEFAULT_TERM_DICT_SIZE];
+    private int[] postingUptos = new int[DEFAULT_TERM_DICT_SIZE];
+    private int[] sizes = new int[DEFAULT_TERM_DICT_SIZE];
+
+    RAMPostingSlices(Counter memoryUsage)
+    {
+        postingsPool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(memoryUsage));
+    }
+
+    PostingList postingList(int termID, final ByteSliceReader reader, long maxSegmentRowID)
+    {
+        initReader(reader, termID);
+
+        final MutableValueInt lastSegmentRowId = new MutableValueInt();
+
+        return new PostingList()
+        {
+            @Override
+            public long nextPosting() throws IOException
+            {
+                if (reader.eof())
+                {
+                    return PostingList.END_OF_STREAM;
+                }
+                else
+                {
+                    lastSegmentRowId.value += reader.readVInt();
+                    return lastSegmentRowId.value;
+                }
+            }
+
+            @Override
+            public long size()
+            {
+                return sizes[termID];
+            }
+
+            @Override
+            public long advance(long targetRowID)
+            {
+                throw new UnsupportedOperationException();
+            }
+        };
+    }
+
+    void initReader(ByteSliceReader reader, int termID)
+    {
+        final int upto = postingUptos[termID];
+        reader.init(postingsPool, postingStarts[termID], upto);
+    }
+
+    void createNewSlice(int termID)
+    {
+        if (termID >= postingStarts.length - 1)
+        {
+            postingStarts = ArrayUtil.grow(postingStarts, termID + 1);
+            postingUptos = ArrayUtil.grow(postingUptos, termID + 1);
+            sizes = ArrayUtil.grow(sizes, termID + 1);
+        }
+
+        // the slice will not fit in the current block, create a new block
+        if ((ByteBlockPool.BYTE_BLOCK_SIZE - postingsPool.byteUpto) < ByteBlockPool.FIRST_LEVEL_SIZE)
+        {
+            postingsPool.nextBuffer();
+        }
+
+        final int upto = postingsPool.newSlice(ByteBlockPool.FIRST_LEVEL_SIZE);
+        postingStarts[termID] = upto + postingsPool.byteOffset;
+        postingUptos[termID] = upto + postingsPool.byteOffset;
+    }
+
+    void writeVInt(int termID, int i)
+    {
+        while ((i & ~0x7F) != 0)
+        {
+            writeByte(termID, (byte) ((i & 0x7f) | 0x80));
+            i >>>= 7;
+        }
+        writeByte(termID, (byte) i);
+        sizes[termID]++;
+    }
+
+    private void writeByte(int termID, byte b)
+    {
+        int upto = postingUptos[termID];
+        byte[] block = postingsPool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
+        assert block != null;
+        int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
+        if (block[offset] != 0)
+        {
+            // End of slice; allocate a new one
+            offset = postingsPool.allocSlice(block, offset);
+            block = postingsPool.buffer;
+            postingUptos[termID] = offset + postingsPool.byteOffset;
+        }
+        block[offset] = b;
+        postingUptos[termID]++;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/RAMStringIndexer.java b/src/java/org/apache/cassandra/index/sai/disk/RAMStringIndexer.java
new file mode 100644
index 000000000000..84cd7187bedb
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/RAMStringIndexer.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.nio.ByteBuffer;
+import java.util.NoSuchElementException;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.Counter;
+
+/**
+ * Indexes strings into an on-heap inverted index to be flushed in an SSTable attached index later.
+ * For flushing use the PostingTerms interface.
+ */
+public class RAMStringIndexer
+{
+    private final AbstractType<?> termComparator;
+    private final BytesRefHash termsHash;
+    private final RAMPostingSlices slices;
+    private final Counter bytesUsed;
+    
+    int rowCount = 0;
+
+    private int[] lastSegmentRowID = new int[RAMPostingSlices.DEFAULT_TERM_DICT_SIZE];
+
+    RAMStringIndexer(AbstractType<?> termComparator)
+    {
+        this.termComparator = termComparator;
+        bytesUsed = Counter.newCounter();
+
+        ByteBlockPool termsPool = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(bytesUsed));
+
+        termsHash = new BytesRefHash(termsPool);
+
+        slices = new RAMPostingSlices(bytesUsed);
+    }
+
+    long estimatedBytesUsed()
+    {
+        return bytesUsed.get();
+    }
+
+    /**
+     * EXPENSIVE OPERATION due to sorting the terms, only call once.
+     */
+    // TODO: assert or throw and exception if getTermsWithPostings is called > 1
+    TermsIterator getTermsWithPostings()
+    {
+        final int[] sortedTermIDs = termsHash.sort();
+
+        final int valueCount = termsHash.size();
+        final ByteSliceReader sliceReader = new ByteSliceReader();
+
+        return new TermsIterator()
+        {
+            private int ordUpto = 0;
+            private final BytesRef br = new BytesRef();
+
+            @Override
+            public ByteBuffer getMinTerm()
+            {
+                BytesRef term = new BytesRef();
+                int minTermID = sortedTermIDs[0];
+                termsHash.get(minTermID, term);
+                return ByteBuffer.wrap(term.bytes, term.offset, term.length);
+            }
+
+            @Override
+            public ByteBuffer getMaxTerm()
+            {
+                BytesRef term = new BytesRef();
+                int maxTermID = sortedTermIDs[valueCount-1];
+                termsHash.get(maxTermID, term);
+                return ByteBuffer.wrap(term.bytes, term.offset, term.length);
+            }
+
+            public void close() {}
+
+            @Override
+            public PostingList postings()
+            {
+                int termID = sortedTermIDs[ordUpto - 1];
+                final int maxSegmentRowId = lastSegmentRowID[termID];
+                return slices.postingList(termID, sliceReader, maxSegmentRowId);
+            }
+
+            @Override
+            public boolean hasNext() {
+                return ordUpto < valueCount;
+            }
+
+            @Override
+            public ByteComparable next()
+            {
+                if (!hasNext())
+                    throw new NoSuchElementException();
+
+                termsHash.get(sortedTermIDs[ordUpto], br);
+                ordUpto++;
+                return asByteComparable(br.bytes, br.offset, br.length);
+            }
+
+            private ByteComparable asByteComparable(byte[] bytes, int offset, int length)
+            {
+                return v -> ByteSource.fixedLength(bytes, offset, length);
+            }
+        };
+    }
+
+    public long add(BytesRef term, int segmentRowId)
+    {
+        long startBytes = estimatedBytesUsed();
+        int termID = termsHash.add(term);
+
+        if (termID >= 0)
+        {
+            // firs time seeing this term, create the term's first slice !
+            slices.createNewSlice(termID);
+        }
+        else
+        {
+            termID = (-termID) - 1;
+        }
+
+        if (termID >= lastSegmentRowID.length - 1)
+        {
+            lastSegmentRowID = ArrayUtil.grow(lastSegmentRowID, termID + 1);
+        }
+
+        int delta = segmentRowId - lastSegmentRowID[termID];
+
+        lastSegmentRowID[termID] = segmentRowId;
+
+        slices.writeVInt(termID, delta);
+
+        long allocatedBytes = estimatedBytesUsed() - startBytes;
+
+        rowCount++;
+
+        return allocatedBytes;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/SSTableComponentsWriter.java b/src/java/org/apache/cassandra/index/sai/disk/SSTableComponentsWriter.java
new file mode 100644
index 000000000000..d188c1270fc9
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/SSTableComponentsWriter.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.MetadataWriter;
+import org.apache.cassandra.index.sai.disk.v1.NumericValuesWriter;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Writes all SSTable-attached index token and offset structures.
+ */
+public class SSTableComponentsWriter
+{
+    protected static final Logger logger = LoggerFactory.getLogger(SSTableComponentsWriter.class);
+
+    private final NumericValuesWriter tokenWriter;
+    private final NumericValuesWriter offsetWriter;
+    private final MetadataWriter metadataWriter;
+
+    private final Descriptor descriptor;
+    private final IndexComponents indexComponents;
+
+    private DecoratedKey currentKey;
+
+    private long currentKeyPartitionOffset;
+
+    public SSTableComponentsWriter(Descriptor descriptor, CompressionParams compressionParams) throws IOException
+    {
+        this.descriptor = descriptor;
+
+        indexComponents = IndexComponents.perSSTable(descriptor, compressionParams);
+        this.metadataWriter = new MetadataWriter(indexComponents.createOutput(IndexComponents.GROUP_META));
+
+        this.tokenWriter = new NumericValuesWriter(IndexComponents.TOKEN_VALUES,
+                                                   indexComponents.createOutput(IndexComponents.TOKEN_VALUES),
+                                                   metadataWriter, false);
+        this.offsetWriter = new NumericValuesWriter(IndexComponents.OFFSETS_VALUES,
+                                                    indexComponents.createOutput(IndexComponents.OFFSETS_VALUES),
+                                                    metadataWriter, true);
+    }
+
+    private SSTableComponentsWriter()
+    {
+        this.descriptor = null;
+        this.indexComponents = null;
+        this.metadataWriter = null;
+        this.tokenWriter = null;
+        this.offsetWriter = null;
+    }
+
+    public void startPartition(DecoratedKey key, long position)
+    {
+        currentKey = key;
+        currentKeyPartitionOffset = position;
+    }
+
+    public void nextUnfilteredCluster(Unfiltered unfiltered, long position) throws IOException
+    {
+        recordCurrentTokenOffset();
+    }
+
+    public void staticRow(Row staticRow, long position) throws IOException
+    {
+        recordCurrentTokenOffset();
+    }
+
+    private void recordCurrentTokenOffset() throws IOException
+    {
+        recordCurrentTokenOffset((long) currentKey.getToken().getTokenValue(), currentKeyPartitionOffset);
+    }
+
+    @VisibleForTesting
+    public void recordCurrentTokenOffset(long tokenValue, long keyOffset) throws IOException
+    {
+        tokenWriter.add(tokenValue);
+        offsetWriter.add(keyOffset);
+    }
+
+    public void complete() throws IOException
+    {
+        IOUtils.close(tokenWriter, offsetWriter, metadataWriter);
+        indexComponents.createGroupCompletionMarker();
+    }
+
+    public void abort(Throwable accumulator)
+    {
+        logger.debug(indexComponents.logMessage("Aborting token/offset writer for {}..."), descriptor);
+        IndexComponents.deletePerSSTableIndexComponents(descriptor);
+    }
+
+    public static final SSTableComponentsWriter NONE = new SSTableComponentsWriter() {
+
+        @Override
+        public void nextUnfilteredCluster(Unfiltered unfiltered, long position)
+        {
+        }
+
+        @Override
+        public void startPartition(DecoratedKey key, long position)
+        {
+        }
+
+        @Override
+        public void staticRow(Row staticRow, long position)
+        {
+        }
+
+        @Override
+        public void complete()
+        {
+        }
+
+        @Override
+        public void abort(Throwable accumulate)
+        {
+        }
+    };
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java
new file mode 100644
index 000000000000..a80ec39ccc81
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java
@@ -0,0 +1,358 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.function.BooleanSupplier;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.MetadataWriter;
+import org.apache.cassandra.index.sai.utils.NamedMemoryLimiter;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NoSpamLogger;
+
+/**
+ * Column index writer that accumulates (on-heap) indexed data from a compacted SSTable as it's being flushed to disk.
+ */
+@NotThreadSafe
+public class SSTableIndexWriter implements ColumnIndexWriter
+{
+    private static final Logger logger = LoggerFactory.getLogger(SSTableIndexWriter.class);
+    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
+
+    public static final int MAX_STRING_TERM_SIZE = Integer.getInteger("cassandra.sai.max_string_term_size_kb", 1) * 1024;
+    public static final int MAX_FROZEN_COLLECTION_TERM_SIZE =Integer.getInteger("cassandra.sai.max_frozen_term_size_kb", 5) * 1024;
+    public static final String TERM_OVERSIZE_MESSAGE =
+            "Can't add term of column {} to index for key: {}, term size {} " +
+                    "max allowed size {}, use analyzed = true (if not yet set) for that column.";
+
+    private final int nowInSec = FBUtilities.nowInSeconds();
+    private final ColumnContext columnContext;
+    private final Descriptor descriptor;
+    private final IndexComponents indexComponents;
+    private final AbstractAnalyzer analyzer;
+    private final NamedMemoryLimiter limiter;
+    private final int maxTermSize;
+    private final BooleanSupplier isIndexValid;
+
+    private boolean aborted = false;
+
+    // segment writer
+    private SegmentBuilder currentBuilder;
+    private final List<SegmentMetadata> segments = new ArrayList<>();
+    private long maxSSTableRowId;
+
+    public SSTableIndexWriter(Descriptor descriptor, ColumnContext columnContext, NamedMemoryLimiter limiter,
+                              BooleanSupplier isIndexValid, CompressionParams compressionParams)
+    {
+        this.columnContext = columnContext;
+        this.descriptor = descriptor;
+        this.indexComponents = IndexComponents.create(columnContext.getIndexName(), descriptor, compressionParams);
+        this.analyzer = columnContext.getAnalyzer();
+        this.limiter = limiter;
+        this.isIndexValid = isIndexValid;
+        this.maxTermSize = columnContext.isFrozenCollection() ? MAX_FROZEN_COLLECTION_TERM_SIZE : MAX_STRING_TERM_SIZE;
+
+    }
+
+    @Override
+    public void addRow(DecoratedKey rowKey, long sstableRowId, Row row) throws IOException
+    {
+        if (maybeAbort())
+            return;
+
+        if (columnContext.isNonFrozenCollection())
+        {
+            Iterator<ByteBuffer> valueIterator = columnContext.getValuesOf(row, nowInSec);
+            if (valueIterator != null)
+            {
+                while (valueIterator.hasNext())
+                {
+                    ByteBuffer value = valueIterator.next();
+                    addTerm(TypeUtil.encode(value.duplicate(), columnContext.getValidator()), rowKey, sstableRowId, columnContext.getValidator());
+                }
+            }
+        }
+        else
+        {
+            ByteBuffer value = columnContext.getValueOf(rowKey, row, nowInSec);
+            if (value != null)
+                addTerm(TypeUtil.encode(value.duplicate(), columnContext.getValidator()), rowKey, sstableRowId, columnContext.getValidator());
+        }
+        maxSSTableRowId = sstableRowId;
+    }
+
+    /**
+     * abort current write if index is dropped
+     *
+     * @return true if current write is aborted.
+     */
+    private boolean maybeAbort()
+    {
+        if (aborted)
+            return true;
+
+        if (isIndexValid.getAsBoolean())
+            return false;
+
+        abort(new RuntimeException(String.format("index %s is dropped", columnContext.getIndexName())));
+        return true;
+    }
+
+    private void addTerm(ByteBuffer term, DecoratedKey key, long sstableRowId, AbstractType<?> type) throws IOException
+    {
+        if (term.remaining() >= maxTermSize)
+        {
+            noSpamLogger.warn(columnContext.logMessage(TERM_OVERSIZE_MESSAGE),
+                              columnContext.getColumnName(),
+                              columnContext.keyValidator().getString(key.getKey()),
+                              FBUtilities.prettyPrintMemory(term.remaining()),
+                              FBUtilities.prettyPrintMemory(maxTermSize));
+            return;
+        }
+
+        if (currentBuilder == null)
+        {
+            currentBuilder = newSegmentBuilder();
+        }
+        else if (shouldFlush(sstableRowId))
+        {
+            flushSegment();
+            currentBuilder = newSegmentBuilder();
+        }
+
+        if (term.remaining() == 0) return;
+
+        if (!TypeUtil.isLiteral(type))
+        {
+            limiter.increment(currentBuilder.add(term, key, sstableRowId));
+        }
+        else
+        {
+            analyzer.reset(term);
+            while (analyzer.hasNext())
+            {
+                ByteBuffer token = analyzer.next();
+                limiter.increment(currentBuilder.add(token, key, sstableRowId));
+            }
+        }
+    }
+
+    private boolean shouldFlush(long sstableRowId)
+    {
+        // If we've hit the minimum flush size and we've breached the global limit, flush a new segment:
+        boolean reachMemoryLimit = limiter.usageExceedsLimit() && currentBuilder.hasReachedMinimumFlushSize();
+
+        if (reachMemoryLimit)
+        {
+            logger.debug(columnContext.logMessage("Global limit of {} and minimum flush size of {} exceeded. " +
+                                            "Current builder usage is {} for {} cells. Global Usage is {}. Flushing..."),
+                         FBUtilities.prettyPrintMemory(limiter.limitBytes()),
+                         FBUtilities.prettyPrintMemory(currentBuilder.getMinimumFlushBytes()),
+                         FBUtilities.prettyPrintMemory(currentBuilder.totalBytesAllocated()),
+                         currentBuilder.getRowCount(),
+                         FBUtilities.prettyPrintMemory(limiter.currentBytesUsed()));
+        }
+
+        return reachMemoryLimit || currentBuilder.exceedsSegmentLimit(sstableRowId);
+    }
+
+    private void flushSegment() throws IOException
+    {
+        long start = System.nanoTime();
+
+        try
+        {
+            long bytesAllocated = currentBuilder.totalBytesAllocated();
+
+            SegmentMetadata segmentMetadata = currentBuilder.flush(indexComponents);
+
+            long flushMillis = Math.max(1, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
+
+            if (segmentMetadata != null)
+            {
+                segments.add(segmentMetadata);
+
+                //TODO Need to look at some of these metrics
+                double rowCount = segmentMetadata.numRows;
+                if (columnContext.getIndexMetrics() != null)
+                    columnContext.getIndexMetrics().compactionSegmentCellsPerSecond.update((long)(rowCount / flushMillis * 1000.0));
+
+                double segmentBytes = segmentMetadata.componentMetadatas.indexSize();
+                if (columnContext.getIndexMetrics() != null)
+                    columnContext.getIndexMetrics().compactionSegmentBytesPerSecond.update((long)(segmentBytes / flushMillis * 1000.0));
+
+                logger.debug(columnContext.logMessage("Flushed segment with {} cells for a total of {} to {} in {} ms."),
+                             (long) rowCount, FBUtilities.prettyPrintMemory((long) segmentBytes), indexComponents, flushMillis);
+            }
+
+            // Builder memory is released against the limiter at the conclusion of a successful
+            // flush. Note that any failure that occurs before this (even in term addition) will
+            // actuate this column writer's abort logic from the parent SSTable-level writer, and
+            // that abort logic will release the current builder's memory against the limiter.
+            long globalBytesUsed = currentBuilder.release(indexComponents);
+            currentBuilder = null;
+            logger.debug(columnContext.logMessage("Flushing index segment for SSTable {} released {}. Global segment memory usage now at {}."),
+                         descriptor, FBUtilities.prettyPrintMemory(bytesAllocated), FBUtilities.prettyPrintMemory(globalBytesUsed));
+
+        }
+        catch (Throwable t)
+        {
+            logger.error(columnContext.logMessage("Failed to build index for SSTable {}."), descriptor, t);
+            indexComponents.deleteColumnIndex();
+
+            columnContext.getIndexMetrics().segmentFlushErrors.inc();
+
+            throw t;
+        }
+    }
+
+    @Override
+    public void flush() throws IOException
+    {
+        if (maybeAbort())
+            return;
+
+        boolean emptySegment = currentBuilder == null || currentBuilder.isEmpty();
+        logger.debug(columnContext.logMessage("Completing index flush with {}buffered data..."), emptySegment ? "no " : "");
+
+        try
+        {
+            // parts are present but there is something still in memory, let's flush that inline
+            if (!emptySegment)
+            {
+                flushSegment();
+            }
+
+            // Even an empty segment may carry some fixed memory, so remove it:
+            if (currentBuilder != null)
+            {
+                long bytesAllocated = currentBuilder.totalBytesAllocated();
+                long globalBytesUsed = currentBuilder.release(indexComponents);
+                logger.debug(columnContext.logMessage("Flushing final segment for SSTable {} released {}. Global segment memory usage now at {}."),
+                             descriptor, FBUtilities.prettyPrintMemory(bytesAllocated), FBUtilities.prettyPrintMemory(globalBytesUsed));
+            }
+
+            compactSegments();
+
+            writeSegmentsMetadata();
+            indexComponents.createColumnCompletionMarker();
+        }
+        finally
+        {
+            if (columnContext.getIndexMetrics() != null)
+            {
+                columnContext.getIndexMetrics().segmentsPerCompaction.update(segments.size());
+                segments.clear();
+                columnContext.getIndexMetrics().compactionCount.inc();
+            }
+        }
+    }
+
+    @Override
+    public void abort(Throwable cause)
+    {
+        aborted = true;
+
+        logger.warn(columnContext.logMessage("Aborting SSTable index flush for {}..."), descriptor, cause);
+
+        // It's possible for the current builder to be unassigned after we flush a final segment.
+        if (currentBuilder != null)
+        {
+            // If an exception is thrown out of any writer operation prior to successful segment
+            // flush, we will end up here, and we need to free up builder memory tracked by the limiter:
+            long allocated = currentBuilder.totalBytesAllocated();
+            long globalBytesUsed = currentBuilder.release(indexComponents);
+            logger.debug(columnContext.logMessage("Aborting index writer for SSTable {} released {}. Global segment memory usage now at {}."),
+                         descriptor, FBUtilities.prettyPrintMemory(allocated), FBUtilities.prettyPrintMemory(globalBytesUsed));
+        }
+
+        indexComponents.deleteColumnIndex();
+    }
+
+    private void compactSegments() throws IOException
+    {
+        if (segments.isEmpty())
+            return;
+
+        DecoratedKey minKey = segments.get(0).minKey;
+        DecoratedKey maxKey = segments.get(segments.size() - 1).maxKey;
+
+        try (SegmentMerger segmentMerger = SegmentMerger.newSegmentMerger(columnContext.isLiteral());
+             SSTableIndex.PerIndexFiles perIndexFiles = new SSTableIndex.PerIndexFiles(indexComponents, columnContext.isLiteral(), true))
+        {
+            for (final SegmentMetadata segment : segments)
+            {
+                segmentMerger.addSegment(columnContext, segment, perIndexFiles);
+            }
+            segments.clear();
+            segments.add(segmentMerger.merge(columnContext, indexComponents, minKey, maxKey, maxSSTableRowId));
+        }
+        finally
+        {
+            indexComponents.deleteTemporaryComponents();
+        }
+    }
+
+    private void writeSegmentsMetadata() throws IOException
+    {
+        if (segments.isEmpty())
+            return;
+
+        try (final MetadataWriter writer = new MetadataWriter(indexComponents.createOutput(indexComponents.meta)))
+        {
+            SegmentMetadata.write(writer, segments, null);
+        }
+        catch (IOException e)
+        {
+            abort(e);
+            throw e;
+        }
+    }
+
+    private SegmentBuilder newSegmentBuilder()
+    {
+        SegmentBuilder builder = TypeUtil.isLiteral(columnContext.getValidator())
+                                 ? new SegmentBuilder.RAMStringSegmentBuilder(columnContext.getValidator(), limiter)
+                                 : new SegmentBuilder.KDTreeSegmentBuilder(columnContext.getValidator(), limiter, columnContext.getIndexWriterConfig());
+
+        long globalBytesUsed = limiter.increment(builder.totalBytesAllocated());
+        logger.debug(columnContext.logMessage("Created new segment builder while flushing SSTable {}. Global segment memory usage now at {}."),
+                     descriptor, FBUtilities.prettyPrintMemory(globalBytesUsed));
+
+        return builder;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/Segment.java b/src/java/org/apache/cassandra/index/sai/disk/Segment.java
new file mode 100644
index 000000000000..ea02bb445fd3
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/Segment.java
@@ -0,0 +1,185 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.io.util.FileUtils;
+
+/**
+ * Each segment represents an on-disk index structure (kdtree/terms/postings) flushed by memory limit or token boundaries,
+ * or max segment rowId limit, because of lucene's limitation on 2B(Integer.MAX_VALUE). It also helps to reduce resource
+ * consumption for read requests as only segments that intersect with read request data range need to be loaded.
+ */
+public class Segment implements Closeable
+{
+    private final Token minKey;
+    private final Token.KeyBound minKeyBound;
+    private final Token maxKey;
+    private final Token.KeyBound maxKeyBound;
+
+    // per sstable
+    final LongArray.Factory segmentRowIdToTokenFactory;
+    final LongArray.Factory segmentRowIdToOffsetFactory;
+    final SSTableContext.KeyFetcher keyFetcher;
+    // per-index
+    public final SSTableIndex.PerIndexFiles indexFiles;
+    // per-segment
+    public final SegmentMetadata metadata;
+
+    private final IndexSearcher index;
+    private final AbstractType<?> columnType;
+
+    public Segment(ColumnContext columnContext, SSTableContext sstableContext, SSTableIndex.PerIndexFiles indexFiles, SegmentMetadata metadata) throws IOException
+    {
+        this.minKey = metadata.minKey.getToken();
+        this.minKeyBound = minKey.minKeyBound();
+        this.maxKey = metadata.maxKey.getToken();
+        this.maxKeyBound = maxKey.maxKeyBound();
+
+        this.segmentRowIdToTokenFactory = sstableContext.tokenReaderFactory.withOffset(metadata.segmentRowIdOffset);
+        this.segmentRowIdToOffsetFactory = sstableContext.offsetReaderFactory.withOffset(metadata.segmentRowIdOffset);
+        this.keyFetcher = sstableContext.keyFetcher;
+        this.indexFiles = indexFiles;
+        this.metadata = metadata;
+        this.columnType = columnContext.getValidator();
+
+        this.index = IndexSearcher.open(columnContext.isLiteral(), this, columnContext.getColumnQueryMetrics());
+    }
+
+    @VisibleForTesting
+    public Segment(LongArray.Factory tokenFactory, LongArray.Factory offsetFactory, SSTableContext.KeyFetcher keyFetcher,
+                   SSTableIndex.PerIndexFiles indexFiles, SegmentMetadata metadata, AbstractType<?> columnType)
+    {
+        this.segmentRowIdToTokenFactory = tokenFactory;
+        this.segmentRowIdToOffsetFactory = offsetFactory;
+        this.keyFetcher = keyFetcher;
+        this.indexFiles = indexFiles;
+        this.metadata = metadata;
+        this.columnType = columnType;
+        this.minKey = null;
+        this.minKeyBound = null;
+        this.maxKey = null;
+        this.maxKeyBound = null;
+        this.index = null;
+    }
+
+    @VisibleForTesting
+    public Segment(Token minKey, Token maxKey)
+    {
+        this.segmentRowIdToTokenFactory = null;
+        this.segmentRowIdToOffsetFactory = null;
+        this.keyFetcher = null;
+        this.indexFiles = null;
+        this.metadata = null;
+        this.minKey = minKey;
+        this.minKeyBound = minKey.minKeyBound();
+        this.maxKey = maxKey;
+        this.maxKeyBound = maxKey.maxKeyBound();
+        this.columnType = null;
+        this.index = null;
+    }
+
+    /**
+     * @return true if current segment intersects with query key range
+     */
+    public boolean intersects(AbstractBounds<PartitionPosition> keyRange)
+    {
+        if (keyRange instanceof Range && ((Range<?>)keyRange).isWrapAround())
+            return keyRange.contains(minKeyBound) || keyRange.contains(maxKeyBound);
+
+        int cmp = keyRange.right.getToken().compareTo(minKey);
+        // if right is minimum, it means right is the max token and bigger than maxKey.
+        // if right bound is less than minKey, no intersection
+        if (!keyRange.right.isMinimum() && (!keyRange.inclusiveRight() && cmp == 0 || cmp < 0))
+            return false;
+
+        cmp = keyRange.left.getToken().compareTo(maxKey);
+        // if left bound is bigger than maxKey, no intersection
+        if (!keyRange.isStartInclusive() && cmp == 0 || cmp > 0)
+            return false;
+
+        return true;
+    }
+
+    public long indexFileCacheSize()
+    {
+        return index == null ? 0 : index.indexFileCacheSize();
+    }
+
+    /**
+     * Search on-disk index synchronously
+     *
+     * @param expression to filter on disk index
+     * @param context to track per sstable cache and per query metrics
+     * @param defer create the iterator in a deferred state
+     * @return range iterator that matches given expression
+     */
+    public RangeIterator search(Expression expression, SSTableQueryContext context, boolean defer)
+    {
+        return index.search(expression, context, defer);
+    }
+
+    public AbstractType<?> getColumnType()
+    {
+        return columnType;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        Segment segment = (Segment) o;
+        return Objects.equal(metadata, segment.metadata);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(metadata);
+    }
+
+    @Override
+    public void close()
+    {
+        FileUtils.closeQuietly(index);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Segment{metadata=%s}", metadata);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/SegmentBuilder.java b/src/java/org/apache/cassandra/index/sai/disk/SegmentBuilder.java
new file mode 100644
index 000000000000..0cc030fa6e43
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/SegmentBuilder.java
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.concurrent.atomic.AtomicLong;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.disk.io.BytesRefUtil;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.BKDTreeRamBuffer;
+import org.apache.cassandra.index.sai.disk.v1.InvertedIndexWriter;
+import org.apache.cassandra.index.sai.disk.v1.NumericIndexWriter;
+import org.apache.cassandra.index.sai.utils.NamedMemoryLimiter;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+/**
+ * Creates an on-heap index data structure to be flushed to an SSTable index.
+ */
+@NotThreadSafe
+public abstract class SegmentBuilder
+{
+    private static final Logger logger = LoggerFactory.getLogger(SegmentBuilder.class);
+
+    // Served as safe net in case memory limit is not triggered or when merger merges small segments..
+    public static final long LAST_VALID_SEGMENT_ROW_ID = ((long)Integer.MAX_VALUE / 2) - 1L;
+    private static long testLastValidSegmentRowId = -1;
+
+    /** The number of column indexes being built globally. (Starts at one to avoid divide by zero.) */
+    public static final AtomicLong ACTIVE_BUILDER_COUNT = new AtomicLong(1);
+
+    /** Minimum flush size, dynamically updated as segment builds are started and completed/aborted. */
+    private static volatile long minimumFlushBytes;
+
+    final AbstractType<?> termComparator;
+
+    private final NamedMemoryLimiter limiter;
+    long totalBytesAllocated;
+
+    private final long lastValidSegmentRowID;
+
+    private boolean flushed = false;
+    private boolean active = true;
+
+    // segment metadata
+    private long minSSTableRowId = -1;
+    private long maxSSTableRowId = -1;
+    private long segmentRowIdOffset = 0;
+    int rowCount = 0;
+    int maxSegmentRowId = -1;
+    // in token order
+    private DecoratedKey minKey, maxKey;
+    // in termComparator order
+    private ByteBuffer minTerm, maxTerm;
+
+    public static class KDTreeSegmentBuilder extends SegmentBuilder
+    {
+        protected final byte[] buffer;
+        private final BKDTreeRamBuffer kdTreeRamBuffer;
+        private final IndexWriterConfig indexWriterConfig;
+
+        KDTreeSegmentBuilder(AbstractType<?> termComparator, NamedMemoryLimiter limiter, IndexWriterConfig indexWriterConfig)
+        {
+            super(termComparator, limiter);
+
+            int typeSize = TypeUtil.fixedSizeOf(termComparator);
+            this.kdTreeRamBuffer = new BKDTreeRamBuffer(1, typeSize);
+            this.buffer = new byte[typeSize];
+            this.totalBytesAllocated = this.kdTreeRamBuffer.ramBytesUsed();
+            this.indexWriterConfig = indexWriterConfig;
+        }
+
+        public boolean isEmpty()
+        {
+            return kdTreeRamBuffer.numRows() == 0;
+        }
+
+        protected long addInternal(ByteBuffer term, int segmentRowId)
+        {
+            TypeUtil.toComparableBytes(term, termComparator, buffer);
+            return kdTreeRamBuffer.addPackedValue(segmentRowId, new BytesRef(buffer));
+        }
+
+        @Override
+        protected SegmentMetadata.ComponentMetadataMap flushInternal(IndexComponents indexComponents) throws IOException
+        {
+            try (NumericIndexWriter writer = new NumericIndexWriter(indexComponents,
+                                                                    TypeUtil.fixedSizeOf(termComparator),
+                                                                    maxSegmentRowId,
+                                                                    rowCount,
+                                                                    indexWriterConfig,
+                                                                    true))
+            {
+                return writer.writeAll(kdTreeRamBuffer.asPointValues());
+            }
+        }
+    }
+
+    public static class RAMStringSegmentBuilder extends SegmentBuilder
+    {
+        final RAMStringIndexer ramIndexer;
+
+        final BytesRefBuilder stringBuffer = new BytesRefBuilder();
+
+        RAMStringSegmentBuilder(AbstractType<?> termComparator, NamedMemoryLimiter limiter)
+        {
+            super(termComparator, limiter);
+
+            ramIndexer = new RAMStringIndexer(termComparator);
+            totalBytesAllocated = ramIndexer.estimatedBytesUsed();
+        }
+
+        public boolean isEmpty()
+        {
+            return ramIndexer.rowCount == 0;
+        }
+
+        protected long addInternal(ByteBuffer term, int segmentRowId)
+        {
+            BytesRefUtil.copyBufferToBytesRef(term, stringBuffer);
+            return ramIndexer.add(stringBuffer.get(), segmentRowId);
+        }
+
+        @Override
+        protected SegmentMetadata.ComponentMetadataMap flushInternal(IndexComponents indexComponents) throws IOException
+        {
+            try (InvertedIndexWriter writer = new InvertedIndexWriter(indexComponents, true))
+            {
+                return writer.writeAll(ramIndexer.getTermsWithPostings());
+            }
+        }
+    }
+
+    private SegmentBuilder(AbstractType<?> termComparator, NamedMemoryLimiter limiter)
+    {
+        this.termComparator = termComparator;
+        this.limiter = limiter;
+        this.lastValidSegmentRowID = testLastValidSegmentRowId >= 0 ? testLastValidSegmentRowId : LAST_VALID_SEGMENT_ROW_ID;
+
+        minimumFlushBytes = limiter.limitBytes() / ACTIVE_BUILDER_COUNT.getAndIncrement();
+    }
+
+    public SegmentMetadata flush(final IndexComponents indexComponents) throws IOException
+    {
+        assert !flushed;
+        flushed = true;
+
+        if (getRowCount() == 0)
+        {
+            logger.warn(indexComponents.logMessage("No rows to index during flush of SSTable {}."), indexComponents.descriptor);
+            return null;
+        }
+
+        SegmentMetadata.ComponentMetadataMap indexMetas = flushInternal(indexComponents);
+
+        return new SegmentMetadata(segmentRowIdOffset, rowCount, minSSTableRowId, maxSSTableRowId, minKey, maxKey, minTerm, maxTerm, indexMetas);
+    }
+
+    public long add(ByteBuffer term, DecoratedKey key, long sstableRowId)
+    {
+        assert !flushed : "Cannot add to flushed segment.";
+        assert sstableRowId >= maxSSTableRowId;
+        minSSTableRowId = minSSTableRowId < 0 ? sstableRowId : minSSTableRowId;
+        maxSSTableRowId = sstableRowId;
+
+        assert maxKey == null || maxKey.compareTo(key) <= 0;
+        minKey = minKey == null ? key : minKey;
+        maxKey = key;
+
+        minTerm = TypeUtil.min(term, minTerm, termComparator);
+        maxTerm = TypeUtil.max(term, maxTerm, termComparator);
+
+        if (rowCount == 0)
+        {
+            // use first global rowId in the segment as segment rowId offset
+            segmentRowIdOffset = sstableRowId;
+        }
+
+        rowCount++;
+
+        // segmentRowIdOffset should encode sstableRowId into Integer
+        int segmentRowId = castToSegmentRowId(sstableRowId, segmentRowIdOffset);
+        maxSegmentRowId = Math.max(maxSegmentRowId, segmentRowId);
+
+        long bytesAllocated = addInternal(term, segmentRowId);
+        totalBytesAllocated += bytesAllocated;
+
+        return bytesAllocated;
+    }
+
+    public static int castToSegmentRowId(long sstableRowId, long segmentRowIdOffset)
+    {
+        int segmentRowId = Math.toIntExact(sstableRowId - segmentRowIdOffset);
+
+        if (segmentRowId == PostingList.END_OF_STREAM)
+            throw new IllegalArgumentException("Illegal segment row id: END_OF_STREAM found");
+
+        return segmentRowId;
+    }
+
+    long totalBytesAllocated()
+    {
+        return totalBytesAllocated;
+    }
+
+    boolean hasReachedMinimumFlushSize()
+    {
+        return totalBytesAllocated >= minimumFlushBytes;
+    }
+
+    long getMinimumFlushBytes()
+    {
+        return minimumFlushBytes;
+    }
+
+    /**
+     * This method does three things:
+     *
+     * 1.) It decrements active builder count and updates the global minimum flush size to reflect that.
+     * 2.) It releases the builder's memory against its limiter.
+     * 3.) It defensively marks the builder inactive to make sure nothing bad happens if we try to close it twice.
+     *
+     * @param indexComponents
+     *
+     * @return the number of bytes currently used by the memory limiter
+     */
+    long release(IndexComponents indexComponents)
+    {
+        if (active)
+        {
+            minimumFlushBytes = limiter.limitBytes() / ACTIVE_BUILDER_COUNT.decrementAndGet();
+            long used = limiter.decrement(totalBytesAllocated);
+            active = false;
+            return used;
+        }
+
+        logger.warn(indexComponents.logMessage("Attempted to release storage attached index segment builder memory after builder marked inactive."));
+        return limiter.currentBytesUsed();
+    }
+
+    public abstract boolean isEmpty();
+
+    protected abstract long addInternal(ByteBuffer term, int segmentRowId);
+
+    protected abstract SegmentMetadata.ComponentMetadataMap flushInternal(IndexComponents indexComponents) throws IOException;
+
+    int getRowCount()
+    {
+        return rowCount;
+    }
+
+    /**
+     * @return true if next SSTable row ID exceeds max segment row ID
+     */
+    boolean exceedsSegmentLimit(long ssTableRowId)
+    {
+        if (getRowCount() == 0)
+            return false;
+
+        // To handle the case where there are many non-indexable rows. eg. rowId-1 and rowId-3B are indexable,
+        // the rest are non-indexable. We should flush them as 2 separate segments, because rowId-3B is going
+        // to cause error in on-disk index structure with 2B limitation.
+        return ssTableRowId - segmentRowIdOffset > lastValidSegmentRowID;
+    }
+
+    @VisibleForTesting
+    public static void updateLastValidSegmentRowId(long lastValidSegmentRowID)
+    {
+        testLastValidSegmentRowId = lastValidSegmentRowID;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/SegmentMerger.java b/src/java/org/apache/cassandra/index/sai/disk/SegmentMerger.java
new file mode 100644
index 000000000000..2d4c486a630c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/SegmentMerger.java
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.BKDReader;
+import org.apache.cassandra.index.sai.disk.v1.InvertedIndexWriter;
+import org.apache.cassandra.index.sai.disk.v1.NumericIndexWriter;
+import org.apache.cassandra.index.sai.disk.v1.TermsReader;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+
+/**
+ * Responsible for merging index segments into a single segment during initial index build.
+ */
+public interface SegmentMerger extends Closeable
+{
+    void addSegment(ColumnContext context, SegmentMetadata segment, SSTableIndex.PerIndexFiles indexFiles) throws IOException;
+
+    boolean isEmpty();
+
+    SegmentMetadata merge(ColumnContext context, IndexComponents components, DecoratedKey minKey, DecoratedKey maxKey, long maxSSTableRowId) throws IOException;
+
+    @SuppressWarnings("resource")
+    static SegmentMerger newSegmentMerger(boolean literal)
+    {
+        return literal ? new LiteralSegmentMerger() : new NumericSegmentMerger();
+    }
+
+    class LiteralSegmentMerger implements SegmentMerger
+    {
+        final List<TermsReader> readers = new ArrayList<>();
+        final List<TermsIterator> segmentTermsIterators = new ArrayList<>();
+
+        @Override
+        public void addSegment(ColumnContext context, SegmentMetadata segment, SSTableIndex.PerIndexFiles indexFiles) throws IOException
+        {
+            segmentTermsIterators.add(createTermsIterator(segment, indexFiles));
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return segmentTermsIterators.isEmpty();
+        }
+
+        @Override
+        public SegmentMetadata merge(ColumnContext context, IndexComponents components, DecoratedKey minKey, DecoratedKey maxKey, long maxSSTableRowId) throws IOException
+        {
+            try (final TermsIteratorMerger merger = new TermsIteratorMerger(segmentTermsIterators.toArray(new TermsIterator[0]), context.getValidator()))
+            {
+
+                SegmentMetadata.ComponentMetadataMap indexMetas;
+                long numRows;
+
+                try (InvertedIndexWriter indexWriter = new InvertedIndexWriter(components, false))
+                {
+                    indexMetas = indexWriter.writeAll(merger);
+                    numRows = indexWriter.getPostingsCount();
+                }
+                return new SegmentMetadata(0,
+                                           numRows,
+                                           merger.minSSTableRowId,
+                                           merger.maxSSTableRowId,
+                                           minKey,
+                                           maxKey,
+                                           merger.getMinTerm(),
+                                           merger.getMaxTerm(),
+                                           indexMetas);
+            }
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            readers.forEach(TermsReader::close);
+        }
+
+        @SuppressWarnings("resource")
+        private TermsIterator createTermsIterator(SegmentMetadata segment, SSTableIndex.PerIndexFiles indexFiles) throws IOException
+        {
+            final long root = segment.getIndexRoot(indexFiles.components().termsData);
+            assert root >= 0;
+
+            final Map<String, String> map = segment.componentMetadatas.get(IndexComponents.NDIType.TERMS_DATA).attributes;
+            final String footerPointerString = map.get(SAICodecUtils.FOOTER_POINTER);
+            final long footerPointer = footerPointerString == null ? -1 : Long.parseLong(footerPointerString);
+
+            final TermsReader termsReader = new TermsReader(indexFiles.components(),
+                                                            indexFiles.termsData().sharedCopy(),
+                                                            indexFiles.postingLists().sharedCopy(),
+                                                            root,
+                                                            footerPointer);
+            readers.add(termsReader);
+            return termsReader.allTerms(segment.segmentRowIdOffset, QueryEventListeners.NO_OP_TRIE_LISTENER);
+        }
+    }
+
+    class NumericSegmentMerger implements SegmentMerger
+    {
+        final List<BKDReader.IteratorState> segmentIterators = new ArrayList<>();
+        final List<BKDReader> readers = new ArrayList<>();
+
+        ByteBuffer minTerm = null, maxTerm = null;
+
+        @Override
+        public void addSegment(ColumnContext context, SegmentMetadata segment, SSTableIndex.PerIndexFiles indexFiles) throws IOException
+        {
+            minTerm = TypeUtil.min(segment.minTerm, minTerm, context.getValidator());
+            maxTerm = TypeUtil.max(segment.maxTerm, maxTerm, context.getValidator());
+
+            segmentIterators.add(createIteratorState(segment, indexFiles));
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return segmentIterators.isEmpty();
+        }
+
+        @Override
+        public SegmentMetadata merge(ColumnContext context, IndexComponents components, DecoratedKey minKey, DecoratedKey maxKey, long maxSSTableRowId) throws IOException
+        {
+            final MergeOneDimPointValues merger = new MergeOneDimPointValues(segmentIterators, context.getValidator());
+
+            final SegmentMetadata.ComponentMetadataMap componentMetadataMap;
+            try (NumericIndexWriter indexWriter = new NumericIndexWriter(components,
+                                                                         TypeUtil.fixedSizeOf(context.getValidator()),
+                                                                         maxSSTableRowId,
+                                                                         Integer.MAX_VALUE,
+                                                                         context.getIndexWriterConfig(),
+                                                                         false))
+            {
+                componentMetadataMap = indexWriter.writeAll(merger);
+            }
+            return new SegmentMetadata(0,
+                                       merger.getNumRows(),
+                                       merger.getMinRowID(),
+                                       merger.getMaxRowID(),
+                                       minKey,
+                                       maxKey,
+                                       minTerm,
+                                       maxTerm,
+                                       componentMetadataMap);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            segmentIterators.forEach(BKDReader.IteratorState::close);
+            readers.forEach(BKDReader::close);
+        }
+
+        @SuppressWarnings("resource")
+        private BKDReader.IteratorState createIteratorState(SegmentMetadata segment, SSTableIndex.PerIndexFiles indexFiles) throws IOException
+        {
+            final long bkdPosition = segment.getIndexRoot(indexFiles.components().kdTree);
+            assert bkdPosition >= 0;
+            final long postingsPosition = segment.getIndexRoot(indexFiles.components().kdTreePostingLists);
+            assert postingsPosition >= 0;
+
+            final BKDReader bkdReader = new BKDReader(indexFiles.components(),
+                                                      indexFiles.kdtree().sharedCopy(),
+                                                      bkdPosition,
+                                                      indexFiles.kdtreePostingLists().sharedCopy(),
+                                                      postingsPosition);
+            readers.add(bkdReader);
+            return bkdReader.iteratorState(rowid -> rowid + segment.segmentRowIdOffset);
+        }
+    }
+}
+
diff --git a/src/java/org/apache/cassandra/index/sai/disk/SegmentMetadata.java b/src/java/org/apache/cassandra/index/sai/disk/SegmentMetadata.java
new file mode 100644
index 000000000000..543932b90a03
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/SegmentMetadata.java
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.stream.Stream;
+
+import com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.RAMIndexOutput;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.index.sai.disk.v1.MetadataWriter;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Multiple {@link SegmentMetadata} are stored in {@link IndexComponents.NDIType#meta} file, each corresponds to an on-disk
+ * index segment.
+ */
+public class SegmentMetadata implements Comparable<SegmentMetadata>
+{
+    private static final String NAME = "SegmentMetadata";
+
+    /**
+     * Used to retrieve sstableRowId which equals to offset plus segmentRowId.
+     */
+    public final long segmentRowIdOffset;
+
+    /**
+     * Min and max sstable rowId in current segment.
+     *
+     * For index generated by compaction, minSSTableRowId is the same as segmentRowIdOffset.
+     * But for flush, segmentRowIdOffset is taken from previous segment's maxSSTableRowId.
+     */
+    public final long minSSTableRowId;
+    public final long maxSSTableRowId;
+
+    /**
+     * number of indexed rows (aka. pair of term and segmentRowId) in current segment
+     */
+    public final long numRows;
+
+    /**
+     * Ordered by their token position in current segment
+     */
+    public final DecoratedKey minKey;
+    public final DecoratedKey maxKey;
+
+    /**
+     * Minimum and maximum indexed column value ordered by its {@link org.apache.cassandra.db.marshal.AbstractType}.
+     */
+    public final ByteBuffer minTerm, maxTerm;
+
+    /**
+     * Root, offset, length for each index structure in the segment.
+     *
+     * Note: postings block offsets are stored in terms dictionary, no need to worry about its root.
+     */
+    public final ComponentMetadataMap componentMetadatas;
+
+    SegmentMetadata(long segmentRowIdOffset,
+                    long numRows,
+                    long minSSTableRowId,
+                    long maxSSTableRowId,
+                    DecoratedKey minKey,
+                    DecoratedKey maxKey,
+                    ByteBuffer minTerm,
+                    ByteBuffer maxTerm,
+                    ComponentMetadataMap componentMetadatas)
+    {
+        assert numRows < Integer.MAX_VALUE;
+        Objects.requireNonNull(minKey);
+        Objects.requireNonNull(maxKey);
+        Objects.requireNonNull(minTerm);
+        Objects.requireNonNull(maxTerm);
+
+        this.segmentRowIdOffset = segmentRowIdOffset;
+        this.minSSTableRowId = minSSTableRowId;
+        this.maxSSTableRowId = maxSSTableRowId;
+        this.numRows = numRows;
+        this.minKey = minKey;
+        this.maxKey = maxKey;
+        this.minTerm = minTerm;
+        this.maxTerm = maxTerm;
+        this.componentMetadatas = componentMetadatas;
+    }
+
+    private static final Logger logger = LoggerFactory.getLogger(SegmentMetadata.class);
+
+    @SuppressWarnings("resource")
+    private SegmentMetadata(IndexInput input, ICompressor compressor) throws IOException
+    {
+        this.segmentRowIdOffset = input.readLong();
+
+        numRows = input.readLong();
+        minSSTableRowId = input.readLong();
+        maxSSTableRowId = input.readLong();
+        minKey = DatabaseDescriptor.getPartitioner().decorateKey(readBytes(input));
+        maxKey = DatabaseDescriptor.getPartitioner().decorateKey(readBytes(input));
+
+        if (compressor != null)
+        {
+            IndexInput cryptoIn = CryptoUtils.uncompress(input, compressor);
+
+            assert cryptoIn.length() > 0;
+
+            minTerm = readBytes(cryptoIn);
+            maxTerm = readBytes(cryptoIn);
+        }
+        else
+        {
+            minTerm = readBytes(input);
+            maxTerm = readBytes(input);
+        }
+        componentMetadatas = new ComponentMetadataMap(input);
+    }
+
+    @SuppressWarnings("resource")
+    public static List<SegmentMetadata> load(MetadataSource source, ICompressor compressor) throws IOException
+    {
+        IndexInput input = source.get(NAME);
+
+        int segmentCount = input.readVInt();
+
+        List<SegmentMetadata> segmentMetadata = new ArrayList<>(segmentCount);
+
+        for (int i = 0; i < segmentCount; i++)
+        {
+            segmentMetadata.add(new SegmentMetadata(input, compressor));
+        }
+
+        return segmentMetadata;
+    }
+
+    /**
+     * Writes disk metadata for the given segment list.
+     */
+    @SuppressWarnings("resource")
+    public static void write(MetadataWriter writer, List<SegmentMetadata> segments, ICompressor compressor) throws IOException
+    {
+        try (IndexOutput output = writer.builder(NAME))
+        {
+            output.writeVInt(segments.size());
+
+            for (SegmentMetadata metadata : segments)
+            {
+                output.writeLong(metadata.segmentRowIdOffset);
+                output.writeLong(metadata.numRows);
+                output.writeLong(metadata.minSSTableRowId);
+                output.writeLong(metadata.maxSSTableRowId);
+
+                if (compressor != null)
+                {
+                    Stream.of(metadata.minKey.getKey(), metadata.maxKey.getKey()).forEach(bb -> writeBytes(bb, output));
+
+                    RAMIndexOutput out = new RAMIndexOutput("");
+                    writeBytes(metadata.minTerm, out);
+                    writeBytes(metadata.maxTerm, out);
+
+                    CryptoUtils.compress(new BytesRef(out.getBytes(), 0, (int)out.getFilePointer()), output, compressor);
+                }
+                else
+                {
+                    Stream.of(metadata.minKey.getKey(), metadata.maxKey.getKey(), metadata.minTerm, metadata.maxTerm).forEach(bb -> writeBytes(bb, output));
+                }
+
+                metadata.componentMetadatas.write(output);
+            }
+        }
+    }
+
+    @Override
+    public int compareTo(SegmentMetadata other)
+    {
+        return Long.compare(this.segmentRowIdOffset, other.segmentRowIdOffset);
+    }
+
+    @Override
+    public String toString()
+    {
+        return "SegmentMetadata{" +
+               "segmentRowIdOffset=" + segmentRowIdOffset +
+               ", minSSTableRowId=" + minSSTableRowId +
+               ", maxSSTableRowId=" + maxSSTableRowId +
+               ", numRows=" + numRows +
+               ", componentMetadatas=" + componentMetadatas +
+               '}';
+    }
+
+    private static ByteBuffer readBytes(IndexInput input) throws IOException
+    {
+        int len = input.readVInt();
+        byte[] bytes = new byte[len];
+        input.readBytes(bytes, 0, len);
+        return ByteBuffer.wrap(bytes);
+    }
+
+    private static void writeBytes(ByteBuffer buf, IndexOutput out)
+    {
+        try
+        {
+            byte[] bytes = ByteBufferUtil.getArray(buf);
+            out.writeVInt(bytes.length);
+            out.writeBytes(bytes, 0, bytes.length);
+        }
+        catch (IOException ioe)
+        {
+            throw new RuntimeException(ioe);
+        }
+    }
+
+    long getIndexRoot(IndexComponents.IndexComponent component)
+    {
+        return componentMetadatas.get(component.ndiType).root;
+    }
+
+    public long getIndexOffset(IndexComponents.IndexComponent component)
+    {
+        return componentMetadatas.get(component.ndiType).offset;
+    }
+
+    public long getIndexLength(IndexComponents.IndexComponent component)
+    {
+        return componentMetadatas.get(component.ndiType).length;
+    }
+
+    public static class ComponentMetadataMap
+    {
+        private final Map<IndexComponents.NDIType, ComponentMetadata> metas = new HashMap<>();
+
+        ComponentMetadataMap(IndexInput input) throws IOException
+        {
+            int size = input.readInt();
+
+            for (int i = 0; i < size; i++)
+            {
+                metas.put(IndexComponents.NDIType.valueOf(input.readString()), new ComponentMetadata(input));
+            }
+        }
+
+        public ComponentMetadataMap()
+        {
+        }
+
+        public void put(IndexComponents.NDIType ndiType, long root, long offset, long length)
+        {
+            metas.put(ndiType, new ComponentMetadata(root, offset, length));
+        }
+
+        public void put(IndexComponents.NDIType ndiType, long root, long offset, long length, Map<String, String> additionalMap)
+        {
+            metas.put(ndiType, new ComponentMetadata(root, offset, length, additionalMap));
+        }
+
+        private void write(IndexOutput output) throws IOException
+        {
+            output.writeInt(metas.size());
+
+            for (Map.Entry<IndexComponents.NDIType, ComponentMetadata> entry : metas.entrySet())
+            {
+                output.writeString(entry.getKey().name());
+                entry.getValue().write(output);
+            }
+        }
+
+        public ComponentMetadata get(IndexComponents.NDIType ndiType)
+        {
+            if (!metas.containsKey(ndiType))
+                throw new IllegalArgumentException(ndiType + " ComponentMetadata not found");
+
+            return metas.get(ndiType);
+        }
+
+        public Map<String, Map<String, String>> asMap()
+        {
+            Map<String, Map<String, String>> metaAttributes = new HashMap<>();
+
+            for (Map.Entry<IndexComponents.NDIType, ComponentMetadata> entry : metas.entrySet())
+            {
+                String name = entry.getKey().name;
+                ComponentMetadata metadata = entry.getValue();
+
+                Map<String, String> componentAttributes = metadata.asMap();
+
+                assert !metaAttributes.containsKey(name) : "Found duplicate index type: " + name;
+                metaAttributes.put(name, componentAttributes);
+            }
+
+            return metaAttributes;
+        }
+
+        @Override
+        public String toString()
+        {
+            return "ComponentMetadataMap{" +
+                   "metas=" + metas +
+                   '}';
+        }
+
+        public double indexSize()
+        {
+            return metas.values().stream().mapToLong(meta -> meta.length).sum();
+        }
+    }
+
+    public static class ComponentMetadata
+    {
+        public static final String ROOT = "Root";
+        public static final String OFFSET = "Offset";
+        public static final String LENGTH = "Length";
+
+        public final long root;
+        public final long offset;
+        public final long length;
+        public final Map<String,String> attributes;
+
+        ComponentMetadata(long root, long offset, long length)
+        {
+            this.root = root;
+            this.offset = offset;
+            this.length = length;
+            this.attributes = Collections.emptyMap();
+        }
+
+        ComponentMetadata(long root, long offset, long length, Map<String, String> attributes)
+        {
+            this.root = root;
+            this.offset = offset;
+            this.length = length;
+            this.attributes = attributes;
+        }
+
+        ComponentMetadata(IndexInput input) throws IOException
+        {
+            this.root = input.readLong();
+            this.offset = input.readLong();
+            this.length = input.readLong();
+            int size = input.readInt();
+
+            attributes = new HashMap<>(size);
+            for (int x=0; x < size; x++)
+            {
+                String key = input.readString();
+                String value = input.readString();
+
+                attributes.put(key, value);
+            }
+        }
+
+        public void write(IndexOutput output) throws IOException
+        {
+            output.writeLong(root);
+            output.writeLong(offset);
+            output.writeLong(length);
+
+            output.writeInt(attributes.size());
+            for (Map.Entry<String,String> entry : attributes.entrySet())
+            {
+                output.writeString(entry.getKey());
+                output.writeString(entry.getValue());
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("ComponentMetadata{root=%d, offset=%d, length=%d, attributes=%s}", root, offset, length, attributes.toString());
+        }
+
+        public Map<String, String> asMap()
+        {
+            return ImmutableMap.<String, String>builder().putAll(attributes).put(OFFSET, Long.toString(offset)).put(LENGTH, Long.toString(length)).put(ROOT, Long.toString(root)).build();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java
new file mode 100644
index 000000000000..66cae625f792
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/StorageAttachedIndexWriter.java
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.Objects;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.memory.RowMapping;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.schema.CompressionParams;
+
+/**
+ * Writes all on-disk index structures attached to a given SSTable.
+ */
+public class StorageAttachedIndexWriter implements SSTableFlushObserver
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final Descriptor descriptor;
+    private final Collection<StorageAttachedIndex> indices;
+    private final Collection<ColumnIndexWriter> columnIndexWriters;
+    private final SSTableComponentsWriter sstableComponentsWriter;
+    private final Stopwatch stopwatch = Stopwatch.createUnstarted();
+    private final RowMapping rowMapping;
+
+    private DecoratedKey currentKey;
+    private boolean tokenOffsetWriterCompleted = false;
+    private boolean aborted = false;
+
+    private long sstableRowId = 0;
+
+    public StorageAttachedIndexWriter(Descriptor descriptor,
+                                      Collection<StorageAttachedIndex> indices,
+                                      LifecycleNewTracker tracker, CompressionParams compressionParams) throws IOException
+    {
+        this(descriptor, indices, tracker, false, compressionParams);
+    }
+
+    public StorageAttachedIndexWriter(Descriptor descriptor,
+                                      Collection<StorageAttachedIndex> indices,
+                                      LifecycleNewTracker tracker,
+                                      boolean perColumnOnly, CompressionParams compressionParams) throws IOException
+    {
+        this.descriptor = descriptor;
+        this.indices = indices;
+        this.rowMapping = RowMapping.create(tracker.opType());
+        this.columnIndexWriters = indices.stream().map(i -> i.newIndexWriter(descriptor, tracker, rowMapping, compressionParams))
+                                         .filter(Objects::nonNull) // a null here means the column had no data to flush
+                                         .collect(Collectors.toList());
+
+        this.sstableComponentsWriter = perColumnOnly ? SSTableComponentsWriter.NONE : new SSTableComponentsWriter(descriptor, compressionParams);
+    }
+
+    @Override
+    public void begin()
+    {
+        logger.debug(logMessage("Starting partition iteration for storage attached index flush for SSTable {}..."), descriptor);
+        stopwatch.start();
+    }
+
+    @Override
+    public void startPartition(DecoratedKey key, long position)
+    {
+        if (aborted) return;
+        
+        currentKey = key;
+        sstableComponentsWriter.startPartition(key, position);
+    }
+
+    @Override
+    public void nextUnfilteredCluster(Unfiltered unfiltered, long position)
+    {
+        if (aborted) return;
+        
+        try
+        {
+            // Ignore range tombstones...
+            if (unfiltered.isRow())
+            {
+                sstableComponentsWriter.nextUnfilteredCluster(unfiltered, position);
+                rowMapping.add(currentKey, unfiltered, sstableRowId);
+
+                for (ColumnIndexWriter w : columnIndexWriters)
+                {
+                    w.addRow(currentKey, sstableRowId, (Row) unfiltered);
+                }
+
+                sstableRowId++;
+            }
+        }
+        catch (Throwable t)
+        {
+            logger.error(logMessage("Failed to record a row during an index build"), t);
+            abort(t, true);
+        }
+    }
+
+    @Override
+    public void partitionLevelDeletion(DeletionTime deletionTime, long position)
+    {
+        // Deletions (including partition deletions) are accounted for during reads.
+    }
+
+    @Override
+    public void staticRow(Row staticRow, long position)
+    {
+        if (aborted) return;
+        
+        if (staticRow.isEmpty())
+            return;
+
+        try
+        {
+            sstableComponentsWriter.staticRow(staticRow, position);
+            rowMapping.add(currentKey, staticRow, sstableRowId);
+
+            for (ColumnIndexWriter w : columnIndexWriters)
+            {
+                w.addRow(currentKey, sstableRowId, staticRow);
+            }
+
+            sstableRowId++;
+        }
+        catch (Throwable t)
+        {
+            logger.error(logMessage("Failed to record a static row during an index build"), t);
+            abort(t, true);
+        }
+    }
+
+    @Override
+    public void complete()
+    {
+        if (aborted) return;
+        
+        logger.debug(logMessage("Completed partition iteration for index flush for SSTable {}. Elapsed time: {} ms"),
+                     descriptor, stopwatch.elapsed(TimeUnit.MILLISECONDS));
+
+        try
+        {
+            sstableComponentsWriter.complete();
+            tokenOffsetWriterCompleted = true;
+
+            logger.debug(logMessage("Flushed tokens and offsets for SSTable {}. Elapsed time: {} ms."),
+                         descriptor, stopwatch.elapsed(TimeUnit.MILLISECONDS));
+
+            rowMapping.complete();
+
+            for (ColumnIndexWriter columnIndexWriter : columnIndexWriters)
+            {
+                columnIndexWriter.flush();
+            }
+        }
+        catch (Throwable t)
+        {
+            logger.error(logMessage("Failed to complete an index build"), t);
+            abort(t, true);
+        }
+    }
+
+    /**
+     * Aborts all column index writers and, only if they have not yet completed, SSTable-level component writers.
+     * 
+     * @param accumulator the initial exception thrown from the failed writer
+     */
+    @Override
+    public void abort(Throwable accumulator)
+    {
+        abort(accumulator, false);
+    }
+
+    /**
+     *
+     * @param accumulator original cause of the abort
+     * @param fromIndex true if the cause of the abort was the index itself, false otherwise
+     */
+    public void abort(Throwable accumulator, boolean fromIndex)
+    {
+        // Mark the write aborted, so we can short-circuit any further operations on the component writers.
+        aborted = true;
+        
+        // Make any indexes involved in this transaction non-queryable, as they will likely not match the backing table.
+        if (fromIndex)
+            indices.forEach(StorageAttachedIndex::makeIndexNonQueryable);
+        
+        for (ColumnIndexWriter writer : columnIndexWriters)
+        {
+            try
+            {
+                writer.abort(accumulator);
+            }
+            catch (Throwable t)
+            {
+                if (accumulator != null)
+                {
+                    accumulator.addSuppressed(t);
+                }
+            }
+        }
+        
+        if (!tokenOffsetWriterCompleted)
+        {
+            // If the token/offset files have already been written successfully, they can be reused later. 
+            sstableComponentsWriter.abort(accumulator);
+        }
+    }
+
+    /**
+     * A helper method for constructing consistent log messages. This method is different to similar helper
+     * methods in that log messages generated in this class are not necessarily related to a single index
+     * so the log message is decorated as follows:
+     *
+     * [ks.tb.*] Log message
+     *
+     * @param message The raw content of a logging message.
+     *
+     * @return A log message with the proper keyspace and table name prepended to it.
+     */
+    public String logMessage(String message)
+    {
+        // Index names are unique only within a keyspace.
+        return String.format("[%s.%s.*] %s", descriptor.ksname, descriptor.cfname, message);
+    }
+
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/TermsIterator.java b/src/java/org/apache/cassandra/index/sai/disk/TermsIterator.java
new file mode 100644
index 000000000000..368f8e30a64c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/TermsIterator.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * Iterator to step through terms to obtain {@link PostingList} for the current term.
+ *
+ * Term enumerations are always ordered by their {@link ByteSource}.
+ */
+@NotThreadSafe
+public interface TermsIterator extends Iterator<ByteComparable>, Closeable
+{
+    /**
+     * Get {@link PostingList} for the current term.
+     */
+    PostingList postings() throws IOException;
+
+    ByteBuffer getMinTerm();
+
+    ByteBuffer getMaxTerm();
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/TermsIteratorMerger.java b/src/java/org/apache/cassandra/index/sai/disk/TermsIteratorMerger.java
new file mode 100644
index 000000000000..bfdad083ba56
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/TermsIteratorMerger.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Comparator;
+import java.util.PriorityQueue;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.disk.v1.MergePostingList;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+
+public class TermsIteratorMerger implements TermsIterator
+{
+    private final TermsIterator[] iterators;
+    private final MergingIterator mergedIterator;
+    private final AbstractType<?> type;
+
+    public long maxSSTableRowId = -1;
+    public long minSSTableRowId = Long.MAX_VALUE;
+    public ByteComparable minTerm;
+    public ByteComparable maxTerm;
+
+    public TermsIteratorMerger(final TermsIterator[] iterators, AbstractType<?> type)
+    {
+        this.iterators = iterators;
+        this.mergedIterator = new MergingIterator(type, iterators);
+        this.type = type;
+    }
+
+    @Override
+    public ByteBuffer getMinTerm()
+    {
+        byte[] bytes = ByteSourceInverse.readBytes(minTerm.asPeekableBytes(ByteComparable.Version.OSS41));
+        return ByteBuffer.wrap(bytes);
+    }
+
+    @Override
+    public ByteBuffer getMaxTerm()
+    {
+        byte[] bytes = ByteSourceInverse.readBytes(maxTerm.asPeekableBytes(ByteComparable.Version.OSS41));
+        return ByteBuffer.wrap(bytes);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        for (TermsIterator iterator : iterators)
+            iterator.close();
+    }
+
+    @Override
+    public boolean hasNext()
+    {
+        return mergedIterator.hasNext();
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public PostingList postings() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> postingLists = new PriorityQueue<>(100, Comparator.comparingLong(PostingList.PeekablePostingList::peek));
+        for (int x = 0; x < mergedIterator.getNumTop(); x++)
+        {
+            final int index = mergedIterator.top[x].index;
+            final TermsIterator termsIterator = iterators[index];
+            final PostingList postings = termsIterator.postings();
+
+            postingLists.add(postings.peekable());
+        }
+        return new MonitoringPostingList(MergePostingList.merge(postingLists));
+    }
+
+    @Override
+    public ByteComparable next()
+    {
+        ByteComparable nextTerm = mergedIterator.next();
+        minTerm = type.isReversed() ? TypeUtil.max(nextTerm, minTerm) : TypeUtil.min(nextTerm, minTerm);
+        maxTerm = type.isReversed() ? TypeUtil.min(nextTerm, maxTerm) : TypeUtil.max(nextTerm, minTerm);
+
+        return nextTerm;
+    }
+
+    private class MonitoringPostingList implements PostingList
+    {
+        private final PostingList monitored;
+
+        private MonitoringPostingList(PostingList monitored)
+        {
+            this.monitored = monitored;
+        }
+
+        @Override
+        public long nextPosting() throws IOException
+        {
+            long next = monitored.nextPosting();
+            if (next != PostingList.END_OF_STREAM)
+            {
+                minSSTableRowId = Math.min(minSSTableRowId, next);
+                maxSSTableRowId = Math.max(maxSSTableRowId, next);
+            }
+            return next;
+        }
+
+        @Override
+        public long size()
+        {
+            return monitored.size();
+        }
+
+        @Override
+        public long advance(long targetRowID) throws IOException
+        {
+            return monitored.advance(targetRowID);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            monitored.close();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/format/Version.java b/src/java/org/apache/cassandra/index/sai/disk/format/Version.java
new file mode 100644
index 000000000000..85b6d2ce0b7b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/format/Version.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.format;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+/**
+ * Format version of indexing component, denoted as [major][minor]. Same forward-compatibility rules apply as to
+ * {@link org.apache.cassandra.io.sstable.format.Version}.
+ */
+public class Version
+{
+    private static final Version AA = new Version('a', 'a');
+
+    public static final Version EARLIEST = AA;
+    public static final Version LATEST = AA;
+
+    private final String version;
+
+    public Version(char major, char minor)
+    {
+        this.version = major + "" + minor;
+    }
+
+    public static Version parse(String input)
+    {
+        checkArgument(input.length() == 2);
+        return new Version(input.charAt(0), input.charAt(1));
+    }
+
+    @Override
+    public String toString()
+    {
+        return version;
+    }
+
+    public boolean onOrAfter(Version other)
+    {
+        return version.compareTo(other.version) >= 0;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/BytesRefUtil.java b/src/java/org/apache/cassandra/index/sai/disk/io/BytesRefUtil.java
new file mode 100644
index 000000000000..ccad8f85f51a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/BytesRefUtil.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.lucene.util.BytesRefBuilder;
+
+public final class BytesRefUtil
+{
+    private BytesRefUtil() {}
+
+    public static void copyBufferToBytesRef(ByteBuffer buffer, BytesRefBuilder stringBuffer)
+    {
+        int length = buffer.remaining();
+        stringBuffer.clear();
+        stringBuffer.grow(length);
+        FastByteOperations.copy(buffer, buffer.position(), stringBuffer.bytes(), 0, buffer.remaining());
+        stringBuffer.setLength(length);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/CryptoUtils.java b/src/java/org/apache/cassandra/index/sai/disk/io/CryptoUtils.java
new file mode 100644
index 000000000000..06b3572edfd1
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/CryptoUtils.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.io.compress.CompressionMetadata;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.lucene.store.ByteArrayIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+
+public class CryptoUtils
+{
+
+    public static CompressionMetadata getCompressionMeta(SSTableReader ssTableReader)
+    {
+        return ssTableReader.compression ? ssTableReader.getCompressionMetadata() : null;
+    }
+
+    public static CompressionParams getCompressionParams(SSTableReader ssTableReader)
+    {
+        return getCompressionParams(getCompressionMeta(ssTableReader));
+    }
+
+    public static CompressionParams getCompressionParams(CompressionMetadata meta)
+    {
+        return meta != null ? meta.parameters : null;
+    }
+
+    //TODO Encryption tidyup
+//    public static ICompressor getEncryptionCompressor(CompressionParams compressionParams)
+//    {
+//        ICompressor compressor = compressionParams != null ? compressionParams.getSstableCompressor() : null;
+//        return compressor != null ? compressor.encryptionOnly() : null;
+//    }
+//
+//    public static boolean isCryptoEnabled(CompressionParams params)
+//    {
+//        ICompressor sstableCompressor = params != null ? params.getSstableCompressor() : null;
+//        return sstableCompressor != null && sstableCompressor.encryptionOnly() != null ? true : false;
+//    }
+
+    public static IndexInput uncompress(IndexInput input, ICompressor compressor) throws IOException
+    {
+        return uncompress(input, compressor,
+                          new BytesRef(new byte[16]), new BytesRef(new byte[16])
+        );
+    }
+
+    /**
+     * Takes an {@link IndexInput} with compressed/encrypted data and returns another {@link IndexInput} with
+     * that data uncompressed/decrypted.
+     */
+    public static IndexInput uncompress(IndexInput input, ICompressor compressor, BytesRef compBytes, BytesRef uncompBytes) throws IOException
+    {
+        final int uncompBytesLen = input.readVInt();
+        final int compBytesLength = input.readVInt();
+
+        assert compBytesLength > 0 : "uncompBytesLen="+uncompBytesLen+" compBytesLength="+compBytesLength;
+
+        compBytes.bytes = ArrayUtil.grow(compBytes.bytes, compBytesLength);
+
+        input.readBytes(compBytes.bytes, 0, compBytesLength);
+
+        if (uncompBytes.bytes == BytesRef.EMPTY_BYTES)
+        {
+            // if EMPTY_BYTES use an exact new byte array
+            uncompBytes.bytes = new byte[uncompBytesLen];
+            uncompBytes.length = uncompBytesLen;
+        }
+        else
+        {
+            uncompBytes.bytes = ArrayUtil.grow(uncompBytes.bytes, uncompBytesLen);
+            uncompBytes.length = uncompBytesLen;
+        }
+        compressor.uncompress(compBytes.bytes, 0, compBytesLength, uncompBytes.bytes, 0);
+
+        return new ByteArrayIndexInput("", uncompBytes.bytes, 0, uncompBytesLen);
+    }
+
+    public static void compress(BytesRef uncompBytes,
+                                IndexOutput out, ICompressor compressor) throws IOException
+    {
+       compress(uncompBytes, new BytesRef(new byte[16]), out, compressor);
+    }
+
+    public static void compress(BytesRef uncompBytes, BytesRef compBytes,
+                                IndexOutput out, ICompressor compressor) throws IOException
+    {
+        ByteBuffer input = ByteBuffer.wrap(uncompBytes.bytes, 0, uncompBytes.length);
+
+        final int initCompLen = compressor.initialCompressedBufferLength(uncompBytes.length);
+
+        compBytes.bytes = ArrayUtil.grow(compBytes.bytes, initCompLen);
+        compBytes.length = initCompLen;
+
+        ByteBuffer output = ByteBuffer.wrap(compBytes.bytes);
+
+        compressor.compress(input, output);
+
+        final int compLen = output.position();
+
+        compBytes.length = compLen;
+
+        assert uncompBytes.length > 0;
+        assert compLen > 0;
+
+        out.writeVInt(uncompBytes.length);
+        out.writeVInt(compLen);
+
+        out.writeBytes(compBytes.bytes, compLen);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/EmptyDirectory.java b/src/java/org/apache/cassandra/index/sai/disk/io/EmptyDirectory.java
new file mode 100644
index 000000000000..8269304f4ed2
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/EmptyDirectory.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.util.Collection;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.Lock;
+
+/**
+ * Always empty directory. Any operations to create, delete or open index files are unsupported.
+ */
+public final class EmptyDirectory extends Directory
+{
+    public static final Directory INSTANCE = new EmptyDirectory();
+
+    @Override
+    public String[] listAll()
+    {
+        return new String[0];
+    }
+
+    @Override
+    public void close()
+    {
+        // no-op
+    }
+
+    @Override
+    public void deleteFile(String name)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long fileLength(String name)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public IndexOutput createOutput(String name, IOContext context)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public IndexOutput createTempOutput(String prefix, String suffix, IOContext context)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void sync(Collection<String> names)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void syncMetaData()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void rename(String source, String dest)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public IndexInput openInput(String name, IOContext context)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Lock obtainLock(String name)
+    {
+        throw new UnsupportedOperationException();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/FilterIndexInput.java b/src/java/org/apache/cassandra/index/sai/disk/io/FilterIndexInput.java
new file mode 100644
index 000000000000..5a8092d8222b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/FilterIndexInput.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+
+public abstract class FilterIndexInput extends IndexInput
+{
+    private final IndexInput delegate;
+
+    protected FilterIndexInput(IndexInput delegate)
+    {
+        super(delegate.toString());
+        this.delegate = delegate;
+    }
+
+    public IndexInput getDelegate()
+    {
+        return delegate;
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        delegate.close();
+    }
+
+    @Override
+    public long getFilePointer()
+    {
+        return delegate.getFilePointer();
+    }
+
+    @Override
+    public void seek(long pos) throws IOException
+    {
+        delegate.seek(pos);
+    }
+
+    @Override
+    public long length()
+    {
+        return delegate.length();
+    }
+
+    @Override
+    public IndexInput slice(String sliceDescription, long offset, long length) throws IOException
+    {
+        return delegate.slice(sliceDescription, offset, length);
+    }
+
+    @Override
+    public byte readByte() throws IOException
+    {
+        return delegate.readByte();
+    }
+
+    @Override
+    public void readBytes(byte[] b, int offset, int len) throws IOException
+    {
+        delegate.readBytes(b, offset, len);
+    }
+
+    @Override
+    public String toString()
+    {
+        return delegate.toString();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/IndexComponents.java b/src/java/org/apache/cassandra/index/sai/disk/io/IndexComponents.java
new file mode 100644
index 000000000000..20e48d0a9c03
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/IndexComponents.java
@@ -0,0 +1,768 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Stream;
+import java.util.zip.CRC32;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Strings;
+import com.google.common.collect.ObjectArrays;
+import com.google.common.io.Files;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.index.sai.disk.v1.NumericValuesMeta;
+import org.apache.cassandra.index.sai.disk.v1.PostingsWriter;
+import org.apache.cassandra.index.sai.disk.v1.TrieTermsDictionaryWriter;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.bkd.BKDWriter;
+
+/**
+ * //TODO Need to consider how we handle encryption in OS
+ * The {@link Component}s that storage-attached indexing attaches to an SSTable.
+ *
+ * It allows us to unify index file creation, and ensures they will follow the same naming convention.
+ */
+public class IndexComponents
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    public static final String TYPE_PREFIX = "SAI";
+    private static final String PER_SSTABLE_FILE_NAME_FORMAT = TYPE_PREFIX + "_%s.db";
+//    public static final String LEGACY_PER_COLUMN_FILE_NAME_FORMAT = "%s_" + PER_SSTABLE_FILE_NAME_FORMAT;
+    public static final String PER_COLUMN_FILE_NAME_FORMAT = TYPE_PREFIX + "_%s_%s.db";
+
+    public static class IndexComponent extends Component
+    {
+        public final NDIType ndiType;
+
+        private IndexComponent(NDIType ndiType, String name)
+        {
+            super(Type.CUSTOM, name);
+            this.ndiType = ndiType;
+        }
+    }
+
+    public enum NDIType
+    {
+        // per-column components
+        /**
+         * Stores multiple {@link SegmentMetadata}s.
+         */
+        META("Meta", false),
+        /**
+         * KDTree written by {@link BKDWriter} indexes mappings of term to one ore more segment row IDs
+         * (segment row ID = SSTable row ID - segment row ID offset).
+         */
+        KD_TREE("KDTree", false),
+        KD_TREE_POSTING_LISTS("KDTreePostingLists", false),
+        /**
+         * Term dictionary written by {@link TrieTermsDictionaryWriter} stores mappings of term and
+         * file pointer to posting block on posting file.
+         */
+        TERMS_DATA("TermsData", false, false, true),
+        /**
+         * Stores postings written by {@link PostingsWriter}
+         */
+        POSTING_LISTS("PostingLists", false),
+        /**
+         * If present indicates that the column index build completed successfully
+         */
+        COLUMN_COMPLETION_MARKER("ColumnComplete", false, true),
+
+        // per-sstable components
+        /**
+         * Partition key token value for rows including row tombstone and static row. (access key is rowId)
+         */
+        TOKEN_VALUES("TokenValues"),
+        /**
+         * Partition key offset in sstable data file for rows including row tombstone and static row. (access key is
+         * rowId)
+         */
+        OFFSETS_VALUES("OffsetsValues"),
+        /**
+         * Stores {@link NumericValuesMeta} for {@link NDIType#TOKEN_VALUES} and {@link NDIType#OFFSETS_VALUES}.
+         */
+        GROUP_META("GroupMeta"),
+        /**
+         * If present indicates that the per-sstable index build completed successfully
+         */
+        GROUP_COMPLETION_MARKER("GroupComplete", true, true);
+
+        public final String name;
+        private final boolean perSSTable;
+        private final boolean marker;
+        private final boolean encryptable;
+
+        NDIType(String name)
+        {
+            this(name, true, false);
+        }
+
+        NDIType(String name, boolean perSSTable)
+        {
+            this(name, perSSTable, false);
+        }
+
+        NDIType(String name, boolean perSSTable, boolean marker)
+        {
+            this.name = name;
+            this.perSSTable = perSSTable;
+            this.marker = marker;
+            this.encryptable = false;
+        }
+
+        NDIType(String name, boolean perSSTable, boolean marker, boolean encryptable)
+        {
+            this.name = name;
+            this.perSSTable = perSSTable;
+            this.marker = marker;
+            this.encryptable = encryptable;
+        }
+
+        public boolean encryptable()
+        {
+            return encryptable;
+        }
+
+        public boolean perSSTable()
+        {
+            return perSSTable;
+        }
+
+        public boolean completionMarker()
+        {
+            return marker;
+        }
+
+        private boolean perSegment()
+        {
+            return !perSSTable && this != META;
+        }
+
+        public IndexComponent newComponent()
+        {
+            assert perSSTable;
+            String componentName = String.format(PER_SSTABLE_FILE_NAME_FORMAT, name);
+
+            return new IndexComponent(this, componentName);
+        }
+
+        public IndexComponent newComponent(String indexName)
+        {
+            assert !perSSTable;
+            String componentName = String.format(PER_COLUMN_FILE_NAME_FORMAT, indexName, name);
+
+            return new IndexComponent(this, componentName);
+        }
+
+        @Override
+        public String toString()
+        {
+            return name;
+        }
+    }
+
+    public static final NDIType[] STRING_COMPONENTS = new NDIType[]{ NDIType.TERMS_DATA, NDIType.POSTING_LISTS };
+
+    private static final NDIType[] NUMERIC_COMPONENTS = new NDIType[]{ NDIType.KD_TREE, NDIType.KD_TREE_POSTING_LISTS };
+
+    private static final NDIType[] PER_COLUMN_COMPONENTS = new NDIType[]{ NDIType.COLUMN_COMPLETION_MARKER, NDIType.META };
+
+    private static final NDIType[] NUMERIC_PER_COLUMN_COMPONENTS = ObjectArrays.concat(PER_COLUMN_COMPONENTS, NUMERIC_COMPONENTS, NDIType.class);
+
+    private static final NDIType[] LITERAL_PER_COLUMN_COMPONENTS = ObjectArrays.concat(PER_COLUMN_COMPONENTS, STRING_COMPONENTS, NDIType.class);
+
+    private static final NDIType[] ALL_PER_COLUMN_COMPONENTS = ObjectArrays.concat(NUMERIC_PER_COLUMN_COMPONENTS, STRING_COMPONENTS, NDIType.class);
+
+    public static final IndexComponent TOKEN_VALUES = NDIType.TOKEN_VALUES.newComponent();
+
+    public static final IndexComponent OFFSETS_VALUES = NDIType.OFFSETS_VALUES.newComponent();
+
+    public static final IndexComponent GROUP_META = NDIType.GROUP_META.newComponent();
+
+    public static final IndexComponent GROUP_COMPLETION_MARKER = NDIType.GROUP_COMPLETION_MARKER.newComponent();
+
+
+    /**
+     * Files that are shared by all storage-attached indexes for each SSTable
+     */
+    public static final List<IndexComponent> PER_SSTABLE_COMPONENTS = Arrays.asList(GROUP_COMPLETION_MARKER, TOKEN_VALUES, OFFSETS_VALUES, GROUP_META);
+
+    public final IndexComponent termsData, postingLists, meta, groupCompletionMarker, kdTree, kdTreePostingLists, columnCompletionMarker;
+
+    private static final SequentialWriterOption defaultWriterOption = SequentialWriterOption.newBuilder()
+                                                                                            .trickleFsync(DatabaseDescriptor.getTrickleFsync())
+                                                                                            .trickleFsyncByteInterval(DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024)
+                                                                                            .bufferType(BufferType.OFF_HEAP)
+                                                                                            .finishOnClose(true)
+                                                                                            .build();
+
+    public final Descriptor descriptor;
+    public final String indexName;
+
+    private final SequentialWriterOption writerOption;
+    private final CompressionParams compressionParams;
+
+    IndexComponents(Descriptor descriptor, SequentialWriterOption sequentialWriterOption, CompressionParams compressionParams)
+    {
+        this(null, descriptor, sequentialWriterOption, compressionParams);
+    }
+
+    @VisibleForTesting
+    IndexComponents(String indexName, Descriptor descriptor, SequentialWriterOption sequentialWriterOption, CompressionParams compressionParams)
+    {
+        this.indexName = indexName;
+        this.descriptor = descriptor;
+        this.writerOption = sequentialWriterOption;
+
+        this.compressionParams = compressionParams;
+
+        termsData = NDIType.TERMS_DATA.newComponent(indexName);
+        postingLists = NDIType.POSTING_LISTS.newComponent(indexName);
+        meta = NDIType.META.newComponent(indexName);
+        groupCompletionMarker = NDIType.GROUP_COMPLETION_MARKER.newComponent();
+        kdTree = NDIType.KD_TREE.newComponent(indexName);
+        kdTreePostingLists = NDIType.KD_TREE_POSTING_LISTS.newComponent(indexName);
+        columnCompletionMarker = NDIType.COLUMN_COMPLETION_MARKER.newComponent(indexName);
+    }
+
+    /**
+     * Used to access per-sstable and per-index components
+     */
+    public static IndexComponents create(String indexName, SSTableReader ssTableReader)
+    {
+        return create(indexName, ssTableReader.descriptor, CryptoUtils.getCompressionParams(ssTableReader));
+    }
+
+    public static IndexComponents create(String indexName, Descriptor descriptor, CompressionParams params)
+    {
+        return new IndexComponents(indexName, descriptor, defaultWriterOption, params);
+    }
+
+    /**
+     * Returns the sstable {@link Component}s for the specified column index, excluding the shared ones.
+     */
+    public static Set<IndexComponent> perColumnComponents(String indexName, boolean isLiteral)
+    {
+        return components(indexName, isLiteral ? LITERAL_PER_COLUMN_COMPONENTS : NUMERIC_PER_COLUMN_COMPONENTS);
+    }
+
+    /**
+     * Used to access per-sstable shared components
+     */
+    public static IndexComponents perSSTable(Descriptor descriptor, CompressionParams params)
+    {
+        return new IndexComponents(descriptor, defaultWriterOption, params);
+    }
+
+    public static IndexComponents perSSTable(SSTableReader ssTableReader)
+    {
+        return perSSTable(ssTableReader.descriptor, CryptoUtils.getCompressionParams(ssTableReader));
+    }
+
+    /**
+     * @return <code>true</code> if an index build successfully completed building the per-SSTable
+     * components for the given SSTable
+     */
+    public static boolean isGroupIndexComplete(Descriptor descriptor)
+    {
+        return descriptor.fileFor(GROUP_COMPLETION_MARKER).exists();
+    }
+
+    /**
+     * @return <code>true</code> if an index build successfully completed for the given column index
+     */
+    public static boolean isColumnIndexComplete(Descriptor descriptor, String indexName)
+    {
+        return isGroupIndexComplete(descriptor) && descriptor.fileFor(NDIType.COLUMN_COMPLETION_MARKER.newComponent(indexName)).exists();
+    }
+
+    /**
+     * @return <code>true</code> if an index build successfully completed for the given column index but
+     * the SSTable did not have any indexable rows relating to the index
+     */
+    public static boolean isColumnIndexEmpty(Descriptor descriptor, String indexName)
+    {
+        long numIndexFiles = components(indexName, ALL_PER_COLUMN_COMPONENTS).stream().map(descriptor::fileFor).filter(File::exists).count();
+        return isColumnIndexComplete(descriptor, indexName) && (numIndexFiles == 1);
+    }
+
+    /**
+     * Delete the per-SSTable index files from the filesystem
+     */
+    public static void deletePerSSTableIndexComponents(Descriptor descriptor)
+    {
+        PER_SSTABLE_COMPONENTS.stream()
+                              .map(descriptor::fileFor)
+                              .filter(File::exists)
+                              .forEach(IndexComponents::deleteComponent);
+    }
+
+    private static Set<IndexComponent> components(String indexName, NDIType... types)
+    {
+        Set<IndexComponent> components = new HashSet<>(types.length);
+        for (NDIType type : types)
+        {
+            components.add(type.newComponent(indexName));
+        }
+        return components;
+    }
+
+    private static void deleteComponent(File file)
+    {
+        logger.debug("Deleting storage attached index component file {}", file);
+        try
+        {
+            IOUtils.deleteFilesIfExist(file.toPath());
+        }
+        catch (IOException e)
+        {
+            logger.warn("Unable to delete storage attached index component file {} due to {}.", file, e.getMessage(), e);
+        }
+    }
+
+    /**
+     * @return total size (in bytes) of column index components
+     */
+    public long sizeOfPerColumnComponents()
+    {
+        return sizeOf(components(indexName, ALL_PER_COLUMN_COMPONENTS));
+    }
+
+    /**
+     * @return total size (in bytes) of per-SSTable index components
+     */
+    public long sizeOfPerSSTableComponents()
+    {
+        return sizeOf(PER_SSTABLE_COMPONENTS);
+    }
+
+    public long sizeOf(Collection<IndexComponent> components)
+    {
+        return components.stream().map(descriptor::fileFor).filter(File::exists).mapToLong(File::length).sum();
+    }
+
+    /**
+     * A helper method for constructing consistent log messages for specific column indexes.
+     *
+     * Example: For the index "idx" in keyspace "ks" on table "tb", calling this method with the raw message
+     * "Flushing new index segment..." will produce...
+     *
+     * "[ks.idx.tb] Flushing new index segment..."
+     *
+     * @param message The raw content of a logging message, without information identifying it with an index.
+     *
+     * @return A log message with the proper keyspace, table and index name prepended to it.
+     */
+    public String logMessage(String message)
+    {
+        // Index names are unique only within a keyspace.
+        return String.format("[%s.%s.%s] %s", descriptor.ksname, descriptor.cfname, Strings.isNullOrEmpty(indexName) ? "*" : indexName, message);
+    }
+
+    /**
+     * Delete the underlying per-column index files from the filesystem.
+     */
+    public void deleteColumnIndex()
+    {
+        Stream.of(ALL_PER_COLUMN_COMPONENTS)
+              .map(type -> type.newComponent(indexName))
+              .map(descriptor::fileFor)
+              .filter(File::exists)
+              .forEach(IndexComponents::deleteComponent);
+    }
+
+    public FileHandle createFileHandle(IndexComponent component)
+    {
+        return createFileHandle(component, false);
+    }
+
+    public FileHandle createFileHandle(IndexComponent component, boolean temporary)
+    {
+        final File file = temporary ? descriptor.tmpFileFor(component) : descriptor.fileFor(component);
+
+        if (logger.isTraceEnabled())
+        {
+            logger.trace(logMessage("Opening {} file handle for {} ({})"), temporary ? "temporary" : "", file, FBUtilities.prettyPrintMemory(file.length()));
+        }
+
+        try (final FileHandle.Builder builder = new FileHandle.Builder(file.getAbsolutePath()).mmapped(true))
+        {
+            return builder.complete();
+        }
+    }
+
+    public boolean validatePerSSTableComponentsChecksum()
+    {
+        for (IndexComponent component : PER_SSTABLE_COMPONENTS)
+        {
+            try
+            {
+                validateComponent(component, true);
+            }
+            catch (Throwable e)
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    public boolean validatePerColumnComponentsChecksum(boolean isLiteral)
+    {
+        try
+        {
+            validatePerColumnComponents(isLiteral, true);
+            return true;
+        }
+        catch (Throwable e)
+        {
+            logger.warn(logMessage("Checksum validation failed on SSTable {}."), descriptor, e);
+            return false;
+        }
+    }
+
+    public void validatePerSSTableComponents() throws IOException
+    {
+        for (IndexComponent component : PER_SSTABLE_COMPONENTS)
+        {
+            validateComponent(component, false);
+        }
+    }
+
+    public void validatePerColumnComponents(boolean isLiteral) throws IOException
+    {
+        validatePerColumnComponents(isLiteral, false);
+    }
+
+    public IndexInput openInput(FileHandle handle)
+    {
+        return IndexInputReader.create(handle);
+    }
+
+    @SuppressWarnings("resource")
+    public IndexInput openBlockingInput(IndexComponent component)
+    {
+        final File file = descriptor.fileFor(component);
+        if (logger.isTraceEnabled())
+            logger.trace(logMessage("Opening blocking index input for file {} ({})"), file, FBUtilities.prettyPrintMemory(file.length()));
+
+        try (final FileHandle.Builder builder = new FileHandle.Builder(file.getAbsolutePath()))
+        {
+            final FileHandle fileHandle = builder.complete();
+            final RandomAccessReader randomReader = fileHandle.createReader();
+
+            return IndexInputReader.create(randomReader, fileHandle::close);
+        }
+    }
+
+    public IndexOutputWriter createOutput(IndexComponent component) throws IOException
+    {
+        return createOutput(component, false);
+    }
+
+    public IndexOutputWriter createOutput(IndexComponent component, boolean append) throws IOException
+    {
+        return createOutput(component, append, false);
+    }
+
+    public IndexOutputWriter createOutput(IndexComponent component, boolean append, boolean temporary) throws IOException
+    {
+        final File file = temporary ? descriptor.tmpFileFor(component) : descriptor.fileFor(component);
+
+        if (logger.isTraceEnabled())
+            logger.trace(logMessage("Creating {} sstable attached index output for component {} on file {}..."), temporary ? "temporary" : "", component, file);
+
+        IndexOutputWriter writer = createOutput(file, component.ndiType.encryptable());
+
+        if (append)
+        {
+            writer.skipBytes(file.length());
+        }
+
+        return writer;
+    }
+
+    public void deleteTemporaryComponents()
+    {
+        Stream.of(ALL_PER_COLUMN_COMPONENTS)
+              .map(type -> type.newComponent(indexName))
+              .map(descriptor::tmpFileFor)
+              .filter(File::exists)
+              .forEach(IndexComponents::deleteComponent);
+    }
+
+    public void deleteTemporaryComponent(IndexComponent component) throws IOException
+    {
+        final File file = descriptor.tmpFileFor(component);
+
+        if (file.exists())
+            if (!file.delete())
+                logger.warn("Failed to delete temporary file " + file);
+    }
+
+    public CompressionParams getCompressionParams()
+    {
+        return compressionParams;
+    }
+
+    @SuppressWarnings("resource")
+    public IndexOutputWriter createOutput(File file, boolean encryptable)
+    {
+        assert writerOption.finishOnClose() : "IndexOutputWriter relies on close() to sync with disk.";
+
+        return new IndexOutputWriter(new IncrementalChecksumSequentialWriter(file));
+    }
+
+    @VisibleForTesting
+    public IndexOutputWriter createOutput(File file)
+    {
+        return createOutput(file, false);
+    }
+
+    public void createGroupCompletionMarker() throws IOException
+    {
+        Files.touch(descriptor.fileFor(groupCompletionMarker));
+    }
+
+    public void createColumnCompletionMarker() throws IOException
+    {
+        Files.touch(descriptor.fileFor(columnCompletionMarker));
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this).add("descriptor", descriptor)
+                          .add("indexName", indexName)
+                          .toString();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        IndexComponents components = (IndexComponents) o;
+
+        if (descriptor != null ? !descriptor.equals(components.descriptor) : components.descriptor != null)
+            return false;
+        return indexName != null ? indexName.equals(components.indexName) : components.indexName == null;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        int result = descriptor != null ? descriptor.hashCode() : 0;
+        result = 31 * result + (indexName != null ? indexName.hashCode() : 0);
+        return result;
+    }
+
+    private void validatePerColumnComponents(boolean isLiteral, boolean checksum) throws IOException
+    {
+        MetadataSource source = MetadataSource.loadColumnMetadata(this);
+        List<SegmentMetadata> segments = SegmentMetadata.load(source, null);
+
+        for (IndexComponent component : perColumnComponents(indexName, isLiteral))
+        {
+            if (!component.ndiType.completionMarker())
+            {
+                if (component.ndiType.perSegment())
+                {
+                    for (int i = 0; i < segments.size(); i++)
+                    {
+                        SegmentMetadata metadata = segments.get(i);
+                        boolean isLastSegment = i == segments.size() - 1;
+
+                        validateSegment(component, metadata, isLastSegment, checksum, false);
+                    }
+                }
+                else
+                {
+                    validateComponent(component, checksum);
+                }
+            }
+        }
+    }
+
+    @SuppressWarnings("resource")
+    private void validateSegment(IndexComponent component, SegmentMetadata metadata, boolean isLastSegment, boolean checksum, boolean isEncrypted) throws IOException
+    {
+        long offset = metadata.getIndexOffset(component);
+        long length = metadata.getIndexLength(component);
+
+        try (IndexInput input = openBlockingInput(component))
+        {
+            // Even if the component is encryptable, we still need to check for corruption if it isn't actually encrypted:
+            if (!component.ndiType.encryptable() || !isEncrypted)
+            {
+                // Make sure there isn't any data appended incorrectly after the official end of the file:
+                if (isLastSegment && input.length() != offset + length)
+                {
+                    String message = logMessage(String.format("Corrupted last segment! offset (%d) + length (%d) != file "+
+                            "length (%s) isEncrypted: %s component.ndiType.encryptable: %s component.ndiType: %s",
+                            offset, length, input.length(), isEncrypted, component.ndiType.encryptable(), component.ndiType.toString()));
+                    throw new CorruptIndexException(message, descriptor.toString());
+                }
+            }
+
+            IndexInput slice = input.slice(String.format("%s with offset=%d and length=%d]", input.toString(), offset, length), offset, length);
+
+            if (checksum)
+                SAICodecUtils.validateChecksum(slice);
+            else
+                SAICodecUtils.validate(slice);
+        }
+        catch (IOException e)
+        {
+            if (logger.isDebugEnabled())
+            {
+                logger.debug(logMessage("Per-segment {} validation failed for index component {} on SSTable {}"), (checksum ? "checksum " : ""), component, descriptor);
+            }
+            throw e;
+        }
+    }
+
+    private void validateComponent(IndexComponent component, boolean checksum) throws IOException
+    {
+        if (!component.ndiType.completionMarker())
+        {
+            try (IndexInput input = openBlockingInput(component))
+            {
+                if (checksum)
+                    SAICodecUtils.validateChecksum(input);
+                else
+                    SAICodecUtils.validate(input);
+            }
+            catch (IOException e)
+            {
+                if (logger.isDebugEnabled())
+                {
+                    logger.debug(logMessage("{} failed for index component {} on SSTable {}"), (checksum ? "Checksum validation" : "Validation"), component, descriptor);
+                }
+                throw e;
+            }
+        }
+    }
+
+
+    interface ChecksumWriter
+    {
+        long getChecksum();
+    }
+
+//    class EncryptedIncrementalChecksumSequentialWriter extends EncryptedSequentialWriter implements ChecksumWriter
+//    {
+//        private final CRC32 checksum = new CRC32();
+//
+//        EncryptedIncrementalChecksumSequentialWriter(File file, ICompressor encryptor)
+//        {
+//            super(file, writerOption, encryptor);
+//        }
+//
+//        @Override
+//        public void writeByte(int b) throws IOException
+//        {
+//            super.writeByte(b);
+//            checksum.update(b);
+//        }
+//
+//        @Override
+//        public void write(byte[] b) throws IOException
+//        {
+//            super.write(b);
+//            checksum.update(b);
+//        }
+//
+//        @Override
+//        public void write(byte[] b, int off, int len) throws IOException
+//        {
+//            super.write(b, off, len);
+//            checksum.update(b, off, len);
+//        }
+//
+//        public long getChecksum()
+//        {
+//            return checksum.getValue();
+//        }
+//    }
+
+    class IncrementalChecksumSequentialWriter extends SequentialWriter implements ChecksumWriter
+    {
+        private final CRC32 checksum = new CRC32();
+
+        IncrementalChecksumSequentialWriter(File file)
+        {
+            super(file, writerOption);
+        }
+
+        @Override
+        public void writeByte(int b) throws IOException
+        {
+            super.writeByte(b);
+            checksum.update(b);
+        }
+
+        @Override
+        public void write(byte[] b) throws IOException
+        {
+            super.write(b);
+            checksum.update(b);
+        }
+
+        @Override
+        public void write(byte[] b, int off, int len) throws IOException
+        {
+            super.write(b, off, len);
+            checksum.update(b, off, len);
+        }
+
+        public long getChecksum()
+        {
+            return checksum.getValue();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/IndexInputReader.java b/src/java/org/apache/cassandra/index/sai/disk/io/IndexInputReader.java
new file mode 100644
index 000000000000..1ad929fa3bcb
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/IndexInputReader.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+
+import java.io.IOException;
+
+import org.apache.cassandra.io.compress.CorruptBlockException;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+
+public class IndexInputReader extends IndexInput
+{
+    private final RandomAccessReader input;
+    private final Runnable doOnClose;
+
+    private IndexInputReader(RandomAccessReader input, Runnable doOnClose)
+    {
+        super(input.getPath());
+        this.input = input;
+        this.doOnClose = doOnClose;
+    }
+
+    public static IndexInputReader create(RandomAccessReader input)
+    {
+        return new IndexInputReader(input, () -> {});
+    }
+
+    static IndexInputReader create(RandomAccessReader input, Runnable doOnClose)
+    {
+        return new IndexInputReader(input, doOnClose);
+    }
+
+    @SuppressWarnings("resource")
+    static IndexInputReader create(FileHandle handle)
+    {
+        RandomAccessReader reader = handle.createReader();
+        return new IndexInputReader(reader, () -> {});
+    }
+
+    public RandomAccessReader reader()
+    {
+        return input;
+    }
+
+    @Override
+    public byte readByte() throws IOException
+    {
+        return input.readByte();
+    }
+
+    @Override
+    public void readBytes(byte[] bytes, int off, int len) throws IOException
+    {
+        try
+        {
+            input.readFully(bytes, off, len);
+        }
+        catch (CorruptBlockException ex)
+        {
+            throw new CorruptIndexException(input.getPath(), "Corrupted block", ex);
+        }
+    }
+
+    /**
+     * Using {@link RandomAccessReader#readShort()} directly is faster than {@link DataInput#readShort()} which calls
+     * {@link DataInput#readByte()} one by one
+     */
+    @Override
+    public short readShort() throws IOException
+    {
+        try
+        {
+            return input.readShort();
+        }
+        catch (CorruptBlockException ex)
+        {
+            throw new CorruptIndexException(input.getPath(), "Corrupted block", ex);
+        }
+    }
+
+    /**
+     * Using {@link RandomAccessReader#readInt()} directly is faster than {@link DataInput#readInt()} which
+     * calls {@link DataInput#readByte()} one by one
+     */
+    @Override
+    public int readInt() throws IOException
+    {
+        try
+        {
+            return input.readInt();
+        }
+        catch (CorruptBlockException ex)
+        {
+            throw new CorruptIndexException(input.getPath(), "Corrupted block", ex);
+        }
+    }
+
+    /**
+     * Using {@link RandomAccessReader#readLong()} directly is faster than {@link DataInput#readLong()} which
+     * calls {@link DataInput#readByte()} one by one
+     */
+    @Override
+    public long readLong() throws IOException
+    {
+        try
+        {
+            return input.readLong();
+        }
+        catch (CorruptBlockException ex)
+        {
+            throw new CorruptIndexException(input.getPath(), "Corrupted block", ex);
+        }
+    }
+
+    @Override
+    public void close()
+    {
+        try
+        {
+            input.close();
+        }
+        finally
+        {
+            doOnClose.run();
+        }
+    }
+
+    @Override
+    public long getFilePointer()
+    {
+        return input.getFilePointer();
+    }
+
+    @Override
+    public void seek(long position)
+    {
+        input.seek(position);
+    }
+
+    @Override
+    public long length()
+    {
+        return input.length();
+    }
+
+    @Override
+    public IndexInput slice(String sliceDescription, long offset, long length) throws CorruptIndexException
+    {
+        if (offset < 0 || length < 0 || offset + length > input.length())
+        {
+            throw new CorruptIndexException("Invalid slice! Offset: " + offset + ", Length: " + length + ", Input Length: " + input.length(), this);
+        }
+
+        return new IndexInputReader(input, doOnClose)
+        {
+            @Override
+            public void seek(long position)
+            {
+                input.seek(position + offset);
+            }
+
+            @Override
+            public long getFilePointer()
+            {
+                return input.getFilePointer() - offset;
+            }
+
+            @Override
+            public long length()
+            {
+                return length;
+            }
+        };
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/IndexOutputWriter.java b/src/java/org/apache/cassandra/index/sai/disk/io/IndexOutputWriter.java
new file mode 100644
index 000000000000..f40bb8e68205
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/IndexOutputWriter.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+
+import com.google.common.base.MoreObjects;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.lucene.store.IndexOutput;
+
+public class IndexOutputWriter extends IndexOutput
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final SequentialWriter out;
+    private boolean closed;
+
+    IndexOutputWriter(SequentialWriter out)
+    {
+        super(out.getPath(), out.getPath());
+        this.out = out;
+    }
+
+    public void skipBytes(long length) throws IOException
+    {
+        this.out.skipBytes(length);
+    }
+
+    public String getPath()
+    {
+        return out.getPath();
+    }
+
+    @Override
+    public long getChecksum()
+    {
+        return ((IndexComponents.ChecksumWriter)out).getChecksum();
+    }
+
+    @Override
+    public long getFilePointer()
+    {
+        return out.position();
+    }
+
+    @Override
+    public void writeBytes(byte[] bytes, int offset, int len) throws IOException
+    {
+        out.write(bytes, offset, len);
+    }
+
+    @Override
+    public void writeByte(byte b) throws IOException
+    {
+        out.writeByte(b);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        // IndexOutput#close contract allows any output to be closed multiple times,
+        // and Lucene does it in few places. SequentialWriter can be closed once.
+        if (!closed)
+        {
+            if (logger.isTraceEnabled())
+            {
+                logger.trace("Closing index output: {}", this);
+            }
+
+            // The writer should sync its contents to disk before closing...
+            out.close();
+            closed = true;
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("path", out.getPath())
+                          .add("bytesWritten", getFilePointer())
+                          .add("crc", getChecksum())
+                          .toString();
+    }
+
+    /**
+     * Returns {@link SequentialWriter} associated with this writer. Convenient when interacting with DSE-DB codebase to
+     * write files to disk. Note that all bytes written to the returned writer will still contribute to the checksum.
+     *
+     * @return {@link SequentialWriter} associated with this writer
+     */
+    public SequentialWriter asSequentialWriter()
+    {
+        return out;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/io/RAMIndexOutput.java b/src/java/org/apache/cassandra/index/sai/disk/io/RAMIndexOutput.java
new file mode 100644
index 000000000000..b8abd40e4cba
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/io/RAMIndexOutput.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
+
+// Use in place of RAMOutputStream (which has monitor locks)
+public class RAMIndexOutput extends IndexOutput
+{
+    protected final GrowableByteArrayDataOutput out;
+
+    public RAMIndexOutput(String name)
+    {
+        super("", name);
+        out = new GrowableByteArrayDataOutput(128);
+    }
+
+    public byte[] getBytes()
+    {
+        return out.getBytes();
+    }
+
+    @Override
+    public long getChecksum() throws IOException
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long getFilePointer()
+    {
+        return out.getPosition();
+    }
+
+    @Override
+    public void writeByte(byte b)
+    {
+        out.writeByte(b);
+    }
+
+    @Override
+    public void writeBytes(byte[] bytes, int offset, int len)
+    {
+        out.writeBytes(bytes, offset, len);
+    }
+
+    public void writeTo(IndexOutput externalOut) throws IOException
+    {
+        externalOut.writeBytes(out.getBytes(), 0, out.getPosition());
+    }
+
+    public void reset()
+    {
+        out.reset();
+    }
+
+    @Override
+    public void close() throws IOException {}
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedReader.java
new file mode 100644
index 000000000000..65f36d1cfc43
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedReader.java
@@ -0,0 +1,233 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.SeekingRandomAccessInput;
+import org.apache.lucene.store.IndexInput;
+
+public abstract class AbstractBlockPackedReader implements LongArray
+{
+    private final int blockShift, blockMask;
+    private final int blockSize;
+    private final long valueCount;
+    final byte[] blockBitsPerValue; // package protected for test access
+    private final SeekingRandomAccessInput input;
+
+    private long prevTokenValue = Long.MIN_VALUE;
+    private long lastIndex; // the last index visited by token -> row ID searches
+
+    AbstractBlockPackedReader(IndexInput indexInput, byte[] blockBitsPerValue, int blockShift, int blockMask, long sstableRowId, long valueCount)
+    {
+        this.blockShift = blockShift;
+        this.blockMask = blockMask;
+        this.blockSize = blockMask + 1;
+        this.valueCount = valueCount;
+        this.input = new SeekingRandomAccessInput(indexInput);
+        this.blockBitsPerValue = blockBitsPerValue;
+        // start searching tokens from current index segment
+        this.lastIndex = sstableRowId;
+    }
+
+    protected abstract long blockOffsetAt(int block);
+
+    @Override
+    public long get(final long index)
+    {
+        if (index < 0 || index >= valueCount)
+        {
+            throw new IndexOutOfBoundsException(String.format("Index should be between [0, %d), but was %d.", valueCount, index));
+        }
+
+        final int block = (int) (index >>> blockShift);
+        final int idx = (int) (index & blockMask);
+        final DirectReaders.Reader subReader = DirectReaders.getReaderForBitsPerValue(blockBitsPerValue[block]);
+        return delta(block, idx) + subReader.get(input, blockOffsetAt(block), idx);
+    }
+
+    @Override
+    public long findTokenRowID(long targetValue)
+    {
+        // already out of range
+        if (lastIndex >= valueCount)
+            return -1;
+
+        // We keep track previous returned value in lastIndex, so searching backward will not return correct result.
+        // Also it's logically wrong to search backward during token iteration in PostingListRangeIterator.
+        if (targetValue < prevTokenValue)
+            throw new IllegalArgumentException(String.format("%d is smaller than prev token value %d", targetValue, prevTokenValue));
+        prevTokenValue = targetValue;
+
+        int blockIndex = binarySearchBlockMinValues(targetValue);
+
+        // We need to check next block's min value on an exact match.
+        boolean exactMatch = blockIndex >= 0;
+
+        if (blockIndex < 0)
+        {
+            // A non-exact match, which is the negative index of the first value greater than the target.
+            // For example, searching for 4 against min values [3,3,5,7] produces -2, which we convert to 2.
+            blockIndex = -blockIndex;
+        }
+
+        if (blockIndex > 0)
+        {
+            // Start at the previous block, because there could be duplicate values in the previous block.
+            // For example, with block 1: [1,2,3,3] & block 2: [3,3,5,7], binary search for 3 would find
+            // block 2, but we need to start from block 1 and search both.
+            // In case non-exact match, we need to pivot left as target is less than next block's min.
+            blockIndex--;
+        }
+
+        // Find the global (not block-specific) index of the target token, which is equivalent to its row ID:
+        lastIndex = findBlockRowID(targetValue, blockIndex, exactMatch);
+        return lastIndex >= valueCount ? -1 : lastIndex;
+    }
+
+    /**
+     *
+     * @return a positive block index for an exact match, or a negative one for a non-exact match
+     */
+    private int binarySearchBlockMinValues(long targetValue)
+    {
+        int high = Math.toIntExact(blockBitsPerValue.length) - 1;
+
+        // Assume here that we'll never move backward through the blocks:
+        int low = Math.toIntExact(lastIndex >> blockShift);
+
+        // Short-circuit the search if the target is in current block:
+        if (low + 1 <= high)
+        {
+            long cmp = Long.compare(targetValue, delta(low + 1, 0));
+
+            if (cmp == 0)
+            {
+                // We have an exact match, so return the index of the next block, which means we'll start
+                // searching from the current one and also inspect the first value of the next block.
+                return low + 1;
+            }
+            else if (cmp < 0)
+            {
+                // We're in the same block. Indicate a non-exact match, and this value will be both
+                // negated and then decremented to wind up at the current value of "low" here.
+                return -low - 1;
+            }
+
+            // The target is greater than the next block's min value, so advance to that
+            // block before starting the usual search...
+            low++;
+        }
+
+        while (low <= high)
+        {
+            int mid = low + ((high - low) >> 1);
+
+            long midVal = delta(mid, 0);
+
+            if (midVal < targetValue)
+            {
+                low = mid + 1;
+            }
+            else if (midVal > targetValue)
+            {
+                high = mid - 1;
+            }
+            else
+            {
+                // target found, but we need to check for duplicates
+                if (mid > 0 && delta(mid - 1, 0) == targetValue)
+                {
+                    // there are duplicates, pivot left
+                    high = mid - 1;
+                }
+                else
+                {
+                    // no duplicates
+                    return mid;
+                }
+            }
+        }
+
+        return -low; // no exact match found
+    }
+
+    private long findBlockRowID(long targetValue, long blockIdx, boolean exactMatch)
+    {
+        // Calculate the global offset for the selected block:
+        long offset = blockIdx << blockShift;
+
+        // Resume from previous index if it's larger than offset
+        long low = Math.max(lastIndex, offset);
+
+        // The high is either the last local index in the block, or something smaller if the block isn't full:
+        long high = Math.min(offset + blockSize - 1 + (exactMatch ? 1 : 0), valueCount - 1);
+
+        return binarySearchBlock(targetValue, low, high);
+    }
+
+    /**
+     * binary search target value between low and high.
+     *
+     * @return index if exact match is found, or *positive* insertion point if no exact match is found.
+     */
+    private long binarySearchBlock(long target, long low, long high)
+    {
+        while (low <= high)
+        {
+            long mid = low + ((high - low) >> 1);
+
+            long midVal = get(mid);
+
+            if (midVal < target)
+            {
+                low = mid + 1;
+                // future rowId cannot be smaller than mid as long as next token not smaller than current token.
+                lastIndex = mid;
+            }
+            else if (midVal > target)
+            {
+                high = mid - 1;
+            }
+            else
+            {
+                // target found, but we need to check for duplicates
+                if (mid > 0 && get(mid - 1) == target)
+                {
+                    // there are duplicates, pivot left
+                    high = mid - 1;
+                }
+                else
+                {
+                    // exact match and no duplicates
+                    return mid;
+                }
+            }
+        }
+
+        // target not found
+        return low;
+    }
+
+    @Override
+    public long length()
+    {
+        return valueCount;
+    }
+
+    abstract long delta(int block, int idx);
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedWriter.java
new file mode 100644
index 000000000000..ca8a0df6c3ea
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/AbstractBlockPackedWriter.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.cassandra.index.sai.disk.io.RAMIndexOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.packed.DirectWriter;
+
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.checkBlockSize;
+
+/**
+ * Modified copy of {@link org.apache.lucene.util.packed.AbstractBlockPackedWriter} to use {@link DirectWriter} for
+ * optimised reads that doesn't require seeking through the whole file to open a thread-exclusive reader.
+ */
+abstract class AbstractBlockPackedWriter
+{
+    static final int MIN_BLOCK_SIZE = 64;
+    static final int MAX_BLOCK_SIZE = 1 << (30 - 3);
+    static final int MIN_VALUE_EQUALS_0 = 1;
+    static final int BPV_SHIFT = 1;
+
+    protected final IndexOutput out;
+    protected final long[] values;
+    protected int off;
+    protected boolean finished;
+    
+    final RAMIndexOutput blockMetaWriter;
+
+    AbstractBlockPackedWriter(IndexOutput out, int blockSize)
+    {
+        checkBlockSize(blockSize, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
+        this.out = out;
+        this.blockMetaWriter = new RAMIndexOutput("NumericValuesMeta");
+        values = new long[blockSize];
+    }
+
+    private void checkNotFinished()
+    {
+        if (finished)
+        {
+            throw new IllegalStateException(String.format("[%s] Writer already finished!", out.getName()));
+        }
+    }
+
+    /**
+     * Append a new long.
+     */
+    public void add(long l) throws IOException
+    {
+        checkNotFinished();
+        if (off == values.length)
+        {
+            flush();
+        }
+        values[off++] = l;
+    }
+
+
+    /**
+     * Flush all buffered data to disk. This instance is not usable anymore
+     * after this method has been called.
+     *
+     * @return a file offset to the block metadata
+     */
+    public long finish() throws IOException
+    {
+        checkNotFinished();
+        if (off > 0)
+        {
+            flush();
+        }
+        final long fp = out.getFilePointer();
+        blockMetaWriter.writeTo(out);
+        finished = true;
+        return fp;
+    }
+
+    protected abstract void flush() throws IOException;
+
+    void writeValues(int numValues, int bitsPerValue) throws IOException
+    {
+        final DirectWriter writer = DirectWriter.getInstance(out, numValues, bitsPerValue);
+        for (int i = 0; i < numValues; ++i)
+        {
+            writer.add(values[i]);
+        }
+        writer.finish();
+    }
+
+    void writeVLong(IndexOutput out, long i) throws IOException
+    {
+        int k = 0;
+        while ((i & ~0x7FL) != 0L && k++ < 8)
+        {
+            out.writeByte((byte) ((i & 0x7FL) | 0x80L));
+            i >>>= 7;
+        }
+        out.writeByte((byte) i);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/BKDPostingsIndex.java b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDPostingsIndex.java
new file mode 100644
index 000000000000..6d5e6d6f8406
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDPostingsIndex.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import com.carrotsearch.hppc.IntLongHashMap;
+import com.carrotsearch.hppc.IntLongMap;
+import org.apache.cassandra.index.sai.disk.io.IndexInputReader;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.github.jamm.MemoryLayoutSpecification;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.validate;
+
+/**
+ * Mapping between node ID and an offset to its auxiliary posting list (containing every row id from all leaves
+ * reachable from that node. See {@link OneDimBKDPostingsWriter}).
+ */
+class BKDPostingsIndex
+{
+    private final int size;
+    public final IntLongMap index = new IntLongHashMap();
+
+    @SuppressWarnings("resource")
+    BKDPostingsIndex(FileHandle postingsFileHandle, long filePosition) throws IOException
+    {
+        try (final RandomAccessReader reader = postingsFileHandle.createReader())
+        {
+            final IndexInputReader input = IndexInputReader.create(reader);
+            validate(input);
+            input.seek(filePosition);
+
+            size = input.readVInt();
+
+            for (int x = 0; x < size; x++)
+            {
+                final int node = input.readVInt();
+                final long filePointer = input.readVLong();
+
+                index.put(node, filePointer);
+            }
+        }
+    }
+
+    public long memoryUsage()
+    {
+        // IntLongHashMap uses two arrays: one for keys, one for values.
+        return MemoryLayoutSpecification.sizeOfArray(index.size(), 4L)
+               + MemoryLayoutSpecification.sizeOfArray(index.size(), 8L);
+    }
+
+    /**
+     * Returns <tt>true</tt> if given node ID has an auxiliary posting list.
+     */
+    boolean exists(int nodeID)
+    {
+        checkArgument(nodeID > 0);
+        return index.containsKey(nodeID);
+    }
+
+    /**
+     * Returns an offset within the bkd postings file to the begining of the blocks summary of given node's auxiliary
+     * posting list.
+     *
+     * @throws IllegalArgumentException when given nodeID doesn't have an auxiliary posting list. Check first with
+     * {@link #exists(int)}
+     */
+    long getPostingsFilePointer(int nodeID)
+    {
+        checkArgument(exists(nodeID));
+        return index.get(nodeID);
+    }
+
+    int size()
+    {
+        return size;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/BKDReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDReader.java
new file mode 100644
index 000000000000..9a5e7452fc15
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDReader.java
@@ -0,0 +1,809 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.TreeMap;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Stopwatch;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.agrona.collections.LongArrayList;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.utils.AbortedOperationException;
+import org.apache.cassandra.index.sai.utils.AbstractIterator;
+import org.apache.cassandra.index.sai.utils.SeekingRandomAccessInput;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.FutureArrays;
+import org.apache.lucene.util.packed.DirectWriter;
+
+/**
+ * Handles intersection of a multi-dimensional shape in byte[] space with a block KD-tree previously written with
+ * {@link BKDWriter}.
+ */
+public class BKDReader extends TraversingBKDReader implements Closeable
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private static final Comparator<PostingList.PeekablePostingList> COMPARATOR = Comparator.comparingLong(PostingList.PeekablePostingList::peek);
+
+    private final FileHandle postingsFile, kdtreeFile;
+    private final BKDPostingsIndex postingsIndex;
+    private final ICompressor compressor;
+    private final DirectReaders.Reader leafOrderMapReader;
+
+    /**
+     * Performs a blocking read.
+     */
+    public BKDReader(IndexComponents indexComponents,
+                     FileHandle kdtreeFile,
+                     long bkdIndexRoot,
+                     FileHandle postingsFile,
+                     long bkdPostingsRoot) throws IOException
+    {
+        super(indexComponents, kdtreeFile, bkdIndexRoot);
+        this.postingsFile = postingsFile;
+        this.kdtreeFile = kdtreeFile;
+        this.postingsIndex = new BKDPostingsIndex(postingsFile, bkdPostingsRoot);
+        this.compressor = null;
+        final byte bits = (byte) DirectWriter.unsignedBitsRequired(maxPointsInLeafNode - 1);
+        leafOrderMapReader = DirectReaders.getReaderForBitsPerValue(bits);
+    }
+
+    public interface DocMapper
+    {
+        long oldToNew(long rowID);
+    }
+
+    private TreeMap<Long,Integer> getLeafOffsets()
+    {
+        final TreeMap<Long,Integer> map = new TreeMap();
+        final PackedIndexTree index = new PackedIndexTree();
+        getLeafOffsets(index, map);
+        return map;
+    }
+
+    private void getLeafOffsets(final IndexTree index, TreeMap<Long, Integer> map)
+    {
+        if (index.isLeafNode())
+        {
+            if (index.nodeExists())
+            {
+                map.put(index.getLeafBlockFP(), index.getNodeID());
+            }
+        }
+        else
+        {
+            index.pushLeft();
+            getLeafOffsets(index, map);
+            index.pop();
+
+            index.pushRight();
+            getLeafOffsets(index, map);
+            index.pop();
+        }
+    }
+
+    @VisibleForTesting
+    public IteratorState iteratorState() throws IOException
+    {
+        return new IteratorState((rowID) -> rowID);
+    }
+
+    public IteratorState iteratorState(DocMapper docMapper) throws IOException
+    {
+        return new IteratorState(docMapper);
+    }
+
+    public class IteratorState extends AbstractIterator<Long> implements Comparable<IteratorState>, Closeable
+    {
+        final IndexInput bkdInput;
+        final IndexInput bkdPostingsInput;
+        final byte[] packedValues = new byte[maxPointsInLeafNode * packedBytesLength];
+        private int leaf, leafPointCount, leafPointIndex = -1;
+        final LongArrayList tempPostings = new LongArrayList();
+        final long[] postings = new long[maxPointsInLeafNode];
+        final DocMapper docMapper;
+        public final byte[] scratch;
+        final Iterator<Map.Entry<Long,Integer>> iterator;
+
+        public IteratorState(DocMapper docMapper) throws IOException
+        {
+            this.docMapper = docMapper;
+
+            scratch = new byte[packedBytesLength];
+
+            final long firstLeafFilePointer = getMinLeafBlockFP();
+            bkdInput = indexComponents.openInput(kdtreeFile);
+            bkdPostingsInput = indexComponents.openInput(postingsFile);
+            bkdInput.seek(firstLeafFilePointer);
+
+            final TreeMap<Long,Integer> leafNodeToLeafFP = getLeafOffsets();
+
+            // init the first leaf
+            iterator = leafNodeToLeafFP.entrySet().iterator();
+            final Map.Entry<Long,Integer> entry = iterator.next();
+            leafPointCount = readLeaf(entry.getKey(), entry.getValue(), bkdInput, packedValues, bkdPostingsInput, postings, tempPostings);
+        }
+
+        @Override
+        public void close()
+        {
+            FileUtils.closeQuietly(bkdInput, bkdPostingsInput);
+        }
+
+        @Override
+        public int compareTo(final IteratorState other)
+        {
+            final int cmp = FutureArrays.compareUnsigned(scratch, 0, packedBytesLength, other.scratch, 0, packedBytesLength);
+            if (cmp == 0)
+            {
+                final long rowid1 = next;
+                final long rowid2 = other.next;
+                return Long.compare(rowid1, rowid2);
+            }
+            return cmp;
+        }
+
+        @Override
+        protected Long computeNext()
+        {
+            while (true)
+            {
+                if (leafPointIndex == leafPointCount - 1)
+                {
+                    leaf++;
+                    if (leaf == numLeaves && leafPointIndex == leafPointCount - 1)
+                    {
+                        return endOfData();
+                    }
+                    final Map.Entry<Long, Integer> entry = iterator.next();
+                    try
+                    {
+                        leafPointCount = readLeaf(entry.getKey(), entry.getValue(), bkdInput, packedValues, bkdPostingsInput, postings, tempPostings);
+                    }
+                    catch (IOException e)
+                    {
+                        logger.error("Failed to read leaf during BKDTree merger", e);
+                        throw new RuntimeException("Failed to read leaf during BKDTree merger", e);
+                    }
+                    leafPointIndex = -1;
+                }
+
+                leafPointIndex++;
+
+                System.arraycopy(packedValues, leafPointIndex * packedBytesLength, scratch, 0, packedBytesLength);
+                return docMapper.oldToNew(postings[leafPointIndex]);
+            }
+        }
+    }
+
+    @SuppressWarnings("resource")
+    public int readLeaf(long filePointer,
+                        int nodeID,
+                        final IndexInput bkdInput,
+                        final byte[] packedValues,
+                        final IndexInput bkdPostingsInput,
+                        long[] postings,
+                        LongArrayList tempPostings) throws IOException
+    {
+        bkdInput.seek(filePointer);
+        final int count = bkdInput.readVInt();
+        // loading doc ids occurred here prior
+        final int orderMapLength = bkdInput.readVInt();
+        final long orderMapPointer = bkdInput.getFilePointer();
+
+        // order of the values in the posting list
+        final short[] origIndex = new short[maxPointsInLeafNode];
+
+        final int[] commonPrefixLengths = new int[numDims];
+        final byte[] scratchPackedValue1 = new byte[packedBytesLength];
+
+        final SeekingRandomAccessInput randoInput = new SeekingRandomAccessInput(bkdInput);
+        for (int x = 0; x < count; x++)
+        {
+            final short idx = (short) LeafOrderMap.getValue(randoInput, orderMapPointer, x, leafOrderMapReader);
+            origIndex[x] = idx;
+        }
+
+        IndexInput leafInput = bkdInput;
+
+        // reused byte arrays for the decompression of leaf values
+        final BytesRef uncompBytes = new BytesRef(new byte[16]);
+        final BytesRef compBytes = new BytesRef(new byte[16]);
+
+        // seek beyond the ordermap
+        leafInput.seek(orderMapPointer + orderMapLength);
+
+        if (compressor != null)
+        {
+            // This should not throw WouldBlockException, even though we're on a TPC thread, because the
+            // secret key used by the underlying encryptor should be loaded at reader construction time.
+            leafInput = CryptoUtils.uncompress(bkdInput, compressor, compBytes, uncompBytes);
+        }
+
+        final IntersectVisitor visitor = new IntersectVisitor() {
+            int i = 0;
+
+            @Override
+            public boolean visit(byte[] packedValue)
+            {
+                System.arraycopy(packedValue, 0, packedValues, i * packedBytesLength, packedBytesLength);
+                i++;
+                return true;
+            }
+
+            @Override
+            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+                return Relation.CELL_CROSSES_QUERY;
+            }
+        };
+
+        visitDocValues(commonPrefixLengths, scratchPackedValue1, leafInput, count, visitor, null, origIndex);
+
+        if (postingsIndex.exists(nodeID))
+        {
+            final long pointer = postingsIndex.getPostingsFilePointer(nodeID);
+            final PostingsReader.BlocksSummary summary = new PostingsReader.BlocksSummary(bkdPostingsInput, pointer);
+            final PostingsReader postingsReader = new PostingsReader(bkdPostingsInput, summary, QueryEventListener.PostingListEventListener.NO_OP);
+
+            tempPostings.clear();
+
+            // gather the postings into tempPostings
+            while (true)
+            {
+                final long rowid = postingsReader.nextPosting();
+                if (rowid == PostingList.END_OF_STREAM) break;
+                tempPostings.add(rowid);
+            }
+
+            // put the postings into the array according the origIndex
+            for (int x = 0; x < tempPostings.size(); x++)
+            {
+                int idx = origIndex[x];
+                final long rowid = tempPostings.get(idx);
+
+                postings[x] = rowid;
+            }
+        }
+        else
+        {
+            throw new IllegalStateException();
+        }
+        return count;
+    }
+
+    public static int openPerIndexFiles()
+    {
+        // kd-tree, posting lists file
+        return 2;
+    }
+
+    @Override
+    public void close()
+    {
+        try
+        {
+            super.close();
+        }
+        finally
+        {
+            postingsFile.close();
+        }
+    }
+
+    @SuppressWarnings("resource")
+    public PostingList intersect(IntersectVisitor visitor, QueryEventListener.BKDIndexEventListener listener, QueryContext context)
+    {
+        Relation relation = visitor.compare(minPackedValue, maxPackedValue);
+
+        if (relation == Relation.CELL_OUTSIDE_QUERY)
+        {
+            listener.onIntersectionEarlyExit();
+            return null;
+        }
+
+        listener.onSegmentHit();
+        IndexInput bkdInput = indexComponents.openInput(indexFile);
+        IndexInput postingsInput = indexComponents.openInput(postingsFile);
+        IndexInput postingsSummaryInput = indexComponents.openInput(postingsFile);
+        PackedIndexTree index = new PackedIndexTree();
+
+        Intersection completable =
+        relation == Relation.CELL_INSIDE_QUERY ?
+                new Intersection(bkdInput, postingsInput, postingsSummaryInput, index, listener, context) :
+                new FilteringIntersection(bkdInput, postingsInput, postingsSummaryInput, index, visitor, listener, context);
+
+        return completable.execute();
+    }
+
+    /**
+     * Synchronous intersection of an multi-dimensional shape in byte[] space with a block KD-tree
+     * previously written with {@link BKDWriter}.
+     */
+    class Intersection
+    {
+        private final Stopwatch queryExecutionTimer = Stopwatch.createStarted();
+        final QueryContext context;
+
+        final IndexInput bkdInput;
+        final IndexInput postingsInput;
+        final IndexInput postingsSummaryInput;
+        final IndexTree index;
+        final QueryEventListener.BKDIndexEventListener listener;
+
+        Intersection(IndexInput bkdInput, IndexInput postingsInput, IndexInput postingsSummaryInput,
+                     IndexTree index, QueryEventListener.BKDIndexEventListener listener, QueryContext context)
+        {
+            this.bkdInput = bkdInput;
+            this.postingsInput = postingsInput;
+            this.postingsSummaryInput = postingsSummaryInput;
+            this.index = index;
+            this.listener = listener;
+            this.context = context;
+        }
+
+        public PostingList execute()
+        {
+            try
+            {
+                PriorityQueue<PostingList.PeekablePostingList> postingLists = new PriorityQueue<>(100, COMPARATOR);
+                executeInternal(postingLists);
+
+                FileUtils.closeQuietly(bkdInput);
+
+                return mergePostings(postingLists);
+            }
+            catch (Throwable t)
+            {
+                if (!(t instanceof AbortedOperationException))
+                    logger.error(indexComponents.logMessage("kd-tree intersection failed on {}"), indexFile.path(), t);
+
+                closeOnException();
+                throw Throwables.cleaned(t);
+            }
+        }
+
+        protected void executeInternal(final PriorityQueue<PostingList.PeekablePostingList> postingLists) throws IOException
+        {
+            collectPostingLists(postingLists);
+        }
+
+        protected void closeOnException()
+        {
+            FileUtils.closeQuietly(bkdInput, postingsInput, postingsSummaryInput);
+        }
+
+        protected PostingList mergePostings(PriorityQueue<PostingList.PeekablePostingList> postingLists)
+        {
+            final long elapsedMicros = queryExecutionTimer.stop().elapsed(TimeUnit.MICROSECONDS);
+
+            listener.onIntersectionComplete(elapsedMicros, TimeUnit.MICROSECONDS);
+            listener.postingListsHit(postingLists.size());
+
+            if (postingLists.isEmpty())
+            {
+                FileUtils.closeQuietly(postingsInput, postingsSummaryInput);
+                return null;
+            }
+            else
+            {
+                if (logger.isTraceEnabled())
+                    logger.trace(indexComponents.logMessage("[{}] Intersection completed in {} microseconds. {} leaf and internal posting lists hit."),
+                                 indexFile.path(), elapsedMicros, postingLists.size());
+                return MergePostingList.merge(postingLists, () -> FileUtils.close(postingsInput, postingsSummaryInput));
+            }
+        }
+
+        public void collectPostingLists(PriorityQueue<PostingList.PeekablePostingList> postingLists) throws IOException
+        {
+            context.checkpoint();
+
+            final int nodeID = index.getNodeID();
+
+            // if there is pre-built posting for entire subtree
+            if (postingsIndex.exists(nodeID))
+            {
+                postingLists.add(initPostingReader(postingsIndex.getPostingsFilePointer(nodeID)).peekable());
+                return;
+            }
+
+            Preconditions.checkState(!index.isLeafNode(), "Leaf node %s does not have kd-tree postings.", index.getNodeID());
+
+            // Recurse on left sub-tree:
+            index.pushLeft();
+            collectPostingLists(postingLists);
+            index.pop();
+
+            // Recurse on right sub-tree:
+            index.pushRight();
+            collectPostingLists(postingLists);
+            index.pop();
+        }
+
+        private PostingList initPostingReader(long offset) throws IOException
+        {
+            final PostingsReader.BlocksSummary summary = new PostingsReader.BlocksSummary(postingsSummaryInput, offset);
+            return new PostingsReader(postingsInput, summary, listener.postingListEventListener());
+        }
+    }
+
+    /**
+     * Modified copy of BKDReader#visitDocValues()
+     */
+    private int visitDocValues(int[] commonPrefixLengths,
+                               byte[] scratchPackedValue1,
+                               IndexInput in,
+                               int count,
+                               IntersectVisitor visitor,
+                               FixedBitSet[] holder,
+                               final short[] origIndex) throws IOException
+    {
+        readCommonPrefixes(commonPrefixLengths, scratchPackedValue1, in);
+
+        int compressedDim = readCompressedDim(in);
+        if (compressedDim == -1)
+        {
+            return visitRawDocValues(commonPrefixLengths, scratchPackedValue1, in, count, visitor, holder, origIndex);
+        }
+        else
+        {
+            return visitCompressedDocValues(commonPrefixLengths, scratchPackedValue1, in, count, visitor, compressedDim, holder, origIndex);
+        }
+    }
+
+    /**
+     * Modified copy of {@link org.apache.lucene.util.bkd.BKDReader#readCompressedDim(IndexInput)}
+     */
+    @SuppressWarnings("JavadocReference")
+    private int readCompressedDim(IndexInput in) throws IOException
+    {
+        int compressedDim = in.readByte();
+        if (compressedDim < -1 || compressedDim >= numDims)
+        {
+            throw new CorruptIndexException(String.format("Dimension should be in the range [-1, %d), but was %d.", numDims, compressedDim), in);
+        }
+        return compressedDim;
+    }
+
+    /**
+     * Modified copy of BKDReader#visitCompressedDocValues()
+     */
+    private int visitCompressedDocValues(int[] commonPrefixLengths,
+                                         byte[] scratchPackedValue,
+                                         IndexInput in,
+                                         int count,
+                                         IntersectVisitor visitor,
+                                         int compressedDim,
+                                         FixedBitSet[] holder,
+                                         final short[] origIndex) throws IOException
+    {
+        // the byte at `compressedByteOffset` is compressed using run-length compression,
+        // other suffix bytes are stored verbatim
+        final int compressedByteOffset = compressedDim * bytesPerDim + commonPrefixLengths[compressedDim];
+        commonPrefixLengths[compressedDim]++;
+        int i, collected = 0;
+
+        final FixedBitSet bitSet;
+        if (holder != null)
+        {
+            bitSet = new FixedBitSet(maxPointsInLeafNode);
+        }
+        else
+        {
+            bitSet = null;
+        }
+
+        for (i = 0; i < count; )
+        {
+            scratchPackedValue[compressedByteOffset] = in.readByte();
+            final int runLen = Byte.toUnsignedInt(in.readByte());
+            for (int j = 0; j < runLen; ++j)
+            {
+                for (int dim = 0; dim < numDims; dim++)
+                {
+                    int prefix = commonPrefixLengths[dim];
+                    in.readBytes(scratchPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
+                }
+                final int rowIDIndex = origIndex[i + j];
+                if (visitor.visit(scratchPackedValue))
+                {
+                    if (bitSet != null) bitSet.set(rowIDIndex);
+                    collected++;
+                }
+            }
+            i += runLen;
+        }
+        if (i != count)
+        {
+            throw new CorruptIndexException(String.format("Expected %d sub-blocks but read %d.", count, i), in);
+        }
+
+        if (holder != null)
+        {
+            holder[0] = bitSet;
+        }
+
+        return collected;
+    }
+
+    /**
+     * Modified copy of BKDReader#visitRawDocValues()
+     */
+    private int visitRawDocValues(int[] commonPrefixLengths,
+                                  byte[] scratchPackedValue,
+                                  IndexInput in,
+                                  int count,
+                                  IntersectVisitor visitor,
+                                  FixedBitSet[] holder,
+                                  final short[] origIndex) throws IOException
+    {
+        final FixedBitSet bitSet;
+        if (holder != null)
+        {
+            bitSet = new FixedBitSet(maxPointsInLeafNode);
+        }
+        else
+        {
+            bitSet = null;
+        }
+
+        int collected = 0;
+        for (int i = 0; i < count; ++i)
+        {
+            for (int dim = 0; dim < numDims; dim++)
+            {
+                int prefix = commonPrefixLengths[dim];
+                in.readBytes(scratchPackedValue, dim * bytesPerDim + prefix, bytesPerDim - prefix);
+            }
+            final int rowIDIndex = origIndex[i];
+            if (visitor.visit(scratchPackedValue))
+            {
+                if (bitSet != null) bitSet.set(rowIDIndex);
+
+                collected++;
+            }
+        }
+        if (holder != null)
+        {
+            holder[0] = bitSet;
+        }
+        return collected;
+    }
+
+    /**
+     * Copy of BKDReader#readCommonPrefixes()
+     */
+    private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException
+    {
+        for (int dim = 0; dim < numDims; dim++)
+        {
+            int prefix = in.readVInt();
+            commonPrefixLengths[dim] = prefix;
+            if (prefix > 0)
+            {
+//                System.out.println("dim * bytesPerDim="+(dim * bytesPerDim)+" prefix="+prefix+" numDims="+numDims);
+                in.readBytes(scratchPackedValue, dim * bytesPerDim, prefix);
+            }
+        }
+    }
+
+    private class FilteringIntersection extends Intersection
+    {
+        private final IntersectVisitor visitor;
+        private final byte[] scratchPackedValue1;
+        private final int[] commonPrefixLengths;
+        private final short[] origIndex;
+
+        // reused byte arrays for the decompression of leaf values
+        private final BytesRef uncompBytes = new BytesRef(new byte[16]);
+        private final BytesRef compBytes = new BytesRef(new byte[16]);
+
+        FilteringIntersection(IndexInput bkdInput, IndexInput postingsInput, IndexInput postingsSummaryInput,
+                              IndexTree index, IntersectVisitor visitor,
+                              QueryEventListener.BKDIndexEventListener listener, QueryContext context)
+        {
+            super(bkdInput, postingsInput, postingsSummaryInput, index, listener, context);
+            this.visitor = visitor;
+            this.commonPrefixLengths = new int[numDims];
+            this.scratchPackedValue1 = new byte[packedBytesLength];
+            this.origIndex = new short[maxPointsInLeafNode];
+        }
+
+        @Override
+        public void executeInternal(final PriorityQueue<PostingList.PeekablePostingList> postingLists) throws IOException
+        {
+            collectPostingLists(postingLists, minPackedValue, maxPackedValue);
+        }
+
+        public void collectPostingLists(PriorityQueue<PostingList.PeekablePostingList> postingLists, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException
+        {
+            context.checkpoint();
+
+            final Relation r = visitor.compare(cellMinPacked, cellMaxPacked);
+
+            if (r == Relation.CELL_OUTSIDE_QUERY)
+            {
+                // This cell is fully outside of the query shape: stop recursing
+                return;
+            }
+
+            if (r == Relation.CELL_INSIDE_QUERY)
+            {
+                // This cell is fully inside of the query shape: recursively add all points in this cell without filtering
+                super.collectPostingLists(postingLists);
+                return;
+            }
+
+            if (index.isLeafNode())
+            {
+                if (index.nodeExists())
+                    filterLeaf(postingLists);
+                return;
+            }
+
+            visitNode(postingLists, cellMinPacked, cellMaxPacked);
+        }
+
+        @SuppressWarnings("resource")
+        void filterLeaf(PriorityQueue<PostingList.PeekablePostingList> postingLists) throws IOException
+        {
+            bkdInput.seek(index.getLeafBlockFP());
+
+            final int count = bkdInput.readVInt();
+
+            // loading doc ids occurred here prior
+
+            final FixedBitSet[] holder = new FixedBitSet[1];
+
+            final int orderMapLength = bkdInput.readVInt();
+
+            final long orderMapPointer = bkdInput.getFilePointer();
+
+            final SeekingRandomAccessInput randoInput = new SeekingRandomAccessInput(bkdInput);
+            for (int x = 0; x < count; x++)
+            {
+                origIndex[x] = (short) LeafOrderMap.getValue(randoInput, orderMapPointer, x, leafOrderMapReader);
+            }
+
+            // seek beyond the ordermap
+            bkdInput.seek(orderMapPointer + orderMapLength);
+
+            IndexInput leafInput = bkdInput;
+
+            if (compressor != null)
+            {
+                // This should not throw WouldBlockException, even though we're on a TPC thread, because the
+                // secret key used by the underlying encryptor should be loaded at reader construction time.
+                leafInput = CryptoUtils.uncompress(bkdInput, compressor, compBytes, uncompBytes);
+            }
+
+            visitDocValues(commonPrefixLengths, scratchPackedValue1, leafInput, count, visitor, holder, origIndex);
+
+            final int nodeID = index.getNodeID();
+
+            if (postingsIndex.exists(nodeID) && holder[0].cardinality() > 0)
+            {
+                final long pointer = postingsIndex.getPostingsFilePointer(nodeID);
+                postingLists.add(initFilteringPostingReader(pointer, holder[0]).peekable());
+            }
+        }
+
+        void visitNode(PriorityQueue<PostingList.PeekablePostingList> postingLists, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException
+        {
+            int splitDim = index.getSplitDim();
+            assert splitDim >= 0 : "splitDim=" + splitDim;
+            assert splitDim < numDims;
+
+            byte[] splitPackedValue = index.getSplitPackedValue();
+            BytesRef splitDimValue = index.getSplitDimValue();
+            assert splitDimValue.length == bytesPerDim;
+
+            // make sure cellMin <= splitValue <= cellMax:
+            assert FutureArrays.compareUnsigned(cellMinPacked, splitDim * bytesPerDim, splitDim * bytesPerDim + bytesPerDim, splitDimValue.bytes, splitDimValue.offset, splitDimValue.offset + bytesPerDim) <= 0 : "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
+            assert FutureArrays.compareUnsigned(cellMaxPacked, splitDim * bytesPerDim, splitDim * bytesPerDim + bytesPerDim, splitDimValue.bytes, splitDimValue.offset, splitDimValue.offset + bytesPerDim) >= 0 : "bytesPerDim=" + bytesPerDim + " splitDim=" + splitDim + " numDims=" + numDims;
+
+            // Recurse on left sub-tree:
+            System.arraycopy(cellMaxPacked, 0, splitPackedValue, 0, packedBytesLength);
+            System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+
+            index.pushLeft();
+            collectPostingLists(postingLists, cellMinPacked, splitPackedValue);
+            index.pop();
+
+            // Restore the split dim value since it may have been overwritten while recursing:
+            System.arraycopy(splitPackedValue, splitDim * bytesPerDim, splitDimValue.bytes, splitDimValue.offset, bytesPerDim);
+            // Recurse on right sub-tree:
+            System.arraycopy(cellMinPacked, 0, splitPackedValue, 0, packedBytesLength);
+            System.arraycopy(splitDimValue.bytes, splitDimValue.offset, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
+            index.pushRight();
+            collectPostingLists(postingLists, splitPackedValue, cellMaxPacked);
+            index.pop();
+        }
+
+        private PostingList initFilteringPostingReader(long offset, FixedBitSet filter) throws IOException
+        {
+            final PostingsReader.BlocksSummary summary = new PostingsReader.BlocksSummary(postingsSummaryInput, offset);
+            return initFilteringPostingReader(filter, summary);
+        }
+
+        @SuppressWarnings("resource")
+        private PostingList initFilteringPostingReader(FixedBitSet filter, PostingsReader.BlocksSummary header) throws IOException
+        {
+            PostingsReader postingsReader = new PostingsReader(postingsInput, header, listener.postingListEventListener());
+            return new FilteringPostingList(filter, postingsReader);
+        }
+    }
+
+    public int getNumDimensions()
+    {
+        return numDims;
+    }
+
+    public int getBytesPerDimension()
+    {
+        return bytesPerDim;
+    }
+
+    public long getPointCount()
+    {
+        return pointCount;
+    }
+
+    /**
+     * We recurse the BKD tree, using a provided instance of this to guide the recursion.
+     */
+    public interface IntersectVisitor
+    {
+        /**
+         * Called for all values in a leaf cell that crosses the query.  The consumer
+         * should scrutinize the packedValue to decide whether to accept it.  In the 1D case,
+         * values are visited in increasing order, and in the case of ties, in increasing order
+         * by segment row ID.
+         */
+        boolean visit(byte[] packedValue);
+
+        /**
+         * Called for non-leaf cells to test how the cell relates to the query, to
+         * determine how to further recurse down the tree.
+         */
+        Relation compare(byte[] minPackedValue, byte[] maxPackedValue);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBuffer.java b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBuffer.java
new file mode 100644
index 000000000000..6483ad17bc7b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBuffer.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.cassandra.index.sai.disk.MutableOneDimPointValues;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.ByteBlockPool;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+/**
+ * On-heap buffer for point values that provides a sortable view of itself as {@link MutablePointValues}.
+ */
+public class BKDTreeRamBuffer implements Accountable
+{
+    private final Counter bytesUsed;
+    private final ByteBlockPool bytes;
+    private final int pointDimensionCount, pointNumBytes;
+    private final int packedBytesLength;
+    private final byte[] packedValue;
+    private final PackedLongValues.Builder docIDsBuilder;
+    private int numPoints;
+    private int numRows;
+    private int lastSegmentRowID = -1;
+    private boolean closed = false;
+
+    public BKDTreeRamBuffer(int pointDimensionCount, int pointNumBytes)
+    {
+        this.bytesUsed = Counter.newCounter();
+        this.pointDimensionCount = pointDimensionCount;
+        this.pointNumBytes = pointNumBytes;
+
+        this.bytes = new ByteBlockPool(new ByteBlockPool.DirectTrackingAllocator(bytesUsed));
+
+        packedValue = new byte[pointDimensionCount * pointNumBytes];
+        packedBytesLength = pointDimensionCount * pointNumBytes;
+
+        docIDsBuilder = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
+        bytesUsed.addAndGet(docIDsBuilder.ramBytesUsed());
+    }
+
+    @Override
+    public long ramBytesUsed()
+    {
+        return bytesUsed.get();
+    }
+
+    public int numRows()
+    {
+        return numRows;
+    }
+
+    public long addPackedValue(int segmentRowId, BytesRef value)
+    {
+        ensureOpen();
+        
+        if (value.length != packedBytesLength)
+        {
+            throw new IllegalArgumentException("The value has length=" + value.length + " but should be " + pointDimensionCount * pointNumBytes);
+        }
+
+        long startingBytesUsed = bytesUsed.get();
+        long startingDocIDsBytesUsed = docIDsBuilder.ramBytesUsed();
+
+        docIDsBuilder.add(segmentRowId);
+        bytes.append(value);
+
+        if (segmentRowId != lastSegmentRowID)
+        {
+            numRows++;
+            lastSegmentRowID = segmentRowId;
+        }
+
+        numPoints++;
+
+        long docIDsAllocatedBytes = docIDsBuilder.ramBytesUsed() - startingDocIDsBytesUsed;
+        long endingBytesAllocated = bytesUsed.addAndGet(docIDsAllocatedBytes);
+        
+        return endingBytesAllocated - startingBytesUsed;
+    }
+
+    public MutableOneDimPointValues asPointValues()
+    {
+        ensureOpen();
+        // building packed longs is destructive
+        closed = true;
+        final PackedLongValues docIDs = docIDsBuilder.build();
+        return new MutableOneDimPointValues()
+        {
+            final int[] ords = new int[numPoints];
+
+            {
+                for (int i = 0; i < numPoints; ++i)
+                {
+                    ords[i] = i;
+                }
+            }
+
+            @Override
+            public void getValue(int i, BytesRef packedValue)
+            {
+                final long offset = (long) packedBytesLength * (long) ords[i];
+                packedValue.length = packedBytesLength;
+                bytes.setRawBytesRef(packedValue, offset);
+            }
+
+            @Override
+            public byte getByteAt(int i, int k)
+            {
+                final long offset = (long) packedBytesLength * (long) ords[i] + (long) k;
+
+                return bytes.readByte(offset);
+            }
+
+            @Override
+            public int getDocID(int i)
+            {
+                return Math.toIntExact(docIDs.get(ords[i]));
+            }
+
+            @Override
+            public void swap(int i, int j)
+            {
+                int tmp = ords[i];
+                ords[i] = ords[j];
+                ords[j] = tmp;
+            }
+
+            @Override
+            public void intersect(IntersectVisitor visitor) throws IOException
+            {
+                final BytesRef scratch = new BytesRef();
+                for (int i = 0; i < numPoints; i++)
+                {
+                    getValue(i, scratch);
+                    assert scratch.length == packedValue.length;
+                    System.arraycopy(scratch.bytes, scratch.offset, packedValue, 0, packedBytesLength);
+                    visitor.visit(getDocID(i), packedValue);
+                }
+            }
+
+            @Override
+            public int getNumDimensions()
+            {
+                return pointDimensionCount;
+            }
+
+            @Override
+            public int getBytesPerDimension()
+            {
+                return pointNumBytes;
+            }
+
+            @Override
+            public long size()
+            {
+                return numPoints;
+            }
+
+            @Override
+            public int getDocCount()
+            {
+                return numRows;
+            }
+        };
+    }
+
+    private void ensureOpen()
+    {
+        Preconditions.checkState(!closed, "Expected open buffer.");
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/BKDWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDWriter.java
new file mode 100644
index 000000000000..633457747fd6
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/BKDWriter.java
@@ -0,0 +1,1042 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.function.IntFunction;
+
+import com.google.common.base.MoreObjects;
+
+import org.apache.cassandra.index.sai.disk.MutableOneDimPointValues;
+import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
+import org.apache.cassandra.index.sai.disk.io.RAMIndexOutput;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.compress.ICompressor;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.GrowableByteArrayDataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMOutputStream;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.FutureArrays;
+import org.apache.lucene.util.IntroSorter;
+import org.apache.lucene.util.LongBitSet;
+import org.apache.lucene.util.Sorter;
+import org.apache.lucene.util.bkd.MutablePointsReaderUtils;
+
+// TODO
+//   - allow variable length byte[] (across docs and dims), but this is quite a bit more hairy
+//   - we could also index "auto-prefix terms" here, and use better compression, and maybe only use for the "fully contained" case so we'd
+//     only index docIDs
+//   - the index could be efficiently encoded as an FST, so we don't have wasteful
+//     (monotonic) long[] leafBlockFPs; or we could use MonotonicLongValues ... but then
+//     the index is already plenty small: 60M OSM points --> 1.1 MB with 128 points
+//     per leaf, and you can reduce that by putting more points per leaf
+//   - we could use threads while building; the higher nodes are very parallelizable
+
+/**
+ * Recursively builds a block KD-tree to assign all incoming points in N-dim space to smaller
+ * and smaller N-dim rectangles (cells) until the number of points in a given
+ * rectangle is &lt;= <code>maxPointsInLeafNode</code>.  The tree is
+ * fully balanced, which means the leaf nodes will have between 50% and 100% of
+ * the requested <code>maxPointsInLeafNode</code>.  Values that fall exactly
+ * on a cell boundary may be in either cell.
+ *
+ * <p>The number of dimensions can be 1 to 8, but every byte[] value is fixed length.
+ *
+ * <p>
+ * See <a href="https://www.cs.duke.edu/~pankaj/publications/papers/bkd-sstd.pdf">this paper</a> for details.
+ *
+ * <p>This consumes heap during writing: it allocates a <code>LongBitSet(numPoints)</code>,
+ * and then uses up to the specified {@code maxMBSortInHeap} heap space for writing.
+ *
+ * <p>
+ * <b>NOTE</b>: This can write at most Integer.MAX_VALUE * <code>maxPointsInLeafNode</code> total points.
+ *
+ * @lucene.experimental
+ */
+
+public class BKDWriter implements Closeable
+{
+    /** How many bytes each docs takes in the fixed-width offline format */
+    private final int bytesPerDoc;
+
+    /** Default maximum number of point in each leaf block */
+    public static final int DEFAULT_MAX_POINTS_IN_LEAF_NODE = 1024;
+
+    /** Default maximum heap to use, before spilling to (slower) disk */
+    public static final float DEFAULT_MAX_MB_SORT_IN_HEAP = 16.0f;
+
+    /** Maximum number of dimensions */
+    public static final int MAX_DIMS = 8;
+
+    /** How many dimensions we are indexing */
+    protected final int numDims;
+
+    /** How many bytes each value in each dimension takes. */
+    protected final int bytesPerDim;
+
+    /** numDims * bytesPerDim */
+    protected final int packedBytesLength;
+
+    final BytesRef scratchBytesRef1 = new BytesRef();
+    final int[] commonPrefixLengths;
+
+    protected final LongBitSet docsSeen;
+
+    protected final int maxPointsInLeafNode;
+    private final int maxPointsSortInHeap;
+
+    /** Minimum per-dim values, packed */
+    protected final byte[] minPackedValue;
+
+    /** Maximum per-dim values, packed */
+    protected final byte[] maxPackedValue;
+
+    protected long pointCount;
+
+    /** true if we have so many values that we must write ords using long (8 bytes) instead of int (4 bytes) */
+    protected final boolean longOrds;
+
+    /** An upper bound on how many points the caller will add (includes deletions) */
+    private final long totalPointCount;
+
+    private final long maxDoc;
+
+    private final ICompressor compressor;
+
+    public BKDWriter(long maxDoc, int numDims, int bytesPerDim,
+            int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount, boolean singleValuePerDoc,
+            ICompressor compressor) throws IOException
+    {
+        this(maxDoc, numDims, bytesPerDim, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount, singleValuePerDoc,
+             totalPointCount > Integer.MAX_VALUE, compressor);
+    }
+
+    protected BKDWriter(long maxDoc, int numDims, int bytesPerDim,
+                        int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount,
+                        boolean singleValuePerDoc, boolean longOrds, ICompressor compressor) throws IOException
+    {
+        verifyParams(numDims, maxPointsInLeafNode, maxMBSortInHeap, totalPointCount);
+        // We use tracking dir to deal with removing files on exception, so each place that
+        // creates temp files doesn't need crazy try/finally/sucess logic:
+        this.maxPointsInLeafNode = maxPointsInLeafNode;
+        this.numDims = numDims;
+        this.bytesPerDim = bytesPerDim;
+        this.totalPointCount = totalPointCount;
+        this.maxDoc = maxDoc;
+        this.compressor = compressor;
+        docsSeen = new LongBitSet(maxDoc);
+        packedBytesLength = numDims * bytesPerDim;
+
+        commonPrefixLengths = new int[numDims];
+
+        minPackedValue = new byte[packedBytesLength];
+        maxPackedValue = new byte[packedBytesLength];
+
+        // If we may have more than 1+Integer.MAX_VALUE values, then we must encode ords with long (8 bytes), else we can use int (4 bytes).
+        this.longOrds = longOrds;
+
+        // dimensional values (numDims * bytesPerDim) + ord (int or long) + docID (int)
+        if (singleValuePerDoc)
+        {
+            // Lucene only supports up to 2.1 docs, so we better not need longOrds in this case:
+            assert longOrds == false;
+            bytesPerDoc = packedBytesLength + Integer.BYTES;
+        }
+        else if (longOrds)
+        {
+            bytesPerDoc = packedBytesLength + Long.BYTES + Integer.BYTES;
+        }
+        else
+        {
+            bytesPerDoc = packedBytesLength + Integer.BYTES + Integer.BYTES;
+        }
+
+        // As we recurse, we compute temporary partitions of the data, halving the
+        // number of points at each recursion.  Once there are few enough points,
+        // we can switch to sorting in heap instead of offline (on disk).  At any
+        // time in the recursion, we hold the number of points at that level, plus
+        // all recursive halves (i.e. 16 + 8 + 4 + 2) so the memory usage is 2X
+        // what that level would consume, so we multiply by 0.5 to convert from
+        // bytes to points here.  Each dimension has its own sorted partition, so
+        // we must divide by numDims as wel.
+
+        maxPointsSortInHeap = (int) (0.5 * (maxMBSortInHeap * 1024 * 1024) / (bytesPerDoc * numDims));
+
+        // Finally, we must be able to hold at least the leaf node in heap during build:
+        if (maxPointsSortInHeap < maxPointsInLeafNode)
+        {
+            throw new IllegalArgumentException("maxMBSortInHeap=" + maxMBSortInHeap + " only allows for maxPointsSortInHeap=" + maxPointsSortInHeap + ", but this is less than maxPointsInLeafNode=" + maxPointsInLeafNode + "; either increase maxMBSortInHeap or decrease maxPointsInLeafNode");
+        }
+    }
+
+    public static void verifyParams(int numDims, int maxPointsInLeafNode, double maxMBSortInHeap, long totalPointCount)
+    {
+        // We encode dim in a single byte in the splitPackedValues, but we only expose 4 bits for it now, in case we want to use
+        // remaining 4 bits for another purpose later
+        if (numDims < 1 || numDims > MAX_DIMS)
+        {
+            throw new IllegalArgumentException("numDims must be 1 .. " + MAX_DIMS + " (got: " + numDims + ")");
+        }
+        if (maxPointsInLeafNode <= 0)
+        {
+            throw new IllegalArgumentException("maxPointsInLeafNode must be > 0; got " + maxPointsInLeafNode);
+        }
+        if (maxPointsInLeafNode > ArrayUtil.MAX_ARRAY_LENGTH)
+        {
+            throw new IllegalArgumentException("maxPointsInLeafNode must be <= ArrayUtil.MAX_ARRAY_LENGTH (= " + ArrayUtil.MAX_ARRAY_LENGTH + "); got " + maxPointsInLeafNode);
+        }
+        if (maxMBSortInHeap < 0.0)
+        {
+            throw new IllegalArgumentException("maxMBSortInHeap must be >= 0.0 (got: " + maxMBSortInHeap + ")");
+        }
+        if (totalPointCount < 0)
+        {
+            throw new IllegalArgumentException("totalPointCount must be >=0 (got: " + totalPointCount + ")");
+        }
+    }
+
+    /** How many points have been added so far */
+    public long getPointCount()
+    {
+        return pointCount;
+    }
+
+    /**
+     * Write a field from a {@link MutablePointValues}. This way of writing
+     * points is faster than regular writes with BKDWriter#add since
+     * there is opportunity for reordering points before writing them to
+     * disk. This method does not use transient disk in order to reorder points.
+     */
+    public long writeField(IndexOutput out, MutableOneDimPointValues reader,
+                           final OneDimensionBKDWriterCallback callback) throws IOException
+    {
+        if (numDims == 1)
+        {
+            SAICodecUtils.writeHeader(out);
+            final long fp = writeField1Dim(out, reader, callback);
+            SAICodecUtils.writeFooter(out);
+            return fp;
+        }
+        else
+        {
+            throw new IllegalArgumentException("Only 1 dimension is supported.");
+        }
+    }
+
+    /* In the 1D case, we can simply sort points in ascending order and use the
+     * same writing logic as we use at merge time. */
+    private long writeField1Dim(IndexOutput out, MutableOneDimPointValues reader,
+                                OneDimensionBKDWriterCallback callback) throws IOException
+    {
+        // TODO: cast to int
+        if (reader.size() > 1)
+            MutablePointsReaderUtils.sort(Math.toIntExact(maxDoc), packedBytesLength, reader, 0, Math.toIntExact(reader.size()));
+
+        final OneDimensionBKDWriter oneDimWriter = new OneDimensionBKDWriter(out, callback);
+
+        reader.intersect((docID, packedValue) -> oneDimWriter.add(packedValue, docID));
+
+        return oneDimWriter.finish();
+    }
+
+    // reused when writing leaf blocks
+    private final GrowableByteArrayDataOutput scratchOut = new GrowableByteArrayDataOutput(32 * 1024);
+
+    private final GrowableByteArrayDataOutput scratchOut2 = new GrowableByteArrayDataOutput(2 * 1024);
+
+    interface OneDimensionBKDWriterCallback
+    {
+        void writeLeafDocs(int leafNum, RowIDAndIndex[] leafDocs, int offset, int count);
+    }
+
+    public static class RowIDAndIndex
+    {
+        public int valueOrderIndex;
+        public long rowID;
+
+        @Override
+        public String toString()
+        {
+            return MoreObjects.toStringHelper(this)
+                              .add("valueOrderIndex", valueOrderIndex)
+                              .add("rowID", rowID)
+                              .toString();
+        }
+    }
+
+    private class OneDimensionBKDWriter
+    {
+
+        final IndexOutput out;
+        final List<Long> leafBlockFPs = new ArrayList<>();
+        final List<byte[]> leafBlockStartValues = new ArrayList<>();
+        final byte[] leafValues = new byte[maxPointsInLeafNode * packedBytesLength];
+        final long[] leafDocs = new long[maxPointsInLeafNode];
+        private long valueCount;
+        private int leafCount;
+        final RowIDAndIndex[] rowIDAndIndexes = new RowIDAndIndex[maxPointsInLeafNode];
+        final int[] orderIndex = new int[maxPointsInLeafNode];
+        final OneDimensionBKDWriterCallback callback;
+
+        {
+            for (int x = 0; x < rowIDAndIndexes.length; x++)
+            {
+                rowIDAndIndexes[x] = new RowIDAndIndex();
+            }
+        }
+
+        OneDimensionBKDWriter(IndexOutput out, OneDimensionBKDWriterCallback callback)
+        {
+            if (numDims != 1)
+            {
+                throw new UnsupportedOperationException("numDims must be 1 but got " + numDims);
+            }
+            if (pointCount != 0)
+            {
+                throw new IllegalStateException("cannot mix add and merge");
+            }
+
+            this.out = out;
+            this.callback = callback;
+
+            lastPackedValue = new byte[packedBytesLength];
+        }
+
+        // for asserts
+        final byte[] lastPackedValue;
+        private long lastDocID;
+
+        void add(byte[] packedValue, long docID) throws IOException
+        {
+            assert valueInOrder(valueCount + leafCount,
+                                0, lastPackedValue, packedValue, 0, docID, lastDocID);
+
+            System.arraycopy(packedValue, 0, leafValues, leafCount * packedBytesLength, packedBytesLength);
+            leafDocs[leafCount] = docID;
+            docsSeen.set(docID);
+            leafCount++;
+
+            if (valueCount > totalPointCount)
+            {
+                throw new IllegalStateException("totalPointCount=" + totalPointCount + " was passed when we were created, but we just hit " + pointCount + " values");
+            }
+
+            if (leafCount == maxPointsInLeafNode)
+            {
+                // We write a block once we hit exactly the max count ... this is different from
+                // when we write N > 1 dimensional points where we write between max/2 and max per leaf block
+                writeLeafBlock();
+                leafCount = 0;
+            }
+
+            assert (lastDocID = docID) >= 0; // only assign when asserts are enabled
+        }
+
+        public long finish() throws IOException
+        {
+            if (leafCount > 0)
+            {
+                writeLeafBlock();
+                leafCount = 0;
+            }
+
+            if (valueCount == 0)
+            {
+                return -1;
+            }
+
+            pointCount = valueCount;
+
+            long indexFP = out.getFilePointer();
+
+            int numInnerNodes = leafBlockStartValues.size();
+
+            //System.out.println("BKDW: now rotate numInnerNodes=" + numInnerNodes + " leafBlockStarts=" + leafBlockStartValues.size());
+
+            byte[] index = new byte[(1 + numInnerNodes) * (1 + bytesPerDim)];
+            rotateToTree(1, 0, numInnerNodes, index, leafBlockStartValues);
+            long[] arr = new long[leafBlockFPs.size()];
+            for (int i = 0; i < leafBlockFPs.size(); i++)
+            {
+                arr[i] = leafBlockFPs.get(i);
+            }
+            writeIndex(out, maxPointsInLeafNode, arr, index);
+            return indexFP;
+        }
+
+        private void writeLeafBlock() throws IOException
+        {
+            assert leafCount != 0;
+            if (valueCount == 0)
+            {
+                System.arraycopy(leafValues, 0, minPackedValue, 0, packedBytesLength);
+            }
+            System.arraycopy(leafValues, (leafCount - 1) * packedBytesLength, maxPackedValue, 0, packedBytesLength);
+
+            valueCount += leafCount;
+
+            if (leafBlockFPs.size() > 0)
+            {
+                // Save the first (minimum) value in each leaf block except the first, to build the split value index in the end:
+                leafBlockStartValues.add(ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength));
+            }
+            leafBlockFPs.add(out.getFilePointer());
+            checkMaxLeafNodeCount(leafBlockFPs.size());
+
+            // Find per-dim common prefix:
+            int prefix = bytesPerDim;
+            int offset = (leafCount - 1) * packedBytesLength;
+            for (int j = 0; j < bytesPerDim; j++)
+            {
+                if (leafValues[j] != leafValues[offset + j])
+                {
+                    prefix = j;
+                    break;
+                }
+            }
+
+            commonPrefixLengths[0] = prefix;
+
+            assert scratchOut.getPosition() == 0;
+
+            out.writeVInt(leafCount);
+
+            for (int x = 0; x < leafCount; x++)
+            {
+                rowIDAndIndexes[x].valueOrderIndex = x;
+                rowIDAndIndexes[x].rowID = leafDocs[x];
+            }
+
+            final Sorter sorter = new IntroSorter()
+            {
+                RowIDAndIndex pivot;
+
+                @Override
+                protected void swap(int i, int j)
+                {
+                    RowIDAndIndex o = rowIDAndIndexes[i];
+                    rowIDAndIndexes[i] = rowIDAndIndexes[j];
+                    rowIDAndIndexes[j] = o;
+                }
+
+                @Override
+                protected void setPivot(int i)
+                {
+                    pivot = rowIDAndIndexes[i];
+                }
+
+                @Override
+                protected int comparePivot(int j)
+                {
+                    return Long.compare(pivot.rowID, rowIDAndIndexes[j].rowID);
+                }
+            };
+
+            sorter.sort(0, leafCount);
+
+            // write leaf rowID -> orig index
+            scratchOut2.reset();
+
+            // iterate in row ID order to get the row ID index for the given value order index
+            // place into an array to be written as packed ints
+            for (int x = 0; x < leafCount; x++)
+            {
+                final int valueOrderIndex = rowIDAndIndexes[x].valueOrderIndex;
+                orderIndex[valueOrderIndex] = x;
+            }
+
+            LeafOrderMap.write(orderIndex, leafCount, maxPointsInLeafNode - 1, scratchOut2);
+
+            out.writeVInt(scratchOut2.getPosition());
+            out.writeBytes(scratchOut2.getBytes(), 0, scratchOut2.getPosition());
+
+            if (callback != null) callback.writeLeafDocs(leafBlockFPs.size() - 1, rowIDAndIndexes, 0, leafCount);
+
+            writeCommonPrefixes(scratchOut, commonPrefixLengths, leafValues);
+
+            scratchBytesRef1.length = packedBytesLength;
+            scratchBytesRef1.bytes = leafValues;
+
+            final IntFunction<BytesRef> packedValues = (i) -> {
+                    scratchBytesRef1.offset = packedBytesLength * i;
+                    return scratchBytesRef1;
+            };
+            assert valuesInOrderAndBounds(leafCount, 0, ArrayUtil.copyOfSubArray(leafValues, 0, packedBytesLength),
+                                          ArrayUtil.copyOfSubArray(leafValues, (leafCount - 1) * packedBytesLength, leafCount * packedBytesLength),
+                                          packedValues, leafDocs, 0);
+
+            writeLeafBlockPackedValues(scratchOut, commonPrefixLengths, leafCount, 0, packedValues);
+
+            if (compressor == null)
+            {
+                out.writeBytes(scratchOut.getBytes(), 0, scratchOut.getPosition());
+            }
+            else
+            {
+                CryptoUtils.compress(new BytesRef(scratchOut.getBytes(), 0, scratchOut.getPosition()), scratchBytesRef, out, compressor);
+            }
+            scratchOut.reset();
+        }
+    }
+
+    private final BytesRef scratchBytesRef = new BytesRef(new byte[128]);
+
+    // TODO: there must be a simpler way?
+    private void rotateToTree(int nodeID, int offset, int count, byte[] index, List<byte[]> leafBlockStartValues)
+    {
+        //System.out.println("ROTATE: nodeID=" + nodeID + " offset=" + offset + " count=" + count + " bpd=" + bytesPerDim + " index.length=" + index.length);
+        if (count == 1)
+        {
+            // Leaf index node
+            //System.out.println("  leaf index node");
+            //System.out.println("  index[" + nodeID + "] = blockStartValues[" + offset + "]");
+            System.arraycopy(leafBlockStartValues.get(offset), 0, index, nodeID * (1 + bytesPerDim) + 1, bytesPerDim);
+        }
+        else if (count > 1)
+        {
+            // Internal index node: binary partition of count
+            int countAtLevel = 1;
+            int totalCount = 0;
+            while (true)
+            {
+                int countLeft = count - totalCount;
+                //System.out.println("    cycle countLeft=" + countLeft + " coutAtLevel=" + countAtLevel);
+                if (countLeft <= countAtLevel)
+                {
+                    // This is the last level, possibly partially filled:
+                    int lastLeftCount = Math.min(countAtLevel / 2, countLeft);
+                    assert lastLeftCount >= 0;
+                    int leftHalf = (totalCount - 1) / 2 + lastLeftCount;
+
+                    int rootOffset = offset + leftHalf;
+          /*
+          System.out.println("  last left count " + lastLeftCount);
+          System.out.println("  leftHalf " + leftHalf + " rightHalf=" + (count-leftHalf-1));
+          System.out.println("  rootOffset=" + rootOffset);
+          */
+
+                    System.arraycopy(leafBlockStartValues.get(rootOffset), 0, index, nodeID * (1 + bytesPerDim) + 1, bytesPerDim);
+                    //System.out.println("  index[" + nodeID + "] = blockStartValues[" + rootOffset + "]");
+
+                    // TODO: we could optimize/specialize, when we know it's simply fully balanced binary tree
+                    // under here, to save this while loop on each recursion
+
+                    // Recurse left
+                    rotateToTree(2 * nodeID, offset, leftHalf, index, leafBlockStartValues);
+
+                    // Recurse right
+                    rotateToTree(2 * nodeID + 1, rootOffset + 1, count - leftHalf - 1, index, leafBlockStartValues);
+                    return;
+                }
+                totalCount += countAtLevel;
+                countAtLevel *= 2;
+            }
+        }
+        else
+        {
+            assert count == 0;
+        }
+    }
+
+    // useful for debugging:
+  /*
+  private void printPathSlice(String desc, PathSlice slice, int dim) throws IOException {
+    System.out.println("    " + desc + " dim=" + dim + " count=" + slice.count + ":");    
+    try(PointReader r = slice.writer.getReader(slice.start, slice.count)) {
+      int count = 0;
+      while (r.next()) {
+        byte[] v = r.packedValue();
+        System.out.println("      " + count + ": " + new BytesRef(v, dim*bytesPerDim, bytesPerDim));
+        count++;
+        if (count == slice.count) {
+          break;
+        }
+      }
+    }
+  }
+  */
+
+    private void checkMaxLeafNodeCount(int numLeaves)
+    {
+        if ((1 + bytesPerDim) * (long) numLeaves > ArrayUtil.MAX_ARRAY_LENGTH)
+        {
+            throw new IllegalStateException("too many nodes; increase maxPointsInLeafNode (currently " + maxPointsInLeafNode + ") and reindex");
+        }
+    }
+
+    /** Packs the two arrays, representing a balanced binary tree, into a compact byte[] structure. */
+    @SuppressWarnings("resource")
+    private byte[] packIndex(long[] leafBlockFPs, byte[] splitPackedValues) throws IOException
+    {
+
+        int numLeaves = leafBlockFPs.length;
+
+        // Possibly rotate the leaf block FPs, if the index not fully balanced binary tree (only happens
+        // if it was created by OneDimensionBKDWriter).  In this case the leaf nodes may straddle the two bottom
+        // levels of the binary tree:
+        if (numDims == 1 && numLeaves > 1)
+        {
+            int levelCount = 2;
+            while (true)
+            {
+                if (numLeaves >= levelCount && numLeaves <= 2 * levelCount)
+                {
+                    int lastLevel = 2 * (numLeaves - levelCount);
+                    assert lastLevel >= 0;
+                    if (lastLevel != 0)
+                    {
+                        // Last level is partially filled, so we must rotate the leaf FPs to match.  We do this here, after loading
+                        // at read-time, so that we can still delta code them on disk at write:
+                        long[] newLeafBlockFPs = new long[numLeaves];
+                        System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
+                        System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
+                        leafBlockFPs = newLeafBlockFPs;
+                    }
+                    break;
+                }
+
+                levelCount *= 2;
+            }
+        }
+
+        /** Reused while packing the index */
+        // TODO: replace with RAMIndexOutput because RAMOutputStream has synchronized/monitor locks
+        RAMOutputStream writeBuffer = new RAMOutputStream();
+
+        // This is the "file" we append the byte[] to:
+        List<byte[]> blocks = new ArrayList<>();
+        byte[] lastSplitValues = new byte[bytesPerDim * numDims];
+        //System.out.println("\npack index");
+        int totalSize = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, 0l, blocks, 1, lastSplitValues, new boolean[numDims], false);
+
+        // Compact the byte[] blocks into single byte index:
+        byte[] index = new byte[totalSize];
+        int upto = 0;
+        for (byte[] block : blocks)
+        {
+            System.arraycopy(block, 0, index, upto, block.length);
+            upto += block.length;
+        }
+        assert upto == totalSize;
+
+        return index;
+    }
+
+    /** Appends the current contents of writeBuffer as another block on the growing in-memory file */
+    private int appendBlock(RAMOutputStream writeBuffer, List<byte[]> blocks) throws IOException
+    {
+        int pos = Math.toIntExact(writeBuffer.getFilePointer());
+        byte[] bytes = new byte[pos];
+        writeBuffer.writeTo(bytes, 0);
+        writeBuffer.reset();
+        blocks.add(bytes);
+        return pos;
+    }
+
+    /**
+     * lastSplitValues is per-dimension split value previously seen; we use this to prefix-code the split byte[] on each
+     * inner node
+     */
+    private int recursePackIndex(RAMOutputStream writeBuffer, long[] leafBlockFPs, byte[] splitPackedValues, long minBlockFP, List<byte[]> blocks,
+                                 int nodeID, byte[] lastSplitValues, boolean[] negativeDeltas, boolean isLeft) throws IOException
+    {
+        if (nodeID >= leafBlockFPs.length)
+        {
+            int leafID = nodeID - leafBlockFPs.length;
+            //System.out.println("recursePack leaf nodeID=" + nodeID);
+
+            // In the unbalanced case it's possible the left most node only has one child:
+            if (leafID < leafBlockFPs.length)
+            {
+                long delta = leafBlockFPs[leafID] - minBlockFP;
+                if (isLeft)
+                {
+                    assert delta == 0;
+                    return 0;
+                }
+                else
+                {
+                    assert nodeID == 1 || delta > 0 : "nodeID=" + nodeID;
+                    writeBuffer.writeVLong(delta);
+                    return appendBlock(writeBuffer, blocks);
+                }
+            }
+            else
+            {
+                return 0;
+            }
+        }
+        else
+        {
+            long leftBlockFP;
+            if (isLeft == false)
+            {
+                leftBlockFP = getLeftMostLeafBlockFP(leafBlockFPs, nodeID);
+                long delta = leftBlockFP - minBlockFP;
+                assert nodeID == 1 || delta > 0;
+                writeBuffer.writeVLong(delta);
+            }
+            else
+            {
+                // The left tree's left most leaf block FP is always the minimal FP:
+                leftBlockFP = minBlockFP;
+            }
+
+            int address = nodeID * (1 + bytesPerDim);
+            int splitDim = splitPackedValues[address++] & 0xff;
+
+            //System.out.println("recursePack inner nodeID=" + nodeID + " splitDim=" + splitDim + " splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
+
+            // find common prefix with last split value in this dim:
+            int prefix = 0;
+            for (; prefix < bytesPerDim; prefix++)
+            {
+                if (splitPackedValues[address + prefix] != lastSplitValues[splitDim * bytesPerDim + prefix])
+                {
+                    break;
+                }
+            }
+
+            //System.out.println("writeNodeData nodeID=" + nodeID + " splitDim=" + splitDim + " numDims=" + numDims + " bytesPerDim=" + bytesPerDim + " prefix=" + prefix);
+
+            int firstDiffByteDelta;
+            if (prefix < bytesPerDim)
+            {
+                //System.out.println("  delta byte cur=" + Integer.toHexString(splitPackedValues[address+prefix]&0xFF) + " prev=" + Integer.toHexString(lastSplitValues[splitDim * bytesPerDim + prefix]&0xFF) + " negated?=" + negativeDeltas[splitDim]);
+                firstDiffByteDelta = (splitPackedValues[address + prefix] & 0xFF) - (lastSplitValues[splitDim * bytesPerDim + prefix] & 0xFF);
+                if (negativeDeltas[splitDim])
+                {
+                    firstDiffByteDelta = -firstDiffByteDelta;
+                }
+                //System.out.println("  delta=" + firstDiffByteDelta);
+                assert firstDiffByteDelta > 0;
+            }
+            else
+            {
+                firstDiffByteDelta = 0;
+            }
+
+            // pack the prefix, splitDim and delta first diff byte into a single vInt:
+            int code = (firstDiffByteDelta * (1 + bytesPerDim) + prefix) * numDims + splitDim;
+
+            //System.out.println("  code=" + code);
+            //System.out.println("  splitValue=" + new BytesRef(splitPackedValues, address, bytesPerDim));
+
+            writeBuffer.writeVInt(code);
+
+            // write the split value, prefix coded vs. our parent's split value:
+            int suffix = bytesPerDim - prefix;
+            byte[] savSplitValue = new byte[suffix];
+            if (suffix > 1)
+            {
+                writeBuffer.writeBytes(splitPackedValues, address + prefix + 1, suffix - 1);
+            }
+
+            byte[] cmp = lastSplitValues.clone();
+
+            System.arraycopy(lastSplitValues, splitDim * bytesPerDim + prefix, savSplitValue, 0, suffix);
+
+            // copy our split value into lastSplitValues for our children to prefix-code against
+            System.arraycopy(splitPackedValues, address + prefix, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+
+            int numBytes = appendBlock(writeBuffer, blocks);
+
+            // placeholder for left-tree numBytes; we need this so that at search time if we only need to recurse into the right sub-tree we can
+            // quickly seek to its starting point
+            int idxSav = blocks.size();
+            blocks.add(null);
+
+            boolean savNegativeDelta = negativeDeltas[splitDim];
+            negativeDeltas[splitDim] = true;
+
+            int leftNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2 * nodeID, lastSplitValues, negativeDeltas, true);
+
+            if (nodeID * 2 < leafBlockFPs.length)
+            {
+                writeBuffer.writeVInt(leftNumBytes);
+            }
+            else
+            {
+                assert leftNumBytes == 0 : "leftNumBytes=" + leftNumBytes;
+            }
+            int numBytes2 = Math.toIntExact(writeBuffer.getFilePointer());
+            byte[] bytes2 = new byte[numBytes2];
+            writeBuffer.writeTo(bytes2, 0);
+            writeBuffer.reset();
+            // replace our placeholder:
+            blocks.set(idxSav, bytes2);
+
+            negativeDeltas[splitDim] = false;
+            int rightNumBytes = recursePackIndex(writeBuffer, leafBlockFPs, splitPackedValues, leftBlockFP, blocks, 2 * nodeID + 1, lastSplitValues, negativeDeltas, false);
+
+            negativeDeltas[splitDim] = savNegativeDelta;
+
+            // restore lastSplitValues to what caller originally passed us:
+            System.arraycopy(savSplitValue, 0, lastSplitValues, splitDim * bytesPerDim + prefix, suffix);
+
+            assert Arrays.equals(lastSplitValues, cmp);
+
+            return numBytes + numBytes2 + leftNumBytes + rightNumBytes;
+        }
+    }
+
+    private long getLeftMostLeafBlockFP(long[] leafBlockFPs, int nodeID)
+    {
+        // TODO: can we do this cheaper, e.g. a closed form solution instead of while loop?  Or
+        // change the recursion while packing the index to return this left-most leaf block FP
+        // from each recursion instead?
+        //
+        // Still, the overall cost here is minor: this method's cost is O(log(N)), and while writing
+        // we call it O(N) times (N = number of leaf blocks)
+        while (nodeID < leafBlockFPs.length)
+        {
+            nodeID *= 2;
+        }
+        int leafID = nodeID - leafBlockFPs.length;
+        long result = leafBlockFPs[leafID];
+        if (result < 0)
+        {
+            throw new AssertionError(result + " for leaf " + leafID);
+        }
+        return result;
+    }
+
+    private void writeIndex(IndexOutput out, int countPerLeaf, long[] leafBlockFPs, byte[] splitPackedValues) throws IOException
+    {
+        byte[] packedIndex = packIndex(leafBlockFPs, splitPackedValues);
+        writeIndex(out, countPerLeaf, leafBlockFPs.length, packedIndex);
+    }
+
+    private void writeIndex(IndexOutput out, int countPerLeaf, int numLeaves, byte[] packedIndex) throws IOException
+    {
+        out.writeVInt(numDims);
+        out.writeVInt(countPerLeaf);
+        out.writeVInt(bytesPerDim);
+
+        assert numLeaves > 0;
+        out.writeVInt(numLeaves);
+
+        if (compressor != null)
+        {
+            RAMIndexOutput ramOut = new RAMIndexOutput("");
+            ramOut.writeBytes(minPackedValue, 0, packedBytesLength);
+            ramOut.writeBytes(maxPackedValue, 0, packedBytesLength);
+
+            CryptoUtils.compress(new BytesRef(ramOut.getBytes(), 0, (int)ramOut.getFilePointer()), out, compressor);
+        }
+        else
+        {
+            out.writeBytes(minPackedValue, 0, packedBytesLength);
+            out.writeBytes(maxPackedValue, 0, packedBytesLength);
+        }
+
+        out.writeVLong(pointCount);
+        //TODO Changing disk format
+        out.writeVLong(docsSeen.cardinality());
+
+        if (compressor != null)
+        {
+            CryptoUtils.compress(new BytesRef(packedIndex, 0, packedIndex.length), out, compressor);
+        }
+        else
+        {
+            out.writeVInt(packedIndex.length);
+            out.writeBytes(packedIndex, 0, packedIndex.length);
+        }
+    }
+
+    private void writeLeafBlockPackedValues(DataOutput out, int[] commonPrefixLengths, int count, int sortedDim, IntFunction<BytesRef> packedValues) throws IOException
+    {
+        int prefixLenSum = Arrays.stream(commonPrefixLengths).sum();
+        if (prefixLenSum == packedBytesLength)
+        {
+            // all values in this block are equal
+            out.writeByte((byte) -1);
+        }
+        else
+        {
+            assert numDims == 1;
+
+            assert commonPrefixLengths[sortedDim] < bytesPerDim;
+            out.writeByte((byte) sortedDim);
+            int compressedByteOffset = sortedDim * bytesPerDim + commonPrefixLengths[sortedDim];
+            commonPrefixLengths[sortedDim]++;
+            for (int i = 0; i < count; )
+            {
+                // do run-length compression on the byte at compressedByteOffset
+                int runLen = runLen(packedValues, i, Math.min(i + 0xff, count), compressedByteOffset);
+                assert runLen <= 0xff;
+                BytesRef first = packedValues.apply(i);
+                byte prefixByte = first.bytes[first.offset + compressedByteOffset];
+                out.writeByte(prefixByte);
+                out.writeByte((byte) runLen);
+                writeLeafBlockPackedValuesRange(out, commonPrefixLengths, i, i + runLen, packedValues);
+                i += runLen;
+                assert i <= count;
+            }
+        }
+    }
+
+    /**
+     * Return an array that contains the min and max values for the [offset, offset+length] interval
+     * of the given {@link BytesRef}s.
+     */
+    private static BytesRef[] computeMinMax(int count, IntFunction<BytesRef> packedValues, int offset, int length)
+    {
+        assert length > 0;
+        BytesRefBuilder min = new BytesRefBuilder();
+        BytesRefBuilder max = new BytesRefBuilder();
+        BytesRef first = packedValues.apply(0);
+        min.copyBytes(first.bytes, first.offset + offset, length);
+        max.copyBytes(first.bytes, first.offset + offset, length);
+        for (int i = 1; i < count; ++i)
+        {
+            BytesRef candidate = packedValues.apply(i);
+            if (FutureArrays.compareUnsigned(min.bytes(), 0, length, candidate.bytes, candidate.offset + offset, candidate.offset + offset + length) > 0)
+            {
+                min.copyBytes(candidate.bytes, candidate.offset + offset, length);
+            }
+            else if (FutureArrays.compareUnsigned(max.bytes(), 0, length, candidate.bytes, candidate.offset + offset, candidate.offset + offset + length) < 0)
+            {
+                max.copyBytes(candidate.bytes, candidate.offset + offset, length);
+            }
+        }
+        return new BytesRef[]{ min.get(), max.get() };
+    }
+
+    private void writeLeafBlockPackedValuesRange(DataOutput out, int[] commonPrefixLengths, int start, int end, IntFunction<BytesRef> packedValues) throws IOException
+    {
+        for (int i = start; i < end; ++i)
+        {
+            BytesRef ref = packedValues.apply(i);
+            assert ref.length == packedBytesLength;
+
+            for (int dim = 0; dim < numDims; dim++)
+            {
+                int prefix = commonPrefixLengths[dim];
+                out.writeBytes(ref.bytes, ref.offset + dim * bytesPerDim + prefix, bytesPerDim - prefix);
+            }
+        }
+    }
+
+    private static int runLen(IntFunction<BytesRef> packedValues, int start, int end, int byteOffset)
+    {
+        BytesRef first = packedValues.apply(start);
+        byte b = first.bytes[first.offset + byteOffset];
+        for (int i = start + 1; i < end; ++i)
+        {
+            BytesRef ref = packedValues.apply(i);
+            byte b2 = ref.bytes[ref.offset + byteOffset];
+            assert Byte.toUnsignedInt(b2) >= Byte.toUnsignedInt(b);
+            if (b != b2)
+            {
+                return i - start;
+            }
+        }
+        return end - start;
+    }
+
+    private void writeCommonPrefixes(DataOutput out, int[] commonPrefixes, byte[] packedValue) throws IOException
+    {
+        for (int dim = 0; dim < numDims; dim++)
+        {
+            out.writeVInt(commonPrefixes[dim]);
+            //System.out.println(commonPrefixes[dim] + " of " + bytesPerDim);
+            out.writeBytes(packedValue, dim * bytesPerDim, commonPrefixes[dim]);
+        }
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+
+    }
+
+    /** Called only in assert */
+    private boolean valueInBounds(BytesRef packedValue, byte[] minPackedValue, byte[] maxPackedValue)
+    {
+        for (int dim = 0; dim < numDims; dim++)
+        {
+            int offset = bytesPerDim * dim;
+            if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDim, minPackedValue, offset, offset + bytesPerDim) < 0)
+            {
+                return false;
+            }
+            if (FutureArrays.compareUnsigned(packedValue.bytes, packedValue.offset + offset, packedValue.offset + offset + bytesPerDim, maxPackedValue, offset, offset + bytesPerDim) > 0)
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    // only called from assert
+    private boolean valuesInOrderAndBounds(int count, int sortedDim, byte[] minPackedValue, byte[] maxPackedValue,
+                                           IntFunction<BytesRef> values, long[] docs, int docsOffset) throws IOException
+    {
+        byte[] lastPackedValue = new byte[packedBytesLength];
+        long lastDoc = -1;
+        for (int i = 0; i < count; i++)
+        {
+            BytesRef packedValue = values.apply(i);
+            assert packedValue.length == packedBytesLength;
+            assert valueInOrder(i, sortedDim, lastPackedValue, packedValue.bytes, packedValue.offset,
+                                docs[docsOffset + i], lastDoc);
+            lastDoc = docs[docsOffset + i];
+
+            // Make sure this value does in fact fall within this leaf cell:
+            assert valueInBounds(packedValue, minPackedValue, maxPackedValue);
+        }
+        return true;
+    }
+
+    // only called from assert
+    private boolean valueInOrder(long ord, int sortedDim, byte[] lastPackedValue, byte[] packedValue, int packedValueOffset,
+                                 long doc, long lastDoc)
+    {
+        int dimOffset = sortedDim * bytesPerDim;
+        if (ord > 0)
+        {
+            int cmp = FutureArrays.compareUnsigned(lastPackedValue, dimOffset, dimOffset + bytesPerDim, packedValue, packedValueOffset + dimOffset, packedValueOffset + dimOffset + bytesPerDim);
+            if (cmp > 0)
+            {
+                throw new AssertionError("values out of order: last value=" + new BytesRef(lastPackedValue) + " current value=" + new BytesRef(packedValue, packedValueOffset, packedBytesLength) + " ord=" + ord);
+            }
+            if (cmp == 0 && doc < lastDoc)
+            {
+                throw new AssertionError("docs out of order: last doc=" + lastDoc + " current doc=" + doc + " ord=" + ord);
+            }
+        }
+        System.arraycopy(packedValue, packedValueOffset, lastPackedValue, 0, packedBytesLength);
+        return true;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedReader.java
new file mode 100644
index 000000000000..17572e597b3c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedReader.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexInputReader;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.IndexInput;
+
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.checkBlockSize;
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.numBlocks;
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.readVLong;
+import static org.apache.lucene.util.BitUtil.zigZagDecode;
+
+/**
+ * Provides non-blocking, random access to a stream written with {@link BlockPackedWriter}.
+ */
+public class BlockPackedReader implements LongArray.Factory
+{
+    private final IndexComponents components;
+    private final FileHandle file;
+    private final int blockShift, blockMask;
+    private final long valueCount;
+    private final byte[] blockBitsPerValue;
+    private final long[] blockOffsets;
+    private final long[] minValues;
+
+    public BlockPackedReader(FileHandle file, Component component, IndexComponents components, MetadataSource source) throws IOException
+    {
+        this(file, components, new NumericValuesMeta(source.get(component.name())));
+    }
+
+    @SuppressWarnings("resource")
+    public BlockPackedReader(FileHandle file, IndexComponents components, NumericValuesMeta meta) throws IOException
+    {
+        this.components = components;
+        this.file = file;
+
+        this.valueCount = meta.valueCount;
+
+        blockShift = checkBlockSize(meta.blockSize, AbstractBlockPackedWriter.MIN_BLOCK_SIZE, AbstractBlockPackedWriter.MAX_BLOCK_SIZE);
+        blockMask = meta.blockSize - 1;
+        final int numBlocks = numBlocks(valueCount, meta.blockSize);
+        blockBitsPerValue = new byte[numBlocks];
+        blockOffsets = new long[numBlocks];
+        minValues = new long[numBlocks];
+
+        try (final RandomAccessReader reader = this.file.createReader())
+        {
+            final IndexInputReader in = IndexInputReader.create(reader);
+            SAICodecUtils.validate(in);
+            in.seek(meta.blockMetaOffset);
+
+            for (int i = 0; i < numBlocks; ++i)
+            {
+                final int token = in.readByte() & 0xFF;
+                final int bitsPerValue = token >>> AbstractBlockPackedWriter.BPV_SHIFT;
+                if (bitsPerValue > 64)
+                {
+                    throw new CorruptIndexException(String.format("Block %d is corrupted. Bits per value should be no more than 64 and is %d.", i, bitsPerValue), in);
+                }
+                if ((token & AbstractBlockPackedWriter.MIN_VALUE_EQUALS_0) == 0)
+                {
+                    long val = zigZagDecode(1L + readVLong(in));
+                    minValues[i] = val;
+                }
+                else
+                {
+                    minValues[i] = 0L;
+                }
+
+                blockBitsPerValue[i] = (byte) bitsPerValue;
+
+                if (bitsPerValue > 0)
+                {
+                    blockOffsets[i] = in.readVLong();
+                }
+                else
+                {
+                    blockOffsets[i] = -1;
+                }
+            }
+        }
+    }
+
+    @VisibleForTesting
+    @Override
+    public LongArray open()
+    {
+        return openTokenReader(0, null);
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public LongArray openTokenReader(long sstableRowId, SSTableQueryContext context)
+    {
+        final IndexInput indexInput = components.openInput(file);
+        return new AbstractBlockPackedReader(indexInput, blockBitsPerValue, blockShift, blockMask, sstableRowId, valueCount)
+        {
+            @Override
+            protected long blockOffsetAt(int block)
+            {
+                return blockOffsets[block];
+            }
+
+            @Override
+            long delta(int block, int idx)
+            {
+                return minValues[block];
+            }
+
+            @Override
+            public void close() throws IOException
+            {
+                indexInput.close();
+            }
+        };
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedWriter.java
new file mode 100644
index 000000000000..2309d4bb9286
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/BlockPackedWriter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.packed.DirectWriter;
+
+import static org.apache.lucene.util.BitUtil.zigZagEncode;
+
+/**
+ * A writer for large sequences of longs.
+ *
+ * Modified copy of {@link org.apache.lucene.util.packed.BlockPackedWriter} to use {@link DirectWriter}
+ * for optimised reads that doesn't require seeking through the whole file to open a thread-exclusive reader.
+ */
+public class BlockPackedWriter extends AbstractBlockPackedWriter
+{
+    public BlockPackedWriter(IndexOutput out, int blockSize)
+    {
+        super(out, blockSize);
+    }
+
+    @Override
+    protected void flush() throws IOException
+    {
+        assert off > 0;
+        long min = Long.MAX_VALUE, max = Long.MIN_VALUE;
+        for (int i = 0; i < off; ++i)
+        {
+            min = Math.min(values[i], min);
+            max = Math.max(values[i], max);
+        }
+
+        final long delta = max - min;
+        int bitsRequired = delta == 0 ? 0 : DirectWriter.unsignedBitsRequired(delta);
+
+        final int token = (bitsRequired << BPV_SHIFT) | (min == 0 ? MIN_VALUE_EQUALS_0 : 0);
+        blockMetaWriter.writeByte((byte) token);
+
+        if (min != 0)
+        {
+            // TODO: the min values can be delta encoded since they are read linearly
+            // TODO: buffer the min values so they may be written as a single block
+            writeVLong(blockMetaWriter, zigZagEncode(min) - 1);
+        }
+
+        if (bitsRequired > 0)
+        {
+            if (min != 0)
+            {
+                for (int i = 0; i < off; ++i)
+                {
+                    values[i] -= min;
+                }
+            }
+            blockMetaWriter.writeVLong(out.getFilePointer());
+            writeValues(off, bitsRequired);
+        }
+
+        off = 0;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/DirectReaders.java b/src/java/org/apache/cassandra/index/sai/disk/v1/DirectReaders.java
new file mode 100644
index 000000000000..420348f62ab5
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/DirectReaders.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.RandomAccessInput;
+
+class DirectReaders
+{
+    interface Reader
+    {
+        long get(RandomAccessInput in, long offset, long index);
+    }
+
+    static Reader getReaderForBitsPerValue(byte bitsPerValue)
+    {
+        switch (bitsPerValue)
+        {
+            case 0:
+                return READER_0;
+            case 1:
+                return READER_1;
+            case 2:
+                return READER_2;
+            case 4:
+                return READER_4;
+            case 8:
+                return READER_8;
+            case 12:
+                return READER_12;
+            case 16:
+                return READER_16;
+            case 20:
+                return READER_20;
+            case 24:
+                return READER_24;
+            case 28:
+                return READER_28;
+            case 32:
+                return READER_32;
+            case 40:
+                return READER_40;
+            case 48:
+                return READER_48;
+            case 56:
+                return READER_56;
+            case 64:
+                return READER_64;
+            default:
+                throw new IllegalArgumentException("unsupported bitsPerValue: " + bitsPerValue);
+        }
+    }
+
+    private static final Reader READER_0 = (in, offset, index) -> 0;
+
+    private static final Reader READER_1 = (in, offset, index) -> {
+        try
+        {
+            int shift = 7 - (int) (index & 7);
+            return (in.readByte(offset + (index >>> 3)) >>> shift) & 0x1;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_2 = (in, offset, index) -> {
+        try
+        {
+            int shift = (3 - (int) (index & 3)) << 1;
+            return (in.readByte(offset + (index >>> 2)) >>> shift) & 0x3;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_4 = (in, offset, index) -> {
+        try
+        {
+            int shift = (int) ((index + 1) & 1) << 2;
+            return (in.readByte(offset + (index >>> 1)) >>> shift) & 0xF;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_8 = (in, offset, index) -> {
+        try
+        {
+            return in.readByte(offset + index) & 0xFF;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_12 = (in, offset, index) -> {
+        try
+        {
+            long o = (index * 12) >>> 3;
+            int shift = (int) ((index + 1) & 1) << 2;
+            return (in.readShort(offset + o) >>> shift) & 0xFFF;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_16 = (in, offset, index) -> {
+        try
+        {
+            return in.readShort(offset + (index << 1)) & 0xFFFF;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_20 = (in, offset, index) -> {
+        try
+        {
+            long o = (index * 20) >>> 3;
+            int v = in.readInt(offset + o) >>> 8;
+            int shift = (int) ((index + 1) & 1) << 2;
+            return (v >>> shift) & 0xFFFFF;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_24 = (in, offset, index) -> {
+        try
+        {
+            return in.readInt(offset + index * 3) >>> 8;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_28 = (in, offset, index) -> {
+        try
+        {
+            long o = (index * 28) >>> 3;
+            int shift = (int) ((index + 1) & 1) << 2;
+            return (in.readInt(offset + o) >>> shift) & 0xFFFFFFFL;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_32 = (in, offset, index) -> {
+        try
+        {
+            return in.readInt(offset + (index << 2)) & 0xFFFFFFFFL;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_40 = (in, offset, index) -> {
+        try
+        {
+            return in.readLong(offset + index * 5) >>> 24;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_48 = (in, offset, index) -> {
+        try
+        {
+            return in.readLong(offset + index * 6) >>> 16;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_56 = (in, offset, index) -> {
+        try
+        {
+            return in.readLong(offset + index * 7) >>> 8;
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+
+    private static final Reader READER_64 = (in, offset, index) -> {
+        try
+        {
+            return in.readLong(offset + (index << 3));
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    };
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/FilteringPostingList.java b/src/java/org/apache/cassandra/index/sai/disk/v1/FilteringPostingList.java
new file mode 100644
index 000000000000..cabf733dcd5d
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/FilteringPostingList.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import com.google.common.base.Preconditions;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.lucene.util.FixedBitSet;
+
+
+/**
+ * A wrapper that iterates over a delegate {@link PostingList}, filtering out postings at
+ * positions that are not present in a provided filter.
+ */
+public class FilteringPostingList implements PostingList
+{
+    private final FixedBitSet filter;
+    private final OrdinalPostingList delegate;
+    private final int cardinality;
+    private int position = 0;
+
+    FilteringPostingList(FixedBitSet filter, OrdinalPostingList delegate)
+    {
+        cardinality = filter.cardinality();
+
+        Preconditions.checkArgument(cardinality > 0, "Filter must contain at least one match.");
+
+        this.filter = filter;
+        this.delegate = delegate;
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        delegate.close();
+    }
+
+    /**
+     *
+     * @return the segment row ID of the next match
+     */
+    @Override
+    public long nextPosting() throws IOException
+    {
+        while (true)
+        {
+            long segmentRowId = delegate.nextPosting();
+
+            if (segmentRowId == PostingList.END_OF_STREAM)
+            {
+                return PostingList.END_OF_STREAM;
+            }
+
+            if (filter.get(position++))
+            {
+                return segmentRowId;
+            }
+        }
+    }
+
+    @Override
+    public long size()
+    {
+        return cardinality;
+    }
+
+    @Override
+    public  long advance(long targetRowID) throws IOException
+    {
+        long segmentRowId = delegate.advance(targetRowID);
+
+        if (segmentRowId == PostingList.END_OF_STREAM)
+        {
+            return PostingList.END_OF_STREAM;
+        }
+
+        // these are always for leaf kdtree postings so the max is 1024
+        position = (int)delegate.getOrdinal();
+
+        // If the ordinal of the ID we just read satisfies the filter, just return it...
+        if (filter.get(position - 1))
+        {
+            return segmentRowId;
+        }
+
+        // ...but if the ID doesn't satisfy the filter, get the next match.
+        return nextPosting();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexWriter.java
new file mode 100644
index 000000000000..3d444244e3c4
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/InvertedIndexWriter.java
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.commons.lang3.mutable.MutableLong;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.TermsIterator;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Builds an on-disk inverted index structure: terms dictionary and postings lists.
+ */
+@NotThreadSafe
+public class InvertedIndexWriter implements Closeable
+{
+    private final TrieTermsDictionaryWriter termsDictionaryWriter;
+    private final PostingsWriter postingsWriter;
+    private long postingsAdded;
+
+    public InvertedIndexWriter(IndexComponents indexComponents, boolean segmented) throws IOException
+    {
+        this.termsDictionaryWriter = new TrieTermsDictionaryWriter(indexComponents, segmented);
+        this.postingsWriter = new PostingsWriter(indexComponents, segmented);
+    }
+
+    /**
+     * Appends a set of terms and associated postings to their respective overall SSTable component files.
+     *
+     * @param terms an iterator of terms with their associated postings
+     *
+     * @return metadata describing the location of this inverted index in the overall SSTable
+     *         terms and postings component files
+     */
+    public SegmentMetadata.ComponentMetadataMap writeAll(TermsIterator terms) throws IOException
+    {
+        // Terms and postings writers are opened in append mode with pointers at the end of their respective files.
+        long termsOffset = termsDictionaryWriter.getStartOffset();
+        long postingsOffset = postingsWriter.getStartOffset();
+
+        while (terms.hasNext())
+        {
+            ByteComparable term = terms.next();
+            try (PostingList postings = terms.postings())
+            {
+                final long offset = postingsWriter.write(postings);
+                if (offset >= 0)
+                    termsDictionaryWriter.add(term, offset);
+            }
+        }
+        postingsAdded = postingsWriter.getTotalPostings();
+        MutableLong footerPointer = new MutableLong();
+        long termsRoot = termsDictionaryWriter.complete(footerPointer);
+        postingsWriter.complete();
+
+        long termsLength = termsDictionaryWriter.getFilePointer() - termsOffset;
+        long postingsLength = postingsWriter.getFilePointer() - postingsOffset;
+
+        SegmentMetadata.ComponentMetadataMap components = new SegmentMetadata.ComponentMetadataMap();
+
+        Map<String,String> map = new HashMap<>(2);
+        map.put(SAICodecUtils.FOOTER_POINTER, "" + footerPointer.getValue());
+
+        // Postings list file pointers are stored directly in TERMS_DATA, so a root is not needed.
+        components.put(IndexComponents.NDIType.POSTING_LISTS, -1, postingsOffset, postingsLength);
+        components.put(IndexComponents.NDIType.TERMS_DATA, termsRoot, termsOffset, termsLength, map);
+
+        return components;
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        postingsWriter.close();
+        termsDictionaryWriter.close();
+    }
+
+    /**
+     * @return total number of row IDs added to posting lists
+     */
+    public long getPostingsCount()
+    {
+        return postingsAdded;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/LeafOrderMap.java b/src/java/org/apache/cassandra/index/sai/disk/v1/LeafOrderMap.java
new file mode 100644
index 000000000000..a5ca47809c50
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/LeafOrderMap.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.RandomAccessInput;
+import org.apache.lucene.util.packed.DirectWriter;
+
+public class LeafOrderMap
+{
+    public static int getValue(RandomAccessInput in, long offset, int index, DirectReaders.Reader reader)
+    {
+        return Math.toIntExact(reader.get(in, offset, index));
+    }
+
+    public static void write(final int[] array, int length, int maxValue, final DataOutput out) throws IOException
+    {
+        final int bits = DirectWriter.unsignedBitsRequired(maxValue);
+        final DirectWriter writer = DirectWriter.getInstance(out, length, bits);
+        for (int i = 0; i < length; i++)
+        {
+            assert array[i] <= maxValue;
+
+            writer.add(array[i]);
+        }
+        writer.finish();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/MergePostingList.java b/src/java/org/apache/cassandra/index/sai/disk/v1/MergePostingList.java
new file mode 100644
index 000000000000..a7acc55f574c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/MergePostingList.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.PriorityQueue;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.io.util.FileUtils;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+/**
+ * Merges multiple {@link PostingList} which individually contain unique items into a single list.
+ */
+@NotThreadSafe
+public class MergePostingList implements PostingList
+{
+    final PriorityQueue<PeekablePostingList> postingLists;
+    final List<PeekablePostingList> temp;
+    final Closeable onClose;
+    final long size;
+    private long lastRowId = -1;
+
+    private MergePostingList(PriorityQueue<PeekablePostingList> postingLists, Closeable onClose)
+    {
+        this.temp = new ArrayList<>(postingLists.size());
+        this.onClose = onClose;
+        this.postingLists = postingLists;
+        long size = 0;
+        for (PostingList postingList : postingLists)
+        {
+            size += postingList.size();
+        }
+        this.size = size;
+    }
+
+    public static PostingList merge(PriorityQueue<PeekablePostingList> postings, Closeable onClose)
+    {
+        checkArgument(!postings.isEmpty());
+        return postings.size() > 1 ? new MergePostingList(postings, onClose) : postings.poll();
+    }
+
+    public static PostingList merge(PriorityQueue<PeekablePostingList> postings)
+    {
+        return merge(postings, () -> FileUtils.close(postings));
+    }
+
+    @SuppressWarnings("resource")
+    @Override
+    public long nextPosting() throws IOException
+    {
+        while (!postingLists.isEmpty())
+        {
+            PeekablePostingList head = postingLists.poll();
+            long next = head.nextPosting();
+
+            if (next == END_OF_STREAM)
+            {
+                // skip current posting list
+            }
+            else if (next > lastRowId)
+            {
+                lastRowId = next;
+                postingLists.add(head);
+                return next;
+            }
+            else if (next == lastRowId)
+            {
+                postingLists.add(head);
+            }
+        }
+
+        return PostingList.END_OF_STREAM;
+    }
+
+    @SuppressWarnings("resource")
+    @Override
+    public long advance(long targetRowID) throws IOException
+    {
+        temp.clear();
+
+        while (!postingLists.isEmpty())
+        {
+            PeekablePostingList peekable = postingLists.poll();
+            peekable.advanceWithoutConsuming(targetRowID);
+            if (peekable.peek() != PostingList.END_OF_STREAM)
+                temp.add(peekable);
+        }
+        postingLists.addAll(temp);
+
+        return nextPosting();
+    }
+
+    @Override
+    public long size()
+    {
+        return size;
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        onClose.close();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/MetadataSource.java b/src/java/org/apache/cassandra/index/sai/disk/v1/MetadataSource.java
new file mode 100644
index 000000000000..766c90865f1c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/MetadataSource.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.cassandra.index.sai.disk.format.Version;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.lucene.store.BufferedChecksumIndexInput;
+import org.apache.lucene.store.ByteArrayIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+
+@NotThreadSafe
+public class MetadataSource
+{
+    private final Version version;
+    private final Map<String, BytesRef> components;
+
+    private MetadataSource(Version version, Map<String, BytesRef> components)
+    {
+        this.version = version;
+        this.components = components;
+    }
+
+    public static MetadataSource loadGroupMetadata(IndexComponents components) throws IOException
+    {
+        return MetadataSource.load(components.openBlockingInput(IndexComponents.GROUP_META));
+    }
+
+    public static MetadataSource loadColumnMetadata(IndexComponents components) throws IOException
+    {
+        return MetadataSource.load(components.openBlockingInput(components.meta));
+    }
+
+    private static MetadataSource load(IndexInput indexInput) throws IOException
+    {
+        Map<String, BytesRef> components = new HashMap<>();
+        Version version;
+
+        try (BufferedChecksumIndexInput input = new BufferedChecksumIndexInput(indexInput))
+        {
+            version = SAICodecUtils.checkHeader(input);
+            final int num = input.readInt();
+
+            for (int x = 0; x < num; x++)
+            {
+                if (input.length() == input.getFilePointer())
+                {
+                    // we should never get here, because we always add footer to the file
+                    throw new IllegalStateException("Unexpected EOF in " + input);
+                }
+
+                final String name = input.readString();
+                final int length = input.readInt();
+                final byte[] bytes = new byte[length];
+                input.readBytes(bytes, 0, length);
+
+                components.put(name, new BytesRef(bytes));
+            }
+
+            SAICodecUtils.checkFooter(input);
+        }
+
+        return new MetadataSource(version, components);
+    }
+
+    public IndexInput get(String name)
+    {
+        BytesRef bytes = components.get(name);
+
+        if (bytes == null)
+        {
+            throw new IllegalArgumentException(String.format("Could not find component '%s'. Available properties are %s.",
+                                                             name, components.keySet()));
+        }
+
+        return new ByteArrayIndexInput(name, bytes.bytes);
+    }
+
+    public Version getVersion()
+    {
+        return version;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/MetadataWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/MetadataWriter.java
new file mode 100644
index 000000000000..76434fae1637
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/MetadataWriter.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.cassandra.index.sai.disk.io.RAMIndexOutput;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+
+@NotThreadSafe
+public class MetadataWriter implements Closeable
+{
+    private final IndexOutput output;
+    private final Map<String, BytesRef> map = new HashMap<>();
+
+    public MetadataWriter(IndexOutput output)
+    {
+        this.output = output;
+    }
+
+    public Builder builder(String name)
+    {
+        return new Builder(name);
+    }
+
+    public class Builder extends RAMIndexOutput implements Closeable
+    {
+        private Builder(String name)
+        {
+            super(name);
+        }
+
+        @Override
+        public void close()
+        {
+            map.put(getName(), new BytesRef(out.getBytes(), 0, out.getPosition()));
+        }
+    }
+
+    private void finish() throws IOException
+    {
+        SAICodecUtils.writeHeader(output);
+        output.writeInt(map.size());
+        for (Map.Entry<String, BytesRef> entry : map.entrySet())
+        {
+            output.writeString(entry.getKey());
+            output.writeInt(entry.getValue().length);
+            output.writeBytes(entry.getValue().bytes, entry.getValue().offset, entry.getValue().length);
+        }
+        SAICodecUtils.writeFooter(output);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        try
+        {
+            finish();
+        }
+        finally
+        {
+            output.close();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedReader.java
new file mode 100644
index 000000000000..cbbe18e9c692
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedReader.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexInputReader;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.checkBlockSize;
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.numBlocks;
+
+/**
+ * Provides non-blocking, random access to a stream written with {@link MonotonicBlockPackedWriter}.
+ */
+public class MonotonicBlockPackedReader implements LongArray.Factory
+{
+    private final IndexComponents components;
+    private final FileHandle file;
+    private final int blockShift, blockMask;
+    private final long valueCount;
+    private final byte[] blockBitsPerValue;
+    private final PackedLongValues blockOffsets;
+    private final PackedLongValues minValues;
+    private final float[] averages;
+
+    public MonotonicBlockPackedReader(FileHandle file, Component component, IndexComponents components, MetadataSource source) throws IOException
+    {
+        this(file, components, new NumericValuesMeta(source.get(component.name())));
+    }
+
+    @SuppressWarnings("resource")
+    public MonotonicBlockPackedReader(FileHandle file, IndexComponents components, NumericValuesMeta meta) throws IOException
+    {
+        this.components = components;
+        this.valueCount = meta.valueCount;
+        blockShift = checkBlockSize(meta.blockSize, AbstractBlockPackedWriter.MIN_BLOCK_SIZE, AbstractBlockPackedWriter.MAX_BLOCK_SIZE);
+        blockMask = meta.blockSize - 1;
+        int numBlocks = numBlocks(valueCount, meta.blockSize);
+        PackedLongValues.Builder minValuesBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+        PackedLongValues.Builder blockOffsetsBuilder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+        averages = new float[numBlocks];
+        blockBitsPerValue = new byte[numBlocks];
+        this.file = file;
+
+        try (final RandomAccessReader reader = this.file.createReader())
+        {
+            final IndexInputReader in = IndexInputReader.create(reader);
+            SAICodecUtils.validate(in);
+
+            in.seek(meta.blockMetaOffset);
+            for (int i = 0; i < numBlocks; ++i)
+            {
+                minValuesBuilder.add(in.readZLong());
+                averages[i] = Float.intBitsToFloat(in.readInt());
+                final int bitsPerValue = in.readVInt();
+                if (bitsPerValue > 64)
+                {
+                    throw new CorruptIndexException(String.format("Block %d is corrupted. Bits per value should be no more than 64 and is %d.", i, bitsPerValue), in);
+                }
+                blockBitsPerValue[i] = (byte) bitsPerValue;
+                // when bitsPerValue is 0, block offset won't be used
+                blockOffsetsBuilder.add(bitsPerValue == 0 ? -1 : in.readVLong());
+            }
+        }
+
+        blockOffsets = blockOffsetsBuilder.build();
+        minValues = minValuesBuilder.build();
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public LongArray open()
+    {
+        final IndexInput indexInput = components.openInput(file);
+        return new AbstractBlockPackedReader(indexInput, blockBitsPerValue, blockShift, blockMask, 0, valueCount)
+        {
+            @Override
+            long delta(int block, int idx)
+            {
+                return expected(minValues.get(block), averages[block], idx);
+            }
+
+            @Override
+            public void close() throws IOException
+            {
+                indexInput.close();
+            }
+
+            @Override
+            protected long blockOffsetAt(int block)
+            {
+                return blockOffsets.get(block);
+            }
+
+            @Override
+            public long findTokenRowID(long targetValue)
+            {
+               throw new UnsupportedOperationException();
+            }
+        };
+    }
+
+    public static long expected(long origin, float average, int index)
+    {
+        return origin + (long) (average * (long) index);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedWriter.java
new file mode 100644
index 000000000000..268daaffbabd
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/MonotonicBlockPackedWriter.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.packed.DirectWriter;
+
+/**
+ * A writer for large monotonically increasing sequences of positive longs.
+ *
+ * Modified copy of {@link org.apache.lucene.util.packed.MonotonicBlockPackedWriter} to use {@link DirectWriter} for
+ * optimised reads that doesn't require seeking through the whole file to open a thread-exclusive reader.
+ */
+public class MonotonicBlockPackedWriter extends AbstractBlockPackedWriter
+{
+    public MonotonicBlockPackedWriter(IndexOutput out, int blockSize)
+    {
+        super(out, blockSize);
+    }
+
+    @Override
+    public void add(long l) throws IOException
+    {
+        assert l >= 0;
+        super.add(l);
+    }
+
+    @Override
+    protected void flush() throws IOException
+    {
+        assert off > 0;
+
+        final float avg = off == 1 ? 0f : (float) (values[off - 1] - values[0]) / (off - 1);
+        long min = values[0];
+        // adjust min so that all deltas will be positive
+        for (int i = 1; i < off; ++i)
+        {
+            final long actual = values[i];
+            final long expected = MonotonicBlockPackedReader.expected(min, avg, i);
+            if (expected > actual)
+            {
+                min -= (expected - actual);
+            }
+        }
+
+        long maxDelta = 0;
+        for (int i = 0; i < off; ++i)
+        {
+            values[i] = values[i] - MonotonicBlockPackedReader.expected(min, avg, i);
+            maxDelta = Math.max(maxDelta, values[i]);
+        }
+
+        blockMetaWriter.writeZLong(min);
+        blockMetaWriter.writeInt(Float.floatToIntBits(avg));
+        if (maxDelta == 0)
+        {
+            blockMetaWriter.writeVInt(0);
+        }
+        else
+        {
+            final int bitsRequired = DirectWriter.bitsRequired(maxDelta);
+            blockMetaWriter.writeVInt(bitsRequired);
+            blockMetaWriter.writeVLong(out.getFilePointer());
+            writeValues(off, bitsRequired);
+        }
+
+        off = 0;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java
new file mode 100644
index 000000000000..2928e75a89c9
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.base.MoreObjects;
+
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.MutableOneDimPointValues;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+import static com.google.common.base.Preconditions.checkArgument;
+
+
+/**
+ * Specialized writer for 1-dim point values, that builds them into a BKD tree with auxiliary posting lists on eligible
+ * tree levels.
+ *
+ * Given sorted input {@link MutablePointValues}, 1-dim case allows to optimise flush process, because we don't need to
+ * buffer all point values to sort them.
+ */
+public class NumericIndexWriter implements Closeable
+{
+    public static final int MAX_POINTS_IN_LEAF_NODE = BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE;
+    private final BKDWriter writer;
+    private final IndexComponents indexComponents;
+    private final int bytesPerDim;
+    private final boolean segmented;
+
+    private final IndexWriterConfig config;
+
+    /**
+     * @param maxSegmentRowId maximum possible segment row ID, used to create `maxDoc` for kd-tree
+     * @param numRows must be greater than number of added rowIds, only used for validation.
+     */
+    public NumericIndexWriter(IndexComponents indexComponents, int bytesPerDim, long maxSegmentRowId, long numRows, IndexWriterConfig config, boolean segmented) throws IOException
+    {
+        this(indexComponents, MAX_POINTS_IN_LEAF_NODE, bytesPerDim, maxSegmentRowId, numRows, config, segmented);
+    }
+
+    public NumericIndexWriter(IndexComponents indexComponents, int maxPointsInLeafNode, int bytesPerDim, long maxSegmentRowId, long numRows, IndexWriterConfig config, boolean segmented) throws IOException
+    {
+        checkArgument(maxSegmentRowId >= 0,
+                      "[%s] maxRowId must be non-negative value, but got %s",
+                      config.getIndexName(), maxSegmentRowId);
+
+        checkArgument(numRows >= 0,
+                      "[$s] numRows must be non-negative value, but got %s",
+                      config.getIndexName(), numRows);
+
+        this.indexComponents = indexComponents;
+        this.bytesPerDim = bytesPerDim;
+        this.config = config;
+        this.writer = new BKDWriter(maxSegmentRowId + 1,
+                                    1,
+                                    bytesPerDim,
+                                    maxPointsInLeafNode,
+                                    BKDWriter.DEFAULT_MAX_MB_SORT_IN_HEAP,
+                                    numRows,
+                                    true, null);
+        this.segmented = segmented;
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        IOUtils.close(writer);
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("bytesPerDim", bytesPerDim)
+                          .add("bufferedPoints", writer.getPointCount())
+                          .toString();
+    }
+
+    public static class LeafCallback implements BKDWriter.OneDimensionBKDWriterCallback
+    {
+        final List<PackedLongValues> postings = new ArrayList<>();
+
+        public int numLeaves()
+        {
+            return postings.size();
+        }
+
+        @Override
+        public void writeLeafDocs(int leafNum, BKDWriter.RowIDAndIndex[] sortedByRowID, int offset, int count)
+        {
+            final PackedLongValues.Builder builder = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
+
+            for (int i = offset; i < count; ++i)
+            {
+                builder.add(sortedByRowID[i].rowID);
+            }
+            postings.add(builder.build());
+        }
+    }
+
+    /**
+     * Writes a k-d tree and posting lists from a {@link MutablePointValues}.
+     *
+     * @param values points to write
+     *
+     * @return metadata describing the location and size of this kd-tree in the overall SSTable kd-tree component file
+     */
+    public SegmentMetadata.ComponentMetadataMap writeAll(MutableOneDimPointValues values) throws IOException
+    {
+        long bkdPosition;
+        final SegmentMetadata.ComponentMetadataMap components = new SegmentMetadata.ComponentMetadataMap();
+
+        final LeafCallback leafCallback = new LeafCallback();
+
+        try (IndexOutput bkdOutput = indexComponents.createOutput(indexComponents.kdTree, true, segmented))
+        {
+            // The SSTable kd-tree component file is opened in append mode, so our offset is the current file pointer.
+            final long bkdOffset = bkdOutput.getFilePointer();
+
+            bkdPosition = writer.writeField(bkdOutput, values, leafCallback);
+
+            final long bkdLength = bkdOutput.getFilePointer() - bkdOffset;
+
+            Map<String, String> attributes = new LinkedHashMap<>();
+            attributes.put("max_points_in_leaf_node", Integer.toString(writer.maxPointsInLeafNode));
+            attributes.put("num_leaves", Integer.toString(leafCallback.numLeaves()));
+            attributes.put("num_points", Long.toString(writer.pointCount));
+            attributes.put("bytes_per_dim", Long.toString(writer.bytesPerDim));
+            attributes.put("num_dims", Long.toString(writer.numDims));
+
+            components.put(IndexComponents.NDIType.KD_TREE, bkdPosition, bkdOffset, bkdLength, attributes);
+        }
+
+        try (TraversingBKDReader reader = new TraversingBKDReader(indexComponents, indexComponents.createFileHandle(indexComponents.kdTree, segmented), bkdPosition);
+             IndexOutput postingsOutput = indexComponents.createOutput(indexComponents.kdTreePostingLists, true, segmented))
+        {
+            final long postingsOffset = postingsOutput.getFilePointer();
+
+            final OneDimBKDPostingsWriter postingsWriter = new OneDimBKDPostingsWriter(leafCallback.postings, config, indexComponents);
+            reader.traverse(postingsWriter);
+
+            // The kd-tree postings writer already writes its own header & footer.
+            final long postingsPosition = postingsWriter.finish(postingsOutput);
+
+            Map<String, String> attributes = new LinkedHashMap<>();
+            attributes.put("num_leaf_postings", Integer.toString(postingsWriter.numLeafPostings));
+            attributes.put("num_non_leaf_postings", Integer.toString(postingsWriter.numNonLeafPostings));
+
+            long postingsLength = postingsOutput.getFilePointer() - postingsOffset;
+            components.put(IndexComponents.NDIType.KD_TREE_POSTING_LISTS, postingsPosition, postingsOffset, postingsLength, attributes);
+        }
+
+        return components;
+    }
+
+    /**
+     * @return number of points added
+     */
+    public long getPointCount()
+    {
+        return writer.getPointCount();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesMeta.java b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesMeta.java
new file mode 100644
index 000000000000..a27047955861
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesMeta.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+
+public class NumericValuesMeta
+{
+    final long valueCount;
+    final int blockSize;
+    final long blockMetaOffset;
+
+    NumericValuesMeta(IndexInput input) throws IOException
+    {
+        valueCount = input.readLong();
+        blockSize = input.readInt();
+        blockMetaOffset = input.readVLong();
+    }
+
+    public NumericValuesMeta(long valueCount, int blockSize, long blockMetaOffset)
+    {
+        this.valueCount = valueCount;
+        this.blockSize = blockSize;
+        this.blockMetaOffset = blockMetaOffset;
+    }
+
+    public void write(IndexOutput out) throws IOException
+    {
+        out.writeLong(valueCount);
+        out.writeInt(blockSize);
+        out.writeVLong(blockMetaOffset);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesWriter.java
new file mode 100644
index 000000000000..7da8888474ff
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericValuesWriter.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.lucene.store.IndexOutput;
+
+
+public class NumericValuesWriter implements Closeable
+{
+    public static final int MONOTONIC_BLOCK_SIZE = 16384;
+    public static final int BLOCK_SIZE = 128;
+
+    private final IndexOutput output;
+    private final AbstractBlockPackedWriter writer;
+    private final MetadataWriter metadataWriter;
+    private final Component component;
+    private final int blockSize;
+    private long count = 0;
+
+    public NumericValuesWriter(IndexComponents.IndexComponent component,
+                               IndexOutput indexOutput,
+                               MetadataWriter metadataWriter,
+                               boolean monotonic) throws IOException
+    {
+        this(component, indexOutput, metadataWriter, monotonic, monotonic ? MONOTONIC_BLOCK_SIZE : BLOCK_SIZE);
+    }
+
+    NumericValuesWriter(IndexComponents.IndexComponent component,
+                        IndexComponents indexComponents,
+                        MetadataWriter metadataWriter,
+                        boolean monotonic,
+                        int blockSize) throws IOException
+    {
+        this(component, indexComponents.createOutput(component), metadataWriter, monotonic, blockSize);
+    }
+
+    private NumericValuesWriter(Component component,
+                                IndexOutput indexOutput,
+                                MetadataWriter metadataWriter,
+                                boolean monotonic, int blockSize) throws IOException
+    {
+        SAICodecUtils.writeHeader(indexOutput);
+        this.writer = monotonic ? new MonotonicBlockPackedWriter(indexOutput, blockSize)
+                                : new BlockPackedWriter(indexOutput, blockSize);
+        this.output = indexOutput;
+        this.component = component;
+        this.metadataWriter = metadataWriter;
+        this.blockSize = blockSize;
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        try (IndexOutput o = metadataWriter.builder(component.name))
+        {
+            final long fp = writer.finish();
+            SAICodecUtils.writeFooter(output);
+
+            NumericValuesMeta meta = new NumericValuesMeta(count, blockSize, fp);
+            meta.write(o);
+        }
+        finally
+        {
+            output.close();
+        }
+    }
+
+    public void add(long value) throws IOException
+    {
+        writer.add(value);
+        count++;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriter.java
new file mode 100644
index 000000000000..57a94d739bf6
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriter.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.TreeMap;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Stopwatch;
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Multimap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.agrona.collections.IntArrayList;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static com.google.common.base.Preconditions.checkState;
+
+/**
+ * Writes auxiliary posting lists for bkd tree nodes. If a node has a posting list attached, it will contain every row
+ * id
+ * from all leaves reachable from that node.
+ *
+ * Writer is stateful, because it needs to collect data from bkd index data structure first to find set of eligible
+ * nodes and leaf nodes reachable from them.
+ *
+ * This is an optimised writer for 1-dim points, where we know that leaf blocks are written in value order (in this
+ * order we pass them to the {@link BKDWriter}). That allows us to skip reading the leaves, instead just order leaf
+ * blocks by their offset in the index file, and correlate them with buffered posting lists. We can't make this
+ * assumption for multi-dim case.
+ */
+public class OneDimBKDPostingsWriter implements TraversingBKDReader.IndexTreeTraversalCallback
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final List<PackedLongValues> postings;
+    private final TreeMap<Long, Integer> leafOffsetToNodeID = new TreeMap<>(Long::compareTo);
+    private final Multimap<Integer, Integer> nodeToChildLeaves = HashMultimap.create();
+
+    private final IndexWriterConfig config;
+    private final IndexComponents components;
+    int numNonLeafPostings = 0;
+    int numLeafPostings = 0;
+
+    OneDimBKDPostingsWriter(List<PackedLongValues> postings, IndexWriterConfig config, IndexComponents indexComponents)
+    {
+        this.postings = postings;
+        this.config = config;
+        this.components = indexComponents;
+    }
+
+    @Override
+    public void onLeaf(int leafNodeID, long leafBlockFP, IntArrayList pathToRoot)
+    {
+        checkArgument(!pathToRoot.containsInt(leafNodeID));
+        checkArgument(pathToRoot.isEmpty() || leafNodeID > pathToRoot.get(pathToRoot.size() - 1));
+
+        leafOffsetToNodeID.put(leafBlockFP, leafNodeID);
+        for (int i = 0; i < pathToRoot.size(); i++)
+        {
+            final int level = i + 1;
+            if (isLevelEligibleForPostingList(level))
+            {
+                final int nodeID = pathToRoot.get(i);
+                nodeToChildLeaves.put(nodeID, leafNodeID);
+            }
+        }
+    }
+
+    @SuppressWarnings("resource")
+    public long finish(IndexOutput out) throws IOException
+    {
+        checkState(postings.size() == leafOffsetToNodeID.size(),
+                   "Expected equal number of postings lists (%s) and leaf offsets (%s).",
+                   postings.size(), leafOffsetToNodeID.size());
+
+        final PostingsWriter postingsWriter = new PostingsWriter(out);
+
+        final Iterator<PackedLongValues> postingsIterator = postings.iterator();
+        final Map<Integer, PackedLongValues> leafToPostings = new HashMap<>();
+        leafOffsetToNodeID.forEach((fp, nodeID) -> leafToPostings.put(nodeID, postingsIterator.next()));
+
+        final long postingsRamBytesUsed = postings.stream()
+                                                  .mapToLong(PackedLongValues::ramBytesUsed)
+                                                  .sum();
+
+        final List<Integer> internalNodeIDs =
+                nodeToChildLeaves.keySet()
+                                 .stream()
+                                 .filter(i -> nodeToChildLeaves.get(i).size() >= config.getBkdPostingsMinLeaves())
+                                 .collect(Collectors.toList());
+
+        final Collection<Integer> leafNodeIDs = leafOffsetToNodeID.values();
+
+        logger.debug(components.logMessage("Writing posting lists for {} internal and {} leaf kd-tree nodes. Leaf postings memory usage: {}."),
+                     internalNodeIDs.size(), leafNodeIDs.size(), FBUtilities.prettyPrintMemory(postingsRamBytesUsed));
+
+        final long startFP = out.getFilePointer();
+        final Stopwatch flushTime = Stopwatch.createStarted();
+        final TreeMap<Integer, Long> nodeIDToPostingsFilePointer = new TreeMap<>();
+        for (int nodeID : Iterables.concat(internalNodeIDs, leafNodeIDs))
+        {
+            Collection<Integer> leaves = nodeToChildLeaves.get(nodeID);
+
+            if (leaves.size() == 0)
+            {
+                leaves = Collections.singletonList(nodeID);
+                numLeafPostings++;
+            }
+            else
+            {
+                numNonLeafPostings++;
+            }
+
+            final PriorityQueue<PostingList.PeekablePostingList> postingLists = new PriorityQueue<>(100, Comparator.comparingLong(PostingList.PeekablePostingList::peek));
+            for (Integer leaf : leaves)
+                postingLists.add(new PackedLongsPostingList(leafToPostings.get(leaf)).peekable());
+
+            final PostingList mergedPostingList = MergePostingList.merge(postingLists);
+            final long postingFilePosition = postingsWriter.write(mergedPostingList);
+            // During compaction we could end up with an empty postings due to deletions.
+            // The writer will return a fp of -1 if no postings were written.
+            if (postingFilePosition >= 0)
+                nodeIDToPostingsFilePointer.put(nodeID, postingFilePosition);
+        }
+        flushTime.stop();
+        logger.debug(components.logMessage("Flushed {} of posting lists for kd-tree nodes in {} ms."),
+                     FBUtilities.prettyPrintMemory(out.getFilePointer() - startFP),
+                     flushTime.elapsed(TimeUnit.MILLISECONDS));
+
+
+        final long indexFilePointer = out.getFilePointer();
+        writeMap(nodeIDToPostingsFilePointer, out);
+        postingsWriter.complete();
+        return indexFilePointer;
+    }
+
+    private boolean isLevelEligibleForPostingList(int level)
+    {
+        return level > 1 && level % config.getBkdPostingsSkip() == 0;
+    }
+
+    private void writeMap(Map<Integer, Long> map, IndexOutput out) throws IOException
+    {
+        out.writeVInt(map.size());
+
+        for (Map.Entry<Integer, Long> e : map.entrySet())
+        {
+            out.writeVInt(e.getKey());
+            out.writeVLong(e.getValue());
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/OrdinalPostingList.java b/src/java/org/apache/cassandra/index/sai/disk/v1/OrdinalPostingList.java
new file mode 100644
index 000000000000..6e0fb035bf3b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/OrdinalPostingList.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+
+public interface OrdinalPostingList extends PostingList
+{
+    /**
+     *
+     * @return the ordinal of the posting that will be returned on the next call to {@link #nextPosting()}
+     */
+    long getOrdinal();
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/PackedLongsPostingList.java b/src/java/org/apache/cassandra/index/sai/disk/v1/PackedLongsPostingList.java
new file mode 100644
index 000000000000..61f9ae9cf450
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/PackedLongsPostingList.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+/**
+ * Adapter class for {@link PackedLongValues} to expose it as {@link PostingList}.
+ */
+public class PackedLongsPostingList implements PostingList
+{
+    private final PackedLongValues.Iterator iterator;
+    private final PackedLongValues values;
+
+    PackedLongsPostingList(PackedLongValues values)
+    {
+        this.values = values;
+        iterator = values.iterator();
+    }
+
+    @Override
+    public long nextPosting()
+    {
+        if (iterator.hasNext())
+        {
+            return iterator.next();
+        }
+        else
+        {
+            return PostingList.END_OF_STREAM;
+        }
+    }
+
+    @Override
+    public long size()
+    {
+        return values.size();
+    }
+
+    @Override
+    public long advance(long targetRowID) throws IOException
+    {
+        throw new UnsupportedOperationException();
+    }
+}
+
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java
new file mode 100644
index 000000000000..5d97000d59fc
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+
+import java.io.IOException;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.SeekingRandomAccessInput;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+
+
+/**
+ * Reads, decompresses and decodes postings lists written by {@link PostingsWriter}.
+ *
+ * Holds exactly one postings block in memory at a time. Does binary search over skip table to find a postings block to
+ * load.
+ */
+@NotThreadSafe
+public class PostingsReader implements OrdinalPostingList
+{
+    protected final IndexInput input;
+    private final int blockSize;
+    private final long numPostings;
+    private final LongArray blockOffsets;
+    private final LongArray blockMaxValues;
+    private final SeekingRandomAccessInput seekingInput;
+    private final QueryEventListener.PostingListEventListener listener;
+
+    // TODO: Expose more things through the summary, now that it's an actual field?
+    private final BlocksSummary summary;
+
+    private int postingsBlockIdx;
+    private int blockIdx; // position in block
+    private long totalPostingsRead;
+    private long actualSegmentRowId;
+
+    private long currentPosition;
+    private DirectReaders.Reader currentFORValues;
+
+    @VisibleForTesting
+    PostingsReader(IndexInput input, long summaryOffset, QueryEventListener.PostingListEventListener listener) throws IOException
+    {
+        this(input, new BlocksSummary(input, summaryOffset, () -> {}), listener);
+    }
+
+    @VisibleForTesting
+    public PostingsReader(IndexInput input, BlocksSummary summary, QueryEventListener.PostingListEventListener listener) throws IOException
+    {
+        this.input = input;
+        this.seekingInput = new SeekingRandomAccessInput(input);
+        this.blockOffsets = summary.offsets;
+        this.blockSize = summary.blockSize;
+        this.numPostings = summary.numPostings;
+        this.blockMaxValues = summary.maxValues;
+        this.listener = listener;
+
+        this.summary = summary;
+
+        reBuffer();
+    }
+
+    @Override
+    public long getOrdinal()
+    {
+        return totalPostingsRead;
+    }
+
+    interface InputCloser
+    {
+        void close() throws IOException;
+    }
+
+    @VisibleForTesting
+    public static class BlocksSummary
+    {
+        final int blockSize;
+        final int numPostings;
+        final LongArray offsets;
+        final LongArray maxValues;
+
+        private final InputCloser runOnClose;
+
+        @VisibleForTesting
+        public BlocksSummary(IndexInput input, long offset) throws IOException
+        {
+            this(input, offset, input::close);
+        }
+
+        BlocksSummary(IndexInput input, long offset, InputCloser runOnClose) throws IOException
+        {
+            this.runOnClose = runOnClose;
+
+            input.seek(offset);
+            this.blockSize = input.readVInt();
+            //TODO This should need to change because we can potentially end up with postings of more than Integer.MAX_VALUE?
+            this.numPostings = input.readVInt();
+
+            final SeekingRandomAccessInput randomAccessInput = new SeekingRandomAccessInput(input);
+            final int numBlocks = input.readVInt();
+            final long maxBlockValuesLength = input.readVLong();
+            final long maxBlockValuesOffset = input.getFilePointer() + maxBlockValuesLength;
+
+            final byte offsetBitsPerValue = input.readByte();
+            if (offsetBitsPerValue > 64)
+            {
+                throw new CorruptIndexException(
+                        String.format("Postings list header is corrupted: Bits per value for block offsets must be no more than 64 and is %d.", offsetBitsPerValue), input);
+            }
+            this.offsets = new LongArrayReader(randomAccessInput, DirectReaders.getReaderForBitsPerValue(offsetBitsPerValue), input.getFilePointer(), numBlocks);
+
+            input.seek(maxBlockValuesOffset);
+            final byte valuesBitsPerValue = input.readByte();
+            if (valuesBitsPerValue > 64)
+            {
+                throw new CorruptIndexException(
+                        String.format("Postings list header is corrupted: Bits per value for values samples must be no more than 64 and is %d.", valuesBitsPerValue), input);
+            }
+            this.maxValues = new LongArrayReader(randomAccessInput, DirectReaders.getReaderForBitsPerValue(valuesBitsPerValue), input.getFilePointer(), numBlocks);
+        }
+
+        void close() throws IOException
+        {
+            runOnClose.close();
+        }
+
+        private static class LongArrayReader implements LongArray
+        {
+            private final RandomAccessInput input;
+            private final DirectReaders.Reader reader;
+            private final long offset;
+            private final int length;
+
+            private LongArrayReader(RandomAccessInput input, DirectReaders.Reader reader, long offset, int length)
+            {
+                this.input = input;
+                this.reader = reader;
+                this.offset = offset;
+                this.length = length;
+            }
+
+            @Override
+            public long findTokenRowID(long value)
+            {
+                throw new UnsupportedOperationException();
+            }
+
+            @Override
+            public long get(long idx)
+            {
+                return reader.get(input, offset, idx);
+            }
+
+            @Override
+            public long length()
+            {
+                return length;
+            }
+        }
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        try
+        {
+            input.close();
+        }
+        finally
+        {
+            summary.close();
+        }
+    }
+
+    @Override
+    public long size()
+    {
+        return numPostings;
+    }
+
+    /**
+     * Advances to the first row ID beyond the current that is greater than or equal to the
+     * target, and returns that row ID. Exhausts the iterator and returns {@link #END_OF_STREAM} if
+     * the target is greater than the highest row ID.
+     *
+     * Does binary search over the skip table to find the next block to load into memory.
+     *
+     * Note: Callers must use the return value of this method before calling {@link #nextPosting()}, as calling
+     * that method will return the next posting, not the one to which we have just advanced.
+     *
+     * @param targetRowID target row ID to advance to
+     *
+     * @return first segment row ID which is >= the target row ID or {@link PostingList#END_OF_STREAM} if one does not exist
+     */
+    @Override
+    public long advance(long targetRowID) throws IOException
+    {
+        listener.onAdvance();
+        int block = binarySearchBlock(targetRowID);
+
+        if (block < 0)
+        {
+            block = -block - 1;
+        }
+
+        if (postingsBlockIdx == block + 1)
+        {
+            // we're in the same block, just iterate through
+            return slowAdvance(targetRowID);
+        }
+        assert block > 0;
+        // Even if there was an exact match, block might contain duplicates.
+        // We iterate to the target token from the beginning.
+        lastPosInBlock(block - 1);
+        return slowAdvance(targetRowID);
+    }
+
+    private long slowAdvance(long targetRowID) throws IOException
+    {
+        while (totalPostingsRead < numPostings)
+        {
+            long segmentRowId = peekNext();
+
+            advanceOnePosition(segmentRowId);
+
+            if (segmentRowId >= targetRowID)
+            {
+                return segmentRowId;
+            }
+        }
+        return END_OF_STREAM;
+    }
+
+    private int binarySearchBlock(long targetRowID)
+    {
+        int low = postingsBlockIdx - 1;
+        int high = Math.toIntExact(blockMaxValues.length()) - 1;
+
+        // in current block
+        if (low <= high && targetRowID <= blockMaxValues.get(low))
+            return low;
+
+        while (low <= high)
+        {
+            int mid = low + ((high - low) >> 1) ;
+
+            long midVal = blockMaxValues.get(mid);
+
+            if (midVal < targetRowID)
+            {
+                low = mid + 1;
+            }
+            else if (midVal > targetRowID)
+            {
+                high = mid - 1;
+            }
+            else
+            {
+                // target found, but we need to check for duplicates
+                if (mid > 0 && blockMaxValues.get(mid - 1L) == targetRowID)
+                {
+                    // there are duplicates, pivot left
+                    high = mid - 1;
+                }
+                else
+                {
+                    // no duplicates
+                    return mid;
+                }
+            }
+        }
+        return -(low + 1);  // target not found
+    }
+
+    private void lastPosInBlock(int block)
+    {
+        // blockMaxValues is integer only
+        actualSegmentRowId = blockMaxValues.get(block);
+        //upper bound, since we might've advanced to the last block, but upper bound is enough
+        totalPostingsRead += (blockSize - blockIdx) + (block - postingsBlockIdx + 1) * blockSize;
+
+        postingsBlockIdx = block + 1;
+        blockIdx = blockSize;
+    }
+
+    @Override
+    public long nextPosting() throws IOException
+    {
+        final long next = peekNext();
+        if (next != END_OF_STREAM)
+        {
+            advanceOnePosition(next);
+        }
+        return next;
+    }
+
+    @VisibleForTesting
+    int getBlockSize()
+    {
+        return blockSize;
+    }
+
+    private long peekNext() throws IOException
+    {
+        if (totalPostingsRead >= numPostings)
+        {
+            return END_OF_STREAM;
+        }
+        if (blockIdx == blockSize)
+        {
+            reBuffer();
+        }
+
+        return actualSegmentRowId + nextRowID();
+    }
+
+    private int nextRowID()
+    {
+        // currentFORValues is null when the all the values in the block are the same
+        if (currentFORValues == null)
+        {
+            return 0;
+        }
+        else
+        {
+            final long id = currentFORValues.get(seekingInput, currentPosition, blockIdx);
+            listener.onPostingDecoded();
+            return Math.toIntExact(id);
+        }
+    }
+
+    private void advanceOnePosition(long nextRowID)
+    {
+        actualSegmentRowId = nextRowID;
+        totalPostingsRead++;
+        blockIdx++;
+    }
+
+    private void reBuffer() throws IOException
+    {
+        final long pointer = blockOffsets.get(postingsBlockIdx);
+
+        input.seek(pointer);
+
+        final long left = numPostings - totalPostingsRead;
+        assert left > 0;
+
+        readFoRBlock(input);
+
+        postingsBlockIdx++;
+        blockIdx = 0;
+    }
+
+    private void readFoRBlock(IndexInput in) throws IOException
+    {
+        final byte bitsPerValue = in.readByte();
+
+        currentPosition = in.getFilePointer();
+
+        if (bitsPerValue == 0)
+        {
+            // currentFORValues is null when the all the values in the block are the same
+            currentFORValues = null;
+            return;
+        }
+        else if (bitsPerValue > 64)
+        {
+            throw new CorruptIndexException(
+                    String.format("Postings list #%s block is corrupted. Bits per value should be no more than 64 and is %d.", postingsBlockIdx, bitsPerValue), input);
+        }
+        currentFORValues = DirectReaders.getReaderForBitsPerValue(bitsPerValue);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java
new file mode 100644
index 000000000000..2bfe5eca33ee
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java
@@ -0,0 +1,313 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+
+import java.io.Closeable;
+import java.io.IOException;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.agrona.collections.IntArrayList;
+import org.agrona.collections.LongArrayList;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.RAMIndexOutput;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.packed.DirectWriter;
+
+import static com.google.common.base.Preconditions.checkArgument;
+import static java.lang.Math.max;
+import static org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat.BLOCK_SIZE;
+
+/**
+ * Encodes, compresses and writes postings lists to disk.
+ *
+ * All row IDs in the posting list are delta encoded, then deltas are divided into blocks for compression.
+ * <p>
+ * In packed blocks, longs are encoded with the same bit width (FoR compression). The block size (i.e. number of
+ * longs inside block) is fixed (currently 128). Additionally blocks that are all the same value are encoded in an
+ * optimized way.
+ * </p>
+ * <p>
+ * In VLong blocks, longs are compressed with {@link DataOutput#writeVLong}. The block size is variable.
+ * </p>
+ *
+ * <p>
+ * Packed blocks are favoured, meaning when the postings are long enough, {@link PostingsWriter} will try
+ * to encode most data as a packed block. Take a term with 259 row IDs as an example, the first 256 IDs are encoded
+ * as two packed blocks, while the remaining 3 are encoded as one VLong block.
+ * </p>
+ * <p>
+ * Each posting list ends with a meta section and a skip table, that are written right after all postings blocks. Skip
+ * interval is the same as block size, and each skip entry points to the end of each block.  Skip table consist of
+ * block offsets and last values of each block, compressed as two FoR blocks.
+ * </p>
+ *
+ * Visual representation of the disk format:
+ * <pre>
+ *
+ * +========+========================+=====+==============+===============+============+=====+========================+========+
+ * | HEADER | POSTINGS LIST (TERM 1)                                                   | ... | POSTINGS LIST (TERM N) | FOOTER |
+ * +========+========================+=====+==============+===============+============+=====+========================+========+
+ *          | FOR BLOCK (1)          | ... | FOR BLOCK (N)| BLOCK SUMMARY              |
+ *          +------------------------+-----+--------------+---------------+------------+
+ *                                                        | BLOCK SIZE    |            |
+ *                                                        | LIST SIZE     | SKIP TABLE |
+ *                                                        +---------------+------------+
+ *                                                                        | BLOCKS POS.|
+ *                                                                        | MAX VALUES |
+ *                                                                        +------------+
+ *
+ *  </pre>
+ */
+@NotThreadSafe
+//TODO Review this for DSP-19608
+public class PostingsWriter implements Closeable
+{
+    private final static String POSTINGS_MUST_BE_SORTED_ERROR_MSG = "Postings must be sorted ascending, got [%s] after [%s]";
+
+    private final IndexOutput dataOutput;
+    private final int blockSize;
+    private final long[] deltaBuffer;
+    private final LongArrayList blockOffsets = new LongArrayList();
+    private final LongArrayList blockMaxIDs = new LongArrayList();
+    private final RAMIndexOutput inMemoryOutput = new RAMIndexOutput("blockOffsets");
+
+    private final long startOffset;
+
+    private int bufferUpto;
+    private long lastSegmentRowId;
+    private long maxDelta;
+    private long totalPostings;
+
+    @VisibleForTesting
+    PostingsWriter(IndexComponents components, boolean segmented) throws IOException
+    {
+        this(components, BLOCK_SIZE, segmented);
+    }
+
+    PostingsWriter(IndexOutput dataOutput) throws IOException
+    {
+        this(dataOutput, BLOCK_SIZE);
+    }
+
+    PostingsWriter(IndexComponents components, int blockSize, boolean segmented) throws IOException
+    {
+        this(components.createOutput(components.postingLists, true, segmented), blockSize);
+    }
+
+    private PostingsWriter(IndexOutput dataOutput, int blockSize) throws IOException
+    {
+        this.blockSize = blockSize;
+        this.dataOutput = dataOutput;
+        startOffset = dataOutput.getFilePointer();
+        deltaBuffer = new long[blockSize];
+        SAICodecUtils.writeHeader(dataOutput);
+    }
+
+    /**
+     * @return current file pointer
+     */
+    public long getFilePointer()
+    {
+        return dataOutput.getFilePointer();
+    }
+
+    /**
+     * @return file pointer where index structure begins
+     */
+    public long getStartOffset()
+    {
+        return startOffset;
+    }
+
+    /**
+     * write footer to the postings
+     */
+    public void complete() throws IOException
+    {
+        SAICodecUtils.writeFooter(dataOutput);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        dataOutput.close();
+    }
+
+    /**
+     * Encodes, compresses and flushes given posting list to disk.
+     *
+     * @param postings posting list to write to disk
+     *
+     * @return file offset to the summary block of this posting list
+     */
+    public long write(PostingList postings) throws IOException
+    {
+        checkArgument(postings != null, "Expected non-null posting list.");
+        checkArgument(postings.size() > 0, "Expected non-empty posting list.");
+
+        resetBlockCounters();
+        blockOffsets.clear();
+        blockMaxIDs.clear();
+
+        long segmentRowId;
+        // When postings list are merged, we don't know exact size, just an upper bound.
+        // We need to count how many postings we added to the block ourselves.
+        int size = 0;
+        while ((segmentRowId = postings.nextPosting()) != PostingList.END_OF_STREAM)
+        {
+            writePosting(segmentRowId);
+            size++;
+            totalPostings++;
+        }
+        if (size == 0)
+            return -1;
+
+        finish();
+
+        final long summaryOffset = dataOutput.getFilePointer();
+        writeSummary(size);
+        return summaryOffset;
+    }
+
+    public long getTotalPostings()
+    {
+        return totalPostings;
+    }
+
+    private void writePosting(long segmentRowId) throws IOException
+    {
+        if (!(segmentRowId >= lastSegmentRowId || lastSegmentRowId == 0))
+            throw new IllegalArgumentException(String.format(POSTINGS_MUST_BE_SORTED_ERROR_MSG, segmentRowId, lastSegmentRowId));
+
+        final long delta = segmentRowId - lastSegmentRowId;
+        maxDelta = max(maxDelta, delta);
+        deltaBuffer[bufferUpto++] = delta;
+
+        if (bufferUpto == blockSize)
+        {
+            addBlockToSkipTable(segmentRowId);
+            writePostingsBlock(maxDelta, bufferUpto);
+            resetBlockCounters();
+        }
+        lastSegmentRowId = segmentRowId;
+    }
+
+    private void finish() throws IOException
+    {
+        if (bufferUpto > 0)
+        {
+            addBlockToSkipTable(lastSegmentRowId);
+
+            writePostingsBlock(maxDelta, bufferUpto);
+        }
+    }
+
+    private void resetBlockCounters()
+    {
+        bufferUpto = 0;
+        lastSegmentRowId = 0;
+        maxDelta = 0;
+    }
+
+    private void addBlockToSkipTable(long maxSegmentRowID)
+    {
+        blockOffsets.add(dataOutput.getFilePointer());
+        blockMaxIDs.add(maxSegmentRowID);
+    }
+
+    private void writeSummary(int exactSize) throws IOException
+    {
+        dataOutput.writeVInt(blockSize);
+        dataOutput.writeVInt(exactSize);
+        writeSkipTable();
+    }
+
+    private void writeSkipTable() throws IOException
+    {
+        assert blockOffsets.size() == blockMaxIDs.size();
+        dataOutput.writeVInt(blockOffsets.size());
+
+        // compressing offsets in memory first, to know the exact length (with padding)
+        inMemoryOutput.reset();
+
+        writeSortedFoRBlock(blockOffsets, inMemoryOutput);
+        dataOutput.writeVLong(inMemoryOutput.getFilePointer());
+        inMemoryOutput.writeTo(dataOutput);
+        writeSortedFoRBlock(blockMaxIDs, dataOutput);
+    }
+
+    private void writePostingsBlock(long maxValue, int blockSize) throws IOException
+    {
+        final int bitsPerValue = maxValue == 0 ? 0 : DirectWriter.unsignedBitsRequired(maxValue);
+
+        assert bitsPerValue < Byte.MAX_VALUE;
+
+        dataOutput.writeByte((byte) bitsPerValue);
+        if (bitsPerValue > 0)
+        {
+            final DirectWriter writer = DirectWriter.getInstance(dataOutput, blockSize, bitsPerValue);
+            for (int i = 0; i < blockSize; ++i)
+            {
+                writer.add(deltaBuffer[i]);
+            }
+            writer.finish();
+        }
+    }
+
+    private void writeSortedFoRBlock(LongArrayList values, IndexOutput output) throws IOException
+    {
+        final long maxValue = values.getLong(values.size() - 1);
+
+        assert values.size() > 0;
+        final int bitsPerValue = maxValue == 0 ? 0 : DirectWriter.unsignedBitsRequired(maxValue);
+        output.writeByte((byte) bitsPerValue);
+        if (bitsPerValue > 0)
+        {
+            final DirectWriter writer = DirectWriter.getInstance(output, values.size(), bitsPerValue);
+            for (int i = 0; i < values.size(); ++i)
+            {
+                writer.add(values.getLong(i));
+            }
+            writer.finish();
+        }
+    }
+
+    private void writeSortedFoRBlock(IntArrayList values, IndexOutput output) throws IOException
+    {
+        final int maxValue = values.getInt(values.size() - 1);
+
+        assert values.size() > 0;
+        final int bitsPerValue = maxValue == 0 ? 0 : DirectWriter.unsignedBitsRequired(maxValue);
+        output.writeByte((byte) bitsPerValue);
+        if (bitsPerValue > 0)
+        {
+            final DirectWriter writer = DirectWriter.getInstance(output, values.size(), bitsPerValue);
+            for (int i = 0; i < values.size(); ++i)
+            {
+                writer.add(values.getInt(i));
+            }
+            writer.finish();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/TermsReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/TermsReader.java
new file mode 100644
index 000000000000..70e0abe5052a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/TermsReader.java
@@ -0,0 +1,306 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.TermsIterator;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.utils.AbortedOperationException;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+import org.apache.lucene.store.IndexInput;
+
+import static org.apache.cassandra.index.sai.utils.SAICodecUtils.validate;
+
+/**
+ * Synchronous reader of terms dictionary and postings lists to produce a {@link PostingList} with matching row ids.
+ *
+ * {@link #exactMatch(ByteComparable, QueryEventListener.TrieIndexEventListener, QueryContext)} does:
+ * <ul>
+ * <li>{@link TermQuery#lookupTermDictionary(ByteComparable)}: does term dictionary lookup to find the posting list file
+ * position</li>
+ * <li>{@link TermQuery#getPostingReader(long)}: reads posting list block summary and initializes posting read which
+ * reads the first block of the posting list into memory</li>
+ * </ul>
+ */
+public class TermsReader implements Closeable
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    private final IndexComponents indexComponents;
+    private final FileHandle termDictionaryFile;
+    private final FileHandle postingsFile;
+    private final long termDictionaryRoot;
+
+    public TermsReader(IndexComponents components, FileHandle termsData, FileHandle postingLists,
+                       long root, long termsFooterPointer) throws IOException
+    {
+        this.indexComponents = components;
+        termDictionaryFile = termsData;
+        postingsFile = postingLists;
+        termDictionaryRoot = root;
+
+        try (final IndexInput indexInput = indexComponents.openInput(termDictionaryFile))
+        {
+            // if the pointer is -1 then this is a previous version of the index
+            // use the old way to validate the footer
+            // the footer pointer is used due to encrypted indexes padding extra bytes
+            if (termsFooterPointer == -1)
+            {
+                validate(indexInput);
+            }
+            else
+            {
+                validate(indexInput, termsFooterPointer);
+            }
+        }
+
+        try (final IndexInput indexInput = indexComponents.openInput(postingsFile))
+        {
+            validate(indexInput);
+        }
+    }
+
+    public static int openPerIndexFiles()
+    {
+        // terms and postings
+        return 2;
+    }
+
+    @Override
+    public void close()
+    {
+        try
+        {
+            termDictionaryFile.close();
+        }
+        finally
+        {
+            postingsFile.close();
+        }
+    }
+
+    public TermsIterator allTerms(long segmentOffset, QueryEventListener.TrieIndexEventListener listener)
+    {
+        // blocking, since we use it only for segment merging for now
+        return new TermsScanner(segmentOffset, listener);
+    }
+
+    public PostingList exactMatch(ByteComparable term, QueryEventListener.TrieIndexEventListener perQueryEventListener, QueryContext context)
+    {
+        perQueryEventListener.onSegmentHit();
+        return new TermQuery(term, perQueryEventListener, context).execute();
+    }
+
+    @VisibleForTesting
+    public class TermQuery
+    {
+        private final IndexInput postingsInput;
+        private final IndexInput postingsSummaryInput;
+        private final QueryEventListener.TrieIndexEventListener listener;
+        private final long lookupStartTime;
+        private final QueryContext context;
+
+        private ByteComparable term;
+
+        TermQuery(ByteComparable term, QueryEventListener.TrieIndexEventListener listener, QueryContext context)
+        {
+            this.listener = listener;
+            postingsInput = indexComponents.openInput(postingsFile);
+            postingsSummaryInput = indexComponents.openInput(postingsFile);
+            this.term = term;
+            lookupStartTime = System.nanoTime();
+            this.context = context;
+        }
+
+        public PostingList execute()
+        {
+            try
+            {
+                long postingOffset = lookupTermDictionary(term);
+                if (postingOffset == PostingList.OFFSET_NOT_FOUND)
+                {
+                    FileUtils.closeQuietly(postingsInput);
+                    FileUtils.closeQuietly(postingsSummaryInput);
+                    return null;
+                }
+
+                context.checkpoint();
+
+                // when posting is found, resources will be closed when posting reader is closed.
+                return getPostingReader(postingOffset);
+            }
+            catch (Throwable e)
+            {
+                //TODO Is there an equivalent of AOE in OS?
+                if (!(e instanceof AbortedOperationException))
+                    logger.error(indexComponents.logMessage("Failed to execute term query"), e);
+
+                closeOnException();
+                throw Throwables.cleaned(e);
+            }
+        }
+
+        private void closeOnException()
+        {
+            FileUtils.closeQuietly(postingsInput);
+            FileUtils.closeQuietly(postingsSummaryInput);
+        }
+
+        public long lookupTermDictionary(ByteComparable term)
+        {
+            try (TrieTermsDictionaryReader reader = new TrieTermsDictionaryReader(termDictionaryFile.instantiateRebufferer(), termDictionaryRoot))
+            {
+                final long offset = reader.exactMatch(term);
+
+                listener.onTraversalComplete(System.nanoTime() - lookupStartTime, TimeUnit.NANOSECONDS);
+
+                if (offset == TrieTermsDictionaryReader.NOT_FOUND)
+                    return PostingList.OFFSET_NOT_FOUND;
+
+                return offset;
+            }
+        }
+
+        public PostingsReader getPostingReader(long offset) throws IOException
+        {
+            PostingsReader.BlocksSummary header = new PostingsReader.BlocksSummary(postingsSummaryInput, offset);
+
+            return new PostingsReader(postingsInput, header, listener.postingListEventListener());
+        }
+    }
+
+    // currently only used for testing
+    private class TermsScanner implements TermsIterator
+    {
+        private final long segmentOffset;
+        private final QueryEventListener.TrieIndexEventListener listener;
+        private final TrieTermsDictionaryReader termsDictionaryReader;
+        private final Iterator<Pair<ByteComparable, Long>> iterator;
+        private final ByteBuffer minTerm, maxTerm;
+        private Pair<ByteComparable, Long> entry;
+
+        private TermsScanner(long segmentOffset, QueryEventListener.TrieIndexEventListener listener)
+        {
+            this.termsDictionaryReader = new TrieTermsDictionaryReader(termDictionaryFile.instantiateRebufferer(), termDictionaryRoot);
+
+            this.minTerm = ByteBuffer.wrap(ByteSourceInverse.readBytes(termsDictionaryReader.getMinTerm().asComparableBytes(ByteComparable.Version.OSS41)));
+            this.maxTerm = ByteBuffer.wrap(ByteSourceInverse.readBytes(termsDictionaryReader.getMaxTerm().asComparableBytes(ByteComparable.Version.OSS41)));
+            this.iterator = termsDictionaryReader.iterator();
+            this.listener = listener;
+            this.segmentOffset = segmentOffset;
+        }
+
+        @Override
+        @SuppressWarnings("resource")
+        public PostingList postings() throws IOException
+        {
+            assert entry != null;
+            final IndexInput input = indexComponents.openInput(postingsFile);
+            return new OffsetPostingList(segmentOffset, new PostingsReader(input, new PostingsReader.BlocksSummary(input, entry.right), listener.postingListEventListener()));
+        }
+
+        @Override
+        public void close()
+        {
+            termsDictionaryReader.close();
+        }
+
+        @Override
+        public ByteBuffer getMinTerm()
+        {
+            return minTerm;
+        }
+
+        @Override
+        public ByteBuffer getMaxTerm()
+        {
+            return maxTerm;
+        }
+
+        @Override
+        public ByteComparable next()
+        {
+            if (iterator.hasNext())
+            {
+                entry = iterator.next();
+                return entry.left;
+            }
+            return null;
+        }
+
+        @Override
+        public boolean hasNext()
+        {
+            return iterator.hasNext();
+        }
+    }
+
+    private class OffsetPostingList implements PostingList
+    {
+        private final long offset;
+        private final PostingList wrapped;
+
+        OffsetPostingList(long offset, PostingList postingList)
+        {
+            this.offset = offset;
+            this.wrapped = postingList;
+        }
+
+        @Override
+        public long nextPosting() throws IOException
+        {
+            long next = wrapped.nextPosting();
+            if (next == PostingList.END_OF_STREAM)
+                return next;
+            return next + offset;
+        }
+
+        @Override
+        public long size()
+        {
+            return wrapped.size();
+        }
+
+        @Override
+        public long advance(long targetRowID) throws IOException
+        {
+            long next = wrapped.advance(targetRowID);
+            if (next == PostingList.END_OF_STREAM)
+                return next;
+            return next + offset;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/TraversingBKDReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/TraversingBKDReader.java
new file mode 100644
index 000000000000..a92b2808b050
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/TraversingBKDReader.java
@@ -0,0 +1,468 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+
+import org.agrona.collections.IntArrayList;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexInputReader;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.ByteArrayDataInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FutureArrays;
+import org.apache.lucene.util.MathUtil;
+
+/**
+ * Base reader for a block KD-tree previously written with {@link BKDWriter}.
+ *
+ * Holds index tree on heap and enables it's traversal via {@link #traverse(IndexTreeTraversalCallback)}.
+ */
+public class TraversingBKDReader implements Closeable
+{
+    final IndexComponents indexComponents;
+    final FileHandle indexFile;
+    final int bytesPerDim;
+    final int numLeaves;
+    final byte[] minPackedValue;
+    final byte[] maxPackedValue;
+    // Packed array of byte[] holding all split values in the full binary tree:
+    final byte[] packedIndex;
+    final long pointCount;
+    final int leafNodeOffset;
+    final int numDims;
+    final int maxPointsInLeafNode;
+    final int packedBytesLength;
+
+    @SuppressWarnings("resource")
+    TraversingBKDReader(IndexComponents indexComponents, FileHandle indexFile, long root)
+    {
+        this.indexComponents = indexComponents;
+        this.indexFile = indexFile;
+
+        try (final RandomAccessReader reader = indexFile.createReader())
+        {
+            final IndexInputReader in = IndexInputReader.create(reader);
+            SAICodecUtils.validate(in);
+            in.seek(root);
+
+            numDims = in.readVInt();
+            maxPointsInLeafNode = in.readVInt();
+            bytesPerDim = in.readVInt();
+            packedBytesLength = numDims * bytesPerDim;
+
+            // Read index:
+            numLeaves = in.readVInt();
+            assert numLeaves > 0;
+            leafNodeOffset = numLeaves;
+
+            minPackedValue = new byte[packedBytesLength];
+            maxPackedValue = new byte[packedBytesLength];
+
+//            if (indexComponents.getEncryptionCompressor() != null)
+//            {
+//                IndexInput cryptoInput = CryptoUtils.uncompress(in, indexComponents.getEncryptionCompressor());
+//                cryptoInput.readBytes(minPackedValue, 0, packedBytesLength);
+//                cryptoInput.readBytes(maxPackedValue, 0, packedBytesLength);
+//            }
+//            else
+//            {
+                in.readBytes(minPackedValue, 0, packedBytesLength);
+                in.readBytes(maxPackedValue, 0, packedBytesLength);
+//            }
+
+            for (int dim = 0; dim < numDims; dim++)
+            {
+                if (FutureArrays.compareUnsigned(minPackedValue, dim * bytesPerDim, dim * bytesPerDim + bytesPerDim, maxPackedValue, dim * bytesPerDim, dim * bytesPerDim + bytesPerDim) > 0)
+                {
+                    String message = String.format("Min packed value %s is > max packed value %s for dimension %d.",
+                                                   new BytesRef(minPackedValue), new BytesRef(maxPackedValue), dim);
+                    throw new CorruptIndexException(message, in);
+                }
+            }
+
+            pointCount = in.readVLong();
+
+            // docCount, unused
+            in.readVInt();
+
+//            ICompressor compressor = indexComponents.getEncryptionCompressor();
+//            if (compressor != null)
+//            {
+//                // TODO: there's extra byte[] allocation here
+//                IndexInput input = CryptoUtils.uncompress(in, compressor);
+//
+//                packedIndex = new byte[(int)input.length()];
+//                input.readBytes(packedIndex, 0, (int)input.length());
+//            }
+//            else
+//            {
+                int numBytes = in.readVInt();
+                packedIndex = new byte[numBytes];
+                in.readBytes(packedIndex, 0, numBytes);
+//            }
+        }
+        catch (Throwable t)
+        {
+            FileUtils.closeQuietly(indexFile);
+            throw Throwables.unchecked(t);
+        }
+    }
+
+    public long getMinLeafBlockFP()
+    {
+        if (packedIndex != null)
+        {
+            return new ByteArrayDataInput(packedIndex).readVLong();
+        }
+        else
+        {
+            throw new IllegalStateException();
+        }
+    }
+
+    public long memoryUsage()
+    {
+        return ObjectSizes.sizeOfArray(packedIndex)
+               + ObjectSizes.sizeOfArray(minPackedValue)
+               + ObjectSizes.sizeOfArray(maxPackedValue);
+    }
+
+    @Override
+    public void close()
+    {
+        indexFile.close();
+    }
+
+    interface IndexTreeTraversalCallback
+    {
+        void onLeaf(int leafNodeID, long leafBlockFP, IntArrayList pathToRoot);
+    }
+
+    /**
+     * Copy of BKDReader.IndexTree
+     */
+    abstract class IndexTree implements Cloneable
+    {
+        protected int nodeID;
+        // level is 1-based so that we can do level-1 w/o checking each time:
+        protected int level;
+        protected int splitDim;
+        protected final byte[][] splitPackedValueStack;
+
+        protected IndexTree()
+        {
+            int treeDepth = getTreeDepth();
+            splitPackedValueStack = new byte[treeDepth + 1][];
+            nodeID = 1;
+            level = 1;
+            splitPackedValueStack[level] = new byte[packedBytesLength];
+        }
+
+        public void pushLeft()
+        {
+            nodeID *= 2;
+            level++;
+            if (splitPackedValueStack[level] == null)
+            {
+                splitPackedValueStack[level] = new byte[packedBytesLength];
+            }
+        }
+
+        /** Clone, but you are not allowed to pop up past the point where the clone happened. */
+        public abstract IndexTree clone();
+
+        public void pushRight()
+        {
+            nodeID = nodeID * 2 + 1;
+            level++;
+            if (splitPackedValueStack[level] == null)
+            {
+                splitPackedValueStack[level] = new byte[packedBytesLength];
+            }
+        }
+
+        public void pop()
+        {
+            nodeID /= 2;
+            level--;
+            splitDim = -1;
+            //System.out.println("  pop nodeID=" + nodeID);
+        }
+
+        public boolean isLeafNode()
+        {
+            return nodeID >= leafNodeOffset;
+        }
+
+        public boolean nodeExists()
+        {
+            return nodeID - leafNodeOffset < leafNodeOffset;
+        }
+
+        public int getNodeID()
+        {
+            return nodeID;
+        }
+
+        public byte[] getSplitPackedValue()
+        {
+            assert !isLeafNode();
+            assert splitPackedValueStack[level] != null : "level=" + level;
+            return splitPackedValueStack[level];
+        }
+
+        /** Only valid after pushLeft or pushRight, not pop! */
+        public int getSplitDim()
+        {
+            assert !isLeafNode();
+            return splitDim;
+        }
+
+        /** Only valid after pushLeft or pushRight, not pop! */
+        public abstract BytesRef getSplitDimValue();
+
+        /** Only valid after pushLeft or pushRight, not pop! */
+        public abstract long getLeafBlockFP();
+    }
+
+
+    /**
+     * Copy of BKDReader.PackedIndexTree
+     */
+    final class PackedIndexTree extends IndexTree
+    {
+        // used to read the packed byte[]
+        private final ByteArrayDataInput in;
+        // holds the minimum (left most) leaf block file pointer for each level we've recursed to:
+        private final long[] leafBlockFPStack;
+        // holds the address, in the packed byte[] index, of the left-node of each level:
+        private final int[] leftNodePositions;
+        // holds the address, in the packed byte[] index, of the right-node of each level:
+        private final int[] rightNodePositions;
+        // holds the splitDim for each level:
+        private final int[] splitDims;
+        // true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
+        // 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim].  this will be true if the last time we
+        // split on this dimension, we next pushed to the left sub-tree:
+        private final boolean[] negativeDeltas;
+        // holds the packed per-level split values; the run method uses this to save the cell min/max as it recurses:
+        private final byte[][] splitValuesStack;
+        // scratch value to return from getPackedValue:
+        private final BytesRef scratch;
+
+        PackedIndexTree()
+        {
+            int treeDepth = getTreeDepth();
+            leafBlockFPStack = new long[treeDepth + 1];
+            leftNodePositions = new int[treeDepth + 1];
+            rightNodePositions = new int[treeDepth + 1];
+            splitValuesStack = new byte[treeDepth + 1][];
+            splitDims = new int[treeDepth + 1];
+            negativeDeltas = new boolean[numDims * (treeDepth + 1)];
+
+            in = new ByteArrayDataInput(packedIndex);
+            splitValuesStack[0] = new byte[packedBytesLength];
+            readNodeData(false);
+            scratch = new BytesRef();
+            scratch.length = bytesPerDim;
+        }
+
+        @Override
+        public PackedIndexTree clone()
+        {
+            PackedIndexTree index = new PackedIndexTree();
+            index.nodeID = nodeID;
+            index.level = level;
+            index.splitDim = splitDim;
+            index.leafBlockFPStack[level] = leafBlockFPStack[level];
+            index.leftNodePositions[level] = leftNodePositions[level];
+            index.rightNodePositions[level] = rightNodePositions[level];
+            index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
+            System.arraycopy(negativeDeltas, level * numDims, index.negativeDeltas, level * numDims, numDims);
+            index.splitDims[level] = splitDims[level];
+            return index;
+        }
+
+        @Override
+        public void pushLeft()
+        {
+            int nodePosition = leftNodePositions[level];
+            super.pushLeft();
+            System.arraycopy(negativeDeltas, (level - 1) * numDims, negativeDeltas, level * numDims, numDims);
+            assert splitDim != -1;
+            negativeDeltas[level * numDims + splitDim] = true;
+            in.setPosition(nodePosition);
+            readNodeData(true);
+        }
+
+        @Override
+        public void pushRight()
+        {
+            int nodePosition = rightNodePositions[level];
+            super.pushRight();
+            System.arraycopy(negativeDeltas, (level - 1) * numDims, negativeDeltas, level * numDims, numDims);
+            assert splitDim != -1;
+            negativeDeltas[level * numDims + splitDim] = false;
+            in.setPosition(nodePosition);
+            readNodeData(false);
+        }
+
+        @Override
+        public void pop()
+        {
+            super.pop();
+            splitDim = splitDims[level];
+        }
+
+        @Override
+        public long getLeafBlockFP()
+        {
+            assert isLeafNode() : "nodeID=" + nodeID + " is not a leaf";
+            return leafBlockFPStack[level];
+        }
+
+        @Override
+        public BytesRef getSplitDimValue()
+        {
+            assert !isLeafNode();
+            scratch.bytes = splitValuesStack[level];
+            scratch.offset = splitDim * bytesPerDim;
+            return scratch;
+        }
+
+        private void readNodeData(boolean isLeft)
+        {
+
+            leafBlockFPStack[level] = leafBlockFPStack[level - 1];
+
+            // read leaf block FP delta
+            if (!isLeft)
+            {
+                leafBlockFPStack[level] += in.readVLong();
+            }
+
+            if (isLeafNode())
+            {
+                splitDim = -1;
+            }
+            else
+            {
+
+                // read split dim, prefix, firstDiffByteDelta encoded as int:
+                int code = in.readVInt();
+                splitDim = code % numDims;
+                splitDims[level] = splitDim;
+                code /= numDims;
+                int prefix = code % (1 + bytesPerDim);
+                int suffix = bytesPerDim - prefix;
+
+                if (splitValuesStack[level] == null)
+                {
+                    splitValuesStack[level] = new byte[packedBytesLength];
+                }
+                System.arraycopy(splitValuesStack[level - 1], 0, splitValuesStack[level], 0, packedBytesLength);
+                if (suffix > 0)
+                {
+                    int firstDiffByteDelta = code / (1 + bytesPerDim);
+                    if (negativeDeltas[level * numDims + splitDim])
+                    {
+                        firstDiffByteDelta = -firstDiffByteDelta;
+                    }
+                    int oldByte = splitValuesStack[level][splitDim * bytesPerDim + prefix] & 0xFF;
+                    splitValuesStack[level][splitDim * bytesPerDim + prefix] = (byte) (oldByte + firstDiffByteDelta);
+                    in.readBytes(splitValuesStack[level], splitDim * bytesPerDim + prefix + 1, suffix - 1);
+                }
+                else
+                {
+                    // our split value is == last split value in this dim, which can happen when there are many duplicate values
+                }
+
+                int leftNumBytes;
+                if (nodeID * 2 < leafNodeOffset)
+                {
+                    leftNumBytes = in.readVInt();
+                }
+                else
+                {
+                    leftNumBytes = 0;
+                }
+
+                leftNodePositions[level] = in.getPosition();
+                rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
+            }
+        }
+    }
+
+
+    void traverse(IndexTreeTraversalCallback callback)
+    {
+        traverse(callback,
+                 new PackedIndexTree(),
+                 new IntArrayList());
+    }
+
+    private void traverse(IndexTreeTraversalCallback callback,
+                          IndexTree index,
+                          IntArrayList pathToRoot)
+    {
+        if (index.isLeafNode())
+        {
+            // In the unbalanced case it's possible the left most node only has one child:
+            if (index.nodeExists())
+            {
+                callback.onLeaf(index.getNodeID(), index.getLeafBlockFP(), pathToRoot);
+            }
+        }
+        else
+        {
+            final int nodeID = index.getNodeID();
+            final IntArrayList currentPath = new IntArrayList();
+            currentPath.addAll(pathToRoot);
+            currentPath.add(nodeID);
+
+            index.pushLeft();
+            traverse(callback, index, currentPath);
+            index.pop();
+
+            index.pushRight();
+            traverse(callback, index, currentPath);
+            index.pop();
+        }
+    }
+
+    /**
+     * Copy of BKDReader#getTreeDepth()
+     */
+    private int getTreeDepth()
+    {
+        // First +1 because all the non-leave nodes makes another power
+        // of 2; e.g. to have a fully balanced tree with 4 leaves you
+        // need a depth=3 tree:
+
+        // Second +1 because MathUtil.log computes floor of the logarithm; e.g.
+        // with 5 leaves you need a depth=4 tree:
+        return MathUtil.log(numLeaves, 2) + 2;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryReader.java
new file mode 100644
index 000000000000..c97dd44bc4c4
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryReader.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Iterator;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.collect.AbstractIterator;
+
+import org.apache.cassandra.io.tries.SerializationNode;
+import org.apache.cassandra.io.tries.TrieNode;
+import org.apache.cassandra.io.tries.TrieSerializer;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.SizedInts;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.lucene.util.ArrayUtil;
+
+/**
+ * Page-aware random access reader for a trie terms dictionary written by {@link TrieTermsDictionaryWriter}.
+ */
+@NotThreadSafe
+class TrieTermsDictionaryReader extends Walker<TrieTermsDictionaryReader>
+{
+    static final long NOT_FOUND = -1;
+
+    TrieTermsDictionaryReader(Rebufferer rebufferer, long root)
+    {
+        super(rebufferer, root);
+    }
+
+    static final TrieSerializer<Long, DataOutputPlus> trieSerializer = new TrieSerializer<Long, DataOutputPlus>()
+    {
+        @Override
+        public int sizeofNode(SerializationNode<Long> node, long nodePosition)
+        {
+            return TrieNode.typeFor(node, nodePosition).sizeofNode(node) + sizeof(node.payload());
+        }
+
+        @Override
+        public void write(DataOutputPlus dest, SerializationNode<Long> node, long nodePosition) throws IOException
+        {
+            final TrieNode type = TrieNode.typeFor(node, nodePosition);
+            final Long payload = node.payload();
+            if (payload != null)
+            {
+                final int payloadBits = SizedInts.nonZeroSize(payload);
+                type.serialize(dest, node, payloadBits, nodePosition);
+                SizedInts.write(dest, payload, payloadBits);
+            }
+            else
+            {
+                type.serialize(dest, node, 0, nodePosition);
+            }
+        }
+
+        private int sizeof(Long payload)
+        {
+            if (payload != null)
+            {
+                return SizedInts.nonZeroSize(payload);
+            }
+            return 0;
+        }
+    };
+
+    long exactMatch(ByteComparable key)
+    {
+        int b = follow(key);
+        if (b != ByteSource.END_OF_STREAM)
+        {
+            return NOT_FOUND;
+        }
+        return getCurrentPayload();
+    }
+
+    Iterator<Pair<ByteComparable, Long>> iterator()
+    {
+        return new AbstractIterator<Pair<ByteComparable, Long>>()
+        {
+            final TransitionBytesCollector collector = new TransitionBytesCollector();
+            IterationPosition stack = new IterationPosition(root, -1, null);
+
+            @Override
+            protected Pair<ByteComparable, Long> computeNext()
+            {
+                final long node = advanceNode();
+                if (node == -1)
+                {
+                    return endOfData();
+                }
+                return Pair.create(collector.toByteComparable(), getCurrentPayload());
+            }
+
+            private long advanceNode()
+            {
+                long child;
+                int transitionByte;
+
+                go(stack.node);
+                while (true)
+                {
+                    int childIndex = stack.childIndex + 1;
+                    transitionByte = transitionByte(childIndex);
+
+                    if (transitionByte > 256)
+                    {
+                        // ascend
+                        stack = stack.prev;
+                        collector.pop();
+                        if (stack == null)
+                        {
+                            // exhausted whole trie
+                            return -1;
+                        }
+                        go(stack.node);
+                        continue;
+                    }
+
+                    child = transition(childIndex);
+
+                    if (child != -1)
+                    {
+                        assert child >= 0 : String.format("Expected value >= 0 but got %d - %s", child, this);
+
+                        // descend
+                        go(child);
+
+                        stack.childIndex = childIndex;
+                        stack = new IterationPosition(child, -1, stack);
+                        collector.add(transitionByte);
+
+                        if (payloadFlags() != 0)
+                            return child;
+                    }
+                    else
+                    {
+                        stack.childIndex = childIndex;
+                    }
+                }
+            }
+        };
+    }
+
+    ByteComparable getMaxTerm()
+    {
+        final TransitionBytesCollector collector = new ImmutableTransitionBytesCollector();
+        go(root);
+        while (true)
+        {
+            int lastIdx = transitionRange() - 1;
+            long lastChild = transition(lastIdx);
+            if (lastIdx < 0)
+            {
+                return collector.toByteComparable();
+            }
+            collector.add(transitionByte(lastIdx));
+            go(lastChild);
+        }
+    }
+
+    ByteComparable getMinTerm()
+    {
+        final TransitionBytesCollector collector = new ImmutableTransitionBytesCollector();
+        go(root);
+        while (true)
+        {
+            int payloadBits = payloadFlags();
+            if (payloadBits > 0)
+            {
+                return collector.toByteComparable();
+            }
+            collector.add(transitionByte(0));
+            go(transition(0));
+        }
+    }
+
+    private long getCurrentPayload()
+    {
+        return getPayload(buf, payloadPosition(), payloadFlags());
+    }
+
+    private long getPayload(ByteBuffer contents, int payloadPos, int bytes)
+    {
+        if (bytes == 0)
+        {
+            return NOT_FOUND;
+        }
+        return SizedInts.read(contents, payloadPos, bytes);
+    }
+
+    private static class ImmutableTransitionBytesCollector extends TransitionBytesCollector
+    {
+        @Override
+        ByteComparable toByteComparable()
+        {
+            assert pos > 0;
+            final int length = pos;
+            return v -> ByteSource.fixedLength(bytes, 0, length);
+        }
+
+        @Override
+        void pop()
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    private static class TransitionBytesCollector
+    {
+        protected byte[] bytes = new byte[32];
+        protected int pos = 0;
+
+        void add(int b)
+        {
+            if (pos == bytes.length)
+            {
+                bytes = ArrayUtil.grow(bytes, pos + 1);
+            }
+            bytes[pos++] = (byte) b;
+        }
+
+        void pop()
+        {
+            assert pos >= 0;
+            pos--;
+        }
+
+        ByteComparable toByteComparable()
+        {
+            assert pos > 0;
+            final byte[] value = new byte[pos];
+            System.arraycopy(bytes, 0, value, 0, pos);
+            return v -> ByteSource.fixedLength(value, 0, value.length);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("[Bytes %s, pos %d]", Arrays.toString(bytes), pos);
+        }
+    }
+
+    private static class IterationPosition
+    {
+        final long node;
+        final IterationPosition prev;
+        int childIndex;
+
+        IterationPosition(long node, int childIndex, IterationPosition prev)
+        {
+            this.node = node;
+            this.childIndex = childIndex;
+            this.prev = prev;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("[Node %d, child %d]", node, childIndex);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryWriter.java
new file mode 100644
index 000000000000..712c669cc283
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryWriter.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.Closeable;
+import java.io.IOException;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.apache.commons.lang3.mutable.MutableLong;
+
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexOutputWriter;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.tries.IncrementalDeepTrieWriterPageAware;
+import org.apache.cassandra.io.tries.IncrementalTrieWriter;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Writes terms dictionary to disk in a trie format (see {@link IncrementalTrieWriter}.
+ *
+ * Allows for variable-length keys. Trie values are 64-bit offsets to the posting file, pointing to the beginning of
+ * summary block for that postings list.
+ */
+@NotThreadSafe
+public class TrieTermsDictionaryWriter implements Closeable
+{
+    private final IncrementalTrieWriter<Long> termsDictionaryWriter;
+    private final IndexOutputWriter termDictionaryOutput;
+    private final long startOffset;
+
+    TrieTermsDictionaryWriter(IndexComponents indexComponents, boolean segmented) throws IOException
+    {
+        termDictionaryOutput = indexComponents.createOutput(indexComponents.termsData, true, segmented);
+        startOffset = termDictionaryOutput.getFilePointer();
+
+        SAICodecUtils.writeHeader(termDictionaryOutput);
+        // we pass the output as SequentialWriter, but we keep IndexOutputWriter around to write footer on flush
+        termsDictionaryWriter = new IncrementalDeepTrieWriterPageAware<>(TrieTermsDictionaryReader.trieSerializer, termDictionaryOutput.asSequentialWriter());
+    }
+
+    public void add(ByteComparable term, long postingListOffset) throws IOException
+    {
+        termsDictionaryWriter.add(term, postingListOffset);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        termsDictionaryWriter.close();
+        termDictionaryOutput.close();
+    }
+
+    /**
+     * complete trie index and write footer
+     *
+     * @return the position in the file of the root node.
+     */
+    public long complete(MutableLong footerPointer) throws IOException
+    {
+        long root = termsDictionaryWriter.complete();
+
+        footerPointer.setValue(termDictionaryOutput.getFilePointer());
+        SAICodecUtils.writeFooter(termDictionaryOutput);
+        return root;
+    }
+
+    /**
+     * @return current file pointer
+     */
+    public long getFilePointer()
+    {
+        return termDictionaryOutput.getFilePointer();
+    }
+
+    /**
+     * @return file pointer where index structure begins
+     */
+    public long getStartOffset()
+    {
+        return startOffset;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/memory/InMemoryToken.java b/src/java/org/apache/cassandra/index/sai/memory/InMemoryToken.java
new file mode 100644
index 000000000000..3111ae577660
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/memory/InMemoryToken.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.util.Collections;
+import java.util.Iterator;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.Token;
+
+/**
+ * A single-use {@link Token} whose iterator of {@link DecoratedKey} is already materialized on-heap.
+ */
+@NotThreadSafe
+public class InMemoryToken extends Token
+{
+    private final Iterator<DecoratedKey> keys;
+
+    public InMemoryToken(long token, Iterator<DecoratedKey> keys)
+    {
+        super(token);
+        this.keys = keys == null ? Collections.emptyIterator() : keys;
+    }
+
+    public InMemoryToken(long token, DecoratedKey key)
+    {
+        super(token);
+        this.keys = Iterators.singletonIterator(key);
+    }
+
+    @Override
+    public Iterator<DecoratedKey> keys()
+    {
+        return keys;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/memory/KeyRangeIterator.java b/src/java/org/apache/cassandra/index/sai/memory/KeyRangeIterator.java
new file mode 100644
index 000000000000..972cdcb4e98b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/memory/KeyRangeIterator.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.io.IOException;
+import java.util.PriorityQueue;
+import java.util.SortedSet;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+
+public class KeyRangeIterator extends RangeIterator
+{
+    private final PriorityQueue<DecoratedKey> keys;
+    private final boolean uniqueKeys;
+    private DecoratedKey lastKey;
+
+    /**
+     * An in-memory {@link RangeIterator} that uses a {@link SortedSet} which has no duplication as its backing store.
+     */
+    public KeyRangeIterator(SortedSet<DecoratedKey> keys)
+    {
+        super((Long) keys.first().getToken().getTokenValue(), (Long) keys.last().getToken().getTokenValue(), keys.size());
+        this.keys = new PriorityQueue<>(keys);
+        this.uniqueKeys = true;
+    }
+
+    /**
+     * An in-memory {@link RangeIterator} that uses a {@link PriorityQueue} which may
+     * contain duplicated keys as its backing store.
+     */
+    public KeyRangeIterator(Long min, Long max, PriorityQueue<DecoratedKey> keys)
+    {
+        super(min, max, keys.size());
+        this.keys = keys;
+        this.uniqueKeys = false;
+    }
+
+    protected Token computeNext()
+    {
+        DecoratedKey key = computeNextKey();
+        return key == null ? endOfData() : new InMemoryToken(key.getToken().getLongValue(), key);
+    }
+
+    private DecoratedKey computeNextKey()
+    {
+        DecoratedKey next = null;
+
+        while (!keys.isEmpty())
+        {
+            DecoratedKey key = keys.poll();
+            if (uniqueKeys)
+                return key;
+
+            if (lastKey == null || lastKey.compareTo(key) != 0)
+            {
+                next = key;
+                lastKey = key;
+                break;
+            }
+        }
+
+        return next;
+    }
+
+    protected void performSkipTo(Long nextToken)
+    {
+        while (!keys.isEmpty())
+        {
+            DecoratedKey key = keys.peek();
+            if ((long) key.getToken().getTokenValue() >= nextToken)
+                break;
+
+            // consume smaller key
+            keys.poll();
+        }
+    }
+
+    public void close() throws IOException
+    {}
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/index/sai/memory/MemoryIndex.java b/src/java/org/apache/cassandra/index/sai/memory/MemoryIndex.java
new file mode 100644
index 000000000000..398c4002191c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/memory/MemoryIndex.java
@@ -0,0 +1,80 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.memory;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.PrimaryKeys;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+public abstract class MemoryIndex
+{
+    protected final ColumnContext columnContext;
+
+    private ByteBuffer minTerm;
+    private ByteBuffer maxTerm;
+
+    protected MemoryIndex(ColumnContext columnContext)
+    {
+        this.columnContext = columnContext;
+    }
+
+    public abstract long add(DecoratedKey key, Clustering clustering, ByteBuffer value);
+
+    public abstract RangeIterator search(Expression expression, AbstractBounds<PartitionPosition> keyRange);
+
+    public void setMinMaxTerm(ByteBuffer term)
+    {
+        assert term != null;
+
+        minTerm = TypeUtil.min(term, minTerm, columnContext.getValidator());
+        maxTerm = TypeUtil.max(term, maxTerm, columnContext.getValidator());
+    }
+
+    public ByteBuffer getMinTerm()
+    {
+        return minTerm;
+    }
+
+    public ByteBuffer getMaxTerm()
+    {
+        return maxTerm;
+    }
+
+    /**
+     * Iterate all Term->PrimaryKeys mappings in sorted order
+     */
+    public abstract Iterator<Pair<ByteComparable, PrimaryKeys>> iterator();
+}
diff --git a/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java b/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
new file mode 100644
index 000000000000..d77cb7110533
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
@@ -0,0 +1,105 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.memory;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.concurrent.atomic.LongAdder;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.PrimaryKeys;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+public class MemtableIndex
+{
+    private final MemoryIndex index;
+    private final AbstractType<?> validator;
+    private final ClusteringComparator clusteringComparator;
+    private final LongAdder writeCount = new LongAdder();
+    private final LongAdder estimatedMemoryUsed = new LongAdder();
+
+    public MemtableIndex(ColumnContext columnContext, Memtable mt)
+    {
+        this.index = new TrieMemoryIndex(columnContext);
+        this.validator = columnContext.getValidator();
+        this.clusteringComparator = columnContext.clusteringComparator();
+    }
+
+    public long writeCount()
+    {
+        return writeCount.sum();
+    }
+
+    public long estimatedMemoryUsed()
+    {
+        return estimatedMemoryUsed.sum();
+    }
+
+    public boolean isEmpty()
+    {
+        return getMinTerm() == null;
+    }
+
+    public ByteBuffer getMinTerm()
+    {
+        return index.getMinTerm();
+    }
+
+    public ByteBuffer getMaxTerm()
+    {
+        return index.getMaxTerm();
+    }
+
+    public long index(DecoratedKey key, Clustering clustering, ByteBuffer value)
+    {
+        if (value == null || value.remaining() == 0)
+            return 0;
+
+        long ram = index.add(key, clustering, value);
+        writeCount.increment();
+        estimatedMemoryUsed.add(ram);
+        return ram;
+    }
+
+    public RangeIterator search(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    {
+        return index.search(expression, keyRange);
+    }
+
+    public Iterator<Pair<ByteComparable, PrimaryKeys>> iterator()
+    {
+        return index.iterator();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java b/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java
new file mode 100644
index 000000000000..9241c7bd9625
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/memory/RowMapping.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.util.Collections;
+import java.util.Iterator;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.tries.MemtableTrie;
+import org.apache.cassandra.db.tries.Trie;
+import org.apache.cassandra.index.sai.disk.SegmentBuilder;
+import org.apache.cassandra.index.sai.utils.AbstractIterator;
+import org.apache.cassandra.index.sai.utils.PrimaryKey;
+import org.apache.cassandra.index.sai.utils.PrimaryKeys;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * In memory representation of {@link PrimaryKey} to row ID mappings which only contains
+ * {@link Row} regardless it's live or deleted. ({@link RangeTombstoneMarker} is not included.)
+ *
+ * For JBOD, we can make use of sstable min/max partition key to filter irrelevant {@link MemtableIndex} subranges.
+ * For Tiered Storage, in most cases, it flushes to tiered 0.
+ */
+public class RowMapping
+{
+    public static final RowMapping DUMMY = new RowMapping()
+    {
+        @Override
+        public Iterator<Pair<ByteComparable, IntArrayList>> merge(MemtableIndex index) { return Collections.emptyIterator(); }
+
+        @Override
+        public void complete() {}
+
+        @Override
+        public void add(DecoratedKey key, Unfiltered unfiltered, long sstableRowId) {}
+    };
+
+    private final MemtableTrie<Integer> rowMapping = new MemtableTrie<>(BufferType.OFF_HEAP);
+
+    private volatile boolean complete = false;
+
+    public DecoratedKey minKey;
+    public DecoratedKey maxKey;
+
+    public int maxSegmentRowId = -1;
+
+    private RowMapping()
+    {
+    }
+
+    /**
+     * Create row mapping for FLUSH operation only.
+     */
+    public static RowMapping create(OperationType opType)
+    {
+        if (opType == OperationType.FLUSH)
+            return new RowMapping();
+        return DUMMY;
+    }
+
+    /**
+     * Merge IndexMemtable(index term to PrimaryKeys mappings) with row mapping of a sstable
+     * (PrimaryKey to RowId mappings).
+     *
+     * @param index a Memtable-attached column index
+     *
+     * @return iterator of index term to postings mapping exists in the sstable
+     */
+    public Iterator<Pair<ByteComparable, IntArrayList>> merge(MemtableIndex index)
+    {
+        assert complete : "RowMapping is not built.";
+
+        Iterator<Pair<ByteComparable, PrimaryKeys>> iterator = index.iterator();
+        return new AbstractIterator<Pair<ByteComparable, IntArrayList>>()
+        {
+            @Override
+            protected Pair<ByteComparable, IntArrayList> computeNext()
+            {
+                while (iterator.hasNext())
+                {
+                    Pair<ByteComparable, PrimaryKeys> pair = iterator.next();
+
+                    IntArrayList postings = null;
+                    Iterator<PrimaryKey> primaryKeys = pair.right.iterator();
+
+                    while (primaryKeys.hasNext())
+                    {
+                        PrimaryKey primaryKey = primaryKeys.next();
+                        ByteComparable byteComparable = asComparableBytes(primaryKey.partitionKey(), primaryKey.clustering());
+                        Integer segmentRowId = rowMapping.get(byteComparable);
+
+                        if (segmentRowId != null)
+                        {
+                            postings = postings == null ? new IntArrayList() : postings;
+                            postings.add(segmentRowId);
+                        }
+                    }
+                    if (postings != null && !postings.isEmpty())
+                        return Pair.create(pair.left, postings);
+                }
+                return endOfData();
+            }
+        };
+    }
+
+    /**
+     * Complete building in memory RowMapping, mark it as immutable.
+     */
+    public void complete()
+    {
+        assert !complete : "RowMapping can only be built once.";
+        this.complete = true;
+    }
+
+    /**
+     * Include PrimaryKey to RowId mapping
+     */
+    public void add(DecoratedKey key, Unfiltered unfiltered, long sstableRowId)
+    {
+        assert !complete : "Cannot modify built RowMapping.";
+
+        if (unfiltered.isRangeTombstoneMarker())
+        {
+            // currently we don't record range tombstones..
+        }
+        else
+        {
+            assert unfiltered.isRow();
+            Row row = (Row) unfiltered;
+
+            ByteComparable byteComparable = asComparableBytes(key, row.clustering());
+            int segmentRowId = SegmentBuilder.castToSegmentRowId(sstableRowId, 0);
+            try
+            {
+                rowMapping.apply(Trie.singleton(byteComparable, segmentRowId), (existing, neww) -> neww);
+            }
+            catch (MemtableTrie.SpaceExhaustedException e)
+            {
+                //TODO Work out how to handle this properly
+                throw new RuntimeException(e);
+            }
+
+            maxSegmentRowId = Math.max(maxSegmentRowId, segmentRowId);
+
+            // data is written in token sorted order
+            if (minKey == null)
+                minKey = key;
+            maxKey = key;
+        }
+    }
+
+    public boolean hasRows()
+    {
+        return maxSegmentRowId >= 0;
+    }
+
+    private ByteComparable asComparableBytes(DecoratedKey key, Clustering clustering)
+    {
+        return v -> new ByteSource()
+        {
+            ByteSource source = key.asComparableBytes(v);
+            int index = -1;
+
+            @Override
+            public int next()
+            {
+                if (index == clustering.size())
+                    return END_OF_STREAM;
+
+                int b = source.next();
+                if (b > END_OF_STREAM)
+                    return b;
+
+                if (++index == clustering.size())
+                    return v == ByteComparable.Version.LEGACY ? ByteSource.END_OF_STREAM : ByteSource.TERMINATOR;
+                source = ByteSource.of(clustering.accessor(), clustering.get(index), v);
+                return NEXT_COMPONENT;
+            }
+        };
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
new file mode 100644
index 000000000000..e47977f8dac2
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java
@@ -0,0 +1,339 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.memory;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.SortedSet;
+import java.util.concurrent.atomic.LongAdder;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import io.netty.util.concurrent.FastThreadLocal;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.tries.MemtableTrie;
+import org.apache.cassandra.db.tries.Trie;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.PrimaryKey;
+import org.apache.cassandra.index.sai.utils.PrimaryKeys;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+
+public class TrieMemoryIndex extends MemoryIndex
+{
+    private static final Logger logger = LoggerFactory.getLogger(TrieMemoryIndex.class);
+    private static final int MINIMUM_QUEUE_SIZE = 128;
+    private static final int MAX_RECURSIVE_KEY_LENGTH = 128;
+
+
+    private final MemtableTrie<PrimaryKeys> data;
+    private final ClusteringComparator clusteringComparator;
+    private final PrimaryKeysReducer primaryKeysReducer;
+    private final AbstractAnalyzer analyzer;
+    private final AbstractType<?> validator;
+    private final boolean isLiteral;
+    private final Object writeLock = new Object();
+
+    private static final FastThreadLocal<Integer> lastQueueSize = new FastThreadLocal<Integer>()
+    {
+        protected Integer initialValue()
+        {
+            return MINIMUM_QUEUE_SIZE;
+        }
+    };
+
+
+    public TrieMemoryIndex(ColumnContext columnContext)
+    {
+        super(columnContext);
+        //TODO Do we need to follow a setting for this?
+        this.data = new MemtableTrie<>(BufferType.OFF_HEAP);
+        this.clusteringComparator = columnContext.clusteringComparator();
+        this.primaryKeysReducer = new PrimaryKeysReducer();
+        // MemoryIndex is per-core, so analyzer should be thread-safe..
+        this.analyzer = columnContext.getAnalyzer();
+        this.validator = columnContext.getValidator();
+        this.isLiteral = TypeUtil.isLiteral(validator);
+    }
+
+    @Override
+    public long add(DecoratedKey key, Clustering clustering, ByteBuffer value)
+    {
+        synchronized (writeLock)
+        {
+            AbstractAnalyzer analyzer = columnContext.getAnalyzer();
+            value = TypeUtil.encode(value, validator);
+            analyzer.reset(value.duplicate());
+            final PrimaryKey primaryKey = PrimaryKey.of(key, clustering);
+            final long initialSizeOnHeap = data.sizeOnHeap();
+            final long initialSizeOffHeap = data.sizeOffHeap();
+            final long reducerHeapSize = primaryKeysReducer.heapAllocations();
+
+
+            while (analyzer.hasNext())
+            {
+                final ByteBuffer term = analyzer.next();
+                setMinMaxTerm(term);
+
+                final ByteComparable encodedTerm = encode(term);
+                try
+                {
+                    if (term.limit() <= MAX_RECURSIVE_KEY_LENGTH)
+                    {
+                        data.putRecursive(encodedTerm, primaryKey, primaryKeysReducer);
+                    }
+                    else
+                    {
+                        data.apply(Trie.singleton(encodedTerm, primaryKey), primaryKeysReducer);
+                    }
+                }
+                catch (MemtableTrie.SpaceExhaustedException e)
+                {
+                    //TODO Handle this properly
+                    throw new RuntimeException(e);
+                }
+            }
+            return (data.sizeOnHeap() - initialSizeOnHeap) + (data.sizeOffHeap() - initialSizeOffHeap) + (primaryKeysReducer.heapAllocations() - reducerHeapSize);
+        }
+    }
+
+    @Override
+    public RangeIterator search(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace("Searching memtable index on expression '{}'...", expression);
+
+        switch (expression.getOp())
+        {
+            case MATCH:
+            case EQ:
+            case CONTAINS_KEY:
+            case CONTAINS_VALUE:
+                return exactMatch(expression);
+            case RANGE:
+                return rangeMatch(expression, keyRange);
+            default:
+                throw new IllegalArgumentException("Unsupported expression: " + expression);
+        }
+    }
+
+    @Override
+    public Iterator<Pair<ByteComparable, PrimaryKeys>> iterator()
+    {
+        Iterator<Map.Entry<ByteComparable, PrimaryKeys>> iterator = data.entrySet().iterator();
+        return new Iterator<Pair<ByteComparable, PrimaryKeys>>()
+        {
+            @Override
+            public boolean hasNext()
+            {
+                return iterator.hasNext();
+            }
+
+            @Override
+            public Pair<ByteComparable, PrimaryKeys> next()
+            {
+                Map.Entry<ByteComparable, PrimaryKeys> entry = iterator.next();
+                return Pair.create(decode(entry.getKey()), entry.getValue());
+            }
+        };
+    }
+
+    private ByteComparable encode(ByteBuffer input)
+    {
+        return isLiteral ? version -> append(ByteSource.of(input, version), ByteSource.TERMINATOR)
+                         : version -> TypeUtil.asComparableBytes(input, validator, version);
+    }
+
+    private ByteComparable decode(ByteComparable term)
+    {
+        return isLiteral ? version -> ByteSourceInverse.unescape(ByteSource.peekable(term.asComparableBytes(version)))
+                         : term;
+
+    }
+
+    private ByteSource append(ByteSource src, int lastByte)
+    {
+        return new ByteSource()
+        {
+            boolean done = false;
+
+            @Override
+            public int next()
+            {
+                if (done)
+                    return END_OF_STREAM;
+                int n = src.next();
+                if (n != END_OF_STREAM)
+                    return n;
+
+                done = true;
+                return lastByte;
+            }
+        };
+    }
+
+    private RangeIterator exactMatch(Expression expression)
+    {
+        final ByteComparable prefix = expression.lower == null ? ByteComparable.EMPTY : encode(expression.lower.value.encoded);
+        final PrimaryKeys primaryKeys = data.get(prefix);
+        if (primaryKeys == null)
+        {
+            return RangeIterator.empty();
+        }
+        return new KeyRangeIterator(primaryKeys.partitionKeys());
+    }
+
+    public static class Collector
+    {
+        long minimumTokenValue = Long.MAX_VALUE;
+        long maximumTokenValue = Long.MIN_VALUE;
+        PriorityQueue<DecoratedKey> mergedKeys = new PriorityQueue<>(lastQueueSize.get(), DecoratedKey.comparator);
+
+        AbstractBounds<PartitionPosition> keyRange;
+
+        public Collector(AbstractBounds<PartitionPosition> keyRange)
+        {
+            this.keyRange = keyRange;
+        }
+
+        public void processContent(PrimaryKeys keys)
+        {
+            if (keys.isEmpty())
+                return;
+
+            SortedSet<DecoratedKey> partitionKeys = keys.partitionKeys();
+
+            // shortcut to avoid generating iterator
+            if (partitionKeys.size() == 1)
+            {
+                DecoratedKey first = partitionKeys.first();
+                if (keyRange.contains(first))
+                {
+                    mergedKeys.add(first);
+
+                    long currentTokenValue = first.getToken().getLongValue();
+                    minimumTokenValue = Math.min(minimumTokenValue, currentTokenValue);
+                    maximumTokenValue = Math.max(maximumTokenValue, currentTokenValue);
+                }
+
+                return;
+            }
+
+            // skip entire partition keys if they don't overlap
+            if (!keyRange.right.isMinimum() && partitionKeys.first().compareTo(keyRange.right) > 0
+                    || partitionKeys.last().compareTo(keyRange.left) < 0)
+                return;
+
+            for (DecoratedKey key : partitionKeys)
+            {
+                if (keyRange.contains(key))
+                {
+                    mergedKeys.add(key);
+
+                    long currentTokenValue = key.getToken().getLongValue();
+                    minimumTokenValue = Math.min(minimumTokenValue, currentTokenValue);
+                    maximumTokenValue = Math.max(maximumTokenValue, currentTokenValue);
+                }
+            }
+            return;
+        }
+    }
+
+    private RangeIterator rangeMatch(Expression expression, AbstractBounds<PartitionPosition> keyRange)
+    {
+        ByteComparable lowerBound, upperBound;
+        boolean lowerInclusive, upperInclusive;
+        if (expression.lower != null)
+        {
+            lowerBound = encode(expression.lower.value.encoded);
+            lowerInclusive = expression.lower.inclusive;
+        }
+        else
+        {
+            lowerBound = ByteComparable.EMPTY;
+            lowerInclusive = false;
+        }
+
+        if (expression.upper != null)
+        {
+            upperBound = encode(expression.upper.value.encoded);
+            upperInclusive = expression.upper.inclusive;
+        }
+        else
+        {
+            upperBound = null;
+            upperInclusive = false;
+        }
+
+        Collector cd = new Collector(keyRange);
+
+        data.subtrie(lowerBound, lowerInclusive, upperBound, upperInclusive).values().forEach(pk -> cd.processContent(pk));
+
+        if (cd.mergedKeys.isEmpty())
+        {
+            return RangeIterator.empty();
+        }
+
+        lastQueueSize.set(Math.max(MINIMUM_QUEUE_SIZE, cd.mergedKeys.size()));
+        return new KeyRangeIterator(cd.minimumTokenValue, cd.maximumTokenValue, cd.mergedKeys);
+    }
+
+    private class PrimaryKeysReducer implements MemtableTrie.UpsertTransformer<PrimaryKeys, PrimaryKey>
+    {
+        private final LongAdder heapAllocations = new LongAdder();
+
+        @Override
+        public PrimaryKeys apply(PrimaryKeys existing, PrimaryKey neww)
+        {
+            if (existing == null)
+            {
+                existing = PrimaryKeys.create(clusteringComparator);
+                heapAllocations.add(existing.unsharedHeapSize());
+            }
+            heapAllocations.add(existing.add(neww));
+            return existing;
+        }
+
+        long heapAllocations()
+        {
+            return heapAllocations.longValue();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/AbstractMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/AbstractMetrics.java
new file mode 100644
index 000000000000..b34cfdde8f2a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/AbstractMetrics.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.metrics.CassandraMetricsRegistry;
+import org.apache.cassandra.metrics.DefaultNameFactory;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public abstract class AbstractMetrics
+{
+    public static final String TYPE = "StorageAttachedIndex";
+
+    protected final TableMetadata table;
+    private final String index;
+    private final String scope;
+    protected final List<CassandraMetricsRegistry.MetricName> tracked = new ArrayList<>();
+
+    AbstractMetrics(TableMetadata table, String scope)
+    {
+        this(table, null, scope);
+    }
+
+    AbstractMetrics(TableMetadata table, String index, String scope)
+    {
+        assert table != null : "SAI metrics must include table metadata";
+        this.table = table;
+        this.index = index;
+        this.scope = scope;
+    }
+
+    public void release()
+    {
+        tracked.forEach(Metrics::remove);
+        tracked.clear();
+    }
+
+    protected CassandraMetricsRegistry.MetricName createMetricName(String name)
+    {
+        return createMetricName(name, scope);
+    }
+
+    protected CassandraMetricsRegistry.MetricName createMetricName(String name, String scope)
+    {
+        String metricScope = table.keyspace + "." + table.name;
+        if (index != null)
+        {
+            metricScope += "." + index;
+        }
+        metricScope += "." + scope + "." + name;
+
+        CassandraMetricsRegistry.MetricName metricName = new CassandraMetricsRegistry.MetricName(DefaultNameFactory.GROUP_NAME,
+                                                                                                 TYPE, name, metricScope, createMBeanName(name, scope));
+        tracked.add(metricName);
+        return metricName;
+    }
+
+    private String createMBeanName(String name, String scope)
+    {
+        StringBuilder builder = new StringBuilder();
+        builder.append(DefaultNameFactory.GROUP_NAME);
+        builder.append(":type=").append(TYPE);
+        builder.append(',').append("keyspace=").append(table.keyspace);
+        builder.append(',').append("table=").append(table.name);
+        if (index != null)
+            builder.append(',').append("index=").append(index);
+        builder.append(',').append("scope=").append(scope);
+        builder.append(',').append("name=").append(name);
+        return builder.toString();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java
new file mode 100644
index 000000000000..a0559c58fa04
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+
+import com.codahale.metrics.Meter;
+import com.codahale.metrics.Timer;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public abstract class ColumnQueryMetrics extends AbstractMetrics
+{
+    private ColumnQueryMetrics(String indexName, TableMetadata table)
+    {
+        super(table, indexName, "ColumnQueryMetrics");
+    }
+
+    public static class TrieIndexMetrics extends ColumnQueryMetrics implements QueryEventListener.TrieIndexEventListener
+    {
+        private static final String TRIE_POSTINGS_TYPE = "Postings";
+
+        /**
+         * Trie index metrics.
+         */
+        private final Timer termsTraversalTotalTime;
+
+        private final QueryEventListener.PostingListEventListener postingsListener;
+
+        public TrieIndexMetrics(String indexName, TableMetadata table)
+        {
+            super(indexName, table);
+
+            termsTraversalTotalTime = Metrics.timer(createMetricName("TermsLookupLatency"));
+
+            Meter postingDecodes = Metrics.meter(createMetricName("PostingDecodes", TRIE_POSTINGS_TYPE));
+
+            postingsListener = new PostingListEventsMetrics(postingDecodes);
+        }
+
+        @Override
+        public void onSegmentHit() { }
+
+        @Override
+        public void onTraversalComplete(long traversalTotalTime, TimeUnit unit)
+        {
+            termsTraversalTotalTime.update(traversalTotalTime, unit);
+        }
+
+        @Override
+        public QueryEventListener.PostingListEventListener postingListEventListener()
+        {
+            return postingsListener;
+        }
+    }
+
+    public static class BKDIndexMetrics extends ColumnQueryMetrics implements QueryEventListener.BKDIndexEventListener
+    {
+        private static final String BKD_POSTINGS_TYPE = "KDTreePostings";
+
+        /**
+         * BKD index metrics.
+         */
+        private final Timer intersectionLatency;
+        private final Meter postingsNumPostings;
+        private final Meter intersectionEarlyExits;
+
+        private final QueryEventListener.PostingListEventListener postingsListener;
+
+        public BKDIndexMetrics(String indexName, TableMetadata table)
+        {
+            super(indexName, table);
+
+            intersectionLatency = Metrics.timer(createMetricName("KDTreeIntersectionLatency"));
+            intersectionEarlyExits = Metrics.meter(createMetricName("KDTreeIntersectionEarlyExits"));
+
+            postingsNumPostings = Metrics.meter(createMetricName("NumPostings", BKD_POSTINGS_TYPE));
+
+            Meter postingDecodes = Metrics.meter(createMetricName("PostingDecodes", BKD_POSTINGS_TYPE));
+
+            postingsListener = new PostingListEventsMetrics(postingDecodes);
+        }
+
+        @Override
+        public void onIntersectionComplete(long intersectionTotalTime, TimeUnit unit)
+        {
+            intersectionLatency.update(intersectionTotalTime, unit);
+        }
+
+        @Override
+        public void onIntersectionEarlyExit()
+        {
+            intersectionEarlyExits.mark();
+        }
+
+        @Override
+        public void postingListsHit(int count)
+        {
+            postingsNumPostings.mark(count);
+        }
+
+        @Override
+        public void onSegmentHit() { }
+
+        @Override
+        public QueryEventListener.PostingListEventListener postingListEventListener()
+        {
+            return postingsListener;
+        }
+    }
+
+    private static class PostingListEventsMetrics implements QueryEventListener.PostingListEventListener
+    {
+        private final Meter postingDecodes;
+
+        private PostingListEventsMetrics(Meter postingDecodes)
+        {
+            this.postingDecodes = postingDecodes;
+        }
+
+        @Override
+        public void onAdvance() { }
+
+        @Override
+        public void onPostingDecoded()
+        {
+            postingDecodes.mark();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/IndexGroupMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/IndexGroupMetrics.java
new file mode 100644
index 000000000000..50ed57b92ecd
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/IndexGroupMetrics.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import com.codahale.metrics.Gauge;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class IndexGroupMetrics extends AbstractMetrics
+{
+    public final Gauge openIndexFiles;
+    public final Gauge diskUsedBytes;
+
+    public IndexGroupMetrics(TableMetadata table, StorageAttachedIndexGroup group)
+    {
+        super(table, "IndexGroupMetrics");
+
+        openIndexFiles = Metrics.register(createMetricName("OpenIndexFiles"), group::openIndexFiles);
+
+        diskUsedBytes = Metrics.register(createMetricName("DiskUsedBytes"), group::diskUsage);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/IndexMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/IndexMetrics.java
new file mode 100644
index 000000000000..29881ff59740
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/IndexMetrics.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.Histogram;
+import com.codahale.metrics.Timer;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class IndexMetrics extends AbstractMetrics
+{
+    public final Timer memtableIndexWriteLatency;
+    
+    public final Gauge ssTableCellCount;
+    public final Gauge liveMemtableIndexWriteCount;
+    public final Gauge diskUsedBytes;
+    public final Gauge memtableIndexBytes;
+    public final Gauge indexFileCacheBytes;
+    
+    public final Counter memtableIndexFlushCount;
+    public final Counter compactionCount;
+    public final Counter memtableIndexFlushErrors;
+    public final Counter segmentFlushErrors;
+    
+    public final Histogram memtableFlushCellsPerSecond;
+    public final Histogram segmentsPerCompaction;
+    public final Histogram compactionSegmentCellsPerSecond;
+    public final Histogram compactionSegmentBytesPerSecond;
+
+    public IndexMetrics(ColumnContext context, TableMetadata table)
+    {
+        super(table, context.getIndexName(), "IndexMetrics");
+
+        memtableIndexWriteLatency = Metrics.timer(createMetricName("MemtableIndexWriteLatency"));
+        compactionSegmentCellsPerSecond = Metrics.histogram(createMetricName("CompactionSegmentCellsPerSecond"), false);
+        compactionSegmentBytesPerSecond = Metrics.histogram(createMetricName("CompactionSegmentBytesPerSecond"), false);
+        memtableFlushCellsPerSecond = Metrics.histogram(createMetricName("MemtableIndexFlushCellsPerSecond"), false);
+        segmentsPerCompaction = Metrics.histogram(createMetricName("SegmentsPerCompaction"), false);
+        ssTableCellCount = Metrics.register(createMetricName("SSTableCellCount"), context::getCellCount);
+        memtableIndexFlushCount = Metrics.counter(createMetricName("MemtableIndexFlushCount"));
+        compactionCount = Metrics.counter(createMetricName("CompactionCount"));
+        memtableIndexFlushErrors = Metrics.counter(createMetricName("MemtableIndexFlushErrors"));
+        segmentFlushErrors = Metrics.counter(createMetricName("CompactionSegmentFlushErrors"));
+        liveMemtableIndexWriteCount = Metrics.register(createMetricName("LiveMemtableIndexWriteCount"), context::liveMemtableWriteCount);
+        memtableIndexBytes = Metrics.register(createMetricName("MemtableIndexBytes"), context::estimatedMemIndexMemoryUsed);
+        diskUsedBytes = Metrics.register(createMetricName("DiskUsedBytes"), context::diskUsage);
+        indexFileCacheBytes = Metrics.register(createMetricName("IndexFileCacheBytes"), context::indexFileCacheSize);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java b/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java
new file mode 100644
index 000000000000..f3926dcdfd8e
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.index.sai.QueryContext;
+
+public final class MulticastQueryEventListeners
+{
+    public static QueryEventListener.TrieIndexEventListener of(QueryContext ctx, QueryEventListener.TrieIndexEventListener listener)
+    {
+        return new Multicast2TrieIndexEventListener(ctx, listener);
+    }
+
+    public static QueryEventListener.BKDIndexEventListener of(QueryContext ctx, QueryEventListener.BKDIndexEventListener listener)
+    {
+        return new Multicast2BKDIndexEventListener(ctx, listener);
+    }
+
+    public static class Multicast2TrieIndexEventListener implements QueryEventListener.TrieIndexEventListener
+    {
+        private final QueryContext ctx;
+        private final QueryEventListener.TrieIndexEventListener listener;
+        private final Multicast2TriePostingListEventListener postingListEventListener;
+
+        private Multicast2TrieIndexEventListener(QueryContext ctx, QueryEventListener.TrieIndexEventListener listener)
+        {
+            this.ctx = ctx;
+            this.listener = listener;
+            this.postingListEventListener = new Multicast2TriePostingListEventListener(ctx, listener.postingListEventListener());
+        }
+
+        @Override
+        public void onSegmentHit()
+        {
+            ctx.segmentsHit++;
+            ctx.trieSegmentsHit++;
+            listener.onSegmentHit();
+        }
+
+        @Override
+        public void onTraversalComplete(long traversalTotalTime, TimeUnit unit)
+        {
+            listener.onTraversalComplete(traversalTotalTime, unit);
+        }
+
+        @Override
+        public QueryEventListener.PostingListEventListener postingListEventListener()
+        {
+            return postingListEventListener;
+        }
+    }
+
+    public static class Multicast2BKDIndexEventListener implements QueryEventListener.BKDIndexEventListener
+    {
+        private final QueryContext ctx;
+        private final QueryEventListener.BKDIndexEventListener listener;
+        private final Multicast2BKDPostingListEventListener postingListEventListener;
+
+        private Multicast2BKDIndexEventListener(QueryContext ctx, QueryEventListener.BKDIndexEventListener listener)
+        {
+            this.ctx = ctx;
+            this.listener = listener;
+            this.postingListEventListener = new Multicast2BKDPostingListEventListener(ctx, listener.postingListEventListener());
+        }
+
+        @Override
+        public void onIntersectionComplete(long intersectionTotalTime, TimeUnit unit)
+        {
+            listener.onIntersectionComplete(intersectionTotalTime, unit);
+        }
+
+        @Override
+        public void onIntersectionEarlyExit()
+        {
+            listener.onIntersectionEarlyExit();
+        }
+
+        @Override
+        public void postingListsHit(int count)
+        {
+            ctx.bkdPostingListsHit++;
+            listener.postingListsHit(count);
+        }
+
+        @Override
+        public void onSegmentHit()
+        {
+            ctx.segmentsHit++;
+            ctx.bkdSegmentsHit++;
+            listener.onSegmentHit();
+        }
+
+        @Override
+        public QueryEventListener.PostingListEventListener postingListEventListener()
+        {
+            return postingListEventListener;
+        }
+    }
+
+    public static class Multicast2BKDPostingListEventListener implements QueryEventListener.PostingListEventListener
+    {
+        private final QueryContext ctx;
+        private final QueryEventListener.PostingListEventListener listener;
+
+        Multicast2BKDPostingListEventListener(QueryContext ctx, QueryEventListener.PostingListEventListener listener)
+        {
+            this.ctx = ctx;
+            this.listener = listener;
+        }
+
+        @Override
+        public void onAdvance()
+        {
+            ctx.bkdPostingsSkips++;
+            listener.onAdvance();
+        }
+
+        @Override
+        public void onPostingDecoded()
+        {
+            ctx.bkdPostingsDecodes++;
+            listener.onPostingDecoded();
+        }
+    }
+
+    public static class Multicast2TriePostingListEventListener implements QueryEventListener.PostingListEventListener
+    {
+        private final QueryContext ctx;
+        private final QueryEventListener.PostingListEventListener listener;
+
+        Multicast2TriePostingListEventListener(QueryContext ctx, QueryEventListener.PostingListEventListener listener)
+        {
+            this.ctx = ctx;
+            this.listener = listener;
+        }
+
+        @Override
+        public void onAdvance()
+        {
+            ctx.triePostingsSkips++;
+            listener.onAdvance();
+        }
+
+        @Override
+        public void onPostingDecoded()
+        {
+            ctx.triePostingsDecodes++;
+            listener.onPostingDecoded();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java b/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java
new file mode 100644
index 000000000000..134590b5d7fd
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Listener that gets notified during storage-attached index query execution.
+ */
+public interface QueryEventListener
+{
+    /**
+     * Returns listener for bkd index events.
+     */
+    BKDIndexEventListener bkdIndexEventListener();
+
+    /**
+     * Returns listener for trie index events.
+     */
+    TrieIndexEventListener trieIndexEventListener();
+
+    /**
+     * Collector for kd-tree index file related metrics.
+     */
+    interface BKDIndexEventListener
+    {
+        /**
+         * Per-segment kd-tree index intersection time in given units. Recorded when intersection completes.
+         */
+        void onIntersectionComplete(long intersectionTotalTime, TimeUnit unit);
+
+        /**
+         * When an intersection exits early due to the query shape being completely outside the min/max range.
+         */
+        void onIntersectionEarlyExit();
+
+        /**
+         * How many bkd posting list were matched during the intersection.
+         */
+        void postingListsHit(int count);
+
+        /**
+         * When query potentially matches value range within a segment and we need to do a traversal.
+         */
+        void onSegmentHit();
+
+        /**
+         * Returns events listener for bkd postings.
+         */
+        PostingListEventListener postingListEventListener();
+    }
+
+    interface TrieIndexEventListener
+    {
+        /**
+         * When query potentially matches value range within a segment and we need to do a traversal.
+         */
+        void onSegmentHit();
+
+        /**
+         * Per-segment trie index traversal time in given units. Recorded when traversal completes.
+         */
+        void onTraversalComplete(long traversalTotalTime, TimeUnit unit);
+
+        /**
+         * Returns events listener for trie postings.
+         */
+        PostingListEventListener postingListEventListener();
+    }
+
+    /**
+     * Collector for posting file related metrics.
+     */
+    interface PostingListEventListener
+    {
+        /**
+         * When an individual posting lists is advanced.
+         */
+        void onAdvance();
+
+        /**
+         * When a posting is successfully read from disk and decoded.
+         */
+        void onPostingDecoded();
+
+        PostingListEventListener NO_OP = new PostingListEventListener()
+        {
+            @Override
+            public void onAdvance()
+            {
+
+            }
+
+            @Override
+            public void onPostingDecoded()
+            {
+
+            }
+        };
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/TableQueryMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/TableQueryMetrics.java
new file mode 100644
index 000000000000..5b38f3919f1a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/TableQueryMetrics.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.codahale.metrics.Counter;
+import com.codahale.metrics.Histogram;
+import com.codahale.metrics.Meter;
+import com.codahale.metrics.Timer;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.NoSpamLogger;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class TableQueryMetrics extends AbstractMetrics
+{
+    public static final String TABLE_QUERY_METRIC_TYPE = "TableQueryMetrics";
+    private static final Logger logger = LoggerFactory.getLogger(TableQueryMetrics.class);
+    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 60, TimeUnit.SECONDS);
+
+    private final PerQueryMetrics perQueryMetrics;
+
+    private final Counter totalQueryTimeouts;
+    private final Counter totalPartitionReads;
+    private final Counter totalRowsFiltered;
+    private final Counter totalQueriesCompleted;
+
+    private final Meter tokenSkippingLookups;
+    private final Meter tokenSkippingCacheHits;
+
+    public TableQueryMetrics(TableMetadata table)
+    {
+        super(table, TABLE_QUERY_METRIC_TYPE);
+
+        perQueryMetrics = new PerQueryMetrics(table);
+
+        totalPartitionReads = Metrics.counter(createMetricName("TotalPartitionReads"));
+        totalRowsFiltered = Metrics.counter(createMetricName("TotalRowsFiltered"));
+        totalQueriesCompleted = Metrics.counter(createMetricName("TotalQueriesCompleted"));
+        totalQueryTimeouts = Metrics.counter(createMetricName("TotalQueryTimeouts"));
+
+        tokenSkippingLookups = Metrics.meter(createMetricName("Lookups", "TokenSkipping"));
+        tokenSkippingCacheHits = Metrics.meter(createMetricName("CacheHits", "TokenSkipping"));
+    }
+
+    public void record(QueryContext queryContext)
+    {
+        if (queryContext.queryTimeouts > 0)
+        {
+            assert queryContext.queryTimeouts == 1;
+
+            totalQueryTimeouts.inc();
+        }
+
+        long skippingLookups = queryContext.tokenSkippingLookups;
+        long skippingCacheHits = queryContext.tokenSkippingCacheHits;
+
+        tokenSkippingLookups.mark(skippingLookups);
+        tokenSkippingCacheHits.mark(skippingCacheHits);
+
+        perQueryMetrics.record(queryContext);
+    }
+
+    public void release()
+    {
+        super.release();
+        perQueryMetrics.release();
+    }
+
+    public class PerQueryMetrics extends AbstractMetrics
+    {
+        private final Timer queryLatency;
+
+        /**
+         * Global metrics for all indices hit during the query.
+         */
+        private final Histogram sstablesHit;
+        private final Histogram segmentsHit;
+        private final Histogram partitionReads;
+        private final Histogram rowsFiltered;
+
+        /**
+         * BKD index metrics.
+         */
+        private final Histogram kdTreePostingsNumPostings;
+        /**
+         * BKD index posting lists metrics.
+         */
+        private final Histogram kdTreePostingsSkips;
+        private final Histogram kdTreePostingsDecodes;
+        /**
+         * Trie index posting lists metrics.
+         */
+        private final Histogram postingsSkips;
+        private final Histogram postingsDecodes;
+
+        public PerQueryMetrics(TableMetadata table)
+        {
+            super(table, "PerQuery");
+
+            queryLatency = Metrics.timer(createMetricName("QueryLatency"));
+
+            sstablesHit = Metrics.histogram(createMetricName("SSTableIndexesHit"), false);
+            segmentsHit = Metrics.histogram(createMetricName("IndexSegmentsHit"), false);
+
+            kdTreePostingsSkips = Metrics.histogram(createMetricName("KDTreePostingsSkips"), false);
+
+            kdTreePostingsNumPostings = Metrics.histogram(createMetricName("KDTreePostingsNumPostings"), false);
+            kdTreePostingsDecodes = Metrics.histogram(createMetricName("KDTreePostingsDecodes"), false);
+
+            postingsSkips = Metrics.histogram(createMetricName("PostingsSkips"), false);
+            postingsDecodes = Metrics.histogram(createMetricName("PostingsDecodes"), false);
+
+            partitionReads = Metrics.histogram(createMetricName("PartitionReads"), false);
+            rowsFiltered = Metrics.histogram(createMetricName("RowsFiltered"), false);
+        }
+
+        private void recordStringIndexCacheMetrics(QueryContext events)
+        {
+            postingsSkips.update(events.triePostingsSkips);
+            postingsDecodes.update(events.triePostingsDecodes);
+        }
+
+        private void recordNumericIndexCacheMetrics(QueryContext events)
+        {
+            kdTreePostingsNumPostings.update(events.bkdPostingListsHit);
+
+            kdTreePostingsSkips.update(events.bkdPostingsSkips);
+            kdTreePostingsDecodes.update(events.bkdPostingsDecodes);
+        }
+
+        public void record(QueryContext queryContext)
+        {
+            final long totalQueryTimeNs = queryContext.totalQueryTimeNs();
+            queryLatency.update(totalQueryTimeNs, TimeUnit.NANOSECONDS);
+            final long queryLatencyMicros = TimeUnit.NANOSECONDS.toMicros(totalQueryTimeNs);
+
+            final long ssTablesHit = queryContext.sstablesHit;
+            final long segmentsHit = queryContext.segmentsHit;
+            final long partitionsRead = queryContext.partitionsRead;
+            final long rowsFiltered = queryContext.rowsFiltered;
+
+            sstablesHit.update(ssTablesHit);
+            this.segmentsHit.update(segmentsHit);
+
+            partitionReads.update(partitionsRead);
+            totalPartitionReads.inc(partitionsRead);
+
+            this.rowsFiltered.update(rowsFiltered);
+            totalRowsFiltered.inc(rowsFiltered);
+
+            if (Tracing.isTracing())
+            {
+                Tracing.trace("Index query accessed memtable indexes, {}, and {}, post-filtered {} in {}, and took {} microseconds.",
+                              pluralize(ssTablesHit, "SSTable index", "es"), pluralize(segmentsHit, "segment", "s"),
+                              pluralize(rowsFiltered, "row", "s"), pluralize(partitionsRead, "partition", "s"),
+                              queryLatencyMicros);
+            }
+
+            if (queryContext.trieSegmentsHit > 0)
+            {
+                recordStringIndexCacheMetrics(queryContext);
+            }
+
+            if (queryContext.bkdSegmentsHit > 0)
+            {
+                recordNumericIndexCacheMetrics(queryContext);
+            }
+
+            totalQueriesCompleted.inc();
+        }
+    }
+
+    private String pluralize(long count, String root, String plural)
+    {
+        return count == 1 ? String.format("1 %s", root) : String.format("%d %s%s", count, root, plural);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/TableStateMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/TableStateMetrics.java
new file mode 100644
index 000000000000..9841770f730b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/metrics/TableStateMetrics.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import com.codahale.metrics.Gauge;
+import com.codahale.metrics.RatioGauge;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class TableStateMetrics extends AbstractMetrics
+{
+    public static final String TABLE_STATE_METRIC_TYPE = "TableStateMetrics";
+
+    private final Gauge diskUsageBytes;
+    private final Gauge diskUsagePercentageOfBaseTable;
+    private final Gauge totalIndexCount;
+    private final Gauge totalIndexBuildsInProgress;
+    private final Gauge totalQueryableIndexCount;
+
+    public TableStateMetrics(TableMetadata table, StorageAttachedIndexGroup group)
+    {
+        super(table, TABLE_STATE_METRIC_TYPE);
+        totalQueryableIndexCount = Metrics.register(createMetricName("TotalQueryableIndexCount"), group::totalQueryableIndexCount);
+        totalIndexCount = Metrics.register(createMetricName("TotalIndexCount"), group::totalIndexCount);
+        totalIndexBuildsInProgress = Metrics.register(createMetricName("TotalIndexBuildsInProgress"), group::totalIndexBuildsInProgress);
+        diskUsageBytes = Metrics.register(createMetricName("DiskUsedBytes"), group::totalDiskUsage);
+        diskUsagePercentageOfBaseTable = Metrics.register(createMetricName("DiskPercentageOfBaseTable"), new RatioGauge() {
+            @Override
+            protected Ratio getRatio()
+            {
+                return Ratio.of(group.totalDiskUsage(), group.table().metric.liveDiskSpaceUsed.getCount());
+            }
+        });
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Expression.java b/src/java/org/apache/cassandra/index/sai/plan/Expression.java
new file mode 100644
index 000000000000..20e5f4766832
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/plan/Expression.java
@@ -0,0 +1,425 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.plan;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Iterators;
+import org.apache.commons.lang3.builder.HashCodeBuilder;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class Expression
+{
+    private static final Logger logger = LoggerFactory.getLogger(Expression.class);
+
+    public enum Op
+    {
+        EQ, MATCH, PREFIX, NOT_EQ, RANGE, CONTAINS_KEY, CONTAINS_VALUE;
+
+        public static Op valueOf(Operator operator)
+        {
+            switch (operator)
+            {
+                case EQ:
+                    return EQ;
+
+                case NEQ:
+                    return NOT_EQ;
+
+                case CONTAINS:
+                    return CONTAINS_VALUE; // non-frozen map: value contains term;
+
+                case CONTAINS_KEY:
+                    return CONTAINS_KEY; // non-frozen map: value contains key term;
+
+                case LT:
+                case GT:
+                case LTE:
+                case GTE:
+                    return RANGE;
+
+                case LIKE_PREFIX:
+                    return PREFIX;
+
+                case LIKE_MATCHES:
+                    return MATCH;
+
+                default:
+                    return null;
+            }
+        }
+
+        public boolean isEquality()
+        {
+            return this == EQ || this == CONTAINS_KEY || this == CONTAINS_VALUE;
+        }
+
+        public boolean isEqualityOrRange()
+        {
+            return isEquality() || this == RANGE;
+        }
+    }
+
+    public final AbstractAnalyzer analyzer;
+
+    public final ColumnContext context;
+    public final AbstractType<?> validator;
+
+    @VisibleForTesting
+    protected Op operation;
+
+    public Bound lower, upper;
+    public boolean upperInclusive, lowerInclusive;
+
+    final List<ByteBuffer> exclusions = new ArrayList<>();
+
+    public Expression(ColumnContext columnContext)
+    {
+        this.context = columnContext;
+        this.analyzer = columnContext.getAnalyzer();
+        this.validator = columnContext.getValidator();
+    }
+
+    public Expression add(Operator op, ByteBuffer value)
+    {
+        boolean lowerInclusive, upperInclusive;
+        // If the type supports rounding then we need to make sure that index
+        // range search is always inclusive, otherwise we run the risk of
+        // missing values that are within the exclusive range but are rejected
+        // because their rounded value is the same as the value being queried.
+        lowerInclusive = upperInclusive = TypeUtil.supportsRounding(validator);
+        switch (op)
+        {
+            case LIKE_PREFIX:
+            case LIKE_MATCHES:
+            case EQ:
+            case CONTAINS:
+            case CONTAINS_KEY:
+                lower = new Bound(value, validator, true);
+                upper = lower;
+                operation = Op.valueOf(op);
+                break;
+
+            case NEQ:
+                // index expressions are priority sorted
+                // and NOT_EQ is the lowest priority, which means that operation type
+                // is always going to be set before reaching it in case of RANGE or EQ.
+                if (operation == null)
+                {
+                    operation = Op.NOT_EQ;
+                    lower = new Bound(value, validator, true);
+                    upper = lower;
+                }
+                else
+                    exclusions.add(value);
+                break;
+
+            case LTE:
+                if (context.getDefinition().isReversedType())
+                {
+                    this.lowerInclusive = true;
+                    lowerInclusive = true;
+                }
+                else
+                {
+                    this.upperInclusive = true;
+                    upperInclusive = true;
+                }
+            case LT:
+                operation = Op.RANGE;
+                if (context.getDefinition().isReversedType())
+                    lower = new Bound(value, validator, lowerInclusive);
+                else
+                    upper = new Bound(value, validator, upperInclusive);
+                break;
+
+            case GTE:
+                if (context.getDefinition().isReversedType())
+                {
+                    this.upperInclusive = true;
+                    upperInclusive = true;
+                }
+                else
+                {
+                    this.lowerInclusive = true;
+                    lowerInclusive = true;
+                }
+            case GT:
+                operation = Op.RANGE;
+                if (context.getDefinition().isReversedType())
+                    upper = new Bound(value, validator,  upperInclusive);
+                else
+                    lower = new Bound(value, validator, lowerInclusive);
+                break;
+        }
+
+        return this;
+    }
+
+    public boolean isSatisfiedBy(ByteBuffer columnValue)
+    {
+        if (!TypeUtil.isValid(columnValue, validator))
+        {
+            logger.error(context.logMessage("Value is not valid for indexed column {} with {}"), context.getColumnName(), validator);
+            return false;
+        }
+
+        Value value = new Value(columnValue, validator);
+
+        if (lower != null)
+        {
+            // suffix check
+            if (TypeUtil.isLiteral(validator))
+            {
+                if (!validateStringValue(value.raw, lower.value.raw))
+                    return false;
+            }
+            else
+            {
+                // range or (not-)equals - (mainly) for numeric values
+                int cmp = TypeUtil.comparePostFilter(lower.value, value, validator);
+
+                // in case of (NOT_)EQ lower == upper
+                if (operation == Op.EQ || operation == Op.CONTAINS_KEY || operation == Op.CONTAINS_VALUE || operation == Op.NOT_EQ)
+                    return cmp == 0;
+
+                if (cmp > 0 || (cmp == 0 && !lowerInclusive))
+                    return false;
+            }
+        }
+
+        if (upper != null && lower != upper)
+        {
+            // string (prefix or suffix) check
+            if (TypeUtil.isLiteral(validator))
+            {
+                if (!validateStringValue(value.raw, upper.value.raw))
+                    return false;
+            }
+            else
+            {
+                // range - mainly for numeric values
+                int cmp = TypeUtil.comparePostFilter(upper.value, value, validator);
+                if (cmp < 0 || (cmp == 0 && !upperInclusive))
+                    return false;
+            }
+        }
+
+        // as a last step let's check exclusions for the given field,
+        // this covers EQ/RANGE with exclusions.
+        for (ByteBuffer term : exclusions)
+        {
+            if (TypeUtil.isLiteral(validator) && validateStringValue(value.raw, term) ||
+                TypeUtil.comparePostFilter(new Value(term, validator), value, validator) == 0)
+                return false;
+        }
+
+        return true;
+    }
+
+    private boolean validateStringValue(ByteBuffer columnValue, ByteBuffer requestedValue)
+    {
+        analyzer.reset(columnValue.duplicate());
+        while (analyzer.hasNext())
+        {
+            ByteBuffer term = analyzer.next();
+
+            boolean isMatch = false;
+            switch (operation)
+            {
+                case EQ:
+                case MATCH:
+                // Operation.isSatisfiedBy handles conclusion on !=,
+                // here we just need to make sure that term matched it
+                case CONTAINS_KEY:
+                case CONTAINS_VALUE:
+                case NOT_EQ:
+                    isMatch = validator.compare(term, requestedValue) == 0;
+                    break;
+                case RANGE:
+                    isMatch = isLowerSatisfiedBy(term) && isUpperSatisfiedBy(term);
+                    break;
+
+                case PREFIX:
+                    isMatch = ByteBufferUtil.startsWith(term, requestedValue);
+                    break;
+            }
+
+            if (isMatch)
+                return true;
+        }
+
+        return false;
+    }
+
+    public Op getOp()
+    {
+        return operation;
+    }
+
+    private boolean hasLower()
+    {
+        return lower != null;
+    }
+
+    private boolean hasUpper()
+    {
+        return upper != null;
+    }
+
+    private boolean isLowerSatisfiedBy(ByteBuffer value)
+    {
+        if (!hasLower())
+            return true;
+
+        int cmp = validator.compare(value, lower.value.raw);
+        return cmp > 0 || cmp == 0 && lower.inclusive;
+    }
+
+    private boolean isUpperSatisfiedBy(ByteBuffer value)
+    {
+        if (!hasUpper())
+            return true;
+
+        int cmp = validator.compare(value, upper.value.raw);
+        return cmp < 0 || cmp == 0 && upper.inclusive;
+    }
+
+    public String toString()
+    {
+        return String.format("Expression{name: %s, op: %s, lower: (%s, %s), upper: (%s, %s), exclusions: %s}",
+                             context.getColumnName(),
+                             operation,
+                             lower == null ? "null" : validator.getString(lower.value.raw),
+                             lower != null && lower.inclusive,
+                             upper == null ? "null" : validator.getString(upper.value.raw),
+                             upper != null && upper.inclusive,
+                             Iterators.toString(Iterators.transform(exclusions.iterator(), validator::getString)));
+    }
+
+    public int hashCode()
+    {
+        return new HashCodeBuilder().append(context.getColumnName())
+                                    .append(operation)
+                                    .append(validator)
+                                    .append(lower).append(upper)
+                                    .append(exclusions).build();
+    }
+
+    public boolean equals(Object other)
+    {
+        if (!(other instanceof Expression))
+            return false;
+
+        if (this == other)
+            return true;
+
+        Expression o = (Expression) other;
+
+        return Objects.equals(context.getColumnName(), o.context.getColumnName())
+                && validator.equals(o.validator)
+                && operation == o.operation
+                && Objects.equals(lower, o.lower)
+                && Objects.equals(upper, o.upper)
+                && exclusions.equals(o.exclusions);
+    }
+
+    /**
+     * A representation of a column value in it's raw and encoded form.
+     */
+    public static class Value
+    {
+        public final ByteBuffer raw;
+        public final ByteBuffer encoded;
+
+        public Value(ByteBuffer value, AbstractType<?> type)
+        {
+            this.raw = value;
+            this.encoded = TypeUtil.encode(value, type);
+        }
+
+        @Override
+        public boolean equals(Object other)
+        {
+            if (!(other instanceof Value))
+                return false;
+
+            Value o = (Value) other;
+            return raw.equals(o.raw) && encoded.equals(o.encoded);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            HashCodeBuilder builder = new HashCodeBuilder();
+            builder.append(raw);
+            builder.append(encoded);
+            return builder.toHashCode();
+        }
+    }
+
+    public static class Bound
+    {
+        public final Value value;
+        public final boolean inclusive;
+
+        public Bound(ByteBuffer value, AbstractType<?> type, boolean inclusive)
+        {
+            this.value = new Value(value, type);
+            this.inclusive = inclusive;
+        }
+
+        @Override
+        public boolean equals(Object other)
+        {
+            if (!(other instanceof Bound))
+                return false;
+
+            Bound o = (Bound) other;
+            return value.equals(o.value) && inclusive == o.inclusive;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            HashCodeBuilder builder = new HashCodeBuilder();
+            builder.append(value);
+            builder.append(inclusive);
+            return builder.toHashCode();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java b/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java
new file mode 100644
index 000000000000..e35edbb9685b
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java
@@ -0,0 +1,244 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.plan;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.List;
+import java.util.ListIterator;
+
+import com.google.common.collect.ListMultimap;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.index.sai.plan.Expression.Op;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.ColumnMetadata.Kind;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.index.sai.plan.Operation.OperationType;
+
+/**
+ * Tree-like structure to filter base table data using indexed expressions and non-user-defined filters.
+ *
+ * This is needed because:
+ * 1. SAI doesn't index tombstones, base data may have been shadowed.
+ * 2. SAI indexes partition offset, not all rows in partition match index condition.
+ * 3. Replica filter protecting may fetch data that doesn't match index expressions.
+ */
+public class FilterTree
+{
+    protected final OperationType op;
+    protected final ListMultimap<ColumnMetadata, Expression> expressions;
+
+    protected final FilterTree left;
+    protected final FilterTree right;
+
+    FilterTree(OperationType operation,
+               ListMultimap<ColumnMetadata, Expression> expressions,
+               FilterTree left, FilterTree right)
+    {
+        this.op = operation;
+        this.expressions = expressions;
+
+        this.left = left;
+        this.right = right;
+    }
+
+    /**
+     * Recursive "satisfies" checks based on operation
+     * and data from the lower level members using depth-first search
+     * and bubbling the results back to the top level caller.
+     *
+     * Most of the work here is done by localSatisfiedBy(Unfiltered, Row, boolean)
+     * see it's comment for details, if there are no local expressions
+     * assigned to Operation it will call satisfiedBy(Row) on it's children.
+     *
+     * Query: first_name = X AND (last_name = Y OR address = XYZ AND street = IL AND city = C) OR (state = 'CA' AND country = 'US')
+     * Row: key1: (first_name: X, last_name: Z, address: XYZ, street: IL, city: C, state: NY, country:US)
+     *
+     * #1                       OR
+     *                        /    \
+     * #2       (first_name) AND   AND (state, country)
+     *                          \
+     * #3            (last_name) OR
+     *                             \
+     * #4                          AND (address, street, city)
+     *
+     *
+     * Evaluation of the key1 is top-down depth-first search:
+     *
+     * --- going down ---
+     * Level #1 is evaluated, OR expression has to pull results from it's children which are at level #2 and OR them together,
+     * Level #2 AND (state, country) could be be evaluated right away, AND (first_name) refers to it's "right" child from level #3
+     * Level #3 OR (last_name) requests results from level #4
+     * Level #4 AND (address, street, city) does logical AND between it's 3 fields, returns result back to level #3.
+     * --- bubbling up ---
+     * Level #3 computes OR between AND (address, street, city) result and it's "last_name" expression
+     * Level #2 computes AND between "first_name" and result of level #3, AND (state, country) which is already computed
+     * Level #1 does OR between results of AND (first_name) and AND (state, country) and returns final result.
+     *
+     * @param key The partition key for the row.
+     * @param currentCluster The row cluster to check.
+     * @param staticRow The static row associated with current cluster.
+     * @return true if give Row satisfied all of the expressions in the tree,
+     *         false otherwise.
+     */
+    public boolean satisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
+    {
+        boolean sideL, sideR;
+
+        if (expressions == null || expressions.isEmpty())
+        {
+            sideL =  left != null &&  left.satisfiedBy(key, currentCluster, staticRow);
+            sideR = right != null && right.satisfiedBy(key, currentCluster, staticRow);
+
+            // one of the expressions was skipped
+            // because it had no indexes attached
+            if (left == null)
+                return sideR;
+        }
+        else
+        {
+            sideL = localSatisfiedBy(key, currentCluster, staticRow);
+
+            // if there is no right it means that this expression
+            // is last in the sequence, we can just return result from local expressions
+            if (right == null)
+                return sideL;
+
+            sideR = right.satisfiedBy(key, currentCluster, staticRow);
+        }
+
+        return op.apply(sideL, sideR);
+    }
+
+    /**
+     * Check every expression in the analyzed list to figure out if the
+     * columns in the give row match all of the based on the operation
+     * set to the current operation node.
+     *
+     * The algorithm is as follows: for every given expression from analyzed
+     * list get corresponding column from the Row:
+     *   - apply {@link Expression#isSatisfiedBy(ByteBuffer)}
+     *     method to figure out if it's satisfied;
+     *   - apply logical operation between boolean accumulator and current boolean result;
+     *   - if result == false and node's operation is AND return right away;
+     *
+     * After all of the expressions have been evaluated return resulting accumulator variable.
+     *
+     * Example:
+     *
+     * Operation = (op: AND, columns: [first_name = p, 5 < age < 7, last_name: y])
+     * Row = (first_name: pavel, last_name: y, age: 6, timestamp: 15)
+     *
+     * #1 get "first_name" = p (expressions)
+     *      - row-get "first_name"                      => "pavel"
+     *      - compare "pavel" against "p"               => true (current)
+     *      - set accumulator current                   => true (because this is expression #1)
+     *
+     * #2 get "last_name" = y (expressions)
+     *      - row-get "last_name"                       => "y"
+     *      - compare "y" against "y"                   => true (current)
+     *      - set accumulator to accumulator & current  => true
+     *
+     * #3 get 5 < "age" < 7 (expressions)
+     *      - row-get "age"                             => "6"
+     *      - compare 5 < 6 < 7                         => true (current)
+     *      - set accumulator to accumulator & current  => true
+     *
+     * #4 return accumulator => true (row satisfied all of the conditions)
+     *
+     * @param key The partition key for the row.
+     * @param currentCluster The row cluster to check.
+     * @param staticRow The static row associated with current cluster.
+     * @return true if give Row satisfied all of the analyzed expressions,
+     *         false otherwise.
+     */
+    private boolean localSatisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
+    {
+        if (currentCluster == null || !currentCluster.isRow())
+            return false;
+
+        final int now = FBUtilities.nowInSeconds();
+        boolean result = op == OperationType.AND;
+
+        Iterator<ColumnMetadata> columnIterator = expressions.keySet().iterator();
+        while(columnIterator.hasNext())
+        {
+            ColumnMetadata column = columnIterator.next();
+            Row row = column.kind == Kind.STATIC ? staticRow : (Row)currentCluster;
+
+            // If there is a column with multiple expressions that can mean an OR or (in the case of map
+            // collections) it can mean different map indexes.
+            List<Expression> filters = expressions.get(column);
+
+            // We do a reverse iteration over the filters because NOT_EQ operations will be at the end
+            // of the filter list and we want to check them first.
+            ListIterator<Expression> filterIterator = filters.listIterator(filters.size());
+            while(filterIterator.hasPrevious())
+            {
+                Expression filter = filterIterator.previous();
+
+                if (TypeUtil.isNonFrozenCollection(column.type))
+                {
+                    Iterator<ByteBuffer> valueIterator = filter.context.getValuesOf(row, now);
+                    result = op.apply(result, collectionMatch(valueIterator, filter));
+                }
+                else
+                {
+                    ByteBuffer value = filter.context.getValueOf(key, row, now);
+                    result = op.apply(result, singletonMatch(value, filter));
+                }
+
+                // If the operation is an AND then exit early if we get a single false
+                if (op == OperationType.AND && !result)
+                    return false;
+            }
+        }
+        return result;
+    }
+
+    private boolean singletonMatch(ByteBuffer value, Expression filter)
+    {
+        boolean match = value != null && filter.isSatisfiedBy(value);
+        // If this is NOT_EQ operation we have to
+        // inverse match flag (to check against other expressions),
+        if (filter.getOp() == Op.NOT_EQ)
+            match = !match;
+        return match;
+    }
+
+    private boolean collectionMatch(Iterator<ByteBuffer> valueIterator, Expression filter)
+    {
+        if (valueIterator == null)
+            return false;
+
+        while (valueIterator.hasNext())
+        {
+            ByteBuffer value = valueIterator.next();
+            if (value == null)
+                continue;
+            if (filter.isSatisfiedBy(value))
+                return true;
+        }
+        return false;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Operation.java b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
new file mode 100644
index 000000000000..2e91a0d9650a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
@@ -0,0 +1,478 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.plan;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.ListMultimap;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.utils.RangeIntersectionIterator;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.RangeUnionIterator;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.schema.ColumnMetadata;
+
+public class Operation extends RangeIterator
+{
+    public enum OperationType
+    {
+        AND, OR;
+
+        public boolean apply(boolean a, boolean b)
+        {
+            switch (this)
+            {
+                case OR:
+                    return a | b;
+
+                case AND:
+                    return a & b;
+
+                default:
+                    throw new AssertionError();
+            }
+        }
+    }
+
+    final FilterTree filterTree;
+    final RangeIterator range;
+
+    final QueryController controller;
+
+    private Operation(RangeIterator range, FilterTree filterTree, QueryController controller)
+    {
+        super(range);
+        this.filterTree = filterTree;
+        this.range = range;
+        this.controller = controller;
+    }
+
+    public boolean satisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
+    {
+        return filterTree.satisfiedBy(key, currentCluster, staticRow);
+    }
+
+    @VisibleForTesting
+    protected static ListMultimap<ColumnMetadata, Expression> analyzeGroup(QueryController controller,
+                                                                           OperationType op,
+                                                                           List<RowFilter.Expression> expressions)
+    {
+        ListMultimap<ColumnMetadata, Expression> analyzed = ArrayListMultimap.create();
+
+        // sort all of the expressions in the operation by name and priority of the logical operator
+        // this gives us an efficient way to handle inequality and combining into ranges without extra processing
+        // and converting expressions from one type to another.
+        expressions.sort((a, b) -> {
+            int cmp = a.column().compareTo(b.column());
+            return cmp == 0 ? -Integer.compare(getPriority(a.operator()), getPriority(b.operator())) : cmp;
+        });
+
+        for (final RowFilter.Expression e : expressions)
+        {
+            ColumnContext columnContext = controller.getContext(e);
+            List<Expression> perColumn = analyzed.get(e.column());
+
+            AbstractAnalyzer analyzer = columnContext.getAnalyzer();
+            analyzer.reset(e.getIndexValue().duplicate());
+
+            // EQ/LIKE_*/NOT_EQ can have multiple expressions e.g. text = "Hello World",
+            // becomes text = "Hello" OR text = "World" because "space" is always interpreted as a split point (by analyzer),
+            // CONTAINS/CONTAINS_KEY are always treated as multiple expressions since they currently only targetting
+            // collections, NOT_EQ is made an independent expression only in case of pre-existing multiple EQ expressions, or
+            // if there is no EQ operations and NOT_EQ is met or a single NOT_EQ expression present,
+            // in such case we know exactly that there would be no more EQ/RANGE expressions for given column
+            // since NOT_EQ has the lowest priority.
+            boolean isMultiExpression = false;
+            switch (e.operator())
+            {
+                case EQ:
+                    // EQ operator will always be a multiple expression because it is being used by
+                    // map entries
+                    isMultiExpression = columnContext.isNonFrozenCollection();
+                    break;
+
+                case CONTAINS:
+                case CONTAINS_KEY:
+                case LIKE_PREFIX:
+                case LIKE_MATCHES:
+                    isMultiExpression = true;
+                    break;
+
+                case NEQ:
+                    isMultiExpression = (perColumn.size() == 0 || perColumn.size() > 1
+                                     || (perColumn.size() == 1 && perColumn.get(0).getOp() == Expression.Op.NOT_EQ));
+                    break;
+            }
+            if (isMultiExpression)
+            {
+                while (analyzer.hasNext())
+                {
+                    final ByteBuffer token = analyzer.next();
+                    perColumn.add(new Expression(columnContext).add(e.operator(), token));
+                }
+            }
+            else
+            // "range" or not-equals operator, combines both bounds together into the single expression,
+            // iff operation of the group is AND, otherwise we are forced to create separate expressions,
+            // not-equals is combined with the range iff operator is AND.
+            {
+                Expression range;
+                if (perColumn.size() == 0 || op != OperationType.AND)
+                {
+                    perColumn.add((range = new Expression(columnContext)));
+                }
+                else
+                {
+                    range = Iterables.getLast(perColumn);
+                }
+
+                if (!TypeUtil.isLiteral(columnContext.getValidator()))
+                {
+                    range.add(e.operator(), e.getIndexValue().duplicate());
+                }
+                else
+                {
+                    while (analyzer.hasNext())
+                    {
+                        range.add(e.operator(), analyzer.next());
+                    }
+                }
+            }
+        }
+
+        return analyzed;
+    }
+
+    private static int getPriority(org.apache.cassandra.cql3.Operator op)
+    {
+        switch (op)
+        {
+            case EQ:
+            case CONTAINS:
+            case CONTAINS_KEY:
+                return 5;
+
+            case LIKE_PREFIX:
+            case LIKE_MATCHES:
+                return 4;
+
+            case GTE:
+            case GT:
+                return 3;
+
+            case LTE:
+            case LT:
+                return 2;
+
+            case NEQ:
+                return 1;
+
+            default:
+                return 0;
+        }
+    }
+
+    @Override
+    protected Token computeNext()
+    {
+        return range != null && range.hasNext() ? range.next() : endOfData();
+    }
+
+    @Override
+    protected void performSkipTo(Long nextToken)
+    {
+        if (range != null)
+            range.skipTo(nextToken);
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        if (range != null)
+            range.close();
+
+        controller.releaseIndexes(filterTree.expressions);
+    }
+
+    /**
+     * @param controller current query controller
+     * @return tree builder with query expressions added from query controller.
+     */
+    static TreeBuilder initTreeBuilder(QueryController controller)
+    {
+        TreeBuilder tree = new TreeBuilder(controller);
+        tree.add(controller.getExpressions());
+        return tree;
+    }
+
+    /**
+     * A builder on which like expressions are built as subtrees using {@link OperationType} OR to
+     * keep their correct semantics. Remaining expressions are added into the root AND OperationType.
+     *
+     *  Example:
+     *
+     *   3 Like expressions:
+     *
+     *                    AND (expressions)
+     *                  /   \
+     *                AND   OR (like)
+     *               /   \
+     *      (like) OR   OR (like)
+     *
+     **/
+    public static class TreeBuilder
+    {
+        private final QueryController controller;
+        final Builder root;
+        Builder subtree;
+
+        TreeBuilder(QueryController controller)
+        {
+            this.controller = controller;
+            this.root = new Builder(OperationType.AND, controller);
+            this.subtree = root;
+        }
+
+        public TreeBuilder add(Collection<RowFilter.Expression> expressions)
+        {
+            if (expressions != null)
+                expressions.forEach(this::add);
+            return this;
+        }
+
+        public TreeBuilder add(RowFilter.Expression exp)
+        {
+            if (exp.operator().isLike())
+                addToSubTree(exp);
+            else
+                root.add(exp);
+
+            return this;
+        }
+
+        private void addToSubTree(RowFilter.Expression exp)
+        {
+            Builder likeOperation = new Builder(OperationType.OR, controller);
+            likeOperation.add(exp);
+            if (subtree.right == null)
+            {
+                subtree.setRight(likeOperation);
+            }
+            else if (subtree.left == null)
+            {
+                Builder newSubtree = new Builder(OperationType.AND, controller);
+                subtree.setLeft(newSubtree);
+                newSubtree.setRight(likeOperation);
+                subtree = newSubtree;
+            }
+            else
+            {
+                throw new IllegalStateException("Both trees are full");
+            }
+        }
+
+        public Operation complete()
+        {
+            return root.complete();
+        }
+
+        FilterTree completeFilter()
+        {
+            return root.completeFilter();
+        }
+    }
+
+    public static class Builder
+    {
+        private final QueryController controller;
+
+        protected final OperationType op;
+        private final List<RowFilter.Expression> expressions;
+
+        protected Builder left, right;
+
+        public Builder(OperationType operation, QueryController controller, RowFilter.Expression... columns)
+        {
+            this.op = operation;
+            this.controller = controller;
+            this.expressions = new ArrayList<>();
+            Collections.addAll(expressions, columns);
+        }
+
+        public Builder setRight(Builder operation)
+        {
+            this.right = operation;
+            return this;
+        }
+
+        public Builder setLeft(Builder operation)
+        {
+            this.left = operation;
+            return this;
+        }
+
+        public void add(RowFilter.Expression e)
+        {
+            expressions.add(e);
+        }
+
+        public void add(Collection<RowFilter.Expression> newExpressions)
+        {
+            if (expressions != null)
+                expressions.addAll(newExpressions);
+        }
+
+        @SuppressWarnings("resource")
+        public Operation complete()
+        {
+            if (!expressions.isEmpty())
+            {
+                ListMultimap<ColumnMetadata, Expression> analyzedExpressions = analyzeGroup(controller, op, expressions);
+                RangeIterator.Builder range = controller.getIndexes(op, analyzedExpressions.values());
+
+                Operation rightOp = null;
+                if (right != null)
+                {
+                    rightOp = right.complete();
+                    range.add(rightOp);
+                }
+
+                FilterTree filterTree  = new FilterTree(op, analyzedExpressions, null, rightOp != null ? rightOp.filterTree : null);
+                return new Operation(range.build(), filterTree, controller);
+            }
+            else // when OR is used
+            {
+                Operation leftOp = null, rightOp = null;
+                boolean leftIndexes = false, rightIndexes = false;
+
+                if (left != null)
+                {
+                    leftOp = left.complete();
+                    leftIndexes = leftOp != null && leftOp.range != null;
+                }
+
+                if (right != null)
+                {
+                    rightOp = right.complete();
+                    rightIndexes = rightOp != null && rightOp.range != null;
+                }
+
+                RangeIterator join;
+                /**
+                 * Operation should allow one of it's sub-trees to wrap no indexes, that is related  to the fact that we
+                 * have to accept defined-but-not-indexed columns as well as key range as IndexExpressions.
+                 *
+                 * Two cases are possible:
+                 *
+                 * only left child produced indexed iterators, that could happen when there are two columns
+                 * or key range on the right:
+                 *
+                 *                AND
+                 *              /     \
+                 *            OR       \
+                 *           /   \     AND
+                 *          a     b   /   \
+                 *                  key   key
+                 *
+                 * only right child produced indexed iterators:
+                 *
+                 *               AND
+                 *              /    \
+                 *            AND     a
+                 *           /   \
+                 *         key  key
+                 */
+                if (leftIndexes && !rightIndexes)
+                    join = leftOp;
+                else if (!leftIndexes && rightIndexes)
+                    join = rightOp;
+                else if (leftIndexes)
+                {
+                    RangeIterator.Builder builder = op == OperationType.OR
+                                                                 ? RangeUnionIterator.builder()
+                                                                 : RangeIntersectionIterator.selectiveBuilder();
+
+                    join = builder.add(leftOp).add(rightOp).build();
+                }
+                else
+                    throw new AssertionError("both sub-trees have 0 indexes.");
+
+                return new Operation(join,
+                                     new FilterTree(op, null,
+                                                    leftOp == null ? null : leftOp.filterTree,
+                                                    leftOp == null ? null : leftOp.filterTree),
+                                     controller);
+            }
+        }
+
+        /**
+         * To build a filter tree used to filter data using indexed expressions and non-user-defined expressions.
+         *
+         * Similar to {@link #complete()}, except that this method won't reference {@link SSTableIndex} and avoids
+         * complexity of RangeIterator.
+         *
+         * @return the filter tree
+         */
+        FilterTree completeFilter()
+        {
+            if (!expressions.isEmpty())
+            {
+                ListMultimap<ColumnMetadata, Expression> analyzedExpressions = analyzeGroup(controller, op, expressions);
+                if (right != null)
+                {
+                    FilterTree ro = right.completeFilter();
+                    return new FilterTree(op, analyzedExpressions, null, ro);
+                }
+                return new FilterTree(op, analyzedExpressions, null, null);
+            }
+            else
+            {
+                FilterTree leftOperation = left != null ? left.completeFilter() : null;
+                FilterTree rightOperation = right != null ? right.completeFilter() : null;
+
+                if (leftOperation == null && rightOperation == null)
+                    throw new AssertionError("both sub-trees have 0 indexes.");
+
+                return new FilterTree(op, null, leftOperation, rightOperation);
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
new file mode 100644
index 000000000000..26ee729714b1
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.plan;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.metrics.TableQueryMetrics;
+import org.apache.cassandra.index.sai.utils.RangeIntersectionIterator;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.index.sai.utils.RangeUnionIterator;
+import org.apache.cassandra.index.sai.utils.TermIterator;
+import org.apache.cassandra.index.sai.view.View;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.concurrent.Ref;
+
+public class QueryController
+{
+    private static final Logger logger = LoggerFactory.getLogger(QueryController.class);
+
+    private final ColumnFamilyStore cfs;
+    private final ReadCommand command;
+    private final Set<Collection<Expression>> resources = new HashSet<>();
+    private final QueryContext queryContext;
+    private final TableQueryMetrics tableQueryMetrics;
+    private final List<RowFilter.Expression> expressions;
+
+    private final List<DataRange> ranges;
+    private final AbstractBounds<PartitionPosition> mergeRange;
+
+    public QueryController(ColumnFamilyStore cfs,
+                           ReadCommand command,
+                           List<RowFilter.Expression> expressions,
+                           QueryContext queryContext,
+                           TableQueryMetrics tableQueryMetrics)
+    {
+        this.cfs = cfs;
+        this.command = command;
+        this.queryContext = queryContext;
+        this.tableQueryMetrics = tableQueryMetrics;
+        this.expressions = expressions;
+
+        this.ranges = dataRanges(command);
+        DataRange first = ranges.get(0);
+        DataRange last = ranges.get(ranges.size() - 1);
+        this.mergeRange = ranges.size() == 1 ? first.keyRange() : first.keyRange().withNewRight(last.keyRange().right);
+    }
+
+    public TableMetadata metadata()
+    {
+        return command.metadata();
+    }
+
+    /**
+     * @return non-user defined expressions used in the read command
+     */
+    List<RowFilter.Expression> getExpressions()
+    {
+        return expressions;
+    }
+
+    /**
+     * @return token ranges used in the read command
+     */
+    List<DataRange> dataRanges()
+    {
+        return ranges;
+    }
+
+    /**
+     * Note: merged range may contain subrange that no longer belongs to the local node after range movement.
+     * It should only be used as an optimization to reduce search space. Use {@link #dataRanges()} instead to filter data.
+     *
+     * @return merged token range
+     */
+    AbstractBounds<PartitionPosition> mergeRange()
+    {
+        return mergeRange;
+    }
+
+    /**
+     * @return indexed {@code ColumnContext} if index is found; otherwise return non-indexed {@code ColumnContext}.
+     */
+    public ColumnContext getContext(RowFilter.Expression expression)
+    {
+        StorageAttachedIndex index = getBestIndexFor(expression);
+
+        return index != null ? index.getContext() : new ColumnContext(cfs.metadata(), expression.column());
+    }
+
+    public StorageAttachedIndex getBestIndexFor(RowFilter.Expression expression)
+    {
+        return cfs.indexManager.getBestIndexFor(expression, StorageAttachedIndex.class).orElse(null);
+    }
+
+    public UnfilteredRowIterator getPartition(DecoratedKey key, ReadExecutionController executionController)
+    {
+        if (key == null)
+            throw new IllegalArgumentException("non-null key required");
+
+        try
+        {
+            SinglePartitionReadCommand partition = SinglePartitionReadCommand.create(cfs.metadata(),
+                                                                                     command.nowInSec(),
+                                                                                     command.columnFilter(),
+                                                                                     RowFilter.NONE,
+                                                                                     DataLimits.NONE,
+                                                                                     key,
+                                                                                     command.clusteringIndexFilter(key));
+
+            return partition.queryMemtableAndDisk(cfs, executionController);
+        }
+        finally
+        {
+            queryContext.checkpoint();
+        }
+    }
+
+    /**
+     * Build a {@link RangeIterator.Builder} from the given list of expressions by applying given operation (OR/AND).
+     * Building of such builder involves index search, results of which are persisted in the internal resources list
+     * and can be released later via {@link QueryController#releaseIndexes(ListMultimap)}}.
+     *
+     * @param op The operation type to coalesce expressions with.
+     * @param expressions The expressions to build range iterator from (expressions with not results are ignored).
+     *
+     * @return range iterator builder based on given expressions and operation type.
+     */
+    public RangeIterator.Builder getIndexes(Operation.OperationType op, Collection<Expression> expressions)
+    {
+        if (resources.contains(expressions))
+            throw new IllegalArgumentException("Can't process the same expressions multiple times.");
+
+        boolean defer = op == Operation.OperationType.OR || RangeIntersectionIterator.shouldDefer(expressions.size());
+
+        RangeIterator.Builder builder = op == Operation.OperationType.OR
+                                        ? RangeUnionIterator.builder()
+                                        : RangeIntersectionIterator.selectiveBuilder();
+
+        Set<Map.Entry<Expression, NavigableSet<SSTableIndex>>> view = referenceAndGetView(op, expressions).entrySet();
+
+        try
+        {
+            for (Map.Entry<Expression, NavigableSet<SSTableIndex>> e : view)
+            {
+                @SuppressWarnings("resource") // RangeIterators are closed by releaseIndexes
+                RangeIterator index = TermIterator.build(e.getKey(), e.getValue(), mergeRange, queryContext, defer);
+
+                builder.add(index);
+            }
+        }
+        catch (Throwable t)
+        {
+            // all sstable indexes in view have been referenced, need to clean up when exception is thrown
+            FileUtils.closeQuietly(builder.ranges());
+            view.forEach(e -> e.getValue().forEach(SSTableIndex::release));
+            throw t;
+        }
+
+        resources.add(expressions);
+        return builder;
+    }
+
+    private static void releaseQuietly(SSTableIndex index)
+    {
+        try
+        {
+            index.release();
+        }
+        catch (Throwable e)
+        {
+            logger.error(index.getColumnContext().logMessage("Failed to release index on SSTable {}"), index.getSSTable().descriptor, e);
+        }
+    }
+
+    public void releaseIndexes(ListMultimap<?, Expression> expressions)
+    {
+        if (expressions != null)
+            resources.remove(expressions.values());
+    }
+
+    /**
+     * Used to release all resources and record metrics when query finishes.
+     */
+    public void finish()
+    {
+        if (tableQueryMetrics != null) tableQueryMetrics.record(queryContext);
+    }
+
+    /**
+     * Try to reference all SSTableIndexes before querying on disk indexes.
+     *
+     * If we attempt to proceed into {@link TermIterator#build(Expression, Set, AbstractBounds, QueryContext, boolean)}
+     * without first referencing all indexes, a concurrent compaction may decrement one or more of their backing
+     * SSTable {@link Ref} instances. This will allow the {@link SSTableIndex} itself to be released and will fail the query.
+     */
+    private Map<Expression, NavigableSet<SSTableIndex>> referenceAndGetView(Operation.OperationType op, Collection<Expression> expressions)
+    {
+        SortedSet<String> indexNames = new TreeSet<>();
+        try
+        {
+            while (true)
+            {
+                List<SSTableIndex> referencedIndexes = new ArrayList<>();
+                boolean failed = false;
+
+                Map<Expression, NavigableSet<SSTableIndex>> view = getView(op, expressions);
+
+                for (SSTableIndex index : view.values().stream().flatMap(Collection::stream).collect(Collectors.toList()))
+                {
+                    indexNames.add(index.getColumnContext().getIndexName());
+
+                    if (index.reference())
+                    {
+                        referencedIndexes.add(index);
+                    }
+                    else
+                    {
+                        failed = true;
+                        break;
+                    }
+                }
+
+                if (failed)
+                {
+                    // TODO: This might be a good candidate for a table/index group metric in the future...
+                    referencedIndexes.forEach(QueryController::releaseQuietly);
+                }
+                else
+                {
+                    return view;
+                }
+            }
+        }
+        finally
+        {
+            Tracing.trace("Querying storage-attached indexes {}", indexNames);
+        }
+    }
+
+    private Map<Expression, NavigableSet<SSTableIndex>> getView(Operation.OperationType op, Collection<Expression> expressions)
+    {
+        // first let's determine the primary expression if op is AND
+        Pair<Expression, NavigableSet<SSTableIndex>> primary = (op == Operation.OperationType.AND) ? calculatePrimary(expressions) : null;
+
+        Map<Expression, NavigableSet<SSTableIndex>> indexes = new HashMap<>();
+        for (Expression e : expressions)
+        {
+            // NO_EQ and non-index column query should only act as FILTER BY for satisfiedBy(Row) method
+            // because otherwise it likely to go through the whole index.
+            if (!e.context.isIndexed() || e.getOp() == Expression.Op.NOT_EQ)
+            {
+                continue;
+            }
+
+            // primary expression, we'll have to add as is
+            if (primary != null && e.equals(primary.left))
+            {
+                indexes.put(primary.left, primary.right);
+
+                continue;
+            }
+
+            View view = e.context.getView();
+
+            NavigableSet<SSTableIndex> readers = new TreeSet<>(SSTableIndex.COMPARATOR);
+            if (primary != null && primary.right.size() > 0)
+            {
+                for (SSTableIndex index : primary.right)
+                    readers.addAll(view.match(index.minKey(), index.maxKey()));
+            }
+            else
+            {
+                readers.addAll(applyScope(view.match(e)));
+            }
+
+            indexes.put(e, readers);
+        }
+
+        return indexes;
+    }
+
+    private Pair<Expression, NavigableSet<SSTableIndex>> calculatePrimary(Collection<Expression> expressions)
+    {
+        Expression expression = null;
+        NavigableSet<SSTableIndex> primaryIndexes = null;
+
+        for (Expression e : expressions)
+        {
+            if (!e.context.isIndexed())
+                continue;
+
+            View view = e.context.getView();
+
+            NavigableSet<SSTableIndex> indexes = new TreeSet<>(SSTableIndex.COMPARATOR);
+            indexes.addAll(applyScope(view.match(e)));
+
+            if (expression == null || primaryIndexes.size() > indexes.size())
+            {
+                primaryIndexes = indexes;
+                expression = e;
+            }
+        }
+
+        return expression == null ? null : Pair.create(expression, primaryIndexes);
+    }
+
+    private Set<SSTableIndex> applyScope(Set<SSTableIndex> indexes)
+    {
+        return Sets.filter(indexes, index -> {
+            SSTableReader sstable = index.getSSTable();
+
+            return mergeRange.left.compareTo(sstable.last) <= 0 && (mergeRange.right.isMinimum() || sstable.first.compareTo(mergeRange.right) <= 0);
+        });
+    }
+
+    /**
+     * Returns the {@link DataRange} list covered by the specified {@link ReadCommand}.
+     *
+     * @param command a read command
+     * @return the data ranges covered by {@code command}
+     */
+    private static List<DataRange> dataRanges(ReadCommand command)
+    {
+        if (command instanceof SinglePartitionReadCommand)
+        {
+            SinglePartitionReadCommand cmd = (SinglePartitionReadCommand) command;
+            DecoratedKey key = cmd.partitionKey();
+            return Lists.newArrayList(new DataRange(new Range<>(key, key), cmd.clusteringIndexFilter()));
+        }
+        else if (command instanceof PartitionRangeReadCommand)
+        {
+            PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
+            return Lists.newArrayList(cmd.dataRange());
+        }
+        else
+        {
+            throw new AssertionError("Unsupported read command type: " + command.getClass().getName());
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
new file mode 100644
index 000000000000..cbb21784fd32
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.plan;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.metrics.TableQueryMetrics;
+import org.apache.cassandra.schema.TableMetadata;
+
+public class StorageAttachedIndexQueryPlan implements Index.QueryPlan
+{
+    private final ColumnFamilyStore cfs;
+    private final TableQueryMetrics queryMetrics;
+    private final RowFilter postIndexFilter;
+    private final List<RowFilter.Expression> expressions;
+    private final Set<Index> indexes;
+
+    private StorageAttachedIndexQueryPlan(ColumnFamilyStore cfs,
+                                          TableQueryMetrics queryMetrics,
+                                          RowFilter postIndexFilter,
+                                          List<RowFilter.Expression> expressions,
+                                          ImmutableSet<Index> indexes)
+    {
+        this.cfs = cfs;
+        this.queryMetrics = queryMetrics;
+        this.postIndexFilter = postIndexFilter;
+        this.expressions = expressions;
+        this.indexes = indexes;
+    }
+
+    @Nullable
+    public static StorageAttachedIndexQueryPlan create(ColumnFamilyStore cfs,
+                                                       TableQueryMetrics queryMetrics,
+                                                       Set<StorageAttachedIndex> indexes,
+                                                       RowFilter rowFilter)
+    {
+        ImmutableSet.Builder<Index> selectedIndexesBuilder = ImmutableSet.builder();
+        List<RowFilter.Expression> acceptedExpressions = new ArrayList<>();
+
+        for (RowFilter.Expression expression : rowFilter.getExpressions())
+        {
+            // we ignore user-defined expressions here because we don't have a way to translate their #isSatifiedBy
+            // method, they will be included in the filter returned by QueryPlan#postIndexQueryFilter()
+            if (expression.isUserDefined())
+                continue;
+
+            acceptedExpressions.add(expression);
+            for (StorageAttachedIndex index : indexes)
+            {
+                if (index.supportsExpression(expression.column(), expression.operator()))
+                {
+                    selectedIndexesBuilder.add(index);
+                }
+            }
+        }
+
+        ImmutableSet<Index> selectedIndexes = selectedIndexesBuilder.build();
+        if (selectedIndexes.isEmpty())
+            return null;
+
+        /*
+         * postIndexFilter comprised by those expressions in the read command row filter that can't be handled by
+         * {@link FilterTree#satisfiedBy(Unfiltered, Row, boolean)}. That includes expressions targeted
+         * at {@link RowFilter.UserExpression}s like those used by RLAC.
+         */
+        RowFilter postIndexFilter = rowFilter.restrict(e -> e.isUserDefined());
+        return new StorageAttachedIndexQueryPlan(cfs, queryMetrics, postIndexFilter, acceptedExpressions, selectedIndexes);
+    }
+
+    @Override
+    public Set<Index> getIndexes()
+    {
+        return indexes;
+    }
+
+    @Override
+    public long getEstimatedResultRows()
+    {
+        // this is temporary (until proper QueryPlan is integrated into Cassandra)
+        // and allows us to priority storage-attached indexes if any in the query since they
+        // are going to be more efficient, to query and intersect, than built-in indexes.
+        return Long.MIN_VALUE;
+    }
+
+    @Override
+    public boolean shouldEstimateInitialConcurrency()
+    {
+        return false;
+    }
+
+    @Override
+    public Index.Searcher searcherFor(ReadCommand command)
+    {
+        return new StorageAttachedIndexSearcher(cfs, queryMetrics, command, expressions, DatabaseDescriptor.getRangeRpcTimeout(TimeUnit.MILLISECONDS));
+    }
+
+    /**
+     * @return a filter with all the expressions that are user-defined or for a non-indexed partition key column
+     *
+     * (currently index on partition columns is not supported, see {@link StorageAttachedIndex#validateOptions(Map, TableMetadata)})
+     */
+    @Override
+    public RowFilter postIndexQueryFilter()
+    {
+        return postIndexFilter;
+    }
+
+    //TODO Do we need to support this
+//    @Override
+//    public boolean supportsMultiRangeReadCommand()
+//    {
+//        return true;
+//    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
new file mode 100644
index 000000000000..9f1c73a42932
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.plan;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.exceptions.RequestTimeoutException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.metrics.TableQueryMetrics;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.AbstractIterator;
+
+public class StorageAttachedIndexSearcher implements Index.Searcher
+{
+    private final ReadCommand command;
+    private final QueryController controller;
+    private final QueryContext queryContext;
+
+    public StorageAttachedIndexSearcher(ColumnFamilyStore cfs,
+                                        TableQueryMetrics tableQueryMetrics,
+                                        ReadCommand command,
+                                        List<RowFilter.Expression> expressions,
+                                        long executionQuotaMs)
+    {
+        this.command = command;
+        this.queryContext = new QueryContext(executionQuotaMs);
+        this.controller = new QueryController(cfs, command, expressions, queryContext, tableQueryMetrics);
+    }
+
+    @Override
+    public UnfilteredPartitionIterator search(ReadExecutionController executionController) throws RequestTimeoutException
+    {
+        return  new ResultRetriever(analyze(), controller, executionController, queryContext);
+    }
+
+//    @Override
+//    public Flow<FlowableUnfilteredPartition> search(ReadExecutionController executionController) throws RequestTimeoutException
+//    {
+//        return analyzeAsync().map(operation -> new ResultRetriever(operation, controller, executionController, queryContext))
+//                             .flatMap(FlowablePartitions::fromPartitions);
+//    }
+
+    /**
+     * Converts expressions into filter tree and reference {@link SSTableIndex}s used for query.
+     *
+     * @return operation
+     */
+    private Operation analyze()
+    {
+        return Operation.initTreeBuilder(controller).complete();
+    }
+
+    /**
+     * Converts expressions into filter tree (which is currently just a single AND).
+     *
+     * Filter tree allows us to do a couple of important optimizations
+     * namely, group flattening for AND operations (query rewrite), expression bounds checks,
+     * "satisfies by" checks for resulting rows with an early exit.
+     *
+     * @return root of the filter tree.
+     */
+    //TODO How does this get applied in OS
+    private FilterTree analyzeFilter()
+    {
+        return Operation.initTreeBuilder(controller).completeFilter();
+    }
+
+    private static class ResultRetriever extends AbstractIterator<UnfilteredRowIterator> implements UnfilteredPartitionIterator
+    {
+        private final PartitionPosition startToken;
+        private final PartitionPosition lastToken;
+        private final Iterator<DataRange> keyRanges;
+        private AbstractBounds<PartitionPosition> current;
+
+        private final Operation operation;
+        private final QueryController controller;
+        private final ReadExecutionController executionController;
+        private final QueryContext queryContext;
+
+        private Iterator<DecoratedKey> currentKeys = null;
+        private DecoratedKey lastKey;
+
+        private ResultRetriever(Operation operation, QueryController controller,
+                                ReadExecutionController executionController, QueryContext queryContext)
+        {
+            this.keyRanges = controller.dataRanges().iterator();
+            this.current = keyRanges.next().keyRange();
+
+            this.operation = operation;
+            this.controller = controller;
+            this.executionController = executionController;
+            this.queryContext = queryContext;
+
+            this.startToken = controller.mergeRange().left;
+            this.lastToken = controller.mergeRange().right;
+        }
+
+        @Override
+        public UnfilteredRowIterator computeNext()
+        {
+            if (operation == null)
+                return endOfData();
+
+            operation.skipTo(startToken.getToken().getLongValue());
+            if (!operation.hasNext())
+                return endOfData();
+            currentKeys = operation.next().keys();
+
+
+            // IMPORTANT: The correctness of the entire query pipeline relies on the fact that we consume a token
+            // and materialize its keys before moving on to the next token in the flow. This sequence must not be broken
+            // with toList() or similar. (Both the union and intersection flow constructs, to avoid excessive object
+            // allocation, reuse their token mergers as they process individual positions on the ring.)
+            while (true)
+            {
+                while (currentKeys.hasNext())
+                {
+                    DecoratedKey key = currentKeys.next();
+
+                    if (!lastToken.isMinimum() && lastToken.compareTo(key) < 0)
+                        return endOfData();
+
+                    while (current != null)
+                    {
+                        if (current.contains(key))
+                        {
+                            UnfilteredRowIterator partition = apply(key);
+                            if (partition != null)
+                                return partition;
+                            break;
+                        }
+                        // bigger than current range
+                        else if (!current.right.isMinimum() && current.right.compareTo(key) <= 0)
+                        {
+                            if (keyRanges.hasNext())
+                                current = keyRanges.next().keyRange();
+                            else
+                                return endOfData();
+                        }
+                        // smaller than current range
+                        else
+                        {
+                            // we already knew that key is not included in "current" abstract bounds,
+                            // so "left" may have the same partition position as "key" when "left" is exclusive.
+                            assert current.left.compareTo(key) >= 0;
+                            operation.skipTo(current.left.getToken().getLongValue());
+                            if (!operation.hasNext())
+                                return endOfData();
+                            currentKeys = operation.next().keys();
+                            break;
+                        }
+                    }
+                }
+                if (!operation.hasNext())
+                    return endOfData();
+                currentKeys = operation.next().keys();
+            }
+        }
+
+        public UnfilteredRowIterator apply(DecoratedKey key)
+        {
+            // Key reads are lazy, delayed all the way to this point. Skip if we've already seen this one:
+            if (key.equals(lastKey))
+                return null;
+
+            lastKey = key;
+
+            // SPRC should only return UnfilteredRowIterator, but it returns UnfilteredPartitionIterator due to Flow.
+            try (UnfilteredRowIterator partition = controller.getPartition(key, executionController))
+            {
+                queryContext.partitionsRead++;
+
+                return applyIndexFilter(key, partition, operation.filterTree, queryContext);
+            }
+        }
+
+        private static UnfilteredRowIterator applyIndexFilter(DecoratedKey key, UnfilteredRowIterator partition, FilterTree tree, QueryContext queryContext)
+        {
+            Row staticRow = partition.staticRow();
+            List<Unfiltered> clusters = new ArrayList<>();
+
+            while (partition.hasNext())
+            {
+                Unfiltered row = partition.next();
+
+                queryContext.rowsFiltered++;
+                if (tree.satisfiedBy(key, row, staticRow))
+                    clusters.add(row);
+            }
+
+            if (clusters.isEmpty())
+            {
+                queryContext.rowsFiltered++;
+                if (tree.satisfiedBy(key, staticRow, staticRow))
+                    clusters.add(staticRow);
+            }
+
+            /*
+             * If {@code clusters} is empty, which means either all clustering row and static row pairs failed,
+             *       or static row and static row pair failed. In both cases, we should not return any partition.
+             * If {@code clusters} is not empty, which means either there are some clustering row and static row pairs match the filters,
+             *       or static row and static row pair matches the filters. In both cases, we should return a partition with static row,
+             *       and remove the static row marker from the {@code clusters} for the latter case.
+             */
+            if (clusters.isEmpty())
+                return null;
+
+            return new PartitionIterator(partition, staticRow, Iterators.filter(clusters.iterator(), u -> !((Row)u).isStatic()));
+        }
+
+        private static class PartitionIterator extends AbstractUnfilteredRowIterator
+        {
+            private final Iterator<Unfiltered> rows;
+
+            public PartitionIterator(UnfilteredRowIterator partition, Row staticRow, Iterator<Unfiltered> content)
+            {
+                super(partition.metadata(),
+                      partition.partitionKey(),
+                      partition.partitionLevelDeletion(),
+                      partition.columns(),
+                      staticRow,
+                      partition.isReverseOrder(),
+                      partition.stats());
+
+                rows = content;
+            }
+
+            @Override
+            protected Unfiltered computeNext()
+            {
+                return rows.hasNext() ? rows.next() : endOfData();
+            }
+        }
+
+        @Override
+        public TableMetadata metadata()
+        {
+            return controller.metadata();
+        }
+
+        public void close()
+        {
+            FileUtils.closeQuietly(operation);
+            controller.finish();
+        }
+    }
+
+//    /**
+//     * Used by {@link StorageAttachedIndexSearcher#filterReplicaFilteringProtection} which is not ported to OSS yet.
+//     */
+//    private static <U extends Unfiltered, F extends FlowablePartitionBase<U, F>> Flow<F>  applyIndexFilter(Flow<F> fp, FilterTree tree, QueryContext queryContext)
+//    {
+//        return fp.flatMap(partition ->
+//        {
+//            Row staticRow = partition.staticRow();
+//            /*
+//             * If {@code content} is empty, which means either all clustering row and static row pairs failed,
+//             *       or static row and static row pair failed. In both cases, we should not return any partition.
+//             * If {@code content} is not empty, which means either there are some clustering row and static row pairs match the filters,
+//             *       or static row and static row pair matches the filters. In both cases, we should return a partition with static row,
+//             *       and remove the static row marker from the {@code content} for the latter case.
+//             */
+//            Flow<U> content = partition.content()
+//                                       .filter(Unfiltered::isRow)
+//                                       .ifEmpty((U) staticRow)
+//                                       .filter(row ->
+//                                               {
+//                                                   queryContext.rowsFiltered++;
+//                                                   return tree.satisfiedBy(partition.partitionKey(), row, staticRow);
+//                                               });
+//
+//                              return content.skipMapEmpty(c -> partition.withContent(c.filter(unfiltered -> !((Row)unfiltered).isStatic())));
+//                          });
+//    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/AbortedOperationException.java b/src/java/org/apache/cassandra/index/sai/utils/AbortedOperationException.java
new file mode 100644
index 000000000000..072110808721
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/AbortedOperationException.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.utils;
+
+
+/**
+ * This exception indicates that a request was aborted, normally because it was taking too much time.
+ *
+ * It is handled in a special way by the verb handlers and the request execute method: it is simply
+ * passed to the onAborted callback without logging any message. Therefore if any logging is required,
+ * it is up to the code raising this exception to log anything.
+ */
+// TODO OSS doesn't support onAbort and timeout response
+public class AbortedOperationException extends RuntimeException
+{
+    public AbortedOperationException()
+    {
+        super();
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/index/sai/utils/AbstractIterator.java b/src/java/org/apache/cassandra/index/sai/utils/AbstractIterator.java
new file mode 100644
index 000000000000..3285d7b748de
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/AbstractIterator.java
@@ -0,0 +1,161 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Copyright (C) 2007 The Guava Authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.NoSuchElementException;
+
+import com.google.common.collect.PeekingIterator;
+
+import static com.google.common.base.Preconditions.checkState;
+
+// This is fork of the Guava AbstractIterator, the only difference
+// is that state & next variables are now protected, this was required
+// for SkippableIterator.skipTo(..) to void all previous state.
+public abstract class AbstractIterator<T> implements PeekingIterator<T>
+{
+    protected State state = State.NOT_READY;
+
+    /** Constructor for use by subclasses. */
+    protected AbstractIterator() {}
+
+    protected enum State
+    {
+        /** We have computed the next element and haven't returned it yet. */
+        READY,
+
+        /** We haven't yet computed or have already returned the element. */
+        NOT_READY,
+
+        /** We have reached the end of the data and are finished. */
+        DONE,
+
+        /** We've suffered an exception and are kaput. */
+        FAILED,
+    }
+
+    protected T next;
+
+    /**
+     * Returns the next element. <b>Note:</b> the implementation must call {@link
+     * #endOfData()} when there are no elements left in the iteration. Failure to
+     * do so could result in an infinite loop.
+     *
+     * <p>The initial invocation of {@link #hasNext()} or {@link #next()} calls
+     * this method, as does the first invocation of {@code hasNext} or {@code
+     * next} following each successful call to {@code next}. Once the
+     * implementation either invokes {@code endOfData} or throws an exception,
+     * {@code computeNext} is guaranteed to never be called again.
+     *
+     * <p>If this method throws an exception, it will propagate outward to the
+     * {@code hasNext} or {@code next} invocation that invoked this method. Any
+     * further attempts to use the iterator will result in an {@link
+     * IllegalStateException}.
+     *
+     * <p>The implementation of this method may not invoke the {@code hasNext},
+     * {@code next}, or {@link #peek()} methods on this instance; if it does, an
+     * {@code IllegalStateException} will result.
+     *
+     * @return the next element if there was one. If {@code endOfData} was called
+     *     during execution, the return value will be ignored.
+     * @throws RuntimeException if any unrecoverable error happens. This exception
+     *     will propagate outward to the {@code hasNext()}, {@code next()}, or
+     *     {@code peek()} invocation that invoked this method. Any further
+     *     attempts to use the iterator will result in an
+     *     {@link IllegalStateException}.
+     */
+    protected abstract T computeNext();
+
+    /**
+     * Implementations of {@link #computeNext} <b>must</b> invoke this method when
+     * there are no elements left in the iteration.
+     *
+     * @return {@code null}; a convenience so your {@code computeNext}
+     *     implementation can use the simple statement {@code return endOfData();}
+     */
+    protected final T endOfData()
+    {
+        state = State.DONE;
+        return null;
+    }
+
+    public final boolean hasNext()
+    {
+        checkState(state != State.FAILED);
+
+        switch (state)
+        {
+            case DONE:
+                return false;
+
+            case READY:
+                return true;
+
+            default:
+        }
+
+        return tryToComputeNext();
+    }
+
+    protected boolean tryToComputeNext()
+    {
+        state = State.FAILED; // temporary pessimism
+        next = computeNext();
+
+        if (state != State.DONE)
+        {
+            state = State.READY;
+            return true;
+        }
+
+        return false;
+    }
+
+    public final T next()
+    {
+        if (!hasNext())
+            throw new NoSuchElementException();
+
+        state = State.NOT_READY;
+        return next;
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Returns the next element in the iteration without advancing the iteration,
+     * according to the contract of {@link PeekingIterator#peek()}.
+     *
+     * <p>Implementations of {@code AbstractIterator} that wish to expose this
+     * functionality should implement {@code PeekingIterator}.
+     */
+    public final T peek()
+    {
+        if (!hasNext())
+            throw new NoSuchElementException();
+
+        return next;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/LongArray.java b/src/java/org/apache/cassandra/index/sai/utils/LongArray.java
new file mode 100644
index 000000000000..f26b75249271
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/LongArray.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.function.Supplier;
+
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.v1.BlockPackedReader;
+
+/**
+ * Abstraction over a long-indexed array of longs.
+ */
+public interface LongArray extends Closeable
+{
+    /**
+     * Get value at {@code idx}.
+     */
+    long get(long idx);
+
+    /**
+     * Get array length.
+     */
+    long length();
+
+    /**
+     * Using the given token returns the first row ID corresponding to the token.
+     * @param targetToken Token to lookup and it must not be smaller than previous value
+     * @return The row ID of the given token or negative value if target token is greater than all tokens
+     */
+    long findTokenRowID(long targetToken);
+
+    @Override
+    default void close() throws IOException { }
+
+    class DeferredLongArray implements LongArray
+    {
+        private Supplier<LongArray> supplier;
+        private LongArray longArray;
+        private boolean opened = false;
+
+        public DeferredLongArray(Supplier<LongArray> supplier)
+        {
+            this.supplier = supplier;
+        }
+
+        @Override
+        public long get(long idx)
+        {
+            open();
+            return longArray.get(idx);
+        }
+
+        @Override
+        public long length()
+        {
+            open();
+            return longArray.length();
+        }
+
+        @Override
+        public long findTokenRowID(long targetToken)
+        {
+            open();
+            return longArray.findTokenRowID(targetToken);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            if (opened)
+                longArray.close();
+        }
+
+        private void open()
+        {
+            if (!opened)
+            {
+                longArray = supplier.get();
+                opened = true;
+            }
+        }
+    }
+
+    interface Factory
+    {
+        LongArray open();
+
+        /**
+         * TODO use a different interface for {@link BlockPackedReader}, as {@link LongArray#findTokenRowID(long)} is
+         * not supported by other implementation.
+         *
+         * @param startingIndex minimum index to be used in {@link LongArray#findTokenRowID(long)}.
+         *                      In {@link org.apache.cassandra.index.sai.disk.PostingListRangeIterator}, a segmentRowId
+         *                      is provided and then in {@link OffsetFactory},
+         *                      segment offset is applied to segmentRowId to create sstableRowId which will be used by
+         *                      {@link BlockPackedReader#openTokenReader(long, SSTableQueryContext)}.
+         * @param context shared between indexed columns for the same sstable in a given query
+         * @return token BlockPackedReader
+         */
+        default LongArray openTokenReader(long startingIndex, SSTableQueryContext context)
+        {
+            return open();
+        }
+
+        default Factory withOffset(long idxOffset)
+        {
+            return new OffsetFactory(this, idxOffset);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/NamedMemoryLimiter.java b/src/java/org/apache/cassandra/index/sai/utils/NamedMemoryLimiter.java
new file mode 100644
index 000000000000..4ffa6d059f4a
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/NamedMemoryLimiter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.concurrent.atomic.AtomicLong;
+import javax.annotation.concurrent.ThreadSafe;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A simple, thread-safe memory usage tracker, named to reflect a particular scope.
+ */
+@ThreadSafe
+public final class NamedMemoryLimiter
+{
+    private static final Logger logger = LoggerFactory.getLogger(NamedMemoryLimiter.class);
+    
+    private final long limitBytes;
+    private final AtomicLong bytesUsed = new AtomicLong(0);
+    private final String scope;
+
+    public NamedMemoryLimiter(long limitBytes, String scope)
+    {
+        this.limitBytes = limitBytes;
+        this.scope = scope;
+
+        logger.info("[{}]: Memory limiter using limit of {}...", scope, FBUtilities.prettyPrintMemory(limitBytes));
+    }
+
+    /**
+     * @return true if the current number of bytes allocated against the tracker has breached the limit, false otherwise
+     */
+    public boolean usageExceedsLimit()
+    {
+        return currentBytesUsed() > limitBytes;
+    }
+
+    public long increment(long bytes)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace("[{}]: Incrementing tracked memory usage by {} bytes from current usage of {}...", scope, bytes, currentBytesUsed());
+        return this.bytesUsed.addAndGet(bytes);
+    }
+
+    public long decrement(long bytes)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace("[{}]: Decrementing tracked memory usage by {} bytes from current usage of {}...", scope, bytes, currentBytesUsed());
+        return this.bytesUsed.addAndGet(-bytes);
+    }
+
+    public long currentBytesUsed()
+    {
+        return this.bytesUsed.get();
+    }
+    
+    public long limitBytes()
+    {
+        return this.limitBytes;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/OffsetFactory.java b/src/java/org/apache/cassandra/index/sai/utils/OffsetFactory.java
new file mode 100644
index 000000000000..a2840705e416
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/OffsetFactory.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+
+public class OffsetFactory implements LongArray.Factory
+{
+    private final LongArray.Factory wrapped;
+    private final long idxOffset;
+
+    OffsetFactory(LongArray.Factory wrapped, long idxOffset)
+    {
+        this.wrapped = wrapped;
+        this.idxOffset = idxOffset;
+    }
+
+    @Override
+    public LongArray open()
+    {
+        return new OffsetLongArray(wrapped.open(), idxOffset);
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    public LongArray openTokenReader(long segmentRowId, SSTableQueryContext context)
+    {
+        // apply segment offset so that `LongArray.findTokenRowId` will start searching tokens from current segment.
+        LongArray reader = wrapped.openTokenReader(segmentRowId + idxOffset, context);
+        return new TokenLongArray(context, reader, idxOffset);
+    }
+
+    private static class OffsetLongArray implements LongArray
+    {
+        private final LongArray wrapped;
+        protected final long idxOffset;
+
+        OffsetLongArray(LongArray wrapped, long idxOffset)
+        {
+            this.wrapped = wrapped;
+            this.idxOffset = idxOffset;
+        }
+
+        /**
+         * Get value at {@code idx}.
+         */
+        @Override
+        public long get(long idx)
+        {
+            return wrapped.get(toSSTableRowId(idx));
+        }
+
+        @Override
+        public long findTokenRowID(long value)
+        {
+            // Subtract the segment offset from the global row ID to provide a segment row ID for the caller:
+            return toSegmentRowId(wrapped.findTokenRowID(value));
+        }
+
+        /**
+         * Get array length.
+         */
+        public long length()
+        {
+            return wrapped.length();
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            wrapped.close();
+        }
+
+        protected long toSSTableRowId(long segmentRowId)
+        {
+            return segmentRowId + idxOffset;
+        }
+
+        protected long toSegmentRowId(long sstableRowId)
+        {
+            return sstableRowId - idxOffset;
+        }
+    }
+
+    /**
+     * Cache the prev token value and prev sstable row id pair, and share it between different indexed columns in the
+     * same query.
+     */
+    static class TokenLongArray extends OffsetLongArray
+    {
+        private final SSTableQueryContext context;
+
+        TokenLongArray(SSTableQueryContext context, LongArray wrapped, long idxOffset)
+        {
+            super(wrapped, idxOffset);
+            this.context = context;
+        }
+
+        @Override
+        public long get(long segmentRowId)
+        {
+            long sstableRowId = toSSTableRowId(segmentRowId);
+            if (sstableRowId == context.prevSSTableRowId)
+            {
+                return context.prevTokenValue;
+            }
+
+            long tokenValue = super.get(segmentRowId);
+
+            // during intersection, the next pair of token and rowId from current indexed column iterator is very likely
+            // to be used to skip another indexed column iterator (aka. used to call findTokenRowId) or used to fetch
+            // token (aka. get) if there is matching row id from posting reader.
+            context.prevTokenValue = tokenValue;
+            context.prevSSTableRowId = sstableRowId;
+
+            return tokenValue;
+        }
+
+        @Override
+        public long findTokenRowID(long tokenValue)
+        {
+            long segmentRowId = toSegmentRowId(context.prevSkipToSSTableRowId);
+
+            // Don't use cached value from previous segment when there is duplicated tokens across segments.
+            if (tokenValue == context.prevSkipToTokenValue && segmentRowId >= 0)
+            {
+                context.markTokenSkippingCacheHit();
+            }
+            else
+            {
+                segmentRowId = super.findTokenRowID(tokenValue);
+
+                context.prevSkipToTokenValue = tokenValue;
+                context.prevSkipToSSTableRowId = toSSTableRowId(segmentRowId);
+            }
+            context.markTokenSkippingLookup();
+            return segmentRowId;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java
new file mode 100644
index 000000000000..bb56ee38cd00
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKey.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+
+import com.google.common.base.Objects;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * The primary key of a row, composed by the partition key and the clustering key.
+ */
+public abstract class PrimaryKey
+{
+    private final DecoratedKey partitionKey;
+
+    /**
+     * Returns a new primary key composed by the specified partition and clustering keys.
+     *
+     * @param partitionKey a partition key
+     * @param clustering a clustering key
+     * @return a new primary key composed by {@code partitionKey} and {@code ClusteringKey}
+     */
+    public static PrimaryKey of(DecoratedKey partitionKey, Clustering clustering)
+    {
+        if (clustering == Clustering.EMPTY)
+            return new Skinny(partitionKey);
+        else if (clustering == Clustering.STATIC_CLUSTERING)
+            return new Static(partitionKey);
+        else
+            return new Wide(partitionKey, clustering);
+    }
+
+    private PrimaryKey(DecoratedKey partitionKey)
+    {
+        this.partitionKey = partitionKey;
+    }
+
+    /**
+     * Returns the {@link Token} of the partition key.
+     *
+     * @return the partitioning token of the partition key
+     */
+    public Token token()
+    {
+        return partitionKey.getToken();
+    }
+
+    /**
+     * Returns the partition key.
+     *
+     * @return the partition key
+     */
+    public DecoratedKey partitionKey()
+    {
+        return partitionKey;
+    }
+
+    /**
+     * Returns the clustering key.
+     *
+     * @return the clustering key
+     */
+    public abstract Clustering clustering();
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        PrimaryKey that = (PrimaryKey) o;
+        return Objects.equal(partitionKey, that.partitionKey) && Objects.equal(clustering(), that.clustering());
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(partitionKey, clustering());
+    }
+
+    public String toString(TableMetadata metadata)
+    {
+        return String.format("PrimaryKey: { partition : %s, clustering: %s}",
+                             metadata.partitionKeyType.getString(partitionKey.getKey()),
+                             clustering().toString(metadata));
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("PrimaryKey: { partition : %s, clustering: %s} "+getClass().getSimpleName(),
+                             ByteBufferUtil.bytesToHex(partitionKey.getKey()),
+                             String.join(",", Arrays.stream(clustering().getBufferArray())
+                                                    .map(ByteBufferUtil::bytesToHex)
+                                                    .collect(Collectors.toList())));
+    }
+
+    /**
+     * {@link PrimaryKey} implementation for rows in tables without a defined clustering key.
+     */
+    static class Skinny extends PrimaryKey
+    {
+        Skinny(DecoratedKey partitionKey)
+        {
+            super(partitionKey);
+        }
+
+        @Override
+        public Clustering clustering()
+        {
+            return Clustering.EMPTY;
+        }
+    }
+
+    /**
+     * {@link PrimaryKey} implementation for static rows in tables with a defined clustering key.
+     */
+    static class Static extends PrimaryKey
+    {
+        Static(DecoratedKey partitionKey)
+        {
+            super(partitionKey);
+        }
+
+        @Override
+        public Clustering clustering()
+        {
+            return Clustering.STATIC_CLUSTERING;
+        }
+    }
+
+    /**
+     * {@link PrimaryKey} implementation for regular rows in tables with a defined clustering key.
+     */
+    static class Wide extends PrimaryKey
+    {
+        private final Clustering clustering;
+
+        Wide(DecoratedKey partitionKey, Clustering clustering)
+        {
+            super(partitionKey);
+            this.clustering = clustering;
+        }
+
+        @Override
+        public Clustering clustering()
+        {
+            return clustering;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
new file mode 100644
index 000000000000..f50624004f74
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.Iterator;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.stream.Stream;
+import javax.annotation.concurrent.ThreadSafe;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.utils.ObjectSizes;
+
+/**
+ * A sorted set of {@link PrimaryKey}s.
+ *
+ * The primary keys are sorted first by token, then by partition key value, and then by clustering.
+ */
+public interface PrimaryKeys extends Iterable<PrimaryKey>
+{
+    // from https://github.com/gaul/java-collection-overhead
+    long SET_ENTRY_OVERHEAD = 36;
+    long MAP_ENTRY_OVERHEAD = 36;
+
+    /**
+     * Returns a new empty {@link PrimaryKey} sorted set using the specified clustering comparator.
+     *
+     * @param clusteringComparator a clustering comparator
+     * @return a new empty primary key set
+     */
+    static PrimaryKeys create(ClusteringComparator clusteringComparator)
+    {
+        return clusteringComparator.size() == 0 ? new Skinny() : new Wide(clusteringComparator);
+    }
+
+    /**
+     * Adds the specified {@link PrimaryKey}.
+     *
+     * @param key a primary key
+     */
+    default long add(PrimaryKey key)
+    {
+        return add(key.partitionKey(), key.clustering());
+    }
+
+    /**
+     * Adds the primary key defined by the specified partition key and clustering.
+     *
+     * @param key a partition key
+     * @param clustering a clustering key
+     */
+    long add(DecoratedKey key, Clustering clustering);
+
+    /**
+     * Returns all the partition keys.
+     *
+     * @return all the partition keys
+     */
+    SortedSet<DecoratedKey> partitionKeys();
+
+    /**
+     * Returns the number of primary keys.
+     *
+     * @return the number of primary keys
+     */
+    int size();
+
+    /**
+     * Returns if this primary key set is empty.
+     *
+     * @return {@code true} if this is empty, {@code false} otherwise
+     */
+    boolean isEmpty();
+
+    Stream<PrimaryKey> stream();
+
+    long unsharedHeapSize();
+
+    @Override
+    @SuppressWarnings("NullableProblems")
+    default Iterator<PrimaryKey> iterator()
+    {
+        return stream().iterator();
+    }
+
+    /**
+     * A {@link PrimaryKeys} implementation for tables without a defined clustering key,
+     * in which case the clustering key is always {@link Clustering#EMPTY}.
+     */
+    @ThreadSafe
+    class Skinny implements PrimaryKeys
+    {
+        private static final long EMPTY_SIZE = ObjectSizes.measure(new Skinny());
+
+        public final ConcurrentSkipListSet<DecoratedKey> keys;
+
+        private Skinny()
+        {
+            this.keys = new ConcurrentSkipListSet<>();
+        }
+
+        @Override
+        public long add(DecoratedKey key, Clustering clustering)
+        {
+            assert clustering == Clustering.EMPTY;
+            return keys.add(key) ? SET_ENTRY_OVERHEAD : 0;
+        }
+
+        @Override
+        public SortedSet<DecoratedKey> partitionKeys()
+        {
+            return keys;
+        }
+
+        @Override
+        public int size()
+        {
+            return keys.size();
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return keys.isEmpty();
+        }
+
+        @Override
+        public Stream<PrimaryKey> stream()
+        {
+            return keys.stream().map(PrimaryKey.Skinny::new);
+        }
+
+        @Override
+        public long unsharedHeapSize()
+        {
+            return EMPTY_SIZE;
+        }
+    }
+
+    /**
+     * A {@link PrimaryKeys} implementation for tables with a defined clustering key.
+     */
+    @ThreadSafe
+    class Wide implements PrimaryKeys
+    {
+        private static final long EMPTY_SIZE = ObjectSizes.measure(new Wide(null));
+
+        private final ClusteringComparator clusteringComparator;
+        private final ConcurrentSkipListMap<DecoratedKey, ConcurrentSkipListSet<Clustering>> keys;
+
+        private Wide(ClusteringComparator clusteringComparator)
+        {
+            this.clusteringComparator = clusteringComparator;
+            this.keys = new ConcurrentSkipListMap<>();
+        }
+
+        @Override
+        public long add(DecoratedKey key, Clustering clustering)
+        {
+            long onHeapOverhead = 0;
+            ConcurrentSkipListSet<Clustering> keys = this.keys.get(key);
+
+            if (keys == null)
+            {
+                ConcurrentSkipListSet<Clustering> newKeys = new ConcurrentSkipListSet<>(clusteringComparator);
+                keys = this.keys.putIfAbsent(key, newKeys);
+                if (keys == null)
+                {
+                    onHeapOverhead += (ObjectSizes.measure(newKeys) + MAP_ENTRY_OVERHEAD);
+                    keys = newKeys;
+                }
+            }
+
+            return keys.add(clustering) ? onHeapOverhead + SET_ENTRY_OVERHEAD : onHeapOverhead;
+        }
+
+        @Override
+        public SortedSet<DecoratedKey> partitionKeys()
+        {
+            return keys.keySet();
+        }
+
+        @Override
+        public int size()
+        {
+            return keys.values().stream().mapToInt(Set::size).sum();
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return keys.isEmpty();
+        }
+
+        @Override
+        public Stream<PrimaryKey> stream()
+        {
+            return keys.entrySet().stream().flatMap(e -> e.getValue().stream().map(c -> PrimaryKey.of(e.getKey(), c)));
+        }
+
+        @Override
+        public long unsharedHeapSize()
+        {
+            return EMPTY_SIZE;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/RangeConcatIterator.java b/src/java/org/apache/cassandra/index/sai/utils/RangeConcatIterator.java
new file mode 100644
index 000000000000..37d7546e58a4
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/RangeConcatIterator.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.PriorityQueue;
+
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.io.util.FileUtils;
+
+/**
+ * {@link RangeConcatIterator} takes a list of sorted range iterator and concatenates them, leaving duplicates in
+ * place, to produce a new stably sorted iterator. Duplicates are eliminated later in
+ * {@link org.apache.cassandra.index.sai.plan.StorageAttachedIndexSearcher}
+ * as results from multiple SSTable indexes and their respective segments are consumed.
+ *
+ * ex. (1, 2, 3) + (3, 3, 4, 5) -> (1, 2, 3, 3, 3, 4, 5)
+ * ex. (1, 2, 2, 3) + (3, 4, 4, 6, 6, 7) -> (1, 2, 2, 3, 3, 4, 4, 6, 6, 7)
+ *
+ */
+public class RangeConcatIterator extends RangeIterator
+{
+    private final PriorityQueue<RangeIterator> ranges;
+    private final List<RangeIterator> toRelease;
+
+    protected RangeConcatIterator(RangeIterator.Builder.Statistics statistics, PriorityQueue<RangeIterator> ranges)
+    {
+        super(statistics);
+
+        this.ranges = ranges;
+        this.toRelease = new ArrayList<>(ranges);
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    protected void performSkipTo(Long nextToken)
+    {
+        while (!ranges.isEmpty())
+        {
+            if (ranges.peek().getCurrent().compareTo(nextToken) >= 0)
+                break;
+
+            RangeIterator head = ranges.poll();
+
+            if (head.getMaximum().compareTo(nextToken) >= 0)
+            {
+                head.skipTo(nextToken);
+                ranges.add(head);
+                break;
+            }
+        }
+    }
+
+    @Override
+    public void close() throws IOException
+    {
+        // due to lazy key fetching, we cannot close iterator immediately
+        toRelease.forEach(FileUtils::closeQuietly);
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    protected Token computeNext()
+    {
+        while (!ranges.isEmpty())
+        {
+            RangeIterator current = ranges.poll();
+            if (current.hasNext())
+            {
+                Token next = current.next();
+                // hasNext will update RangeIterator's current which is used to sort in PQ
+                if (current.hasNext())
+                    ranges.add(current);
+
+                return next;
+            }
+        }
+
+        return endOfData();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static RangeIterator build(List<RangeIterator> tokens)
+    {
+        return new Builder().add(tokens).build();
+    }
+
+    public static class Builder extends RangeIterator.Builder
+    {
+        public Builder()
+        {
+            super(IteratorType.CONCAT);
+        }
+
+        protected RangeIterator buildIterator()
+        {
+            switch (rangeCount())
+            {
+                case 1:
+                    return ranges.poll();
+
+                default:
+                    return new RangeConcatIterator(statistics, ranges);
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/RangeIntersectionIterator.java b/src/java/org/apache/cassandra/index/sai/utils/RangeIntersectionIterator.java
new file mode 100644
index 000000000000..ba9e5b305cf8
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/RangeIntersectionIterator.java
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+import java.lang.invoke.MethodHandles;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.stream.Collectors;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.tracing.Tracing;
+
+/**
+ * Modified from {@link org.apache.cassandra.index.sasi.utils.RangeIntersectionIterator} to support:
+ * 1. no generic type to reduce allocation
+ * 2. support selective intersection to reduce disk io
+ * 3. make sure iterators are closed when intersection ends because of lazy key fetching
+ */
+@SuppressWarnings("resource")
+public class RangeIntersectionIterator
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    // The cassandra.sai.intersection.clause.limit (default: 2) controls the maximum number of range iterator that
+    // will be used in the final intersection of a query operation.
+    private static final int INTERSECTION_CLAUSE_LIMIT = Integer.getInteger("cassandra.sai.intersection.clause.limit", 2);
+
+    static
+    {
+        logger.info(String.format("Storage attached index intersection clause limit is %d", INTERSECTION_CLAUSE_LIMIT));
+    }
+
+    public static boolean shouldDefer(int numberOfExpressions)
+    {
+        return (INTERSECTION_CLAUSE_LIMIT <= 0) || (numberOfExpressions <= INTERSECTION_CLAUSE_LIMIT);
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static Builder selectiveBuilder()
+    {
+        return selectiveBuilder(INTERSECTION_CLAUSE_LIMIT);
+    }
+
+    public static Builder selectiveBuilder(int limit)
+    {
+        return new Builder(limit);
+    }
+
+    public static class Builder extends RangeIterator.Builder
+    {
+        private final int limit;
+
+        public Builder()
+        {
+            super(IteratorType.INTERSECTION);
+            this.limit = Integer.MAX_VALUE;
+        }
+
+        public Builder(int limit)
+        {
+            super(IteratorType.INTERSECTION);
+            this.limit = limit;
+        }
+
+        protected RangeIterator buildIterator()
+        {
+            // all ranges will be included
+            if (limit >= ranges.size() || limit <= 0)
+                return buildIterator(statistics, ranges);
+
+            // Apply most selective iterators during intersection, because larger number of iterators will result lots of disk seek.
+            List<RangeIterator> selectiveIterator = new ArrayList<>(ranges);
+            selectiveIterator.sort(Comparator.comparingLong(RangeIterator::getCount));
+
+            Statistics selectiveStatistics = new Statistics(IteratorType.INTERSECTION);
+            for (int i = selectiveIterator.size() - 1; i >= 0 && i >= limit; i--)
+                FileUtils.closeQuietly(selectiveIterator.remove(i));
+
+            for (RangeIterator iterator : selectiveIterator)
+                selectiveStatistics.update(iterator);
+
+            if (Tracing.isTracing())
+                Tracing.trace("Selecting {} {} of {} out of {} indexes",
+                              selectiveIterator.size(),
+                              selectiveIterator.size() > 1 ? "indexes with cardinalities" : "index with cardinality",
+                              selectiveIterator.stream().map(RangeIterator::getCount).map(Object::toString).collect(Collectors.joining(", ")),
+                              ranges.size());
+
+            PriorityQueue<RangeIterator> selectiveRanges = new PriorityQueue<>(limit, Comparator.comparing(RangeIterator::getCurrent));
+            selectiveRanges.addAll(selectiveIterator);
+
+            return buildIterator(selectiveStatistics, selectiveRanges);
+        }
+
+        private static RangeIterator buildIterator(Statistics statistics, PriorityQueue<RangeIterator> ranges)
+        {
+            // if the range is disjoint or we have an intersection with an empty set,
+            // we can simply return an empty iterator, because it's not going to produce any results.
+            if (statistics.isDisjoint())
+            {
+                // release posting lists
+                FileUtils.closeQuietly(ranges);
+                return RangeIterator.empty();
+            }
+
+            if (ranges.size() == 1)
+                return ranges.poll();
+
+            return new BounceIntersectionIterator(statistics, ranges);
+        }
+    }
+
+    /**
+     * Iterator which performs intersection of multiple ranges by using bouncing (merge-join) technique to identify
+     * common elements in the given ranges. Aforementioned "bounce" works as follows: range queue is poll'ed for the
+     * range with the smallest current token (main loop), that token is used to {@link RangeIterator#skipTo(Long)}
+     * other ranges, if token produced by {@link RangeIterator#skipTo(Long)} is equal to current "candidate" token,
+     * both get merged together and the same operation is repeated for next range from the queue, if returned token
+     * is not equal than candidate, candidate's range gets put back into the queue and the main loop gets repeated until
+     * next intersection token is found or at least one iterator runs out of tokens.
+     *
+     * This technique is every efficient to jump over gaps in the ranges.
+     */
+    private static class BounceIntersectionIterator extends RangeIterator
+    {
+        private final PriorityQueue<RangeIterator> ranges;
+        private final Token.TokenMerger merger;
+        private final List<RangeIterator> toRelease;
+        private final List<RangeIterator> processedRanges;
+
+        private BounceIntersectionIterator(Builder.Statistics statistics, PriorityQueue<RangeIterator> ranges)
+        {
+            super(statistics);
+            this.ranges = ranges;
+            this.toRelease = new ArrayList<>(ranges);
+            this.merger = new Token.ReusableTokenMerger(ranges.size());
+            this.processedRanges = new ArrayList<>(ranges.size());
+        }
+
+        protected Token computeNext()
+        {
+            RangeIterator head = ranges.poll();
+
+            if (head == null)
+                return endOfData();
+
+            // jump right to the beginning of the intersection or return next element
+            if (head.getCurrent().compareTo(getMinimum()) < 0)
+                head.skipTo(getMinimum());
+
+            Token candidate = head.hasNext() ? head.next() : null;
+
+            if (candidate == null || candidate.get() > getMaximum())
+                return endOfData();
+
+            merger.reset();
+            merger.add(candidate);
+
+            processedRanges.clear();
+
+            boolean intersectsAll = true;
+            while (!ranges.isEmpty())
+            {
+                RangeIterator range = ranges.poll();
+
+                // found a range which doesn't overlap with one (or possibly more) other range(s)
+                // or the range is exhausted
+                if (!isOverlapping(head, range) || (range.skipTo(candidate.get()) == null))
+                {
+                    intersectsAll = false;
+                    break;
+                }
+
+                int cmp = Long.compare(candidate.get(), range.getCurrent());
+
+                if (cmp == 0)
+                {
+                    merger.add(range.next());
+                    // advance skipped range to the next element if any
+                    range.hasNext();
+                    processedRanges.add(range);
+                }
+                else if (cmp < 0)
+                {
+                    // the candidate is less than the current value in the next range
+                    // so make the next range the candidate and start again
+                    candidate = range.next();
+                    merger.reset();
+                    merger.add(candidate);
+                    ranges.add(head);
+                    ranges.addAll(processedRanges);
+                    processedRanges.clear();
+                    head = range;
+                }
+                else
+                {
+                    intersectsAll = false;
+                    break;
+                }
+            }
+
+            if (intersectsAll)
+            {
+                ranges.add(head);
+                ranges.addAll(processedRanges);
+                return merger.merge();
+            }
+
+            return endOfData();
+        }
+
+        protected void performSkipTo(Long nextToken)
+        {
+            List<RangeIterator> skipped = new ArrayList<>();
+
+            while (!ranges.isEmpty())
+            {
+                RangeIterator range = ranges.poll();
+                range.skipTo(nextToken);
+                skipped.add(range);
+            }
+
+            for (RangeIterator range : skipped)
+                ranges.add(range);
+        }
+
+        public void close() throws IOException
+        {
+            toRelease.forEach(FileUtils::closeQuietly);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/RangeIterator.java b/src/java/org/apache/cassandra/index/sai/utils/RangeIterator.java
new file mode 100644
index 000000000000..33abd6bff9fb
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/RangeIterator.java
@@ -0,0 +1,370 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.Closeable;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.List;
+import java.util.PriorityQueue;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.io.util.FileUtils;
+
+/**
+ * Modified from {@link org.apache.cassandra.index.sasi.utils.RangeIterator} to support:
+ * 1. no generic type to reduce allocation
+ * 2. CONCAT iterator type
+ */
+public abstract class RangeIterator extends AbstractIterator<Token> implements Closeable
+{
+    private static final Builder.EmptyRangeIterator EMPTY = new Builder.EmptyRangeIterator();
+
+    private final Long min, max;
+    private final long count;
+    private Long current;
+
+    protected RangeIterator(Builder.Statistics statistics)
+    {
+        this(statistics.min, statistics.max, statistics.tokenCount);
+    }
+
+    public RangeIterator(RangeIterator range)
+    {
+        this(range == null ? null : range.min, range == null ? null : range.max, range == null ? -1 : range.count);
+    }
+
+    public RangeIterator(Long min, Long max, long count)
+    {
+        if (min == null || max == null || count == 0)
+            assert min == null && max == null && (count == 0 || count == -1) : min + " - " + max + " " + count;
+
+        this.min = min;
+        this.current = min;
+        this.max = max;
+        this.count = count;
+    }
+
+    public final Long getMinimum()
+    {
+        return min;
+    }
+
+    public final Long getCurrent()
+    {
+        return current;
+    }
+
+    public final Long getMaximum()
+    {
+        return max;
+    }
+
+    public final long getCount()
+    {
+        return count;
+    }
+
+    /**
+     * When called, this iterators current position should
+     * be skipped forwards until finding either:
+     *   1) an element equal to or bigger than next
+     *   2) the end of the iterator
+     *
+     * @param nextToken value to skip the iterator forward until matching
+     *
+     * @return The next current token after the skip was performed
+     */
+    public final Token skipTo(Long nextToken)
+    {
+        if (min == null || max == null)
+            return endOfData();
+
+        // In the case of deferred iterators the current value may not accurately
+        // reflect the next value so we need to check that as well
+        if (current.compareTo(nextToken) >= 0)
+        {
+            next = next == null ? recomputeNext() : next;
+            if (next == null)
+                return endOfData();
+            else if (next.get().compareTo(nextToken) >= 0)
+                return next;
+        }
+
+        if (max.compareTo(nextToken) < 0)
+            return endOfData();
+
+        performSkipTo(nextToken);
+        return recomputeNext();
+    }
+
+    protected abstract void performSkipTo(Long nextToken);
+
+    protected Token recomputeNext()
+    {
+        return tryToComputeNext() ? peek() : endOfData();
+    }
+
+    protected boolean tryToComputeNext()
+    {
+        boolean hasNext = super.tryToComputeNext();
+        current = hasNext ? next.get() : getMaximum();
+        return hasNext;
+    }
+
+    public static RangeIterator empty()
+    {
+        return EMPTY;
+    }
+
+    public static abstract class Builder
+    {
+        public enum IteratorType
+        {
+            CONCAT,
+            UNION,
+            INTERSECTION;
+        }
+
+        @VisibleForTesting
+        protected final Statistics statistics;
+
+        @VisibleForTesting
+        protected final PriorityQueue<RangeIterator> ranges;
+
+        public Builder(IteratorType type)
+        {
+            statistics = new Statistics(type);
+            ranges = new PriorityQueue<>(16, Comparator.comparingLong(RangeIterator::getCurrent));
+        }
+
+        public Long getMinimum()
+        {
+            return statistics.min;
+        }
+
+        public Long getMaximum()
+        {
+            return statistics.max;
+        }
+
+        public long getTokenCount()
+        {
+            return statistics.tokenCount;
+        }
+
+        public int rangeCount()
+        {
+            return ranges.size();
+        }
+
+        public Collection<RangeIterator> ranges()
+        {
+            return ranges;
+        }
+
+        public Builder add(RangeIterator range)
+        {
+            if (range == null)
+                return this;
+
+            if (range.getCount() > 0)
+                ranges.add(range);
+            else
+                FileUtils.closeQuietly(range);
+            statistics.update(range);
+
+            return this;
+        }
+
+        public Builder add(List<RangeIterator> ranges)
+        {
+            if (ranges == null || ranges.isEmpty())
+                return this;
+
+            ranges.forEach(this::add);
+            return this;
+        }
+
+        public final RangeIterator build()
+        {
+            if (rangeCount() == 0)
+                return empty();
+            else
+                return buildIterator();
+        }
+
+        public static class EmptyRangeIterator extends RangeIterator
+        {
+            EmptyRangeIterator() { super(null, null, 0); }
+            public Token computeNext() { return endOfData(); }
+            protected void performSkipTo(Long nextToken) { }
+            public void close() { }
+        }
+
+        protected abstract RangeIterator buildIterator();
+
+        public static class Statistics
+        {
+            protected final IteratorType iteratorType;
+
+            protected Long min, max;
+            protected long tokenCount;
+
+            // iterator with the least number of items
+            protected RangeIterator minRange;
+            // iterator with the most number of items
+            protected RangeIterator maxRange;
+
+            // tracks if all of the added ranges overlap, which is useful in case of intersection,
+            // as it gives direct answer as to such iterator is going to produce any results.
+            private boolean isOverlapping = true;
+
+            public Statistics(IteratorType iteratorType)
+            {
+                this.iteratorType = iteratorType;
+            }
+
+            /**
+             * Update statistics information with the given range.
+             *
+             * Updates min/max of the combined range, token count and
+             * tracks range with the least/most number of tokens.
+             *
+             * @param range The range to update statistics with.
+             */
+            public void update(RangeIterator range)
+            {
+                switch (iteratorType)
+                {
+                    case CONCAT:
+                        // range iterators should be sorted, but previous max must not be greater than next min.
+                        if (range.getCount() > 0)
+                        {
+                            if (tokenCount == 0)
+                            {
+                                min = range.getMinimum();
+                            }
+                            else if (tokenCount > 0 && max > range.getMinimum())
+                            {
+                                throw new IllegalArgumentException("RangeIterator must be sorted, previous max: " + max + ", next min: " + range.getMinimum());
+                            }
+
+                            max = range.getMaximum();
+                        }
+                        break;
+
+                    case UNION:
+                        min = nullSafeMin(min, range.getMinimum());
+                        max = nullSafeMax(max, range.getMaximum());
+                        break;
+
+                    case INTERSECTION:
+                        // minimum of the intersection is the biggest minimum of individual iterators
+                        min = nullSafeMax(min, range.getMinimum());
+                        // maximum of the intersection is the smallest maximum of individual iterators
+                        max = nullSafeMin(max, range.getMaximum());
+                        break;
+
+                    default:
+                        throw new IllegalStateException("Unknown iterator type: " + iteratorType);
+                }
+
+                // check if new range is disjoint with already added ranges, which means that this intersection
+                // is not going to produce any results, so we can cleanup range storage and never added anything to it.
+                isOverlapping &= isOverlapping(min, max, range);
+
+                minRange = minRange == null ? range : min(minRange, range);
+                maxRange = maxRange == null ? range : max(maxRange, range);
+
+                tokenCount += range.getCount();
+            }
+
+            private RangeIterator min(RangeIterator a, RangeIterator b)
+            {
+                return a.getCount() > b.getCount() ? b : a;
+            }
+
+            private RangeIterator max(RangeIterator a, RangeIterator b)
+            {
+                return a.getCount() > b.getCount() ? a : b;
+            }
+
+            public boolean isDisjoint()
+            {
+                return !isOverlapping;
+            }
+
+            public double sizeRatio()
+            {
+                return minRange.getCount() * 1d / maxRange.getCount();
+            }
+        }
+    }
+
+    @VisibleForTesting
+    protected static boolean isOverlapping(RangeIterator a, RangeIterator b)
+    {
+        return isOverlapping(a.getCurrent(), a.getMaximum(), b);
+    }
+
+    /**
+     * Ranges are overlapping the following cases:
+     *
+     *   * When they have a common subrange:
+     *
+     *   min       b.current      max          b.max
+     *   +---------|--------------+------------|
+     *
+     *   b.current      min       max          b.max
+     *   |--------------+---------+------------|
+     *
+     *   min        b.current     b.max        max
+     *   +----------|-------------|------------+
+     *
+     *
+     *  If either range is empty, they're disjoint.
+     */
+    @VisibleForTesting
+    protected static boolean isOverlapping(Long min, Long max, RangeIterator b)
+    {
+        return (min != null && max != null) &&
+               b.getCount() != 0 &&
+               (min.compareTo(b.getMaximum()) <= 0 && b.getCurrent().compareTo(max) <= 0);
+    }
+
+    @SuppressWarnings("unchecked")
+    private static <T extends Comparable> T nullSafeMin(T a, T b)
+    {
+        if (a == null) return b;
+        if (b == null) return a;
+
+        return a.compareTo(b) > 0 ? b : a;
+    }
+
+    @SuppressWarnings("unchecked")
+    private static <T extends Comparable> T nullSafeMax(T a, T b)
+    {
+        if (a == null) return b;
+        if (b == null) return a;
+
+        return a.compareTo(b) > 0 ? a : b;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/RangeUnionIterator.java b/src/java/org/apache/cassandra/index/sai/utils/RangeUnionIterator.java
new file mode 100644
index 000000000000..84686e8b815e
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/RangeUnionIterator.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.PriorityQueue;
+
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.io.util.FileUtils;
+
+/**
+ * Range Union Iterator is used to return sorted stream of elements from multiple RangeIterator instances.
+ *
+ * PriorityQueue is used as a sorting mechanism for the ranges, where each computeNext() operation would poll
+ * from the queue (and push when done), which returns range that contains the smallest element, because
+ * sorting is done on the moving window of range iteration {@link RangeIterator#getCurrent()}. Once retrieved
+ * the smallest element (return candidate) is attempted to be merged with other ranges, because there could
+ * be equal elements in adjacent ranges, such ranges are poll'ed only if their {@link RangeIterator#getCurrent()}
+ * equals to the return candidate.
+ *
+ * Modified from {@link org.apache.cassandra.index.sasi.utils.RangeUnionIterator} to support:
+ * 1. no generic type to reduce allocation=
+ * 2. make sure iterators are closed when intersection ends because of lazy key fetching
+ */
+@SuppressWarnings("resource")
+public class RangeUnionIterator extends RangeIterator
+{
+    // Due to lazy key fetching, we cannot close iterator immediately
+    private final PriorityQueue<RangeIterator> ranges;
+
+    // If the ranges are deferred then the ranges queue is not
+    // necessarily in order so we need to maintain a separate queue
+    // of candidate tokens until the ranges queue is ordered correctly
+    private final PriorityQueue<Token> candidates;
+
+    private final Token.TokenMerger merger;
+    private final List<RangeIterator> toRelease;
+
+    private RangeUnionIterator(Builder.Statistics statistics, PriorityQueue<RangeIterator> ranges)
+    {
+        super(statistics);
+        this.ranges = ranges;
+        // Don't use Comparator.comparing here, it auto-boxes the longs
+        this.candidates = new PriorityQueue<>(ranges.size(), (t1, t2) -> Long.compare(t1.getLong(), t2.getLong()));
+        this.merger = new Token.ReusableTokenMerger(ranges.size());
+        this.toRelease = new ArrayList<>(ranges);
+    }
+
+    public Token computeNext()
+    {
+        Token candidate;
+        List<RangeIterator> processedRanges = new ArrayList<>(ranges.size());
+
+        // Only poll the ranges for a new candidate if the candidates queue is empty.
+        // Otherwise, always start with a candidate from the candidates queue until
+        // it is empty.
+        if (candidates.isEmpty())
+        {
+            RangeIterator head = null;
+
+            while (!ranges.isEmpty())
+            {
+                head = ranges.poll();
+                if (head.hasNext())
+                    break;
+            }
+
+            if (head == null || !head.hasNext())
+                return endOfData();
+
+            candidate = head.next();
+
+            if (head.hasNext())
+                processedRanges.add(head);
+        }
+        else
+        {
+            candidate = candidates.poll();
+            // may have duplicates in the candidates queue so flush them out before continuing
+            while (!candidates.isEmpty())
+            {
+                if (candidate.get() < candidates.peek().get())
+                    break;
+                candidates.poll();
+            }
+        }
+
+        merger.reset();
+        merger.add(candidate);
+
+        long minCurrent = ranges.stream().mapToLong(RangeIterator::getCurrent).min().orElse(Long.MAX_VALUE);
+
+        if (candidate.get() < minCurrent)
+        {
+            ranges.addAll(processedRanges);
+            return merger.merge();
+        }
+
+        while (!ranges.isEmpty())
+        {
+            RangeIterator range = ranges.poll();
+
+            if (!range.hasNext())
+                continue;
+
+            int cmp = Long.compare(candidate.get(), range.getCurrent());
+
+            if (cmp == 0)
+            {
+                // If the next token is the same then consume and merge it
+                merger.add(range.next());
+            }
+            else if (cmp > 0)
+            {
+                candidates.add(candidate);
+                candidate = range.next();
+                merger.reset();
+                merger.add(candidate);
+            }
+            else
+            {
+                candidates.add(range.next());
+            }
+
+            processedRanges.add(range);
+        }
+
+        ranges.addAll(processedRanges);
+        return merger.merge();
+    }
+
+    protected void performSkipTo(Long nextToken)
+    {
+        while (!candidates.isEmpty())
+        {
+            Token candidate = candidates.peek();
+            if (candidate.get() >= nextToken)
+                break;
+            candidates.poll();
+        }
+        while (!ranges.isEmpty())
+        {
+            if (ranges.peek().getCurrent().compareTo(nextToken) >= 0)
+                break;
+
+            RangeIterator head = ranges.poll();
+
+            if (head.getMaximum().compareTo(nextToken) >= 0)
+            {
+                head.skipTo(nextToken);
+                if (head.hasNext())
+                {
+                    ranges.add(head);
+                    continue;
+                }
+            }
+        }
+    }
+
+    public void close() throws IOException
+    {
+        // Due to lazy key fetching, we cannot close iterator immediately
+        toRelease.forEach(FileUtils::closeQuietly);
+        ranges.forEach(FileUtils::closeQuietly);
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static RangeIterator build(List<RangeIterator> tokens)
+    {
+        return new Builder().add(tokens).build();
+    }
+
+    public static class Builder extends RangeIterator.Builder
+    {
+        public Builder()
+        {
+            super(IteratorType.UNION);
+        }
+
+        protected RangeIterator buildIterator()
+        {
+            switch (rangeCount())
+            {
+                case 1:
+                    return ranges.poll();
+
+                default:
+                    return new RangeUnionIterator(statistics, ranges);
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/SAICodecUtils.java b/src/java/org/apache/cassandra/index/sai/utils/SAICodecUtils.java
new file mode 100644
index 000000000000..a588f4cc59c6
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/SAICodecUtils.java
@@ -0,0 +1,266 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+
+import org.apache.cassandra.index.sai.disk.format.Version;
+import org.apache.cassandra.io.compress.CorruptBlockException;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+
+import static org.apache.lucene.codecs.CodecUtil.CODEC_MAGIC;
+import static org.apache.lucene.codecs.CodecUtil.FOOTER_MAGIC;
+
+public class SAICodecUtils
+{
+    public static final String FOOTER_POINTER = "footerPointer";
+
+    public static void writeHeader(IndexOutput out) throws IOException
+    {
+        out.writeInt(CODEC_MAGIC);
+        out.writeString(Version.LATEST.toString());
+    }
+
+    public static void writeFooter(IndexOutput out) throws IOException
+    {
+        out.writeInt(FOOTER_MAGIC);
+        out.writeInt(0);
+        writeCRC(out);
+    }
+
+    public static Version checkHeader(DataInput in) throws IOException
+    {
+        try
+        {
+            final int actualMagic = in.readInt();
+            if (actualMagic != CODEC_MAGIC)
+            {
+                throw new CorruptIndexException("codec header mismatch: actual header=" + actualMagic + " vs expected header=" + CODEC_MAGIC, in);
+            }
+            final Version actualVersion = Version.parse(in.readString());
+            if (!actualVersion.onOrAfter(Version.EARLIEST))
+            {
+                throw new IOException("Unsupported version: " + actualVersion);
+            }
+            return actualVersion;
+        }
+        catch (Throwable th)
+        {
+            if (th.getCause() instanceof CorruptBlockException)
+            {
+                throw new CorruptIndexException("corrupted", in, th.getCause());
+            }
+            else
+            {
+                throw th;
+            }
+        }
+    }
+
+    public static long checkFooter(ChecksumIndexInput in) throws IOException
+    {
+        validateFooter(in, false);
+        long actualChecksum = in.getChecksum();
+        long expectedChecksum = readCRC(in);
+        if (expectedChecksum != actualChecksum)
+        {
+            throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + Long.toHexString(expectedChecksum) +
+                                            " actual=" + Long.toHexString(actualChecksum), in);
+        }
+        return actualChecksum;
+    }
+
+    public static void validate(IndexInput input) throws IOException
+    {
+        checkHeader(input);
+        validateFooterAndResetPosition(input);
+    }
+
+    public static void validate(IndexInput input, long footerPointer) throws IOException
+    {
+        checkHeader(input);
+
+        long current = input.getFilePointer();
+        input.seek(footerPointer);
+        validateFooter(input, true);
+
+        input.seek(current);
+    }
+
+    public static void validateFooterAndResetPosition(IndexInput in) throws IOException
+    {
+        long position = in.getFilePointer();
+        long fileLength = in.length();
+        long footerLength = CodecUtil.footerLength();
+        long footerPosition = fileLength - footerLength;
+
+        if (footerPosition < 0)
+        {
+            throw new CorruptIndexException("invalid codec footer (file truncated?): file length=" + fileLength + ", footer length=" + footerLength, in);
+        }
+
+        in.seek(footerPosition);
+        validateFooter(in, false);
+        in.seek(position);
+    }
+
+    public static void validateChecksum(IndexInput input) throws IOException
+    {
+        long position = input.getFilePointer();
+        long expected = CodecUtil.retrieveChecksum(input);
+
+        input.seek(position);
+        long actual = CodecUtil.checksumEntireFile(input);
+        if (expected != actual)
+            throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + Long.toHexString(expected) + " actual=" + Long.toHexString(actual), input);
+    }
+
+    /**
+     * Copied from org.apache.lucene.codecs.CodecUtil.validateFooter(IndexInput)
+     */
+    public static void validateFooter(IndexInput in, boolean padded) throws IOException
+    {
+        long remaining = in.length() - in.getFilePointer();
+        long expected = CodecUtil.footerLength();
+
+        if (!padded)
+        {
+            if (remaining < expected)
+            {
+                throw new CorruptIndexException("misplaced codec footer (file truncated?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in);
+            }
+            else if (remaining > expected)
+            {
+                throw new CorruptIndexException("misplaced codec footer (file extended?): remaining=" + remaining + ", expected=" + expected + ", fp=" + in.getFilePointer(), in);
+            }
+        }
+
+        final int magic = in.readInt();
+
+        if (magic != FOOTER_MAGIC)
+        {
+            throw new CorruptIndexException("codec footer mismatch (file truncated?): actual footer=" + magic + " vs expected footer=" + FOOTER_MAGIC, in);
+        }
+
+        final int algorithmID = in.readInt();
+
+        if (algorithmID != 0)
+        {
+            throw new CorruptIndexException("codec footer mismatch: unknown algorithmID: " + algorithmID, in);
+        }
+    }
+
+
+    // Copied from Lucene CodecUtil as they are not public
+
+    /**
+     * Writes CRC32 value as a 64-bit long to the output.
+     * @throws IllegalStateException if CRC is formatted incorrectly (wrong bits set)
+     * @throws IOException if an i/o error occurs
+     */
+    static void writeCRC(IndexOutput output) throws IOException {
+        long value = output.getChecksum();
+        if ((value & 0xFFFFFFFF00000000L) != 0) {
+            throw new IllegalStateException("Illegal CRC-32 checksum: " + value + " (resource=" + output + ")");
+        }
+        output.writeLong(value);
+    }
+
+    /**
+     * Reads CRC32 value as a 64-bit long from the input.
+     * @throws CorruptIndexException if CRC is formatted incorrectly (wrong bits set)
+     * @throws IOException if an i/o error occurs
+     */
+    static long readCRC(IndexInput input) throws IOException {
+        long value = input.readLong();
+        if ((value & 0xFFFFFFFF00000000L) != 0) {
+            throw new CorruptIndexException("Illegal CRC-32 checksum: " + value, input);
+        }
+        return value;
+    }
+
+    // Copied from Lucene PackedInts as they are not public
+
+    public static int checkBlockSize(int blockSize, int minBlockSize, int maxBlockSize) {
+        if (blockSize >= minBlockSize && blockSize <= maxBlockSize) {
+            if ((blockSize & blockSize - 1) != 0) {
+                throw new IllegalArgumentException("blockSize must be a power of two, got " + blockSize);
+            } else {
+                return Integer.numberOfTrailingZeros(blockSize);
+            }
+        } else {
+            throw new IllegalArgumentException("blockSize must be >= " + minBlockSize + " and <= " + maxBlockSize + ", got " + blockSize);
+        }
+    }
+
+    public static int numBlocks(long size, int blockSize) {
+        int numBlocks = (int)(size / (long)blockSize) + (size % (long)blockSize == 0L ? 0 : 1);
+        if ((long)numBlocks * (long)blockSize < size) {
+            throw new IllegalArgumentException("size is too large for this block size");
+        } else {
+            return numBlocks;
+        }
+    }
+
+    // Copied from Lucene BlockPackedReaderIterator as they are not public
+
+    /**
+     * Same as DataInput.readVLong but supports negative values
+     */
+    public static long readVLong(DataInput in) throws IOException
+    {
+        byte b = in.readByte();
+        if (b >= 0) return b;
+        long i = b & 0x7FL;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 7;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 14;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 21;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 28;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 35;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 42;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0x7FL) << 49;
+        if (b >= 0) return i;
+        b = in.readByte();
+        i |= (b & 0xFFL) << 56;
+        return i;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/SeekingRandomAccessInput.java b/src/java/org/apache/cassandra/index/sai/utils/SeekingRandomAccessInput.java
new file mode 100644
index 000000000000..64d66daebfdc
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/SeekingRandomAccessInput.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.RandomAccessInput;
+
+/**
+ * {@link IndexInput} adapter that exposes it as a {@link RandomAccessInput} type.
+ */
+public class SeekingRandomAccessInput implements RandomAccessInput
+{
+    private final IndexInput in;
+
+    public SeekingRandomAccessInput(IndexInput in)
+    {
+        this.in = in;
+    }
+
+    @Override
+    public byte readByte(long pos) throws IOException
+    {
+        in.seek(pos);
+        return in.readByte();
+    }
+
+    @Override
+    public short readShort(long pos) throws IOException
+    {
+        in.seek(pos);
+        return in.readShort();
+    }
+
+    @Override
+    public int readInt(long pos) throws IOException
+    {
+        in.seek(pos);
+        return in.readInt();
+    }
+
+    @Override
+    public long readLong(long pos) throws IOException
+    {
+        in.seek(pos);
+        return in.readLong();
+    }
+
+    @Override
+    public String toString()
+    {
+        return "SeekingRandomAccessInput(" + in + ")";
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/TermIterator.java b/src/java/org/apache/cassandra/index/sai/utils/TermIterator.java
new file mode 100644
index 000000000000..c58ad339571f
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/TermIterator.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.utils.Throwables;
+
+public class TermIterator extends RangeIterator
+{
+    private static final Logger logger = LoggerFactory.getLogger(TermIterator.class);
+
+    private final QueryContext context;
+
+    private final RangeIterator union;
+    private final Set<SSTableIndex> referencedIndexes;
+
+    private TermIterator(RangeIterator union, Set<SSTableIndex> referencedIndexes, QueryContext queryContext)
+    {
+        super(union.getMinimum(), union.getMaximum(), union.getCount());
+
+        this.union = union;
+        this.referencedIndexes = referencedIndexes;
+        this.context = queryContext;
+    }
+
+    @SuppressWarnings("resource")
+    public static TermIterator build(final Expression e, Set<SSTableIndex> perSSTableIndexes, AbstractBounds<PartitionPosition> keyRange, QueryContext queryContext, boolean defer)
+    {
+        final List<RangeIterator> tokens = new ArrayList<>(1 + perSSTableIndexes.size());;
+
+        RangeIterator memtableIterator = e.context.searchMemtable(e, keyRange);
+        if (memtableIterator != null)
+            tokens.add(memtableIterator);
+
+        for (final SSTableIndex index : perSSTableIndexes)
+        {
+            try
+            {
+                queryContext.checkpoint();
+                queryContext.incSstablesHit();
+                assert !index.isReleased();
+
+                SSTableQueryContext context = queryContext.getSSTableQueryContext(index.getSSTable());
+                RangeIterator keyIterator = index.search(e, keyRange, context, defer);
+
+                if (keyIterator == null)
+                    continue;
+
+                tokens.add(keyIterator);
+            }
+            catch (Throwable e1)
+            {
+                if (logger.isDebugEnabled() && !(e1 instanceof AbortedOperationException))
+                    logger.debug(String.format("Failed search an index %s, skipping.", index.getSSTable()), e1);
+
+                throw Throwables.cleaned(e1);
+            }
+        }
+
+        RangeIterator ranges = RangeUnionIterator.build(tokens);
+        return new TermIterator(ranges, perSSTableIndexes, queryContext);
+    }
+
+    protected Token computeNext()
+    {
+        try
+        {
+            return union.hasNext() ? union.next() : endOfData();
+        }
+        finally
+        {
+            context.checkpoint();
+        }
+    }
+
+    protected void performSkipTo(Long nextToken)
+    {
+        try
+        {
+            union.skipTo(nextToken);
+        }
+        finally
+        {
+            context.checkpoint();
+        }
+    }
+
+    public void close()
+    {
+        FileUtils.closeQuietly(union);
+        referencedIndexes.forEach(TermIterator::releaseQuietly);
+        referencedIndexes.clear();
+    }
+
+    private static void releaseQuietly(SSTableIndex index)
+    {
+        try
+        {
+            index.release();
+        }
+        catch (Throwable e)
+        {
+            logger.error(String.format("Failed to release index %s", index.getSSTable()), e);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java b/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java
new file mode 100644
index 000000000000..70b54cde69be
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java
@@ -0,0 +1,533 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.utils;
+
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.Set;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+import com.googlecode.concurrenttrees.radix.ConcurrentRadixTree;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
+import org.apache.cassandra.db.marshal.CollectionType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.InetAddressType;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FastByteOperations;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+
+public class TypeUtil
+{
+    private static final byte[] IPV4_PREFIX = new byte[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 };
+
+    /**
+     * DecimalType / BigDecimal values are indexed by truncating their asComparableBytes representation to this size,
+     * padding on the right with zero-value-bytes until this size is reached (if necessary).  This causes
+     * false-positives that must be filtered in a separate step after hitting the index and reading the associated
+     * (full) values.
+     */
+    public static final int DECIMAL_APPROXIMATION_BYTES = 24;
+
+    private TypeUtil() {}
+
+    /**
+     * Returns <code>true</code> if given buffer would pass the {@link AbstractType#validate(ByteBuffer)}
+     * check. False otherwise.
+     */
+    public static boolean isValid(ByteBuffer term, AbstractType<?> validator)
+    {
+        try
+        {
+            validator.validate(term);
+            return true;
+        }
+        catch (MarshalException e)
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Indicates if the type encoding supports rounding of the raw value.
+     *
+     * This is significant in range searches where we have to make all range
+     * queries inclusive when searching the indexes in order to avoid excluding
+     * rounded values. Excluded values are removed by post-filtering.
+     */
+    public static boolean supportsRounding(AbstractType<?> type)
+    {
+        return isBigInteger(type) || isBigDecimal(type);
+    }
+
+    /**
+     * Returns the smaller of two {@code ByteBuffer} values, based on the result of {@link
+     * #compare(ByteBuffer, ByteBuffer, AbstractType)} comparision.
+     */
+    public static ByteBuffer min(ByteBuffer a, ByteBuffer b, AbstractType<?> type)
+    {
+        return a == null ?  b : (b == null || compare(b, a, type) > 0) ? a : b;
+    }
+
+    /**
+     * Returns the greater of two {@code ByteBuffer} values, based on the result of {@link
+     * #compare(ByteBuffer, ByteBuffer, AbstractType)} comparision.
+     */
+    public static ByteBuffer max(ByteBuffer a, ByteBuffer b, AbstractType<?> type)
+    {
+        return a == null ?  b : (b == null || compare(b, a, type) < 0) ? a : b;
+    }
+
+    /**
+     * Returns the lesser of two {@code ByteComparable} values, based on the result of {@link
+     * ByteComparable#compare(ByteComparable, ByteComparable, ByteComparable.Version)} comparision.
+     */
+    public static ByteComparable min(ByteComparable a, ByteComparable b)
+    {
+        return a == null ?  b : (b == null || ByteComparable.compare(b, a, ByteComparable.Version.OSS41) > 0) ? a : b;
+    }
+
+    /**
+     * Returns the greater of two {@code ByteComparable} values, based on the result of {@link
+     * ByteComparable#compare(ByteComparable, ByteComparable, ByteComparable.Version)} comparision.
+     */
+    public static ByteComparable max(ByteComparable a, ByteComparable b)
+    {
+        return a == null ?  b : (b == null || ByteComparable.compare(b, a, ByteComparable.Version.OSS41) < 0) ? a : b;
+    }
+
+    /**
+     * Returns the value length for the given {@link AbstractType}, selecting 16 for types
+     * that officially use VARIABLE_LENGTH but are, in fact, of a fixed length.
+     */
+    public static int fixedSizeOf(AbstractType<?> type)
+    {
+        if (type.isValueLengthFixed())
+            return type.valueLengthIfFixed();
+        else if (isInetAddress(type))
+            return 16;
+        else if (isBigInteger(type))
+            return 20;
+        else if (type instanceof DecimalType)
+            return DECIMAL_APPROXIMATION_BYTES;
+        return 16;
+    }
+
+    public static AbstractType<?> cellValueType(Pair<ColumnMetadata, IndexTarget.Type> target)
+    {
+        AbstractType<?> type = target.left.type;
+        if (isNonFrozenCollection(type))
+        {
+            CollectionType<?> collection = ((CollectionType<?>) type);
+            switch (collection.kind)
+            {
+                case LIST:
+                    return collection.valueComparator();
+                case SET:
+                    return collection.nameComparator();
+                case MAP:
+                    switch (target.right)
+                    {
+                        case KEYS:
+                            return collection.nameComparator();
+                        case VALUES:
+                            return collection.valueComparator();
+                        case KEYS_AND_VALUES:
+                            return CompositeType.getInstance(collection.nameComparator(), collection.valueComparator());
+                    }
+            }
+        }
+        return type;
+    }
+
+    /**
+     * Allows overriding the default getString method for {@link CompositeType}. It is
+     * a requirement of the {@link ConcurrentRadixTree} that the keys are strings but
+     * the getString method of {@link CompositeType} does not return a string that compares
+     * in the same order as the underlying {@link ByteBuffer}. To get round this we convert
+     * the {@link CompositeType} bytes to a hex string.
+     */
+    public static String getString(ByteBuffer value, AbstractType<?> type)
+    {
+        if (isComposite(type))
+            return ByteBufferUtil.bytesToHex(value);
+        return type.getString(value);
+    }
+
+    /**
+     * The inverse of the above method. Overrides the fromString method on {@link CompositeType}
+     * in order to convert the hex string to bytes.
+     */
+    public static ByteBuffer fromString(String value, AbstractType<?> type)
+    {
+        if (isComposite(type))
+            return ByteBufferUtil.hexToBytes(value);
+        return type.fromString(value);
+    }
+
+    public static ByteSource asComparableBytes(ByteBuffer value, AbstractType<?> type, ByteComparable.Version version)
+    {
+        if (type instanceof InetAddressType || type instanceof IntegerType || type instanceof DecimalType)
+            return ByteSource.optionalFixedLength(ByteBufferAccessor.instance, value);
+        return type.asComparableBytes(value, version);
+    }
+
+    /**
+     * Fills a byte array with the comparable bytes for a type.
+     * <p>
+     * This method expects a {@code value} parameter generated by calling {@link #encode(ByteBuffer, AbstractType)}.
+     * It is not generally safe to pass the output of other serialization methods to this method.  For instance, it is
+     * not generally safe to pass the output of {@link AbstractType#decompose(Object)} as the {@code value} parameter
+     * (there are certain types for which this is technically OK, but that doesn't hold for all types).
+     *
+     * @param value a value buffer returned by {@link #encode(ByteBuffer, AbstractType)}
+     * @param type the type associated with the encoded {@code value} parameter
+     * @param bytes this method's output
+     */
+    public static void toComparableBytes(ByteBuffer value, AbstractType<?> type, byte[] bytes)
+    {
+        if (isInetAddress(type))
+            ByteBufferUtil.arrayCopy(value, value.hasArray() ? value.arrayOffset() + value.position() : value.position(), bytes, 0, 16);
+        else if (isBigInteger(type))
+            ByteBufferUtil.arrayCopy(value, value.hasArray() ? value.arrayOffset() + value.position() : value.position(), bytes, 0, 20);
+        else if (type instanceof DecimalType)
+            ByteBufferUtil.arrayCopy(value, value.hasArray() ? value.arrayOffset() + value.position() : value.position(), bytes, 0, DECIMAL_APPROXIMATION_BYTES);
+        else
+            ByteBufferUtil.toBytes(type.asComparableBytes(value, ByteComparable.Version.OSS41), bytes);
+    }
+
+    /**
+     * Encode an external term from a memtable index or a compaction. The purpose of this is to
+     * allow terms of particular types to be handled differently and not use the default
+     * {@link ByteComparable} encoding.
+     */
+    public static ByteBuffer encode(ByteBuffer value, AbstractType<?> type)
+    {
+        if (value == null)
+            return null;
+
+        if (isInetAddress(type))
+            return encodeInetAddress(value);
+        else if (isBigInteger(type))
+            return encodeBigInteger(value);
+        else if (type instanceof DecimalType)
+            return encodeDecimal(value);
+        return value;
+    }
+
+    /**
+     * Compare two terms based on their type. This is used in place of {@link AbstractType#compare(ByteBuffer, ByteBuffer)}
+     * so that the default comparison can be overridden for specific types.
+     *
+     * Note: This should be used for all term comparison
+     */
+    public static int compare(ByteBuffer b1, ByteBuffer b2, AbstractType<?> type)
+    {
+        if (isInetAddress(type))
+            return compareInet(b1, b2);
+        // BigInteger values, frozen types and composite types (map entries) use compareUnsigned to maintain
+        // a consistent order between the in-memory index and the on-disk index.
+        else if (isBigInteger(type) || isBigDecimal(type) || isCompositeOrFrozenCollection(type))
+            return FastByteOperations.compareUnsigned(b1, b2);
+
+        return type.compare(b1, b2 );
+    }
+
+    /**
+     * This is used for value comparison in post-filtering - {@link Expression#isSatisfiedBy(ByteBuffer)}.
+     *
+     * This allows types to decide whether they should be compared based on their encoded value or their
+     * raw value. At present only {@link InetAddressType} values are compared by their encoded values to
+     * allow for ipv4 -> ipv6 equivalency in searches.
+     */
+    public static int comparePostFilter(Expression.Value requestedValue, Expression.Value columnValue, AbstractType<?> type)
+    {
+        if (isInetAddress(type))
+            return compareInet(requestedValue.encoded, columnValue.encoded);
+        // Override comparisons for frozen collections and composite types (map entries)
+        else if (isCompositeOrFrozenCollection(type))
+            return FastByteOperations.compareUnsigned(requestedValue.raw, columnValue.raw);
+
+        return type.compare(requestedValue.raw, columnValue.raw);
+    }
+
+    public static Iterator<ByteBuffer> collectionIterator(AbstractType<?> validator,
+                                                          ComplexColumnData cellData,
+                                                          Pair<ColumnMetadata, IndexTarget.Type> target,
+                                                          int nowInSecs)
+    {
+        if (cellData == null)
+            return null;
+
+        Stream<ByteBuffer> stream = StreamSupport.stream(cellData.spliterator(), false).filter(cell -> cell != null && cell.isLive(nowInSecs))
+                                                 .map(cell -> cellValue(target, cell));
+
+        if (isInetAddress(validator))
+            stream = stream.sorted((c1, c2) -> compareInet(encodeInetAddress(c1), encodeInetAddress(c2)));
+
+        return stream.iterator();
+    }
+
+    public static Comparator<ByteBuffer> comparator(AbstractType<?> type)
+    {
+        // Override the comparator for BigInteger, frozen collections and composite types
+        if (isBigInteger(type) || isBigDecimal(type) || isCompositeOrFrozenCollection(type))
+            return FastByteOperations::compareUnsigned;
+
+        return type;
+    }
+
+    private static ByteBuffer cellValue(Pair<ColumnMetadata, IndexTarget.Type> target, Cell cell)
+    {
+        if (target.left.type.isCollection() && target.left.type.isMultiCell())
+        {
+            switch (((CollectionType<?>) target.left.type).kind)
+            {
+                case LIST:
+                    //TODO Is there any optimisation can be done here with cell values?
+                    return cell.buffer();
+                case SET:
+                    return cell.path().get(0);
+                case MAP:
+                    switch (target.right)
+                    {
+                        case KEYS:
+                            return cell.path().get(0);
+                        case VALUES:
+                            return cell.buffer();
+                        case KEYS_AND_VALUES:
+                            return CompositeType.build(ByteBufferAccessor.instance, cell.path().get(0), cell.buffer());
+                    }
+            }
+        }
+        return cell.buffer();
+    }
+
+    /**
+     * Compares 2 InetAddress terms by ensuring that both addresses are represented as
+     * ipv6 addresses.
+     */
+    private static int compareInet(ByteBuffer b1, ByteBuffer b2)
+    {
+        assert isIPv6(b1) && isIPv6(b2);
+
+        return FastByteOperations.compareUnsigned(b1, b2);
+    }
+
+    private static boolean isIPv6(ByteBuffer address)
+    {
+        return address.remaining() == 16;
+    }
+
+    /**
+     * Encode a {@link InetAddress} into a fixed width 16 byte encoded value.
+     *
+     * The encoded value is byte comparable and prefix compressible.
+     *
+     * The encoding is done by converting ipv4 addresses to their ipv6 equivalent.
+     */
+    private static ByteBuffer encodeInetAddress(ByteBuffer value)
+    {
+        if (value.remaining() == 4)
+        {
+            int position = value.hasArray() ? value.arrayOffset() + value.position() : value.position();
+            ByteBuffer mapped = ByteBuffer.allocate(16);
+            System.arraycopy(IPV4_PREFIX, 0, mapped.array(), 0, IPV4_PREFIX.length);
+            ByteBufferUtil.arrayCopy(value, position, mapped, IPV4_PREFIX.length, value.remaining());
+            return mapped;
+        }
+        return value;
+    }
+
+    /**
+     * Encode a {@link BigInteger} into a fixed width 20 byte encoded value.
+     *
+     * The encoded value is byte comparable and prefix compressible.
+     *
+     * The format of the encoding is:
+     *
+     *  The first 4 bytes contain the length of the {@link BigInteger} byte array
+     *  with the top bit flipped for positive values.
+     *
+     *  The remaining 16 bytes contain the 16 most significant bytes of the
+     *  {@link BigInteger} byte array.
+     *
+     *  For {@link BigInteger} values whose underlying byte array is less than
+     *  16 bytes, the encoded value is sign extended.
+     */
+    public static ByteBuffer encodeBigInteger(ByteBuffer value)
+    {
+        int size = value.remaining();
+        int position = value.hasArray() ? value.arrayOffset() + value.position() : value.position();
+        byte[] bytes = new byte[20];
+        if (size < 16)
+        {
+            ByteBufferUtil.arrayCopy(value, position, bytes, bytes.length - size, size);
+            if ((bytes[bytes.length - size] & 0x80) != 0)
+                Arrays.fill(bytes, 4, bytes.length - size, (byte)0xff);
+            else
+                Arrays.fill(bytes, 4, bytes.length - size, (byte)0x00);
+        }
+        else
+        {
+            ByteBufferUtil.arrayCopy(value, position, bytes, 4, 16);
+        }
+        if ((bytes[4] & 0x80) != 0)
+        {
+            size = -size;
+        }
+        bytes[0] = (byte)(size >> 24 & 0xff);
+        bytes[1] = (byte)(size >> 16 & 0xff);
+        bytes[2] = (byte)(size >> 8 & 0xff);
+        bytes[3] = (byte)(size & 0xff);
+        bytes[0] ^= 0x80;
+        return ByteBuffer.wrap(bytes);
+    }
+
+    /* Type comparison to get rid of ReversedType */
+
+    /**
+     * Returns <code>true</code> if values of the given {@link AbstractType} should be indexed as literals.
+     */
+    public static boolean isLiteral(AbstractType<?> type)
+    {
+        return isUTF8OrAscii(type) || isCompositeOrFrozenCollection(type);
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is UTF8 or Ascii
+     */
+    public static boolean isUTF8OrAscii(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type instanceof UTF8Type || type instanceof AsciiType;
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is Composite(map entry) or frozen-collection.
+     */
+    public static boolean isCompositeOrFrozenCollection(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type instanceof CompositeType || (type.isCollection() && !type.isMultiCell());
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is frozen-collection.
+     */
+    public static boolean isFrozenCollection(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type.isCollection() && !type.isMultiCell();
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is non-frozen-collection.
+     */
+    public static boolean isNonFrozenCollection(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type.isCollection() && type.isMultiCell();
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is included in the types.
+     */
+    public static boolean isIn(AbstractType<?> type, Set<AbstractType<?>> types)
+    {
+        type = baseType(type);
+        return types.contains(type);
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is {@link InetAddressType}
+     */
+    private static boolean isInetAddress(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type instanceof InetAddressType;
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is {@link IntegerType}
+     */
+    private static boolean isBigInteger(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type instanceof IntegerType;
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is {@link DecimalType}
+     */
+    private static boolean isBigDecimal(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type instanceof DecimalType;
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is {@link CompositeType}
+     */
+    public static boolean isComposite(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return type instanceof CompositeType;
+    }
+
+    /**
+     * @return base type if given type is reversed, otherwise return itself
+     */
+    private static AbstractType<?> baseType(AbstractType<?> type)
+    {
+        return type.isReversed() ? ((ReversedType<?>) type).baseType : type;
+    }
+
+    public static ByteBuffer encodeDecimal(ByteBuffer value)
+    {
+        ByteSource bs = DecimalType.instance.asComparableBytes(value, ByteComparable.Version.OSS41);
+        bs = ByteSource.cutOrRightPad(bs, DECIMAL_APPROXIMATION_BYTES, 0);
+        return ByteBuffer.wrap(ByteSourceInverse.readBytes(bs, DECIMAL_APPROXIMATION_BYTES));
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/view/IndexViewManager.java b/src/java/org/apache/cassandra/index/sai/view/IndexViewManager.java
new file mode 100644
index 000000000000..d69227c37770
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/view/IndexViewManager.java
@@ -0,0 +1,161 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.view;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicReference;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * Maintain a atomic view for read requests, so that requests can read all data during concurrent compactions.
+ *
+ * All per-column {@link SSTableIndex} updates should be proxied by {@link StorageAttachedIndexGroup} to make
+ * sure per-sstable {@link SSTableContext} are in-sync.
+ */
+public class IndexViewManager
+{
+    private static final Logger logger = LoggerFactory.getLogger(IndexViewManager.class);
+    
+    private final ColumnContext context;
+    private final AtomicReference<View> view = new AtomicReference<>();
+
+    public IndexViewManager(ColumnContext context)
+    {
+        this(context, Collections.emptySet());
+    }
+
+    @VisibleForTesting
+    IndexViewManager(ColumnContext context, Collection<SSTableIndex> indices)
+    {
+        this.context = context;
+        this.view.set(new View(context, indices));
+    }
+
+    public View getView()
+    {
+        return view.get();
+    }
+
+    /**
+     * Replaces old SSTables with new by creating new immutable view.
+     *
+     * @param oldSSTables A set of SSTables to remove.
+     * @param newSSTableContexts A set of SSTableContexts to add to tracker.
+     * @param validate if true, per-column index files' header and footer will be validated.
+     * @param rename if true check whether the per-column index components need renaming
+     *
+     * @return A set of SSTables which have attached to them invalid index components.
+     */
+    public Set<SSTableContext> update(Collection<SSTableReader> oldSSTables, Collection<SSTableContext> newSSTableContexts, boolean validate, boolean rename)
+    {
+        // Valid indexes on the left and invalid SSTable contexts on the right...
+        Pair<Set<SSTableIndex>, Set<SSTableContext>> indexes = context.getBuiltIndexes(newSSTableContexts, validate, rename);
+
+        View currentView, newView;
+        Collection<SSTableIndex> newViewIndexes = new HashSet<>();
+        Collection<SSTableIndex> releasableIndexes = new ArrayList<>();
+        Collection<SSTableReader> toRemove = new HashSet<>(oldSSTables);
+        
+        do
+        {
+            currentView = view.get();
+            newViewIndexes.clear();
+            releasableIndexes.clear();
+
+            for (SSTableIndex sstableIndex : currentView)
+            {
+                // When aborting early open transaction, toRemove may have the same sstable files as newSSTableContexts,
+                // but different SSTableReader java objects with different start positions. So we need to release them
+                // from existing view.  see DSP-19677
+                SSTableReader sstable = sstableIndex.getSSTable();
+                if (toRemove.contains(sstable) || newViewIndexes.contains(sstableIndex))
+                    releasableIndexes.add(sstableIndex);
+                else
+                    newViewIndexes.add(sstableIndex);
+            }
+
+            for (SSTableIndex sstableIndex : indexes.left)
+            {
+                if (newViewIndexes.contains(sstableIndex))
+                    releasableIndexes.add(sstableIndex);
+                else
+                    newViewIndexes.add(sstableIndex);
+            }
+
+            newView = new View(context, newViewIndexes);
+        }
+        while (!view.compareAndSet(currentView, newView));
+
+        releasableIndexes.forEach(SSTableIndex::release);
+
+        if (logger.isTraceEnabled())
+            logger.trace(context.logMessage("There are now {} active SSTable indexes."), view.get().getIndexes().size());
+
+        return indexes.right;
+    }
+
+    public void drop(Collection<SSTableReader> sstablesToRebuild)
+    {
+        View currentView = view.get();
+
+        Set<SSTableReader> toRemove = new HashSet<>(sstablesToRebuild);
+        for (SSTableIndex index : currentView)
+        {
+            SSTableReader sstable = index.getSSTable();
+            if (!toRemove.contains(sstable))
+                continue;
+
+            index.markObsolete();
+        }
+
+        update(toRemove, Collections.emptyList(), false, false);
+    }
+
+    public void invalidate()
+    {
+        View currentView = view.get();
+
+        for (SSTableIndex index : currentView)
+        {
+            index.markObsolete();
+        }
+
+        view.set(new View(context, Collections.emptyList()));
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/view/RangeTermTree.java b/src/java/org/apache/cassandra/index/sai/view/RangeTermTree.java
new file mode 100644
index 000000000000..d2f6375b0ceb
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/view/RangeTermTree.java
@@ -0,0 +1,124 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.view;
+
+import java.lang.invoke.MethodHandles;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.utils.Interval;
+import org.apache.cassandra.utils.IntervalTree;
+
+public class RangeTermTree implements TermTree
+{
+    private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+    protected final ByteBuffer min, max;
+    protected final AbstractType<?> comparator;
+    
+    private final IntervalTree<Term, SSTableIndex, Interval<Term, SSTableIndex>> rangeTree;
+
+    private RangeTermTree(ByteBuffer min, ByteBuffer max, IntervalTree<Term, SSTableIndex, Interval<Term, SSTableIndex>> rangeTree, AbstractType<?> comparator)
+    {
+        this.min = min;
+        this.max = max;
+        this.rangeTree = rangeTree;
+        this.comparator = comparator;
+    }
+
+    public Set<SSTableIndex> search(Expression e)
+    {
+        ByteBuffer minTerm = e.lower == null ? min : e.lower.value.encoded;
+        ByteBuffer maxTerm = e.upper == null ? max : e.upper.value.encoded;
+
+        return new HashSet<>(rangeTree.search(Interval.create(new Term(minTerm, comparator),
+                                                              new Term(maxTerm, comparator),
+                                                              null)));
+    }
+
+    static class Builder extends TermTree.Builder
+    {
+        final List<Interval<Term, SSTableIndex>> intervals = new ArrayList<>();
+
+        protected Builder(AbstractType<?> comparator)
+        {
+            super(comparator);
+        }
+
+        public void addIndex(SSTableIndex index)
+        {
+            Interval<Term, SSTableIndex> interval =
+                    Interval.create(new Term(index.minTerm(), comparator), new Term(index.maxTerm(), comparator), index);
+
+            if (logger.isTraceEnabled())
+            {
+                ColumnContext context = index.getColumnContext();
+                logger.trace(context.logMessage("Adding index for SSTable {} with minTerm={} and maxTerm={}..."), 
+                                                index.getSSTable().descriptor, 
+                                                comparator.compose(index.minTerm()), 
+                                                comparator.compose(index.maxTerm()));
+            }
+
+            intervals.add(interval);
+        }
+
+        public TermTree build()
+        {
+            return new RangeTermTree(min, max, IntervalTree.build(intervals), comparator);
+        }
+    }
+
+    /**
+     * This is required since IntervalTree doesn't support custom Comparator
+     * implementations and relied on items to be comparable which "raw" terms are not.
+     */
+    protected static class Term implements Comparable<Term>
+    {
+        private final ByteBuffer term;
+        private final AbstractType<?> comparator;
+
+        Term(ByteBuffer term, AbstractType<?> comparator)
+        {
+            this.term = term;
+            this.comparator = comparator;
+        }
+
+        public int compareTo(Term o)
+        {
+            return TypeUtil.compare(term, o.term, comparator);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/view/TermTree.java b/src/java/org/apache/cassandra/index/sai/view/TermTree.java
new file mode 100644
index 000000000000..d2041b29f83c
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/view/TermTree.java
@@ -0,0 +1,61 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.view;
+
+import java.nio.ByteBuffer;
+import java.util.Set;
+
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+
+public interface TermTree
+{
+    Set<SSTableIndex> search(Expression e);
+
+    abstract class Builder
+    {
+        protected final AbstractType<?> comparator;
+        protected ByteBuffer min, max;
+
+        protected Builder(AbstractType<?> comparator)
+        {
+            this.comparator = comparator;
+        }
+
+        public final void add(SSTableIndex index)
+        {
+            addIndex(index);
+
+            min = min == null || TypeUtil.compare(min, index.minTerm(), comparator) > 0 ? index.minTerm() : min;
+            max = max == null || TypeUtil.compare(max, index.maxTerm(), comparator) < 0 ? index.maxTerm() : max;
+        }
+
+        protected abstract void addIndex(SSTableIndex index);
+
+        public abstract TermTree build();
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/view/View.java b/src/java/org/apache/cassandra/index/sai/view/View.java
new file mode 100644
index 000000000000..d10ac1589c6d
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/view/View.java
@@ -0,0 +1,129 @@
+/*
+ * All changes to the original code are Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.view;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.Interval;
+import org.apache.cassandra.utils.IntervalTree;
+
+public class View implements Iterable<SSTableIndex>
+{
+    private final Map<Descriptor, SSTableIndex> view;
+
+    private final TermTree termTree;
+    private final AbstractType<?> keyValidator;
+    private final IntervalTree<Key, SSTableIndex, Interval<Key, SSTableIndex>> keyIntervalTree;
+
+    public View(ColumnContext context, Collection<SSTableIndex> indexes) {
+        this.view = new HashMap<>();
+        this.keyValidator = context.keyValidator();
+
+        AbstractType<?> validator = context.getValidator();
+
+        TermTree.Builder termTreeBuilder = new RangeTermTree.Builder(validator);
+
+        List<Interval<Key, SSTableIndex>> keyIntervals = new ArrayList<>();
+        for (SSTableIndex sstableIndex : indexes)
+        {
+            this.view.put(sstableIndex.getSSTable().descriptor, sstableIndex);
+            termTreeBuilder.add(sstableIndex);
+            keyIntervals.add(Interval.create(new Key(sstableIndex.minKey()),
+                                             new Key(sstableIndex.maxKey()),
+                                             sstableIndex));
+        }
+
+        this.termTree = termTreeBuilder.build();
+        this.keyIntervalTree = IntervalTree.build(keyIntervals);
+    }
+
+    public Set<SSTableIndex> match(Expression expression)
+    {
+        return termTree.search(expression);
+    }
+
+    public List<SSTableIndex> match(DecoratedKey minKey, DecoratedKey maxKey)
+    {
+        return keyIntervalTree.search(Interval.create(new Key(minKey), new Key(maxKey), null));
+    }
+
+    public Iterator<SSTableIndex> iterator()
+    {
+        return view.values().iterator();
+    }
+
+    public Collection<SSTableIndex> getIndexes()
+    {
+        return view.values();
+    }
+
+    public boolean containsSSTable(SSTableReader sstable)
+    {
+        return view.containsKey(sstable.descriptor);
+    }
+
+    public int size()
+    {
+        return view.size();
+    }
+
+    /**
+     * This is required since IntervalTree doesn't support custom Comparator
+     * implementations and relied on items to be comparable which "raw" keys are not.
+     */
+    private static class Key implements Comparable<Key>
+    {
+        private final DecoratedKey key;
+
+        public Key(DecoratedKey key)
+        {
+            this.key = key;
+        }
+
+        public int compareTo(Key o)
+        {
+            return key.compareTo(o.key);
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("View{view=%s, keyValidator=%s, keyIntervalTree=%s}", view, keyValidator, keyIntervalTree);
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java b/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java
new file mode 100644
index 000000000000..f001fdf63b80
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.virtual;
+
+import java.util.function.Consumer;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.virtual.AbstractVirtualTable;
+import org.apache.cassandra.db.virtual.SimpleDataSet;
+import org.apache.cassandra.db.virtual.VirtualTable;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.index.sai.view.View;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * A {@link VirtualTable} providing a system view of per-column storage-attached index metadata.
+ */
+public class IndexesSystemView extends AbstractVirtualTable
+{
+    private static final Logger logger = LoggerFactory.getLogger(IndexesSystemView.class);
+
+    static final String NAME = "indexes";
+
+    static final String KEYSPACE_NAME = "keyspace_name";
+    static final String INDEX_NAME = "index_name";
+    static final String TABLE_NAME = "table_name";
+    static final String COLUMN_NAME = "column_name";
+    static final String IS_QUERYABLE = "is_queryable";
+    static final String IS_BUILDING = "is_building";
+    static final String IS_STRING = "is_string";
+    static final String ANALYZER = "analyzer";
+    static final String INDEXED_SSTABLE_COUNT = "indexed_sstable_count";
+    static final String CELL_COUNT = "cell_count";
+    static final String PER_TABLE_DISK_SIZE = "per_table_disk_size";
+    static final String PER_COLUMN_DISK_SIZE = "per_column_disk_size";
+    static final String PER_TABLE_FILE_CACHE_SIZE = "per_table_file_cache_size";
+    static final String PER_COLUMN_FILE_CACHE_SIZE = "per_column_file_cache_size";
+
+    public IndexesSystemView(String keyspace)
+    {
+        super(TableMetadata.builder(keyspace, NAME)
+                           .partitioner(new LocalPartitioner(UTF8Type.instance))
+                           .comment("Storage-attached column index metadata")
+                           .kind(TableMetadata.Kind.VIRTUAL)
+                           .addPartitionKeyColumn(KEYSPACE_NAME, UTF8Type.instance)
+                           .addClusteringColumn(INDEX_NAME, UTF8Type.instance)
+                           .addRegularColumn(TABLE_NAME, UTF8Type.instance)
+                           .addRegularColumn(COLUMN_NAME, UTF8Type.instance)
+                           .addRegularColumn(IS_QUERYABLE, BooleanType.instance)
+                           .addRegularColumn(IS_BUILDING, BooleanType.instance)
+                           .addRegularColumn(IS_STRING, BooleanType.instance)
+                           .addRegularColumn(ANALYZER, UTF8Type.instance)
+                           .addRegularColumn(INDEXED_SSTABLE_COUNT, Int32Type.instance)
+                           .addRegularColumn(CELL_COUNT, LongType.instance)
+                           .addRegularColumn(PER_TABLE_DISK_SIZE, LongType.instance)
+                           .addRegularColumn(PER_COLUMN_DISK_SIZE, LongType.instance)
+                           .build());
+    }
+
+
+    @Override
+    public void apply(PartitionUpdate update)
+    {
+        // TODO port DataSet. Now we can't change index queryability via system view
+        throw new InvalidRequestException("Modification is not supported by table " + metadata);
+    }
+
+    @Override
+    public DataSet data()
+    {
+        SimpleDataSet dataset = new SimpleDataSet(metadata());
+
+        for (String ks : Schema.instance.getUserKeyspaces())
+        {
+            Keyspace keyspace = Schema.instance.getKeyspaceInstance(ks);
+            if (keyspace == null)
+                throw new IllegalArgumentException("Unknown keyspace " + ks);
+
+            for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+            {
+                SecondaryIndexManager manager = cfs.indexManager;
+                StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+
+                if (group != null)
+                {
+                    for (StorageAttachedIndex index : group)
+                    {
+                        ColumnContext context = index.getContext();
+                        String indexName = context.getIndexName();
+                        View view = context.getView();
+
+                        dataset.row(ks, indexName)
+                               .column(TABLE_NAME, cfs.name)
+                               .column(COLUMN_NAME, context.getColumnName())
+                               .column(IS_QUERYABLE, manager.isIndexQueryable(index))
+                               .column(IS_BUILDING, manager.isIndexBuilding(indexName))
+                               .column(IS_STRING, context.isLiteral())
+                               .column(ANALYZER, context.getAnalyzer().toString())
+                               .column(INDEXED_SSTABLE_COUNT, view.size())
+                               .column(CELL_COUNT, context.getCellCount())
+                               .column(PER_TABLE_DISK_SIZE, group.diskUsage())
+                               .column(PER_COLUMN_DISK_SIZE, context.diskUsage());
+                    }
+                }
+            }
+        }
+
+        return dataset;
+    }
+
+    private static Consumer<Boolean> isQueryableUpdateConsumer(SecondaryIndexManager manager, StorageAttachedIndex index)
+    {
+        return isQueryable -> {
+            logger.debug(index.getContext().logMessage("Index is now {}queryable."), isQueryable ? "" : "non-");
+
+            if (isQueryable)
+                manager.makeIndexQueryable(index, Index.Status.BUILD_SUCCEEDED);
+            else
+                manager.makeIndexNonQueryable(index, Index.Status.BUILD_FAILED);
+        };
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java b/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java
new file mode 100644
index 000000000000..50f7c055b5d4
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.virtual;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.virtual.AbstractVirtualTable;
+import org.apache.cassandra.db.virtual.SimpleDataSet;
+import org.apache.cassandra.db.virtual.VirtualTable;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * A {@link VirtualTable} providing a system view of SSTable index metadata.
+ */
+public class SSTablesSystemView extends AbstractVirtualTable
+{
+    static final String NAME = "sstable_indexes";
+
+    static final String KEYSPACE_NAME = "keyspace_name";
+    static final String INDEX_NAME = "index_name";
+    static final String SSTABLE_NAME = "sstable_name";
+    static final String TABLE_NAME = "table_name";
+    static final String COLUMN_NAME = "column_name";
+    static final String FORMAT_VERSION = "format_version";
+    static final String CELL_COUNT = "cell_count";
+    static final String MIN_ROW_ID = "min_row_id";
+    static final String MAX_ROW_ID = "max_row_id";
+    static final String START_TOKEN = "start_token";
+    static final String END_TOKEN = "end_token";
+    static final String PER_TABLE_DISK_SIZE = "per_table_disk_size";
+    static final String PER_COLUMN_DISK_SIZE = "per_column_disk_size";
+
+    public SSTablesSystemView(String keyspace)
+    {
+        super(TableMetadata.builder(keyspace, NAME)
+                           .partitioner(new LocalPartitioner(UTF8Type.instance))
+                           .comment("SSTable index metadata")
+                           .kind(TableMetadata.Kind.VIRTUAL)
+                           .addPartitionKeyColumn(KEYSPACE_NAME, UTF8Type.instance)
+                           .addClusteringColumn(INDEX_NAME, UTF8Type.instance)
+                           .addClusteringColumn(SSTABLE_NAME, UTF8Type.instance)
+                           .addRegularColumn(TABLE_NAME, UTF8Type.instance)
+                           .addRegularColumn(COLUMN_NAME, UTF8Type.instance)
+                           .addRegularColumn(FORMAT_VERSION, UTF8Type.instance)
+                           .addRegularColumn(CELL_COUNT, LongType.instance)
+                           .addRegularColumn(MIN_ROW_ID, LongType.instance)
+                           .addRegularColumn(MAX_ROW_ID, LongType.instance)
+                           .addRegularColumn(START_TOKEN, UTF8Type.instance)
+                           .addRegularColumn(END_TOKEN, UTF8Type.instance)
+                           .addRegularColumn(PER_TABLE_DISK_SIZE, LongType.instance)
+                           .addRegularColumn(PER_COLUMN_DISK_SIZE, LongType.instance)
+                           .build());
+    }
+
+    @Override
+    public DataSet data()
+    {
+        SimpleDataSet dataset = new SimpleDataSet(metadata());
+
+        for (String ks : Schema.instance.getUserKeyspaces())
+        {
+            Keyspace keyspace = Schema.instance.getKeyspaceInstance(ks);
+            if (keyspace == null)
+                throw new IllegalArgumentException("Unknown keyspace " + ks);
+
+            for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+            {
+                StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+
+                if (group != null)
+                {
+                    Token.TokenFactory tokenFactory = cfs.metadata().partitioner.getTokenFactory();
+
+                    for (StorageAttachedIndex index : group)
+                    {
+                        ColumnContext columnContext = index.getContext();
+
+                        for (SSTableIndex sstableIndex : columnContext.getView())
+                        {
+                            SSTableReader sstable = sstableIndex.getSSTable();
+                            Descriptor descriptor = sstable.descriptor;
+                            AbstractBounds<Token> bounds = sstable.getBounds();
+
+                            dataset.row(ks, columnContext.getIndexName(), sstable.getFilename())
+                                   .column(TABLE_NAME, descriptor.cfname)
+                                   .column(COLUMN_NAME, columnContext.getColumnName())
+                                   .column(FORMAT_VERSION, sstableIndex.getVersion().toString())
+                                   .column(CELL_COUNT, sstableIndex.getRowCount())
+                                   .column(MIN_ROW_ID, sstableIndex.minSSTableRowId())
+                                   .column(MAX_ROW_ID, sstableIndex.maxSSTableRowId())
+                                   .column(START_TOKEN, tokenFactory.toString(bounds.left))
+                                   .column(END_TOKEN, tokenFactory.toString(bounds.right))
+                                   .column(PER_TABLE_DISK_SIZE, sstableIndex.getSSTableContext().diskUsage())
+                                   .column(PER_COLUMN_DISK_SIZE, sstableIndex.sizeOfPerColumnComponents());
+                        }
+                    }
+                }
+            }
+        }
+
+        return dataset;
+    }
+}
diff --git a/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java b/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java
new file mode 100644
index 000000000000..27315ce707ba
--- /dev/null
+++ b/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.virtual;
+
+import java.util.List;
+import java.util.function.Consumer;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.virtual.AbstractVirtualTable;
+import org.apache.cassandra.db.virtual.SimpleDataSet;
+import org.apache.cassandra.db.virtual.VirtualTable;
+import org.apache.cassandra.dht.LocalPartitioner;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * A {@link VirtualTable} providing a system view of SSTable index segment metadata.
+ */
+public class SegmentsSystemView extends AbstractVirtualTable
+{
+    static final String NAME = "sstable_index_segments";
+
+    static final String KEYSPACE_NAME = "keyspace_name";
+    static final String INDEX_NAME = "index_name";
+    static final String SSTABLE_NAME = "sstable_name";
+    static final String TABLE_NAME = "table_name";
+    static final String COLUMN_NAME = "column_name";
+    static final String CELL_COUNT = "cell_count";
+    static final String SEGMENT_ROW_ID_OFFSET = "segment_row_id_offset";
+    static final String MIN_SSTABLE_ROW_ID = "min_sstable_row_id";
+    static final String MAX_SSTABLE_ROW_ID = "max_sstable_row_id";
+    static final String START_TOKEN = "start_token";
+    static final String END_TOKEN = "end_token";
+    static final String MIN_TERM = "min_term";
+    static final String MAX_TERM = "max_term";
+    static final String COMPONENT_METADATA = "component_metadata";
+
+    public SegmentsSystemView(String keyspace)
+    {
+        super(TableMetadata.builder(keyspace, NAME)
+                           .partitioner(new LocalPartitioner(UTF8Type.instance))
+                           .comment("SSTable index segment metadata")
+                           .kind(TableMetadata.Kind.VIRTUAL)
+                           .addPartitionKeyColumn(KEYSPACE_NAME, UTF8Type.instance)
+                           .addClusteringColumn(INDEX_NAME, UTF8Type.instance)
+                           .addClusteringColumn(SSTABLE_NAME, UTF8Type.instance)
+                           .addClusteringColumn(SEGMENT_ROW_ID_OFFSET, LongType.instance)
+                           .addRegularColumn(TABLE_NAME, UTF8Type.instance)
+                           .addRegularColumn(COLUMN_NAME, UTF8Type.instance)
+                           .addRegularColumn(CELL_COUNT, LongType.instance)
+                           .addRegularColumn(MIN_SSTABLE_ROW_ID, LongType.instance)
+                           .addRegularColumn(MAX_SSTABLE_ROW_ID, LongType.instance)
+                           .addRegularColumn(START_TOKEN, UTF8Type.instance)
+                           .addRegularColumn(END_TOKEN, UTF8Type.instance)
+                           .addRegularColumn(MIN_TERM, UTF8Type.instance)
+                           .addRegularColumn(MAX_TERM, UTF8Type.instance)
+                           .addRegularColumn(COMPONENT_METADATA,
+                                             MapType.getInstance(UTF8Type.instance,
+                                                                 MapType.getInstance(UTF8Type.instance, UTF8Type.instance, false),
+                                                                 false))
+                           .build());
+    }
+
+    @Override
+    public DataSet data()
+    {
+        SimpleDataSet dataset = new SimpleDataSet(metadata());
+
+        forEachIndex(columnContext -> {
+            for (SSTableIndex sstableIndex : columnContext.getView())
+            {
+                SSTableReader sstable = sstableIndex.getSSTable();
+                List<SegmentMetadata> segments = sstableIndex.segments();
+                Descriptor descriptor = sstable.descriptor;
+                Token.TokenFactory tokenFactory = sstable.metadata().partitioner.getTokenFactory();
+
+                for (SegmentMetadata metadata : segments)
+                {
+                    dataset.row(sstable.metadata().keyspace, columnContext.getIndexName(), sstable.getFilename(), metadata.segmentRowIdOffset)
+                           .column(TABLE_NAME, descriptor.cfname)
+                           .column(COLUMN_NAME, columnContext.getColumnName())
+                           .column(CELL_COUNT, metadata.numRows)
+                           .column(MIN_SSTABLE_ROW_ID, metadata.minSSTableRowId)
+                           .column(MAX_SSTABLE_ROW_ID, metadata.maxSSTableRowId)
+                           .column(START_TOKEN, tokenFactory.toString(metadata.minKey.getToken()))
+                           .column(END_TOKEN, tokenFactory.toString(metadata.maxKey.getToken()))
+                           .column(MIN_TERM, columnContext.getValidator().getSerializer().deserialize(metadata.minTerm).toString())
+                           .column(MAX_TERM, columnContext.getValidator().getSerializer().deserialize(metadata.maxTerm).toString())
+                           .column(COMPONENT_METADATA, metadata.componentMetadatas.asMap());
+                }
+            }
+        });
+
+        return dataset;
+    }
+
+    private void forEachIndex(Consumer<ColumnContext> process)
+    {
+        for (String ks : Schema.instance.getUserKeyspaces())
+        {
+            Keyspace keyspace = Schema.instance.getKeyspaceInstance(ks);
+            if (keyspace == null)
+                throw new IllegalArgumentException("Unknown keyspace " + ks);
+
+            for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+            {
+                StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+
+                if (group != null)
+                {
+                    for (StorageAttachedIndex index : group)
+                    {
+                        process.accept(index.getContext());
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/schema/IndexMetadata.java b/src/java/org/apache/cassandra/schema/IndexMetadata.java
index d4188aa2c9f6..9f7104d6d130 100644
--- a/src/java/org/apache/cassandra/schema/IndexMetadata.java
+++ b/src/java/org/apache/cassandra/schema/IndexMetadata.java
@@ -40,6 +40,7 @@
 import org.apache.cassandra.exceptions.UnknownIndexException;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.internal.CassandraIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.utils.FBUtilities;
@@ -63,6 +64,11 @@ public final class IndexMetadata
      */
     private static final Map<String, String> indexNameAliases = new ConcurrentHashMap<>();
 
+    static
+    {
+        indexNameAliases.put(StorageAttachedIndex.class.getSimpleName(), StorageAttachedIndex.class.getCanonicalName());
+    }
+
     public enum Kind
     {
         KEYS, CUSTOM, COMPOSITES
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
index ae355bcd5eaf..f9ad9694085a 100644
--- a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
+++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
@@ -108,7 +108,7 @@ static float estimateResultsPerRange(PartitionRangeReadCommand command, Keyspace
         Index index = command.getIndex(cfs);
         float maxExpectedResults = index == null
                                    ? command.limits().estimateTotalResults(cfs)
-                                   : index.getEstimatedResultRows();
+                                   : command.indexQueryPlan().getEstimatedResultRows();
 
         // adjust maxExpectedResults by the number of tokens this node has and the replication factor for this ks
         return (maxExpectedResults / DatabaseDescriptor.getNumTokens())
diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
index a08375fd67f6..c03bb09a2425 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
@@ -92,6 +92,7 @@ public class DatabaseDescriptorRefTest
     "org.apache.cassandra.config.YamlConfigurationLoader$PropertiesChecker$1",
     "org.apache.cassandra.config.YamlConfigurationLoader$CustomConstructor",
     "org.apache.cassandra.config.TransparentDataEncryptionOptions",
+    "org.apache.cassandra.config.StorageAttachedIndexOptions",
     "org.apache.cassandra.db.ConsistencyLevel",
     "org.apache.cassandra.db.commitlog.CommitLogSegmentManagerFactory",
     "org.apache.cassandra.db.commitlog.DefaultCommitLogSegmentMgrFactory",
diff --git a/test/unit/org/apache/cassandra/index/sai/IndexingSchemaLoader.java b/test/unit/org/apache/cassandra/index/sai/IndexingSchemaLoader.java
new file mode 100644
index 000000000000..c6fab79ee259
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/IndexingSchemaLoader.java
@@ -0,0 +1,205 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai;
+
+import java.util.HashMap;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.marshal.DoubleType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Indexes;
+import org.apache.cassandra.schema.TableMetadata;
+
+public class IndexingSchemaLoader extends SchemaLoader
+{
+    public static TableMetadata.Builder ndiCFMD(String ksName, String cfName)
+    {
+        TableMetadata.Builder builder =
+                TableMetadata.builder(ksName, cfName)
+                             .addPartitionKeyColumn("id", UTF8Type.instance)
+                             .addRegularColumn("first_name", UTF8Type.instance)
+                             .addRegularColumn("last_name", UTF8Type.instance)
+                             .addRegularColumn("age", Int32Type.instance)
+                             .addRegularColumn("height", Int32Type.instance)
+                             .addRegularColumn("timestamp", LongType.instance)
+                             .addRegularColumn("address", UTF8Type.instance)
+                             .addRegularColumn("score", DoubleType.instance)
+                             .addRegularColumn("comment", UTF8Type.instance)
+                             .addRegularColumn("comment_suffix_split", UTF8Type.instance)
+                             .addRegularColumn("/output/full-name/", UTF8Type.instance)
+                             .addRegularColumn("/data/output/id", UTF8Type.instance)
+                             .addRegularColumn("first_name_prefix", UTF8Type.instance);
+
+        Indexes.Builder indexes = Indexes.builder();
+
+        indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_first_name", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+        {{
+            put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+            put(IndexTarget.TARGET_OPTION_NAME, "first_name");
+        }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_last_name", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "last_name");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_age", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "age");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_timestamp", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "timestamp");
+
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_address", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "address");
+                    put("case_sensitive", "false");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_score", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "score");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_comment", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "comment");
+                    put("case_sensitive", "true");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_comment_suffix_split", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "comment_suffix_split");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_output_full_name", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "/output/full-name/");
+                    put("case_sensitive", "false");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_data_output_id", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "/data/output/id");
+                }}))
+                .add(IndexMetadata.fromSchemaMetadata(cfName + "_first_name_prefix", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+                {{
+                    put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                    put(IndexTarget.TARGET_OPTION_NAME, "first_name_prefix");
+                }}));
+
+        return builder.indexes(indexes.build());
+    }
+
+    public static TableMetadata.Builder clusteringNDICFMD(String ksName, String cfName)
+    {
+        return clusteringNDICFMD(ksName, cfName, "location", "age", "height", "score");
+    }
+
+    public static TableMetadata.Builder clusteringNDICFMD(String ksName, String cfName, String...indexedColumns)
+    {
+        Indexes.Builder indexes = Indexes.builder();
+        for (String indexedColumn : indexedColumns)
+        {
+            indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_" + indexedColumn, IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+            {{
+                put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+                put(IndexTarget.TARGET_OPTION_NAME, indexedColumn);
+            }}));
+        }
+
+        return TableMetadata.builder(ksName, cfName)
+                            .addPartitionKeyColumn("name", UTF8Type.instance)
+                            .addClusteringColumn("location", UTF8Type.instance)
+                            .addClusteringColumn("age", Int32Type.instance)
+                            .addRegularColumn("height", Int32Type.instance)
+                            .addRegularColumn("score", DoubleType.instance)
+                            .addStaticColumn("nickname", UTF8Type.instance)
+                            .indexes(indexes.build());
+    }
+
+    public static TableMetadata.Builder staticNDICFMD(String ksName, String cfName)
+    {
+        TableMetadata.Builder builder =
+                TableMetadata.builder(ksName, cfName)
+                             .addPartitionKeyColumn("sensor_id", Int32Type.instance)
+                             .addStaticColumn("sensor_type", UTF8Type.instance)
+                             .addClusteringColumn("date", LongType.instance)
+                             .addRegularColumn("value", DoubleType.instance)
+                             .addRegularColumn("variance", Int32Type.instance);
+
+        Indexes.Builder indexes = Indexes.builder();
+
+        indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_sensor_type", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+        {{
+            put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+            put(IndexTarget.TARGET_OPTION_NAME, "sensor_type");
+            put("case_sensitive", "false");
+        }}));
+
+        indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_value", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+        {{
+            put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+            put(IndexTarget.TARGET_OPTION_NAME, "value");
+        }}));
+
+        indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_variance", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+        {{
+            put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+            put(IndexTarget.TARGET_OPTION_NAME, "variance");
+        }}));
+
+        return builder.indexes(indexes.build());
+    }
+
+    public static TableMetadata.Builder fullTextSearchNDICFMD(String ksName, String cfName)
+    {
+        TableMetadata.Builder builder =
+                TableMetadata.builder(ksName, cfName)
+                             .addPartitionKeyColumn("song_id", UUIDType.instance)
+                             .addRegularColumn("title", UTF8Type.instance)
+                             .addRegularColumn("artist", UTF8Type.instance);
+
+        Indexes.Builder indexes = Indexes.builder();
+
+        indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_title", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+        {{
+            put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+            put(IndexTarget.TARGET_OPTION_NAME, "title");
+        }}));
+
+        indexes.add(IndexMetadata.fromSchemaMetadata(cfName + "_artist", IndexMetadata.Kind.CUSTOM, new HashMap<String, String>()
+        {{
+            put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getName());
+            put(IndexTarget.TARGET_OPTION_NAME, "artist");
+            put("case_sensitive", "false");
+
+        }}));
+
+        return builder.indexes(indexes.build());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/SAITester.java b/test/unit/org/apache/cassandra/index/sai/SAITester.java
new file mode 100644
index 000000000000..945253b2502e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/SAITester.java
@@ -0,0 +1,750 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.channels.FileChannel;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Objects;
+import java.util.Random;
+import java.util.Set;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import javax.management.AttributeNotFoundException;
+import javax.management.MalformedObjectNameException;
+import javax.management.ObjectName;
+
+import com.google.common.base.Predicates;
+import com.google.common.collect.Sets;
+import org.junit.After;
+import org.junit.Assert;
+
+import com.datastax.driver.core.QueryTrace;
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Session;
+import com.datastax.driver.core.exceptions.ReadFailureException;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.exceptions.RequestFailureReason;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.lucene.codecs.CodecUtil;
+import org.awaitility.Awaitility;
+
+import static org.apache.cassandra.inject.ActionBuilder.newActionBuilder;
+import static org.apache.cassandra.inject.Expression.quote;
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class SAITester extends CQLTester
+{
+    protected static final String CREATE_KEYSPACE_TEMPLATE = "CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}";
+
+    protected static final String CREATE_TABLE_TEMPLATE = "CREATE TABLE %s (id1 TEXT PRIMARY KEY, v1 INT, v2 TEXT) WITH compaction = " +
+            "{'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+    protected static final String CREATE_INDEX_TEMPLATE = "CREATE CUSTOM INDEX IF NOT EXISTS ON %%s(%s) USING 'StorageAttachedIndex'";
+
+    protected static int ASSERTION_TIMEOUT_SECONDS = 15;
+
+    protected static final Injections.Counter INDEX_BUILD_COUNTER = Injections.newCounter("IndexBuildCounter")
+                                                                              .add(newInvokePoint().onClass(CompactionManager.class)
+                                                                                                   .onMethod("submitIndexBuild", "SecondaryIndexBuilder", "ActiveCompactionsTracker"))
+                                                                              .build();
+
+    protected static final Injections.Counter perSSTableValidationCounter = Injections.newCounter("PerSSTableValidationCounter")
+                                                                                      .add(newInvokePoint().onClass(IndexComponents.class)
+                                                                                                           .onMethod("validatePerSSTableComponents"))
+                                                                                      .build();
+
+    protected static final Injections.Counter perColumnValidationCounter = Injections.newCounter("PerColumnValidationCounter")
+                                                                                     .add(newInvokePoint().onClass(IndexComponents.class)
+                                                                                                          .onMethod("validatePerColumnComponents", "boolean"))
+                                                                                     .build();
+
+    protected static ColumnIdentifier V1_COLUMN_IDENTIFIER = ColumnIdentifier.getInterned("v1", true);
+    protected static ColumnIdentifier V2_COLUMN_IDENTIFIER = ColumnIdentifier.getInterned("v2", true);
+
+    public enum CorruptionType
+    {
+        REMOVED
+                {
+                    @Override
+                    public void corrupt(File file) throws IOException
+                    {
+                        if (!file.delete())
+                            throw new IOException("Unable to delete file: " + file);
+                    }
+                },
+        EMPTY_FILE
+                {
+                    @Override
+                    public void corrupt(File file) throws IOException
+                    {
+                        FileChannel.open(file.toPath(), StandardOpenOption.WRITE).truncate(0).close();
+                    }
+                },
+        TRUNCATED_HEADER
+                {
+                    @Override
+                    public void corrupt(File file) throws IOException
+                    {
+                        FileChannel.open(file.toPath(), StandardOpenOption.WRITE).truncate(2).close();
+                    }
+                },
+        TRUNCATED_DATA
+                {
+                    @Override
+                    public void corrupt(File file) throws IOException
+                    {
+                        // header length is not fixed, use footer length to navigate a given data position
+                        FileChannel.open(file.toPath(), StandardOpenOption.WRITE).truncate(file.length() - CodecUtil.footerLength() - 2).close();
+                    }
+                },
+        TRUNCATED_FOOTER
+                {
+                    @Override
+                    public void corrupt(File file) throws IOException
+                    {
+                        FileChannel.open(file.toPath(), StandardOpenOption.WRITE).truncate(file.length() - CodecUtil.footerLength() + 2).close();
+                    }
+                },
+        APPENDED_DATA
+                {
+                    @Override
+                    public void corrupt(File file) throws IOException
+                    {
+                        try (RandomAccessFile raf = new RandomAccessFile(file, "rw"))
+                        {
+                            raf.seek(file.length());
+
+                            byte[] corruptedData = new byte[100];
+                            new Random().nextBytes(corruptedData);
+                            raf.write(corruptedData);
+                        }
+                    }
+                };
+
+        public abstract void corrupt(File file) throws IOException;
+    }
+
+    @After
+    public void removeAllInjections()
+    {
+        Injections.deleteAll();
+    }
+
+    public static ColumnContext createColumnContext(String name, AbstractType<?> validator)
+    {
+        return new ColumnContext("test_ks",
+                                 "test_cf",
+                                 UTF8Type.instance,
+                                 new ClusteringComparator(),
+                                 ColumnMetadata.regularColumn("sai", "internal", name, validator),
+                                 IndexMetadata.fromSchemaMetadata(name, IndexMetadata.Kind.CUSTOM, null),
+                                 IndexWriterConfig.emptyConfig());
+    }
+
+    public static ColumnContext createColumnContext(String columnName, String indexName, AbstractType<?> validator)
+    {
+        return new ColumnContext("test_ks",
+                                 "test_cf",
+                                 UTF8Type.instance,
+                                 new ClusteringComparator(),
+                                 ColumnMetadata.regularColumn("sai", "internal", columnName, validator),
+                                 IndexMetadata.fromSchemaMetadata(indexName, IndexMetadata.Kind.CUSTOM, null),
+                                 IndexWriterConfig.emptyConfig());
+    }
+
+    protected void simulateNodeRestart()
+    {
+        simulateNodeRestart(true);
+    }
+
+    protected void simulateNodeRestart(boolean wait)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        cfs.indexManager.listIndexes().forEach(index -> {
+            ((StorageAttachedIndexGroup)cfs.indexManager.getIndexGroup(index)).reset();
+        });
+        cfs.indexManager.listIndexes().forEach(index -> cfs.indexManager.buildIndex(index));
+        cfs.indexManager.executePreJoinTasksBlocking(true);
+        if (wait)
+        {
+            waitForIndexQueryable();
+        }
+    }
+
+    protected void corruptNDIComponent(Component ndiComponent, CorruptionType corruptionType) throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+
+        for (SSTableReader sstable : cfs.getLiveSSTables())
+        {
+            File file = sstable.descriptor.fileFor(ndiComponent);
+            corruptionType.corrupt(file);
+        }
+    }
+
+    protected void waitForAssert(Runnable runnableAssert, long timeout, TimeUnit unit)
+    {
+        Awaitility.await().dontCatchUncaughtExceptions().atMost(timeout, unit).untilAsserted(runnableAssert::run);
+    }
+
+    protected void waitForAssert(Runnable assertion)
+    {
+        waitForAssert(() -> assertion.run(), ASSERTION_TIMEOUT_SECONDS, TimeUnit.SECONDS);
+    }
+
+    protected boolean indexNeedsFullRebuild(String index)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        return cfs.indexManager.needsFullRebuild(index);
+    }
+
+    protected boolean isIndexQueryable()
+    {
+        return isIndexQueryable(KEYSPACE, currentTable());
+    }
+
+    protected boolean isIndexQueryable(String keyspace, String table)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+        for (Index index : cfs.indexManager.listIndexes())
+        {
+            if (!cfs.indexManager.isIndexQueryable(index))
+                return false;
+        }
+        return true;
+    }
+
+    protected void verifyInitialIndexFailed(String indexName)
+    {
+        // Verify that the initial index build fails...
+        waitForAssert(() -> assertTrue(indexNeedsFullRebuild(indexName)));
+    }
+
+    protected boolean verifyChecksum(ColumnContext context)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+
+        for (SSTableReader sstable : cfs.getLiveSSTables())
+        {
+            IndexComponents components = IndexComponents.create(context.getIndexName(), sstable);
+            if (!components.validatePerSSTableComponentsChecksum() || !components.validatePerColumnComponentsChecksum(context.isLiteral()))
+                return false;
+        }
+        return true;
+    }
+
+    protected static void assertFailureReason(ReadFailureException e, RequestFailureReason reason)
+    {
+        int expected = reason.codeForNativeProtocol();
+        int actual = e.getFailuresMap().get(FBUtilities.getBroadcastAddressAndPort().address);
+        assertEquals(expected, actual);
+    }
+
+    protected Object getMBeanAttribute(ObjectName name, String attribute) throws Exception
+    {
+        return jmxConnection.getAttribute(name, attribute);
+    }
+
+    protected Object getMetricValue(ObjectName metricObjectName)
+    {
+        // lets workaround the fact that gauges have Value, but counters have Count
+        Object metricValue;
+        try
+        {
+            try
+            {
+                metricValue = getMBeanAttribute(metricObjectName, "Value");
+            }
+            catch (AttributeNotFoundException ignored)
+            {
+                metricValue = getMBeanAttribute(metricObjectName, "Count");
+            }
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+        return metricValue;
+    }
+
+    public void waitForIndexQueryable()
+    {
+        waitForIndexQueryable(KEYSPACE, currentTable());
+    }
+
+    public void waitForIndexQueryable(String keyspace, String table)
+    {
+        waitForAssert(() -> assertTrue(isIndexQueryable(keyspace, table)), 60, TimeUnit.SECONDS);
+    }
+
+    protected void startCompaction() throws Throwable
+    {
+        Iterable<ColumnFamilyStore> tables = StorageService.instance.getValidColumnFamilies(true, false, KEYSPACE, currentTable());
+        tables.forEach(table ->
+        {
+            int gcBefore = CompactionManager.getDefaultGcBefore(table, FBUtilities.nowInSeconds());
+            CompactionManager.instance.submitMaximal(table, gcBefore, false);
+        });
+    }
+
+    public void waitForCompactions()
+    {
+        waitForAssert(() -> assertFalse(CompactionManager.instance.isCompacting(ColumnFamilyStore.all(), Predicates.alwaysTrue())), 10, TimeUnit.SECONDS);
+    }
+
+    protected void waitForCompactionsFinished()
+    {
+        waitForAssert(() -> assertEquals(0, getCompactionTasks()), 10, TimeUnit.SECONDS);
+    }
+
+    protected void waitForEquals(ObjectName name, ObjectName name2)
+    {
+        waitForAssert(() -> {
+            long jmxValue = ((Number) getMetricValue(name)).longValue();
+            long jmxValue2 = ((Number) getMetricValue(name2)).longValue();
+
+            jmxValue2 += 2; // add 2 for the first 2 queries in setupCluster
+
+            assertEquals(jmxValue, jmxValue2);
+        }, 10, TimeUnit.SECONDS);
+    }
+
+    protected void waitForEquals(ObjectName name, long value)
+    {
+        waitForAssert(() -> assertEquals(value, ((Number) getMetricValue(name)).longValue()), 10, TimeUnit.SECONDS);
+    }
+
+    protected ObjectName objectName(String name, String keyspace, String table, String index, String type)
+    {
+        try
+        {
+            return new ObjectName(String.format("org.apache.cassandra.metrics:type=StorageAttachedIndex,keyspace=%s,table=%s,index=%s,scope=%s,name=%s",
+                    keyspace, table, index, type, name));
+        }
+        catch (Throwable ex)
+        {
+            throw Throwables.unchecked(ex);
+        }
+    }
+
+    protected ObjectName objectNameNoIndex(String name, String keyspace, String table, String type)
+    {
+        try
+        {
+            return new ObjectName(String.format("org.apache.cassandra.metrics:type=StorageAttachedIndex,keyspace=%s,table=%s,scope=%s,name=%s",
+                    keyspace, table, type, name));
+        }
+        catch (Throwable ex)
+        {
+            throw Throwables.unchecked(ex);
+        }
+    }
+
+    protected void upgradeSSTables()
+    {
+        try
+        {
+            StorageService.instance.upgradeSSTables(KEYSPACE, false, currentTable());
+        }
+        catch (Throwable e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected long totalDiskSpaceUsed()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        return cfs.metric.totalDiskSpaceUsed.getCount();
+    }
+
+    protected long indexDiskSpaceUse()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        return Objects.requireNonNull(StorageAttachedIndexGroup.getIndexGroup(cfs)).totalDiskUsage();
+    }
+
+    protected int getOpenIndexFiles()
+    {
+        ColumnFamilyStore cfs = Schema.instance.getKeyspaceInstance(KEYSPACE).getColumnFamilyStore(currentTable());
+        return StorageAttachedIndexGroup.getIndexGroup(cfs).openIndexFiles();
+    }
+
+    protected long getDiskUsage()
+    {
+        ColumnFamilyStore cfs = Schema.instance.getKeyspaceInstance(KEYSPACE).getColumnFamilyStore(currentTable());
+        return StorageAttachedIndexGroup.getIndexGroup(cfs).diskUsage();
+    }
+
+    protected void verifyIndexFiles(int numericFiles, int stringFiles)
+    {
+        verifyIndexFiles(Math.max(numericFiles, stringFiles), numericFiles, stringFiles, numericFiles + stringFiles);
+    }
+
+    protected void verifyIndexFiles(int perSSTableFiles, int numericFiles, int stringFiles, int completionFiles)
+    {
+        Set<File> indexFiles = indexFiles();
+
+        for (Component component : IndexComponents.PER_SSTABLE_COMPONENTS)
+        {
+            Set<File> tableFiles = componentFiles(indexFiles, component);
+            assertEquals(tableFiles.toString(), perSSTableFiles, tableFiles.size());
+        }
+
+        for (IndexComponents.NDIType type : IndexComponents.STRING_COMPONENTS)
+        {
+            Set<File> stringIndexFiles = componentFiles(indexFiles, type.name);
+            assertEquals(stringIndexFiles.toString(), stringFiles, stringIndexFiles.size());
+        }
+
+        Set<File> kdTreeFiles = componentFiles(indexFiles, IndexComponents.NDIType.KD_TREE.name);
+        assertEquals(kdTreeFiles.toString(), numericFiles, kdTreeFiles.size());
+
+        Set<File> metaFiles = componentFiles(indexFiles, IndexComponents.NDIType.META.name);
+        assertEquals(metaFiles.toString(), numericFiles + stringFiles, metaFiles.size());
+
+        Set<File> completionMarkers = componentFiles(indexFiles, IndexComponents.NDIType.COLUMN_COMPLETION_MARKER.name);
+        assertEquals(completionMarkers.toString(), completionFiles, completionMarkers.size());
+    }
+
+    protected Set<File> indexFiles()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        Set<Component> components = cfs.indexManager.listIndexGroups()
+                                                    .stream()
+                                                    .filter(g -> g instanceof StorageAttachedIndexGroup)
+                                                    .map(Index.Group::getComponents)
+                                                    .flatMap(Set::stream)
+                                                    .collect(Collectors.toSet());
+
+        Set<File> indexFiles = new HashSet<>();
+        for (Component component : components)
+        {
+            List<File> files = cfs.getDirectories().getCFDirectories()
+                    .stream()
+                    .flatMap(dir -> Arrays.stream(dir.listFiles()))
+                    .filter(File::isFile)
+                    .filter(f -> f.getName().endsWith(component.name))
+                    .collect(Collectors.toList());
+            indexFiles.addAll(files);
+        }
+        return indexFiles;
+    }
+
+    protected ObjectName bufferSpaceObjectName(String name) throws MalformedObjectNameException
+    {
+        return new ObjectName(String.format("org.apache.cassandra.metrics:type=StorageAttachedIndex,name=%s", name));
+    }
+
+    protected long getSegmentBufferSpaceLimit() throws Exception
+    {
+        ObjectName limitBytesName = bufferSpaceObjectName("SegmentBufferSpaceLimitBytes");
+        return (long) (Long) getMetricValue(limitBytesName);
+    }
+
+    protected Object getSegmentBufferUsedBytes() throws Exception
+    {
+        ObjectName usedBytesName = bufferSpaceObjectName("SegmentBufferSpaceUsedBytes");
+        return getMetricValue(usedBytesName);
+    }
+
+    protected Object getColumnIndexBuildsInProgress() throws Exception
+    {
+        ObjectName buildersInProgressName = bufferSpaceObjectName("ColumnIndexBuildsInProgress");
+        return getMetricValue(buildersInProgressName);
+    }
+
+    protected void verifySSTableIndexes(String indexName, int count)
+    {
+        try
+        {
+            verifySSTableIndexes(indexName, count, count);
+        }
+        catch (Exception e)
+        {
+            throw Throwables.unchecked(e);
+        }
+    }
+
+    protected void verifySSTableIndexes(String indexName, int sstableContextCount, int sstableIndexCount)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        StorageAttachedIndexGroup indexGroup = StorageAttachedIndexGroup.getIndexGroup(cfs);
+        int contextCount = indexGroup.sstableContextManager().size();
+        assertEquals("Expected " + sstableContextCount +" SSTableContexts, but got " + contextCount, sstableContextCount, contextCount);
+
+        StorageAttachedIndex sai = (StorageAttachedIndex) cfs.indexManager.getIndexByName(indexName);
+        Collection<SSTableIndex> sstableIndexes = sai == null ? Collections.emptyList() : sai.getContext().getView().getIndexes();
+        assertEquals("Expected " + sstableIndexCount +" SSTableIndexes, but got " + sstableIndexes.toString(), sstableIndexCount, sstableIndexes.size());
+    }
+
+    protected void truncate(boolean snapshot)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        cfs.truncateBlocking(snapshot);
+    }
+
+    protected void rebuildIndexes(String... indexes)
+    {
+        ColumnFamilyStore.rebuildSecondaryIndex(KEYSPACE, currentTable(), indexes);
+    }
+
+    protected void reloadSSTableIndex()
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        StorageAttachedIndexGroup.getIndexGroup(cfs).unsafeReload();
+    }
+
+    protected void runInitializationTask() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        for (Index i : cfs.indexManager.listIndexes())
+        {
+            assert i instanceof StorageAttachedIndex;
+            cfs.indexManager.makeIndexNonQueryable(i, Index.Status.BUILD_FAILED);
+            cfs.indexManager.buildIndex(i).get();
+        }
+    }
+
+    protected int getCompactionTasks()
+    {
+        return CompactionManager.instance.getActiveCompactions() + CompactionManager.instance.getPendingTasks();
+    }
+
+    protected String getSingleTraceStatement(Session session, String query, String contains) throws Throwable
+    {
+        query = String.format(query, KEYSPACE + "." + currentTable());
+        QueryTrace trace = session.execute(session.prepare(query).bind().enableTracing()).getExecutionInfo().getQueryTrace();
+        waitForTracingEvents();
+
+        for (QueryTrace.Event event : trace.getEvents())
+        {
+            if (event.getDescription().contains(contains))
+                return event.getDescription();
+        }
+        return null;
+    }
+
+    protected void assertNumRows(int expected, String query, Object... args) throws Throwable
+    {
+        ResultSet rs = executeNet(String.format(query, args));
+        assertEquals(expected, rs.all().size());
+    }
+
+    protected static Injection newFailureOnEntry(String name, Class<?> invokeClass, String method, Class<? extends Throwable> exception)
+    {
+        return Injections.newCustom(name)
+                         .add(newInvokePoint().onClass(invokeClass).onMethod(method))
+                         .add(newActionBuilder().actions().doThrow(exception, quote("Injected failure!")))
+                         .build();
+    }
+
+    protected int snapshot(String snapshotName)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        Set<SSTableReader> snapshottedSSTables = cfs.snapshot(snapshotName);
+        return snapshottedSSTables.size();
+    }
+
+    protected List<String> restoreSnapshot(String snapshot)
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        Directories.SSTableLister lister = cfs.getDirectories().sstableLister(Directories.OnTxnErr.IGNORE).snapshots(snapshot);
+        return restore(cfs, lister);
+    }
+
+    protected List<String> restore(ColumnFamilyStore cfs, Directories.SSTableLister lister)
+    {
+        File dataDirectory = cfs.getDirectories().getDirectoryForNewSSTables();
+
+        List<String> fileNames = new ArrayList<>();
+        for (File file : lister.listFiles())
+        {
+            if (file.renameTo(new File(dataDirectory.getAbsoluteFile() + File.separator + file.getName())))
+            {
+                fileNames.add(file.getName());
+            }
+        }
+        cfs.loadNewSSTables();
+        return fileNames;
+    }
+
+    protected void assertValidationCount(int perSSTable, int perColumn)
+    {
+        Assert.assertEquals(perSSTable, perSSTableValidationCounter.get());
+        Assert.assertEquals(perColumn, perColumnValidationCounter.get());
+    }
+
+    protected void resetValidationCount()
+    {
+        perSSTableValidationCounter.reset();
+        perColumnValidationCounter.reset();
+    }
+
+    protected long indexFilesLastModified()
+    {
+        return indexFiles().stream().map(File::lastModified).max(Long::compare).orElse(0L);
+    }
+
+    protected void verifyIndexComponentsIncludedInSSTable() throws Exception
+    {
+        verifySSTableComponents(currentTable(), true);
+    }
+
+    protected void verifyIndexComponentsNotIncludedInSSTable() throws Exception
+    {
+        verifySSTableComponents(currentTable(), false);
+    }
+
+    private void verifySSTableComponents(String table, boolean indexComponentsExist) throws Exception
+    {
+        ColumnFamilyStore cfs = Objects.requireNonNull(Schema.instance.getKeyspaceInstance(KEYSPACE)).getColumnFamilyStore(table);
+        for (SSTable sstable : cfs.getLiveSSTables())
+        {
+            Set<Component> components = sstable.components;
+            StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+            Set<Component> ndiComponents = group == null ? Collections.emptySet() : group.getComponents();
+
+            Set<Component> diff = Sets.difference(ndiComponents, components);
+            if (indexComponentsExist)
+                assertTrue("Expect all index components are tracked by SSTable, but " + diff + " are not included.",
+                           !ndiComponents.isEmpty() && diff.isEmpty());
+            else
+                assertFalse("Expect no index components, but got " + components, components.toString().contains(IndexComponents.TYPE_PREFIX));
+
+            Set<Component> tocContents = SSTable.readTOC(sstable.descriptor);
+            assertEquals(components, tocContents);
+        }
+    }
+
+    private Set<File> componentFiles(Collection<File> indexFiles, Component component)
+    {
+        return indexFiles.stream().filter(c -> c.getName().endsWith(component.name)).collect(Collectors.toSet());
+    }
+
+    private Set<File> componentFiles(Collection<File> indexFiles, String shortName)
+    {
+        String suffix = String.format("_%s.db", shortName);
+        return indexFiles.stream().filter(c -> c.getName().endsWith(suffix)).collect(Collectors.toSet());
+    }
+
+    /**
+     * Run repeated verification task concurrently with target test
+     */
+    protected static class TestWithConcurrentVerification
+    {
+        private final Runnable verificationTask;
+        private final CountDownLatch verificationStarted = new CountDownLatch(1);
+
+        private final Runnable targetTask;
+        private final CountDownLatch taskCompleted = new CountDownLatch(1);
+
+        private final int verificationIntervalInMs;
+        private final int verificationMaxInMs = 300_000; // 300s
+
+        public TestWithConcurrentVerification(Runnable verificationTask, Runnable targetTask)
+        {
+            this(verificationTask, targetTask, 10);
+        }
+
+        /**
+         * @param verificationTask to be run concurrently with target task
+         * @param targetTask task to be performed once
+         * @param verificationIntervalInMs interval between each verification task, -1 to run verification task once
+         */
+        public TestWithConcurrentVerification(Runnable verificationTask, Runnable targetTask, int verificationIntervalInMs)
+        {
+            this.verificationTask = verificationTask;
+            this.targetTask = targetTask;
+            this.verificationIntervalInMs = verificationIntervalInMs;
+        }
+
+        public void start()
+        {
+            Thread verificationThread = new Thread(() -> {
+                verificationStarted.countDown();
+
+                while (true)
+                {
+                    try
+                    {
+                        verificationTask.run();
+
+                        if (verificationIntervalInMs < 0 || taskCompleted.await(verificationIntervalInMs, TimeUnit.MILLISECONDS))
+                            break;
+                    }
+                    catch (Throwable e)
+                    {
+                        throw Throwables.unchecked(e);
+                    }
+                }
+            });
+
+            try
+            {
+                verificationThread.start();
+                verificationStarted.await();
+
+                targetTask.run();
+                taskCompleted.countDown();
+
+                verificationThread.join(verificationMaxInMs);
+            }
+            catch (InterruptedException e)
+            {
+                throw Throwables.unchecked(e);
+            }
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java b/test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java
new file mode 100644
index 000000000000..6d108e320f3d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/analyzer/NonTokenizingAnalyzerTest.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+/**
+ * Tests for the non-tokenizing analyzer
+ */
+public class NonTokenizingAnalyzerTest
+{
+    @Test
+    public void asciiAnalyzer() throws Exception
+    {
+        NonTokenizingOptions options = NonTokenizingOptions.getDefaultOptions();
+        options.setCaseSensitive(false);
+        options.setAscii(true);
+        NonTokenizingAnalyzer analyzer = new NonTokenizingAnalyzer(UTF8Type.instance, options);
+
+        String testString = "Éppinger";
+        ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
+        analyzer.reset(toAnalyze);
+        ByteBuffer analyzed = null;
+
+        while (analyzer.hasNext())
+        {
+            analyzed = analyzer.next();
+        }
+
+        String good = "eppinger";
+
+        String result = ByteBufferUtil.string(analyzed);
+
+        assertEquals(good, result);
+    }
+
+    @Test
+    public void asciiAnalyzerFalse() throws Exception
+    {
+        NonTokenizingOptions options = NonTokenizingOptions.getDefaultOptions();
+        options.setCaseSensitive(true);
+        options.setAscii(false);
+        NonTokenizingAnalyzer analyzer = new NonTokenizingAnalyzer(UTF8Type.instance, options);
+
+        String testString = "Éppinger";
+        ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
+        analyzer.reset(toAnalyze);
+        ByteBuffer analyzed = null;
+
+        while (analyzer.hasNext())
+        {
+            analyzed = analyzer.next();
+        }
+
+        String good = "Éppinger";
+
+        String result = ByteBufferUtil.string(analyzed);
+
+        assertEquals(good, result);
+    }
+
+    @Test
+    public void caseInsensitiveAnalyzer() throws Exception
+    {
+        NonTokenizingOptions options = NonTokenizingOptions.getDefaultOptions();
+        options.setCaseSensitive(false);
+        NonTokenizingAnalyzer analyzer = new NonTokenizingAnalyzer(UTF8Type.instance, options);
+
+        String testString = "Nip it in the bud";
+        ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
+        analyzer.reset(toAnalyze);
+        ByteBuffer analyzed = null;
+        
+        while (analyzer.hasNext())
+        {
+            analyzed = analyzer.next();
+        }
+        
+        assertEquals(testString.toLowerCase(), ByteBufferUtil.string(analyzed));
+    }
+
+    @Test
+    public void caseSensitiveAnalyzer() throws Exception
+    {
+        NonTokenizingOptions options = NonTokenizingOptions.getDefaultOptions();
+        NonTokenizingAnalyzer analyzer = new NonTokenizingAnalyzer(UTF8Type.instance, options);
+
+        String testString = "Nip it in the bud";
+        ByteBuffer toAnalyze = ByteBuffer.wrap(testString.getBytes());
+        analyzer.reset(toAnalyze);
+        ByteBuffer analyzed = null;
+        
+        while (analyzer.hasNext())
+        {
+            analyzed = analyzer.next();
+        }
+        
+        assertNotEquals(testString.toLowerCase(), ByteBufferUtil.string(analyzed));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFiltersTest.java b/test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFiltersTest.java
new file mode 100644
index 000000000000..1340f93c9c15
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/analyzer/filter/BasicResultFiltersTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.analyzer.filter;
+
+import java.text.Normalizer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+
+import static org.junit.Assert.assertEquals;
+
+public class BasicResultFiltersTest
+{
+    @Test
+    public void testLowerCase()
+    {
+        BasicResultFilters.LowerCase lowerCase = new BasicResultFilters.LowerCase();
+        
+        for (int count = 0; count < CQLTester.getRandom().nextIntBetween(100, 1000); count++)
+        {
+            String actual = CQLTester.getRandom().nextTextString(10, 50);
+            assertEquals(actual.toLowerCase(), lowerCase.process(actual));
+        }
+    }
+    
+    @Test
+    public void testNormalize()
+    {
+        BasicResultFilters.Normalize normalize = new BasicResultFilters.Normalize();
+
+        for (int count = 0; count < CQLTester.getRandom().nextIntBetween(100, 1000); count++)
+        {
+            String actual = CQLTester.getRandom().nextTextString(10, 50);
+            assertEquals(Normalizer.normalize(actual, Normalizer.Form.NFC), normalize.process(actual));
+        }
+    }
+    
+    @Test
+    public void testAscii()
+    {
+        BasicResultFilters.Ascii ascii = new BasicResultFilters.Ascii();
+
+        for (int count = 0; count < CQLTester.getRandom().nextIntBetween(100, 1000); count++)
+        {
+            String actual = CQLTester.getRandom().nextTextString(100, 5000);
+
+            char[] actualChars = actual.toCharArray();
+            char[] expectedChars = new char[actualChars.length * 4];
+            int expectedSize = BasicResultFilters.foldToASCII(actualChars, 0, expectedChars, 0, actualChars.length);
+            String expected = new String(expectedChars, 0, expectedSize);
+
+            assertEquals(expected, ascii.process(actual));
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java b/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java
new file mode 100644
index 000000000000..1c98f9ae14e4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java
@@ -0,0 +1,437 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+
+import static org.junit.Assert.assertNotNull;
+
+/**
+ * Tests that {@code ALLOW FILTERING} is required only if needed.
+ */
+public class AllowFilteringTest extends SAITester
+{
+    @Test
+    public void testAllowFilteringOnFirstClusteringKeyColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, c3 int, v1 int, " +
+                    "PRIMARY KEY ((k1, k2), c1, c2, c3))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c1) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+
+        // with only index restrictions
+        test("SELECT * FROM %s WHERE c1=0", false);
+        test("SELECT * FROM %s WHERE c1>0", false);
+        test("SELECT * FROM %s WHERE c1>0 AND c1<1", false);
+        
+        // with additional simple filtering restrictions
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0", true);
+        test("SELECT * FROM %s WHERE c1=0 AND k2=0", true);
+        test("SELECT * FROM %s WHERE c1=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE c1=0 AND c3=0", true);
+        test("SELECT * FROM %s WHERE c1=0 AND v1=0", true);
+
+        // with token restrictions
+        test("SELECT * FROM %s WHERE c1=0 AND token(k1, k2) = token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c1=0 AND token(k1, k2) > token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c1=0 AND token(k1, k2) > token(0, 0) AND token(k1, k2) <= token(1, 1)", false);
+
+        // with restriction on partition key
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0", false);
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND v1=0", true);
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c3=0", true);
+
+        // with restriction on partition key and clustering key prefix
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c2=0", false);
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c2=0 AND v1=0", true);
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c2=0 AND c3>0", false);
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c2=0 AND c3>0 AND v1=0", true);
+
+        // with restriction on partition key and full clustering key
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c2=0 AND c3=0", false);
+        test("SELECT * FROM %s WHERE c1=0 AND k1=0 AND k2=0 AND c2=0 AND c3=0 AND v1=0", true);
+
+        // with restriction on partition key and full clustering key, multicolumn format
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND (c1, c2, c3) = (0, 0, 0)", false);
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND (c1, c2, c3) = (0, 0, 0) AND v1=0", true);
+    }
+
+    @Test
+    public void testAllowFilteringOnNotFirstClusteringKeyColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, c3 int, c4 int, v1 int, " +
+                    "PRIMARY KEY ((k1, k2), c1, c2, c3, c4))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c3) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+
+        // with only index restrictions
+        test("SELECT * FROM %s WHERE c3=0", false);
+        test("SELECT * FROM %s WHERE c3>0", false);
+        test("SELECT * FROM %s WHERE c3>0 AND c3<1", false);
+        
+        // with additional simple filtering restrictions
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND k2=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND c1=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND c4=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND v1=0", true);
+
+        // with token restrictions
+        test("SELECT * FROM %s WHERE c3=0 AND token(k1, k2) = token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c3=0 AND token(k1, k2) > token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c3=0 AND token(k1, k2) > token(0, 0) AND token(k1, k2) <= token(1, 1)", false);
+
+        // with restriction on partition key
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0", false);
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND v1=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND c4=0", true);
+
+        // with restriction on partition key and clustering key prefix
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND c1=0", true);
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND c1=0 AND v1=0", true);
+
+        // with restriction on partition key and full clustering key
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND c1=0 AND c2=0 AND c4=0", false);
+        test("SELECT * FROM %s WHERE c3=0 AND k1=0 AND k2=0 AND c1=0 AND c2=0 AND c4=0 AND v1=0", true);
+
+        // with restriction on partition key and full clustering key, multicolumn format
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND (c1, c2, c3, c4) = (0, 0, 0, 0)", false);
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND (c1, c2, c3, c4) = (0, 0, 0, 0) AND v1=0", true);
+    }
+
+    @Test
+    public void testAllowFilteringOnMultipleClusteringKeyColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, c3 int, c4 int, v1 int, " +
+                    "PRIMARY KEY ((k1, k2), c1, c2, c3, c4))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c2) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c4) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+        
+        // with only index restrictions
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4>0", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4>0 AND c4<1", false);
+        test("SELECT * FROM %s WHERE c2>0 AND c4=0", false);
+        test("SELECT * FROM %s WHERE c2>0 AND c2<1 AND c4=0", false);
+        test("SELECT * FROM %s WHERE c2>0 AND c4>0", false);
+        test("SELECT * FROM %s WHERE c2>0 AND c2<1 AND c4>0 AND c4<1", false);
+
+        // with additional simple filtering restrictions
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k1=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k2=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND c1=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND c3=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0", true);
+
+        // with token restrictions
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND token(k1, k2) = token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND token(k1, k2) > token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND token(k1, k2) > token(0, 0) AND token(k1, k2) <= token(1, 1)", false);
+
+        // with restriction on partition key
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k1=0 AND k2=0", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k1=0 AND k2=0 AND c3=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k1=0 AND k2=0 AND v1=0", true);
+
+        // with restriction on partition key and clustering key prefix
+        test("SELECT * FROM %s WHERE k1=2 AND k2=3 AND c1=4 AND c2=0 AND c4=1", true);
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND c1=0 AND c2=0 AND c4=0 AND v1=0", true);
+
+        // with restriction on partition key and full clustering key
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k1=0 AND k2=0 AND c1=0 AND c3=0", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND k1=0 AND k2=0 AND c1=0 AND c3=0 AND v1=0", true);
+
+        // with restriction on partition key and full clustering key, multicolumn format
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND (c1, c2, c3, c4) = (0, 0, 0, 0)", false);
+        test("SELECT * FROM %s WHERE k1=0 AND k2=0 AND (c1, c2, c3, c4) = (0, 0, 0, 0) AND v1=0", true);
+    }
+
+    @Test
+    public void testAllowFilteringOnSingleRegularColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, v1 int, v2 int, PRIMARY KEY ((k1, k2), c1, c2))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+
+        // with only index restrictions
+        test("SELECT * FROM %s WHERE v1=0", false);
+        test("SELECT * FROM %s WHERE v1>0", false);
+        test("SELECT * FROM %s WHERE v1>0 AND v1<1", false);
+
+        // with additional simple filtering restrictions
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND k2=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND c1=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0", true);
+
+        // with token restrictions
+        test("SELECT * FROM %s WHERE v1=0 AND token(k1, k2) = token(0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND token(k1, k2) > token(0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND token(k1, k2) > token(0, 0) AND token(k1, k2) <= token(1, 1)", false);
+
+        // with restriction on partition key
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND v2=0", true);
+
+        // with restriction on partition key and clustering key prefix
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND c1=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND c1=0 AND v2=0", true);
+
+        // with restriction on partition key and full clustering key
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND c1=0 AND c2=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND c1=0 AND c2=0 AND v2=0", true);
+
+        // with restriction on partition key and full clustering key, multicolumn format
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND (c1, c2) = (0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND k1=0 AND k2=0 AND (c1, c2) = (0, 0) AND v2=0", true);
+    }
+
+    @Test
+    public void testAllowFilteringOnMultipleRegularColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, v1 int, v2 int, v3 int, " +
+                    "PRIMARY KEY ((k1, k2), c1, c2))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+
+        // with only index restrictions
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0", false);
+        test("SELECT * FROM %s WHERE v1>0 AND v2=0", false);
+        test("SELECT * FROM %s WHERE v1>0 AND v1<1 AND v2=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2>0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2>0 AND v2<1", false);
+        test("SELECT * FROM %s WHERE v1>0 AND v1<1 AND v2>0 AND v2<1", false);
+
+        // with additional simple filtering restrictions
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k2=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND c1=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND v3=0", true);
+
+        // with token restrictions
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND token(k1, k2) = token(0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND token(k1, k2) > token(0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND token(k1, k2) > token(0, 0) AND token(k1, k2) <= token(1, 1)", false);
+
+        // with restriction on partition key
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND c2=0", true);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND v3=0", true);
+
+        // with restriction on partition key and clustering key prefix
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0 AND v3=0", true);
+
+        // with restriction on partition key and full clustering key
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0 AND c2=0", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0 AND c2=0 AND v3=0", true);
+
+        // with restriction on partition key and full clustering key, multicolumn format
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND (c1, c2) = (0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND (c1, c2) = (0, 0) AND v3=0", true);
+    }
+
+    @Test
+    public void testAllowFilteringOnClusteringAndRegularColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, c3 int, c4 int, v1 int, v2 int, v3 int, " +
+                    "PRIMARY KEY ((k1, k2), c1, c2, c3, c4))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c2) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c4) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", StorageAttachedIndex.class.getName()));
+
+        // with only index restrictions
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0", false);
+        test("SELECT * FROM %s WHERE c2>0 AND c4>0 AND v1>0 AND v2>0", false);
+        test("SELECT * FROM %s WHERE c2>0 AND c2<1 AND c4>0 AND c4<1 AND v1>0 AND v1<0 AND v2>0 AND v2<1", false);
+        
+        // with additional simple filtering restrictions
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k1=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k2=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND c3=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND v3=0", true);
+
+        // with token restrictions
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND token(k1, k2) = token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND token(k1, k2) > token(0, 0)", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND token(k1, k2) > token(0, 0) AND token(k1, k2) <= token(1, 1)", false);
+
+        // with restriction on partition key
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k1=0 AND k2=0", false);
+
+        // with restriction on partition key and clustering key prefix
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0", true);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0 AND v3=0", true);
+
+        // with restriction on partition key and full clustering key
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0 AND c3=0", false);
+        test("SELECT * FROM %s WHERE c2=0 AND c4=0 AND v1=0 AND v2=0 AND k1=0 AND k2=0 AND c1=0 AND c3=0 AND v3=0", true);
+
+        // with restriction on partition key and full clustering key, multicolumn format
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND (c1, c2, c3, c4) = (0, 0, 0, 0)", false);
+        test("SELECT * FROM %s WHERE v1=0 AND v2=0 AND k1=0 AND k2=0 AND (c1, c2, c3, c4) = (0, 0, 0, 0) AND v3=0", true);
+    }
+
+    @Test
+    public void testAllowFilteringOnCollectionColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 int, c1 int, c2 int, l list<int>, s set<int>, m_k map<int,int>,"
+                    + " m_v map<int,int>, m_en map<int, int>, not_indexed list<int>, PRIMARY KEY ((k1, k2), c1, c2))");
+        createIndex("CREATE CUSTOM INDEX ON %s(l) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(s) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(keys(m_k)) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(values(m_v)) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(entries(m_en)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        // single contains
+        test("SELECT * FROM %s WHERE l contains 1", false);
+        test("SELECT * FROM %s WHERE s contains 1", false);
+        test("SELECT * FROM %s WHERE m_k contains key 1", false);
+        test("SELECT * FROM %s WHERE m_v contains 1", false);
+        test("SELECT * FROM %s WHERE m_en[1] = 1", false);
+
+        // multiple contains on different indexed columns
+        test("SELECT * FROM %s WHERE l contains 1 and s contains 2", false);
+        test("SELECT * FROM %s WHERE l contains 1 and m_k contains key 2", false);
+        test("SELECT * FROM %s WHERE l contains 1 and m_v contains 2", false);
+        test("SELECT * FROM %s WHERE l contains 1 and m_en[2] = 2", false);
+        test("SELECT * FROM %s WHERE s contains 1 and s contains 2", false);
+        test("SELECT * FROM %s WHERE s contains 1 and m_k contains key 2", false);
+        test("SELECT * FROM %s WHERE s contains 1 and m_v contains 2", false);
+        test("SELECT * FROM %s WHERE s contains 1 and m_en[2] = 2", false);
+
+        // multiple contains on the same column
+        test("SELECT * FROM %s WHERE l contains 1 and l contains 2", false);
+        test("SELECT * FROM %s WHERE s contains 1 and s contains 2", false);
+        test("SELECT * FROM %s WHERE m_k contains key 1 and m_k contains key 2", false);
+        test("SELECT * FROM %s WHERE m_v contains 1 and m_v contains 2", false);
+        test("SELECT * FROM %s WHERE m_en[1] = 1 and m_en[2] = 2", false);
+
+        // multiple contains on different columns with not indexed column
+        test("SELECT * FROM %s WHERE l contains 1 and not_indexed contains 2", true);
+        test("SELECT * FROM %s WHERE s contains 1 and not_indexed contains 2", true);
+        test("SELECT * FROM %s WHERE m_k contains key 1 and not_indexed contains 2", true);
+        test("SELECT * FROM %s WHERE m_v contains 1 and not_indexed contains 2", true);
+        test("SELECT * FROM %s WHERE m_en[1] = 1 and not_indexed contains 2", true);
+    }
+
+    private void test(String query, boolean requiresAllowFiltering) throws Throwable
+    {
+        if (requiresAllowFiltering)
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE, query);
+        else
+            assertNotNull(execute(query));
+
+        assertNotNull(execute(query + " ALLOW FILTERING"));
+    }
+
+    @Test
+    public void testUnsupportedIndexRestrictions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b text, c text, d text, PRIMARY KEY (a, b))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(b) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(d) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (a, b, c, d) VALUES ('Test1', 'Test1', 'Test1', 'Test1')");
+        execute("INSERT INTO %s (a, b, c, d) VALUES ('Test2', 'Test2', 'Test2', 'Test2')");
+        execute("INSERT INTO %s (a, b, c, d) VALUES ('Test3', 'Test3', 'Test3', 'Test3')");
+
+        // Single restriction
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"), "SELECT * FROM %s WHERE b > 'Test'");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"), "SELECT * FROM %s WHERE c > 'Test'");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "d"), "SELECT * FROM %s WHERE d > 'Test'");
+
+        // Supported and unsupported restriction
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"), "SELECT * FROM %s WHERE b > 'Test' AND c = 'Test1'");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"), "SELECT * FROM %s WHERE c > 'Test' AND d = 'Test1'");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "d"), "SELECT * FROM %s WHERE d > 'Test' AND b = 'Test1'");
+
+        // Two unsupported restrictions
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"), "SELECT * FROM %s WHERE b > 'Test' AND b < 'Test3'");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI, "[b, c]"), "SELECT * FROM %s WHERE c > 'Test' AND b < 'Test3'");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI, "[b, d]"), "SELECT * FROM %s WHERE d > 'Test' AND b < 'Test3'");
+
+        // IN restriction
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"), "SELECT * FROM %s WHERE b IN ('Test1', 'Test2')");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"), "SELECT * FROM %s WHERE c IN ('Test1', 'Test2')");
+        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "d"), "SELECT * FROM %s WHERE d IN ('Test1', 'Test2')");
+
+        // The same queries with ALLOW FILTERING should work
+
+        // Single restriction
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE b > 'Test' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                    row("Test2", "Test2", "Test2", "Test2"),
+                                                                                                    row("Test3", "Test3", "Test3", "Test3"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c > 'Test' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                    row("Test2", "Test2", "Test2", "Test2"),
+                                                                                                    row("Test3", "Test3", "Test3", "Test3"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE d > 'Test' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                    row("Test2", "Test2", "Test2", "Test2"),
+                                                                                                    row("Test3", "Test3", "Test3", "Test3"));
+
+        // Supported and unsupported restriction
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE b > 'Test' AND c = 'Test1' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c > 'Test' AND d = 'Test1' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE d > 'Test' AND b = 'Test1' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"));
+
+        // Two unsupported restrictions
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE b > 'Test' AND b < 'Test3' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                                    row("Test2", "Test2", "Test2", "Test2"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c > 'Test' AND b < 'Test3' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                                    row("Test2", "Test2", "Test2", "Test2"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE d > 'Test' AND b < 'Test3' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                                    row("Test2", "Test2", "Test2", "Test2"));
+
+        // IN restriction
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE b IN ('Test1', 'Test2') ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                                 row("Test2", "Test2", "Test2", "Test2"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c IN ('Test1', 'Test2') ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                                 row("Test2", "Test2", "Test2", "Test2"));
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE d IN ('Test1', 'Test2') ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
+                                                                                                                 row("Test2", "Test2", "Test2", "Test2"));
+    }
+
+    @Test
+    public void testIndexedColumnDoesNotSupportLikeRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b text, c text, d text, PRIMARY KEY (a, b))");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(b) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(d) USING '%s'", StorageAttachedIndex.class.getName()));
+
+        // LIKE restriction
+        assertInvalidMessage(String.format(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE, "b"), "SELECT * FROM %s WHERE b LIKE 'Test'");
+        assertInvalidMessage(String.format(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE, "c"), "SELECT * FROM %s WHERE c LIKE 'Test'");
+        assertInvalidMessage(String.format(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE, "d"), "SELECT * FROM %s WHERE d LIKE 'Test'");
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/ClusteringKeyIndexTest.java b/test/unit/org/apache/cassandra/index/sai/cql/ClusteringKeyIndexTest.java
new file mode 100644
index 000000000000..44887644844b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/ClusteringKeyIndexTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class ClusteringKeyIndexTest extends SAITester
+{
+    @Before
+    public void createTableAndIndex()
+    {
+        createTable("CREATE TABLE %s (pk1 int, pk2 text, val int, PRIMARY KEY((pk1), pk2)) WITH CLUSTERING ORDER BY (pk2 DESC)");
+        createIndex("CREATE CUSTOM INDEX pk2_idx ON %s(pk2) USING 'StorageAttachedIndex'");
+
+        disableCompaction();
+    }
+
+    private void insertData1() throws Throwable
+    {
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (1, '1', 1)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (2, '2', 2)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (3, '3', 3)");
+    }
+
+    private void insertData2() throws Throwable
+    {
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (4, '4', 4)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (5, '5', 5)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (6, '6', 6)");
+    }
+
+    @Test
+    public void queryFromMemtable() throws Throwable
+    {
+        insertData1();
+        insertData2();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromSingleSSTable() throws Throwable
+    {
+        insertData1();
+        insertData2();
+        flush();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromMultipleSSTables() throws Throwable
+    {
+        insertData1();
+        flush();
+        insertData2();
+        flush();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromMemtableAndSSTables() throws Throwable
+    {
+        insertData1();
+        flush();
+        insertData2();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromCompactedSSTable() throws Throwable
+    {
+        insertData1();
+        flush();
+        insertData2();
+        flush();
+        compact();
+        runQueries();
+    }
+
+    private Object[] expectedRow(int index)
+    {
+        return row(index, Integer.toString(index), index);
+    }
+
+    private void runQueries() throws Throwable
+    {
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 = 2"), expectedRow(2));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk2 = '2'"), expectedRow(2));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 = -1 AND pk2 = '2'"));
+
+        assertThatThrownBy(()->execute("SELECT * FROM %s WHERE pk1 = -1 AND val = 2")).hasMessageContaining("use ALLOW FILTERING");
+
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/CollectionIndexingTest.java b/test/unit/org/apache/cassandra/index/sai/cql/CollectionIndexingTest.java
new file mode 100644
index 000000000000..5fcb0e384956
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/CollectionIndexingTest.java
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.Arrays;
+import java.util.HashMap;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.junit.Assert.assertEquals;
+
+// This test is primarily handling edge conditions, error conditions
+// and basic functionality. Comprehensive type testing of collections
+// is in the cql/types/collections package
+//TODO Sort out statement restrictions assertion
+public class CollectionIndexingTest extends SAITester
+{
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+    }
+
+    @Test
+    public void indexMap() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertEquals(2, execute("SELECT * FROM %s WHERE value CONTAINS 'v1'").size());
+    }
+
+    @Test
+    public void indexMapKeys() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(KEYS(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertEquals(2, execute("SELECT * FROM %s WHERE value CONTAINS KEY 1").size());
+    }
+
+    @Test
+    public void indexMapValues() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertEquals(2, execute("SELECT * FROM %s WHERE value CONTAINS 'v1'").size());
+    }
+
+    @Test
+    public void indexMapEntries() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(ENTRIES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertEquals(2, execute("SELECT * FROM %s WHERE value[1] = 'v1'").size());
+        assertEquals(1, execute("SELECT * FROM %s WHERE value[1] = 'v1' AND value[2] = 'v2'").size());
+    }
+
+    @Test
+    public void indexFrozenList() throws Throwable
+    {
+        createPopulatedFrozenList();
+        createIndex("CREATE CUSTOM INDEX ON %s(FULL(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertEquals(2, execute("SELECT * FROM %s WHERE value = ?", Arrays.asList(1, 2, 3)).size());
+    }
+
+    @Test
+    public void indexFrozenMap() throws Throwable
+    {
+        createPopulatedFrozenMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(FULL(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertEquals(1, execute("SELECT * FROM %s WHERE value = ?", new HashMap<Integer, String>() {{
+            put(1, "v1");
+            put(2, "v2");
+        }}).size());
+
+    }
+
+    @Test
+    public void indexFrozenMapQueryKeys() throws Throwable
+    {
+        createPopulatedFrozenMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(FULL(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value contains key 1");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value contains key 1 ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexFrozenMapQueryValues() throws Throwable
+    {
+        createPopulatedFrozenMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(FULL(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value contains 'v1'");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value contains 'v1' ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexFrozenMapQueryEntries() throws Throwable
+    {
+        createPopulatedFrozenMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(FULL(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertInvalidMessage("Map-entry equality predicates on frozen map column value are not supported",
+                "SELECT * FROM %s WHERE value[1] = 'v1'");
+    }
+
+    @Test
+    public void indexMapEntriesQueryEq() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(ENTRIES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertInvalidMessage("Collection column 'value' (map<int, text>) cannot be restricted by a '=' relation",
+                "SELECT * FROM %s WHERE value = ?", Arrays.asList(1, 2));
+    }
+
+    @Test
+    public void indexMapEntriesQueryKeys() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(ENTRIES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value contains key 1");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value contains key 1 ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexMapEntriesQueryValues() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(ENTRIES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value contains 'v1'");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value contains 'v1' ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexMapKeysQueryEq() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(KEYS(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertInvalidMessage("Collection column 'value' (map<int, text>) cannot be restricted by a '=' relation",
+                "SELECT * FROM %s WHERE value = ?", Arrays.asList(1, 2));
+    }
+
+    @Test
+    public void indexMapKeysQueryValues() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(KEYS(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value contains 'v1'");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value contains 'v1' ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexMapKeysQueryEntries() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(KEYS(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value[1] = 'v1'");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value[1] = 'v1' ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexMapValuesQueryEq() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertInvalidMessage("Collection column 'value' (map<int, text>) cannot be restricted by a '=' relation",
+                "SELECT * FROM %s WHERE value = ?", Arrays.asList(1, 2));
+    }
+
+    @Test
+    public void indexMapValuesQueryKeys() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value contains key 1");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value contains key 1 ALLOW FILTERING").size());
+    }
+
+    @Test
+    public void indexMapValuesQueryEntries() throws Throwable
+    {
+        createPopulatedMap();
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(value)) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        assertUnsupportedIndexOperator("SELECT * FROM %s WHERE value[1] = 'v1'");
+        assertEquals(2, execute("SELECT * FROM %s WHERE value[1] = 'v1' ALLOW FILTERING").size());
+    }
+
+    private void createPopulatedMap() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int primary key, value map<int, text>)");
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 1, new HashMap<Integer, String>() {{
+            put(1, "v1");
+            put(2, "v2");
+        }});
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 2, new HashMap<Integer, String>() {{
+            put(1, "v1");
+            put(2, "v3");
+        }});
+    }
+
+    private void createPopulatedFrozenMap() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int primary key, value frozen<map<int, text>>)");
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 1, new HashMap<Integer, String>() {{
+            put(1, "v1");
+            put(2, "v2");
+        }});
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 2, new HashMap<Integer, String>() {{
+            put(1, "v1");
+            put(2, "v3");
+        }});
+    }
+
+    private void createPopulatedFrozenList() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int primary key, value frozen<list<int>>)");
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 1, Arrays.asList(1, 2, 3));
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 2, Arrays.asList(1, 2, 3));
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 3, Arrays.asList(4, 5, 6));
+        execute("INSERT INTO %s (pk, value) VALUES (?, ?)", 4, Arrays.asList(1, 2, 7));
+    }
+
+    private void assertUnsupportedIndexOperator(String query, Object... values) throws Throwable
+    {
+//        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "value"),
+//                query, values);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java b/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java
new file mode 100644
index 000000000000..52c74c506839
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/CompositePartitionKeyIndexTest.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class CompositePartitionKeyIndexTest extends SAITester
+{
+    @Before
+    public void createTableAndIndex()
+    {
+        createTable("CREATE TABLE %s (pk1 int, pk2 text, val int, PRIMARY KEY((pk1, pk2)))");
+        createIndex("CREATE CUSTOM INDEX ON %s(pk1) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(pk2) USING 'StorageAttachedIndex'");
+
+        disableCompaction();
+    }
+
+    private void insertData1() throws Throwable
+    {
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (1, '1', 1)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (2, '2', 2)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (3, '3', 3)");
+    }
+
+    private void insertData2() throws Throwable
+    {
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (4, '4', 4)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (5, '5', 5)");
+        execute("INSERT INTO %s (pk1, pk2, val) VALUES (6, '6', 6)");
+    }
+
+    @Test
+    public void queryFromMemtable() throws Throwable
+    {
+        insertData1();
+        insertData2();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromSingleSSTable() throws Throwable
+    {
+        insertData1();
+        insertData2();
+        flush();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromMultipleSSTables() throws Throwable
+    {
+        insertData1();
+        flush();
+        insertData2();
+        flush();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromMemtableAndSSTables() throws Throwable
+    {
+        insertData1();
+        flush();
+        insertData2();
+        runQueries();
+    }
+
+    @Test
+    public void queryFromCompactedSSTable() throws Throwable
+    {
+        insertData1();
+        flush();
+        insertData2();
+        flush();
+        compact();
+        runQueries();
+    }
+
+    private Object[] expectedRow(int index) {
+        return row(index, Integer.toString(index), index);
+    }
+
+    private void runQueries() throws Throwable
+    {
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 = 2"),
+                expectedRow(2));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 > 1"),
+                expectedRow(2),
+                expectedRow(3),
+                expectedRow(4),
+                expectedRow(5),
+                expectedRow(6));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 >= 3"),
+                expectedRow(3),
+                expectedRow(4),
+                expectedRow(5),
+                expectedRow(6));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 < 3"),
+                expectedRow(1),
+                expectedRow(2));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 <= 3"),
+                expectedRow(1),
+                expectedRow(2),
+                expectedRow(3));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk2 = '2'"),
+                expectedRow(2));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 > 1 AND pk2 = '2'"),
+                expectedRow(2));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE pk1 = -1 AND pk2 = '2'"));
+
+        assertThatThrownBy(()->execute("SELECT * FROM %s WHERE pk1 = -1 AND val = 2"))
+                .hasMessageContaining("use ALLOW FILTERING");
+
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java b/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
new file mode 100644
index 000000000000..5793f9d9d4e2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
@@ -0,0 +1,612 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.collect.ForwardingList;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Sets;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.SimpleStatement;
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.utils.Pair;
+
+import static org.apache.cassandra.cql3.CQLTester.KEYSPACE;
+
+public interface DataModel
+{
+    String SIMPLE_SELECT_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? LIMIT ?";
+    String SIMPLE_SELECT_WITH_FILTERING_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? LIMIT ? ALLOW FILTERING";
+    String TWO_CLAUSE_AND_QUERY_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? AND %s %s ? LIMIT ?";
+    String TWO_CLAUSE_AND_QUERY_FILTERING_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? AND %s %s ? LIMIT ? ALLOW FILTERING";
+    String THREE_CLAUSE_AND_QUERY_FILTERING_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? AND %s %s ? AND %s %s ? LIMIT ? ALLOW FILTERING";
+
+    String ASCII_COLUMN = "abbreviation";
+    String BIGINT_COLUMN = "gdp";
+    String BOOLEAN_COLUMN = "active";
+    String DATE_COLUMN = "visited";
+    String DOUBLE_COLUMN = "area_sq_miles";
+    String FLOAT_COLUMN = "murder_rate";
+    String INET_COLUMN = "ip";
+    String INT_COLUMN = "population";
+    String SMALLINT_COLUMN = "murders_per_year";
+    String TINYINT_COLUMN = "tiny_murders_per_year";
+    String TEXT_COLUMN = "name";
+    String TIME_COLUMN = "avg_dmv_wait";
+    String TIMESTAMP_COLUMN = "visited_timestamp";
+    String UUID_COLUMN = "id";
+    String TIMEUUID_COLUMN = "temporal_id";
+    String NON_INDEXED_COLUMN = "non_indexed";
+
+    Set<String> skipColumns = Sets.newHashSet(NON_INDEXED_COLUMN, BOOLEAN_COLUMN);
+
+    int DEFAULT_TTL_SECONDS = 10;
+
+    List<Pair<String, String>> NORMAL_COLUMNS =
+            ImmutableList.<Pair<String, String>>builder()
+                    .add(Pair.create(ASCII_COLUMN, CQL3Type.Native.ASCII.toString()))
+                    .add(Pair.create(BIGINT_COLUMN, CQL3Type.Native.BIGINT.toString()))
+                    .add(Pair.create(BOOLEAN_COLUMN, CQL3Type.Native.BOOLEAN.toString()))
+                    .add(Pair.create(DATE_COLUMN, CQL3Type.Native.DATE.toString()))
+                    .add(Pair.create(DOUBLE_COLUMN, CQL3Type.Native.DOUBLE.toString()))
+                    .add(Pair.create(FLOAT_COLUMN, CQL3Type.Native.FLOAT.toString()))
+                    .add(Pair.create(INET_COLUMN, CQL3Type.Native.INET.toString()))
+                    .add(Pair.create(INT_COLUMN, CQL3Type.Native.INT.toString()))
+                    .add(Pair.create(SMALLINT_COLUMN, CQL3Type.Native.SMALLINT.toString()))
+                    .add(Pair.create(TINYINT_COLUMN, CQL3Type.Native.TINYINT.toString()))
+                    .add(Pair.create(TEXT_COLUMN, CQL3Type.Native.TEXT.toString()))
+                    .add(Pair.create(TIME_COLUMN, CQL3Type.Native.TIME.toString()))
+                    .add(Pair.create(TIMESTAMP_COLUMN, CQL3Type.Native.TIMESTAMP.toString()))
+                    .add(Pair.create(UUID_COLUMN, CQL3Type.Native.UUID.toString()))
+                    .add(Pair.create(TIMEUUID_COLUMN, CQL3Type.Native.TIMEUUID.toString()))
+                    .add(Pair.create(NON_INDEXED_COLUMN, CQL3Type.Native.INT.toString()))
+                    .build();
+
+    List<String> NORMAL_COLUMN_DATA =
+            ImmutableList.<String>builder()
+                    .add("'AK',   500000000,  true, '2009-07-15', 570640.95,  7.7,   '158.145.20.64',   737709,  164,  16,        'Alaska', '00:18:20', '2009-07-15T00:00:00', e37394dc-d17b-11e8-a8d5-f2801f1b9fd1, acfe5ada-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'AL',  1000000000,  true, '2011-09-13',  50645.33,  7.0,   '206.16.212.91',  4853875,   57,   5,       'Alabama', '01:04:00', '2011-09-13T00:00:00', b7373af6-d7c1-45ae-b145-5bf4b5cdd00c, c592c37e-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'AR',  2000000000, false, '2013-06-17', 113594.08,  5.5,  '170.94.194.134',  2977853,   99,   9,      'Arkansas', '00:55:23', '2013-06-17T00:00:00', a0daaeb4-c8a2-4c68-9899-e32d08238550, cfaae67a-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'CA',  3000000000,  true, '2012-06-17', 155779.22,  4.8,    '67.157.98.46', 38993940, 1861, 117,    'California', '01:30:45', '2012-06-17T00:00:00', 96232af0-0af7-438b-9049-c5a5a944ff93, d7e80692-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'DE',  4000000000, false, '2013-06-17',   1948.54,  6.7,   '167.21.128.20',   944076,   63,   6,      'Delaware', '00:23:45', '2013-06-17T00:00:00', b2a0a879-5223-40d2-9671-775ee209b6f2, dd10a5b6-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'ID',  4500000000, false, '2015-06-18',  82643.12,  1.8,   '164.165.67.10',  1652828,   30,   3,         'Idaho', '00:18:45', '2015-06-18T00:00:00', c6eec0b0-0eef-40e8-ac38-3a82110443e4, e2788780-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'KY',  4750000000, false, '2018-03-12',  39486.34,  4.7,  '205.204.196.64',  4424611,  209,  20,      'Kentucky', '00:45:00', '2018-03-12T00:00:00', 752355f8-405b-4d94-88f3-9992cda30f1e, e7c4e1d4-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'LA',  4800000000,  true, '2013-06-10',  43203.90, 10.2,  '204.196.242.71',  4668960,  474,  47,     'Louisiana', '00:56:07', '2013-06-10T00:00:00', 17be691a-c1a4-4467-a4ad-64605c74fb1c, ee6136d2-d17c-11e8-a8d5-f2801f1b9fd1, 1")
+                    .add("'MA',  5000000000,  true,  '2010-07-04',   7800.06,  1.9,   '170.63.206.57',  6784240,  126,  12, 'Massachusetts', '01:01:34', '2010-07-04T00:00:00', e8a3c287-78cf-46b5-b554-42562e7dcfb3, f57a3b62-d17c-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'MI',  6000000000, false, '2011-09-13',  56538.90,  5.8,    '23.72.184.64',  9917715,  571,  57,      'Michigan', '00:43:09', '2011-09-13T00:00:00', a0daaeb4-c8a2-4c68-9899-e32d08238550, 0497b886-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'MS',  7000000000,  true, '2013-06-17',  46923.27,  5.3,   '192.251.58.38',  2989390,  159,  15,   'Mississippi', '01:04:23', '2013-06-17T00:00:00', 96232af0-0af7-438b-9049-c5a5a944ff93, 0b0205e6-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'TN',  8000000000, false, '2018-03-10',  41234.90,  6.1, '170.141.221.177',  6595056,  402,  40,     'Tennessee', '00:39:45', '2018-03-10T00:00:00', b2a0a879-5223-40d2-9671-775ee209b6f2, 105dc746-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'TX',  9000000000,  true, '2014-06-17', 261231.71,  4.7,   '204.66.40.181', 27429639, 1276, 107,         'Texas', '00:38:13', '2014-06-17T00:00:00', c6eec0b0-0eef-40e8-ac38-3a82110443e4, 155b6bcc-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'UT',  9250000000,  true, '2014-06-20',  82169.62,  1.8,   '204.113.13.48',  2990632,   54,   5,          'Utah', '00:25:00', '2014-06-20T00:00:00', 752355f8-405b-4d94-88f3-9992cda30f1e, 1a267c50-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'VA',  9500000000,  true, '2018-06-19',  39490.09,  4.6,  '152.130.96.221',  8367587,  383,  38,      'Virginia', '00:43:07', '2018-06-19T00:00:00', 17be691a-c1a4-4467-a4ad-64605c74fb1c, 1fc81a4c-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .add("'WY', 10000000000, false, '2015-06-17',  97093.14,  2.7,  '192.146.215.91',   586107,   57,   5,       'Wyoming', '00:15:50', '2015-06-17T00:00:00', e8a3c287-78cf-46b5-b554-42562e7dcfb3, 2576612e-d17d-11e8-a8d5-f2801f1b9fd1, 2")
+                    .build();
+
+    String STATIC_INT_COLUMN = "entered";
+
+    List<Pair<String, String>> STATIC_COLUMNS =
+            ImmutableList.<Pair<String, String>>builder().add(Pair.create(STATIC_INT_COLUMN, CQL3Type.Native.INT.toString() + " static"))
+                                                         .addAll(NORMAL_COLUMNS).build();
+
+    List<String> STATIC_COLUMN_DATA = ImmutableList.of("1819, " + NORMAL_COLUMN_DATA.get(0),
+                                                       "1819, " + NORMAL_COLUMN_DATA.get(1),
+                                                       "1850, " + NORMAL_COLUMN_DATA.get(2),
+                                                       "1850, " + NORMAL_COLUMN_DATA.get(3),
+                                                       "1910, " + NORMAL_COLUMN_DATA.get(4),
+                                                       "1910, " + NORMAL_COLUMN_DATA.get(5),
+                                                       "1792, " + NORMAL_COLUMN_DATA.get(6),
+                                                       "1792, " + NORMAL_COLUMN_DATA.get(7),
+                                                       "1788, " + NORMAL_COLUMN_DATA.get(8),
+                                                       "1788, " + NORMAL_COLUMN_DATA.get(9),
+                                                       "1817, " + NORMAL_COLUMN_DATA.get(10),
+                                                       "1817, " + NORMAL_COLUMN_DATA.get(11),
+                                                       "1896, " + NORMAL_COLUMN_DATA.get(12),
+                                                       "1896, " + NORMAL_COLUMN_DATA.get(13),
+                                                       "1845, " + NORMAL_COLUMN_DATA.get(14),
+                                                       "1845, " + NORMAL_COLUMN_DATA.get(15));
+
+    DataModel withTableOptions(String tableOptions) throws Throwable;
+
+    List<Pair<String, String>> keyColumns();
+
+    void createTables(SAITester tester) throws Throwable;
+
+    void createIndexes(SAITester tester) throws Throwable;
+
+    void flush(SAITester tester) throws Throwable;
+
+    void disableCompaction(SAITester tester) throws Throwable;
+
+    void compact(SAITester tester) throws Throwable;
+
+    void truncateTables(SAITester tester) throws Throwable;
+
+    void insertRows(SAITester tester) throws Throwable;
+
+    void insertRowsWithTTL(SAITester tester) throws Throwable;
+
+    void updateCells(SAITester tester) throws Throwable;
+
+    void deleteCells(SAITester tester) throws Throwable;
+
+    void deleteRows(SAITester tester) throws Throwable;
+
+    ResultSet executeIndexed(SAITester tester, String query, Object... values) throws Throwable;
+
+    ResultSet executeIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable;
+
+    ResultSet executeNonIndexed(SAITester tester, String query, Object... values) throws Throwable;
+
+    ResultSet executeNonIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable;
+
+    class BaseDataModel implements DataModel
+    {
+        final List<Pair<String, String>> columns;
+        final String columnNames;
+        final List<String> rows;
+
+        String tableOptions = "";
+        String indexedTable;
+        String nonIndexedTable;
+
+        List<Pair<String, String>> keyColumns;
+        String primaryKey;
+        List<String> keys;
+
+        BaseDataModel(List<Pair<String, String>> columns, List<String> rows)
+        {
+            this.keyColumns = ImmutableList.of(Pair.create("p", "int"));
+            this.primaryKey = keyColumns.stream().map(pair -> pair.left).collect(Collectors.joining(", "));
+
+            this.columns = columns;
+            this.columnNames = columns.stream().map(pair -> pair.left).collect(Collectors.joining(", "));
+            this.rows = rows;
+
+            this.keys = new SimplePrimaryKeyList(rows.size());
+        }
+
+        public DataModel withTableOptions(String tableOptions)
+        {
+            this.tableOptions = tableOptions;
+            return this;
+        }
+
+        public List<Pair<String, String>> keyColumns()
+        {
+            return keyColumns;
+        }
+
+        public void createTables(SAITester tester)
+        {
+            String keyColumnDefs = keyColumns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
+            String normalColumnDefs = columns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
+
+            String template = "CREATE TABLE %%s (%s, %s, PRIMARY KEY (%s))" + tableOptions;
+            indexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
+            nonIndexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
+        }
+
+        public void truncateTables(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, "TRUNCATE TABLE %s");
+            executeLocal(tester, "TRUNCATE TABLE %s");
+        }
+
+        public void createIndexes(SAITester tester) throws Throwable
+        {
+            String template = "CREATE CUSTOM INDEX ndi_%s_index_%s ON %%s (%s) USING 'StorageAttachedIndex'";
+
+            for (Pair<String, String> column : columns)
+            {
+                if (!skipColumns.contains(column.left))
+                {
+                    executeLocalIndexed(tester, String.format(template, column.left, indexedTable, column.left));
+                    tester.waitForIndexQueryable();
+                }
+            }
+        }
+
+        public void flush(SAITester tester) throws Throwable
+        {
+            tester.flush(KEYSPACE, indexedTable);
+            tester.flush(KEYSPACE, nonIndexedTable);
+        }
+
+        public void disableCompaction(SAITester tester) throws Throwable
+        {
+            tester.disableCompaction(KEYSPACE, indexedTable);
+            tester.disableCompaction(KEYSPACE, nonIndexedTable);
+        }
+
+        public void compact(SAITester tester) throws Throwable
+        {
+            tester.compact(KEYSPACE, indexedTable);
+            tester.compact(KEYSPACE, nonIndexedTable);
+        }
+
+        public void insertRows(SAITester tester) throws Throwable
+        {
+            String template = "INSERT INTO %%s (%s, %s) VALUES (%s, %s)";
+
+            for (int i = 0; i < keys.size(); i++)
+            {
+                executeLocal(tester, String.format(template, primaryKey, columnNames, keys.get(i), rows.get(i)));
+            }
+        }
+
+        public void insertRowsWithTTL(SAITester tester) throws Throwable
+        {
+            String template = "INSERT INTO %%s (%s, %s) VALUES (%s, %s)%s";
+
+            for (int i = 0; i < keys.size(); i++)
+            {
+                String ttl = deletable().contains(i) ? " USING TTL " + DEFAULT_TTL_SECONDS : "";
+                executeLocal(tester, String.format(template, primaryKey, columnNames, keys.get(i), rows.get(i), ttl));
+            }
+        }
+
+        public void updateCells(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p = 0", BIGINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p = 1", BOOLEAN_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2018-03-10' WHERE p = 2", DATE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 8788.06 WHERE p = 3", DOUBLE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 2.9 WHERE p = 4", FLOAT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '205.204.196.65' WHERE p = 5", INET_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 27429638 WHERE p = 6", INT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 31 WHERE p = 7", SMALLINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 116 WHERE p = 8", TINYINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 'State of Michigan' WHERE p = 9", TEXT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '00:20:26' WHERE p = 10", TIME_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2009-07-16T00:00:00' WHERE p = 11", TIMESTAMP_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = e37394dc-d17b-11e8-a8d5-f2801f1b9fd1 WHERE p = 12", UUID_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 1fc81a4c-d17d-11e8-a8d5-f2801f1b9fd1 WHERE p = 13", TIMEUUID_COLUMN));
+        }
+
+        public void deleteCells(SAITester tester) throws Throwable
+        {
+            for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
+            {
+                executeLocal(tester, String.format("DELETE %s FROM %%s WHERE p = %s", NORMAL_COLUMNS.get(i).left, i));
+            }
+        }
+
+        public void deleteRows(SAITester tester) throws Throwable
+        {
+            String template = "DELETE FROM %%s WHERE p = %d";
+
+            for (int deleted : deletable())
+            {
+                executeLocal(tester, String.format(template, deleted));
+            }
+        }
+
+        public void executeLocal(SAITester tester, String query, Object... values) throws Throwable
+        {
+            tester.executeFormattedQuery(formatIndexedQuery(query), values);
+            tester.executeFormattedQuery(formatNonIndexedQuery(query), values);
+        }
+
+        public UntypedResultSet executeLocalIndexed(SAITester tester, String query, Object... values) throws Throwable
+        {
+            return tester.executeFormattedQuery(formatIndexedQuery(query), values);
+        }
+
+        public UntypedResultSet executeLocalNonIndexed(SAITester tester, String query, Object... values) throws Throwable
+        {
+            return tester.executeFormattedQuery(formatNonIndexedQuery(query), values);
+        }
+
+        public ResultSet executeIndexed(SAITester tester, String query, Object... values) throws Throwable
+        {
+            return tester.sessionNet().execute(new SimpleStatement(formatIndexedQuery(query), values));
+        }
+
+        public ResultSet executeIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable
+        {
+            SimpleStatement statement = new SimpleStatement(formatIndexedQuery(query), values);
+            statement.setFetchSize(fetchSize);
+            return tester.sessionNet().execute(statement);
+        }
+
+        public ResultSet executeNonIndexed(SAITester tester, String query, Object... values) throws Throwable
+        {
+            return tester.sessionNet().execute(new SimpleStatement(formatNonIndexedQuery(query), values));
+        }
+
+        public ResultSet executeNonIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable
+        {
+            SimpleStatement statement = new SimpleStatement(formatNonIndexedQuery(query), values);
+            statement.setFetchSize(fetchSize);
+            return tester.sessionNet().execute(statement);
+        }
+
+        protected Set<Integer> deletable()
+        {
+            return Sets.newHashSet(3, 7, 9, 12);
+        }
+
+        private String formatIndexedQuery(String query)
+        {
+            return indexedTable == null ? query : String.format(query, KEYSPACE + "." + indexedTable);
+        }
+
+        private String formatNonIndexedQuery(String query)
+        {
+            return nonIndexedTable == null ? query : String.format(query, KEYSPACE + "." + nonIndexedTable);
+        }
+
+        @Override
+        public String toString()
+        {
+            return MoreObjects.toStringHelper(this).add("primaryKey", primaryKey).toString();
+        }
+    }
+
+    class CompoundKeyWithStaticsDataModel extends CompoundKeyDataModel
+    {
+        CompoundKeyWithStaticsDataModel(List<Pair<String, String>> columns, List<String> rows)
+        {
+            super(columns, rows);
+
+            this.keys = new CompoundPrimaryKeyList(rows.size(), 2);
+        }
+
+        @Override
+        public void insertRows(SAITester tester) throws Throwable
+        {
+            super.insertRows(tester);
+
+            executeLocal(tester, String.format("INSERT INTO %%s (p, %s) VALUES(100, 2019)", DataModel.STATIC_INT_COLUMN)); // static only
+        }
+
+        @Override
+        public void updateCells(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p = 0 AND c = 0", BIGINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p = 0 AND c = 1", BOOLEAN_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2018-03-10' WHERE p = 1 AND c = 0", DATE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 8788.06 WHERE p = 1 AND c = 1", DOUBLE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 2.9 WHERE p = 2 AND c = 0", FLOAT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '205.204.196.65' WHERE p = 2 AND c = 1", INET_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 27429638 WHERE p = 3 AND c = 0", INT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 31 WHERE p = 3 AND c = 1", SMALLINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 116 WHERE p = 4 AND c = 0", TINYINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 'State of Michigan' WHERE p = 4 AND c = 1", TEXT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '00:20:26' WHERE p = 5 AND c = 0", TIME_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2009-07-16T00:00:00' WHERE p = 5 AND c = 1", TIMESTAMP_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = e37394dc-d17b-11e8-a8d5-f2801f1b9fd1 WHERE p = 6 AND c = 0", UUID_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 1fc81a4c-d17d-11e8-a8d5-f2801f1b9fd1 WHERE p = 6 AND c = 1", TIMEUUID_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 1896 WHERE p = 7", STATIC_INT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 2020 WHERE p = 100", STATIC_INT_COLUMN)); // static only
+        }
+
+        @Override
+        public void deleteCells(SAITester tester) throws Throwable
+        {
+            for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
+            {
+                String[] primaryKey = keys.get(i).split(",");
+                executeLocal(tester, String.format("DELETE %s FROM %%s WHERE p = %s AND c = %s", NORMAL_COLUMNS.get(i).left, primaryKey[0], primaryKey[1]));
+            }
+        }
+
+        @Override
+        public void deleteRows(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, "DELETE FROM %s WHERE p = 2 AND c = 0");
+            executeLocal(tester, "DELETE FROM %s WHERE p = 4 AND c = 0");
+            executeLocal(tester, "DELETE FROM %s WHERE p = 6");
+        }
+
+        @Override
+        protected Set<Integer> deletable()
+        {
+            return Sets.newHashSet(4, 8, 12, 13, 100);
+        }
+    }
+
+    class CompositePartitionKeyDataModel extends BaseDataModel
+    {
+        CompositePartitionKeyDataModel(List<Pair<String, String>> columns, List<String> rows)
+        {
+            super(columns, rows);
+
+            this.keyColumns = ImmutableList.of(Pair.create("p1", "int"), Pair.create("p2", "int"));
+            this.primaryKey = keyColumns.stream().map(pair -> pair.left).collect(Collectors.joining(", "));
+            this.keys = new CompoundPrimaryKeyList(rows.size(), 2);
+        }
+
+        @Override
+        public void createTables(SAITester tester)
+        {
+            String keyColumnDefs = keyColumns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
+            String normalColumnDefs = columns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
+
+            String template = "CREATE TABLE %%s (%s, %s, PRIMARY KEY ((%s)))" + tableOptions;
+            indexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
+            nonIndexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
+        }
+
+        @Override
+        public void createIndexes(SAITester tester) throws Throwable
+        {
+            super.createIndexes(tester);
+            String template = "CREATE CUSTOM INDEX ndi_%s_index_%s ON %%s (%s) USING 'StorageAttachedIndex'";
+
+            for (Pair<String, String> column : keyColumns)
+            {
+                if (!skipColumns.contains(column.left))
+                {
+                    executeLocalIndexed(tester, String.format(template, column.left, indexedTable, column.left));
+                    tester.waitForIndexQueryable();
+                }
+            }
+        }
+
+        @Override
+        public void insertRows(SAITester tester) throws Throwable
+        {
+            super.insertRows(tester);
+        }
+
+        @Override
+        public void updateCells(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p1 = 0 AND p2 = 0", BIGINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p1 = 0 AND p2 = 1", BOOLEAN_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2018-03-10' WHERE p1 = 1 AND p2 = 0", DATE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 8788.06 WHERE p1 = 1 AND p2 = 1", DOUBLE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 2.9 WHERE p1 = 2 AND p2 = 0", FLOAT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '205.204.196.65' WHERE p1 = 2 AND p2 = 1", INET_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 27429638 WHERE p1 = 3 AND p2 = 0", INT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 31 WHERE p1 = 3 AND p2 = 1", SMALLINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 116 WHERE p1 = 4 AND p2 = 0", TINYINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 'State of Michigan' WHERE p1 = 4 AND p2 = 2", TEXT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '00:20:26' WHERE p1 = 5 AND p2 = 3", TIME_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2009-07-16T00:00:00' WHERE p1 = 5 AND p2 = 1", TIMESTAMP_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = e37394dc-d17b-11e8-a8d5-f2801f1b9fd1 WHERE p1 = 6 AND p2 = 0", UUID_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 1fc81a4c-d17d-11e8-a8d5-f2801f1b9fd1 WHERE p1 = 6 AND p2 = 1", TIMEUUID_COLUMN));
+        }
+
+        @Override
+        public void deleteCells(SAITester tester) throws Throwable
+        {
+            for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
+            {
+                String[] primaryKey = keys.get(i).split(",");
+                executeLocal(tester, String.format("DELETE %s FROM %%s WHERE p1 = %s AND p2 = %s",
+                                                   NORMAL_COLUMNS.get(i).left, primaryKey[0], primaryKey[1]));
+            }
+        }
+
+        @Override
+        public void deleteRows(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, "DELETE FROM %s WHERE p1 = 2 AND p2 = 0");
+            executeLocal(tester, "DELETE FROM %s WHERE p1 = 4 AND p2 = 1");
+            executeLocal(tester, "DELETE FROM %s WHERE p1 = 6 AND p2 = 2");
+            executeLocal(tester, "DELETE FROM %s WHERE p1 = 8 AND p2 = 0");
+        }
+
+        @Override
+        protected Set<Integer> deletable()
+        {
+            // already overwrites {@code deleteRows()}
+            return Collections.emptySet();
+        }
+    }
+
+    class CompoundKeyDataModel extends BaseDataModel
+    {
+        CompoundKeyDataModel(List<Pair<String, String>> columns, List<String> rows)
+        {
+            super(columns, rows);
+
+            this.keyColumns = ImmutableList.of(Pair.create("p", "int"), Pair.create("c", "int"));
+            this.primaryKey = keyColumns.stream().map(pair -> pair.left).collect(Collectors.joining(", "));
+            this.keys = new CompoundPrimaryKeyList(rows.size(), 1);
+        }
+
+        @Override
+        public void updateCells(SAITester tester) throws Throwable
+        {
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p = 0 AND c = 0", BIGINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p = 1 AND c = 0", BOOLEAN_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2018-03-10' WHERE p = 2 AND c = 0", DATE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 8788.06 WHERE p = 3 AND c = 0", DOUBLE_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 2.9 WHERE p = 4 AND c = 0", FLOAT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '205.204.196.65' WHERE p = 5 AND c = 0", INET_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 27429638 WHERE p = 6 AND c = 0", INT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 31 WHERE p = 7 AND c = 0", SMALLINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 116 WHERE p = 8 AND c = 0", TINYINT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 'State of Michigan' WHERE p = 9 AND c = 0", TEXT_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '00:20:26' WHERE p = 10 AND c = 0", TIME_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = '2009-07-16T00:00:00' WHERE p = 11 AND c = 0", TIMESTAMP_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = e37394dc-d17b-11e8-a8d5-f2801f1b9fd1 WHERE p = 12 AND c = 0", UUID_COLUMN));
+            executeLocal(tester, String.format("UPDATE %%s SET %s = 1fc81a4c-d17d-11e8-a8d5-f2801f1b9fd1 WHERE p = 13 AND c = 0", TIMEUUID_COLUMN));
+        }
+
+        @Override
+        public void deleteCells(SAITester tester) throws Throwable
+        {
+            for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
+            {
+                executeLocal(tester, String.format("DELETE %s FROM %%s WHERE p = %s AND c = 0", NORMAL_COLUMNS.get(i).left, i));
+            }
+        }
+    }
+
+    class SimplePrimaryKeyList extends ForwardingList<String>
+    {
+        private final List<String> primaryKeys;
+
+        SimplePrimaryKeyList(int rows)
+        {
+            this.primaryKeys = IntStream.range(0, rows).mapToObj(String::valueOf).collect(Collectors.toList());
+        }
+
+        @Override
+        protected List<String> delegate()
+        {
+            return primaryKeys;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("SimplePrimaryKeyList[rows: %d]", primaryKeys.size());
+        }
+    }
+
+    class CompoundPrimaryKeyList extends ForwardingList<String>
+    {
+        private final List<String> primaryKeys;
+        private final int rowsPerPartition;
+
+        CompoundPrimaryKeyList(int rows, int rowsPerPartition)
+        {
+            this.primaryKeys = IntStream.range(0, rows).mapToObj(v -> v / rowsPerPartition + ", " + v % rowsPerPartition).collect(Collectors.toList());
+            this.rowsPerPartition = rowsPerPartition;
+        }
+
+        @Override
+        protected List<String> delegate()
+        {
+            return primaryKeys;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("CompoundPrimaryKeyList[rows: %d, partition size: %d]", primaryKeys.size(), rowsPerPartition);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/DecimalLargeValueTest.java b/test/unit/org/apache/cassandra/index/sai/cql/DecimalLargeValueTest.java
new file mode 100644
index 000000000000..37ccda942102
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/DecimalLargeValueTest.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.math.BigDecimal;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.StringUtils;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+
+public class DecimalLargeValueTest extends SAITester
+{
+    @Before
+    public void createTableAndIndex()
+    {
+        requireNetwork();
+
+        createTable("CREATE TABLE %s (pk int, ck int, dec decimal, PRIMARY KEY (pk, ck))");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(dec) USING 'StorageAttachedIndex'");
+
+        disableCompaction();
+    }
+
+    /**
+     * This test tries to induce rounding errors involving decimal values with wide significands.
+     *
+     * Two values are indexed:
+     * <ul>
+     * <li>1.0</li>
+     * <li>1.(510 zeros)1</li>
+     * </ul>
+     */
+    @Test
+    public void runQueriesWithDecimalValueCollision() throws Throwable
+    {
+        final int significandSizeInDecimalDigits = 512;
+        // String.repeat(int) exists in JDK 11 and later, but this line was introduced on JDK 8
+        String wideDecimalString = "1." + StringUtils.repeat('0', significandSizeInDecimalDigits - 2) + "1";
+        BigDecimal wideDecimal = new BigDecimal(wideDecimalString);
+        // Sanity checks that this value was actually constructed as intended
+        Preconditions.checkState(wideDecimal.precision() == significandSizeInDecimalDigits,
+                                 "expected precision %s, but got %s; string representation is \"%s\"",
+                                 significandSizeInDecimalDigits, wideDecimal.precision(), wideDecimalString);
+        Preconditions.checkState(wideDecimalString.equals(wideDecimal.toPlainString()),
+                                 "expected: %s; actual: %s", wideDecimalString, wideDecimal.toPlainString());
+
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (0, 1, 1.0)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (2, 0, " + wideDecimalString + ")");
+
+        // EQ queries
+        assertRows(execute("SELECT * FROM %s WHERE dec = 1.0"),
+                row(0, 1, BigDecimal.valueOf(1.0D)));
+
+        assertRows(execute("SELECT * FROM %s WHERE dec = " + wideDecimalString),
+                row(2, 0, wideDecimal));
+
+        // LT/LTE queries
+        assertRows(execute("SELECT * FROM %s WHERE dec < " + wideDecimalString),
+                row(0, 1, BigDecimal.valueOf(1.0D)));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec <= " + wideDecimalString),
+                row(0, 1, BigDecimal.valueOf(1.0D)),
+                row(2, 0, wideDecimal));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE dec < 1.0"));
+
+        assertRows(execute("SELECT * FROM %s WHERE dec <= 1.0"),
+                row(0, 1, BigDecimal.valueOf(1.0D)));
+
+        // GT/GTE queries
+        assertRows(execute("SELECT * FROM %s WHERE dec > 1.0"),
+                row(2, 0, wideDecimal));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec >= " + wideDecimalString),
+                row(2, 0, wideDecimal));
+
+        assertEmpty(execute("SELECT * FROM %s WHERE dec > " + wideDecimalString));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec >= 1.0"),
+                row(0, 1, BigDecimal.valueOf(1.0D)),
+                row(2, 0, wideDecimal));
+    }
+    /**
+     * This is a control method with small (two-significant-digit) values.
+     */
+    @Test
+    public void runQueriesWithoutCollisions() throws Throwable
+    {
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (-2, 1, 2.2)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (-2, 2, 2.2)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (-1, 1, 1.1)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (0, 1, 0)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (1, 1, 1.1)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (2, 1, 2.2)");
+        execute("INSERT INTO %s (pk, ck, dec) VALUES (2, 2, 2.2)");
+
+        // EQ queries
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec = 1.1"),
+                row(-1, 1, BigDecimal.valueOf(1.1D)),
+                row(1, 1, BigDecimal.valueOf(1.1D)));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec = 2.2"),
+                row(-2, 1, BigDecimal.valueOf(2.2D)),
+                row(-2, 2, BigDecimal.valueOf(2.2D)),
+                row(2, 1, BigDecimal.valueOf(2.2D)),
+                row(2, 2, BigDecimal.valueOf(2.2D)));
+
+        // LT/LTE queries
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec < 1.1"),
+                row(0, 1, BigDecimal.valueOf(0)));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec <= 1.1"),
+                row(-1, 1, BigDecimal.valueOf(1.1D)),
+                row(0, 1, BigDecimal.valueOf(0)),
+                row(1, 1, BigDecimal.valueOf(1.1D)));
+
+        // GT/GTE queries
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec > 1.1"),
+                row(-2, 1, BigDecimal.valueOf(2.2D)),
+                row(-2, 2, BigDecimal.valueOf(2.2D)),
+                row(2, 1, BigDecimal.valueOf(2.2D)),
+                row(2, 2, BigDecimal.valueOf(2.2D)));
+
+        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE dec >= 1.1"),
+                row(-2, 1, BigDecimal.valueOf(2.2D)),
+                row(-2, 2, BigDecimal.valueOf(2.2D)),
+                row(-1, 1, BigDecimal.valueOf(1.1D)),
+                row(1, 1, BigDecimal.valueOf(1.1D)),
+                row(2, 1, BigDecimal.valueOf(2.2D)),
+                row(2, 2, BigDecimal.valueOf(2.2D)));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/DuplicateRowIDTest.java b/test/unit/org/apache/cassandra/index/sai/cql/DuplicateRowIDTest.java
new file mode 100644
index 000000000000..3ffa1b3e41dd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/DuplicateRowIDTest.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.List;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.Row;
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.junit.Assert.assertEquals;
+
+public class DuplicateRowIDTest extends SAITester
+{
+    @BeforeClass
+    public static void setupCluster()
+    {
+        requireNetwork();
+    }
+
+    @Test
+    public void shouldTolerateDuplicatedRowIDsAfterMemtableUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id1 TEXT PRIMARY KEY, v1 INT)");
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+
+        // fill 2 bkd leaves
+        for (int i = 0; i < 2048; ++i)
+        {
+            execute("INSERT INTO %s (id1, v1) VALUES (?, ?)", Integer.toString(i % 10), i);
+        }
+
+        // tolerate query duplicates from memtable
+        List<Row> rows = executeNet("SELECT * FROM %s WHERE v1 > 0").all();
+        assertEquals(10, rows.size());
+
+        flush();
+
+        // tolerate duplicates from 1 sstable
+        // query will match both leaves, one entirely and one with filtering, as it contains a single entry with v1 == 0
+        rows = executeNet("SELECT * FROM %s WHERE v1 > 0").all();
+        assertEquals(10, rows.size());
+
+        // fill 2 bkd leaves again
+        for (int i = 0; i < 2048; ++i)
+        {
+            execute("INSERT INTO %s (id1, v1) VALUES (?, ?)", Integer.toString(i % 10), i);
+        }
+
+        // tolerate duplicates from memtable and sstable
+        rows = executeNet("SELECT * FROM %s WHERE v1 > 0").all();
+        assertEquals(10, rows.size());
+
+        flush();
+
+        // tolerate duplicates from 2 sstables
+        rows = executeNet("SELECT * FROM %s WHERE v1 > 0").all();
+        assertEquals(10, rows.size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java b/test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java
new file mode 100644
index 000000000000..e41bcfce5a48
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.utils.Pair;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class IndexOperatorSupportTest extends SAITester
+{
+    @Test
+    public void shouldRejectAllQueries() throws Throwable
+    {
+        requireNetwork();
+
+        DataModel model = new DataModel.BaseDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA);
+        model.createTables(this);
+        model.createIndexes(this);
+
+        for (String[] scenario : scenarios())
+        {
+            assertThatThrownBy(() -> model.executeIndexed(this, String.format(scenario[2], scenario[1]))).isInstanceOf(InvalidQueryException.class).as(scenario[0]);
+        }
+    }
+
+    private List<String[]> scenarios()
+    {
+        List<String[]> scenarios = new LinkedList<>();
+
+        for (Pair<String, String> column : DataModel.NORMAL_COLUMNS)
+        {
+            scenarios.add(new String[]{ "Should reject LIKE query for " + column.left, column.left, "SELECT * FROM %%s WHERE %s LIKE 'foo%%%%'" });
+        }
+
+        scenarios.add(new String[] { "Should reject range query for " + DataModel.ASCII_COLUMN, DataModel.ASCII_COLUMN, "SELECT * FROM %%s WHERE %s > 'foo'" });
+        scenarios.add(new String[] { "Should reject range query for " + DataModel.ASCII_COLUMN, DataModel.ASCII_COLUMN, "SELECT * FROM %%s WHERE %s != 'foo'" });
+
+        scenarios.add(new String[] { "Should reject range query for " + DataModel.TEXT_COLUMN, DataModel.TEXT_COLUMN, "SELECT * FROM %%s WHERE %s > 'foo'" });
+        scenarios.add(new String[] { "Should reject range query for " + DataModel.TEXT_COLUMN, DataModel.TEXT_COLUMN, "SELECT * FROM %%s WHERE %s != 'foo'" });
+
+        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.TINYINT_COLUMN, DataModel.TINYINT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
+        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.SMALLINT_COLUMN, DataModel.SMALLINT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
+        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.INT_COLUMN, DataModel.INT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
+        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.BIGINT_COLUMN, DataModel.BIGINT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
+
+        scenarios.add(new String[] { "Should reject range query for " + DataModel.BOOLEAN_COLUMN, DataModel.BOOLEAN_COLUMN, "SELECT * FROM %%s WHERE %s > true" });
+
+        scenarios.add(new String[] { "Should reject range query for " + DataModel.UUID_COLUMN, DataModel.UUID_COLUMN, "SELECT * FROM %%s WHERE %s > e37394dc-d17b-11e8-a8d5-f2801f1b9fd1" });
+
+        return scenarios;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java b/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
new file mode 100644
index 000000000000..9ab0879bb119
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
@@ -0,0 +1,631 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.Arrays;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+import org.junit.Before;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.marshal.InetAddressType;
+import org.apache.cassandra.db.marshal.SimpleDateType;
+import org.apache.cassandra.db.marshal.TimeType;
+import org.apache.cassandra.db.marshal.TimestampType;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.plan.StorageAttachedIndexSearcher;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.utils.Pair;
+import org.hamcrest.Matchers;
+
+import static org.apache.cassandra.index.sai.cql.DataModel.INET_COLUMN;
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+
+/**
+ * A CQL-based test framework for simulating queries across as much of the index state space as possible.
+ *
+ * This includes, but need not be limited to...
+ *
+ * 1.) ...queries on the same data as it migrates through the write path and storage engine.
+ * 2.) ...queries across all supported native data types.
+ * 3.) ...queries for all supported operators and value boundaries.
+ * 4.) ...queries for varying write, update, delete, and TTL workloads.
+ * 5.) ...queries across varying primary key and table structures.
+ * 6.) ...queries across static, normal, and clustering column types.
+ * 7.) ...queries across various paging and limit settings.
+ */
+@RunWith(Parameterized.class)
+public abstract class IndexQuerySupport extends SAITester
+{
+    static List<BaseQuerySet> BASE_QUERY_SETS = ImmutableList.of(new BaseQuerySet(10, 5),
+                                                                 new BaseQuerySet(10, 9),
+                                                                 new BaseQuerySet(10, 10),
+                                                                 new BaseQuerySet(10, Integer.MAX_VALUE),
+                                                                 new BaseQuerySet(24, 10),
+                                                                 new BaseQuerySet(24, 100),
+                                                                 new BaseQuerySet(24, Integer.MAX_VALUE));
+
+    static List<BaseQuerySet> COMPOSITE_PARTITION_QUERY_SETS = ImmutableList.of(new CompositePartitionQuerySet(10, 5),
+                                                                                new CompositePartitionQuerySet(10, 10),
+                                                                                new CompositePartitionQuerySet(10, Integer.MAX_VALUE),
+                                                                                new CompositePartitionQuerySet(24, 10),
+                                                                                new CompositePartitionQuerySet(24, 100),
+                                                                                new CompositePartitionQuerySet(24, Integer.MAX_VALUE));
+
+    static List<BaseQuerySet> STATIC_QUERY_SETS = ImmutableList.of(new StaticColumnQuerySet(10, 5),
+                                                                   new StaticColumnQuerySet(10, 10),
+                                                                   new StaticColumnQuerySet(10, Integer.MAX_VALUE),
+                                                                   new StaticColumnQuerySet(24, 10),
+                                                                   new StaticColumnQuerySet(24, 100),
+                                                                   new StaticColumnQuerySet(24, Integer.MAX_VALUE));
+
+    static final Injections.Counter INDEX_QUERY_COUNTER = Injections.newCounter("IndexQueryCounter")
+                                                                    .add(newInvokePoint().onClass(StorageAttachedIndexSearcher.class).onMethod("search"))
+                                                                    .build();
+
+    @Parameterized.Parameter(0)
+    public DataModel dataModel;
+    @Parameterized.Parameter(1)
+    public List<BaseQuerySet> sets;
+
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+
+        Injections.inject(INDEX_QUERY_COUNTER);
+    }
+
+    protected void writeLifecycle() throws Throwable
+    {
+        dataModel.createTables(this);
+
+        dataModel.disableCompaction(this);
+
+        dataModel.createIndexes(this);
+
+        // queries against Memtable adjacent in-memory indexes
+        dataModel.insertRows(this);
+        executeQueries(dataModel, sets);
+
+        // queries with Memtable flushed to SSTable on disk
+        dataModel.flush(this);
+        executeQueries(dataModel, sets);
+
+        // queries across memory and disk indexes
+        dataModel.insertRows(this);
+        executeQueries(dataModel, sets);
+
+        // queries w/ multiple SSTable indexes
+        dataModel.flush(this);
+        executeQueries(dataModel, sets);
+
+        // queries after compacting to a single SSTable index
+        dataModel.compact(this);
+        executeQueries(dataModel, sets);
+
+        // queries against Memtable updates and the existing SSTable index
+        dataModel.updateCells(this);
+        executeQueries(dataModel, sets);
+
+        // queries against the newly flushed SSTable index and the existing SSTable index
+        dataModel.flush(this);
+        executeQueries(dataModel, sets);
+
+        // queries after compacting updates into to a single SSTable index
+        dataModel.compact(this);
+        executeQueries(dataModel, sets);
+    }
+
+    public void rowDeletions() throws Throwable
+    {
+        dataModel.createTables(this);
+
+        dataModel.disableCompaction(this);
+
+        dataModel.createIndexes(this);
+        dataModel.insertRows(this);
+        dataModel.flush(this);
+        dataModel.compact(this);
+
+        // baseline queries
+        executeQueries(dataModel, sets);
+
+        // queries against Memtable deletes and the existing SSTable index
+        dataModel.deleteRows(this);
+        executeQueries(dataModel, sets);
+
+        // queries against the newly flushed SSTable index and the existing SSTable index
+        dataModel.flush(this);
+        executeQueries(dataModel, sets);
+
+        // queries after compacting deletes into to a single SSTable index
+        dataModel.compact(this);
+        executeQueries(dataModel, sets);
+
+        // truncate, reload, and verify that the load is clean
+        dataModel.truncateTables(this);
+        dataModel.insertRows(this);
+        executeQueries(dataModel, sets);
+    }
+
+    public void cellDeletions() throws Throwable
+    {
+        dataModel.createTables(this);
+
+        dataModel.disableCompaction(this);
+
+        dataModel.createIndexes(this);
+        dataModel.insertRows(this);
+        dataModel.flush(this);
+        dataModel.compact(this);
+
+        // baseline queries
+        executeQueries(dataModel, sets);
+
+        // queries against Memtable deletes and the existing SSTable index
+        dataModel.deleteCells(this);
+        executeQueries(dataModel, sets);
+
+        // queries against the newly flushed SSTable index and the existing SSTable index
+        dataModel.flush(this);
+        executeQueries(dataModel, sets);
+
+        // queries after compacting deletes into to a single SSTable index
+        dataModel.compact(this);
+        executeQueries(dataModel, sets);
+    }
+
+    public void timeToLive() throws Throwable
+    {
+        dataModel.createTables(this);
+
+        dataModel.disableCompaction(this);
+
+        dataModel.createIndexes(this);
+        dataModel.insertRowsWithTTL(this);
+
+        // Wait for the TTL to become effective:
+        TimeUnit.SECONDS.sleep(DataModel.DEFAULT_TTL_SECONDS);
+
+        // Make sure TTLs are reflected in our query results from the Memtable:
+        executeQueries(dataModel, sets);
+
+        // Make sure TTLs are reflected in our query results from SSTables:
+        dataModel.flush(this);
+        executeQueries(dataModel, sets);
+
+        // Make sure fresh overwrites invalidate TTLs:
+        dataModel.insertRows(this);
+        executeQueries(dataModel, sets);
+    }
+
+    @SuppressWarnings("unused")
+    @Parameterized.Parameters(name = "{0}")
+    public static List<Object[]> params() throws Throwable
+    {
+        List<Object[]> scenarios = new LinkedList<>();
+
+        scenarios.add(new Object[]{ new DataModel.BaseDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA), BASE_QUERY_SETS });
+
+        scenarios.add(new Object[]{ new DataModel.CompoundKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA), BASE_QUERY_SETS });
+
+        scenarios.add(new Object[]{ new DataModel.CompoundKeyWithStaticsDataModel(DataModel.STATIC_COLUMNS, DataModel.STATIC_COLUMN_DATA), STATIC_QUERY_SETS });
+
+        scenarios.add(new Object[]{ new DataModel.CompositePartitionKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA),
+                                    ImmutableList.builder().addAll(BASE_QUERY_SETS).addAll(COMPOSITE_PARTITION_QUERY_SETS).build()});
+
+        return scenarios;
+    }
+
+    static String randomPostfix()
+    {
+        return UUID.randomUUID().toString().replace("-", "_");
+    }
+
+    private void executeQueries(DataModel dataModel, List<BaseQuerySet> sets) throws Throwable
+    {
+        for (BaseQuerySet set : sets)
+        {
+            set.execute(this, dataModel);
+        }
+    }
+
+    static class StaticColumnQuerySet extends BaseQuerySet
+    {
+        StaticColumnQuerySet(int limit, int fetchSize)
+        {
+            super(limit, fetchSize);
+        }
+
+        public void execute(SAITester tester, DataModel model) throws Throwable
+        {
+            super.execute(tester, model);
+
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.EQ, 1845);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.LT, 1845);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.LTE, 1845);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.GT, 1845);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.GTE, 1845);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.EQ, 1909);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.LT, 1787);
+            query(tester, model, DataModel.STATIC_INT_COLUMN, Operator.GT, 1910);
+
+            rangeQuery(tester, model, DataModel.STATIC_INT_COLUMN, 1845, 1909);
+        }
+    }
+
+    static class CompositePartitionQuerySet extends BaseQuerySet
+    {
+        CompositePartitionQuerySet(int limit, int fetchSize)
+        {
+            super(limit, fetchSize);
+        }
+
+        public void execute(SAITester tester, DataModel model) throws Throwable
+        {
+            super.execute(tester, model);
+
+            DataModel.BaseDataModel baseDataModel = (DataModel.BaseDataModel) model;
+            for(Pair<String, String> partitionKeyComponent: baseDataModel.keyColumns)
+            {
+                String partitionKeyComponentName = partitionKeyComponent.left;
+                query(tester, model, partitionKeyComponentName, Operator.EQ, 0);
+                query(tester, model, partitionKeyComponentName, Operator.GT, 0);
+                query(tester, model, partitionKeyComponentName, Operator.LTE, 2);
+                query(tester, model, partitionKeyComponentName, Operator.GTE, -1);
+                query(tester, model, partitionKeyComponentName, Operator.LT, 50);
+                query(tester, model, partitionKeyComponentName, Operator.GT, 0);
+            }
+
+            String firstPartitionKey = baseDataModel.keyColumns.get(0).left;
+            String secondPartitionKey = baseDataModel.keyColumns.get(1).left;
+            List<Operator> numericOperators = Arrays.asList(Operator.EQ, Operator.GT, Operator.LT, Operator.GTE, Operator.LTE);
+            List<List<Operator>> combinations = Lists.cartesianProduct(numericOperators, numericOperators).stream()
+                                                     .filter(p-> p.get(0) != Operator.EQ || p.get(1) != Operator.EQ) //If both are EQ the entire partition is specified
+                                                     .collect(Collectors.toList());
+            for(List<Operator> operators : combinations)
+            {
+                andQuery(tester,
+                         model,
+                         firstPartitionKey, operators.get(0), 2,
+                         secondPartitionKey, operators.get(1), 2,
+                         false);
+            }
+        }
+    }
+
+    private static class BaseQuerySet
+    {
+        final int limit;
+        final int fetchSize;
+
+        BaseQuerySet(int limit, int fetchSize)
+        {
+            this.limit = limit;
+            this.fetchSize = fetchSize;
+        }
+
+        void execute(SAITester tester, DataModel model) throws Throwable
+        {
+            query(tester, model, DataModel.ASCII_COLUMN, Operator.EQ, "MA");
+            query(tester, model, DataModel.ASCII_COLUMN, Operator.EQ, "LA");
+            query(tester, model, DataModel.ASCII_COLUMN, Operator.EQ, "XX");
+
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.EQ, 4800000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.EQ, 5000000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.LT, 5000000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.LTE, 5000000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.GT, 5000000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.GTE, 5000000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.EQ, 22L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.LT, 400000000L);
+            query(tester, model, DataModel.BIGINT_COLUMN, Operator.GT, 10000000000L);
+
+            rangeQuery(tester, model, DataModel.BIGINT_COLUMN, 3000000000L, 7000000000L);
+
+            query(tester, model, DataModel.DATE_COLUMN, Operator.EQ, SimpleDateType.instance.fromString("2013-06-10"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.EQ, SimpleDateType.instance.fromString("2013-06-17"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.LT, SimpleDateType.instance.fromString("2013-06-17"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.LTE, SimpleDateType.instance.fromString("2013-06-17"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.GT, SimpleDateType.instance.fromString("2013-06-17"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.GTE, SimpleDateType.instance.fromString("2013-06-17"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.EQ, SimpleDateType.instance.fromString("2017-01-01"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.LT, SimpleDateType.instance.fromString("2000-01-01"));
+            query(tester, model, DataModel.DATE_COLUMN, Operator.GT, SimpleDateType.instance.fromString("2020-01-01"));
+
+            rangeQuery(tester, model, DataModel.DATE_COLUMN, SimpleDateType.instance.fromString("2013-06-17"), SimpleDateType.instance.fromString("2018-06-19"));
+
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.EQ, 43203.90);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.EQ, 7800.06);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.LT, 82169.62);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.LTE, 82169.62);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.GT, 82169.62);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.GTE, 82169.62);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.EQ, 82169.60);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.LT, 1948.54);
+            query(tester, model, DataModel.DOUBLE_COLUMN, Operator.GT, 570640.95);
+
+            rangeQuery(tester, model, DataModel.DOUBLE_COLUMN, 56538.90, 113594.08);
+
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.EQ, 10.2f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.EQ, 1.9f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.LT, 5.3f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.LTE, 5.3f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.GT, 5.3f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.GTE, 5.3f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.EQ, 5.9f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.LT, 1.8f);
+            query(tester, model, DataModel.FLOAT_COLUMN, Operator.GT, 10.2f);
+
+            rangeQuery(tester, model, DataModel.FLOAT_COLUMN, 4.6f, 6.7f);
+
+            query(tester, model, INET_COLUMN, Operator.EQ, InetAddressType.instance.fromString("170.63.206.57"));
+            query(tester, model, INET_COLUMN, Operator.EQ, InetAddressType.instance.fromString("170.63.206.56"));
+            query(tester, model, INET_COLUMN, Operator.EQ, InetAddressType.instance.fromString("205.204.196.65"));
+            query(tester, model, INET_COLUMN, Operator.EQ, InetAddressType.instance.fromString("164.165.67.10"));
+            query(tester, model, INET_COLUMN, Operator.EQ, InetAddressType.instance.fromString("204.196.242.71"));
+
+            rangeQuery(tester, model, DataModel.INT_COLUMN, 2977853, 6784240);
+
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.EQ, (short) 164);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.LT, (short) 164);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.LTE, (short) 164);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.GT, (short) 164);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.GTE, (short) 164);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.EQ, (short) 2);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.LT, (short) 30);
+            query(tester, model, DataModel.SMALLINT_COLUMN, Operator.GT, (short) 1861);
+
+            rangeQuery(tester, model, DataModel.SMALLINT_COLUMN, (short) 126, (short) 383);
+
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.EQ, (byte) 16);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.LT, (byte) 16);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.LTE, (byte) 16);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.GT, (byte) 16);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.GTE, (byte) 16);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.EQ, (byte) 1);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.LT, (byte) 2);
+            query(tester, model, DataModel.TINYINT_COLUMN, Operator.GT, (byte) 117);
+
+            rangeQuery(tester, model, DataModel.TINYINT_COLUMN, (byte) 12, (byte) 47);
+
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "Alaska");
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "Wyoming");
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "Franklin");
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "State of Michigan");
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "Michigan");
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "Louisiana");
+            query(tester, model, DataModel.TEXT_COLUMN, Operator.EQ, "Massachusetts");
+
+            query(tester, model, DataModel.TIME_COLUMN, Operator.EQ, TimeType.instance.fromString("00:43:07"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.LT, TimeType.instance.fromString("00:43:07"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.LTE, TimeType.instance.fromString("00:43:07"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.GT, TimeType.instance.fromString("00:43:07"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.GTE, TimeType.instance.fromString("00:43:07"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.EQ, TimeType.instance.fromString("00:15:57"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.LT, TimeType.instance.fromString("00:15:50"));
+            query(tester, model, DataModel.TIME_COLUMN, Operator.GT, TimeType.instance.fromString("01:30:45"));
+
+            rangeQuery(tester, model, DataModel.TIME_COLUMN, TimeType.instance.fromString("00:38:13"), TimeType.instance.fromString("00:56:07"));
+
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.EQ, TimestampType.instance.fromString("2013-06-17T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.LT, TimestampType.instance.fromString("2013-06-17T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.LTE, TimestampType.instance.fromString("2013-06-17T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.GT, TimestampType.instance.fromString("2013-06-17T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.GTE, TimestampType.instance.fromString("2013-06-17T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.EQ, TimestampType.instance.fromString("2017-01-01T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.LT, TimestampType.instance.fromString("2000-01-01T00:00:00"));
+            query(tester, model, DataModel.TIMESTAMP_COLUMN, Operator.GT, TimestampType.instance.fromString("2020-01-01T00:00:00"));
+
+            rangeQuery(tester, model, DataModel.TIMESTAMP_COLUMN,
+                       TimestampType.instance.fromString("2013-6-17T00:00:00"),
+                       TimestampType.instance.fromString("2018-6-19T00:00:00"));
+
+            query(tester, model, DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("e37394dc-d17b-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("752355f8-405b-4d94-88f3-9992cda30f1e"));
+            query(tester, model, DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("ac0aa734-d17f-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("c6eec0b0-0eef-40e8-ac38-3a82110443e4"));
+            query(tester, model, DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("e37394dc-d17b-11e8-a8d5-f2801f1b9fd1"));
+
+            query(tester, model, DataModel.TIMEUUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("ee6136d2-d17c-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.TIMEUUID_COLUMN, Operator.LT, UUIDType.instance.fromString("ee6136d2-d17c-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.TIMEUUID_COLUMN, Operator.LTE, UUIDType.instance.fromString("ee6136d2-d17c-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.TIMEUUID_COLUMN, Operator.GT, UUIDType.instance.fromString("ee6136d2-d17c-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.TIMEUUID_COLUMN, Operator.GTE, UUIDType.instance.fromString("ee6136d2-d17c-11e8-a8d5-f2801f1b9fd1"));
+            query(tester, model, DataModel.TIMEUUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("2a421a68-d182-11e8-a8d5-f2801f1b9fd1"));
+
+            andQuery(tester, model,
+                     DataModel.TIMESTAMP_COLUMN, Operator.GTE, TimestampType.instance.fromString("2013-06-20T00:00:00"),
+                     DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("752355f8-405b-4d94-88f3-9992cda30f1e"),
+                     false);
+
+            andQuery(tester, model,
+                     DataModel.TIMESTAMP_COLUMN, Operator.GTE, TimestampType.instance.fromString("2018-06-20T00:00:00"),
+                     DataModel.TEXT_COLUMN, Operator.EQ, "Texas",
+                     false);
+
+            andQuery(tester, model,
+                     DataModel.SMALLINT_COLUMN, Operator.LTE, (short) 126,
+                     DataModel.TINYINT_COLUMN, Operator.LTE, (byte) 9,
+                     false);
+
+            andQuery(tester, model,
+                     DataModel.SMALLINT_COLUMN, Operator.LTE, (short) 126,
+                     DataModel.NON_INDEXED_COLUMN, Operator.GT, 0,
+                     true);
+
+            andQuery(tester, model,
+                     DataModel.TEXT_COLUMN, Operator.EQ, "Alaska",
+                     DataModel.NON_INDEXED_COLUMN, Operator.EQ, 2,
+                     true);
+
+
+            andQuery(tester, model,
+                     DataModel.UUID_COLUMN, Operator.EQ, UUIDType.instance.fromString("e37394dc-d17b-11e8-a8d5-f2801f1b9fd1"),
+                     DataModel.NON_INDEXED_COLUMN, Operator.LT, 3,
+                     true);
+
+            // with partition column filtering
+            String firstPartitionKey = model.keyColumns().get(0).left;
+
+            andQuery(tester, model,
+                     DataModel.TEXT_COLUMN, Operator.EQ, "Alaska",
+                     firstPartitionKey, Operator.EQ, 0,
+                     true);
+
+            andQuery(tester, model,
+                     DataModel.TEXT_COLUMN, Operator.EQ, "Kentucky",
+                     firstPartitionKey, Operator.GT, 4,
+                     true);
+
+            andQuery(tester, model,
+                     DataModel.TEXT_COLUMN, Operator.EQ, "Wyoming",
+                     firstPartitionKey, Operator.LT, 200,
+                     true);
+
+            if (model.keyColumns().size() > 1)
+            {
+                String secondPrimaryKey = model.keyColumns().get(1).left;
+
+                andQuery(tester, model,
+                         DataModel.BIGINT_COLUMN, Operator.EQ, 4800000000L,
+                         secondPrimaryKey, Operator.EQ, 0,
+                         true);
+
+                andQuery(tester, model,
+                         DataModel.DOUBLE_COLUMN, Operator.EQ, 82169.60,
+                         secondPrimaryKey, Operator.GT, 0,
+                         true);
+
+                andQuery(tester, model,
+                         DataModel.DOUBLE_COLUMN, Operator.LT, 1948.54,
+                         secondPrimaryKey, Operator.LTE, 2,
+                         true);
+
+                andQuery(tester, model,
+                         DataModel.TEXT_COLUMN, Operator.EQ, "Alaska",
+                         firstPartitionKey, Operator.EQ, 0,
+                         secondPrimaryKey, Operator.GTE, -1);
+
+                andQuery(tester, model,
+                         DataModel.TEXT_COLUMN, Operator.EQ, "Kentucky",
+                         firstPartitionKey, Operator.GT, 4,
+                         secondPrimaryKey, Operator.LT, 50);
+
+                andQuery(tester, model,
+                         DataModel.TEXT_COLUMN, Operator.EQ, "Wyoming",
+                         firstPartitionKey, Operator.LT, 200,
+                         secondPrimaryKey, Operator.GT, 0);
+            }
+        }
+
+        void query(SAITester tester, DataModel model, String column, Operator operator, Object value) throws Throwable
+        {
+            String query = String.format(DataModel.SIMPLE_SELECT_TEMPLATE, DataModel.ASCII_COLUMN, column, operator);
+            String queryValidator = String.format(DataModel.SIMPLE_SELECT_WITH_FILTERING_TEMPLATE, DataModel.ASCII_COLUMN, column, operator);
+            validate(tester, model, query, queryValidator, value, limit);
+        }
+
+        void andQuery(SAITester tester, DataModel model,
+                      String column1, Operator operator1, Object value1,
+                      String column2, Operator operator2, Object value2,
+                      boolean filtering) throws Throwable
+        {
+            String query = String.format(filtering ? DataModel.TWO_CLAUSE_AND_QUERY_FILTERING_TEMPLATE : DataModel.TWO_CLAUSE_AND_QUERY_TEMPLATE,
+                                         DataModel.ASCII_COLUMN, column1, operator1, column2, operator2);
+
+            String queryValidator = String.format(DataModel.TWO_CLAUSE_AND_QUERY_FILTERING_TEMPLATE,
+                                                  DataModel.ASCII_COLUMN, column1, operator1, column2, operator2);
+
+            validate(tester, model,query, queryValidator, value1, value2, limit);
+        }
+
+        void andQuery(SAITester tester, DataModel model,
+                      String column1, Operator operator1, Object value1,
+                      String column2, Operator operator2, Object value2,
+                      String column3, Operator operator3, Object value3) throws Throwable
+        {
+            // TODO: If we support indexes in all columns, ALLOW FILTERING might go away here...
+            String query = String.format(DataModel.THREE_CLAUSE_AND_QUERY_FILTERING_TEMPLATE,
+                                         DataModel.ASCII_COLUMN, column1, operator1, column2, operator2, column3, operator3);
+
+            String queryValidator = String.format(DataModel.THREE_CLAUSE_AND_QUERY_FILTERING_TEMPLATE,
+                                                  DataModel.ASCII_COLUMN, column1, operator1, column2, operator2, column3, operator3);
+
+            validate(tester, model, query, queryValidator, value1, value2, value3, limit);
+        }
+
+        void rangeQuery(SAITester tester, DataModel model, String column, Object value1, Object value2) throws Throwable
+        {
+            String template = "SELECT %s FROM %%s WHERE %s > ? AND %s < ? LIMIT ?";
+            String templateWithFiltering = "SELECT %s FROM %%s WHERE %s > ? AND %s < ? LIMIT ? ALLOW FILTERING";
+
+            String query = String.format(template, DataModel.ASCII_COLUMN, column, column);
+            String queryValidator = String.format(templateWithFiltering, DataModel.ASCII_COLUMN, column, column);
+            validate(tester, model, query, queryValidator, value1, value2, limit);
+        }
+
+        private List<Object> validate(SAITester tester, DataModel model, String query, String validator, Object... values) throws Throwable
+        {
+            try
+            {
+                INDEX_QUERY_COUNTER.reset();
+
+                List<Object> actual = model.executeIndexed(tester, query, fetchSize, values).all().stream().map(r -> r.getObject(0)).collect(Collectors.toList());
+
+                // This could be more strict, but it serves as a reasonable paging-aware lower bound:
+                int pageCount = (int) Math.ceil(actual.size() / (double) Math.min(actual.size(), fetchSize));
+                assertThat("Expected more calls to " + StorageAttachedIndexSearcher.class, INDEX_QUERY_COUNTER.get(), Matchers.greaterThanOrEqualTo((long) Math.max(1, pageCount)));
+
+                List<Object> expected = model.executeNonIndexed(tester, validator, fetchSize, values).all().stream().map(r -> r.getObject(0)).collect(Collectors.toList());
+
+                assertEquals(expected, actual);
+
+                return expected;
+            }
+            catch (Throwable ex)
+            {
+                // When thrown here, AssertionError does not seem to produce a stack trace, so it's logged explicitly:
+                logger.error("Validation failed while executing query: " + query + ", exception message: " + ex.getMessage(), ex);
+                throw ex;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            return MoreObjects.toStringHelper(this).add("limit", limit).add("fetchSize", fetchSize).toString();
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/InetAddressTypeEquivalencyTest.java b/test/unit/org/apache/cassandra/index/sai/cql/InetAddressTypeEquivalencyTest.java
new file mode 100644
index 000000000000..12ebb54d587c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/InetAddressTypeEquivalencyTest.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.net.InetAddress;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.cql.types.InetTest;
+
+/**
+ * This is testing that we can query ipv4 addresses using ipv6 equivalent addresses.
+ *
+ * The remaining InetAddressType tests are now handled by {@link InetTest}
+ */
+public class InetAddressTypeEquivalencyTest extends SAITester
+{
+    @Before
+    public void createTableAndIndex() throws Throwable
+    {
+        requireNetwork();
+
+        createTable("CREATE TABLE %s (pk int, ck int, ip inet, PRIMARY KEY(pk, ck ))");
+
+        disableCompaction();
+    }
+
+    @Test
+    public void mixedWorkloadQueryTest() throws Throwable
+    {
+        createIndex("CREATE CUSTOM INDEX ON %s(ip) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 1, '127.0.0.1')");
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 2, '127.0.0.1')");
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 3, '127.0.0.2')");
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 4, '::ffff:7f00:3')");
+
+        flush();
+
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 5, '2002:4559:1fe2::4559:1fe2')");
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 6, '2002:4559:1fe2::4559:1fe2')");
+
+        flush();
+
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 7, '2002:4559:1fe2::4559:1fe2')");
+        execute("INSERT INTO %s (pk, ck, ip) VALUES (1, 8, '2002:4559:1fe2::4559:1fe3')");
+
+        runQueries();
+    }
+
+    private void runQueries() throws Throwable
+    {
+        // EQ single ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip = '127.0.0.1'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")));
+
+        // EQ mapped-ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip = '::ffff:7f00:1'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")));
+
+        // EQ ipv6 address
+        assertRows(execute("SELECT * FROM %s WHERE ip = '2002:4559:1fe2::4559:1fe2'"),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")));
+
+        // GT ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip > '127.0.0.1'"),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // GT mapped-ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip > '::ffff:7f00:1'"),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // GT ipv6 address
+        assertRows(execute("SELECT * FROM %s WHERE ip > '2002:4559:1fe2::4559:1fe2'"),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // LT ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip < '127.0.0.3'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")));
+
+        // LT mapped-ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip < '::ffff:7f00:3'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")));
+
+        // LT ipv6 address
+        assertRows(execute("SELECT * FROM %s WHERE ip < '2002:4559:1fe2::4559:1fe3'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")));
+
+        // GE ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '127.0.0.2'"),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // GE mapped-ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '::ffff:7f00:2'"),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // GE ipv6 address
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '2002:4559:1fe2::4559:1fe3'"),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // LE ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip <= '127.0.0.2'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")));
+
+        // LE mapped-ipv4 address
+        assertRows(execute("SELECT * FROM %s WHERE ip <= '::ffff:7f00:2'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")));
+
+        // LE ipv6 address
+        assertRows(execute("SELECT * FROM %s WHERE ip <= '2002:4559:1fe2::4559:1fe2'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")));
+
+        // ipv4 range
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '127.0.0.1' AND ip <= '127.0.0.3'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("127.0.0.3")));
+
+        // ipv4 and mapped ipv4 range
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '127.0.0.1' AND ip <= '::ffff:7f00:3'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("127.0.0.3")));
+
+        // ipv6 range
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '2002:4559:1fe2::4559:1fe2' AND ip <= '2002:4559:1fe2::4559:1fe3'"),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+
+        // Full ipv6 range
+        assertRows(execute("SELECT * FROM %s WHERE ip >= '::' AND ip <= 'ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff'"),
+                row(1, 1, InetAddress.getByName("127.0.0.1")),
+                row(1, 2, InetAddress.getByName("127.0.0.1")),
+                row(1, 3, InetAddress.getByName("127.0.0.2")),
+                row(1, 4, InetAddress.getByName("::ffff:7f00:3")),
+                row(1, 5, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 6, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 7, InetAddress.getByName("2002:4559:1fe2::4559:1fe2")),
+                row(1, 8, InetAddress.getByName("2002:4559:1fe2::4559:1fe3")));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/MixedIndexImplementationsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/MixedIndexImplementationsTest.java
new file mode 100644
index 000000000000..73a4652d21ad
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/MixedIndexImplementationsTest.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.index.ExpressionFilteringIndex;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+
+import static org.junit.Assert.assertNotNull;
+
+/**
+ * Tests behaviour when there are non-storage-attached indexes in the table.
+ */
+public class MixedIndexImplementationsTest extends SAITester
+{
+    /**
+     * Tests that storage-attached indexes can be dropped when there are other indexes in the same table, and vice versa.
+     */
+    @Test
+    public void shouldDropOtherIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int)");
+
+        String ossIndex = createIndex("CREATE INDEX ON %s(v1)");
+        String ndiIndex = createIndex(
+                String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", StorageAttachedIndex.class.getName()));
+
+        // drop non-storage-attached index when a SAI index exists
+        dropIndex("DROP INDEX %s." + ossIndex);
+
+        // drop storage-attached index when a non-SAI exists
+        createIndex("CREATE INDEX ON %s(v1)");
+        dropIndex("DROP INDEX %s." + ndiIndex);
+    }
+
+    /**
+     * Tests that storage-attached index queries can include restrictions over columns indexed by other indexes.
+     */
+    @Test
+    public void shouldAcceptColumnsWithOtherIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int)");
+
+        createIndex("CREATE INDEX ON %s(v1)");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", StorageAttachedIndex.class.getName()));
+
+        String insert = "INSERT INTO %s(k, v1, v2) VALUES (?, ?, ?)";
+        execute(insert, 0, 0, 0);
+        execute(insert, 1, 0, 1);
+        execute(insert, 2, 1, 0);
+        execute(insert, 3, 1, 1);
+
+        waitForIndexQueryable();
+
+        String ossSelect = "SELECT * FROM %s WHERE v1 = ?";
+        assertRowsIgnoringOrder(execute(ossSelect, 0), new Object[][]{{0, 0, 0}, {1, 0, 1}});
+        assertRowsIgnoringOrder(execute(ossSelect, 1), new Object[][]{{2, 1, 0}, {3, 1, 1}});
+
+        String ndiSelect = "SELECT * FROM %s WHERE v1 = ? AND v2 = ? ALLOW FILTERING";
+        assertRowsIgnoringOrder(execute(ndiSelect, 0, 0), new Object[]{0, 0, 0});
+        assertRowsIgnoringOrder(execute(ndiSelect, 0, 1), new Object[]{1, 0, 1});
+        assertRowsIgnoringOrder(execute(ndiSelect, 1, 0), new Object[]{2, 1, 0});
+        assertRowsIgnoringOrder(execute(ndiSelect, 1, 1), new Object[]{3, 1, 1});
+    }
+
+    /**
+     * Tests that storage-attached indexes are not selected when the query contains a custom expression targeted to another index.
+     */
+    @Test
+    public void shouldNotBeSelectedForCustomExpressions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int)");
+
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", StorageAttachedIndex.class.getName()));
+        String indexName = createIndex(
+                String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", ExpressionFilteringIndex.class.getName()));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        ExpressionFilteringIndex customIndex = (ExpressionFilteringIndex) cfs.indexManager.getIndexByName(indexName);
+
+        String insert = "INSERT INTO %s(k, v1, v2) VALUES (?, ?, ?)";
+        execute(insert, 0, 0, 0);
+        execute(insert, 1, 0, 1);
+        execute(insert, 2, 1, 0);
+        execute(insert, 3, 1, 1);
+
+        waitForIndexQueryable();
+
+        String ndiSelect = "SELECT * FROM %s WHERE v1 = ?";
+        assertRowsIgnoringOrder(execute(ndiSelect, 0), new Object[][]{{0, 0, 0}, {1, 0, 1}});
+        assertRowsIgnoringOrder(execute(ndiSelect, 1), new Object[][]{{2, 1, 0}, {3, 1, 1}});
+        Assert.assertEquals(0, customIndex.searches.get());
+
+        String mixedSelect = "SELECT * FROM %s WHERE v1 = ? AND v2 = ? ALLOW FILTERING";
+        assertRowsIgnoringOrder(execute(mixedSelect, 0, 0), new Object[]{0, 0, 0});
+        assertRowsIgnoringOrder(execute(mixedSelect, 0, 1), new Object[]{1, 0, 1});
+        assertRowsIgnoringOrder(execute(mixedSelect, 1, 0), new Object[]{2, 1, 0});
+        assertRowsIgnoringOrder(execute(mixedSelect, 1, 1), new Object[]{3, 1, 1});
+        Assert.assertEquals(0, customIndex.searches.get());
+
+        String exprSelect = String.format("SELECT * FROM %%s WHERE v1 = ? AND expr(%s, ?) ALLOW FILTERING", indexName);
+        assertRowsIgnoringOrder(execute(exprSelect, 0, 0), new Object[]{0, 0, 0});
+        assertRowsIgnoringOrder(execute(exprSelect, 0, 1), new Object[]{1, 0, 1});
+        assertRowsIgnoringOrder(execute(exprSelect, 1, 0), new Object[]{2, 1, 0});
+        assertRowsIgnoringOrder(execute(exprSelect, 1, 1), new Object[]{3, 1, 1});
+        Assert.assertEquals(4, customIndex.searches.get());
+    }
+
+    @Test
+    public void shouldRequireAllowFilteringWithOtherIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "k1 int, k2 int, " +
+                    "s1 int static, " +
+                    "c1 int, c2 int, c3 int, c4 int," +
+                    "r1 int, r2 int, r3 int, " +
+                    "PRIMARY KEY((k1, k2), c1, c2, c3, c4))");
+
+        createIndex("CREATE INDEX ON %s(k1)");
+        createIndex("CREATE INDEX ON %s(c4)");
+        createIndex("CREATE INDEX ON %s(r3)");
+        createIndex("CREATE INDEX ON %s(s1)");
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c2) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(c3) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(r1) USING '%s'", StorageAttachedIndex.class.getName()));
+        createIndex(String.format("CREATE CUSTOM INDEX ON %%s(r2) USING '%s'", StorageAttachedIndex.class.getName()));
+
+        // without using the not-SAI index
+        testAllowFiltering("SELECT * FROM %s", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE r1=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND c3=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND r1=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c3=0 AND r1=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c3=0 AND r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE r1=0 AND r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND c3=0 AND r1=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND c3=0 AND r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND r1=0 AND r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c3=0 AND r1=0 AND r2=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c2=0 AND c3=0 AND r1=0 AND r2=0", false);
+
+        // using the not-SAI index on partition key
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND c3=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c3=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE k1=0 AND c2=0 AND c3=0 AND r1=0 AND r2=0", true);
+
+        // using the not-SAI index on clustering key
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND c3=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c3=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE c4=0 AND c2=0 AND c3=0 AND r1=0 AND r2=0", true);
+
+        // using the not-SAI index on regular column
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND c3=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c3=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE r3=0 AND c2=0 AND c3=0 AND r1=0 AND r2=0", true);
+
+        // using the not-SAI index on static column
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0", false);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND c3=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND c3=0 AND r1=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND c3=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c3=0 AND r1=0 AND r2=0", true);
+        testAllowFiltering("SELECT * FROM %s WHERE s1=0 AND c2=0 AND c3=0 AND r1=0 AND r2=0", true);
+    }
+
+    private void testAllowFiltering(String query, boolean requiresAllowFiltering) throws Throwable
+    {
+        if (requiresAllowFiltering)
+            assertInvalidMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE, query);
+        else
+            assertNotNull(execute(query));
+
+        assertNotNull(execute(query + " ALLOW FILTERING"));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/MultipleColumnIndexTest.java b/test/unit/org/apache/cassandra/index/sai/cql/MultipleColumnIndexTest.java
new file mode 100644
index 000000000000..3dad3d305215
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/MultipleColumnIndexTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+
+public class MultipleColumnIndexTest extends SAITester
+{
+    // Note: Full testing of multiple map index types is done in the
+    // types/collections/maps/MultiMap*Test tests
+    // This is just testing that the indexes can be created
+    @Test
+    public void canCreateMultipleMapIndexesOnSameColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, value map<int,int>, PRIMARY KEY(pk, ck))");
+        createIndex("CREATE CUSTOM INDEX ON %s(KEYS(value)) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(VALUES(value)) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(ENTRIES(value)) USING 'StorageAttachedIndex'");
+    }
+
+    @Test
+    public void cannotHaveMultipleLiteralIndexesWithDifferentOptions() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, value text, PRIMARY KEY(pk, ck))");
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : true }");
+        assertThatThrownBy(() -> createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : false }"))
+                .isInstanceOf(RuntimeException.class).hasCauseInstanceOf(InvalidRequestException.class);
+    }
+
+    @Test
+    public void indexNamedAsColumnWillCoExistWithGeneratedIndexNames() throws Throwable
+    {
+        createTable("CREATE TABLE %s(id int PRIMARY KEY, text_map map<text, text>)");
+        execute("INSERT INTO %s(id, text_map) values (1, {'k1':'v1', 'k2':'v2'})");
+        execute("INSERT INTO %s(id, text_map) values (2, {'k1':'v1', 'k3':'v3'})");
+        execute("INSERT INTO %s(id, text_map) values (3, {'k4':'v4', 'k5':'v5'})");
+
+        flush();
+
+        createIndex("CREATE CUSTOM INDEX text_map ON %s(keys(text_map)) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(values(text_map)) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(entries(text_map)) USING 'StorageAttachedIndex'");
+
+        waitForIndexQueryable();
+
+        assertEquals(1, execute("SELECT * FROM %s WHERE text_map['k1'] = 'v1' AND text_map['k2'] = 'v2'").size());
+        assertEquals(2, execute("SELECT * FROM %s WHERE text_map CONTAINS 'v1'").size());
+        assertEquals(2, execute("SELECT * FROM %s WHERE text_map CONTAINS KEY 'k1'").size());
+        assertEquals(1, execute("SELECT * FROM %s WHERE text_map CONTAINS KEY 'k1' AND text_map CONTAINS KEY 'k2'").size());
+        assertEquals(2, execute("SELECT * FROM %s WHERE text_map['k1'] = 'v1' AND text_map CONTAINS KEY 'k1' AND text_map CONTAINS 'v1'").size());
+        assertEquals(1, execute("SELECT * FROM %s WHERE text_map['k1'] = 'v1' AND text_map CONTAINS KEY 'k1' AND text_map CONTAINS KEY 'k2' AND text_map CONTAINS 'v1'").size());
+        assertEquals(0, execute("SELECT * FROM %s WHERE text_map['k1'] = 'v1' AND text_map CONTAINS KEY 'k1' AND text_map CONTAINS KEY 'k4'").size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java b/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java
new file mode 100644
index 000000000000..687841cc0015
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java
@@ -0,0 +1,1348 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.cql;
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.exceptions.InvalidConfigurationInQueryException;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import com.datastax.driver.core.exceptions.ReadFailureException;
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.restrictions.IndexRestrictions;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexBuilder;
+import org.apache.cassandra.index.sai.disk.SegmentBuilder;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.NumericValuesWriter;
+import org.apache.cassandra.index.sai.view.View;
+import org.apache.cassandra.inject.ActionBuilder;
+import org.apache.cassandra.inject.Expression;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.inject.InvokePointBuilder;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.utils.Throwables;
+import org.mockito.Mockito;
+
+import static java.util.Collections.singletonList;
+import static junit.framework.TestCase.fail;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.when;
+
+public class NativeIndexDDLTest extends SAITester
+{
+    private static final Injections.Counter NDI_CREATION_COUNTER = Injections.newCounter("IndexCreationCounter")
+                                                                             .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("register"))
+                                                                             .build();
+
+    private static final Injection failNDIInitialializaion = Injections.newCustom("fail_ndi_initialization")
+                                                                       .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndexBuilder.class).onMethod("build"))
+                                                                       .add(ActionBuilder.newActionBuilder().actions().doThrow(RuntimeException.class, Expression.quote("Injected failure!")))
+                                                                       .build();
+
+    private static final Injection forceFlushPause = Injections.newPause("force_flush_pause", 30_000)
+                                                               .add(InvokePointBuilder.newInvokePoint().onClass(ColumnFamilyStore.class).onMethod("forceBlockingFlush"))
+                                                               .build();
+
+    private static final Injection failPerIndexMetaCompletion = Injections.newCustom("fail_index_meta_completion")
+                                                                          .add(InvokePointBuilder.newInvokePoint().onClass(SegmentBuilder.class).onMethod("flush"))
+                                                                          .add(ActionBuilder.newActionBuilder().actions().doThrow(RuntimeException.class, Expression.quote("Injected failure!")))
+                                                                          .build();
+
+    private static final Injection failPerSSTableTokenAdd = Injections.newCustom("fail_token_writer")
+                                                                      .add(InvokePointBuilder.newInvokePoint().onClass(NumericValuesWriter.class).onMethod("add"))
+                                                                      .add(ActionBuilder.newActionBuilder().actions().doThrow(IOException.class, Expression.quote("Injected failure!")))
+                                                                      .build();
+
+    private static final Injection FAIL_INDEX_GC_TRANSACTION = Injections.newCustom("fail_index_gc_transaction")
+                                                                         .add(InvokePointBuilder.newInvokePoint().onClass("org.apache.cassandra.index.SecondaryIndexManager$IndexGCTransaction")
+                                                                                                .onMethod("<init>"))
+                                                                         .add(ActionBuilder.newActionBuilder().actions().doThrow(RuntimeException.class, Expression.quote("Injected failure!")))
+                                                                         .build();
+
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+
+        startJMXServer();
+
+        createMBeanServerConnection();
+
+        Injections.inject(NDI_CREATION_COUNTER, INDEX_BUILD_COUNTER, FAIL_INDEX_GC_TRANSACTION);
+
+        NDI_CREATION_COUNTER.reset();
+        INDEX_BUILD_COUNTER.reset();
+    }
+
+    @After
+    public void removeInjections()
+    {
+        Injections.deleteAll();
+    }
+
+    @Test
+    public void shouldFailUnsupportedType() throws Throwable
+    {
+        for (CQL3Type.Native cql3Type : CQL3Type.Native.values())
+        {
+            if (cql3Type == CQL3Type.Native.EMPTY)
+                continue;
+
+            String createTableTemplate = "CREATE TABLE %%s (id text PRIMARY KEY, %s %s)";
+            createTable(String.format(createTableTemplate, cql3Type, cql3Type));
+
+            boolean supported = StorageAttachedIndex.SUPPORTED_TYPES.contains(cql3Type);
+
+            try
+            {
+                executeNet(String.format("CREATE CUSTOM INDEX ON %%s(%s) USING 'StorageAttachedIndex'", cql3Type));
+                assertTrue("Index creation on unsupported type " + cql3Type + " should have failed.", supported);
+            }
+            catch (RuntimeException e)
+            {
+                assertFalse("Index creation on supported type " + cql3Type + " should have succeeded.", supported);
+                // InvalidConfigurationInQueryException is sub-class of InvalidQueryException
+                assertTrue(Throwables.isCausedBy(e, InvalidQueryException.class));
+            }
+        }
+    }
+
+    @Test
+    public void shouldFailCreationOnPartitionKey()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(id) USING 'StorageAttachedIndex'"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessageContaining("Cannot create secondary index on the only partition key column id");
+    }
+
+    @Test
+    public void shouldFailCreationUsingMode()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING " +
+                                            "'StorageAttachedIndex' WITH OPTIONS = { 'mode' : 'CONTAINS' }")).isInstanceOf(InvalidConfigurationInQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCreateSpecifyingAnalyzerClass()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
+                                            "USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = { 'analyzer_class' : 'org.apache.cassandra.index.sai.analyzer.NonTokenizingAnalyzer' }"))
+                .isInstanceOf(InvalidConfigurationInQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCreateWithMisspelledOption()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
+                                            "USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = { 'case-sensitive' : true }")).isInstanceOf(InvalidConfigurationInQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCaseSensitiveWithNonText()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
+                                            "USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = { 'case_sensitive' : true }")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailOnNormalizeWithNonText()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
+                                            "USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = { 'normalize' : true }")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCreateWithTupleType()
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val tuple<text, int, double>)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
+                                            "USING 'StorageAttachedIndex'")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCreateWithUserType()
+    {
+        String typeName = createType("CREATE TYPE %s (a text, b int, c double)");
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val frozen<" + typeName + ">)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
+                                            "USING 'StorageAttachedIndex'")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCreateWithInvalidCharactersInColumnName()
+    {
+        String invalidColumn = "/invalid";
+        createTable(String.format("CREATE TABLE %%s (id text PRIMARY KEY, \"%s\" text)", invalidColumn));
+
+        assertThatThrownBy(() -> executeNet(String.format("CREATE CUSTOM INDEX ON %%s(\"%s\")" +
+                                                          " USING 'StorageAttachedIndex'", invalidColumn)))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessage(String.format("Column '%s' is longer than the permissible name length of %d characters or" +
+                                          " contains non-alphanumeric-underscore characters", invalidColumn, SchemaConstants.NAME_LENGTH));
+    }
+
+    @Test
+    public void shouldCreateIndexIfExists() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX IF NOT EXISTS ON %s(val) USING 'StorageAttachedIndex' ");
+
+        createIndex("CREATE CUSTOM INDEX IF NOT EXISTS ON %s(val) USING 'StorageAttachedIndex' ");
+
+        assertEquals(1, NDI_CREATION_COUNTER.get());
+    }
+
+    @Test
+    public void shouldBeCaseSensitiveByDefault() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'Camel'").size());
+
+        assertEquals(0, execute("SELECT id FROM %s WHERE val = 'camel'").size());
+    }
+
+    @Test
+    public void shouldEnableCaseSensitiveSearch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : true }");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'Camel'").size());
+
+        assertEquals(0, execute("SELECT id FROM %s WHERE val = 'camel'").size());
+    }
+
+    @Test
+    public void shouldEnableCaseInsensitiveSearch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : false }");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'camel'").size());
+    }
+
+    @Test
+    public void shouldBeNonNormalizedByDefault() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'Cam\u00E1l'").size());
+
+        // Both \u00E1 and \u0061\u0301 are visible as the character á, but without NFC normalization, they won't match.
+        assertEquals(0, execute("SELECT id FROM %s WHERE val = 'Cam\u0061\u0301l'").size());
+    }
+
+    @Test
+    public void shouldEnableNonNormalizedSearch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = { 'normalize' : false }");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'Cam\u00E1l'").size());
+
+        // Both \u00E1 and \u0061\u0301 are visible as the character á, but without NFC normalization, they won't match.
+        assertEquals(0, execute("SELECT id FROM %s WHERE val = 'Cam\u0061\u0301l'").size());
+    }
+
+    @Test
+    public void shouldEnableNormalizedSearch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = { 'normalize' : true }");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'Cam\u0061\u0301l'").size());
+    }
+
+    @Test
+    public void shouldEnableNormalizedCaseInsensitiveSearch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = { 'normalize' : true, 'case_sensitive' : false}");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Cam\u00E1l')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'cam\u0061\u0301l'").size());
+    }
+
+    @Test
+    public void shouldEnableAsciiSearch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = { 'ascii' : true, 'case_sensitive' : false}");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Éppinger')");
+
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = 'eppinger'").size());
+    }
+
+    @Test
+    public void shouldCreateIndexOnReversedType() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text, ck1 text, ck2 int, val text, PRIMARY KEY (id,ck1,ck2)) WITH CLUSTERING ORDER BY (ck1 desc, ck2 desc)");
+
+        String indexNameCk1 = createIndex("CREATE CUSTOM INDEX ON %s(ck1) USING 'StorageAttachedIndex'");
+        String indexNameCk2 = createIndex("CREATE CUSTOM INDEX ON %s(ck2) USING 'StorageAttachedIndex'");
+
+        execute("insert into %s(id, ck1, ck2, val) values('1', '2', 3, '3')");
+        execute("insert into %s(id, ck1, ck2, val) values('1', '3', 4, '4')");
+        assertEquals(1, executeNet("SELECT * FROM %s WHERE ck1='3'").all().size());
+        assertEquals(2, executeNet("SELECT * FROM %s WHERE ck2>=0").all().size());
+        assertEquals(2, executeNet("SELECT * FROM %s WHERE ck2<=4").all().size());
+
+        flush();
+        assertEquals(1, executeNet("SELECT * FROM %s WHERE ck1='2'").all().size());
+        assertEquals(2, executeNet("SELECT * FROM %s WHERE ck2>=3").all().size());
+        assertEquals(2, executeNet("SELECT * FROM %s WHERE ck2<=4").all().size());
+
+        SecondaryIndexManager sim = getCurrentColumnFamilyStore().indexManager;
+        StorageAttachedIndex index = (StorageAttachedIndex) sim.getIndexByName(indexNameCk1);
+        ColumnContext context = index.getContext();
+        assertTrue(context.isLiteral());
+        assertTrue(context.getValidator() instanceof ReversedType);
+
+        index = (StorageAttachedIndex) sim.getIndexByName(indexNameCk2);
+        context = index.getContext();
+        assertFalse(context.isLiteral());
+        assertTrue(context.getValidator() instanceof ReversedType);
+    }
+
+    @Test
+    public void shouldCreateIndexWithAlias() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex'");
+
+        assertEquals(1, NDI_CREATION_COUNTER.get());
+    }
+
+    /**
+     * Verify SASI can be created and queries with NDI dependencies.
+     * Not putting in {@link MixedIndexImplementationsTest} because it uses CQLTester which doesn't load NDI dependency.
+     */
+    @Test
+    public void shouldCreateSASI() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        createIndex("CREATE CUSTOM INDEX ON %s(v1) USING 'org.apache.cassandra.index.sasi.SASIIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(v2) USING 'org.apache.cassandra.index.sasi.SASIIndex' WITH OPTIONS = {'mode': 'CONTAINS',\n" +
+                    "'analyzer_class': 'org.apache.cassandra.index.sasi.analyzer.StandardAnalyzer',\n" +
+                    "'tokenization_enable_stemming': 'true',\n" +
+                    "'tokenization_locale': 'en',\n" +
+                    "'tokenization_skip_stop_words': 'true',\n" +
+                    "'analyzed': 'true',\n" +
+                    "'tokenization_normalize_lowercase': 'true'};");
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(1, rows.all().size());
+
+        rows = executeNet("SELECT id1 FROM %s WHERE v2 like '0'");
+        assertEquals(1, rows.all().size());
+    }
+
+    @Test
+    public void shouldCreateNumericIndexWithBkdPostingsSkipAndMinLeaves() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = {'bkd_postings_skip' : 3, 'bkd_postings_min_leaves' : 32}");
+
+        assertEquals(1, NDI_CREATION_COUNTER.get());
+    }
+
+    @Test
+    public void shouldCreateNumericIndexWithBkdPostingsSkipOnly() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = {'bkd_postings_skip' : 3}");
+
+        assertEquals(1, NDI_CREATION_COUNTER.get());
+    }
+
+    @Test
+    public void shouldCreateNumericIndexWithBkdPostingsMinLeavesOnly() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' WITH OPTIONS = {'bkd_postings_min_leaves': 32}");
+
+        assertEquals(1, NDI_CREATION_COUNTER.get());
+    }
+
+    @Test
+    public void shouldFailToCreateNumericIndexWithTooLowBkdPostingsSkip() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = {'bkd_postings_skip' : 0, 'bkd_postings_min_leaves' : 32}")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailToCreateNumericIndexWithTooLowBkdPostingsMinLeaves() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val int)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = {'bkd_postings_skip' : 3, 'bkd_postings_min_leaves' : 0}")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailToCreateStringIndexWithBkdPostingsSkip() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = {'bkd_postings_skip' : 3}")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailToCreateStringIndexWithBkdPostingsMinLeaves() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = {'bkd_postings_min_leaves' : 9}")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailToCreateInvalidBooleanOption() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = {'case_sensitive': 'NOTVALID'}")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailToCreateEmptyBooleanOption() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex' " +
+                                            "WITH OPTIONS = {'case_sensitive': ''}")).isInstanceOf(InvalidQueryException.class);
+    }
+
+    @Test
+    public void shouldFailCreationOnMultipleColumns() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val1 text, val2 text)");
+
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val1, val2) USING 'StorageAttachedIndex'"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessageContaining("storage-attached index cannot be created over multiple columns");
+    }
+
+    @Test
+    public void shouldFailCreationMultipleIndexesOnSimpleColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id int PRIMARY KEY, v1 TEXT)");
+        execute("INSERT INTO %s (id, v1) VALUES(1, '1')");
+        flush();
+
+        executeNet("CREATE CUSTOM INDEX index_1 ON %s(v1) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        // same name
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX index_1 ON %s(v1) USING 'StorageAttachedIndex'"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessageContaining("Index 'index_1' already exists");
+
+        // different name, same option
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX index_2 ON %s(v1) USING 'StorageAttachedIndex'"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessageContaining("Index index_2 is a duplicate of existing index index_1");
+
+        // different name, different option, same target.
+        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(v1) USING 'StorageAttachedIndex' WITH OPTIONS = { 'case_sensitive' : true }"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessageContaining("Cannot create more than one storage-attached index on the same column: v1" );
+
+        ResultSet rows = executeNet("SELECT id FROM %s WHERE v1 = '1'");
+        assertEquals(1, rows.all().size());
+    }
+
+    @Test
+    public void shouldIndexBuildingWithInMemoryData() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        int rowCount = 10;
+        for (int i = 0; i < rowCount; i++)
+            execute("INSERT INTO %s (id1, v1, v2) VALUES ('" + i + "', " + i + ", '0')");
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(rowCount, rows.all().size());
+    }
+
+    @Test
+    public void shouldIndexExistingMemtableOnCreationWithConcurrentFlush() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val text)");
+        execute("INSERT INTO %s (id, val) VALUES ('1', 'Camel')");
+
+        Injections.Barrier delayInitializationTask =
+                Injections.newBarrier("delayInitializationTask", 2, false)
+                          .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startInitialBuild"))
+                          .build();
+
+        // Create the index, but do not allow the initial index build to begin:
+        Injections.inject(delayInitializationTask);
+        String indexName = createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex'");
+
+        // Flush the Memtable's contents, which will feed data to the index as the SSTable is written:
+        flush();
+
+        // Allow the initialization task, which builds the index, to continue:
+        delayInitializationTask.countDown();
+
+        waitForIndexQueryable();
+
+        ResultSet rows = executeNet("SELECT id FROM %s WHERE val = 'Camel'");
+        assertEquals(1, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void shouldRejectQueriesBeforeIndexInitializationFinished() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        int rowCount = 10;
+        for (int i = 0; i < rowCount; i++)
+            execute("INSERT INTO %s (id1, v1, v2) VALUES ('" + i + "', " + i + ", '0')");
+
+        Injections.inject(forceFlushPause);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        assertThatThrownBy(() -> executeNet("SELECT id1 FROM %s WHERE v1>=0")).isInstanceOf(ReadFailureException.class);
+    }
+
+    @Test
+    public void shouldRejectQueriesOnIndexInitializationFailure() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        int rowCount = 10;
+        for (int i = 0; i < rowCount; i++)
+            execute("INSERT INTO %s (id1, v1, v2) VALUES ('" + i + "', " + i + ", '0')");
+        flush();
+
+        Injections.inject(failNDIInitialializaion);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForAssert(() -> assertEquals(1, INDEX_BUILD_COUNTER.get()));
+        waitForCompactions();
+
+        assertThatThrownBy(() -> executeNet("SELECT id1 FROM %s WHERE v1>=0")).isInstanceOf(ReadFailureException.class);
+    }
+
+    @Test
+    public void shouldReleaseIndexFilesAfterDroppingLastIndex() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0')");
+        flush();
+        verifyIndexFiles(1, 1);
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(1, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(1, rows.all().size());
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0')");
+        flush();
+        verifyIndexFiles(2, 2);
+        verifySSTableIndexes(v1IndexName, 2, 2);
+        verifySSTableIndexes(v2IndexName, 2, 2);
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(2, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(2, rows.all().size());
+
+        dropIndex("DROP INDEX %s." + v1IndexName);
+        verifyIndexFiles(0, 2);
+        verifySSTableIndexes(v1IndexName, 2, 0);
+        verifySSTableIndexes(v2IndexName, 2, 2);
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(2, rows.all().size());
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('2', 2, '0')");
+        flush();
+        verifyIndexFiles(0, 3);
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(3, rows.all().size());
+
+        dropIndex("DROP INDEX %s." + v2IndexName);
+        verifyIndexFiles(0, 0);
+        verifySSTableIndexes(v1IndexName, 0);
+        verifySSTableIndexes(v2IndexName, 0);
+
+        assertEquals("Segment memory limiter should revert to zero on drop.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void shouldCreateIndexFilesAfterMultipleConcurrentIndexCreation() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        flush();
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+        flush();
+        verifyIndexFiles(0, 0);
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+        verifyIndexFiles(2, 2);
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(2, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(2, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void shouldCreateIndexFilesAfterMultipleSequentialIndexCreation() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        flush();
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+        flush();
+        verifyIndexFiles(0, 0);
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+        verifyIndexFiles(2, 0);
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(2, rows.all().size());
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+        verifyIndexFiles(2, 2);
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(2, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(2, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void shouldReleaseIndexFilesAfterCompaction() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        flush();
+        verifyIndexFiles(1, 1);
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(1, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(1, rows.all().size());
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+        flush();
+        verifyIndexFiles(2, 2);
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(2, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(2, rows.all().size());
+
+        compact();
+        waitForAssert(() -> verifyIndexFiles(1, 1));
+
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(2, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(2, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero after compaction.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void truncateWithBuiltIndexes() throws Throwable
+    {
+        verifyTruncateWithIndex(false);
+    }
+
+    @Test
+    public void concurrentTruncateWithIndexBuilding() throws Throwable
+    {
+        verifyTruncateWithIndex(true);
+    }
+
+    private void verifyTruncateWithIndex(boolean concurrentTruncate) throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        if (!concurrentTruncate)
+        {
+            createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+            createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        }
+
+        // create 100 rows, half in sstable and half in memtable
+        int num = 100;
+        for (int i = 0; i < num; i++)
+        {
+            if (i == num / 2)
+                flush();
+            execute("INSERT INTO %s (id1, v1, v2) VALUES ('" + i + "', 0, '0');");
+        }
+
+        if (concurrentTruncate)
+        {
+            String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+            String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+            truncate(true);
+            waitForIndexQueryable();
+        }
+        else
+        {
+            truncate(true);
+        }
+
+        waitForAssert(() -> verifyIndexFiles(0, 0));
+
+        // verify index-view-manager has been cleaned up
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V1_COLUMN_IDENTIFIER), 0);
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V2_COLUMN_IDENTIFIER), 0);
+
+        assertEquals("Segment memory limiter should revert to zero after truncate.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void verifyRebuildCorruptedFiles() throws Throwable
+    {
+        // prepare schema and data
+        createTable(CREATE_TABLE_TEMPLATE);
+        String numericIndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String stringIndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+        flush();
+
+        for (CorruptionType corruptionType : CorruptionType.values())
+        {
+            verifyRebuildCorruptedFiles(numericIndexName, stringIndexName, corruptionType, false);
+            verifyRebuildCorruptedFiles(numericIndexName, stringIndexName, corruptionType, true);
+        }
+
+        assertEquals("Segment memory limiter should revert to zero following rebuild.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    private void verifyRebuildCorruptedFiles(String numericIndexName,
+                                             String stringIndexName,
+                                             CorruptionType corruptionType,
+                                             boolean rebuild) throws Throwable
+    {
+        for (IndexComponents.IndexComponent component : IndexComponents.PER_SSTABLE_COMPONENTS)
+            verifyRebuildIndexComponent(numericIndexName, stringIndexName, component, corruptionType, true, true, rebuild);
+
+        for (IndexComponents.IndexComponent component : IndexComponents.perColumnComponents(numericIndexName, false))
+            verifyRebuildIndexComponent(numericIndexName, stringIndexName, component, corruptionType, false, true, rebuild);
+
+        for (IndexComponents.IndexComponent component : IndexComponents.perColumnComponents(stringIndexName, true))
+            verifyRebuildIndexComponent(numericIndexName, stringIndexName, component, corruptionType, true, false, rebuild);
+    }
+
+    private void verifyRebuildIndexComponent(String numericIndexName,
+                                             String stringIndexName,
+                                             IndexComponents.IndexComponent component,
+                                             CorruptionType corruptionType,
+                                             boolean failedStringIndex,
+                                             boolean failedNumericIndex,
+                                             boolean rebuild) throws Throwable
+    {
+        boolean encrypted = Boolean.parseBoolean(System.getProperty("cassandra.test.encryption", "false"));
+
+        // The completion markers are valid if they exist on the file system so we only need to test
+        // their removal. If we are testing with encryption then we don't want to test any components
+        // that are encryptable unless they have been removed because encrypted components aren't
+        // checksum validated.
+        if ((component.ndiType.completionMarker() || (encrypted && component.ndiType.encryptable())) && (corruptionType != CorruptionType.REMOVED))
+            return;
+
+        int rowCount = 2;
+
+        // initial verification
+        verifySSTableIndexes(numericIndexName, 1);
+        verifySSTableIndexes(stringIndexName, 1);
+        verifyIndexFiles(1, 1, 1, 2);
+        assertTrue(verifyChecksum(createColumnContext("v1", numericIndexName, Int32Type.instance)));
+        assertTrue(verifyChecksum(createColumnContext("v2", stringIndexName, UTF8Type.instance)));
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(rowCount, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(rowCount, rows.all().size());
+
+        // corrupt file
+        corruptNDIComponent(component, corruptionType);
+
+        // If we are removing completion markers then the rest of the components should still have
+        // valid checksums.
+        boolean expectedNumericState = !failedNumericIndex || component.ndiType.completionMarker();
+        boolean expectedLiteralState = !failedStringIndex || component.ndiType.completionMarker();
+
+        assertEquals(expectedNumericState, verifyChecksum(createColumnContext("v1", numericIndexName, Int32Type.instance)));
+        assertEquals(expectedLiteralState, verifyChecksum(createColumnContext("v2", stringIndexName, UTF8Type.instance)));
+
+        if (rebuild)
+        {
+            rebuildIndexes(numericIndexName, stringIndexName);
+        }
+        else
+        {
+            // Reload all SSTable indexes to manifest the corruption:
+            reloadSSTableIndex();
+
+            // Verify the index cannot be read:
+            verifySSTableIndexes(numericIndexName, component.ndiType.perSSTable() ? 0 : 1, failedNumericIndex ? 0 : 1);
+            verifySSTableIndexes(stringIndexName, component.ndiType.perSSTable() ? 0 : 1, failedStringIndex ? 0 : 1);
+
+            try
+            {
+                // If the corruption is that a file is missing entirely, the index won't be marked non-queryable...
+                rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+                assertEquals(failedNumericIndex ? 0 : rowCount, rows.all().size());
+            }
+            catch (ReadFailureException e)
+            {
+                // ...but most kind of corruption will result in the index being non-queryable.
+            }
+
+            try
+            {
+                // If the corruption is that a file is missing entirely, the index won't be marked non-queryable...
+                rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+                assertEquals(failedStringIndex ? 0 : rowCount, rows.all().size());
+            }
+            catch (ReadFailureException e)
+            {
+                // ...but most kind of corruption will result in the index being non-queryable.
+            }
+
+            // Simulate the index repair that would occur on restart:
+            runInitializationTask();
+        }
+
+        // verify indexes are recovered
+        verifySSTableIndexes(numericIndexName, 1);
+        verifySSTableIndexes(stringIndexName, 1);
+        verifyIndexFiles(1, 1, 1, 2);
+
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(rowCount, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(rowCount, rows.all().size());
+    }
+
+    @Test
+    public void verifyCleanupFailedPerIndexFiles() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        flush();
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+        flush();
+
+        // Inject failure
+        Injections.inject(failPerIndexMetaCompletion);
+        failPerIndexMetaCompletion.enable();
+
+        try
+        {
+            // Create a new index, which will actuate a build compaction and fail, but leave the node running...
+            createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+            // two index builders running in different compaction threads because of parallelised index initial build
+            waitForAssert(() -> assertEquals(2, INDEX_BUILD_COUNTER.get()));
+            waitForCompactionsFinished();
+
+            // Only token/offset files for the first SSTable in the compaction task should exist, while column-specific files are blown away:
+            verifyIndexFiles(2, 0, 0, 0);
+
+            assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+            assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+        }
+        finally
+        {
+            failPerIndexMetaCompletion.disable();
+        }
+    }
+
+    @Test
+    public void verifyCleanupFailedTokenOffsetFiles() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        flush();
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '0');");
+        flush();
+
+        // Inject failure
+        Injections.inject(failPerSSTableTokenAdd);
+        failPerSSTableTokenAdd.enable();
+
+        try
+        {
+            // Create a new index, which will actuate a build compaction and fail, but leave the node running...
+            createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+            // two index builders running in different compaction threads because of parallelised index initial build
+            waitForAssert(() -> assertEquals(2, INDEX_BUILD_COUNTER.get()));
+            waitForAssert(() -> assertEquals(0, getCompactionTasks()));
+
+            // SSTable-level token/offset file(s) should be removed, while column-specific files never existed:
+            verifyIndexFiles(0, 0);
+
+            assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+            assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+        }
+        finally
+        {
+            failPerSSTableTokenAdd.disable();
+        }
+    }
+
+    @Test
+    public void verifyFlushAndCompactEmptyIndex() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+        // flush empty index
+        execute("INSERT INTO %s (id1) VALUES ('0');");
+        flush();
+
+        execute("INSERT INTO %s (id1) VALUES ('1');");
+        flush();
+
+        verifyIndexFiles(2, 0, 0, 4);
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(0, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(0, rows.all().size());
+
+        // compact empty index
+        compact();
+        waitForAssert(() -> verifyIndexFiles(1, 0, 0, 2));
+
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(0, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(0, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void verifyFlushAndCompactNonIndexableRows() throws Throwable
+    {
+        // valid row ids, but no valid indexable content
+        Runnable populateData = () -> {
+            try
+            {
+                execute("INSERT INTO %s (id1) VALUES ('0');");
+                flush();
+
+                execute("INSERT INTO %s (id1) VALUES ('1');");
+                flush();
+            }
+            catch (Throwable e)
+            {
+                throw Throwables.unchecked(e);
+            }
+        };
+
+
+        verifyFlushAndCompactEmptyIndexes(populateData);
+    }
+
+    @Test
+    public void verifyFlushAndCompactTombstones() throws Throwable
+    {
+        // no valid row ids
+        Runnable populateData = () -> {
+            try
+            {
+                execute("DELETE FROM %s WHERE id1 = '0'");
+                flush();
+
+                execute("DELETE FROM %s WHERE id1 = '1'");
+                flush();
+            }
+            catch (Throwable e)
+            {
+                throw Throwables.unchecked(e);
+            }
+        };
+
+        verifyFlushAndCompactEmptyIndexes(populateData);
+    }
+
+    private void verifyFlushAndCompactEmptyIndexes(Runnable populateData) throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+
+        populateData.run();
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V1_COLUMN_IDENTIFIER), 2, 0);
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V2_COLUMN_IDENTIFIER), 2, 0);
+        verifyIndexFiles(2, 0, 0, 4);
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(0, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(0, rows.all().size());
+
+        // compact empty index
+        compact();
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V1_COLUMN_IDENTIFIER), 1, 0);
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V2_COLUMN_IDENTIFIER), 1, 0);
+        waitForAssert(() -> verifyIndexFiles(1, 0, 0, 2));
+
+        rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(0, rows.all().size());
+        rows = executeNet("SELECT id1 FROM %s WHERE v2='0'");
+        assertEquals(0, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+    }
+
+    @Test
+    public void droppingIndexStopInitialIndexBuild() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        int num = 100;
+        for (int i = 0; i < num; i++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES (?, 0, '0')", Integer.toString(i));
+        }
+        flush();
+
+        Injections.Barrier delayIndexBuilderCompletion = Injections.newBarrier("delayIndexBuilder", 2, false)
+                                                                   .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndexBuilder.class).onMethod("build"))
+                                                                   .build();
+
+        Injections.inject(delayIndexBuilderCompletion);
+        String indexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForAssert(() -> assertEquals(1, delayIndexBuilderCompletion.getCount()));
+
+        dropIndex("DROP INDEX %s." + indexName);
+
+        // let blocked builders to continue
+        delayIndexBuilderCompletion.countDown();
+        waitForCompactions();
+
+        delayIndexBuilderCompletion.disable();
+
+        verifySSTableIndexes(indexName, 0);
+        assertFalse("Expect index not built", SystemKeyspace.isIndexBuilt(KEYSPACE, indexName));
+
+        // create index again, it should succeed
+        indexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+        verifySSTableIndexes(indexName, 1);
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(num, rows.all().size());
+    }
+
+    @Test
+    public void nodetoolStopInitialIndexBuild() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        disableCompaction(KEYSPACE);
+
+        // create 100 rows into 1 sstable
+        int num = 100;
+        int sstable = 1;
+        for (int i = 0; i < num; i++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES ('" + i + "', 0, '0');");
+        }
+        flush();
+
+        Injections.Barrier delayIndexBuilderCompletion = Injections.newBarrierAwait("delayIndexBuilder", 1, true)
+                                                                   .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndexBuilder.class).onMethod("build"))
+                                                                   .build();
+
+        Injections.inject(delayIndexBuilderCompletion);
+        String indexv1Name = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        // Stop initial index build by interrupting active and pending compactions
+        int attempt = 10;
+        while (getCompactionTasks() > 0 && attempt > 0)
+        {
+            // only interrupts active compactions, not pending compactions.
+            CompactionManager.instance.stopCompaction(OperationType.INDEX_BUILD.name());
+            // let blocked builder to continue, but still block pending builder threads
+            delayIndexBuilderCompletion.reset();
+
+            Thread.sleep(3000);
+            attempt--;
+        }
+        if (getCompactionTasks() > 0)
+            fail("Compaction tasks are not interrupted.");
+
+        delayIndexBuilderCompletion.disable();
+
+        // initial index builder should have stopped abruptly resulting in the index not being queryable
+        verifyInitialIndexFailed(indexv1Name);
+        assertFalse(isIndexQueryable());
+
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        for (Index i : cfs.indexManager.listIndexes())
+        {
+            StorageAttachedIndex index = (StorageAttachedIndex) i;
+            assertTrue(index.getContext().getLiveMemtables().isEmpty());
+
+            View view = index.getContext().getView();
+            assertTrue("Expect index build stopped", view.getIndexes().isEmpty());
+        }
+
+        assertEquals("Segment memory limiter should revert to zero on interrupted compactions.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        // rebuild index
+        ColumnFamilyStore.rebuildSecondaryIndex(KEYSPACE, currentTable(), indexv1Name);
+
+        verifyIndexFiles(sstable, 0);
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(num, rows.all().size());
+
+        assertEquals("Segment memory limiter should revert to zero following rebuild.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        assertTrue(verifyChecksum(createColumnContext("v1", indexv1Name, Int32Type.instance)));
+    }
+
+    @Test
+    public void shouldRejectQueriesWithCustomExpressions() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        String index = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        assertThatThrownBy(() -> executeNet(String.format("SELECT * FROM %%s WHERE expr(%s, 0)", index)))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessage(String.format(IndexRestrictions.CUSTOM_EXPRESSION_NOT_SUPPORTED, index));
+    }
+
+    @Test
+    public void testInitialBuildParallelism()
+    {
+        Function<Long, SSTableReader> createMockSSTable = onDiskLength -> {
+            SSTableReader reader = Mockito.mock(SSTableReader.class);
+            when(reader.onDiskLength()).thenReturn(onDiskLength);
+            return reader;
+        };
+
+        Function<List<SSTableReader>, List<Long>> toSize = sstables -> sstables.stream().map(SSTableReader::onDiskLength).collect(Collectors.toList());
+
+        // total size = 55
+        List<SSTableReader> sstables = LongStream.range(1, 11).boxed().map(createMockSSTable).collect(Collectors.toList());
+
+        // avg = 55 == total size
+        List<List<SSTableReader>> groups = StorageAttachedIndex.groupBySize(sstables, 1);
+        Iterator<List<SSTableReader>> iterator = groups.iterator();
+        assertEquals(1, groups.size());
+        assertEquals(Arrays.asList(10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), toSize.apply(iterator.next())); // size = 55
+
+        // avg = 27.5
+        groups = StorageAttachedIndex.groupBySize(sstables, 2);
+        iterator = groups.iterator();
+        assertEquals(2, groups.size());
+        assertEquals(Arrays.asList(10L, 9L, 8L, 7L), toSize.apply(iterator.next())); // size = 34
+        assertEquals(Arrays.asList(6L, 5L, 4L, 3L, 2L, 1L), toSize.apply(iterator.next())); // size = 21
+
+        // avg = 18.333
+        groups = StorageAttachedIndex.groupBySize(sstables, 3);
+        iterator = groups.iterator();
+        assertEquals(3, groups.size());
+        assertEquals(Arrays.asList(10L, 9L), toSize.apply(iterator.next())); // size = 19
+        assertEquals(Arrays.asList(8L, 7L, 6L), toSize.apply(iterator.next())); // size = 21
+        assertEquals(Arrays.asList(5L, 4L, 3L, 2L, 1L), toSize.apply(iterator.next())); // size = 15
+
+        // avg = 11
+        groups = StorageAttachedIndex.groupBySize(sstables, 5);
+        iterator = groups.iterator();
+        assertEquals(4, groups.size());
+        assertEquals(Arrays.asList(10L, 9L), toSize.apply(iterator.next())); // size = 19
+        assertEquals(Arrays.asList(8L, 7L), toSize.apply(iterator.next())); // size = 15
+        assertEquals(Arrays.asList(6L, 5L), toSize.apply(iterator.next())); // size = 11
+        assertEquals(Arrays.asList(4L, 3L, 2L, 1L), toSize.apply(iterator.next())); // size = 11
+
+        // avg = 5.5
+        groups = StorageAttachedIndex.groupBySize(sstables, 10);
+        iterator = groups.iterator();
+        assertEquals(7, groups.size());
+        assertEquals(singletonList(10L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(9L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(8L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(7L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(6L), toSize.apply(iterator.next()));
+        assertEquals(Arrays.asList(5L, 4L), toSize.apply(iterator.next()));
+        assertEquals(Arrays.asList(3L, 2L, 1L), toSize.apply(iterator.next()));
+
+        // avg = 2.75
+        groups = StorageAttachedIndex.groupBySize(sstables, 20);
+        iterator = groups.iterator();
+        assertEquals(9, groups.size());
+        assertEquals(singletonList(10L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(9L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(8L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(7L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(6L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(5L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(4L), toSize.apply(iterator.next()));
+        assertEquals(singletonList(3L), toSize.apply(iterator.next()));
+        assertEquals(Arrays.asList(2L, 1L), toSize.apply(iterator.next()));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/PartitionRestrictedQueryTest.java b/test/unit/org/apache/cassandra/index/sai/cql/PartitionRestrictedQueryTest.java
new file mode 100644
index 000000000000..07efe20106c0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/PartitionRestrictedQueryTest.java
@@ -0,0 +1,211 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Random;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.utils.Pair;
+
+import static org.junit.Assert.assertEquals;
+
+public class PartitionRestrictedQueryTest extends SAITester
+{
+    private static final int LARGE_ROWS = 4096;
+    private static final int LARGE_PARTITIONS = 64;
+    private static final int LARGE_ROWS_PER_PARTITION = LARGE_ROWS / LARGE_PARTITIONS;
+    private static final int LARGE_PARTITIONS_QUERIED = 16;
+
+    private static final int SMALL_ROWS = 512;
+    private static final int SMALL_PARTITIONS = 16;
+    private static final int SMALL_ROWS_PER_PARTITION = SMALL_ROWS / SMALL_PARTITIONS;
+    private static final int SMALL_PARTITIONS_QUERIED = 8;
+
+    private static final String QUERY_TEMPLATE = "SELECT * FROM %s WHERE pk = %d AND value >= %d AND value < %d LIMIT %d";
+    private static final String FILTERING_TEMPLATE = "SELECT * FROM %s WHERE pk = %d AND value >= %d AND value < %d LIMIT %d ALLOW FILTERING";
+
+    private static final int DEFAULT_LIMIT = 10;
+
+    private static final Random RANDOM = new Random(System.currentTimeMillis());
+
+    private String largeTable;
+    private String largeReferenceTable;
+    private String smallTable;
+    private String smallReferenceTable;
+
+    @Before
+    public void setup() throws Throwable
+    {
+        largeTable = createTable("CREATE TABLE %s (pk int, ck int, value int, PRIMARY KEY (pk, ck))");
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex' WITH OPTIONS = {'bkd_postings_skip' : 1, 'bkd_postings_min_leaves' : 2}");
+        largeReferenceTable = createTable("CREATE TABLE %s (pk int, ck int, value int, PRIMARY KEY (pk, ck))");
+
+        smallTable = createTable("CREATE TABLE %s (pk int, ck int, value int, PRIMARY KEY (pk, ck))");
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex' WITH OPTIONS = {'bkd_postings_skip' : 1, 'bkd_postings_min_leaves' : 2}");
+        smallReferenceTable = createTable("CREATE TABLE %s (pk int, ck int, value int, PRIMARY KEY (pk, ck))");
+
+        String template = "INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)";
+
+        // Stripe the values from 0 -> (LARGE_ROWS - 1) across LARGE_PARTITIONS partitions. This makes it
+        // possible to easily query slices that span all partitions.
+        for (int pk = 0; pk < LARGE_PARTITIONS; pk++)
+        {
+            for (int ck = 0; ck < LARGE_ROWS_PER_PARTITION; ck++)
+            {
+                int value = pk + (ck * LARGE_PARTITIONS);
+
+                // Write to both the indexed and reference tables:
+                execute(String.format(template, KEYSPACE + "." + largeTable), pk, ck, value);
+                execute(String.format(template, KEYSPACE + "." + largeReferenceTable), pk, ck, value);
+            }
+        }
+
+        // Stripe the values from 0 -> (SMALL_ROWS - 1) across SMALL_PARTITIONS partitions. This makes it
+        // possible to easily query slices that span all partitions.
+        for (int pk = 0; pk < SMALL_PARTITIONS; pk++)
+        {
+            for (int ck = 0; ck < SMALL_ROWS_PER_PARTITION; ck++)
+            {
+                int value = pk + (ck * SMALL_PARTITIONS);
+
+                // Write to both the indexed and reference tables:
+                execute(String.format(template, KEYSPACE + "." + smallTable), pk, ck, value);
+                execute(String.format(template, KEYSPACE + "." + smallReferenceTable), pk, ck, value);
+            }
+        }
+    }
+
+    @Test
+    public void shouldQueryLargeNumericRangeInSinglePartition() throws Throwable
+    {
+        for (Pair<Integer, Integer> scenario : buildScenarios(LARGE_ROWS))
+        {
+            for (int i = 0; i < LARGE_PARTITIONS_QUERIED; i++)
+            {
+                verifyPartition(largeTable, largeReferenceTable, scenario.right, scenario.left, LARGE_PARTITIONS, LARGE_ROWS_PER_PARTITION);
+            }
+
+            flush(KEYSPACE, largeTable);
+
+            for (int i = 0; i < LARGE_PARTITIONS_QUERIED; i++)
+            {
+                verifyPartition(largeTable, largeReferenceTable, scenario.right, scenario.left, LARGE_PARTITIONS, LARGE_ROWS_PER_PARTITION);
+            }
+        }
+    }
+
+    @Test
+    public void shouldQuerySmallNumericRangeInSinglePartition() throws Throwable
+    {
+        for (Pair<Integer, Integer> scenario : buildScenarios(SMALL_ROWS))
+        {
+            for (int i = 0; i < SMALL_PARTITIONS_QUERIED; i++)
+            {
+                verifyPartition(smallTable, smallReferenceTable, scenario.right, scenario.left, SMALL_PARTITIONS, SMALL_ROWS_PER_PARTITION);
+            }
+
+            flush(KEYSPACE, smallTable);
+
+            for (int i = 0; i < SMALL_PARTITIONS_QUERIED; i++)
+            {
+                verifyPartition(smallTable, smallReferenceTable, scenario.right, scenario.left, SMALL_PARTITIONS, SMALL_ROWS_PER_PARTITION);
+            }
+        }
+    }
+
+    @Test
+    public void testCount() throws Throwable
+    {
+        ResultSet indexedRows = executeNet(String.format("SELECT count(*) FROM %s WHERE pk = %d AND value >= %d", KEYSPACE + "." + smallTable, 0, SMALL_ROWS / 2));
+        ResultSet filteredRows = executeNet(String.format("SELECT count(*) FROM %s WHERE pk = %d AND value >= %d ALLOW FILTERING", KEYSPACE + "." + smallReferenceTable, 0, SMALL_ROWS / 2));
+        assertEquals(filteredRows.one().getLong(0), indexedRows.one().getLong(0));
+    }
+
+    @Test
+    public void testSum() throws Throwable
+    {
+        ResultSet indexedRows = executeNet(String.format("SELECT sum(value) FROM %s WHERE pk = %d AND value >= %d", KEYSPACE + "." + smallTable, 0, SMALL_ROWS / 2));
+        ResultSet filteredRows = executeNet(String.format("SELECT sum(value) FROM %s WHERE pk = %d AND value >= %d ALLOW FILTERING", KEYSPACE + "." + smallReferenceTable, 0, SMALL_ROWS / 2));
+        assertEquals(filteredRows.one().getInt(0), indexedRows.one().getInt(0));
+    }
+
+    @Test
+    public void testAverage() throws Throwable
+    {
+        ResultSet indexedRows = executeNet(String.format("SELECT avg(value) FROM %s WHERE pk = %d AND value >= %d", KEYSPACE + "." + smallTable, 0, SMALL_ROWS / 2));
+        ResultSet filteredRows = executeNet(String.format("SELECT avg(value) FROM %s WHERE pk = %d AND value >= %d ALLOW FILTERING", KEYSPACE + "." + smallReferenceTable, 0, SMALL_ROWS / 2));
+        assertEquals(filteredRows.one().getInt(0), indexedRows.one().getInt(0));
+    }
+
+    private void verifyPartition(String table, String referenceTable, int max, int min, int numPartitions, int rowsPerPartition) throws Throwable
+    {
+        int pk = RANDOM.nextInt(numPartitions);
+
+        // Compare the index result w/ the result of an equivalent ALLOW FILTERING query:
+        ResultSet indexedRows = executeNet(String.format(QUERY_TEMPLATE, KEYSPACE + "." + table, pk, min, max, DEFAULT_LIMIT));
+        ResultSet filteredRows = executeNet(String.format(FILTERING_TEMPLATE, KEYSPACE + "." + referenceTable, pk, min, max, DEFAULT_LIMIT));
+        verifyRowValues(filteredRows.all(), indexedRows.all());
+
+        // Then do the same thing with a DEFAULT_LIMIT high enough to exhaust the partition:
+        indexedRows = executeNet(String.format(QUERY_TEMPLATE, KEYSPACE + "." + table, pk, min, max, rowsPerPartition + 1));
+        filteredRows = executeNet(String.format(FILTERING_TEMPLATE, KEYSPACE + "." + referenceTable, pk, min, max, rowsPerPartition + 1));
+        verifyRowValues(filteredRows.all(), indexedRows.all());
+    }
+
+    private void verifyRowValues(List<Row> expected, List<Row> actual)
+    {
+        assertEquals(expected.size(), actual.size());
+
+        for (int i = 0; i < expected.size(); i++)
+        {
+            assertEquals(expected.get(i).getInt("pk"), actual.get(i).getInt("pk"));
+            assertEquals(expected.get(i).getInt("ck"), actual.get(i).getInt("ck"));
+            assertEquals(expected.get(i).getInt("value"), actual.get(i).getInt("value"));
+        }
+    }
+
+    private List<Pair<Integer, Integer>> buildScenarios(int numRows)
+    {
+
+        List<Pair<Integer, Integer>> scenarios = new LinkedList<>();
+
+        scenarios.add(Pair.create(numRows / 16, numRows));
+        scenarios.add(Pair.create(numRows / 8, numRows));
+        scenarios.add(Pair.create(numRows / 4, numRows));
+        scenarios.add(Pair.create(numRows / 2, numRows));
+
+        scenarios.add(Pair.create(0, numRows / 16));
+        scenarios.add(Pair.create(0, numRows / 8));
+        scenarios.add(Pair.create(0, numRows / 4));
+        scenarios.add(Pair.create(0, numRows / 2));
+
+        scenarios.add(Pair.create(0, numRows));
+
+        return scenarios;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java
new file mode 100644
index 000000000000..41a35a228e8a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+public class QueryCellDeletionsTest extends IndexQuerySupport
+{
+    @Test
+    public void testCellDeletions() throws Throwable
+    {
+        cellDeletions();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java
new file mode 100644
index 000000000000..3099c2c7fafd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+public class QueryRowDeletionsTest extends IndexQuerySupport
+{
+    @Test
+    public void testRowDeletions() throws Throwable
+    {
+        rowDeletions();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java
new file mode 100644
index 000000000000..f37db42b0038
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+public class QueryTimeToLiveTest extends IndexQuerySupport
+{
+    @Test
+    public void testTimeToLive() throws Throwable
+    {
+        timeToLive();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeoutTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeoutTest.java
new file mode 100644
index 000000000000..a4d66ce90f83
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeoutTest.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import javax.management.ObjectName;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.ReadTimeoutException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.PostingListRangeIterator;
+import org.apache.cassandra.index.sai.metrics.TableQueryMetrics;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class QueryTimeoutTest extends SAITester
+{
+    private static final int TIMEOUT = 5000;
+    private static final int DELAY = TIMEOUT + (TIMEOUT / 2);
+
+    private ObjectName queryCountName, queryTimeoutsName;
+
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+
+        startJMXServer();
+
+        createMBeanServerConnection();
+
+        DatabaseDescriptor.setRangeRpcTimeout(TIMEOUT);
+        DatabaseDescriptor.setReadRpcTimeout(TIMEOUT);
+
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+        if (execute("SELECT * FROM %s").size() > 0)
+        {
+            return;
+        }
+
+        for (int i = 0; i < 100; ++i)
+        {
+            execute("INSERT INTO %s(id1,v1,v2) VALUES (?, ?, ?)", i, i, Integer.toString(i % 5));
+        }
+        flush();
+
+        execute("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 10000");
+        execute("SELECT * FROM %s WHERE v2 = '0'");
+
+        queryCountName = objectNameNoIndex("TotalQueriesCompleted", CQLTester.KEYSPACE, currentTable(), TableQueryMetrics.TABLE_QUERY_METRIC_TYPE);
+        queryTimeoutsName = objectNameNoIndex("TotalQueryTimeouts", CQLTester.KEYSPACE, currentTable(), TableQueryMetrics.TABLE_QUERY_METRIC_TYPE);
+    }
+
+    @After
+    public void removeInjections() throws Exception
+    {
+        Injections.deleteAll();
+    }
+
+    @Test
+    public void delayDuringKDTreeIntersectionShouldProvokeTimeoutInReader() throws Throwable
+    {
+        Injection kdtree_intersection_delay = Injections.newPause("kdtree_intersection_delay", DELAY)
+                                                        .add(newInvokePoint().onClass("org.apache.cassandra.index.sai.disk.v1.BKDReader$Intersection")
+                                                                                   .onMethod("collectPostingLists")
+                                                                                   .at("INVOKE QueryContext.checkpoint"))
+
+                                                        .build();
+
+        Injections.inject(kdtree_intersection_delay);
+
+        assertThatThrownBy(() -> executeNet("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 10000")).isInstanceOf(ReadTimeoutException.class);
+
+        waitForEquals(queryCountName, queryTimeoutsName);
+    }
+
+    @Test
+    public void delayDuringTermsReaderMatchShouldProvokeTimeoutInReader() throws Throwable
+    {
+        Injection terms_match_delay = Injections.newPause("terms_match_delay", DELAY)
+                                                .add(newInvokePoint().onClass("org.apache.cassandra.index.sai.disk.v1.TermsReader$TermQuery")
+                                                                           .onMethod("execute")
+                                                                           .at("INVOKE QueryContext.checkpoint"))
+                                                .build();
+
+        Injections.inject(terms_match_delay);
+
+        assertThatThrownBy(() -> executeNet("SELECT * FROM %s WHERE v2 = '1'")).isInstanceOf(ReadTimeoutException.class);
+
+        waitForEquals(queryCountName, queryTimeoutsName);
+    }
+
+    @Test
+    public void delayDuringTokenLookupShouldProvokeTimeoutInRangeIterator() throws Throwable
+    {
+        Injection token_lookup_delay = Injections.newPause("token_lookup_delay", DELAY)
+                                                 .add(newInvokePoint().onClass(PostingListRangeIterator.class)
+                                                                            .onMethod("computeNext")
+                                                                            .at("INVOKE QueryContext.checkpoint"))
+                                                 .build();
+
+        Injections.inject(token_lookup_delay);
+
+        assertThatThrownBy(() -> executeNet("SELECT * FROM %s WHERE v2 = '1'")).isInstanceOf(ReadTimeoutException.class);
+
+        waitForEquals(queryCountName, queryTimeoutsName);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java
new file mode 100644
index 000000000000..bc58a8370209
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+
+public class QueryWriteLifecycleTest extends IndexQuerySupport
+{
+    @Test
+    public void testWriteLifecycle() throws Throwable
+    {
+        writeLifecycle();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java
new file mode 100644
index 000000000000..908c8617ae3b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+/**
+ * Force generates segments due to a small RAM size on compaction, to test segment splitting
+ */
+public class TinySegmentQueryCellDeletionsTest extends IndexQuerySupport
+{
+    @Before
+    public void setSegmentWriteBufferSpace() throws Throwable
+    {
+        DatabaseDescriptor.setSAISegmentWriteBufferSpace(0);
+    }
+
+    @Test
+    public void testCellDeletions() throws Throwable
+    {
+        cellDeletions();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java
new file mode 100644
index 000000000000..0ac830494c14
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+/**
+ * Force generates segments due to a small RAM size on compaction, to test segment splitting
+ */
+public class TinySegmentQueryRowDeletionsTest extends IndexQuerySupport
+{
+    @Before
+    public void setSegmentWriteBufferSpace() throws Throwable
+    {
+        DatabaseDescriptor.setSAISegmentWriteBufferSpace(0);
+    }
+
+    @Test
+    public void testRowDeletions() throws Throwable
+    {
+        rowDeletions();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java
new file mode 100644
index 000000000000..0b1388de9158
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+/**
+ * Force generates segments due to a small RAM size on compaction, to test segment splitting
+ */
+public class TinySegmentQueryTimeToLiveTest extends IndexQuerySupport
+{
+    @Before
+    public void setSegmentWriteBufferSpace() throws Throwable
+    {
+        DatabaseDescriptor.setSAISegmentWriteBufferSpace(0);
+    }
+
+    @Test
+    public void testTimeToLive() throws Throwable
+    {
+        timeToLive();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java
new file mode 100644
index 000000000000..4999fafa51ef
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+/**
+ * Force generates segments due to a small RAM size on compaction, to test segment splitting
+ */
+public class TinySegmentQueryWriteLifecycleTest extends IndexQuerySupport
+{
+    @Before
+    public void setSegmentWriteBufferSpace() throws Throwable
+    {
+        DatabaseDescriptor.setSAISegmentWriteBufferSpace(0);
+    }
+
+    @Test
+    public void testWriteLifecycle() throws Throwable
+    {
+        writeLifecycle();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TokenCollisionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TokenCollisionTest.java
new file mode 100644
index 000000000000..3b8a1cd65f46
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TokenCollisionTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.List;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.Row;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.dht.LengthPartitioner;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.service.StorageService;
+
+import static org.junit.Assert.assertEquals;
+
+public class TokenCollisionTest extends SAITester
+{
+    @BeforeClass
+    public static void setupCQLTester()
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(LengthPartitioner.instance);
+        StorageService.instance.setPartitionerUnsafe(LengthPartitioner.instance);
+    }
+
+    @Before
+    public void setup()
+    {
+        requireNetwork();
+    }
+
+    @Test
+    public void testSkippingWhenTokensCollide() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk text, value text, PRIMARY KEY (pk))");
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex'");
+
+        final int numRows = 640; // 5 blocks x 128 postings, so skip table will contain 5 entries
+        for (int i = 0; i < numRows; i++)
+        {
+            final String pk = String.format("%3d", i);
+            execute("INSERT INTO %s (pk, value) VALUES (?, ?)", pk, "abc");
+        }
+        flush();
+
+        // A storage-attached index will advance token flow to the token that is shared between all indexed rows,
+        // and cause binary search on the postings skip table that looks like this [3, 3, 3, 3, 3].
+        List<Row> rows = executeNet("SELECT * FROM %s WHERE token(pk) >= token('000') AND value = 'abc'").all();
+        // we should match all the rows
+        assertEquals(numRows, rows.size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/AsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/AsciiTest.java
new file mode 100644
index 000000000000..82f5f745d2ce
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/AsciiTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class AsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.AsciiDataSet());
+    }
+
+    public AsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/BigintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/BigintTest.java
new file mode 100644
index 000000000000..a361acd5211e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/BigintTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class BigintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.BigintDataSet());
+    }
+
+    public BigintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java
new file mode 100644
index 000000000000..99232c2118ee
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java
@@ -0,0 +1,679 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.time.Instant;
+import java.time.temporal.ChronoUnit;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Date;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.marshal.InetAddressType;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.serializers.SimpleDateSerializer;
+import org.apache.cassandra.serializers.TimeSerializer;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport.NUMBER_OF_VALUES;
+
+public abstract class DataSet<T> extends CQLTester
+{
+    public T[] values;
+
+    public abstract QuerySet querySet();
+
+    public Collection<String> decorateIndexColumn(String column)
+    {
+        return Arrays.asList(column);
+    }
+
+    public static abstract class NumericDataSet<T extends Number> extends DataSet<T>
+    {
+        NumericDataSet()
+        {
+            values = emptyValues();
+            List<T> list = Arrays.asList(values);
+            for (int index = 0; index < values.length; index += 2)
+            {
+                T value1, value2;
+                while (true)
+                {
+                    value1 = nextValue();
+                    value1 = getRandom().nextBoolean() ? negate(value1) : abs(value1);
+                    value2 = increment(value1);
+                    if (!list.contains(value1) && !list.contains(value2))
+                        break;
+                }
+                values[index] = value1;
+                values[index + 1] = value2;
+            }
+            Arrays.sort(values);
+        }
+
+        abstract T[] emptyValues();
+
+        abstract T nextValue();
+
+        abstract T negate(T value);
+
+        abstract T abs(T value);
+
+        abstract T increment(T value);
+
+        public QuerySet querySet()
+        {
+            return new QuerySet.NumericQuerySet(this);
+        }
+    }
+
+    public static class IntDataSet extends NumericDataSet<Integer>
+    {
+        @Override
+        Integer[] emptyValues()
+        {
+            return new Integer[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        Integer nextValue()
+        {
+            return getRandom().nextInt();
+        }
+
+        @Override
+        Integer negate(Integer value)
+        {
+            return value > 0 ? -value : value;
+        }
+
+        @Override
+        Integer abs(Integer value)
+        {
+            return value < 0 ? Math.abs(value) : value;
+        }
+
+        @Override
+        Integer increment(Integer value)
+        {
+            return ++value;
+        }
+
+        public String toString()
+        {
+            return "int";
+        }
+    }
+
+    public static class BigintDataSet extends NumericDataSet<Long>
+    {
+        @Override
+        Long[] emptyValues()
+        {
+            return new Long[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        Long nextValue()
+        {
+            return getRandom().nextLong();
+        }
+
+        @Override
+        Long negate(Long value)
+        {
+            return value > 0 ? -value : value;
+        }
+
+        @Override
+        Long abs(Long value)
+        {
+            return value < 0 ? Math.abs(value) : value;
+        }
+
+        @Override
+        Long increment(Long value)
+        {
+            return ++value;
+        }
+
+        public String toString()
+        {
+            return "bigint";
+        }
+    }
+
+    public static class SmallintDataSet extends NumericDataSet<Short>
+    {
+        @Override
+        Short[] emptyValues()
+        {
+            return new Short[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        Short nextValue()
+        {
+            return getRandom().nextShort();
+        }
+
+        @Override
+        Short negate(Short value)
+        {
+            return value > 0 ? (short)-value : value;
+        }
+
+        @Override
+        Short abs(Short value)
+        {
+            return value < 0 ? (short)Math.abs(value) : value;
+        }
+
+        @Override
+        Short increment(Short value)
+        {
+            return ++value;
+        }
+
+        public String toString()
+        {
+            return "smallint";
+        }
+    }
+
+    public static class TinyintDataSet extends NumericDataSet<Byte>
+    {
+        @Override
+        Byte[] emptyValues()
+        {
+            return new Byte[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        Byte nextValue()
+        {
+            return getRandom().nextByte();
+        }
+
+        @Override
+        Byte negate(Byte value)
+        {
+            return value > 0 ? (byte)-value : value;
+        }
+
+        @Override
+        Byte abs(Byte value)
+        {
+            return value < 0 ? (byte)Math.abs(value) : value;
+        }
+
+        @Override
+        Byte increment(Byte value)
+        {
+            return ++value;
+        }
+
+        public String toString()
+        {
+            return "tinyint";
+        }
+    }
+
+    public static class VarintDataSet extends NumericDataSet<BigInteger>
+    {
+        @Override
+        BigInteger[] emptyValues()
+        {
+            return new BigInteger[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        BigInteger nextValue()
+        {
+            return getRandom().nextBigInteger(16, 512);
+        }
+
+        @Override
+        BigInteger negate(BigInteger value)
+        {
+            return value.signum() > 0 ? value.negate() : value;
+        }
+
+        @Override
+        BigInteger abs(BigInteger value)
+        {
+            return value.signum() < 0 ? value.abs() : value;
+        }
+
+        @Override
+        BigInteger increment(BigInteger value)
+        {
+            return value.add(BigInteger.ONE);
+        }
+
+        public String toString()
+        {
+            return "varint";
+        }
+    }
+
+    public static class DecimalDataSet extends NumericDataSet<BigDecimal>
+    {
+        @Override
+        BigDecimal[] emptyValues()
+        {
+            return new BigDecimal[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        BigDecimal nextValue()
+        {
+            return getRandom().nextBigDecimal(-1_000_000, 1_000_000, -64, 64);
+        }
+
+        @Override
+        BigDecimal negate(BigDecimal value)
+        {
+            return value.signum() > 0 ? value.negate() : value;
+        }
+
+        @Override
+        BigDecimal abs(BigDecimal value)
+        {
+            return value.signum() < 0 ? value.abs() : value;
+        }
+
+        @Override
+        BigDecimal increment(BigDecimal value)
+        {
+            return value.add(BigDecimal.ONE);
+        }
+
+        public String toString()
+        {
+            return "decimal";
+        }
+    }
+
+
+    public static class FloatDataSet extends NumericDataSet<Float>
+    {
+        @Override
+        Float[] emptyValues()
+        {
+            return new Float[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        Float nextValue()
+        {
+            return getRandom().nextFloat();
+        }
+
+        @Override
+        Float negate(Float value)
+        {
+            return value > 0 ? -value : value;
+        }
+
+        @Override
+        Float abs(Float value)
+        {
+            return value < 0 ? Math.abs(value) : value;
+        }
+
+        @Override
+        Float increment(Float value)
+        {
+            return ++value;
+        }
+
+        public String toString()
+        {
+            return "float";
+        }
+    }
+
+    public static class DoubleDataSet extends NumericDataSet<Double>
+    {
+        @Override
+        Double[] emptyValues()
+        {
+            return new Double[NUMBER_OF_VALUES];
+        }
+
+        @Override
+        Double nextValue()
+        {
+            return getRandom().nextDouble();
+        }
+
+        @Override
+        Double negate(Double value)
+        {
+            return value > 0 ? -value : value;
+        }
+
+        @Override
+        Double abs(Double value)
+        {
+            return value < 0 ? Math.abs(value) : value;
+        }
+
+        @Override
+        Double increment(Double value)
+        {
+            return ++value;
+        }
+
+        public String toString()
+        {
+            return "double";
+        }
+    }
+
+    public static class AsciiDataSet extends DataSet<String>
+    {
+        public AsciiDataSet()
+        {
+            values = new String[NUMBER_OF_VALUES];
+            List<String> list = Arrays.asList(values);
+            for (int index = 0; index < values.length; index++)
+            {
+                String value;
+                while (true)
+                {
+                    value = getRandom().nextAsciiString(8, 256);
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.LiteralQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "ascii";
+        }
+    }
+
+    public static class TextDataSet extends DataSet<String>
+    {
+        public TextDataSet()
+        {
+            values = new String[NUMBER_OF_VALUES];
+            List<String> list = Arrays.asList(values);
+            for (int index = 0; index < values.length; index++)
+            {
+                String value;
+                while (true)
+                {
+                    value = getRandom().nextTextString(8, 256);
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.LiteralQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "text";
+        }
+    }
+
+    public static class DateDataSet extends DataSet<Integer>
+    {
+        public DateDataSet()
+        {
+            values = new Integer[NUMBER_OF_VALUES];
+            List<Integer> list = Arrays.asList(values);
+            long min = TimeUnit.DAYS.toMillis(Integer.MIN_VALUE);
+            long max = TimeUnit.DAYS.toMillis(Integer.MAX_VALUE);
+            long range = max - min;
+
+            for (int index = 0; index < values.length; index++)
+            {
+                Integer value;
+                while (true)
+                {
+                    value = SimpleDateSerializer.timeInMillisToDay(min + Math.round(getRandom().nextDouble() * range));
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.LiteralQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "date";
+        }
+    }
+
+    public static class TimeDataSet extends DataSet<Long>
+    {
+        public TimeDataSet()
+        {
+            values = new Long[NUMBER_OF_VALUES];
+            List<Long> list = Arrays.asList(values);
+            for (int index = 0; index < values.length; index++)
+            {
+                Long value;
+                while (true)
+                {
+                    int hours = getRandom().nextIntBetween(0, 23);
+                    int minutes = getRandom().nextIntBetween(0, 59);
+                    int seconds = getRandom().nextIntBetween(0, 59);
+                    long nanos = getRandom().nextIntBetween(0, 1000000000);
+                    value = TimeSerializer.timeStringToLong(String.format("%s:%s:%s.%s", hours, minutes, seconds, nanos));
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+            Arrays.sort(values);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.NumericQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "time";
+        }
+    }
+
+    public static class TimestampDataSet extends DataSet<Date>
+    {
+        public TimestampDataSet()
+        {
+            values = new Date[NUMBER_OF_VALUES];
+            List<Date> list = Arrays.asList(values);
+            long min = Instant.EPOCH.getEpochSecond();
+            long max = Instant.EPOCH.plus(100 * 365, ChronoUnit.DAYS).getEpochSecond();
+            long range = max - min;
+
+            for (int index = 0; index < values.length; index++)
+            {
+                Date value;
+                while (true)
+                {
+                    value = Date.from(Instant.ofEpochSecond(min + Math.round(getRandom().nextDouble() * range)));
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.LiteralQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "timestamp";
+        }
+    }
+
+    public static class UuidDataSet extends DataSet<UUID>
+    {
+        public UuidDataSet()
+        {
+            values = new UUID[NUMBER_OF_VALUES];
+            List<UUID> list = Arrays.asList(values);
+
+            for (int index = 0; index < values.length; index++)
+            {
+                UUID value;
+                while (true)
+                {
+                    value = UUID.randomUUID();
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.LiteralQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "uuid";
+        }
+    }
+
+    public static class TimeuuidDataSet extends DataSet<UUID>
+    {
+        public TimeuuidDataSet()
+        {
+            values = new UUID[NUMBER_OF_VALUES];
+            List<UUID> list = Arrays.asList(values);
+
+            for (int index = 0; index < values.length; index++)
+            {
+                UUID value;
+                while (true)
+                {
+                    value = UUIDGen.getTimeUUID();
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.LiteralQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "timeuuid";
+        }
+    }
+
+    public static class InetDataSet extends DataSet<InetAddress>
+    {
+        public InetDataSet()
+        {
+            values = new InetAddress[NUMBER_OF_VALUES];
+            List<InetAddress> list = Arrays.asList(values);
+
+            for (int index = 0; index < values.length; index++)
+            {
+                InetAddress value;
+                while (true)
+                {
+                    byte[] bytes;
+                    if (getRandom().nextBoolean())
+                        bytes = new byte[4];
+                    else
+                        bytes = new byte[16];
+                    getRandom().nextBytes(bytes);
+                    try
+                    {
+                        value = InetAddress.getByAddress(bytes);
+                    }
+                    catch (UnknownHostException e)
+                    {
+                        throw new RuntimeException(e);
+                    }
+                    if (!list.contains(value))
+                        break;
+                }
+                values[index] = value;
+            }
+            Arrays.sort(values, (o1, o2) -> {
+                return TypeUtil.compare(TypeUtil.encode(ByteBuffer.wrap(o1.getAddress()), InetAddressType.instance),
+                                        TypeUtil.encode(ByteBuffer.wrap(o2.getAddress()), InetAddressType.instance),
+                                        InetAddressType.instance);
+            });
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.NumericQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "inet";
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/DateTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/DateTest.java
new file mode 100644
index 000000000000..11978d14201f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/DateTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class DateTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.DateDataSet());
+    }
+
+    public DateTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/DecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/DecimalTest.java
new file mode 100644
index 000000000000..a2552ba6b725
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/DecimalTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class DecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.DecimalDataSet());
+    }
+
+    public DecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/DoubleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/DoubleTest.java
new file mode 100644
index 000000000000..f60fe6897ff8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/DoubleTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class DoubleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.DoubleDataSet());
+    }
+
+    public DoubleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/FloatTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/FloatTest.java
new file mode 100644
index 000000000000..df9045c5ee98
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/FloatTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class FloatTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.FloatDataSet());
+    }
+
+    public FloatTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java b/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java
new file mode 100644
index 000000000000..c4677294715a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Before;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.index.sai.SAITester;
+
+public abstract class IndexingTypeSupport extends SAITester
+{
+    public static final int NUMBER_OF_VALUES = 64;
+
+    protected final DataSet<?> dataset;
+
+    private final boolean widePartitions;
+    private final Scenario scenario;
+    private Object[][] allRows;
+
+    public enum Scenario
+    {
+        MEMTABLE_QUERY,
+        SSTABLE_QUERY,
+        MIXED_QUERY,
+        COMPACTED_QUERY,
+        POST_BUILD_QUERY
+    }
+
+    protected static Collection<Object[]> generateParameters(DataSet<?> dataset)
+    {
+        return Arrays.asList(new Object[][]
+        {
+            { dataset, true, Scenario.MEMTABLE_QUERY },
+            { dataset, true, Scenario.SSTABLE_QUERY},
+            { dataset, true, Scenario.COMPACTED_QUERY},
+            { dataset, true, Scenario.MIXED_QUERY},
+            { dataset, true, Scenario.POST_BUILD_QUERY},
+            { dataset, false, Scenario.MEMTABLE_QUERY },
+            { dataset, false, Scenario.SSTABLE_QUERY},
+            { dataset, false, Scenario.COMPACTED_QUERY},
+            { dataset, false, Scenario.MIXED_QUERY},
+            { dataset, false, Scenario.POST_BUILD_QUERY}
+        });
+    }
+
+    public IndexingTypeSupport(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        this.dataset = dataset;
+        this.widePartitions = widePartitions;
+        this.scenario = scenario;
+    }
+
+    @Before
+    public void createTable()
+    {
+        createTable(String.format("CREATE TABLE %%s (pk int, ck int, value %s, PRIMARY KEY(pk, ck))", dataset));
+
+        disableCompaction();
+
+        allRows = generateRows(dataset, widePartitions);
+    }
+
+    protected void runIndexQueryScenarios() throws Throwable
+    {
+        if (scenario != Scenario.POST_BUILD_QUERY)
+        {
+            for (String index : dataset.decorateIndexColumn("value"))
+                createIndex(String.format("CREATE CUSTOM INDEX ON %%s(%s) USING 'StorageAttachedIndex'", index));
+            waitForIndexQueryable();
+        }
+
+        insertData(this, allRows, scenario);
+
+        switch (scenario)
+        {
+            case SSTABLE_QUERY:
+                flush();
+                break;
+            case COMPACTED_QUERY:
+                flush();
+                compact();
+                break;
+            case POST_BUILD_QUERY:
+                flush();
+                for (String index : dataset.decorateIndexColumn("value"))
+                    createIndex(String.format("CREATE CUSTOM INDEX ON %%s(%s) USING 'StorageAttachedIndex'", index));
+                waitForIndexQueryable();
+                break;
+        }
+
+        dataset.querySet().runQueries(this, allRows);
+    }
+
+    public static void insertData(CQLTester tester, Object[][] allRows, Scenario scenario) throws Throwable
+    {
+        int sstableCounter = 0;
+        int sstableIncrement = NUMBER_OF_VALUES / 8;
+        for (int count = 0; count < allRows.length; count++)
+        {
+            tester.execute("INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)", allRows[count][0], allRows[count][1], allRows[count][2]);
+            if ((scenario != Scenario.MEMTABLE_QUERY) && (++sstableCounter == sstableIncrement))
+            {
+                tester.flush();
+                sstableCounter = 0;
+            }
+        }
+    }
+
+    public static Object[][] generateRows(DataSet<?> dataset, boolean widePartitions)
+    {
+        Object[][] allRows = new Object[dataset.values.length][];
+        int partitionIncrement = NUMBER_OF_VALUES / 16;
+        int partitionCounter = 0;
+        int partition = 1;
+        for (int index = 0; index < dataset.values.length; index++)
+        {
+            allRows[index] = row(partition, partitionCounter, dataset.values[index]);
+            if (widePartitions)
+            {
+                if (++partitionCounter == partitionIncrement)
+                {
+                    partition++;
+                    partitionCounter = 0;
+                }
+            }
+            else
+            {
+                partition++;
+            }
+        }
+        return allRows;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/InetTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/InetTest.java
new file mode 100644
index 000000000000..90d929c5f398
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/InetTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class InetTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.InetDataSet());
+    }
+
+    public InetTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/IntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/IntTest.java
new file mode 100644
index 000000000000..29f3e387eb21
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/IntTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class IntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.IntDataSet());
+    }
+
+    public IntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/NumericTypeSortingTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/NumericTypeSortingTest.java
new file mode 100644
index 000000000000..5103bac863bb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/NumericTypeSortingTest.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.math.RoundingMode;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+
+@ThreadLeakScope(ThreadLeakScope.Scope.NONE)
+public class NumericTypeSortingTest extends RandomizedTest
+{
+    @Test
+    public void testBigDecimalEncoding()
+    {
+        BigDecimal[] data = new BigDecimal[10000];
+        for (int i = 0; i < data.length; i++)
+        {
+            BigDecimal divider = new BigDecimal(new BigInteger(randomInt(1000), getRandom()).add(BigInteger.ONE));
+            BigDecimal randomNumber = new BigDecimal(new BigInteger(randomInt(1000), getRandom())).divide(divider, RoundingMode.HALF_DOWN);
+            if (randomBoolean())
+                randomNumber = randomNumber.negate();
+
+            data[i] = randomNumber;
+        }
+
+        Arrays.sort(data, BigDecimal::compareTo);
+
+        for (int i = 1; i < data.length; i++)
+        {
+            BigDecimal i0 = data[i - 1];
+            BigDecimal i1 = data[i];
+            assertTrue(i0 + " <= " + i1, i0.compareTo(i1) <= 0);
+
+            ByteBuffer b0 = TypeUtil.encode(DecimalType.instance.decompose(i0), DecimalType.instance);
+
+            ByteBuffer b1 = TypeUtil.encode(DecimalType.instance.decompose(i1), DecimalType.instance);
+
+            assertTrue(i0 + " <= " + i1, TypeUtil.compare(b0, b1, DecimalType.instance) <= 0);
+        }
+    }
+
+    @Test
+    public void testBigIntEncoding()
+    {
+        BigInteger[] data = new BigInteger[10000];
+        for (int i = 0; i < data.length; i++)
+        {
+            BigInteger divider = new BigInteger(randomInt(1000), getRandom()).add(BigInteger.ONE);
+            BigInteger randomNumber = new BigInteger(randomInt(1000), getRandom()).divide(divider);
+            if (randomBoolean())
+                randomNumber = randomNumber.negate();
+
+            data[i] = randomNumber;
+        }
+
+        Arrays.sort(data, BigInteger::compareTo);
+
+        for (int i = 1; i < data.length; i++)
+        {
+            BigInteger i0 = data[i - 1];
+            BigInteger i1 = data[i];
+            assertTrue(i0 + " <= " + i1, i0.compareTo(i1) <= 0);
+
+            ByteBuffer b0 = TypeUtil.encode(IntegerType.instance.decompose(i0), IntegerType.instance);
+
+            ByteBuffer b1 = TypeUtil.encode(IntegerType.instance.decompose(i1), IntegerType.instance);
+
+            assertTrue(i0 + " <= " + i1, TypeUtil.compare(b0, b1, IntegerType.instance) <= 0);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java
new file mode 100644
index 000000000000..cd2f29c31634
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java
@@ -0,0 +1,500 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport.NUMBER_OF_VALUES;
+
+public abstract class QuerySet extends CQLTester
+{
+    final DataSet<?> dataset;
+
+    QuerySet(DataSet<?> dataset)
+    {
+        this.dataset = dataset;
+    }
+
+    public abstract void runQueries(SAITester tester, Object[][] allRows) throws Throwable;
+
+    public static class NumericQuerySet extends QuerySet
+    {
+        NumericQuerySet(DataSet<?> dataset)
+        {
+            super(dataset);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            // Query each value for all operators
+            for (int index = 0; index < allRows.length; index++)
+            {
+                assertRows(tester.execute("SELECT * FROM %s WHERE value = ?", allRows[index][2]), new Object[][] { allRows[index] });
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value > ?", allRows[index][2]), Arrays.copyOfRange(allRows, index + 1, allRows.length));
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value >= ?", allRows[index][2]), Arrays.copyOfRange(allRows, index, allRows.length));
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value < ?", allRows[index][2]), Arrays.copyOfRange(allRows, 0, index));
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value <= ?", allRows[index][2]), Arrays.copyOfRange(allRows, 0, index + 1));
+            }
+
+            // Query full range
+            assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value >= ? AND value <= ?", allRows[0][2], allRows[NUMBER_OF_VALUES - 1][2]), allRows);
+
+            // Query random ranges. This selects a series of random ranges and tests the different possible inclusivity
+            // on them. This loops a reasonable number of times to cover as many ranges as possible without taking too long
+            for (int range = 0; range < allRows.length / 4; range++)
+            {
+                int index1 = 0;
+                int index2 = 0;
+                while (index1 == index2)
+                {
+                    index1 = getRandom().nextIntBetween(0, allRows.length - 1);
+                    index2 = getRandom().nextIntBetween(0, allRows.length - 1);
+                }
+
+                int min = Math.min(index1, index2);
+                int max = Math.max(index1, index2);
+
+                // lower exclusive -> upper exclusive
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value > ? AND value < ?", allRows[min][2], allRows[max][2]),
+                        Arrays.copyOfRange(allRows, min + 1, max));
+
+                // lower inclusive -> upper exclusive
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value >= ? AND value < ?", allRows[min][2], allRows[max][2]),
+                        Arrays.copyOfRange(allRows, min, max));
+
+                // lower exclusive -> upper inclusive
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value > ? AND value <= ?", allRows[min][2], allRows[max][2]),
+                        Arrays.copyOfRange(allRows, min + 1, max + 1));
+
+                // lower inclusive -> upper inclusive
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value >= ? AND value <= ?", allRows[min][2], allRows[max][2]),
+                        Arrays.copyOfRange(allRows, min, max + 1));
+            }
+        }
+    }
+
+    public static class LiteralQuerySet extends QuerySet
+    {
+        LiteralQuerySet(DataSet<?> dataSet)
+        {
+            super(dataSet);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            // Query each value for EQ operator
+            for (int index = 0; index < allRows.length; index++)
+            {
+                assertRows(tester.execute("SELECT * FROM %s WHERE value = ?", allRows[index][2]), new Object[][] { allRows[index] });
+            }
+        }
+    }
+
+    public static class CollectionQuerySet extends QuerySet
+    {
+        protected DataSet<?> elementDataSet;
+
+        public CollectionQuerySet(DataSet<?> dataSet, DataSet<?> elementDataSet)
+        {
+            super(dataSet);
+            this.elementDataSet = elementDataSet;
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            for (int index = 0; index < allRows.length; index++)
+            {
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS ?",
+                        elementDataSet.values[index]), getExpectedRows(elementDataSet.values[index], allRows));
+            }
+
+            for (int and = 0; and < allRows.length / 4; and++)
+            {
+                int index = getRandom().nextIntBetween(0, allRows.length - 1);
+                Iterator valueIterator = ((Collection) allRows[index][2]).iterator();
+                Object value1 = valueIterator.next();
+                Object value2 = valueIterator.hasNext() ? valueIterator.next() : value1;
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS ? AND value CONTAINS ?",
+                    value1, value2), getExpectedRows(value1, value2, allRows));
+            }
+        }
+
+        protected Object[][] getExpectedRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Collection)row[2]).contains(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedRows(Object value1, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Collection)row[2]).contains(value1) && ((Collection)row[2]).contains(value2))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+
+    public static class FrozenCollectionQuerySet extends QuerySet
+    {
+        public FrozenCollectionQuerySet(DataSet<?> dataset)
+        {
+            super(dataset);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            for (int index = 0; index < allRows.length; index++)
+            {
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value = ?",
+                        allRows[index][2]), getExpectedRows(allRows[index][2], allRows));
+            }
+        }
+
+        protected Object[][] getExpectedRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (row[2].equals(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+
+    public static class MapValuesQuerySet extends CollectionQuerySet
+    {
+        public MapValuesQuerySet(DataSet<?> dataSet, DataSet<?> elementDataSet)
+        {
+            super(dataSet, elementDataSet);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            for (int index = 0; index < allRows.length; index++)
+            {
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS ?",
+                        elementDataSet.values[index]), getExpectedRows(elementDataSet.values[index], allRows));
+            }
+
+            for (int and = 0; and < allRows.length / 4; and++)
+            {
+                int index = getRandom().nextIntBetween(0, allRows.length - 1);
+                Map map = (Map)allRows[index][2];
+                Object value1 = map.values().toArray()[getRandom().nextIntBetween(0, map.values().size() - 1)];
+                Object value2 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.values().size() - 1)];
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS ? AND value CONTAINS ?",
+                        value1, value2), getExpectedRows(value1, value2, allRows));
+            }
+        }
+
+        protected Object[][] getExpectedRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).values().contains(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedRows(Object value1, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).values().contains(value1) && ((Map)row[2]).values().contains(value2))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+
+    public static class MapKeysQuerySet extends CollectionQuerySet
+    {
+        public MapKeysQuerySet(DataSet<?> dataSet, DataSet<?> elementDataSet)
+        {
+            super(dataSet, elementDataSet);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            for (int index = 0; index < allRows.length; index++)
+            {
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS KEY ?",
+                        elementDataSet.values[index]), getExpectedRows(elementDataSet.values[index], allRows));
+            }
+
+            for (int and = 0; and < allRows.length / 4; and++)
+            {
+                int index = getRandom().nextIntBetween(0, allRows.length - 1);
+                Map map = (Map)allRows[index][2];
+                Object key1 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.keySet().size() - 1)];
+                Object key2 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.keySet().size() - 1)];
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS KEY ? AND value CONTAINS KEY ?",
+                        key1, key2), getExpectedRows(key1, key2, allRows));
+            }
+        }
+
+        protected Object[][] getExpectedRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).keySet().contains(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedRows(Object value1, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).keySet().contains(value1) && ((Map)row[2]).keySet().contains(value2))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+
+    public static class MapEntriesQuerySet extends CollectionQuerySet
+    {
+        public MapEntriesQuerySet(DataSet<?> dataSet, DataSet<?> elementDataSet)
+        {
+            super(dataSet, elementDataSet);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            for (int index = 0; index < allRows.length; index++)
+            {
+                Map map = (Map)allRows[index][2];
+                Object key = map.keySet().toArray()[0];
+                Object value = map.get(key);
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value[?] = ?",
+                        key, value), getExpectedRows(key, value, allRows));
+            }
+            for (int and = 0; and < allRows.length / 4; and++)
+            {
+                int index = getRandom().nextIntBetween(0, allRows.length - 1);
+                Map map = (Map)allRows[index][2];
+                Object key1 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.keySet().size() - 1)];
+                Object value1 = map.get(key1);
+                Object key2 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.keySet().size() - 1)];
+                Object value2 = map.get(key2);
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value[?] = ? AND value[?] = ?",
+                        key1, value1, key2, value2), getExpectedRows(key1, value1, key2, value2, allRows));
+            }
+        }
+
+        protected Object[][] getExpectedRows(Object key, Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                Map rowMap = (Map)row[2];
+                if (rowMap.containsKey(key))
+                {
+                    if (rowMap.get(key).equals(value))
+                        expected.add(row);
+                }
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedRows(Object key1, Object value1, Object key2, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                Map rowMap = (Map)row[2];
+                if (rowMap.containsKey(key1) && rowMap.containsKey(key2))
+                {
+                    if (rowMap.get(key1).equals(value1) && rowMap.get(key2).equals(value2))
+                        expected.add(row);
+                }
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+
+    public static class MultiMapQuerySet extends CollectionQuerySet
+    {
+        public MultiMapQuerySet(DataSet<?> dataSet, DataSet<?> elementDataSet)
+        {
+            super(dataSet, elementDataSet);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            for (int index = 0; index < allRows.length; index++)
+            {
+                Map map = (Map)allRows[index][2];
+                Object key = map.keySet().toArray()[0];
+                Object value = map.get(key);
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS KEY ?", key),
+                        getExpectedKeyRows(key, allRows));
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS ?", value),
+                        getExpectedValueRows(value, allRows));
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value[?] = ?", key, value),
+                        getExpectedEntryRows(key, value, allRows));
+            }
+            for (int and = 0; and < allRows.length / 4; and++)
+            {
+                int index = getRandom().nextIntBetween(0, allRows.length - 1);
+                Map map = (Map)allRows[index][2];
+                Object key1 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.keySet().size() - 1)];
+                Object value1 = map.get(key1);
+                Object key2 = map.keySet().toArray()[getRandom().nextIntBetween(0, map.keySet().size() - 1)];
+                Object value2 = map.get(key2);
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS KEY ? AND value CONTAINS KEY ?", key1, key2),
+                                        getExpectedKeyRows(key1, key2, allRows));
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value CONTAINS ? AND value CONTAINS ?", value1, value2),
+                                        getExpectedValueRows(value1, value2, allRows));
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value[?] = ? AND value[?] = ?", key1, value1, key2, value2),
+                                        getExpectedEntryRows(key1, value1, key2, value2, allRows));
+
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value[?] = ? AND value CONTAINS KEY ? AND value CONTAINS ?", key1, value1, key2, value2),
+                        getExpectedMixedRows(key1, value1, key2, value2, allRows));
+            }
+        }
+
+        protected Object[][] getExpectedKeyRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).keySet().contains(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedValueRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).values().contains(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedEntryRows(Object key, Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                Map rowMap = (Map)row[2];
+                if (rowMap.containsKey(key))
+                {
+                    if (rowMap.get(key).equals(value))
+                        expected.add(row);
+                }
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedKeyRows(Object value1, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).keySet().contains(value1) && ((Map)row[2]).keySet().contains(value2))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedValueRows(Object value1, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (((Map)row[2]).values().contains(value1) && ((Map)row[2]).values().contains(value2))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedEntryRows(Object key1, Object value1, Object key2, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                Map rowMap = (Map)row[2];
+                if (rowMap.containsKey(key1) && rowMap.containsKey(key2))
+                {
+                    if (rowMap.get(key1).equals(value1) && rowMap.get(key2).equals(value2))
+                        expected.add(row);
+                }
+            }
+            return expected.toArray(new Object[][]{});
+        }
+
+        protected Object[][] getExpectedMixedRows(Object key1, Object value1, Object key2, Object value2, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                Map rowMap = (Map)row[2];
+                if (rowMap.containsKey(key1) && rowMap.containsKey(key2) && rowMap.containsValue(value2))
+                {
+                    if (rowMap.get(key1).equals(value1))
+                        expected.add(row);
+                }
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+}
+
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/SmallintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/SmallintTest.java
new file mode 100644
index 000000000000..0fe667b9e939
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/SmallintTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class SmallintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.SmallintDataSet());
+    }
+
+    public SmallintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/TextTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/TextTest.java
new file mode 100644
index 000000000000..c1614472525e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/TextTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TextTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.TextDataSet());
+    }
+
+    public TextTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/TimeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/TimeTest.java
new file mode 100644
index 000000000000..16e117ce0f9d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/TimeTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TimeTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.TimeDataSet());
+    }
+
+    public TimeTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/TimestampTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/TimestampTest.java
new file mode 100644
index 000000000000..74dc979f4e29
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/TimestampTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TimestampTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.TimestampDataSet());
+    }
+
+    public TimestampTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/TimeuuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/TimeuuidTest.java
new file mode 100644
index 000000000000..85b8424ae1ff
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/TimeuuidTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TimeuuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.TimeuuidDataSet());
+    }
+
+    public TimeuuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/TinyintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/TinyintTest.java
new file mode 100644
index 000000000000..cb9ea1c2c1dc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/TinyintTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class TinyintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.TinyintDataSet());
+    }
+
+    public TinyintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/UuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/UuidTest.java
new file mode 100644
index 000000000000..9793d92a2a33
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/UuidTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class UuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.UuidDataSet());
+    }
+
+    public UuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/VarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/VarintTest.java
new file mode 100644
index 000000000000..1f3c826b16bd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/VarintTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class VarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.VarintDataSet());
+    }
+
+    public VarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/CollectionDataSet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/CollectionDataSet.java
new file mode 100644
index 000000000000..c79d9c98a3fd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/CollectionDataSet.java
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.QuerySet;
+
+import static org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport.NUMBER_OF_VALUES;
+
+public abstract class CollectionDataSet<T> extends DataSet<T>
+{
+    public static class SetDataSet<T> extends CollectionDataSet<Set<T>>
+    {
+        protected DataSet<T> elementDataSet;
+
+        public SetDataSet(DataSet<T> elementDataSet)
+        {
+            values = new Set[NUMBER_OF_VALUES];
+            this.elementDataSet = elementDataSet;
+            for (int index = 0; index < NUMBER_OF_VALUES; index++)
+            {
+                values[index] = new HashSet<>();
+                for (int element = 0; element < getRandom().nextIntBetween(2, 8); element++)
+                {
+                    values[index].add(elementDataSet.values[getRandom().nextIntBetween(0, elementDataSet.values.length - 1)]);
+                }
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.CollectionQuerySet(this, elementDataSet);
+        }
+
+        public String toString()
+        {
+            return String.format("set<%s>", elementDataSet);
+        }
+    }
+
+    public static class FrozenSetDataSet<T> extends SetDataSet<T>
+    {
+        public FrozenSetDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.FrozenCollectionQuerySet(this);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("FULL(%s)", column));
+        }
+
+        public String toString()
+        {
+            return String.format("frozen<set<%s>>", elementDataSet);
+        }
+    }
+
+    public static class ListDataSet<T> extends CollectionDataSet<List<T>>
+    {
+        protected DataSet<T> elementDataSet;
+
+        public ListDataSet(DataSet<T> elementDataSet)
+        {
+            values = new List[NUMBER_OF_VALUES];
+            this.elementDataSet = elementDataSet;
+            for (int index = 0; index < NUMBER_OF_VALUES; index++)
+            {
+                values[index] = new ArrayList<>();
+                for (int element = 0; element < getRandom().nextIntBetween(2, 8); element++)
+                {
+                    values[index].add(elementDataSet.values[getRandom().nextIntBetween(0, elementDataSet.values.length - 1)]);
+                }
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.CollectionQuerySet(this, elementDataSet);
+        }
+
+        public String toString()
+        {
+            return String.format("list<%s>", elementDataSet);
+        }
+    }
+
+    public static class FrozenListDataSet<T> extends ListDataSet<T>
+    {
+        public FrozenListDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.FrozenCollectionQuerySet(this);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("FULL(%s)", column));
+        }
+
+        public String toString()
+        {
+            return String.format("frozen<list<%s>>", elementDataSet);
+        }
+    }
+
+    public static class MapDataSet<T> extends CollectionDataSet<Map<T, T>>
+    {
+        protected DataSet<T> elementDataSet;
+
+        public MapDataSet(DataSet<T> elementDataSet)
+        {
+            values = new Map[NUMBER_OF_VALUES];
+            this.elementDataSet = elementDataSet;
+            for (int index = 0; index < NUMBER_OF_VALUES; index++)
+            {
+                values[index] = new HashMap<>();
+                for (int element = 0; element < getRandom().nextIntBetween(2, 8); element++)
+                {
+                    T key = elementDataSet.values[getRandom().nextIntBetween(0, elementDataSet.values.length - 1)];
+                    T value = elementDataSet.values[getRandom().nextIntBetween(0, elementDataSet.values.length - 1)];
+                    values[index].put(key, value);
+                }
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.MapValuesQuerySet(this, elementDataSet);
+        }
+
+        public String toString()
+        {
+            return String.format("map<%s,%s>", elementDataSet, elementDataSet);
+        }
+    }
+
+    public static class FrozenMapValuesDataSet<T> extends MapDataSet<T>
+    {
+        public FrozenMapValuesDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.FrozenCollectionQuerySet(this);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("FULL(%s)", column));
+        }
+
+        public String toString()
+        {
+            return String.format("frozen<map<%s,%s>>", elementDataSet, elementDataSet);
+        }
+    }
+
+    public static class MapKeysDataSet<T> extends MapDataSet<T>
+    {
+        public MapKeysDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.MapKeysQuerySet(this, elementDataSet);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("KEYS(%s)", column));
+        }
+    }
+
+    public static class MapValuesDataSet<T> extends MapDataSet<T>
+    {
+        public MapValuesDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.MapValuesQuerySet(this, elementDataSet);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("VALUES(%s)", column));
+        }
+    }
+
+    public static class MapEntriesDataSet<T> extends MapDataSet<T>
+    {
+        public MapEntriesDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.MapEntriesQuerySet(this, elementDataSet);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("ENTRIES(%s)", column));
+        }
+    }
+
+    public static class MultiMapDataSet<T> extends MapDataSet<T>
+    {
+        public MultiMapDataSet(DataSet<T> elementDataSet)
+        {
+            super(elementDataSet);
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.MultiMapQuerySet(this, elementDataSet);
+        }
+
+        @Override
+        public Collection<String> decorateIndexColumn(String column)
+        {
+            return Arrays.asList(String.format("KEYS(%s)", column),
+                                 String.format("VALUES(%s)", column),
+                                 String.format("ENTRIES(%s)", column));
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListAsciiTest.java
new file mode 100644
index 000000000000..cdccb5eefd71
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenListAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenListDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenListAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListDecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListDecimalTest.java
new file mode 100644
index 000000000000..369dac9946f5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListDecimalTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenListDecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenListDataSet<>(new DataSet.DecimalDataSet()));
+    }
+
+    public FrozenListDecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListIntTest.java
new file mode 100644
index 000000000000..eb213af9df7d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenListIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenListDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public FrozenListIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListVarintTest.java
new file mode 100644
index 000000000000..a9aa6f55d78f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/FrozenListVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenListVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenListDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public FrozenListVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListAsciiTest.java
new file mode 100644
index 000000000000..cd3bb1564c0d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public ListAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListBigintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListBigintTest.java
new file mode 100644
index 000000000000..f90ba655e1c0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListBigintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListBigintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.BigintDataSet()));
+    }
+
+    public ListBigintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDateTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDateTest.java
new file mode 100644
index 000000000000..b1082cfa751a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDateTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListDateTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.DateDataSet()));
+    }
+
+    public ListDateTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDecimalTest.java
new file mode 100644
index 000000000000..14462024bcf1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDecimalTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListDecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.DecimalDataSet()));
+    }
+
+    public ListDecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDoubleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDoubleTest.java
new file mode 100644
index 000000000000..1ad2f938119c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListDoubleTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListDoubleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.DoubleDataSet()));
+    }
+
+    public ListDoubleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFloatTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFloatTest.java
new file mode 100644
index 000000000000..e3c557fa44c8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFloatTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListFloatTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.FloatDataSet()));
+    }
+
+    public ListFloatTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFrozenCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFrozenCollectionTest.java
new file mode 100644
index 000000000000..6a61ce2ad819
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListFrozenCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+import java.util.List;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListFrozenCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        DataSet<List<Integer>> frozen = new CollectionDataSet.FrozenListDataSet<>(new DataSet.IntDataSet());
+        return generateParameters(new CollectionDataSet.ListDataSet<>(frozen));
+    }
+
+    public ListFrozenCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListInetTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListInetTest.java
new file mode 100644
index 000000000000..6bb494962cf4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListInetTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListInetTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.InetDataSet()));
+    }
+
+    public ListInetTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListIntTest.java
new file mode 100644
index 000000000000..24939fb98893
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public ListIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListSmallintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListSmallintTest.java
new file mode 100644
index 000000000000..b98965b8a260
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListSmallintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListSmallintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.SmallintDataSet()));
+    }
+
+    public ListSmallintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTextTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTextTest.java
new file mode 100644
index 000000000000..148816b5a0f5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTextTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListTextTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.TextDataSet()));
+    }
+
+    public ListTextTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeTest.java
new file mode 100644
index 000000000000..6815fd1200b4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListTimeTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.TimeDataSet()));
+    }
+
+    public ListTimeTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimestampTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimestampTest.java
new file mode 100644
index 000000000000..5d621af3fec6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimestampTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListTimestampTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.TimestampDataSet()));
+    }
+
+    public ListTimestampTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeuuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeuuidTest.java
new file mode 100644
index 000000000000..de99a61709ee
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTimeuuidTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListTimeuuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.TimeuuidDataSet()));
+    }
+
+    public ListTimeuuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTinyintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTinyintTest.java
new file mode 100644
index 000000000000..abeb2a160f3c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListTinyintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListTinyintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.TinyintDataSet()));
+    }
+
+    public ListTinyintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListUuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListUuidTest.java
new file mode 100644
index 000000000000..439d95ce8f98
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListUuidTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListUuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.UuidDataSet()));
+    }
+
+    public ListUuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListVarintTest.java
new file mode 100644
index 000000000000..0dcb16994aa2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/lists/ListVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.lists;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class ListVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.ListDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public ListVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapAsciiTest.java
new file mode 100644
index 000000000000..be631586c150
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenMapAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenMapAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapDecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapDecimalTest.java
new file mode 100644
index 000000000000..7fcf255bc295
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapDecimalTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenMapDecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.DecimalDataSet()));
+    }
+
+    public FrozenMapDecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapIntTest.java
new file mode 100644
index 000000000000..aeea575bbb70
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenMapIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public FrozenMapIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapVarintTest.java
new file mode 100644
index 000000000000..4102bc31e81e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/FrozenMapVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenMapVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public FrozenMapVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapAsciiTest.java
new file mode 100644
index 000000000000..6853c65e7638
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public MapAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapBigintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapBigintTest.java
new file mode 100644
index 000000000000..ad58d4d3017c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapBigintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapBigintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.BigintDataSet()));
+    }
+
+    public MapBigintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDateTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDateTest.java
new file mode 100644
index 000000000000..8e0ec50f6c79
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDateTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapDateTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.DateDataSet()));
+    }
+
+    public MapDateTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDecimalTest.java
new file mode 100644
index 000000000000..d3aa8b98d0ca
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDecimalTest.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapDecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.DecimalDataSet()));
+    }
+
+    public MapDecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
+
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDoubleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDoubleTest.java
new file mode 100644
index 000000000000..488c5b8c4d6a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapDoubleTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapDoubleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.DoubleDataSet()));
+    }
+
+    public MapDoubleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesAsciiTest.java
new file mode 100644
index 000000000000..ad13985c7e3d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapEntriesAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapEntriesDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public MapEntriesAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesFrozenCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesFrozenCollectionTest.java
new file mode 100644
index 000000000000..f5405275cd98
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesFrozenCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapEntriesFrozenCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        DataSet<Map<Integer, Integer>> frozen = new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.IntDataSet());
+        return generateParameters(new CollectionDataSet.MapEntriesDataSet<>(frozen));
+    }
+
+    public MapEntriesFrozenCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesIntTest.java
new file mode 100644
index 000000000000..1aec8d2931af
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapEntriesIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapEntriesDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public MapEntriesIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesVarintTest.java
new file mode 100644
index 000000000000..cf2d23799517
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapEntriesVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapEntriesVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapEntriesDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public MapEntriesVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFloatTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFloatTest.java
new file mode 100644
index 000000000000..fdacd52247fe
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFloatTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapFloatTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.FloatDataSet()));
+    }
+
+    public MapFloatTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFrozenCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFrozenCollectionTest.java
new file mode 100644
index 000000000000..b11f9975a58a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapFrozenCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapFrozenCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        DataSet<Map<Integer, Integer>> frozen = new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.IntDataSet());
+        return generateParameters(new CollectionDataSet.MapDataSet<>(frozen));
+    }
+
+    public MapFrozenCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapInetTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapInetTest.java
new file mode 100644
index 000000000000..e32c75f5595e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapInetTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapInetTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.InetDataSet()));
+    }
+
+    public MapInetTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapIntTest.java
new file mode 100644
index 000000000000..c88e13f248ab
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public MapIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysAsciiTest.java
new file mode 100644
index 000000000000..840ac7f53781
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapKeysAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapKeysDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public MapKeysAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysFrozenCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysFrozenCollectionTest.java
new file mode 100644
index 000000000000..9566b54174d1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysFrozenCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapKeysFrozenCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        DataSet<Map<Integer, Integer>> frozen = new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.IntDataSet());
+        return generateParameters(new CollectionDataSet.MapKeysDataSet<>(frozen));
+    }
+
+    public MapKeysFrozenCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysIntTest.java
new file mode 100644
index 000000000000..5ed3c673b7d4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapKeysIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapKeysDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public MapKeysIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysVarintTest.java
new file mode 100644
index 000000000000..8c8dc9671e69
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapKeysVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapKeysVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapKeysDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public MapKeysVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapSmallintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapSmallintTest.java
new file mode 100644
index 000000000000..5454a02404cd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapSmallintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapSmallintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.SmallintDataSet()));
+    }
+
+    public MapSmallintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTextTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTextTest.java
new file mode 100644
index 000000000000..0f9aaeb17a9f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTextTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapTextTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.TextDataSet()));
+    }
+
+    public MapTextTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeTest.java
new file mode 100644
index 000000000000..7aa3c1cfec3f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapTimeTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.TimeDataSet()));
+    }
+
+    public MapTimeTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimestampTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimestampTest.java
new file mode 100644
index 000000000000..10569f6d8dd1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimestampTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapTimestampTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.TimestampDataSet()));
+    }
+
+    public MapTimestampTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeuuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeuuidTest.java
new file mode 100644
index 000000000000..560bbb24d14d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTimeuuidTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapTimeuuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.TimeuuidDataSet()));
+    }
+
+    public MapTimeuuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTinyintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTinyintTest.java
new file mode 100644
index 000000000000..15826328f11c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapTinyintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapTinyintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.TinyintDataSet()));
+    }
+
+    public MapTinyintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapUuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapUuidTest.java
new file mode 100644
index 000000000000..f1c0290c2304
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapUuidTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapUuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.UuidDataSet()));
+    }
+
+    public MapUuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesAsciiTest.java
new file mode 100644
index 000000000000..0f50c7a63e4e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapValuesAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapValuesDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public MapValuesAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesFrozenCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesFrozenCollectionTest.java
new file mode 100644
index 000000000000..ccae63b59d64
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesFrozenCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+import java.util.Map;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapValuesFrozenCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        DataSet<Map<Integer, Integer>> frozen = new CollectionDataSet.FrozenMapValuesDataSet<>(new DataSet.IntDataSet());
+        return generateParameters(new CollectionDataSet.MapValuesDataSet<>(frozen));
+    }
+
+    public MapValuesFrozenCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesIntTest.java
new file mode 100644
index 000000000000..c6a55911da22
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapValuesIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapValuesDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public MapValuesIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesVarintTest.java
new file mode 100644
index 000000000000..0f7d4a18e4fc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapValuesVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapValuesVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapValuesDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public MapValuesVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapVarintTest.java
new file mode 100644
index 000000000000..3ad3a1ec7eb3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MapVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MapVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MapDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public MapVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapAsciiTest.java
new file mode 100644
index 000000000000..db8ce55ec526
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MultiMapAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MultiMapDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public MultiMapAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapIntTest.java
new file mode 100644
index 000000000000..575945b9afcf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MultiMapIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MultiMapDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public MultiMapIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapVarintTest.java
new file mode 100644
index 000000000000..1142dd711373
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/maps/MultiMapVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.maps;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class MultiMapVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.MultiMapDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public MultiMapVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetAsciiTest.java
new file mode 100644
index 000000000000..48f8bf7a03f7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenSetAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenSetDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenSetAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetDecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetDecimalTest.java
new file mode 100644
index 000000000000..be4f9c1d9675
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetDecimalTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenSetDecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenSetDataSet<>(new DataSet.DecimalDataSet()));
+    }
+
+    public FrozenSetDecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetIntTest.java
new file mode 100644
index 000000000000..8164ac3467fe
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenSetIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenSetDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenSetIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetVarintTest.java
new file mode 100644
index 000000000000..3b13a39cdd46
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/FrozenSetVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenSetVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.FrozenSetDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public FrozenSetVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetAsciiTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetAsciiTest.java
new file mode 100644
index 000000000000..77653149a4e4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetAsciiTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetAsciiTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.AsciiDataSet()));
+    }
+
+    public SetAsciiTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetBigintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetBigintTest.java
new file mode 100644
index 000000000000..055829afd1ca
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetBigintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetBigintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.BigintDataSet()));
+    }
+
+    public SetBigintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDateTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDateTest.java
new file mode 100644
index 000000000000..81b52b52c3e9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDateTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetDateTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.DateDataSet()));
+    }
+
+    public SetDateTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDecimalTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDecimalTest.java
new file mode 100644
index 000000000000..89e7cb70f676
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDecimalTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetDecimalTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.DecimalDataSet()));
+    }
+
+    public SetDecimalTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDoubleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDoubleTest.java
new file mode 100644
index 000000000000..5fe7d088cc73
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetDoubleTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetDoubleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.DoubleDataSet()));
+    }
+
+    public SetDoubleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFloatTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFloatTest.java
new file mode 100644
index 000000000000..56f53bbee968
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFloatTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetFloatTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.FloatDataSet()));
+    }
+
+    public SetFloatTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFrozenCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFrozenCollectionTest.java
new file mode 100644
index 000000000000..07905cdaaeb2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetFrozenCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+import java.util.Set;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetFrozenCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        DataSet<Set<Integer>> frozen = new CollectionDataSet.FrozenSetDataSet<>(new DataSet.IntDataSet());
+        return generateParameters(new CollectionDataSet.SetDataSet<>(frozen));
+    }
+
+    public SetFrozenCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetInetTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetInetTest.java
new file mode 100644
index 000000000000..af6d4f03100d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetInetTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetInetTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.InetDataSet()));
+    }
+
+    public SetInetTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetIntTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetIntTest.java
new file mode 100644
index 000000000000..94348c478644
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetIntTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetIntTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.IntDataSet()));
+    }
+
+    public SetIntTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetSmallintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetSmallintTest.java
new file mode 100644
index 000000000000..690b7bc6fe90
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetSmallintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetSmallintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.SmallintDataSet()));
+    }
+
+    public SetSmallintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTextTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTextTest.java
new file mode 100644
index 000000000000..ea86dcd16c32
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTextTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetTextTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.TextDataSet()));
+    }
+
+    public SetTextTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeTest.java
new file mode 100644
index 000000000000..fed485ab52fb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetTimeTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.TimeDataSet()));
+    }
+
+    public SetTimeTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimestampTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimestampTest.java
new file mode 100644
index 000000000000..adb0dfc5e19f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimestampTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetTimestampTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.TimestampDataSet()));
+    }
+
+    public SetTimestampTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeuuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeuuidTest.java
new file mode 100644
index 000000000000..dfd9d6499bc3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTimeuuidTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetTimeuuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.TimeuuidDataSet()));
+    }
+
+    public SetTimeuuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTinyintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTinyintTest.java
new file mode 100644
index 000000000000..ab5f84e8d0f0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetTinyintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetTinyintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.TinyintDataSet()));
+    }
+
+    public SetTinyintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetUuidTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetUuidTest.java
new file mode 100644
index 000000000000..e40318dbb804
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetUuidTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetUuidTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.UuidDataSet()));
+    }
+
+    public SetUuidTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetVarintTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetVarintTest.java
new file mode 100644
index 000000000000..a6211c4dacad
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/collections/sets/SetVarintTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.cql.types.collections.sets;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class SetVarintTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new CollectionDataSet.SetDataSet<>(new DataSet.VarintDataSet()));
+    }
+
+    public SetVarintTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/BKDQueriesTest.java b/test/unit/org/apache/cassandra/index/sai/disk/BKDQueriesTest.java
new file mode 100644
index 000000000000..848d074b6ef5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/BKDQueriesTest.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.v1.BKDReader;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+import static org.apache.lucene.index.PointValues.Relation.CELL_CROSSES_QUERY;
+import static org.apache.lucene.index.PointValues.Relation.CELL_INSIDE_QUERY;
+import static org.apache.lucene.index.PointValues.Relation.CELL_OUTSIDE_QUERY;
+
+public class BKDQueriesTest extends NdiRandomizedTest
+{
+    @Test
+    public void testInclusiveLowerBound()
+    {
+        final int lowerBound = between(-10, 10);
+        final Expression expression = buildExpression(Operator.GTE, lowerBound);
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertFalse(query.visit(toSortableBytes(lowerBound - 1)));
+        assertTrue(query.visit(toSortableBytes(lowerBound)));
+        assertTrue(query.visit(toSortableBytes(lowerBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(lowerBound - 2), toSortableBytes(lowerBound - 1)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(lowerBound), toSortableBytes(lowerBound + 1)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(lowerBound - 1), toSortableBytes(lowerBound)));
+    }
+
+    @Test
+    public void testExclusiveLowerBound()
+    {
+        final int lowerBound = between(-10, 10);
+        final Expression expression = buildExpression(Operator.GT, lowerBound);
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertFalse(query.visit(toSortableBytes(lowerBound - 1)));
+        assertFalse(query.visit(toSortableBytes(lowerBound)));
+        assertTrue(query.visit(toSortableBytes(lowerBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(lowerBound - 1), toSortableBytes(lowerBound)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(lowerBound + 1), toSortableBytes(lowerBound + 2)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(lowerBound), toSortableBytes(lowerBound + 1)));
+    }
+
+    @Test
+    public void testInclusiveUpperBound()
+    {
+        final int upperBound = between(-10, 10);
+        final Expression expression = buildExpression(Operator.LTE, upperBound);
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertTrue(query.visit(toSortableBytes(upperBound - 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound)));
+        assertFalse(query.visit(toSortableBytes(upperBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(upperBound + 1), toSortableBytes(upperBound + 2)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(upperBound - 1), toSortableBytes(upperBound)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(upperBound), toSortableBytes(upperBound + 1)));
+    }
+
+    @Test
+    public void testExclusiveUpperBound()
+    {
+        final int upper = between(-10, 10);
+        final Expression expression = buildExpression(Operator.LT, upper);
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertTrue(query.visit(toSortableBytes(upper - 1)));
+        assertFalse(query.visit(toSortableBytes(upper)));
+        assertFalse(query.visit(toSortableBytes(upper + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(upper), toSortableBytes(upper + 1)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(upper - 2), toSortableBytes(upper - 1)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(upper - 1), toSortableBytes(upper)));
+    }
+
+    @Test
+    public void testInclusiveLowerAndUpperBound()
+    {
+        final int lowerBound = between(-15, 15);
+        final int upperBound = lowerBound + 5;
+        final Expression expression = buildExpression(Operator.GTE, lowerBound)
+                .add(Operator.LTE, Int32Type.instance.decompose(upperBound));
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertFalse(query.visit(toSortableBytes(lowerBound - 1)));
+        assertTrue(query.visit(toSortableBytes(lowerBound)));
+        assertTrue(query.visit(toSortableBytes(lowerBound + 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound - 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound)));
+        assertFalse(query.visit(toSortableBytes(upperBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(lowerBound - 2), toSortableBytes(lowerBound - 1)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(lowerBound - 1), toSortableBytes(lowerBound)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(lowerBound), toSortableBytes(upperBound)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(upperBound), toSortableBytes(upperBound + 1)));
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(upperBound + 1), toSortableBytes(upperBound + 2)));
+    }
+
+    @Test
+    public void testExclusiveLowerAndUpperBound()
+    {
+        final int lowerBound = between(-15, 15);
+        final int upperBound = lowerBound + 5;
+        final Expression expression = buildExpression(Operator.GT, lowerBound)
+                .add(Operator.LT, Int32Type.instance.decompose(upperBound));
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertFalse(query.visit(toSortableBytes(lowerBound - 1)));
+        assertFalse(query.visit(toSortableBytes(lowerBound)));
+        assertTrue(query.visit(toSortableBytes(lowerBound + 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound - 1)));
+        assertFalse(query.visit(toSortableBytes(upperBound)));
+        assertFalse(query.visit(toSortableBytes(upperBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(lowerBound - 1), toSortableBytes(lowerBound)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(lowerBound), toSortableBytes(lowerBound + 1)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(lowerBound + 1), toSortableBytes(upperBound - 1)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(upperBound - 1), toSortableBytes(upperBound)));
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(upperBound), toSortableBytes(upperBound + 1)));
+    }
+
+    @Test
+    public void testExclusiveLowerAndInclusiveUpperBound()
+    {
+        final int lowerBound = between(-15, 15);
+        final int upperBound = lowerBound + 5;
+        final Expression expression = buildExpression(Operator.GT, lowerBound)
+                .add(Operator.LTE, Int32Type.instance.decompose(upperBound));
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertFalse(query.visit(toSortableBytes(lowerBound - 1)));
+        assertFalse(query.visit(toSortableBytes(lowerBound)));
+        assertTrue(query.visit(toSortableBytes(lowerBound + 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound - 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound)));
+        assertFalse(query.visit(toSortableBytes(upperBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(lowerBound - 1), toSortableBytes(lowerBound)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(lowerBound), toSortableBytes(lowerBound + 1)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(lowerBound + 1), toSortableBytes(upperBound)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(upperBound), toSortableBytes(upperBound + 1)));
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(upperBound + 1), toSortableBytes(upperBound + 2)));
+    }
+
+    @Test
+    public void testInclusiveLowerAndExclusiveUpperBound()
+    {
+        final int lowerBound = between(-15, 15);
+        final int upperBound = lowerBound + 5;
+        final Expression expression = buildExpression(Operator.GTE, lowerBound)
+                .add(Operator.LT, Int32Type.instance.decompose(upperBound));
+        final BKDReader.IntersectVisitor query = BKDQueries.bkdQueryFrom(expression, 1, 4);
+
+        assertFalse(query.visit(toSortableBytes(lowerBound - 1)));
+        assertTrue(query.visit(toSortableBytes(lowerBound)));
+        assertTrue(query.visit(toSortableBytes(lowerBound + 1)));
+        assertTrue(query.visit(toSortableBytes(upperBound - 1)));
+        assertFalse(query.visit(toSortableBytes(upperBound)));
+        assertFalse(query.visit(toSortableBytes(upperBound + 1)));
+
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(lowerBound - 2), toSortableBytes(lowerBound - 1)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(lowerBound - 1), toSortableBytes(lowerBound)));
+        assertEquals(CELL_INSIDE_QUERY, query.compare(toSortableBytes(lowerBound), toSortableBytes(upperBound - 1)));
+        assertEquals(CELL_CROSSES_QUERY, query.compare(toSortableBytes(upperBound - 1), toSortableBytes(upperBound)));
+        assertEquals(CELL_OUTSIDE_QUERY, query.compare(toSortableBytes(upperBound), toSortableBytes(upperBound + 1)));
+    }
+
+    private byte[] toSortableBytes(int value)
+    {
+        byte[] buffer = new byte[4];
+        ByteSource source = Int32Type.instance.asComparableBytes(Int32Type.instance.decompose(value), ByteComparable.Version.OSS41);
+        ByteBufferUtil.toBytes(source, buffer);
+        return buffer;
+    }
+
+    private Expression buildExpression(Operator op, int value)
+    {
+        final Expression expression = new Expression(SAITester.createColumnContext("meh", Int32Type.instance));
+        expression.add(op, Int32Type.instance.decompose(value));
+        return expression;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValuesTest.java b/test/unit/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValuesTest.java
new file mode 100644
index 000000000000..e410706e32fe
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/ImmutableOneDimPointValuesTest.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.index.sai.utils.AbstractIterator;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.lucene.util.bkd.MutablePointsReaderUtils;
+
+import static org.junit.Assert.assertEquals;
+
+public class ImmutableOneDimPointValuesTest
+{
+    @Rule
+    public ExpectedException expectedException = ExpectedException.none();
+
+    @Test
+    public void shouldTraversePointsInTermEnumOrder() throws IOException
+    {
+        final int minTerm = 0, maxTerm = 10;
+        final TermsIterator termEnum = buildDescTermEnum(minTerm, maxTerm);
+        final ImmutableOneDimPointValues pointValues = ImmutableOneDimPointValues
+                .fromTermEnum(termEnum, Int32Type.instance);
+
+        pointValues.intersect(assertingVisitor(minTerm));
+    }
+
+    @Test
+    public void shouldFailOnSorting()
+    {
+        final int minTerm = 3, maxTerm = 13;
+        final TermsIterator termEnum = buildDescTermEnum(minTerm, maxTerm);
+        final ImmutableOneDimPointValues pointValues = ImmutableOneDimPointValues
+                .fromTermEnum(termEnum, Int32Type.instance);
+
+        expectedException.expect(IllegalStateException.class);
+        pointValues.swap(0, 1);
+    }
+
+    @Test
+    public void shouldSkipLuceneSorting() throws IOException
+    {
+        final int minTerm = 2, maxTerm = 7;
+        final TermsIterator termEnum = buildDescTermEnum(minTerm, maxTerm);
+        final ImmutableOneDimPointValues pointValues = ImmutableOneDimPointValues.fromTermEnum(termEnum, Int32Type.instance);
+
+        MutablePointsReaderUtils.sort(2, Int32Type.instance.valueLengthIfFixed(), pointValues, 0, Math.toIntExact(pointValues.size()));
+
+        pointValues.intersect(assertingVisitor(minTerm));
+    }
+
+    private MutableOneDimPointValues.IntersectVisitor assertingVisitor(int minTerm)
+    {
+        return new MutableOneDimPointValues.IntersectVisitor()
+        {
+            int term = minTerm;
+            int postingCounter = 0;
+
+            @Override
+            public void visit(long docID, byte[] packedValue)
+            {
+                final ByteComparable actualTerm = ByteComparable.fixedLength(packedValue);
+                final ByteComparable expectedTerm = ByteComparable.of(term);
+
+                assertEquals(0, ByteComparable.compare(actualTerm, expectedTerm, ByteComparable.Version.OSS41));
+                assertEquals(postingCounter, docID);
+
+                if (postingCounter >= 2)
+                {
+                    postingCounter = 0;
+                    term++;
+                }
+                else
+                {
+                    postingCounter++;
+                }
+            }
+        };
+    }
+
+    private TermsIterator buildDescTermEnum(int from, int to)
+    {
+        final ByteBuffer minTerm = Int32Type.instance.decompose(from);
+        final ByteBuffer maxTerm = Int32Type.instance.decompose(to);
+
+        final AbstractIterator<Pair<ByteComparable, IntArrayList>> iterator = new AbstractIterator<Pair<ByteComparable, IntArrayList>>()
+        {
+            private int currentTerm = from;
+
+            @Override
+            protected Pair<ByteComparable, IntArrayList> computeNext()
+            {
+                if (currentTerm <= to)
+                {
+                    return endOfData();
+                }
+                final ByteBuffer term = Int32Type.instance.decompose(currentTerm++);
+                IntArrayList postings = new IntArrayList();
+                postings.add(0, 1, 2);
+                return Pair.create(ByteComparable.fixedLength(term), postings);
+            }
+        };
+
+        return new MemtableTermsIterator(minTerm, maxTerm, iterator);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexBuilder.java b/test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexBuilder.java
new file mode 100644
index 000000000000..5308947cfe0b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexBuilder.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.IntSupplier;
+import java.util.function.Supplier;
+import java.util.stream.IntStream;
+import java.util.stream.Stream;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static java.util.stream.Collectors.toList;
+
+public class InvertedIndexBuilder
+{
+    public static List<Pair<ByteComparable, IntArrayList>> buildStringTermsEnum(int terms, int postings, Supplier<String> termsGenerator, IntSupplier postingsGenerator)
+    {
+        final List<ByteComparable> sortedTerms = Stream.generate(termsGenerator)
+                                                       .distinct()
+                                                       .limit(terms)
+                                                       .sorted()
+                                                       .map(UTF8Type.instance::decompose)
+                                                       .map(ByteComparable::fixedLength)
+                                                       .collect(toList());
+
+        final List<Pair<ByteComparable, IntArrayList>> termsEnum = new ArrayList<>();
+        for (ByteComparable term : sortedTerms)
+        {
+            final IntArrayList postingsList = new IntArrayList();
+
+            IntStream.generate(postingsGenerator)
+                     .distinct()
+                     .limit(postings)
+                     .sorted()
+                     .forEach(postingsList::add);
+
+            termsEnum.add(Pair.create(term, postingsList));
+        }
+        return termsEnum;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexSearcherTest.java b/test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexSearcherTest.java
new file mode 100644
index 000000000000..539a03339290
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/InvertedIndexSearcherTest.java
@@ -0,0 +1,203 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.List;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.InvertedIndexWriter;
+import org.apache.cassandra.index.sai.metrics.QueryEventListeners;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.LongArrays;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+
+import static org.apache.cassandra.Util.dk;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.is;
+
+public class InvertedIndexSearcherTest extends NdiRandomizedTest
+{
+    private static final SSTableContext.KeyFetcher KEY_FETCHER = new SSTableContext.KeyFetcher() {
+        @Override
+        public DecoratedKey apply(RandomAccessReader reader, long keyOffset)
+        {
+            return dk(String.format("pkvalue_%07d", keyOffset));
+        }
+
+        @Override
+        public RandomAccessReader createReader()
+        {
+            return null;
+        }
+    };
+
+    @BeforeClass
+    public static void setupCQLTester()
+    {
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance);
+    }
+
+    @Test
+    public void testEqQueriesAgainstStringIndex() throws Exception
+    {
+        doTestEqQueriesAgainstStringIndex();
+    }
+
+    private void doTestEqQueriesAgainstStringIndex() throws Exception
+    {
+        final int numTerms = randomIntBetween(64, 512), numPostings = randomIntBetween(256, 1024);
+        final List<Pair<ByteComparable, IntArrayList>> termsEnum = buildTermsEnum(numTerms, numPostings);
+
+        try (IndexSearcher searcher = buildIndexAndOpenSearcher(numTerms, numPostings, termsEnum))
+        {
+            for (int t = 0; t < numTerms; ++t)
+            {
+                try (RangeIterator results = searcher.search(new Expression(SAITester.createColumnContext("meh", UTF8Type.instance))
+                        .add(Operator.EQ, wrap(termsEnum.get(t).left)), SSTableQueryContext.forTest(), false))
+                {
+                    assertEquals(results.getMinimum(), results.getCurrent());
+                    assertTrue(results.hasNext());
+
+                    for (int p = 0; p < numPostings; ++p)
+                    {
+                        final long expectedToken = termsEnum.get(t).right.get(p);
+                        assertTrue(results.hasNext());
+                        final long actualToken = results.next().get();
+                        assertEquals(expectedToken, actualToken);
+                    }
+                    assertFalse(results.hasNext());
+                }
+
+                try (RangeIterator results = searcher.search(new Expression(SAITester.createColumnContext("meh", UTF8Type.instance))
+                        .add(Operator.EQ, wrap(termsEnum.get(t).left)), SSTableQueryContext.forTest(), false))
+                {
+                    assertEquals(results.getMinimum(), results.getCurrent());
+                    assertTrue(results.hasNext());
+
+                    // test skipping to the last block
+                    final int idxToSkip = numPostings - 7;
+                    // tokens are equal to their corresponding row IDs
+                    final long tokenToSkip = termsEnum.get(t).right.get(idxToSkip);
+                    results.skipTo(tokenToSkip);
+
+                    for (int p = idxToSkip; p < numPostings; ++p)
+                    {
+                        final long expectedToken = termsEnum.get(t).right.get(p);
+                        final long actualToken = results.next().get();
+                        assertEquals(expectedToken, actualToken);
+                    }
+                }
+            }
+
+            // try searching for terms that weren't indexed
+            final String tooLongTerm = randomSimpleString(10, 12);
+            RangeIterator results = searcher.search(new Expression(SAITester.createColumnContext("meh", UTF8Type.instance))
+                                                                .add(Operator.EQ, UTF8Type.instance.decompose(tooLongTerm)), SSTableQueryContext.forTest(), false);
+            assertFalse(results.hasNext());
+
+            final String tooShortTerm = randomSimpleString(1, 2);
+            results = searcher.search(new Expression(SAITester.createColumnContext("meh", UTF8Type.instance))
+                                                      .add(Operator.EQ, UTF8Type.instance.decompose(tooShortTerm)), SSTableQueryContext.forTest(), false);
+            assertFalse(results.hasNext());
+        }
+    }
+
+    @Test
+    public void testUnsupportedOperator() throws Exception
+    {
+        final int numTerms = randomIntBetween(5, 15), numPostings = randomIntBetween(5, 20);
+        final List<Pair<ByteComparable, IntArrayList>> termsEnum = buildTermsEnum(numTerms, numPostings);
+
+        try (IndexSearcher searcher = buildIndexAndOpenSearcher(numTerms, numPostings, termsEnum))
+        {
+            searcher.search(new Expression(SAITester.createColumnContext("meh", UTF8Type.instance)).add(Operator.GT, UTF8Type.instance.decompose("a")), SSTableQueryContext.forTest(), false);
+
+            fail("Expect IllegalArgumentException thrown, but didn't");
+        }
+        catch (IllegalArgumentException e)
+        {
+            // expected
+        }
+    }
+
+    private IndexSearcher buildIndexAndOpenSearcher(int terms, int postings, List<Pair<ByteComparable, IntArrayList>> termsEnum) throws IOException
+    {
+        final int size = terms * postings;
+        final IndexComponents indexComponents = newIndexComponents();
+
+        SegmentMetadata.ComponentMetadataMap indexMetas;
+        try (InvertedIndexWriter writer = new InvertedIndexWriter(indexComponents, false))
+        {
+            indexMetas = writer.writeAll(new MemtableTermsIterator(null, null, termsEnum.iterator()));
+        }
+
+        final SegmentMetadata segmentMetadata = new SegmentMetadata(0,
+                                                                    size,
+                                                                    0,
+                                                                    Long.MAX_VALUE,
+                                                                    new BufferDecoratedKey(DatabaseDescriptor.getPartitioner().getMinimumToken(), ByteBufferUtil.bytes(1)),
+                                                                    new BufferDecoratedKey(DatabaseDescriptor.getPartitioner().getMaximumToken(), ByteBufferUtil.bytes(2)),
+                                                                    wrap(termsEnum.get(0).left),
+                                                                    wrap(termsEnum.get(terms - 1).left),
+                                                                    indexMetas);
+
+        try (SSTableIndex.PerIndexFiles indexFiles = new SSTableIndex.PerIndexFiles(indexComponents, true))
+        {
+            final LongArray.Factory factory = () -> LongArrays.identity().build();
+            Segment segment = new Segment(factory, factory, KEY_FETCHER, indexFiles, segmentMetadata, UTF8Type.instance);
+            final IndexSearcher searcher = IndexSearcher.open(segment, QueryEventListeners.NO_OP_TRIE_LISTENER);
+            assertThat(searcher, is(instanceOf(InvertedIndexSearcher.class)));
+            return searcher;
+        }
+    }
+
+    private List<Pair<ByteComparable, IntArrayList>> buildTermsEnum(int terms, int postings)
+    {
+        return InvertedIndexBuilder.buildStringTermsEnum(terms, postings, () -> randomSimpleString(3, 5), () -> nextInt(0, Integer.MAX_VALUE));
+    }
+
+    private ByteBuffer wrap(ByteComparable bc)
+    {
+        return ByteBuffer.wrap(ByteSourceInverse.readBytes(bc.asComparableBytes(ByteComparable.Version.OSS41)));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexBuilder.java b/test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexBuilder.java
new file mode 100644
index 000000000000..e6d6f346568a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexBuilder.java
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+import java.util.stream.LongStream;
+import java.util.stream.Stream;
+
+import org.junit.Assert;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.ShortType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.NumericIndexWriter;
+import org.apache.cassandra.index.sai.metrics.QueryEventListeners;
+import org.apache.cassandra.index.sai.utils.AbstractIterator;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.LongArrays;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+import static org.apache.cassandra.Util.dk;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.is;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+public class KDTreeIndexBuilder
+{
+    private final IndexComponents indexComponents;
+    private final AbstractType<?> type;
+    private final AbstractIterator<Pair<ByteComparable, IntArrayList>> terms;
+    private final int size;
+    private final int minSegmentRowId;
+    private final int maxSegmentRowId;
+    private final LongArray segmentRowIdToToken = LongArrays.identity().build();
+    private final LongArray segmentRowIdToOffset = LongArrays.identity().build();
+    private final SSTableContext.KeyFetcher keyFetcher = new SSTableContext.KeyFetcher() {
+        @Override
+        public DecoratedKey apply(RandomAccessReader reader, long keyOffset)
+        {
+            return dk(String.format("pkvalue_%07d", keyOffset));
+        }
+
+        @Override
+        public RandomAccessReader createReader()
+        {
+            return null;
+        }
+    };
+    private static final BigDecimal ONE_TENTH = BigDecimal.valueOf(1, 1);
+
+    public KDTreeIndexBuilder(IndexComponents indexComponents,
+                              AbstractType<?> type,
+                              AbstractIterator<Pair<ByteComparable, IntArrayList>> terms,
+                              int size,
+                              int minSegmentRowId,
+                              int maxSegmentRowId)
+    {
+        this.indexComponents = indexComponents;
+        this.type = type;
+        this.terms = terms;
+        this.size = size;
+        this.minSegmentRowId = minSegmentRowId;
+        this.maxSegmentRowId = maxSegmentRowId;
+    }
+
+    KDTreeIndexSearcher flushAndOpen() throws IOException
+    {
+        final TermsIterator termEnum = new MemtableTermsIterator(null, null, terms);
+        final ImmutableOneDimPointValues pointValues = ImmutableOneDimPointValues.fromTermEnum(termEnum, type);
+
+        final SegmentMetadata metadata;
+        try (NumericIndexWriter writer = new NumericIndexWriter(indexComponents, TypeUtil.fixedSizeOf(type), maxSegmentRowId, size, IndexWriterConfig.defaultConfig("test"), false))
+        {
+            final SegmentMetadata.ComponentMetadataMap indexMetas = writer.writeAll(pointValues);
+            metadata = new SegmentMetadata(0,
+                                           size,
+                                           minSegmentRowId,
+                                           maxSegmentRowId,
+                                           // min/max is unused for now
+                                           Murmur3Partitioner.instance.decorateKey(UTF8Type.instance.fromString("a")),
+                                           Murmur3Partitioner.instance.decorateKey(UTF8Type.instance.fromString("b")),
+                                           UTF8Type.instance.fromString("c"),
+                                           UTF8Type.instance.fromString("d"),
+                                           indexMetas);
+        }
+
+        try (SSTableIndex.PerIndexFiles indexFiles = new SSTableIndex.PerIndexFiles(indexComponents, false))
+        {
+            Segment segment = new Segment(() -> segmentRowIdToToken, () -> segmentRowIdToOffset, keyFetcher, indexFiles, metadata, type);
+            KDTreeIndexSearcher searcher = IndexSearcher.open(segment, QueryEventListeners.NO_OP_BKD_LISTENER);
+            assertThat(searcher, is(instanceOf(KDTreeIndexSearcher.class)));
+            return (KDTreeIndexSearcher) searcher;
+        }
+    }
+
+    /**
+     * Returns a k-d tree index where:
+     * 1. term values have 32b
+     * 2. term value is equal to {@code startTermInclusive} + row id;
+     * 3. tokens and offsets are equal to row id;
+     */
+    public static IndexSearcher buildInt32Searcher(IndexComponents indexComponents, int startTermInclusive, int endTermExclusive)
+            throws IOException
+    {
+        final int size = endTermExclusive - startTermInclusive;
+        Assert.assertTrue(size > 0);
+        KDTreeIndexBuilder indexBuilder = new KDTreeIndexBuilder(indexComponents,
+                                                                 Int32Type.instance,
+                                                                 singleOrd(int32Range(startTermInclusive, endTermExclusive), Int32Type.instance, startTermInclusive, size),
+                                                                 size,
+                                                                 startTermInclusive,
+                                                                 endTermExclusive);
+        return indexBuilder.flushAndOpen();
+    }
+
+    public static IndexSearcher buildDecimalSearcher(IndexComponents indexComponents, BigDecimal startTermInclusive, BigDecimal endTermExclusive)
+            throws IOException
+    {
+        BigDecimal bigDifference = endTermExclusive.subtract(startTermInclusive);
+        int size = bigDifference.intValueExact() * 10;
+        Assert.assertTrue(size > 0);
+        KDTreeIndexBuilder indexBuilder = new KDTreeIndexBuilder(indexComponents,
+                                                                 DecimalType.instance,
+                                                                 singleOrd(decimalRange(startTermInclusive, endTermExclusive), DecimalType.instance, startTermInclusive.intValueExact() * 10, size),
+                                                                 size,
+                startTermInclusive.intValueExact() * 10,
+                endTermExclusive.intValueExact() * 10);
+        return indexBuilder.flushAndOpen();
+    }
+
+    public static IndexSearcher buildBigIntegerSearcher(IndexComponents indexComponents, BigInteger startTermInclusive, BigInteger endTermExclusive)
+            throws IOException
+    {
+        BigInteger bigDifference = endTermExclusive.subtract(startTermInclusive);
+        int size = bigDifference.intValueExact();
+        Assert.assertTrue(size > 0);
+        KDTreeIndexBuilder indexBuilder = new KDTreeIndexBuilder(indexComponents,
+                                                                 IntegerType.instance,
+                                                                 singleOrd(bigIntegerRange(startTermInclusive, endTermExclusive), IntegerType.instance, startTermInclusive.intValueExact(), size),
+                                                                 size,
+                                                                 startTermInclusive.intValueExact(),
+                                                                 endTermExclusive.intValueExact());
+        return indexBuilder.flushAndOpen();
+    }
+
+    /**
+     * Returns a k-d tree index where:
+     * 1. term values have 64b
+     * 2. term value is equal to {@code startTermInclusive} + row id;
+     * 3. tokens and offsets are equal to row id;
+     */
+    public static IndexSearcher buildLongSearcher(IndexComponents indexComponents, long startTermInclusive, long endTermExclusive)
+            throws IOException
+    {
+        final long size = endTermExclusive - startTermInclusive;
+        Assert.assertTrue(size > 0);
+        KDTreeIndexBuilder indexBuilder = new KDTreeIndexBuilder(indexComponents,
+                                                                 LongType.instance,
+                                                                 singleOrd(longRange(startTermInclusive, endTermExclusive), LongType.instance, Math.toIntExact(startTermInclusive), Math.toIntExact(size)),
+                                                                 Math.toIntExact(size),
+                                                                 Math.toIntExact(startTermInclusive),
+                                                                 Math.toIntExact(endTermExclusive));
+        return indexBuilder.flushAndOpen();
+    }
+
+    /**
+     * Returns a k-d tree index where:
+     * 1. term values have 16b
+     * 2. term value is equal to {@code startTermInclusive} + row id;
+     * 3. tokens and offsets are equal to row id;
+     */
+    public static IndexSearcher buildShortSearcher(IndexComponents indexComponents, short startTermInclusive, short endTermExclusive)
+            throws IOException
+    {
+        final int size = endTermExclusive - startTermInclusive;
+        Assert.assertTrue(size > 0);
+        KDTreeIndexBuilder indexBuilder = new KDTreeIndexBuilder(indexComponents,
+                                                                 ShortType.instance,
+                                                                 singleOrd(shortRange(startTermInclusive, endTermExclusive), ShortType.instance, startTermInclusive, size),
+                                                                 size,
+                                                                 startTermInclusive,
+                                                                 endTermExclusive);
+        return indexBuilder.flushAndOpen();
+    }
+
+    /**
+     * Returns inverted index where each posting list contains exactly one element equal to the terms ordinal number +
+     * given offset.
+     */
+    public static AbstractIterator<Pair<ByteComparable, IntArrayList>> singleOrd(Iterator<ByteBuffer> terms, AbstractType<?> type, int segmentRowIdOffset, int size)
+    {
+        return new AbstractIterator<Pair<ByteComparable, IntArrayList>>()
+        {
+            private long currentTerm = 0;
+            private int currentSegmentRowId = segmentRowIdOffset;
+
+            @Override
+            protected Pair<ByteComparable, IntArrayList> computeNext()
+            {
+                if (currentTerm++ >= size)
+                {
+                    return endOfData();
+                }
+
+                IntArrayList postings = new IntArrayList();
+                postings.add(currentSegmentRowId++);
+                assertTrue(terms.hasNext());
+
+                final ByteSource encoded = TypeUtil.asComparableBytes(terms.next(), type, ByteComparable.Version.OSS41);
+                return Pair.create(v -> encoded, postings);
+            }
+        };
+    }
+
+    /**
+     * Returns sequential ordered encoded ints from {@code startInclusive} (inclusive) to {@code endExclusive}
+     * (exclusive) by an incremental step of {@code 1}.
+     */
+    public static Iterator<ByteBuffer> int32Range(int startInclusive, int endExclusive)
+    {
+        return IntStream.range(startInclusive, endExclusive)
+                        .mapToObj(Int32Type.instance::decompose)
+                        .collect(Collectors.toList())
+                        .iterator();
+    }
+
+    /**
+     * Returns sequential ordered encoded longs from {@code startInclusive} (inclusive) to {@code endExclusive}
+     * (exclusive) by an incremental step of {@code 1}.
+     */
+    public static Iterator<ByteBuffer> longRange(long startInclusive, long endExclusive)
+    {
+        return LongStream.range(startInclusive, endExclusive)
+                         .mapToObj(LongType.instance::decompose)
+                         .collect(Collectors.toList())
+                         .iterator();
+    }
+
+    public static Iterator<ByteBuffer> decimalRange(final BigDecimal startInclusive, final BigDecimal endExclusive)
+    {
+        int n = endExclusive.subtract(startInclusive).intValueExact() * 10;
+        final Supplier<BigDecimal> generator = new Supplier<BigDecimal>() {
+            BigDecimal current = startInclusive;
+
+            @Override
+            public BigDecimal get() {
+                BigDecimal result = current;
+                current = current.add(ONE_TENTH);
+                return result;
+            }
+        };
+        return Stream.generate(generator)
+                .limit(n)
+                .map(bd -> {
+                    return TypeUtil.encodeDecimal(DecimalType.instance.decompose(bd));
+                })
+                .collect(Collectors.toList())
+                .iterator();
+    }
+
+    public static Iterator<ByteBuffer> bigIntegerRange(final BigInteger startInclusive, final BigInteger endExclusive)
+    {
+        int n = endExclusive.subtract(startInclusive).intValueExact();
+        final Supplier<BigInteger> generator = new Supplier<BigInteger>() {
+            BigInteger current = startInclusive;
+
+            @Override
+            public BigInteger get() {
+                BigInteger result = current;
+                current = current.add(BigInteger.ONE);
+                return result;
+            }
+        };
+        return Stream.generate(generator)
+                .limit(n)
+                .map(bd -> {
+                    return TypeUtil.encodeBigInteger(IntegerType.instance.decompose(bd));
+                })
+                .collect(Collectors.toList())
+                .iterator();
+    }
+
+
+    /**
+     * Returns sequential ordered encoded shorts from {@code startInclusive} (inclusive) to {@code endExclusive}
+     * (exclusive) by an incremental step of {@code 1}.
+     */
+    public static Iterator<ByteBuffer> shortRange(short startInclusive, short endExclusive)
+    {
+        return IntStream.range(startInclusive, endExclusive)
+                        .mapToObj(i -> ShortType.instance.decompose((short) i))
+                        .collect(Collectors.toList())
+                        .iterator();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcherTest.java b/test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcherTest.java
new file mode 100644
index 000000000000..339633321627
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/KDTreeIndexSearcherTest.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.math.BigDecimal;
+import java.math.BigInteger;
+import java.util.List;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import java.util.stream.LongStream;
+
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.DecimalType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.NumberType;
+import org.apache.cassandra.db.marshal.ShortType;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+
+public class KDTreeIndexSearcherTest extends NdiRandomizedTest
+{
+    private static final short EQ_TEST_LOWER_BOUND_INCLUSIVE = 0;
+    private static final short EQ_TEST_UPPER_BOUND_EXCLUSIVE = 3;
+
+    private static final short RANGE_TEST_LOWER_BOUND_INCLUSIVE = 0;
+    private static final short RANGE_TEST_UPPER_BOUND_EXCLUSIVE = 10;
+
+    @Test
+    public void testRangeQueriesAgainstInt32Index() throws Exception
+    {
+        doTestRangeQueriesAgainstInt32Index();
+    }
+
+    private void doTestRangeQueriesAgainstInt32Index() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildInt32Searcher(newIndexComponents(), 0, 10);
+        testRangeQueries(indexSearcher, Int32Type.instance, Int32Type.instance, Integer::valueOf);
+    }
+
+    @Test
+    public void testEqQueriesAgainstInt32Index() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildInt32Searcher(newIndexComponents(),
+                                                                            EQ_TEST_LOWER_BOUND_INCLUSIVE, EQ_TEST_UPPER_BOUND_EXCLUSIVE);
+        testEqQueries(indexSearcher, Int32Type.instance, Int32Type.instance, Integer::valueOf);
+    }
+
+    @Test
+    public void testRangeQueriesAgainstLongIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildLongSearcher(newIndexComponents(), 0, 10);
+        testRangeQueries(indexSearcher, LongType.instance, Int32Type.instance, Long::valueOf);
+    }
+
+    @Test
+    public void testEqQueriesAgainstLongIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildLongSearcher(newIndexComponents(),
+                                                                           EQ_TEST_LOWER_BOUND_INCLUSIVE, EQ_TEST_UPPER_BOUND_EXCLUSIVE);
+        testEqQueries(indexSearcher, LongType.instance, Int32Type.instance, Long::valueOf);
+    }
+
+    @Test
+    public void testRangeQueriesAgainstShortIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildShortSearcher(newIndexComponents(), (short) 0, (short) 10);
+        testRangeQueries(indexSearcher, ShortType.instance, Int32Type.instance, Function.identity());
+    }
+
+    @Test
+    public void testEqQueriesAgainstShortIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildShortSearcher(newIndexComponents(),
+                                                                            EQ_TEST_LOWER_BOUND_INCLUSIVE, EQ_TEST_UPPER_BOUND_EXCLUSIVE);
+        testEqQueries(indexSearcher, ShortType.instance, Int32Type.instance, Function.identity());
+    }
+
+    @Test
+    public void testRangeQueriesAgainstDecimalIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildDecimalSearcher(newIndexComponents(),
+                                                                              BigDecimal.ZERO, BigDecimal.valueOf(10L));
+        testRangeQueries(indexSearcher, DecimalType.instance, DecimalType.instance, BigDecimal::valueOf,
+                         getLongsOnInterval(21L, 70L));
+    }
+
+    private List<Long> getLongsOnInterval(long lowerInclusive, long upperInclusive)
+    {
+        return LongStream.range(lowerInclusive, upperInclusive + 1L).boxed().collect(Collectors.toList());
+    }
+
+    @Test
+    public void testEqQueriesAgainstDecimalIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildDecimalSearcher(newIndexComponents(),
+                                                                              BigDecimal.valueOf(EQ_TEST_LOWER_BOUND_INCLUSIVE), BigDecimal.valueOf(EQ_TEST_UPPER_BOUND_EXCLUSIVE));
+        testEqQueries(indexSearcher, DecimalType.instance, DecimalType.instance, BigDecimal::valueOf);
+    }
+
+
+    @Test
+    public void testEqQueriesAgainstBigIntegerIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildBigIntegerSearcher(newIndexComponents(),
+                                                                                 BigInteger.valueOf(EQ_TEST_LOWER_BOUND_INCLUSIVE), BigInteger.valueOf(EQ_TEST_UPPER_BOUND_EXCLUSIVE));
+        testEqQueries(indexSearcher, IntegerType.instance, IntegerType.instance, BigInteger::valueOf);
+    }
+
+    @Test
+    public void testRangeQueriesAgainstBigIntegerIndex() throws Exception
+    {
+        IndexSearcher indexSearcher = KDTreeIndexBuilder.buildBigIntegerSearcher(newIndexComponents(),
+                                                                                 BigInteger.ZERO, BigInteger.valueOf(10L));
+        testRangeQueries(indexSearcher, IntegerType.instance, IntegerType.instance, BigInteger::valueOf);
+    }
+
+
+    @Test
+    public void testUnsupportedOperator() throws Exception
+    {
+        final IndexSearcher indexSearcher = KDTreeIndexBuilder.buildShortSearcher(newIndexComponents(), (short) 0, (short) 3);
+        try
+        {
+            indexSearcher.search(new Expression(SAITester.createColumnContext("meh", ShortType.instance))
+            {{
+                operation = Op.NOT_EQ;
+                lower = upper = new Bound(ShortType.instance.decompose((short) 0), Int32Type.instance, true);
+            }}, SSTableQueryContext.forTest(), false);
+
+            fail("Expect IllegalArgumentException thrown, but didn't");
+        }
+        catch (IllegalArgumentException e)
+        {
+            // expected
+        }
+    }
+
+    private <T extends Number> void testEqQueries(final IndexSearcher indexSearcher,
+                                                  final NumberType<T> rawType, final NumberType<?> encodedType,
+                                                  final Function<Short, T> rawValueProducer) throws Exception
+    {
+        try (RangeIterator results = indexSearcher.search(new Expression(SAITester.createColumnContext("meh", rawType))
+        {{
+            operation = Op.EQ;
+            lower = upper = new Bound(rawType.decompose(rawValueProducer.apply(EQ_TEST_LOWER_BOUND_INCLUSIVE)), encodedType, true);
+        }}, SSTableQueryContext.forTest(), false))
+        {
+            assertEquals(results.getMinimum(), results.getCurrent());
+            assertTrue(results.hasNext());
+
+            assertEquals(0L, results.next().getLong());
+        }
+
+        try (RangeIterator results = indexSearcher.search(new Expression(SAITester.createColumnContext("meh", rawType))
+        {{
+            operation = Op.EQ;
+            lower = upper = new Bound(rawType.decompose(rawValueProducer.apply(EQ_TEST_UPPER_BOUND_EXCLUSIVE)), encodedType, true);
+        }}, SSTableQueryContext.forTest(), false))
+        {
+            assertFalse(results.hasNext());
+            indexSearcher.close();
+        }
+    }
+
+    private <T extends Number> void testRangeQueries(final IndexSearcher indexSearcher,
+                                                     final NumberType<T> rawType, final NumberType<?> encodedType,
+                                                     final Function<Short, T> rawValueProducer) throws Exception
+    {
+        List<Long> expectedTokenList = getLongsOnInterval(3L, 7L);
+        testRangeQueries(indexSearcher, rawType, encodedType, rawValueProducer, expectedTokenList);
+    }
+
+
+    private <T extends Number> void testRangeQueries(final IndexSearcher indexSearcher,
+                                                     final NumberType<T> rawType, final NumberType<?> encodedType,
+                                                     final Function<Short, T> rawValueProducer, List<Long> expectedTokenList) throws Exception
+    {
+        try (RangeIterator results = indexSearcher.search(new Expression(SAITester.createColumnContext("meh", rawType))
+        {{
+            operation = Op.RANGE;
+
+            lower = new Bound(rawType.decompose(rawValueProducer.apply((short)2)), encodedType, false);
+            upper = new Bound(rawType.decompose(rawValueProducer.apply((short)7)), encodedType, true);
+        }}, SSTableQueryContext.forTest(), false))
+        {
+            assertEquals(results.getMinimum(), results.getCurrent());
+            assertTrue(results.hasNext());
+
+            List<Long> actualTokenList = Lists.newArrayList(Iterators.transform(results, Token::get));
+            assertEquals(expectedTokenList, actualTokenList);
+        }
+
+        try (RangeIterator results = indexSearcher.search(new Expression(SAITester.createColumnContext("meh", rawType))
+        {{
+            operation = Op.RANGE;
+            lower = new Bound(rawType.decompose(rawValueProducer.apply(RANGE_TEST_UPPER_BOUND_EXCLUSIVE)), encodedType, true);
+        }}, SSTableQueryContext.forTest(), false))
+        {
+            assertFalse(results.hasNext());
+        }
+
+        try (RangeIterator results = indexSearcher.search(new Expression(SAITester.createColumnContext("meh", rawType))
+        {{
+            operation = Op.RANGE;
+            upper = new Bound(rawType.decompose(rawValueProducer.apply(RANGE_TEST_LOWER_BOUND_INCLUSIVE)), encodedType, false);
+        }}, SSTableQueryContext.forTest(), false))
+        {
+            assertFalse(results.hasNext());
+            indexSearcher.close();
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/KDTreeSegmentMergerTest.java b/test/unit/org/apache/cassandra/index/sai/disk/KDTreeSegmentMergerTest.java
new file mode 100644
index 000000000000..dd5399bec819
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/KDTreeSegmentMergerTest.java
@@ -0,0 +1,277 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.BKDReader;
+import org.apache.cassandra.index.sai.disk.v1.BKDTreeRamBuffer;
+import org.apache.cassandra.index.sai.disk.v1.NumericIndexWriter;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NumericUtils;
+
+import static org.apache.cassandra.index.sai.disk.QueryEventListeners.NO_OP_BKD_LISTENER;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertThat;
+
+public class KDTreeSegmentMergerTest extends SAITester
+{
+    private TemporaryFolder temporaryFolder = new TemporaryFolder();
+    private Map<Integer, List<Long>> expected;
+    private Map<Integer, List<Long>> actual;
+
+    @BeforeClass
+    public static void dbSetup() throws Throwable
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @Before
+    public void setup() throws Throwable
+    {
+        temporaryFolder.create();
+        expected = new HashMap<>();
+        actual = new HashMap<>();
+    }
+
+    @After
+    public void teardown() throws Throwable
+    {
+        temporaryFolder.delete();
+    }
+
+    @Test
+    public void compactionMergerTest() throws Throwable
+    {
+        performMerger(getRandom().nextIntBetween(1000, 15000), getRandom().nextIntBetween(2, 10), true);
+
+        expected.keySet().forEach(term -> assertThat(expected.get(term), is(actual.get(term))));
+    }
+
+    @Test
+    public void postBuildMergerTest() throws Throwable
+    {
+        performMerger(getRandom().nextIntBetween(1000, 15000), getRandom().nextIntBetween(2, 10), false);
+
+        expected.keySet().forEach(term -> assertThat(expected.get(term), is(actual.get(term))));
+    }
+
+    @Test
+    public void compactionQueryTest() throws Throwable
+    {
+        performCompaction(getRandom().nextIntBetween(1000, 15000), getRandom().nextIntBetween(2, 10), true);
+
+        expected.keySet().forEach(term -> assertThat(expected.get(term), is(actual.get(term))));
+    }
+
+    @Test
+    public void postBuildQueryTest() throws Throwable
+    {
+        performCompaction(getRandom().nextIntBetween(1000, 15000), getRandom().nextIntBetween(2, 10), false);
+
+        expected.keySet().forEach(term -> assertThat(expected.get(term), is(actual.get(term))));
+    }
+
+    private void performMerger(int segmentSize, int segments, boolean compaction) throws Throwable
+    {
+        final List<BKDReader.IteratorState> segmentIterators = new ArrayList<>();
+
+        byte[] scratch = new byte[Integer.BYTES];
+
+        int maxSegmentRowId = 0;
+        int generation = 1;
+
+        for (int segment = 0; segment < segments; segment++)
+        {
+            BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+
+            for (int docID = segmentSize * segment; docID < (segmentSize * segment) + segmentSize; docID++)
+            {
+                int value = getRandom().nextIntBetween(0, 100);
+                NumericUtils.intToSortableBytes(value, scratch, 0);
+                buffer.addPackedValue(docID, new BytesRef(scratch));
+                maxSegmentRowId = docID;
+                List<Long> postings;
+                if (expected.containsKey(value))
+                    postings = expected.get(value);
+                else
+                {
+                    postings = new ArrayList<>();
+                    expected.put(value, postings);
+                }
+                postings.add(new Long(docID));
+            }
+            segmentIterators.add(createReader(buffer, maxSegmentRowId, generation).iteratorState());
+            if (compaction)
+                generation++;
+        }
+
+        MergeOneDimPointValues merger = new MergeOneDimPointValues(segmentIterators, Integer.BYTES);
+
+        merger.intersect((rowId, packedValue) -> {
+            int value = NumericUtils.sortableBytesToInt(packedValue, 0);
+            List<Long> postings;
+            if (actual.containsKey(value))
+                postings = actual.get(value);
+            else
+            {
+                postings = new ArrayList<>();
+                actual.put(value, postings);
+            }
+            postings.add(rowId);
+        });
+    }
+
+    private void performCompaction(int segmentSize, int segments, boolean compaction) throws Throwable
+    {
+        final List<BKDReader.IteratorState> segmentIterators = new ArrayList<>();
+
+        byte[] scratch = new byte[Integer.BYTES];
+
+        int maxSegmentRowId = 0;
+        int generation = 1;
+        int totalRows = 0;
+
+        for (int segment = 0; segment < segments; segment++)
+        {
+            BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+
+            for (int docID = segmentSize * segment; docID < (segmentSize * segment) + segmentSize; docID++)
+            {
+                int value = getRandom().nextIntBetween(0, 100);
+                NumericUtils.intToSortableBytes(value, scratch, 0);
+                buffer.addPackedValue(docID, new BytesRef(scratch));
+                maxSegmentRowId = docID;
+                List<Long> postings;
+                if (expected.containsKey(value))
+                    postings = expected.get(value);
+                else
+                {
+                    postings = new ArrayList<>();
+                    expected.put(value, postings);
+                }
+                postings.add(new Long(docID));
+                totalRows++;
+            }
+            segmentIterators.add(createReader(buffer, maxSegmentRowId, generation).iteratorState());
+            if (compaction)
+                generation++;
+        }
+
+        MergeOneDimPointValues merger = new MergeOneDimPointValues(segmentIterators, Integer.BYTES);
+
+        IndexComponents components = IndexComponents.create("test", new Descriptor(temporaryFolder.newFolder(), "test", "test", 20), null);
+
+        try (NumericIndexWriter indexWriter = new NumericIndexWriter(components, Integer.BYTES, maxSegmentRowId, totalRows, IndexWriterConfig.defaultConfig("test"), false))
+        {
+            SegmentMetadata.ComponentMetadataMap metadata = indexWriter.writeAll(merger);
+            final long bkdPosition = metadata.get(IndexComponents.NDIType.KD_TREE).root;
+            final long postingsPosition = metadata.get(IndexComponents.NDIType.KD_TREE_POSTING_LISTS).root;
+
+            FileHandle kdtree = components.createFileHandle(components.kdTree);
+            FileHandle kdtreePostings = components.createFileHandle(components.kdTreePostingLists);
+            BKDReader reader = new BKDReader(components, kdtree, bkdPosition, kdtreePostings, postingsPosition);
+
+            for (int term : expected.keySet())
+            {
+                PostingList postingList = reader.intersect(buildQuery(term, term), NO_OP_BKD_LISTENER, new QueryContext());
+
+                while (true)
+                {
+                    long rowId = postingList.nextPosting();
+                    if (rowId == PostingList.END_OF_STREAM)
+                        break;
+                    List<Long> postings;
+                    if (actual.containsKey(term))
+                        postings = actual.get(term);
+                    else
+                    {
+                        postings = new ArrayList<>();
+                        actual.put(term, postings);
+                    }
+                    postings.add(rowId);
+                }
+            }
+        }
+    }
+
+    private BKDReader createReader(BKDTreeRamBuffer buffer, int maxSegmentRowId, int generation) throws Throwable
+    {
+        IndexComponents components = IndexComponents.create("test", new Descriptor(temporaryFolder.newFolder(), "test", "test", generation), null);
+
+        final NumericIndexWriter writer = new NumericIndexWriter(components, Integer.BYTES, maxSegmentRowId, buffer.numRows(), IndexWriterConfig.defaultConfig("test"), false);
+
+        final SegmentMetadata.ComponentMetadataMap metadata = writer.writeAll(buffer.asPointValues());
+        final long bkdPosition = metadata.get(IndexComponents.NDIType.KD_TREE).root;
+        final long postingsPosition = metadata.get(IndexComponents.NDIType.KD_TREE_POSTING_LISTS).root;
+
+        FileHandle kdtree = components.createFileHandle(components.kdTree);
+        FileHandle kdtreePostings = components.createFileHandle(components.kdTreePostingLists);
+        return new BKDReader(components, kdtree, bkdPosition, kdtreePostings, postingsPosition);
+    }
+
+    private BKDReader.IntersectVisitor buildQuery(int queryMin, int queryMax)
+    {
+        return new BKDReader.IntersectVisitor()
+        {
+            @Override
+            public boolean visit(byte[] packedValue)
+            {
+                int x = NumericUtils.sortableBytesToInt(packedValue, 0);
+                return x >= queryMin && x <= queryMax;
+            }
+
+            @Override
+            public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+            {
+                int min = NumericUtils.sortableBytesToInt(minPackedValue, 0);
+                int max = NumericUtils.sortableBytesToInt(maxPackedValue, 0);
+                assert max >= min;
+
+                if (max < queryMin || min > queryMax)
+                {
+                    return PointValues.Relation.CELL_OUTSIDE_QUERY;
+                }
+                else if (min >= queryMin && max <= queryMax)
+                {
+                    return PointValues.Relation.CELL_INSIDE_QUERY;
+                }
+                else
+                {
+                    return PointValues.Relation.CELL_CROSSES_QUERY;
+                }
+            }
+        };
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/NodeStartupTest.java b/test/unit/org/apache/cassandra/index/sai/disk/NodeStartupTest.java
new file mode 100644
index 000000000000..00e1bb1fc00e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/NodeStartupTest.java
@@ -0,0 +1,369 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Objects;
+import java.util.Set;
+import java.util.stream.Stream;
+
+import com.google.common.collect.ObjectArrays;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexBuilder;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.inject.InvokePointBuilder;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.schema.Schema;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+@RunWith(Parameterized.class)
+public class NodeStartupTest extends SAITester
+{
+    private static final int DOCS = 100;
+
+    private static final Injections.Barrier preJoinWaitsForBuild = Injections.newBarrierAwait("pre_join_build", 1, false)
+                                                                             .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startPreJoinTask").atEntry())
+                                                                             .build();
+
+    private static final Injections.Barrier buildReleasesPreJoin = Injections.newBarrierCountDown("pre_join_build", 1, false)
+                                                                             .add(InvokePointBuilder.newInvokePoint().onClass(SecondaryIndexManager.class).onMethod("markIndexBuilt").atExit())
+                                                                             .build();
+
+    private static final Injections.Barrier buildWaitsForPreJoin = Injections.newBarrierAwait("build_pre_join", 1, false)
+                                                                             .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startInitialBuild").atEntry())
+                                                                             .build();
+
+    private static final Injections.Barrier preJoinReleasesBuild = Injections.newBarrierCountDown("build_pre_join", 1, false)
+                                                                             .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startPreJoinTask").atExit())
+                                                                             .build();
+
+    private static final Injections.Barrier preJoinStartWaitsMidBuild = Injections.newBarrierAwait("pre_join_mid_build", 1, false)
+                                                                                  .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startPreJoinTask").atEntry())
+                                                                                  .build();
+
+    private static final Injections.Barrier midBuildReleasesPreJoinStart = Injections.newBarrierCountDown("pre_join_mid_build", 1, false)
+                                                                                     .add(InvokePointBuilder.newInvokePoint().onClass(SSTableIndexWriter.class).onMethod("addRow").atEntry())
+                                                                                     .build();
+
+    private static final Injections.Barrier midBuildWaitsPreJoinFinish = Injections.newBarrierAwait("mid_build_pre_join", 1, false)
+                                                                                   .add(InvokePointBuilder.newInvokePoint().onClass(SSTableIndexWriter.class).onMethod("addRow").atExit())
+                                                                                   .build();
+
+    private static final Injections.Barrier preJoinFinishReleasesMidBuild = Injections.newBarrierCountDown("mid_build_pre_join", 1, false)
+                                                                                      .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startPreJoinTask").atExit())
+                                                                                      .build();
+
+    private static final Injections.Barrier[] barriers = new Injections.Barrier[] { preJoinWaitsForBuild, buildReleasesPreJoin, buildWaitsForPreJoin,
+                                                                                    preJoinReleasesBuild, preJoinStartWaitsMidBuild, midBuildReleasesPreJoinStart, midBuildWaitsPreJoinFinish, preJoinFinishReleasesMidBuild
+    };
+
+    private static final Injections.Counter buildCounter = Injections.newCounter("buildCounter")
+                                                                     .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndexBuilder.class).onMethod("build").atEntry())
+                                                                     .build();
+
+    private static final Injections.Counter deleteComponentCounter = Injections.newCounter("deletedComponentCounter")
+                                                                               .add(InvokePointBuilder.newInvokePoint().onClass(IndexComponents.class).onMethod("deleteComponent").atEntry())
+                                                                               .build();
+
+    private static final Injections.Counter[] counters = new Injections.Counter[] { buildCounter, deleteComponentCounter };
+
+    private static Throwable error = null;
+
+    private String indexName = null;
+
+    enum Populator
+    {
+        INDEXABLE_ROWS("populateIndexableRows"),
+        NON_INDEXABLE_ROWS("populateNonIndexableRows"),
+        TOMBSTONES("populateTombstones");
+
+        private final String populator;
+
+        Populator(String populator)
+        {
+            this.populator = populator;
+        }
+
+        public void populate(NodeStartupTest test)
+        {
+            try
+            {
+                test.getClass().getMethod(populator).invoke(test);
+            }
+            catch (Exception e)
+            {
+                e.printStackTrace();
+                fail("Populator " + name() + " failed because " + e.getLocalizedMessage());
+            }
+            if (error != null)
+            {
+                fail("Populator " + name() + " failed because " + error.getLocalizedMessage());
+            }
+        }
+    }
+
+    enum IndexStateOnRestart
+    {
+        VALID,
+        ALL_EMPTY,
+        PER_SSTABLE_INCOMPLETE,
+        PER_COLUMN_INCOMPLETE,
+        PER_SSTABLE_CORRUPT,
+        PER_COLUMN_CORRUPT;
+    }
+
+    enum StartupTaskRunOrder
+    {
+        PRE_JOIN_RUNS_AFTER_BUILD(preJoinWaitsForBuild, buildReleasesPreJoin),
+        PRE_JOIN_RUNS_BEFORE_BUILD(buildWaitsForPreJoin, preJoinReleasesBuild),
+        PRE_JOIN_RUNS_MID_BUILD(preJoinStartWaitsMidBuild, midBuildReleasesPreJoinStart, midBuildWaitsPreJoinFinish, preJoinFinishReleasesMidBuild);
+
+        private final Injection[] injections;
+
+        StartupTaskRunOrder(Injections.Barrier... injections)
+        {
+            this.injections = injections;
+        }
+
+        public void enable()
+        {
+            Stream.of(injections).forEach(Injection::enable);
+        }
+    }
+
+    @Before
+    public void setup() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, v1 int)");
+        indexName = createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", StorageAttachedIndex.class.getName()));
+        Injections.inject(ObjectArrays.concat(barriers, counters, Injection.class));
+        Stream.of(barriers).forEach(Injections.Barrier::reset);
+        Stream.of(barriers).forEach(Injections.Barrier::disable);
+        Stream.of(counters).forEach(Injections.Counter::reset);
+        Stream.of(counters).forEach(Injection::enable);
+        error = null;
+    }
+
+    @Parameterized.Parameter(0)
+    public Populator populator;
+    @Parameterized.Parameter(1)
+    public IndexStateOnRestart state;
+    @Parameterized.Parameter(2)
+    public StartupTaskRunOrder order;
+    @Parameterized.Parameter(3)
+    public int builds;
+    @Parameterized.Parameter(4)
+    public int deletedComponents;
+    @Parameterized.Parameter(5)
+    public int expectedDocuments;
+
+    @SuppressWarnings("unused")
+    @Parameterized.Parameters(name = "{0} {1} {2}")
+    public static List<Object[]> startupScenarios()
+    {
+        List<Object[]> scenarios = new LinkedList<>();
+
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.VALID, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 0, 0, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.VALID, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 0, 0, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.ALL_EMPTY, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 1, 2, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.ALL_EMPTY, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 1, 2, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.ALL_EMPTY, StartupTaskRunOrder.PRE_JOIN_RUNS_MID_BUILD, 1, 2, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_SSTABLE_INCOMPLETE, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 1, 9, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_SSTABLE_INCOMPLETE, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 1, 9, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_SSTABLE_INCOMPLETE, StartupTaskRunOrder.PRE_JOIN_RUNS_MID_BUILD, 1, 9, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_COLUMN_INCOMPLETE, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 1, 5, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_COLUMN_INCOMPLETE, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 1, 5, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_COLUMN_INCOMPLETE, StartupTaskRunOrder.PRE_JOIN_RUNS_MID_BUILD, 1, 5, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_SSTABLE_CORRUPT, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 1, 10, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_SSTABLE_CORRUPT, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 1, 10, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_SSTABLE_CORRUPT, StartupTaskRunOrder.PRE_JOIN_RUNS_MID_BUILD, 1, 10, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_COLUMN_CORRUPT, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 1, 6, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_COLUMN_CORRUPT, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 1, 6, DOCS });
+        scenarios.add( new Object[] { Populator.INDEXABLE_ROWS, IndexStateOnRestart.PER_COLUMN_CORRUPT, StartupTaskRunOrder.PRE_JOIN_RUNS_MID_BUILD, 1, 6, DOCS });
+        scenarios.add( new Object[] { Populator.NON_INDEXABLE_ROWS, IndexStateOnRestart.VALID, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 0, 0, 0 });
+        scenarios.add( new Object[] { Populator.NON_INDEXABLE_ROWS, IndexStateOnRestart.VALID, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 0, 0, 0 });
+        scenarios.add( new Object[] { Populator.TOMBSTONES, IndexStateOnRestart.VALID, StartupTaskRunOrder.PRE_JOIN_RUNS_BEFORE_BUILD, 0, 0, 0 });
+        scenarios.add( new Object[] { Populator.TOMBSTONES, IndexStateOnRestart.VALID, StartupTaskRunOrder.PRE_JOIN_RUNS_AFTER_BUILD, 0, 0, 0 });
+
+        return scenarios;
+    }
+
+    @Test
+    public void startupOrderingTest() throws Throwable
+    {
+        populator.populate(this);
+
+        assertTrue(isIndexQueryable());
+        assertTrue(isGroupIndexComplete());
+        assertTrue(isColumnIndexComplete());
+        Assert.assertEquals(expectedDocuments, execute("SELECT * FROM %s WHERE v1 >= 0").size());
+
+        setState(state);
+
+        order.enable();
+
+        simulateNodeRestart();
+
+        assertTrue(isIndexQueryable());
+        assertTrue(isGroupIndexComplete());
+        assertTrue(isColumnIndexComplete());
+        Assert.assertEquals(expectedDocuments, execute("SELECT * FROM %s WHERE v1 >= 0").size());
+
+        Assert.assertEquals(builds, buildCounter.get());
+        Assert.assertEquals(deletedComponents, deleteComponentCounter.get());
+    }
+
+    public void populateIndexableRows()
+    {
+        try
+        {
+            for (int i = 0; i < DOCS; i++)
+            {
+                execute("INSERT INTO %s (id, v1) VALUES (?, 0)", i);
+            }
+            flush();
+        }
+        catch (Throwable e)
+        {
+            error = e;
+            e.printStackTrace();
+        }
+    }
+
+    public void populateNonIndexableRows()
+    {
+        try
+        {
+            for (int i = 0; i < DOCS; i++)
+            {
+                execute("INSERT INTO %s (id) VALUES (?)", i);
+            }
+            flush();
+        }
+        catch (Throwable e)
+        {
+            error = e;
+            e.printStackTrace();
+        }
+    }
+
+    public void populateTombstones()
+    {
+        try
+        {
+            for (int i = 0; i < DOCS; i++)
+            {
+                execute("DELETE FROM %s WHERE id=?", i);
+            }
+            flush();
+        }
+        catch (Throwable e)
+        {
+            error = e;
+            e.printStackTrace();
+        }
+    }
+
+    private boolean isGroupIndexComplete() throws Exception
+    {
+        ColumnFamilyStore cfs = Objects.requireNonNull(Schema.instance.getKeyspaceInstance(KEYSPACE)).getColumnFamilyStore(currentTable());
+        return cfs.getLiveSSTables().stream().allMatch(sstable -> IndexComponents.isGroupIndexComplete(sstable.descriptor));
+    }
+
+    private boolean isColumnIndexComplete() throws Exception
+    {
+        ColumnFamilyStore cfs = Objects.requireNonNull(Schema.instance.getKeyspaceInstance(KEYSPACE)).getColumnFamilyStore(currentTable());
+        return cfs.getLiveSSTables().stream().allMatch(sstable -> IndexComponents.isColumnIndexComplete(sstable.descriptor, indexName));
+    }
+
+    private void setState(IndexStateOnRestart state)
+    {
+        switch (state)
+        {
+            case VALID:
+                break;
+            case ALL_EMPTY:
+                allIndexComponents().forEach(this::remove);
+                break;
+            case PER_SSTABLE_INCOMPLETE:
+                remove(IndexComponents.GROUP_COMPLETION_MARKER);
+                break;
+            case PER_COLUMN_INCOMPLETE:
+                remove(IndexComponents.NDIType.COLUMN_COMPLETION_MARKER.newComponent(indexName));
+                break;
+            case PER_SSTABLE_CORRUPT:
+                corrupt(IndexComponents.GROUP_META);
+                break;
+            case PER_COLUMN_CORRUPT:
+                corrupt(IndexComponents.NDIType.META.newComponent(indexName));
+                break;
+        }
+    }
+
+    private Set<Component> allIndexComponents()
+    {
+        Set<Component> components = new HashSet<>();
+        components.addAll(IndexComponents.PER_SSTABLE_COMPONENTS);
+        components.addAll(IndexComponents.perColumnComponents(indexName, false));
+        return components;
+    }
+
+    private void remove(Component component)
+    {
+        try
+        {
+            corruptNDIComponent(component, CorruptionType.REMOVED);
+        }
+        catch (Exception e)
+        {
+            error = e;
+            e.printStackTrace();
+        }
+    }
+
+    private void corrupt(Component component)
+    {
+        try
+        {
+            corruptNDIComponent(component, CorruptionType.TRUNCATED_HEADER);
+        }
+        catch (Exception e)
+        {
+            error = e;
+            e.printStackTrace();
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/RAMPostingSlicesTest.java b/test/unit/org/apache/cassandra/index/sai/disk/RAMPostingSlicesTest.java
new file mode 100644
index 000000000000..210cae4a5260
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/RAMPostingSlicesTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.FixedBitSet;
+
+public class RAMPostingSlicesTest extends NdiRandomizedTest
+{
+    @Test
+    public void testRAMPostingSlices() throws Exception
+    {
+        RAMPostingSlices slices = new RAMPostingSlices(Counter.newCounter());
+
+        int[] segmentRowIdUpto = new int[1024];
+        Arrays.fill(segmentRowIdUpto, -1);
+
+        FixedBitSet[] bitSets = new FixedBitSet[segmentRowIdUpto.length];
+
+        for (int x = 0; x < 1_000_000; x++)
+        {
+            int termID = nextInt(segmentRowIdUpto.length);
+
+            if (segmentRowIdUpto[termID] == -1)
+            {
+                slices.createNewSlice(termID);
+            }
+
+            segmentRowIdUpto[termID]++;
+
+            if (bitSets[termID] == null)
+            {
+                bitSets[termID] = new FixedBitSet(1_000_000);
+            }
+
+            bitSets[termID].set(segmentRowIdUpto[termID]);
+
+            slices.writeVInt(termID, segmentRowIdUpto[termID]);
+        }
+
+        for (int termID = 0; termID < segmentRowIdUpto.length; termID++)
+        {
+            ByteSliceReader reader = new ByteSliceReader();
+            slices.initReader(reader, termID);
+
+            int segmentRowId = -1;
+
+            while (!reader.eof())
+            {
+                segmentRowId = reader.readVInt();
+                assertTrue("termID=" + termID + " segmentRowId=" + segmentRowId, bitSets[termID].get(segmentRowId));
+            }
+            assertEquals(segmentRowId, segmentRowIdUpto[termID]);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/RAMStringIndexerTest.java b/test/unit/org/apache/cassandra/index/sai/disk/RAMStringIndexerTest.java
new file mode 100644
index 000000000000..a4b37fbd43d7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/RAMStringIndexerTest.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+import org.apache.lucene.util.BytesRef;
+
+import static org.apache.cassandra.utils.ByteBufferUtil.string;
+
+public class RAMStringIndexerTest extends NdiRandomizedTest
+{
+    @Test
+    public void test() throws Exception
+    {
+        RAMStringIndexer indexer = new RAMStringIndexer(UTF8Type.instance);
+
+        indexer.add(new BytesRef("0"), 100);
+        indexer.add(new BytesRef("2"), 102);
+        indexer.add(new BytesRef("0"), 200);
+        indexer.add(new BytesRef("2"), 202);
+        indexer.add(new BytesRef("2"), 302);
+
+        List<List<Long>> matches = new ArrayList<>();
+        matches.add(Arrays.asList(100L, 200L));
+        matches.add(Arrays.asList(102L, 202L, 302L));
+
+        try (TermsIterator terms = indexer.getTermsWithPostings())
+        {
+            int ord = 0;
+            while (terms.hasNext())
+            {
+                terms.next();
+                try (PostingList postings = terms.postings())
+                {
+                    List<Long> results = new ArrayList<>();
+                    long segmentRowId;
+                    while ((segmentRowId = postings.nextPosting()) != PostingList.END_OF_STREAM)
+                    {
+                        results.add(segmentRowId);
+                    }
+                    assertEquals(matches.get(ord++), results);
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testLargeSegment() throws IOException
+    {
+        final RAMStringIndexer indexer = new RAMStringIndexer(UTF8Type.instance);
+        final int numTerms = between(1 << 10, 1 << 13);
+        final int numPostings = between(1 << 5, 1 << 10);
+
+        for (int id = 0; id < numTerms; ++id)
+        {
+            final BytesRef term = new BytesRef(String.format("%04d", id));
+            for (int posting = 0; posting < numPostings; ++posting)
+            {
+                indexer.add(term, posting);
+            }
+        }
+
+        final TermsIterator terms = indexer.getTermsWithPostings();
+
+        ByteComparable term;
+        long termOrd = 0L;
+        while (terms.hasNext())
+        {
+            term = terms.next();
+            final ByteBuffer decoded = ByteBuffer.wrap(ByteSourceInverse.readBytes(term.asComparableBytes(ByteComparable.Version.OSS41)));
+            assertEquals(String.format("%04d", termOrd), string(decoded));
+
+            try (PostingList postingList = terms.postings())
+            {
+                assertEquals(numPostings, postingList.size());
+                for (int i = 0; i < numPostings; ++i)
+                {
+                    assertEquals(i, postingList.nextPosting());
+                }
+                assertEquals(PostingList.END_OF_STREAM, postingList.nextPosting());
+            }
+            termOrd++;
+        }
+
+        assertEquals(numTerms, termOrd);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/SegmentFlushTest.java b/test/unit/org/apache/cassandra/index/sai/disk/SegmentFlushTest.java
new file mode 100644
index 000000000000..af544f721ed6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/SegmentFlushTest.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.index.sai.disk.v1.TermsReader;
+import org.apache.cassandra.index.sai.metrics.QueryEventListeners;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.apache.cassandra.Util.dk;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+//TODO This test needs rethinking because we always now end up with a single segment after a flush
+// and we are not restricted to Integer.MAX_VALUE in the segments
+public class SegmentFlushTest
+{
+    private static long segmentRowIdOffset;
+    private static int posting1;
+    private static int posting2;
+    private static DecoratedKey minKey;
+    private static DecoratedKey maxKey;
+    private static ByteBuffer minTerm;
+    private static ByteBuffer maxTerm;
+    private static int numRows;
+
+    @BeforeClass
+    public static void init()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @After
+    public void reset()
+    {
+        SegmentBuilder.updateLastValidSegmentRowId(-1); // reset
+    }
+
+    @Test
+    public void testFlushBetweenRowIds() throws Exception
+    {
+        // exceeds max rowId per segment
+        testFlushBetweenRowIds(0, Integer.MAX_VALUE, 1);
+        testFlushBetweenRowIds(0, Long.MAX_VALUE - 1, 1);
+        testFlushBetweenRowIds(0, SegmentBuilder.LAST_VALID_SEGMENT_ROW_ID + 1, 1);
+        testFlushBetweenRowIds(Integer.MAX_VALUE - SegmentBuilder.LAST_VALID_SEGMENT_ROW_ID - 1, Integer.MAX_VALUE - 1, 1);
+        testFlushBetweenRowIds(Long.MAX_VALUE - SegmentBuilder.LAST_VALID_SEGMENT_ROW_ID - 1, Long.MAX_VALUE - 1, 1);
+    }
+
+    @Test
+    public void testNoFlushBetweenRowIds() throws Exception
+    {
+        // not exceeds max rowId per segment
+        testFlushBetweenRowIds(0, SegmentBuilder.LAST_VALID_SEGMENT_ROW_ID, 1);
+        testFlushBetweenRowIds(Long.MAX_VALUE - SegmentBuilder.LAST_VALID_SEGMENT_ROW_ID, Long.MAX_VALUE - 1, 1);
+    }
+
+    private void testFlushBetweenRowIds(long sstableRowId1, long sstableRowId2, int segments) throws Exception
+    {
+        Path tmpDir = Files.createTempDirectory("SegmentFlushTest");
+        Descriptor descriptor = new Descriptor(tmpDir.toFile(), "ks", "cf", 1);
+
+        ColumnMetadata column = ColumnMetadata.regularColumn("sai", "internal", "column", UTF8Type.instance);
+        IndexMetadata config = IndexMetadata.fromSchemaMetadata("index_name", IndexMetadata.Kind.CUSTOM, null);
+
+        ColumnContext context = new ColumnContext("ks", "cf",
+                                                  UTF8Type.instance, new ClusteringComparator(),
+                                                  column, config, IndexWriterConfig.defaultConfig("test"));
+
+        SSTableIndexWriter writer = new SSTableIndexWriter(descriptor, context, StorageAttachedIndex.SEGMENT_BUILD_MEMORY_LIMITER, () -> true, null);
+
+        List<DecoratedKey> keys = Arrays.asList(dk("1"), dk("2"));
+        Collections.sort(keys);
+
+        DecoratedKey key1 = keys.get(0);
+        ByteBuffer term1 = UTF8Type.instance.decompose("a");
+        Row row1 = createRow(column, term1);
+        writer.addRow(key1, sstableRowId1, row1);
+
+        // expect a flush if exceed max rowId per segment
+        DecoratedKey key2 = keys.get(1);
+        ByteBuffer term2 = UTF8Type.instance.decompose("b");
+        Row row2 = createRow(column, term2);
+        writer.addRow(key2, sstableRowId2, row2);
+
+        writer.flush();
+
+        IndexComponents components = IndexComponents.create(context.getIndexName(), descriptor, null);
+        MetadataSource source = MetadataSource.loadColumnMetadata(components);
+
+        // verify segment count
+        List<SegmentMetadata> segmentMetadatas = SegmentMetadata.load(source, null);
+        assertEquals(segments, segmentMetadatas.size());
+
+        // verify segment metadata
+        SegmentMetadata segmentMetadata = segmentMetadatas.get(0);
+        segmentRowIdOffset = 0;
+        posting1 = 0;
+        posting2 = (int) (sstableRowId2 - segmentRowIdOffset);
+        minKey = key1;
+        maxKey = key2;
+        minTerm = term1;
+        maxTerm = term2;
+        numRows = 2;
+        verifySegmentMetadata(segmentMetadata);
+        verifyStringIndex(components, segmentMetadata);
+    }
+
+    private void verifyStringIndex(IndexComponents components, SegmentMetadata segmentMetadata) throws IOException
+    {
+        FileHandle termsData = components.createFileHandle(components.termsData);
+        FileHandle postingLists = components.createFileHandle(components.postingLists);
+
+        long termsFooterPointer = Long.parseLong(segmentMetadata.componentMetadatas.get(IndexComponents.NDIType.TERMS_DATA).attributes.get(SAICodecUtils.FOOTER_POINTER));
+
+        try (TermsReader reader = new TermsReader(components, termsData, postingLists,
+                                                  segmentMetadata.componentMetadatas.get(components.termsData.ndiType).root, termsFooterPointer))
+        {
+            TermsIterator iterator = reader.allTerms(0, QueryEventListeners.NO_OP_TRIE_LISTENER);
+            assertEquals(minTerm, iterator.getMinTerm());
+            assertEquals(maxTerm, iterator.getMaxTerm());
+
+            verifyTermPostings(iterator, minTerm, posting1, posting1);
+
+            if (numRows > 1)
+            {
+                verifyTermPostings(iterator, maxTerm, posting2, posting2);
+            }
+
+            assertFalse(iterator.hasNext());
+        }
+    }
+
+    private void verifyTermPostings(TermsIterator iterator, ByteBuffer expectedTerm, int minSegmentRowId, int maxSegmentRowId) throws IOException
+    {
+        ByteComparable term = iterator.next();
+        PostingList postings = iterator.postings();
+
+        assertEquals(0, ByteComparable.compare(term, ByteComparable.fixedLength(expectedTerm), ByteComparable.Version.OSS41));
+        assertEquals(minSegmentRowId == maxSegmentRowId ? 1 : 2, postings.size());
+    }
+
+    private void verifySegmentMetadata(SegmentMetadata segmentMetadata)
+    {
+        assertEquals(segmentRowIdOffset, segmentMetadata.segmentRowIdOffset);
+        assertEquals(minKey, segmentMetadata.minKey);
+        assertEquals(maxKey, segmentMetadata.maxKey);
+        assertEquals(minTerm, segmentMetadata.minTerm);
+        assertEquals(maxTerm, segmentMetadata.maxTerm);
+        assertEquals(numRows, segmentMetadata.numRows);
+    }
+
+    private Row createRow(ColumnMetadata column, ByteBuffer value)
+    {
+        Row.Builder builder1 = BTreeRow.sortedBuilder();
+        builder1.newRow(Clustering.EMPTY);
+        builder1.addCell(BufferCell.live(column, 0, value));
+        return builder1.build();
+    }
+
+    private void assertOverflow(long sstableRowId1, long sstableRowId2) throws Exception
+    {
+        try
+        {
+            testFlushBetweenRowIds(sstableRowId1, sstableRowId2, 0);
+            fail("Expect integer overflow, but didn't");
+        }
+        catch (ArithmeticException e)
+        {
+            assertTrue(e.getMessage().contains("integer overflow"));
+        }
+    }
+
+    private void assertIllegalEndOfStream(long sstableRowId1, long sstableRowId2) throws Exception
+    {
+        try
+        {
+            testFlushBetweenRowIds(sstableRowId1, sstableRowId2, 0);
+            fail("Expect integer overflow, but didn't");
+        }
+        catch (IllegalArgumentException e)
+        {
+            assertTrue(e.getMessage().contains("END_OF_STREAM"));
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/SegmentMergerTest.java b/test/unit/org/apache/cassandra/index/sai/disk/SegmentMergerTest.java
new file mode 100644
index 000000000000..b07eeb2495c1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/SegmentMergerTest.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.hamcrest.CoreMatchers.is;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+public class SegmentMergerTest extends SAITester
+{
+    protected static final Injections.Counter SEGMENT_BUILD_COUNTER = Injections.newCounter("SegmentBuildCounter")
+                                                                                .add(newInvokePoint().onClass(SSTableIndexWriter.class).onMethod("newSegmentBuilder"))
+                                                                                .build();
+
+    @Before
+    public void setup() throws Throwable
+    {
+        System.setProperty("cassandra.test.sai.segment_build_memory_limit", "70000");
+        requireNetwork();
+        SEGMENT_BUILD_COUNTER.reset();
+    }
+
+    @Test
+    public void literalIndexTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, value text, PRIMARY KEY(pk))");
+        disableCompaction();
+
+        Injections.inject(SEGMENT_BUILD_COUNTER);
+
+        // Insert sufficient rows to make sure more than 1 segments are created before segment compaction
+        Map<String, List<Integer>> expected = new HashMap<>();
+
+        for (int rowId = 0; rowId < getRandom().nextIntBetween(50000, 100000); rowId++)
+        {
+            String value = Integer.toString(getRandom().nextIntBetween(0, 1000));
+            execute("INSERT INTO %s (pk, value) VALUES (?, ?)", rowId, value);
+            List<Integer> postings;
+            if (expected.containsKey(value))
+                postings = expected.get(value);
+            else
+            {
+                postings = new ArrayList<>();
+                expected.put(value, postings);
+            }
+            postings.add(rowId);
+
+        }
+        flush();
+
+        String indexName = createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        // All we are interested in is that before the segment compaction there were more than 1 segment created
+        assertTrue(SEGMENT_BUILD_COUNTER.get() > 1);
+
+        List<SegmentMetadata> segments = getSegments(indexName, 1);
+
+        // Post-build the index only has 1 segment
+        assertEquals(1, segments.size());
+
+        Map<String, List<Integer>> actual = new HashMap<>();
+
+        for (String term : expected.keySet())
+        {
+            UntypedResultSet results = execute("SELECT * FROM %s WHERE value = ?", term);
+            List<Integer> postings;
+            if (actual.containsKey(term))
+                postings = actual.get(term);
+            else
+            {
+                postings = new ArrayList<>();
+                actual.put(term, postings);
+            }
+            results.forEach(row -> postings.add(row.getInt("pk")));
+            postings.sort(Integer::compareTo);
+        }
+
+        expected.keySet().forEach(term -> assertThat("Postings comparison failed for term = " + term, expected.get(term), is(actual.get(term))));
+    }
+
+    @Test
+    public void numericIndexTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, value int, PRIMARY KEY(pk))");
+        disableCompaction();
+
+        Injections.inject(SEGMENT_BUILD_COUNTER);
+
+        // Insert sufficient rows to make sure more than 1 segments are created before segment compaction
+        Map<Integer, List<Integer>> expected = new HashMap<>();
+
+        for (int rowId = 0; rowId < getRandom().nextIntBetween(10000, 50000); rowId++)
+        {
+            int value = getRandom().nextIntBetween(0, 1000);
+            execute("INSERT INTO %s (pk, value) VALUES (?, ?)", rowId, value);
+            List<Integer> postings;
+            if (expected.containsKey(value))
+                postings = expected.get(value);
+            else
+            {
+                postings = new ArrayList<>();
+                expected.put(value, postings);
+            }
+            postings.add(rowId);
+
+        }
+        flush();
+
+        String indexName = createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        // All we are interested in is that before the segment compaction there were more than 1 segment created
+        assertTrue(SEGMENT_BUILD_COUNTER.get() > 1);
+
+        List<SegmentMetadata> segments = getSegments(indexName, 1);
+
+        // Post-build the index only has 1 segment
+        assertEquals(1, segments.size());
+
+        Map<Integer, List<Integer>> actual = new HashMap<>();
+
+        for (int term : expected.keySet())
+        {
+            UntypedResultSet results = execute("SELECT * FROM %s WHERE value = ?", term);
+            List<Integer> postings;
+            if (actual.containsKey(term))
+                postings = actual.get(term);
+            else
+            {
+                postings = new ArrayList<>();
+                actual.put(term, postings);
+            }
+            results.forEach(row -> postings.add(row.getInt("pk")));
+            postings.sort(Integer::compareTo);
+        }
+
+        expected.keySet().forEach(term -> assertThat("Postings comparison failed for term = " + term, expected.get(term), is(actual.get(term))));
+    }
+
+    private List<SegmentMetadata> getSegments(String indexName, int generation) throws Throwable
+    {
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        File dataFolder = new Directories(cfs.metadata()).getDirectoryForNewSSTables();
+        Descriptor descriptor = new Descriptor(dataFolder, cfs.keyspace.getName(), cfs.getTableName(), generation, SSTableFormat.Type.current());
+        TableMetadata table = currentTableMetadata();
+        assertTrue(IndexComponents.isGroupIndexComplete(descriptor));
+        IndexMetadata index = table.indexes.get(indexName).get();
+        ColumnContext context = new ColumnContext(table, index);
+        assertTrue(IndexComponents.isColumnIndexComplete(descriptor, context.getIndexName()));
+        IndexComponents components = IndexComponents.create(context.getIndexName(), descriptor, table.params.compression);
+        final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+        return SegmentMetadata.load(source, null);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/SegmentTest.java b/test/unit/org/apache/cassandra/index/sai/disk/SegmentTest.java
new file mode 100644
index 000000000000..35426e1d0832
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/SegmentTest.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class SegmentTest
+{
+    private static IPartitioner partitioner;
+    private static Token min, max;
+    private static List<Token> tokens;
+
+    @BeforeClass
+    public static void init()
+    {
+        DatabaseDescriptor.daemonInitialization();
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        partitioner = DatabaseDescriptor.getPartitioner();
+        min = partitioner.getMinimumToken();
+        max = partitioner.getMaximumToken();
+        tokens = IntStream.rangeClosed(0, 10).boxed().map(i -> partitioner.getRandomToken())
+                          .distinct().sorted().collect(Collectors.toList());
+    }
+
+    @Test
+    public void testNoOverlapping()
+    {
+        // wrap around
+        AbstractBounds<PartitionPosition> wrapAround = inclusiveRight(tokens.get(7), tokens.get(2));
+        assertWrapAround(wrapAround);
+        assertNoOverlapping(seg(tokens.get(5), tokens.get(6)), wrapAround);
+        wrapAround = inclusiveRight(tokens.get(7), min);
+        assertWrapAround(wrapAround);
+        assertNoOverlapping(seg(tokens.get(5), tokens.get(6)), wrapAround);
+
+        // exclusive intersection
+        assertNoOverlapping(seg(min, tokens.get(0)), inclusiveRight(tokens.get(0), tokens.get(1)));
+        assertNoOverlapping(seg(tokens.get(1), tokens.get(2)), exclusive(tokens.get(0), tokens.get(1)));
+        assertNoOverlapping(seg(tokens.get(3), max), inclusiveLeft(tokens.get(0), tokens.get(3)));
+
+        // disjoint
+        assertNoOverlapping(seg(min, tokens.get(0)), inclusiveRight(tokens.get(7), tokens.get(9)));
+        assertNoOverlapping(seg(tokens.get(2), tokens.get(4)), inclusiveRight(tokens.get(5), tokens.get(6)));
+        assertNoOverlapping(seg(tokens.get(3), max), inclusiveLeft(tokens.get(0), tokens.get(2)));
+    }
+
+    @Test
+    public void testOverlapping()
+    {
+        // wrap around
+        AbstractBounds<PartitionPosition> wrapAround = inclusiveRight(tokens.get(7), tokens.get(7));
+        assertWrapAround(wrapAround);
+        assertOverlapping(seg(tokens.get(5), tokens.get(6)), wrapAround);
+        wrapAround = inclusiveRight(tokens.get(7), min);
+        assertWrapAround(wrapAround);
+        assertOverlapping(seg(tokens.get(7), tokens.get(8)), wrapAround);
+        wrapAround = inclusiveRight(tokens.get(7), tokens.get(5));
+        assertWrapAround(wrapAround);
+        assertOverlapping(seg(tokens.get(1), tokens.get(2)), wrapAround);
+        wrapAround = inclusiveRight(min, min);
+        assertWrapAround(wrapAround);
+        assertOverlapping(seg(tokens.get(1), tokens.get(2)), wrapAround);
+
+        // inclusive intersection
+        assertOverlapping(seg(min, tokens.get(0)), inclusiveLeft(tokens.get(0), tokens.get(1)));
+        assertOverlapping(seg(tokens.get(1), tokens.get(2)), inclusive(tokens.get(0), tokens.get(1)));
+        assertOverlapping(seg(tokens.get(3), max), inclusiveRight(tokens.get(0), tokens.get(3)));
+
+        // intersect
+        assertOverlapping(seg(min, tokens.get(7)), exclusive(tokens.get(5), tokens.get(9)));
+        assertOverlapping(seg(tokens.get(5), tokens.get(7)), exclusive(tokens.get(5), tokens.get(6)));
+
+        // contains
+        assertOverlapping(seg(tokens.get(2), tokens.get(6)), inclusiveRight(tokens.get(4), tokens.get(5)));
+        assertOverlapping(seg(tokens.get(3), max), inclusiveLeft(tokens.get(5), tokens.get(8)));
+        assertOverlapping(seg(tokens.get(3), tokens.get(5)), inclusiveLeft(tokens.get(1), tokens.get(6)));
+    }
+
+    private static void assertNoOverlapping(Segment segment, AbstractBounds<PartitionPosition> keyRange)
+    {
+        assertFalse("Expect no overlapping", segment.intersects(keyRange));
+    }
+
+    private static void assertOverlapping(Segment segment, AbstractBounds<PartitionPosition> keyRange)
+    {
+        assertTrue("Expect overlapping", segment.intersects(keyRange));
+    }
+
+
+    private static void assertWrapAround(AbstractBounds<PartitionPosition> keyRange)
+    {
+        assertTrue("Expect wrap around range, but it's not", keyRange instanceof Range && ((Range<?>)keyRange).isWrapAround());
+    }
+
+    private static Segment seg(Token left, Token right)
+    {
+        return new Segment(left, right);
+    }
+
+    private static AbstractBounds<PartitionPosition> inclusiveLeft(Token left, Token right)
+    {
+        return keyRange(left, true, right, false);
+    }
+
+    private static AbstractBounds<PartitionPosition> inclusiveRight(Token left, Token right)
+    {
+        return keyRange(left, false, right, true);
+    }
+
+    private static AbstractBounds<PartitionPosition> inclusive(Token left, Token right)
+    {
+        return keyRange(left, true, right, true);
+    }
+
+    private static AbstractBounds<PartitionPosition> exclusive(Token left, Token right)
+    {
+        return keyRange(left, false, right, false);
+    }
+
+    private static AbstractBounds<PartitionPosition> keyRange(Token left, boolean inclusiveLeft, Token right, boolean inclusiveRight)
+    {
+        return Bounds.bounds(key(left), inclusiveLeft, key(right), inclusiveRight);
+    }
+
+    private static DecoratedKey key(Token token)
+    {
+        return new BufferDecoratedKey(token, ByteBufferUtil.bytes(0));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/SelectiveIntersectionTest.java b/test/unit/org/apache/cassandra/index/sai/disk/SelectiveIntersectionTest.java
new file mode 100644
index 000000000000..ae0dda9fcbf3
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/SelectiveIntersectionTest.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.datastax.driver.core.Session;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.v1.PostingsReader;
+import org.apache.cassandra.index.sai.utils.RangeIntersectionIterator;
+import org.apache.cassandra.inject.Injections;
+
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.junit.Assert.assertEquals;
+
+public class SelectiveIntersectionTest extends SAITester
+{
+    private static Injections.Counter INTERSECTION_FLOW_COUNTER = Injections.newCounter("IntersectionFlowCounter")
+                                                                            .add(newInvokePoint().onClass("org.apache.cassandra.index.sai.utils.RangeIntersectionIterator$BounceIntersectionIterator").onMethod("<init>"))
+                                                                            .build();
+
+    private static Injections.Counter POSTINGS_READER_OPEN_COUNTER = Injections.newCounter("PostingsReaderOpenCounter")
+                                                                               .add(newInvokePoint().onClass(PostingsReader.class).onMethod("<init>"))
+                                                                               .build();
+
+    private static Injections.Counter POSTINGS_READER_CLOSE_COUNTER = Injections.newCounter("PostingsReaderCloseCounter")
+                                                                                .add(newInvokePoint().onClass(PostingsReader.class).onMethod("close"))
+                                                                                .build();
+
+
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+
+        Injections.inject(INTERSECTION_FLOW_COUNTER, POSTINGS_READER_OPEN_COUNTER, POSTINGS_READER_CLOSE_COUNTER);
+
+        setLimits(2);
+
+        createTable("CREATE TABLE %s (pk int primary key, v1 int, v2 text, v3 text)");
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v3"));
+
+        for (int i = 0; i < 100; ++i)
+        {
+            execute("INSERT INTO %s(pk,v1,v2,v3) VALUES (?, ?, ?, ?)", i, i, Integer.toString(i / 20), Integer.toString(i % 10));
+        }
+        flush();
+    }
+
+    @After
+    public void resetCounters()
+    {
+        INTERSECTION_FLOW_COUNTER.reset();
+        POSTINGS_READER_OPEN_COUNTER.reset();
+        POSTINGS_READER_CLOSE_COUNTER.reset();
+    }
+
+    @Test
+    public void queryBelowSelectiveLimitUsesDeferredFlows() throws Throwable
+    {
+        assertEquals(50, execute("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50").size());
+        Assert.assertEquals(0, INTERSECTION_FLOW_COUNTER.get());
+
+        Assert.assertEquals(POSTINGS_READER_OPEN_COUNTER.get(), POSTINGS_READER_CLOSE_COUNTER.get());
+    }
+
+    @Test
+    public void queryAtSelectiveLimitUsesDeferredFlows() throws Throwable
+    {
+        assertEquals(20, execute("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50 AND v2 = '1'").size());
+        Assert.assertEquals(1, INTERSECTION_FLOW_COUNTER.get());
+
+        Assert.assertEquals(POSTINGS_READER_OPEN_COUNTER.get(), POSTINGS_READER_CLOSE_COUNTER.get());
+    }
+
+    @Test
+    public void queryAboveSelectiveLimitUsesDirectFlows() throws Throwable
+    {
+        assertEquals(2, execute("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50 AND v2 = '1' AND v3 = '9'").size());
+        Assert.assertEquals(1, INTERSECTION_FLOW_COUNTER.get());
+
+        Assert.assertEquals(POSTINGS_READER_OPEN_COUNTER.get(), POSTINGS_READER_CLOSE_COUNTER.get());
+    }
+
+    @Test
+    public void selectivityOfOneWillNotIntersect() throws Throwable
+    {
+        setLimits(1);
+
+        assertEquals(2, execute("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50 AND v2 = '1' AND v3 = '9'").size());
+        Assert.assertEquals(0, INTERSECTION_FLOW_COUNTER.get());
+
+        Assert.assertEquals(POSTINGS_READER_OPEN_COUNTER.get(), POSTINGS_READER_CLOSE_COUNTER.get());
+    }
+
+    @Test
+    public void selectivityLimitOfZeroDisablesSelectivityTest() throws Throwable
+    {
+        setLimits(0);
+
+        assertEquals(2, execute("SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50 AND v2 = '1' AND v3 = '9'").size());
+        Assert.assertEquals(1, INTERSECTION_FLOW_COUNTER.get());
+
+        Assert.assertEquals(POSTINGS_READER_OPEN_COUNTER.get(), POSTINGS_READER_CLOSE_COUNTER.get());
+    }
+
+    @Test
+    public void tracingIsCorrectlyReported() throws Throwable
+    {
+        Session session = sessionNet();
+
+        String trace = getSingleTraceStatement(session, "SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50 AND v2 = '1' AND v3 = '9'", "Selecting");
+
+        assertEquals("Selecting 2 indexes with cardinalities of 10, 20 out of 3 indexes", trace);
+
+        setLimits(1);
+
+        trace = getSingleTraceStatement(session, "SELECT * FROM %s WHERE v1 >= 0 AND v1 < 50 AND v2 = '1' AND v3 = '9'", "Selecting");
+
+        assertEquals("Selecting 1 index with cardinality of 10 out of 3 indexes", trace);
+
+        Assert.assertEquals(POSTINGS_READER_OPEN_COUNTER.get(), POSTINGS_READER_CLOSE_COUNTER.get());
+    }
+
+    private static void setLimits(final int selectivityLimit) throws Exception
+    {
+        Field selectivity = RangeIntersectionIterator.class.getDeclaredField("INTERSECTION_CLAUSE_LIMIT");
+        selectivity.setAccessible(true);
+        Field modifiersField = Field.class.getDeclaredField("modifiers");
+        modifiersField.setAccessible(true);
+        modifiersField.setInt(selectivity, selectivity.getModifiers() & ~Modifier.FINAL);
+        selectivity.set(null, selectivityLimit);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/SingleNodeQueryFailureTest.java b/test/unit/org/apache/cassandra/index/sai/disk/SingleNodeQueryFailureTest.java
new file mode 100644
index 000000000000..253e3ab324e1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/SingleNodeQueryFailureTest.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.ReadFailureException;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.disk.v1.PostingsReader;
+import org.apache.cassandra.index.sai.disk.v1.TermsReader;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.inject.ActionBuilder.newActionBuilder;
+import static org.apache.cassandra.inject.Expression.quote;
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class SingleNodeQueryFailureTest extends SAITester
+{
+    private static final String CREATE_TABLE_TEMPLATE = "CREATE TABLE %s (id text PRIMARY KEY, v1 int, v2 text) WITH compaction = {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+    }
+
+    @After
+    public void teardown()
+    {
+        Injections.deleteAll();
+    }
+
+    @Test
+    public void testFailedRangeIteratorOnMultiIndexesQuery() throws Throwable
+    {
+        testFailedMultiIndexesQuery("range_iterator", PostingListRangeIterator.class, "getNextSegmentRowId");
+    }
+
+    @Test
+    public void testFailedTermsReaderOnMultiIndexesQuery() throws Throwable
+    {
+        testFailedMultiIndexesQuery("terms_reader", TermsReader.TermQuery.class, "lookupTermDictionary");
+    }
+
+    @Test
+    public void testFailedBkdReaderOnMultiIndexesQuery() throws Throwable
+    {
+        testFailedMultiIndexesQuery("bkd_reader", PostingsReader.class, "<init>");
+    }
+
+    @Test
+    public void testFailedKeyFetcherOnMultiIndexesQuery() throws Throwable
+    {
+        testFailedMultiIndexesQuery("key_fetcher", SSTableContext.DecoratedKeyFetcher.class, "apply");
+    }
+
+    @Test
+    public void testFailedKeyReaderOnMultiIndexesQuery() throws Throwable
+    {
+        testFailedMultiIndexesQuery("key_reader", SSTableContext.DecoratedKeyFetcher.class, "createReader");
+    }
+
+    private void testFailedMultiIndexesQuery(String name, Class<?> targetClass, String targetMethod) throws Throwable
+    {
+        String table = "test_mixed_index_query_" + name;
+
+        Injection injection = Injections.newCustom(name)
+                                        .add(newInvokePoint().onClass(targetClass).onMethod(targetMethod))
+                                        .add(newActionBuilder().actions().doThrow(RuntimeException.class, quote("Injected failure!")))
+                                        .build();
+
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, v1, v2) VALUES ('1', 0, '0')");
+        flush();
+        execute("INSERT INTO %s (id, v1, v2) VALUES ('2', 1, '1')");
+        flush();
+        execute("INSERT INTO %s (id, v1, v2) VALUES ('3', 2, '2')");
+        flush();
+
+        try
+        {
+            Injections.inject(injection);
+
+            assertThatThrownBy(() -> executeNet("SELECT id FROM %s WHERE v1 < 1 and v2 = '0'"))
+                    .isInstanceOf(ReadFailureException.class);
+
+            assertThatThrownBy(() -> executeNet("SELECT id FROM %s WHERE v1 >= 1 and v2 = '1'"))
+                    .isInstanceOf(ReadFailureException.class);
+
+            assertThatThrownBy(() -> executeNet("SELECT id FROM %s WHERE v1 >= 2 and v2 = '2'"))
+                    .isInstanceOf(ReadFailureException.class);
+        }
+        catch (Exception e)
+        {
+            throw Throwables.unchecked(e);
+        }
+        finally
+        {
+            injection.disable();
+        }
+
+        Assert.assertEquals(3, executeNet("SELECT id FROM %s WHERE v1 >= 0").all().size());
+        Assert.assertEquals(1, executeNet("SELECT id FROM %s WHERE v2 = '0'").all().size());
+        Assert.assertEquals(1, executeNet("SELECT id FROM %s WHERE v2 = '1'").all().size());
+        Assert.assertEquals(1, executeNet("SELECT id FROM %s WHERE v2 = '2'").all().size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/TermsIteratorMergerTest.java b/test/unit/org/apache/cassandra/index/sai/disk/TermsIteratorMergerTest.java
new file mode 100644
index 000000000000..387e193d1620
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/TermsIteratorMergerTest.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.assertTrue;
+
+public class TermsIteratorMergerTest extends SAITester
+{
+    @Test
+    public void testMerger() throws Exception
+    {
+        final TermsIterator[] iterators = new TermsIterator[getRandom().nextIntBetween(2, 50)];
+
+        TreeMap<String, List<Long>> expected = new TreeMap<>();
+        for (int termCount = 0; termCount < getRandom().nextIntBetween(10, 50); termCount++)
+        {
+            List<Long> postings = new ArrayList<>();
+            for (int postingsCount = 0; postingsCount < getRandom().nextIntBetween(200, 500); postingsCount++)
+                postings.add(Long.valueOf(getRandom().nextIntBetween(0, 10000)));
+            postings.sort(Long::compareTo);
+            expected.put(getRandom().nextAsciiString(5, 20), postings);
+        }
+
+        String[] expectedTerms = expected.keySet().toArray(new String[] {});
+        for (int termIteratorCount = 0; termIteratorCount < iterators.length; termIteratorCount++)
+        {
+            TreeMap<ByteBuffer, Long[]> termMap = new TreeMap<>();
+            for (int termCount = 0; termCount < getRandom().nextIntBetween(2, expected.size() - 1); termCount++)
+            {
+                String term = expectedTerms[getRandom().nextIntBetween(0, expected.size() - 1)];
+                Long[] expectedPostings = expected.get(term).toArray(new Long[] {});
+                List<Long> postings = new ArrayList<>();
+                for (int postingsCount = 0; postingsCount < getRandom().nextIntBetween(50, 150); postingsCount++)
+                {
+                    while (true)
+                    {
+                        long posting = expectedPostings[getRandom().nextIntBetween(0, expectedPostings.length - 1)];
+                        if (!postings.contains(posting))
+                        {
+                            postings.add(posting);
+                            break;
+                        }
+                    }
+                }
+                postings.sort(Long::compareTo);
+                termMap.put(UTF8Type.instance.decompose(term), postings.toArray(new Long[] {}));
+                iterators[termIteratorCount] = new TermsIteratorImpl(termMap);
+            }
+        }
+
+        TermsIteratorMerger merger = new TermsIteratorMerger(iterators, UTF8Type.instance);
+        while (merger.hasNext())
+        {
+            String term = UTF8Type.instance.compose(UTF8Type.instance.fromComparableBytes(merger.next().asPeekableBytes(ByteComparable.Version.OSS41), ByteComparable.Version.OSS41));
+            assertTrue(expected.containsKey(term));
+            List<Long> expectedPostings = expected.get(term);
+            PostingList postings = merger.postings();
+            long lastRowId = Long.MIN_VALUE;
+            while (true)
+            {
+                long rowId = postings.nextPosting();
+                if (rowId == PostingList.END_OF_STREAM) break;
+                assertTrue(rowId > lastRowId);
+                lastRowId = rowId;
+                assertTrue(expectedPostings.contains(rowId));
+            }
+        }
+    }
+
+    public static class TermsIteratorImpl implements TermsIterator
+    {
+        final TreeMap<ByteBuffer, Long[]> map;
+        final Iterator<Map.Entry<ByteBuffer, Long[]>> iterator;
+        Map.Entry<ByteBuffer, Long[]> current;
+
+        public TermsIteratorImpl(TreeMap<ByteBuffer, Long[]> map)
+        {
+            this.map = map;
+            iterator = map.entrySet().iterator();
+        }
+
+        @Override
+        public PostingList postings() throws IOException
+        {
+
+            return new PostingList()
+            {
+                int index = 0;
+                Long[] postings = current.getValue();
+
+                @Override
+                public long nextPosting() throws IOException
+                {
+                    if (index == postings.length)
+                        return END_OF_STREAM;
+                    return postings[index++];
+                }
+
+                @Override
+                public long size()
+                {
+                    return postings.length;
+                }
+
+                @Override
+                public long advance(long targetRowID) throws IOException
+                {
+                    throw new UnsupportedOperationException();
+                }
+            };
+        }
+
+        @Override
+        public ByteBuffer getMinTerm()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ByteBuffer getMaxTerm()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public void close() throws IOException {}
+
+        @Override
+        public boolean hasNext()
+        {
+            return iterator.hasNext();
+        }
+
+        @Override
+        public ByteComparable next()
+        {
+            current = iterator.next();
+            return version -> UTF8Type.instance.asComparableBytes(current.getKey(), version);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java b/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java
new file mode 100644
index 000000000000..8dce481a1ea9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk;
+
+import java.math.BigInteger;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.function.BiConsumer;
+import java.util.function.BiFunction;
+
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.ListType;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.ReversedType;
+import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+public class TypeUtilTest extends NdiRandomizedTest
+{
+    @Test
+    public void testSimpleType()
+    {
+        for (CQL3Type cql3Type : StorageAttachedIndex.SUPPORTED_TYPES)
+        {
+            AbstractType<?> type = cql3Type.getType();
+            AbstractType<?> reversedType = ReversedType.getInstance(type);
+
+            boolean isLiteral = cql3Type == CQL3Type.Native.ASCII || cql3Type == CQL3Type.Native.TEXT || cql3Type == CQL3Type.Native.VARCHAR;
+            assertEquals(isLiteral, TypeUtil.isLiteral(type));
+            assertEquals(TypeUtil.isLiteral(type), TypeUtil.isLiteral(reversedType));
+            assertEquals(isLiteral, TypeUtil.isUTF8OrAscii(type));
+            assertEquals(TypeUtil.isUTF8OrAscii(type), TypeUtil.isUTF8OrAscii(reversedType));
+            assertEquals(TypeUtil.isIn(type, AbstractAnalyzer.ANALYZABLE_TYPES),
+                         TypeUtil.isIn(reversedType, AbstractAnalyzer.ANALYZABLE_TYPES));
+        }
+    }
+
+    @Test
+    public void testMapType()
+    {
+        for(CQL3Type keyCql3Type : StorageAttachedIndex.SUPPORTED_TYPES)
+        {
+            AbstractType<?> keyType = keyCql3Type.getType();
+
+            testCollectionType((valueType, multiCell) -> MapType.getInstance(keyType, valueType, multiCell),
+                               (valueType, nonFrozenMap) -> {
+                assertEquals(keyType, cellValueType(nonFrozenMap, IndexTarget.Type.KEYS));
+                assertEquals(valueType, cellValueType(nonFrozenMap, IndexTarget.Type.VALUES));
+                AbstractType<?> entryType = cellValueType(nonFrozenMap, IndexTarget.Type.KEYS_AND_VALUES);
+                assertEquals(CompositeType.getInstance(keyType, valueType), entryType);
+                assertTrue(TypeUtil.isLiteral(entryType));
+            });
+        }
+    }
+
+    @Test
+    public void testSetType()
+    {
+        testCollectionType(SetType::getInstance, (a, b) -> {});
+    }
+
+    @Test
+    public void testListType()
+    {
+        testCollectionType(ListType::getInstance, (a, b) -> {});
+    }
+
+    private static void testCollectionType(BiFunction<AbstractType<?>, Boolean, AbstractType<?>> init,
+                                           BiConsumer<AbstractType<?>, AbstractType<?>> nonFrozenCollectionTester)
+    {
+        for(CQL3Type elementType : StorageAttachedIndex.SUPPORTED_TYPES)
+        {
+            AbstractType<?> frozenCollection = init.apply(elementType.getType(), false);
+            AbstractType<?> reversedFrozenCollection = ReversedType.getInstance(frozenCollection);
+
+            AbstractType<?> type = TypeUtil.cellValueType(target(frozenCollection, IndexTarget.Type.FULL));
+            assertTrue(TypeUtil.isFrozenCollection(type));
+            assertTrue(TypeUtil.isLiteral(type));
+            assertFalse(type.isReversed());
+
+            type = TypeUtil.cellValueType(target(reversedFrozenCollection, IndexTarget.Type.FULL));
+            assertTrue(TypeUtil.isFrozenCollection(type));
+            assertTrue(TypeUtil.isLiteral(type));
+            assertTrue(type.isReversed());
+
+            AbstractType<?> nonFrozenCollection = init.apply(elementType.getType(), true);
+            assertEquals(elementType.getType(), cellValueType(nonFrozenCollection, IndexTarget.Type.VALUES));
+            nonFrozenCollectionTester.accept(elementType.getType(), nonFrozenCollection);
+        }
+    }
+
+    private static AbstractType<?> cellValueType(AbstractType<?> type, IndexTarget.Type indexType)
+    {
+        return TypeUtil.cellValueType(target(type, indexType));
+    }
+
+    private static Pair<ColumnMetadata, IndexTarget.Type> target(AbstractType<?> type, IndexTarget.Type indexType)
+    {
+        return Pair.create(column(type), indexType);
+    }
+
+    private static ColumnMetadata column(AbstractType<?> type)
+    {
+        return ColumnMetadata.regularColumn("ks", "cf", "col", type);
+    }
+
+    @Test
+    public void shouldCompareByteBuffers()
+    {
+        final ByteBuffer a = Int32Type.instance.decompose(1);
+        final ByteBuffer b = Int32Type.instance.decompose(2);
+
+        assertEquals(a, TypeUtil.min(a, b, Int32Type.instance));
+        assertEquals(a, TypeUtil.min(b, a, Int32Type.instance));
+        assertEquals(a, TypeUtil.min(a, a, Int32Type.instance));
+        assertEquals(b, TypeUtil.min(b, b, Int32Type.instance));
+        assertEquals(b, TypeUtil.min(null, b, Int32Type.instance));
+        assertEquals(a, TypeUtil.min(a, null, Int32Type.instance));
+
+        assertEquals(b, TypeUtil.max(b, a, Int32Type.instance));
+        assertEquals(b, TypeUtil.max(a, b, Int32Type.instance));
+        assertEquals(a, TypeUtil.max(a, a, Int32Type.instance));
+        assertEquals(b, TypeUtil.max(b, b, Int32Type.instance));
+        assertEquals(b, TypeUtil.max(null, b, Int32Type.instance));
+        assertEquals(a, TypeUtil.max(a, null, Int32Type.instance));
+    }
+
+    @Test
+    public void testBigIntegerEncoding()
+    {
+        Random rng = new Random(-9078270684023566599L);
+
+        BigInteger[] data = new BigInteger[10000];
+        for (int i = 0; i < data.length; i++)
+        {
+            BigInteger randomNumber = new BigInteger(rng.nextInt(1000), rng);
+            if (rng.nextBoolean())
+                randomNumber = randomNumber.negate();
+
+            data[i] = randomNumber;
+        }
+
+        Arrays.sort(data, BigInteger::compareTo);
+
+        for (int i = 1; i < data.length; i++)
+        {
+            BigInteger i0 = data[i - 1];
+            BigInteger i1 = data[i];
+            assertTrue("#" + i, i0.compareTo(i1) <= 0);
+
+            ByteBuffer b0 = TypeUtil.encode(ByteBuffer.wrap(i0.toByteArray()), IntegerType.instance);
+            ByteBuffer b1 = TypeUtil.encode(ByteBuffer.wrap(i1.toByteArray()), IntegerType.instance);
+            assertTrue("#" + i, TypeUtil.compare(b0, b1, IntegerType.instance) <= 0);
+        }
+    }
+
+    @Test
+    public void testMapEntryEncoding()
+    {
+        Random rng = new Random(-9078270684023566599L);
+        CompositeType type = CompositeType.getInstance(UTF8Type.instance, Int32Type.instance);
+
+        // simulate: index memtable insertion
+        String[] data = new String[10000];
+        byte[] temp = new byte[100];
+        for (int i = 0; i < data.length; i++)
+        {
+            rng.nextBytes(temp);
+            String v1 = new String(temp);
+            int v2 = rng.nextInt();
+
+            data[i] = TypeUtil.getString(type.decompose(v1, v2), type);
+        }
+
+        Arrays.sort(data, String::compareTo);
+
+        for (int i = 1; i < data.length; i++)
+        {
+            // simulate: index memtable flush
+            ByteBuffer b0 = TypeUtil.fromString(data[i - 1], type);
+            ByteBuffer b1 = TypeUtil.fromString(data[i], type);
+            assertTrue("#" + i, TypeUtil.compare(b0, b1, type) <= 0);
+
+            // simulate: saving into on-disk trie
+            ByteComparable t0 = ByteComparable.fixedLength(b0);
+            ByteComparable t1 = ByteComparable.fixedLength(b1);
+            assertTrue("#" + i, ByteComparable.compare(t0, t1, ByteComparable.Version.OSS41) <= 0);
+        }
+    }
+
+
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/format/VersionTest.java b/test/unit/org/apache/cassandra/index/sai/disk/format/VersionTest.java
new file mode 100644
index 000000000000..01595e847528
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/format/VersionTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.format;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class VersionTest
+{
+    @Rule
+    public ExpectedException expectedException = ExpectedException.none();
+
+    @Test
+    public void shouldCompareVersions()
+    {
+        final Version aa = new Version('a', 'a');
+        final Version ab = new Version('a', 'b');
+        final Version ba = new Version('b', 'a');
+        final Version bb = new Version('b', 'b');
+
+        assertTrue(bb.onOrAfter(aa));
+        assertTrue(bb.onOrAfter(ab));
+        assertTrue(bb.onOrAfter(ba));
+        assertTrue(bb.onOrAfter(bb));
+
+        assertTrue(ba.onOrAfter(aa));
+        assertTrue(ba.onOrAfter(ab));
+        assertTrue(ba.onOrAfter(ba));
+        assertFalse(ba.onOrAfter(bb));
+
+        assertTrue(ab.onOrAfter(aa));
+        assertTrue(ab.onOrAfter(ab));
+        assertFalse(ab.onOrAfter(ba));
+        assertFalse(ab.onOrAfter(bb));
+
+        assertTrue(aa.onOrAfter(aa));
+        assertFalse(aa.onOrAfter(ab));
+        assertFalse(aa.onOrAfter(ba));
+        assertFalse(aa.onOrAfter(bb));
+    }
+
+    @Test
+    public void shouldFormatVersion()
+    {
+        assertEquals("ac", new Version('a', 'c').toString());
+        assertEquals("ce", new Version('c', 'e').toString());
+    }
+
+    @Test
+    public void shouldParseVersion()
+    {
+        assertEquals("ac", Version.parse("ac").toString());
+        assertEquals("ce", Version.parse("ce").toString());
+    }
+
+    @Test
+    public void shouldNotParseTooShortVersion()
+    {
+        expectedException.expect(IllegalArgumentException.class);
+        Version.parse("a");
+    }
+
+    @Test
+    public void shouldNotParseTooLongVersion()
+    {
+        expectedException.expect(IllegalArgumentException.class);
+        Version.parse("aaa");
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectory.java b/test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectory.java
new file mode 100644
index 000000000000..3b1dac431da8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectory.java
@@ -0,0 +1,152 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.Lock;
+
+import static org.apache.cassandra.index.sai.disk.io.IndexComponents.PER_COLUMN_FILE_NAME_FORMAT;
+
+/**
+ * Limited Lucene Directory for use with writing the KD-Tree only
+ */
+public class BKDTempFilesDirectory extends Directory
+{
+    private static final Logger logger = LoggerFactory.getLogger(BKDTempFilesDirectory.class);
+
+    private final AtomicLong nextTempFileCounter;
+    private final IndexComponents delegate;
+
+    public BKDTempFilesDirectory(IndexComponents delegate, long seed)
+    {
+        this.delegate = delegate;
+        // SequentialWriter#openChannel doesn't fail when we try to create a file that already exist.
+        // If tests were running concurrently, it's possible that we could get a tmp file name collision,
+        // hence each directory has to have a separate seed.
+        this.nextTempFileCounter = new AtomicLong(seed);
+    }
+
+    @Override
+    public IndexOutput createTempOutput(String prefix, String suffix, IOContext context)
+    {
+        final String name = prefix + "_" + Long.toString(nextTempFileCounter.getAndIncrement(), Character.MAX_RADIX) + "_" + suffix;
+        final File file = delegate.descriptor.tmpFileFor(new Component(Component.Type.CUSTOM,
+                                                                       String.format(PER_COLUMN_FILE_NAME_FORMAT,
+                                                                                     delegate.indexName,
+                                                                                     name)));
+        return delegate.createOutput(file);
+    }
+
+    @Override
+    public IndexInput openInput(String name, IOContext context)
+    {
+        final File indexInput = getTmpFileByName(name);
+        
+        try (FileHandle.Builder builder = new FileHandle.Builder(indexInput.getPath()))
+        {
+            final FileHandle handle = builder.complete();
+            final RandomAccessReader reader = handle.createReader();
+
+            return IndexInputReader.create(reader, handle::close);
+        }
+    }
+
+    @Override
+    public long fileLength(String name)
+    {
+        return getTmpFileByName(name).length();
+    }
+
+    @Override
+    public void deleteFile(String name)
+    {
+        final File file = getTmpFileByName(name);
+        if (!file.delete())
+        {
+            logger.warn(delegate.logMessage("Unable to delete file {}"), file.getAbsolutePath());
+        }
+    }
+
+    @Override
+    public void close()
+    {
+        // noop
+    }
+
+    @Override
+    public void syncMetaData()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String[] listAll()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void rename(String source, String dest)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public IndexOutput createOutput(String name, IOContext context)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void sync(Collection<String> collection)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public Lock obtainLock(String s)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    private File getTmpFileByName(String name)
+    {
+        assert name.endsWith(Descriptor.TMP_EXT);
+        final File file = new File(name);
+//        final File file = new File(delegate.descriptor.directory, name);
+        if (file.exists())
+        {
+            return file;
+        }
+        throw new IllegalStateException(delegate.logMessage("unrecognised file: " + name));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectoryTest.java b/test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectoryTest.java
new file mode 100644
index 000000000000..73d63afe8534
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/io/BKDTempFilesDirectoryTest.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FilterDirectory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.bkd.BKDReader;
+import org.apache.lucene.util.bkd.BKDWriter;
+
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.is;
+
+public class BKDTempFilesDirectoryTest extends NdiRandomizedTest
+{
+    @Test
+    public void shouldSortPointsOnDisk() throws IOException
+    {
+        final int numRows = between(300_000, 500_000);
+        final IndexComponents indexComponents = newIndexComponents();
+        final TempFileTrackingDirectoryWrapper directoryWrapper = 
+                new TempFileTrackingDirectoryWrapper(new BKDTempFilesDirectory(indexComponents, randomLong()));
+
+        try (final BKDWriter w = new BKDWriter(numRows,
+                                               directoryWrapper,
+                                               "tmp",
+                                               1,
+                                               4,
+                                               BKDWriter.DEFAULT_MAX_POINTS_IN_LEAF_NODE,
+                                               // low threshold
+                                               1.0,
+                                               numRows,
+                                               true))
+        {
+
+            byte[] scratch = new byte[4];
+            for (int segmentRowId = 0; segmentRowId < numRows; ++segmentRowId)
+            {
+                NumericUtils.intToSortableBytes(segmentRowId, scratch, 0);
+                w.add(scratch, segmentRowId);
+            }
+
+            long indexFP;
+            
+            try (IndexOutput out = indexComponents.createOutput(indexComponents.kdTree))
+            {
+                indexFP = w.finish(out);
+            }
+            
+            assertThat(directoryWrapper.createdTempFiles.size(), is(greaterThan(0)));
+    
+            try (final IndexInput indexInput = indexComponents.openBlockingInput(indexComponents.kdTree))
+            {
+                indexInput.seek(indexFP);
+                final BKDReader bkdReader = new BKDReader(indexInput);
+                assertEquals(numRows, bkdReader.getDocCount());
+            }
+        }
+    }
+
+    private static class TempFileTrackingDirectoryWrapper extends FilterDirectory
+    {
+        private final Set<String> createdTempFiles = new HashSet<>();
+
+        TempFileTrackingDirectoryWrapper(Directory in)
+        {
+            super(in);
+        }
+
+        @Override
+        public IndexOutput createTempOutput(String prefix, String suffix, IOContext context) throws IOException
+        {
+            IndexOutput output = super.createTempOutput(prefix, suffix, context);
+            createdTempFiles.add(output.getName());
+            return output;
+        }
+
+        @Override
+        public void deleteFile(String name) throws IOException
+        {
+            assertTrue(createdTempFiles.contains(name));
+            super.deleteFile(name);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/io/BytesRefUtilTest.java b/test/unit/org/apache/cassandra/index/sai/disk/io/BytesRefUtilTest.java
new file mode 100644
index 000000000000..08b058e64393
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/io/BytesRefUtilTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.junit.Test;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+
+public class BytesRefUtilTest
+{
+    @Test
+    public void shouldCopyBufferToBytesRef()
+    {
+        final Random random = new Random();
+        final byte[] expectedBytes = new byte[21];
+        random.nextBytes(expectedBytes);
+        final BytesRefBuilder refBuilder = new BytesRefBuilder();
+
+        BytesRefUtil.copyBufferToBytesRef(ByteBuffer.wrap(expectedBytes), refBuilder);
+        final BytesRef actualBytesRef = refBuilder.get();
+
+        assertEquals(expectedBytes.length, actualBytesRef.length);
+        final byte[] actualBytes = new byte[actualBytesRef.length];
+        System.arraycopy(actualBytesRef.bytes, actualBytesRef.offset, actualBytes, 0, actualBytesRef.length);
+        assertArrayEquals(expectedBytes, actualBytes);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/io/TrackingIndexComponents.java b/test/unit/org/apache/cassandra/index/sai/disk/io/TrackingIndexComponents.java
new file mode 100644
index 000000000000..dc3f45578f32
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/io/TrackingIndexComponents.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.io;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.base.Throwables;
+
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.lucene.store.IndexInput;
+
+import static org.junit.Assert.assertNotNull;
+
+public class TrackingIndexComponents extends IndexComponents
+{
+    private final Map<TrackingIndexInput, String> openInputs = Collections.synchronizedMap(new HashMap<>());
+
+    public TrackingIndexComponents(String indexName, Descriptor descriptor, SequentialWriterOption sequentialWriterOption,
+                                   final CompressionParams compressionParams)
+    {
+        super(indexName, descriptor, sequentialWriterOption, compressionParams);
+    }
+
+    @Override
+    public IndexInput openBlockingInput(IndexComponent component)
+    {
+        TrackingIndexInput input = new TrackingIndexInput(super.openBlockingInput(component));
+        openInputs.put(input, Throwables.getStackTraceAsString(new RuntimeException("Blocking input created")));
+        return input;
+    }
+
+    @Override
+    public IndexInput openInput(FileHandle handle)
+    {
+        TrackingIndexInput input = new TrackingIndexInput(super.openInput(handle));
+        openInputs.put(input, Throwables.getStackTraceAsString(new RuntimeException("Input created")));
+        return input;
+    }
+
+    public Map<IndexInput, String> getOpenInputs()
+    {
+        return new HashMap<>(openInputs);
+    }
+
+    public class TrackingIndexInput extends FilterIndexInput
+    {
+        TrackingIndexInput(IndexInput delegate)
+        {
+            super(delegate);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            super.close();
+            final String creationStackTrace = openInputs.remove(this);
+            assertNotNull("Closed unregistered input: " + this, creationStackTrace);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/BKDReaderTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/BKDReaderTest.java
new file mode 100644
index 000000000000..64d4b18b8b28
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/BKDReaderTest.java
@@ -0,0 +1,395 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.List;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.MergeOneDimPointValues;
+import org.apache.cassandra.index.sai.disk.MutableOneDimPointValues;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NumericUtils;
+import org.assertj.core.util.Lists;
+
+import static org.apache.cassandra.index.sai.metrics.QueryEventListeners.NO_OP_BKD_LISTENER;
+import static org.apache.lucene.index.PointValues.Relation.CELL_CROSSES_QUERY;
+import static org.apache.lucene.index.PointValues.Relation.CELL_INSIDE_QUERY;
+import static org.apache.lucene.index.PointValues.Relation.CELL_OUTSIDE_QUERY;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.instanceOf;
+import static org.hamcrest.Matchers.is;
+
+public class BKDReaderTest extends NdiRandomizedTest
+{
+    private final BKDReader.IntersectVisitor NONE_MATCH = new BKDReader.IntersectVisitor()
+    {
+        @Override
+        public boolean visit(byte[] packedValue)
+        {
+            return false;
+        }
+
+        @Override
+        public Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+        {
+            return CELL_OUTSIDE_QUERY;
+        }
+    };
+
+    private final BKDReader.IntersectVisitor ALL_MATCH = new BKDReader.IntersectVisitor()
+    {
+        @Override
+        public boolean visit(byte[] packedValue)
+        {
+            return true;
+        }
+
+        @Override
+        public Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+        {
+            return CELL_INSIDE_QUERY;
+        }
+    };
+
+    private final BKDReader.IntersectVisitor ALL_MATCH_WITH_FILTERING = new BKDReader.IntersectVisitor()
+    {
+        @Override
+        public boolean visit(byte[] packedValue)
+        {
+            return true;
+        }
+
+        @Override
+        public Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+        {
+            return CELL_CROSSES_QUERY;
+        }
+    };
+
+    @Test
+    public void testInts1D() throws IOException
+    {
+        doTestInts1D();
+    }
+
+    @Test
+    public void testMerge() throws Exception
+    {
+        // Start by testing that the iteratorState returns rowIds in order
+        BKDReader reader1 = createReader(10);
+        BKDReader.IteratorState it1 = reader1.iteratorState();
+        Long expectedRowId = 0L;
+        while (it1.hasNext())
+        {
+            assertEquals(expectedRowId++, it1.next());
+        }
+        it1.close();
+
+        // Next test that an intersection only returns the query values
+        List<Long> expected = Lists.list(8L, 9L);
+        int expectedCount = 0;
+        PostingList intersection = reader1.intersect(buildQuery(8, 9), NO_OP_BKD_LISTENER, new QueryContext());
+        for (Long id = intersection.nextPosting(); id != PostingList.END_OF_STREAM; id = intersection.nextPosting())
+        {
+            assertEquals(expected.get(expectedCount++), id);
+        }
+        intersection.close();
+        reader1.close();
+
+        // Finally test that merger returns the correct values
+        expected = Lists.list(8L, 9L, 18L, 19L);
+        expectedCount = 0;
+
+        reader1 = createReader(10);
+        BKDReader reader2 = createReader(10);
+
+        final IndexComponents indexComponents = newIndexComponents();
+
+        List<BKDReader.IteratorState> iterators = ImmutableList.of(reader1.iteratorState(), reader2.iteratorState((rowID) -> rowID + 10));
+        MergeOneDimPointValues merger = new MergeOneDimPointValues(iterators, Int32Type.instance);
+        final BKDReader reader = finishAndOpenReaderOneDim(2, merger, 20, indexComponents);
+
+        final int queryMin = 8;
+        final int queryMax = 9;
+
+        intersection = reader.intersect(buildQuery(queryMin, queryMax), NO_OP_BKD_LISTENER, new QueryContext());
+
+        for (Long id = intersection.nextPosting(); id != PostingList.END_OF_STREAM; id = intersection.nextPosting())
+        {
+            assertEquals(expected.get(expectedCount++), id);
+        }
+
+        intersection.close();
+
+        for (BKDReader.IteratorState iterator : iterators)
+        {
+            iterator.close();
+        }
+
+        reader1.close();
+        reader2.close();
+        reader.close();
+    }
+
+    private BKDReader createReader(int numRows) throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+        byte[] scratch = new byte[4];
+        for (int docID = 0; docID < numRows; docID++)
+        {
+            NumericUtils.intToSortableBytes(docID, scratch, 0);
+            buffer.addPackedValue(docID, new BytesRef(scratch));
+        }
+        return finishAndOpenReaderOneDim(2, buffer, indexComponents);
+    }
+
+    private void doTestInts1D() throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final int numRows = between(100, 400);
+        final BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+
+        byte[] scratch = new byte[4];
+        for (int docID = 0; docID < numRows; docID++)
+        {
+            NumericUtils.intToSortableBytes(docID, scratch, 0);
+            buffer.addPackedValue(docID, new BytesRef(scratch));
+        }
+
+        final BKDReader reader = finishAndOpenReaderOneDim(2, buffer, indexComponents);
+
+        try (BKDReader.IteratorState iterator = reader.iteratorState())
+        {
+            while (iterator.hasNext())
+            {
+                int value = NumericUtils.sortableBytesToInt(iterator.scratch, 0);
+                System.out.println("term=" + value);
+                iterator.next();
+            }
+        }
+
+        try (PostingList intersection = reader.intersect(NONE_MATCH, NO_OP_BKD_LISTENER, new QueryContext()))
+        {
+            assertNull(intersection);
+        }
+
+        try (PostingList collectAllIntersection = reader.intersect(ALL_MATCH, NO_OP_BKD_LISTENER, new QueryContext());
+             PostingList filteringIntersection = reader.intersect(ALL_MATCH_WITH_FILTERING, NO_OP_BKD_LISTENER, new QueryContext()))
+        {
+            assertEquals(numRows, collectAllIntersection.size());
+            assertEquals(numRows, filteringIntersection.size());
+
+            for (int docID = 0; docID < numRows; docID++)
+            {
+                assertEquals(docID, collectAllIntersection.nextPosting());
+                assertEquals(docID, filteringIntersection.nextPosting());
+            }
+
+            assertEquals(PostingList.END_OF_STREAM, collectAllIntersection.nextPosting());
+            assertEquals(PostingList.END_OF_STREAM, filteringIntersection.nextPosting());
+        }
+
+        // Simple 1D range query:
+        final int queryMin = 42;
+        final int queryMax = 87;
+
+        final PostingList intersection = reader.intersect(buildQuery(queryMin, queryMax), NO_OP_BKD_LISTENER, new QueryContext());
+
+        assertThat(intersection, is(instanceOf(MergePostingList.class)));
+        long expectedRowID = queryMin;
+        for (long id = intersection.nextPosting(); id != PostingList.END_OF_STREAM; id = intersection.nextPosting())
+        {
+            assertEquals(expectedRowID++, id);
+        }
+        assertEquals(queryMax - queryMin + 1, intersection.size());
+
+        intersection.close();
+        reader.close();
+    }
+
+    @Test
+    public void testAdvance() throws IOException
+    {
+        doTestAdvance(false);
+    }
+
+    @Test
+    public void testAdvanceCrypto() throws IOException
+    {
+        doTestAdvance(true);
+    }
+
+    private void doTestAdvance(boolean crypto) throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final int numRows = between(1000, 2000);
+        final BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+
+        byte[] scratch = new byte[4];
+        for (int docID = 0; docID < numRows; docID++)
+        {
+            NumericUtils.intToSortableBytes(docID, scratch, 0);
+            buffer.addPackedValue(docID, new BytesRef(scratch));
+        }
+
+        final BKDReader reader = finishAndOpenReaderOneDim(2, buffer, indexComponents);
+
+        PostingList intersection = reader.intersect(NONE_MATCH, NO_OP_BKD_LISTENER, new QueryContext());
+        assertNull(intersection);
+
+        intersection = reader.intersect(ALL_MATCH, NO_OP_BKD_LISTENER, new QueryContext());
+        assertEquals(numRows, intersection.size());
+        assertEquals(100, intersection.advance(100));
+        assertEquals(200, intersection.advance(200));
+        assertEquals(300, intersection.advance(300));
+        assertEquals(400, intersection.advance(400));
+
+        assertEquals(401, intersection.advance(401));
+        long expectedRowID = 402;
+        for (long id = intersection.nextPosting(); expectedRowID < 500; id = intersection.nextPosting())
+        {
+            assertEquals(expectedRowID++, id);
+        }
+        assertEquals(PostingList.END_OF_STREAM, intersection.advance(numRows + 1));
+
+        intersection.close();
+    }
+
+    @Test
+    public void testResourcesReleaseWhenQueryDoesntMatchAnything() throws Exception
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+        byte[] scratch = new byte[4];
+        for (int docID = 0; docID < 1000; docID++)
+        {
+            NumericUtils.intToSortableBytes(docID, scratch, 0);
+            buffer.addPackedValue(docID, new BytesRef(scratch));
+        }
+        // add a gap between 1000 and 1100
+        for (int docID = 1000; docID < 2000; docID++)
+        {
+            NumericUtils.intToSortableBytes(docID + 100, scratch, 0);
+            buffer.addPackedValue(docID, new BytesRef(scratch));
+        }
+
+        final BKDReader reader = finishAndOpenReaderOneDim(50, buffer, indexComponents);
+
+        final PostingList intersection = reader.intersect(buildQuery(1017, 1096), NO_OP_BKD_LISTENER, new QueryContext());
+        assertNull(intersection);
+    }
+
+    private BKDReader.IntersectVisitor buildQuery(int queryMin, int queryMax)
+    {
+        return new BKDReader.IntersectVisitor()
+        {
+            @Override
+            public boolean visit(byte[] packedValue)
+            {
+                int x = NumericUtils.sortableBytesToInt(packedValue, 0);
+                boolean bb = x >= queryMin && x <= queryMax;
+                if (bb) System.out.println("visit value="+x+" bb="+bb);
+                return bb;
+            }
+
+            @Override
+            public Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+            {
+                int min = NumericUtils.sortableBytesToInt(minPackedValue, 0);
+                int max = NumericUtils.sortableBytesToInt(maxPackedValue, 0);
+                assert max >= min;
+
+                if (max < queryMin || min > queryMax)
+                {
+                    return Relation.CELL_OUTSIDE_QUERY;
+                }
+                else if (min >= queryMin && max <= queryMax)
+                {
+                    return CELL_INSIDE_QUERY;
+                }
+                else
+                {
+                    return CELL_CROSSES_QUERY;
+                }
+            }
+        };
+    }
+
+    private BKDReader finishAndOpenReaderOneDim(int maxPointsPerLeaf, BKDTreeRamBuffer buffer, IndexComponents indexComponents) throws IOException
+    {
+        final NumericIndexWriter writer = new NumericIndexWriter(indexComponents,
+                                                                 maxPointsPerLeaf,
+                                                                 Integer.BYTES,
+                                                                 Math.toIntExact(buffer.numRows()),
+                                                                 buffer.numRows(),
+                                                                 new IndexWriterConfig("test", 2, 8),
+                                                                 false);
+
+        final SegmentMetadata.ComponentMetadataMap metadata = writer.writeAll(buffer.asPointValues());
+        final long bkdPosition = metadata.get(IndexComponents.NDIType.KD_TREE).root;
+        assertThat(bkdPosition, is(greaterThan(0L)));
+        final long postingsPosition = metadata.get(IndexComponents.NDIType.KD_TREE_POSTING_LISTS).root;
+        assertThat(postingsPosition, is(greaterThan(0L)));
+
+        FileHandle kdtree = indexComponents.createFileHandle(indexComponents.kdTree);
+        FileHandle kdtreePostings = indexComponents.createFileHandle(indexComponents.kdTreePostingLists);
+        return new BKDReader(indexComponents,
+                             kdtree,
+                             bkdPosition,
+                             kdtreePostings,
+                             postingsPosition);
+    }
+
+    private BKDReader finishAndOpenReaderOneDim(int maxPointsPerLeaf, MutableOneDimPointValues values, int numRows, IndexComponents indexComponents) throws IOException
+    {
+        final NumericIndexWriter writer = new NumericIndexWriter(indexComponents,
+                                                                 maxPointsPerLeaf,
+                                                                 Integer.BYTES,
+                                                                 Math.toIntExact(numRows),
+                                                                 numRows,
+                                                                 new IndexWriterConfig("test", 2, 8),
+                                                                 false);
+
+        final SegmentMetadata.ComponentMetadataMap metadata = writer.writeAll(values);
+        final long bkdPosition = metadata.get(IndexComponents.NDIType.KD_TREE).root;
+        assertThat(bkdPosition, is(greaterThan(0L)));
+        final long postingsPosition = metadata.get(IndexComponents.NDIType.KD_TREE_POSTING_LISTS).root;
+        assertThat(postingsPosition, is(greaterThan(0L)));
+
+        FileHandle kdtree = indexComponents.createFileHandle(indexComponents.kdTree);
+        FileHandle kdtreePostings = indexComponents.createFileHandle(indexComponents.kdTreePostingLists);
+        return new BKDReader(indexComponents,
+                             kdtree,
+                             bkdPosition,
+                             kdtreePostings,
+                             postingsPosition);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBufferTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBufferTest.java
new file mode 100644
index 000000000000..0edf48e793b7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/BKDTreeRamBufferTest.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.lucene.codecs.MutablePointValues;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.bkd.MutablePointsReaderUtils;
+
+public class BKDTreeRamBufferTest
+{
+    @Test
+    public void shouldKeepInsertionOrder()
+    {
+        final BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+        int currentValue = 202;
+        for (int i = 0; i < 100; ++i)
+        {
+            byte[] scratch = new byte[Integer.BYTES];
+            NumericUtils.intToSortableBytes(currentValue--, scratch, 0);
+            buffer.addPackedValue(i, new BytesRef(scratch));
+        }
+
+        final MutablePointValues pointValues = buffer.asPointValues();
+
+        for (int i = 0; i < 100; ++i)
+        {
+            // expect insertion order
+            Assert.assertEquals(i, pointValues.getDocID(i));
+            BytesRef ref = new BytesRef();
+            pointValues.getValue(i, ref);
+            Assert.assertEquals(202 - i, NumericUtils.sortableBytesToInt(ref.bytes, ref.offset));
+        }
+    }
+
+    @Test
+    public void shouldBeSortable()
+    {
+        final BKDTreeRamBuffer buffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+        int value = 301;
+        for (int i = 0; i < 100; ++i)
+        {
+            byte[] scratch = new byte[Integer.BYTES];
+            NumericUtils.intToSortableBytes(value--, scratch, 0);
+            buffer.addPackedValue(i, new BytesRef(scratch));
+        }
+
+        final MutablePointValues pointValues = buffer.asPointValues();
+
+        MutablePointsReaderUtils.sort(100, Integer.BYTES, pointValues, 0, Math.toIntExact(pointValues.size()));
+
+        for (int i = 0; i < 100; ++i)
+        {
+            // expect reverse order after sorting
+            Assert.assertEquals(99 - i, pointValues.getDocID(i));
+            BytesRef ref = new BytesRef();
+            pointValues.getValue(i, ref);
+            Assert.assertEquals(202 + i, NumericUtils.sortableBytesToInt(ref.bytes, ref.offset));
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/FilteringPostingListTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/FilteringPostingListTest.java
new file mode 100644
index 000000000000..966720745f76
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/FilteringPostingListTest.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.utils.ArrayPostingList;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.lucene.util.FixedBitSet;
+
+public class FilteringPostingListTest extends NdiRandomizedTest
+{
+    @Test
+    public void shouldMatchAllWithoutAdvance() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithoutAdvance(postingsArray, 0, postingsArray.length);
+    }
+
+    @Test
+    public void shouldMatchEndRangeWithoutAdvance() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithoutAdvance(postingsArray, 2, postingsArray.length);
+    }
+
+    @Test
+    public void shouldMatchStartRangeWithoutAdvance() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithoutAdvance(postingsArray, 0, 3);
+    }
+
+    @Test
+    public void shouldMatchMiddleRangeWithoutAdvance() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithoutAdvance(postingsArray, 2, 4);
+    }
+
+    @Test
+    public void shouldAdvanceToMiddleMatchingAll() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 0, postingsArray.length, 2);
+    }
+
+    @Test
+    public void shouldAdvanceToStartMatchingAll() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 0, postingsArray.length, 0);
+    }
+
+    @Test
+    public void shouldAdvanceToEndMatchingAll() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 0, postingsArray.length, 5);
+    }
+
+    @Test
+    public void shouldAdvancePastEndMatchingAll() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 0, postingsArray.length, 6);
+    }
+
+    @Test
+    public void shouldAdvanceToBeforeMatchStart() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 3, postingsArray.length, 1);
+    }
+
+    @Test
+    public void shouldAdvanceToAfterMatchEnd() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 0, 2, 3);
+    }
+
+    @Test
+    public void shouldAdvanceToExactMatchStart() throws IOException
+    {
+        int[] postingsArray = { 0, 1, 2, 3, 4, 5 };
+        verifyFilteringWithAdvance(postingsArray, 2, postingsArray.length, 2);
+    }
+
+    private void verifyFilteringWithoutAdvance(int[] postingsArray, int from, int toExclusive) throws IOException
+    {
+        ArrayPostingList delegate = new ArrayPostingList(postingsArray);
+
+        FixedBitSet filter = new FixedBitSet((int)delegate.size());
+        filter.set(from, toExclusive);
+
+        FilteringPostingList filteringPostings = new FilteringPostingList(filter, delegate);
+
+        ArrayPostingList expected = new ArrayPostingList(Arrays.copyOfRange(postingsArray, from, toExclusive));
+        assertPostingListEquals(expected, filteringPostings);
+    }
+
+    private void verifyFilteringWithAdvance(int[] postingsArray, int from, int toExclusive, int target) throws IOException
+    {
+        ArrayPostingList delegate = new ArrayPostingList(postingsArray);
+
+        FixedBitSet filter = new FixedBitSet((int)delegate.size());
+        filter.set(from, toExclusive);
+
+        FilteringPostingList filteringPostings = new FilteringPostingList(filter, delegate);
+
+        // Make sure the expected advance ID is either in the range of matches or the sentinel value.
+        long expectedAdvanceTo = target < from ? from : target >= toExclusive ? PostingList.END_OF_STREAM : target;
+
+        try
+        {
+            long id = filteringPostings.advance(target);
+
+            assertEquals(expectedAdvanceTo, id);
+
+            if (id == PostingList.END_OF_STREAM)
+            {
+                return;
+            }
+        }
+        catch (Exception e)
+        {
+            long id = filteringPostings.advance(target);
+
+            assertEquals(expectedAdvanceTo, id);
+
+            if (id == PostingList.END_OF_STREAM)
+            {
+                return;
+            }
+        }
+
+        ArrayPostingList expected = new ArrayPostingList(postingsArray);
+        expected.advance(target);
+
+        // Advance to the first actual match...
+        while (expected.getOrdinal() <= from)
+        {
+            expected.nextPosting();
+        }
+
+        assertPostingListEquals(expected, filteringPostings);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/LeafOrderMapTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/LeafOrderMapTest.java
new file mode 100644
index 000000000000..2b5dc85d496a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/LeafOrderMapTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.io.RAMIndexOutput;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.SeekingRandomAccessInput;
+import org.apache.lucene.store.ByteArrayIndexInput;
+import org.apache.lucene.util.packed.DirectWriter;
+
+public class LeafOrderMapTest extends NdiRandomizedTest
+{
+    @Test
+    public void test() throws Exception
+    {
+        int[] array = new int[1024];
+        for (int x=0; x < array.length; x++)
+        {
+            array[x] = x;
+        }
+        shuffle(array);
+
+        RAMIndexOutput out = new RAMIndexOutput("");
+
+        LeafOrderMap.write(array, array.length, array.length - 1, out);
+
+        ByteArrayIndexInput input = new ByteArrayIndexInput("", out.getBytes(), 0, (int)out.getFilePointer());
+
+        final byte bits = (byte) DirectWriter.unsignedBitsRequired(array.length - 1);
+        DirectReaders.Reader reader = DirectReaders.getReaderForBitsPerValue(bits);
+
+        for (int x=0; x < array.length; x++)
+        {
+            int value = LeafOrderMap.getValue(new SeekingRandomAccessInput(input), 0, x, reader);
+
+            assertEquals(array[x], value);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/MergePostingListTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/MergePostingListTest.java
new file mode 100644
index 000000000000..1ada0aea3065
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/MergePostingListTest.java
@@ -0,0 +1,337 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import com.google.common.primitives.Ints;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.utils.ArrayPostingList;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+
+public class MergePostingListTest extends NdiRandomizedTest
+{
+    @Test
+    public void shouldMergeInterleavedPostingLists() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 1, 4, 6 }),
+                new ArrayPostingList(new int[]{ 2, 3, 4 }),
+                new ArrayPostingList(new int[]{ 1, 6 }),
+                new ArrayPostingList(new int[]{ 2, 5 }),
+                new ArrayPostingList(new int[]{ 3, 6 }),
+                new ArrayPostingList(new int[]{ 3, 5, 6 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        assertPostingListEquals(new ArrayPostingList(new int[]{ 1, 2, 3, 4, 5, 6 }), merged);
+    }
+
+    @Test
+    public void shouldMergeDisjointPostingLists() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 1, 6 }),
+                new ArrayPostingList(new int[]{ 8, 9, 11 }),
+                new ArrayPostingList(new int[]{ 15 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        assertPostingListEquals(new ArrayPostingList(new int[]{ 1, 6, 8, 9, 11, 15 }), merged);
+    }
+
+    @Test
+    public void shouldMergeSinglePostingList() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(new ArrayPostingList(new int[]{ 1, 4, 6 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        assertPostingListEquals(new ArrayPostingList(new int[]{ 1, 4, 6 }), merged);
+    }
+
+    @Test
+    public void shouldMergeSamePostingLists() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(new ArrayPostingList(new int[]{ 0 }),
+                                                                                      new ArrayPostingList(new int[]{ 0 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        assertPostingListEquals(new ArrayPostingList(new int[]{ 0 }), merged);
+    }
+
+    @Test
+    public void shouldAdvanceAllMergedLists() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 1, 5, 10 }),
+                new ArrayPostingList(new int[]{ 2, 3, 8 }),
+                new ArrayPostingList(new int[]{ 3, 5, 9 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+        final PostingList expected = new ArrayPostingList(new int[]{ 1, 2, 3, 5, 8, 9, 10 });
+
+        assertEquals(expected.advance(9),
+                     merged.advance(9));
+
+        assertPostingListEquals(expected, merged);
+    }
+
+
+    @Test
+    public void shouldConsumeDuplicatedPostingOnAdvance() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 1, 4, 6 }),
+                new ArrayPostingList(new int[]{ 2, 3, 4 }),
+                new ArrayPostingList(new int[]{ 1, 6 }),
+                new ArrayPostingList(new int[]{ 2, 5 }),
+                new ArrayPostingList(new int[]{ 3, 6 }),
+                new ArrayPostingList(new int[]{ 3, 5, 6 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        assertEquals(2, merged.advance(2));
+        assertEquals(4, merged.advance(4));
+        assertPostingListEquals(new ArrayPostingList(new int[]{ 5, 6 }), merged);
+    }
+
+    @Test
+    public void shouldInterleaveNextAndAdvance() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 1, 4, 6 }),
+                new ArrayPostingList(new int[]{ 2, 3, 4 }),
+                new ArrayPostingList(new int[]{ 1, 6 }),
+                new ArrayPostingList(new int[]{ 2, 5 }),
+                new ArrayPostingList(new int[]{ 3, 6 }),
+                new ArrayPostingList(new int[]{ 3, 5, 6 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        assertEquals(2, merged.advance(2));
+        assertEquals(3, merged.nextPosting());
+        assertEquals(5, merged.advance(5));
+        assertEquals(6, merged.nextPosting());
+    }
+
+    @Test
+    public void shouldAdvanceToAllElementsWithoutFailures()
+    {
+        testAdvancingToAllElements();
+    }
+
+    @Test
+    public void shouldNotSkipUnconsumedElementOnAdvance() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 1, 2 }),
+                new ArrayPostingList(new int[]{ 3 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+        assertEquals(1, merged.nextPosting());
+        assertEquals(2, merged.advance(2));
+        assertEquals(3, merged.nextPosting());
+    }
+
+    @Test
+    public void shouldNotReadFromExhaustedChild() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(
+                new ArrayPostingList(new int[]{ 2 }),
+                new ArrayPostingList(new int[]{ 1, 3, 4 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+        assertEquals(1, merged.nextPosting());
+        assertEquals(3, merged.advance(3));
+        assertEquals(4, merged.advance(4));
+    }
+
+    @Test
+    public void shouldSkipDuplicates() throws IOException
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(new ArrayPostingList(new int[]{ 1, 1, 2, 2, 2, 2, 5, 5 }),
+                                                                                      new ArrayPostingList(new int[]{ 1, 2, 2, 3, 3, 4, 4, 5 }));
+
+        final PostingList merged = MergePostingList.merge(lists);
+        assertEquals(1, merged.nextPosting());
+        assertEquals(2, merged.nextPosting());
+        assertEquals(3, merged.advance(3));
+        assertEquals(4, merged.advance(4));
+        assertEquals(5, merged.nextPosting());
+        assertEquals(PostingList.END_OF_STREAM, merged.nextPosting());
+    }
+
+    @Test
+    public void shouldInterleaveNextAndAdvanceOnRandom() throws IOException
+    {
+        for (int i = 0; i < 1000; ++i)
+        {
+            testAdvancingOnRandom();
+        }
+    }
+
+    private PriorityQueue<PostingList.PeekablePostingList> newPriorityQueue(PostingList...postingLists)
+    {
+        PriorityQueue<PostingList.PeekablePostingList> queue = new PriorityQueue<>(postingLists.length, Comparator.comparingLong(PostingList.PeekablePostingList::peek));
+        for (PostingList postingList : postingLists)
+            queue.add(postingList.peekable());
+        return queue;
+    }
+    
+    private void testAdvancingOnRandom() throws IOException
+    {
+        final int postingsCount = nextInt(1, 50_000);
+        final int postingListCount = nextInt(5, 50);
+
+        final AtomicInteger rowId = new AtomicInteger();
+        final int[] postings = IntStream.generate(() -> rowId.addAndGet(nextInt(0, 10)))
+                                        .limit(postingsCount)
+                                        .toArray();
+        final int[] postingsWithoutDuplicates = IntStream.of(postings)
+                                                         .distinct()
+                                                         .toArray();
+
+        // split postings into multiple lists
+        final Map<Integer, List<Integer>> splitPostings = Arrays.stream(postings)
+                                                                .boxed()
+                                                                .collect(Collectors.groupingBy(it -> nextInt(postingListCount)));
+
+        final PriorityQueue<PostingList.PeekablePostingList> splitPostingLists = new PriorityQueue<>(splitPostings.size(), Comparator.comparingLong(PostingList.PeekablePostingList::peek));
+        for (List<Integer> split : splitPostings.values())
+        {
+            splitPostingLists.add(new ArrayPostingList(Ints.toArray(split)).peekable());
+        }
+
+        final PostingList merge = MergePostingList.merge(splitPostingLists);
+        final PostingList expected = new ArrayPostingList(postingsWithoutDuplicates);
+
+        final List<PostingListAdvance> actions = new ArrayList<>();
+        for (int idx = 0; idx < postingsWithoutDuplicates.length; idx++)
+        {
+            if (nextInt(0, 8) == 0)
+            {
+                actions.add((postingList) -> {
+                    try
+                    {
+                        return postingList.nextPosting();
+                    }
+                    catch (IOException e)
+                    {
+                        fail(e.getMessage());
+                        throw new RuntimeException(e);
+                    }
+                });
+            }
+            else
+            {
+                final int skips = nextInt(0, 10);
+                idx = Math.min(idx + skips, postingsWithoutDuplicates.length - 1);
+                final int rowID = postingsWithoutDuplicates[idx];
+                actions.add((postingList) -> {
+                    while (true)
+                    {
+                        try
+                        {
+                            return postingList.advance(rowID);
+                        }
+                        catch (ArrayPostingList.LookupException ignore)
+                        {
+                            // continue
+                        }
+                        catch (Exception e)
+                        {
+                            fail();
+                        }
+                    }
+                });
+            }
+        }
+
+        for (PostingListAdvance action : actions)
+        {
+            long expectedResult = action.advance(expected);
+            long actualResult = action.advance(merge);
+
+            assertEquals(expectedResult, actualResult);
+        }
+    }
+
+    private void testAdvancingToAllElements()
+    {
+        final int[] postings1 = randomPostings();
+        final int[] postings2 = randomPostings();
+
+        final int[] mergedPostings = IntStream.concat(IntStream.of(postings1), IntStream.of(postings2))
+                                              .distinct()
+                                              .sorted()
+                                              .toArray();
+
+        final PriorityQueue<PostingList.PeekablePostingList> lists = newPriorityQueue(new ArrayPostingList(postings1), new ArrayPostingList(postings2));
+
+        final PostingList merged = MergePostingList.merge(lists);
+
+        // tokens are equal row IDs in this test case
+        for (int targetRowID : mergedPostings)
+        {
+            long rowID;
+            while (true)
+            {
+                try
+                {
+                    rowID = merged.advance(targetRowID);
+                    break;
+                }
+                catch (ArrayPostingList.LookupException ignore)
+                {
+                    // continue
+                }
+                catch (Exception e)
+                {
+                    fail();
+                }
+            }
+            assertEquals(targetRowID, rowID);
+        }
+    }
+
+    private int[] randomPostings()
+    {
+        final AtomicInteger rowId = new AtomicInteger();
+        return IntStream.generate(() -> rowId.getAndAdd(randomIntBetween(0, 5)))
+                        .limit(randomIntBetween(1 << 10, 1 << 12))
+                        .toArray();
+    }
+
+    private interface PostingListAdvance
+    {
+        long advance(PostingList list) throws IOException;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/MetadataTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/MetadataTest.java
new file mode 100644
index 000000000000..ef86b487fcc5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/MetadataTest.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.UUID;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexOutputWriter;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.store.IndexInput;
+
+public class MetadataTest extends NdiRandomizedTest
+{
+    @Rule
+    public final ExpectedException expectedException = ExpectedException.none();
+
+    @Test
+    public void shouldReadWrittenMetadata() throws Exception
+    {
+        final Map<String, byte[]> data = new HashMap<>();
+        final IndexComponents components = newIndexComponents();
+        try (MetadataWriter writer = new MetadataWriter(components.createOutput(components.meta)))
+        {
+            int num = nextInt(1, 50);
+            for (int x = 0; x < num; x++)
+            {
+                byte[] bytes = nextBytes(0, 1024);
+
+                String name = UUID.randomUUID().toString();
+
+                data.put(name, bytes);
+                try (MetadataWriter.Builder builder = writer.builder(name))
+                {
+                    builder.writeBytes(bytes, 0, bytes.length);
+                }
+            }
+        }
+        MetadataSource reader = MetadataSource.loadColumnMetadata(components);
+
+        for (Map.Entry<String, byte[]> entry : data.entrySet())
+        {
+            final IndexInput input = reader.get(entry.getKey());
+            assertNotNull(input);
+            final byte[] expectedBytes = entry.getValue();
+            assertEquals(expectedBytes.length, input.length());
+            final byte[] actualBytes = new byte[expectedBytes.length];
+            input.readBytes(actualBytes, 0, expectedBytes.length);
+            assertArrayEquals(expectedBytes, actualBytes);
+        }
+    }
+
+    @Test
+    public void shouldFailWhenFileHasNoHeader() throws IOException
+    {
+        IndexComponents components = newIndexComponents();
+        try (IndexOutputWriter out = components.createOutput(components.meta))
+        {
+            final byte[] bytes = nextBytes(13, 29);
+            out.writeBytes(bytes, bytes.length);
+        }
+
+        expectedException.expect(CorruptIndexException.class);
+        expectedException.expectMessage("codec header mismatch");
+        MetadataSource.loadColumnMetadata(components);
+    }
+
+    @Test
+    public void shouldFailCrcCheckWhenFileIsTruncated() throws IOException
+    {
+        final IndexComponents components = newIndexComponents();
+        final IndexOutputWriter output = writeRandomBytes(components);
+
+        final File indexFile = new File(output.getPath());
+        final long length = indexFile.length();
+        assertTrue(length > 0);
+        final File renamed = temporaryFolder.newFile();
+        FileUtils.renameWithConfirm(indexFile, renamed);
+        assertFalse(new File(output.getPath()).exists());
+
+        try (FileOutputStream outputStream = new FileOutputStream(output.getPath());
+             RandomAccessFile input = new RandomAccessFile(renamed, "r"))
+        {
+            // skip last byte when copying
+            FileUtils.copyTo(input, outputStream, Math.toIntExact(length - 1));
+        }
+
+        expectedException.expect(CorruptIndexException.class);
+        expectedException.expectMessage("misplaced codec footer (file truncated?)");
+        MetadataSource.loadColumnMetadata(components);
+    }
+
+    @Test
+    public void shouldFailCrcCheckWhenFileIsCorrupted() throws IOException
+    {
+        final IndexComponents components = newIndexComponents();
+        final IndexOutputWriter output = writeRandomBytes(components);
+
+        final File indexFile = new File(output.getPath());
+        final long length = indexFile.length();
+        assertTrue(length > 0);
+        final File renamed = temporaryFolder.newFile();
+        FileUtils.renameWithConfirm(indexFile, renamed);
+        assertFalse(new File(output.getPath()).exists());
+
+        try (FileOutputStream outputStream = new FileOutputStream(output.getPath());
+             RandomAccessFile file = new RandomAccessFile(renamed, "r"))
+        {
+            // copy most of the file untouched
+            final byte[] buffer = new byte[Math.toIntExact(length - 1 - CodecUtil.footerLength())];
+            file.read(buffer);
+            outputStream.write(buffer);
+
+            // corrupt a single byte at the end
+            final byte last = (byte) file.read();
+            outputStream.write(~last);
+
+            // copy footer
+            final byte[] footer = new byte[CodecUtil.footerLength()];
+            file.read(footer);
+            outputStream.write(footer);
+        }
+
+        expectedException.expect(CorruptIndexException.class);
+        expectedException.expectMessage("checksum failed");
+        MetadataSource.loadColumnMetadata(components);
+    }
+
+    private IndexOutputWriter writeRandomBytes(IndexComponents indexComponents) throws IOException
+    {
+        final IndexOutputWriter output = indexComponents.createOutput(indexComponents.meta);
+        try (MetadataWriter writer = new MetadataWriter(output))
+        {
+            byte[] bytes = nextBytes(11, 1024);
+
+            try (MetadataWriter.Builder builder = writer.builder("name"))
+            {
+                builder.writeBytes(bytes, 0, bytes.length);
+            }
+        }
+        return output;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriterTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriterTest.java
new file mode 100644
index 000000000000..9c320e2c37d0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriterTest.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.disk.ImmutableOneDimPointValues;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.MemtableTermsIterator;
+import org.apache.cassandra.index.sai.disk.MutableOneDimPointValues;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.TermsIterator;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.metrics.QueryEventListeners;
+import org.apache.cassandra.index.sai.utils.AbstractIterator;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.NumericUtils;
+
+public class NumericIndexWriterTest extends NdiRandomizedTest
+{
+    @Test
+    public void shouldFlushFromRamBuffer() throws Exception
+    {
+        doShouldFlushFromRamBuffer();
+    }
+
+    private void doShouldFlushFromRamBuffer() throws Exception
+    {
+        final BKDTreeRamBuffer ramBuffer = new BKDTreeRamBuffer(1, Integer.BYTES);
+        final int numRows = 120;
+        int currentValue = numRows;
+        for (int i = 0; i < numRows; ++i)
+        {
+            byte[] scratch = new byte[Integer.BYTES];
+            NumericUtils.intToSortableBytes(currentValue--, scratch, 0);
+            ramBuffer.addPackedValue(i, new BytesRef(scratch));
+        }
+
+        final MutableOneDimPointValues pointValues = ramBuffer.asPointValues();
+
+        final IndexComponents indexComponents = newIndexComponents();
+        int docCount = pointValues.getDocCount();
+
+        SegmentMetadata.ComponentMetadataMap indexMetas;
+
+        try (NumericIndexWriter writer = new NumericIndexWriter(indexComponents,
+                                                                Integer.BYTES,
+                                                                docCount, docCount,
+                                                                IndexWriterConfig.defaultConfig("test"),
+                                                                false))
+        {
+            indexMetas = writer.writeAll(pointValues);
+        }
+
+        final FileHandle kdtree = indexComponents.createFileHandle(indexComponents.kdTree);
+        final FileHandle kdtreePostings = indexComponents.createFileHandle(indexComponents.kdTreePostingLists);
+
+        try (BKDReader reader = new BKDReader(indexComponents,
+                                              kdtree,
+                                              indexMetas.get(indexComponents.kdTree.ndiType).root,
+                                              kdtreePostings,
+                                              indexMetas.get(indexComponents.kdTreePostingLists.ndiType).root
+        ))
+        {
+            final Counter visited = Counter.newCounter();
+            try (final PostingList ignored = reader.intersect(new BKDReader.IntersectVisitor()
+            {
+                @Override
+                public boolean visit(byte[] packedValue)
+                {
+                    // we should read point values in reverse order after sorting
+                    assertEquals(1 + visited.get(), NumericUtils.sortableBytesToInt(packedValue, 0));
+                    visited.addAndGet(1);
+                    return true;
+                }
+
+                @Override
+                public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+                {
+                    return PointValues.Relation.CELL_CROSSES_QUERY;
+                }
+            }, QueryEventListeners.NO_OP_BKD_LISTENER, new QueryContext()))
+            {
+                assertEquals(numRows, visited.get());
+            }
+        }
+    }
+
+    @Test
+    public void shouldFlushFromMemtable() throws Exception
+    {
+        final int maxSegmentRowId = 100;
+        final TermsIterator termEnum = buildTermEnum(0, maxSegmentRowId);
+        final ImmutableOneDimPointValues pointValues = ImmutableOneDimPointValues
+                .fromTermEnum(termEnum, Int32Type.instance);
+
+        final IndexComponents indexComponents = newIndexComponents();
+
+        SegmentMetadata.ComponentMetadataMap indexMetas;
+        try (NumericIndexWriter writer = new NumericIndexWriter(indexComponents,
+                                                                TypeUtil.fixedSizeOf(Int32Type.instance),
+                                                                maxSegmentRowId, maxSegmentRowId,
+                                                                IndexWriterConfig.defaultConfig("test"), false))
+        {
+            indexMetas = writer.writeAll(pointValues);
+        }
+
+        final FileHandle kdtree = indexComponents.createFileHandle(indexComponents.kdTree);
+        final FileHandle kdtreePostings = indexComponents.createFileHandle(indexComponents.kdTreePostingLists);
+
+        try (BKDReader reader = new BKDReader(indexComponents,
+                                              kdtree,
+                                              indexMetas.get(indexComponents.kdTree.ndiType).root,
+                                              kdtreePostings,
+                                              indexMetas.get(indexComponents.kdTreePostingLists.ndiType).root
+        ))
+        {
+            final Counter visited = Counter.newCounter();
+            try (final PostingList ignored = reader.intersect(new BKDReader.IntersectVisitor()
+            {
+                @Override
+                public boolean visit(byte[] packedValue)
+                {
+                    final ByteComparable actualTerm = ByteComparable.fixedLength(packedValue);
+                    final ByteComparable expectedTerm = ByteComparable.of(Math.toIntExact(visited.get()));
+                    assertEquals("Point value mismatch after visiting " + visited.get() + " entries.", 0,
+                                 ByteComparable.compare(actualTerm, expectedTerm, ByteComparable.Version.OSS41));
+
+                    visited.addAndGet(1);
+                    return true;
+                }
+
+                @Override
+                public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue)
+                {
+                    return PointValues.Relation.CELL_CROSSES_QUERY;
+                }
+            }, QueryEventListeners.NO_OP_BKD_LISTENER, new QueryContext()))
+            {
+                assertEquals(maxSegmentRowId, visited.get());
+            }
+        }
+    }
+
+    private TermsIterator buildTermEnum(int startTermInclusive, int endTermExclusive)
+    {
+        final ByteBuffer minTerm = Int32Type.instance.decompose(startTermInclusive);
+        final ByteBuffer maxTerm = Int32Type.instance.decompose(endTermExclusive);
+
+        final AbstractIterator<Pair<ByteComparable, IntArrayList>> iterator = new AbstractIterator<Pair<ByteComparable, IntArrayList>>()
+        {
+            private int currentTerm = startTermInclusive;
+            private int currentRowId = 0;
+
+            @Override
+            protected Pair<ByteComparable, IntArrayList> computeNext()
+            {
+                if (currentTerm >= endTermExclusive)
+                {
+                    return endOfData();
+                }
+                final ByteBuffer term = Int32Type.instance.decompose(currentTerm++);
+                final IntArrayList postings = new IntArrayList();
+                postings.add(currentRowId++);
+                final ByteSource encoded = Int32Type.instance.asComparableBytes(term, ByteComparable.Version.OSS41);
+                return Pair.create(v -> encoded, postings);
+            }
+        };
+
+        return new MemtableTermsIterator(minTerm, maxTerm, iterator);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/NumericValuesTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/NumericValuesTest.java
new file mode 100644
index 000000000000..746f842ad9ae
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/NumericValuesTest.java
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+
+import java.util.Arrays;
+import java.util.function.LongFunction;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.io.util.FileHandle;
+
+public class NumericValuesTest extends NdiRandomizedTest
+{
+    @Test
+    public void testMonotonic() throws Exception
+    {
+        doTest(true);
+    }
+
+    @Test
+    public void testRegular() throws Exception
+    {
+        doTest(false);
+    }
+
+    @Test
+    public void testRepeatsMonotonicValues() throws Exception
+    {
+        testRepeatedNumericValues(true);
+    }
+
+    @Test
+    public void testRepeatsRegularValues() throws Exception
+    {
+        testRepeatedNumericValues(false);
+    }
+
+    private void testRepeatedNumericValues(boolean monotonic) throws Exception
+    {
+        int length = 64_000;
+        final IndexComponents components = newIndexComponents();
+        writeTokens(monotonic, components, new long[length], prev -> 1000L);
+
+        final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+
+        try (FileHandle fileHandle = components.createFileHandle(IndexComponents.TOKEN_VALUES);
+             LongArray reader = monotonic ? new MonotonicBlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source).open()
+                                          : new BlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source).open())
+        {
+            for (int x = 0; x < length; x++)
+            {
+                assertEquals(reader.get(x), 1000);
+            }
+        }
+    }
+
+    @Test
+    public void testRepeatsRegularValuesFindTokenRowID() throws Exception
+    {
+        testRepeatedNumericValuesFindTokenRowID();
+    }
+
+    @Test
+    public void testTokenFind() throws Exception
+    {
+        final long[] array = new long[64_000];
+        final IndexComponents components = newIndexComponents();
+        writeTokens(false, components, array, prev -> prev + nextInt(2, 100));
+
+        final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+
+        try (FileHandle fileHandle = components.createFileHandle(IndexComponents.TOKEN_VALUES);
+             LongArray reader = new BlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source).open())
+        {
+            assertEquals(array.length, reader.length());
+
+            for (int x = 0; x < array.length; x++)
+            {
+                long rowId = reader.findTokenRowID(array[x]);
+                assertEquals("rowID=" + x + " token=" + array[x], x, rowId);
+                assertEquals(rowId, reader.findTokenRowID(array[x]));
+            }
+        }
+
+        // non-exact match
+        try (FileHandle fileHandle = components.createFileHandle(IndexComponents.TOKEN_VALUES);
+             LongArray reader = new BlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source).open())
+        {
+            assertEquals(array.length, reader.length());
+
+            for (int x = 0; x < array.length; x++)
+            {
+                long rowId = reader.findTokenRowID(array[x] - 1);
+                assertEquals("rowID=" + x + " matched token=" + array[x] + " target token="+(array[x] - 1), x, rowId);
+                assertEquals(rowId, reader.findTokenRowID(array[x] - 1));
+            }
+        }
+    }
+
+    private void testRepeatedNumericValuesFindTokenRowID() throws Exception
+    {
+        int length = 64_000;
+        final IndexComponents components = newIndexComponents();
+        writeTokens(false, components, new long[length], prev -> 1000L);
+        final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+
+        try (FileHandle fileHandle = components.createFileHandle(IndexComponents.TOKEN_VALUES);
+             LongArray reader = new BlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source).open())
+        {
+            for (int x = 0; x < length; x++)
+            {
+                long rowID = reader.findTokenRowID(1000L);
+
+                assertEquals(0, rowID);
+            }
+        }
+    }
+
+    @Test
+    public void testMultiSegmentFindTokenRowId() throws Exception
+    {
+        final IndexComponents components = newIndexComponents();
+        int length = 64_000;
+        long[] array = new long[length];
+        writeTokens(false, components, array, prev -> prev + nextInt(1, 100));
+
+        final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+
+        try (FileHandle fileHandle = components.createFileHandle(IndexComponents.TOKEN_VALUES))
+        {
+            LongArray.Factory factory = new BlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source);
+            for (int segmentOffset : Arrays.asList(0, 33, 123, nextInt(length)))
+            {
+                LongArray.Factory perSegmentFactory = factory.withOffset(segmentOffset);
+                try (LongArray reader = perSegmentFactory.openTokenReader(0, SSTableQueryContext.forTest()))
+                {
+                    for (int i = 0; i < length; i++)
+                    {
+                        long segmentRowId = reader.findTokenRowID(array[i]);
+                        if (i < segmentOffset)
+                        {
+                            // for all tokens smaller than first token in the segment, it should return segment row id 0
+                            assertEquals(0, segmentRowId);
+                        }
+                        else
+                        {
+                            // for tokens within current segment, return its proper segment row id
+                            assertEquals(i - segmentOffset, segmentRowId);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void doTest(boolean monotonic) throws Exception
+    {
+        final long[] array = new long[64_000];
+        final IndexComponents components = newIndexComponents();
+        writeTokens(monotonic, components, array, prev -> monotonic ? prev + nextInt(100) : nextInt(100));
+
+        final MetadataSource source = MetadataSource.loadColumnMetadata(components);
+
+        try (FileHandle fileHandle = components.createFileHandle(IndexComponents.TOKEN_VALUES);
+             LongArray reader = (monotonic ? new MonotonicBlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source)
+                                           : new BlockPackedReader(fileHandle, IndexComponents.TOKEN_VALUES, components, source)).open())
+        {
+            assertEquals(array.length, reader.length());
+
+            for (int x = 0; x < array.length; x++)
+            {
+                assertEquals(array[x], reader.get(x));
+            }
+        }
+    }
+
+    private void writeTokens(boolean monotonic, IndexComponents components, long[] array, LongFunction<Long> generator) throws Exception
+    {
+        final int blockSize = 1 << nextInt(8, 15);
+
+        long current = 0;
+        try (MetadataWriter metadataWriter = new MetadataWriter(components.createOutput(components.meta));
+             final NumericValuesWriter numericWriter = new NumericValuesWriter(IndexComponents.TOKEN_VALUES,
+                                                                               components,
+                                                                               metadataWriter,
+                                                                               monotonic,
+                                                                               blockSize))
+        {
+            for (int x = 0; x < array.length; x++)
+            {
+                current = generator.apply(current);
+
+                numericWriter.add(current);
+
+                array[x] = current;
+            }
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriterTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriterTest.java
new file mode 100644
index 000000000000..727eb01f7192
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/OneDimBKDPostingsWriterTest.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.junit.Test;
+
+import org.agrona.collections.IntArrayList;
+import org.apache.cassandra.index.sai.disk.IndexWriterConfig;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.IndexOutputWriter;
+import org.apache.cassandra.index.sai.utils.ArrayPostingList;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PackedLongValues;
+
+import static org.apache.cassandra.index.sai.metrics.QueryEventListeners.NO_OP_POSTINGS_LISTENER;
+
+public class OneDimBKDPostingsWriterTest extends NdiRandomizedTest
+{
+    @Test
+    public void shouldWritePostingsForEligibleNodes() throws IOException
+    {
+        List<PackedLongValues> leaves =
+                Arrays.asList(postings(1, 5, 7), postings(3, 4, 6), postings(2, 8, 10), postings(11, 12, 13));
+
+        OneDimBKDPostingsWriter writer = new OneDimBKDPostingsWriter(leaves, new IndexWriterConfig("test", 2, 1), newIndexComponents());
+
+        // should build postings for nodes 2 & 3 (lvl 2) and 8, 10, 12, 14 (lvl 4)
+        writer.onLeaf(64, 1, pathToRoot(1, 2, 4, 8, 16));
+        writer.onLeaf(80, 2, pathToRoot(1, 2, 5, 10, 20));
+        writer.onLeaf(96, 3, pathToRoot(1, 3, 6, 12, 24));
+        writer.onLeaf(112, 4, pathToRoot(1, 3, 7, 14, 28));
+
+        IndexComponents indexComponents = newIndexComponents();
+        long fp;
+        try (IndexOutputWriter output = indexComponents.createOutput(indexComponents.kdTreePostingLists))
+        {
+            fp = writer.finish(output);
+        }
+
+        BKDPostingsIndex postingsIndex = new BKDPostingsIndex(indexComponents.createFileHandle(indexComponents.kdTreePostingLists), fp);
+        assertEquals(10, postingsIndex.size());
+
+        // Internal postings...
+        assertTrue(postingsIndex.exists(2));
+        assertTrue(postingsIndex.exists(3));
+        assertTrue(postingsIndex.exists(8));
+        assertTrue(postingsIndex.exists(10));
+        assertTrue(postingsIndex.exists(12));
+        assertTrue(postingsIndex.exists(14));
+
+        assertPostingReaderEquals(indexComponents, postingsIndex, 2, new int[]{ 1, 3, 4, 5, 6, 7 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 3, new int[]{ 2, 8, 10, 11, 12, 13 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 8, new int[]{ 1, 5, 7 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 10, new int[]{ 3, 4, 6 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 12, new int[]{ 2, 8, 10 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 14, new int[]{ 11, 12, 13 });
+
+        // Leaf postings...
+        assertTrue(postingsIndex.exists(64));
+        assertTrue(postingsIndex.exists(80));
+        assertTrue(postingsIndex.exists(96));
+        assertTrue(postingsIndex.exists(112));
+
+        assertPostingReaderEquals(indexComponents, postingsIndex, 64, new int[]{ 1, 5, 7 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 80, new int[]{ 3, 4, 6 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 96, new int[]{ 2, 8, 10 });
+        assertPostingReaderEquals(indexComponents, postingsIndex, 112, new int[]{ 11, 12, 13 });
+    }
+
+    @Test
+    public void shouldSkipPostingListWhenSamplingMisses() throws IOException
+    {
+        List<PackedLongValues> leaves = Collections.singletonList(postings(1, 2, 3));
+        OneDimBKDPostingsWriter writer = new OneDimBKDPostingsWriter(leaves, new IndexWriterConfig("test", 5, 1), newIndexComponents());
+
+        // The tree is too short to have any internal posting lists.
+        writer.onLeaf(16, 1, pathToRoot(1, 2, 4, 8));
+
+        IndexComponents indexComponents = newIndexComponents();
+        long fp;
+        try (IndexOutputWriter output = indexComponents.createOutput(indexComponents.kdTreePostingLists))
+        {
+            fp = writer.finish(output);
+        }
+
+        // There is only a single posting list...the leaf posting list.
+        BKDPostingsIndex postingsIndex = new BKDPostingsIndex(indexComponents.createFileHandle(indexComponents.kdTreePostingLists), fp);
+        assertEquals(1, postingsIndex.size());
+    }
+
+    @Test
+    public void shouldSkipPostingListWhenTooFewLeaves() throws IOException
+    {
+        List<PackedLongValues> leaves = Collections.singletonList(postings(1, 2, 3));
+        OneDimBKDPostingsWriter writer = new OneDimBKDPostingsWriter(leaves, new IndexWriterConfig("test", 2, 2), newIndexComponents());
+
+        // The tree is too short to have any internal posting lists.
+        writer.onLeaf(16, 1, pathToRoot(1, 2, 4, 8));
+
+        IndexComponents indexComponents = newIndexComponents();
+        long fp;
+        try (IndexOutputWriter output = indexComponents.createOutput(indexComponents.kdTreePostingLists))
+        {
+            fp = writer.finish(output);
+        }
+
+        // There is only a single posting list...the leaf posting list.
+        BKDPostingsIndex postingsIndex = new BKDPostingsIndex(indexComponents.createFileHandle(indexComponents.kdTreePostingLists), fp);
+        assertEquals(1, postingsIndex.size());
+    }
+
+    private void assertPostingReaderEquals(IndexComponents indexComponents, BKDPostingsIndex postingsIndex, int nodeID, int[] postings) throws IOException
+    {
+        assertPostingReaderEquals(indexComponents.openBlockingInput(indexComponents.kdTreePostingLists),
+                                  postingsIndex.getPostingsFilePointer(nodeID),
+                                  new ArrayPostingList(postings));
+    }
+
+    private void assertPostingReaderEquals(IndexInput input, long offset, PostingList expected) throws IOException
+    {
+        try (PostingsReader reader = new PostingsReader(input, offset, NO_OP_POSTINGS_LISTENER))
+        {
+            assertPostingListEquals(expected, reader);
+        }
+    }
+
+    private PackedLongValues postings(int... postings)
+    {
+        final PackedLongValues.Builder builder = PackedLongValues.deltaPackedBuilder(PackedInts.COMPACT);
+        for (int posting : postings)
+        {
+            builder.add(posting);
+        }
+        return builder.build();
+    }
+
+    private IntArrayList pathToRoot(int... nodes)
+    {
+        final IntArrayList path = new IntArrayList();
+        for (int node : nodes)
+        {
+            path.add(node);
+        }
+        return path;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java
new file mode 100644
index 000000000000..f39443e3e550
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.IntStream;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.utils.ArrayPostingList;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.lucene.store.IndexInput;
+
+public class PostingsTest extends NdiRandomizedTest
+{
+    @Rule
+    public final ExpectedException expectedException = ExpectedException.none();
+
+    @Test
+    public void testSingleBlockPostingList() throws Exception
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final int blockSize = 1 << between(3, 8);
+        final ArrayPostingList expectedPostingList = new ArrayPostingList(new int[]{ 10, 20, 30, 40, 50, 60 });
+
+        long postingPointer;
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, blockSize, false))
+        {
+            postingPointer = writer.write(expectedPostingList);
+            writer.complete();
+        }
+
+        IndexInput input = indexComponents.openBlockingInput(indexComponents.postingLists);
+        SAICodecUtils.validate(input);
+        input.seek(postingPointer);
+
+        final PostingsReader.BlocksSummary summary = assertBlockSummary(blockSize, expectedPostingList, input);
+        assertEquals(1, summary.offsets.length());
+
+        CountingPostingListEventListener listener = new CountingPostingListEventListener();
+        try (PostingsReader reader = new PostingsReader(input, postingPointer, listener))
+        {
+            expectedPostingList.reset();
+            assertEquals(expectedPostingList.getOrdinal(), reader.getOrdinal());
+            assertEquals(expectedPostingList.size(), reader.size());
+
+            long actualRowID;
+            while ((actualRowID = reader.nextPosting()) != PostingList.END_OF_STREAM)
+            {
+                assertEquals(expectedPostingList.nextPosting(), actualRowID);
+                assertEquals(expectedPostingList.getOrdinal(), reader.getOrdinal());
+            }
+            assertEquals(PostingList.END_OF_STREAM, expectedPostingList.nextPosting());
+            assertEquals(0, listener.advances);
+            assertEquals(reader.size(), listener.decodes);
+        }
+
+        input = indexComponents.openBlockingInput(indexComponents.postingLists);
+        listener = new CountingPostingListEventListener();
+        try (PostingsReader reader = new PostingsReader(input, postingPointer, listener))
+        {
+            assertEquals(0, listener.decodes); // nothing is decoded up-front
+            assertEquals(50, reader.advance(45));
+            assertEquals(5, listener.decodes); // slow advance also decodes
+            assertEquals(60, reader.advance(60));
+            assertEquals(6, listener.decodes); // slow advance also decodes
+            assertEquals(PostingList.END_OF_STREAM, reader.nextPosting());
+            assertEquals(reader.size(), listener.decodes); // nothing more was decoded
+            assertEquals(2, listener.advances);
+        }
+    }
+
+    @Test
+    public void testMultiBlockPostingList() throws Exception
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final int numPostingLists = 1 << between(1, 5);
+        final int blockSize = 1 << between(5, 10);
+        final int numPostings = between(1 << 11, 1 << 15);
+        final ArrayPostingList[] expected = new ArrayPostingList[numPostingLists];
+        final long[] postingPointers = new long[numPostingLists];
+
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, blockSize, false))
+        {
+            for (int i = 0; i < numPostingLists; ++i)
+            {
+                final int[] postings = randomPostings(numPostings);
+                final ArrayPostingList postingList = new ArrayPostingList(postings);
+                expected[i] = postingList;
+                postingPointers[i] = writer.write(postingList);
+            }
+            writer.complete();
+        }
+
+        try (IndexInput input = indexComponents.openBlockingInput(indexComponents.postingLists))
+        {
+            SAICodecUtils.validate(input);
+        }
+
+        for (int i = 0; i < numPostingLists; ++i)
+        {
+            IndexInput input = indexComponents.openBlockingInput(indexComponents.postingLists);
+            input.seek(postingPointers[i]);
+            final ArrayPostingList expectedPostingList = expected[i];
+            final PostingsReader.BlocksSummary summary = assertBlockSummary(blockSize, expectedPostingList, input);
+            assertTrue(summary.offsets.length() > 1);
+
+            final CountingPostingListEventListener listener = new CountingPostingListEventListener();
+            try (PostingsReader reader = new PostingsReader(input, postingPointers[i], listener))
+            {
+                expectedPostingList.reset();
+                assertEquals(expectedPostingList.getOrdinal(), reader.getOrdinal());
+                assertEquals(expectedPostingList.size(), reader.size());
+
+                assertPostingListEquals(expectedPostingList, reader);
+                assertEquals(0, listener.advances);
+            }
+
+            // test skipping to the last block
+            input = indexComponents.openBlockingInput(indexComponents.postingLists);
+            try (PostingsReader reader = new PostingsReader(input, postingPointers[i], listener))
+            {
+                long tokenToAdvance = -1;
+                expectedPostingList.reset();
+                for (int p = 0; p < numPostings - 7; ++p)
+                {
+                    tokenToAdvance = expectedPostingList.nextPosting();
+                }
+
+                expectedPostingList.reset();
+                assertEquals(expectedPostingList.advance(tokenToAdvance),
+                             reader.advance(tokenToAdvance));
+
+                assertPostingListEquals(expectedPostingList, reader);
+                assertEquals(1, listener.advances);
+            }
+        }
+    }
+
+    @Test
+    public void testAdvance() throws Exception
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final int blockSize = 4; // 4 postings per FoR block
+        final int maxSegmentRowID = 30;
+        final int[] postings = IntStream.range(0, maxSegmentRowID).toArray(); // 30 postings = 7 FoR blocks + 1 VLong block
+        final ArrayPostingList expected = new ArrayPostingList(postings);
+
+        long fp;
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, blockSize, false))
+        {
+            fp = writer.write(expected);
+            writer.complete();
+        }
+
+        try (IndexInput input = indexComponents.openBlockingInput(indexComponents.postingLists))
+        {
+            SAICodecUtils.validate(input);
+            input.seek(fp);
+
+            final PostingsReader.BlocksSummary summary = assertBlockSummary(blockSize, expected, input);
+            assertEquals((int) Math.ceil((double) maxSegmentRowID / blockSize), summary.offsets.length());
+
+            for (int i = 0; i < summary.maxValues.length(); i++)
+            {
+                assertEquals(Math.min(maxSegmentRowID - 1, (i + 1) * blockSize - 1), summary.maxValues.get(i));
+            }
+        }
+
+        // exact advance
+        testAdvance(indexComponents, fp, expected, new int[]{ 3, 7, 11, 15, 19 });
+        // non-exact advance
+        testAdvance(indexComponents, fp, expected, new int[]{ 2, 6, 12, 17, 25 });
+
+        // exact advance
+        testAdvance(indexComponents, fp, expected, new int[]{ 3, 5, 7, 12 });
+        // non-exact advance
+        testAdvance(indexComponents, fp, expected, new int[]{ 2, 7, 9, 11 });
+    }
+
+    @Test
+    public void testAdvanceOnRandomizedData() throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final int blockSize = 4;
+        final int numPostings = nextInt(64, 64_000);
+        final int[] postings = randomPostings(numPostings);
+
+        final ArrayPostingList expected = new ArrayPostingList(postings);
+
+        long fp;
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, blockSize, false))
+        {
+            fp = writer.write(expected);
+            writer.complete();
+        }
+
+        try (IndexInput input = indexComponents.openBlockingInput(indexComponents.postingLists))
+        {
+            SAICodecUtils.validate(input);
+            input.seek(fp);
+
+            final PostingsReader.BlocksSummary summary = assertBlockSummary(blockSize, expected, input);
+            assertEquals((int) Math.ceil((double) numPostings / blockSize), summary.offsets.length());
+
+            for (int i = 0; i < summary.maxValues.length(); i++)
+            {
+                assertEquals(postings[Math.min(numPostings - 1, (i + 1) * blockSize - 1)], summary.maxValues.get(i));
+            }
+        }
+
+        testAdvance(indexComponents, fp, expected, postings);
+    }
+
+    @Test
+    public void testNullPostingList() throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, false))
+        {
+            expectedException.expect(IllegalArgumentException.class);
+            writer.write(null);
+            writer.complete();
+        }
+    }
+
+    @Test
+    public void testEmptyPostingList() throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, false))
+        {
+            expectedException.expect(IllegalArgumentException.class);
+            writer.write(new ArrayPostingList(new int[0]));
+        }
+    }
+
+    @Test
+    public void testNonAscendingPostingList() throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, false))
+        {
+            expectedException.expect(IllegalArgumentException.class);
+            writer.write(new ArrayPostingList(new int[]{ 1, 0 }));
+        }
+    }
+
+    private void testAdvance(IndexComponents indexComponents, long fp, ArrayPostingList expected, int[] targetIDs) throws IOException
+    {
+        expected.reset();
+        final CountingPostingListEventListener listener = new CountingPostingListEventListener();
+        try (PostingsReader reader = openReader(indexComponents, fp, listener))
+        {
+            for (int i = 0; i < 2; ++i)
+            {
+                assertEquals(expected.nextPosting(), reader.nextPosting());
+                assertEquals(expected.getOrdinal(), reader.getOrdinal());
+            }
+
+            // If all postings in a block have the same value, we don't actually decode any deltas ;)
+            if (expected.getPostingAt(0) != expected.getPostingAt(reader.getBlockSize() - 1))
+            {
+                assertEquals(2, listener.decodes);
+            }
+
+            for (int target : targetIDs)
+            {
+                final long actualRowId = reader.advance(target);
+                final long expectedRowId = expected.advance(target);
+
+                assertEquals(expectedRowId, actualRowId);
+
+                assertEquals(expected.getOrdinal(), reader.getOrdinal());
+            }
+
+            // check if iterator is correctly positioned
+            assertPostingListEquals(expected, reader);
+            // check if reader emitted all events
+            assertEquals(targetIDs.length, listener.advances);
+        }
+    }
+
+    private PostingsReader openReader(IndexComponents indexComponents, long fp, QueryEventListener.PostingListEventListener listener) throws IOException
+    {
+        IndexInput input = indexComponents.openBlockingInput(indexComponents.postingLists);
+        input.seek(fp);
+        return new PostingsReader(input, fp, listener);
+    }
+
+    private PostingsReader.BlocksSummary assertBlockSummary(int blockSize, PostingList expected, IndexInput input) throws IOException
+    {
+        final PostingsReader.BlocksSummary summary = new PostingsReader.BlocksSummary(input, input.getFilePointer());
+        assertEquals(blockSize, summary.blockSize);
+        assertEquals(expected.size(), summary.numPostings);
+        assertTrue(summary.offsets.length() > 0);
+        assertEquals(summary.offsets.length(), summary.maxValues.length());
+        return summary;
+    }
+
+    private int[] randomPostings(int numPostings)
+    {
+        final AtomicInteger rowId = new AtomicInteger();
+        // postings with duplicates
+        return IntStream.generate(() -> rowId.getAndAdd(randomIntBetween(0, 4)))
+                        .limit(numPostings)
+                        .toArray();
+    }
+
+    static class CountingPostingListEventListener implements QueryEventListener.PostingListEventListener
+    {
+        int advances;
+        int decodes;
+
+        @Override
+        public void onAdvance()
+        {
+            advances++;
+        }
+
+        @Override
+        public void onPostingDecoded()
+        {
+            decodes++;
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/SorterTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/SorterTest.java
new file mode 100644
index 000000000000..3f5bc781986e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/SorterTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.lucene.util.IntroSorter;
+import org.apache.lucene.util.Sorter;
+
+import static org.junit.Assert.assertTrue;
+
+public class SorterTest
+{
+    @Test
+    public void test() throws Exception
+    {
+        final int[] array = new int[100];
+        for (int x=0; x < array.length; x++)
+        {
+            array[x] = x;
+        }
+
+        int[] sortedArray = Arrays.copyOf(array, array.length);
+
+        NdiRandomizedTest.shuffle(array);
+
+        System.out.println("shuffle array="+ Arrays.toString(array));
+
+        final Sorter sorter = new IntroSorter() {
+            int pivotDoc;
+
+            @Override
+            protected void swap(int i, int j) {
+                int o = array[i];
+                array[i] = array[j];
+                array[j] = o;
+            }
+
+            @Override
+            protected void setPivot(int i)
+            {
+                pivotDoc = array[i];
+            }
+
+            @Override
+            protected int comparePivot(int j) {
+                return pivotDoc - array[j];
+            }
+        };
+
+        sorter.sort(0, array.length);
+
+        System.out.println("sorted array="+ Arrays.toString(array));
+
+        assertTrue(Arrays.equals(sortedArray, array));
+    }
+
+
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/TermsReaderTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/TermsReaderTest.java
new file mode 100644
index 000000000000..900be8d8638c
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/TermsReaderTest.java
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.Test;
+
+import com.carrotsearch.hppc.IntArrayList;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.disk.MemtableTermsIterator;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.TermsIterator;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.index.sai.utils.SAICodecUtils;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;
+
+import static org.apache.cassandra.index.sai.disk.InvertedIndexBuilder.buildStringTermsEnum;
+import static org.apache.cassandra.index.sai.metrics.QueryEventListeners.NO_OP_TRIE_LISTENER;
+
+public class TermsReaderTest extends NdiRandomizedTest
+{
+    @Test
+    public void testTermQueriesAgainstShortPostingLists() throws IOException
+    {
+        testTermQueries(randomIntBetween(5, 10), randomIntBetween(5, 10));
+    }
+
+    @Test
+    public void testTermQueriesAgainstLongPostingLists() throws  IOException
+    {
+        testTermQueries(randomIntBetween(512, 1024), randomIntBetween(1024, 2048));
+    }
+
+    @Test
+    public void testTermsIteration() throws IOException
+    {
+        doTestTermsIteration();
+    }
+
+    private void doTestTermsIteration() throws IOException
+    {
+        final int terms = 70, postings = 2;
+        final IndexComponents indexComponents = newIndexComponents();
+        final List<Pair<ByteComparable, IntArrayList>> termsEnum = buildTermsEnum(terms, postings);
+
+        SegmentMetadata.ComponentMetadataMap indexMetas;
+        try (InvertedIndexWriter writer = new InvertedIndexWriter(indexComponents, false))
+        {
+            indexMetas = writer.writeAll(new MemtableTermsIterator(null, null, termsEnum.iterator()));
+        }
+
+        FileHandle termsData = indexComponents.createFileHandle(indexComponents.termsData);
+        FileHandle postingLists = indexComponents.createFileHandle(indexComponents.postingLists);
+
+        long termsFooterPointer = Long.parseLong(indexMetas.get(IndexComponents.NDIType.TERMS_DATA).attributes.get(SAICodecUtils.FOOTER_POINTER));
+
+        try (TermsReader reader = new TermsReader(indexComponents, termsData, postingLists,
+                                                  indexMetas.get(indexComponents.termsData.ndiType).root, termsFooterPointer))
+        {
+            try (TermsIterator actualTermsEnum = reader.allTerms(0, NO_OP_TRIE_LISTENER))
+            {
+                int i = 0;
+                for (ByteComparable term = actualTermsEnum.next(); term != null; term = actualTermsEnum.next())
+                {
+                    final ByteComparable expected = termsEnum.get(i++).left;
+                    assertEquals(0, ByteComparable.compare(expected, term, ByteComparable.Version.OSS41));
+                }
+            }
+        }
+    }
+
+    private void testTermQueries(int numTerms, int numPostings) throws IOException
+    {
+        final IndexComponents indexComponents = newIndexComponents();
+        final List<Pair<ByteComparable, IntArrayList>> termsEnum = buildTermsEnum(numTerms, numPostings);
+
+        SegmentMetadata.ComponentMetadataMap indexMetas;
+        try (InvertedIndexWriter writer = new InvertedIndexWriter(indexComponents, false))
+        {
+            indexMetas = writer.writeAll(new MemtableTermsIterator(null, null, termsEnum.iterator()));
+        }
+
+        FileHandle termsData = indexComponents.createFileHandle(indexComponents.termsData);
+        FileHandle postingLists = indexComponents.createFileHandle(indexComponents.postingLists);
+
+        long termsFooterPointer = Long.parseLong(indexMetas.get(IndexComponents.NDIType.TERMS_DATA).attributes.get(SAICodecUtils.FOOTER_POINTER));
+
+        try (TermsReader reader = new TermsReader(indexComponents, termsData, postingLists,
+                                                  indexMetas.get(indexComponents.termsData.ndiType).root, termsFooterPointer))
+        {
+            for (Pair<ByteComparable, IntArrayList> pair : termsEnum)
+            {
+                final byte[] bytes = ByteSourceInverse.readBytes(pair.left.asComparableBytes(ByteComparable.Version.OSS41));
+                try (PostingList actualPostingList = reader.exactMatch(ByteComparable.fixedLength(bytes), NO_OP_TRIE_LISTENER, new QueryContext()))
+                {
+                    final IntArrayList expectedPostingList = pair.right;
+
+                    assertNotNull(actualPostingList);
+                    assertEquals(expectedPostingList.size(), actualPostingList.size());
+
+                    for (int i = 0; i < expectedPostingList.size(); ++i)
+                    {
+                        final long expectedRowID = expectedPostingList.get(i);
+                        long result = actualPostingList.nextPosting();
+                        assertEquals(expectedRowID, result);
+                    }
+
+                    long lastResult = actualPostingList.nextPosting();
+                    assertEquals(PostingList.END_OF_STREAM, lastResult);
+                }
+
+                // test skipping
+                try (PostingList actualPostingList = reader.exactMatch(ByteComparable.fixedLength(bytes), NO_OP_TRIE_LISTENER, new QueryContext()))
+                {
+                    final IntArrayList expectedPostingList = pair.right;
+                    // test skipping to the last block
+                    final int idxToSkip = numPostings - 2;
+                    // tokens are equal to their corresponding row IDs
+                    final int tokenToSkip = expectedPostingList.get(idxToSkip);
+
+                    long advanceResult = actualPostingList.advance(tokenToSkip);
+                    assertEquals(tokenToSkip, advanceResult);
+
+                    for (int i = idxToSkip + 1; i < expectedPostingList.size(); ++i)
+                    {
+                        final long expectedRowID = expectedPostingList.get(i);
+                        long result = actualPostingList.nextPosting();
+                        assertEquals(expectedRowID, result);
+                    }
+
+                    long lastResult = actualPostingList.nextPosting();
+                    assertEquals(PostingList.END_OF_STREAM, lastResult);
+                }
+            }
+        }
+    }
+
+    private List<Pair<ByteComparable, IntArrayList>> buildTermsEnum(int terms, int postings)
+    {
+        return buildStringTermsEnum(terms, postings, () -> randomSimpleString(4, 10), () -> nextInt(0, Integer.MAX_VALUE));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java
new file mode 100644
index 000000000000..fa743a7735e4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.disk.v1;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.apache.commons.lang3.mutable.MutableLong;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.apache.cassandra.utils.bytecomparable.ByteComparable.Version.OSS41;
+import static org.apache.cassandra.utils.bytecomparable.ByteComparable.compare;
+
+public class TrieTermsDictionaryTest extends NdiRandomizedTest
+{
+    @Test
+    public void testExactMatch() throws Exception
+    {
+        doTestExactMatch();
+    }
+
+    private void doTestExactMatch() throws Exception
+    {
+        final IndexComponents components = newIndexComponents();
+
+        long fp;
+        try (TrieTermsDictionaryWriter writer = new TrieTermsDictionaryWriter(components, false))
+        {
+            writer.add(asByteComparable("ab"), 0);
+            writer.add(asByteComparable("abb"), 1);
+            writer.add(asByteComparable("abc"), 2);
+            writer.add(asByteComparable("abcd"), 3);
+            writer.add(asByteComparable("abd"), 4);
+            fp = writer.complete(new MutableLong());
+        }
+
+        try (FileHandle input = components.createFileHandle(components.termsData);
+             TrieTermsDictionaryReader reader = new TrieTermsDictionaryReader(input.instantiateRebufferer(), fp))
+        {
+            assertEquals(TrieTermsDictionaryReader.NOT_FOUND, reader.exactMatch(asByteComparable("a")));
+            assertEquals(0, reader.exactMatch(asByteComparable("ab")));
+            assertEquals(2, reader.exactMatch(asByteComparable("abc")));
+            assertEquals(TrieTermsDictionaryReader.NOT_FOUND, reader.exactMatch(asByteComparable("abca")));
+            assertEquals(1, reader.exactMatch(asByteComparable("abb")));
+            assertEquals(TrieTermsDictionaryReader.NOT_FOUND, reader.exactMatch(asByteComparable("abba")));
+        }
+    }
+
+    @Test
+    public void testTermEnum() throws IOException
+    {
+        final IndexComponents components = newIndexComponents();
+        final int numKeys = randomIntBetween(16, 512);
+        final List<ByteComparable> byteComparables = Stream.generate(() -> randomSimpleString(4, 48))
+                                                           .limit(numKeys)
+                                                           .sorted()
+                                                           .map(this::asByteComparable)
+                                                           .collect(Collectors.toList());
+
+        long fp;
+        try (TrieTermsDictionaryWriter writer = new TrieTermsDictionaryWriter(components, false))
+        {
+            for (int i = 0; i < byteComparables.size(); ++i)
+            {
+                writer.add(byteComparables.get(i), i);
+            }
+            fp = writer.complete(new MutableLong());
+        }
+
+        try (FileHandle input = components.createFileHandle(components.termsData);
+             TrieTermsDictionaryReader reader = new TrieTermsDictionaryReader(input.instantiateRebufferer(), fp))
+        {
+            final Iterator<Pair<ByteComparable, Long>> iterator = reader.iterator();
+            final Iterator<ByteComparable> expected = byteComparables.iterator();
+            int offset = 0;
+            while (iterator.hasNext())
+            {
+                assertTrue(expected.hasNext());
+                final Pair<ByteComparable, Long> actual = iterator.next();
+
+                assertEquals(0, compare(expected.next(), actual.left, OSS41));
+                assertEquals(offset++, actual.right.longValue());
+            }
+            assertFalse(expected.hasNext());
+        }
+    }
+
+    @Test
+    public void testMinMaxTerm() throws IOException
+    {
+        final IndexComponents components = newIndexComponents();
+        final int numKeys = randomIntBetween(16, 512);
+        final List<ByteComparable> byteComparables = Stream.generate(() -> randomSimpleString(4, 48))
+                                                           .limit(numKeys)
+                                                           .sorted()
+                                                           .map(this::asByteComparable)
+                                                           .collect(Collectors.toList());
+
+        long fp;
+        try (TrieTermsDictionaryWriter writer = new TrieTermsDictionaryWriter(components, false))
+        {
+            for (int i = 0; i < byteComparables.size(); ++i)
+            {
+                writer.add(byteComparables.get(i), i);
+            }
+            fp = writer.complete(new MutableLong());
+        }
+
+        try (FileHandle input = components.createFileHandle(components.termsData);
+             TrieTermsDictionaryReader reader = new TrieTermsDictionaryReader(input.instantiateRebufferer(), fp))
+        {
+            final ByteComparable expectedMaxTerm = byteComparables.get(byteComparables.size() - 1);
+            final ByteComparable actualMaxTerm = reader.getMaxTerm();
+            assertEquals(0, compare(expectedMaxTerm, actualMaxTerm, OSS41));
+
+            final ByteComparable expectedMinTerm = byteComparables.get(0);
+            final ByteComparable actualMinTerm = reader.getMinTerm();
+            assertEquals(0, compare(expectedMinTerm, actualMinTerm, OSS41));
+        }
+    }
+
+    private ByteComparable asByteComparable(String s)
+    {
+        return ByteComparable.fixedLength(ByteBufferUtil.bytes(s));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java b/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java
new file mode 100644
index 000000000000..1e1e2138aa21
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java
@@ -0,0 +1,362 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Objects;
+import java.util.Set;
+import java.util.UUID;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.compaction.CompactionInterruptedException;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.index.sai.disk.SSTableIndexWriter;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.inject.ActionBuilder;
+import org.apache.cassandra.inject.Expression;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.inject.InvokePointBuilder;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.locator.RangesAtEndpoint;
+import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.streaming.PreviewKind;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.concurrent.Refs;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class CompactionTest extends SAITester
+{
+    @Test
+    public void testAntiCompaction() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        String indexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        verifyIndexFiles(0, 0);
+
+        // create 100 rows in 1 sstable
+        int num = 100;
+        for (int i = 0; i < num; i++)
+            execute( "INSERT INTO %s (id1, v1) VALUES (?, 0)", Integer.toString(i));
+        flush();
+
+        // verify 1 sstable index
+        assertNumRows(num, "SELECT * FROM %%s WHERE v1 >= 0");
+        verifyIndexFiles(1, 0);
+        verifySSTableIndexes(indexName, 1);
+
+        // split sstable into repaired and unrepaired
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
+        Range<Token> range = new Range<>(DatabaseDescriptor.getPartitioner().getMinimumToken(),
+                                         DatabaseDescriptor.getPartitioner().getToken(ByteBufferUtil.bytes("30")));
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
+        try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
+             Refs<SSTableReader> refs = Refs.ref(sstables))
+        {
+            InetAddressAndPort endpoint = InetAddressAndPort.getByName("10.0.0.1");
+            UUID parentRepairSession = UUID.randomUUID();
+            ActiveRepairService.instance.registerParentRepairSession(parentRepairSession,
+                                                                     endpoint,
+                                                                     Lists.newArrayList(cfs),
+                                                                     Collections.singleton(range),
+                                                                     true,
+                                                                     1000,
+                                                                     false,
+                                                                     PreviewKind.NONE);
+            RangesAtEndpoint replicas = RangesAtEndpoint.builder(endpoint).add(Replica.fullReplica(endpoint, range)).build();
+            CompactionManager.instance.performAnticompaction(cfs, replicas, refs, txn, parentRepairSession, () -> false);
+        }
+
+        // verify 2 sstable indexes
+        assertNumRows(num, "SELECT * FROM %%s WHERE v1 >= 0");
+        waitForAssert(() -> verifyIndexFiles(2, 0));
+        verifySSTableIndexes(indexName, 2);
+
+        // index components are included after anti-compaction
+        verifyIndexComponentsIncludedInSSTable();
+    }
+
+    @Test
+    public void testConcurrentQueryWithCompaction() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+
+        int num = 10;
+        for (int i = 0; i < num; i++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES (?, 0, '0')", Integer.toString(i));
+            flush();
+        }
+
+        TestWithConcurrentVerification compactionTest = new TestWithConcurrentVerification(() -> {
+            for (int i = 0; i < 30; i++)
+            {
+                try
+                {
+                    assertNumRows(num, "SELECT id1 FROM %s WHERE v1>=0");
+                    assertNumRows(num, "SELECT id1 FROM %s WHERE v2='0'");
+                }
+                catch (Throwable e)
+                {
+                    throw new RuntimeException(e);
+                }
+            }
+        }, () -> upgradeSSTables());
+
+        compactionTest.start();
+
+        verifySSTableIndexes(v1IndexName, num);
+        verifySSTableIndexes(v2IndexName, num);
+    }
+
+    @Test
+    public void testAbortCompactionWithEarlyOpenSSTables() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+        int sstables = 2;
+        int num = 10;
+        for (int i = 0; i < num; i++)
+        {
+            if (i == num / sstables)
+                flush();
+
+            execute("INSERT INTO %s (id1, v1, v2) VALUES (?, 0, '0')", Integer.toString(i));
+        }
+        flush();
+
+        // make sure early open is triggered
+        Injections.Counter earlyOpenCounter = Injections.newCounter("early_open_counter")
+                                                        .add(InvokePointBuilder.newInvokePoint().onClass(LifecycleTransaction.class).onMethod("checkpoint"))
+                                                        .build();
+
+        // abort compaction
+        String errMessage = "Injected failure!";
+        Injection failSSTableCompaction = Injections.newCustom("fail_sstable_compaction")
+                                                    .add(InvokePointBuilder.newInvokePoint().onClass(SSTableWriter.class).onMethod("prepareToCommit"))
+                                                    .add(ActionBuilder.newActionBuilder().actions().doThrow(RuntimeException.class, Expression.quote(errMessage)))
+                                                    .build();
+
+        try
+        {
+            Injections.inject(failSSTableCompaction, earlyOpenCounter);
+
+            compact();
+            fail("Expected compaction being interrupted");
+        }
+        catch (Throwable e)
+        {
+            while (e.getCause() != null)
+                e = e.getCause();
+
+            assertTrue(String.format("Expected %s, but got %s", errMessage, e.getMessage()), e.getMessage().contains(errMessage));
+        }
+        finally
+        {
+            earlyOpenCounter.disable();
+            failSSTableCompaction.disable();
+        }
+        assertNotEquals(0, earlyOpenCounter.get());
+
+        // verify indexes are working
+        assertNumRows(num, "SELECT id1 FROM %%s WHERE v1=0");
+        assertNumRows(num, "SELECT id1 FROM %%s WHERE v2='0'");
+        verifySSTableIndexes(v1IndexName, sstables);
+        verifySSTableIndexes(v2IndexName, sstables);
+    }
+
+    @Test
+    public void testConcurrentIndexBuildWithCompaction() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        // prepare data into one sstable
+        int sstables = 1;
+        int num = 100;
+        for (int i = 0; i < num; i++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES (?, 0, '0')", Integer.toString(i));
+        }
+        flush();
+
+        Injections.Barrier compactionLatch =
+        Injections.newBarrier("pause_compaction", 2, false)
+                  .add(InvokePointBuilder.newInvokePoint().onClass(BigTableWriter.class).onMethod("afterAppend"))
+                  .build();
+
+        try
+        {
+            // stop in-progress compaction
+            Injections.inject(compactionLatch);
+
+            TestWithConcurrentVerification compactionTask = new TestWithConcurrentVerification(
+                    () -> {
+                        try
+                        {
+                            upgradeSSTables();
+                            fail("Expected CompactionInterruptedException");
+                        }
+                        catch (Exception e)
+                        {
+                            assertTrue("Expected CompactionInterruptedException, but got " + e,
+                                       Throwables.isCausedBy(e, CompactionInterruptedException.class));
+                        }
+                    },
+                    () -> {
+                        try
+                        {
+                            waitForAssert(() -> Assert.assertEquals(1, compactionLatch.getCount()));
+
+                            // build indexes on SSTables that will be compacted soon
+                            createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+                            createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+                            waitForIndexQueryable();
+
+                            // continue in-progress compaction
+                            compactionLatch.countDown();
+                        }
+                        catch (Exception e)
+                        {
+                            throw new RuntimeException(e);
+                        }
+                    }, -1 // run verification task once
+            );
+
+            compactionTask.start();
+        }
+        finally
+        {
+            compactionLatch.disable();
+        }
+
+        assertNumRows(num, "SELECT id1 FROM %%s WHERE v1>=0");
+        assertNumRows(num, "SELECT id1 FROM %%s WHERE v2='0'");
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V1_COLUMN_IDENTIFIER), sstables);
+        verifySSTableIndexes(IndexMetadata.generateDefaultIndexName(currentTable(), V2_COLUMN_IDENTIFIER), sstables);
+    }
+
+    @Test
+    public void testConcurrentIndexDropWithCompaction() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+
+        // Load data into a single SSTable...
+        int num = 100;
+        for (int i = 0; i < num; i++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES (?, 0, '0')", Integer.toString(i));
+        }
+        flush();
+
+        assertNotEquals(0, getOpenIndexFiles());
+        assertNotEquals(0, getDiskUsage());
+
+        Injections.Barrier compactionLatch =
+                Injections.newBarrier("pause_compaction_for_drop", 2, false)
+                          .add(InvokePointBuilder.newInvokePoint().onClass(SSTableIndexWriter.class).onMethod("addRow"))
+                          .build();
+        try
+        {
+            // pause in-progress compaction
+            Injections.inject(compactionLatch);
+
+            TestWithConcurrentVerification compactionTask = new TestWithConcurrentVerification(
+                    () -> upgradeSSTables(),
+                    () -> {
+                        try
+                        {
+                            waitForAssert(() -> Assert.assertEquals(1, compactionLatch.getCount()));
+
+                            // drop all indexes
+                            dropIndex("DROP INDEX %s." + v1IndexName);
+                            dropIndex("DROP INDEX %s." + v2IndexName);
+
+                            // continue in-progress compaction
+                            compactionLatch.countDown();
+                        }
+                        catch (Throwable e)
+                        {
+                            throw new RuntimeException(e);
+                        }
+                    }, -1 // run verification task once
+            );
+
+            compactionTask.start();
+            waitForCompactionsFinished();
+        }
+        finally
+        {
+            compactionLatch.disable();
+        }
+
+        // verify index group metrics are cleared.
+        assertEquals(0, getOpenIndexFiles());
+        assertEquals(0, getDiskUsage());
+
+        // verify indexes are dropped
+        // verify indexes are dropped
+        assertThatThrownBy(() -> executeNet("SELECT id1 FROM %s WHERE v1>=0"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+        assertThatThrownBy(() -> executeNet("SELECT id1 FROM %s WHERE v2='0'"))
+                .isInstanceOf(InvalidQueryException.class)
+                .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/DiskSpaceTest.java b/test/unit/org/apache/cassandra/index/sai/functional/DiskSpaceTest.java
new file mode 100644
index 000000000000..25032b61c182
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/DiskSpaceTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.junit.Assert.assertEquals;
+
+public class DiskSpaceTest extends SAITester
+{
+    @Test
+    public void testTableTotalDiskSpaceUsed() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        int rows = 1000;
+        for (int j = 0; j < rows; j++)
+        {
+            execute("INSERT INTO %s (id1, v1) VALUES (?, 1)", Integer.toString(j));
+        }
+        flush();
+
+        long sstableSize = totalDiskSpaceUsed();
+
+        // create index, disk space should include index components
+        String indexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+
+        long indexSize = indexDiskSpaceUse();
+        long sstableSizeWithIndex = totalDiskSpaceUsed();
+        assertEquals(sstableSize + indexSize, sstableSizeWithIndex);
+        verifyIndexComponentsIncludedInSSTable();
+
+        // drop index, disk space should not include index, but SSTables still include index components
+        dropIndex("DROP INDEX %s." + indexName);
+        assertEquals(sstableSize, totalDiskSpaceUsed());
+        verifyIndexComponentsNotIncludedInSSTable();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/DropTableTest.java b/test/unit/org/apache/cassandra/index/sai/functional/DropTableTest.java
new file mode 100644
index 000000000000..cf65b1cebdeb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/DropTableTest.java
@@ -0,0 +1,108 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Objects;
+
+import com.google.common.collect.Iterables;
+import org.junit.Test;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.inject.ActionBuilder;
+import org.apache.cassandra.inject.Expression;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.inject.InvokePointBuilder;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.Schema;
+
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class DropTableTest extends SAITester
+{
+    @Test
+    public void testDropTableLifecycle() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+
+        int rows = 100;
+        for (int j = 0; j < rows; j++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES (?, 1 , '1')", Integer.toString(j));
+        }
+        flush();
+
+        verifyIndexComponentsIncludedInSSTable();
+
+        ColumnFamilyStore cfs = Objects.requireNonNull(Schema.instance.getKeyspaceInstance(KEYSPACE)).getColumnFamilyStore(currentTable());
+        SSTableReader sstable = Iterables.getOnlyElement(cfs.getLiveSSTables());
+
+        ArrayList<String> files = new ArrayList<>();
+        for (Component component : sstable.components)
+        {
+            File file = sstable.descriptor.fileFor(component);
+            if (file.exists())
+                files.add(file.getPath());
+        }
+
+        Injection failUnregisterComponents = Injections.newCustom("fail_unregister_components")
+                                                       .add(InvokePointBuilder.newInvokePoint().onClass(SSTable.class).onMethod("unregisterComponents"))
+                                                       .add(ActionBuilder.newActionBuilder().actions().doThrow(RuntimeException.class, Expression.quote("Injected failure!")))
+                                                       .build();
+        assertAllFileExists(files);
+
+        Injections.inject(failUnregisterComponents);
+
+        // drop table, on disk files should be removed. `SSTable#unregisterComponents` should not be call
+        dropTable("DROP TABLE %s");
+
+        assertAllFileRemoved(files);
+    }
+
+    void assertAllFileExists(List<String> filePaths) throws Exception
+    {
+        for (String path : filePaths)
+        {
+            File file = new File(path);
+            assertTrue("Expect file exists, but it's removed: " + path, file.exists());
+        }
+    }
+
+    void assertAllFileRemoved(List<String> filePaths) throws Exception
+    {
+        for (String path : filePaths)
+        {
+            File file = new File(path);
+            System.err.println("## check="+path);
+            assertFalse("Expect file being removed, but it still exists: " + path, file.exists());
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java b/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java
new file mode 100644
index 000000000000..bfa776a89846
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java
@@ -0,0 +1,123 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.IndexNotAvailableException;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.assertj.core.api.Assertions;
+
+import static org.junit.Assert.assertEquals;
+
+public class FailureTest extends SAITester
+{
+    @Test
+    public void shouldMakeIndexNonQueryableOnSSTableContextFailureDuringFlush() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        execute("INSERT INTO %s (id1, v1) VALUES ('1', 1)");
+        execute("INSERT INTO %s (id1, v1) VALUES ('2', 2)");
+        flush();
+
+        assertEquals(1, execute("SELECT id1 FROM %s WHERE v1 > 1").size());
+
+        verifyIndexFiles(1, 1, 0, 1);
+        verifySSTableIndexes(v1IndexName, 1, 1);
+
+        execute("INSERT INTO %s (id1, v1) VALUES ('3', 3)");
+
+        Injection ssTableContextCreationFailure = newFailureOnEntry("context_failure_on_flush", SSTableContext.class, "create", RuntimeException.class);
+        Injections.inject(ssTableContextCreationFailure);
+
+        flush();
+
+        // Verify that, while the node is still operational, the index is not.
+        Assertions.assertThatThrownBy(() -> execute("SELECT * FROM %s WHERE v1 > 1"))
+                  .isInstanceOf(IndexNotAvailableException.class);
+
+        ssTableContextCreationFailure.disable();
+
+        // Now verify that a restart actually repairs the index...
+        simulateNodeRestart();
+
+        verifyIndexFiles(2, 0);
+        verifySSTableIndexes(v1IndexName, 2, 2);
+
+        assertEquals(2, execute("SELECT id1 FROM %s WHERE v1 > 1").size());
+    }
+
+    @Test
+    public void shouldMakeIndexNonQueryableOnSSTableContextFailureDuringCompaction() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        execute("INSERT INTO %s (id1, v1) VALUES ('1', 1)");
+        flush();
+
+        execute("INSERT INTO %s (id1, v1) VALUES ('2', 2)");
+        flush();
+
+        assertEquals(1, execute("SELECT id1 FROM %s WHERE v1 > 1").size());
+
+        verifyIndexFiles(2, 2, 0, 2);
+        verifySSTableIndexes(v1IndexName, 2, 2);
+
+        Injection ssTableContextCreationFailure = newFailureOnEntry("context_failure_on_compaction", SSTableContext.class, "create", RuntimeException.class);
+        Injections.inject(ssTableContextCreationFailure);
+
+        compact();
+
+        // Verify that the index is not available.
+        Assertions.assertThatThrownBy(() -> execute("SELECT * FROM %s WHERE v1 > 1"))
+                  .isInstanceOf(IndexNotAvailableException.class);
+    }
+
+    @Test
+    public void shouldMakeIndexNonQueryableOnSSTableContextFailureDuringCreation() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+
+        execute("INSERT INTO %s (id, v1) VALUES ('1', 1)");
+        execute("INSERT INTO %s (id, v1) VALUES ('2', 2)");
+
+        Injection ssTableContextCreationFailure = newFailureOnEntry("context_failure_on_creation", SSTableContext.class, "create", RuntimeException.class);
+        Injections.inject(ssTableContextCreationFailure);
+
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        // Verify that the initial index build fails...
+        verifyInitialIndexFailed(v1IndexName);
+
+        verifyIndexFiles(0, 0, 0, 0);
+        verifySSTableIndexes(v1IndexName, 0);
+
+        // ...and then verify that, while the node is still operational, the index is not.
+        Assertions.assertThatThrownBy(() -> executeNet("SELECT * FROM %s WHERE v1 > 1"))
+                  .isInstanceOf(ReadFailureException.class);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/FlushingTest.java b/test/unit/org/apache/cassandra/index/sai/functional/FlushingTest.java
new file mode 100644
index 000000000000..0e8000c694a4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/FlushingTest.java
@@ -0,0 +1,80 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.v1.NumericIndexWriter;
+
+import static org.junit.Assert.assertEquals;
+
+public class FlushingTest extends SAITester
+{
+    @Test
+    public void testFlushingLargeStaleMemtableIndex() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        // BDKWriter#valueCount is updated when leaf values are written at BKDWriter#writeLeakBlock on every
+        // BKDWriter#DEFAULT_MAX_POINTS_IN_LEAF_NODE (1024) number of points, see LUCENE-8765
+        int overwrites = NumericIndexWriter.MAX_POINTS_IN_LEAF_NODE + 1;
+        for (int j = 0; j < overwrites; j++)
+        {
+            execute("INSERT INTO %s (id1, v1) VALUES ('1', ?)", j);
+        }
+
+        flush();
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1>=0");
+        assertEquals(1, rows.all().size());
+    }
+
+    @Test
+    public void testFlushingOverwriteDelete() throws Throwable
+    {
+        String table = "flush_overwrite_delete";
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        int sstables = 3;
+        for (int j = 0; j < sstables; j++)
+        {
+            execute("INSERT INTO %s (id1, v1) VALUES (?, 1)", Integer.toString(j));
+            execute("DELETE FROM %s WHERE id1 = ?", Integer.toString(j));
+            flush();
+        }
+
+        ResultSet rows = executeNet("SELECT id1 FROM %s WHERE v1 >= 0");
+        assertEquals(0, rows.all().size());
+        verifyIndexFiles(sstables, 0, 0, sstables);
+        verifySSTableIndexes(v1IndexName, sstables, 0);
+
+        compact();
+        waitForAssert(() -> verifyIndexFiles(1, 0, 0, 1));
+
+        rows = executeNet("SELECT id1 FROM %s WHERE v1 >= 0");
+        assertEquals(0, rows.all().size());
+        verifySSTableIndexes(v1IndexName, 1, 0);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/NodeRestartTest.java b/test/unit/org/apache/cassandra/index/sai/functional/NodeRestartTest.java
new file mode 100644
index 000000000000..c22b57b6d444
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/NodeRestartTest.java
@@ -0,0 +1,181 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.inject.InvokePointBuilder;
+
+import static org.junit.Assert.assertFalse;
+
+public class NodeRestartTest extends SAITester
+{
+    // Failure during the pre-join and initialization tasks shouldn't fail node restart.
+    @Test
+    public void shouldSurviveRestartWithPreJoinAndInitFailures() throws Throwable
+    {
+        createSingleRowIndex();
+
+        Injection ssTableIndexValidationError = newFailureOnEntry("error_at_sstable_index_validation",
+                                                                  StorageAttachedIndex.class,
+                                                                  "findNonIndexedSSTables",
+                                                                  RuntimeException.class);
+
+        // This barrier allows us to wait until the 2i initialization task, which validates the index, has run:
+        Injections.Barrier initTaskLatch =
+                Injections.newBarrier("failing_init_task_barrier", 1, false)
+                          .add(InvokePointBuilder.newInvokePoint().atExceptionExit().onClass(StorageAttachedIndex.class).onMethod("startInitialBuild"))
+                          .build();
+
+        Injections.inject(ssTableIndexValidationError, initTaskLatch, perSSTableValidationCounter, perColumnValidationCounter);
+
+        simulateNodeRestart(false);
+
+        // Wait until the init task runs and fails...
+        initTaskLatch.await();
+
+        // The node should accept a simple query:
+        assertNumRows(1, "SELECT * FROM %%s");
+
+        // We should have completed no actual SSTable validations:
+        assertValidationCount(0, 0);
+
+        assertFalse(isIndexQueryable());
+    }
+
+    // We don't allow the node to actually join the ring before a valid index is ready to accept queries.
+    @Test
+    public void shouldQueryAfterRestartButBeforeInitializationTask() throws Throwable
+    {
+        createSingleRowIndex();
+
+        // This barrier prevents the 2i initialization task, which makes the index queryable, from running:
+        Injections.Barrier initTaskLatch =
+                Injections.newBarrier("pause_init_task_entry", 2, false)
+                          .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startInitialBuild"))
+                          .build();
+
+        Injections.Barrier initTaskLatchExit =
+                Injections.newBarrier("pause_init_task_exit", 1, false)
+                          .add(InvokePointBuilder.newInvokePoint().atExit().onClass(StorageAttachedIndex.class).onMethod("startInitialBuild"))
+                          .build();
+
+        // Make sure we re-introduce existing counter injections...
+        Injections.inject(initTaskLatch, initTaskLatchExit, perSSTableValidationCounter, perColumnValidationCounter);
+
+        simulateNodeRestart(false);
+
+        waitForAssert(() -> Assert.assertEquals(1, initTaskLatch.getCount()));
+
+        // If we do not make the index queryable before it starts accepting queries, this will fail:
+        assertNumRows(1, "SELECT * FROM %%s WHERE v1 >= 0");
+
+        // Allow the init task to run, and then wait for it to finish...
+        initTaskLatch.countDown();
+        initTaskLatchExit.await();
+
+        // This will fail if the init task doesn't skip validation (after the pre-join task has already run):
+        assertValidationCount(1, 1);
+    }
+
+    @Test
+    public void shouldRestartWithExistingNDIComponents() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0');");
+        flush();
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+        verifyIndexFiles(1, 1);
+        assertNumRows(1, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertNumRows(1, "SELECT * FROM %%s WHERE v2 = '0'");
+        assertValidationCount(0, 0);
+
+        simulateNodeRestart();
+
+        verifyIndexFiles(1, 1);
+
+        assertNumRows(1, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertNumRows(1, "SELECT * FROM %%s WHERE v2 = '0'");
+
+        waitForIndexQueryable();
+
+        // index components are included after restart
+        verifyIndexComponentsIncludedInSSTable();
+    }
+
+    // We skip validation in the pre-join task if the initialization task has already run and made the index queryable.
+    @Test
+    public void shouldAvoidPreJoinValidationIfInitTaskHasRun() throws Throwable
+    {
+        createSingleRowIndex();
+
+        //TODO We should be able to use a latch here to avoid having a pause
+        Injection preJoinPause =
+                Injections.newPause("pause_pre_join_task", 5000)
+                          .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("startPreJoinTask"))
+                          .build();
+
+        // Delay the pre-join task, thereby allowing the initialization task to run first:
+        Injections.Barrier preJoinTaskLatch =
+                Injections.newBarrierAwait("init_task_barrier", 1, false)
+                          .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class).onMethod("getPreJoinTask"))
+                          .build();
+
+        // This barrier allows us to wait until the 2i initialization task, which validates the index, has run:
+        Injections.Barrier initTaskLatch =
+                Injections.newBarrierCountDown("init_task_barrier", 1, false)
+                          .add(InvokePointBuilder.newInvokePoint().atExit().onClass(StorageAttachedIndex.class).onMethod("startInitialBuild"))
+                          .build();
+        // Make sure we re-introduce existing counter injections...
+        Injections.inject(preJoinPause, preJoinTaskLatch, initTaskLatch, perSSTableValidationCounter, perColumnValidationCounter);
+
+        simulateNodeRestart(false);
+
+        // This will fail if the pre-join task doesn't skip validation (after the init task has already run):
+        assertValidationCount(0, 0);
+        assertNumRows(1, "SELECT * FROM %%s WHERE v1 >= 0");
+    }
+
+    void createSingleRowIndex() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        verifyIndexFiles(0, 0);
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0')");
+        flush();
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+        verifyIndexFiles(1, 0);
+        assertNumRows(1, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(0, 0);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/SnapshotTest.java b/test/unit/org/apache/cassandra/index/sai/functional/SnapshotTest.java
new file mode 100644
index 000000000000..5f6ff738a87f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/SnapshotTest.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.functional;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.inject.Injections;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+public class SnapshotTest extends SAITester
+{
+    @Before
+    public void injectCounters() throws Throwable
+    {
+        Injections.inject(perSSTableValidationCounter, perColumnValidationCounter);
+    }
+
+    @After
+    public void resetCounters() throws Throwable
+    {
+        resetValidationCount();
+    }
+
+    @Test
+    public void shouldTakeAndRestoreSnapshots() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        verifyIndexFiles(0, 0);
+
+        // Insert some initial data and create the index over it
+        execute("INSERT INTO %s (id1, v1) VALUES ('0', 0);");
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+        flush();
+        verifyIndexFiles(1, 0);
+        assertValidationCount(0, 0);
+        resetValidationCount();
+
+        // Add some data into a second sstable
+        execute("INSERT INTO %s (id1, v1) VALUES ('1', 0);");
+        flush();
+        verifyIndexFiles(2, 0);
+        assertValidationCount(0, 0);
+
+        // Take a snapshot recording the index files last modified date
+        String snapshot = "s";
+        int numSnapshottedSSTables = snapshot(snapshot);
+        assertEquals(2, numSnapshottedSSTables);
+        long snapshotLastModified = indexFilesLastModified();
+
+        // File.lastModified result can be truncated one second resolution, which can be lesser than the index build
+        // time, so we sleep for that time to guarantee that the modification date any of overridden index file will be
+        // different to that of the original file
+        Thread.sleep(1000);
+
+        // Add some data into a third sstable, out of the scope of our snapshot
+        execute("INSERT INTO %s (id1, v1) VALUES ('2', 0);");
+        flush();
+        verifyIndexFiles(3, 0);
+        assertNumRows(3, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(0, 0);
+
+        // Truncate the table
+        truncate(false);
+        waitForAssert(() -> verifyIndexFiles(0, 0));
+        assertNumRows(0, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(0, 0);
+
+        // Restore the snapshot, only the two first sstables should be restored
+        restoreSnapshot(snapshot);
+        verifyIndexFiles(2, 0);
+        assertEquals(snapshotLastModified, indexFilesLastModified());
+        assertNumRows(2, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(2, 2); // newly loaded
+
+        // index components are included after restore
+        verifyIndexComponentsIncludedInSSTable();
+
+        // Rebuild the index to verify that the index files are overridden
+        rebuildIndexes(v1IndexName);
+        verifyIndexFiles(2, 0);
+        assertNotEquals(snapshotLastModified, indexFilesLastModified());
+        assertNumRows(2, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(2, 2); // compaction should not validate
+
+        // index components are included after rebuild
+        verifyIndexComponentsIncludedInSSTable();
+    }
+
+    @Test
+    public void shouldSnapshotAfterIndexBuild() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        verifyIndexFiles(0, 0);
+
+        // Insert some initial data
+        execute("INSERT INTO %s (id1, v1) VALUES ('0', 0);");
+        flush();
+
+        // Add some data into a second sstable
+        execute("INSERT INTO %s (id1, v1) VALUES ('1', 0);");
+        flush();
+
+        // index components are not included
+        verifyIndexComponentsNotIncludedInSSTable();
+
+        // create index
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        waitForIndexQueryable();
+        verifyIndexFiles(2, 0);
+        assertValidationCount(0, 0);
+
+        // index components are included after initial build
+        verifyIndexComponentsIncludedInSSTable();
+
+        // Take a snapshot recording the index files last modified date
+        String snapshot = "s";
+        int numSnapshottedSSTables = snapshot(snapshot);
+        assertEquals(2, numSnapshottedSSTables);
+        long snapshotLastModified = indexFilesLastModified();
+
+        // File.lastModified result can be truncated one second resolution, which can be lesser than the index build
+        // time, so we sleep for that time to guarantee that the modification date any of overridden index file will be
+        // different to that of the original file
+        Thread.sleep(1000);
+
+        // Truncate the table
+        truncate(false);
+        waitForAssert(() -> verifyIndexFiles(0, 0));
+        assertNumRows(0, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(0, 0);
+
+        // Restore the snapshot
+        restoreSnapshot(snapshot);
+        verifyIndexFiles(2, 0);
+        assertEquals(snapshotLastModified, indexFilesLastModified());
+        assertNumRows(2, "SELECT * FROM %%s WHERE v1 >= 0");
+        assertValidationCount(2, 2); // newly loaded
+
+        // index components are included after restore snapshot
+        verifyIndexComponentsIncludedInSSTable();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/memory/AbstractKeyRangeIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/memory/AbstractKeyRangeIteratorTest.java
new file mode 100644
index 000000000000..033b4c2b4972
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/memory/AbstractKeyRangeIteratorTest.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+
+@Ignore
+public abstract class AbstractKeyRangeIteratorTest
+{
+    private static final ByteBuffer KEY_BUFFER = ByteBufferUtil.bytes(0L);
+
+    @Test
+    public void singleTokenIsReturned() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 1, 1);
+
+        assertIterator(iterator, 1);
+    }
+
+    @Test
+    public void duplicateSingleTokenIsReturned() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 1, 1, 1);
+
+        assertIterator(iterator, 1);
+    }
+
+    @Test
+    public void withoutSkipAllTokensAreReturnedInTokenOrder() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void tokensAddedOutOfOrderAreReturnedInOrder() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 3, 2, 1);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void matchingTokensAreIgnoredAtStart() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 1, 2, 3);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void matchingTokensAreIgnoredInMiddle() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 2, 3);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void matchingTokensAreIgnoredAtEnd() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3, 3);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void skipToTokenBeforeFirstTokenWillReturnAllTokens() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3);
+
+        iterator.skipTo(0L);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void skipToFirstTokenWillReturnAllTokens() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3);
+
+        iterator.skipTo(1L);
+
+        assertIterator(iterator, 1, 2, 3);
+    }
+
+    @Test
+    public void skipToMiddleTokenWillReturnRemainingTokens() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3);
+
+        iterator.skipTo(2L);
+
+        assertIterator(iterator, 2, 3);
+    }
+
+    @Test
+    public void skipToLastTokenWillReturnLastToken() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3);
+
+        iterator.skipTo(3L);
+
+        assertIterator(iterator, 3);
+    }
+
+    @Test
+    public void skipToAfterLastTokenWillReturnNoTokens() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 2, 3);
+
+        iterator.skipTo(4L);
+
+        assertIterator(iterator);
+    }
+
+    @Test
+    public void skipToWithMatchingTokensWithReturnCorrectTokens() throws Exception
+    {
+        RangeIterator iterator= makeIterator(1, 3, 1, 1, 2, 2, 3, 3);
+
+        iterator.skipTo(2L);
+
+        assertIterator(iterator, 2, 3);
+    }
+
+    private void assertIterator(RangeIterator iterator, long... tokens) throws Exception
+    {
+        for(long token : tokens)
+        {
+            assertEquals(token, iterator.next().getLong());
+        }
+        assertFalse(iterator.hasNext());
+    }
+
+
+    protected abstract RangeIterator makeIterator(long minimumTokenValue, long maximumTokenValue, long... tokens);
+
+    protected DecoratedKey keyForToken(long token)
+    {
+        return new BufferDecoratedKey(new Murmur3Partitioner.LongToken(token), KEY_BUFFER);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/memory/KeyRangeIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/memory/KeyRangeIteratorTest.java
new file mode 100644
index 000000000000..618dc2304e9e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/memory/KeyRangeIteratorTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.util.Arrays;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+
+public class KeyRangeIteratorTest extends AbstractKeyRangeIteratorTest
+{
+    @Override
+    protected RangeIterator makeIterator(long minimumTokenValue, long maximumTokenValue, long... tokens)
+    {
+        SortedSet<DecoratedKey> set = new TreeSet<>(DecoratedKey.comparator);
+
+        Arrays.stream(tokens).forEach(t -> set.add(keyForToken(t)));
+
+        return new KeyRangeIterator(set);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/memory/PriorityKeyRangeIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/memory/PriorityKeyRangeIteratorTest.java
new file mode 100644
index 000000000000..c9e971502285
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/memory/PriorityKeyRangeIteratorTest.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.util.Arrays;
+import java.util.PriorityQueue;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
+
+public class PriorityKeyRangeIteratorTest extends AbstractKeyRangeIteratorTest
+{
+    @Override
+    protected RangeIterator makeIterator(long minimumTokenValue, long maximumTokenValue, long... tokens)
+    {
+        PriorityQueue<DecoratedKey> queue = new PriorityQueue<>(tokens.length, DecoratedKey.comparator);
+
+        Arrays.stream(tokens).forEach(t -> queue.add(keyForToken(t)));
+
+        return new KeyRangeIterator(minimumTokenValue, maximumTokenValue, queue);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java b/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java
new file mode 100644
index 000000000000..031a501a5be8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/memory/TrieMemoryIndexTest.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.memory;
+
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.function.IntFunction;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.utils.PrimaryKeys;
+import org.apache.cassandra.index.sai.utils.TypeUtil;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.assertEquals;
+
+//TODO Rework this test
+public class TrieMemoryIndexTest
+{
+    private static final String KEYSPACE = "test_keyspace";
+    private static final String TABLE = "test_table";
+    private static final String PART_KEY_COL = "key";
+    private static final String REG_COL = "col";
+
+    private static DecoratedKey key = Murmur3Partitioner.instance.decorateKey(ByteBufferUtil.bytes("key"));
+
+    @Before
+    public void setup()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+//    @Test
+//    public void shouldEncodeAndDecodeStrings()
+//    {
+//        for (int i = 0; i < 31; ++i)
+//        {
+//            final UTF8Type type = UTF8Type.instance;
+//            String input = String.valueOf(i);
+//
+//            final TrieMemoryIndex.Encoder encoder = TrieMemoryIndex.encoderFor(type);
+//            final ByteComparable encodedAndTerminated = encoder.encode(type.decompose(input));
+//            // Note: For string, we unescape the prefix-free encoding, so the comparable contains exactly the bytes of the source.
+//            final ByteBuffer decoded = ByteBuffer.wrap(ByteSourceUtil.readBytes(encoder.decodeEntry(new AbstractMap.SimpleEntry<>(encodedAndTerminated, null)).left.asComparableBytes(ByteComparable.Version.DSE68)));
+//            final String output = type.compose(decoded);
+//
+//            assertEquals(input, output);
+//        }
+//    }
+//
+//    @Test
+//    public void shouldEncodeAndDecodeLongs()
+//    {
+//        for (int i = -31; i < 31; ++i)
+//        {
+//            final LongType type = LongType.instance;
+//            long input = i;
+//
+//            final TrieMemoryIndex.Encoder encoder = TrieMemoryIndex.encoderFor(type);
+//            final ByteComparable encodedAndTerminated = encoder.encode(type.decompose(input));
+//            // Note: For long, we return the prefix-free encoding which must be converted to the type.
+//            final ByteBuffer decoded = type.fromComparableBytes(encoder.decodeEntry(new AbstractMap.SimpleEntry<>(encodedAndTerminated, null)).left.asPeekableBytes(ByteComparable.Version.DSE68),
+//                                                                ByteComparable.Version.DSE68);
+//            final long output = type.compose(decoded);
+//
+//            assertEquals(input, output);
+//        }
+//    }
+
+    @Test
+    public void shouldAcceptPrefixValues()
+    {
+        shouldAcceptPrefixValuesForType(UTF8Type.instance, i -> UTF8Type.instance.decompose(String.format("%03d", i)));
+        shouldAcceptPrefixValuesForType(Int32Type.instance, Int32Type.instance::decompose);
+    }
+
+    private void shouldAcceptPrefixValuesForType(AbstractType<?> type, IntFunction<ByteBuffer> decompose)
+    {
+        final TrieMemoryIndex index = newTrieMemoryIndex(type);
+        for (int i = 0; i < 99; ++i)
+        {
+            index.add(key, Clustering.EMPTY, decompose.apply(i));
+        }
+
+        final Iterator<Pair<ByteComparable, PrimaryKeys>> iterator = index.iterator();
+        int i = 0;
+        while (iterator.hasNext())
+        {
+            Pair<ByteComparable, PrimaryKeys> pair = iterator.next();
+            assertEquals(1, pair.right.size());
+
+            final int rowId = i;
+            final ByteComparable expectedByteComparable = TypeUtil.isLiteral(type)
+                                                          ? ByteComparable.fixedLength(decompose.apply(rowId))
+                                                          : version -> type.asComparableBytes(decompose.apply(rowId), version);
+            final ByteComparable actualByteComparable = pair.left;
+            assertEquals("Mismatch at: " + i, 0, ByteComparable.compare(expectedByteComparable, actualByteComparable, ByteComparable.Version.OSS41));
+
+            i++;
+        }
+        assertEquals(99, i);
+    }
+
+    private TrieMemoryIndex newTrieMemoryIndex(AbstractType<?> columnType)
+    {
+        ColumnMetadata column = ColumnMetadata.regularColumn(KEYSPACE, TABLE, REG_COL, columnType);
+        TableMetadata metadata = TableMetadata.builder(KEYSPACE, TABLE)
+                                              .addPartitionKeyColumn(PART_KEY_COL, UTF8Type.instance)
+                                              .addRegularColumn(REG_COL, columnType)
+                                              .partitioner(Murmur3Partitioner.instance)
+                                              .caching(CachingParams.CACHE_NOTHING)
+                                              .build();
+
+        Map<String, String> options = new HashMap<>();
+        options.put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getCanonicalName());
+        options.put("target", REG_COL);
+
+        IndexMetadata indexMetadata = IndexMetadata.fromSchemaMetadata("col_index", IndexMetadata.Kind.CUSTOM, options);
+        ColumnContext ci = new ColumnContext(metadata, indexMetadata);
+        return new TrieMemoryIndex(ci);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/AbstractMetricsTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/AbstractMetricsTest.java
new file mode 100644
index 000000000000..fa86ce68f952
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/AbstractMetricsTest.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+import javax.management.ObjectName;
+
+import org.junit.Before;
+import org.junit.Ignore;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+@Ignore
+public abstract class AbstractMetricsTest extends SAITester
+{
+    @Before
+    public void initializeTest() throws Throwable
+    {
+        requireNetwork();
+
+        startJMXServer();
+
+        createMBeanServerConnection();
+    }
+
+    protected void waitForIndexCompaction(String keyspace, String table, String index)
+    {
+        waitForAssert(() -> {
+            try
+            {
+                assertEquals(1L, getMetricValue(objectName("CompactionCount", keyspace, table, index, "IndexMetrics")));
+            }
+            catch (Throwable ex)
+            {
+                throw Throwables.unchecked(ex);
+            }
+        }, 60, TimeUnit.SECONDS);
+    }
+
+    protected void waitForVerifyHistogram(ObjectName name, long count)
+    {
+        waitForAssert(() -> {
+            try
+            {
+                assertEquals(count, jmxConnection.getAttribute(name, "Count"));
+            }
+            catch (Throwable ex)
+            {
+                throw Throwables.unchecked(ex);
+            }
+        }, 10, TimeUnit.SECONDS);
+    }
+
+    protected void waitForGreaterThanZero(ObjectName name)
+    {
+        waitForAssert(() -> {
+            try
+            {
+                assertTrue(((Number) getMetricValue(name)).doubleValue() > 0);
+            }
+            catch (Throwable ex)
+            {
+                throw Throwables.unchecked(ex);
+            }
+        }, 160, TimeUnit.SECONDS);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/FinalSegmentFlushingFailureTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/FinalSegmentFlushingFailureTest.java
new file mode 100644
index 000000000000..e52a2cf0ac04
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/FinalSegmentFlushingFailureTest.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+public class FinalSegmentFlushingFailureTest extends SegmentFlushingFailureTest
+{
+    @Override
+    protected long expectedBytesLimit()
+    {
+        return DEFAULT_BYTES_LIMIT;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/IndexGroupMetricsTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/IndexGroupMetricsTest.java
new file mode 100644
index 000000000000..15a7f0ef10c7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/IndexGroupMetricsTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.disk.InvertedIndexSearcher;
+import org.apache.cassandra.index.sai.disk.KDTreeIndexSearcher;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+
+public class IndexGroupMetricsTest extends AbstractMetricsTest
+{
+    @Before
+    public void setup() throws Exception
+    {
+        requireNetwork();
+
+        startJMXServer();
+
+        createMBeanServerConnection();
+    }
+
+    @Test
+    public void verifyIndexGroupMetrics() throws Throwable
+    {
+        // create first index
+        createTable(CREATE_TABLE_TEMPLATE);
+        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        // no open files
+        assertEquals(0, getOpenIndexFiles());
+        assertEquals(0, getDiskUsage());
+
+        int sstables = 10;
+        for (int i = 0; i < sstables; i++)
+        {
+            execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0')");
+            flush();
+        }
+
+        // with 10 sstable
+        int indexopenFileCountWithOnlyNumeric = getOpenIndexFiles();
+        assertEquals(sstables * (SSTableContext.openFilesPerSSTable() + KDTreeIndexSearcher.openPerIndexFiles()), indexopenFileCountWithOnlyNumeric);
+
+        long diskUsageWithOnlyNumeric = getDiskUsage();
+        assertNotEquals(0, diskUsageWithOnlyNumeric);
+
+        // create second index
+        String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+        waitForIndexQueryable();
+
+        // same number of sstables, but more string index files.
+        int stringIndexOpenFileCount = sstables * InvertedIndexSearcher.openPerIndexFiles();
+        assertEquals(indexopenFileCountWithOnlyNumeric, getOpenIndexFiles() - stringIndexOpenFileCount);
+
+        // Index Group disk usage doesn't change with more indexes
+        long diskUsageWithBothIndexes = getDiskUsage();
+        assertEquals(diskUsageWithBothIndexes, diskUsageWithOnlyNumeric);
+
+        // compaction should reduce open files
+        compact();
+
+        long perSSTableFileDiskUsage = getDiskUsage();
+        assertEquals(SSTableContext.openFilesPerSSTable() + KDTreeIndexSearcher.openPerIndexFiles() + InvertedIndexSearcher.openPerIndexFiles(),
+                     getOpenIndexFiles());
+
+        // drop string index, reduce open string index files, per-sstable file disk usage remains the same
+        dropIndex("DROP INDEX %s." + v2IndexName);
+        assertEquals(SSTableContext.openFilesPerSSTable() + KDTreeIndexSearcher.openPerIndexFiles(), getOpenIndexFiles());
+        assertEquals(perSSTableFileDiskUsage, getDiskUsage());
+
+        // drop last index, no open index files
+        dropIndex("DROP INDEX %s." + v1IndexName);
+        assertEquals(0, getOpenIndexFiles());
+        assertEquals(0, getDiskUsage());
+    }
+
+    protected int getOpenIndexFiles()
+    {
+        return (int) getMetricValue(objectNameNoIndex("OpenIndexFiles", KEYSPACE, currentTable(), "IndexGroupMetrics"));
+    }
+
+    protected long getDiskUsage()
+    {
+        return (long) getMetricValue(objectNameNoIndex("DiskUsedBytes", KEYSPACE, currentTable(), "IndexGroupMetrics"));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/IndexMetricsTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/IndexMetricsTest.java
new file mode 100644
index 000000000000..405a9f676243
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/IndexMetricsTest.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class IndexMetricsTest extends AbstractMetricsTest
+{
+    private static final String TABLE = "table_name";
+    private static final String INDEX = "table_name_index";
+
+    private static final String CREATE_TABLE_TEMPLATE = "CREATE TABLE %s." + TABLE + " (ID1 TEXT PRIMARY KEY, v1 INT, v2 TEXT) WITH compaction = " +
+                                                        "{'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+    private static final String CREATE_INDEX_TEMPLATE = "CREATE CUSTOM INDEX IF NOT EXISTS " + INDEX + " ON %s." + TABLE + "(%s) USING 'StorageAttachedIndex'";
+
+    @Test
+    public void testSameIndexNameAcrossKeyspaces() throws Throwable
+    {
+        String keyspace1 = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+        String keyspace2 = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace1));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, keyspace1, "v1"));
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace2));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, keyspace2, "v1"));
+
+        execute("INSERT INTO " + keyspace1 + "." + TABLE + " (id1, v1, v2) VALUES ('0', 0, '0')");
+
+        assertEquals(1L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace1, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(0L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace2, TABLE, INDEX, "IndexMetrics")));
+
+        execute("INSERT INTO " + keyspace2 + "." + TABLE + " (id1, v1, v2) VALUES ('0', 0, '0')");
+        execute("INSERT INTO " + keyspace2 + "." + TABLE + " (id1, v1, v2) VALUES ('1', 1, '1')");
+
+        assertEquals(1L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace1, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(2L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace2, TABLE, INDEX, "IndexMetrics")));
+    }
+
+    @Test
+    public void testMetricRelease() throws Throwable
+    {
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, keyspace, "v1"));
+
+        execute("INSERT INTO " + keyspace + "." + TABLE + " (id1, v1, v2) VALUES ('0', 0, '0')");
+        assertEquals(1L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+
+        dropIndex(String.format("DROP INDEX %s." + INDEX, keyspace));
+
+        // once the index is dropped, make sure MBeans are no longer accessible
+        assertThatThrownBy(() -> getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace, TABLE, INDEX, "IndexMetrics")))
+                .hasCauseInstanceOf(javax.management.InstanceNotFoundException.class);
+    }
+
+    @Test
+    public void testMetricsThroughWriteLifecycle() throws Throwable
+    {
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, keyspace, "v1"));
+
+        int rowCount = 10;
+        for (int i = 0; i < rowCount; i++)
+            execute("INSERT INTO " + keyspace + "." + TABLE + "(id1, v1, v2) VALUES (?, ?, '0')", Integer.toString(i), i);
+
+        assertEquals(10L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertTrue((Long)getMetricValue(objectName("MemtableIndexBytes", keyspace, TABLE, INDEX, "IndexMetrics")) > 0);
+        assertEquals(0L, getMetricValue(objectName("MemtableIndexFlushCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+
+        waitForAssert(() -> {
+            try
+            {
+                assertEquals(10L, getMBeanAttribute(objectName("MemtableIndexWriteLatency", keyspace, TABLE, INDEX, "IndexMetrics"), "Count"));
+            }
+            catch (Throwable ex)
+            {
+                throw Throwables.unchecked(ex);
+            }
+        }, 60, TimeUnit.SECONDS);
+
+        assertEquals(0L, getMetricValue(objectName("SSTableCellCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(0L, getMetricValue(objectName("DiskUsedBytes", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(0L, getMetricValue(objectName("CompactionCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+
+        waitForVerifyHistogram(objectName("MemtableIndexFlushCellsPerSecond", keyspace, TABLE, INDEX, "IndexMetrics"), 0);
+
+        flush(keyspace, TABLE);
+
+        assertEquals(0L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(0L, getMetricValue(objectName("MemtableIndexBytes", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(1L, getMetricValue(objectName("MemtableIndexFlushCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(10L, getMetricValue(objectName("SSTableCellCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertTrue((Long)getMetricValue(objectName("DiskUsedBytes", keyspace, TABLE, INDEX, "IndexMetrics")) > 0);
+        assertEquals(0L, getMetricValue(objectName("CompactionCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+
+        waitForVerifyHistogram(objectName("MemtableIndexFlushCellsPerSecond", keyspace, TABLE, INDEX, "IndexMetrics"), 1);
+
+        compact(keyspace, TABLE);
+
+        waitForIndexCompaction(keyspace, TABLE, INDEX);
+
+        waitForIndexQueryable(keyspace, TABLE);
+        ResultSet rows = executeNet(String.format("SELECT id1 FROM %s.%s WHERE v1 >= 0", keyspace, TABLE));
+        assertEquals(rowCount, rows.all().size());
+
+        assertEquals(0L, getMetricValue(objectName("LiveMemtableIndexWriteCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(1L, getMetricValue(objectName("MemtableIndexFlushCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertEquals(10L, getMetricValue(objectName("SSTableCellCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+        assertTrue((Long)getMetricValue(objectName("DiskUsedBytes", keyspace, TABLE, INDEX, "IndexMetrics")) > 0);
+        assertEquals(1L, getMetricValue(objectName("CompactionCount", keyspace, TABLE, INDEX, "IndexMetrics")));
+
+        waitForVerifyHistogram(objectName("CompactionSegmentCellsPerSecond", keyspace, TABLE, INDEX, "IndexMetrics"), 1);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java b/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java
new file mode 100644
index 000000000000..3da309c65a61
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.TimeUnit;
+
+public class QueryEventListeners
+{
+    public static final QueryEventListener NO_OP = new BaseQueryEventListener();
+
+    public static final QueryEventListener.BKDIndexEventListener NO_OP_BKD_LISTENER = NO_OP.bkdIndexEventListener();
+
+    public static final QueryEventListener.TrieIndexEventListener NO_OP_TRIE_LISTENER = NO_OP.trieIndexEventListener();
+
+    public static final QueryEventListener.PostingListEventListener NO_OP_POSTINGS_LISTENER = new NoOpPostingListEventListener();
+
+    private static class BaseQueryEventListener implements QueryEventListener
+    {
+        @Override
+        public BKDIndexEventListener bkdIndexEventListener()
+        {
+            return NoOpBKDIndexEventListener.INSTANCE;
+        }
+
+        @Override
+        public TrieIndexEventListener trieIndexEventListener()
+        {
+            return NoOpTrieIndexEventListener.INSTANCE;
+        }
+
+        private enum NoOpTrieIndexEventListener implements TrieIndexEventListener
+        {
+            INSTANCE;
+
+            @Override
+            public void onSegmentHit() { }
+
+            @Override
+            public void onTraversalComplete(long traversalTotalTime, TimeUnit unit) { }
+
+            @Override
+            public PostingListEventListener postingListEventListener()
+            {
+                return NO_OP_POSTINGS_LISTENER;
+            }
+        }
+
+        private enum NoOpBKDIndexEventListener implements BKDIndexEventListener
+        {
+            INSTANCE;
+
+            @Override
+            public void onIntersectionComplete(long intersectionTotalTime, TimeUnit unit) { }
+
+            @Override
+            public void onIntersectionEarlyExit() { }
+
+            @Override
+            public void postingListsHit(int count) { }
+
+            @Override
+            public void onSegmentHit() { }
+
+            @Override
+            public PostingListEventListener postingListEventListener()
+            {
+                return NO_OP_POSTINGS_LISTENER;
+            }
+        }
+    }
+
+    public static class NoOpPostingListEventListener implements QueryEventListener.PostingListEventListener
+    {
+        @Override
+        public void onAdvance() { }
+
+        @Override
+        public void onPostingDecoded() { }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java
new file mode 100644
index 000000000000..5b7869d76768
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/QueryMetricsTest.java
@@ -0,0 +1,357 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.util.concurrent.ThreadLocalRandom;
+import javax.management.InstanceNotFoundException;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import com.datastax.driver.core.ResultSet;
+
+import static org.apache.cassandra.index.sai.metrics.TableQueryMetrics.TABLE_QUERY_METRIC_TYPE;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class QueryMetricsTest extends AbstractMetricsTest
+{
+    private static final String CREATE_TABLE_TEMPLATE = "CREATE TABLE %s.%s (id1 TEXT PRIMARY KEY, v1 INT, v2 TEXT) WITH compaction = " +
+                                                        "{'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+    private static final String CREATE_INDEX_TEMPLATE = "CREATE CUSTOM INDEX IF NOT EXISTS %s ON %s.%s(%s) USING 'StorageAttachedIndex'";
+
+    private static final String PER_QUERY_METRIC_TYPE = "PerQuery";
+    private static final String GLOBAL_METRIC_TYPE = "ColumnQueryMetrics";
+
+    @Rule
+    public ExpectedException exception = ExpectedException.none();
+
+    @Test
+    public void testSameIndexNameAcrossKeyspaces() throws Throwable
+    {
+        String table = "test_same_index_name_across_keyspaces";
+        String index = "test_same_index_name_across_keyspaces_index";
+
+        String keyspace1 = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+        String keyspace2 = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace1, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace1, table, "v1"));
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace2, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace2, table, "v1"));
+
+        execute("INSERT INTO " + keyspace1 + "." + table + " (id1, v1, v2) VALUES ('0', 0, '0')");
+
+        ResultSet rows = executeNet("SELECT id1 FROM " + keyspace1 + "." + table + " WHERE v1 = 0");
+        assertEquals(1, rows.all().size());
+
+        assertEquals(1L, getTableQueryMetrics(keyspace1, table, "TotalQueriesCompleted"));
+        assertEquals(0L, getTableQueryMetrics(keyspace2, table, "TotalQueriesCompleted"));
+
+        execute("INSERT INTO " + keyspace2 + "." + table + " (id1, v1, v2) VALUES ('0', 0, '0')");
+        execute("INSERT INTO " + keyspace2 + "." + table + " (id1, v1, v2) VALUES ('1', 1, '1')");
+
+        rows = executeNet("SELECT id1 FROM " + keyspace1 + "." + table + " WHERE v1 = 0");
+        assertEquals(1, rows.all().size());
+
+        rows = executeNet("SELECT id1 FROM " + keyspace2 + "." + table + " WHERE v1 = 1");
+        assertEquals(1, rows.all().size());
+
+        assertEquals(2L, getTableQueryMetrics(keyspace1, table, "TotalQueriesCompleted"));
+        assertEquals(1L, getTableQueryMetrics(keyspace2, table, "TotalQueriesCompleted"));
+    }
+
+    @Test
+    public void testMetricRelease() throws Throwable
+    {
+        String table = "test_metric_release";
+        String index = "test_metric_release_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace, table, "v1"));
+        waitForIndexQueryable(keyspace, table);
+
+        execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES ('0', 0, '0')");
+
+        ResultSet rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 = 0");
+        assertEquals(1, rows.all().size());
+
+        assertEquals(1L, getTableQueryMetrics(keyspace, table, "TotalQueriesCompleted"));
+
+        // Even if we drop the last index on the table, table-level metrics should still be visible:
+        dropIndex(String.format("DROP INDEX %s." + index, keyspace));
+        assertEquals(1L, getTableQueryMetrics(keyspace, table, "TotalQueriesCompleted"));
+
+        // When the whole table is dropped, we should finally fail to find table-level metrics:
+        dropTable(String.format("DROP TABLE %s." + table, keyspace));
+        assertThatThrownBy(() -> getTableQueryMetrics(keyspace, table, "TotalQueriesCompleted")).hasCauseInstanceOf(InstanceNotFoundException.class);
+    }
+
+    @Test
+    public void testKDTreeQueryMetricsWithSingleIndex() throws Throwable
+    {
+        String table = "test_metrics_through_write_lifecycle";
+        String index = "test_metrics_through_write_lifecycle_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace, table, "v1"));
+
+        int resultCounter = 0;
+        int queryCounter = 0;
+
+        int rowsWritten = 10;
+
+        for (int i = 0; i < rowsWritten; i++)
+        {
+            execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES (?, ?, '0')", Integer.toString(i), i);
+        }
+
+        flush(keyspace, table);
+        compact(keyspace, table);
+        waitForIndexCompaction(keyspace, table, index);
+
+        waitForIndexQueryable(keyspace, table);
+
+        ResultSet rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 >= 0");
+
+        int actualRows = rows.all().size();
+        assertEquals(rowsWritten, actualRows);
+        resultCounter += actualRows;
+        queryCounter++;
+
+        rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 >= 5");
+
+        actualRows = rows.all().size();
+        assertEquals(5, actualRows);
+        resultCounter += actualRows;
+        queryCounter++;
+
+        assertEquals(2L, getPerQueryMetrics(keyspace, table, "SSTableIndexesHit"));
+        assertEquals(2L, getPerQueryMetrics(keyspace, table, "IndexSegmentsHit"));
+        assertEquals(2L, getTableQueryMetrics(keyspace, table, "TotalQueriesCompleted"));
+
+        // run several times to get buffer faults across the metrics
+        for (int x = 0; x < 20; x++)
+        {
+            rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 >= 5");
+
+            actualRows = rows.all().size();
+            assertEquals(5, actualRows);
+            resultCounter += actualRows;
+            queryCounter++;
+        }
+
+        // column metrics
+
+        waitForGreaterThanZero(objectNameNoIndex("QueryLatency", keyspace, table, PER_QUERY_METRIC_TYPE));
+
+        waitForEquals(objectNameNoIndex("TotalPartitionReads", keyspace, table, TableQueryMetrics.TABLE_QUERY_METRIC_TYPE), resultCounter);
+        waitForEquals(objectName("KDTreeIntersectionLatency", keyspace, table, index, GLOBAL_METRIC_TYPE), queryCounter);
+    }
+
+    @Test
+    public void testKDTreePostingsQueryMetricsWithSingleIndex() throws Throwable
+    {
+        String table = "test_kdtree_postings_metrics_through_write_lifecycle";
+        String v1Index = "test_kdtree_postings_metrics_through_write_lifecycle_v1_index";
+        String v2Index = "test_kdtree_postings_metrics_through_write_lifecycle_v2_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace, table));
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE + " WITH OPTIONS = {'bkd_postings_min_leaves' : 1}", v1Index, keyspace, table, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, v2Index, keyspace, table, "v2"));
+
+        int rowsWritten = 50;
+
+
+        for (int i = 0; i < rowsWritten; i++)
+        {
+            execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES (?, ?, ?)", Integer.toString(i), i, Integer.toString(i));
+        }
+
+        flush(keyspace, table);
+        compact(keyspace, table);
+        waitForIndexCompaction(keyspace, table, v1Index);
+
+        waitForIndexQueryable(keyspace, table);
+
+        ResultSet rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 >= 0");
+
+        int actualRows = rows.all().size();
+        assertEquals(rowsWritten, actualRows);
+
+        assertTrue(((Number) getMetricValue(objectName("NumPostings", keyspace, table, v1Index, "KDTreePostings"))).longValue() > 0);
+
+        waitForVerifyHistogram(objectNameNoIndex("KDTreePostingsNumPostings", keyspace, table, PER_QUERY_METRIC_TYPE), 1);
+
+        // V2 index is very selective, so it should lead the union merge process, causing V1 index to skip/advance
+        execute("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 >= 0 AND v1 <= 1000 AND v2 = '5' ALLOW FILTERING");
+
+        waitForVerifyHistogram(objectNameNoIndex("KDTreePostingsSkips", keyspace, table, PER_QUERY_METRIC_TYPE), 2);
+    }
+
+    @Test
+    public void testInvertedIndexQueryMetricsWithSingleIndex() throws Throwable
+    {
+        String table = "test_invertedindex_metrics_through_write_lifecycle";
+        String index = "test_invertedindex_metrics_through_write_lifecycle_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace, table, "v2"));
+
+        int resultCounter = 0;
+        int queryCounter = 0;
+
+        int rowsWritten = 10;
+
+        for (int i = 0; i < rowsWritten; i++)
+        {
+            execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES (?, ?, ?)", Integer.toString(i), i, Integer.toString(i));
+        }
+
+        flush(keyspace, table);
+        compact(keyspace, table);
+        waitForIndexCompaction(keyspace, table, index);
+
+        waitForIndexQueryable(keyspace, table);
+
+        ResultSet rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v2 = '0'");
+
+
+        int actualRows = rows.all().size();
+        assertEquals(1, actualRows);
+        resultCounter += actualRows;
+        queryCounter++;
+
+        rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v2 = '5'");
+
+        actualRows = rows.all().size();
+        assertEquals(1, actualRows);
+        resultCounter += actualRows;
+        queryCounter++;
+
+        assertEquals(2L, getPerQueryMetrics(keyspace, table, "SSTableIndexesHit"));
+        assertEquals(2L, getPerQueryMetrics(keyspace, table, "IndexSegmentsHit"));
+        assertEquals(2L, getTableQueryMetrics(keyspace, table, "TotalQueriesCompleted"));
+
+        // run several times to get buffer faults across the metrics
+        for (int x = 0; x < 20; x++)
+        {
+            rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v2 = '" + ThreadLocalRandom.current().nextInt(0, 9) + "'");
+
+            actualRows = rows.all().size();
+            assertEquals(1, actualRows);
+            resultCounter += actualRows;
+            queryCounter++;
+        }
+
+        waitForGreaterThanZero(objectName("TermsLookupLatency", keyspace, table, index, GLOBAL_METRIC_TYPE));
+
+        waitForGreaterThanZero(objectNameNoIndex("QueryLatency", keyspace, table, PER_QUERY_METRIC_TYPE));
+
+        waitForEquals(objectNameNoIndex("TotalPartitionReads", keyspace, table, TableQueryMetrics.TABLE_QUERY_METRIC_TYPE), resultCounter);
+    }
+
+    @Test
+    public void testKDTreePartitionsReadAndRowsFiltered() throws Throwable
+    {
+        String table = "test_rows_filtered_large_partition";
+        String index = "test_rows_filtered_large_partition_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format("CREATE TABLE %s.%s (pk int, ck int, v1 int, PRIMARY KEY (pk, ck)) " +
+                                  "WITH compaction = {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }", keyspace,  table));
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace, table, "v1"));
+        waitForIndexQueryable(keyspace, table);
+
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (0, 0, 0)");
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (1, 1, 1)");
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (1, 2, 2)");
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (2, 1, 3)");
+
+        flush(keyspace, table);
+
+        ResultSet rows = executeNet("SELECT pk, ck FROM " + keyspace + "." + table + " WHERE v1 > 0");
+
+        int actualRows = rows.all().size();
+        assertEquals(3, actualRows);
+
+        waitForEquals(objectNameNoIndex("TotalPartitionReads", keyspace, table, TABLE_QUERY_METRIC_TYPE), 2);
+        waitForVerifyHistogram(objectNameNoIndex("RowsFiltered", keyspace, table, PER_QUERY_METRIC_TYPE), 1);
+        waitForEquals(objectNameNoIndex("TotalRowsFiltered", keyspace, table, TABLE_QUERY_METRIC_TYPE), 3);
+    }
+
+    @Test
+    public void testKDTreeQueryEarlyExit() throws Throwable
+    {
+        String table = "test_queries_exited_early";
+        String index = "test_queries_exited_early_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format("CREATE TABLE %s.%s (pk int, ck int, v1 int, PRIMARY KEY (pk, ck)) " +
+                                  "WITH compaction = {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }", keyspace, table));
+
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace, table, "v1"));
+        waitForIndexQueryable(keyspace, table);
+
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (0, 0, 0)");
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (1, 1, 1)");
+        execute("INSERT INTO " + keyspace + "." + table + "(pk, ck, v1) VALUES (1, 2, 2)");
+
+        flush(keyspace, table);
+
+        ResultSet rows = executeNet("SELECT pk, ck FROM " + keyspace + "." + table + " WHERE v1 > 2");
+
+        assertEquals(0, rows.all().size());
+
+        rows = executeNet("SELECT pk, ck FROM " + keyspace + "." + table + " WHERE v1 < 0");
+        assertEquals(0, rows.all().size());
+
+        waitForEquals(objectName("KDTreeIntersectionLatency", keyspace, table, index, GLOBAL_METRIC_TYPE), 0L);
+        waitForEquals(objectName("KDTreeIntersectionEarlyExits", keyspace, table, index, GLOBAL_METRIC_TYPE), 2L);
+
+        rows = executeNet("SELECT pk, ck FROM " + keyspace + "." + table + " WHERE v1 > 0");
+        assertEquals(2, rows.all().size());
+
+        waitForEquals(objectName("KDTreeIntersectionLatency", keyspace, table, index, GLOBAL_METRIC_TYPE), 1L);
+        waitForEquals(objectName("KDTreeIntersectionEarlyExits", keyspace, table, index, GLOBAL_METRIC_TYPE), 2L);
+    }
+
+    private long getPerQueryMetrics(String keyspace, String table, String metricsName) throws Exception
+    {
+        return (long) getMetricValue(objectNameNoIndex(metricsName, keyspace, table, PER_QUERY_METRIC_TYPE));
+    }
+
+    private long getTableQueryMetrics(String keyspace, String table, String metricsName) throws Exception
+    {
+        return (long) getMetricValue(objectNameNoIndex(metricsName, keyspace, table, TableQueryMetrics.TABLE_QUERY_METRIC_TYPE));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/SegmentFlushingFailureTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/SegmentFlushingFailureTest.java
new file mode 100644
index 000000000000..f007af84803b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/SegmentFlushingFailureTest.java
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.exceptions.ReadFailureException;
+import org.apache.cassandra.config.StorageAttachedIndexOptions;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.SSTableComponentsWriter;
+import org.apache.cassandra.index.sai.disk.SSTableIndexWriter;
+import org.apache.cassandra.index.sai.disk.SegmentBuilder;
+import org.apache.cassandra.index.sai.utils.NamedMemoryLimiter;
+import org.apache.cassandra.inject.Injection;
+import org.apache.cassandra.inject.Injections;
+
+import static org.apache.cassandra.inject.Injections.newCounter;
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+@Ignore
+public abstract class SegmentFlushingFailureTest extends SAITester
+{
+    static final long DEFAULT_BYTES_LIMIT = 1024L * 1024L * StorageAttachedIndexOptions.DEFAULT_SEGMENT_BUFFER_MB;
+
+    @Before
+    public void initialize() throws Throwable
+    {
+        requireNetwork();
+
+        startJMXServer();
+
+        createMBeanServerConnection();
+
+        Injections.inject(memoryTrackingCounter, writerAbortCounter);
+        memoryTrackingCounter.enable();
+        writerAbortCounter.enable();
+    }
+
+    private static final Injections.Counter memoryTrackingCounter =
+            newCounter("memoryTrackingCounter").add(newInvokePoint()
+                                               .onClass(NamedMemoryLimiter.class)
+                                               .onMethod("increment")
+                                               .atEntry()).build();
+
+    private static final Injections.Counter writerAbortCounter =
+            newCounter("writerAbortCounter").add(newInvokePoint()
+                                            .onClass(SSTableIndexWriter.class)
+                                            .onMethod("abort")
+                                            .atEntry()).build();
+
+    private static final Injection sstableComponentsWriterFailure =
+            newFailureOnEntry("sstableComponentsWriterFailure", SSTableComponentsWriter.class, "complete", RuntimeException.class);
+
+    private static final Injection segmentFlushFailure =
+            newFailureOnEntry("segmentFlushFailure", SegmentBuilder.class, "flush", RuntimeException.class);
+
+    private static final Injection segmentFlushIOFailure =
+            newFailureOnEntry("segmentFlushIOFailure", SegmentBuilder.class, "flush", IOException.class);
+
+    private static final Injection kdTreeSegmentFlushFailure =
+            newFailureOnEntry("kdTreeSegmentFlushFailure", SegmentBuilder.KDTreeSegmentBuilder.class, "flushInternal", IOException.class);
+
+    @After
+    public void resetCounters()
+    {
+        memoryTrackingCounter.reset();
+        writerAbortCounter.reset();
+    }
+
+    protected abstract long expectedBytesLimit();
+
+    @Test
+    public void testSegmentMemoryTrackerLifecycle() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        assertEquals(expectedBytesLimit(), getSegmentBufferSpaceLimit());
+        assertEquals("Segment buffer memory tracker should start at zero!", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0')");
+        flush();
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '1')");
+        flush();
+
+        ResultSet rows = executeNet("SELECT * FROM %s WHERE v1 = 0");
+        assertEquals(1, rows.all().size());
+
+        compact();
+
+        // The compaction completed successfully:
+        Assert.assertEquals(0, writerAbortCounter.get());
+
+        // This is a proxy for making sure we've actually tracked something:
+        assertTrue(memoryTrackingCounter.get() > 0);
+
+        assertEquals("Global memory tracker should have reverted to zero.", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        rows = executeNet("SELECT * FROM %s WHERE v1 = 0");
+        assertEquals(1, rows.all().size());
+    }
+
+    @Test
+    public void shouldZeroMemoryTrackerOnOffsetsRuntimeFailure() throws Throwable
+    {
+        shouldZeroMemoryTrackerOnFailure(sstableComponentsWriterFailure, "v1");
+        resetCounters();
+        shouldZeroMemoryTrackerOnFailure(sstableComponentsWriterFailure, "v2");
+    }
+
+    @Test
+    public void shouldZeroMemoryTrackerOnSegmentFlushIOFailure() throws Throwable
+    {
+        shouldZeroMemoryTrackerOnFailure(segmentFlushIOFailure, "v1");
+        resetCounters();
+        shouldZeroMemoryTrackerOnFailure(segmentFlushIOFailure, "v2");
+    }
+
+    @Test
+    public void shouldZeroMemoryTrackerOnSegmentFlushRuntimeFailure() throws Throwable
+    {
+        shouldZeroMemoryTrackerOnFailure(segmentFlushFailure, "v1");
+        resetCounters();
+        shouldZeroMemoryTrackerOnFailure(segmentFlushFailure, "v2");
+    }
+
+    private void shouldZeroMemoryTrackerOnFailure(Injection failure, String column) throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, column));
+
+        assertEquals(expectedBytesLimit(), getSegmentBufferSpaceLimit());
+        assertEquals("Segment buffer memory tracker should start at zero!", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0')");
+        flush();
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '1')");
+        flush();
+
+        // Verify that we abort exactly once and zero the memory tracker:
+        verifyCompactionIndexBuilds(1, failure, currentTable());
+
+        String select = String.format("SELECT * FROM %%s WHERE %s = %s", column, column.equals("v1") ? "0" : "'0'");
+
+        assertThatThrownBy(() -> executeNet(select)).isInstanceOf(ReadFailureException.class);
+    }
+
+    @Test
+    public void shouldZeroMemoryAfterOneOfTwoIndexesFail() throws Throwable
+    {
+        createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+        assertEquals(expectedBytesLimit(), getSegmentBufferSpaceLimit());
+        assertEquals("Segment buffer memory tracker should start at zero!", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('0', 0, '0')");
+        flush();
+        execute("INSERT INTO %s (id1, v1, v2) VALUES ('1', 1, '1')");
+        flush();
+
+        // Verify that we abort both indices and zero the memory tracker:
+        verifyCompactionIndexBuilds(2, kdTreeSegmentFlushFailure, currentTable());
+
+        assertThatThrownBy(() -> executeNet("SELECT * FROM %s WHERE V1 = 0"))
+                .isInstanceOf(ReadFailureException.class);
+    }
+
+    @Test
+    public void shouldZeroMemoryAfterConcurrentIndexFailures() throws Throwable
+    {
+        String table1 = createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String table2 = createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+
+        assertEquals(expectedBytesLimit(), getSegmentBufferSpaceLimit());
+        assertEquals("Segment buffer memory tracker should start at zero!", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        execute("INSERT INTO " + KEYSPACE + "." + table1 + "(id1, v1, v2) VALUES ('0', 0, '0')");
+        flush(KEYSPACE, table1);
+        execute("INSERT INTO " + KEYSPACE + "." + table1 + "(id1, v1, v2) VALUES ('1', 1, '1')");
+        flush(KEYSPACE, table1);
+
+        execute("INSERT INTO " + KEYSPACE + "." + table2 + "(id1, v1, v2) VALUES ('0', 0, '0')");
+        flush(KEYSPACE, table2);
+        execute("INSERT INTO " + KEYSPACE + "." + table2 + "(id1, v1, v2) VALUES ('1', 1, '1')");
+        flush(KEYSPACE, table2);
+
+        // Start compaction against both tables/indexes and verify that they are aborted safely:
+        verifyCompactionIndexBuilds(2, segmentFlushFailure, table1, table2);
+
+        assertThatThrownBy(() -> executeNet(String.format("SELECT * FROM %s WHERE v1 = 0", KEYSPACE + "." + table1)))
+                .isInstanceOf(ReadFailureException.class);
+
+        assertThatThrownBy(() -> executeNet(String.format("SELECT * FROM %s WHERE v1 = 0", KEYSPACE + "." + table2)))
+                .isInstanceOf(ReadFailureException.class);
+    }
+
+    @Test
+    public void shouldLeaveOnlyFailedIndexNonQueryable() throws Throwable
+    {
+        String table1 = createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String table2 = createTable(CREATE_TABLE_TEMPLATE);
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
+
+        assertEquals(expectedBytesLimit(), getSegmentBufferSpaceLimit());
+        assertEquals("Segment buffer memory tracker should start at zero!", 0L, getSegmentBufferUsedBytes());
+        assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+
+        execute("INSERT INTO " + KEYSPACE + "." + table1 + "(id1, v1, v2) VALUES ('0', 0, '0')");
+        flush(KEYSPACE, table1);
+        execute("INSERT INTO " + KEYSPACE + "." + table1 + "(id1, v1, v2) VALUES ('1', 1, '1')");
+        flush(KEYSPACE, table1);
+
+        execute("INSERT INTO " + KEYSPACE + "." + table2 + "(id1, v1, v2) VALUES ('0', 0, '0')");
+        flush(KEYSPACE, table2);
+        execute("INSERT INTO " + KEYSPACE + "." + table2 + "(id1, v1, v2) VALUES ('1', 1, '1')");
+        flush(KEYSPACE, table2);
+
+        // Start compaction against both tables/indexes, and verify only the numeric index is aborted:
+        verifyCompactionIndexBuilds(1, kdTreeSegmentFlushFailure, table1, table2);
+
+        assertThatThrownBy(() -> executeNet(String.format("SELECT * FROM %s WHERE v1 = 0", KEYSPACE + "." + table1)))
+                .isInstanceOf(ReadFailureException.class);
+
+        ResultSet rows = executeNet(String.format("SELECT * FROM %s WHERE v2 = '0'", KEYSPACE + "." + table2));
+        assertEquals(1, rows.all().size());
+    }
+
+    private void verifyCompactionIndexBuilds(int aborts, Injection failure, String... tables) throws Throwable
+    {
+        Injections.inject(failure);
+        failure.enable();
+
+        try
+        {
+            Arrays.stream(tables).forEach(table -> compact(KEYSPACE, table));
+
+            Assert.assertEquals(aborts, writerAbortCounter.get());
+
+            assertEquals("Global memory tracker should have reverted to zero.", 0L, getSegmentBufferUsedBytes());
+            assertEquals("There should be no segment builders in progress.", 0L, getColumnIndexBuildsInProgress());
+        }
+        finally
+        {
+            failure.disable();
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/StateMetricsTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/StateMetricsTest.java
new file mode 100644
index 000000000000..4bd2db0ee73f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/StateMetricsTest.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import javax.management.InstanceNotFoundException;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import com.datastax.driver.core.ResultSet;
+
+import static org.apache.cassandra.index.sai.metrics.TableStateMetrics.TABLE_STATE_METRIC_TYPE;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+
+public class StateMetricsTest extends AbstractMetricsTest
+{
+    private static final String CREATE_TABLE_TEMPLATE = "CREATE TABLE %s.%s (id1 TEXT PRIMARY KEY, v1 INT, v2 TEXT) WITH compaction = " +
+                                                        "{'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+    private static final String CREATE_INDEX_TEMPLATE = "CREATE CUSTOM INDEX IF NOT EXISTS %s ON %s.%s(%s) USING 'StorageAttachedIndex'";
+
+    @Rule
+    public ExpectedException exception = ExpectedException.none();
+
+    @Test
+    public void testMetricRelease() throws Throwable
+    {
+        String table = "test_metric_release";
+        String index = "test_metric_release_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index, keyspace, table, "v1"));
+        waitForIndexQueryable(keyspace, table);
+
+        execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES ('0', 0, '0')");
+
+        ResultSet rows = executeNet("SELECT id1 FROM " + keyspace + "." + table + " WHERE v1 = 0");
+        assertEquals(1, rows.all().size());
+        assertEquals(1L, getTableStateMetrics(keyspace, table, "TotalIndexCount"));
+
+        // If we drop the last index on the table, table-level state metrics should still be visible:
+        dropIndex(String.format("DROP INDEX %s." + index, keyspace));
+        assertEquals(0L, getTableStateMetrics(keyspace, table, "TotalIndexCount"));
+
+        // When the whole table is dropped, we should finally fail to find table-level state metrics:
+        dropTable(String.format("DROP TABLE %s." + table, keyspace));
+        assertThatThrownBy(() -> getTableStateMetrics(keyspace, table, "TotalIndexCount")).hasCauseInstanceOf(InstanceNotFoundException.class);
+    }
+
+    @Test
+    public void testMetricCreation() throws Throwable
+    {
+        String table = "test_table";
+        String index = "test_index";
+
+        String keyspace = createKeyspace(CREATE_KEYSPACE_TEMPLATE);
+        createTable(String.format(CREATE_TABLE_TEMPLATE, keyspace, table));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index+"_v1", keyspace, table, "v1"));
+        createIndex(String.format(CREATE_INDEX_TEMPLATE, index+"_v2", keyspace, table, "v2"));
+        waitForIndexQueryable(keyspace, table);
+
+        execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES ('0', 0, '0')");
+        execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES ('1', 1, '1')");
+        execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES ('2', 2, '2')");
+        execute("INSERT INTO " + keyspace + "." + table + " (id1, v1, v2) VALUES ('3', 3, '3')");
+
+        flush(keyspace, table);
+
+        ResultSet rows = executeNet("SELECT id1, v1, v2 FROM " + keyspace + "." + table + " WHERE v1 >= 0");
+
+        int actualRows = rows.all().size();
+        assertEquals(4, actualRows);
+
+        waitForGreaterThanZero(objectNameNoIndex("DiskPercentageOfBaseTable", keyspace, table, TABLE_STATE_METRIC_TYPE));
+        waitForGreaterThanZero(objectNameNoIndex("DiskUsedBytes", keyspace, table, TABLE_STATE_METRIC_TYPE));
+        waitForEquals(objectNameNoIndex("TotalIndexCount", keyspace, table, TABLE_STATE_METRIC_TYPE), 2);
+        waitForEquals(objectNameNoIndex("TotalIndexBuildsInProgress", keyspace, table, TABLE_STATE_METRIC_TYPE), 0);
+        waitForEquals(objectNameNoIndex("TotalQueryableIndexCount", keyspace, table, TABLE_STATE_METRIC_TYPE), 2);
+    }
+
+    private int getTableStateMetrics(String keyspace, String table, String metricsName) throws Exception
+    {
+        return (int) getMetricValue(objectNameNoIndex(metricsName, keyspace, table, TABLE_STATE_METRIC_TYPE));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/TinySegmentFlushingFailureTest.java b/test/unit/org/apache/cassandra/index/sai/metrics/TinySegmentFlushingFailureTest.java
new file mode 100644
index 000000000000..669807c9a5dc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/TinySegmentFlushingFailureTest.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.metrics;
+
+import org.junit.Before;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+public class TinySegmentFlushingFailureTest extends SegmentFlushingFailureTest
+{
+    @Before
+    public void  setSegmentBufferSpace()
+    {
+        DatabaseDescriptor.setSAISegmentWriteBufferSpace(0);
+    }
+
+
+    @Override
+    protected long expectedBytesLimit()
+    {
+        return 0;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/plan/ExpressionTest.java b/test/unit/org/apache/cassandra/index/sai/plan/ExpressionTest.java
new file mode 100644
index 000000000000..f34f5325f1a4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/plan/ExpressionTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.plan;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+public class ExpressionTest
+{
+
+    @Test
+    public void testBoundHashCode()
+    {
+        ByteBuffer buf1 = UTF8Type.instance.decompose("blah");
+        Expression.Bound b1 = new Expression.Bound(buf1, UTF8Type.instance, true);
+        ByteBuffer buf2 = UTF8Type.instance.decompose("blah");
+        Expression.Bound b2 = new Expression.Bound(buf2, UTF8Type.instance, true);
+        assertEquals(b1, b2);
+        assertEquals(b1.hashCode(), b2.hashCode());
+    }
+
+    @Test
+    public void testNotMatchingBoundHashCode()
+    {
+        ByteBuffer buf1 = UTF8Type.instance.decompose("blah");
+        Expression.Bound b1 = new Expression.Bound(buf1, UTF8Type.instance, true);
+        ByteBuffer buf2 = UTF8Type.instance.decompose("blah2");
+        Expression.Bound b2 = new Expression.Bound(buf2, UTF8Type.instance, true);
+        assertNotEquals(b1, b2);
+        assertNotEquals(b1.hashCode(), b2.hashCode());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java b/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java
new file mode 100644
index 000000000000..a6de29535aaf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java
@@ -0,0 +1,713 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.plan;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ListMultimap;
+import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.DoubleType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.rows.BTreeRow;
+import org.apache.cassandra.db.rows.BufferCell;
+import org.apache.cassandra.db.rows.Cell;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.index.sai.IndexingSchemaLoader;
+import org.apache.cassandra.index.sai.QueryContext;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.db.marshal.Int32Type.instance;
+
+public class OperationTest extends IndexingSchemaLoader
+{
+    private static final String KS_NAME = "sai";
+    private static final String CF_NAME = "test_cf";
+    private static final String CLUSTERING_CF_NAME = "clustering_test_cf";
+    private static final String STATIC_CF_NAME = "static_ndi_test_cf";
+
+    private static ColumnFamilyStore BACKEND;
+    private static ColumnFamilyStore CLUSTERING_BACKEND;
+    private static ColumnFamilyStore STATIC_BACKEND;
+
+
+    private QueryController controller;
+    private QueryController controllerClustering;
+    private QueryController controllerStatic;
+
+
+    @BeforeClass
+    public static void loadSchema() throws ConfigurationException
+    {
+        System.setProperty("cassandra.config", "cassandra-murmur.yaml");
+
+        IndexingSchemaLoader.loadSchema();
+
+        IndexingSchemaLoader.createKeyspace(KS_NAME,
+                                            KeyspaceParams.simpleTransient(1),
+                                            IndexingSchemaLoader.ndiCFMD(KS_NAME, CF_NAME),
+                                            IndexingSchemaLoader.clusteringNDICFMD(KS_NAME, CLUSTERING_CF_NAME),
+                                            IndexingSchemaLoader.staticNDICFMD(KS_NAME, STATIC_CF_NAME));
+
+        BACKEND = Keyspace.open(KS_NAME).getColumnFamilyStore(CF_NAME);
+        CLUSTERING_BACKEND = Keyspace.open(KS_NAME).getColumnFamilyStore(CLUSTERING_CF_NAME);
+        STATIC_BACKEND = Keyspace.open(KS_NAME).getColumnFamilyStore(STATIC_CF_NAME);
+    }
+
+
+    @Before
+    public void beforeTest()
+    {
+        ReadCommand command = PartitionRangeReadCommand.allDataRead(BACKEND.metadata(), FBUtilities.nowInSeconds());
+        controller = new QueryController(BACKEND, command, Collections.emptyList(), new QueryContext(), null);
+
+        command = PartitionRangeReadCommand.allDataRead(CLUSTERING_BACKEND.metadata(), FBUtilities.nowInSeconds());
+        controllerClustering = new QueryController(CLUSTERING_BACKEND, command, Collections.emptyList(), new QueryContext(), null);
+
+        command = PartitionRangeReadCommand.allDataRead(STATIC_BACKEND.metadata(), FBUtilities.nowInSeconds());
+        controllerStatic = new QueryController(STATIC_BACKEND, command, Collections.emptyList(), new QueryContext(), null);
+    }
+
+    @After
+    public void afterTest()
+    {
+    }
+
+    @Test
+    public void testAnalyze()
+    {
+        final ColumnMetadata firstName = getColumn(UTF8Type.instance.decompose("first_name"));
+        final ColumnMetadata age = getColumn(UTF8Type.instance.decompose("age"));
+        final ColumnMetadata comment = getColumn(UTF8Type.instance.decompose("comment"));
+
+        // age != 5 AND age > 1 AND age != 6 AND age <= 10
+        Map<Expression.Op, Expression> expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.AND,
+                                                                                    Arrays.asList(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)),
+                                                                                              new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(1)),
+                                                                                              new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(6)),
+                                                                                              new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(10)))));
+
+        Expression expected = new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+        {{
+            operation = Op.RANGE;
+            lower = new Bound(Int32Type.instance.decompose(1), Int32Type.instance, false);
+            upper = new Bound(Int32Type.instance.decompose(10), Int32Type.instance, true);
+
+            exclusions.add(Int32Type.instance.decompose(5));
+            exclusions.add(Int32Type.instance.decompose(6));
+        }};
+
+        Assert.assertEquals(1, expressions.size());
+        Assert.assertEquals(expected, expressions.get(Expression.Op.RANGE));
+
+        // age != 5 OR age >= 7
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.OR,
+                                                     Arrays.asList(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)),
+                                                                  new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(7)))));
+        Assert.assertEquals(2, expressions.size());
+
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                            {{
+                                    operation = Op.NOT_EQ;
+                                    lower = new Bound(Int32Type.instance.decompose(5), Int32Type.instance, true);
+                                    upper = lower;
+                            }}, expressions.get(Expression.Op.NOT_EQ));
+
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                            {{
+                                    operation = Op.RANGE;
+                                    lower = new Bound(Int32Type.instance.decompose(7), Int32Type.instance, true);
+                            }}, expressions.get(Expression.Op.RANGE));
+
+        // age != 5 OR age < 7
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.OR,
+                                                     Arrays.asList(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)),
+                                                                  new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(7)))));
+
+        Assert.assertEquals(2, expressions.size());
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                            {{
+                                    operation = Op.RANGE;
+                                    upper = new Bound(Int32Type.instance.decompose(7), Int32Type.instance, false);
+                            }}, expressions.get(Expression.Op.RANGE));
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                            {{
+                                    operation = Op.NOT_EQ;
+                                    lower = new Bound(Int32Type.instance.decompose(5), Int32Type.instance, true);
+                                    upper = lower;
+                            }}, expressions.get(Expression.Op.NOT_EQ));
+
+        // age > 1 AND age < 7
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.AND,
+                                                     Arrays.asList(new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(1)),
+                                                                  new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(7)))));
+
+        Assert.assertEquals(1, expressions.size());
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                            {{
+                                    operation = Op.RANGE;
+                                    lower = new Bound(Int32Type.instance.decompose(1), Int32Type.instance, false);
+                                    upper = new Bound(Int32Type.instance.decompose(7), Int32Type.instance, false);
+                            }}, expressions.get(Expression.Op.RANGE));
+
+        // first_name = 'a' OR first_name != 'b'
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.OR,
+                                                     Arrays.asList(new SimpleExpression(firstName, Operator.EQ, UTF8Type.instance.decompose("a")),
+                                                                  new SimpleExpression(firstName, Operator.NEQ, UTF8Type.instance.decompose("b")))));
+
+        Assert.assertEquals(2, expressions.size());
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("first_name", UTF8Type.instance))
+                            {{
+                                    operation = Op.NOT_EQ;
+                                    lower = new Bound(UTF8Type.instance.decompose("b"), UTF8Type.instance, true);
+                                    upper = lower;
+                            }}, expressions.get(Expression.Op.NOT_EQ));
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("first_name", UTF8Type.instance))
+                            {{
+                                    operation = Op.EQ;
+                                    lower = upper = new Bound(UTF8Type.instance.decompose("a"), UTF8Type.instance, true);
+                            }}, expressions.get(Expression.Op.EQ));
+
+        // comment = 'soft eng' and comment != 'likes do'
+        ListMultimap<ColumnMetadata, Expression> e = Operation.analyzeGroup(controller, Operation.OperationType.OR,
+                                                                            Arrays.asList(new SimpleExpression(comment, Operator.LIKE_MATCHES, UTF8Type.instance.decompose("soft eng")),
+                                                                  new SimpleExpression(comment, Operator.NEQ, UTF8Type.instance.decompose("likes do"))));
+
+        List<Expression> expectedExpressions = new ArrayList<Expression>(2)
+        {{
+                add(new Expression(SAITester.createColumnContext("comment", UTF8Type.instance))
+                {{
+                        operation = Op.MATCH;
+                        lower = new Bound(UTF8Type.instance.decompose("soft eng"), UTF8Type.instance, true);
+                        upper = lower;
+                }});
+
+                add(new Expression(SAITester.createColumnContext("comment", UTF8Type.instance))
+                {{
+                        operation = Op.NOT_EQ;
+                        lower = new Bound(UTF8Type.instance.decompose("likes do"), UTF8Type.instance, true);
+                        upper = lower;
+                }});
+        }};
+
+        Assert.assertEquals(expectedExpressions, e.get(comment));
+
+        // first_name = 'j' and comment != 'likes do'
+        e = Operation.analyzeGroup(controller, Operation.OperationType.OR,
+                                   Arrays.asList(new SimpleExpression(comment, Operator.NEQ, UTF8Type.instance.decompose("likes do")),
+                                      new SimpleExpression(firstName, Operator.EQ, UTF8Type.instance.decompose("j"))));
+
+        expectedExpressions = new ArrayList<Expression>(2)
+        {{
+                add(new Expression(SAITester.createColumnContext("comment", UTF8Type.instance))
+                {{
+                        operation = Op.NOT_EQ;
+                        lower = new Bound(UTF8Type.instance.decompose("likes do"), UTF8Type.instance, true);
+                        upper = lower;
+                }});
+        }};
+
+        Assert.assertEquals(expectedExpressions, e.get(comment));
+
+        // age != 27 first_name = 'j' and age != 25
+        e = Operation.analyzeGroup(controller, Operation.OperationType.OR,
+                                   Arrays.asList(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(27)),
+                                      new SimpleExpression(firstName, Operator.EQ, UTF8Type.instance.decompose("j")),
+                                      new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(25))));
+
+        expectedExpressions = new ArrayList<Expression>(2)
+        {{
+                add(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                {{
+                        operation = Op.NOT_EQ;
+                        lower = new Bound(Int32Type.instance.decompose(27), Int32Type.instance, true);
+                        upper = lower;
+                }});
+
+                add(new Expression(SAITester.createColumnContext("age", Int32Type.instance))
+                {{
+                        operation = Op.NOT_EQ;
+                        lower = new Bound(Int32Type.instance.decompose(25), Int32Type.instance, true);
+                        upper = lower;
+                }});
+        }};
+
+        Assert.assertEquals(expectedExpressions, e.get(age));
+    }
+
+    @Test
+    public void testSatisfiedBy()
+    {
+        final ColumnMetadata timestamp = getColumn(UTF8Type.instance.decompose("timestamp"));
+        final ColumnMetadata age = getColumn(UTF8Type.instance.decompose("age"));
+
+        Operation.Builder builder = new Operation.Builder(Operation.OperationType.AND, controller, new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)));
+        Operation op = builder.complete();
+
+        DecoratedKey key = buildKey("0");
+        Unfiltered row = buildRow(buildCell(age, instance.decompose(6), System.currentTimeMillis()));
+        Row staticRow = buildRow(Clustering.STATIC_CLUSTERING);
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()));
+
+        // and reject incorrect value
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(buildCell(age, instance.decompose(6), System.currentTimeMillis()));
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        // range with exclusions - age != 5 AND age > 1 AND age != 6 AND age <= 10
+        builder = new Operation.Builder(Operation.OperationType.AND, controller,
+                                        new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)),
+                                        new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(1)),
+                                        new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(6)),
+                                        new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(10)));
+        op = builder.complete();
+
+        Set<Integer> exclusions = Sets.newHashSet(0, 1, 5, 6, 11);
+        for (int i = 0; i <= 11; i++)
+        {
+            row = buildRow(buildCell(age, instance.decompose(i), System.currentTimeMillis()));
+
+            boolean result = op.satisfiedBy(key, row, staticRow);
+            Assert.assertTrue(exclusions.contains(i) != result);
+        }
+
+        // now let's do something more complex - age = 5 OR age = 6
+        builder = new Operation.Builder(Operation.OperationType.OR, controller,
+                                        new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(5)),
+                                        new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(6)));
+
+        op = builder.complete();
+
+        exclusions = Sets.newHashSet(0, 1, 2, 3, 4, 7, 8, 9, 10);
+        for (int i = 0; i <= 10; i++)
+        {
+            row = buildRow(buildCell(age, instance.decompose(i), System.currentTimeMillis()));
+
+            boolean result = op.satisfiedBy(key, row, staticRow);
+            Assert.assertTrue(exclusions.contains(i) != result);
+        }
+
+        // now let's test aggregated AND commands
+        builder = new Operation.Builder(Operation.OperationType.AND, controller);
+
+        // logical should be ignored by analyzer, but we still what to make sure that it is
+        //IndexExpression logical = new IndexExpression(ByteBufferUtil.EMPTY_BYTE_BUFFER, IndexOperator.EQ, ByteBufferUtil.EMPTY_BYTE_BUFFER);
+        //logical.setLogicalOp(LogicalIndexOperator.AND);
+
+        //builder.add(logical);
+        builder.add(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(0)));
+        builder.add(new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(10)));
+        builder.add(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(7)));
+
+        op = builder.complete();
+
+        exclusions = Sets.newHashSet(7);
+        for (int i = 0; i < 10; i++)
+        {
+            row = buildRow(buildCell(age, instance.decompose(i), System.currentTimeMillis()));
+
+            boolean result = op.satisfiedBy(key, row, staticRow);
+            Assert.assertTrue(exclusions.contains(i) != result);
+        }
+
+        // multiple analyzed expressions in the Operation timestamp >= 10 AND age = 5
+        builder = new Operation.Builder(Operation.OperationType.AND, controller);
+        builder.add(new SimpleExpression(timestamp, Operator.GTE, LongType.instance.decompose(10L)));
+        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(5)));
+
+        op = builder.complete();
+
+        row = buildRow(buildCell(age, instance.decompose(6), System.currentTimeMillis()),
+                                  buildCell(timestamp, LongType.instance.decompose(11L), System.currentTimeMillis()));
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()),
+                                  buildCell(timestamp, LongType.instance.decompose(22L), System.currentTimeMillis()));
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()),
+                                  buildCell(timestamp, LongType.instance.decompose(9L), System.currentTimeMillis()));
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        // operation with internal expressions and right child
+        builder = new Operation.Builder(Operation.OperationType.OR, controller,
+                                        new SimpleExpression(timestamp, Operator.GT, LongType.instance.decompose(10L)));
+        builder.setRight(new Operation.Builder(Operation.OperationType.AND, controller,
+                                               new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(0)),
+                                               new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(10))));
+        op = builder.complete();
+
+        row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()),
+                                  buildCell(timestamp, LongType.instance.decompose(9L), System.currentTimeMillis()));
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(buildCell(age, instance.decompose(20), System.currentTimeMillis()),
+                                  buildCell(timestamp, LongType.instance.decompose(11L), System.currentTimeMillis()));
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(buildCell(age, instance.decompose(0), System.currentTimeMillis()),
+                                  buildCell(timestamp, LongType.instance.decompose(9L), System.currentTimeMillis()));
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        // and for desert let's try out null and deleted rows etc.
+        builder = new Operation.Builder(Operation.OperationType.AND, controller);
+        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(30)));
+        op = builder.complete();
+
+        Assert.assertFalse(op.satisfiedBy(key, null, staticRow));
+        Assert.assertFalse(op.satisfiedBy(key, row, null));
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        long now = System.currentTimeMillis();
+
+        row = OperationTest.buildRow(
+        Row.Deletion.regular(new DeletionTime(now - 10, (int) (now / 1000))),
+        buildCell(age, instance.decompose(6), System.currentTimeMillis()));
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        row = buildRow(deletedCell(age, System.currentTimeMillis(), FBUtilities.nowInSeconds()));
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        try
+        {
+            Assert.assertFalse(op.satisfiedBy(key, buildRow(), staticRow));
+        }
+        catch (IllegalStateException e)
+        {
+            Assert.fail("IllegalStateException should not be thrown when missing column");
+        }
+    }
+
+    @Test
+    public void testAnalyzeNotIndexedButDefinedColumn()
+    {
+        final ColumnMetadata firstName = getColumn(UTF8Type.instance.decompose("first_name"));
+        final ColumnMetadata height = getColumn(UTF8Type.instance.decompose("height"));
+
+        // first_name = 'a' AND height != 10
+        Map<Expression.Op, Expression> expressions;
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.AND,
+                                                     Arrays.asList(new SimpleExpression(firstName, Operator.EQ, UTF8Type.instance.decompose("a")),
+                              new SimpleExpression(height, Operator.NEQ, Int32Type.instance.decompose(5)))));
+
+        Assert.assertEquals(2, expressions.size());
+
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("height", Int32Type.instance))
+        {{
+                operation = Op.NOT_EQ;
+                lower = new Bound(Int32Type.instance.decompose(5), Int32Type.instance, true);
+                upper = lower;
+        }}, expressions.get(Expression.Op.NOT_EQ));
+
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.AND,
+                                                     Arrays.asList(new SimpleExpression(firstName, Operator.EQ, UTF8Type.instance.decompose("a")),
+                              new SimpleExpression(height, Operator.GT, Int32Type.instance.decompose(0)),
+                              new SimpleExpression(height, Operator.NEQ, Int32Type.instance.decompose(5)))));
+
+        Assert.assertEquals(2, expressions.size());
+
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("height", Int32Type.instance))
+        {{
+            operation = Op.RANGE;
+            lower = new Bound(Int32Type.instance.decompose(0), Int32Type.instance, false);
+            exclusions.add(Int32Type.instance.decompose(5));
+        }}, expressions.get(Expression.Op.RANGE));
+
+        expressions = convert(Operation.analyzeGroup(controller, Operation.OperationType.AND,
+                                                     Arrays.asList(new SimpleExpression(firstName, Operator.EQ, UTF8Type.instance.decompose("a")),
+                              new SimpleExpression(height, Operator.NEQ, Int32Type.instance.decompose(5)),
+                              new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(0)),
+                              new SimpleExpression(height, Operator.LT, Int32Type.instance.decompose(10)))));
+
+        Assert.assertEquals(2, expressions.size());
+
+        Assert.assertEquals(new Expression(SAITester.createColumnContext("height", Int32Type.instance))
+        {{
+                operation = Op.RANGE;
+                lower = new Bound(Int32Type.instance.decompose(0), Int32Type.instance, true);
+                upper = new Bound(Int32Type.instance.decompose(10), Int32Type.instance, false);
+                exclusions.add(Int32Type.instance.decompose(5));
+        }}, expressions.get(Expression.Op.RANGE));
+    }
+
+    @Test
+    public void testSatisfiedByWithClustering()
+    {
+        ColumnMetadata location = getColumn(CLUSTERING_BACKEND, UTF8Type.instance.decompose("location"));
+        ColumnMetadata age = getColumn(CLUSTERING_BACKEND, UTF8Type.instance.decompose("age"));
+        ColumnMetadata height = getColumn(CLUSTERING_BACKEND, UTF8Type.instance.decompose("height"));
+        ColumnMetadata score = getColumn(CLUSTERING_BACKEND, UTF8Type.instance.decompose("score"));
+
+        DecoratedKey key = buildKey(CLUSTERING_BACKEND, "0");
+        Unfiltered row = buildRow(Clustering.make(UTF8Type.instance.fromString("US"), Int32Type.instance.decompose(27)),
+                                  buildCell(height, instance.decompose(182), System.currentTimeMillis()),
+                                  buildCell(score, DoubleType.instance.decompose(1.0d), System.currentTimeMillis()));
+        Row staticRow = buildRow(Clustering.STATIC_CLUSTERING);
+
+        Operation.Builder builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(27)));
+        builder.add(new SimpleExpression(height, Operator.EQ, Int32Type.instance.decompose(182)));
+
+        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+
+        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+
+        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(28)));
+        builder.add(new SimpleExpression(height, Operator.EQ, Int32Type.instance.decompose(182)));
+
+        Assert.assertFalse(builder.complete().satisfiedBy(key, row, staticRow));
+
+        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US")));
+        builder.add(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(27)));
+
+        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+
+        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("BY")));
+        builder.add(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(28)));
+
+        Assert.assertFalse(builder.complete().satisfiedBy(key, row, staticRow));
+
+        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US")));
+        builder.add(new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(27)));
+        builder.add(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182)));
+
+        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+
+        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US")));
+        builder.add(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182)));
+        builder.add(new SimpleExpression(score, Operator.EQ, DoubleType.instance.decompose(1.0d)));
+
+        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+
+        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        builder.add(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182)));
+        builder.add(new SimpleExpression(score, Operator.EQ, DoubleType.instance.decompose(1.0d)));
+
+        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+    }
+
+    private Map<Expression.Op, Expression> convert(Multimap<ColumnMetadata, Expression> expressions)
+    {
+        Map<Expression.Op, Expression> converted = new HashMap<>();
+        for (Expression expression : expressions.values())
+        {
+            Expression column = converted.get(expression.getOp());
+            assert column == null; // sanity check
+            converted.put(expression.getOp(), expression);
+        }
+
+        return converted;
+    }
+
+    @Test
+    public void testSatisfiedByWithStatic()
+    {
+        final ColumnMetadata sensorType = getColumn(STATIC_BACKEND, UTF8Type.instance.decompose("sensor_type"));
+        final ColumnMetadata value = getColumn(STATIC_BACKEND, UTF8Type.instance.decompose("value"));
+
+        DecoratedKey key = buildKey(STATIC_BACKEND, 0);
+        Unfiltered row = buildRow(Clustering.make(UTF8Type.instance.fromString("date"), LongType.instance.decompose(20160401L)),
+                                  buildCell(value, DoubleType.instance.decompose(24.56), System.currentTimeMillis()));
+        Row staticRow = buildRow(Clustering.STATIC_CLUSTERING,
+                                 buildCell(sensorType, UTF8Type.instance.decompose("TEMPERATURE"), System.currentTimeMillis()));
+
+        // sensor_type ='TEMPERATURE' AND value = 24.56
+        Operation op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
+                                             new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
+                                             new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))).complete();
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        // sensor_type ='TEMPERATURE' AND value = 30
+        op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
+                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
+                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(30.00))).complete();
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        // sensor_type ='PRESSURE' OR value = 24.56
+        op = new Operation.Builder(Operation.OperationType.OR, controllerStatic,
+                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
+                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))).complete();
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        // sensor_type ='PRESSURE' OR value = 30
+        op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
+                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE")),
+                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(30.00))).complete();
+
+        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+
+        // (sensor_type = 'TEMPERATURE' OR sensor_type = 'PRESSURE') AND value = 24.56
+        op = new Operation.Builder(Operation.OperationType.OR, controllerStatic,
+                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
+                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE")))
+             .setRight(new Operation.Builder(Operation.OperationType.AND, controllerStatic,
+                                             new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56)))).complete();
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+
+        // sensor_type = LIKE 'TEMP%'  AND value = 24.56
+        op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
+                                   new SimpleExpression(sensorType, Operator.LIKE_PREFIX, UTF8Type.instance.decompose("TEMP")),
+                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))).complete();
+
+        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+    }
+
+    private static class SimpleExpression extends RowFilter.Expression
+    {
+        SimpleExpression(ColumnMetadata column, Operator operator, ByteBuffer value)
+        {
+            super(column, operator, value);
+        }
+
+        @Override
+        public Kind kind()
+        {
+            return Kind.SIMPLE;
+        }
+
+        @Override
+        public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey, Row row)
+        {
+            throw new UnsupportedOperationException();
+        }
+    }
+
+    private static DecoratedKey buildKey(Object... key) {
+        return buildKey(BACKEND, key);
+    }
+
+    private static DecoratedKey buildKey(ColumnFamilyStore cfs, Object... key) {
+        AbstractType<?> type = cfs.metadata().partitionKeyType;
+        ByteBuffer decomposed;
+        if(type instanceof CompositeType)
+        {
+            Preconditions.checkArgument(key.length == type.subTypes().size());
+            decomposed = ((CompositeType) type).decompose(key);
+        }
+        else
+        {
+            Preconditions.checkArgument(key.length == 1);
+            decomposed = ((AbstractType) type).decompose(key[0]);
+        }
+        return Murmur3Partitioner.instance.decorateKey(decomposed);
+    }
+
+    private static Unfiltered buildRow(Cell... cells)
+    {
+        return buildRow(Clustering.EMPTY, null, cells);
+    }
+
+    private static Row buildRow(Row.Deletion deletion, Cell... cells)
+    {
+        return buildRow(Clustering.EMPTY, deletion, cells);
+    }
+
+    private static Row buildRow(Clustering clustering, Cell... cells)
+    {
+        return buildRow(clustering, null, cells);
+    }
+
+    private static Row buildRow(Clustering clustering, Row.Deletion deletion, Cell... cells)
+    {
+        Row.Builder rowBuilder = BTreeRow.sortedBuilder();
+        rowBuilder.newRow(clustering);
+        for (Cell c : cells)
+            rowBuilder.addCell(c);
+
+        if (deletion != null)
+            rowBuilder.addRowDeletion(deletion);
+
+        return rowBuilder.build();
+    }
+
+    private static Cell buildCell(ColumnMetadata column, ByteBuffer value, long timestamp)
+    {
+        return BufferCell.live(column, timestamp, value);
+    }
+
+    private static Cell deletedCell(ColumnMetadata column, long timestamp, int nowInSeconds)
+    {
+        return BufferCell.tombstone(column, timestamp, nowInSeconds);
+    }
+
+    private static ColumnMetadata getColumn(ByteBuffer name)
+    {
+        return getColumn(BACKEND, name);
+    }
+
+    private static ColumnMetadata getColumn(ColumnFamilyStore cfs, ByteBuffer name)
+    {
+        return cfs.metadata().getColumn(name);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/AbstractRangeIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/utils/AbstractRangeIteratorTest.java
new file mode 100644
index 000000000000..ba2328ea41f9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/AbstractRangeIteratorTest.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.Arrays;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class AbstractRangeIteratorTest extends NdiRandomizedTest
+{
+    protected long[] arr(long... longArray)
+    {
+        return longArray;
+    }
+
+    protected long[] arr(int... intArray)
+    {
+        return Arrays.stream(intArray).mapToLong(i -> i).toArray();
+    }
+
+    void assertOnError(RangeIterator range)
+    {
+        assertThatThrownBy(() -> LongIterator.convert(range)).isInstanceOf(RuntimeException.class);
+    }
+
+    final RangeIterator buildIntersection(RangeIterator... ranges)
+    {
+        return RangeIntersectionIterator.builder().add(Arrays.asList(ranges)).build();
+    }
+
+    final RangeIterator buildSelectiveIntersection(int limit, RangeIterator... ranges)
+    {
+        return RangeIntersectionIterator.selectiveBuilder(limit).add(Arrays.asList(ranges)).build();
+    }
+
+    final RangeIterator buildIntersection(long[]... ranges)
+    {
+        return buildIntersection(toRangeIterator(ranges));
+    }
+
+    final RangeIterator buildSelectiveIntersection(int limit, long[]... ranges)
+    {
+        return buildSelectiveIntersection(limit, toRangeIterator(ranges));
+    }
+
+    final RangeIterator buildUnion(RangeIterator... ranges)
+    {
+        return RangeUnionIterator.builder().add(Arrays.asList(ranges)).build();
+    }
+
+    final RangeIterator buildUnion(long[]... ranges)
+    {
+        return buildUnion(toRangeIterator(ranges));
+    }
+
+    final RangeIterator buildConcat(RangeIterator... ranges)
+    {
+        return RangeConcatIterator.builder().add(Arrays.asList(ranges)).build();
+    }
+
+    final RangeIterator buildConcat(long[]... ranges)
+    {
+        return buildConcat(toRangeIterator(ranges));
+    }
+
+    private RangeIterator[] toRangeIterator(long[]... ranges)
+    {
+        return Arrays.stream(ranges).map(this::build).toArray(RangeIterator[]::new);
+    }
+
+    protected LongIterator build(long... tokens)
+    {
+        return build(tokens, false);
+    }
+
+    protected LongIterator build(long[] tokensA, boolean onErrorA)
+    {
+        LongIterator rangeA = new LongIterator(tokensA);
+
+        if (onErrorA)
+            rangeA.throwsException();
+
+        return rangeA;
+    }
+
+    protected RangeIterator buildOnError(RangeIterator.Builder.IteratorType type, long[] tokensA, long[] tokensB)
+    {
+        return build(type, tokensA, true, tokensB, true);
+    }
+
+    protected RangeIterator buildOnErrorA(RangeIterator.Builder.IteratorType type, long[] tokensA, long[] tokensB)
+    {
+        return build(type, tokensA, true, tokensB, false);
+    }
+
+    protected RangeIterator buildOnErrorB(RangeIterator.Builder.IteratorType type, long[] tokensA, long[] tokensB)
+    {
+        return build(type, tokensA, false, tokensB, true);
+    }
+
+    protected RangeIterator build(RangeIterator.Builder.IteratorType type, long[] tokensA, long[] tokensB)
+    {
+        return build(type, tokensA, false, tokensB, false);
+    }
+
+    protected RangeIterator build(RangeIterator.Builder.IteratorType type, long[] tokensA, boolean onErrorA, long[] tokensB, boolean onErrorB)
+    {
+        RangeIterator rangeA = build(tokensA, onErrorA);
+        RangeIterator rangeB = build(tokensB, onErrorB);
+
+        switch (type)
+        {
+            case INTERSECTION:
+                return buildIntersection(rangeA, rangeB);
+            case UNION:
+                return buildUnion(rangeA, rangeB);
+            case CONCAT:
+                return buildConcat(rangeA, rangeB);
+            default:
+                throw new IllegalArgumentException("unknown type: " + type);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingList.java b/test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingList.java
new file mode 100644
index 000000000000..96fda022ff49
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingList.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import com.google.common.base.MoreObjects;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.v1.OrdinalPostingList;
+
+//TODO Change this whole lot to use longs
+public class ArrayPostingList implements OrdinalPostingList
+{
+    private final int[] postings;
+    private int idx = 0;
+
+    public ArrayPostingList(int[] postings)
+    {
+        this.postings = postings;
+    }
+
+    @Override
+    public long getOrdinal()
+    {
+        return idx;
+    }
+
+    @Override
+    public long nextPosting()
+    {
+        if (idx >= postings.length)
+        {
+            return PostingList.END_OF_STREAM;
+        }
+        return postings[idx++];
+    }
+
+    @Override
+    public long size()
+    {
+        return postings.length;
+    }
+
+    @Override
+    public long advance(long targetRowID)
+    {
+        for (int i = idx; i < postings.length; ++i)
+        {
+            final int segmentRowId = getPostingAt(i);
+
+            idx++;
+
+            if (segmentRowId >= targetRowID)
+            {
+                return segmentRowId;
+            }
+        }
+        return PostingList.END_OF_STREAM;
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("idx", idx)
+                          .add("hashCode", Integer.toHexString(hashCode()))
+                          .toString();
+    }
+
+    public void reset()
+    {
+        idx = 0;
+    }
+
+    public int getPostingAt(int i)
+    {
+        return postings[i];
+    }
+
+    public static class LookupException extends RuntimeException
+    {
+        public LookupException(long idx)
+        {
+            super("Failed on lookup at index " + idx + "!");
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingListTest.java b/test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingListTest.java
new file mode 100644
index 000000000000..e31eb6592969
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/ArrayPostingListTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+
+public class ArrayPostingListTest extends NdiRandomizedTest
+{
+    @Test
+    public void testArrayPostingList() throws Exception
+    {
+        ArrayPostingList postingList = new ArrayPostingList(new int[]{ 1, 2, 3 });
+        assertEquals(3, postingList.size());
+        assertEquals(1, postingList.nextPosting());
+        assertEquals(2, postingList.nextPosting());
+        assertEquals(3, postingList.nextPosting());
+        assertEquals(PostingList.END_OF_STREAM, postingList.nextPosting());
+
+        postingList = new ArrayPostingList(new int[]{ 10, 20, 30, 40, 50, 60 });
+        assertEquals(50, postingList.advance(45));
+        assertEquals(60, postingList.advance(60));
+        assertEquals(PostingList.END_OF_STREAM, postingList.nextPosting());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/DeferredRangeIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/utils/DeferredRangeIteratorTest.java
new file mode 100644
index 000000000000..b9d831980650
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/DeferredRangeIteratorTest.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.Token;
+
+import static org.apache.cassandra.index.sai.utils.LongIterator.convert;
+
+// Test that the different iterators cope with handling incoming ranges that are out of order
+public class DeferredRangeIteratorTest
+{
+    @Test
+    public void rangeUnionIteratorTest() throws Throwable
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(createDeferred(8, 20, 30, 50, 60, 60, 80));
+        builder.add(createDeferred(10, 10, 30, 30, 60, 70, 80));
+        builder.add(createDeferred(5, 30, 40, 60, 80, 80, 90, 100));
+
+        Assert.assertEquals(convert(10L, 20L, 30L, 40L, 50L, 60L, 70L, 80L, 90L, 100L), convert(builder.build()));
+    }
+
+    @Test
+    public void rangeIntersectionIteratorTest() throws Throwable
+    {
+        RangeIterator.Builder builder = RangeIntersectionIterator.selectiveBuilder();
+
+        builder.add(createDeferred(8, 20, 30, 50, 60, 60, 80));
+        builder.add(createDeferred(10, 10, 30, 30, 60, 70, 80));
+        builder.add(createDeferred(5, 30, 40, 60, 80, 80, 90, 100));
+
+        Assert.assertEquals(convert(30L, 60L, 80L), convert(builder.build()));
+    }
+
+    private RangeIterator createDeferred(long min, long... tokens)
+    {
+        return new DeferredRangeIterator(min, tokens);
+    }
+
+    private static class DeferredRangeIterator extends RangeIterator
+    {
+        private final List<LongIterator.LongToken> tokens;
+        private int currentIdx = 0;
+
+        public DeferredRangeIterator(long min, long[] tokens)
+        {
+            super(min, min + 100, 1000);
+            this.tokens = new ArrayList<>(tokens.length);
+            for (long token : tokens)
+                this.tokens.add(new LongIterator.LongToken(token, token));
+        }
+
+        @Override
+        protected Token computeNext()
+        {
+            if (currentIdx >= tokens.size())
+                return endOfData();
+
+            return tokens.get(currentIdx++);
+        }
+
+        @Override
+        protected void performSkipTo(Long nextToken)
+        {
+            for (int i = currentIdx == 0 ? 0 : currentIdx - 1; i < tokens.size(); i++)
+            {
+                LongIterator.LongToken token = tokens.get(i);
+                if (token.get() >= nextToken)
+                {
+                    currentIdx = i;
+                    break;
+                }
+            }
+        }
+
+        @Override
+        public void close() throws IOException
+        {}
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/IndexComponentsLeakDetector.java b/test/unit/org/apache/cassandra/index/sai/utils/IndexComponentsLeakDetector.java
new file mode 100644
index 000000000000..e5a9484881a7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/IndexComponentsLeakDetector.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.carrotsearch.randomizedtesting.rules.TestRuleAdapter;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.io.TrackingIndexComponents;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.lucene.store.IndexInput;
+
+import static org.junit.Assert.assertTrue;
+
+public class IndexComponentsLeakDetector extends TestRuleAdapter
+{
+    private final static Set<TrackingIndexComponents> trackedIndexComponents = Collections.synchronizedSet(new HashSet<>());
+
+    public IndexComponents newIndexComponents(String column, Descriptor descriptor, SequentialWriterOption sequentialWriterOption,
+                                              CompressionParams params)
+    {
+        final TrackingIndexComponents components = new TrackingIndexComponents(column, descriptor, sequentialWriterOption, params);
+        trackedIndexComponents.add(components);
+        return components;
+    }
+
+    @Override
+    protected void afterIfSuccessful()
+    {
+        for (TrackingIndexComponents components : trackedIndexComponents)
+        {
+            final Map<IndexInput, String> openInputs = components.getOpenInputs();
+            assertTrue("Index components have open inputs: " + openInputs, openInputs.isEmpty());
+        }
+    }
+
+    @Override
+    protected void afterAlways(List<Throwable> errors)
+    {
+        trackedIndexComponents.clear();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/LongArrays.java b/test/unit/org/apache/cassandra/index/sai/utils/LongArrays.java
new file mode 100644
index 000000000000..a37cd2647997
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/LongArrays.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.Arrays;
+
+public final class LongArrays
+{
+    public static LongArrayDecorator identity()
+    {
+        return new LongArrayDecorator(IDENTITY);
+    }
+
+    public static LongArrayDecorator from(long[] values)
+    {
+        return new LongArrayDecorator(new PrimitiveLongArray(values));
+    }
+
+    public static class LongArrayDecorator
+    {
+        private final LongArray base;
+
+        private LongArrayDecorator(LongArray base)
+        {
+            this.base = base;
+        }
+
+        public LongArray build()
+        {
+            return base;
+        }
+    }
+
+    private static final LongArray IDENTITY = new LongArray()
+    {
+        @Override
+        public long get(long index)
+        {
+            return index;
+        }
+
+        @Override
+        public long length()
+        {
+            return Long.MAX_VALUE;
+        }
+
+        @Override
+        public long findTokenRowID(long value)
+        {
+            return value;
+        }
+    };
+
+    private static class PrimitiveLongArray implements LongArray
+    {
+        private final long[] tokens;
+
+        private PrimitiveLongArray(long[] tokens)
+        {
+            this.tokens = tokens;
+        }
+
+        @Override
+        public long get(long idx)
+        {
+            return tokens[Math.toIntExact(idx)];
+        }
+
+        @Override
+        public long length()
+        {
+            return tokens.length;
+        }
+
+        @Override
+        public long findTokenRowID(long value)
+        {
+            return Arrays.binarySearch(tokens, value);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/LongIterator.java b/test/unit/org/apache/cassandra/index/sai/utils/LongIterator.java
new file mode 100644
index 000000000000..47fed5f0addb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/LongIterator.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.function.LongFunction;
+
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.Token;
+import org.apache.cassandra.index.sai.memory.InMemoryToken;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class LongIterator extends RangeIterator
+{
+    private final List<LongToken> tokens;
+    private int currentIdx = 0;
+
+    /**
+     * whether LongIterator should throw exception during iteration.
+     */
+    private boolean shouldThrow = false;
+    private final Random random = new Random();
+
+    public LongIterator(long[] tokens)
+    {
+        this(tokens, t -> t);
+    }
+
+    public LongIterator(long[] tokens, LongFunction<Long> toOffset)
+    {
+        super(tokens.length == 0 ? null : tokens[0], tokens.length == 0 ? null: tokens[tokens.length - 1], tokens.length);
+
+        this.tokens = new ArrayList<>(tokens.length);
+        for (long token : tokens)
+            this.tokens.add(new LongToken(token, toOffset.apply(token)));
+    }
+
+    public LongIterator throwsException()
+    {
+        this.shouldThrow = true;
+        return this;
+    }
+
+    @Override
+    protected Token computeNext()
+    {
+        // throws exception if it's last element or chosen 1 out of n
+        if (shouldThrow && (currentIdx >= tokens.size() - 1 || random.nextInt(tokens.size()) == 0))
+            throw new RuntimeException("injected exception");
+
+        if (currentIdx >= tokens.size())
+            return endOfData();
+
+        return tokens.get(currentIdx++);
+    }
+
+    @Override
+    protected void performSkipTo(Long nextToken)
+    {
+        for (int i = currentIdx == 0 ? 0 : currentIdx - 1; i < tokens.size(); i++)
+        {
+            LongToken token = tokens.get(i);
+            if (token.get() >= nextToken)
+            {
+                currentIdx = i;
+                break;
+            }
+        }
+    }
+
+    @Override
+    public void close()
+    {}
+
+    public static class LongToken extends Token
+    {
+        public final Set<Long> offsets = new TreeSet<>();
+
+        LongToken(long token, long offset)
+        {
+            super(token);
+            offsets.add(offset);
+        }
+
+        @Override
+        public Iterator<DecoratedKey> keys()
+        {
+            return new AbstractIterator<DecoratedKey>()
+            {
+                Iterator<Long> iterator = offsets.iterator();
+
+                @Override
+                protected DecoratedKey computeNext()
+                {
+                    if (!iterator.hasNext())
+                        return endOfData();
+
+                    long offset = iterator.next();
+                    return new BufferDecoratedKey(new Murmur3Partitioner.LongToken(offset), ByteBufferUtil.bytes(offset));
+                }
+            };
+        }
+    }
+
+    public static List<Long> convert(RangeIterator tokens)
+    {
+        List<Long> results = new ArrayList<>();
+        while (tokens.hasNext())
+            results.add(tokens.next().get());
+
+        return results;
+    }
+
+    public static List<Long> convert(final long... nums)
+    {
+        return new ArrayList<Long>(nums.length)
+        {{
+            for (long n : nums)
+                add(n);
+        }};
+    }
+
+    static List<Long> convertOffsets(RangeIterator tokens)
+    {
+        List<Long> results = new ArrayList<>();
+        while (tokens.hasNext())
+        {
+            Token token = tokens.next();
+            if (token instanceof LongToken)
+            {
+                LongToken longToken = (LongToken) token;
+                results.addAll(longToken.offsets);
+            }
+            else
+            {
+                // extract the fake key and token from LongToken#keys
+                InMemoryToken inMemoryToken = (InMemoryToken) token;
+                inMemoryToken.keys().forEachRemaining(key -> results.add(key.getToken().getLongValue()));
+            }
+        }
+
+        return results;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/LongIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/utils/LongIteratorTest.java
new file mode 100644
index 000000000000..a22bc62064eb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/LongIteratorTest.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import static org.apache.cassandra.index.sai.utils.LongIterator.convert;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class LongIteratorTest
+{
+    @Test
+    public void testEmptyIterator() throws IOException
+    {
+        LongIterator it = new LongIterator(new long[] { });
+
+        Assert.assertEquals(0, it.getCount());
+        Assert.assertEquals(null, it.getCurrent());
+        Assert.assertEquals(null, it.getMaximum());
+        Assert.assertEquals(null, it.getMinimum());
+        Assert.assertFalse(it.hasNext());
+
+        it.close();
+    }
+
+    @Test
+    public void testBasicITerator() throws IOException
+    {
+        LongIterator it = new LongIterator(new long[] { 2L, 3L, 5L, 6L });
+
+        Assert.assertEquals(4L, (long) it.getCount());
+        Assert.assertEquals(2L, (long) it.getCurrent());
+        Assert.assertEquals(6L, (long) it.getMaximum());
+        Assert.assertEquals(2L, (long) it.getMinimum());
+
+        Assert.assertEquals(2L, (long) it.next().get());
+        Assert.assertEquals(3L, (long) it.next().get());
+
+        it.close();
+    }
+
+    @Test
+    public void testOnError()
+    {
+        assertThatThrownBy(() -> convert(new LongIterator(new long[]{}).throwsException())).isInstanceOf(RuntimeException.class);
+        assertThatThrownBy(() -> convert(new LongIterator(new long[]{ 1L}).throwsException())).isInstanceOf(RuntimeException.class);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/NamedMemoryLimiterTest.java b/test/unit/org/apache/cassandra/index/sai/utils/NamedMemoryLimiterTest.java
new file mode 100644
index 000000000000..e0855bebbea1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/NamedMemoryLimiterTest.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class NamedMemoryLimiterTest
+{
+    @Rule
+    public final ExpectedException exception = ExpectedException.none();
+    
+    @Test
+    public void shouldStartAtZeroUsage()
+    {
+        NamedMemoryLimiter limiter = new NamedMemoryLimiter(9, "Test");
+        assertEquals(0, limiter.currentBytesUsed());
+        assertFalse(limiter.usageExceedsLimit());
+    }
+
+    @Test
+    public void shouldRegisterUsageBelowLimit()
+    {
+        NamedMemoryLimiter limiter = new NamedMemoryLimiter(9, "Test");
+        limiter.increment(4);
+        assertEquals(4, limiter.currentBytesUsed());
+        assertFalse(limiter.usageExceedsLimit());
+    }
+
+    @Test
+    public void shouldRegisterUsageExceedingLimit()
+    {
+        NamedMemoryLimiter limiter = new NamedMemoryLimiter(9, "Test");
+        limiter.increment(10);
+        assertEquals(10, limiter.currentBytesUsed());
+        assertTrue(limiter.usageExceedsLimit());
+    }
+
+    @Test
+    public void shouldReturnBelowLimit()
+    {
+        NamedMemoryLimiter limiter = new NamedMemoryLimiter(9, "Test");
+        
+        limiter.increment(10);
+        assertEquals(10, limiter.currentBytesUsed());
+        assertTrue(limiter.usageExceedsLimit());
+
+        limiter.decrement(3);
+        assertEquals(7, limiter.currentBytesUsed());
+        assertFalse(limiter.usageExceedsLimit());
+    }
+
+    @Test
+    public void shouldZeroTrackerAfterFlush()
+    {
+        NamedMemoryLimiter limiter = new NamedMemoryLimiter(9, "Test");
+        limiter.increment(5);
+        limiter.decrement(5);
+        assertEquals(0, limiter.currentBytesUsed());
+        assertFalse(limiter.usageExceedsLimit());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/NdiRandomizedTest.java b/test/unit/org/apache/cassandra/index/sai/utils/NdiRandomizedTest.java
new file mode 100644
index 000000000000..adecf1e7dab2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/NdiRandomizedTest.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+import java.util.Random;
+
+import com.google.common.base.Preconditions;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.ClassRule;
+import org.junit.rules.RuleChain;
+import org.junit.rules.TemporaryFolder;
+import org.junit.rules.TestRule;
+
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.carrotsearch.randomizedtesting.annotations.ThreadLeakScope;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+
+@ThreadLeakScope(ThreadLeakScope.Scope.NONE)
+public class NdiRandomizedTest extends RandomizedTest
+{
+    private static Thread.UncaughtExceptionHandler handler;
+
+    @SuppressWarnings("unused")
+    @BeforeClass
+    private static void saveUncaughtExceptionHandler()
+    {
+        handler = Thread.getDefaultUncaughtExceptionHandler();
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @SuppressWarnings("unused")
+    @AfterClass
+    private static void restoreUncaughtExceptionHandler()
+    {
+        Thread.setDefaultUncaughtExceptionHandler(handler);
+    }
+
+    private static IndexComponentsLeakDetector indexComponentsLeakDetector;
+
+    protected static TemporaryFolder temporaryFolder;
+
+    @ClassRule
+    public static TestRule classRules = RuleChain.outerRule(indexComponentsLeakDetector = new IndexComponentsLeakDetector())
+                                                 .around(temporaryFolder = new TemporaryFolder());
+
+    public IndexComponents newIndexComponents() throws IOException
+    {
+        return indexComponentsLeakDetector.newIndexComponents(randomSimpleString(7, 29),
+                                                              new Descriptor(temporaryFolder.newFolder(),
+                                                                             randomSimpleString(5, 13),
+                                                                             randomSimpleString(3, 17),
+                                                                             randomIntBetween(0, 128)),
+                                                              SequentialWriterOption.newBuilder()
+                                                                                    .bufferSize(randomIntBetween(17, 1 << 13))
+                                                                                    .bufferType(randomBoolean() ? BufferType.ON_HEAP : BufferType.OFF_HEAP)
+                                                                                    .trickleFsync(randomBoolean())
+                                                                                    .trickleFsyncByteInterval(nextInt(1 << 10, 1 << 16))
+                                                                                    .finishOnClose(true)
+                                                                                    .build(), null);
+    }
+
+    /**
+     * Load a byte array with random bytes. Shortcut for getRandom() method
+     */
+    public static void nextBytes(byte[] bytes)
+    {
+        getRandom().nextBytes(bytes);
+    }
+
+    public static byte[] nextBytes(int min, int max)
+    {
+        byte[] bytes = new byte[nextInt(min, max)];
+        nextBytes(bytes);
+        return bytes;
+    }
+
+    //
+    // Note: The nextXXX methods maintain the contract of ThreadLocalRandom
+    // where the max value is exclusive. The between methods maintain
+    // the contract of RandomizedTest where the max value is inclusive
+    //
+
+    public static int nextInt(int max)
+    {
+        return nextInt(0, max);
+    }
+
+    public static int nextInt(int min, int max)
+    {
+        return between(min, max - 1);
+    }
+
+    public static long nextLong(long min, long max)
+    {
+        return between(min, max - 1);
+    }
+
+    public static long between(long min, long max)
+    {
+        return randomLongBetween(min, max);
+    }
+
+    public static int randomIntBetween(int min, int max)
+    {
+        if (min < 0) throw new IllegalArgumentException("min must be >= 0: " + min);
+        if (min > max) throw new IllegalArgumentException("max must be >= min: " + min + ", " + max);
+        return min == max ? min : (int) randomDoubleBetween((double) min, (double) max);
+    }
+
+    public static long randomLongBetween(long min, long max)
+    {
+        if (min < 0) throw new IllegalArgumentException("min must be >= 0: " + min);
+        if (min > max) throw new IllegalArgumentException("max must be >= min: " + min + ", " + max);
+        return min == max ? min : (long) randomDoubleBetween((double) min, (double) max);
+    }
+
+    public static double randomDoubleBetween(double min, double max)
+    {
+        if (min < 0) throw new IllegalArgumentException("min must be >= 0: " + min);
+        if (min > max) throw new IllegalArgumentException("max must be >= min: " + min + ", " + max);
+
+        return min == max ? min : min + (max - min) * randomDouble();
+    }
+
+    public static long scaledRandomLongBetween(long min, long max)
+    {
+        if (min < 0) throw new IllegalArgumentException("min must be >= 0: " + min);
+        if (min > max) throw new IllegalArgumentException("max must be >= min: " + min + ", " + max);
+
+        double point = Math.min(1, Math.abs(randomGaussian()) * 0.3) * multiplier();
+        double range = max - min;
+        long scaled = Math.round(Math.min(point * range, range));
+        return isNightly() ? max - scaled : min + scaled;
+    }
+
+    public static String randomSimpleString(int minLength, int maxLength)
+    {
+        Preconditions.checkArgument(minLength >= 0);
+        Preconditions.checkArgument(maxLength >= 0);
+        final int end = nextInt(minLength, maxLength);
+        if (end == 0)
+        {
+            // allow 0 length
+            return "";
+        }
+        final char[] buffer = new char[end];
+        for (int i = 0; i < end; i++)
+        {
+            buffer[i] = (char) nextInt('a', 'z');
+        }
+        return new String(buffer, 0, end);
+    }
+
+    public static void assertPostingListEquals(PostingList expected, PostingList actual) throws IOException
+    {
+        long actualRowID, rowCounter = 0;
+        while ((actualRowID = actual.nextPosting()) != PostingList.END_OF_STREAM)
+        {
+            assertEquals("Mismatch at pos: " + rowCounter, expected.nextPosting(), actualRowID);
+            rowCounter++;
+        }
+        assertEquals(PostingList.END_OF_STREAM, expected.nextPosting());
+    }
+
+    public static int[] shuffle(int[] array)
+    {
+        Random rgen = new Random();
+
+        for (int i=0; i< array.length; i++)
+        {
+            int randomPosition = rgen.nextInt(array.length);
+            int temp = array[i];
+            array[i] = array[randomPosition];
+            array[randomPosition] = temp;
+        }
+
+        return array;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/OffsetFactoryTest.java b/test/unit/org/apache/cassandra/index/sai/utils/OffsetFactoryTest.java
new file mode 100644
index 000000000000..bd782e737428
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/OffsetFactoryTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.index.sai.SSTableQueryContext;
+import org.mockito.Mockito;
+
+import static org.junit.Assert.assertEquals;
+
+public class OffsetFactoryTest
+{
+    @BeforeClass
+    public static void init()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @Test
+    public void testTokenSkippingCacheWidePartition() throws IOException
+    {
+        // wide partition: row id to token value mapping
+        LongArray token = LongArrays.from(new long[]{
+                -10L, // 0
+                -10L, // 1
+                -10L, // 2
+                -9L,  // 3
+                -9L,  // 4
+                -8L,  // 5 : pk = 1, ck = 1, v1 = 11, v2 = 101
+                -8L,  // 6 : pk = 1, ck = 2, v1 = 11, v2 = 102
+        }).build();
+
+        try (OffsetFactory.TokenLongArray offsetLongArray = new OffsetFactory.TokenLongArray(SSTableQueryContext.forTest(), token, 0))
+        {
+            // for PostingListRangeIterator of column v1 = 11:
+            //    find the next matched row id and its token, -8 is used to skip column v2.
+            //    RangeIntersectionIterator will call another getNext() via hasNext after getting first match from getNext()..
+            long tokenValue = offsetLongArray.get(5);
+            assertEquals(-8L, tokenValue);
+            tokenValue = offsetLongArray.get(6);
+            assertEquals(-8L, tokenValue);
+
+            // for PostingListRangeIterator of column v2 = 101: use -8 token to skip
+            long rowId = offsetLongArray.findTokenRowID(-8);
+            assertEquals(5, rowId);
+        }
+    }
+
+    @Test
+    public void testTokenSkippingOnLargeSSTable() throws Throwable
+    {
+        LongArray tokens = Mockito.mock(LongArray.class);
+
+        long segmentRowIdOffset = Long.MAX_VALUE - 100L;
+
+        Mockito.when(tokens.findTokenRowID(Mockito.anyLong())).thenReturn(segmentRowIdOffset + 1);
+
+        SSTableQueryContext context = SSTableQueryContext.forTest();
+
+        OffsetFactory.TokenLongArray offsetLongArray = new OffsetFactory.TokenLongArray(context, tokens, segmentRowIdOffset);
+
+        long rowId = offsetLongArray.findTokenRowID(1L);
+
+        assertEquals(1L, rowId);
+
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/PostingListTest.java b/test/unit/org/apache/cassandra/index/sai/utils/PostingListTest.java
new file mode 100644
index 000000000000..b5f88c60c31d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/PostingListTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+
+
+public class PostingListTest extends NdiRandomizedTest
+{
+    @Test
+    public void testPeekablePostingList() throws IOException
+    {
+        PostingList.PeekablePostingList postingList = new ArrayPostingList(new int[]{ 1, 2, 3 }).peekable();
+        assertEquals(3, postingList.size());
+        assertEquals(1, postingList.peek());
+        assertEquals(1, postingList.nextPosting());
+        assertEquals(2, postingList.peek());
+        assertEquals(2, postingList.nextPosting());
+        assertEquals(3, postingList.peek());
+        assertEquals(3, postingList.nextPosting());
+        assertEquals(PostingList.END_OF_STREAM, postingList.peek());
+        assertEquals(PostingList.END_OF_STREAM, postingList.nextPosting());
+
+        postingList = new ArrayPostingList(new int[]{ 10, 20, 30, 40, 50, 60 }).peekable();
+        assertEquals(10, postingList.peek());
+        assertEquals(50, postingList.advance(45));
+        assertEquals(60, postingList.peek());
+        assertEquals(60, postingList.advance(60));
+        assertEquals(PostingList.END_OF_STREAM, postingList.advance(60));
+        assertEquals(PostingList.END_OF_STREAM, postingList.peek());
+        assertEquals(PostingList.END_OF_STREAM, postingList.nextPosting());
+
+
+        postingList = new ArrayPostingList(new int[]{ 10, 20, 30, 40, 50, 60 }).peekable();
+        assertEquals(10, postingList.peek());
+        assertEquals(50, postingList.advanceWithoutConsuming(45));
+        assertEquals(50, postingList.peek());
+        assertEquals(50, postingList.advance(45));
+        assertEquals(60, postingList.advanceWithoutConsuming(60));
+        assertEquals(60, postingList.advance(60));
+        assertEquals(PostingList.END_OF_STREAM, postingList.peek());
+        assertEquals(PostingList.END_OF_STREAM, postingList.nextPosting());
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/RangeConcatIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/utils/RangeConcatIteratorTest.java
new file mode 100644
index 000000000000..bc176437d8bc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/RangeConcatIteratorTest.java
@@ -0,0 +1,437 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.function.Supplier;
+import java.util.stream.IntStream;
+
+import org.junit.Test;
+
+import static org.apache.cassandra.index.sai.utils.LongIterator.convert;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.CONCAT;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.INTERSECTION;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.UNION;
+
+public class RangeConcatIteratorTest extends AbstractRangeIteratorTest
+{
+    @Test
+    public void testValidation()
+    {
+        try
+        {
+            buildConcat(build(1L, 4L), build(2L, 3L));
+            fail("Flows for a merging concatenation must not contain one another.");
+        }
+        catch (IllegalArgumentException ignored)
+        {
+        }
+
+        try
+        {
+            buildConcat(build(1L, 4L), build(2L, 5L));
+            fail("Minimum for flow must not be included in exclusive range of previous flow.");
+        }
+        catch (IllegalArgumentException ignored)
+        {
+        }
+
+        // allow min boundary included
+        RangeIterator concat = buildConcat(build(1L, 4L), build(4L, 5L));
+        assertEquals(convert(1L, 4L, 4L, 5L), convert(concat));
+
+        try
+        {
+            buildConcat(build(1L, 4L), build(0L, 3L));
+            fail("Maximum for flow must not be included in exclusive range of previous flow.");
+        }
+        catch (IllegalArgumentException ignored)
+        {
+        }
+
+        try
+        {
+            buildConcat(build(2L, 4L), build(0L, 1L));
+            fail("Flows for merging concatenation must be sorted.");
+        }
+        catch (IllegalArgumentException ignored)
+        {
+        }
+
+        // with empty flow
+        concat = buildConcat(build(), build(0L, 1L));
+        assertEquals(convert(0L, 1L), convert(concat));
+
+        concat = buildConcat(build(0L, 1L), build());
+        assertEquals(convert(0L, 1L), convert(concat));
+
+        concat = buildConcat(build(), build(0L, 1L), build());
+        assertEquals(convert(0L, 1L), convert(concat));
+
+        concat = buildConcat(build(), build(0L, 1L),
+                              build(), build(2L, 3L));
+        assertEquals(convert(0L, 1L, 2L, 3L), convert(concat));
+    }
+
+    @Test
+    public void testSingleIterator()
+    {
+        RangeIterator origin = build(1L, 2L, 4L, 9L );
+        RangeIterator concat = buildConcat(origin);
+        assertSame(origin, concat);
+        assertEquals(convert(1L, 2L, 4L, 9L), convert(concat));
+    }
+
+    @Test
+    public void testNoOverlappingSortedRanges()
+    {
+        RangeIterator concat = buildConcat(build(1L, 2L, 3L),
+                                           build(4L, 5L),
+                                           build(7L, 8L, 9L, 10L));
+
+        assertEquals(convert(1L, 2L, 3L, 4L, 5L, 7L, 8L, 9L, 10L), convert(concat));
+    }
+
+    @Test
+    public void testMinMaxAndCount()
+    {
+        RangeIterator.Builder builder = getConcatBuilder();
+
+        builder.add(build(1L, 2L, 3L));
+        builder.add(build(4L, 5L, 6L));
+        builder.add(build(7L, 8L, 9L));
+
+        assertEquals(9L, (long) builder.getMaximum());
+        assertEquals(9L, builder.getTokenCount());
+
+        RangeIterator tokens = builder.build();
+
+        assertNotNull(tokens);
+        assertEquals(1L, (long)  tokens.getMinimum());
+        assertEquals(9L, (long)  tokens.getMaximum());
+        assertEquals(9L, tokens.getCount());
+
+        for (long i = 1; i < 10; i++)
+        {
+            assertTrue(tokens.hasNext());
+            assertEquals(i, tokens.next().getLong());
+        }
+
+        assertFalse(tokens.hasNext());
+        assertEquals(1L, (long) tokens.getMinimum());
+    }
+
+    @Test
+    public void testSkipTo()
+    {
+        // flow is single use..
+        Supplier<RangeIterator> init = () ->  buildConcat(build(1L, 2L, 3L),
+                                                          build( 4L, 5L, 6L),
+                                                          build( 7L, 8L, 9L));
+
+        RangeIterator tokens;
+
+        tokens = init.get();
+        tokens.skipTo(5L);
+        assertTrue(tokens.hasNext());
+        assertEquals(5L, tokens.next().getLong());
+
+        tokens = init.get();
+        tokens.skipTo(7L);
+        assertTrue(tokens.hasNext());
+        assertEquals(7L, tokens.next().getLong());
+
+        tokens = init.get();
+        tokens.skipTo(2L);
+        tokens.skipTo(5L);
+        tokens.skipTo(10L);
+        assertFalse(tokens.hasNext());
+        assertEquals(1L, (long) tokens.getMinimum());
+        assertEquals(9L, (long) tokens.getMaximum());
+    }
+
+    @Test
+    public void testSkipToWithGaps()
+    {
+        // flow is single use..
+        Supplier<RangeIterator> init = () ->  buildConcat(build(1L, 2L, 3L), build(4L, 6L), build(8L, 9L));
+
+        RangeIterator tokens;
+
+        tokens = init.get();
+        tokens.skipTo(5L);
+        assertTrue(tokens.hasNext());
+        assertEquals(6L, tokens.next().getLong());
+
+        tokens = init.get();
+        tokens.skipTo(7L);
+        assertTrue(tokens.hasNext());
+        assertEquals(8L, tokens.next().getLong());
+
+        tokens = init.get();
+        tokens.skipTo(2L);
+        tokens.skipTo(5L);
+        tokens.skipTo(10L);
+        assertFalse(tokens.hasNext());
+        assertEquals(1L, (long) tokens.getMinimum());
+        assertEquals(9L, (long) tokens.getMaximum());
+    }
+
+    @Test
+    public void testMergingMultipleIterators()
+    {
+        RangeIterator concatA = buildConcat(build(1L, 3L, 5L), build(8L, 10L, 12L));
+        RangeIterator concatB = buildConcat(build(7L, 9L, 11L), build(12L, 14L, 16L));
+
+        assertEquals(convert(1L, 3L, 5L, 7L, 8L, 9L, 10L, 11L, 12L, 14L, 16L), convert(buildUnion(concatA, concatB)));
+    }
+
+    @Test
+    public void testEmptyThenManyNonEmpty()
+    {
+        final RangeIterator.Builder builder = getConcatBuilder();
+
+        builder.add(build());
+        IntStream.range(10, 20).forEach(value -> builder.add(build(value)));
+
+        RangeIterator range = builder.build();
+
+        assertEquals(10L, (long) range.getMinimum());
+        assertEquals(19L, (long) range.getMaximum());
+        assertTrue(range.hasNext());
+        assertEquals(10, range.getCount());
+    }
+
+    @Test
+    public void testEmptyThenSingleNonEmpty()
+    {
+        RangeIterator.Builder builder = getConcatBuilder();
+
+        builder.add(build());
+        builder.add(build(10));
+
+        RangeIterator range = builder.build();
+        assertEquals(10L, (long) range.getMinimum());
+        assertEquals(10L, (long) range.getMaximum());
+        assertTrue(range.hasNext());
+        assertEquals(1, range.getCount());
+    }
+
+    @Test
+    public void testManyNonEmptyThenEmpty()
+    {
+        final RangeIterator.Builder builder = getConcatBuilder();
+
+        IntStream.range(10, 20).forEach(value -> builder.add(build(value)));
+        builder.add(build());
+
+        RangeIterator range = builder.build();
+        assertEquals(10L, (long) range.getMinimum());
+        assertEquals(19L, (long) range.getMaximum());
+        assertTrue(range.hasNext());
+        assertEquals(10, range.getCount());
+    }
+
+    @Test
+    public void testSingleNonEmptyThenEmpty()
+    {
+        RangeIterator.Builder builder = getConcatBuilder();
+
+        builder.add(build(10));
+        builder.add(build());
+
+        RangeIterator range = builder.build();
+        assertEquals(10L, (long) range.getMinimum());
+        assertEquals(10L, (long) range.getMaximum());
+        assertTrue(range.hasNext());
+        assertEquals(1, range.getCount());
+    }
+
+    @Test
+    public void testEmptyNonEmptyEmpty()
+    {
+        final RangeIterator.Builder builder = getConcatBuilder();
+
+        builder.add(build());
+        IntStream.range(10, 20).forEach(value -> builder.add(build(value)));
+        builder.add(build());
+
+        RangeIterator range = builder.build();
+        assertEquals(10L, (long) range.getMinimum());
+        assertEquals(19L, (long) range.getMaximum());
+        assertTrue(range.hasNext());
+        assertEquals(10, range.getCount());
+    }
+
+    @Test
+    public void testNonEmptyEmptyNonEmpty()
+    {
+        final RangeIterator.Builder builder = getConcatBuilder();
+
+        IntStream.range(10, 15).forEach(value -> builder.add(build(value)));
+        builder.add(build());
+        IntStream.range(15, 20).forEach(value -> builder.add(build(value)));
+
+        RangeIterator range = builder.build();
+        assertEquals(10L, (long) range.getMinimum());
+        assertEquals(19L, (long) range.getMaximum());
+        assertTrue(range.hasNext());
+        assertEquals(10, range.getCount());
+    }
+
+    @Test
+    public void testConcatOfIntersection()
+    {
+        // concat of two non-intersected intersections
+        RangeIterator intersectionA = buildIntersection(build(1L, 2L, 3L), build(4L, 5L, 6L));
+        RangeIterator intersectionB = buildIntersection(build(6L, 7L, 8L), build(9L, 10L, 11L));
+        assertEquals(convert(), convert(buildConcat(intersectionA, intersectionB)));
+
+        // concat of two intersected intersections
+        intersectionA = buildIntersection(build( 1L, 2L, 3L), build( 2L, 3L, 4L));
+        intersectionB = buildIntersection(build( 6L, 7L, 8L), build( 7L, 8L, 9L));
+        assertEquals(convert(2L, 3L, 7L, 8L), convert(buildConcat(intersectionA, intersectionB)));
+
+        // concat of one intersected intersection and one non-intersected intersection
+        intersectionA = buildIntersection(build( 1L, 2L, 3L), build( 2L, 3L, 4L));
+        intersectionB = buildIntersection(build( 6L, 7L, 8L), build( 10L));
+        assertEquals(convert(2L, 3L), convert(buildConcat(intersectionA, intersectionB)));
+
+        // concat of one non-intersected intersection and one intersected intersection
+        intersectionA = buildIntersection(build( 6L, 7L, 8L), build( 10L));
+        intersectionB = buildIntersection(build( 1L, 2L, 3L), build( 2L, 3L, 4L));
+        assertEquals(convert(2L, 3L), convert(buildConcat(intersectionA, intersectionB)));
+    }
+
+
+    @Test
+    public void testIntersectionOfConcat()
+    {
+        RangeIterator rangeA = build(1L, 2L, 3L);
+        RangeIterator rangeB = build(4L, 5L, 6L);
+        RangeIterator rangeC = build(7L);
+        RangeIterator rangeD = build(8L);
+        RangeIterator rangeE = build(9L);
+        RangeIterator concatA = buildConcat(rangeA, rangeB, rangeC, rangeD, rangeE);
+
+        rangeA = build( 1L, 3L);
+        rangeB = build( 5L, 7L, 9L);
+        RangeIterator concatB = buildConcat(rangeA, rangeB);
+
+        assertEquals(convert(1L, 3L, 5L, 7L, 9L), convert(buildIntersection(concatA, concatB)));
+    }
+
+    @Test
+    public void testConcatOnError()
+    {
+        assertOnError(buildOnErrorA(CONCAT, arr(1L, 2L, 3L), arr(4L, 5L, 6L)));
+        assertOnError(buildOnErrorB(CONCAT, arr( 1L, 2L, 3L), arr(4L)));
+    }
+
+    @Test
+    public void testConcatOfUnionsOnError()
+    {
+        RangeIterator unionA = buildUnion(arr(1L, 2L, 3L), arr(4L));
+        RangeIterator unionB = buildOnErrorB(UNION, arr(6L), arr(8L, 9L));
+        assertOnError(buildConcat(unionA, unionB));
+
+        unionA = buildOnErrorA(UNION, arr( 1L, 2L, 3L), arr( 4L));
+        unionB = buildUnion(arr( 5L), arr( 5L, 6L));
+        assertOnError(buildConcat(unionA, unionB));
+    }
+
+    @Test
+    public void testConcatOfIntersectionsOnError()
+    {
+        RangeIterator intersectionA = buildOnErrorA(INTERSECTION, arr(1L, 2L, 3L), arr(2L, 3L, 4L));
+        RangeIterator intersectionB = buildIntersection(arr(6L, 7L, 8L), arr(7L, 8L, 9L));
+        assertOnError(buildConcat(intersectionA, intersectionB));
+
+        intersectionA = buildIntersection(arr( 1L, 2L, 3L), arr( 2L, 3L, 4L));
+        intersectionB = buildOnErrorB(INTERSECTION, arr( 6L, 7L, 8L, 9L, 10L), arr(  7L, 8L, 9L));
+        assertOnError(buildConcat(intersectionA, intersectionB));
+    }
+
+    @Test
+    public void testDuplicatedElementsInTheSameFlow()
+    {
+        // In real case, we should not have duplicated elements from the same PostingListRangeIterator
+        RangeIterator rangeA = build(1L, 2L, 3L, 3L, 4L, 4L);
+        RangeIterator rangeB = build(6L, 6L, 7L, 7L);
+        RangeIterator rangeC = build(8L, 8L);
+        RangeIterator concatA = buildConcat(rangeA, rangeB, rangeC);
+
+        assertEquals(convert(1L, 2L, 3L, 3L, 4L, 4L, 6L, 6L, 7L, 7L, 8L, 8L), convert(concatA));
+    }
+
+    @Test
+    public void testOverlappingBoundaries()
+    {
+        RangeIterator rangeA = build(1L, 2L, 3L);
+        RangeIterator rangeB = build(3L, 4L, 6L, 7L);
+        RangeIterator rangeC = build(7L, 8L);
+        RangeIterator rangeD = build(8L, 9L);
+        RangeIterator rangeE = build(9L);
+        RangeIterator rangeF = build(9L);
+        RangeIterator rangeG = build(9L, 10L);
+        RangeIterator concatA = buildConcat(rangeA, rangeB, rangeC, rangeD, rangeE, rangeF, rangeG);
+
+        assertEquals(convert(1L, 2L, 3L, 3L, 4L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 9L, 9L, 10L), convert(concatA));
+    }
+
+    @Test
+    public void testDuplicatedElementsAndOverlappingBoundaries()
+    {
+        RangeIterator rangeA = build(1L, 2L, 2L, 3L);
+        RangeIterator rangeB = build(3L, 4L, 4L, 6L, 6L, 7L);
+        assertEquals(convert(1L, 2L, 2L, 3L, 3L, 4L, 4L, 6L, 6L, 7L), convert(buildConcat(rangeA, rangeB)));
+
+        rangeA = build(1L, 2L, 2L, 3L);
+        rangeB = build(3L);
+        RangeIterator rangeC = build(3L, 4L, 4L, 6L, 6L, 7L);
+        RangeIterator rangeD = build(7L, 7L, 8L);
+        RangeIterator rangeE = build(8L, 9L, 9L);
+        RangeIterator rangeF = build(9L, 10L);
+        RangeIterator concatA = buildConcat(rangeA, rangeB, rangeC, rangeD, rangeE, rangeF);
+
+        assertEquals(convert(1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 9L, 9L, 9L, 10L), convert(concatA));
+    }
+
+    @Test
+    public void testDuplicateElementsAtBoundary()
+    {
+        // Duplicate on the right:
+        RangeIterator rangeA = build(1L, 2L, 3L);
+        RangeIterator rangeB = build(3L, 3L, 4L, 5L);
+        assertEquals(convert(1L, 2L, 3L, 3L, 3L, 4L, 5L), convert(buildConcat(rangeA, rangeB)));
+
+        // Duplicate on the left:
+        rangeA = build(1L, 2L, 3L, 3L);
+        rangeB = build(3L, 4L, 5L);
+        assertEquals(convert(1L, 2L, 3L, 3L, 3L, 4L, 5L), convert(buildConcat(rangeA, rangeB)));
+
+        // Duplicates on both sides:
+        rangeA = build(1L, 2L, 3L, 3L);
+        rangeB = build(3L, 3L, 4L, 5L);
+        assertEquals(convert(1L, 2L, 3L, 3L, 3L, 3L, 4L, 5L), convert(buildConcat(rangeA, rangeB)));
+    }
+
+    private RangeIterator.Builder getConcatBuilder()
+    {
+        return RangeConcatIterator.builder();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/RangeIntersectionIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/utils/RangeIntersectionIteratorTest.java
new file mode 100644
index 000000000000..86a710865834
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/RangeIntersectionIteratorTest.java
@@ -0,0 +1,434 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.carrotsearch.hppc.LongHashSet;
+import com.carrotsearch.hppc.LongSet;
+import org.apache.cassandra.io.util.FileUtils;
+
+import static org.apache.cassandra.index.sai.utils.LongIterator.convert;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.INTERSECTION;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.UNION;
+
+public class RangeIntersectionIteratorTest extends AbstractRangeIteratorTest
+{
+    @Test
+    public void testNoOverlappingValues()
+    {
+        RangeIterator.Builder builder = RangeIntersectionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 2L, 3L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 1L, 7L }));
+        builder.add(new LongIterator(new long[] { 4L, 8L, 9L, 10L }));
+
+        Assert.assertEquals(convert(), convert(builder.build()));
+
+        builder = RangeIntersectionIterator.builder();
+        // both ranges overlap by min/max but not by value
+        builder.add(new LongIterator(new long[] { 1L, 5L, 7L, 9L }));
+        builder.add(new LongIterator(new long[] { 6L }));
+
+        RangeIterator range = builder.build();
+
+        Assert.assertNotNull(range);
+        Assert.assertFalse(range.hasNext());
+
+        builder = RangeIntersectionIterator.builder();
+        // both ranges overlap by min/max but not by value
+        builder.add(new LongIterator(new long[] { 1L, 5L, 7L, 9L }));
+        builder.add(new LongIterator(new long[] { 0L, 10L, 12L }));
+
+        range = builder.build();
+
+        Assert.assertNotNull(range);
+        Assert.assertFalse(range.hasNext());
+    }
+
+    @Test
+    public void testOverlappingValues()
+    {
+        RangeIterator.Builder builder = RangeIntersectionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 4L, 6L, 7L }));
+        builder.add(new LongIterator(new long[] { 2L, 4L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 4L, 6L, 8L, 9L, 10L }));
+
+        Assert.assertEquals(convert(4L, 6L), convert(builder.build()));
+    }
+
+    @Test
+    public void testSingleIterator()
+    {
+        RangeIntersectionIterator.Builder builder = RangeIntersectionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 4L, 9L }));
+
+        Assert.assertEquals(convert(1L, 2L, 4L, 9L), convert(builder.build()));
+    }
+
+    @Test
+    public void testSkipTo()
+    {
+        RangeIterator.Builder builder = RangeIntersectionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 4L, 6L, 7L, 9L, 10L }));
+        builder.add(new LongIterator(new long[] { 2L, 4L, 5L, 6L, 7L, 10L, 12L }));
+        builder.add(new LongIterator(new long[] { 4L, 6L, 7L, 9L, 10L }));
+
+        RangeIterator range = builder.build();
+        Assert.assertNotNull(range);
+
+        // first let's skipTo something before range
+        Assert.assertEquals(4L, (long) range.skipTo(3L).get());
+        Assert.assertEquals(4L, (long) range.getCurrent());
+
+        // now let's skip right to the send value
+        Assert.assertEquals(6L, (long) range.skipTo(5L).get());
+        Assert.assertEquals(6L, (long) range.getCurrent());
+
+        // now right to the element
+        Assert.assertEquals(7L, (long) range.skipTo(7L).get());
+        Assert.assertEquals(7L, (long) range.getCurrent());
+        Assert.assertEquals(7L, (long) range.next().get());
+
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(10L, (long) range.getCurrent());
+
+        // now right after the last element
+        Assert.assertNull(range.skipTo(11L));
+        Assert.assertFalse(range.hasNext());
+    }
+
+    @Test
+    public void testMinMaxAndCount()
+    {
+        RangeIterator.Builder builder = RangeIntersectionIterator.builder();
+
+        builder.add(new LongIterator(new long[]{ 1L, 2L, 9L}));
+        builder.add(new LongIterator(new long[]{ 4L, 5L, 9L}));
+        builder.add(new LongIterator(new long[]{ 7L, 8L, 9L}));
+
+        Assert.assertEquals(9L, (long) builder.getMaximum());
+        Assert.assertEquals(9L, builder.getTokenCount());
+
+        RangeIterator tokens = builder.build();
+
+        Assert.assertNotNull(tokens);
+        Assert.assertEquals(7L, (long) tokens.getMinimum());
+        Assert.assertEquals(9L, (long) tokens.getMaximum());
+        Assert.assertEquals(9L, tokens.getCount());
+
+        Assert.assertEquals(convert(9L), convert(builder.build()));
+    }
+
+    @Test
+    public void testBuilder()
+    {
+        RangeIterator.Builder builder = RangeIntersectionIterator.builder();
+
+        Assert.assertNull(builder.getMinimum());
+        Assert.assertNull(builder.getMaximum());
+        Assert.assertEquals(0L, builder.getTokenCount());
+        Assert.assertEquals(0L, builder.rangeCount());
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 6L }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 6L, 8L, 9L }));
+
+        Assert.assertEquals(6L, (long) builder.getMinimum());
+        Assert.assertEquals(6L, (long) builder.getMaximum());
+        Assert.assertEquals(9L, builder.getTokenCount());
+        Assert.assertEquals(3L, builder.rangeCount());
+        Assert.assertFalse(builder.statistics.isDisjoint());
+
+        Assert.assertEquals(1L, (long) builder.ranges.poll().getMinimum());
+        Assert.assertEquals(4L, (long) builder.ranges.poll().getMinimum());
+        Assert.assertEquals(6L, (long) builder.ranges.poll().getMinimum());
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 6L }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 6L, 8L, 9L }));
+
+        Assert.assertEquals(convert(6L), convert(builder.build()));
+
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[]{ 1L, 5L, 6L }));
+        builder.add(new LongIterator(new long[]{ 3L, 5L, 6L }));
+
+        RangeIterator tokens = builder.build();
+
+        Assert.assertEquals(convert(5L, 6L), convert(tokens));
+
+        FileUtils.closeQuietly(tokens);
+
+        RangeIterator emptyTokens = RangeIntersectionIterator.builder().build();
+        Assert.assertEquals(0, emptyTokens.getCount());
+
+        builder = RangeIntersectionIterator.builder();
+        Assert.assertEquals(0L, builder.add((RangeIterator) null).rangeCount());
+        Assert.assertEquals(0L, builder.add((List<RangeIterator>) null).getTokenCount());
+        Assert.assertEquals(0L, builder.add(new LongIterator(new long[] {})).rangeCount());
+
+        RangeIterator single = new LongIterator(new long[] { 1L, 2L, 3L });
+        RangeIterator range = RangeIntersectionIterator.builder().add(single).build();
+
+        // because build should return first element if it's only one instead of building yet another iterator
+        Assert.assertEquals(range, single);
+
+        // Make a difference between empty and null ranges.
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        Assert.assertEquals(0L, builder.rangeCount());
+        builder.add(single);
+        Assert.assertEquals(1L, builder.rangeCount());
+        range = builder.build();
+        Assert.assertEquals(0, range.getCount());
+
+        // disjoint case
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] { 1L, 2L, 3L }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 6L }));
+
+        Assert.assertTrue(builder.statistics.isDisjoint());
+
+        RangeIterator disjointIntersection = builder.build();
+        Assert.assertNotNull(disjointIntersection);
+        Assert.assertFalse(disjointIntersection.hasNext());
+
+    }
+
+    @Test
+    public void emptyRangeTest()
+    {
+        RangeIterator.Builder builder;
+
+        // empty, then non-empty
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        builder.add(new LongIterator(new long[] { 10}));
+        assertEmpty(builder.build());
+
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        for (int i = 0; i < 10; i++)
+            builder.add(new LongIterator(new long[] { 0, i + 10}));
+        assertEmpty(builder.build());
+
+        // non-empty, then empty
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] { 10}));
+        builder.add(new LongIterator(new long[] {}));
+        assertEmpty(builder.build());
+
+        builder = RangeIntersectionIterator.builder();
+        for (int i = 0; i < 10; i++)
+            builder.add(new LongIterator(new long[] { 0, i + 10}));
+
+        builder.add(new LongIterator(new long[] {}));
+        assertEmpty(builder.build());
+
+        // empty, then non-empty then empty again
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        builder.add(new LongIterator(new long[] { 0, 10}));
+        builder.add(new LongIterator(new long[] {}));
+        assertEmpty(builder.build());
+
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        for (int i = 0; i < 10; i++)
+            builder.add(new LongIterator(new long[] { 0, i + 10}));
+        builder.add(new LongIterator(new long[] {}));
+        assertEmpty(builder.build());
+
+        // non-empty, empty, then non-empty again
+        builder = RangeIntersectionIterator.builder();
+        builder.add(new LongIterator(new long[] { 0, 10}));
+        builder.add(new LongIterator(new long[] {}));
+        builder.add(new LongIterator(new long[] { 0, 10}));
+        assertEmpty(builder.build());
+
+        builder = RangeIntersectionIterator.builder();
+        for (int i = 0; i < 5; i++)
+            builder.add(new LongIterator(new long[] { 0, i + 10}));
+        builder.add(new LongIterator(new long[] {}));
+        for (int i = 5; i < 10; i++)
+            builder.add(new LongIterator(new long[] { 0, i + 10}));
+        assertEmpty(builder.build());
+    }
+
+    public static void assertEmpty(RangeIterator range)
+    {
+        Assert.assertNull(range.getMinimum());
+        Assert.assertNull(range.getMaximum());
+        Assert.assertFalse(range.hasNext());
+        Assert.assertEquals(0, range.getCount());
+    }
+
+    @Test
+    public void testClose() throws IOException
+    {
+        RangeIterator tokens = RangeIntersectionIterator.builder()
+                                                        .add(new LongIterator(new long[] { 1L, 2L, 3L }))
+                                                        .build();
+
+        Assert.assertNotNull(tokens);
+        tokens.close();
+    }
+
+    @Test
+    public void testIsOverlapping()
+    {
+        RangeIterator rangeA, rangeB;
+
+        rangeA = new LongIterator(new long[] { 1L, 5L });
+        rangeB = new LongIterator(new long[] { 5L, 9L });
+        Assert.assertTrue(RangeIterator.isOverlapping(rangeA, rangeB));
+
+        rangeA = new LongIterator(new long[] { 5L, 9L });
+        rangeB = new LongIterator(new long[] { 1L, 6L });
+        Assert.assertTrue(RangeIterator.isOverlapping(rangeA, rangeB));
+
+        rangeA = new LongIterator(new long[] { 5L, 9L });
+        rangeB = new LongIterator(new long[] { 5L, 9L });
+        Assert.assertTrue(RangeIterator.isOverlapping(rangeA, rangeB));
+
+        rangeA = new LongIterator(new long[] { 1L, 4L });
+        rangeB = new LongIterator(new long[] { 5L, 9L });
+        Assert.assertFalse(RangeIterator.isOverlapping(rangeA, rangeB));
+
+        rangeA = new LongIterator(new long[] { 6L, 9L });
+        rangeB = new LongIterator(new long[] { 1L, 4L });
+        Assert.assertFalse(RangeIterator.isOverlapping(rangeA, rangeB));
+    }
+
+    @Test
+    public void testIntersectionOfRandomRanges()
+    {
+        for (int attempt = 0; attempt < 16; attempt++)
+        {
+            final int maxRanges = nextInt(2, 16);
+
+            // generate randomize ranges
+            long[][] ranges = new long[maxRanges][];
+            for (int i = 0; i < ranges.length; i++)
+            {
+                int rangeSize = nextInt(16, 512);
+                LongSet range = new LongHashSet(rangeSize);
+
+                for (int j = 0; j < rangeSize; j++)
+                    range.add(nextLong(0, 100));
+
+                ranges[i] = range.toArray();
+                Arrays.sort(ranges[i]);
+            }
+
+            List<Long> expected = new ArrayList<>();
+            // determine unique tokens which intersect every range
+            for (long token : ranges[0])
+            {
+                boolean intersectsAll = true;
+                for (int i = 1; i < ranges.length; i++)
+                {
+                    if (Arrays.binarySearch(ranges[i], token) < 0)
+                    {
+                        intersectsAll = false;
+                        break;
+                    }
+                }
+
+                if (intersectsAll)
+                    expected.add(token);
+            }
+
+            RangeIterator.Builder builder = RangeIntersectionIterator.builder();
+            for (long[] range : ranges)
+                builder.add(new LongIterator(range));
+
+            Assert.assertEquals(expected, convert(builder.build()));
+        }
+    }
+
+    // SAI specific tests
+    @Test
+    public void testMergingSameToken()
+    {
+        RangeIterator intersection =
+                buildIntersection(new LongIterator(arr(1L, 2L, 3L)), // token = offset
+                                  new LongIterator(arr(2L, 3L, 4), token -> -token), // negate token = offset
+                                  new LongIterator(arr(3L, 4L, 5L), token -> token * token)); // token^2 = offset
+
+        assertEquals(convert(-3L, 3L, 9L), LongIterator.convertOffsets(intersection));
+    }
+
+    @Test
+    public void testSelectiveIntersection()
+    {
+        RangeIterator intersection = buildSelectiveIntersection(2,
+                                                                arr(1L, 4L, 6L, 7L),
+                                                                arr(1L, 4L, 5L, 6L),
+                                                                arr(4L, 6L, 8L, 9L, 10L)); // skipped
+
+        assertEquals(convert(1L, 4L, 6L), convert(intersection));
+
+        intersection = buildSelectiveIntersection(1,
+                                                  arr(2L, 4L, 6L),
+                                                  arr(1L, 4L, 5L, 6L),       // skipped
+                                                  arr(4L, 6L, 8L, 9L, 10L)); // skipped
+
+        assertEquals(convert(2L, 4L, 6L), convert(intersection));
+    }
+
+    @Test
+    public void testIntersectionOfUnionsOnError()
+    {
+        // intersection of two unions
+        RangeIterator unionA = buildOnErrorB(UNION, arr(1L, 2L, 3L), arr(5L, 6L, 7L));
+        RangeIterator unionB = buildUnion(arr(2L, 4L, 6L), arr(5L, 7L, 9L));
+        assertOnError(buildIntersection(unionA, unionB));
+
+        // intersection of union and intersection
+        RangeIterator unionC = buildOnErrorB(UNION, arr(2L, 4L, 6L), arr(5L, 6L, 9L));
+        RangeIterator intersectionA = buildIntersection(arr(3L, 4L, 6L, 9L), arr(2L, 3L, 6L, 9L));
+        assertOnError(buildIntersection(unionC, intersectionA));
+    }
+
+    @Test
+    public void testIntersectionOfIntersectionsOnError()
+    {
+        RangeIterator intersectionA = buildIntersection(arr(1L, 2L, 3L, 6L), arr(2L, 3L, 6L));
+        RangeIterator intersectionB = buildOnErrorA(INTERSECTION, arr(2L, 4L, 6L), arr(5L, 6L, 7L, 9L));
+        assertOnError(buildIntersection(intersectionA, intersectionB));
+
+        intersectionA = buildOnErrorB(INTERSECTION, arr(1L, 2L, 3L, 4L, 5L), arr(2L, 3L, 4L));
+        intersectionB = buildIntersection(arr(1L, 2L, 3L, 4L, 6L), arr(2L, 3L, 4L, 7L, 9L));
+        assertOnError(buildIntersection(intersectionA, intersectionB));
+
+        intersectionA = buildOnError(INTERSECTION, arr(1L, 2L, 3L, 5L), arr( 3L, 4L));
+        intersectionB = buildIntersection(arr(1L, 2L, 3L, 4L, 6L), arr(2L, 3L, 4L, 7L, 9L));
+        assertOnError(buildIntersection(intersectionA, intersectionB));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/utils/RangeUnionIteratorTest.java b/test/unit/org/apache/cassandra/index/sai/utils/RangeUnionIteratorTest.java
new file mode 100644
index 000000000000..15c3e8f2ffb5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/utils/RangeUnionIteratorTest.java
@@ -0,0 +1,451 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.utils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.io.util.FileUtils;
+
+import static org.apache.cassandra.index.sai.utils.LongIterator.convert;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.CONCAT;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.INTERSECTION;
+import static org.apache.cassandra.index.sai.utils.RangeIterator.Builder.IteratorType.UNION;
+
+public class RangeUnionIteratorTest extends AbstractRangeIteratorTest
+{
+    @Test
+    public void testNoOverlappingValues()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 2L, 3L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 1L, 7L }));
+        builder.add(new LongIterator(new long[] { 4L, 8L, 9L, 10L }));
+
+        Assert.assertEquals(convert(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), convert(builder.build()));
+    }
+
+    @Test
+    public void testSingleIterator()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 4L, 9L }));
+
+        Assert.assertEquals(convert(1L, 2L, 4L, 9L), convert(builder.build()));
+    }
+
+    @Test
+    public void testOverlappingValues()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 4L, 6L, 7L }));
+        builder.add(new LongIterator(new long[] { 2L, 3L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 4L, 6L, 8L, 9L, 10L }));
+
+        List<Long> values = convert(builder.build());
+
+        Assert.assertEquals(values.toString(), convert(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), values);
+    }
+
+    @Test
+    public void testNoOverlappingRanges()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 3L }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 7L, 8L, 9L }));
+
+        Assert.assertEquals(convert(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), convert(builder.build()));
+    }
+
+    @Test
+    public void testTwoIteratorsWithSingleValues()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L }));
+        builder.add(new LongIterator(new long[] { 1L }));
+
+        Assert.assertEquals(convert(1L), convert(builder.build()));
+    }
+
+    @Test
+    public void testDifferentSizeIterators()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 2L, 3L, 5L, 6L, 12L, 13L }));
+        builder.add(new LongIterator(new long[] { 1L, 7L, 14L, 15 }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 8L, 9L, 10L }));
+
+        Assert.assertEquals(convert(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 12L, 13L, 14L, 15L), convert(builder.build()));
+    }
+
+    @Test
+    public void testRandomSequences()
+    {
+        ThreadLocalRandom random = ThreadLocalRandom.current();
+
+        long[][] values = new long[random.nextInt(1, 20)][];
+        int numTests = random.nextInt(10, 20);
+
+        for (int tests = 0; tests < numTests; tests++)
+        {
+            RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+            int totalCount = 0;
+
+            for (int i = 0; i < values.length; i++)
+            {
+                long[] part = new long[random.nextInt(1, 500)];
+                for (int j = 0; j < part.length; j++)
+                    part[j] = random.nextLong();
+
+                // all of the parts have to be sorted to mimic SSTable
+                Arrays.sort(part);
+
+                values[i] = part;
+                builder.add(new LongIterator(part));
+                totalCount += part.length;
+            }
+
+            long[] totalOrdering = new long[totalCount];
+            int index = 0;
+
+            for (long[] part : values)
+            {
+                for (long value : part)
+                    totalOrdering[index++] = value;
+            }
+
+            Arrays.sort(totalOrdering);
+
+            int count = 0;
+            RangeIterator tokens = builder.build();
+
+            Assert.assertNotNull(tokens);
+            while (tokens.hasNext())
+                Assert.assertEquals(totalOrdering[count++], (long) tokens.next().get());
+
+            Assert.assertEquals(totalCount, count);
+        }
+    }
+
+    @Test
+    public void testMinMaxAndCount()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 3L }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 7L, 8L, 9L }));
+
+        Assert.assertEquals(9L, (long) builder.getMaximum());
+        Assert.assertEquals(9L, builder.getTokenCount());
+
+        RangeIterator tokens = builder.build();
+
+        Assert.assertNotNull(tokens);
+        Assert.assertEquals(1L, (long) tokens.getMinimum());
+        Assert.assertEquals(9L, (long) tokens.getMaximum());
+        Assert.assertEquals(9L, tokens.getCount());
+
+        for (long i = 1; i < 10; i++)
+        {
+            Assert.assertTrue(tokens.hasNext());
+            Assert.assertEquals(i, (long) tokens.next().get());
+        }
+
+        Assert.assertFalse(tokens.hasNext());
+        Assert.assertEquals(1L, (long) tokens.getMinimum());
+    }
+
+    @Test
+    public void testBuilder()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        Assert.assertNull(builder.getMinimum());
+        Assert.assertNull(builder.getMaximum());
+        Assert.assertEquals(0L, builder.getTokenCount());
+        Assert.assertEquals(0L, builder.rangeCount());
+
+        builder.add(new LongIterator(new long[] { 1L, 2L, 3L }));
+        builder.add(new LongIterator(new long[] { 4L, 5L, 6L }));
+        builder.add(new LongIterator(new long[] { 7L, 8L, 9L }));
+
+        Assert.assertEquals(1L, (long) builder.getMinimum());
+        Assert.assertEquals(9L, (long) builder.getMaximum());
+        Assert.assertEquals(9L, builder.getTokenCount());
+        Assert.assertEquals(3L, builder.rangeCount());
+        Assert.assertFalse(builder.statistics.isDisjoint());
+
+        Assert.assertEquals(1L, (long) builder.ranges.poll().getMinimum());
+        Assert.assertEquals(4L, (long) builder.ranges.poll().getMinimum());
+        Assert.assertEquals(7L, (long) builder.ranges.poll().getMinimum());
+
+        RangeIterator tokens = RangeUnionIterator.build(new ArrayList<RangeIterator>()
+        {{
+            add(new LongIterator(new long[]{1L, 2L, 4L}));
+            add(new LongIterator(new long[]{3L, 5L, 6L}));
+        }});
+
+        Assert.assertEquals(convert(1L, 2L, 3L, 4L, 5L, 6L), convert(tokens));
+
+        FileUtils.closeQuietly(tokens);
+
+        RangeIterator emptyTokens = RangeUnionIterator.builder().build();
+        Assert.assertEquals(0, emptyTokens.getCount());
+
+        builder = RangeUnionIterator.builder();
+        Assert.assertEquals(0L, builder.add((RangeIterator) null).rangeCount());
+        Assert.assertEquals(0L, builder.add((List<RangeIterator>) null).getTokenCount());
+        Assert.assertEquals(0L, builder.add(new LongIterator(new long[] {})).rangeCount());
+
+        RangeIterator single = new LongIterator(new long[] { 1L, 2L, 3L });
+        RangeIterator range = RangeIntersectionIterator.builder().add(single).build();
+
+        // because build should return first element if it's only one instead of building yet another iterator
+        Assert.assertEquals(range, single);
+    }
+
+    @Test
+    public void testSkipTo()
+    {
+        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
+
+        builder.add(new LongIterator(new long[]{1L, 2L, 3L}));
+        builder.add(new LongIterator(new long[]{4L, 5L, 6L}));
+        builder.add(new LongIterator(new long[]{7L, 8L, 9L}));
+
+        RangeIterator tokens = builder.build();
+        Assert.assertNotNull(tokens);
+
+        tokens.skipTo(5L);
+        Assert.assertTrue(tokens.hasNext());
+        Assert.assertEquals(5L, (long) tokens.next().get());
+
+        tokens.skipTo(7L);
+        Assert.assertTrue(tokens.hasNext());
+        Assert.assertEquals(7L, (long) tokens.next().get());
+
+        tokens.skipTo(10L);
+        Assert.assertFalse(tokens.hasNext());
+        Assert.assertEquals(1L, (long) tokens.getMinimum());
+        Assert.assertEquals(9L, (long) tokens.getMaximum());
+    }
+
+    @Test
+    public void testMergingMultipleIterators()
+    {
+        RangeUnionIterator.Builder builderA = RangeUnionIterator.builder();
+
+        builderA.add(new LongIterator(new long[] { 1L, 3L, 5L }));
+        builderA.add(new LongIterator(new long[] { 8L, 10L, 12L }));
+
+        RangeUnionIterator.Builder builderB = RangeUnionIterator.builder();
+
+        builderB.add(new LongIterator(new long[] { 7L, 9L, 11L }));
+        builderB.add(new LongIterator(new long[] { 2L, 4L, 6L }));
+
+        RangeIterator union = RangeUnionIterator.build(Arrays.asList(builderA.build(), builderB.build()));
+        Assert.assertEquals(convert(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L), convert(union));
+    }
+
+    @Test
+    public void testRangeIterator()
+    {
+        LongIterator tokens = new LongIterator(new long[] { 0L, 1L, 2L, 3L });
+
+        Assert.assertEquals(0L, (long) tokens.getMinimum());
+        Assert.assertEquals(3L, (long) tokens.getMaximum());
+
+        for (int i = 0; i <= 3; i++)
+        {
+            Assert.assertTrue(tokens.hasNext());
+            Assert.assertEquals(i, (long) tokens.getCurrent());
+            Assert.assertEquals(i, (long) tokens.next().get());
+        }
+
+        tokens = new LongIterator(new long[] { 0L, 1L, 3L, 5L });
+
+        Assert.assertEquals(3L, (long) tokens.skipTo(2L).get());
+        Assert.assertTrue(tokens.hasNext());
+        Assert.assertEquals(3L, (long) tokens.getCurrent());
+        Assert.assertEquals(3L, (long) tokens.next().get());
+
+        Assert.assertEquals(5L, (long) tokens.skipTo(5L).get());
+        Assert.assertTrue(tokens.hasNext());
+        Assert.assertEquals(5L, (long) tokens.getCurrent());
+        Assert.assertEquals(5L, (long) tokens.next().get());
+
+        LongIterator empty = new LongIterator(new long[0]);
+
+        Assert.assertNull(empty.skipTo(3L));
+        Assert.assertFalse(empty.hasNext());
+    }
+
+    @Test
+    public void emptyRangeTest() {
+        RangeIterator.Builder builder;
+        RangeIterator range;
+        // empty, then non-empty
+        builder = RangeUnionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        for (int i = 0; i < 10; i++)
+            builder.add(new LongIterator(new long[] {i + 10}));
+        range = builder.build();
+        Assert.assertEquals(Long.valueOf(10), range.getMinimum());
+        Assert.assertEquals(Long.valueOf(19), range.getMaximum());
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(10, range.getCount());
+
+        builder = RangeUnionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        builder.add(new LongIterator(new long[] {10}));
+        range = builder.build();
+        Assert.assertEquals(Long.valueOf(10), range.getMinimum());
+        Assert.assertEquals(Long.valueOf(10), range.getMaximum());
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(1, range.getCount());
+
+        // non-empty, then empty
+        builder = RangeUnionIterator.builder();
+        for (int i = 0; i < 10; i++)
+            builder.add(new LongIterator(new long[] {i + 10}));
+        builder.add(new LongIterator(new long[] {}));
+        range = builder.build();
+        Assert.assertEquals(Long.valueOf(10), range.getMinimum());
+        Assert.assertEquals(Long.valueOf(19), range.getMaximum());
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(10, range.getCount());
+
+        builder = RangeUnionIterator.builder();
+        builder.add(new LongIterator(new long[] {10}));
+        builder.add(new LongIterator(new long[] {}));
+        range = builder.build();
+        Assert.assertEquals(Long.valueOf(10), range.getMinimum());
+        Assert.assertEquals(Long.valueOf(10), range.getMaximum());
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(1, range.getCount());
+
+        // empty, then non-empty then empty again
+        builder = RangeUnionIterator.builder();
+        builder.add(new LongIterator(new long[] {}));
+        for (int i = 0; i < 10; i++)
+            builder.add(new LongIterator(new long[] {i + 10}));
+        builder.add(new LongIterator(new long[] {}));
+        range = builder.build();
+        Assert.assertEquals(Long.valueOf(10), range.getMinimum());
+        Assert.assertEquals(Long.valueOf(19), range.getMaximum());
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(10, range.getCount());
+
+        // non-empty, empty, then non-empty again
+        builder = RangeUnionIterator.builder();
+        for (int i = 0; i < 5; i++)
+            builder.add(new LongIterator(new long[] {i + 10}));
+        builder.add(new LongIterator(new long[] {}));
+        for (int i = 5; i < 10; i++)
+            builder.add(new LongIterator(new long[] {i + 10}));
+        range = builder.build();
+        Assert.assertEquals(Long.valueOf(10), range.getMinimum());
+        Assert.assertEquals(Long.valueOf(19), range.getMaximum());
+        Assert.assertTrue(range.hasNext());
+        Assert.assertEquals(10, range.getCount());
+    }
+
+    // SAI specific tests
+    @Test
+    public void testUnionOfIntersection()
+    {
+        // union of two non-intersected intersections
+        RangeIterator intersectionA = buildIntersection(arr(1L, 2L, 3L), arr(4L, 5L, 6L));
+        RangeIterator intersectionB = buildIntersection(arr(6L, 7L, 8L), arr(9L, 10L, 11L));
+
+        RangeIterator union = buildUnion(intersectionA, intersectionB);
+        assertEquals(convert(), convert(union));
+
+        // union of two intersected intersections
+        intersectionA = buildIntersection(arr(1L, 2L, 3L), arr(2L, 3L, 4L));
+        intersectionB = buildIntersection(arr(6L, 7L, 8L), arr(7L, 8L, 9L));
+
+        union = buildUnion(intersectionA, intersectionB);
+        assertEquals(convert(2L, 3L, 7L, 8L), convert(union));
+        assertEquals(RangeUnionIterator.class, union.getClass());
+
+        // union of one intersected intersection and one non-intersected intersection
+        intersectionA = buildIntersection(arr(1L, 2L, 3L), arr(2L, 3L, 4L ));
+        intersectionB = buildIntersection(arr(6L, 7L, 8L), arr(10L ));
+
+        union = buildUnion(intersectionA, intersectionB);
+        assertEquals(convert(2L, 3L), convert(union));
+    }
+
+    @Test
+    public void testUnionOnError()
+    {
+        assertOnError(buildOnError(UNION, arr(1L, 3L, 4L ), arr(7L, 8L)));
+        assertOnError(buildOnErrorA(UNION, arr(1L, 3L, 4L ), arr(4L, 5L)));
+        assertOnError(buildOnErrorB(UNION, arr(1L), arr(2)));
+    }
+
+    @Test
+    public void testUnionOfIntersectionsOnError()
+    {
+        RangeIterator intersectionA = buildIntersection(arr(1L, 2L, 3L, 6L), arr(2L, 3L, 6L));
+        RangeIterator intersectionB = buildOnErrorA(INTERSECTION, arr(2L, 4L, 6L), arr(5L, 6L, 7L, 9L));
+        assertOnError(buildUnion(intersectionA, intersectionB));
+
+        intersectionA = buildOnErrorB(INTERSECTION, arr(1L, 2L, 3L, 4L, 5L), arr(2L, 3L, 5L));
+        intersectionB = buildIntersection(arr(2L, 4L, 5L), arr(5L, 6L, 7L));
+        assertOnError(buildUnion(intersectionA, intersectionB));
+    }
+
+    @Test
+    public void testUnionOfUnionsOnError()
+    {
+        RangeIterator unionA = buildUnion(arr(1L, 2L, 3L, 6L), arr(6L, 7L, 8L));
+        RangeIterator unionB = buildOnErrorA(UNION, arr(2L, 4L, 6L), arr (6L, 7L, 9L));
+        assertOnError(buildUnion(unionA, unionB));
+
+        unionA = buildOnErrorB(UNION, arr(1L, 2L, 3L), arr(3L, 7L, 8L));
+        unionB = buildUnion(arr(2L, 4L, 5L), arr (5L, 7L, 9L));
+        assertOnError(buildUnion(unionA, unionB));
+    }
+
+    @Test
+    public void testUnionOfMergingOnError()
+    {
+        RangeIterator mergingA = buildConcat(arr(1L, 2L, 3L, 6L), arr(6L, 7L, 8L));
+        RangeIterator mergingB = buildOnErrorA(CONCAT, arr(2L, 4L, 6L), arr (6L, 7L, 9L));
+        assertOnError(buildUnion(mergingA, mergingB));
+
+        mergingA = buildOnErrorB(CONCAT, arr(1L, 2L, 3L), arr(3L, 7L, 8L));
+        mergingB = buildConcat(arr(2L, 4L, 5L), arr (5L, 7L, 9L));
+        assertOnError(buildUnion(mergingA, mergingB));
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java b/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java
new file mode 100644
index 000000000000..794738a1f794
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java
@@ -0,0 +1,252 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.view;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableContext;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+public class IndexViewManagerTest extends SAITester
+{
+    private static final int CONCURRENT_UPDATES = 100;
+
+    @BeforeClass
+    public static void setupVersionBarrier()
+    {
+        requireNetwork();
+    }
+
+    @Test
+    public void testUpdateFromFlush() throws Throwable
+    {
+        createTable("CREATE TABLE %S (k INT PRIMARY KEY, v INT)");
+        String indexName = createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        ColumnContext columnContext = columnIndex(getCurrentColumnFamilyStore(), indexName);
+        View initialView = columnContext.getView();
+
+        execute("INSERT INTO %s(k, v) VALUES (1, 10)");
+        execute("INSERT INTO %s(k, v) VALUES (2, 20)");
+        flush();
+
+        View updatedView = columnContext.getView();
+        assertNotEquals(initialView, updatedView);
+        assertEquals(1, updatedView.getIndexes().size());
+    }
+
+    @Test
+    public void testUpdateFromCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %S (k INT PRIMARY KEY, v INT)");
+        String indexName = createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        ColumnFamilyStore store = getCurrentColumnFamilyStore();
+        ColumnContext columnContext = columnIndex(store, indexName);
+        store.disableAutoCompaction();
+
+        execute("INSERT INTO %s(k, v) VALUES (1, 10)");
+        execute("INSERT INTO %s(k, v) VALUES (2, 20)");
+        flush();
+
+        execute("INSERT INTO %s(k, v) VALUES (3, 30)");
+        execute("INSERT INTO %s(k, v) VALUES (4, 40)");
+        flush();
+
+        View initialView = columnContext.getView();
+        assertEquals(2, initialView.getIndexes().size());
+
+        CompactionManager.instance.performMaximal(store, false);
+
+        View updatedView = columnContext.getView();
+        assertNotEquals(initialView, updatedView);
+        assertEquals(1, updatedView.getIndexes().size());
+    }
+
+    /**
+     * Tests concurrent sstable updates from flush and compaction, see CASSANDRA-14207.
+     */
+    @Test
+    public void testConcurrentUpdate() throws Throwable
+    {
+        String tableName = createTable("CREATE TABLE %S (k INT PRIMARY KEY, v INT)");
+        String indexName = createIndex("CREATE CUSTOM INDEX ON %s(v) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        ColumnFamilyStore store = getCurrentColumnFamilyStore();
+        ColumnContext columnContext = columnIndex(store, indexName);
+        Path tmpDir = Files.createTempDirectory("IndexViewManagerTest");
+        store.disableAutoCompaction();
+
+        // create sstable 1 from flush
+        execute("INSERT INTO %s(k, v) VALUES (1, 10)");
+        execute("INSERT INTO %s(k, v) VALUES (2, 20)");
+        flush();
+
+        // create sstable 2 from flush
+        execute("INSERT INTO %s(k, v) VALUES (3, 30)");
+        execute("INSERT INTO %s(k, v) VALUES (4, 40)");
+        flush();
+
+        // save sstables 1 and 2 and create sstable 3 from compaction
+        assertEquals(2, store.getLiveSSTables().size());
+        store.getLiveSSTables().forEach(reader -> copySSTable(reader, tmpDir));
+        CompactionManager.instance.performMaximal(store, false);
+
+        // create sstable 4 from flush
+        execute("INSERT INTO %s(k, v) VALUES (5, 50)");
+        execute("INSERT INTO %s(k, v) VALUES (6, 60)");
+        flush();
+
+        // save sstables 3 and 4
+        store.getLiveSSTables().forEach(reader -> copySSTable(reader, tmpDir));
+
+        List<SSTableReader> sstables = IntStream.rangeClosed(1, 4)
+                                                .mapToObj(i -> new Descriptor(tmpDir.toFile(), KEYSPACE, tableName, i))
+                                                .map(SSTableReader::open)
+                                                .collect(Collectors.toList());
+
+        List<SSTableReader> none = Collections.emptyList();
+        List<SSTableReader> initial = sstables.stream().limit(2).collect(Collectors.toList());
+
+        ExecutorService executor = Executors.newFixedThreadPool(2);
+        for (int i = 0; i < CONCURRENT_UPDATES; i++)
+        {
+            // mock the initial view indexes to track the number of releases
+            List<SSTableContext> initialContexts = sstables.stream().limit(2).map(SSTableContext::create).collect(Collectors.toList());
+            List<SSTableIndex> initialIndexes = new ArrayList<>();
+
+            for (SSTableContext initialContext : initialContexts)
+            {
+                MockSSTableIndex mockSSTableIndex = new MockSSTableIndex(initialContext, columnContext);
+                initialIndexes.add(mockSSTableIndex);
+            }
+
+            IndexViewManager tracker = new IndexViewManager(columnContext, initialIndexes);
+            View initialView = tracker.getView();
+            assertEquals(2, initialView.size());
+
+            List<SSTableContext> compacted = sstables.stream().skip(2).limit(1).map(SSTableContext::create).collect(Collectors.toList());
+            List<SSTableContext> flushed = sstables.stream().skip(3).limit(1).map(SSTableContext::create).collect(Collectors.toList());
+
+            // concurrently update from both flush and compaction
+            Future<?> compaction = executor.submit(() -> tracker.update(initial, compacted, true, false));
+            Future<?> flush = executor.submit(() -> tracker.update(none, flushed, true, false));
+
+            FBUtilities.waitOnFutures(Arrays.asList(compaction, flush));
+
+            View updatedView = tracker.getView();
+            assertNotEquals(initialView, updatedView);
+            assertEquals(2, updatedView.getIndexes().size());
+
+            for (SSTableIndex index : initialIndexes)
+            {
+                assertEquals(1, ((MockSSTableIndex) index).releaseCount);
+            }
+
+            // release original SSTableContext objects.
+            // shared copies are already released when compacted and flushed are added.
+            initialContexts.forEach(SSTableContext::close);
+            initialContexts.forEach(group -> assertTrue(group.isCleanedUp()));
+
+            // release compacted and flushed SSTableContext original and shared copies
+            compacted.forEach(SSTableContext::close);
+            flushed.forEach(SSTableContext::close);
+            tracker.getView().getIndexes().forEach(SSTableIndex::release);
+            compacted.forEach(group -> assertTrue(group.isCleanedUp()));
+            flushed.forEach(group -> assertTrue(group.isCleanedUp()));
+        }
+        executor.shutdown();
+        executor.awaitTermination(1, TimeUnit.MINUTES);
+    }
+
+    private ColumnContext columnIndex(ColumnFamilyStore store, String indexName)
+    {
+        assert store.indexManager != null;
+        StorageAttachedIndex sai = (StorageAttachedIndex) store.indexManager.getIndexByName(indexName);
+        return sai.getContext();
+    }
+
+    public static class MockSSTableIndex extends SSTableIndex
+    {
+        int releaseCount = 0;
+
+        MockSSTableIndex(SSTableContext group, ColumnContext context) throws IOException
+        {
+            super(group, context, IndexComponents.create(context.getIndexName(), group.descriptor(), CryptoUtils.getCompressionParams(group.sstable())));
+        }
+
+        @Override
+        public void release()
+        {
+            super.release();
+            releaseCount++;
+        }
+    }
+
+    private static void copySSTable(SSTableReader table, Path destDir)
+    {
+        for (Component component : SSTable.componentsFor(table.descriptor))
+        {
+            Path src = table.descriptor.fileFor(component).toPath();
+            Path dst = destDir.resolve(src.getFileName());
+            try
+            {
+                Files.copy(src, dst, StandardCopyOption.REPLACE_EXISTING);
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java b/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
new file mode 100644
index 000000000000..8f4bbcca80c8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/virtual/IndexesSystemViewTest.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.virtual;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.virtual.VirtualKeyspace;
+import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.inject.Injections;
+import org.apache.cassandra.inject.InvokePointBuilder;
+import org.apache.cassandra.schema.SchemaConstants;
+
+/**
+ * Tests the virtual table exposing storage-attached column index metadata.
+ */
+public class IndexesSystemViewTest extends SAITester
+{
+    private static final String SELECT = String.format("SELECT %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s FROM %s.%s WHERE %s = '%s'",
+                                                       IndexesSystemView.INDEX_NAME,
+                                                       IndexesSystemView.TABLE_NAME,
+                                                       IndexesSystemView.COLUMN_NAME,
+                                                       IndexesSystemView.IS_QUERYABLE,
+                                                       IndexesSystemView.IS_BUILDING,
+                                                       IndexesSystemView.IS_STRING,
+                                                       IndexesSystemView.ANALYZER,
+                                                       IndexesSystemView.INDEXED_SSTABLE_COUNT,
+                                                       IndexesSystemView.CELL_COUNT,
+                                                       IndexesSystemView.PER_TABLE_DISK_SIZE,
+                                                       IndexesSystemView.PER_COLUMN_DISK_SIZE,
+                                                       SchemaConstants.VIRTUAL_VIEWS,
+                                                       IndexesSystemView.NAME,
+                                                       IndexesSystemView.KEYSPACE_NAME,
+                                                       KEYSPACE);
+
+    private Injections.Barrier blockIndexBuild = Injections.newBarrier("block_index_build", 2, false)
+                                                           .add(InvokePointBuilder.newInvokePoint().onClass(StorageAttachedIndex.class)
+                                                                                  .onMethod("startInitialBuild"))
+                                                           .build();
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(SchemaConstants.VIRTUAL_VIEWS, ImmutableList.of(new IndexesSystemView(SchemaConstants.VIRTUAL_VIEWS))));
+
+        CQLTester.setUpClass();
+    }
+
+    @Test
+    public void testVirtualTableThroughIndexLifeCycle() throws Throwable
+    {
+        // create the table and verify that the virtual table is empty before creating any indexes
+        assertEmpty(execute(SELECT));
+        createTable("CREATE TABLE %s (k int, c int, v1 int, v2 text, PRIMARY KEY (k, c))");
+
+        // create the index simulating a long build and verify that there is an empty record in the virtual table
+        Injections.inject(blockIndexBuild);
+        String v1IndexName = createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v1) USING '%s'", StorageAttachedIndex.class.getName()));
+
+        assertRows(execute(SELECT), row(v1IndexName, "v1", false, true, false, 0, 0L));
+
+        // unblock the long build and verify that there is an finished empty record in the virtual table
+        blockIndexBuild.countDown();
+        blockIndexBuild.disable();
+        waitForIndexQueryable();
+        assertRows(execute(SELECT), row(v1IndexName, "v1", true, false, false, 0, 0L));
+
+        // insert some data and verify that virtual table record is still empty since we haven't flushed yet
+        execute("INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)", 1, 10, 100, "1000");
+        execute("INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)", 2, 20, 200, "2000");
+        assertRows(execute(SELECT), row(v1IndexName, "v1", true, false, false, 0, 0L));
+
+        // flush the memtable and verify the not-empty record in the virtual table
+        flush();
+        assertRows(execute(SELECT), row(v1IndexName, "v1", true, false, false, 1, 2L));
+
+        // flush a second memtable and verify the updated record in the virtual table
+        execute("INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)", 3, 30, 300, "3000");
+        flush();
+        assertRows(execute(SELECT), row(v1IndexName, "v1", true, false, false, 2, 3L));
+
+        // create a second index, this should create a new additional entry in the table
+        String v2IndexName = createIndex(String.format("CREATE CUSTOM INDEX ON %%s(v2) USING '%s'", StorageAttachedIndex.class.getName()));
+        waitForIndexQueryable();
+        assertRows(execute(SELECT),
+                   row(v1IndexName, "v1", true, false, false, 2, 3L),
+                   row(v2IndexName, "v2", true, false, true, 2, 3L));
+
+        // update some of the existing rows, this should increase the cell count due to the multiple versions
+        execute("INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)", 1, 10, 111, "1111");
+        execute("INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)", 2, 20, 222, "2222");
+        flush();
+        assertRowsIgnoringOrderAndExtra(execute(SELECT),
+                                        row(v1IndexName, "v1", true, false, false, 3, 5L),
+                                        row(v2IndexName, "v2", true, false, true, 3, 5L));
+
+        // compact and verify that the cell count decreases
+        compact();
+        waitForCompactionsFinished();
+
+        System.out.println(makeRowStrings(execute("SELECT * FROM %s")));
+
+        assertRowsIgnoringOrderAndExtra(execute(SELECT),
+                                        row(v1IndexName, "v1", true, false, false, 1, 3L),
+                                        row(v2IndexName, "v2", true, false, true, 1, 3L));
+
+
+
+        // drop the second index and verify that there is not entry for it in the virtual table
+        dropIndex("DROP INDEX %s." + v2IndexName);
+        assertRowsIgnoringOrderAndExtra(execute(SELECT), row(v1IndexName, "v1", true, false, false, 1, 3L));
+
+        // truncate the base table and verify that there is still an entry in the virtual table and it's empty
+        truncate(false);
+        assertRowsIgnoringOrderAndExtra(execute(SELECT), row(v1IndexName, "v1", true, false, false, 0, 0L));
+
+        // drop the base table and verify that the virtual table is empty
+        dropTable("DROP TABLE %s");
+        assertEmpty(execute(SELECT));
+    }
+
+    private Object[] row(String indexName,
+                         String columnName,
+                         boolean isQueryable,
+                         boolean isBuilding,
+                         boolean isString,
+                         int sstableCount,
+                         long cellCount) throws Exception
+    {
+            ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+            StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+            StorageAttachedIndex sai = (StorageAttachedIndex) cfs.indexManager.getIndexByName(indexName);
+            ColumnContext context = sai.getContext();
+
+            return row(indexName,
+                       currentTable(),
+                       columnName,
+                       isQueryable,
+                       isBuilding,
+                       isString,
+                       context.getAnalyzer().toString(),
+                       sstableCount,
+                       cellCount,
+                       group.diskUsage(),
+                       context.diskUsage());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/virtual/SSTablesSystemViewTest.java b/test/unit/org/apache/cassandra/index/sai/virtual/SSTablesSystemViewTest.java
new file mode 100644
index 000000000000..dc753bc18772
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/virtual/SSTablesSystemViewTest.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.virtual;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.virtual.VirtualKeyspace;
+import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.SchemaConstants;
+
+/**
+ * Tests the virtual table exposing SSTable index metadata.
+ */
+public class SSTablesSystemViewTest extends SAITester
+{
+    private static final String SELECT = String.format("SELECT %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s " +
+                                                       "FROM %s.%s WHERE %s = '%s'",
+                                                       SSTablesSystemView.INDEX_NAME,
+                                                       SSTablesSystemView.SSTABLE_NAME,
+                                                       SSTablesSystemView.TABLE_NAME,
+                                                       SSTablesSystemView.COLUMN_NAME,
+                                                       SSTablesSystemView.FORMAT_VERSION,
+                                                       SSTablesSystemView.CELL_COUNT,
+                                                       SSTablesSystemView.MIN_ROW_ID,
+                                                       SSTablesSystemView.MAX_ROW_ID,
+                                                       SSTablesSystemView.START_TOKEN,
+                                                       SSTablesSystemView.END_TOKEN,
+                                                       SSTablesSystemView.PER_TABLE_DISK_SIZE,
+                                                       SSTablesSystemView.PER_COLUMN_DISK_SIZE,
+                                                       SchemaConstants.VIRTUAL_VIEWS,
+                                                       SSTablesSystemView.NAME,
+                                                       SSTablesSystemView.KEYSPACE_NAME,
+                                                       KEYSPACE);
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(SchemaConstants.VIRTUAL_VIEWS, ImmutableList.of(new SSTablesSystemView(SchemaConstants.VIRTUAL_VIEWS))));
+
+        CQLTester.setUpClass();
+    }
+
+    @Test
+    public void testVirtualTableThroughIndexLifeCycle() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, v1 int, v2 int, PRIMARY KEY (k, c))");
+        String v1IndexName = createIndex("CREATE CUSTOM INDEX ON %s(v1) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        String insert = "INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)";
+
+        // the virtual table should be empty before adding contents
+        assertEmpty(execute(SELECT));
+
+        // insert a row and verify that the virtual table is empty before flushing
+        execute(insert, 1, 10, 100, 1000);
+        assertEmpty(execute(SELECT));
+
+        // flush the memtable and verify the new record in the virtual table
+        flush();
+        Object[] row1 = row(v1IndexName, 1, "v1", 1L, 0L, 0L);
+        assertRows(execute(SELECT), row1);
+
+        // flush a second memtable and verify both the old and the new record in the virtual table
+        execute(insert, 2, 20, 200, 2000);
+        execute(insert, 3, 30, 300, 3000);
+        flush();
+        Object[] row2 = row(v1IndexName, 2, "v1", 2L, 0L, 1L);
+        assertRows(execute(SELECT), row1, row2);
+
+        // create a second index, this should create a new additional entry in the table for each sstable
+        String v2IndexName = createIndex("CREATE CUSTOM INDEX ON %s(v2) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        Object[] row3 = row(v2IndexName, 1, "v2", 1L, 0L, 0L);
+        Object[] row4 = row(v2IndexName, 2, "v2", 2L, 0L, 1L);
+        assertRows(execute(SELECT), row1, row2, row3, row4);
+
+        // create a new sstable that only contains data for the second index, this should add only one new entry
+        execute(insert, 4, 40, null, 4000);
+        flush();
+        Object[] row5 = row(v2IndexName, 3, "v2", 1L, 0L, 0L);
+        assertRows(execute(SELECT), row1, row2, row3, row4, row5);
+
+        // create a new sstable with rows with contents for either one of the indexes or the other
+        execute(insert, 5, 50, 500, null);
+        execute(insert, 6, 60, null, 6000);
+        flush();
+        Object[] row6 = row(v1IndexName, 4, "v1", 1L, 0L, 0L);
+        Object[] row7 = row(v2IndexName, 4, "v2", 1L, 1L, 1L);
+        assertRows(execute(SELECT), row1, row2, row6, row3, row4, row5, row7);
+
+        // compact the table and verify that the virtual table has a single entry per index
+        compact();
+        waitForCompactions();
+        // Compaction may result in sstables with generation 5 or 6. Try both.
+        // key 4, key 6 are not indexable on v1
+        Object[] row8 = row(v1IndexName, 5, 6, "v1", 4L, 0L, 5L);
+        // key 5 is not indexable on v2
+        Object[] row9 = row(v2IndexName, 5, 6, "v2", 5L, 1L, 5L);
+        assertRows(execute(SELECT), row8, row9);
+
+        // drop the first index and verify that there are not entries for it in the table
+        dropIndex("DROP INDEX %s." + v1IndexName);
+        assertRows(execute(SELECT), row9);
+
+        // drop the base table and verify that the virtual table is empty
+        dropTable("DROP TABLE %s");
+        assertEmpty(execute(SELECT));
+    }
+
+    private Object[] row(String indexName,
+                         int generationMin,
+                         int generationMax,
+                         String columnName,
+                         long cellCount,
+                         long minSSTableRowId,
+                         long maxSSTableRowId) throws Exception
+    {
+        for (int generation = generationMin; generation <= generationMax; ++generation)
+        {
+            Object[] row = row(indexName, generation, columnName, cellCount, minSSTableRowId, maxSSTableRowId);
+            if (row != null)
+                return row;
+        }
+        return null;
+    }
+
+    private Object[] row(String indexName,
+                         int generation,
+                         String columnName,
+                         long cellCount,
+                         long minSSTableRowId,
+                         long maxSSTableRowId) throws Exception
+    {
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        StorageAttachedIndex sai = (StorageAttachedIndex) cfs.indexManager.getIndexByName(indexName);
+
+        for (SSTableIndex sstableIndex : sai.getContext().getView())
+        {
+            SSTableReader sstable = sstableIndex.getSSTable();
+
+            if (sstable.descriptor.generation == generation)
+            {
+                Token.TokenFactory tokenFactory = cfs.metadata().partitioner.getTokenFactory();
+                AbstractBounds<Token> bounds = sstable.getBounds();
+
+                CompressionParams params = CryptoUtils.getCompressionParams(sstable);
+
+                return row(indexName,
+                           sstable.getFilename(),
+                           currentTable(),
+                           columnName,
+                           sstableIndex.getVersion().toString(),
+                           cellCount,
+                           minSSTableRowId,
+                           maxSSTableRowId,
+                           tokenFactory.toString(bounds.left),
+                           tokenFactory.toString(bounds.right),
+                           sstableIndex.getSSTableContext().diskUsage(),
+                           sstableIndex.sizeOfPerColumnComponents());
+            }
+        }
+        return null;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/virtual/SegmentsSystemViewTest.java b/test/unit/org/apache/cassandra/index/sai/virtual/SegmentsSystemViewTest.java
new file mode 100644
index 000000000000..e31a63975e19
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/virtual/SegmentsSystemViewTest.java
@@ -0,0 +1,231 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.index.sai.virtual;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.marshal.MapType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.virtual.VirtualKeyspace;
+import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.SSTableIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.disk.SegmentBuilder;
+import org.apache.cassandra.index.sai.disk.SegmentMetadata;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.service.StorageService;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests the virtual table exposing SSTable index segment metadata.
+ */
+public class SegmentsSystemViewTest extends SAITester
+{
+    private static final String SELECT = String.format("SELECT %s, %s, %s, %s " +
+                                                       "FROM %s.%s WHERE %s = '%s' AND %s = ?",
+                                                       SegmentsSystemView.SEGMENT_ROW_ID_OFFSET,
+                                                       SegmentsSystemView.CELL_COUNT,
+                                                       SegmentsSystemView.MIN_SSTABLE_ROW_ID,
+                                                       SegmentsSystemView.MAX_SSTABLE_ROW_ID,
+                                                       SchemaConstants.VIRTUAL_VIEWS,
+                                                       SegmentsSystemView.NAME,
+                                                       SegmentsSystemView.KEYSPACE_NAME,
+                                                       KEYSPACE,
+                                                       SegmentsSystemView.INDEX_NAME);
+
+
+    private static final String SELECT_INDEX_METADATA = String.format("SELECT %s, %s, %s " +
+                                                                      "FROM %s.%s WHERE %s = '%s'",
+                                                                      SegmentsSystemView.COMPONENT_METADATA,
+                                                                      SegmentsSystemView.MIN_TERM,
+                                                                      SegmentsSystemView.MAX_TERM,
+                                                                      SchemaConstants.VIRTUAL_VIEWS,
+                                                                      SegmentsSystemView.NAME,
+                                                                      SegmentsSystemView.KEYSPACE_NAME,
+                                                                      KEYSPACE);
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        VirtualKeyspaceRegistry.instance.register(new VirtualKeyspace(SchemaConstants.VIRTUAL_VIEWS, ImmutableList.of(new SegmentsSystemView(SchemaConstants.VIRTUAL_VIEWS))));
+
+        requireNetwork();
+    }
+
+    @Test
+    public void testSegmentsMetadata() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, v1 int, v2 text, PRIMARY KEY (k, c))");
+        String numericIndex = createIndex("CREATE CUSTOM INDEX ON %s(v1) USING 'StorageAttachedIndex'");
+        String stringIndex = createIndex("CREATE CUSTOM INDEX ON %s(v2) USING 'StorageAttachedIndex'");
+
+        int num = 100;
+
+        String insert = "INSERT INTO %s(k, c, v1, v2) VALUES (?, ?, ?, ?)";
+
+        // the virtual table should be empty before adding contents
+        assertEmpty(execute(SELECT, numericIndex));
+        assertEmpty(execute(SELECT, stringIndex));
+
+        // insert rows and verify that the virtual table is empty before flushing
+        for (int i = 0; i < num / 2; i++)
+            execute(insert, i, 10, 100, "1000");
+        assertEmpty(execute(SELECT, numericIndex));
+        assertEmpty(execute(SELECT, stringIndex));
+
+        // flush the memtable and verify the new record in the virtual table
+        flush();
+        Object[] row1 = row(0L, (long)(num / 2), 0L, (long)(num / 2 - 1));
+        assertRows(execute(SELECT, numericIndex), row1);
+        assertRows(execute(SELECT, stringIndex), row1);
+
+        // flush a second memtable and verify both the old and the new record in the virtual table
+        for (int i = num / 2; i < num; i++)
+            execute(insert, i, 20, 200, "2000");
+        flush();
+        Object[] row2 = row(0L, (long)(num / 2), 0L, (long)(num / 2 - 1));
+        assertRows(execute(SELECT, numericIndex), row1, row2);
+        assertRows(execute(SELECT, stringIndex), row1, row2);
+
+        // force compaction, there is only 1 sstable
+        compact();
+        waitForCompactions();
+        Object[] row3 = row(0L, (long)num, 0L, (long)(num - 1));
+        assertRows(execute(SELECT, numericIndex), row3);
+        assertRows(execute(SELECT, stringIndex), row3);
+
+        for (int lastValidSegmentRowId : Arrays.asList(0, 1, 2, 3, 5, 9, 25, 49, 59, 99, 101))
+        {
+            SegmentBuilder.updateLastValidSegmentRowId(lastValidSegmentRowId);
+
+            // compaction to rewrite segments
+            StorageService.instance.upgradeSSTables(KEYSPACE, false, new String[] { currentTable() });
+            // however many segments we create during the build we should always end up with
+            // just 1 segment with all the rows in it
+            Object[] segmentRow = row(0L, (long)num, 0L, (long)(num - 1));
+            assertRows(execute(SELECT, numericIndex), segmentRow);
+            assertRows(execute(SELECT, stringIndex), segmentRow);
+
+            // verify index metadata length
+            Map<String, Long> indexLengths = new HashMap<>();
+            for (UntypedResultSet.Row row : execute(SELECT_INDEX_METADATA))
+            {
+                int minTerm = Integer.parseInt(row.getString(SegmentsSystemView.MIN_TERM));
+                int maxTerm = Integer.parseInt(row.getString(SegmentsSystemView.MAX_TERM));
+
+                assertTrue(minTerm >= 100);
+                assertTrue(maxTerm <= 2000);
+
+                Map<String, Map<String, String>> indexMetadatas = row.getMap(SegmentsSystemView.COMPONENT_METADATA,
+                                                                             UTF8Type.instance,
+                                                                             MapType.getInstance(UTF8Type.instance, UTF8Type.instance, true));
+
+                for (Map.Entry<String, Map<String, String>> entry : indexMetadatas.entrySet())
+                {
+                    final String indexType = entry.getKey();
+                    final String str = entry.getValue().getOrDefault(SegmentMetadata.ComponentMetadata.LENGTH, "0");
+
+                    if (indexType.equals(IndexComponents.NDIType.KD_TREE.toString()))
+                    {
+                        int maxPointsInLeafNode = Integer.parseInt(entry.getValue().get("max_points_in_leaf_node"));
+
+                        assertEquals(1024, maxPointsInLeafNode);
+                    }
+                    else if (indexType.equals(IndexComponents.NDIType.KD_TREE_POSTING_LISTS.toString()))
+                    {
+                        int numLeafPostings = Integer.parseInt(entry.getValue().get("num_leaf_postings"));
+
+                        assertTrue(numLeafPostings > 0);
+                    }
+
+                    final long length = Long.parseLong(str);
+
+                    final long value = indexLengths.getOrDefault(indexType, 0L);
+                    indexLengths.put(indexType, value + length);
+                }
+            }
+            if (!Boolean.parseBoolean(System.getProperty("cassandra.test.encryption", "false")))
+                assertEquals(indexFileLengths(currentTable()), indexLengths);
+        }
+
+        // drop the numeric index and verify that there are not entries for it in the table
+        dropIndex("DROP INDEX %s." + numericIndex);
+        assertEmpty(execute(SELECT, numericIndex));
+        assertNotEquals(0, execute(SELECT, stringIndex).size());
+
+        // drop the string index and verify that there are not entries for it in the table
+        dropIndex("DROP INDEX %s." +  stringIndex);
+        assertEmpty(execute(SELECT, numericIndex));
+        assertEmpty(execute(SELECT, stringIndex));
+    }
+
+    private HashMap<String, Long> indexFileLengths(String table) throws Exception
+    {
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+
+        HashMap<String, Long> lengths = new HashMap<>();
+        for (Index idx : cfs.indexManager.listIndexes())
+        {
+            StorageAttachedIndex index = (StorageAttachedIndex) idx;
+
+            for (SSTableIndex sstableIndex : index.getContext().getView().getIndexes())
+            {
+                SSTableReader sstable = sstableIndex.getSSTable();
+
+                IndexComponents components = IndexComponents.create(sstableIndex.getColumnContext().getIndexName(), sstable);
+
+                if (sstableIndex.getColumnContext().isLiteral())
+                {
+                    addComponentSizeToMap(lengths, components.termsData, components);
+                    addComponentSizeToMap(lengths, components.postingLists, components);
+                }
+                else
+                {
+                    addComponentSizeToMap(lengths, components.kdTree, components);
+                    addComponentSizeToMap(lengths, components.kdTreePostingLists, components);
+                }
+            }
+        }
+
+        return lengths;
+    }
+
+    private void addComponentSizeToMap(HashMap<String, Long> map, IndexComponents.IndexComponent key, IndexComponents indexComponents)
+    {
+        map.compute(key.ndiType.name, (typeName, acc) -> {
+            final long size = indexComponents.sizeOf(Collections.singleton(key));
+            return acc == null ? size : size + acc;
+        });
+    }
+}

From 8d187208d907bed215fc5baf5814f2125b540fc0 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Fri, 19 Mar 2021 14:40:50 +0000
Subject: [PATCH 045/151] STAR-347: Fix FailureTest failure

(cherry picked from commit f090eb6dce1e0e425a3d0ba209f8cb42e0ac540a)
(cherry picked from commit ea105ce23fb105b0383c485617b5ddcec81a5a9d)
---
 .../index/sai/functional/FailureTest.java      | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java b/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java
index bfa776a89846..c33899e363c1 100644
--- a/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/functional/FailureTest.java
@@ -102,22 +102,26 @@ public void shouldMakeIndexNonQueryableOnSSTableContextFailureDuringCreation() t
     {
         createTable(CREATE_TABLE_TEMPLATE);
 
-        execute("INSERT INTO %s (id, v1) VALUES ('1', 1)");
-        execute("INSERT INTO %s (id, v1) VALUES ('2', 2)");
+        execute("INSERT INTO %s (id1, v1) VALUES ('1', 1)");
+        execute("INSERT INTO %s (id1, v1) VALUES ('2', 2)");
+
+        // We need to reference SSTableContext first or the failure injection fails
+        // because byteman can't find the class.
+        SSTableContext.openFilesPerSSTable();
 
         Injection ssTableContextCreationFailure = newFailureOnEntry("context_failure_on_creation", SSTableContext.class, "create", RuntimeException.class);
         Injections.inject(ssTableContextCreationFailure);
 
-        String v1IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v1"));
+        String v2IndexName = createIndex(String.format(CREATE_INDEX_TEMPLATE, "v2"));
 
         // Verify that the initial index build fails...
-        verifyInitialIndexFailed(v1IndexName);
+        verifyInitialIndexFailed(v2IndexName);
 
         verifyIndexFiles(0, 0, 0, 0);
-        verifySSTableIndexes(v1IndexName, 0);
+        verifySSTableIndexes(v2IndexName, 0);
 
         // ...and then verify that, while the node is still operational, the index is not.
-        Assertions.assertThatThrownBy(() -> executeNet("SELECT * FROM %s WHERE v1 > 1"))
-                  .isInstanceOf(ReadFailureException.class);
+        Assertions.assertThatThrownBy(() -> execute("SELECT * FROM %s WHERE v2 = '1'"))
+                  .isInstanceOf(IndexNotAvailableException.class);
     }
 }

From efc66d1c1f7a729ad14a359963d968290286b88c Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 25 Mar 2021 18:06:11 +0000
Subject: [PATCH 046/151] STAR-361: Query all ranges at once for SAI
 distributed queries (#85)

(cherry picked from commit 5ba3c6bd57ea437e670c9f3734d36cd5f75822db)
(cherry picked from commit c7e186dd5648ac96472c1bf519d35082c37d6d33)
---
 .../index/sai/utils/PrimaryKeys.java          |   2 +-
 .../cassandra/service/StorageProxy.java       |   2 -
 .../reads/range/RangeCommandIterator.java     |   8 +-
 .../service/reads/range/RangeCommands.java    |  31 ++--
 .../test/sai/ConcurrencyFactorTest.java       | 136 ++++++++++++++++++
 5 files changed, 164 insertions(+), 15 deletions(-)
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/ConcurrencyFactorTest.java

diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
index f50624004f74..2828ac769c6c 100644
--- a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
+++ b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
@@ -121,7 +121,7 @@ private Skinny()
         @Override
         public long add(DecoratedKey key, Clustering clustering)
         {
-            assert clustering == Clustering.EMPTY;
+            assert clustering.isEmpty();
             return keys.add(key) ? SET_ENTRY_OVERHEAD : 0;
         }
 
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index ac0f83c1d706..72801a9d129b 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -102,7 +102,6 @@
 import org.apache.cassandra.locator.Replicas;
 import org.apache.cassandra.metrics.CASClientRequestMetrics;
 import org.apache.cassandra.metrics.CASClientWriteRequestMetrics;
-import org.apache.cassandra.metrics.ClientRangeRequestMetrics;
 import org.apache.cassandra.metrics.ClientRequestMetrics;
 import org.apache.cassandra.metrics.ClientWriteRequestMetrics;
 import org.apache.cassandra.metrics.ReadRepairMetrics;
@@ -168,7 +167,6 @@ public AtomicInteger load(InetAddressAndPort inetAddress)
         }
     };
     private static final ClientRequestMetrics readMetrics = new ClientRequestMetrics("Read");
-    public static final ClientRangeRequestMetrics rangeMetrics = new ClientRangeRequestMetrics("RangeSlice");
     private static final ClientWriteRequestMetrics writeMetrics = new ClientWriteRequestMetrics("Write");
     private static final CASClientWriteRequestMetrics casWriteMetrics = new CASClientWriteRequestMetrics("CASWrite");
     private static final CASClientRequestMetrics casReadMetrics = new CASClientRequestMetrics("CASRead");
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java
index ae7ee60666db..a345c6091e01 100644
--- a/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java
+++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java
@@ -40,6 +40,7 @@
 import org.apache.cassandra.locator.EndpointsForRange;
 import org.apache.cassandra.locator.Replica;
 import org.apache.cassandra.locator.ReplicaPlan;
+import org.apache.cassandra.metrics.ClientRangeRequestMetrics;
 import org.apache.cassandra.metrics.ClientRequestMetrics;
 import org.apache.cassandra.net.Message;
 import org.apache.cassandra.net.MessagingService;
@@ -51,11 +52,13 @@
 import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.CloseableIterator;
 
-class RangeCommandIterator extends AbstractIterator<RowIterator> implements PartitionIterator
+@VisibleForTesting
+public class RangeCommandIterator extends AbstractIterator<RowIterator> implements PartitionIterator
 {
     private static final Logger logger = LoggerFactory.getLogger(RangeCommandIterator.class);
 
-    private static final ClientRequestMetrics rangeMetrics = new ClientRequestMetrics("RangeSlice");
+    @VisibleForTesting
+    public static final ClientRangeRequestMetrics rangeMetrics = new ClientRangeRequestMetrics("RangeSlice");
 
     private final CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans;
     private final int totalRangeCount;
@@ -259,6 +262,7 @@ public void close()
         }
         finally
         {
+            rangeMetrics.roundTrips.update(batchesRequested);
             long latency = System.nanoTime() - startTime;
             rangeMetrics.addNano(latency);
             Keyspace.openAndGetStore(command.metadata()).metric.coordinatorScanLatency.update(latency, TimeUnit.NANOSECONDS);
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
index f9ad9694085a..433af022e198 100644
--- a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
+++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
@@ -72,19 +72,30 @@ static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand comma
         Keyspace keyspace = Keyspace.open(command.metadata().keyspace);
         ReplicaPlanIterator replicaPlans = new ReplicaPlanIterator(command.dataRange().keyRange(), command.indexQueryPlan(), keyspace, consistencyLevel);
 
-        // our estimate of how many result rows there will be per-range
-        float resultsPerRange = estimateResultsPerRange(command, keyspace);
-        // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
-        // fetch enough rows in the first round
-        resultsPerRange -= resultsPerRange * CONCURRENT_SUBREQUESTS_MARGIN;
         int maxConcurrencyFactor = Math.min(replicaPlans.size(), MAX_CONCURRENT_RANGE_REQUESTS);
-        int concurrencyFactor = resultsPerRange == 0.0
+        int concurrencyFactor = maxConcurrencyFactor;
+        Index.QueryPlan queryPlan = command.indexQueryPlan();
+        if ( queryPlan == null || queryPlan.shouldEstimateInitialConcurrency())
+        {
+            // our estimate of how many result rows there will be per-range
+            float resultsPerRange = estimateResultsPerRange(command, keyspace);
+            // underestimate how many rows we will get per-range in order to increase the likelihood that we'll
+            // fetch enough rows in the first round
+            resultsPerRange -= resultsPerRange * CONCURRENT_SUBREQUESTS_MARGIN;
+            concurrencyFactor = resultsPerRange == 0.0
                                 ? 1
                                 : Math.max(1, Math.min(maxConcurrencyFactor, (int) Math.ceil(command.limits().count() / resultsPerRange)));
-        logger.trace("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
-                     resultsPerRange, command.limits().count(), replicaPlans.size(), concurrencyFactor);
-        Tracing.trace("Submitting range requests on {} ranges with a concurrency of {} ({} rows per range expected)",
-                      replicaPlans.size(), concurrencyFactor, resultsPerRange);
+            logger.trace("Estimated result rows per range: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
+                         resultsPerRange, command.limits().count(), replicaPlans.size(), concurrencyFactor);
+            Tracing.trace("Submitting range requests on {} ranges with a concurrency of {} ({} rows per range expected)",
+                          replicaPlans.size(), concurrencyFactor, resultsPerRange);
+        }
+        else
+        {
+            logger.trace("Max concurrent range requests: {}; requested rows: {}, ranges.size(): {}; concurrent range requests: {}",
+                         MAX_CONCURRENT_RANGE_REQUESTS, command.limits().count(), replicaPlans.size(), concurrencyFactor);
+            Tracing.trace("Submitting range requests on {} ranges with a concurrency of {}", replicaPlans.size(), concurrencyFactor);
+        }
 
         ReplicaPlanMerger mergedReplicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, consistencyLevel);
         return new RangeCommandIterator(mergedReplicaPlans,
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/ConcurrencyFactorTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/ConcurrencyFactorTest.java
new file mode 100644
index 000000000000..007e907b8ef7
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/ConcurrencyFactorTest.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+import org.apache.cassandra.service.reads.range.RangeCommandIterator;
+
+import static junit.framework.TestCase.assertEquals;
+import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
+import static org.apache.cassandra.distributed.api.Feature.NETWORK;
+
+public class ConcurrencyFactorTest extends TestBaseImpl
+{
+    private static final String SAI_TABLE = "sai_simple_primary_key";
+
+    private static final int nodes = 3;
+
+    private org.apache.cassandra.distributed.Cluster cluster;
+
+    @Before
+    public void init() throws IOException
+    {
+        cluster = init(Cluster.build(nodes).withTokenSupplier(node -> {
+            switch (node)
+            {
+                case 1: return -9223372036854775808L;
+                case 2: return -3074457345618258602L;
+                case 3: return 3074457345618258603L;
+                default: throw new IllegalArgumentException();
+            }
+        }).withConfig(config -> config.with(NETWORK).with(GOSSIP)).start());
+    }
+
+    @After
+    public void cleanup()
+    {
+        cluster.close();
+    }
+
+    private void insertRows(long startVal, long endVal, long increment)
+    {
+        String template = "INSERT INTO %s.%s (pk, state, gdp) VALUES (%s, %s)";
+        Random rnd = new Random();
+        String fakeState, rowData;
+        int i = 0;
+        for (long val = startVal; val <= endVal; val += increment)
+        {
+            fakeState = String.format("%c%c", (char)(rnd.nextInt(26) + 'A'), (char)(rnd.nextInt(26) + 'A'));
+            rowData = String.format("'%s', %s", fakeState, val);
+            cluster.coordinator(1).execute(String.format(template, KEYSPACE, SAI_TABLE, i++, rowData), ConsistencyLevel.LOCAL_ONE);
+        }
+    }
+
+    @Test
+    public void testInitialConcurrencySelection()
+    {
+        cluster.schemaChange(String.format("CREATE TABLE %s.%s (pk int, state ascii, gdp bigint, PRIMARY KEY (pk)) WITH compaction = " +
+                                           " {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }", KEYSPACE, SAI_TABLE));
+        cluster.schemaChange(String.format("CREATE CUSTOM INDEX ON %s.%s (gdp) USING 'StorageAttachedIndex'", KEYSPACE, SAI_TABLE));
+
+        insertRows(1_000_000_000L, 16_000_000_000L, 1_000_000_000L);
+
+        // flush all nodes, expected row distribution by partition key value
+        // node0: 9, 14, 12, 3
+        // node1: 5, 10, 13, 11, 1, 8, 0, 2
+        // node2: 4, 15, 7, 6
+        cluster.forEach((node) -> node.flush(KEYSPACE));
+
+        // we expect to use StorageProxy#RangeCommandIterator and the hit count to increase
+        String query = String.format("SELECT state FROM %s.%s WHERE gdp > ? AND gdp < ? LIMIT 20", KEYSPACE, SAI_TABLE);
+        int prevHistCount = getRangeReadCount();
+        runAndValidate(prevHistCount, 1, query, 3_000_000_000L, 7_000_000_000L);
+
+        // partition-restricted query
+        // we don't expect to use StorageProxy#RangeCommandIterator so previous hit count remains the same
+        query = String.format("SELECT state FROM %s.%s WHERE pk = ?", KEYSPACE, SAI_TABLE);
+        runAndValidate(prevHistCount, 1, query, 0);
+
+        // token-restricted query
+        // we expect StorageProxy#RangeCommandIterator to be used so reset previous hit count
+        query = String.format("SELECT * FROM %s.%s WHERE token(pk) > 0", KEYSPACE, SAI_TABLE);
+        prevHistCount = getRangeReadCount();
+        runAndValidate(prevHistCount, 1, query);
+
+        // token-restricted query and index
+        // we expect StorageProxy#RangeCommandIterator to be used so reset previous hit count
+        query = String.format("SELECT * FROM %s.%s WHERE token(pk) > 0 AND gdp > ?", KEYSPACE, SAI_TABLE);
+        prevHistCount = getRangeReadCount();
+        runAndValidate(prevHistCount, 1, query, 3_000_000_000L);
+    }
+
+    /*
+        Run the given query, check the hit count, check the max round trips.
+     */
+    private void runAndValidate(int prevHistCount, int maxRoundTrips, String query, Object... bondValues)
+    {
+        cluster.coordinator(1).execute(query, ConsistencyLevel.ALL, bondValues);
+        assertEquals(prevHistCount + 1,  getRangeReadCount());
+        assertEquals(maxRoundTrips, getMaxRoundTrips());
+    }
+
+    private int getRangeReadCount()
+    {
+        return cluster.get(1).callOnInstance(() -> Math.toIntExact(RangeCommandIterator.rangeMetrics.roundTrips.getCount()));
+    }
+
+    private int getMaxRoundTrips()
+    {
+        return cluster.get(1).callOnInstance(() -> Math.toIntExact(RangeCommandIterator.rangeMetrics.roundTrips.getSnapshot().getMax()));
+    }
+}

From 641a5cfaa403fc60ec45110518ebc30810b5eec6 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Mon, 22 Mar 2021 12:10:05 +0000
Subject: [PATCH 047/151] STAR-360: Change compaction pause to be more specific
 to rewrite

(cherry picked from commit 16b3e35db2a8d6c66bd5fead3c1240116a3ae8c4)
(cherry picked from commit b913058c6cfe47624ced944255877630eb8e3c3f)
---
 .../apache/cassandra/index/sai/functional/CompactionTest.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java b/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java
index 1e1e2138aa21..d929e5abe694 100644
--- a/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/functional/CompactionTest.java
@@ -235,7 +235,7 @@ public void testConcurrentIndexBuildWithCompaction() throws Throwable
 
         Injections.Barrier compactionLatch =
         Injections.newBarrier("pause_compaction", 2, false)
-                  .add(InvokePointBuilder.newInvokePoint().onClass(BigTableWriter.class).onMethod("afterAppend"))
+                  .add(InvokePointBuilder.newInvokePoint().onClass(CompactionManager.class).onMethod("performSSTableRewrite"))
                   .build();
 
         try

From e7984601cb62f6369148c81a0383fcdefbba4661 Mon Sep 17 00:00:00 2001
From: Aleksandr Sorokoumov <918393+Gerrrr@users.noreply.github.com>
Date: Mon, 29 Mar 2021 14:13:30 +0200
Subject: [PATCH 048/151] STAR-374: Avoid redundant calls to RateLimiter in
 CompactionTask (#92)

+ throttle RateLimiter acquire in compaction operations

Co-authored-by: nitsanw <nitsanw@yahoo.com>
(cherry picked from commit a90d75cda3045929c98c30b2a00ce178e5b77227)
(cherry picked from commit d70f083dd6db19eb814d1fbfce6af26579b68922)
---
 .../db/compaction/CompactionManager.java      | 27 ++++++++++++++-----
 .../db/compaction/CompactionTask.java         |  7 +++--
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index 7d503892c136..2349689b330d 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
@@ -33,6 +33,7 @@
 import com.google.common.collect.*;
 import com.google.common.util.concurrent.*;
 
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.locator.RangesAtEndpoint;
 import org.slf4j.Logger;
@@ -115,6 +116,8 @@ protected Boolean initialValue()
             return false;
         }
     };
+    private static final int ACQUIRE_GRANULARITY =
+    Integer.getInteger(Config.PROPERTY_PREFIX + "compaction_rate_limit_granularity_in_kb", 128) * 1024;
 
     static
     {
@@ -1269,9 +1272,8 @@ private void doCleanupOne(final ColumnFamilyStore cfs,
 
                     long bytesScanned = scanner.getBytesScanned();
 
-                    compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio);
-
-                    lastBytesScanned = bytesScanned;
+                    if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
+                        lastBytesScanned = bytesScanned;
                 }
             }
 
@@ -1296,9 +1298,21 @@ private void doCleanupOne(final ColumnFamilyStore cfs,
 
     }
 
-    static void compactionRateLimiterAcquire(RateLimiter limiter, long bytesScanned, long lastBytesScanned, double compressionRatio)
+    static boolean compactionRateLimiterAcquire(RateLimiter limiter, long bytesScanned, long lastBytesScanned, double compressionRatio)
     {
+        if (DatabaseDescriptor.getCompactionThroughputMbPerSec() == 0)
+            return false;
+
         long lengthRead = (long) ((bytesScanned - lastBytesScanned) * compressionRatio) + 1;
+        // Acquire at 128k granularity. At worst we'll exceed the limit a bit, but acquire is quite expensive.
+        if (lengthRead < ACQUIRE_GRANULARITY)
+            return false;
+
+        return actuallyAcquire(limiter, lengthRead);
+    }
+
+    private static boolean actuallyAcquire(RateLimiter limiter, long lengthRead)
+    {
         while (lengthRead >= Integer.MAX_VALUE)
         {
             limiter.acquire(Integer.MAX_VALUE);
@@ -1308,6 +1322,7 @@ static void compactionRateLimiterAcquire(RateLimiter limiter, long bytesScanned,
         {
             limiter.acquire((int) lengthRead);
         }
+        return true;
     }
 
     private static abstract class CleanupStrategy
@@ -1623,8 +1638,8 @@ else if (transChecker.test(token))
                         unrepairedWriter.append(partition);
                     }
                     long bytesScanned = scanners.getTotalBytesScanned();
-                    compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio);
-                    lastBytesScanned = bytesScanned;
+                    if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
+                        lastBytesScanned = bytesScanned;
                 }
             }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index 13c97253bcf6..bfb840c166ce 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
@@ -196,10 +196,9 @@ public boolean apply(SSTableReader sstable)
 
                         long bytesScanned = scanners.getTotalBytesScanned();
 
-                        //Rate limit the scanners, and account for compression
-                        CompactionManager.compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio);
-
-                        lastBytesScanned = bytesScanned;
+                        // Rate limit the scanners, and account for compression
+                        if (CompactionManager.compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
+                            lastBytesScanned = bytesScanned;
 
                         if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
                         {

From cf7714e5be99b2ec77d34b44b4aff551516d7d09 Mon Sep 17 00:00:00 2001
From: jasonrutherglen <jason.rutherglen@gmail.com>
Date: Mon, 29 Mar 2021 12:52:20 -0500
Subject: [PATCH 049/151] STAR-348 moved postings decoded to mark on postings
 reader close (#87)

fixed up PostingsTest and others

removed new line

(cherry picked from commit fbdbec306ebd383aa3ac05236e93c21ce869cbce)
(cherry picked from commit fabfca15c68c05ae9a5903767ca150db03b1a2d5)
---
 .../index/sai/disk/QueryEventListeners.java   |  2 +-
 .../index/sai/disk/v1/PostingsReader.java     |  4 +-
 .../index/sai/metrics/ColumnQueryMetrics.java |  4 +-
 .../metrics/MulticastQueryEventListeners.java | 12 +--
 .../index/sai/metrics/QueryEventListener.java |  4 +-
 .../index/sai/disk/v1/PostingsTest.java       | 91 +++++++++----------
 .../sai/metrics/QueryEventListeners.java      |  2 +-
 7 files changed, 57 insertions(+), 62 deletions(-)

diff --git a/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java b/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java
index 711b543eed8e..ba6a80265c66 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/QueryEventListeners.java
@@ -92,6 +92,6 @@ public static class NoOpPostingListEventListener implements QueryEventListener.P
         public void onAdvance() { }
 
         @Override
-        public void onPostingDecoded() { }
+        public void postingDecoded(long postingsDecoded) { }
     }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java
index 5d97000d59fc..f2df0941e4ac 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsReader.java
@@ -59,6 +59,7 @@ public class PostingsReader implements OrdinalPostingList
 
     private long currentPosition;
     private DirectReaders.Reader currentFORValues;
+    private long postingsDecoded = 0;
 
     @VisibleForTesting
     PostingsReader(IndexInput input, long summaryOffset, QueryEventListener.PostingListEventListener listener) throws IOException
@@ -184,6 +185,7 @@ public long length()
     @Override
     public void close() throws IOException
     {
+        listener.postingDecoded(postingsDecoded);
         try
         {
             input.close();
@@ -346,7 +348,7 @@ private int nextRowID()
         else
         {
             final long id = currentFORValues.get(seekingInput, currentPosition, blockIdx);
-            listener.onPostingDecoded();
+            postingsDecoded++;
             return Math.toIntExact(id);
         }
     }
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java b/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java
index a0559c58fa04..4cbb9124ff0e 100644
--- a/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java
+++ b/src/java/org/apache/cassandra/index/sai/metrics/ColumnQueryMetrics.java
@@ -138,9 +138,9 @@ private PostingListEventsMetrics(Meter postingDecodes)
         public void onAdvance() { }
 
         @Override
-        public void onPostingDecoded()
+        public void postingDecoded(long postingsDecoded)
         {
-            postingDecodes.mark();
+            postingDecodes.mark(postingsDecoded);
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java b/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java
index f3926dcdfd8e..2096ef0fd902 100644
--- a/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java
+++ b/src/java/org/apache/cassandra/index/sai/metrics/MulticastQueryEventListeners.java
@@ -133,10 +133,10 @@ public void onAdvance()
         }
 
         @Override
-        public void onPostingDecoded()
+        public void postingDecoded(long postingDecoded)
         {
-            ctx.bkdPostingsDecodes++;
-            listener.onPostingDecoded();
+            ctx.bkdPostingsDecodes += postingDecoded;
+            listener.postingDecoded(postingDecoded);
         }
     }
 
@@ -159,10 +159,10 @@ public void onAdvance()
         }
 
         @Override
-        public void onPostingDecoded()
+        public void postingDecoded(long postingDecoded)
         {
-            ctx.triePostingsDecodes++;
-            listener.onPostingDecoded();
+            ctx.triePostingsDecodes += postingDecoded;
+            listener.postingDecoded(postingDecoded);
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java b/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java
index 134590b5d7fd..4f410fa97e66 100644
--- a/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java
+++ b/src/java/org/apache/cassandra/index/sai/metrics/QueryEventListener.java
@@ -96,7 +96,7 @@ interface PostingListEventListener
         /**
          * When a posting is successfully read from disk and decoded.
          */
-        void onPostingDecoded();
+        void postingDecoded(long postingsDecoded);
 
         PostingListEventListener NO_OP = new PostingListEventListener()
         {
@@ -107,7 +107,7 @@ public void onAdvance()
             }
 
             @Override
-            public void onPostingDecoded()
+            public void postingDecoded(long postingsDecoded)
             {
 
             }
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java
index f39443e3e550..e164fe2c89a6 100644
--- a/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/PostingsTest.java
@@ -60,36 +60,35 @@ public void testSingleBlockPostingList() throws Exception
         assertEquals(1, summary.offsets.length());
 
         CountingPostingListEventListener listener = new CountingPostingListEventListener();
-        try (PostingsReader reader = new PostingsReader(input, postingPointer, listener))
+        PostingsReader reader = new PostingsReader(input, postingPointer, listener);
+
+        expectedPostingList.reset();
+        assertEquals(expectedPostingList.getOrdinal(), reader.getOrdinal());
+        assertEquals(expectedPostingList.size(), reader.size());
+
+        long actualRowID;
+        while ((actualRowID = reader.nextPosting()) != PostingList.END_OF_STREAM)
         {
-            expectedPostingList.reset();
+            assertEquals(expectedPostingList.nextPosting(), actualRowID);
             assertEquals(expectedPostingList.getOrdinal(), reader.getOrdinal());
-            assertEquals(expectedPostingList.size(), reader.size());
-
-            long actualRowID;
-            while ((actualRowID = reader.nextPosting()) != PostingList.END_OF_STREAM)
-            {
-                assertEquals(expectedPostingList.nextPosting(), actualRowID);
-                assertEquals(expectedPostingList.getOrdinal(), reader.getOrdinal());
-            }
-            assertEquals(PostingList.END_OF_STREAM, expectedPostingList.nextPosting());
-            assertEquals(0, listener.advances);
-            assertEquals(reader.size(), listener.decodes);
         }
+        assertEquals(PostingList.END_OF_STREAM, expectedPostingList.nextPosting());
+        assertEquals(0, listener.advances);
+        reader.close();
+        assertEquals(reader.size(), listener.decodes);
 
         input = indexComponents.openBlockingInput(indexComponents.postingLists);
         listener = new CountingPostingListEventListener();
-        try (PostingsReader reader = new PostingsReader(input, postingPointer, listener))
-        {
-            assertEquals(0, listener.decodes); // nothing is decoded up-front
-            assertEquals(50, reader.advance(45));
-            assertEquals(5, listener.decodes); // slow advance also decodes
-            assertEquals(60, reader.advance(60));
-            assertEquals(6, listener.decodes); // slow advance also decodes
-            assertEquals(PostingList.END_OF_STREAM, reader.nextPosting());
-            assertEquals(reader.size(), listener.decodes); // nothing more was decoded
-            assertEquals(2, listener.advances);
-        }
+        reader = new PostingsReader(input, postingPointer, listener);
+
+        assertEquals(50, reader.advance(45));
+
+        assertEquals(60, reader.advance(60));
+        assertEquals(PostingList.END_OF_STREAM, reader.nextPosting());
+        assertEquals(2, listener.advances);
+        reader.close();
+
+        assertEquals(reader.size(), listener.decodes); // nothing more was decoded
     }
 
     @Test
@@ -272,35 +271,29 @@ private void testAdvance(IndexComponents indexComponents, long fp, ArrayPostingL
     {
         expected.reset();
         final CountingPostingListEventListener listener = new CountingPostingListEventListener();
-        try (PostingsReader reader = openReader(indexComponents, fp, listener))
+        PostingsReader reader = openReader(indexComponents, fp, listener);
+        for (int i = 0; i < 2; ++i)
         {
-            for (int i = 0; i < 2; ++i)
-            {
-                assertEquals(expected.nextPosting(), reader.nextPosting());
-                assertEquals(expected.getOrdinal(), reader.getOrdinal());
-            }
+            assertEquals(expected.nextPosting(), reader.nextPosting());
+            assertEquals(expected.getOrdinal(), reader.getOrdinal());
+        }
 
-            // If all postings in a block have the same value, we don't actually decode any deltas ;)
-            if (expected.getPostingAt(0) != expected.getPostingAt(reader.getBlockSize() - 1))
-            {
-                assertEquals(2, listener.decodes);
-            }
+        for (int target : targetIDs)
+        {
+            final long actualRowId = reader.advance(target);
+            final long expectedRowId = expected.advance(target);
 
-            for (int target : targetIDs)
-            {
-                final long actualRowId = reader.advance(target);
-                final long expectedRowId = expected.advance(target);
+            assertEquals(expectedRowId, actualRowId);
 
-                assertEquals(expectedRowId, actualRowId);
+            assertEquals(expected.getOrdinal(), reader.getOrdinal());
+        }
 
-                assertEquals(expected.getOrdinal(), reader.getOrdinal());
-            }
+        // check if iterator is correctly positioned
+        assertPostingListEquals(expected, reader);
+        // check if reader emitted all events
+        assertEquals(targetIDs.length, listener.advances);
 
-            // check if iterator is correctly positioned
-            assertPostingListEquals(expected, reader);
-            // check if reader emitted all events
-            assertEquals(targetIDs.length, listener.advances);
-        }
+        reader.close();
     }
 
     private PostingsReader openReader(IndexComponents indexComponents, long fp, QueryEventListener.PostingListEventListener listener) throws IOException
@@ -341,9 +334,9 @@ public void onAdvance()
         }
 
         @Override
-        public void onPostingDecoded()
+        public void postingDecoded(long postingsDecoded)
         {
-            decodes++;
+            this.decodes += postingsDecoded;
         }
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java b/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java
index 3da309c65a61..030a8e4803d2 100644
--- a/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java
+++ b/test/unit/org/apache/cassandra/index/sai/metrics/QueryEventListeners.java
@@ -90,6 +90,6 @@ public static class NoOpPostingListEventListener implements QueryEventListener.P
         public void onAdvance() { }
 
         @Override
-        public void onPostingDecoded() { }
+        public void postingDecoded(long postingsDecoded) { }
     }
 }

From 7d40a6c62ea527faf2ead0e9530d6bc0d749270c Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 9 Jun 2021 10:59:30 +0200
Subject: [PATCH 050/151] STAR-14: Memtable API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Defines a pluggable memtable API which permits selection of memtable implementation, which can be selected in cassandra.yaml or individually per table. In addition to replacements of the typical memtable functionality, this also exposes some control over flushing and commit log operation that should make it possible to use persistent memtables.

The main API is in Memtable.java, it includes support for altering the implementation of the current memtable functionality, but also expands it with control over flushing and commit log (where support for changed-data-capture and point-in-time restore, which rely on commit log, can be turned on or off), plus implementation of functionality that normally requires a flush (streaming/repair).

The API comes with three implementations: the legacy ConcurrentSkipListMap, an initial blocking version of a trie-based one, plus a skeleton for a persistent memory solution that demonstates how one could be attached to effectively replace the storage subsystem.

Also adds logged FlushReason to every flush request and improves JMH benchmarks.

patch by Branimir Lambov; reviewed by Jakub Żytka

(cherry picked from commit 04b02431b97beaa33366af25e03ce977e09d4c5c)
(cherry picked from commit f0c0b5f5db1f2ad8dc663ae8bb0e0c7c50e15145)
---
 build.xml                                     |    4 +-
 .../org/apache/cassandra/config/Config.java   |    1 +
 .../cassandra/config/DatabaseDescriptor.java  |    5 +
 .../statements/schema/TableAttributes.java    |    4 +
 .../cassandra/db/BufferDecoratedKey.java      |    6 +
 .../db/CassandraKeyspaceWriteHandler.java     |   44 +-
 .../cassandra/db/ColumnFamilyStore.java       |  489 ++++----
 .../org/apache/cassandra/db/DecoratedKey.java |    1 +
 .../org/apache/cassandra/db/Keyspace.java     |    6 +-
 .../org/apache/cassandra/db/Memtable.java     |  757 ------------
 .../cassandra/db/NativeDecoratedKey.java      |    6 +
 .../db/PartitionRangeReadCommand.java         |    7 +-
 .../db/SinglePartitionReadCommand.java        |    1 +
 .../apache/cassandra/db/SystemKeyspace.java   |    4 +-
 .../AbstractCommitLogSegmentManager.java      |   17 +-
 .../db/commitlog/CommitLogPosition.java       |    6 -
 .../db/commitlog/CommitLogReplayer.java       |   17 +-
 .../db/compaction/CompactionController.java   |    1 +
 .../db/compaction/CompactionTask.java         |    2 +-
 .../db/lifecycle/LifecycleTransaction.java    |    4 +-
 .../cassandra/db/lifecycle/Tracker.java       |   12 +-
 .../apache/cassandra/db/lifecycle/View.java   |    2 +-
 .../memtable/AbstractAllocatorMemtable.java   |  303 +++++
 .../db/memtable/AbstractMemtable.java         |  208 ++++
 .../AbstractMemtableWithCommitlog.java        |  124 ++
 .../db/memtable/DefaultMemtableFactory.java   |   30 +
 .../cassandra/db/memtable/Flushing.java       |  300 +++++
 .../cassandra/db/memtable/Memtable.java       |  426 +++++++
 .../db/memtable/PersistentMemoryMemtable.java |  261 +++++
 .../db/memtable/SkipListMemtable.java         |  341 ++++++
 .../cassandra/db/memtable/TrieMemtable.java   |  408 +++++++
 .../db/partitions/AbstractBTreePartition.java |   56 +-
 .../db/partitions/AtomicBTreePartition.java   |  220 +---
 .../db/partitions/BTreePartitionData.java     |   58 +
 .../db/partitions/BTreePartitionUpdater.java  |  168 +++
 .../db/partitions/CachedBTreePartition.java   |    6 +-
 .../partitions/ImmutableBTreePartition.java   |    8 +-
 .../cassandra/db/partitions/Partition.java    |    5 +
 .../db/partitions/PartitionUpdate.java        |   30 +-
 .../repair/CassandraValidationIterator.java   |   11 +-
 .../db/repair/PendingAntiCompaction.java      |    2 +-
 .../org/apache/cassandra/db/rows/Rows.java    |    2 +-
 .../db/streaming/CassandraStreamManager.java  |    3 +
 .../db/streaming/CassandraStreamReceiver.java |    6 +-
 .../cassandra/db/tries/MemtableTrie.java      |    4 +-
 .../apache/cassandra/db/view/TableViews.java  |    8 +-
 .../apache/cassandra/db/view/ViewBuilder.java |    2 +-
 .../org/apache/cassandra/index/Index.java     |    1 +
 .../apache/cassandra/index/IndexRegistry.java |    1 +
 .../index/SecondaryIndexManager.java          |    5 +-
 .../cassandra/index/SingletonIndexGroup.java  |    2 +-
 .../index/internal/CassandraIndex.java        |    9 +-
 .../cassandra/index/sai/ColumnContext.java    |    4 +-
 .../index/sai/StorageAttachedIndex.java       |    6 +-
 .../index/sai/StorageAttachedIndexGroup.java  |    2 +-
 .../index/sai/memory/MemtableIndex.java       |    2 +-
 .../cassandra/index/sasi/SASIIndex.java       |    3 +-
 .../index/sasi/conf/ColumnIndex.java          |    2 +-
 .../io/sstable/format/SSTableWriter.java      |    9 +
 .../io/sstable/format/big/BigFormat.java      |    9 +
 .../cassandra/metrics/TableMetrics.java       |   25 +-
 .../MemtableDiscardedNotification.java        |    2 +-
 .../MemtableRenewedNotification.java          |    2 +-
 .../MemtableSwitchedNotification.java         |    2 +-
 .../SSTableAddedNotification.java             |    2 +-
 .../repair/SystemDistributedKeyspace.java     |    9 +-
 .../repair/consistent/LocalSessions.java      |    2 +-
 .../cassandra/schema/MemtableParams.java      |  153 +++
 .../apache/cassandra/schema/SchemaEvent.java  |    9 +
 .../cassandra/schema/SchemaKeyspace.java      |    8 +-
 .../apache/cassandra/schema/TableParams.java  |   19 +
 .../cassandra/service/StorageService.java     |    6 +-
 .../cassandra/streaming/StreamSession.java    |    3 +-
 .../cassandra/utils/memory/EnsureOnHeap.java  |    2 +
 .../cassandra/utils/memory/HeapPool.java      |    3 +-
 .../cassandra/distributed/impl/Instance.java  |    8 +-
 .../distributed/test/FailingRepairTest.java   |    2 +-
 ...yInspectorCorruptSSTableExceptionTest.java |    2 +-
 .../apache/cassandra/cql3/ViewLongTest.java   |    5 +-
 .../db/compaction/LongCompactionsTest.java    |    2 +-
 .../LongLeveledCompactionStrategyCQLTest.java |    3 +-
 .../LongLeveledCompactionStrategyTest.java    |    4 +-
 .../compaction/CompactionAllocationTest.java  |    6 +-
 .../test/microbench/CompactionBench.java      |    4 +-
 .../ZeroCopyStreamingBenchmark.java           |    2 +-
 .../test/microbench/instance/ReadTest.java    |  198 ++--
 .../test/microbench/instance/WriteTest.java   |  250 ++++
 .../tries/MemtableTrieWriteBench.java         |    2 +-
 .../org/apache/cassandra/ServerTestUtils.java |    3 +-
 test/unit/org/apache/cassandra/Util.java      |    2 +-
 .../batchlog/BatchlogManagerTest.java         |    6 +-
 .../cassandra/cache/AutoSavingCacheTest.java  |    2 +-
 .../org/apache/cassandra/cql3/CQLTester.java  |    2 +-
 .../cassandra/cql3/GcCompactionTest.java      |    4 +-
 .../cassandra/cql3/KeyCacheCqlTest.java       |    5 +-
 .../cassandra/cql3/MemtableQuickTest.java     |  142 +++
 .../cassandra/cql3/MemtableSizeTest.java      |   34 +-
 .../apache/cassandra/cql3/OutOfSpaceTest.java |    6 +-
 .../cql3/ViewComplexDeletionsTest.java        |   47 +-
 .../cql3/ViewComplexLivenessTest.java         |    9 +-
 .../cassandra/cql3/ViewComplexTTLTest.java    |   23 +-
 .../cassandra/cql3/ViewComplexTest.java       |   23 +-
 .../cql3/ViewComplexUpdatesTest.java          |   49 +-
 .../cassandra/cql3/ViewFilteringTest.java     |   52 +-
 .../org/apache/cassandra/cql3/ViewTest.java   |   26 +-
 .../statements/DescribeStatementTest.java     |    1 +
 .../miscellaneous/CrcCheckChanceTest.java     |   10 +-
 .../SSTableMetadataTrackingTest.java          |   14 +-
 .../cql3/validation/operations/AlterTest.java |   80 ++
 .../validation/operations/CreateTest.java     |  130 +++
 .../cql3/validation/operations/TTLTest.java   |    2 +-
 .../org/apache/cassandra/db/CleanupTest.java  |    4 +-
 .../cassandra/db/CleanupTransientTest.java    |    2 +-
 .../cassandra/db/ColumnFamilyMetricTest.java  |    6 +-
 .../cassandra/db/ColumnFamilyStoreTest.java   |   22 +-
 .../cassandra/db/DeletePartitionTest.java     |    4 +-
 .../org/apache/cassandra/db/ImportTest.java   |   41 +-
 .../org/apache/cassandra/db/KeyCacheTest.java |    8 +-
 .../org/apache/cassandra/db/KeyspaceTest.java |   22 +-
 .../cassandra/db/MultiKeyspaceTest.java       |    5 +-
 .../org/apache/cassandra/db/NameSortTest.java |    2 +-
 .../cassandra/db/PartitionRangeReadTest.java  |    6 +-
 .../cassandra/db/RangeTombstoneTest.java      |   46 +-
 .../apache/cassandra/db/ReadCommandTest.java  |   44 +-
 .../db/RecoveryManagerFlushedTest.java        |    6 +-
 .../apache/cassandra/db/RemoveCellTest.java   |    2 +-
 .../org/apache/cassandra/db/RowCacheTest.java |    2 +-
 .../apache/cassandra/db/RowIterationTest.java |    8 +-
 .../cassandra/db/SchemaCQLHelperTest.java     |    1 +
 .../org/apache/cassandra/db/ScrubTest.java    |   12 +-
 .../cassandra/db/SecondaryIndexTest.java      |    8 +-
 .../db/SinglePartitionReadCommandCQLTest.java |    4 +-
 .../db/SinglePartitionSliceCommandTest.java   |   24 +-
 .../org/apache/cassandra/db/TimeSortTest.java |    4 +-
 .../org/apache/cassandra/db/VerifyTest.java   |    6 +-
 .../db/commitlog/CommitLogCQLTest.java        |    2 +-
 .../db/commitlog/CommitLogReaderTest.java     |    5 +-
 .../CommitLogSegmentManagerCDCTest.java       |    5 +-
 .../cassandra/db/commitlog/CommitLogTest.java |   27 +-
 .../db/commitlog/SnapshotDeletingTest.java    |    2 +-
 .../AbstractCompactionStrategyTest.java       |    2 +-
 .../compaction/AbstractPendingRepairTest.java |    2 +-
 .../db/compaction/ActiveCompactionsTest.java  |   13 +-
 .../compaction/AntiCompactionBytemanTest.java |    2 +-
 .../db/compaction/AntiCompactionTest.java     |    4 +-
 .../db/compaction/CancelCompactionsTest.java  |    2 +-
 .../compaction/CompactionAwareWriterTest.java |    2 +-
 .../compaction/CompactionControllerTest.java  |   12 +-
 .../db/compaction/CompactionIteratorTest.java |    2 +-
 .../CompactionStrategyManagerTest.java        |    2 +-
 .../db/compaction/CompactionTaskTest.java     |   12 +-
 .../db/compaction/CompactionsBytemanTest.java |    8 +-
 .../db/compaction/CompactionsCQLTest.java     |   32 +-
 .../db/compaction/CompactionsPurgeTest.java   |   52 +-
 .../db/compaction/CompactionsTest.java        |   16 +-
 .../CorruptedSSTablesCompactionsTest.java     |    2 +-
 .../DateTieredCompactionStrategyTest.java     |   16 +-
 .../LeveledCompactionStrategyTest.java        |   22 +-
 .../db/compaction/NeverPurgeTest.java         |    6 +-
 .../db/compaction/OneCompactionTest.java      |    2 +-
 .../compaction/SingleSSTableLCSTaskTest.java  |    6 +-
 .../SizeTieredCompactionStrategyTest.java     |    4 +-
 .../db/compaction/TTLExpiryTest.java          |   28 +-
 .../TimeWindowCompactionStrategyTest.java     |   16 +-
 .../lifecycle/LifecycleTransactionTest.java   |   10 +-
 .../cassandra/db/lifecycle/TrackerTest.java   |   21 +-
 .../cassandra/db/lifecycle/ViewTest.java      |    2 +-
 .../FlushingTest.java}                        |   44 +-
 .../AbstractPendingAntiCompactionTest.java    |    2 +-
 ...onManagerGetSSTablesForValidationTest.java |    2 +-
 .../db/repair/PendingAntiCompactionTest.java  |    4 +-
 .../rows/ThrottledUnfilteredIteratorTest.java |    8 +-
 ...assandraEntireSSTableStreamWriterTest.java |    2 +-
 .../streaming/CassandraOutgoingFileTest.java  |    2 +-
 .../streaming/CassandraStreamHeaderTest.java  |    2 +-
 .../streaming/CassandraStreamManagerTest.java |    2 +-
 ...StreamConcurrentComponentMutationTest.java |    2 +-
 .../db/transform/DuplicateRowCheckerTest.java |    2 +-
 .../db/view/ViewBuilderTaskTest.java          |    4 +-
 .../cassandra/index/CustomIndexTest.java      |   19 +-
 .../org/apache/cassandra/index/StubIndex.java |    1 +
 .../cassandra/index/StubIndexGroup.java       |    2 +-
 .../index/internal/CustomCassandraIndex.java  |    9 +-
 .../cassandra/index/sasi/SASIIndexTest.java   |   38 +-
 .../cassandra/io/DiskSpaceMetricsTest.java    |    2 +-
 .../CompressedSequentialWriterReopenTest.java |    4 +-
 .../io/sstable/IndexSummaryManagerTest.java   |    6 +-
 .../IndexSummaryRedistributionTest.java       |    2 +-
 .../io/sstable/ReducingKeyIteratorTest.java   |    2 +-
 .../SSTableCorruptionDetectionTest.java       |    2 +-
 .../io/sstable/SSTableLoaderTest.java         |   10 +-
 .../io/sstable/SSTableMetadataTest.java       |   20 +-
 .../io/sstable/SSTableReaderTest.java         |   30 +-
 .../io/sstable/SSTableRewriterTest.java       |    4 +-
 .../io/sstable/SSTableScannerTest.java        |    6 +-
 .../big/BigTableZeroCopyWriterTest.java       |    2 +-
 .../SSTableReverseIteratorTest.java           |    2 +-
 .../cassandra/repair/ValidatorTest.java       |    6 +-
 .../consistent/PendingRepairStatTest.java     |    2 +-
 .../schema/MigrationManagerTest.java          |   12 +-
 .../apache/cassandra/schema/MockSchema.java   |    4 +-
 .../service/ActiveRepairServiceTest.java      |    2 +-
 .../cassandra/service/ClientWarningsTest.java |    4 +-
 .../reads/range/RangeCommandIteratorTest.java |    2 +-
 ...SSTableStreamingCorrectFilesCountTest.java |    2 +-
 .../streaming/StreamTransferTaskTest.java     |    4 +-
 .../streaming/StreamingTransferTest.java      |    6 +-
 .../StandaloneSplitterWithCQLTesterTest.java  |    2 +-
 .../StandaloneUpgraderOnSStablesTest.java     |    2 +-
 .../53-f0c0b5f5db STAR-14: Memtable API       | 1038 +++++++++++++++++
 210 files changed, 5844 insertions(+), 1955 deletions(-)
 delete mode 100644 src/java/org/apache/cassandra/db/Memtable.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/Flushing.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/Memtable.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
 create mode 100644 src/java/org/apache/cassandra/db/partitions/BTreePartitionData.java
 create mode 100644 src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java
 create mode 100644 src/java/org/apache/cassandra/schema/MemtableParams.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
 create mode 100644 test/unit/org/apache/cassandra/cql3/MemtableQuickTest.java
 rename test/unit/org/apache/cassandra/db/{MemtableTest.java => memtable/FlushingTest.java} (76%)
 create mode 100644 update-history/STAR-801/53-f0c0b5f5db STAR-14: Memtable API

diff --git a/build.xml b/build.xml
index 479c3fb99fa1..a1c8e0a5d4e6 100644
--- a/build.xml
+++ b/build.xml
@@ -648,7 +648,7 @@
           <dependency groupId="com.google.j2objc" artifactId="j2objc-annotations" version="1.3"/>
           <!-- adding this dependency is necessary for assertj. When updating assertj, need to also update the version of
              this that the new assertj's `assertj-parent-pom` depends on. -->
-          <dependency groupId="org.junit" artifactId="junit-bom" version="5.6.0" type="pom" scope="test"/>
+          <dependency groupId="org.junit" artifactId="junit-bom" type="pom" scope="test"/>
           <!-- when updating assertj, make sure to also update the corresponding junit-bom dependency -->
           <dependency groupId="org.assertj" artifactId="assertj-core" version="3.15.0" scope="provided"/>
           <dependency groupId="org.awaitility" artifactId="awaitility" version="4.0.3"  scope="test">
@@ -728,7 +728,7 @@
         <dependency groupId="org.apache.ant" artifactId="ant-junit"/>
         <!-- adding this dependency is necessary for assertj. When updating assertj, need to also update the version of
              this that the new assertj's `assertj-parent-pom` depends on. -->
-        <dependency groupId="org.junit" artifactId="junit-bom" type="pom"/>
+        <dependency groupId="org.junit" artifactId="junit-bom" version="5.6.0" type="pom"/>
         <dependency groupId="org.awaitility" artifactId="awaitility"/>
         <dependency groupId="org.hamcrest" artifactId="hamcrest"/>
         <!-- coverage debs -->
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 764656abcd7d..490627f58187 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -126,6 +126,7 @@ public class Config
     public Integer memtable_heap_space_in_mb;
     public Integer memtable_offheap_space_in_mb;
     public Float memtable_cleanup_threshold = null;
+    public Map<String, String> memtable = null;
 
     // Limit the maximum depth of repair session merkle trees
     @Deprecated
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index d49195a191c6..90f4d48710fa 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -2849,6 +2849,11 @@ public static Float getMemtableCleanupThreshold()
         return conf.memtable_cleanup_threshold;
     }
 
+    public static Map<String, String> getMemtableOptions()
+    {
+        return conf.memtable;
+    }
+
     public static int getIndexSummaryResizeIntervalInMinutes()
     {
         return conf.index_summary_resize_interval_in_minutes;
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 126e6d7857b1..686729548583 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -28,6 +28,7 @@
 import org.apache.cassandra.schema.CachingParams;
 import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.MemtableParams;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.schema.TableParams;
 import org.apache.cassandra.schema.TableParams.Option;
@@ -110,6 +111,9 @@ private TableParams build(TableParams.Builder builder)
             builder.compression(CompressionParams.fromMap(getMap(Option.COMPRESSION)));
         }
 
+        if (hasOption(Option.MEMTABLE))
+            builder.memtable(MemtableParams.fromMap(getMap(Option.MEMTABLE)));
+
         if (hasOption(Option.DEFAULT_TIME_TO_LIVE))
             builder.defaultTimeToLive(getInt(Option.DEFAULT_TIME_TO_LIVE));
 
diff --git a/src/java/org/apache/cassandra/db/BufferDecoratedKey.java b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
index ae3e9d44e08a..07f610fd7ca4 100644
--- a/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java
@@ -39,6 +39,12 @@ public ByteBuffer getKey()
         return key;
     }
 
+    @Override
+    public int getKeyLength()
+    {
+        return key.remaining();
+    }
+
     /**
      * A factory method that translates the given byte-comparable representation to a {@link BufferDecoratedKey}
      * instance. If the given byte comparable doesn't represent the encoding of a buffer decorated key, anything from a
diff --git a/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java b/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java
index efba11f1a4ff..570a2898d1cf 100644
--- a/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java
+++ b/src/java/org/apache/cassandra/db/CassandraKeyspaceWriteHandler.java
@@ -18,9 +18,13 @@
 
 package org.apache.cassandra.db;
 
+import java.util.HashSet;
+import java.util.Set;
+
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 
@@ -46,8 +50,7 @@ public WriteContext beginWrite(Mutation mutation, boolean makeDurable) throws Re
             CommitLogPosition position = null;
             if (makeDurable)
             {
-                Tracing.trace("Appending to commitlog");
-                position = CommitLog.instance.add(mutation);
+                position = addToCommitLog(mutation);
             }
             return new CassandraWriteContext(group, position);
         }
@@ -61,6 +64,43 @@ public WriteContext beginWrite(Mutation mutation, boolean makeDurable) throws Re
         }
     }
 
+    private CommitLogPosition addToCommitLog(Mutation mutation)
+    {
+        CommitLogPosition position;
+        // Usually one of these will be true, so first check if that's the case.
+        boolean allSkipCommitlog = true;
+        boolean noneSkipCommitlog = true;
+        for (TableId id : mutation.getTableIds())
+        {
+            if (keyspace.getColumnFamilyStore(id).writesShouldSkipCommitLog())
+                noneSkipCommitlog = false;
+            else
+                allSkipCommitlog = false;
+        }
+
+        if (!noneSkipCommitlog)
+        {
+            if (allSkipCommitlog)
+                return null;
+            else
+            {
+                Set<TableId> ids = new HashSet<>();
+                for (TableId id : mutation.getTableIds())
+                {
+                    if (keyspace.getColumnFamilyStore(id).writesShouldSkipCommitLog())
+                        ids.add(id);
+                }
+                mutation = mutation.without(ids);
+            }
+        }
+        // Note: It may be a good idea to precalculate none/all for the set of all tables in the keyspace,
+        // or memoize the mutation.getTableIds()->ids map (needs invalidation on schema version change).
+
+        Tracing.trace("Appending to commitlog");
+        position = CommitLog.instance.add(mutation);
+        return position;
+    }
+
     @SuppressWarnings("resource") // group is closed when CassandraWriteContext is closed
     private WriteContext createEmptyContext()
     {
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 99aca2b3b7e4..6f24b77c748b 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -49,6 +49,8 @@
 import org.apache.cassandra.db.compaction.*;
 import org.apache.cassandra.db.filter.ClusteringIndexFilter;
 import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.memtable.Flushing;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.streaming.CassandraStreamManager;
 import org.apache.cassandra.db.repair.CassandraTableRepairManager;
 import org.apache.cassandra.db.view.TableViews;
@@ -67,6 +69,7 @@
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 import org.apache.cassandra.io.sstable.format.*;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
@@ -87,7 +90,6 @@
 import org.apache.cassandra.utils.*;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.concurrent.Refs;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
 import org.json.simple.JSONArray;
 import org.json.simple.JSONObject;
 
@@ -97,7 +99,7 @@
 import static org.apache.cassandra.utils.Throwables.merge;
 import static org.apache.cassandra.utils.Throwables.perform;
 
-public class ColumnFamilyStore implements ColumnFamilyStoreMBean
+public class ColumnFamilyStore implements ColumnFamilyStoreMBean, Memtable.Owner
 {
     private static final Logger logger = LoggerFactory.getLogger(ColumnFamilyStore.class);
 
@@ -135,6 +137,33 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
                                                                                                new NamedThreadFactory("MemtableReclaimMemory"),
                                                                                                "internal");
 
+    /**
+     * Reason for initiating a memtable flush.
+     */
+    public enum FlushReason
+    {
+        COMMITLOG_DIRTY,
+        MEMTABLE_LIMIT,
+        MEMTABLE_PERIOD_EXPIRED,
+        INDEX_BUILD_STARTED,
+        INDEX_BUILD_COMPLETED,
+        INDEX_REMOVED,
+        INDEX_TABLE_FLUSH,
+        VIEW_BUILD_STARTED,
+        INTERNALLY_FORCED,  // explicitly requested flush, necessary for the operation of an internal table
+        USER_FORCED, // flush explicitly requested by the user (e.g. nodetool flush)
+        STARTUP,
+        SHUTDOWN,
+        SNAPSHOT,
+        TRUNCATE,
+        DROP,
+        STREAMING,
+        STREAMS_RECEIVED,
+        REPAIR,
+        SCHEMA_CHANGE,
+        UNIT_TESTS; // explicitly requested flush needed for a test
+    }
+
     private static final String[] COUNTER_NAMES = new String[]{"table", "count", "error", "value"};
     private static final String[] COUNTER_DESCS = new String[]
     { "keyspace.tablename",
@@ -168,6 +197,8 @@ public class ColumnFamilyStore implements ColumnFamilyStoreMBean
     private final String oldMBeanName;
     private volatile boolean valid = true;
 
+    private Memtable.Factory memtableFactory;
+
     /**
      * Memtables and SSTables on disk for this column family.
      *
@@ -245,48 +276,14 @@ public void reload()
 
         compactionStrategyManager.maybeReload(metadata());
 
-        scheduleFlush();
-
         indexManager.reload();
 
-        // If the CF comparator has changed, we need to change the memtable,
-        // because the old one still aliases the previous comparator.
-        if (data.getView().getCurrentMemtable().initialComparator != metadata().comparator)
-            switchMemtable();
-    }
-
-    void scheduleFlush()
-    {
-        int period = metadata().params.memtableFlushPeriodInMs;
-        if (period > 0)
-        {
-            logger.trace("scheduling flush in {} ms", period);
-            WrappedRunnable runnable = new WrappedRunnable()
-            {
-                protected void runMayThrow()
-                {
-                    synchronized (data)
-                    {
-                        Memtable current = data.getView().getCurrentMemtable();
-                        // if we're not expired, we've been hit by a scheduled flush for an already flushed memtable, so ignore
-                        if (current.isExpired())
-                        {
-                            if (current.isClean())
-                            {
-                                // if we're still clean, instead of swapping just reschedule a flush for later
-                                scheduleFlush();
-                            }
-                            else
-                            {
-                                // we'll be rescheduled by the constructor of the Memtable.
-                                forceFlush();
-                            }
-                        }
-                    }
-                }
-            };
-            ScheduledExecutors.scheduledTasks.schedule(runnable, period, TimeUnit.MILLISECONDS);
-        }
+        memtableFactory = metadata().params.memtable.factory;
+        Memtable currentMemtable = data.getView().getCurrentMemtable();
+        if (currentMemtable.shouldSwitch(FlushReason.SCHEMA_CHANGE))
+            switchMemtableIfCurrent(currentMemtable, FlushReason.SCHEMA_CHANGE);
+        else
+            currentMemtable.metadataUpdated();
     }
 
     public static Runnable getBackgroundCompactionTaskSubmitter()
@@ -382,14 +379,15 @@ public ColumnFamilyStore(Keyspace keyspace,
         fileIndexGenerator.set(generation);
         sampleReadLatencyNanos = DatabaseDescriptor.getReadRpcTimeout(NANOSECONDS) / 2;
         additionalWriteLatencyNanos = DatabaseDescriptor.getWriteRpcTimeout(NANOSECONDS) / 2;
+        memtableFactory = metadata.get().params.memtable.factory;
 
         logger.info("Initializing {}.{}", keyspace.getName(), name);
 
         // Create Memtable only on online
         Memtable initialMemtable = null;
         if (DatabaseDescriptor.isDaemonInitialized())
-            initialMemtable = new Memtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition()), this);
-        data = new Tracker(initialMemtable, loadSSTables);
+            initialMemtable = createMemtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
+        data = new Tracker(this, initialMemtable, loadSSTables);
 
         // Note that this needs to happen before we load the first sstables, or the global sstable tracker will not
         // be notified on the initial loading.
@@ -511,6 +509,26 @@ public List<String> getDataPaths() throws IOException
         return dataPaths;
     }
 
+    public boolean writesShouldSkipCommitLog()
+    {
+        return memtableFactory.writesShouldSkipCommitLog();
+    }
+
+    public boolean memtableWritesAreDurable()
+    {
+        return memtableFactory.writesAreDurable();
+    }
+
+    public boolean streamToMemtable()
+    {
+        return memtableFactory.streamToMemtable();
+    }
+
+    public boolean streamFromMemtable()
+    {
+        return memtableFactory.streamFromMemtable();
+    }
+
     public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, UUID pendingRepair, boolean isTransient, int sstableLevel, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
     {
         MetadataCollector collector = new MetadataCollector(metadata().comparator).sstableLevel(sstableLevel);
@@ -825,12 +843,12 @@ public Descriptor newSSTableDescriptor(File directory, Version version, SSTableF
      *
      * @param memtable
      */
-    public ListenableFuture<CommitLogPosition> switchMemtableIfCurrent(Memtable memtable)
+    public ListenableFuture<CommitLogPosition> switchMemtableIfCurrent(Memtable memtable, FlushReason reason)
     {
         synchronized (data)
         {
             if (data.getView().getCurrentMemtable() == memtable)
-                return switchMemtable();
+                return switchMemtable(reason);
         }
         logger.debug("Memtable is no longer current, returning future that completes when current flushing operation completes");
         return waitForFlushes();
@@ -843,11 +861,12 @@ public ListenableFuture<CommitLogPosition> switchMemtableIfCurrent(Memtable memt
      * not complete until the Memtable (and all prior Memtables) have been successfully flushed, and the CL
      * marked clean up to the position owned by the Memtable.
      */
-    public ListenableFuture<CommitLogPosition> switchMemtable()
+    @VisibleForTesting
+    public ListenableFuture<CommitLogPosition> switchMemtable(FlushReason reason)
     {
         synchronized (data)
         {
-            logFlush();
+            logFlush(reason);
             Flush flush = new Flush(false);
             flushExecutor.execute(flush);
             postFlushExecutor.execute(flush.postFlushTask);
@@ -856,33 +875,19 @@ public ListenableFuture<CommitLogPosition> switchMemtable()
     }
 
     // print out size of all memtables we're enqueuing
-    private void logFlush()
+    private void logFlush(FlushReason reason)
     {
         // reclaiming includes that which we are GC-ing;
-        float onHeapRatio = 0, offHeapRatio = 0;
-        long onHeapTotal = 0, offHeapTotal = 0;
-        Memtable memtable = getTracker().getView().getCurrentMemtable();
-        onHeapRatio +=  memtable.getAllocator().onHeap().ownershipRatio();
-        offHeapRatio += memtable.getAllocator().offHeap().ownershipRatio();
-        onHeapTotal += memtable.getAllocator().onHeap().owns();
-        offHeapTotal += memtable.getAllocator().offHeap().owns();
+        Memtable.MemoryUsage usage = Memtable.newMemoryUsage();
+        getTracker().getView().getCurrentMemtable().addMemoryUsageTo(usage);
 
         for (ColumnFamilyStore indexCfs : indexManager.getAllIndexColumnFamilyStores())
-        {
-            MemtableAllocator allocator = indexCfs.getTracker().getView().getCurrentMemtable().getAllocator();
-            onHeapRatio += allocator.onHeap().ownershipRatio();
-            offHeapRatio += allocator.offHeap().ownershipRatio();
-            onHeapTotal += allocator.onHeap().owns();
-            offHeapTotal += allocator.offHeap().owns();
-        }
+            indexCfs.getTracker().getView().getCurrentMemtable().addMemoryUsageTo(usage);
 
-        logger.info("Enqueuing flush of {}: {}",
+        logger.info("Enqueuing flush of {} ({}): {}",
                      name,
-                     String.format("%s (%.0f%%) on-heap, %s (%.0f%%) off-heap",
-                                   FBUtilities.prettyPrintMemory(onHeapTotal),
-                                   onHeapRatio * 100,
-                                   FBUtilities.prettyPrintMemory(offHeapTotal),
-                                   offHeapRatio * 100));
+                     reason,
+                     usage);
     }
 
 
@@ -892,14 +897,14 @@ private void logFlush()
      * @return a Future yielding the commit log position that can be guaranteed to have been successfully written
      *         to sstables for this table once the future completes
      */
-    public ListenableFuture<CommitLogPosition> forceFlush()
+    public ListenableFuture<CommitLogPosition> forceFlush(FlushReason reason)
     {
         synchronized (data)
         {
             Memtable current = data.getView().getCurrentMemtable();
             for (ColumnFamilyStore cfs : concatWithIndexes())
                 if (!cfs.data.getView().getCurrentMemtable().isClean())
-                    return switchMemtableIfCurrent(current);
+                    return flushMemtable(current, reason);
             return waitForFlushes();
         }
     }
@@ -917,10 +922,18 @@ public ListenableFuture<?> forceFlush(CommitLogPosition flushIfDirtyBefore)
         // and this does not vary between a table and its table-backed indexes
         Memtable current = data.getView().getCurrentMemtable();
         if (current.mayContainDataBefore(flushIfDirtyBefore))
-            return switchMemtableIfCurrent(current);
+            return flushMemtable(current, FlushReason.COMMITLOG_DIRTY);
         return waitForFlushes();
     }
 
+    private ListenableFuture<CommitLogPosition> flushMemtable(Memtable current, FlushReason reason)
+    {
+        if (current.shouldSwitch(reason))
+            return switchMemtableIfCurrent(current, reason);
+        else
+            return waitForFlushes();
+    }
+
     /**
      * @return a Future yielding the commit log position that can be guaranteed to have been successfully written
      *         to sstables for this table once the future completes
@@ -930,17 +943,14 @@ private ListenableFuture<CommitLogPosition> waitForFlushes()
         // we grab the current memtable; once any preceding memtables have flushed, we know its
         // commitLogLowerBound has been set (as this it is set with the upper bound of the preceding memtable)
         final Memtable current = data.getView().getCurrentMemtable();
-        ListenableFutureTask<CommitLogPosition> task = ListenableFutureTask.create(() -> {
-            logger.debug("forceFlush requested but everything is clean in {}", name);
-            return current.getCommitLogLowerBound();
-        });
+        ListenableFutureTask<CommitLogPosition> task = ListenableFutureTask.create(current::getCommitLogLowerBound);
         postFlushExecutor.execute(task);
         return task;
     }
 
-    public CommitLogPosition forceBlockingFlush()
+    public CommitLogPosition forceBlockingFlush(FlushReason reason)
     {
-        return FBUtilities.waitOnFuture(forceFlush());
+        return FBUtilities.waitOnFuture(forceFlush(reason));
     }
 
     /**
@@ -950,12 +960,12 @@ public CommitLogPosition forceBlockingFlush()
     private final class PostFlush implements Callable<CommitLogPosition>
     {
         final CountDownLatch latch = new CountDownLatch(1);
-        final List<Memtable> memtables;
+        final Memtable mainMemtable;
         volatile Throwable flushFailure = null;
 
-        private PostFlush(List<Memtable> memtables)
+        private PostFlush(Memtable mainMemtable)
         {
-            this.memtables = memtables;
+            this.mainMemtable = mainMemtable;
         }
 
         public CommitLogPosition call()
@@ -973,11 +983,10 @@ public CommitLogPosition call()
 
             CommitLogPosition commitLogUpperBound = CommitLogPosition.NONE;
             // If a flush errored out but the error was ignored, make sure we don't discard the commit log.
-            if (flushFailure == null && !memtables.isEmpty())
+            if (flushFailure == null && mainMemtable != null)
             {
-                Memtable memtable = memtables.get(0);
-                commitLogUpperBound = memtable.getCommitLogUpperBound();
-                CommitLog.instance.discardCompletedSegments(metadata.id, memtable.getCommitLogLowerBound(), commitLogUpperBound);
+                commitLogUpperBound = mainMemtable.getCommitLogUpperBound();
+                CommitLog.instance.discardCompletedSegments(metadata.id, mainMemtable.getCommitLogLowerBound(), commitLogUpperBound);
             }
 
             metric.pendingFlushes.dec();
@@ -1000,7 +1009,7 @@ public CommitLogPosition call()
     private final class Flush implements Runnable
     {
         final OpOrder.Barrier writeBarrier;
-        final List<Memtable> memtables = new ArrayList<>();
+        final Map<ColumnFamilyStore, Memtable> memtables;
         final ListenableFutureTask<CommitLogPosition> postFlushTask;
         final PostFlush postFlush;
         final boolean truncate;
@@ -1024,6 +1033,8 @@ private Flush(boolean truncate)
              */
             writeBarrier = Keyspace.writeOrder.newBarrier();
 
+            memtables = new LinkedHashMap<>();
+
             // submit flushes for the memtable for any indexed sub-cfses, and our own
             AtomicReference<CommitLogPosition> commitLogUpperBound = new AtomicReference<>();
             for (ColumnFamilyStore cfs : concatWithIndexes())
@@ -1031,10 +1042,10 @@ private Flush(boolean truncate)
                 // switch all memtables, regardless of their dirty status, setting the barrier
                 // so that we can reach a coordinated decision about cleanliness once they
                 // are no longer possible to be modified
-                Memtable newMemtable = new Memtable(commitLogUpperBound, cfs);
+                Memtable newMemtable = cfs.createMemtable(commitLogUpperBound);
                 Memtable oldMemtable = cfs.data.switchMemtable(truncate, newMemtable);
-                oldMemtable.setDiscarding(writeBarrier, commitLogUpperBound);
-                memtables.add(oldMemtable);
+                oldMemtable.switchOut(writeBarrier, commitLogUpperBound);
+                memtables.put(cfs, oldMemtable);
             }
 
             // we then ensure an atomic decision is made about the upper bound of the continuous range of commit log
@@ -1045,7 +1056,7 @@ private Flush(boolean truncate)
             // since this happens after wiring up the commitLogUpperBound, we also know all operations with earlier
             // commit log segment position have also completed, i.e. the memtables are done and ready to flush
             writeBarrier.issue();
-            postFlush = new PostFlush(memtables);
+            postFlush = new PostFlush(Iterables.get(memtables.values(), 0, null));
             postFlushTask = ListenableFutureTask.create(postFlush);
         }
 
@@ -1065,17 +1076,20 @@ public void run()
                 logger.trace("Flush task for task {}@{} waited {} ms at the barrier", hashCode(), name, TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start));
 
             // mark all memtables as flushing, removing them from the live memtable list
-            for (Memtable memtable : memtables)
-                memtable.cfs.data.markFlushing(memtable);
+            for (Map.Entry<ColumnFamilyStore, Memtable> entry : memtables.entrySet())
+                entry.getKey().data.markFlushing(entry.getValue());
 
             metric.memtableSwitchCount.inc();
 
             try
             {
+                boolean first = true;
                 // Flush "data" memtable with non-cf 2i first;
-                flushMemtable(memtables.get(0), true);
-                for (int i = 1; i < memtables.size(); i++)
-                    flushMemtable(memtables.get(i), false);
+                for (Map.Entry<ColumnFamilyStore, Memtable> entry : memtables.entrySet())
+                {
+                    flushMemtable(entry.getKey(), entry.getValue(), first);
+                    first = false;
+                }
             }
             catch (Throwable t)
             {
@@ -1093,14 +1107,14 @@ public void run()
                 logger.trace("Flush task task {}@{} finished", hashCode(), name);
         }
 
-        public Collection<SSTableReader> flushMemtable(Memtable memtable, boolean flushNonCf2i)
+        public Collection<SSTableReader> flushMemtable(ColumnFamilyStore cfs, Memtable memtable, boolean flushNonCf2i)
         {
             if (logger.isTraceEnabled())
                 logger.trace("Flush task task {}@{} flushing memtable {}", hashCode(), name, memtable);
 
             if (memtable.isClean() || truncate)
             {
-                memtable.cfs.replaceFlushed(memtable, Collections.emptyList());
+                cfs.replaceFlushed(memtable, Collections.emptyList());
                 reclaim(memtable);
                 return Collections.emptyList();
             }
@@ -1112,13 +1126,13 @@ public Collection<SSTableReader> flushMemtable(Memtable memtable, boolean flushN
             List<SSTableReader> sstables = new ArrayList<>();
             try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
             {
-                List<Memtable.FlushRunnable> flushRunnables = null;
+                List<Flushing.FlushRunnable> flushRunnables = null;
                 List<SSTableMultiWriter> flushResults = null;
 
                 try
                 {
                     // flush the memtable
-                    flushRunnables = memtable.flushRunnables(txn);
+                    flushRunnables = Flushing.flushRunnables(cfs, memtable, txn);
                     ExecutorService[] executors = perDiskflushExecutors.getExecutorsFor(keyspace.getName(), name);
 
                     for (int i = 0; i < flushRunnables.size(); i++)
@@ -1140,11 +1154,7 @@ public Collection<SSTableReader> flushMemtable(Memtable memtable, boolean flushN
                 catch (Throwable t)
                 {
                     logger.error("Flushing {} failed with error", memtable.toString(), t);
-                    if (flushRunnables != null)
-                    {
-                        for (Memtable.FlushRunnable runnable : flushRunnables)
-                            t = runnable.abort(t);
-                    }
+                    t = Flushing.abortRunnables(flushRunnables, t);
 
                     // wait for any flush runnables that were submitted (after aborting they should complete immediately)
                     // this ensures that the writers are aborted by FlushRunnable.writeSortedContents(), in the worst
@@ -1209,9 +1219,9 @@ public Collection<SSTableReader> flushMemtable(Memtable memtable, boolean flushN
                     }
                 }
             }
-            memtable.cfs.replaceFlushed(memtable, sstables);
+            cfs.replaceFlushed(memtable, sstables);
             reclaim(memtable);
-            memtable.cfs.compactionStrategyManager.compactionLogger.flush(sstables);
+            cfs.compactionStrategyManager.compactionLogger.flush(sstables);
             logger.debug("Flushed to {} ({} sstables, {}), biggest {}, smallest {}",
                          sstables,
                          sstables.size(),
@@ -1231,12 +1241,17 @@ private void reclaim(final Memtable memtable)
                 public void runMayThrow()
                 {
                     readBarrier.await();
-                    memtable.setDiscarded();
+                    memtable.discard();
                 }
             }, reclaimExecutor);
         }
     }
 
+    public Memtable createMemtable(AtomicReference<CommitLogPosition> commitLogUpperBound)
+    {
+        return memtableFactory.create(commitLogUpperBound, metadata, this);
+    }
+
     // atomically set the upper bound for the commit log
     private static void setCommitLogUpperBound(AtomicReference<CommitLogPosition> commitLogUpperBound)
     {
@@ -1254,86 +1269,27 @@ private static void setCommitLogUpperBound(AtomicReference<CommitLogPosition> co
         }
     }
 
-    /**
-     * Finds the largest memtable, as a percentage of *either* on- or off-heap memory limits, and immediately
-     * queues it for flushing. If the memtable selected is flushed before this completes, no work is done.
-     */
-    public static CompletableFuture<Boolean> flushLargestMemtable()
+    public ListenableFuture<CommitLogPosition> signalFlushRequired(Memtable memtable, FlushReason reason)
     {
-        float largestRatio = 0f;
-        Memtable largest = null;
-        float liveOnHeap = 0, liveOffHeap = 0;
-        for (ColumnFamilyStore cfs : ColumnFamilyStore.all())
-        {
-            // we take a reference to the current main memtable for the CF prior to snapping its ownership ratios
-            // to ensure we have some ordering guarantee for performing the switchMemtableIf(), i.e. we will only
-            // swap if the memtables we are measuring here haven't already been swapped by the time we try to swap them
-            Memtable current = cfs.getTracker().getView().getCurrentMemtable();
-
-            // find the total ownership ratio for the memtable and all SecondaryIndexes owned by this CF,
-            // both on- and off-heap, and select the largest of the two ratios to weight this CF
-            float onHeap = 0f, offHeap = 0f;
-            onHeap += current.getAllocator().onHeap().ownershipRatio();
-            offHeap += current.getAllocator().offHeap().ownershipRatio();
-
-            for (ColumnFamilyStore indexCfs : cfs.indexManager.getAllIndexColumnFamilyStores())
-            {
-                MemtableAllocator allocator = indexCfs.getTracker().getView().getCurrentMemtable().getAllocator();
-                onHeap += allocator.onHeap().ownershipRatio();
-                offHeap += allocator.offHeap().ownershipRatio();
-            }
-
-            float ratio = Math.max(onHeap, offHeap);
-            if (ratio > largestRatio)
-            {
-                largest = current;
-                largestRatio = ratio;
-            }
-
-            liveOnHeap += onHeap;
-            liveOffHeap += offHeap;
-        }
-
-        CompletableFuture<Boolean> returnFuture = new CompletableFuture<>();
-
-        if (largest != null)
-        {
-            float usedOnHeap = Memtable.MEMORY_POOL.onHeap.usedRatio();
-            float usedOffHeap = Memtable.MEMORY_POOL.offHeap.usedRatio();
-            float flushingOnHeap = Memtable.MEMORY_POOL.onHeap.reclaimingRatio();
-            float flushingOffHeap = Memtable.MEMORY_POOL.offHeap.reclaimingRatio();
-            float thisOnHeap = largest.getAllocator().onHeap().ownershipRatio();
-            float thisOffHeap = largest.getAllocator().offHeap().ownershipRatio();
-            logger.debug("Flushing largest {} to free up room. Used total: {}, live: {}, flushing: {}, this: {}",
-                         largest.cfs, ratio(usedOnHeap, usedOffHeap), ratio(liveOnHeap, liveOffHeap),
-                         ratio(flushingOnHeap, flushingOffHeap), ratio(thisOnHeap, thisOffHeap));
-
-            ListenableFuture<CommitLogPosition> flushFuture = largest.cfs.switchMemtableIfCurrent(largest);
-            flushFuture.addListener(() -> {
-                try
-                {
-                    flushFuture.get();
-                    returnFuture.complete(true);
-                }
-                catch (Throwable t)
-                {
-                    returnFuture.completeExceptionally(t);
-                }
-            }, MoreExecutors.directExecutor());
-        }
-        else
-        {
-            logger.debug("Flushing of largest memtable, not done, no memtable found");
+        return switchMemtableIfCurrent(memtable, reason);
+    }
 
-            returnFuture.complete(false);
-        }
+    @Override
+    public Memtable getCurrentMemtable()
+    {
+        return data.getView().getCurrentMemtable();
+    }
 
-        return returnFuture;
+    public static Iterable<Memtable> activeMemtables()
+    {
+        return Iterables.transform(ColumnFamilyStore.all(),
+                                   cfs -> cfs.getTracker().getView().getCurrentMemtable());
     }
 
-    private static String ratio(float onHeap, float offHeap)
+    public Iterable<Memtable> getIndexMemtables()
     {
-        return String.format("%.2f/%.2f", onHeap, offHeap);
+        return Iterables.transform(indexManager.getAllIndexColumnFamilyStores(),
+                                   cfs -> cfs.getTracker().getView().getCurrentMemtable());
     }
 
     /**
@@ -1550,7 +1506,7 @@ public CompactionManager.AllSSTableOpStatus scrub(boolean disableSnapshot, boole
     {
         // skip snapshot creation during scrub, SEE JIRA 5891
         if(!disableSnapshot)
-            snapshotWithoutFlush("pre-scrub-" + System.currentTimeMillis());
+            snapshotWithoutMemtable("pre-scrub-" + System.currentTimeMillis());
 
         try
         {
@@ -1852,15 +1808,15 @@ public ClusteringComparator getComparator()
         return metadata().comparator;
     }
 
-    public void snapshotWithoutFlush(String snapshotName)
+    public void snapshotWithoutMemtable(String snapshotName)
     {
-        snapshotWithoutFlush(snapshotName, null, false, null);
+        snapshotWithoutMemtable(snapshotName, null, false, null);
     }
 
     /**
      * @param ephemeral If this flag is set to true, the snapshot will be cleaned during next startup
      */
-    public Set<SSTableReader> snapshotWithoutFlush(String snapshotName, Predicate<SSTableReader> predicate, boolean ephemeral, RateLimiter rateLimiter)
+    public Set<SSTableReader> snapshotWithoutMemtable(String snapshotName, Predicate<SSTableReader> predicate, boolean ephemeral, RateLimiter rateLimiter)
     {
         if (rateLimiter == null)
             rateLimiter = DatabaseDescriptor.getSnapshotRateLimiter();
@@ -2025,36 +1981,43 @@ public Set<SSTableReader> snapshot(String snapshotName)
      * Take a snap shot of this columnfamily store.
      *
      * @param snapshotName the name of the associated with the snapshot
-     * @param skipFlush Skip blocking flush of memtable
+     * @param skipMemtable Skip flushing the memtable
      * @param rateLimiter Rate limiter for hardlinks-per-second
      */
-    public Set<SSTableReader> snapshot(String snapshotName, boolean skipFlush, RateLimiter rateLimiter)
+    public Set<SSTableReader> snapshot(String snapshotName, boolean skipMemtable, RateLimiter rateLimiter)
     {
-        return snapshot(snapshotName, null, false, skipFlush, rateLimiter);
+        return snapshot(snapshotName, null, false, skipMemtable, rateLimiter);
     }
 
 
     /**
      * @param ephemeral If this flag is set to true, the snapshot will be cleaned up during next startup
-     * @param skipFlush Skip blocking flush of memtable
+     * @param skipMemtable Skip flushing the memtable
      */
-    public Set<SSTableReader> snapshot(String snapshotName, Predicate<SSTableReader> predicate, boolean ephemeral, boolean skipFlush)
+    public Set<SSTableReader> snapshot(String snapshotName, Predicate<SSTableReader> predicate, boolean ephemeral, boolean skipMemtable)
     {
-        return snapshot(snapshotName, predicate, ephemeral, skipFlush, null);
+        return snapshot(snapshotName, predicate, ephemeral, skipMemtable, null);
     }
 
     /**
      * @param ephemeral If this flag is set to true, the snapshot will be cleaned up during next startup
-     * @param skipFlush Skip blocking flush of memtable
+     * @param skipMemtable Skip flushing the memtable
      * @param rateLimiter Rate limiter for hardlinks-per-second
      */
-    public Set<SSTableReader> snapshot(String snapshotName, Predicate<SSTableReader> predicate, boolean ephemeral, boolean skipFlush, RateLimiter rateLimiter)
+    public Set<SSTableReader> snapshot(String snapshotName, Predicate<SSTableReader> predicate, boolean ephemeral, boolean skipMemtable, RateLimiter rateLimiter)
     {
-        if (!skipFlush)
+        if (!skipMemtable)
         {
-            forceBlockingFlush();
+            Memtable current = getTracker().getView().getCurrentMemtable();
+            if (!current.isClean())
+            {
+                if (current.shouldSwitch(FlushReason.SNAPSHOT))
+                    FBUtilities.waitOnFuture(switchMemtableIfCurrent(current, FlushReason.SNAPSHOT));
+                else
+                    current.performSnapshot(snapshotName);
+            }
         }
-        return snapshotWithoutFlush(snapshotName, predicate, ephemeral, rateLimiter);
+        return snapshotWithoutMemtable(snapshotName, predicate, ephemeral, rateLimiter);
     }
 
     public boolean snapshotExists(String snapshotName)
@@ -2233,6 +2196,108 @@ public long estimatedKeysForRange(Range<Token> range)
         }
     }
 
+    public void writeAndAddMemtableRanges(UUID repairSessionID,
+                                          Supplier<Collection<Range<PartitionPosition>>> rangesSupplier,
+                                          Refs<SSTableReader> placeIntoRefs)
+    {
+        @SuppressWarnings("resource") // closed by finish or on exception
+        SSTableMultiWriter memtableContent = writeMemtableRanges(rangesSupplier, repairSessionID);
+        if (memtableContent != null)
+        {
+            try
+            {
+                Collection<SSTableReader> sstables = memtableContent.finish(true);
+                try (Refs sstableReferences = Refs.ref(sstables))
+                {
+                    // This moves all references to placeIntoRefs, clearing sstableReferences
+                    placeIntoRefs.addAll(sstableReferences);
+                }
+
+                // Release the reference any written sstables start with.
+                for (SSTableReader rdr : sstables)
+                {
+                    rdr.selfRef().release();
+                    logger.info("Memtable ranges (keys {} size {}) written in {}",
+                                rdr.estimatedKeys(),
+                                rdr.getDataChannel().size(),
+                                rdr);
+                }
+            }
+            catch (Throwable t)
+            {
+                memtableContent.close();
+                Throwables.propagate(t);
+            }
+        }
+    }
+
+    private SSTableMultiWriter writeMemtableRanges(Supplier<Collection<Range<PartitionPosition>>> rangesSupplier,
+                                                   UUID repairSessionID)
+    {
+        if (!streamFromMemtable())
+            return null;
+
+        Collection<Range<PartitionPosition>> ranges = rangesSupplier.get();
+        Memtable current = getTracker().getView().getCurrentMemtable();
+        if (current.isClean())
+            return null;
+
+        List<Memtable.FlushCollection<?>> dataSets = new ArrayList<>(ranges.size());
+        long keys = 0;
+        for (Range<PartitionPosition> range : ranges)
+        {
+            Memtable.FlushCollection<?> dataSet = current.getFlushSet(range.left, range.right);
+            dataSets.add(dataSet);
+            keys += dataSet.partitionCount();
+        }
+        if (keys == 0)
+            return null;
+
+        // TODO: Can we write directly to stream, skipping disk?
+        Memtable.FlushCollection<?> firstDataSet = dataSets.get(0);
+        SSTableMultiWriter writer = createSSTableMultiWriter(newSSTableDescriptor(directories.getDirectoryForNewSSTables()),
+                                                             keys,
+                                                             0,
+                                                             repairSessionID,
+                                                             false,
+                                                             0,
+                                                             new SerializationHeader(true,
+                                                                                     firstDataSet.metadata(),
+                                                                                     firstDataSet.columns(),
+                                                                                     firstDataSet.encodingStats()),
+                                                             DO_NOT_TRACK);
+        try
+        {
+            for (Memtable.FlushCollection<?> dataSet : dataSets)
+                new Flushing.FlushRunnable(dataSet, writer, metric, false).call();  // executes on this thread
+
+            return writer;
+        }
+        catch (Error | RuntimeException t)
+        {
+            writer.abort(t);
+            throw t;
+        }
+    }
+
+    private static final LifecycleNewTracker DO_NOT_TRACK = new LifecycleNewTracker()
+    {
+        public void trackNew(SSTable table)
+        {
+            // not tracking
+        }
+
+        public void untrackNew(SSTable table)
+        {
+            // not tracking
+        }
+
+        public OperationType opType()
+        {
+            return OperationType.FLUSH;
+        }
+    };
+
     /**
      * For testing.  No effort is made to clear historical or even the current memtables, nor for
      * thread safety.  All we do is wipe the sstable containers clean, while leaving the actual
@@ -2244,7 +2309,7 @@ public void clearUnsafe()
         for (final ColumnFamilyStore cfs : concatWithIndexes())
         {
             cfs.runWithCompactionsDisabled((Callable<Void>) () -> {
-                cfs.data.reset(new Memtable(new AtomicReference<>(CommitLogPosition.NONE), cfs));
+                cfs.data.reset(memtableFactory.create(new AtomicReference<>(CommitLogPosition.NONE), cfs.metadata, cfs));
                 return null;
             }, true, false);
         }
@@ -2282,23 +2347,19 @@ public void truncateBlocking(boolean snapshot)
         final long truncatedAt;
         final CommitLogPosition replayAfter;
 
-        if (keyspace.getMetadata().params.durableWrites || snapshot)
+        if ((keyspace.getMetadata().params.durableWrites && !memtableWritesAreDurable())  // need to clear dirty regions
+            || snapshot) // need sstable for snapshot
         {
-            replayAfter = forceBlockingFlush();
-            viewManager.forceBlockingFlush();
+            replayAfter = forceBlockingFlush(FlushReason.TRUNCATE);
+            viewManager.forceBlockingFlush(FlushReason.TRUNCATE);
         }
         else
         {
             // just nuke the memtable data w/o writing to disk first
-            viewManager.dumpMemtables();
-            try
-            {
-                replayAfter = dumpMemtable().get();
-            }
-            catch (Exception e)
-            {
-                throw new RuntimeException(e);
-            }
+            // note: this does not wait for the switch to complete, but because the post-flush processing is serial,
+            // the call below does.
+            viewManager.dumpMemtables(FlushReason.TRUNCATE);
+            replayAfter = FBUtilities.waitOnFuture(dumpMemtable(FlushReason.TRUNCATE));
         }
 
         long now = System.currentTimeMillis();
@@ -2344,7 +2405,7 @@ public void run()
     /**
      * Drops current memtable without flushing to disk. This should only be called when truncating a column family which is not durable.
      */
-    public Future<CommitLogPosition> dumpMemtable()
+    public Future<CommitLogPosition> dumpMemtable(FlushReason reason)
     {
         synchronized (data)
         {
@@ -2355,6 +2416,14 @@ public Future<CommitLogPosition> dumpMemtable()
         }
     }
 
+    public void unloadCf()
+    {
+        if (keyspace.getMetadata().params.durableWrites && !memtableWritesAreDurable())  // need to clear dirty regions
+            forceBlockingFlush(ColumnFamilyStore.FlushReason.DROP);
+        else
+            FBUtilities.waitOnFuture(dumpMemtable(ColumnFamilyStore.FlushReason.DROP));
+    }
+
     public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptValidation, boolean interruptViews)
     {
         return runWithCompactionsDisabled(callable, (sstable) -> true, interruptValidation, interruptViews, true);
diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java
index b21df8f17d33..ce022bc074ed 100644
--- a/src/java/org/apache/cassandra/db/DecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/DecoratedKey.java
@@ -151,6 +151,7 @@ public Token getToken()
     }
 
     public abstract ByteBuffer getKey();
+    public abstract int getKeyLength();
 
     public void filterHash(long[] dest)
     {
diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java
index 9c32f4a67797..0d4376426596 100644
--- a/src/java/org/apache/cassandra/db/Keyspace.java
+++ b/src/java/org/apache/cassandra/db/Keyspace.java
@@ -413,7 +413,7 @@ public void dropCf(TableId tableId)
     // disassociate a cfs from this keyspace instance.
     private void unloadCf(ColumnFamilyStore cfs)
     {
-        cfs.forceBlockingFlush();
+        cfs.unloadCf();
         cfs.invalidate();
     }
 
@@ -681,11 +681,11 @@ public AbstractReplicationStrategy getReplicationStrategy()
         return replicationStrategy;
     }
 
-    public List<Future<?>> flush()
+    public List<Future<?>> flush(ColumnFamilyStore.FlushReason reason)
     {
         List<Future<?>> futures = new ArrayList<>(columnFamilyStores.size());
         for (ColumnFamilyStore cfs : columnFamilyStores.values())
-            futures.add(cfs.forceFlush());
+            futures.add(cfs.forceFlush(reason));
         return futures;
     }
 
diff --git a/src/java/org/apache/cassandra/db/Memtable.java b/src/java/org/apache/cassandra/db/Memtable.java
deleted file mode 100644
index 4da1c26503e7..000000000000
--- a/src/java/org/apache/cassandra/db/Memtable.java
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ConcurrentNavigableMap;
-import java.util.concurrent.ConcurrentSkipListMap;
-import java.util.concurrent.ConcurrentSkipListSet;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicBoolean;
-import java.util.concurrent.atomic.AtomicLong;
-import java.util.concurrent.atomic.AtomicReference;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-import com.google.common.base.Throwables;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.commitlog.CommitLog;
-import org.apache.cassandra.db.commitlog.CommitLogPosition;
-import org.apache.cassandra.db.commitlog.IntervalSet;
-import org.apache.cassandra.db.filter.ClusteringIndexFilter;
-import org.apache.cassandra.db.filter.ColumnFilter;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.db.partitions.AbstractBTreePartition;
-import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator;
-import org.apache.cassandra.db.partitions.AtomicBTreePartition;
-import org.apache.cassandra.db.partitions.Partition;
-import org.apache.cassandra.db.partitions.PartitionUpdate;
-import org.apache.cassandra.db.rows.EncodingStats;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
-import org.apache.cassandra.dht.AbstractBounds;
-import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.IncludingExcludingBounds;
-import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.index.transactions.UpdateTransaction;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMultiWriter;
-import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.SchemaConstants;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.HeapPool;
-import org.apache.cassandra.utils.memory.MemtableAllocator;
-import org.apache.cassandra.utils.memory.MemtableCleaner;
-import org.apache.cassandra.utils.memory.MemtablePool;
-import org.apache.cassandra.utils.memory.NativePool;
-import org.apache.cassandra.utils.memory.SlabPool;
-
-import static org.apache.cassandra.utils.Throwables.maybeFail;
-
-public class Memtable implements Comparable<Memtable>
-{
-    private static final Logger logger = LoggerFactory.getLogger(Memtable.class);
-
-    public static final MemtablePool MEMORY_POOL = createMemtableAllocatorPool();
-
-    private static MemtablePool createMemtableAllocatorPool()
-    {
-        long heapLimit = DatabaseDescriptor.getMemtableHeapSpaceInMb() << 20;
-        long offHeapLimit = DatabaseDescriptor.getMemtableOffheapSpaceInMb() << 20;
-        final float cleaningThreshold = DatabaseDescriptor.getMemtableCleanupThreshold();
-        final MemtableCleaner cleaner = ColumnFamilyStore::flushLargestMemtable;
-        switch (DatabaseDescriptor.getMemtableAllocationType())
-        {
-            case unslabbed_heap_buffers:
-                return new HeapPool(heapLimit, cleaningThreshold, cleaner);
-            case heap_buffers:
-                return new SlabPool(heapLimit, 0, cleaningThreshold, cleaner);
-            case offheap_buffers:
-                return new SlabPool(heapLimit, offHeapLimit, cleaningThreshold, cleaner);
-            case offheap_objects:
-                return new NativePool(heapLimit, offHeapLimit, cleaningThreshold, cleaner);
-            default:
-                throw new AssertionError();
-        }
-    }
-
-    private static final int ROW_OVERHEAD_HEAP_SIZE = estimateRowOverhead(Integer.parseInt(System.getProperty("cassandra.memtable_row_overhead_computation_step", "100000")));
-
-    private final MemtableAllocator allocator;
-    private final AtomicLong liveDataSize = new AtomicLong(0);
-    private final AtomicLong currentOperations = new AtomicLong(0);
-
-    // Allows us to find a Memtable by its tracker
-    private volatile LifecycleNewTracker tracker;
-    // the write barrier for directing writes to this memtable or the next during a switch
-    private volatile OpOrder.Barrier writeBarrier;
-    // the precise upper bound of CommitLogPosition owned by this memtable
-    private volatile AtomicReference<CommitLogPosition> commitLogUpperBound;
-    // the precise lower bound of CommitLogPosition owned by this memtable; equal to its predecessor's commitLogUpperBound
-    private AtomicReference<CommitLogPosition> commitLogLowerBound;
-
-    // The approximate lower bound by this memtable; must be <= commitLogLowerBound once our predecessor
-    // has been finalised, and this is enforced in the ColumnFamilyStore.setCommitLogUpperBound
-    private final CommitLogPosition approximateCommitLogLowerBound = CommitLog.instance.getCurrentPosition();
-
-    public int compareTo(Memtable that)
-    {
-        return this.approximateCommitLogLowerBound.compareTo(that.approximateCommitLogLowerBound);
-    }
-
-    public static final class LastCommitLogPosition extends CommitLogPosition
-    {
-        public LastCommitLogPosition(CommitLogPosition copy)
-        {
-            super(copy.segmentId, copy.position);
-        }
-    }
-
-    // We index the memtable by PartitionPosition only for the purpose of being able
-    // to select key range using Token.KeyBound. However put() ensures that we
-    // actually only store DecoratedKey.
-    private final ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> partitions = new ConcurrentSkipListMap<>();
-    public final ColumnFamilyStore cfs;
-    private final long creationNano = System.nanoTime();
-
-    // The smallest timestamp for all partitions stored in this memtable
-    private long minTimestamp = Long.MAX_VALUE;
-
-    // Record the comparator of the CFS at the creation of the memtable. This
-    // is only used when a user update the CF comparator, to know if the
-    // memtable was created with the new or old comparator.
-    public final ClusteringComparator initialComparator;
-
-    private final ColumnsCollector columnsCollector;
-    private final StatsCollector statsCollector = new StatsCollector();
-
-    // only to be used by init(), to setup the very first memtable for the cfs
-    public Memtable(AtomicReference<CommitLogPosition> commitLogLowerBound, ColumnFamilyStore cfs)
-    {
-        this.cfs = cfs;
-        this.commitLogLowerBound = commitLogLowerBound;
-        this.allocator = MEMORY_POOL.newAllocator();
-        this.initialComparator = cfs.metadata().comparator;
-        this.cfs.scheduleFlush();
-        this.columnsCollector = new ColumnsCollector(cfs.metadata().regularAndStaticColumns());
-    }
-
-    // ONLY to be used for testing, to create a mock Memtable
-    @VisibleForTesting
-    public Memtable(TableMetadata metadata)
-    {
-        this.initialComparator = metadata.comparator;
-        this.cfs = null;
-        this.allocator = null;
-        this.columnsCollector = new ColumnsCollector(metadata.regularAndStaticColumns());
-    }
-
-    public MemtableAllocator getAllocator()
-    {
-        return allocator;
-    }
-
-    public void allocateExtraOnHeap(long additionalSpace, OpOrder.Group opGroup)
-    {
-        getAllocator().onHeap().allocate(additionalSpace, opGroup);
-    }
-
-    public long getLiveDataSize()
-    {
-        return liveDataSize.get();
-    }
-
-    public long getOperations()
-    {
-        return currentOperations.get();
-    }
-
-    @VisibleForTesting
-    public void setDiscarding(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound)
-    {
-        assert this.writeBarrier == null;
-        this.commitLogUpperBound = commitLogUpperBound;
-        this.writeBarrier = writeBarrier;
-        allocator.setDiscarding();
-    }
-
-    void setDiscarded()
-    {
-        allocator.setDiscarded();
-    }
-
-    // decide if this memtable should take the write, or if it should go to the next memtable
-    public boolean accepts(OpOrder.Group opGroup, CommitLogPosition commitLogPosition)
-    {
-        // if the barrier hasn't been set yet, then this memtable is still taking ALL writes
-        OpOrder.Barrier barrier = this.writeBarrier;
-        if (barrier == null)
-            return true;
-        // if the barrier has been set, but is in the past, we are definitely destined for a future memtable
-        if (!barrier.isAfter(opGroup))
-            return false;
-        // if we aren't durable we are directed only by the barrier
-        if (commitLogPosition == null)
-            return true;
-        while (true)
-        {
-            // otherwise we check if we are in the past/future wrt the CL boundary;
-            // if the boundary hasn't been finalised yet, we simply update it to the max of
-            // its current value and ours; if it HAS been finalised, we simply accept its judgement
-            // this permits us to coordinate a safe boundary, as the boundary choice is made
-            // atomically wrt our max() maintenance, so an operation cannot sneak into the past
-            CommitLogPosition currentLast = commitLogUpperBound.get();
-            if (currentLast instanceof LastCommitLogPosition)
-                return currentLast.compareTo(commitLogPosition) >= 0;
-            if (currentLast != null && currentLast.compareTo(commitLogPosition) >= 0)
-                return true;
-            if (commitLogUpperBound.compareAndSet(currentLast, commitLogPosition))
-                return true;
-        }
-    }
-
-    public CommitLogPosition getCommitLogLowerBound()
-    {
-        return commitLogLowerBound.get();
-    }
-
-    public CommitLogPosition getCommitLogUpperBound()
-    {
-        return commitLogUpperBound.get();
-    }
-
-    public boolean isLive()
-    {
-        return allocator.isLive();
-    }
-
-    public boolean isClean()
-    {
-        return partitions.isEmpty();
-    }
-
-    public boolean mayContainDataBefore(CommitLogPosition position)
-    {
-        return approximateCommitLogLowerBound.compareTo(position) < 0;
-    }
-
-    /**
-     * @return true if this memtable is expired. Expiration time is determined by CF's memtable_flush_period_in_ms.
-     */
-    public boolean isExpired()
-    {
-        int period = cfs.metadata().params.memtableFlushPeriodInMs;
-        return period > 0 && (System.nanoTime() - creationNano >= TimeUnit.MILLISECONDS.toNanos(period));
-    }
-
-    /**
-     * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate
-     * OpOrdering.
-     *
-     * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null
-     */
-    long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
-    {
-        AtomicBTreePartition previous = partitions.get(update.partitionKey());
-
-        long initialSize = 0;
-        if (previous == null)
-        {
-            final DecoratedKey cloneKey = allocator.clone(update.partitionKey(), opGroup);
-            AtomicBTreePartition empty = new AtomicBTreePartition(cfs.metadata, cloneKey, allocator);
-            // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
-            previous = partitions.putIfAbsent(cloneKey, empty);
-            if (previous == null)
-            {
-                previous = empty;
-                // allocate the row overhead after the fact; this saves over allocating and having to free after, but
-                // means we can overshoot our declared limit.
-                int overhead = (int) (cloneKey.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE);
-                allocator.onHeap().allocate(overhead, opGroup);
-                initialSize = 8;
-            }
-        }
-
-        long[] pair = previous.addAllWithSizeDelta(update, opGroup, indexer);
-        minTimestamp = Math.min(minTimestamp, previous.stats().minTimestamp);
-        liveDataSize.addAndGet(initialSize + pair[0]);
-        columnsCollector.update(update.columns());
-        statsCollector.update(update.stats());
-        currentOperations.addAndGet(update.operationCount());
-        return pair[1];
-    }
-
-    public int partitionCount()
-    {
-        return partitions.size();
-    }
-
-    public LifecycleNewTracker tracker()
-    {
-        return tracker;
-    }
-
-    public List<FlushRunnable> flushRunnables(LifecycleTransaction txn)
-    {
-        Preconditions.checkState(this.tracker == null, "Attempted to flush Memtable more than once on %s.%s", cfs.keyspace.getName(), cfs.name);
-        this.tracker = txn;
-
-        return createFlushRunnables(txn);
-    }
-
-    private List<FlushRunnable> createFlushRunnables(LifecycleTransaction txn)
-    {
-        DiskBoundaries diskBoundaries = cfs.getDiskBoundaries();
-        List<PartitionPosition> boundaries = diskBoundaries.positions;
-        List<Directories.DataDirectory> locations = diskBoundaries.directories;
-        if (boundaries == null)
-            return Collections.singletonList(new FlushRunnable(txn));
-
-        return createFlushRunnables(boundaries, locations, txn);
-    }
-
-    @VisibleForTesting
-    List<FlushRunnable> createFlushRunnables(List<PartitionPosition> boundaries, List<Directories.DataDirectory> locations, LifecycleTransaction txn)
-    {
-        List<FlushRunnable> runnables = new ArrayList<>(boundaries.size());
-        PartitionPosition rangeStart = cfs.getPartitioner().getMinimumToken().minKeyBound();
-        try
-        {
-            for (int i = 0; i < boundaries.size(); i++)
-            {
-                PartitionPosition t = boundaries.get(i);
-                runnables.add(new FlushRunnable(rangeStart, t, locations.get(i), txn));
-                rangeStart = t;
-            }
-            return runnables;
-        }
-        catch (Throwable e)
-        {
-            for (Memtable.FlushRunnable runnable : runnables)
-                e = runnable.abort(e);
-
-            throw Throwables.propagate(e);
-        }
-    }
-
-    public String toString()
-    {
-        return String.format("Memtable-%s@%s(%s serialized bytes, %s ops, %.0f%%/%.0f%% of on/off-heap limit)",
-                             cfs.name, hashCode(), FBUtilities.prettyPrintMemory(liveDataSize.get()), currentOperations,
-                             100 * allocator.onHeap().ownershipRatio(), 100 * allocator.offHeap().ownershipRatio());
-    }
-
-    public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFilter columnFilter, final DataRange dataRange)
-    {
-        AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange();
-
-        boolean startIsMin = keyRange.left.isMinimum();
-        boolean stopIsMin = keyRange.right.isMinimum();
-
-        boolean isBound = keyRange instanceof Bounds;
-        boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds;
-        boolean includeStop = isBound || keyRange instanceof Range;
-        Map<PartitionPosition, AtomicBTreePartition> subMap;
-        if (startIsMin)
-            subMap = stopIsMin ? partitions : partitions.headMap(keyRange.right, includeStop);
-        else
-            subMap = stopIsMin
-                   ? partitions.tailMap(keyRange.left, includeStart)
-                   : partitions.subMap(keyRange.left, includeStart, keyRange.right, includeStop);
-
-        int minLocalDeletionTime = Integer.MAX_VALUE;
-
-        // avoid iterating over the memtable if we purge all tombstones
-        if (cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones())
-            minLocalDeletionTime = findMinLocalDeletionTime(subMap.entrySet().iterator());
-
-        final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter = subMap.entrySet().iterator();
-
-        return new MemtableUnfilteredPartitionIterator(cfs, iter, minLocalDeletionTime, columnFilter, dataRange);
-    }
-
-    private int findMinLocalDeletionTime(Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iterator)
-    {
-        int minLocalDeletionTime = Integer.MAX_VALUE;
-        while (iterator.hasNext())
-        {
-            Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iterator.next();
-            minLocalDeletionTime = Math.min(minLocalDeletionTime, entry.getValue().stats().minLocalDeletionTime);
-        }
-        return minLocalDeletionTime;
-    }
-
-    public Partition getPartition(DecoratedKey key)
-    {
-        return partitions.get(key);
-    }
-
-    public long getMinTimestamp()
-    {
-        return minTimestamp;
-    }
-
-    /**
-     * For testing only. Give this memtable too big a size to make it always fail flushing.
-     */
-    @VisibleForTesting
-    public void makeUnflushable()
-    {
-        liveDataSize.addAndGet((long) 1024 * 1024 * 1024 * 1024 * 1024);
-    }
-
-    /**
-     * The valid states for {@link FlushRunnable} writers. The thread writing the contents
-     * will transition from IDLE -> RUNNING and back to IDLE when finished using the writer
-     * or from ABORTING -> ABORTED if another thread has transitioned from RUNNING -> ABORTING.
-     * We can also transition directly from IDLE -> ABORTED. Whichever threads transitions
-     * to ABORTED is responsible to abort the writer.
-     */
-    @VisibleForTesting
-    enum FlushRunnableWriterState
-    {
-        IDLE, // the runnable is idle, either not yet started or completed but with the writer waiting to be committed
-        RUNNING, // the runnable is executing, therefore the writer cannot be aborted or else a SEGV may ensue
-        ABORTING, // an abort request has been issued, this only happens if abort() is called whilst RUNNING
-        ABORTED  // the writer has been aborted, no resources will be leaked
-    }
-
-    class FlushRunnable implements Callable<SSTableMultiWriter>
-    {
-        private final long estimatedSize;
-        private final ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> toFlush;
-
-        private final boolean isBatchLogTable;
-        private final SSTableMultiWriter writer;
-
-        // keeping these to be able to log what we are actually flushing
-        private final PartitionPosition from;
-        private final PartitionPosition to;
-
-        private final AtomicReference<FlushRunnableWriterState> state;
-
-        FlushRunnable(PartitionPosition from, PartitionPosition to, Directories.DataDirectory flushLocation, LifecycleTransaction txn)
-        {
-            this(partitions.subMap(from, to), flushLocation, from, to, txn);
-        }
-
-        FlushRunnable(LifecycleTransaction txn)
-        {
-            this(partitions, null, null, null, txn);
-        }
-
-        FlushRunnable(ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> toFlush, Directories.DataDirectory flushLocation, PartitionPosition from, PartitionPosition to, LifecycleTransaction txn)
-        {
-            this.toFlush = toFlush;
-            this.from = from;
-            this.to = to;
-            long keySize = 0;
-            state = new AtomicReference<>(FlushRunnableWriterState.IDLE);
-
-            for (PartitionPosition key : toFlush.keySet())
-            {
-                //  make sure we don't write non-sensical keys
-                assert key instanceof DecoratedKey;
-                keySize += ((DecoratedKey) key).getKey().remaining();
-            }
-            estimatedSize = (long) ((keySize // index entries
-                                    + keySize // keys in data file
-                                    + liveDataSize.get()) // data
-                                    * 1.2); // bloom filter and row index overhead
-
-            this.isBatchLogTable = cfs.name.equals(SystemKeyspace.BATCHES) && cfs.keyspace.getName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME);
-
-            if (flushLocation == null)
-                writer = createFlushWriter(txn, cfs.newSSTableDescriptor(getDirectories().getWriteableLocationAsFile(estimatedSize)), columnsCollector.get(), statsCollector.get());
-            else
-                writer = createFlushWriter(txn, cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(flushLocation)), columnsCollector.get(), statsCollector.get());
-        }
-
-        protected Directories getDirectories()
-        {
-            return cfs.getDirectories();
-        }
-
-        private void writeSortedContents()
-        {
-            if (!state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.RUNNING))
-            {
-                logger.debug("Failed to write {}, flushed range = ({}, {}], state: {}",
-                             Memtable.this.toString(), from, to, state);
-                return;
-            }
-
-            logger.info("Writing {}, flushed range = ({}, {}], state: {}",
-                        Memtable.this.toString(), from, to, state);
-
-            int heavilyContendedRowCount = 0;
-            try
-            {
-                boolean trackContention = logger.isTraceEnabled();
-                // (we can't clear out the map as-we-go to free up memory,
-                //  since the memtable is being used for queries in the "pending flush" category)
-                for (AtomicBTreePartition partition : toFlush.values())
-                {
-                    if (state.get() == FlushRunnableWriterState.ABORTING)
-                        break;
-
-                    // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
-                    // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local,
-                    // we don't need to preserve tombstones for repair. So if both operation are in this
-                    // memtable (which will almost always be the case if there is no ongoing failure), we can
-                    // just skip the entry (CASSANDRA-4667).
-                    if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
-                        continue;
-
-                    if (trackContention && partition.useLock())
-                        heavilyContendedRowCount++;
-
-                    if (!partition.isEmpty())
-                    {
-                        try (UnfilteredRowIterator iter = partition.unfilteredIterator())
-                        {
-                            writer.append(iter);
-                        }
-                    }
-                }
-            }
-            finally
-            {
-                while (true)
-                {
-                    if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.IDLE))
-                    {
-                        long bytesFlushed = writer.getFilePointer();
-                        logger.info("Completed flushing {} ({}) for commitlog position {}",
-                                    writer.getFilename(),
-                                    FBUtilities.prettyPrintMemory(bytesFlushed),
-                                    commitLogUpperBound);
-                        // Update the metrics
-                        cfs.metric.bytesFlushed.inc(bytesFlushed);
-
-                        if (heavilyContendedRowCount > 0)
-                            logger.trace("High update contention in {}/{} partitions of {} ", heavilyContendedRowCount, toFlush.size(), Memtable.this);
-                        break;
-                    }
-                    else if (state.compareAndSet(FlushRunnableWriterState.ABORTING, FlushRunnableWriterState.ABORTED))
-                    {
-                        logger.debug("Flushing of {} aborted", writer.getFilename());
-                        maybeFail(writer.abort(null));
-                        break;
-                    }
-                }
-            }
-        }
-
-        public Throwable abort(Throwable throwable)
-        {
-            while (true)
-            {
-                if (state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.ABORTED))
-                {
-                    logger.debug("Flushing of {} aborted", writer.getFilename());
-                    return writer.abort(throwable);
-                }
-                else if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.ABORTING))
-                {
-                    // thread currently executing writeSortedContents() will take care of aborting and throw any exceptions
-                    return throwable;
-                }
-            }
-        }
-
-        @VisibleForTesting
-        FlushRunnableWriterState state()
-        {
-            return state.get();
-        }
-
-        public SSTableMultiWriter createFlushWriter(LifecycleTransaction txn,
-                                                    Descriptor descriptor,
-                                                    RegularAndStaticColumns columns,
-                                                    EncodingStats stats)
-        {
-            MetadataCollector sstableMetadataCollector = new MetadataCollector(cfs.metadata().comparator)
-                    .commitLogIntervals(new IntervalSet<>(commitLogLowerBound.get(), commitLogUpperBound.get()));
-
-            return cfs.createSSTableMultiWriter(descriptor,
-                                                toFlush.size(),
-                                                ActiveRepairService.UNREPAIRED_SSTABLE,
-                                                ActiveRepairService.NO_PENDING_REPAIR,
-                                                false,
-                                                sstableMetadataCollector,
-                                                new SerializationHeader(true, cfs.metadata(), columns, stats), txn);
-        }
-
-        @Override
-        public SSTableMultiWriter call()
-        {
-            writeSortedContents();
-            return writer;
-        }
-    }
-
-    private static int estimateRowOverhead(final int count)
-    {
-        // calculate row overhead
-        try (final OpOrder.Group group = new OpOrder().start())
-        {
-            int rowOverhead;
-            MemtableAllocator allocator = MEMORY_POOL.newAllocator();
-            ConcurrentNavigableMap<PartitionPosition, Object> partitions = new ConcurrentSkipListMap<>();
-            final Object val = new Object();
-            for (int i = 0 ; i < count ; i++)
-                partitions.put(allocator.clone(new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group), val);
-            double avgSize = ObjectSizes.measureDeep(partitions) / (double) count;
-            rowOverhead = (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize));
-            rowOverhead -= ObjectSizes.measureDeep(new LongToken(0));
-            rowOverhead += AtomicBTreePartition.EMPTY_SIZE;
-            rowOverhead += AbstractBTreePartition.HOLDER_UNSHARED_HEAP_SIZE;
-            allocator.setDiscarding();
-            allocator.setDiscarded();
-            return rowOverhead;
-        }
-    }
-
-    public static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator
-    {
-        private final ColumnFamilyStore cfs;
-        private final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter;
-        private final int minLocalDeletionTime;
-        private final ColumnFilter columnFilter;
-        private final DataRange dataRange;
-
-        public MemtableUnfilteredPartitionIterator(ColumnFamilyStore cfs, Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter, int minLocalDeletionTime, ColumnFilter columnFilter, DataRange dataRange)
-        {
-            this.cfs = cfs;
-            this.iter = iter;
-            this.minLocalDeletionTime = minLocalDeletionTime;
-            this.columnFilter = columnFilter;
-            this.dataRange = dataRange;
-        }
-
-        public int getMinLocalDeletionTime()
-        {
-            return minLocalDeletionTime;
-        }
-
-        public TableMetadata metadata()
-        {
-            return cfs.metadata();
-        }
-
-        public boolean hasNext()
-        {
-            return iter.hasNext();
-        }
-
-        public UnfilteredRowIterator next()
-        {
-            Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iter.next();
-            // Actual stored key should be true DecoratedKey
-            assert entry.getKey() instanceof DecoratedKey;
-            DecoratedKey key = (DecoratedKey)entry.getKey();
-            ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key);
-
-            return filter.getUnfilteredRowIterator(columnFilter, entry.getValue());
-        }
-    }
-
-    private static class ColumnsCollector
-    {
-        private final HashMap<ColumnMetadata, AtomicBoolean> predefined = new HashMap<>();
-        private final ConcurrentSkipListSet<ColumnMetadata> extra = new ConcurrentSkipListSet<>();
-        ColumnsCollector(RegularAndStaticColumns columns)
-        {
-            for (ColumnMetadata def : columns.statics)
-                predefined.put(def, new AtomicBoolean());
-            for (ColumnMetadata def : columns.regulars)
-                predefined.put(def, new AtomicBoolean());
-        }
-
-        public void update(RegularAndStaticColumns columns)
-        {
-            for (ColumnMetadata s : columns.statics)
-                update(s);
-            for (ColumnMetadata r : columns.regulars)
-                update(r);
-        }
-
-        private void update(ColumnMetadata definition)
-        {
-            AtomicBoolean present = predefined.get(definition);
-            if (present != null)
-            {
-                if (!present.get())
-                    present.set(true);
-            }
-            else
-            {
-                extra.add(definition);
-            }
-        }
-
-        public RegularAndStaticColumns get()
-        {
-            RegularAndStaticColumns.Builder builder = RegularAndStaticColumns.builder();
-            for (Map.Entry<ColumnMetadata, AtomicBoolean> e : predefined.entrySet())
-                if (e.getValue().get())
-                    builder.add(e.getKey());
-            return builder.addAll(extra).build();
-        }
-    }
-
-    private static class StatsCollector
-    {
-        private final AtomicReference<EncodingStats> stats = new AtomicReference<>(EncodingStats.NO_STATS);
-
-        public void update(EncodingStats newStats)
-        {
-            while (true)
-            {
-                EncodingStats current = stats.get();
-                EncodingStats updated = current.mergeWith(newStats);
-                if (stats.compareAndSet(current, updated))
-                    return;
-            }
-        }
-
-        public EncodingStats get()
-        {
-            return stats.get();
-        }
-    }
-}
diff --git a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
index e5517ad48c6a..461a25a4f54a 100644
--- a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
+++ b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java
@@ -74,6 +74,12 @@ public ByteBuffer getKey()
         return MemoryUtil.getByteBuffer(peer + 4, MemoryUtil.getInt(peer), ByteOrder.BIG_ENDIAN);
     }
 
+    @Override
+    public int getKeyLength()
+    {
+        return MemoryUtil.getInt(peer);
+    }
+
     @Override
     protected ByteSource keyComparableBytes(Version version)
     {
diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
index a1f88f9b761d..434cb692c555 100644
--- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
@@ -22,6 +22,7 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -262,11 +263,15 @@ public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, Rea
         InputCollector<UnfilteredPartitionIterator> inputCollector = iteratorsForRange(view);
         try
         {
+            // avoid iterating over the memtable if we purge all tombstones
+            boolean useMinLocalDeletionTime = cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones();
+
             for (Memtable memtable : view.memtables)
             {
                 @SuppressWarnings("resource") // We close on exception and on closing the result returned by this method
                 Memtable.MemtableUnfilteredPartitionIterator iter = memtable.makePartitionIterator(columnFilter(), dataRange());
-                oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, iter.getMinLocalDeletionTime());
+                if (useMinLocalDeletionTime)
+                    oldestUnrepairedTombstone = Math.min(oldestUnrepairedTombstone, iter.getMinLocalDeletionTime());
                 inputCollector.addMemtableIterator(RTBoundValidator.validate(iter, RTBoundValidator.Stage.MEMTABLE, false));
             }
 
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index 7b82929e1032..5baa6d7e1ad8 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -31,6 +31,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.db.lifecycle.*;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.transform.RTBoundValidator;
diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java
index cbb708465c5e..3b89ae35dfac 100644
--- a/src/java/org/apache/cassandra/db/SystemKeyspace.java
+++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java
@@ -830,7 +830,9 @@ public static void forceBlockingFlush(String ...cfnames)
 
             for (String cfname : cfnames)
             {
-                futures.add(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(cfname).forceFlush());
+                futures.add(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME)
+                                    .getColumnFamilyStore(cfname)
+                                    .forceFlush(ColumnFamilyStore.FlushReason.INTERNALLY_FORCED));
             }
             FBUtilities.waitOnFutures(futures);
         }
diff --git a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java
index 848c6e021851..803e880ea5e9 100755
--- a/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java
+++ b/src/java/org/apache/cassandra/db/commitlog/AbstractCommitLogSegmentManager.java
@@ -403,9 +403,20 @@ private Future<?> flushDataFrom(List<CommitLogSegment> segments, boolean force)
                 else if (!flushes.containsKey(dirtyTableId))
                 {
                     final ColumnFamilyStore cfs = Keyspace.open(metadata.keyspace).getColumnFamilyStore(dirtyTableId);
-                    // can safely call forceFlush here as we will only ever block (briefly) for other attempts to flush,
-                    // no deadlock possibility since switchLock removal
-                    flushes.put(dirtyTableId, force ? cfs.forceFlush() : cfs.forceFlush(maxCommitLogPosition));
+
+                    if (cfs.memtableWritesAreDurable())
+                    {
+                        // The memtable does not need this data to be preserved (we only wrote it for PITR and CDC)
+                        segment.markClean(dirtyTableId, CommitLogPosition.NONE, segment.getCurrentCommitLogPosition());
+                    }
+                    else
+                    {
+                        // can safely call forceFlush here as we will only ever block (briefly) for other attempts to flush,
+                        // no deadlock possibility since switchLock removal
+                        flushes.put(dirtyTableId, force
+                                                  ? cfs.forceFlush(ColumnFamilyStore.FlushReason.INTERNALLY_FORCED)
+                                                  : cfs.forceFlush(maxCommitLogPosition));
+                    }
                 }
             }
         }
diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogPosition.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogPosition.java
index 3ffb04ceae57..39f1005a0c6d 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogPosition.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogPosition.java
@@ -95,12 +95,6 @@ public String toString()
                ')';
     }
 
-    public CommitLogPosition clone()
-    {
-        return new CommitLogPosition(segmentId, position);
-    }
-
-
     public static class CommitLogPositionSerializer implements ISerializer<CommitLogPosition>
     {
         public void serialize(CommitLogPosition clsp, DataOutputPlus out) throws IOException
diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
index 39777ec46ca1..e83217cbf119 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogReplayer.java
@@ -130,7 +130,16 @@ public static CommitLogReplayer construct(CommitLog commitLog, UUID localHostId)
                 }
             }
 
-            IntervalSet<CommitLogPosition> filter = persistedIntervals(cfs.getLiveSSTables(), truncatedAt, localHostId);
+            IntervalSet<CommitLogPosition> filter;
+            if (!cfs.memtableWritesAreDurable())
+            {
+                filter = persistedIntervals(cfs.getLiveSSTables(), truncatedAt, localHostId);
+            }
+            else
+            {
+                // everything is persisted and restored by the memtable itself
+                filter = new IntervalSet<>(CommitLogPosition.NONE, CommitLog.instance.getCurrentPosition());
+            }
             cfPersisted.put(cfs.metadata.id, filter);
         }
         CommitLogPosition globalPosition = firstNotCovered(cfPersisted.values());
@@ -212,12 +221,14 @@ public int blockForWrites()
             if (keyspace.getName().equals(SchemaConstants.SYSTEM_KEYSPACE_NAME))
                 flushingSystem = true;
 
-            futures.addAll(keyspace.flush());
+            futures.addAll(keyspace.flush(ColumnFamilyStore.FlushReason.STARTUP));
         }
 
         // also flush batchlog incase of any MV updates
         if (!flushingSystem)
-            futures.add(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceFlush());
+            futures.add(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME)
+                                .getColumnFamilyStore(SystemKeyspace.BATCHES)
+                                .forceFlush(ColumnFamilyStore.FlushReason.INTERNALLY_FORCED));
 
         FBUtilities.waitOnFutures(futures);
 
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
index 6078dabac2e6..a0ab3fdd6727 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
@@ -28,6 +28,7 @@
 
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index bfb840c166ce..d0ffc64b504b 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
@@ -115,7 +115,7 @@ protected void runMayThrow() throws Exception
         CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
 
         if (DatabaseDescriptor.isSnapshotBeforeCompaction())
-            cfs.snapshotWithoutFlush(System.currentTimeMillis() + "-compact-" + cfs.name);
+            cfs.snapshotWithoutMemtable(System.currentTimeMillis() + "-compact-" + cfs.name);
 
         try (CompactionController controller = getCompactionController(transaction.originals()))
         {
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
index 574c6a449936..8b7550d804df 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
@@ -142,7 +142,7 @@ public static LifecycleTransaction offline(OperationType operationType, SSTableR
     public static LifecycleTransaction offline(OperationType operationType, Iterable<SSTableReader> readers)
     {
         // if offline, for simplicity we just use a dummy tracker
-        Tracker dummy = new Tracker(null, false);
+        Tracker dummy = Tracker.newDummyTracker();
         dummy.addInitialSSTables(readers);
         dummy.apply(updateCompacting(emptySet(), readers));
         return new LifecycleTransaction(dummy, operationType, readers);
@@ -154,7 +154,7 @@ public static LifecycleTransaction offline(OperationType operationType, Iterable
     @SuppressWarnings("resource") // log closed during postCleanup
     public static LifecycleTransaction offline(OperationType operationType)
     {
-        Tracker dummy = new Tracker(null, false);
+        Tracker dummy = Tracker.newDummyTracker();
         return new LifecycleTransaction(dummy, new LogTransaction(operationType, dummy), Collections.emptyList());
     }
 
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
index a0cbd8b8b66b..949ca79db17d 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
@@ -30,7 +30,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -74,17 +74,23 @@ public class Tracker
     public final boolean loadsstables;
 
     /**
+     * @param columnFamilyStore
      * @param memtable Initial Memtable. Can be null.
      * @param loadsstables true to indicate to load SSTables (TODO: remove as this is only accessed from 2i)
      */
-    public Tracker(Memtable memtable, boolean loadsstables)
+    public Tracker(ColumnFamilyStore columnFamilyStore, Memtable memtable, boolean loadsstables)
     {
-        this.cfstore = memtable != null ? memtable.cfs : null;
+        this.cfstore = columnFamilyStore;
         this.view = new AtomicReference<>();
         this.loadsstables = loadsstables;
         this.reset(memtable);
     }
 
+    public static Tracker newDummyTracker()
+    {
+        return new Tracker(null, null, false);
+    }
+
     public LifecycleTransaction tryModify(SSTableReader sstable, OperationType operationType)
     {
         return tryModify(singleton(sstable), operationType);
diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java
index e2f09b7791d4..4aad49ed832f 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/View.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/View.java
@@ -26,7 +26,7 @@
 import com.google.common.collect.*;
 
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java
new file mode 100644
index 000000000000..e13fc6543362
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.util.concurrent.ListenableFuture;
+import com.google.common.util.concurrent.MoreExecutors;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.WrappedRunnable;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.HeapPool;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+import org.apache.cassandra.utils.memory.MemtableCleaner;
+import org.apache.cassandra.utils.memory.MemtablePool;
+import org.apache.cassandra.utils.memory.NativePool;
+import org.apache.cassandra.utils.memory.SlabPool;
+
+/**
+ * A memtable that uses memory tracked and maybe allocated via a MemtableAllocator from a MemtablePool.
+ * Provides methods of memory tracking and triggering flushes when the relevant limits are reached.
+ */
+public abstract class AbstractAllocatorMemtable extends AbstractMemtableWithCommitlog
+{
+    private static final Logger logger = LoggerFactory.getLogger(AbstractAllocatorMemtable.class);
+
+    public static final MemtablePool MEMORY_POOL = AbstractAllocatorMemtable.createMemtableAllocatorPool();
+
+    protected final Owner owner;
+    protected final MemtableAllocator allocator;
+
+    // Record the comparator of the CFS at the creation of the memtable. This
+    // is only used when a user update the CF comparator, to know if the
+    // memtable was created with the new or old comparator.
+    public final ClusteringComparator initialComparator;
+
+    private final long creationNano = System.nanoTime();
+
+    private static MemtablePool createMemtableAllocatorPool()
+    {
+        long heapLimit = DatabaseDescriptor.getMemtableHeapSpaceInMb() << 20;
+        long offHeapLimit = DatabaseDescriptor.getMemtableOffheapSpaceInMb() << 20;
+        float memtableCleanupThreshold = DatabaseDescriptor.getMemtableCleanupThreshold();
+        MemtableCleaner cleaner = AbstractAllocatorMemtable::flushLargestMemtable;
+        switch (DatabaseDescriptor.getMemtableAllocationType())
+        {
+        case unslabbed_heap_buffers:
+            logger.debug("Memtables allocating with on-heap buffers");
+            return new HeapPool(heapLimit, memtableCleanupThreshold, cleaner);
+        case heap_buffers:
+            logger.debug("Memtables allocating with on-heap slabs");
+            return new SlabPool(heapLimit, 0, memtableCleanupThreshold, cleaner);
+        case offheap_buffers:
+            logger.debug("Memtables allocating with off-heap buffers");
+            return new SlabPool(heapLimit, offHeapLimit, memtableCleanupThreshold, cleaner);
+        case offheap_objects:
+            logger.debug("Memtables allocating with off-heap objects");
+            return new NativePool(heapLimit, offHeapLimit, memtableCleanupThreshold, cleaner);
+        default:
+            throw new AssertionError();
+        }
+    }
+
+    // only to be used by init(), to setup the very first memtable for the cfs
+    public AbstractAllocatorMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
+    {
+        super(metadataRef, commitLogLowerBound);
+        this.allocator = MEMORY_POOL.newAllocator();
+        this.initialComparator = metadata.get().comparator;
+        this.owner = owner;
+        scheduleFlush();
+    }
+
+    public MemtableAllocator getAllocator()
+    {
+        return allocator;
+    }
+
+    public boolean shouldSwitch(ColumnFamilyStore.FlushReason reason)
+    {
+        switch (reason)
+        {
+        case SCHEMA_CHANGE:
+            return initialComparator != metadata().comparator // If the CF comparator has changed, because our partitions reference the old one
+                   || metadata().params.memtable.factory != factory(); // If a different type of memtable is requested
+        default:
+            return true;
+        }
+    }
+
+    public void metadataUpdated()
+    {
+        scheduleFlush();
+    }
+
+    public void performSnapshot(String snapshotName)
+    {
+        // unless shouldSwitch(SNAPSHOT) returns false, this cannot be called.
+        throw new AssertionError();
+    }
+
+    protected abstract Factory factory();
+
+    public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound)
+    {
+        super.switchOut(writeBarrier, commitLogUpperBound);
+        allocator.setDiscarding();
+    }
+
+    public void discard()
+    {
+        super.discard();
+        allocator.setDiscarded();
+    }
+
+    public String toString()
+    {
+        return String.format("Memtable-%s@%s(%s serialized bytes, %s ops, %.0f%%/%.0f%% of on/off-heap limit)",
+                             metadata.get().name,
+                             hashCode(),
+                             FBUtilities.prettyPrintMemory(liveDataSize.get()),
+                             currentOperations,
+                             100 * allocator.onHeap().ownershipRatio(),
+                             100 * allocator.offHeap().ownershipRatio());
+    }
+
+    /**
+     * For testing only. Give this memtable too big a size to make it always fail flushing.
+     */
+    @VisibleForTesting
+    public void makeUnflushable()
+    {
+        liveDataSize.addAndGet(1024L * 1024 * 1024 * 1024 * 1024);
+    }
+
+    public void addMemoryUsageTo(MemoryUsage stats)
+    {
+        stats.ownershipRatioOnHeap += getAllocator().onHeap().ownershipRatio();
+        stats.ownershipRatioOffHeap += getAllocator().offHeap().ownershipRatio();
+        stats.ownsOnHeap += getAllocator().onHeap().owns();
+        stats.ownsOffHeap += getAllocator().offHeap().owns();
+    }
+
+    public void markExtraOnHeapUsed(long additionalSpace, OpOrder.Group opGroup)
+    {
+        getAllocator().onHeap().allocate(additionalSpace, opGroup);
+    }
+
+    public void markExtraOffHeapUsed(long additionalSpace, OpOrder.Group opGroup)
+    {
+        getAllocator().offHeap().allocate(additionalSpace, opGroup);
+    }
+
+    void scheduleFlush()
+    {
+        int period = metadata().params.memtableFlushPeriodInMs;
+        if (period > 0)
+            scheduleFlush(owner, period);
+    }
+
+    private static void scheduleFlush(Owner owner, int period)
+    {
+        logger.trace("scheduling flush in {} ms", period);
+        WrappedRunnable runnable = new WrappedRunnable()
+        {
+            protected void runMayThrow()
+            {
+                Memtable current = owner.getCurrentMemtable();
+                if (current instanceof AbstractAllocatorMemtable)
+                    ((AbstractAllocatorMemtable) current).flushIfPeriodExpired();
+            }
+        };
+        ScheduledExecutors.scheduledTasks.schedule(runnable, period, TimeUnit.MILLISECONDS);
+    }
+
+    private void flushIfPeriodExpired()
+    {
+        int period = metadata().params.memtableFlushPeriodInMs;
+        if (period > 0 && (System.nanoTime() - creationNano >= TimeUnit.MILLISECONDS.toNanos(period)))
+        {
+            if (isClean())
+            {
+                // if we're still clean, instead of swapping just reschedule a flush for later
+                scheduleFlush(owner, period);
+            }
+            else
+            {
+                // we'll be rescheduled by the constructor of the Memtable.
+                owner.signalFlushRequired(AbstractAllocatorMemtable.this,
+                                          ColumnFamilyStore.FlushReason.MEMTABLE_PERIOD_EXPIRED);
+            }
+        }
+    }
+
+    /**
+     * Finds the largest memtable, as a percentage of *either* on- or off-heap memory limits, and immediately
+     * queues it for flushing. If the memtable selected is flushed before this completes, no work is done.
+     */
+    public static CompletableFuture<Boolean> flushLargestMemtable()
+    {
+        float largestRatio = 0f;
+        AbstractAllocatorMemtable largestMemtable = null;
+        Memtable.MemoryUsage largestUsage = null;
+        float liveOnHeap = 0, liveOffHeap = 0;
+        // we take a reference to the current main memtable for the CF prior to snapping its ownership ratios
+        // to ensure we have some ordering guarantee for performing the switchMemtableIf(), i.e. we will only
+        // swap if the memtables we are measuring here haven't already been swapped by the time we try to swap them
+        for (Memtable currentMemtable : ColumnFamilyStore.activeMemtables())
+        {
+            if (!(currentMemtable instanceof AbstractAllocatorMemtable))
+                continue;
+            AbstractAllocatorMemtable current = (AbstractAllocatorMemtable) currentMemtable;
+
+            // find the total ownership ratio for the memtable and all SecondaryIndexes owned by this CF,
+            // both on- and off-heap, and select the largest of the two ratios to weight this CF
+            MemoryUsage usage = Memtable.newMemoryUsage();
+            current.addMemoryUsageTo(usage);
+
+            for (Memtable indexMemtable : current.owner.getIndexMemtables())
+                if (indexMemtable instanceof AbstractAllocatorMemtable)
+                    indexMemtable.addMemoryUsageTo(usage);
+
+            float ratio = Math.max(usage.ownershipRatioOnHeap, usage.ownershipRatioOffHeap);
+            if (ratio > largestRatio)
+            {
+                largestMemtable = current;
+                largestUsage = usage;
+                largestRatio = ratio;
+            }
+
+            liveOnHeap += usage.ownershipRatioOnHeap;
+            liveOffHeap += usage.ownershipRatioOffHeap;
+        }
+
+        CompletableFuture<Boolean> returnFuture = new CompletableFuture<>();
+
+        if (largestMemtable != null)
+        {
+            float usedOnHeap = MEMORY_POOL.onHeap.usedRatio();
+            float usedOffHeap = MEMORY_POOL.offHeap.usedRatio();
+            float flushingOnHeap = MEMORY_POOL.onHeap.reclaimingRatio();
+            float flushingOffHeap = MEMORY_POOL.offHeap.reclaimingRatio();
+            logger.debug("Flushing largest {} to free up room. Used total: {}, live: {}, flushing: {}, this: {}",
+                         largestMemtable.owner, ratio(usedOnHeap, usedOffHeap), ratio(liveOnHeap, liveOffHeap),
+                         ratio(flushingOnHeap, flushingOffHeap), ratio(largestUsage.ownershipRatioOnHeap, largestUsage.ownershipRatioOffHeap));
+
+            ListenableFuture<CommitLogPosition> flushFuture = largestMemtable.owner.signalFlushRequired(largestMemtable, ColumnFamilyStore.FlushReason.MEMTABLE_LIMIT);
+            flushFuture.addListener(() -> {
+                try
+                {
+                    flushFuture.get();
+                    returnFuture.complete(true);
+                }
+                catch (Throwable t)
+                {
+                    returnFuture.completeExceptionally(t);
+                }
+            }, MoreExecutors.directExecutor());
+        }
+        else
+        {
+            logger.debug("Flushing of largest memtable, not done, no memtable found");
+
+            returnFuture.complete(false);
+        }
+
+        return returnFuture;
+    }
+
+    private static String ratio(float onHeap, float offHeap)
+    {
+        return String.format("%.2f/%.2f", onHeap, offHeap);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
new file mode 100644
index 000000000000..53e52ed95ebf
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+public abstract class AbstractMemtable implements Memtable
+{
+    protected final AtomicLong liveDataSize = new AtomicLong(0);
+    protected final AtomicLong currentOperations = new AtomicLong(0);
+    protected final ColumnsCollector columnsCollector;
+    protected final StatsCollector statsCollector = new StatsCollector();
+    // The smallest timestamp for all partitions stored in this memtable
+    protected AtomicLong minTimestamp = new AtomicLong(Long.MAX_VALUE);
+    private final AtomicReference<LifecycleTransaction> flushTransaction = new AtomicReference<>(null);
+    protected TableMetadataRef metadata;
+
+    public AbstractMemtable(TableMetadataRef metadataRef)
+    {
+        this.metadata = metadataRef;
+        this.columnsCollector = new ColumnsCollector(metadata.get().regularAndStaticColumns());
+    }
+
+    public TableMetadata metadata()
+    {
+        return metadata.get();
+    }
+
+    public long getLiveDataSize()
+    {
+        return liveDataSize.get();
+    }
+
+    public long getOperations()
+    {
+        return currentOperations.get();
+    }
+
+    public long getMinTimestamp()
+    {
+        return minTimestamp.get();
+    }
+
+    protected void updateMin(AtomicLong minTracker, long newValue)
+    {
+        while (true)
+        {
+            long memtableMinTimestamp = minTracker.get();
+            if (memtableMinTimestamp <= newValue)
+                break;
+            if (minTracker.compareAndSet(memtableMinTimestamp, newValue))
+                break;
+        }
+    }
+
+    RegularAndStaticColumns columns()
+    {
+        return columnsCollector.get();
+    }
+
+    EncodingStats encodingStats()
+    {
+        return statsCollector.get();
+    }
+
+    public LifecycleTransaction getFlushTransaction()
+    {
+        return flushTransaction.get();
+    }
+
+    public LifecycleTransaction setFlushTransaction(LifecycleTransaction flushTransaction)
+    {
+        return this.flushTransaction.getAndSet(flushTransaction);
+    }
+
+    protected static class ColumnsCollector
+    {
+        private final HashMap<ColumnMetadata, AtomicBoolean> predefined = new HashMap<>();
+        private final ConcurrentSkipListSet<ColumnMetadata> extra = new ConcurrentSkipListSet<>();
+
+        ColumnsCollector(RegularAndStaticColumns columns)
+        {
+            for (ColumnMetadata def : columns.statics)
+                predefined.put(def, new AtomicBoolean());
+            for (ColumnMetadata def : columns.regulars)
+                predefined.put(def, new AtomicBoolean());
+        }
+
+        public void update(RegularAndStaticColumns columns)
+        {
+            for (ColumnMetadata s : columns.statics)
+                update(s);
+            for (ColumnMetadata r : columns.regulars)
+                update(r);
+        }
+
+        private void update(ColumnMetadata definition)
+        {
+            AtomicBoolean present = predefined.get(definition);
+            if (present != null)
+            {
+                if (!present.get())
+                    present.set(true);
+            }
+            else
+            {
+                extra.add(definition);
+            }
+        }
+
+        /**
+         * Get the current state of the columns set.
+         *
+         * Note: If this is executed while mutations are still being performed on the table (e.g. to prepare
+         * an sstable for streaming when Memtable.Factory.streamFromMemtable() is true), the resulting view may be
+         * in a somewhat inconsistent state (it may include partial updates, as well as miss updates older than
+         * ones it does include).
+         */
+        public RegularAndStaticColumns get()
+        {
+            RegularAndStaticColumns.Builder builder = RegularAndStaticColumns.builder();
+            for (Map.Entry<ColumnMetadata, AtomicBoolean> e : predefined.entrySet())
+                if (e.getValue().get())
+                    builder.add(e.getKey());
+            return builder.addAll(extra).build();
+        }
+    }
+
+    protected static class StatsCollector
+    {
+        private final AtomicReference<EncodingStats> stats = new AtomicReference<>(EncodingStats.NO_STATS);
+
+        public void update(EncodingStats newStats)
+        {
+            while (true)
+            {
+                EncodingStats current = stats.get();
+                EncodingStats updated = current.mergeWith(newStats);
+                if (stats.compareAndSet(current, updated))
+                    return;
+            }
+        }
+
+        public EncodingStats get()
+        {
+            return stats.get();
+        }
+    }
+
+    protected abstract class AbstractFlushCollection<P extends Partition> implements FlushCollection<P>
+    {
+        public long dataSize()
+        {
+            return getLiveDataSize();
+        }
+
+        public CommitLogPosition commitLogLowerBound()
+        {
+            return AbstractMemtable.this.getCommitLogLowerBound();
+        }
+
+        public CommitLogPosition commitLogUpperBound()
+        {
+            return AbstractMemtable.this.getCommitLogUpperBound();
+        }
+
+        public EncodingStats encodingStats()
+        {
+            return AbstractMemtable.this.encodingStats();
+        }
+
+        public RegularAndStaticColumns columns()
+        {
+            return AbstractMemtable.this.columns();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java b/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java
new file mode 100644
index 000000000000..55f08a42a152
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java
@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+/**
+ * Memtable that uses a commit log for persistence. Provides methods of tracking the commit log positions covered by
+ * it and safely switching between memtables.
+ */
+public abstract class AbstractMemtableWithCommitlog extends AbstractMemtable
+{
+    // The approximate lower bound by this memtable; must be <= commitLogLowerBound once our predecessor
+    // has been finalised, and this is enforced in the ColumnFamilyStore.setCommitLogUpperBound
+    private final CommitLogPosition approximateCommitLogLowerBound = CommitLog.instance.getCurrentPosition();
+    // the precise lower bound of CommitLogPosition owned by this memtable; equal to its predecessor's commitLogUpperBound
+    private final AtomicReference<CommitLogPosition> commitLogLowerBound;
+    // the write barrier for directing writes to this memtable or the next during a switch
+    private volatile OpOrder.Barrier writeBarrier;
+    // the precise upper bound of CommitLogPosition owned by this memtable
+    private volatile AtomicReference<CommitLogPosition> commitLogUpperBound;
+
+    public AbstractMemtableWithCommitlog(TableMetadataRef metadataRef, AtomicReference<CommitLogPosition> commitLogLowerBound)
+    {
+        super(metadataRef);
+        this.commitLogLowerBound = commitLogLowerBound;
+    }
+
+    public CommitLogPosition getApproximateCommitLogLowerBound()
+    {
+        return approximateCommitLogLowerBound;
+    }
+
+    public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound)
+    {
+        // This can prepare the memtable data for deletion; it will still be used while the flush is proceeding.
+        // A setDiscarded call will follow.
+        assert this.writeBarrier == null;
+        this.writeBarrier = writeBarrier;
+        this.commitLogUpperBound = commitLogUpperBound;
+    }
+
+    public void discard()
+    {
+        assert writeBarrier != null : "Memtable must be switched out before being discarded.";
+    }
+
+    // decide if this memtable should take the write, or if it should go to the next memtable
+    public boolean accepts(OpOrder.Group opGroup, CommitLogPosition commitLogPosition)
+    {
+        // if the barrier hasn't been set yet, then this memtable is still the newest and is taking ALL writes.
+        OpOrder.Barrier barrier = this.writeBarrier;
+        if (barrier == null)
+            return true;
+        // Note that if this races with taking the barrier the opGroup and commit log position we were given must
+        // necessarily be before the barrier and any LastCommitLogPosition is set, thus this function will return true
+        // and no update to commitLogUpperBound is necessary.
+
+        // If the barrier has been set and issued, but is in the past, we are definitely destined for a future memtable.
+        // Because we issue the barrier after taking LastCommitLogPosition and mutations take their position after
+        // taking the opGroup, this condition also ensures the given commit log position is greater than the chosen
+        // upper bound.
+        if (!barrier.isAfter(opGroup))
+            return false;
+        // We are in the segment of time between the barrier is constructed (and the memtable is switched out)
+        // and the barrier is issued.
+        // if we aren't durable we are directed only by the barrier
+        if (commitLogPosition == null)
+            return true;
+        while (true)
+        {
+            // If the CL boundary has been set, the mutation can be accepted depending on whether it falls before it.
+            // However, if it has not been set, the old sstable must still accept writes but we must also ensure that
+            // their positions are accounted for in the boundary (as there may be a delay between taking the log
+            // position for the boundary and setting it where a mutation sneaks in).
+            // Thus, if the boundary hasn't been finalised yet, we simply update it to the max of its current value and
+            // ours; this permits us to coordinate a safe boundary, as the boundary choice is made atomically wrt our
+            // max() maintenance, so an operation cannot sneak into the past.
+            CommitLogPosition currentLast = commitLogUpperBound.get();
+            if (currentLast instanceof LastCommitLogPosition)
+                return currentLast.compareTo(commitLogPosition) >= 0;
+            if (currentLast != null && currentLast.compareTo(commitLogPosition) >= 0)
+                return true;
+            if (commitLogUpperBound.compareAndSet(currentLast, commitLogPosition))
+                return true;
+        }
+    }
+
+    public CommitLogPosition getCommitLogLowerBound()
+    {
+        return commitLogLowerBound.get();
+    }
+
+    public CommitLogPosition getCommitLogUpperBound()
+    {
+        return commitLogUpperBound.get();
+    }
+
+    public boolean mayContainDataBefore(CommitLogPosition position)
+    {
+        return approximateCommitLogLowerBound.compareTo(position) < 0;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java b/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java
new file mode 100644
index 000000000000..9213fab91f85
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+/**
+ * This class exists solely to avoid initialization of the default memtable class.
+ * Some tests want to setup table parameters before initializing DatabaseDescriptor -- this allows them to do so.
+ */
+public class DefaultMemtableFactory
+{
+    // We can't use TrieMemtable.FACTORY as that requires DatabaseDescriptor to have been initialized.
+    public static final Memtable.Factory INSTANCE = TrieMemtable::new;
+//    public static final Memtable.Factory INSTANCE = SkipListMemtable::new;
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/Flushing.java b/src/java/org/apache/cassandra/db/memtable/Flushing.java
new file mode 100644
index 000000000000..55677b0552c8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/Flushing.java
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.atomic.AtomicReference;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.DiskBoundaries;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.commitlog.IntervalSet;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.AtomicBTreePartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.utils.Throwables.maybeFail;
+
+public class Flushing
+{
+    private static final Logger logger = LoggerFactory.getLogger(Flushing.class);
+
+    public static List<FlushRunnable> flushRunnables(ColumnFamilyStore cfs,
+                                                     Memtable memtable,
+                                                     LifecycleTransaction txn)
+    {
+        LifecycleTransaction ongoingFlushTransaction = memtable.setFlushTransaction(txn);
+        Preconditions.checkState(ongoingFlushTransaction == null,
+                                 "Attempted to flush Memtable more than once on %s.%s",
+                                 cfs.keyspace.getName(),
+                                 cfs.name);
+
+        DiskBoundaries diskBoundaries = cfs.getDiskBoundaries();
+        List<PartitionPosition> boundaries = diskBoundaries.positions;
+        List<Directories.DataDirectory> locations = diskBoundaries.directories;
+        return flushRunnables(cfs, memtable, boundaries, locations, txn);
+    }
+
+    @VisibleForTesting
+    static List<FlushRunnable> flushRunnables(ColumnFamilyStore cfs,
+                                              Memtable memtable,
+                                              List<PartitionPosition> boundaries,
+                                              List<Directories.DataDirectory> locations,
+                                              LifecycleTransaction txn)
+    {
+        if (boundaries == null)
+        {
+            FlushRunnable runnable = flushRunnable(cfs, memtable, null, null, txn, null);
+            return Collections.singletonList(runnable);
+        }
+
+        List<FlushRunnable> runnables = new ArrayList<>(boundaries.size());
+        PartitionPosition rangeStart = boundaries.get(0).getPartitioner().getMinimumToken().minKeyBound();
+        try
+        {
+            for (int i = 0; i < boundaries.size(); i++)
+            {
+                PartitionPosition t = boundaries.get(i);
+                FlushRunnable runnable = flushRunnable(cfs, memtable, rangeStart, t, txn, locations.get(i));
+
+                runnables.add(runnable);
+                rangeStart = t;
+            }
+            return runnables;
+        }
+        catch (Throwable e)
+        {
+            throw Throwables.propagate(abortRunnables(runnables, e));
+        }
+    }
+
+    @SuppressWarnings("resource")   // writer owned by runnable, to be closed or aborted by its caller
+    static FlushRunnable flushRunnable(ColumnFamilyStore cfs,
+                                       Memtable memtable,
+                                       PartitionPosition from,
+                                       PartitionPosition to,
+                                       LifecycleTransaction txn,
+                                       Directories.DataDirectory flushLocation)
+    {
+        Memtable.FlushCollection<?> flushSet = memtable.getFlushSet(from, to);
+        SSTableFormat.Type formatType = SSTableFormat.Type.current();
+        long estimatedSize = formatType.info.getWriterFactory().estimateSize(flushSet);
+
+        Descriptor descriptor = flushLocation == null
+                                ? cfs.newSSTableDescriptor(cfs.getDirectories().getWriteableLocationAsFile(estimatedSize), formatType)
+                                : cfs.newSSTableDescriptor(cfs.getDirectories().getLocationForDisk(flushLocation), formatType);
+
+        SSTableMultiWriter writer = createFlushWriter(cfs,
+                                                      flushSet,
+                                                      txn,
+                                                      descriptor,
+                                                      flushSet.partitionCount());
+
+        return new FlushRunnable(flushSet, writer, cfs.metric, true);
+    }
+
+    public static Throwable abortRunnables(List<FlushRunnable> runnables, Throwable t)
+    {
+        if (runnables != null)
+            for (FlushRunnable runnable : runnables)
+                t = runnable.abort(t);
+        return t;
+    }
+
+    /**
+     * The valid states for {@link FlushRunnable} writers. The thread writing the contents
+     * will transition from IDLE -> RUNNING and back to IDLE when finished using the writer
+     * or from ABORTING -> ABORTED if another thread has transitioned from RUNNING -> ABORTING.
+     * We can also transition directly from IDLE -> ABORTED. Whichever threads transitions
+     * to ABORTED is responsible to abort the writer.
+     */
+    @VisibleForTesting
+    enum FlushRunnableWriterState
+    {
+        IDLE, // the runnable is idle, either not yet started or completed but with the writer waiting to be committed
+        RUNNING, // the runnable is executing, therefore the writer cannot be aborted or else a SEGV may ensue
+        ABORTING, // an abort request has been issued, this only happens if abort() is called whilst RUNNING
+        ABORTED  // the writer has been aborted, no resources will be leaked
+    }
+
+    public static class FlushRunnable implements Callable<SSTableMultiWriter>
+    {
+        private final Memtable.FlushCollection<?> toFlush;
+
+        private final SSTableMultiWriter writer;
+        private final TableMetrics metrics;
+        private final boolean isBatchLogTable;
+        private final boolean logCompletion;
+        private final AtomicReference<FlushRunnableWriterState> state;
+
+        public FlushRunnable(Memtable.FlushCollection<?> flushSet,
+                             SSTableMultiWriter writer,
+                             TableMetrics metrics,
+                             boolean logCompletion)
+        {
+            this.toFlush = flushSet;
+            this.writer = writer;
+            this.metrics = metrics;
+            this.isBatchLogTable = toFlush.metadata() == SystemKeyspace.Batches;
+            this.logCompletion = logCompletion;
+            this.state = new AtomicReference<>(FlushRunnableWriterState.IDLE);
+        }
+
+        private void writeSortedContents()
+        {
+            if (!state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.RUNNING))
+            {
+                logger.debug("Failed to write {}, flushed range = ({}, {}], state: {}",
+                             toFlush.memtable().toString(), toFlush.from(), toFlush.to(), state);
+                return;
+            }
+
+            logger.debug("Writing {}, flushed range = ({}, {}], state: {}",
+                         toFlush.memtable().toString(), toFlush.from(), toFlush.to(), state);
+
+            try
+            {
+                // (we can't clear out the map as-we-go to free up memory,
+                //  since the memtable is being used for queries in the "pending flush" category)
+                for (Partition partition : toFlush)
+                {
+                    if (state.get() == FlushRunnableWriterState.ABORTING)
+                        break;
+
+                    // Each batchlog partition is a separate entry in the log. And for an entry, we only do 2
+                    // operations: 1) we insert the entry and 2) we delete it. Further, BL data is strictly local,
+                    // we don't need to preserve tombstones for repair. So if both operation are in this
+                    // memtable (which will almost always be the case if there is no ongoing failure), we can
+                    // just skip the entry (CASSANDRA-4667).
+                    if (isBatchLogTable && !partition.partitionLevelDeletion().isLive() && partition.hasRows())
+                        continue;
+
+                    if (!partition.isEmpty())
+                    {
+                        try (UnfilteredRowIterator iter = partition.unfilteredIterator())
+                        {
+                            writer.append(iter);
+                        }
+                    }
+                }
+            }
+            finally
+            {
+                while (true)
+                {
+                    if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.IDLE))
+                    {
+                        if (logCompletion)
+                        {
+                            long bytesFlushed = writer.getFilePointer();
+                            logger.info("Completed flushing {} ({}) for commitlog position {}",
+                                        writer.getFilename(),
+                                        FBUtilities.prettyPrintMemory(bytesFlushed),
+                                        toFlush.memtable().getCommitLogUpperBound());
+                            // Update the metrics
+                            metrics.bytesFlushed.inc(bytesFlushed);
+                        }
+
+                        break;
+                    }
+                    else if (state.compareAndSet(FlushRunnableWriterState.ABORTING, FlushRunnableWriterState.ABORTED))
+                    {
+                        logger.debug("Flushing of {} aborted", writer.getFilename());
+                        maybeFail(writer.abort(null));
+                        break;
+                    }
+                }
+            }
+        }
+
+        @Override
+        public SSTableMultiWriter call()
+        {
+            writeSortedContents();
+            return writer;
+            // We don't close the writer on error as the caller aborts all runnables if one happens.
+        }
+
+        public Throwable abort(Throwable throwable)
+        {
+            while (true)
+            {
+                if (state.compareAndSet(FlushRunnableWriterState.IDLE, FlushRunnableWriterState.ABORTED))
+                {
+                    logger.debug("Flushing of {} aborted", writer.getFilename());
+                    return writer.abort(throwable);
+                }
+                else if (state.compareAndSet(FlushRunnableWriterState.RUNNING, FlushRunnableWriterState.ABORTING))
+                {
+                    // thread currently executing writeSortedContents() will take care of aborting and throw any exceptions
+                    return throwable;
+                }
+            }
+        }
+
+        @VisibleForTesting
+        FlushRunnableWriterState state()
+        {
+            return state.get();
+        }
+    }
+
+    public static SSTableMultiWriter createFlushWriter(ColumnFamilyStore cfs,
+                                                       Memtable.FlushCollection<?> flushSet,
+                                                       LifecycleTransaction txn,
+                                                       Descriptor descriptor,
+                                                       long partitionCount)
+    {
+        MetadataCollector sstableMetadataCollector = new MetadataCollector(flushSet.metadata().comparator)
+                                                     .commitLogIntervals(new IntervalSet<>(flushSet.commitLogLowerBound(),
+                                                                                           flushSet.commitLogUpperBound()));
+
+        return cfs.createSSTableMultiWriter(descriptor,
+                                            partitionCount,
+                                            ActiveRepairService.UNREPAIRED_SSTABLE,
+                                            ActiveRepairService.NO_PENDING_REPAIR,
+                                            false,
+                                            sstableMetadataCollector,
+                                            new SerializationHeader(true,
+                                                                    flushSet.metadata(),
+                                                                    flushSet.columns(),
+                                                                    flushSet.encodingStats()),
+                                            txn);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/Memtable.java b/src/java/org/apache/cassandra/db/memtable/Memtable.java
new file mode 100644
index 000000000000..8513ec684a4b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/Memtable.java
@@ -0,0 +1,426 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.util.concurrent.atomic.AtomicReference;
+
+import com.google.common.util.concurrent.ListenableFuture;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+/**
+ * Memtable interface. This defines the operations the ColumnFamilyStore can perform with memtables.
+ * They are of several types:
+ * - construction factory interface
+ * - write and read operations: put, getPartition and makePartitionIterator
+ * - statistics and features, including partition counts, data size, encoding stats, written columns
+ * - memory usage tracking, including methods of retrieval and of adding extra allocated space (used non-CFS secondary
+ *   indexes)
+ * - flush functionality, preparing the set of partitions to flush for given ranges
+ * - lifecycle management, i.e. operations that prepare and execute switch to a different memtable, together
+ *   with ways of tracking the affected commit log spans
+ */
+public interface Memtable extends Comparable<Memtable>
+{
+    // Construction
+
+    /**
+     * Factory interface for constructing memtables, and querying write durability features.
+     *
+     * The factory is chosen using the MemtableParams class (passed as argument to
+     * {@code CREATE TABLE ... WITH memtable = {...}} or in the memtable options in cassandra.yaml). To make that
+     * possible, implementations must provide either a static {@code FACTORY} field (if they accept no further option)
+     * or a static {@code factory(Map<String, String>)} method. In the latter case, the method should avoid creating
+     * multiple instances of the factory for the same parameters, or factories should at least implement hashCode and
+     * equals.
+     */
+    interface Factory
+    {
+        /**
+         * Create a memtable.
+         *
+         * @param commitLogLowerBound A commit log lower bound for the new memtable. This will be equal to the previous
+         *                            memtable's upper bound and defines the span of positions that any flushed sstable
+         *                            will cover.
+         * @param metadaRef Pointer to the up-to-date table metadata.
+         * @param owner Owning objects that will receive flush requests triggered by the memtable (e.g. on expiration).
+         */
+        Memtable create(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadaRef, Owner owner);
+
+        /**
+         * If the memtable can achieve write durability directly (i.e. using some feature other than the commitlog, e.g.
+         * persistent memory), it can return true here, in which case the commit log will not store mutations in this
+         * table.
+         * Note that doing so will prevent point-in-time restores and changed data capture, thus a durable memtable must
+         * allow the option of turning commit log writing on even if it does not need it.
+         */
+        default boolean writesShouldSkipCommitLog()
+        {
+            return false;
+        }
+
+        /**
+         * This should be true if the memtable can achieve write durability for crash recovery directly (i.e. using some
+         * feature other than the commitlog, e.g. persistent memory).
+         * Setting this flag to true means that the commitlog should not replay mutations for this table on restart,
+         * and that it should not try to preserve segments that contain relevant data.
+         * Unless writesShouldSkipCommitLog() is also true, writes will be recorded in the commit log as they may be
+         * needed for changed data capture or point-in-time restore.
+         */
+        default boolean writesAreDurable()
+        {
+            return false;
+        }
+
+        /**
+         * Normally we can receive streamed sstables directly, skipping the memtable stage (zero-copy-streaming). When
+         * the memtable is the primary data store (e.g. persistent memtables), it will usually prefer to receive the
+         * data instead.
+         *
+         * If this returns true, all streamed sstables's content will be read and replayed as mutations, disabling
+         * zero-copy streaming.
+         */
+        default boolean streamToMemtable()
+        {
+            return false;
+        }
+
+        /**
+         * When we need to stream data, we usually flush and stream the resulting sstables. This will not work correctly
+         * if the memtable does not want to flush for streaming (e.g. persistent memtables acting as primary data
+         * store), because data (not just recent) will be missing from the streamed view. Such memtables must present
+         * their data separately for streaming.
+         * In other words if the memtable returns false on shouldSwitch(STREAMING/REPAIR), its factory must return true
+         * here.
+         *
+         * If this flag returns true, streaming will write the relevant content that resides in the memtable to
+         * temporary sstables, stream these sstables and then delete them.
+         */
+        default boolean streamFromMemtable()
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Interface for providing signals back to the owner.
+     */
+    interface Owner
+    {
+        /** Signal to the owner that a flush is required (e.g. in response to hitting space limits) */
+        ListenableFuture<CommitLogPosition> signalFlushRequired(Memtable memtable, ColumnFamilyStore.FlushReason reason);
+
+        /** Get the current memtable for this owner. Used to avoid capturing memtable in scheduled flush tasks. */
+        Memtable getCurrentMemtable();
+
+        /**
+         * Collect the index memtables flushed together with this. Used to accurately calculate memory that would be
+         * freed by a flush.
+         */
+        Iterable<Memtable> getIndexMemtables();
+    }
+
+
+    // Main write and read operations
+
+    /**
+     * Put new data in the memtable. This operation may block until enough memory is available in the memory pool.
+     *
+     * @param update the partition update, may be a new partition or an update to an existing one
+     * @param indexer receives information about the update's effect
+     * @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a
+     *                flush to free space.
+     *
+     * @return the smallest timestamp delta between corresponding rows from existing and update. A
+     * timestamp delta being computed as the difference between the cells and DeletionTimes from any existing partition
+     * and those in {@code update}. See CASSANDRA-7979.
+     */
+    long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup);
+
+    /**
+     * Get the partition for the specified key. Returns null if no such partition is present.
+     */
+    Partition getPartition(DecoratedKey key);
+
+    /**
+     * Returns a partition iterator for the given data range.
+     *
+     * @param columnFilter filter to apply to all returned partitions
+     * @param dataRange the partition and clustering range queried
+     */
+    MemtableUnfilteredPartitionIterator makePartitionIterator(ColumnFilter columnFilter,
+                                                              DataRange dataRange);
+
+    interface MemtableUnfilteredPartitionIterator extends UnfilteredPartitionIterator
+    {
+        /**
+         * Returns the minimum local deletion time for all partitions in the range.
+         * Required for the efficiency of partition range read commands.
+         */
+        int getMinLocalDeletionTime();
+    }
+
+
+    // Statistics
+
+    /** Number of partitions stored in the memtable */
+    long partitionCount();
+
+    /** Size of the data not accounting for any metadata / mapping overheads */
+    long getLiveDataSize();
+
+    /**
+     * Number of "operations" (in the sense defined in {@link PartitionUpdate#operationCount()}) the memtable has
+     * executed.
+     */
+    long getOperations();
+
+    /** Minimum timestamp of all stored data */
+    long getMinTimestamp();
+
+    /**
+     * The table's definition metadata.
+     *
+     * Note that this tracks the current state of the table and is not necessarily the same as what was used to create
+     * the memtable.
+     */
+    TableMetadata metadata();
+
+
+    // Memory usage tracking
+
+    /**
+     * Add this memtable's used memory to the given usage object. This can be used to retrieve a single memtable's usage
+     * as well as to combine the ones of related sstables (e.g. a table and its table-based secondary indexes).
+     */
+    void addMemoryUsageTo(MemoryUsage usage);
+
+
+    /**
+     * Creates a holder for memory usage collection.
+     *
+     * This is used to track on- and off-heap memory, as well as the ratio to the total permitted memtable memory.
+     */
+    static MemoryUsage newMemoryUsage()
+    {
+        return new MemoryUsage();
+    }
+
+    /**
+     * Shorthand for the getting a given table's memory usage.
+     * Implemented as a static to prevent implementations altering expectations by e.g. returning a cached object.
+     */
+    static MemoryUsage getMemoryUsage(Memtable memtable)
+    {
+        MemoryUsage usage = newMemoryUsage();
+        memtable.addMemoryUsageTo(usage);
+        return usage;
+    }
+
+    class MemoryUsage
+    {
+        /** On-heap memory used in bytes */
+        public long ownsOnHeap = 0;
+        /** Off-heap memory used in bytes */
+        public long ownsOffHeap = 0;
+        /** On-heap memory as ratio to permitted memtable space */
+        public float ownershipRatioOnHeap = 0.0f;
+        /** Off-heap memory as ratio to permitted memtable space */
+        public float ownershipRatioOffHeap = 0.0f;
+
+        public String toString()
+        {
+            return String.format("%s (%.0f%%) on-heap, %s (%.0f%%) off-heap",
+                                 FBUtilities.prettyPrintMemory(ownsOnHeap),
+                                 ownershipRatioOnHeap * 100,
+                                 FBUtilities.prettyPrintMemory(ownsOffHeap),
+                                 ownershipRatioOffHeap * 100);
+        }
+    }
+
+    /**
+     * Adjust the used on-heap space by the given size (e.g. to reflect memory used by a non-table-based index).
+     * This operation may block until enough memory is available in the memory pool.
+     *
+     * @param additionalSpace the number of allocated bytes
+     * @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a
+     *                flush to free space.
+     */
+    void markExtraOnHeapUsed(long additionalSpace, OpOrder.Group opGroup);
+
+    /**
+     * Adjust the used off-heap space by the given size (e.g. to reflect memory used by a non-table-based index).
+     * This operation may block until enough memory is available in the memory pool.
+     *
+     * @param additionalSpace the number of allocated bytes
+     * @param opGroup write operation group, used to permit the operation to complete if it is needed to complete a
+     *                flush to free space.
+     */
+    void markExtraOffHeapUsed(long additionalSpace, OpOrder.Group opGroup);
+
+
+    // Flushing
+
+    /**
+     * Get the collection of data between the given partition boundaries in a form suitable for flushing.
+     */
+    FlushCollection<?> getFlushSet(PartitionPosition from, PartitionPosition to);
+
+    /**
+     * A collection of partitions for flushing plus some information required for writing an sstable.
+     *
+     * Note that the listed entries must conform with the specified metadata. In particular, if the memtable is still
+     * being written to, care must be taken to not list newer items as they may violate the bounds collected by the
+     * encoding stats or refer to columns that don't exist in the collected columns set.
+     */
+    interface FlushCollection<P extends Partition> extends Iterable<P>, SSTableWriter.SSTableSizeParameters
+    {
+        Memtable memtable();
+
+        PartitionPosition from();
+        PartitionPosition to();
+
+        /** The commit log position at the time that this memtable was created */
+        CommitLogPosition commitLogLowerBound();
+        /** The commit log position at the time that this memtable was switched out */
+        CommitLogPosition commitLogUpperBound();
+
+        /** The set of all columns that have been written */
+        RegularAndStaticColumns columns();
+        /** Statistics required for writing an sstable efficiently */
+        EncodingStats encodingStats();
+
+        default TableMetadata metadata()
+        {
+            return memtable().metadata();
+        }
+
+        long partitionCount();
+        default boolean isEmpty()
+        {
+            return partitionCount() > 0;
+        }
+    }
+
+
+    // Lifecycle management
+
+    /**
+     * Called to tell the memtable that it is being switched out and will be flushed (or dropped) and discarded.
+     * Will be followed by a {@link #getFlushSet} call (if the table is not truncated or dropped), and a
+     * {@link #discard}.
+     *
+     * @param writeBarrier The barrier that will signal that all writes to this memtable have completed. That is, the
+     *                     point after which writes cannot be accepted by this memtable (it is permitted for writes
+     *                     before this barrier to go into the next; see {@link #accepts}).
+     * @param commitLogUpperBound The upper commit log position for this memtable. The value may be modified after this
+     *                            call and will match the next memtable's lower commit log bound.
+     */
+    void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound);
+
+    /**
+     * This memtable is no longer in use or required for outstanding flushes or operations.
+     * All held memory must be released.
+     */
+    void discard();
+
+    /**
+     * Decide if this memtable should take a write with the given parameters, or if the write should go to the next
+     * memtable. This enforces that no writes after the barrier set by {@link #switchOut} can be accepted, and
+     * is also used to define a shared commit log bound as the upper for this memtable and lower for the next.
+     */
+    boolean accepts(OpOrder.Group opGroup, CommitLogPosition commitLogPosition);
+
+    /** Approximate commit log lower bound, <= getCommitLogLowerBound, used as a time stamp for ordering */
+    CommitLogPosition getApproximateCommitLogLowerBound();
+
+    /** The commit log position at the time that this memtable was created */
+    CommitLogPosition getCommitLogLowerBound();
+
+    /** The commit log position at the time that this memtable was switched out */
+    CommitLogPosition getCommitLogUpperBound();
+
+    /** True if the memtable can contain any data that was written before the given commit log position */
+    boolean mayContainDataBefore(CommitLogPosition position);
+
+    /** True if the memtable contains no data */
+    boolean isClean();
+
+    /** These two methods provide a way of tracking on-going flushes */
+    public LifecycleTransaction setFlushTransaction(LifecycleTransaction transaction);
+    public LifecycleTransaction getFlushTransaction();
+
+    /** Order memtables by time as reflected in the commit log position at time of construction */
+    default int compareTo(Memtable that)
+    {
+        return this.getApproximateCommitLogLowerBound().compareTo(that.getApproximateCommitLogLowerBound());
+    }
+
+    /**
+     * Decides whether the memtable should be switched/flushed for the passed reason.
+     * Normally this will return true, but e.g. persistent memtables may choose not to flush. Returning false will
+     * trigger further action for some reasons:
+     * - SCHEMA_CHANGE will be followed by metadataUpdated().
+     * - SNAPSHOT will be followed by performSnapshot().
+     * - STREAMING/REPAIR will be followed by creating a FlushSet for the streamed/repaired ranges. This data will be
+     *   used to create sstables, which will be streamed and then deleted.
+     * This will not be called if the sstable is switched because of truncation or drop.
+     */
+    boolean shouldSwitch(ColumnFamilyStore.FlushReason reason);
+
+    /**
+     * Called when the table's metadata is updated. The memtable's metadata reference now points to the new version.
+     */
+    void metadataUpdated();
+
+    /**
+     * If the memtable needs to do some special action for snapshots (e.g. because it is persistent and does not want
+     * to flush), it should return false on the above with reason SNAPSHOT and implement this method.
+     */
+    void performSnapshot(String snapshotName);
+
+    /**
+     * Special commit log position marker used in the upper bound marker setting process
+     * (see {@link org.apache.cassandra.db.ColumnFamilyStore#setCommitLogUpperBound} and {@link AbstractMemtable#accepts})
+     */
+    public static final class LastCommitLogPosition extends CommitLogPosition
+    {
+        public LastCommitLogPosition(CommitLogPosition copy)
+        {
+            super(copy.segmentId, copy.position);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java b/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java
new file mode 100644
index 000000000000..eaa235218b95
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+
+/**
+ * Skeleton for persistent memory memtable.
+ */
+public class PersistentMemoryMemtable
+//extends AbstractMemtable
+extends SkipListMemtable        // to test framework
+{
+    public PersistentMemoryMemtable(TableMetadataRef metadaRef, Owner owner)
+    {
+        super(null, metadaRef, owner);
+        // We should possibly link the persistent data of this memtable
+    }
+
+    public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
+    {
+        // TODO: implement
+        return super.put(update, indexer, opGroup);
+    }
+
+    public MemtableUnfilteredPartitionIterator makePartitionIterator(ColumnFilter columnFilter, DataRange dataRange)
+    {
+        // TODO: implement
+        return super.makePartitionIterator(columnFilter, dataRange);
+    }
+
+    public Partition getPartition(DecoratedKey key)
+    {
+        // TODO: implement
+        return super.getPartition(key);
+    }
+
+    public long partitionCount()
+    {
+        // TODO: implement
+        return super.partitionCount();
+    }
+
+    public FlushCollection<?> getFlushSet(PartitionPosition from, PartitionPosition to)
+    {
+        // TODO: implement
+        // FIXME: If the memtable can still be written to, this uses a view of the metadata that may not be up-to-date
+        // with the content. This may cause streaming to fail e.g. if a new column appears and is added to some row in
+        // the memtable between the time that this is constructed and the relevant row is written. Such failures should
+        // be recoverable by redoing the stream.
+        // If an implementation can produce a view/snapshot of the data at a point before the features were collected,
+        // this problem will not occur.
+        return super.getFlushSet(from, to);
+    }
+
+    public boolean shouldSwitch(ColumnFamilyStore.FlushReason reason)
+    {
+        // We want to avoid all flushing.
+        switch (reason)
+        {
+        case STARTUP: // Called after reading and replaying the commit log.
+        case SHUTDOWN: // Called to flush data before shutdown.
+        case INTERNALLY_FORCED: // Called to ensure ordering and persistence of system table events.
+        case MEMTABLE_PERIOD_EXPIRED: // The specified memtable expiration time elapsed.
+        case INDEX_TABLE_FLUSH: // Flush requested on index table because main table is flushing.
+        case STREAMS_RECEIVED: // Flush to save streamed data that was written to memtable.
+            return false;   // do not do anything
+
+        case INDEX_BUILD_COMPLETED:
+        case INDEX_REMOVED:
+            // Both of these are needed as safepoints for index management. Nothing to do.
+            return false;
+
+        case VIEW_BUILD_STARTED:
+        case INDEX_BUILD_STARTED:
+            // TODO: Figure out secondary indexes and views.
+            return false;
+
+        case SCHEMA_CHANGE:
+            if (!(metadata().params.memtable.factory instanceof Factory))
+                return true;    // User has switched to a different memtable class. Flush and release all held data.
+            // Otherwise, assuming we can handle the change, don't switch.
+            // TODO: Handle
+            return false;
+
+        case STREAMING: // Called to flush data so it can be streamed. TODO: How dow we stream?
+        case REPAIR: // Called to flush data for repair. TODO: How do we repair?
+            // ColumnFamilyStore will create sstables of the affected ranges which will not be consulted on reads and
+            // will be deleted after streaming.
+            return false;
+
+        case SNAPSHOT:
+            // We don't flush for this. Returning false will trigger a performSnapshot call.
+            return false;
+
+        case DROP: // Called when a table is dropped. This memtable is no longer necessary.
+        case TRUNCATE: // The data is being deleted, but the table remains.
+            // Returning true asks the ColumnFamilyStore to replace this memtable object without flushing.
+            // This will call discard() below to delete all held data.
+            return true;
+
+        case MEMTABLE_LIMIT: // The memtable size limit is reached, and this table was selected for flushing.
+                             // Also passed if we call owner.signalLimitReached()
+        case COMMITLOG_DIRTY: // Commitlog thinks it needs to keep data from this table.
+            // Neither of the above should happen as we specify writesAreDurable and don't use an allocator/cleaner.
+            throw new AssertionError();
+
+        case USER_FORCED:
+        case UNIT_TESTS:
+            return false;
+        default:
+            throw new AssertionError();
+        }
+    }
+
+    public void metadataUpdated()
+    {
+        // TODO: handle
+    }
+
+    public void performSnapshot(String snapshotName)
+    {
+        // TODO: implement. Figure out how to restore snapshot (with external tools).
+    }
+
+    public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound)
+    {
+        super.switchOut(writeBarrier, commitLogUpperBound);
+        // This can prepare the memtable data for deletion; it will still be used while the flush is proceeding.
+        // A discard call will follow.
+    }
+
+    public void discard()
+    {
+        // This will be called to release/delete all held data because the memtable is switched, due to having
+        // its data flushed, due to a truncate/drop, or due to a schema change to a different memtable class.
+
+        // TODO: Implement. This should delete all memtable data from pmem.
+        super.discard();
+    }
+
+    public CommitLogPosition getApproximateCommitLogLowerBound()
+    {
+        // We don't maintain commit log positions
+        return CommitLogPosition.NONE;
+    }
+
+    public CommitLogPosition getCommitLogLowerBound()
+    {
+        // We don't maintain commit log positions
+        return CommitLogPosition.NONE;
+    }
+
+    public CommitLogPosition getCommitLogUpperBound()
+    {
+        // We don't maintain commit log positions
+        return CommitLogPosition.NONE;
+    }
+
+    public boolean isClean()
+    {
+        return partitionCount() == 0;
+    }
+
+    public boolean mayContainDataBefore(CommitLogPosition position)
+    {
+        // We don't track commit log positions, so if we are dirty, we may.
+        return !isClean();
+    }
+
+    public void addMemoryUsageTo(MemoryUsage stats)
+    {
+        // our memory usage is not counted
+    }
+
+    public void markExtraOnHeapUsed(long additionalSpace, OpOrder.Group opGroup)
+    {
+        // we don't track this
+    }
+
+    public void markExtraOffHeapUsed(long additionalSpace, OpOrder.Group opGroup)
+    {
+        // we don't track this
+    }
+
+    public static Factory factory(Map<String, String> furtherOptions)
+    {
+        Boolean skipOption = Boolean.parseBoolean(furtherOptions.remove("skipCommitLog"));
+        return skipOption ? commitLogSkippingFactory : commitLogWritingFactory;
+    }
+
+    private static final Factory commitLogSkippingFactory = new Factory(true);
+    private static final Factory commitLogWritingFactory = new Factory(false);
+
+    static class Factory implements Memtable.Factory
+    {
+        private final boolean skipCommitLog;
+
+        public Factory(boolean skipCommitLog)
+        {
+            this.skipCommitLog = skipCommitLog;
+        }
+
+        public Memtable create(AtomicReference<CommitLogPosition> commitLogLowerBound,
+                               TableMetadataRef metadaRef,
+                               Owner owner)
+        {
+            return new PersistentMemoryMemtable(metadaRef, owner);
+        }
+
+        public boolean writesShouldSkipCommitLog()
+        {
+            return skipCommitLog;
+        }
+
+        public boolean writesAreDurable()
+        {
+            return true;
+        }
+
+        public boolean streamToMemtable()
+        {
+            return true;
+        }
+
+        public boolean streamFromMemtable()
+        {
+            return true;
+        }
+    }
+
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java b/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java
new file mode 100644
index 000000000000..cd2b7eaee634
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.memtable;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.ConcurrentNavigableMap;
+import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicReference;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.util.concurrent.ListenableFuture;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.AtomicBTreePartition;
+import org.apache.cassandra.db.partitions.BTreePartitionData;
+import org.apache.cassandra.db.partitions.BTreePartitionUpdater;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.IncludingExcludingBounds;
+import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+public class SkipListMemtable extends AbstractAllocatorMemtable
+{
+    private static final Logger logger = LoggerFactory.getLogger(SkipListMemtable.class);
+
+    public static final Factory FACTORY = SkipListMemtable::new;
+
+    private static final int ROW_OVERHEAD_HEAP_SIZE = estimateRowOverhead(Integer.parseInt(System.getProperty("cassandra.memtable_row_overhead_computation_step", "100000")));
+
+    // We index the memtable by PartitionPosition only for the purpose of being able
+    // to select key range using Token.KeyBound. However put() ensures that we
+    // actually only store DecoratedKey.
+    private final ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> partitions = new ConcurrentSkipListMap<>();
+
+    SkipListMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
+    {
+        super(commitLogLowerBound, metadataRef, owner);
+    }
+
+    // Only for testing
+    @VisibleForTesting
+    public SkipListMemtable(TableMetadataRef metadataRef)
+    {
+        this(null, metadataRef, new Owner()
+        {
+            @Override
+            public ListenableFuture<CommitLogPosition> signalFlushRequired(Memtable memtable, ColumnFamilyStore.FlushReason reason)
+            {
+                return null;
+            }
+
+            @Override
+            public Memtable getCurrentMemtable()
+            {
+                return null;
+            }
+
+            @Override
+            public Iterable<Memtable> getIndexMemtables()
+            {
+                return Collections.emptyList();
+            }
+        });
+    }
+
+    protected Factory factory()
+    {
+        return FACTORY;
+    }
+
+    public boolean isClean()
+    {
+        return partitions.isEmpty();
+    }
+
+    /**
+     * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate
+     * OpOrdering.
+     *
+     * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null
+     */
+    public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
+    {
+        AtomicBTreePartition previous = partitions.get(update.partitionKey());
+
+        long initialSize = 0;
+        if (previous == null)
+        {
+            final DecoratedKey cloneKey = allocator.clone(update.partitionKey(), opGroup);
+            AtomicBTreePartition empty = new AtomicBTreePartition(metadata, cloneKey, allocator);
+            // We'll add the columns later. This avoids wasting works if we get beaten in the putIfAbsent
+            previous = partitions.putIfAbsent(cloneKey, empty);
+            if (previous == null)
+            {
+                previous = empty;
+                // allocate the row overhead after the fact; this saves over allocating and having to free after, but
+                // means we can overshoot our declared limit.
+                int overhead = (int) (cloneKey.getToken().getHeapSize() + ROW_OVERHEAD_HEAP_SIZE);
+                allocator.onHeap().allocate(overhead, opGroup);
+                initialSize = 8;
+            }
+        }
+
+        BTreePartitionUpdater updater = previous.addAll(update, opGroup, indexer);
+        updateMin(minTimestamp, previous.stats().minTimestamp);
+        liveDataSize.addAndGet(initialSize + updater.dataSize);
+        columnsCollector.update(update.columns());
+        statsCollector.update(update.stats());
+        currentOperations.addAndGet(update.operationCount());
+        return updater.colUpdateTimeDelta;
+    }
+
+    public long partitionCount()
+    {
+        return partitions.size();
+    }
+
+    public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFilter columnFilter,
+                                                                     final DataRange dataRange)
+    {
+        AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange();
+
+        PartitionPosition left = keyRange.left;
+        PartitionPosition right = keyRange.right;
+
+        boolean isBound = keyRange instanceof Bounds;
+        boolean includeLeft = isBound || keyRange instanceof IncludingExcludingBounds;
+        boolean includeRight = isBound || keyRange instanceof Range;
+        Map<PartitionPosition, AtomicBTreePartition> subMap = getPartitionsSubMap(left,
+                                                                                  includeLeft,
+                                                                                  right,
+                                                                                  includeRight);
+
+        return new MemtableUnfilteredPartitionIterator(metadata.get(), subMap, columnFilter, dataRange);
+    }
+
+    private Map<PartitionPosition, AtomicBTreePartition> getPartitionsSubMap(PartitionPosition left,
+                                                                             boolean includeLeft,
+                                                                             PartitionPosition right,
+                                                                             boolean includeRight)
+    {
+        if (left != null && left.isMinimum())
+            left = null;
+        if (right != null && right.isMinimum())
+            right = null;
+
+        try
+        {
+            if (left == null)
+                return right == null ? partitions : partitions.headMap(right, includeRight);
+            else
+                return right == null
+                       ? partitions.tailMap(left, includeLeft)
+                       : partitions.subMap(left, includeLeft, right, includeRight);
+        }
+        catch (IllegalArgumentException e)
+        {
+            logger.error("Invalid range requested {} - {}", left, right);
+            throw e;
+        }
+    }
+
+    public Partition getPartition(DecoratedKey key)
+    {
+        return partitions.get(key);
+    }
+
+    private static int estimateRowOverhead(final int count)
+    {
+        // calculate row overhead
+        try (final OpOrder.Group group = new OpOrder().start())
+        {
+            int rowOverhead;
+            MemtableAllocator allocator = MEMORY_POOL.newAllocator();
+            ConcurrentNavigableMap<PartitionPosition, Object> partitions = new ConcurrentSkipListMap<>();
+            final Object val = new Object();
+            for (int i = 0 ; i < count ; i++)
+                partitions.put(allocator.clone(new BufferDecoratedKey(new LongToken(i), ByteBufferUtil.EMPTY_BYTE_BUFFER), group), val);
+            double avgSize = ObjectSizes.measureDeep(partitions) / (double) count;
+            rowOverhead = (int) ((avgSize - Math.floor(avgSize)) < 0.05 ? Math.floor(avgSize) : Math.ceil(avgSize));
+            rowOverhead -= ObjectSizes.measureDeep(new LongToken(0));
+            rowOverhead += AtomicBTreePartition.EMPTY_SIZE;
+            rowOverhead += BTreePartitionData.UNSHARED_HEAP_SIZE;
+            allocator.setDiscarding();
+            allocator.setDiscarded();
+            return rowOverhead;
+        }
+    }
+
+    public FlushCollection<?> getFlushSet(PartitionPosition from, PartitionPosition to)
+    {
+        Map<PartitionPosition, AtomicBTreePartition> toFlush = getPartitionsSubMap(from, true, to, false);
+        long keySize = 0;
+
+        boolean trackContention = logger.isTraceEnabled();
+        if (trackContention)
+        {
+            int heavilyContendedRowCount = 0;
+
+            for (AtomicBTreePartition partition : toFlush.values())
+            {
+                keySize += partition.partitionKey().getKey().remaining();
+                if (trackContention && partition.useLock())
+                    heavilyContendedRowCount++;
+            }
+
+            if (heavilyContendedRowCount > 0)
+                logger.trace("High update contention in {}/{} partitions of {} ", heavilyContendedRowCount, toFlush.size(), SkipListMemtable.this);
+        }
+        else
+        {
+            for (PartitionPosition key : toFlush.keySet())
+            {
+                //  make sure we don't write non-sensical keys
+                assert key instanceof DecoratedKey;
+                keySize += ((DecoratedKey) key).getKey().remaining();
+            }
+        }
+        final long partitionKeySize = keySize;
+
+        return new AbstractFlushCollection<AtomicBTreePartition>()
+        {
+            public Memtable memtable()
+            {
+                return SkipListMemtable.this;
+            }
+
+            public PartitionPosition from()
+            {
+                return from;
+            }
+
+            public PartitionPosition to()
+            {
+                return to;
+            }
+
+            public long partitionCount()
+            {
+                return toFlush.size();
+            }
+
+            public Iterator<AtomicBTreePartition> iterator()
+            {
+                return toFlush.values().iterator();
+            }
+
+            public long partitionKeySize()
+            {
+                return partitionKeySize;
+            }
+        };
+    }
+
+
+    public static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator implements Memtable.MemtableUnfilteredPartitionIterator
+    {
+        private final TableMetadata metadata;
+        private final Iterator<Map.Entry<PartitionPosition, AtomicBTreePartition>> iter;
+        private final Map<PartitionPosition, AtomicBTreePartition> source;
+        private final ColumnFilter columnFilter;
+        private final DataRange dataRange;
+
+        public MemtableUnfilteredPartitionIterator(TableMetadata metadata, Map<PartitionPosition, AtomicBTreePartition> map, ColumnFilter columnFilter, DataRange dataRange)
+        {
+            this.metadata = metadata;
+            this.source = map;
+            this.iter = map.entrySet().iterator();
+            this.columnFilter = columnFilter;
+            this.dataRange = dataRange;
+        }
+
+        public int getMinLocalDeletionTime()
+        {
+            int minLocalDeletionTime = Integer.MAX_VALUE;
+            for (AtomicBTreePartition partition : source.values())
+                minLocalDeletionTime = Math.min(minLocalDeletionTime, partition.stats().minLocalDeletionTime);
+
+            return minLocalDeletionTime;
+        }
+
+        public TableMetadata metadata()
+        {
+            return metadata;
+        }
+
+        public boolean hasNext()
+        {
+            return iter.hasNext();
+        }
+
+        public UnfilteredRowIterator next()
+        {
+            Map.Entry<PartitionPosition, AtomicBTreePartition> entry = iter.next();
+            // Actual stored key should be true DecoratedKey
+            assert entry.getKey() instanceof DecoratedKey;
+            DecoratedKey key = (DecoratedKey)entry.getKey();
+            ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key);
+
+            return filter.getUnfilteredRowIterator(columnFilter, entry.getValue());
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
new file mode 100644
index 000000000000..ebbfe7cd78d6
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
@@ -0,0 +1,408 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.memtable;
+
+import java.util.Iterator;
+import java.util.Map;
+import java.util.NavigableSet;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicReference;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Throwables;
+import com.google.common.collect.Iterators;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.BTreePartitionData;
+import org.apache.cassandra.db.partitions.BTreePartitionUpdater;
+import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
+import org.apache.cassandra.db.partitions.Partition;
+import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.tries.MemtableTrie;
+import org.apache.cassandra.db.tries.Trie;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.IncludingExcludingBounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.EnsureOnHeap;
+
+public class TrieMemtable extends AbstractAllocatorMemtable
+{
+    private static final Logger logger = LoggerFactory.getLogger(TrieMemtable.class);
+
+    public static final Factory FACTORY = TrieMemtable::new;
+
+    /** Buffer type to use for memtable tries (on- vs off-heap) */
+    public static final BufferType BUFFER_TYPE;
+    static
+    {
+        switch (DatabaseDescriptor.getMemtableAllocationType())
+        {
+        case unslabbed_heap_buffers:
+        case heap_buffers:
+            BUFFER_TYPE = BufferType.ON_HEAP;
+            break;
+        case offheap_buffers:
+        case offheap_objects:
+            BUFFER_TYPE = BufferType.OFF_HEAP;
+            break;
+        default:
+            throw new AssertionError();
+        }
+    }
+
+    /** If keys is below this length, we will use a recursive procedure for inserting data in the memtable trie. */
+    @VisibleForTesting
+    public static final int MAX_RECURSIVE_KEY_LENGTH = 128;
+
+    /** The byte-ordering conversion version to use for memtables. */
+    public static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS41;
+
+    // Set to true when the memtable requests a switch (e.g. for trie size limit being reached) to ensure only one
+    // thread calls cfs.switchMemtableIfCurrent.
+    private AtomicBoolean switchRequested = new AtomicBoolean(false);
+
+    // We index the memtable by PartitionPosition only for the purpose of being able
+    // to select key range using Token.KeyBound. However put() ensures that we
+    // actually only store DecoratedKey.
+    private final MemtableTrie<BTreePartitionData> partitions = new MemtableTrie<>(BUFFER_TYPE);
+
+    // only to be used by init(), to setup the very first memtable for the cfs
+    TrieMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
+    {
+        super(commitLogLowerBound, metadataRef, owner);
+    }
+
+    protected Factory factory()
+    {
+        return FACTORY;
+    }
+
+    public boolean isClean()
+    {
+        return partitions.isEmpty();
+    }
+
+    /**
+     * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate
+     * OpOrdering.
+     *
+     * commitLogSegmentPosition should only be null if this is a secondary index, in which case it is *expected* to be null
+     */
+    public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
+    {
+        BTreePartitionUpdater updater = new BTreePartitionUpdater(allocator, opGroup, indexer);
+        DecoratedKey key = update.partitionKey();
+
+        // TODO: Improve locking.
+        synchronized (this)
+        {
+            long onHeap = partitions.sizeOnHeap();
+            long offHeap = partitions.sizeOffHeap();
+
+            try
+            {
+                partitions.putSingleton(key, update, updater::mergePartitions, key.getKeyLength() < MAX_RECURSIVE_KEY_LENGTH);
+            }
+            catch (MemtableTrie.SpaceExhaustedException e)
+            {
+                // This should never really happen as a flush would be triggered long before this limit is reached.
+                throw Throwables.propagate(e);
+            }
+
+            allocator.offHeap().adjust(partitions.sizeOffHeap() - offHeap, opGroup);
+            allocator.onHeap().adjust(partitions.sizeOnHeap() - onHeap, opGroup);
+            updateMin(minTimestamp, update.stats().minTimestamp);
+            liveDataSize.addAndGet(updater.dataSize);
+            columnsCollector.update(update.columns());
+            statsCollector.update(update.stats());
+            currentOperations.addAndGet(update.operationCount());
+        }
+
+        if (partitions.reachedAllocatedSizeThreshold() && !switchRequested.getAndSet(true))
+        {
+            logger.info("Scheduling flush due to trie size limit reached.");
+            owner.signalFlushRequired(this, ColumnFamilyStore.FlushReason.MEMTABLE_LIMIT);
+        }
+
+        return updater.colUpdateTimeDelta;
+    }
+
+    public long partitionCount()
+    {
+        return partitions.valuesCount();
+    }
+
+    public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFilter columnFilter, final DataRange dataRange)
+    {
+        AbstractBounds<PartitionPosition> keyRange = dataRange.keyRange();
+
+        PartitionPosition left = keyRange.left;
+        PartitionPosition right = keyRange.right;
+        if (left.isMinimum())
+            left = null;
+        if (right.isMinimum())
+            right = null;
+
+        boolean isBound = keyRange instanceof Bounds;
+        boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds;
+        boolean includeStop = isBound || keyRange instanceof Range;
+
+        Trie<BTreePartitionData> subMap = partitions.subtrie(left, includeStart, right, includeStop);
+
+        return new MemtableUnfilteredPartitionIterator(metadata(),
+                                                       allocator.ensureOnHeap(),
+                                                       subMap,
+                                                       columnFilter,
+                                                       dataRange);
+    }
+
+    public Partition getPartition(DecoratedKey key)
+    {
+        BTreePartitionData data = partitions.get(key);
+        if (data != null)
+            return createPartition(metadata(), allocator.ensureOnHeap(), key, data);
+        else
+            return null;
+    }
+
+    private static MemtablePartition createPartition(TableMetadata metadata, EnsureOnHeap ensureOnHeap, DecoratedKey key, BTreePartitionData data)
+    {
+        return new MemtablePartition(metadata, ensureOnHeap, key, data);
+    }
+
+    private static MemtablePartition getPartitionFromTrieEntry(TableMetadata metadata, EnsureOnHeap ensureOnHeap, Map.Entry<ByteComparable, BTreePartitionData> en)
+    {
+        DecoratedKey key = BufferDecoratedKey.fromByteComparable(en.getKey(),
+                                                                 BYTE_COMPARABLE_VERSION,
+                                                                 metadata.partitioner);
+        return createPartition(metadata, ensureOnHeap, key, en.getValue());
+    }
+
+
+    public FlushCollection<MemtablePartition> getFlushSet(PartitionPosition from, PartitionPosition to)
+    {
+        Trie<BTreePartitionData> toFlush = partitions.subtrie(from, true, to, false);
+        long keySize = 0;
+        int keyCount = 0;
+
+        for (Iterator<Map.Entry<ByteComparable, BTreePartitionData>> it = toFlush.entryIterator(); it.hasNext(); )
+        {
+            Map.Entry<ByteComparable, BTreePartitionData> en = it.next();
+            ByteComparable byteComparable = v -> en.getKey().asPeekableBytes(BYTE_COMPARABLE_VERSION);
+            byte[] keyBytes = DecoratedKey.keyFromByteComparable(byteComparable, BYTE_COMPARABLE_VERSION, metadata().partitioner);
+            keySize += keyBytes.length;
+            keyCount++;
+        }
+        long partitionKeySize = keySize;
+        int partitionCount = keyCount;
+
+        return new AbstractFlushCollection<MemtablePartition>()
+        {
+            public Memtable memtable()
+            {
+                return TrieMemtable.this;
+            }
+
+            public PartitionPosition from()
+            {
+                return from;
+            }
+
+            public PartitionPosition to()
+            {
+                return to;
+            }
+
+            public long partitionCount()
+            {
+                return partitionCount;
+            }
+
+            public Iterator<MemtablePartition> iterator()
+            {
+                return Iterators.transform(toFlush.entryIterator(),
+                                           // TODO: During flushing we shouldn't need to copy partition data on heap because the memtable can't
+                                           // disappear until we are done with the flush. Figure out why EnsureOnHeap.NOOP doesn't work.
+                                           entry -> getPartitionFromTrieEntry(metadata(), allocator.ensureOnHeap(), entry));
+            }
+
+            public long partitionKeySize()
+            {
+                return partitionKeySize;
+            }
+        };
+    }
+
+    static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator implements Memtable.MemtableUnfilteredPartitionIterator
+    {
+        private final TableMetadata metadata;
+        private final EnsureOnHeap ensureOnHeap;
+        private final Trie<BTreePartitionData> source;
+        private final Iterator<Map.Entry<ByteComparable, BTreePartitionData>> iter;
+        private final ColumnFilter columnFilter;
+        private final DataRange dataRange;
+
+        public MemtableUnfilteredPartitionIterator(TableMetadata metadata,
+                                                   EnsureOnHeap ensureOnHeap,
+                                                   Trie<BTreePartitionData> source,
+                                                   ColumnFilter columnFilter,
+                                                   DataRange dataRange)
+        {
+            this.metadata = metadata;
+            this.ensureOnHeap = ensureOnHeap;
+            this.iter = source.entryIterator();
+            this.source = source;
+            this.columnFilter = columnFilter;
+            this.dataRange = dataRange;
+        }
+
+        public int getMinLocalDeletionTime()
+        {
+            int minLocalDeletionTime = Integer.MAX_VALUE;
+            for (BTreePartitionData partition : source.values())
+                minLocalDeletionTime = Math.min(minLocalDeletionTime, partition.stats.minLocalDeletionTime);
+
+            return minLocalDeletionTime;
+        }
+
+        public TableMetadata metadata()
+        {
+            return metadata;
+        }
+
+        public boolean hasNext()
+        {
+            return iter.hasNext();
+        }
+
+        public UnfilteredRowIterator next()
+        {
+            Partition partition = getPartitionFromTrieEntry(metadata(), ensureOnHeap, iter.next());
+            DecoratedKey key = partition.partitionKey();
+            ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(key);
+
+            return filter.getUnfilteredRowIterator(columnFilter, partition);
+        }
+    }
+
+    static class MemtablePartition extends ImmutableBTreePartition
+    {
+
+        private final EnsureOnHeap ensureOnHeap;
+
+        private MemtablePartition(TableMetadata table, EnsureOnHeap ensureOnHeap, DecoratedKey key, BTreePartitionData data)
+        {
+            super(table, key, data);
+            this.ensureOnHeap = ensureOnHeap;
+        }
+
+        @Override
+        protected boolean canHaveShadowedData()
+        {
+            // The BtreePartitionData we store in the memtable are build iteratively by BTreePartitionData.add(), which
+            // doesn't make sure there isn't shadowed data, so we'll need to eliminate any.
+            return true;
+        }
+
+
+        @Override
+        public DeletionInfo deletionInfo()
+        {
+            return ensureOnHeap.applyToDeletionInfo(super.deletionInfo());
+        }
+
+        @Override
+        public Row staticRow()
+        {
+            return ensureOnHeap.applyToStatic(super.staticRow());
+        }
+
+        @Override
+        public DecoratedKey partitionKey()
+        {
+            return ensureOnHeap.applyToPartitionKey(super.partitionKey());
+        }
+
+        @Override
+        public Row getRow(Clustering<?> clustering)
+        {
+            return ensureOnHeap.applyToRow(super.getRow(clustering));
+        }
+
+        @Override
+        public Row lastRow()
+        {
+            return ensureOnHeap.applyToRow(super.lastRow());
+        }
+
+        @Override
+        public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed)
+        {
+            return ensureOnHeap.applyToPartition(super.unfilteredIterator(selection, slices, reversed));
+        }
+
+        @Override
+        public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, NavigableSet<Clustering<?>> clusteringsInQueryOrder, boolean reversed)
+        {
+            return ensureOnHeap
+                            .applyToPartition(super.unfilteredIterator(selection, clusteringsInQueryOrder, reversed));
+        }
+
+        @Override
+        public UnfilteredRowIterator unfilteredIterator()
+        {
+            return ensureOnHeap.applyToPartition(super.unfilteredIterator());
+        }
+
+        @Override
+        public UnfilteredRowIterator unfilteredIterator(BTreePartitionData current, ColumnFilter selection, Slices slices, boolean reversed)
+        {
+            return ensureOnHeap
+                            .applyToPartition(super.unfilteredIterator(current, selection, slices, reversed));
+        }
+
+        @Override
+        public Iterator<Row> iterator()
+        {
+            return ensureOnHeap.applyToPartition(super.iterator());
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java
index 1d6603eec2f2..7e3fdc5eec4f 100644
--- a/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java
+++ b/src/java/org/apache/cassandra/db/partitions/AbstractBTreePartition.java
@@ -36,12 +36,9 @@
 
 public abstract class AbstractBTreePartition implements Partition, Iterable<Row>
 {
-    protected static final Holder EMPTY = new Holder(RegularAndStaticColumns.NONE, BTree.empty(), DeletionInfo.LIVE, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
-    public static final long HOLDER_UNSHARED_HEAP_SIZE = ObjectSizes.measure(EMPTY);
-
     protected final DecoratedKey partitionKey;
 
-    protected abstract Holder holder();
+    protected abstract BTreePartitionData holder();
     protected abstract boolean canHaveShadowedData();
 
     protected AbstractBTreePartition(DecoratedKey partitionKey)
@@ -49,25 +46,6 @@ protected AbstractBTreePartition(DecoratedKey partitionKey)
         this.partitionKey = partitionKey;
     }
 
-    protected static final class Holder
-    {
-        final RegularAndStaticColumns columns;
-        final DeletionInfo deletionInfo;
-        // the btree of rows
-        final Object[] tree;
-        final Row staticRow;
-        final EncodingStats stats;
-
-        Holder(RegularAndStaticColumns columns, Object[] tree, DeletionInfo deletionInfo, Row staticRow, EncodingStats stats)
-        {
-            this.columns = columns;
-            this.tree = tree;
-            this.deletionInfo = deletionInfo;
-            this.staticRow = staticRow == null ? Rows.EMPTY_STATIC_ROW : staticRow;
-            this.stats = stats;
-        }
-    }
-
     public DeletionInfo deletionInfo()
     {
         return holder().deletionInfo;
@@ -80,13 +58,13 @@ public Row staticRow()
 
     public boolean isEmpty()
     {
-        Holder holder = holder();
+        BTreePartitionData holder = holder();
         return holder.deletionInfo.isLive() && BTree.isEmpty(holder.tree) && holder.staticRow.isEmpty();
     }
 
     public boolean hasRows()
     {
-        Holder holder = holder();
+        BTreePartitionData holder = holder();
         return !BTree.isEmpty(holder.tree);
     }
 
@@ -115,7 +93,7 @@ public EncodingStats stats()
     public Row getRow(Clustering<?> clustering)
     {
         ColumnFilter columns = ColumnFilter.selection(columns());
-        Holder holder = holder();
+        BTreePartitionData holder = holder();
 
         if (clustering == Clustering.STATIC_CLUSTERING)
         {
@@ -145,7 +123,7 @@ public Row getRow(Clustering<?> clustering)
         return row.filter(columns, activeDeletion, true, metadata());
     }
 
-    private Row staticRow(Holder current, ColumnFilter columns, boolean setActiveDeletionToRow)
+    private Row staticRow(BTreePartitionData current, ColumnFilter columns, boolean setActiveDeletionToRow)
     {
         DeletionTime partitionDeletion = current.deletionInfo.getPartitionDeletion();
         if (columns.fetchedColumns().statics.isEmpty() || (current.staticRow.isEmpty() && partitionDeletion.isLive()))
@@ -178,7 +156,7 @@ public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices s
         return unfilteredIterator(holder(), selection, slices, reversed);
     }
 
-    public UnfilteredRowIterator unfilteredIterator(Holder current, ColumnFilter selection, Slices slices, boolean reversed)
+    public UnfilteredRowIterator unfilteredIterator(BTreePartitionData current, ColumnFilter selection, Slices slices, boolean reversed)
     {
         Row staticRow = staticRow(current, selection, false);
         if (slices.size() == 0)
@@ -192,7 +170,7 @@ public UnfilteredRowIterator unfilteredIterator(Holder current, ColumnFilter sel
                : new SlicesIterator(selection, slices, reversed, current, staticRow);
     }
 
-    private UnfilteredRowIterator sliceIterator(ColumnFilter selection, Slice slice, boolean reversed, Holder current, Row staticRow)
+    private UnfilteredRowIterator sliceIterator(ColumnFilter selection, Slice slice, boolean reversed, BTreePartitionData current, Row staticRow)
     {
         ClusteringBound<?> start = slice.start().isBottom() ? null : slice.start();
         ClusteringBound<?> end = slice.end().isTop() ? null : slice.end();
@@ -202,7 +180,7 @@ private UnfilteredRowIterator sliceIterator(ColumnFilter selection, Slice slice,
     }
 
     private RowAndDeletionMergeIterator merge(Iterator<Row> rowIter, Iterator<RangeTombstone> deleteIter,
-                                              ColumnFilter selection, boolean reversed, Holder current, Row staticRow)
+                                              ColumnFilter selection, boolean reversed, BTreePartitionData current, Row staticRow)
     {
         return new RowAndDeletionMergeIterator(metadata(), partitionKey(), current.deletionInfo.getPartitionDeletion(),
                                                selection, staticRow, reversed, current.stats,
@@ -212,10 +190,10 @@ private RowAndDeletionMergeIterator merge(Iterator<Row> rowIter, Iterator<RangeT
 
     private abstract class AbstractIterator extends AbstractUnfilteredRowIterator
     {
-        final Holder current;
+        final BTreePartitionData current;
         final ColumnFilter selection;
 
-        private AbstractIterator(Holder current, Row staticRow, ColumnFilter selection, boolean isReversed)
+        private AbstractIterator(BTreePartitionData current, Row staticRow, ColumnFilter selection, boolean isReversed)
         {
             super(AbstractBTreePartition.this.metadata(),
                   AbstractBTreePartition.this.partitionKey(),
@@ -238,7 +216,7 @@ private class SlicesIterator extends AbstractIterator
         private int idx;
         private Iterator<Unfiltered> currentSlice;
 
-        private SlicesIterator(ColumnFilter selection, Slices slices, boolean isReversed, Holder current, Row staticRow)
+        private SlicesIterator(ColumnFilter selection, Slices slices, boolean isReversed, BTreePartitionData current, Row staticRow)
         {
             super(current, staticRow, selection, isReversed);
             this.slices = slices;
@@ -276,7 +254,7 @@ private class ClusteringsIterator extends AbstractIterator
         private ClusteringsIterator(ColumnFilter selection,
                                     NavigableSet<Clustering<?>> clusteringsInQueryOrder,
                                     boolean isReversed,
-                                    Holder current,
+                                    BTreePartitionData current,
                                     Row staticRow)
         {
             super(current, staticRow, selection, isReversed);
@@ -319,12 +297,12 @@ private Iterator<Unfiltered> nextIterator(Clustering<?> next)
         }
     }
 
-    protected static Holder build(UnfilteredRowIterator iterator, int initialRowCapacity)
+    protected static BTreePartitionData build(UnfilteredRowIterator iterator, int initialRowCapacity)
     {
         return build(iterator, initialRowCapacity, true);
     }
 
-    protected static Holder build(UnfilteredRowIterator iterator, int initialRowCapacity, boolean ordered)
+    protected static BTreePartitionData build(UnfilteredRowIterator iterator, int initialRowCapacity, boolean ordered)
     {
         TableMetadata metadata = iterator.metadata();
         RegularAndStaticColumns columns = iterator.columns();
@@ -346,12 +324,12 @@ protected static Holder build(UnfilteredRowIterator iterator, int initialRowCapa
         if (reversed)
             builder.reverse();
 
-        return new Holder(columns, builder.build(), deletionBuilder.build(), iterator.staticRow(), iterator.stats());
+        return new BTreePartitionData(columns, builder.build(), deletionBuilder.build(), iterator.staticRow(), iterator.stats());
     }
 
     // Note that when building with a RowIterator, deletion will generally be LIVE, but we allow to pass it nonetheless because PartitionUpdate
     // passes a MutableDeletionInfo that it mutates later.
-    protected static Holder build(RowIterator rows, DeletionInfo deletion, boolean buildEncodingStats, int initialRowCapacity)
+    protected static BTreePartitionData build(RowIterator rows, DeletionInfo deletion, boolean buildEncodingStats, int initialRowCapacity)
     {
         TableMetadata metadata = rows.metadata();
         RegularAndStaticColumns columns = rows.columns();
@@ -369,7 +347,7 @@ protected static Holder build(RowIterator rows, DeletionInfo deletion, boolean b
         Object[] tree = builder.build();
         EncodingStats stats = buildEncodingStats ? EncodingStats.Collector.collect(staticRow, BTree.iterator(tree), deletion)
                                                  : EncodingStats.NO_STATS;
-        return new Holder(columns, tree, deletion, staticRow, stats);
+        return new BTreePartitionData(columns, tree, deletion, staticRow, stats);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java
index 801d9e2338f7..d5c82845f15c 100644
--- a/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java
+++ b/src/java/org/apache/cassandra/db/partitions/AtomicBTreePartition.java
@@ -18,25 +18,20 @@
 package org.apache.cassandra.db.partitions;
 
 import java.nio.ByteBuffer;
-import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.List;
 import java.util.NavigableSet;
 import java.util.concurrent.atomic.AtomicIntegerFieldUpdater;
 import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
 
+import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.*;
-import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.utils.ObjectSizes;
-import org.apache.cassandra.utils.btree.BTree;
-import org.apache.cassandra.utils.btree.UpdateFunction;
 import org.apache.cassandra.utils.concurrent.OpOrder;
-import org.apache.cassandra.utils.memory.HeapAllocator;
 import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 /**
@@ -67,7 +62,7 @@ public final class AtomicBTreePartition extends AbstractBTreePartition
     // CLOCK_GRANULARITY = 1^9ns >> CLOCK_SHIFT == 132us == (1/7.63)ms
 
     private static final AtomicIntegerFieldUpdater<AtomicBTreePartition> wasteTrackerUpdater = AtomicIntegerFieldUpdater.newUpdater(AtomicBTreePartition.class, "wasteTracker");
-    private static final AtomicReferenceFieldUpdater<AtomicBTreePartition, Holder> refUpdater = AtomicReferenceFieldUpdater.newUpdater(AtomicBTreePartition.class, Holder.class, "ref");
+    private static final AtomicReferenceFieldUpdater<AtomicBTreePartition, BTreePartitionData> refUpdater = AtomicReferenceFieldUpdater.newUpdater(AtomicBTreePartition.class, BTreePartitionData.class, "ref");
 
     /**
      * (clock + allocation) granularity are combined to give us an acceptable (waste) allocation rate that is defined by
@@ -80,7 +75,7 @@ public final class AtomicBTreePartition extends AbstractBTreePartition
     private volatile int wasteTracker = TRACKER_NEVER_WASTED;
 
     private final MemtableAllocator allocator;
-    private volatile Holder ref;
+    private volatile BTreePartitionData ref;
 
     private final TableMetadataRef metadata;
 
@@ -90,10 +85,10 @@ public AtomicBTreePartition(TableMetadataRef metadata, DecoratedKey partitionKey
         super(partitionKey);
         this.metadata = metadata;
         this.allocator = allocator;
-        this.ref = EMPTY;
+        this.ref = BTreePartitionData.EMPTY;
     }
 
-    protected Holder holder()
+    protected BTreePartitionData holder()
     {
         return ref;
     }
@@ -108,90 +103,71 @@ protected boolean canHaveShadowedData()
         return true;
     }
 
-    private long[] addAllWithSizeDeltaInternal(RowUpdater updater, PartitionUpdate update, UpdateTransaction indexer)
-    {
-        Holder current = ref;
-        updater.ref = current;
-        updater.reset();
-
-        if (!update.deletionInfo().getPartitionDeletion().isLive())
-            indexer.onPartitionDeletion(update.deletionInfo().getPartitionDeletion());
-
-        if (update.deletionInfo().hasRanges())
-            update.deletionInfo().rangeIterator(false).forEachRemaining(indexer::onRangeTombstone);
-
-        DeletionInfo deletionInfo;
-        if (update.deletionInfo().mayModify(current.deletionInfo))
-        {
-            if (updater.inputDeletionInfoCopy == null)
-                updater.inputDeletionInfoCopy = update.deletionInfo().copy(HeapAllocator.instance);
-
-            deletionInfo = current.deletionInfo.mutableCopy().add(updater.inputDeletionInfoCopy);
-            updater.allocated(deletionInfo.unsharedHeapSize() - current.deletionInfo.unsharedHeapSize());
-        }
-        else
-        {
-            deletionInfo = current.deletionInfo;
-        }
-
-        RegularAndStaticColumns columns = update.columns().mergeTo(current.columns);
-        updater.allocated(columns.unsharedHeapSize() - current.columns.unsharedHeapSize());
-        Row newStatic = update.staticRow();
-        Row staticRow = newStatic.isEmpty()
-                        ? current.staticRow
-                        : (current.staticRow.isEmpty() ? updater.apply(newStatic) : updater.apply(current.staticRow, newStatic));
-        Object[] tree = BTree.update(current.tree, update.metadata().comparator, update, update.rowCount(), updater);
-        EncodingStats newStats = current.stats.mergeWith(update.stats());
-        updater.allocated(newStats.unsharedHeapSize() - current.stats.unsharedHeapSize());
-
-        if (tree != null && refUpdater.compareAndSet(this, current, new Holder(columns, tree, deletionInfo, staticRow, newStats)))
-        {
-            updater.finish();
-            return new long[]{ updater.dataSize, updater.colUpdateTimeDelta };
-        }
-        else
-        {
-            return null;
-        }
-    }
     /**
      * Adds a given update to this in-memtable partition.
      *
      * @return an array containing first the difference in size seen after merging the updates, and second the minimum
      * time detla between updates.
      */
-    public long[] addAllWithSizeDelta(final PartitionUpdate update, OpOrder.Group writeOp, UpdateTransaction indexer)
+    public BTreePartitionUpdater addAll(final PartitionUpdate update, OpOrder.Group writeOp, UpdateTransaction indexer)
     {
-        RowUpdater updater = new RowUpdater(this, allocator, writeOp, indexer);
-        try
+        return new Updater(allocator, writeOp, indexer).addAll(update);
+    }
+
+    class Updater extends BTreePartitionUpdater
+    {
+        BTreePartitionData current;
+
+        public Updater(MemtableAllocator allocator, OpOrder.Group writeOp, UpdateTransaction indexer)
         {
-            boolean shouldLock = shouldLock(writeOp);
-            indexer.start();
+            super(allocator, writeOp, indexer);
+        }
 
-            while (true)
+        Updater addAll(final PartitionUpdate update)
+        {
+            try
             {
-                if (shouldLock)
+                boolean shouldLock = shouldLock(writeOp);
+                indexer.start();
+
+                while (true)
                 {
-                    synchronized (this)
+                    if (shouldLock)
                     {
-                        long[] result = addAllWithSizeDeltaInternal(updater, update, indexer);
-                        if (result != null)
-                            return result;
+                        synchronized (this)
+                        {
+                            if (tryUpdateData(update))
+                                return this;
+                        }
                     }
-                }
-                else
-                {
-                    long[] result = addAllWithSizeDeltaInternal(updater, update, indexer);
-                    if (result != null)
-                        return result;
+                    else
+                    {
+                        if (tryUpdateData(update))
+                            return this;
 
-                    shouldLock = shouldLock(updater.heapSize, writeOp);
+                        shouldLock = shouldLock(heapSize, writeOp);
+                    }
                 }
             }
+            finally
+            {
+                indexer.commit();
+                reportAllocatedMemory();
+            }
         }
-        finally
+
+        private boolean tryUpdateData(PartitionUpdate update)
         {
-            indexer.commit();
+            current = ref;
+            this.dataSize = 0;
+            this.heapSize = 0;
+            BTreePartitionData result = makeMergedPartition(current, update);
+            return refUpdater.compareAndSet(AtomicBTreePartition.this, current, result);
+        }
+
+        public boolean abortEarly()
+        {
+            return ref != current;
         }
     }
 
@@ -244,7 +220,7 @@ public UnfilteredRowIterator unfilteredIterator()
     }
 
     @Override
-    public UnfilteredRowIterator unfilteredIterator(Holder current, ColumnFilter selection, Slices slices, boolean reversed)
+    public UnfilteredRowIterator unfilteredIterator(BTreePartitionData current, ColumnFilter selection, Slices slices, boolean reversed)
     {
         return allocator.ensureOnHeap().applyToPartition(super.unfilteredIterator(current, selection, slices, reversed));
     }
@@ -328,94 +304,4 @@ private static int avoidReservedValues(int wasteTracker)
             return wasteTracker + 1;
         return wasteTracker;
     }
-
-    // the function we provide to the btree utilities to perform any column replacements
-    private static final class RowUpdater implements UpdateFunction<Row, Row>
-    {
-        final AtomicBTreePartition updating;
-        final MemtableAllocator allocator;
-        final OpOrder.Group writeOp;
-        final UpdateTransaction indexer;
-        Holder ref;
-        Row.Builder regularBuilder;
-        long dataSize;
-        long heapSize;
-        long colUpdateTimeDelta = Long.MAX_VALUE;
-        List<Row> inserted; // TODO: replace with walk of aborted BTree
-
-        DeletionInfo inputDeletionInfoCopy = null;
-
-        private RowUpdater(AtomicBTreePartition updating, MemtableAllocator allocator, OpOrder.Group writeOp, UpdateTransaction indexer)
-        {
-            this.updating = updating;
-            this.allocator = allocator;
-            this.writeOp = writeOp;
-            this.indexer = indexer;
-        }
-
-        private Row.Builder builder(Clustering<?> clustering)
-        {
-            boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;
-            // We know we only insert/update one static per PartitionUpdate, so no point in saving the builder
-            if (isStatic)
-                return allocator.rowBuilder(writeOp);
-
-            if (regularBuilder == null)
-                regularBuilder = allocator.rowBuilder(writeOp);
-            return regularBuilder;
-        }
-
-        public Row apply(Row insert)
-        {
-            Row data = Rows.copy(insert, builder(insert.clustering())).build();
-            indexer.onInserted(insert);
-
-            this.dataSize += data.dataSize();
-            allocated(data.unsharedHeapSizeExcludingData());
-            if (inserted == null)
-                inserted = new ArrayList<>();
-            inserted.add(data);
-            return data;
-        }
-
-        public Row apply(Row existing, Row update)
-        {
-            Row.Builder builder = builder(existing.clustering());
-            colUpdateTimeDelta = Math.min(colUpdateTimeDelta, Rows.merge(existing, update, builder));
-
-            Row reconciled = builder.build();
-
-            indexer.onUpdated(existing, reconciled);
-
-            dataSize += reconciled.dataSize() - existing.dataSize();
-            allocated(reconciled.unsharedHeapSizeExcludingData() - existing.unsharedHeapSizeExcludingData());
-            if (inserted == null)
-                inserted = new ArrayList<>();
-            inserted.add(reconciled);
-
-            return reconciled;
-        }
-
-        protected void reset()
-        {
-            this.dataSize = 0;
-            this.heapSize = 0;
-            if (inserted != null)
-                inserted.clear();
-        }
-        public boolean abortEarly()
-        {
-            return updating.ref != ref;
-        }
-
-        public void allocated(long heapSize)
-        {
-            this.heapSize += heapSize;
-        }
-
-        protected void finish()
-        {
-            allocator.onHeap().adjust(heapSize, writeOp);
-        }
-    }
 }
diff --git a/src/java/org/apache/cassandra/db/partitions/BTreePartitionData.java b/src/java/org/apache/cassandra/db/partitions/BTreePartitionData.java
new file mode 100644
index 000000000000..1f0320d37fdc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/BTreePartitionData.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.utils.ObjectSizes;
+import org.apache.cassandra.utils.btree.BTree;
+
+/**
+ * Holder of the content of a partition, see AbstractBTreePartition.
+ * When updating a partition one holder is swapped for another atomically.
+ */
+public final class BTreePartitionData
+{
+    public static final BTreePartitionData EMPTY = new BTreePartitionData(RegularAndStaticColumns.NONE, BTree.empty(), DeletionInfo.LIVE, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+    public static final long UNSHARED_HEAP_SIZE = ObjectSizes.measure(EMPTY);
+
+
+    final RegularAndStaticColumns columns;
+    final DeletionInfo deletionInfo;
+    // the btree of rows
+    final Object[] tree;
+    final Row staticRow;
+    public final EncodingStats stats;
+
+    BTreePartitionData(RegularAndStaticColumns columns,
+                       Object[] tree,
+                       DeletionInfo deletionInfo,
+                       Row staticRow,
+                       EncodingStats stats)
+    {
+        this.columns = columns;
+        this.tree = tree;
+        this.deletionInfo = deletionInfo;
+        this.staticRow = staticRow == null ? Rows.EMPTY_STATIC_ROW : staticRow;
+        this.stats = stats;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java b/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java
new file mode 100644
index 000000000000..75253aa24019
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/partitions/BTreePartitionUpdater.java
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.partitions;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DeletionInfo;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.index.transactions.UpdateTransaction;
+import org.apache.cassandra.utils.btree.BTree;
+import org.apache.cassandra.utils.btree.UpdateFunction;
+import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.HeapAllocator;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
+
+/**
+ *  the function we provide to the trie and btree utilities to perform any row and column replacements
+ */
+public class BTreePartitionUpdater implements UpdateFunction<Row, Row>
+{
+    final MemtableAllocator allocator;
+    final OpOrder.Group writeOp;
+    final UpdateTransaction indexer;
+    Row.Builder regularBuilder;
+    public long dataSize;
+    long heapSize;
+    public long colUpdateTimeDelta = Long.MAX_VALUE;
+
+    public BTreePartitionUpdater(MemtableAllocator allocator, OpOrder.Group writeOp, UpdateTransaction indexer)
+    {
+        this.allocator = allocator;
+        this.writeOp = writeOp;
+        this.indexer = indexer;
+        this.heapSize = 0;
+        this.dataSize = 0;
+    }
+
+    private Row.Builder builder(Clustering<?> clustering)
+    {
+        boolean isStatic = clustering == Clustering.STATIC_CLUSTERING;
+        // We know we only insert/update one static per PartitionUpdate, so no point in saving the builder
+        if (isStatic)
+            return allocator.rowBuilder(writeOp);
+
+        if (regularBuilder == null)
+            regularBuilder = allocator.rowBuilder(writeOp);
+        return regularBuilder;
+    }
+
+    public Row apply(Row insert)
+    {
+        Row data = Rows.copy(insert, builder(insert.clustering())).build();
+        indexer.onInserted(insert);
+
+        this.dataSize += data.dataSize();
+        allocated(data.unsharedHeapSizeExcludingData());
+        return data;
+    }
+
+    public Row apply(Row existing, Row update)
+    {
+        Row.Builder builder = builder(existing.clustering());
+        colUpdateTimeDelta = Math.min(colUpdateTimeDelta, Rows.merge(existing, update, builder));
+
+        Row reconciled = builder.build();
+
+        indexer.onUpdated(existing, reconciled);
+
+        dataSize += reconciled.dataSize() - existing.dataSize();
+        allocated(reconciled.unsharedHeapSizeExcludingData() - existing.unsharedHeapSizeExcludingData());
+
+        return reconciled;
+    }
+
+    private DeletionInfo apply(DeletionInfo existing, DeletionInfo update)
+    {
+        if (update.isLive() || !update.mayModify(existing))
+            return existing;
+
+        if (!update.getPartitionDeletion().isLive())
+            indexer.onPartitionDeletion(update.getPartitionDeletion());
+
+        if (update.hasRanges())
+            update.rangeIterator(false).forEachRemaining(indexer::onRangeTombstone);
+
+        // Like for rows, we have to clone the update in case internal buffers (when it has range tombstones) reference
+        // memory we shouldn't hold into. But we don't ever store this off-heap currently so we just default to the
+        // HeapAllocator (rather than using 'allocator').
+        DeletionInfo newInfo = existing.mutableCopy().add(update.copy(HeapAllocator.instance));
+        allocated(newInfo.unsharedHeapSize() - existing.unsharedHeapSize());
+        return newInfo;
+    }
+
+    public BTreePartitionData mergePartitions(BTreePartitionData current, final PartitionUpdate update)
+    {
+        if (current == null)
+        {
+            current = BTreePartitionData.EMPTY;
+            this.allocated(BTreePartitionData.UNSHARED_HEAP_SIZE);
+        }
+
+        try
+        {
+            indexer.start();
+
+            return makeMergedPartition(current, update);
+        }
+        finally
+        {
+            indexer.commit();
+            reportAllocatedMemory();
+        }
+    }
+
+    protected BTreePartitionData makeMergedPartition(BTreePartitionData current, PartitionUpdate update)
+    {
+        DeletionInfo newDeletionInfo = apply(current.deletionInfo, update.deletionInfo());
+
+        RegularAndStaticColumns columns = current.columns;
+        RegularAndStaticColumns newColumns = update.columns().mergeTo(columns);
+        allocated(newColumns.unsharedHeapSize() - columns.unsharedHeapSize());
+        Row newStatic = update.staticRow();
+        newStatic = newStatic.isEmpty()
+                    ? current.staticRow
+                    : (current.staticRow.isEmpty()
+                       ? this.apply(newStatic)
+                       : this.apply(current.staticRow, newStatic));
+
+        Object[] tree = BTree.update(current.tree, update.metadata().comparator, update, update.rowCount(), this);
+        EncodingStats newStats = current.stats.mergeWith(update.stats());
+        allocated(newStats.unsharedHeapSize() - current.stats.unsharedHeapSize());
+
+        return new BTreePartitionData(newColumns, tree, newDeletionInfo, newStatic, newStats);
+    }
+
+    public boolean abortEarly()
+    {
+        return false;
+    }
+
+    public void allocated(long heapSize)
+    {
+        this.heapSize += heapSize;
+    }
+
+    public void reportAllocatedMemory()
+    {
+        allocator.onHeap().adjust(heapSize, writeOp);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java
index 2183a9852a49..f09f75aa5864 100644
--- a/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java
+++ b/src/java/org/apache/cassandra/db/partitions/CachedBTreePartition.java
@@ -40,7 +40,7 @@ public class CachedBTreePartition extends ImmutableBTreePartition implements Cac
 
     private CachedBTreePartition(TableMetadata metadata,
                                  DecoratedKey partitionKey,
-                                 Holder holder,
+                                 BTreePartitionData holder,
                                  int createdAtInSec,
                                  int cachedLiveRows,
                                  int rowsWithNonExpiringCells)
@@ -80,7 +80,7 @@ public static CachedBTreePartition create(UnfilteredRowIterator iterator, int no
      */
     public static CachedBTreePartition create(UnfilteredRowIterator iterator, int initialRowCapacity, int nowInSec)
     {
-        Holder holder = ImmutableBTreePartition.build(iterator, initialRowCapacity);
+        BTreePartitionData holder = ImmutableBTreePartition.build(iterator, initialRowCapacity);
 
         int cachedLiveRows = 0;
         int rowsWithNonExpiringCells = 0;
@@ -180,7 +180,7 @@ public CachedPartition deserialize(DataInputPlus in) throws IOException
             UnfilteredRowIteratorSerializer.Header header = UnfilteredRowIteratorSerializer.serializer.deserializeHeader(metadata, null, in, version, DeserializationHelper.Flag.LOCAL);
             assert !header.isReversed && header.rowEstimate >= 0;
 
-            Holder holder;
+            BTreePartitionData holder;
             try (UnfilteredRowIterator partition = UnfilteredRowIteratorSerializer.serializer.deserialize(in, version, metadata, DeserializationHelper.Flag.LOCAL, header))
             {
                 holder = ImmutableBTreePartition.build(partition, header.rowEstimate);
diff --git a/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java b/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java
index 5139d40134b9..661725566002 100644
--- a/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java
+++ b/src/java/org/apache/cassandra/db/partitions/ImmutableBTreePartition.java
@@ -27,7 +27,7 @@
 public class ImmutableBTreePartition extends AbstractBTreePartition
 {
 
-    protected final Holder holder;
+    protected final BTreePartitionData holder;
     protected final TableMetadata metadata;
 
     public ImmutableBTreePartition(TableMetadata metadata,
@@ -40,12 +40,12 @@ public ImmutableBTreePartition(TableMetadata metadata,
     {
         super(partitionKey);
         this.metadata = metadata;
-        this.holder = new Holder(columns, tree, deletionInfo, staticRow, stats);
+        this.holder = new BTreePartitionData(columns, tree, deletionInfo, staticRow, stats);
     }
 
     protected ImmutableBTreePartition(TableMetadata metadata,
                                       DecoratedKey partitionKey,
-                                      Holder holder)
+                                      BTreePartitionData holder)
     {
         super(partitionKey);
         this.metadata = metadata;
@@ -119,7 +119,7 @@ public TableMetadata metadata()
         return metadata;
     }
 
-    protected Holder holder()
+    protected BTreePartitionData holder()
     {
         return holder;
     }
diff --git a/src/java/org/apache/cassandra/db/partitions/Partition.java b/src/java/org/apache/cassandra/db/partitions/Partition.java
index b6297a1c3979..4fb5148cb129 100644
--- a/src/java/org/apache/cassandra/db/partitions/Partition.java
+++ b/src/java/org/apache/cassandra/db/partitions/Partition.java
@@ -50,6 +50,11 @@ public interface Partition
      */
     public boolean isEmpty();
 
+    /**
+     * Whether the partition object has rows. This may be true but partition still be non-empty if it has a deletion.
+     */
+    boolean hasRows();
+
     /**
      * Returns the row corresponding to the provided clustering, or null if there is not such row.
      *
diff --git a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java
index ce1a8508c203..8dffa5e17a27 100644
--- a/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java
+++ b/src/java/org/apache/cassandra/db/partitions/PartitionUpdate.java
@@ -61,7 +61,7 @@ public class PartitionUpdate extends AbstractBTreePartition
 
     public static final PartitionUpdateSerializer serializer = new PartitionUpdateSerializer();
 
-    private final Holder holder;
+    private final BTreePartitionData holder;
     private final DeletionInfo deletionInfo;
     private final TableMetadata metadata;
 
@@ -69,7 +69,7 @@ public class PartitionUpdate extends AbstractBTreePartition
 
     private PartitionUpdate(TableMetadata metadata,
                             DecoratedKey key,
-                            Holder holder,
+                            BTreePartitionData holder,
                             MutableDeletionInfo deletionInfo,
                             boolean canHaveShadowedData)
     {
@@ -91,7 +91,7 @@ private PartitionUpdate(TableMetadata metadata,
     public static PartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedKey key)
     {
         MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
-        Holder holder = new Holder(RegularAndStaticColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+        BTreePartitionData holder = new BTreePartitionData(RegularAndStaticColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
         return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
     }
 
@@ -108,7 +108,7 @@ public static PartitionUpdate emptyUpdate(TableMetadata metadata, DecoratedKey k
     public static PartitionUpdate fullPartitionDelete(TableMetadata metadata, DecoratedKey key, long timestamp, int nowInSec)
     {
         MutableDeletionInfo deletionInfo = new MutableDeletionInfo(timestamp, nowInSec);
-        Holder holder = new Holder(RegularAndStaticColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
+        BTreePartitionData holder = new BTreePartitionData(RegularAndStaticColumns.NONE, BTree.empty(), deletionInfo, Rows.EMPTY_STATIC_ROW, EncodingStats.NO_STATS);
         return new PartitionUpdate(metadata, key, holder, deletionInfo, false);
     }
 
@@ -125,7 +125,7 @@ public static PartitionUpdate fullPartitionDelete(TableMetadata metadata, Decora
     public static PartitionUpdate singleRowUpdate(TableMetadata metadata, DecoratedKey key, Row row, Row staticRow)
     {
         MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
-        Holder holder = new Holder(
+        BTreePartitionData holder = new BTreePartitionData(
             new RegularAndStaticColumns(
                 staticRow == null ? Columns.NONE : Columns.from(staticRow.columns()),
                 row == null ? Columns.NONE : Columns.from(row.columns())
@@ -181,7 +181,7 @@ public static PartitionUpdate singleRowUpdate(TableMetadata metadata, ByteBuffer
     public static PartitionUpdate fromIterator(UnfilteredRowIterator iterator, ColumnFilter filter)
     {
         iterator = UnfilteredRowIterators.withOnlyQueriedData(iterator, filter);
-        Holder holder = build(iterator, 16);
+        BTreePartitionData holder = build(iterator, 16);
         MutableDeletionInfo deletionInfo = (MutableDeletionInfo) holder.deletionInfo;
         return new PartitionUpdate(iterator.metadata(), iterator.partitionKey(), holder, deletionInfo, false);
     }
@@ -202,7 +202,7 @@ public static PartitionUpdate fromIterator(RowIterator iterator, ColumnFilter fi
     {
         iterator = RowIterators.withOnlyQueriedData(iterator, filter);
         MutableDeletionInfo deletionInfo = MutableDeletionInfo.live();
-        Holder holder = build(iterator, deletionInfo, true, 16);
+        BTreePartitionData holder = build(iterator, deletionInfo, true, 16);
         return new PartitionUpdate(iterator.metadata(), iterator.partitionKey(), holder, deletionInfo, false);
     }
 
@@ -357,7 +357,7 @@ public RegularAndStaticColumns columns()
         return holder.columns;
     }
 
-    protected Holder holder()
+    protected BTreePartitionData holder()
     {
         return holder;
     }
@@ -670,7 +670,7 @@ public PartitionUpdate deserialize(DataInputPlus in, int version, Deserializatio
             MutableDeletionInfo deletionInfo = deletionBuilder.build();
             return new PartitionUpdate(metadata,
                                        header.key,
-                                       new Holder(header.sHeader.columns(), rows.build(), deletionInfo, header.staticRow, header.sHeader.stats()),
+                                       new BTreePartitionData(header.sHeader.columns(), rows.build(), deletionInfo, header.staticRow, header.sHeader.stats()),
                                        deletionInfo,
                                        false);
         }
@@ -765,7 +765,7 @@ private Builder(TableMetadata metadata,
                        RegularAndStaticColumns columns,
                        int initialRowCapacity,
                        boolean canHaveShadowedData,
-                       Holder holder)
+                       BTreePartitionData holder)
         {
             this(metadata, key, columns, initialRowCapacity, canHaveShadowedData, holder.staticRow, holder.deletionInfo, holder.tree);
         }
@@ -874,11 +874,11 @@ public PartitionUpdate build()
             isBuilt = true;
             return new PartitionUpdate(metadata,
                                        partitionKey(),
-                                       new Holder(columns,
-                                                  merged,
-                                                  deletionInfo,
-                                                  staticRow,
-                                                  newStats),
+                                       new BTreePartitionData(columns,
+                                                              merged,
+                                                              deletionInfo,
+                                                              staticRow,
+                                                              newStats),
                                        deletionInfo,
                                        canHaveShadowedData);
         }
diff --git a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
index 6f2256f370b8..45089e2441b6 100644
--- a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
+++ b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
@@ -30,6 +30,7 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.Collections2;
 import com.google.common.collect.Maps;
 
 import org.slf4j.Logger;
@@ -54,7 +55,6 @@
 import org.apache.cassandra.repair.ValidationPartitionIterator;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.concurrent.Refs;
 
@@ -198,12 +198,19 @@ public CassandraValidationIterator(ColumnFamilyStore cfs, Collection<Range<Token
             if (!isIncremental)
             {
                 // flush first so everyone is validating data that is as similar as possible
-                StorageService.instance.forceKeyspaceFlush(cfs.keyspace.getName(), cfs.name);
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.REPAIR);
+                // Note: we also flush for incremental repair during the anti-compaction process.
             }
             sstables = getSSTablesToValidate(cfs, ranges, parentId, isIncremental);
         }
 
+        // Persistent memtables will not flush or snapshot to sstables, make an sstable with their data.
+        cfs.writeAndAddMemtableRanges(parentId,
+                                      () -> Collections2.transform(Range.normalize(ranges), Range::makeRowRange),
+                                      sstables);
+
         Preconditions.checkArgument(sstables != null);
+
         ActiveRepairService.ParentRepairSession prs = ActiveRepairService.instance.getParentRepairSession(parentId);
         if (prs != null)
         {
diff --git a/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java b/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java
index e0ee68d955b2..f6762556e985 100644
--- a/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java
+++ b/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java
@@ -364,7 +364,7 @@ public ListenableFuture run()
         List<ListenableFutureTask<AcquireResult>> tasks = new ArrayList<>(tables.size());
         for (ColumnFamilyStore cfs : tables)
         {
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.REPAIR);
             ListenableFutureTask<AcquireResult> task = ListenableFutureTask.create(getAcquisitionCallable(cfs, tokenRanges.ranges(), prsId, acquireRetrySeconds, acquireSleepMillis));
             executor.submit(task);
             tasks.add(task);
diff --git a/src/java/org/apache/cassandra/db/rows/Rows.java b/src/java/org/apache/cassandra/db/rows/Rows.java
index 9ce45e68150f..873f4760416f 100644
--- a/src/java/org/apache/cassandra/db/rows/Rows.java
+++ b/src/java/org/apache/cassandra/db/rows/Rows.java
@@ -273,7 +273,7 @@ public static Row merge(Row row1, Row row2)
      *
      * @return the smallest timestamp delta between corresponding rows from existing and update. A
      * timestamp delta being computed as the difference between the cells and DeletionTimes from {@code existing}
-     * and those in {@code existing}.
+     * and those in {@code update}.
      */
     public static long merge(Row existing,
                              Row update,
diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java
index a84fd2764b55..2cdcd49bcde9 100644
--- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java
+++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java
@@ -81,6 +81,7 @@ public StreamReceiver createStreamReceiver(StreamSession session, int totalStrea
         return new CassandraStreamReceiver(cfs, session, totalStreams);
     }
 
+    @SuppressWarnings("resource")   // references placed onto returned collection or closed on error
     @Override
     public Collection<OutgoingStream> createOutgoingStreams(StreamSession session, RangesAtEndpoint replicas, UUID pendingRepair, PreviewKind previewKind)
     {
@@ -126,6 +127,8 @@ else if (pendingRepair == ActiveRepairService.NO_PENDING_REPAIR)
                 return sstables;
             }).refs);
 
+            // Persistent memtables will not flush, make an sstable with their data.
+            cfs.writeAndAddMemtableRanges(session.getPendingRepair(), () -> Range.normalize(keyRanges), refs);
 
             List<Range<Token>> normalizedFullRanges = Range.normalize(replicas.onlyFull().ranges());
             List<Range<Token>> normalizedAllRanges = Range.normalize(replicas.ranges());
diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java
index b2b2ce5cf093..5749156036a3 100644
--- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java
+++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java
@@ -28,7 +28,7 @@
 
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.streaming.StreamReceiveTask;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -182,7 +182,7 @@ private boolean hasCDC(ColumnFamilyStore cfs)
      * can be archived by the CDC process on discard.
      */
     private boolean requiresWritePath(ColumnFamilyStore cfs) {
-        return hasCDC(cfs) || (session.streamOperation().requiresViewBuild() && hasViews(cfs));
+        return hasCDC(cfs) || cfs.streamToMemtable() || (session.streamOperation().requiresViewBuild() && hasViews(cfs));
     }
 
     private void sendThroughWritePath(ColumnFamilyStore cfs, Collection<SSTableReader> readers) {
@@ -273,7 +273,7 @@ public void cleanup()
         // the streamed sstables.
         if (requiresWritePath)
         {
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.STREAMS_RECEIVED);
             abort();
         }
     }
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index dde198313d07..b492805690b1 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -867,14 +867,14 @@ int advanceAllocatedPos(int wantedPos) throws SpaceExhaustedException
     /** Returns the off heap size of the memtable trie itself, not counting any space taken by referenced content. */
     public long sizeOffHeap()
     {
-        return bufferType == BufferType.ON_HEAP ? 0 : allocatedPos;
+        return bufferType == BufferType.ON_HEAP ? 0 : buffer.capacity();
     }
 
     /** Returns the on heap size of the memtable trie itself, not counting any space taken by referenced content. */
     public long sizeOnHeap()
     {
         return contentCount * MemoryLayoutSpecification.SPEC.getReferenceSize() +
-               (bufferType == BufferType.ON_HEAP ? allocatedPos + EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP);
+               (bufferType == BufferType.ON_HEAP ? buffer.capacity() + EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/db/view/TableViews.java b/src/java/org/apache/cassandra/db/view/TableViews.java
index cc58dc19e740..421aa44719ba 100644
--- a/src/java/org/apache/cassandra/db/view/TableViews.java
+++ b/src/java/org/apache/cassandra/db/view/TableViews.java
@@ -102,16 +102,16 @@ public void stopBuild()
         views.forEach(View::stopBuild);
     }
 
-    public void forceBlockingFlush()
+    public void forceBlockingFlush(ColumnFamilyStore.FlushReason reason)
     {
         for (ColumnFamilyStore viewCfs : allViewsCfs())
-            viewCfs.forceBlockingFlush();
+            viewCfs.forceBlockingFlush(reason);
     }
 
-    public void dumpMemtables()
+    public void dumpMemtables(ColumnFamilyStore.FlushReason reason)
     {
         for (ColumnFamilyStore viewCfs : allViewsCfs())
-            viewCfs.dumpMemtable();
+            viewCfs.dumpMemtable(reason);
     }
 
     public void truncateBlocking(CommitLogPosition replayAfter, long truncatedAt)
diff --git a/src/java/org/apache/cassandra/db/view/ViewBuilder.java b/src/java/org/apache/cassandra/db/view/ViewBuilder.java
index a88ffbecc3fe..27ff53118f7b 100644
--- a/src/java/org/apache/cassandra/db/view/ViewBuilder.java
+++ b/src/java/org/apache/cassandra/db/view/ViewBuilder.java
@@ -96,7 +96,7 @@ public void start()
 
             logger.debug("Starting build of view({}.{}). Flushing base table {}.{}",
                          ksName, view.name, ksName, baseCfs.name);
-            baseCfs.forceBlockingFlush();
+            baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.VIEW_BUILD_STARTED);
 
             loadStatusAndBuild();
         }
diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
index fb2d248c2900..223e68b68e6f 100644
--- a/src/java/org/apache/cassandra/index/Index.java
+++ b/src/java/org/apache/cassandra/index/Index.java
@@ -37,6 +37,7 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
diff --git a/src/java/org/apache/cassandra/index/IndexRegistry.java b/src/java/org/apache/cassandra/index/IndexRegistry.java
index 7ff4799693c0..c193c4cef01f 100644
--- a/src/java/org/apache/cassandra/index/IndexRegistry.java
+++ b/src/java/org/apache/cassandra/index/IndexRegistry.java
@@ -39,6 +39,7 @@
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.InvalidRequestException;
diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
index 95d7f67143ff..b65827b000f4 100644
--- a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
@@ -59,6 +59,7 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.View;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -427,7 +428,7 @@ public void rebuildIndexesBlocking(Set<String> indexNames)
 
         // Once we are tracking new writes, flush any memtable contents to not miss them from the sstable-based rebuild
         if (needsFlush)
-            baseCfs.forceBlockingFlush();
+            baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_STARTED);
 
         // Now that we are tracking new writes and we haven't left untracked contents on the memtables, we are ready to
         // index the sstables
@@ -897,7 +898,7 @@ private void flushIndexesBlocking(Set<Index> indexes, FutureCallback<Object> cal
         {
             indexes.forEach(index ->
                             index.getBackingTable()
-                                 .map(cfs -> wait.add(cfs.forceFlush()))
+                                 .map(cfs -> wait.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_COMPLETED)))
                                  .orElseGet(() -> nonCfsIndexes.add(index)));
         }
 
diff --git a/src/java/org/apache/cassandra/index/SingletonIndexGroup.java b/src/java/org/apache/cassandra/index/SingletonIndexGroup.java
index 94fb482e6bb8..c70b996f4ca8 100644
--- a/src/java/org/apache/cassandra/index/SingletonIndexGroup.java
+++ b/src/java/org/apache/cassandra/index/SingletonIndexGroup.java
@@ -26,11 +26,11 @@
 import java.util.function.Predicate;
 
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.RegularAndStaticColumns;
 import org.apache.cassandra.db.WriteContext;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.index.transactions.IndexTransaction;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
index 68db86da069c..ae5ae7a778d1 100644
--- a/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndex.java
@@ -33,6 +33,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -184,7 +185,7 @@ public Optional<ColumnFamilyStore> getBackingTable()
     public Callable<Void> getBlockingFlushTask()
     {
         return () -> {
-            indexCfs.forceBlockingFlush();
+            indexCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_TABLE_FLUSH);
             return null;
         };
     }
@@ -655,7 +656,7 @@ private void invalidate()
         CompactionManager.instance.interruptCompactionForCFs(cfss, (sstable) -> true, true);
         CompactionManager.instance.waitForCessation(cfss, (sstable) -> true);
         Keyspace.writeOrder.awaitNewBarrier();
-        indexCfs.forceBlockingFlush();
+        indexCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_REMOVED);
         indexCfs.readOrdering.awaitNewBarrier();
         indexCfs.invalidate();
     }
@@ -681,7 +682,7 @@ private Callable<?> getBuildIndexTask()
     @SuppressWarnings("resource")
     private void buildBlocking()
     {
-        baseCfs.forceBlockingFlush();
+        baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_STARTED);
 
         try (ColumnFamilyStore.RefViewFragment viewFragment = baseCfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
              Refs<SSTableReader> sstables = viewFragment.refs)
@@ -705,7 +706,7 @@ private void buildBlocking()
                                                                          ImmutableSet.copyOf(sstables));
             Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
             FBUtilities.waitOnFuture(future);
-            indexCfs.forceBlockingFlush();
+            indexCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_COMPLETED);
         }
         logger.info("Index build of {} complete", metadata.name);
     }
diff --git a/src/java/org/apache/cassandra/index/sai/ColumnContext.java b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
index 1f73ef426125..7b93acbe1aff 100644
--- a/src/java/org/apache/cassandra/index/sai/ColumnContext.java
+++ b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
@@ -46,7 +46,6 @@
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -55,6 +54,7 @@
 import org.apache.cassandra.db.marshal.CompositeType;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.ComplexColumnData;
 import org.apache.cassandra.db.rows.Row;
@@ -246,7 +246,7 @@ public void discardMemtable(Memtable discarded)
     public MemtableIndex getPendingMemtableIndex(LifecycleNewTracker tracker)
     {
         return liveMemtables.keySet().stream()
-                            .filter(m -> tracker.equals(m.tracker()))
+                            .filter(m -> tracker.equals(m.getFlushTransaction()))
                             .findFirst()
                             .map(liveMemtables::get)
                             .orElse(null);
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
index 036d46b75710..1f4847531219 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
@@ -61,7 +61,6 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.DeletionTime;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.RangeTombstone;
 import org.apache.cassandra.db.ReadCommand;
 import org.apache.cassandra.db.RegularAndStaticColumns;
@@ -71,6 +70,7 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
@@ -350,7 +350,7 @@ private Future<?> startInitialBuild(ColumnFamilyStore baseCfs, boolean validate)
         // In case of offline scrub, there is no live memtables.
         if (!baseCfs.getTracker().getView().liveMemtables.isEmpty())
         {
-            baseCfs.forceBlockingFlush();
+            baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.INDEX_BUILD_STARTED);
         }
 
         // It is now safe to flush indexes directly from flushing Memtables.
@@ -611,7 +611,7 @@ public void updateRow(Row oldRow, Row newRow)
 
         void adjustMemtableSize(long additionalSpace, OpOrder.Group opGroup)
         {
-            mt.allocateExtraOnHeap(additionalSpace, opGroup);
+            mt.markExtraOnHeapUsed(additionalSpace, opGroup);
         }
     }
 
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
index 6355fe875708..a5dd9c6909ad 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
@@ -36,12 +36,12 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.RegularAndStaticColumns;
 import org.apache.cassandra.db.WriteContext;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.lifecycle.Tracker;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.sai.disk.StorageAttachedIndexWriter;
diff --git a/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java b/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
index d77cb7110533..320f3b13ced5 100644
--- a/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
+++ b/src/java/org/apache/cassandra/index/sai/memory/MemtableIndex.java
@@ -31,9 +31,9 @@
 import org.apache.cassandra.db.Clustering;
 import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.index.sai.ColumnContext;
 import org.apache.cassandra.index.sai.plan.Expression;
diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndex.java b/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
index fdcf9e427b09..7bd4d10f360e 100644
--- a/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
+++ b/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
@@ -33,6 +33,7 @@
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.lifecycle.Tracker;
 import org.apache.cassandra.db.marshal.AbstractType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.db.rows.Row;
@@ -296,7 +297,7 @@ private boolean isNewData()
 
             public void adjustMemtableSize(long additionalSpace, OpOrder.Group opGroup)
             {
-                baseCfs.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().allocate(additionalSpace, opGroup);
+                baseCfs.getTracker().getView().getCurrentMemtable().markExtraOnHeapUsed(additionalSpace, opGroup);
             }
         };
     }
diff --git a/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java b/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java
index 4c9c59e19916..fc24ce341456 100644
--- a/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/index/sasi/conf/ColumnIndex.java
@@ -31,7 +31,7 @@
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.AsciiType;
 import org.apache.cassandra.db.marshal.UTF8Type;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index 263b150135c7..14d976b7b4bc 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
@@ -404,8 +404,17 @@ public static void hardlink(Descriptor tmpdesc, Descriptor newdesc, Set<Componen
         FileUtils.createHardLinkWithoutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
     }
 
+    public interface SSTableSizeParameters
+    {
+        long partitionCount();
+        long partitionKeySize();
+        long dataSize();
+    }
+
     public static abstract class Factory
     {
+        public abstract long estimateSize(SSTableSizeParameters parameters);
+
         public abstract SSTableWriter open(Descriptor descriptor,
                                            long keyCount,
                                            long repairedAt,
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index 4dea115bc205..5e5888480a8f 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -82,6 +82,15 @@ public SSTableReader.Factory getReaderFactory()
 
     static class WriterFactory extends SSTableWriter.Factory
     {
+        @Override
+        public long estimateSize(SSTableWriter.SSTableSizeParameters parameters)
+        {
+            return (long) ((parameters.partitionCount() // index entries
+                            + parameters.partitionCount() // keys in data file
+                            + parameters.dataSize()) // data
+                           * 1.2); // bloom filter and row index overhead
+        }
+
         @Override
         public SSTableWriter open(Descriptor descriptor,
                                   long keyCount,
diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java
index 09f41a15ee0f..8605c71007e5 100644
--- a/src/java/org/apache/cassandra/metrics/TableMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java
@@ -37,7 +37,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.index.SecondaryIndexManager;
@@ -429,12 +429,12 @@ public String toString(String value)
 
         // MemtableOnHeapSize naming deprecated in 4.0
         memtableOnHeapDataSize = createTableGaugeWithDeprecation("MemtableOnHeapDataSize", "MemtableOnHeapSize", 
-                                                                 () -> cfs.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns(), 
+                                                                 () -> Memtable.getMemoryUsage(cfs.getTracker().getView().getCurrentMemtable()).ownsOnHeap,
                                                                  new GlobalTableGauge("MemtableOnHeapDataSize"));
 
         // MemtableOffHeapSize naming deprecated in 4.0
         memtableOffHeapDataSize = createTableGaugeWithDeprecation("MemtableOffHeapDataSize", "MemtableOffHeapSize", 
-                                                                  () -> cfs.getTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns(), 
+                                                                  () -> Memtable.getMemoryUsage(cfs.getTracker().getView().getCurrentMemtable()).ownsOffHeap,
                                                                   new GlobalTableGauge("MemtableOnHeapDataSize"));
         
         memtableLiveDataSize = createTableGauge("MemtableLiveDataSize", 
@@ -445,10 +445,7 @@ public String toString(String value)
         {
             public Long getValue()
             {
-                long size = 0;
-                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
-                    size += cfs2.getTracker().getView().getCurrentMemtable().getAllocator().onHeap().owns();
-                return size;
+                return getMemoryUsageWithIndexes(cfs).ownsOnHeap;
             }
         }, new GlobalTableGauge("AllMemtablesOnHeapDataSize"));
 
@@ -457,10 +454,7 @@ public Long getValue()
         {
             public Long getValue()
             {
-                long size = 0;
-                for (ColumnFamilyStore cfs2 : cfs.concatWithIndexes())
-                    size += cfs2.getTracker().getView().getCurrentMemtable().getAllocator().offHeap().owns();
-                return size;
+                return getMemoryUsageWithIndexes(cfs).ownsOffHeap;
             }
         }, new GlobalTableGauge("AllMemtablesOffHeapDataSize"));
         allMemtablesLiveDataSize = createTableGauge("AllMemtablesLiveDataSize", new Gauge<Long>()
@@ -915,6 +909,15 @@ protected double getDenominator()
         });
     }
 
+    private Memtable.MemoryUsage getMemoryUsageWithIndexes(ColumnFamilyStore cfs)
+    {
+        Memtable.MemoryUsage usage = Memtable.newMemoryUsage();
+        cfs.getTracker().getView().getCurrentMemtable().addMemoryUsageTo(usage);
+        for (ColumnFamilyStore indexCfs : cfs.indexManager.getAllIndexColumnFamilyStores())
+            indexCfs.getTracker().getView().getCurrentMemtable().addMemoryUsageTo(usage);
+        return usage;
+    }
+
     public void updateSSTableIterated(int count)
     {
         sstablesPerReadHistogram.update(count);
diff --git a/src/java/org/apache/cassandra/notifications/MemtableDiscardedNotification.java b/src/java/org/apache/cassandra/notifications/MemtableDiscardedNotification.java
index 778cad06c025..849b2f698aa4 100644
--- a/src/java/org/apache/cassandra/notifications/MemtableDiscardedNotification.java
+++ b/src/java/org/apache/cassandra/notifications/MemtableDiscardedNotification.java
@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.notifications;
 
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 
 public class MemtableDiscardedNotification implements INotification
 {
diff --git a/src/java/org/apache/cassandra/notifications/MemtableRenewedNotification.java b/src/java/org/apache/cassandra/notifications/MemtableRenewedNotification.java
index 4c7e6c5b0524..776c9da161f3 100644
--- a/src/java/org/apache/cassandra/notifications/MemtableRenewedNotification.java
+++ b/src/java/org/apache/cassandra/notifications/MemtableRenewedNotification.java
@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.notifications;
 
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 
 public class MemtableRenewedNotification implements INotification
 {
diff --git a/src/java/org/apache/cassandra/notifications/MemtableSwitchedNotification.java b/src/java/org/apache/cassandra/notifications/MemtableSwitchedNotification.java
index 946de4ee8447..b1737bebfd11 100644
--- a/src/java/org/apache/cassandra/notifications/MemtableSwitchedNotification.java
+++ b/src/java/org/apache/cassandra/notifications/MemtableSwitchedNotification.java
@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.notifications;
 
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 
 public class MemtableSwitchedNotification implements INotification
 {
diff --git a/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java b/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java
index 9c95a182de18..857af698473d 100644
--- a/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java
+++ b/src/java/org/apache/cassandra/notifications/SSTableAddedNotification.java
@@ -21,7 +21,7 @@
 
 import javax.annotation.Nullable;
 
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
 /**
diff --git a/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java b/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java
index 7e8d8bc385d4..1e4226f68916 100644
--- a/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java
+++ b/src/java/org/apache/cassandra/repair/SystemDistributedKeyspace.java
@@ -42,6 +42,7 @@
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.dht.Range;
@@ -353,7 +354,7 @@ public static void setViewRemoved(String keyspaceName, String viewName)
     {
         String buildReq = "DELETE FROM %s.%s WHERE keyspace_name = ? AND view_name = ?";
         QueryProcessor.executeInternal(format(buildReq, SchemaConstants.DISTRIBUTED_KEYSPACE_NAME, VIEW_BUILD_STATUS), keyspaceName, viewName);
-        forceBlockingFlush(VIEW_BUILD_STATUS);
+        forceBlockingFlush(VIEW_BUILD_STATUS, ColumnFamilyStore.FlushReason.INTERNALLY_FORCED);
     }
 
     private static void processSilent(String fmtQry, String... values)
@@ -373,10 +374,12 @@ private static void processSilent(String fmtQry, String... values)
         }
     }
 
-    public static void forceBlockingFlush(String table)
+    public static void forceBlockingFlush(String table, ColumnFamilyStore.FlushReason reason)
     {
         if (!DatabaseDescriptor.isUnsafeSystem())
-            FBUtilities.waitOnFuture(Keyspace.open(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME).getColumnFamilyStore(table).forceFlush());
+            FBUtilities.waitOnFuture(Keyspace.open(SchemaConstants.DISTRIBUTED_KEYSPACE_NAME)
+                                             .getColumnFamilyStore(table)
+                                             .forceFlush(reason));
     }
 
     private enum RepairState
diff --git a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
index e6ca3ee4d8bb..35e69e29988c 100644
--- a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
+++ b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
@@ -587,7 +587,7 @@ private void syncTable()
     {
         TableId tid = Schema.instance.getTableMetadata(keyspace, table).id;
         ColumnFamilyStore cfm = Schema.instance.getColumnFamilyStoreInstance(tid);
-        cfm.forceBlockingFlush();
+        cfm.forceBlockingFlush(ColumnFamilyStore.FlushReason.INTERNALLY_FORCED);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/schema/MemtableParams.java b/src/java/org/apache/cassandra/schema/MemtableParams.java
new file mode 100644
index 000000000000..f9430c22564d
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/MemtableParams.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.schema;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.Map;
+
+import com.google.common.collect.ImmutableMap;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.memtable.DefaultMemtableFactory;
+import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+/**
+ * Memtable types and options are specified with these parameters. Memtable classes must either contain a static FACTORY
+ * field (if they take no arguments other than class), or implement a factory(Map<String, String>) method.
+ *
+ * The latter should consume any further options (using map.remove).
+ *
+ *
+ * CQL: {'class' : 'SkipListMemtable'}
+ */
+public final class MemtableParams
+{
+    public enum Option
+    {
+        CLASS;
+
+        @Override
+        public String toString()
+        {
+            return name().toLowerCase();
+        }
+    }
+
+    public static final MemtableParams DEFAULT = new MemtableParams();
+
+    public final Memtable.Factory factory;
+    public final ImmutableMap<String, String> options;
+
+    private MemtableParams()
+    {
+        this.options = ImmutableMap.of();
+        this.factory = DefaultMemtableFactory.INSTANCE;
+    }
+
+    public MemtableParams(Map<String, String> options)
+    {
+        this.options = ImmutableMap.copyOf(options);
+        this.factory = getMemtableFactory(options);
+    }
+
+    private static Memtable.Factory getMemtableFactory(Map<String, String> options)
+    {
+        Map<String, String> copy = new HashMap<>(options);
+        String className = copy.remove(Option.CLASS.toString());
+        if (className.isEmpty() || className == null)
+            throw new ConfigurationException(
+            "The 'class' option must not be empty. To use default implementation, remove option.");
+
+        className = className.contains(".") ? className : "org.apache.cassandra.db.memtable." + className;
+        try
+        {
+            Memtable.Factory factory;
+            Class<?> clazz = Class.forName(className);
+            try
+            {
+                Method factoryMethod = clazz.getDeclaredMethod("factory", Map.class);
+                factory = (Memtable.Factory) factoryMethod.invoke(null, copy);
+            }
+            catch (NoSuchMethodException e)
+            {
+                // continue with FACTORY field
+                Field factoryField = clazz.getDeclaredField("FACTORY");
+                factory = (Memtable.Factory) factoryField.get(null);
+            }
+            if (!copy.isEmpty())
+                throw new ConfigurationException("Memtable class " + className + " does not accept any futher parameters, but " +
+                                                 copy + " were given.");
+            return factory;
+        }
+        catch (NoSuchFieldException | ClassNotFoundException | IllegalAccessException | InvocationTargetException | ClassCastException e)
+        {
+            if (e.getCause() instanceof ConfigurationException)
+                throw (ConfigurationException) e.getCause();
+            throw new ConfigurationException("Could not create memtable factory for type " + className +
+                                             " and options " + copy, e);
+        }
+    }
+
+    public static MemtableParams fromMap(Map<String, String> map)
+    {
+        if (map == null || map.isEmpty())
+        {
+            map = DatabaseDescriptor.getMemtableOptions();
+            if (map == null || map.isEmpty())
+                return DEFAULT;
+        }
+
+        return new MemtableParams(map);
+    }
+
+    public Map<String, String> asMap()
+    {
+        // options is an immutable map, ok to share
+        return options;
+    }
+
+    @Override
+    public String toString()
+    {
+        return options.toString();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof MemtableParams))
+            return false;
+
+        MemtableParams c = (MemtableParams) o;
+
+        return factory.equals(c.factory);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return factory.hashCode();
+    }
+}
diff --git a/src/java/org/apache/cassandra/schema/SchemaEvent.java b/src/java/org/apache/cassandra/schema/SchemaEvent.java
index d163a11167af..f808dc89d8ca 100644
--- a/src/java/org/apache/cassandra/schema/SchemaEvent.java
+++ b/src/java/org/apache/cassandra/schema/SchemaEvent.java
@@ -225,6 +225,7 @@ private HashMap<String, Serializable> repr(TableParams params)
         ret.put("caching", repr(params.caching));
         ret.put("compaction", repr(params.compaction));
         ret.put("compression", repr(params.compression));
+        ret.put("memtable", repr(params.memtable));
         if (params.speculativeRetry != null) ret.put("speculativeRetry", params.speculativeRetry.kind().name());
         return ret;
     }
@@ -253,6 +254,14 @@ private HashMap<String, Serializable> repr(CompressionParams compr)
         return ret;
     }
 
+    private HashMap<String, Serializable> repr(MemtableParams params)
+    {
+        HashMap<String, Serializable> ret = new HashMap<>();
+        if (params == null) return ret;
+        ret.putAll(params.asMap());
+        return ret;
+    }
+
     private HashMap<String, Serializable> repr(IndexMetadata index)
     {
         HashMap<String, Serializable> ret = new HashMap<>();
diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
index fdb209e06ca0..fecfbbc95b56 100644
--- a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
+++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
@@ -40,7 +40,6 @@
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
-import org.apache.cassandra.exceptions.StartupException;
 import org.apache.cassandra.service.reads.SpeculativeRetryPolicy;
 import org.apache.cassandra.schema.ColumnMetadata.ClusteringOrder;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
@@ -54,7 +53,6 @@
 import static java.util.stream.Collectors.toList;
 import static java.util.stream.Collectors.toSet;
 
-import static org.apache.cassandra.cql3.ColumnIdentifier.maybeQuote;
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal;
 
@@ -125,6 +123,7 @@ private SchemaKeyspace()
               + "comment text,"
               + "compaction frozen<map<text, text>>,"
               + "compression frozen<map<text, text>>,"
+              + "memtable frozen<map<text, text>>,"
               + "crc_check_chance double,"
               + "dclocal_read_repair_chance double," // no longer used, left for drivers' sake
               + "default_time_to_live int,"
@@ -192,6 +191,7 @@ private SchemaKeyspace()
               + "comment text,"
               + "compaction frozen<map<text, text>>,"
               + "compression frozen<map<text, text>>,"
+              + "memtable frozen<map<text, text>>,"
               + "crc_check_chance double,"
               + "dclocal_read_repair_chance double," // no longer used, left for drivers' sake
               + "default_time_to_live int,"
@@ -345,7 +345,7 @@ public static void truncate()
     private static void flush()
     {
         if (!DatabaseDescriptor.isUnsafeSystem())
-            ALL.forEach(table -> FBUtilities.waitOnFuture(getSchemaCFS(table).forceFlush()));
+            ALL.forEach(table -> FBUtilities.waitOnFuture(getSchemaCFS(table).forceFlush(ColumnFamilyStore.FlushReason.INTERNALLY_FORCED)));
     }
 
     /**
@@ -564,6 +564,7 @@ private static void addTableParamsToRowBuilder(TableParams params, Row.SimpleBui
                .add("caching", params.caching.asMap())
                .add("compaction", params.compaction.asMap())
                .add("compression", params.compression.asMap())
+               .add("memtable", params.memtable.asMap())
                .add("read_repair", params.readRepair.toString())
                .add("extensions", params.extensions);
 
@@ -974,6 +975,7 @@ static TableParams createTableParamsFromRow(UntypedResultSet.Row row)
                           .comment(row.getString("comment"))
                           .compaction(CompactionParams.fromMap(row.getFrozenTextMap("compaction")))
                           .compression(CompressionParams.fromMap(row.getFrozenTextMap("compression")))
+                          .memtable(MemtableParams.fromMap(row.getFrozenTextMap("memtable")))
                           .defaultTimeToLive(row.getInt("default_time_to_live"))
                           .extensions(row.getFrozenMap("extensions", UTF8Type.instance, BytesType.instance))
                           .gcGraceSeconds(row.getInt("gc_grace_seconds"))
diff --git a/src/java/org/apache/cassandra/schema/TableParams.java b/src/java/org/apache/cassandra/schema/TableParams.java
index 2be8a40c5a9f..3ca6e26eafd2 100644
--- a/src/java/org/apache/cassandra/schema/TableParams.java
+++ b/src/java/org/apache/cassandra/schema/TableParams.java
@@ -46,6 +46,7 @@ public enum Option
         COMMENT,
         COMPACTION,
         COMPRESSION,
+        MEMTABLE,
         DEFAULT_TIME_TO_LIVE,
         EXTENSIONS,
         GC_GRACE_SECONDS,
@@ -78,6 +79,7 @@ public String toString()
     public final CachingParams caching;
     public final CompactionParams compaction;
     public final CompressionParams compression;
+    public final MemtableParams memtable;
     public final ImmutableMap<String, ByteBuffer> extensions;
     public final boolean cdc;
     public final ReadRepairStrategy readRepair;
@@ -99,6 +101,7 @@ private TableParams(Builder builder)
         caching = builder.caching;
         compaction = builder.compaction;
         compression = builder.compression;
+        memtable = builder.memtable;
         extensions = builder.extensions;
         cdc = builder.cdc;
         readRepair = builder.readRepair;
@@ -116,6 +119,7 @@ public static Builder builder(TableParams params)
                             .comment(params.comment)
                             .compaction(params.compaction)
                             .compression(params.compression)
+                            .memtable(params.memtable)
                             .crcCheckChance(params.crcCheckChance)
                             .defaultTimeToLive(params.defaultTimeToLive)
                             .gcGraceSeconds(params.gcGraceSeconds)
@@ -178,6 +182,9 @@ public void validate()
 
         if (memtableFlushPeriodInMs < 0)
             fail("%s must be greater than or equal to 0 (got %s)", Option.MEMTABLE_FLUSH_PERIOD_IN_MS, memtableFlushPeriodInMs);
+
+        if (cdc && memtable.factory.writesShouldSkipCommitLog())
+            fail("CDC cannot work if writes skip the commit log. Check your memtable configuration.");
     }
 
     private static void fail(String format, Object... args)
@@ -208,6 +215,7 @@ public boolean equals(Object o)
             && caching.equals(p.caching)
             && compaction.equals(p.compaction)
             && compression.equals(p.compression)
+            && memtable.equals(p.memtable)
             && extensions.equals(p.extensions)
             && cdc == p.cdc
             && readRepair == p.readRepair;
@@ -228,6 +236,7 @@ public int hashCode()
                                 caching,
                                 compaction,
                                 compression,
+                                memtable,
                                 extensions,
                                 cdc,
                                 readRepair);
@@ -249,6 +258,7 @@ public String toString()
                           .add(Option.CACHING.toString(), caching)
                           .add(Option.COMPACTION.toString(), compaction)
                           .add(Option.COMPRESSION.toString(), compression)
+                          .add(Option.MEMTABLE.toString(), memtable)
                           .add(Option.EXTENSIONS.toString(), extensions)
                           .add(Option.CDC.toString(), cdc)
                           .add(Option.READ_REPAIR.toString(), readRepair)
@@ -272,6 +282,8 @@ public void appendCqlTo(CqlBuilder builder)
                .newLine()
                .append("AND compression = ").append(compression.asMap())
                .newLine()
+               .append("AND memtable = ").append(memtable.asMap())
+               .newLine()
                .append("AND crc_check_chance = ").append(crcCheckChance)
                .newLine()
                .append("AND default_time_to_live = ").append(defaultTimeToLive)
@@ -311,6 +323,7 @@ public static final class Builder
         private CachingParams caching = CachingParams.DEFAULT;
         private CompactionParams compaction = CompactionParams.DEFAULT;
         private CompressionParams compression = CompressionParams.DEFAULT;
+        private MemtableParams memtable = MemtableParams.DEFAULT;
         private ImmutableMap<String, ByteBuffer> extensions = ImmutableMap.of();
         private boolean cdc;
         private ReadRepairStrategy readRepair = ReadRepairStrategy.BLOCKING;
@@ -396,6 +409,12 @@ public Builder compaction(CompactionParams val)
             return this;
         }
 
+        public Builder memtable(MemtableParams val)
+        {
+            memtable = val;
+            return this;
+        }
+
         public Builder compression(CompressionParams val)
         {
             compression = val;
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index 9e1afa743b45..0d517b15e66d 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -3979,7 +3979,7 @@ public void forceKeyspaceFlush(String keyspaceName, String... tableNames) throws
         for (ColumnFamilyStore cfStore : getValidColumnFamilies(true, false, keyspaceName, tableNames))
         {
             logger.debug("Forcing flush on keyspace {}, CF {}", keyspaceName, cfStore.name);
-            cfStore.forceBlockingFlush();
+            cfStore.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
         }
     }
 
@@ -4934,7 +4934,7 @@ protected synchronized void drain(boolean isFinalShutdown) throws IOException, I
             for (Keyspace keyspace : Keyspace.nonSystem())
             {
                 for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
-                    flushes.add(cfs.forceFlush());
+                    flushes.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.SHUTDOWN));
             }
             // wait for the flushes.
             // TODO this is a godawful way to track progress, since they flush in parallel.  a long one could
@@ -4966,7 +4966,7 @@ protected synchronized void drain(boolean isFinalShutdown) throws IOException, I
             for (Keyspace keyspace : Keyspace.system())
             {
                 for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
-                    flushes.add(cfs.forceFlush());
+                    flushes.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.SHUTDOWN));
             }
             FBUtilities.waitOnFutures(flushes);
 
diff --git a/src/java/org/apache/cassandra/streaming/StreamSession.java b/src/java/org/apache/cassandra/streaming/StreamSession.java
index 3a32834719ac..46a528655ce2 100644
--- a/src/java/org/apache/cassandra/streaming/StreamSession.java
+++ b/src/java/org/apache/cassandra/streaming/StreamSession.java
@@ -27,6 +27,7 @@
 
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 import org.apache.cassandra.locator.RangesAtEndpoint;
 
 import org.slf4j.Logger;
@@ -952,7 +953,7 @@ private void flushSSTables(Iterable<ColumnFamilyStore> stores)
     {
         List<Future<?>> flushes = new ArrayList<>();
         for (ColumnFamilyStore cfs : stores)
-            flushes.add(cfs.forceFlush());
+            flushes.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.STREAMING));
         FBUtilities.waitOnFutures(flushes);
     }
 
diff --git a/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java b/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java
index 6893fb0ac985..3ab9b7f878d0 100644
--- a/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java
+++ b/src/java/org/apache/cassandra/utils/memory/EnsureOnHeap.java
@@ -32,6 +32,8 @@
 
 public abstract class EnsureOnHeap extends Transformation
 {
+    public static final EnsureOnHeap NOOP = new NoOp();
+
     public abstract DecoratedKey applyToPartitionKey(DecoratedKey key);
     public abstract UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition);
     public abstract SearchIterator<Clustering<?>, Row> applyToPartition(SearchIterator<Clustering<?>, Row> partition);
diff --git a/src/java/org/apache/cassandra/utils/memory/HeapPool.java b/src/java/org/apache/cassandra/utils/memory/HeapPool.java
index 6371bdaf7dac..4ae19abe2449 100644
--- a/src/java/org/apache/cassandra/utils/memory/HeapPool.java
+++ b/src/java/org/apache/cassandra/utils/memory/HeapPool.java
@@ -36,7 +36,6 @@ public MemtableAllocator newAllocator()
 
     private static class Allocator extends MemtableBufferAllocator
     {
-        private static final EnsureOnHeap ENSURE_NOOP = new EnsureOnHeap.NoOp();
         Allocator(HeapPool pool)
         {
             super(pool.onHeap.newAllocator(), pool.offHeap.newAllocator());
@@ -50,7 +49,7 @@ public ByteBuffer allocate(int size, OpOrder.Group opGroup)
 
         public EnsureOnHeap ensureOnHeap()
         {
-            return ENSURE_NOOP;
+            return EnsureOnHeap.NOOP;
         }
     }
 }
diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
index d772d5175886..95767965efe1 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
@@ -60,11 +60,11 @@
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.SystemKeyspaceMigrator40;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.memtable.AbstractAllocatorMemtable;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.distributed.Cluster;
@@ -130,8 +130,8 @@
 import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.UUIDSerializer;
 import org.apache.cassandra.utils.concurrent.Ref;
-import org.apache.cassandra.utils.progress.jmx.JMXBroadcastExecutor;
 import org.apache.cassandra.utils.memory.BufferPools;
+import org.apache.cassandra.utils.progress.jmx.JMXBroadcastExecutor;
 
 import static java.util.concurrent.TimeUnit.MINUTES;
 import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
@@ -433,7 +433,7 @@ public String getReleaseVersionString()
 
     public void flush(String keyspace)
     {
-        runOnInstance(() -> FBUtilities.waitOnFutures(Keyspace.open(keyspace).flush()));
+        runOnInstance(() -> FBUtilities.waitOnFutures(Keyspace.open(keyspace).flush(ColumnFamilyStore.FlushReason.UNIT_TESTS)));
     }
 
     public void forceCompact(String keyspace, String table)
@@ -719,7 +719,7 @@ public Future<Void> shutdown(boolean graceful)
                                 () -> PendingRangeCalculatorService.instance.shutdownAndWait(1L, MINUTES),
                                 () -> BufferPools.shutdownLocalCleaner(1L, MINUTES),
                                 () -> Ref.shutdownReferenceReaper(1L, MINUTES),
-                                () -> Memtable.MEMORY_POOL.shutdownAndWait(1L, MINUTES),
+                                () -> AbstractAllocatorMemtable.MEMORY_POOL.shutdownAndWait(1L, MINUTES),
                                 () -> DiagnosticSnapshotService.instance.shutdownAndWait(1L, MINUTES),
                                 () -> ScheduledExecutors.shutdownAndWait(1L, MINUTES),
                                 () -> SSTableReader.shutdownBlocking(1L, MINUTES),
diff --git a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
index e6ccbc2b49d5..e6d8f83ec394 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
@@ -111,7 +111,7 @@ private static SerializableRunnable failingReaders(Verb type, RepairParallelism
         return () -> {
             String cfName = getCfName(type, parallelism, withTracing);
             ColumnFamilyStore cf = Keyspace.open(KEYSPACE).getColumnFamilyStore(cfName);
-            cf.forceBlockingFlush();
+            cf.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             Set<SSTableReader> remove = cf.getLiveSSTables();
             Set<SSTableReader> replace = new HashSet<>();
             if (type == Verb.VALIDATION_REQ)
diff --git a/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java b/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java
index 6ee1b2d016dc..7ea82572ab04 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/JVMStabilityInspectorCorruptSSTableExceptionTest.java
@@ -159,7 +159,7 @@ private static void corruptTable(IInvokableInstance node, String keyspace, Strin
     {
         node.runOnInstance(() -> {
             ColumnFamilyStore cf = Keyspace.open(keyspace).getColumnFamilyStore(table);
-            cf.forceBlockingFlush();
+            cf.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             Set<SSTableReader> remove = cf.getLiveSSTables();
             Set<SSTableReader> replace = new HashSet<>();
diff --git a/test/long/org/apache/cassandra/cql3/ViewLongTest.java b/test/long/org/apache/cassandra/cql3/ViewLongTest.java
index de888d4df6bd..dccc089f291f 100644
--- a/test/long/org/apache/cassandra/cql3/ViewLongTest.java
+++ b/test/long/org/apache/cassandra/cql3/ViewLongTest.java
@@ -40,6 +40,7 @@
 import com.datastax.driver.core.exceptions.WriteTimeoutException;
 import org.apache.cassandra.batchlog.BatchlogManager;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.concurrent.SEPExecutor;
 import org.apache.cassandra.concurrent.Stage;
@@ -392,7 +393,7 @@ private void testExpiredLivenessInfoWithUnselectedColumnAndDefaultTTL(boolean fl
 
             updateViewWithFlush("UPDATE %s USING TTL 90  SET b = null WHERE k = 1 AND c = 2", flush);
             if (flush)
-                FBUtilities.waitOnFutures(Keyspace.open(keyspace()).flush());
+                FBUtilities.waitOnFutures(Keyspace.open(keyspace()).flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
             assertRows(execute("select k,c,a,b from %s"));
             assertRows(execute("select k,c,a from mv"));
 
@@ -437,6 +438,6 @@ private void updateViewWithFlush(String query, boolean flush, Object... params)
             Thread.sleep(1);
         }
         if (flush)
-            Keyspace.open(keyspace()).flush();
+            Keyspace.open(keyspace()).flush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 }
diff --git a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
index fe8cdc2fd92b..5bc4768be63e 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
@@ -165,7 +165,7 @@ public void testStandardColumnCompactions()
 
                 inserted.add(key);
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             CompactionsTest.assertMaxTimestamp(cfs, maxTimestampExpected);
 
             assertEquals(inserted.toString(), inserted.size(), Util.getAll(Util.cmd(cfs).build()).size());
diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyCQLTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyCQLTest.java
index 9bfa3808b886..fed145039d78 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyCQLTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyCQLTest.java
@@ -32,6 +32,7 @@
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.Hex;
 
@@ -70,7 +71,7 @@ public void stressTestCompactionStrategyManager() throws ExecutionException, Int
                             throw new RuntimeException(throwable);
                         }
                     }
-                    getCurrentColumnFamilyStore().forceBlockingFlush();
+                    getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                     Uninterruptibles.sleepUninterruptibly(r.nextInt(200), TimeUnit.MILLISECONDS);
                 }
             });
diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
index 3de764ad7db2..a72e75e4c423 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
@@ -164,7 +164,7 @@ public void testLeveledScanner() throws Exception
         }
 
         //Flush sstable
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         store.runWithCompactionsDisabled(new Callable<Void>()
         {
@@ -263,7 +263,7 @@ private void populateSSTables(ColumnFamilyStore store)
 
             Mutation rm = new Mutation(builder.build());
             rm.apply();
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 }
diff --git a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
index a58303da6032..ece90e017b4a 100644
--- a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
+++ b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
@@ -491,7 +491,7 @@ public void setup()
                             reads.add(() -> runQuery(query, cfs.metadata.get()));
                         }
                     }
-                    cfs.forceBlockingFlush();
+                    cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                 }
 
                 Assert.assertEquals(numSSTable, cfs.getLiveSSTables().size());
@@ -606,7 +606,7 @@ public void setup()
                             reads.add(() -> runQuery(query, cfs.metadata.get()));
                         }
                     }
-                    cfs.forceBlockingFlush();
+                    cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                 }
 
                 Assert.assertEquals(numSSTable, cfs.getLiveSSTables().size());
@@ -706,7 +706,7 @@ public void setup()
                             reads.add(() -> runQuery(query, cfs.metadata.get()));
                         }
                     }
-                    cfs.forceBlockingFlush();
+                    cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                 }
 
                 Assert.assertEquals(numSSTable, cfs.getLiveSSTables().size());
diff --git a/test/microbench/org/apache/cassandra/test/microbench/CompactionBench.java b/test/microbench/org/apache/cassandra/test/microbench/CompactionBench.java
index 41220a2a655c..b2d011d391f3 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/CompactionBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/CompactionBench.java
@@ -70,13 +70,13 @@ public void setup() throws Throwable
             execute(writeStatement, i, i, i );
 
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
 
         System.err.println("Writing 50k again...");
         for (long i = 0; i < 50000; i++)
             execute(writeStatement, i, i, i );
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
 
         cfs.snapshot("originals");
 
diff --git a/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBenchmark.java
index 9e84e0ae9dce..53cc1a533bcf 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBenchmark.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/ZeroCopyStreamingBenchmark.java
@@ -202,7 +202,7 @@ private void generateData()
                 .build()
                 .applyUnsafe();
             }
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
             CompactionManager.instance.performMaximal(store, false);
         }
 
diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/ReadTest.java b/test/microbench/org/apache/cassandra/test/microbench/instance/ReadTest.java
index 789ca008a6a6..c39c0594dd46 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/instance/ReadTest.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/instance/ReadTest.java
@@ -19,10 +19,14 @@
 package org.apache.cassandra.test.microbench.instance;
 
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
-import java.util.stream.IntStream;
 
 import com.google.common.base.Throwables;
 
@@ -30,15 +34,15 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.utils.FBUtilities;
 import org.openjdk.jmh.annotations.*;
 
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
-@Warmup(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
-@Measurement(iterations = 15, time = 2, timeUnit = TimeUnit.SECONDS)
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
 @Fork(value = 1)
 @Threads(1)
 @State(Scope.Benchmark)
@@ -63,64 +67,73 @@ public enum Flush
     @Param({"INMEM", "YES"})
     Flush flush = Flush.INMEM;
 
-    public enum Execution
-    {
-        SERIAL,
-        SERIAL_NET,
-        PARALLEL,
-        PARALLEL_NET,
-    }
+    @Param({""})
+    String memtableClass = "";
+
+    @Param({"false"})
+    boolean useNet = false;
 
-    @Param({"PARALLEL"})
-    Execution async = Execution.PARALLEL;
+    @Param({"1"})
+    int threadCount = 1;
+
+    ExecutorService executorService;
 
     @Setup(Level.Trial)
     public void setup() throws Throwable
     {
         rand = new Random(1);
+        executorService = Executors.newFixedThreadPool(threadCount);
         CQLTester.setUpClass();
         CQLTester.prepareServer();
+        DatabaseDescriptor.setAutoSnapshot(false);
         System.err.println("setupClass done.");
-        keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
-        table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid)) with compression = {'enabled': false}");
-        execute("use "+keyspace+";");
-        switch (async)
+        String memtableSetup = "";
+        if (!memtableClass.isEmpty())
+            memtableSetup = String.format(" AND memtable = { 'class': '%s' }", memtableClass);
+        keyspace = createKeyspace(
+        "CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
+        table = createTable(keyspace,
+                            "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid)) with compression = {'enabled': false}" +
+                            memtableSetup);
+        execute("use " + keyspace + ";");
+        if (useNet)
         {
-            case SERIAL_NET:
-            case PARALLEL_NET:
-                CQLTester.requireNetwork();
-                executeNet(getDefaultVersion(), "use " + keyspace + ";");
+            CQLTester.requireNetwork();
+            executeNet(getDefaultVersion(), "use " + keyspace + ";");
         }
-        String writeStatement = "INSERT INTO "+table+"(userid,picid,commentid)VALUES(?,?,?)";
-        System.err.println("Prepared, batch " + BATCH + " flush " + flush);
-        System.err.println("Disk access mode " + DatabaseDescriptor.getDiskAccessMode() + " index " + DatabaseDescriptor.getIndexAccessMode());
+        String writeStatement = "INSERT INTO " + table + "(userid,picid,commentid)VALUES(?,?,?)";
+        System.err.println("Prepared, batch " + BATCH + " threads " + threadCount + " flush " + flush);
+        System.err.println("Disk access mode " + DatabaseDescriptor.getDiskAccessMode() +
+                           " index " + DatabaseDescriptor.getIndexAccessMode());
 
         cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
         cfs.disableAutoCompaction();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
 
         //Warm up
+        long writeStart = System.currentTimeMillis();
         System.err.println("Writing " + count);
         long i;
         for (i = 0; i <= count - BATCH; i += BATCH)
             performWrite(writeStatement, i, BATCH);
         if (i < count)
             performWrite(writeStatement, i, count - i);
+        long writeLength = System.currentTimeMillis() - writeStart;
+        System.err.format("... done in %.3f s.\n", writeLength / 1000.0);
 
         Memtable memtable = cfs.getTracker().getView().getCurrentMemtable();
-        System.err.format("Memtable in %s mode: %d ops, %s serialized bytes, %s (%.0f%%) on heap, %s (%.0f%%) off-heap\n",
+        Memtable.MemoryUsage usage = Memtable.getMemoryUsage(memtable);
+        System.err.format("%s in %s mode: %d ops, %s serialized bytes, %s\n",
+                          memtable.getClass().getSimpleName(),
                           DatabaseDescriptor.getMemtableAllocationType(),
                           memtable.getOperations(),
                           FBUtilities.prettyPrintMemory(memtable.getLiveDataSize()),
-                          FBUtilities.prettyPrintMemory(memtable.getAllocator().onHeap().owns()),
-                          100 * memtable.getAllocator().onHeap().ownershipRatio(),
-                          FBUtilities.prettyPrintMemory(memtable.getAllocator().offHeap().owns()),
-                          100 * memtable.getAllocator().offHeap().ownershipRatio());
+                          usage);
 
         switch (flush)
         {
         case YES:
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
             break;
         case INMEM:
             if (!cfs.getLiveSSTables().isEmpty())
@@ -140,11 +153,43 @@ public void setup() throws Throwable
     abstract Object[] writeArguments(long i);
 
     public void performWrite(String writeStatement, long ofs, long count) throws Throwable
+    {
+        if (threadCount == 1)
+            performWriteSerial(writeStatement, ofs, count);
+        else
+            performWriteThreads(writeStatement, ofs, count);
+    }
+
+    public void performWriteSerial(String writeStatement, long ofs, long count) throws Throwable
     {
         for (long i = ofs; i < ofs + count; ++i)
             execute(writeStatement, writeArguments(i));
     }
 
+    public void performWriteThreads(String writeStatement, long ofs, long count) throws Throwable
+    {
+        List<Future<Integer>> futures = new ArrayList<>();
+        for (long i = 0; i < count; ++i)
+        {
+            long pos = ofs + i;
+            futures.add(executorService.submit(() ->
+                                               {
+                                                   try
+                                                   {
+                                                       execute(writeStatement, writeArguments(pos));
+                                                       return 1;
+                                                   }
+                                                   catch (Throwable throwable)
+                                                   {
+                                                       throw Throwables.propagate(throwable);
+                                                   }
+                                               }));
+        }
+        long done = 0;
+        for (Future<Integer> f : futures)
+            done += f.get();
+        assert count == done;
+    }
 
     @TearDown(Level.Trial)
     public void teardown() throws InterruptedException
@@ -152,8 +197,11 @@ public void teardown() throws InterruptedException
         if (flush == Flush.INMEM && !cfs.getLiveSSTables().isEmpty())
             throw new AssertionError("SSTables created for INMEM test.");
 
+        executorService.shutdown();
+        executorService.awaitTermination(15, TimeUnit.SECONDS);
+
         // do a flush to print sizes
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
 
         CommitLog.instance.shutdownBlocking();
         CQLTester.tearDownClass();
@@ -170,20 +218,25 @@ public Object performReadSerial(String readStatement, Supplier<Object[]> supplie
 
     public Object performReadThreads(String readStatement, Supplier<Object[]> supplier) throws Throwable
     {
-        return IntStream.range(0, BATCH)
-                        .parallel()
-                        .mapToLong(i ->
-                                   {
-                                       try
-                                       {
-                                           return execute(readStatement, supplier.get()).size();
-                                       }
-                                       catch (Throwable throwable)
-                                       {
-                                           throw Throwables.propagate(throwable);
-                                       }
-                                   })
-                        .sum();
+        List<Future<Integer>> futures = new ArrayList<>();
+        for (long i = 0; i < BATCH; ++i)
+        {
+            futures.add(executorService.submit(() ->
+                                               {
+                                                   try
+                                                   {
+                                                       return execute(readStatement, supplier.get()).size();
+                                                   }
+                                                   catch (Throwable throwable)
+                                                   {
+                                                       throw Throwables.propagate(throwable);
+                                                   }
+                                               }));
+        }
+        long done = 0;
+        for (Future<Integer> f : futures)
+            done += f.get();
+        return done;
     }
 
     public Object performReadSerialNet(String readStatement, Supplier<Object[]> supplier) throws Throwable
@@ -197,37 +250,44 @@ public Object performReadSerialNet(String readStatement, Supplier<Object[]> supp
 
     public long performReadThreadsNet(String readStatement, Supplier<Object[]> supplier) throws Throwable
     {
-        return IntStream.range(0, BATCH)
-                        .parallel()
-                        .mapToLong(i ->
-                                   {
-                                       try
-                                       {
-                                           return executeNet(getDefaultVersion(), readStatement, supplier.get())
-                                                          .getAvailableWithoutFetching();
-                                       }
-                                       catch (Throwable throwable)
-                                       {
-                                           throw Throwables.propagate(throwable);
-                                       }
-                                   })
-                        .sum();
+        List<Future<Integer>> futures = new ArrayList<>();
+        for (long i = 0; i < BATCH; ++i)
+        {
+            futures.add(executorService.submit(() ->
+                                               {
+                                                   try
+                                                   {
+                                                       return executeNet(getDefaultVersion(), readStatement, supplier.get())
+                                                              .getAvailableWithoutFetching();
+                                                   }
+                                                   catch (Throwable throwable)
+                                                   {
+                                                       throw Throwables.propagate(throwable);
+                                                   }
+                                               }));
+        }
+        long done = 0;
+        for (Future<Integer> f : futures)
+            done += f.get();
+        return done;
     }
 
 
     public Object performRead(String readStatement, Supplier<Object[]> supplier) throws Throwable
     {
-        switch (async)
+        if (useNet)
         {
-            case SERIAL:
-                return performReadSerial(readStatement, supplier);
-            case SERIAL_NET:
+            if (threadCount == 1)
                 return performReadSerialNet(readStatement, supplier);
-            case PARALLEL:
-                return performReadThreads(readStatement, supplier);
-            case PARALLEL_NET:
+            else
                 return performReadThreadsNet(readStatement, supplier);
         }
-        return null;
+        else
+        {
+            if (threadCount == 1)
+                return performReadSerial(readStatement, supplier);
+            else
+                return performReadThreads(readStatement, supplier);
+        }
     }
 }
diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java b/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
new file mode 100644
index 000000000000..e7a98d882e29
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
@@ -0,0 +1,250 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.instance;
+
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Random;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.base.Throwables;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.utils.FBUtilities;
+import org.openjdk.jmh.annotations.*;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS)
+@Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS)
+@Fork(value = 1)
+@Threads(1)
+@State(Scope.Benchmark)
+public class WriteTest extends CQLTester
+{
+    static String keyspace;
+    String table;
+    ColumnFamilyStore cfs;
+    Random rand;
+
+    @Param({"1000"})
+    int BATCH = 1_000;
+
+    public enum EndOp
+    {
+        INMEM, TRUNCATE, FLUSH
+    }
+
+    @Param({"1000000"})
+    int count = 1_000_000;
+
+    @Param({"INMEM", "TRUNCATE", "FLUSH"})
+    EndOp flush = EndOp.INMEM;
+
+    @Param({""})
+    String memtableClass = "";
+
+    @Param({"false"})
+    boolean useNet = false;
+
+    String writeStatement;
+
+    @Param({"1"})
+    int threadCount = 1;
+
+    ExecutorService executorService;
+
+    @Setup(Level.Trial)
+    public void setup() throws Throwable
+    {
+        rand = new Random(1);
+        executorService = Executors.newFixedThreadPool(threadCount);
+        CQLTester.setUpClass();
+        CQLTester.prepareServer();
+        DatabaseDescriptor.setAutoSnapshot(false);
+        System.err.println("setupClass done.");
+        String memtableSetup = "";
+        if (!memtableClass.isEmpty())
+            memtableSetup = String.format(" AND memtable = { 'class': '%s' }", memtableClass);
+        keyspace = createKeyspace(
+        "CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
+        table = createTable(keyspace,
+                            "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid)) with compression = {'enabled': false}" +
+                            memtableSetup);
+        execute("use " + keyspace + ";");
+        if (useNet)
+        {
+            CQLTester.requireNetwork();
+            executeNet(getDefaultVersion(), "use " + keyspace + ";");
+        }
+        writeStatement = "INSERT INTO " + table + "(userid,picid,commentid)VALUES(?,?,?)";
+        System.err.println("Prepared, batch " + BATCH + " threads " + threadCount + " flush " + flush);
+        System.err.println("Disk access mode " + DatabaseDescriptor.getDiskAccessMode() +
+                           " index " + DatabaseDescriptor.getIndexAccessMode());
+
+        cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+        cfs.disableAutoCompaction();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
+    }
+
+    @Benchmark
+    public void writeTable() throws Throwable
+    {
+        long i;
+        for (i = 0; i <= count - BATCH; i += BATCH)
+            performWrite(i, BATCH);
+        if (i < count)
+            performWrite(i, Math.toIntExact(count - i));
+
+        switch (flush)
+        {
+        case FLUSH:
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
+            // if we flush we also must truncate to avoid accummulating sstables
+        case TRUNCATE:
+            execute("TRUNCATE TABLE " + table);
+            // note: we turn snapshotting and durable writes (which would have caused a flush) off for this benchmark
+            break;
+        case INMEM:
+            if (!cfs.getLiveSSTables().isEmpty())
+                throw new AssertionError("SSTables created for INMEM test.");
+            // leave unflushed, i.e. next iteration will overwrite data
+        default:
+        }
+    }
+
+    public Object[] writeArguments(long i)
+    {
+        return new Object[] { i, i, i };
+    }
+
+    public void performWrite(long ofs, long count) throws Throwable
+    {
+        if (useNet)
+        {
+            if (threadCount == 1)
+                performWriteSerialNet(ofs, count);
+            else
+                performWriteThreadsNet(ofs, count);
+        }
+        else
+        {
+            if (threadCount == 1)
+                performWriteSerial(ofs, count);
+            else
+                performWriteThreads(ofs, count);
+        }
+    }
+
+    public void performWriteSerial(long ofs, long count) throws Throwable
+    {
+        for (long i = ofs; i < ofs + count; ++i)
+            execute(writeStatement, writeArguments(i));
+    }
+
+    public void performWriteThreads(long ofs, long count) throws Throwable
+    {
+        List<Future<Integer>> futures = new ArrayList<>();
+        for (long i = 0; i < count; ++i)
+        {
+            long pos = ofs + i;
+            futures.add(executorService.submit(() ->
+            {
+                try
+                {
+                    execute(writeStatement, writeArguments(pos));
+                    return 1;
+                }
+                catch (Throwable throwable)
+                {
+                    throw Throwables.propagate(throwable);
+                }
+            }));
+        }
+        long done = 0;
+        for (Future<Integer> f : futures)
+            done += f.get();
+        assert count == done;
+    }
+
+    public void performWriteSerialNet(long ofs, long count) throws Throwable
+    {
+        for (long i = ofs; i < ofs + count; ++i)
+            sessionNet().execute(writeStatement, writeArguments(i));
+    }
+
+    public void performWriteThreadsNet(long ofs, long count) throws Throwable
+    {
+        List<Future<Integer>> futures = new ArrayList<>();
+        for (long i = 0; i < count; ++i)
+        {
+            long pos = ofs + i;
+            futures.add(executorService.submit(() ->
+                                               {
+                                                   try
+                                                   {
+                                                       sessionNet().execute(writeStatement, writeArguments(pos));
+                                                       return 1;
+                                                   }
+                                                   catch (Throwable throwable)
+                                                   {
+                                                       throw Throwables.propagate(throwable);
+                                                   }
+                                               }));
+        }
+        long done = 0;
+        for (Future<Integer> f : futures)
+            done += f.get();
+        assert count == done;
+    }
+
+    @TearDown(Level.Trial)
+    public void teardown() throws InterruptedException
+    {
+        executorService.shutdown();
+        executorService.awaitTermination(15, TimeUnit.SECONDS);
+        Memtable memtable = cfs.getTracker().getView().getCurrentMemtable();
+        Memtable.MemoryUsage usage = Memtable.getMemoryUsage(memtable);
+        System.err.format("\n%s in %s mode: %d ops, %s serialized bytes, %s\n",
+                          memtable.getClass().getSimpleName(),
+                          DatabaseDescriptor.getMemtableAllocationType(),
+                          memtable.getOperations(),
+                          FBUtilities.prettyPrintMemory(memtable.getLiveDataSize()),
+                          usage);
+
+        if (flush == EndOp.INMEM && !cfs.getLiveSSTables().isEmpty())
+            throw new AssertionError("SSTables created for INMEM test.");
+
+        // do a flush to print sizes
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.USER_FORCED);
+
+        CommitLog.instance.shutdownBlocking();
+        CQLTester.tearDownClass();
+        CQLTester.cleanup();
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
index 8b99ac95eb07..71600812dfac 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
@@ -44,7 +44,7 @@ public class MemtableTrieWriteBench
     @Param({"8"})
     int keyLength = 8;
 
-    final static MemtableTrie.UpsertTransformer<Byte, Byte> resolver = (x, y) -> x;
+    final static MemtableTrie.UpsertTransformer<Byte, Byte> resolver = (x, y) -> y;
 
     @Benchmark
     public void putSequential() throws MemtableTrie.SpaceExhaustedException
diff --git a/test/unit/org/apache/cassandra/ServerTestUtils.java b/test/unit/org/apache/cassandra/ServerTestUtils.java
index 221a23a526eb..56afb8413156 100644
--- a/test/unit/org/apache/cassandra/ServerTestUtils.java
+++ b/test/unit/org/apache/cassandra/ServerTestUtils.java
@@ -173,7 +173,8 @@ private static void cleanupDirectory(File directory)
 
     private static void cleanupDirectory(String dirName)
     {
-        cleanupDirectory(new File(dirName));
+        if (dirName != null)
+            cleanupDirectory(new File(dirName));
     }
 
     /**
diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index c26b339dc5b7..c103f4b62e03 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -200,7 +200,7 @@ public static ColumnFamilyStore writeColumnFamily(List<Mutation> mutations)
             rm.applyUnsafe();
 
         ColumnFamilyStore store = Keyspace.open(keyspaceName).getColumnFamilyStore(tableId);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         return store;
     }
 
diff --git a/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java b/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java
index b86b0d34dfa6..237ff93ac81c 100644
--- a/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java
+++ b/test/unit/org/apache/cassandra/batchlog/BatchlogManagerTest.java
@@ -153,7 +153,7 @@ public void testReplay() throws Exception
         }
 
         // Flush the batchlog to disk (see CASSANDRA-6822).
-        Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+        Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(100, BatchlogManager.instance.countAllBatches() - initialAllBatches);
         assertEquals(0, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
@@ -237,7 +237,7 @@ public void testTruncatedReplay() throws InterruptedException, ExecutionExceptio
         }
 
         // Flush the batchlog to disk (see CASSANDRA-6822).
-        Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+        Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Force batchlog replay and wait for it to complete.
         BatchlogManager.instance.startBatchlogReplay().get();
@@ -365,7 +365,7 @@ public void testReplayWithNoPeers() throws Exception
         assertEquals(1, BatchlogManager.instance.countAllBatches() - initialAllBatches);
 
         // Flush the batchlog to disk (see CASSANDRA-6822).
-        Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush();
+        Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStore(SystemKeyspace.BATCHES).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, BatchlogManager.instance.countAllBatches() - initialAllBatches);
         assertEquals(0, BatchlogManager.instance.getTotalBatchesReplayed() - initialReplayedBatches);
diff --git a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
index baec51a58ed0..d07837788f85 100644
--- a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
+++ b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
@@ -75,7 +75,7 @@ private static void doTestSerializeAndLoadKeyCache() throws Exception
             RowUpdateBuilder rowBuilder = new RowUpdateBuilder(cfs.metadata(), System.currentTimeMillis(), "key1");
             rowBuilder.add(colDef, "val1");
             rowBuilder.build().apply();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         Assert.assertEquals(2, cfs.getLiveSSTables().size());
diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 00e5218a08a8..27df5bd298bd 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -559,7 +559,7 @@ public void flush(String keyspace, String table)
     {
         ColumnFamilyStore store = getColumnFamilyStore(keyspace, table);
         if (store != null)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     public void disableCompaction(String keyspace)
diff --git a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
index 144c3f9375ba..62762d36904f 100644
--- a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
@@ -407,11 +407,11 @@ public void testLocalDeletionTime() throws Throwable
         createTable("create table %s (k int, c1 int, primary key (k, c1)) with compaction = {'class': 'SizeTieredCompactionStrategy', 'provide_overlapping_tombstones':'row'}");
         execute("delete from %s where k = 1");
         Set<SSTableReader> readers = new HashSet<>(getCurrentColumnFamilyStore().getLiveSSTables());
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         SSTableReader oldSSTable = getNewTable(readers);
         Thread.sleep(2000);
         execute("delete from %s where k = 1");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         SSTableReader newTable = getNewTable(readers);
 
         CompactionManager.instance.forceUserDefinedCompaction(oldSSTable.getFilename());
diff --git a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
index 122f97a339fa..98f15b3db8df 100644
--- a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
+++ b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
@@ -30,6 +30,7 @@
 
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.index.Index;
@@ -543,7 +544,7 @@ private void insertData(String table, String index, boolean withClustering) thro
 
             if (i % 10 == 9)
             {
-                Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).forceFlush().get();
+                Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).forceFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS).get();
                 if (index != null)
                     triggerBlockingFlush(Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).indexManager.getIndexByName(index));
             }
@@ -553,7 +554,7 @@ private void insertData(String table, String index, boolean withClustering) thro
     private static void prepareTable(String table) throws IOException, InterruptedException, java.util.concurrent.ExecutionException
     {
         StorageService.instance.disableAutoCompaction(KEYSPACE_PER_TEST, table);
-        Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).forceFlush().get();
+        Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).forceFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS).get();
         Keyspace.open(KEYSPACE_PER_TEST).getColumnFamilyStore(table).truncateBlocking();
     }
 
diff --git a/test/unit/org/apache/cassandra/cql3/MemtableQuickTest.java b/test/unit/org/apache/cassandra/cql3/MemtableQuickTest.java
new file mode 100644
index 000000000000..f2bf4434255f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/MemtableQuickTest.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import java.util.List;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.ObjectSizes;
+
+@RunWith(Parameterized.class)
+public class MemtableQuickTest extends CQLTester
+{
+    static String keyspace;
+    String table;
+    ColumnFamilyStore cfs;
+
+    int partitions = 50_000;
+    int rowsPerPartition = 4;
+
+    int deletedPartitionsStart = 20_000;
+    int deletedPartitionsEnd = deletedPartitionsStart + 10_000;
+
+    int deletedRowsStart = 40_000;
+    int deletedRowsEnd = deletedRowsStart + 5_000;
+
+    @Parameterized.Parameter()
+    public String memtableClass;
+
+    @Parameterized.Parameters(name = "{0}")
+    public static List<Object> parameters()
+    {
+        return ImmutableList.of("SkipListMemtable",
+                                "TrieMemtable",
+                                "PersistentMemoryMemtable");
+    }
+
+    @BeforeClass
+    public static void setUp()
+    {
+        CQLTester.setUpClass();
+        CQLTester.prepareServer();
+        CQLTester.disablePreparedReuseForTest();
+        System.err.println("setupClass done.");
+    }
+
+    @Test
+    public void testMemtable() throws Throwable
+    {
+        keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
+        table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid))" +
+                                      " with compression = {'enabled': false}" +
+                                      " and memtable = { 'class': '" + memtableClass + "'}");
+        execute("use " + keyspace + ';');
+
+        String writeStatement = "INSERT INTO "+table+"(userid,picid,commentid)VALUES(?,?,?)";
+
+        cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+        cfs.disableAutoCompaction();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        long i;
+        long limit = partitions;
+        System.out.println("Writing " + partitions + " partitions of " + rowsPerPartition + " rows");
+        for (i = 0; i < limit; ++i)
+        {
+            for (long j = 0; j < rowsPerPartition; ++j)
+                execute(writeStatement, i, j, i + j);
+        }
+
+        System.out.println("Deleting partitions between " + deletedPartitionsStart + " and " + deletedPartitionsEnd);
+        for (i = deletedPartitionsStart; i < deletedPartitionsEnd; ++i)
+        {
+            // no partition exists, but we will create a tombstone
+            execute("DELETE FROM " + table + " WHERE userid = ?", i);
+        }
+
+        System.out.println("Deleting rows between " + deletedRowsStart + " and " + deletedRowsEnd);
+        for (i = deletedRowsStart; i < deletedRowsEnd; ++i)
+        {
+            // no row exists, but we will create a tombstone (and partition)
+            execute("DELETE FROM " + table + " WHERE userid = ? AND picid = ?", i, 0L);
+        }
+
+        System.out.println("Reading " + partitions + " partitions");
+        for (i = 0; i < limit; ++i)
+        {
+            UntypedResultSet result = execute("SELECT * FROM " + table + " WHERE userid = ?", i);
+            if (i >= deletedPartitionsStart && i < deletedPartitionsEnd)
+                assertEmpty(result);
+            else
+            {
+                int start = 0;
+                if (i >= deletedRowsStart && i < deletedRowsEnd)
+                    start = 1;
+                Object[][] rows = new Object[rowsPerPartition - start][];
+                for (long j = start; j < rowsPerPartition; ++j)
+                    rows[(int) (j - start)] = row(i, j, i + j);
+                assertRows(result, rows);
+            }
+        }
+
+
+        int deletedPartitions = deletedPartitionsEnd - deletedPartitionsStart;
+        int deletedRows = deletedRowsEnd - deletedRowsStart;
+        System.out.println("Selecting *");
+        UntypedResultSet result = execute("SELECT * FROM " + table);
+        assertRowCount(result, rowsPerPartition * (partitions - deletedPartitions) - deletedRows);
+
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        System.out.println("Selecting *");
+        result = execute("SELECT * FROM " + table);
+        assertRowCount(result, rowsPerPartition * (partitions - deletedPartitions) - deletedRows);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/cql3/MemtableSizeTest.java b/test/unit/org/apache/cassandra/cql3/MemtableSizeTest.java
index 7d6c6dd3040a..80176543d571 100644
--- a/test/unit/org/apache/cassandra/cql3/MemtableSizeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/MemtableSizeTest.java
@@ -18,19 +18,25 @@
 
 package org.apache.cassandra.cql3;
 
+import java.util.List;
+
 import com.google.common.base.Throwables;
+import com.google.common.collect.ImmutableList;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.Memtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ObjectSizes;
 
+@RunWith(Parameterized.class)
 public class MemtableSizeTest extends CQLTester
 {
     static String keyspace;
@@ -43,6 +49,16 @@ public class MemtableSizeTest extends CQLTester
     int deletedPartitions = 10_000;
     int deletedRows = 5_000;
 
+    @Parameterized.Parameter()
+    public String memtableClass;
+
+    @Parameterized.Parameters(name = "{0}")
+    public static List<Object> parameters()
+    {
+        return ImmutableList.of("SkipListMemtable",
+                                "TrieMemtable");
+    }
+
     // must be within 50 bytes per partition of the actual size
     final int MAX_DIFFERENCE = (partitions + deletedPartitions + deletedRows) * 50;
 
@@ -66,14 +82,16 @@ private void testSize()
         try
         {
             keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
-            table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid)) with compression = {'enabled': false}");
+            table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid))" +
+                                      " with compression = {'enabled': false}" +
+                                      " and memtable = { 'class': '" + memtableClass + "'}");
             execute("use " + keyspace + ';');
 
             String writeStatement = "INSERT INTO " + table + "(userid,picid,commentid)VALUES(?,?,?)";
 
             cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
             cfs.disableAutoCompaction();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             long deepSizeBefore = ObjectSizes.measureDeep(cfs.getTracker().getView().getCurrentMemtable());
             System.out.printf("Memtable deep size before %s\n%n",
@@ -108,15 +126,13 @@ private void testSize()
                 System.out.println("Warning: " + cfs.getLiveSSTables().size() + " sstables created.");
 
             Memtable memtable = cfs.getTracker().getView().getCurrentMemtable();
-            long actualHeap = memtable.getAllocator().onHeap().owns();
-            System.out.printf("Memtable in %s mode: %d ops, %s serialized bytes, %s (%.0f%%) on heap, %s (%.0f%%) off-heap%n",
+            Memtable.MemoryUsage usage = Memtable.getMemoryUsage(memtable);
+        long actualHeap = usage.ownsOnHeap;
+            System.out.printf("Memtable in %s mode: %d ops, %s serialized bytes, %s %n",
                               DatabaseDescriptor.getMemtableAllocationType(),
                               memtable.getOperations(),
                               FBUtilities.prettyPrintMemory(memtable.getLiveDataSize()),
-                              FBUtilities.prettyPrintMemory(actualHeap),
-                              100 * memtable.getAllocator().onHeap().ownershipRatio(),
-                              FBUtilities.prettyPrintMemory(memtable.getAllocator().offHeap().owns()),
-                              100 * memtable.getAllocator().offHeap().ownershipRatio());
+                              usage);
 
             long deepSizeAfter = ObjectSizes.measureDeep(memtable);
             System.out.printf("Memtable deep size %s\n%n",
diff --git a/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java b/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java
index b4fe0f5fd2f3..41fb6e814a84 100644
--- a/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java
+++ b/test/unit/org/apache/cassandra/cql3/OutOfSpaceTest.java
@@ -28,6 +28,7 @@
 import org.apache.cassandra.Util;
 import org.apache.cassandra.config.Config.DiskFailurePolicy;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogSegment;
 import org.apache.cassandra.db.Keyspace;
@@ -115,7 +116,10 @@ public void flushAndExpectError() throws InterruptedException, ExecutionExceptio
     {
         try
         {
-            Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable()).forceFlush().get();
+            Keyspace.open(KEYSPACE)
+                    .getColumnFamilyStore(currentTable())
+                    .forceFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS)
+                    .get();
             fail("FSWriteError expected.");
         }
         catch (ExecutionException e)
diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java
index e130378d4fe7..e39acdb176dd 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java
@@ -42,6 +42,7 @@
 import static org.apache.cassandra.cql3.ViewComplexTest.createView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateViewWithFlush;
+import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
 import static org.junit.Assert.assertEquals;
 
 /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
@@ -103,17 +104,17 @@ public void testPartialDeleteUnselectedColumn() throws Throwable
 
         updateView("UPDATE %s USING TIMESTAMP 10 SET b=1 WHERE k=1 AND c=1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRows(execute("SELECT * from %s"), row(1, 1, null, 1));
         assertRows(execute("SELECT * from mv"), row(1, 1));
         updateView("DELETE b FROM %s USING TIMESTAMP 11 WHERE k=1 AND c=1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertEmpty(execute("SELECT * from %s"));
         assertEmpty(execute("SELECT * from mv"));
         updateView("UPDATE %s USING TIMESTAMP 1 SET a=1 WHERE k=1 AND c=1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRows(execute("SELECT * from %s"), row(1, 1, 1, null));
         assertRows(execute("SELECT * from mv"), row(1, 1));
 
@@ -255,27 +256,27 @@ public void testRangeDeletion(boolean flush) throws Throwable
 
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 0", 1, 1, 1, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
 
         // remove view row
         updateView("UPDATE %s using timestamp 1 set b = null WHERE a=1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
         // remove base row, no view updated generated.
         updateView("DELETE FROM %s using timestamp 2 where a=1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
 
         // restor view row with b,c column. d is still tombstone
         updateView("UPDATE %s using timestamp 3 set b = 1,c = 1 where a=1", version, this); // upsert
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, null));
     }
@@ -314,35 +315,35 @@ private void testCommutativeRowDeletion(boolean flush) throws Throwable
         updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(3, 1L));
         // sstable-2
         updateView("Delete from %s using timestamp 2 where p = 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"));
         // sstable-3
         updateView("Insert into %s (p, v1) values (3, 1) using timestamp 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
         // sstable-4
         updateView("UPdate %s using timestamp 4 set v1 = 2 where p = 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, null, null));
         // sstable-5
         updateView("UPdate %s using timestamp 5 set v1 = 1 where p = 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
 
@@ -396,7 +397,7 @@ private void complexTimestampWithbasePKColumnsInViewPKDeletionTest(boolean flush
         updateView("Insert into %s (p1, p2, v1, v2) values (1, 2, 3, 4) using timestamp 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
                                 row(3, 4, 1L));
@@ -404,14 +405,14 @@ private void complexTimestampWithbasePKColumnsInViewPKDeletionTest(boolean flush
         updateView("Delete from %s using timestamp 2 where p1 = 1 and p2 = 2;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         // view are empty
         assertRowsIgnoringOrder(execute("SELECT * from mv2"));
         // insert PK with TS=3
         updateView("Insert into %s (p1, p2) values (1, 2) using timestamp 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         // deleted column in MV remained dead
         assertRowsIgnoringOrder(execute("SELECT * from mv2"), row(2, 1, null, null));
 
@@ -421,21 +422,21 @@ private void complexTimestampWithbasePKColumnsInViewPKDeletionTest(boolean flush
         // reset values
         updateView("Insert into %s (p1, p2, v1, v2) values (1, 2, 3, 4) using timestamp 10;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
                                 row(3, 4, 10L));
 
         updateView("UPDATE %s using timestamp 20 SET v2 = 5 WHERE p1 = 1 and p2 = 2", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
                                 row(3, 5, 20L));
 
         updateView("DELETE FROM %s using timestamp 10 WHERE p1 = 1 and p2 = 2", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
                                 row(null, 5, 20L));
@@ -460,21 +461,21 @@ public void complexTimestampWithbaseNonPKColumnsInViewPKDeletionTest(boolean flu
         updateView("Insert into %s (p, v1, v2) values (3, 1, 5) using timestamp 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(5, 1L));
         // remove row/mv TS=2
         updateView("Delete from %s using timestamp 2 where p = 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         // view are empty
         assertRowsIgnoringOrder(execute("SELECT * from mv"));
         // insert PK with TS=3
         updateView("Insert into %s (p, v1) values (3, 1) using timestamp 3;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         // deleted column in MV remained dead
         assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 3, null));
 
@@ -482,7 +483,7 @@ public void complexTimestampWithbaseNonPKColumnsInViewPKDeletionTest(boolean flu
         updateView("Insert into %s (p, v1, v2) values (3, 1, 5) using timestamp 2;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         // deleted column in MV remained dead
         assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 3, null));
         assertRowsIgnoringOrder(execute("SELECT * from mv limit 1"), row(1, 3, null));
@@ -491,7 +492,7 @@ public void complexTimestampWithbaseNonPKColumnsInViewPKDeletionTest(boolean flu
         executeNet(version, "UPDATE %s USING TIMESTAMP 3 SET v2 = ? WHERE p = ?", 4, 3);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 4, 3L));
 
diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java
index 6999bef2d661..db9c2051cc0b 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java
@@ -38,6 +38,7 @@
 import static org.apache.cassandra.cql3.ViewComplexTest.createView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateViewWithFlush;
+import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
 import static org.junit.Assert.assertEquals;
 
 /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
@@ -181,8 +182,8 @@ private void testExpiredLivenessLimit(boolean flush) throws Throwable
         }
         if (flush)
         {
-            ks.getColumnFamilyStore("mv1").forceBlockingFlush();
-            ks.getColumnFamilyStore("mv2").forceBlockingFlush();
+            ks.getColumnFamilyStore("mv1").forceBlockingFlush(UNIT_TESTS);
+            ks.getColumnFamilyStore("mv2").forceBlockingFlush(UNIT_TESTS);
         }
 
         for (String view : Arrays.asList("mv1", "mv2"))
@@ -224,7 +225,7 @@ public void testStrictLivenessTombstone() throws Throwable
         assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
 
         updateView("Update %s set v1 = null WHERE p = 1", version, this);
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"));
 
         cfs.forceMajorCompaction(); // before gc grace second, strict-liveness tombstoned dead row remains
@@ -237,7 +238,7 @@ public void testStrictLivenessTombstone() throws Throwable
         assertEquals(0, cfs.getLiveSSTables().size());
 
         updateView("Update %s using ttl 5 set v1 = 1 WHERE p = 1", version, this);
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
 
         cfs.forceMajorCompaction(); // before ttl+gc_grace_second, strict-liveness ttled dead row remains
diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java
index 76a8933fb71f..6eb16ef29e5e 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java
@@ -36,6 +36,7 @@
 import static org.apache.cassandra.cql3.ViewComplexTest.createView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateViewWithFlush;
+import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
 import static org.junit.Assert.assertTrue;
 
 /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
@@ -111,7 +112,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("UPDATE %s SET a = 1 WHERE k = 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT * from %s"), row(1, 1, null));
         assertRows(execute("SELECT * from mv"), row(1, 1, null));
@@ -119,7 +120,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("DELETE a FROM %s WHERE k = 1", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT * from %s"));
         assertEmpty(execute("SELECT * from mv"));
@@ -127,7 +128,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("INSERT INTO %s (k) VALUES (1);", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT * from %s"), row(1, null, null));
         assertEmpty(execute("SELECT * from mv"));
@@ -135,7 +136,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("UPDATE %s USING TTL 5 SET a = 10 WHERE k = 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT * from %s"), row(1, 10, null));
         assertRows(execute("SELECT * from mv"), row(10, 1, null));
@@ -143,7 +144,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("UPDATE %s SET b = 100 WHERE k = 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT * from %s"), row(1, 10, 100));
         assertRows(execute("SELECT * from mv"), row(10, 1, 100));
@@ -158,7 +159,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("DELETE b FROM %s WHERE k=1", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRows(execute("SELECT * from %s"), row(1, null, null));
         assertEmpty(execute("SELECT * from mv"));
@@ -166,7 +167,7 @@ private void testUpdateColumnInViewPKWithTTL(boolean flush) throws Throwable
         updateView("DELETE FROM %s WHERE k=1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertEmpty(execute("SELECT * from %s"));
         assertEmpty(execute("SELECT * from mv"));
@@ -250,11 +251,11 @@ public void testBaseTTLWithSameTimestampTest() throws Throwable
 
         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) using timestamp 1;", version, this);
 
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING TTL 3 and timestamp 1;", version, this);
 
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         Thread.sleep(4000);
 
@@ -265,11 +266,11 @@ public void testBaseTTLWithSameTimestampTest() throws Throwable
 
         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING TTL 3 and timestamp 1;", version, this);
 
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING timestamp 1;", version, this);
 
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         Thread.sleep(4000);
 
diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
index ea05eef50c38..7ff7cd246567 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
@@ -47,6 +47,7 @@
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.*;
 import static org.junit.Assert.fail;
 
 /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
@@ -133,7 +134,7 @@ public static void updateViewWithFlush(String query, boolean flush, ProtocolVers
             Thread.sleep(1);
         }
         if (flush)
-            Keyspace.open(cqlTester.keyspace()).flush();
+            Keyspace.open(cqlTester.keyspace()).flush(UNIT_TESTS);
     }
 
     @Test
@@ -167,37 +168,37 @@ public void testNonBaseColumnInViewPk(boolean flush) throws Throwable
 
         updateView("UPDATE %s USING TIMESTAMP 1 set v1 =1 where p1 = 1 AND p2 = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, 1, null));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, 1, null));
 
         updateView("UPDATE %s USING TIMESTAMP 2 set v1 = null, v2 = 1 where p1 = 1 AND p2 = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, 1));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, 1));
 
         updateView("UPDATE %s USING TIMESTAMP 2 set v2 = null where p1 = 1 AND p2 = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"));
 
         updateView("INSERT INTO %s (p1,p2) VALUES(1,1) USING TIMESTAMP 3;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, null));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, null));
 
         updateView("DELETE FROM %s USING TIMESTAMP 4 WHERE p1 =1 AND p2 = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"));
 
         updateView("UPDATE %s USING TIMESTAMP 5 set v2 = 1 where p1 = 1 AND p2 = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, 1));
         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, 1));
     }
@@ -233,14 +234,14 @@ private void testCellTombstoneAndShadowableTombstones(boolean flush) throws Thro
         updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 1;", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(3, 1L));
         // sstable 2
         updateView("UPdate %s using timestamp 2 set v2 = null where p = 3", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3),
                                 row(null, null));
@@ -248,14 +249,14 @@ private void testCellTombstoneAndShadowableTombstones(boolean flush) throws Thro
         updateView("UPdate %s using timestamp 3 set v1 = 2 where p = 3", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, null, null));
         // sstable 4
         updateView("UPdate %s using timestamp 4 set v1 = 1 where p = 3", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
 
diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java
index f2a627df56dd..5c973a100c44 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java
@@ -36,6 +36,7 @@
 
 import static org.apache.cassandra.cql3.ViewComplexTest.createView;
 import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
+import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
 
 /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
  * Any changes here check if they apply to the other classes:
@@ -109,7 +110,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("UPDATE %s USING TIMESTAMP 0 SET v1 = 1 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, 1, null));
         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
@@ -117,7 +118,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("DELETE v1 FROM %s USING TIMESTAMP 1 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
         assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
@@ -126,7 +127,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("UPDATE %s USING TIMESTAMP 1 SET v1 = 1 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
         assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
@@ -134,7 +135,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("UPDATE %s USING TIMESTAMP 2 SET v2 = 1 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
@@ -142,7 +143,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("DELETE v1 FROM %s USING TIMESTAMP 3 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
@@ -150,7 +151,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("DELETE v2 FROM %s USING TIMESTAMP 4 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
         assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
@@ -158,7 +159,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("UPDATE %s USING TTL 3 SET v2 = 1 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
@@ -171,7 +172,7 @@ private void testUpdateColumnNotInView(boolean flush) throws Throwable
         updateView("UPDATE %s SET v2 = 1 WHERE p = 0 AND c = 0", version, this);
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
@@ -212,23 +213,23 @@ public void testPartialUpdateWithUnselectedCollections(boolean flush) throws Thr
 
         updateView("UPDATE %s SET l=l+[1,2,3] WHERE k = 1 AND c = 1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
 
         updateView("UPDATE %s SET l=l-[1,2] WHERE k = 1 AND c = 1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
 
         updateView("UPDATE %s SET b=3 WHERE k=1 AND c=1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRows(execute("SELECT * from mv"), row(1, 1, null, 3));
 
         updateView("UPDATE %s SET b=null, l=l-[3], s=s-{3} WHERE k = 1 AND c = 1", version, this);
         if (flush)
         {
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
             ks.getColumnFamilyStore("mv").forceMajorCompaction();
         }
         assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"));
@@ -236,7 +237,7 @@ public void testPartialUpdateWithUnselectedCollections(boolean flush) throws Thr
 
         updateView("UPDATE %s SET m=m+{3:3}, l=l-[1], s=s-{2} WHERE k = 1 AND c = 1", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"), row(1, 1, null, null));
         assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 1, null, null));
 
@@ -279,23 +280,23 @@ public void testUpdateWithColumnTimestampSmallerThanPk(boolean flush) throws Thr
         // reset value
         updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 6;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
         // increase pk's timestamp to 20
         updateView("Insert into %s (p) values (3) using timestamp 20;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
         // change v1's to 2 and remove existing view row with ts7
         updateView("UPdate %s using timestamp 7 set v1 = 2 where p = 3;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, 3, 6L));
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv limit 1"), row(2, 3, 3, 6L));
         // change v1's to 1 and remove existing view row with ts8
         updateView("UPdate %s using timestamp 8 set v1 = 1 where p = 3;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
     }
 
@@ -330,41 +331,41 @@ public void testUpdateWithColumnTimestampBiggerThanPk(boolean flush) throws Thro
         ks.getColumnFamilyStore("mv").disableAutoCompaction();
         updateView("DELETE FROM %s USING TIMESTAMP 0 WHERE k = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         // sstable-1, Set initial values TS=1
         updateView("INSERT INTO %s(k, a, b) VALUES (1, 1, 1) USING TIMESTAMP 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 1));
         updateView("UPDATE %s USING TIMESTAMP 10 SET b = 2 WHERE k = 1;", version, this);
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
         updateView("UPDATE %s USING TIMESTAMP 2 SET a = 2 WHERE k = 1;", version, this);
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 2, 2));
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         ks.getColumnFamilyStore("mv").forceMajorCompaction();
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 2, 2));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv limit 1"), row(1, 2, 2));
         updateView("UPDATE %s USING TIMESTAMP 11 SET a = 1 WHERE k = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, 1, 2));
 
         // set non-key base column as tombstone, view row is removed with shadowable
         updateView("UPDATE %s USING TIMESTAMP 12 SET a = null WHERE k = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, null, 2));
 
         // column b should be alive
         updateView("UPDATE %s USING TIMESTAMP 13 SET a = 1 WHERE k = 1;", version, this);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
         assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, 1, 2));
 
diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
index 374f79f27cf6..2d4cbb65ab30 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
@@ -41,6 +41,8 @@
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
+
 /* ViewFilteringTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
  * Any changes here check if they apply to the other classes
  * - ViewFilteringPKTest
@@ -203,7 +205,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 0", 1, 1, 1, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         // views should be updated.
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
@@ -215,7 +217,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("UPDATE %s using timestamp 1 set c = ? WHERE a=?", 0, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
@@ -226,7 +228,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("UPDATE %s using timestamp 2 set c = ? WHERE a=?", 1, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         // row should be back in views.
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
@@ -238,7 +240,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("UPDATE %s using timestamp 3 set d = ? WHERE a=?", 0, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 0));
         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
@@ -249,7 +251,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("UPDATE %s using timestamp 4 set c = ? WHERE a=?", 0, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
@@ -260,7 +262,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("UPDATE %s using timestamp 5 set d = ? WHERE a=?", 1, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         // should not update as c=0
         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
@@ -283,7 +285,7 @@ public void testViewFiltering(boolean flush) throws Throwable
         updateView("UPDATE %s using timestamp 7 set b = ? WHERE a=?", 2, 1);
         if (flush)
         {
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
             for (String view : views)
                 ks.getColumnFamilyStore(view).forceMajorCompaction();
         }
@@ -297,7 +299,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("DELETE b, c FROM %s using timestamp 6 WHERE a=?", 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM %s"), row(1, 2, null, 1));
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
@@ -309,7 +311,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("DELETE FROM %s using timestamp 8 where a=?", 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
@@ -320,7 +322,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("UPDATE %s using timestamp 9 set b = ?,c = ? where a=?", 1, 1, 1); // upsert
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, null));
         assertRows(execute("SELECT * FROM mv_test2"));
@@ -331,7 +333,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("DELETE FROM %s using timestamp 10 where a=?", 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
@@ -342,7 +344,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 11", 1, 1, 1, 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         // row should be back in views.
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
@@ -354,7 +356,7 @@ public void testViewFiltering(boolean flush) throws Throwable
 
         updateView("DELETE FROM %s using timestamp 12 where a=?", 1);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
@@ -415,7 +417,7 @@ public void testMVFilteringWithComplexColumn() throws Throwable
                 list(1, 1, 2),
                 set(1, 2),
                 map(1, 1, 2, 2));
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1));
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test2"), row(1, 1));
@@ -423,7 +425,7 @@ public void testMVFilteringWithComplexColumn() throws Throwable
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test4"), row(1, 1));
 
         execute("UPDATE %s SET l=l-[1] WHERE a = 1 AND b = 1" );
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test2"));
@@ -431,7 +433,7 @@ public void testMVFilteringWithComplexColumn() throws Throwable
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test4"), row(1, 1));
 
         execute("UPDATE %s SET s=s-{2}, m=m-{2} WHERE a = 1 AND b = 1");
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT a,b,c FROM %s"), row(1, 1, 1));
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
@@ -440,7 +442,7 @@ public void testMVFilteringWithComplexColumn() throws Throwable
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test4"), row(1, 1));
 
         execute("UPDATE %s SET  m=m-{1} WHERE a = 1 AND b = 1");
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT a,b,c FROM %s"), row(1, 1, 1));
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
@@ -450,7 +452,7 @@ public void testMVFilteringWithComplexColumn() throws Throwable
 
         // filter conditions result not changed
         execute("UPDATE %s SET  l=l+[2], s=s-{0}, m=m+{3:3} WHERE a = 1 AND b = 1");
-        FBUtilities.waitOnFutures(ks.flush());
+        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         assertRowsIgnoringOrder(execute("SELECT a,b,c FROM %s"), row(1, 1, 1));
         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
@@ -892,14 +894,14 @@ public void complexRestrictedTimestampUpdateTest(boolean flush) throws Throwable
         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         //update c's timestamp TS=2
         executeNet(version, "UPDATE %s USING TIMESTAMP 2 SET c = ? WHERE a = ? and b = ? ", 1, 0, 0);
         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         //change c's value and TS=3, tombstones c=1 and adds c=0 record
         executeNet(version, "UPDATE %s USING TIMESTAMP 3 SET c = ? WHERE a = ? and b = ? ", 0, 0, 0);
@@ -908,7 +910,7 @@ public void complexRestrictedTimestampUpdateTest(boolean flush) throws Throwable
         if(flush)
         {
             ks.getColumnFamilyStore("mv").forceMajorCompaction();
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         }
 
         //change c's value back to 1 with TS=4, check we can see d
@@ -916,7 +918,7 @@ public void complexRestrictedTimestampUpdateTest(boolean flush) throws Throwable
         if (flush)
         {
             ks.getColumnFamilyStore("mv").forceMajorCompaction();
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
         }
 
         assertRows(execute("SELECT d, e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, null));
@@ -926,14 +928,14 @@ public void complexRestrictedTimestampUpdateTest(boolean flush) throws Throwable
         assertRows(execute("SELECT d, e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, 1));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         //Change d value @ TS=2
         executeNet(version, "UPDATE %s USING TIMESTAMP 2 SET d = ? WHERE a = ? and b = ? ", 2, 0, 0);
         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(2));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
 
         //Change d value @ TS=3
         executeNet(version, "UPDATE %s USING TIMESTAMP 3 SET d = ? WHERE a = ? and b = ? ", 1, 0, 0);
@@ -1022,7 +1024,7 @@ public void testOldTimestampsWithRestrictions() throws Throwable
         for (int i = 0; i < 100; i++)
             updateView("INSERT into %s (k,c,val)VALUES(?,?,?)", 0, i % 2, "baz");
 
-        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush(UNIT_TESTS);
 
         Assert.assertEquals(2, execute("select * from %s").size());
         Assert.assertEquals(2, execute("select * from mv_tstest").size());
diff --git a/test/unit/org/apache/cassandra/cql3/ViewTest.java b/test/unit/org/apache/cassandra/cql3/ViewTest.java
index 804022952eb5..23b9195e44ab 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewTest.java
@@ -153,7 +153,7 @@ public void testExistingRangeTombstone(boolean flush) throws Throwable
         updateView("DELETE FROM %s USING TIMESTAMP 10 WHERE k1 = 1 and c1=1");
 
         if (flush)
-            Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+            Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         String table = KEYSPACE + "." + currentTable();
         updateView("BEGIN BATCH " +
@@ -363,7 +363,7 @@ public void testOldTimestamps() throws Throwable
         for (int i = 0; i < 100; i++)
             updateView("INSERT into %s (k,c,val)VALUES(?,?,?)", 0, i % 2, "baz");
 
-        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Assert.assertEquals(2, execute("select * from %s").size());
         Assert.assertEquals(2, execute("select * from mv_tstest").size());
@@ -489,18 +489,18 @@ public void complexTimestampUpdateTest(boolean flush) throws Throwable
         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
 
         // change c's value and TS=3, tombstones c=1 and adds c=0 record
         executeNet("UPDATE %s USING TIMESTAMP 3 SET c = ? WHERE a = ? and b = ? ", 0, 0, 0);
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0));
 
         if(flush)
         {
             ks.getColumnFamilyStore("mv").forceMajorCompaction();
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
         }
 
 
@@ -509,7 +509,7 @@ public void complexTimestampUpdateTest(boolean flush) throws Throwable
         if (flush)
         {
             ks.getColumnFamilyStore("mv").forceMajorCompaction();
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
         }
 
         assertRows(execute("SELECT d,e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, null));
@@ -520,7 +520,7 @@ public void complexTimestampUpdateTest(boolean flush) throws Throwable
         assertRows(execute("SELECT d,e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, 1));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
 
 
         //Change d value @ TS=2
@@ -528,7 +528,7 @@ public void complexTimestampUpdateTest(boolean flush) throws Throwable
         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(2));
 
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
 
 
         //Change d value @ TS=3
@@ -951,7 +951,7 @@ public void testIgnoreUpdate() throws Throwable
         assertRows(execute("SELECT a, b, c from mv WHERE b = ?", 1), row(0, 1, null));
 
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore("mv");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Assert.assertEquals(1, cfs.getLiveSSTables().size());
     }
 
@@ -1364,22 +1364,22 @@ private void testViewBuilderResume(int concurrentViewBuilders) throws Throwable
         for (int i = 0; i < 1024; i++)
             execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (int i = 0; i < 1024; i++)
             execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (int i = 0; i < 1024; i++)
             execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (int i = 0; i < 1024; i++)
             execute("INSERT into %s (k,c,val)VALUES(?,?,?)", i, i, ""+i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         String viewName1 = "mv_test_" + concurrentViewBuilders;
         createView(viewName1, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
diff --git a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java
index 60dd0395987f..f90bce6b2868 100644
--- a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java
+++ b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java
@@ -791,6 +791,7 @@ private static String tableParametersCql()
                "    AND comment = ''\n" +
                "    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}\n" +
                "    AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}\n" +
+               "    AND memtable = {}\n" +
                "    AND crc_check_chance = 1.0\n" +
                "    AND default_time_to_live = 0\n" +
                "    AND extensions = {}\n" +
diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java
index 246f512f66b5..837f562ced2a 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/CrcCheckChanceTest.java
@@ -68,7 +68,7 @@ public void testChangingCrcCheckChance(boolean newFormat) throws Throwable
 
         ColumnFamilyStore cfs = Keyspace.open(CQLTester.KEYSPACE).getColumnFamilyStore(currentTable());
         ColumnFamilyStore indexCfs = cfs.indexManager.getAllIndexColumnFamilyStores().iterator().next();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Assert.assertEquals(0.99, cfs.getCrcCheckChance(), 0.0);
         Assert.assertEquals(0.99, cfs.getLiveSSTables().iterator().next().getCrcCheckChance(), 0.0);
@@ -96,19 +96,19 @@ public void testChangingCrcCheckChance(boolean newFormat) throws Throwable
         execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
         execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
         execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
         execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         execute("INSERT INTO %s(p, c, v, s) values (?, ?, ?, ?)", "p1", "k1", "v1", "sv1");
         execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
         execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.forceMajorCompaction();
 
         //Now let's change via JMX
@@ -182,7 +182,7 @@ public void testDropDuringCompaction() throws Throwable
             execute("INSERT INTO %s(p, c, v) values (?, ?, ?)", "p1", "k2", "v2");
             execute("INSERT INTO %s(p, s) values (?, ?)", "p2", "sv2");
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         DatabaseDescriptor.setCompactionThroughputMbPerSec(1);
diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java
index 5d367de49c09..83489df15dc3 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTableMetadataTrackingTest.java
@@ -40,7 +40,7 @@ public void baseCheck() throws Throwable
         createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b))");
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 9999");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(Integer.MAX_VALUE, metadata.maxLocalDeletionTime);
@@ -57,7 +57,7 @@ public void testMinMaxtimestampRange() throws Throwable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 10000");
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
@@ -76,7 +76,7 @@ public void testMinMaxtimestampRow() throws Throwable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("INSERT INTO %s (a,b,c) VALUES (1,1,'1') using timestamp 10000");
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
         assertEquals(10000, metadata.maxTimestamp);
@@ -95,7 +95,7 @@ public void testTrackMetadata_rangeTombstone() throws Throwable
         createTable("CREATE TABLE %s (a int, b int, c text, PRIMARY KEY (a, b)) WITH gc_grace_seconds = 10000");
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1 and b = 1");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
@@ -115,7 +115,7 @@ public void testTrackMetadata_rowTombstone() throws Throwable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a = 1");
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
@@ -135,7 +135,7 @@ public void testTrackMetadata_rowMarker() throws Throwable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("INSERT INTO %s (a) VALUES (1) USING TIMESTAMP 9999");
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
@@ -154,7 +154,7 @@ public void testTrackMetadata_rowMarkerDelete() throws Throwable
         createTable("CREATE TABLE %s (a int, PRIMARY KEY (a))");
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
         execute("DELETE FROM %s USING TIMESTAMP 9999 WHERE a=1");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
         StatsMetadata metadata = cfs.getLiveSSTables().iterator().next().getSSTableMetadata();
         assertEquals(9999, metadata.minTimestamp);
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
index 23d0ae7c89bb..3ccac4034202 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
@@ -28,11 +28,14 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.memtable.SkipListMemtable;
+import org.apache.cassandra.db.memtable.TrieMemtable;
 import org.apache.cassandra.dht.OrderPreservingPartitioner;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.schema.MemtableParams;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.SchemaKeyspace;
 import org.apache.cassandra.service.ClientWarn;
@@ -43,6 +46,7 @@
 import static java.lang.String.format;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -519,6 +523,82 @@ public void testDoubleWith() throws Throwable
         }
     }
 
+
+    @Test
+    public void testAlterTableWithMemtable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))");
+        assertSame(MemtableParams.DEFAULT.factory, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map()));
+
+        alterTable("ALTER TABLE %s WITH memtable = { 'class' : 'SkipListMemtable' };");
+        assertSame(SkipListMemtable.FACTORY, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+        assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof SkipListMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", "SkipListMemtable")));
+
+        alterTable("ALTER TABLE %s"
+                    + " WITH memtable = { 'class' : 'org.apache.cassandra.db.memtable.TrieMemtable' };");
+        assertSame(TrieMemtable.FACTORY, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+        assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof TrieMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", "org.apache.cassandra.db.memtable.TrieMemtable")));
+
+        alterTable("ALTER TABLE %s"
+                    + " WITH memtable = { 'class' : '" + CreateTest.TestMemtableFactory.class.getName() + "', 'skiplist' : 'true' };");
+        assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof SkipListMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", CreateTest.TestMemtableFactory.class.getName(),
+                           "skiplist", "true")));
+
+        alterTable("ALTER TABLE %s"
+                    + " WITH memtable = {  };");
+        assertSame(MemtableParams.DEFAULT.factory, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map()));
+
+        assertAlterTableThrowsException(ConfigurationException.class,
+                                        "The 'class' option must not be empty. To use default implementation, remove option.",
+                                        "ALTER TABLE %s"
+                                        + " WITH memtable = { 'class' : '' };");
+
+        assertAlterTableThrowsException(ConfigurationException.class,
+                                        "Could not create memtable factory for type org.apache.cassandra.db.memtable.NotExisting and options {}",
+                                        "ALTER TABLE %s"
+                                        + " WITH memtable = { 'class' : 'NotExisting'};");
+
+        assertAlterTableThrowsException(ConfigurationException.class,
+                                        "Options {invalid=throw} not expected.",
+                                        "ALTER TABLE %s"
+                                        + " WITH memtable = { 'class' : '" + CreateTest.TestMemtableFactory.class.getName() + "', 'invalid' : 'throw' };");
+    }
+
     @Test
     public void testAlterTableWithCompression() throws Throwable
     {
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java
index 3d5c680adfec..8d2417bd2518 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/CreateTest.java
@@ -21,14 +21,19 @@
 import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Map;
 import java.util.UUID;
 
+import org.junit.Assert;
 import org.junit.Test;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.Duration;
 import org.apache.cassandra.db.Mutation;
+import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.db.memtable.SkipListMemtable;
+import org.apache.cassandra.db.memtable.TrieMemtable;
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -38,6 +43,7 @@
 import org.apache.cassandra.locator.IEndpointSnitch;
 import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.schema.MemtableParams;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.SchemaKeyspace;
@@ -53,6 +59,7 @@
 import static org.apache.cassandra.cql3.Duration.NANOS_PER_MINUTE;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -555,6 +562,129 @@ public void testDoubleWith() throws Throwable
             assertInvalidSyntaxMessage("no viable alternative at input 'WITH'", stmt);
     }
 
+    public static class TestMemtableFactory
+    {
+        public static Memtable.Factory factory(Map<String, String> options)
+        {
+            String skiplist = options.remove("skiplist");
+            if (!options.isEmpty())
+                throw new ConfigurationException("Options " + options + " not expected.");
+            if (Boolean.parseBoolean(skiplist))
+                return SkipListMemtable.FACTORY;
+            else
+                return TrieMemtable.FACTORY;
+        }
+    }
+
+    public static class InvalidMemtableFactoryMethod
+    {
+        public static String factory(Map<String, String> options)
+        {
+            return "invalid";
+        }
+    }
+
+    public static class InvalidMemtableFactoryField
+    {
+        public static String FACTORY = "invalid";
+    }
+
+    @Test
+    public void testCreateTableWithMemtable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))");
+        assertSame(MemtableParams.DEFAULT.factory, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map()));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                    + " WITH memtable = { 'class' : 'SkipListMemtable' };");
+        assertSame(SkipListMemtable.FACTORY, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+        Assert.assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof SkipListMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", "SkipListMemtable")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                    + " WITH memtable = { 'class' : 'org.apache.cassandra.db.memtable.TrieMemtable' };");
+        assertSame(TrieMemtable.FACTORY, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+        Assert.assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof TrieMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", "org.apache.cassandra.db.memtable.TrieMemtable")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                    + " WITH memtable = { 'class' : '" + TestMemtableFactory.class.getName() + "' };");
+        Assert.assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof TrieMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", TestMemtableFactory.class.getName())));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                    + " WITH memtable = { 'class' : '" + TestMemtableFactory.class.getName() + "', 'skiplist' : 'true' };");
+        Assert.assertTrue(getCurrentColumnFamilyStore().getTracker().getView().getCurrentMemtable() instanceof SkipListMemtable);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map("class", TestMemtableFactory.class.getName(),
+                           "skiplist", "true")));
+
+        createTable("CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                    + " WITH memtable = {  };");
+        assertSame(MemtableParams.DEFAULT.factory, getCurrentColumnFamilyStore().metadata().params.memtable.factory);
+
+        assertRows(execute(format("SELECT memtable FROM %s.%s WHERE keyspace_name = ? and table_name = ?;",
+                                  SchemaConstants.SCHEMA_KEYSPACE_NAME,
+                                  SchemaKeyspace.TABLES),
+                           KEYSPACE,
+                           currentTable()),
+                   row(map()));
+
+        assertThrowsConfigurationException("The 'class' option must not be empty. To use default implementation, remove option.",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH memtable = { 'class' : '' };");
+
+        assertThrowsConfigurationException("Memtable class org.apache.cassandra.db.memtable.TrieMemtable does not accept any futher parameters, but {invalid=throw} were given.",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH memtable = { 'class' : 'TrieMemtable', 'invalid' : 'throw' };");
+
+        assertThrowsConfigurationException("Could not create memtable factory for type org.apache.cassandra.db.memtable.NotExisting and options {}",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH memtable = { 'class' : 'NotExisting'};");
+
+        assertThrowsConfigurationException("Options {invalid=throw} not expected.",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH memtable = { 'class' : '" + TestMemtableFactory.class.getName() + "', 'invalid' : 'throw' };");
+
+        assertThrowsConfigurationException("Could not create memtable factory for type " + InvalidMemtableFactoryMethod.class.getName() +  " and options {}",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH memtable = { 'class' : '" + InvalidMemtableFactoryMethod.class.getName() + "' };");
+
+        assertThrowsConfigurationException("Could not create memtable factory for type " + InvalidMemtableFactoryField.class.getName() +  " and options {}",
+                                           "CREATE TABLE %s (a text, b int, c int, primary key (a, b))"
+                                           + " WITH memtable = { 'class' : '" + InvalidMemtableFactoryField.class.getName() + "' };");
+    }
+
     @Test
     public void testCreateTableWithCompression() throws Throwable
     {
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java
index 13cbdf3f552a..1780c7834acb 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/TTLTest.java
@@ -279,7 +279,7 @@ public void testCapExpirationDateOverflowPolicy(boolean simple, boolean clusteri
         // Maybe Flush
         Keyspace ks = Keyspace.open(keyspace());
         if (flush)
-            FBUtilities.waitOnFutures(ks.flush());
+            FBUtilities.waitOnFutures(ks.flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
 
         // Verify data
         verifyData(simple);
diff --git a/test/unit/org/apache/cassandra/db/CleanupTest.java b/test/unit/org/apache/cassandra/db/CleanupTest.java
index 996536126bbf..d53241949d01 100644
--- a/test/unit/org/apache/cassandra/db/CleanupTest.java
+++ b/test/unit/org/apache/cassandra/db/CleanupTest.java
@@ -272,7 +272,7 @@ public void testCleanupSkippingSSTables() throws UnknownHostException, Execution
             .add("val", VALUE)
             .build()
             .applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         Set<SSTableReader> beforeFirstCleanup = Sets.newHashSet(cfs.getLiveSSTables());
@@ -407,7 +407,7 @@ protected void fillCF(ColumnFamilyStore cfs, String colName, int rowsPerSSTable)
                     .applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     protected List<Long> getMaxTimestampList(ColumnFamilyStore cfs)
diff --git a/test/unit/org/apache/cassandra/db/CleanupTransientTest.java b/test/unit/org/apache/cassandra/db/CleanupTransientTest.java
index 9789183dc14b..c9510a6cbdf6 100644
--- a/test/unit/org/apache/cassandra/db/CleanupTransientTest.java
+++ b/test/unit/org/apache/cassandra/db/CleanupTransientTest.java
@@ -182,7 +182,7 @@ protected void fillCF(ColumnFamilyStore cfs, String colName, int rowsPerSSTable)
                     .applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     protected List<Long> getMaxTimestampList(ColumnFamilyStore cfs)
diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java
index 21417ed78795..a1e4d955727d 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyMetricTest.java
@@ -70,7 +70,7 @@ public void testSizeMetric()
         {
             applyMutation(cfs.metadata(), String.valueOf(j), ByteBufferUtil.EMPTY_BYTE_BUFFER, FBUtilities.timestampMicros());
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Collection<SSTableReader> sstables = cfs.getLiveSSTables();
         long size = 0;
         for (SSTableReader reader : sstables)
@@ -152,7 +152,7 @@ public void testEstimatedColumnCountHistogramAndEstimatedRowSizeHistogram()
             applyMutation(store.metadata(), "1", bytes(1), FBUtilities.timestampMicros());
 
             // Flushing first SSTable
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             long[] estimatedColumnCountHistogram = store.metric.estimatedColumnCountHistogram.getValue();
             assertNumberOfNonZeroValue(estimatedColumnCountHistogram, 1);
@@ -165,7 +165,7 @@ public void testEstimatedColumnCountHistogramAndEstimatedRowSizeHistogram()
             applyMutation(store.metadata(), "2", bytes(2), FBUtilities.timestampMicros());
 
             // Flushing second SSTable
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             estimatedColumnCountHistogram = store.metric.estimatedColumnCountHistogram.getValue();
             assertNumberOfNonZeroValue(estimatedColumnCountHistogram, 1);
diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
index 48ef580dd476..479c215db8a6 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
@@ -102,14 +102,14 @@ public void testTimeSortedQuery()
                 .add("val", "asdf")
                 .build()
                 .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, "key1")
                 .clustering("Column1")
                 .add("val", "asdf")
                 .build()
                 .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         ((ClearableHistogram)cfs.metric.sstablesPerReadHistogram.cf).clear(); // resets counts
         Util.getAll(Util.cmd(cfs, "key1").includeRow("c1").build());
@@ -178,7 +178,7 @@ public void testDeleteStandardRowSticksAfterFlush() throws Throwable
         assertRangeCount(cfs, col, val, 2);
 
         // flush.
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // insert, don't flush
         new RowUpdateBuilder(cfs.metadata(), 1, "key3").clustering("Column1").add("val", "val1").build().applyUnsafe();
@@ -193,7 +193,7 @@ public void testDeleteStandardRowSticksAfterFlush() throws Throwable
         assertRangeCount(cfs, col, val, 2);
 
         // flush
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // re-verify delete. // first breakage is right here because of CASSANDRA-1837.
         assertRangeCount(cfs, col, val, 2);
@@ -211,7 +211,7 @@ public void testDeleteStandardRowSticksAfterFlush() throws Throwable
         assertRangeCount(cfs, col, val, 4);
 
         // and it remains so after flush. (this wasn't failing before, but it's good to check.)
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertRangeCount(cfs, col, val, 4);
     }
 
@@ -260,9 +260,9 @@ public void testBackupAfterFlush() throws Throwable
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1);
         new RowUpdateBuilder(cfs.metadata(), 0, ByteBufferUtil.bytes("key1")).clustering("Column1").add("val", "asdf").build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         new RowUpdateBuilder(cfs.metadata(), 0, ByteBufferUtil.bytes("key2")).clustering("Column1").add("val", "asdf").build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (int version = 1; version <= 2; ++version)
         {
@@ -401,7 +401,7 @@ public void testBackupAfterFlush() throws Throwable
     public void reTest(ColumnFamilyStore cfs, Runnable verify) throws Exception
     {
         verify.run();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         verify.run();
     }
 
@@ -440,10 +440,10 @@ public void testSnapshotWithoutFlushWithSecondaryIndexes() throws Exception
                                              .add("birthdate", 1L)
                                              .add("notbirthdate", 2L);
         new Mutation(builder.build()).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         String snapshotName = "newSnapshot";
-        cfs.snapshotWithoutFlush(snapshotName);
+        cfs.snapshotWithoutMemtable(snapshotName);
 
         File snapshotManifestFile = cfs.getDirectories().getSnapshotManifestFile(snapshotName);
         JSONParser parser = new JSONParser();
@@ -485,7 +485,7 @@ public void testScrubDataDirectories() throws Throwable
         ColumnFamilyStore.scrubDataDirectories(cfs.metadata());
 
         new RowUpdateBuilder(cfs.metadata(), 2, "key").clustering("name").add("val", "2").build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Nuke the metadata and reload that sstable
         Collection<SSTableReader> ssTables = cfs.getLiveSSTables();
diff --git a/test/unit/org/apache/cassandra/db/DeletePartitionTest.java b/test/unit/org/apache/cassandra/db/DeletePartitionTest.java
index 6ed43f726250..19ad578a6042 100644
--- a/test/unit/org/apache/cassandra/db/DeletePartitionTest.java
+++ b/test/unit/org/apache/cassandra/db/DeletePartitionTest.java
@@ -75,7 +75,7 @@ public void testDeletePartition(DecoratedKey key, boolean flushBeforeRemove, boo
         assertTrue(r.getCell(column).value().equals(ByteBufferUtil.bytes("asdf")));
 
         if (flushBeforeRemove)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // delete the partition
         new Mutation.PartitionUpdateCollector(KEYSPACE1, key)
@@ -84,7 +84,7 @@ public void testDeletePartition(DecoratedKey key, boolean flushBeforeRemove, boo
                 .applyUnsafe();
 
         if (flushAfterRemove)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // validate removal
         ImmutableBTreePartition partitionUnfiltered = Util.getOnlyPartitionUnfiltered(Util.cmd(store, key).build());
diff --git a/test/unit/org/apache/cassandra/db/ImportTest.java b/test/unit/org/apache/cassandra/db/ImportTest.java
index c0c3799b858b..92e44b4fa9c7 100644
--- a/test/unit/org/apache/cassandra/db/ImportTest.java
+++ b/test/unit/org/apache/cassandra/db/ImportTest.java
@@ -84,8 +84,7 @@ private File prepareBasicImporting() throws Throwable
         {
             execute("insert into %s (id, d) values (?, ?)", i, i);
         }
-
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
 
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
@@ -110,14 +109,14 @@ public void basicImportMultiDirTest() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
 
         File backupdir = moveToBackupDir(sstables);
         for (int i = 10; i < 20; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
 
@@ -141,7 +140,7 @@ public void refreshTest() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
         sstables.forEach(s -> s.selfRef().release());
@@ -156,7 +155,7 @@ public void importResetLevelTest() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
         for (SSTableReader sstable : sstables)
@@ -193,7 +192,7 @@ public void importClearRepairedTest() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
         for (SSTableReader sstable : sstables)
@@ -275,7 +274,7 @@ public void testGetCorrectDirectory() throws Throwable
         for (int i = 0; i < 10; i++)
         {
             execute("insert into %s (id, d) values (?, ?)", i, i);
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
 
         Set<SSTableReader> toMove = getCurrentColumnFamilyStore().getLiveSSTables();
@@ -304,11 +303,11 @@ private void testCorruptHelper(boolean verify, boolean copy) throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         SSTableReader sstableToCorrupt = getCurrentColumnFamilyStore().getLiveSSTables().iterator().next();
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i + 10, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
 
         getCurrentColumnFamilyStore().clearUnsafe();
@@ -325,7 +324,7 @@ private void testCorruptHelper(boolean verify, boolean copy) throws Throwable
         // now move a correct sstable to another directory to make sure that directory gets properly imported
         for (int i = 100; i < 130; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> correctSSTables = getCurrentColumnFamilyStore().getLiveSSTables();
 
         getCurrentColumnFamilyStore().clearUnsafe();
@@ -402,7 +401,7 @@ public void testImportOutOfRange() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 1000; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
 
         getCurrentColumnFamilyStore().clearUnsafe();
@@ -447,7 +446,7 @@ public void testImportOutOfRangeExtendedVerify() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 1000; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
 
         getCurrentColumnFamilyStore().clearUnsafe();
@@ -483,7 +482,7 @@ public void testImportInvalidateCache() throws Throwable
         createTable("create table %s (id int primary key, d int) WITH caching = { 'keys': 'NONE', 'rows_per_partition': 'ALL' }");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         CacheService.instance.setRowCacheCapacityInMB(1);
 
         Set<RowCacheKey> keysToInvalidate = new HashSet<>();
@@ -504,7 +503,7 @@ public void testImportInvalidateCache() throws Throwable
 
         for (int i = 10; i < 20; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
 
         Set<RowCacheKey> allCachedKeys = new HashSet<>();
 
@@ -551,7 +550,7 @@ public void testImportCacheEnabledWithoutSrcDir() throws Throwable
         createTable("create table %s (id int primary key, d int) WITH caching = { 'keys': 'NONE', 'rows_per_partition': 'ALL' }");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         CacheService.instance.setRowCacheCapacityInMB(1);
         getCurrentColumnFamilyStore().clearUnsafe();
@@ -568,7 +567,7 @@ public void testRefreshCorrupt() throws Throwable
         createTable("create table %s (id int primary key, d int) WITH caching = { 'keys': 'NONE', 'rows_per_partition': 'ALL' }");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
         sstables.forEach(s -> s.selfRef().release());
@@ -583,10 +582,10 @@ public void testRefreshCorrupt() throws Throwable
 
         for (int i = 10; i < 20; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         for (int i = 20; i < 30; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
 
         Set<SSTableReader> expectedFiles = new HashSet<>(getCurrentColumnFamilyStore().getLiveSSTables());
 
@@ -632,14 +631,14 @@ public void importBadDirectoryTest() throws Throwable
         createTable("create table %s (id int primary key, d int)");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
 
         File backupdir = moveToBackupDir(sstables);
         for (int i = 10; i < 20; i++)
             execute("insert into %s (id, d) values (?, ?)", i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         getCurrentColumnFamilyStore().clearUnsafe();
 
diff --git a/test/unit/org/apache/cassandra/db/KeyCacheTest.java b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
index 31bdf83f3f9f..1155cc446b45 100644
--- a/test/unit/org/apache/cassandra/db/KeyCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
@@ -103,7 +103,7 @@ private void testKeyCacheLoad(String cf) throws Exception
 
         // insert data and force to disk
         SchemaLoader.insertData(KEYSPACE1, cf, 0, 100);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // populate the cache
         readData(KEYSPACE1, cf, 0, 100);
@@ -203,7 +203,7 @@ private void testKeyCacheLoadWithLostTable(String cf) throws Exception
 
         // insert data and force to disk
         SchemaLoader.insertData(KEYSPACE1, cf, 0, 100);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Collection<SSTableReader> firstFlushTables = ImmutableList.copyOf(store.getLiveSSTables());
 
@@ -213,7 +213,7 @@ private void testKeyCacheLoadWithLostTable(String cf) throws Exception
 
         // insert some new data and force to disk
         SchemaLoader.insertData(KEYSPACE1, cf, 100, 50);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // check that it's fine
         readData(KEYSPACE1, cf, 100, 50);
@@ -274,7 +274,7 @@ private void testKeyCache(String cf) throws ExecutionException, InterruptedExcep
         new RowUpdateBuilder(cfs.metadata(), 0, "key2").clustering("2").build().applyUnsafe();
 
         // to make sure we have SSTable
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // reads to cache key position
         Util.getAll(Util.cmd(cfs, "key1").build());
diff --git a/test/unit/org/apache/cassandra/db/KeyspaceTest.java b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
index c3980b66648f..aaf28db817a6 100644
--- a/test/unit/org/apache/cassandra/db/KeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
@@ -87,7 +87,7 @@ public void testGetRowNoColumns() throws Throwable
             Util.assertEmpty(Util.cmd(cfs, "0").columns("c").includeRow(1).build());
 
             if (round == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
@@ -122,7 +122,7 @@ public void testGetRowSingleColumn() throws Throwable
             }
 
             if (round == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
@@ -139,7 +139,7 @@ public void testGetSliceBloomFilterFalsePositive() throws Throwable
         for (String key : new String[]{"0", "2"})
             Util.assertEmpty(Util.cmd(cfs, key).build());
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (String key : new String[]{"0", "2"})
             Util.assertEmpty(Util.cmd(cfs, key).build());
@@ -211,7 +211,7 @@ public void testGetSliceWithCutoff() throws Throwable
             assertRowsInSlice(cfs, "0", 288, 299, 12, true, prefix);
 
             if (round == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
@@ -224,7 +224,7 @@ public void testReversedWithFlushing() throws Throwable
         for (int i = 0; i < 10; i++)
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (int i = 10; i < 20; i++)
         {
@@ -338,7 +338,7 @@ public void testGetSliceFromBasic() throws Throwable
             assertRowsInResult(cfs, command);
 
             if (round == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
@@ -361,7 +361,7 @@ public void testGetSliceWithExpiration() throws Throwable
             assertRowsInResult(cfs, command, 1);
 
             if (round == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
@@ -374,7 +374,7 @@ public void testGetSliceFromAdvanced() throws Throwable
         for (int i = 1; i < 7; i++)
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // overwrite three rows with -1
         for (int i = 1; i < 4; i++)
@@ -386,7 +386,7 @@ public void testGetSliceFromAdvanced() throws Throwable
             assertRowsInResult(cfs, command, -1, -1, 4);
 
             if (round == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
@@ -399,7 +399,7 @@ public void testGetSliceFromLarge() throws Throwable
         for (int i = 1000; i < 2000; i++)
             execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", "0", i, i);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         validateSliceLarge(cfs);
 
@@ -427,7 +427,7 @@ public void testLimitSSTables() throws Throwable
             for (int i = 1000 + (j*100); i < 1000 + ((j+1)*100); i++)
                 execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", "0", i, i, (long)i);
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         ((ClearableHistogram)cfs.metric.sstablesPerReadHistogram.cf).clear();
diff --git a/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java b/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java
index d69025372be1..fff835c71e95 100644
--- a/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/MultiKeyspaceTest.java
@@ -21,6 +21,7 @@
  */
 
 import org.apache.cassandra.cql3.CQLTester;
+
 import org.junit.Test;
 
 
@@ -38,8 +39,8 @@ public void testSameTableNames() throws Throwable
         execute("INSERT INTO multikstest1.standard1 (a, b) VALUES (0, 0)");
         execute("INSERT INTO multikstest2.standard1 (a, b) VALUES (0, 0)");
 
-        Keyspace.open("multikstest1").flush();
-        Keyspace.open("multikstest2").flush();
+        Keyspace.open("multikstest1").flush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        Keyspace.open("multikstest2").flush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertRows(execute("SELECT * FROM multikstest1.standard1"),
                    row(0, 0));
diff --git a/test/unit/org/apache/cassandra/db/NameSortTest.java b/test/unit/org/apache/cassandra/db/NameSortTest.java
index 0d7b09c8fc50..2fdd73530077 100644
--- a/test/unit/org/apache/cassandra/db/NameSortTest.java
+++ b/test/unit/org/apache/cassandra/db/NameSortTest.java
@@ -84,7 +84,7 @@ private void testNameSort(int N) throws IOException
             rub.build().applyUnsafe();
         }
         validateNameSort(cfs);
-        keyspace.getColumnFamilyStore("Standard1").forceBlockingFlush();
+        keyspace.getColumnFamilyStore("Standard1").forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         validateNameSort(cfs);
     }
 
diff --git a/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java b/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java
index 986a1251d55c..0fd3cc2fff77 100644
--- a/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java
+++ b/test/unit/org/apache/cassandra/db/PartitionRangeReadTest.java
@@ -100,14 +100,14 @@ public void testCassandra6778()
                 .add("val", "val1")
                 .build()
                 .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, "k1")
                 .clustering(new BigInteger(new byte[]{0, 0, 1}))
                 .add("val", "val2")
                 .build()
                 .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // fetch by the first column name; we should get the second version of the column value
         Row row = Util.getOnlyRow(Util.cmd(cfs, "k1").includeRow(new BigInteger(new byte[]{1})).build());
@@ -157,7 +157,7 @@ public void testRangeSliceInclusionExclusion()
             builder.build().applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         ColumnMetadata cDef = cfs.metadata().getColumn(ByteBufferUtil.bytes("val"));
 
diff --git a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
index 755302d1bdfa..819e60b1e4bd 100644
--- a/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
+++ b/test/unit/org/apache/cassandra/db/RangeTombstoneTest.java
@@ -85,7 +85,7 @@ public void simpleQueryWithRangeTombstoneTest() throws Exception
         for (int i = 0; i < 40; i += 2)
             builder.newRow(i).add("val", i);
         builder.applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(10, 22).build().applyUnsafe();
 
@@ -235,7 +235,7 @@ public void testTrackTimesPartitionTombstone() throws ExecutionException, Interr
 
         int nowInSec = FBUtilities.nowInSeconds();
         new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata(), Util.dk(key), 1000, nowInSec)).apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 1000, 1000, nowInSec);
@@ -257,7 +257,7 @@ public void testTrackTimesPartitionTombstoneWithData() throws ExecutionException
         key = "rt_times2";
         int nowInSec = FBUtilities.nowInSeconds();
         new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata(), Util.dk(key), 1000, nowInSec)).apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
@@ -276,7 +276,7 @@ public void testTrackTimesRangeTombstone() throws ExecutionException, Interrupte
 
         int nowInSec = FBUtilities.nowInSeconds();
         new RowUpdateBuilder(cfs.metadata(), nowInSec, 1000L, key).addRangeTombstone(1, 2).build().apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 1000, 1000, nowInSec);
@@ -298,9 +298,9 @@ public void testTrackTimesRangeTombstoneWithData() throws ExecutionException, In
         key = "rt_times2";
         int nowInSec = FBUtilities.nowInSeconds();
         new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata(), Util.dk(key), 1000, nowInSec)).apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         assertTimes(sstable.getSSTableMetadata(), 999, 1000, Integer.MAX_VALUE);
         cfs.forceMajorCompaction();
@@ -328,10 +328,10 @@ public void test7810() throws ExecutionException, InterruptedException
         for (int i = 10; i < 20; i ++)
             builder.newRow(i).add("val", i);
         builder.apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(10, 11).build().apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Thread.sleep(5);
         cfs.forceMajorCompaction();
@@ -350,10 +350,10 @@ public void test7808_1() throws ExecutionException, InterruptedException
         for (int i = 0; i < 40; i += 2)
             builder.newRow(i).add("val", i);
         builder.apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata(), Util.dk(key), 1, 1)).apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Thread.sleep(5);
         cfs.forceMajorCompaction();
     }
@@ -370,13 +370,13 @@ public void test7808_2() throws ExecutionException, InterruptedException
         for (int i = 10; i < 20; i ++)
             builder.newRow(i).add("val", i);
         builder.apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new Mutation(PartitionUpdate.fullPartitionDelete(cfs.metadata(), Util.dk(key), 0, 0)).apply();
 
         UpdateBuilder.create(cfs.metadata(), key).withTimestamp(1).newRow(5).add("val", 5).apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Thread.sleep(5);
         cfs.forceMajorCompaction();
         assertEquals(1, Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).rowCount());
@@ -396,16 +396,16 @@ public void overlappingRangeTest() throws Exception
         for (int i = 0; i < 20; i++)
             builder.newRow(i).add("val", i);
         builder.applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(5, 15).build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(5, 10).build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 2, key).addRangeTombstone(5, 8).build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Partition partition = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build());
         int nowInSec = FBUtilities.nowInSeconds();
@@ -447,11 +447,11 @@ public void reverseQueryTest() throws Exception
         String key = "k3";
 
         UpdateBuilder.create(cfs.metadata(), key).withTimestamp(0).newRow(2).add("val", 2).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(0, 10).build().applyUnsafe();
         UpdateBuilder.create(cfs.metadata(), key).withTimestamp(2).newRow(1).add("val", 1).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Get the last value of the row
         FilteredPartition partition = Util.getOnlyPartition(Util.cmd(cfs, key).build());
@@ -508,10 +508,10 @@ public void testRowWithRangeTombstonesUpdatesSecondaryIndex() throws Exception
         for (int i = 0; i < 10; i++)
             builder.newRow(i).add("val", i);
         builder.applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 0, key).addRangeTombstone(0, 7).build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(10, index.rowsInserted.size());
 
@@ -538,10 +538,10 @@ public void testRangeTombstoneCompaction() throws Exception
         for (int i = 0; i < 10; i += 2)
             builder.newRow(i).add("val", i);
         builder.applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 0, key).addRangeTombstone(0, 7).build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // there should be 2 sstables
         assertEquals(2, cfs.getLiveSSTables().size());
@@ -614,7 +614,7 @@ public void testOverwritesToDeletedColumns() throws Exception
         // now re-insert that column
         UpdateBuilder.create(cfs.metadata(), key).withTimestamp(2).newRow(1).add("val", 1).applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // We should have 1 insert and 1 update to the indexed "1" column
         // CASSANDRA-6640 changed index update to just update, not insert then delete
diff --git a/test/unit/org/apache/cassandra/db/ReadCommandTest.java b/test/unit/org/apache/cassandra/db/ReadCommandTest.java
index dcd6331c3b6f..fc069450767d 100644
--- a/test/unit/org/apache/cassandra/db/ReadCommandTest.java
+++ b/test/unit/org/apache/cassandra/db/ReadCommandTest.java
@@ -218,7 +218,7 @@ public void testPartitionRangeAbort() throws Exception
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 0, ByteBufferUtil.bytes("key2"))
                 .clustering("Column1")
@@ -246,7 +246,7 @@ public void testSinglePartitionSliceAbort() throws Exception
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 0, ByteBufferUtil.bytes("key"))
                 .clustering("dd")
@@ -277,7 +277,7 @@ public void testSinglePartitionNamesAbort() throws Exception
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 0, ByteBufferUtil.bytes("key"))
                 .clustering("dd")
@@ -356,7 +356,7 @@ public void testSinglePartitionGroupMerge() throws Exception
                 commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter, DataLimits.NONE, Util.dk(data[1]), sliceFilter));
             }
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             ReadQuery query = new SinglePartitionReadCommand.Group(commands, DataLimits.NONE);
 
@@ -526,7 +526,7 @@ public void testCountDeletedRows() throws Exception
                         DataLimits.NONE, Util.dk(data[1]), sliceFilter));
             }
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             ReadQuery query = new SinglePartitionReadCommand.Group(commands, DataLimits.NONE);
 
@@ -602,7 +602,7 @@ public void testCountWithNoDeletedRow() throws Exception
                         DataLimits.NONE, Util.dk(data[1]), sliceFilter));
             }
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             ReadQuery query = new SinglePartitionReadCommand.Group(commands, DataLimits.NONE);
 
@@ -661,7 +661,7 @@ public void testSinglePartitionNamesSkipsOptimisationsIfTrackingRepairedData()
             .build()
             .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, ByteBufferUtil.bytes("key"))
             .clustering("dd")
@@ -669,7 +669,7 @@ public void testSinglePartitionNamesSkipsOptimisationsIfTrackingRepairedData()
             .build()
             .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         assertEquals(2, sstables.size());
         Collections.sort(sstables, SSTableReader.maxTimestampDescending);
@@ -709,7 +709,7 @@ public void dontIncludeLegacyCounterContextInDigest() throws IOException
                 .addLegacyCounterCell("c", 0L)
                 .build()
                 .apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
 
         // execute a read and capture the digest
@@ -726,7 +726,7 @@ public void dontIncludeLegacyCounterContextInDigest() throws IOException
                 .addLegacyCounterCell("c", 1L)
                 .build()
                 .apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
 
         ByteBuffer digestWithLegacyCounter1 = performReadAndVerifyRepairedInfo(readCommand, 1, 1, true);
@@ -741,7 +741,7 @@ public void dontIncludeLegacyCounterContextInDigest() throws IOException
                 .add("c", 1L)
                 .build()
                 .apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
 
         ByteBuffer digestWithCounterCell = performReadAndVerifyRepairedInfo(readCommand, 1, 1, true);
@@ -785,7 +785,7 @@ public void purgeGCableTombstonesBeforeCalculatingDigest() throws Exception
         // Partition with 2 rows, one fully deleted
         new RowUpdateBuilder(cfs.metadata.get(), 0, keys[3]).clustering("bb").add("a", ByteBufferUtil.bytes("a")).delete("b").build().apply();
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, keys[3], "cc").apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
 
         Map<DecoratedKey, ByteBuffer> digestsWithTombstones = new HashMap<>();
@@ -848,7 +848,7 @@ public void testRepairedDataOverreadMetrics()
                                                         .build());
         // Insert and repair
         insert(cfs, IntStream.range(0, 10), () -> IntStream.range(0, 10));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
         // Insert and leave unrepaired
         insert(cfs, IntStream.range(0, 10), () -> IntStream.range(10, 20));
@@ -983,7 +983,7 @@ private void fullyPurgedPartitionCreatesEmptyDigest(ColumnFamilyStore cfs, ReadC
         // Partition with a fully deleted static row and a single, fully deleted regular row
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, ByteBufferUtil.bytes("key")).apply();
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, ByteBufferUtil.bytes("key"), "cc").apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
 
         command.trackRepairedStatus();
@@ -1029,12 +1029,12 @@ public void mixedPurgedAndNonPurgedPartitions()
 
         // Live partition in a repaired sstable, so included in the digest calculation
         new RowUpdateBuilder(cfs.metadata.get(), 0, ByteBufferUtil.bytes("key-0")).clustering("cc").add("a", ByteBufferUtil.bytes("a")).build().apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
         // Fully deleted partition (static and regular rows) in an unrepaired sstable, so not included in the intial digest
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, ByteBufferUtil.bytes("key-1")).apply();
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, ByteBufferUtil.bytes("key-1"), "cc").apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         command.trackRepairedStatus();
         List<ImmutableBTreePartition> partitions = Util.getAllUnfiltered(command);
@@ -1071,11 +1071,11 @@ public void purgingConsidersRepairedDataOnly() throws Exception
         DecoratedKey key = Util.dk("key");
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, key).apply();
         RowUpdateBuilder.deleteRow(cfs.metadata(), 0, key, "cc").apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.getLiveSSTables().forEach(sstable -> mutateRepaired(cfs, sstable, 111, null));
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).clustering("cc").add("a", ByteBufferUtil.bytes("a")).build().apply();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         int nowInSec = FBUtilities.nowInSeconds() + 10;
         ReadCommand cmd = Util.cmd(cfs, key).withNowInSeconds(nowInSec).build();
@@ -1113,7 +1113,7 @@ public void skipRowCacheIfTrackingRepairedData()
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         ReadCommand readCommand = Util.cmd(cfs, Util.dk("key")).build();
         assertTrue(cfs.isRowCacheEnabled());
@@ -1186,7 +1186,7 @@ private void testRepairedDataTracking(ColumnFamilyStore cfs, ReadCommand readCom
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 1, ByteBufferUtil.bytes("key"))
                 .clustering("dd")
@@ -1194,7 +1194,7 @@ private void testRepairedDataTracking(ColumnFamilyStore cfs, ReadCommand readCom
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         assertEquals(2, sstables.size());
         sstables.forEach(sstable -> assertFalse(sstable.isRepaired() || sstable.isPendingRepair()));
@@ -1251,7 +1251,7 @@ private void testRepairedDataTracking(ColumnFamilyStore cfs, ReadCommand readCom
             assertEquals(EMPTY_BYTE_BUFFER, digest);
 
             // now flush so we have an unrepaired table with the deletion and repeat the check
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             digest = performReadAndVerifyRepairedInfo(readCommand, 0, rowsPerPartition, false);
             assertEquals(EMPTY_BYTE_BUFFER, digest);
         }
diff --git a/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java b/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java
index d3954394f070..9b2d1375b9b2 100644
--- a/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java
+++ b/test/unit/org/apache/cassandra/db/RecoveryManagerFlushedTest.java
@@ -101,8 +101,8 @@ public static void defineSchema() throws ConfigurationException
     public void testWithFlush() throws Exception
     {
         // Flush everything that may be in the commit log now to start fresh
-        FBUtilities.waitOnFutures(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).flush());
-        FBUtilities.waitOnFutures(Keyspace.open(SchemaConstants.SCHEMA_KEYSPACE_NAME).flush());
+        FBUtilities.waitOnFutures(Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
+        FBUtilities.waitOnFutures(Keyspace.open(SchemaConstants.SCHEMA_KEYSPACE_NAME).flush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
 
 
         CompactionManager.instance.disableAutoCompaction();
@@ -119,7 +119,7 @@ public void testWithFlush() throws Exception
         Keyspace keyspace1 = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore cfs = keyspace1.getColumnFamilyStore("Standard1");
         logger.debug("forcing flush");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         logger.debug("begin manual replay");
         // replay the commit log (nothing on Standard1 should be replayed since everything was flushed, so only the row on Standard2
diff --git a/test/unit/org/apache/cassandra/db/RemoveCellTest.java b/test/unit/org/apache/cassandra/db/RemoveCellTest.java
index 01fe2551f4b9..f09d9c84e5d2 100644
--- a/test/unit/org/apache/cassandra/db/RemoveCellTest.java
+++ b/test/unit/org/apache/cassandra/db/RemoveCellTest.java
@@ -30,7 +30,7 @@ public void testDeleteCell() throws Throwable
         String tableName = createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY (a, b))");
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
         execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 0, 0, 0, 0L);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         execute("DELETE c FROM %s USING TIMESTAMP ? WHERE a = ? AND b = ?", 1L, 0, 0);
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b = ?", 0, 0), row(0, 0, null));
         assertRows(execute("SELECT c FROM %s WHERE a = ? AND b = ?", 0, 0), row(new Object[]{null}));
diff --git a/test/unit/org/apache/cassandra/db/RowCacheTest.java b/test/unit/org/apache/cassandra/db/RowCacheTest.java
index 59ce21345425..8b715916ece7 100644
--- a/test/unit/org/apache/cassandra/db/RowCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/RowCacheTest.java
@@ -491,7 +491,7 @@ public void testSSTablesPerReadHistogramWhenRowCache()
         SchemaLoader.insertData(KEYSPACE_CACHED, CF_CACHED, 0, 100);
 
         //force flush for confidence that SSTables exists
-        cachedStore.forceBlockingFlush();
+        cachedStore.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         ((ClearableHistogram)cachedStore.metric.sstablesPerReadHistogram.cf).clear();
 
diff --git a/test/unit/org/apache/cassandra/db/RowIterationTest.java b/test/unit/org/apache/cassandra/db/RowIterationTest.java
index b0cd4fc1ca40..1b9230ec05a0 100644
--- a/test/unit/org/apache/cassandra/db/RowIterationTest.java
+++ b/test/unit/org/apache/cassandra/db/RowIterationTest.java
@@ -36,7 +36,7 @@ public void testRowIteration() throws Throwable
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
         for (int i = 0; i < 10; i++)
             execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) USING TIMESTAMP ?", i, 0, i, i, (long)i);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(10, execute("SELECT * FROM %s").size());
     }
 
@@ -49,7 +49,7 @@ public void testRowIterationDeletionTime() throws Throwable
         execute("INSERT INTO %s (a, b) VALUES (?, ?) USING TIMESTAMP ?", 0, 0, 0L);
         execute("DELETE FROM %s USING TIMESTAMP ? WHERE a = ?", 0L, 0);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Delete row in second sstable with higher timestamp
         execute("INSERT INTO %s (a, b) VALUES (?, ?) USING TIMESTAMP ?", 0, 0, 1L);
@@ -57,7 +57,7 @@ public void testRowIterationDeletionTime() throws Throwable
 
         int localDeletionTime = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build()).partitionLevelDeletion().localDeletionTime();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         DeletionTime dt = Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build()).partitionLevelDeletion();
         assertEquals(1L, dt.markedForDeleteAt());
@@ -72,7 +72,7 @@ public void testRowIterationDeletion() throws Throwable
 
         // Delete a row in first sstable
         execute("DELETE FROM %s USING TIMESTAMP ? WHERE a = ?", 0L, 0);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertFalse(Util.getOnlyPartitionUnfiltered(Util.cmd(cfs).build()).isEmpty());
     }
diff --git a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java
index b928ebfe6d07..d801f80ddc10 100644
--- a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java
+++ b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java
@@ -303,6 +303,7 @@ public void testCfmOptionsCQL()
                             "    AND comment = 'comment'\n" +
                             "    AND compaction = {'class': 'org.apache.cassandra.db.compaction.LeveledCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4', 'sstable_size_in_mb': '1'}\n" +
                             "    AND compression = {'chunk_length_in_kb': '64', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor', 'min_compress_ratio': '2.0'}\n" +
+                            "    AND memtable = {}\n" +
                             "    AND crc_check_chance = 0.3\n" +
                             "    AND default_time_to_live = 4\n" +
                             "    AND extensions = {'ext1': 0x76616c31}\n" +
diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index 502f6182cce0..11a20f7f8bb6 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
@@ -575,7 +575,7 @@ protected void fillCF(ColumnFamilyStore cfs, int partitionsPerSSTable)
             new Mutation(update).applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long... values)
@@ -599,7 +599,7 @@ public static void fillIndexCF(ColumnFamilyStore cfs, boolean composite, long...
             new Mutation(builder.build()).applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     protected String[] fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException
@@ -616,7 +616,7 @@ protected String[] fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable
             new CounterMutation(new Mutation(update), ConsistencyLevel.ONE).apply();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         return tokenSorted.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
     }
@@ -629,14 +629,14 @@ public void testScrubColumnValidation() throws InterruptedException, RequestExec
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("test_compact_static_columns");
 
         QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".test_compact_static_columns (a, b, c, d) VALUES (123, c3db07e8-b602-11e3-bc6b-e0b9a54a6d93, true, 'foobar')", ksName));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performScrub(cfs, false, true, 2);
 
         QueryProcessor.process(String.format("CREATE TABLE \"%s\".test_scrub_validation (a text primary key, b int)", ksName), ConsistencyLevel.ONE);
         ColumnFamilyStore cfs2 = keyspace.getColumnFamilyStore("test_scrub_validation");
 
         new Mutation(UpdateBuilder.create(cfs2.metadata(), "key").newRow().add("b", Int32Type.instance.decompose(1)).build()).apply();
-        cfs2.forceBlockingFlush();
+        cfs2.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         CompactionManager.instance.performScrub(cfs2, false, false, 2);
     }
@@ -655,7 +655,7 @@ public void testValidationCompactStorage() throws Exception
         QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'a', 'foo')", ksName));
         QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'b', 'bar')", ksName));
         QueryProcessor.executeInternal(String.format("INSERT INTO \"%s\".test_compact_dynamic_columns (a, b, c) VALUES (0, 'c', 'boo')", ksName));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performScrub(cfs, true, true, 2);
 
         // Scrub is silent, but it will remove broken records. So reading everything back to make sure nothing to "scrubbed away"
diff --git a/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
index 3ec0c6838fe8..edcf4cd01122 100644
--- a/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/db/SecondaryIndexTest.java
@@ -300,7 +300,7 @@ public void testDeleteOfInconsistentValuesInKeysIndex() throws Exception
         new RowUpdateBuilder(cfs.metadata(), 1, "k1").noRowMarker().add("birthdate", 1L).build().applyUnsafe();
 
         // force a flush, so our index isn't being read from a memtable
-        keyspace.getColumnFamilyStore(WITH_KEYS_INDEX).forceBlockingFlush();
+        keyspace.getColumnFamilyStore(WITH_KEYS_INDEX).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // now apply another update, but force the index update to be skipped
         keyspace.apply(new RowUpdateBuilder(cfs.metadata(), 2, "k1").noRowMarker().add("birthdate", 2L).build(),
@@ -356,7 +356,7 @@ private void runDeleteOfInconsistentValuesFromCompositeIndexTest(boolean isStati
         assertIndexedOne(cfs, col, 10l);
 
         // force a flush and retry the query, so our index isn't being read from a memtable
-        keyspace.getColumnFamilyStore(cfName).forceBlockingFlush();
+        keyspace.getColumnFamilyStore(cfName).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertIndexedOne(cfs, col, 10l);
 
         // now apply another update, but force the index update to be skipped
@@ -522,7 +522,7 @@ public void testKeysSearcherSimple() throws Exception
             new RowUpdateBuilder(cfs.metadata(), 0, "k" + i).noRowMarker().add("birthdate", 1l).build().applyUnsafe();
 
         assertIndexedCount(cfs, ByteBufferUtil.bytes("birthdate"), 1l, 10);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertIndexedCount(cfs, ByteBufferUtil.bytes("birthdate"), 1l, 10);
     }
 
@@ -537,7 +537,7 @@ public void testSelectivityWithMultipleIndexes()
         new RowUpdateBuilder(cfs.metadata(), 0, "k3").clustering("c").add("birthdate", 1L).add("notbirthdate", 3L).build().applyUnsafe();
         new RowUpdateBuilder(cfs.metadata(), 0, "k4").clustering("c").add("birthdate", 1L).add("notbirthdate", 3L).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         ReadCommand rc = Util.cmd(cfs)
                              .fromKeyIncl("k1")
                              .toKeyIncl("k3")
diff --git a/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java b/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java
index 1c891ec2b2c6..47e866c0dd4d 100644
--- a/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/SinglePartitionReadCommandCQLTest.java
@@ -31,10 +31,10 @@ public void partitionLevelDeletionTest() throws Throwable
     {
         createTable("CREATE TABLE %s (bucket_id TEXT,name TEXT,data TEXT,PRIMARY KEY (bucket_id, name))");
         execute("insert into %s (bucket_id, name, data) values ('8772618c9009cf8f5a5e0c18', 'test', 'hello')");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         execute("insert into %s (bucket_id, name, data) values ('8772618c9009cf8f5a5e0c19', 'test2', 'hello');");
         execute("delete from %s where bucket_id = '8772618c9009cf8f5a5e0c18'");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         UntypedResultSet res = execute("select * from %s where bucket_id = '8772618c9009cf8f5a5e0c18' and name = 'test'");
         assertTrue(res.isEmpty());
     }
diff --git a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
index ffa44228e3bc..960ee2abf97c 100644
--- a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
+++ b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
@@ -183,7 +183,7 @@ private void testMultiNamesOrSlicesCommand(boolean flush, boolean isSlice)
                                                      ck1));
 
         if (flush)
-            Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_SCLICES).forceBlockingFlush();
+            Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE_SCLICES).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         AbstractClusteringIndexFilter clusteringFilter = createClusteringFilter(uniqueCk1, uniqueCk2, isSlice);
         ReadCommand cmd = SinglePartitionReadCommand.create(CFM_SLICES,
@@ -282,7 +282,7 @@ public void staticColumnsAreReturned() throws IOException
         }
 
         // check (de)serialized iterator for sstable static cell
-        Schema.instance.getColumnFamilyStoreInstance(metadata.id).forceBlockingFlush();
+        Schema.instance.getColumnFamilyStoreInstance(metadata.id).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         try (ReadExecutionController executionController = cmd.executionController(); UnfilteredPartitionIterator pi = cmd.executeLocally(executionController))
         {
             response = ReadResponse.createDataResponse(pi, cmd);
@@ -321,7 +321,7 @@ public void testReadOnRangeTombstoneMarker()
             QueryProcessor.executeOnceInternal("DELETE FROM ks.test_read_rt USING TIMESTAMP 10 WHERE k=1 AND c1=1");
 
             List<Unfiltered> memtableUnfiltereds = assertQueryReturnsSingleRT(query);
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             List<Unfiltered> sstableUnfiltereds = assertQueryReturnsSingleRT(query);
 
             String errorMessage = String.format("Expected %s but got %s with postfix '%s'",
@@ -359,17 +359,17 @@ public void testPartitionDeletionRowDeletionTie()
                                                                   timestamp,
                                                                   "DELETE FROM ks.partition_row_deletion USING TIMESTAMP 10 WHERE k=1");
             if (flush && multiSSTable)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             QueryProcessor.executeOnceInternalWithNowAndTimestamp(nowInSec,
                                                                   timestamp,
                                                                   "DELETE FROM ks.partition_row_deletion USING TIMESTAMP 10 WHERE k=1 and c=1");
             if (flush)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             QueryProcessor.executeOnceInternal("INSERT INTO ks.partition_row_deletion(k,c,v) VALUES(1,1,1) using timestamp 11");
             if (flush)
             {
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                 try
                 {
                     cfs.forceMajorCompaction();
@@ -427,17 +427,17 @@ public void testPartitionDeletionRangeDeletionTie()
                                                                   timestamp,
                                                                   "DELETE FROM ks.partition_range_deletion USING TIMESTAMP 10 WHERE k=1");
             if (flush && multiSSTable)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             QueryProcessor.executeOnceInternalWithNowAndTimestamp(nowInSec,
                                                                   timestamp,
                                                                   "DELETE FROM ks.partition_range_deletion USING TIMESTAMP 10 WHERE k=1 and c1=1");
             if (flush)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             QueryProcessor.executeOnceInternal("INSERT INTO ks.partition_range_deletion(k,c1,c2,v) VALUES(1,1,1,1) using timestamp 11");
             if (flush)
             {
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                 try
                 {
                     cfs.forceMajorCompaction();
@@ -547,7 +547,7 @@ public void sstableFiltering()
         QueryProcessor.executeOnceInternal("INSERT INTO ks.legacy_mc_inaccurate_min_max (k, c1, c2, c3, v) VALUES (100, 2, 2, 2, 2)");
         QueryProcessor.executeOnceInternal("DELETE FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1");
         assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1");
         assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=1 AND c2=1 AND c3=1"); // clustering names
 
@@ -563,7 +563,7 @@ public void sstableFiltering()
         new Mutation(builder.build()).apply();
 
         assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=3 AND c2=2");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=3 AND c2=2");
         assertQueryReturnsSingleRT("SELECT * FROM ks.legacy_mc_inaccurate_min_max WHERE k=100 AND c1=3 AND c2=2 AND c3=2"); // clustering names
 
@@ -701,7 +701,7 @@ private SSTableReader createSSTable(TableMetadata metadata, String keyspace, Str
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
         for (int i = 0; i < 10; i++)
             QueryProcessor.executeInternal(String.format(query, keyspace, table, i));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         DecoratedKey key = metadata.partitioner.decorateKey(ByteBufferUtil.bytes("k1"));
         ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, key));
         assertEquals(1, view.sstables.size());
diff --git a/test/unit/org/apache/cassandra/db/TimeSortTest.java b/test/unit/org/apache/cassandra/db/TimeSortTest.java
index 8ae05ea9578f..ae67d7ddd353 100644
--- a/test/unit/org/apache/cassandra/db/TimeSortTest.java
+++ b/test/unit/org/apache/cassandra/db/TimeSortTest.java
@@ -36,7 +36,7 @@ public void testMixedSources() throws Throwable
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(tableName);
 
         execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 0, 100, 0, 100L);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", 0, 0, 1, 0L);
 
         assertRows(execute("SELECT * FROM %s WHERE a = ? AND b >= ? LIMIT 1000", 0, 10), row(0, 100, 0));
@@ -53,7 +53,7 @@ public void testTimeSort() throws Throwable
                 execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?) USING TIMESTAMP ?", i, j * 2, 0, (long)j * 2);
 
         validateTimeSort();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         validateTimeSort();
 
         // interleave some new data to test memtable + sstable
diff --git a/test/unit/org/apache/cassandra/db/VerifyTest.java b/test/unit/org/apache/cassandra/db/VerifyTest.java
index 4b73f270103c..9ca98b8e7504 100644
--- a/test/unit/org/apache/cassandra/db/VerifyTest.java
+++ b/test/unit/org/apache/cassandra/db/VerifyTest.java
@@ -696,7 +696,7 @@ public void testVerifyLocalPartitioner() throws UnknownHostException
         Batch bogus = Batch.createLocal(UUID.randomUUID(), 0, Collections.emptyList());
         BatchlogManager.store(bogus);
         ColumnFamilyStore cfs = Keyspace.open("system").getColumnFamilyStore("batches");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         for (SSTableReader sstable : cfs.getLiveSSTables())
         {
 
@@ -754,7 +754,7 @@ protected void fillCF(ColumnFamilyStore cfs, int partitionsPerSSTable)
                          .apply();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     protected void fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) throws WriteTimeoutException
@@ -766,7 +766,7 @@ protected void fillCounterCF(ColumnFamilyStore cfs, int partitionsPerSSTable) th
                          .apply();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     protected long simpleFullChecksum(String filename) throws IOException
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
index 4725bcf3998d..531ca87bee27 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
@@ -45,7 +45,7 @@ public void testTruncateSegmentDiscard() throws Throwable
         // Calling switchMemtable directly applies Flush even though memtable is empty. This can happen with some races
         // (flush with recycling by segment manager). It should still tell commitlog that the memtable's region is clean.
         // CASSANDRA-12436
-        cfs.switchMemtable();
+        cfs.switchMemtable(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         execute("INSERT INTO %s (idx, data) VALUES (?, ?)", 15, Integer.toString(17));
 
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogReaderTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogReaderTest.java
index 794f99f47b28..a4713b2533f4 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogReaderTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogReaderTest.java
@@ -27,6 +27,7 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.Config;
@@ -261,7 +262,9 @@ CommitLogPosition populateData(int entryCount) throws Throwable
         for (int i = midpoint; i < entryCount; i++)
             execute("INSERT INTO %s (idx, data) VALUES (?, ?)", i, Integer.toString(i));
 
-        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+        Keyspace.open(keyspace())
+                .getColumnFamilyStore(currentTable())
+                .forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         return result;
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java
index 4128b7122ee6..133755354477 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java
@@ -31,6 +31,7 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.db.commitlog.CommitLogSegment.CDCState;
@@ -95,7 +96,9 @@ public void testCDCWriteFailure() throws Throwable
             execute("INSERT INTO %s (idx, data) VALUES (1, '1');");
 
             // Confirm that, on flush+recyle, we see files show up in cdc_raw
-            Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
+            Keyspace.open(keyspace())
+                    .getColumnFamilyStore(currentTable())
+                    .forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             CommitLog.instance.forceRecycleAllSegments();
             cdcMgr.awaitManagementTasksCompletion();
             Assert.assertTrue("Expected files to be moved to overflow.", getCDCRawCount() > 0);
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
index 86a47febc74b..e433c39523af 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
@@ -29,6 +29,7 @@
 import java.util.zip.CRC32;
 import java.util.zip.Checksum;
 
+import com.google.common.base.Throwables;
 import com.google.common.collect.Iterables;
 import com.google.common.io.Files;
 
@@ -39,6 +40,8 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.db.memtable.AbstractAllocatorMemtable;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.io.compress.ZstdCompressor;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.schema.TableId;
@@ -899,7 +902,7 @@ public void testUnwriteableFlushRecovery() throws ExecutionException, Interrupte
                 {
                     try (Closeable c = Util.markDirectoriesUnwriteable(cfs))
                     {
-                        cfs.forceBlockingFlush();
+                        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
                     }
                     catch (Throwable t)
                     {
@@ -909,7 +912,7 @@ public void testUnwriteableFlushRecovery() throws ExecutionException, Interrupte
                     }
                 }
                 else
-                    cfs.forceBlockingFlush();
+                    cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             }
         }
         finally
@@ -941,7 +944,7 @@ public void testOutOfOrderFlushRecovery(BiConsumer<ColumnFamilyStore, Memtable>
 
             Memtable current = cfs.getTracker().getView().getCurrentMemtable();
             if (i == 2)
-                current.makeUnflushable();
+                ((AbstractAllocatorMemtable) current).makeUnflushable();
 
             flushAction.accept(cfs, current);
         }
@@ -964,7 +967,7 @@ public void testOutOfOrderFlushRecovery(BiConsumer<ColumnFamilyStore, Memtable>
     {
         try
         {
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         catch (Throwable t)
         {
@@ -972,7 +975,7 @@ public void testOutOfOrderFlushRecovery(BiConsumer<ColumnFamilyStore, Memtable>
             while (!(t instanceof FSWriteError))
                 t = t.getCause();
             // Wait for started flushes to complete.
-            cfs.switchMemtableIfCurrent(current);
+            waitForStartedFlushes(cfs, current);
         }
     };
 
@@ -984,9 +987,21 @@ public void testOutOfOrderFlushRecovery(BiConsumer<ColumnFamilyStore, Memtable>
         CommitLog.instance.forceRecycleAllSegments();
 
         // Wait for started flushes to complete.
-        cfs.switchMemtableIfCurrent(current);
+        waitForStartedFlushes(cfs, current);
     };
 
+    private void waitForStartedFlushes(ColumnFamilyStore cfs, Memtable current)
+    {
+        try
+        {
+            cfs.switchMemtableIfCurrent(current, ColumnFamilyStore.FlushReason.UNIT_TESTS).get();
+        }
+        catch (InterruptedException|ExecutionException e)
+        {
+            throw Throwables.propagate(e);
+        }
+    }
+
     @Test
     public void testOutOfOrderFlushRecovery() throws ExecutionException, InterruptedException, IOException
     {
diff --git a/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java b/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java
index b3dc07010146..00b0e409ca2a 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/SnapshotDeletingTest.java
@@ -76,7 +76,7 @@ public void testCompactionHook() throws Exception
 
         // Compact the cf and confirm that the executor's after hook calls rescheduleDeletion
         populate(20000);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         store.forceMajorCompaction();
 
         long start = System.currentTimeMillis();
diff --git a/test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java
index 4092f541f4b1..82544f379340 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java
@@ -139,6 +139,6 @@ private static void insertKeyAndFlush(String table, int key)
         .add("val", "val")
         .build()
         .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java b/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java
index de7ddfcb1f38..915abbed1597 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java
@@ -86,7 +86,7 @@ SSTableReader makeSSTable(boolean orphan)
         int pk = nextSSTableKey++;
         Set<SSTableReader> pre = cfs.getLiveSSTables();
         QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES(?, ?)", ks, tbl), pk, pk);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Set<SSTableReader> post = cfs.getLiveSSTables();
         Set<SSTableReader> diff = new HashSet<>(post);
         diff.removeAll(pre);
diff --git a/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
index 3baad0ae5ad0..4a859e7f8275 100644
--- a/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
@@ -38,6 +38,7 @@
 import org.apache.cassandra.cache.AutoSavingCache;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.db.view.ViewBuilderTask;
@@ -67,7 +68,7 @@ public void testActiveCompactionTrackingRaceWithIndexBuilder() throws Throwable
         for (int i = 0; i < 5; i++)
         {
             execute("INSERT INTO %s (pk, ck, a, b) VALUES (" + i + ", 2, 3, 4)");
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         Index idx = getCurrentColumnFamilyStore().indexManager.getIndexByName(idxName);
@@ -111,7 +112,7 @@ public void testSecondaryIndexTracking() throws Throwable
         for (int i = 0; i < 5; i++)
         {
             execute("INSERT INTO %s (pk, ck, a, b) VALUES (" + i + ", 2, 3, 4)");
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
 
         Index idx = getCurrentColumnFamilyStore().indexManager.getIndexByName(idxName);
@@ -134,7 +135,7 @@ public void testIndexSummaryRedistributionTracking() throws Throwable
         for (int i = 0; i < 5; i++)
         {
             execute("INSERT INTO %s (pk, ck, a, b) VALUES (" + i + ", 2, 3, 4)");
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         try (LifecycleTransaction txn = getCurrentColumnFamilyStore().getTracker().tryModify(sstables, OperationType.INDEX_SUMMARY))
@@ -159,7 +160,7 @@ public void testViewBuildTracking() throws Throwable
         for (int i = 0; i < 5; i++)
         {
             execute("INSERT INTO %s (k1, c1, val) VALUES (" + i + ", 2, 3)");
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
         execute(String.format("CREATE MATERIALIZED VIEW %s.view1 AS SELECT k1, c1, val FROM %s.%s WHERE k1 IS NOT NULL AND c1 IS NOT NULL AND val IS NOT NULL PRIMARY KEY (val, k1, c1)", keyspace(), keyspace(), currentTable()));
         View view = Iterables.getOnlyElement(getCurrentColumnFamilyStore().viewManager);
@@ -183,7 +184,7 @@ public void testScrubOne() throws Throwable
         for (int i = 0; i < 5; i++)
         {
             execute("INSERT INTO %s (pk, ck, a, b) VALUES (" + i + ", 2, 3, 4)");
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
 
         SSTableReader sstable = Iterables.getFirst(getCurrentColumnFamilyStore().getLiveSSTables(), null);
@@ -208,7 +209,7 @@ public void testVerifyOne() throws Throwable
         for (int i = 0; i < 5; i++)
         {
             execute("INSERT INTO %s (pk, ck, a, b) VALUES (" + i + ", 2, 3, 4)");
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
 
         SSTableReader sstable = Iterables.getFirst(getCurrentColumnFamilyStore().getLiveSSTables(), null);
diff --git a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java
index 38d2607d2f7d..66ea7bb5b3fe 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionBytemanTest.java
@@ -67,7 +67,7 @@ public void testRedundantTransitions() throws Throwable
         execute("insert into %s (id, i) values (1, 1)");
         execute("insert into %s (id, i) values (2, 1)");
         execute("insert into %s (id, i) values (3, 1)");
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         UntypedResultSet res = execute("select token(id) as tok from %s");
         Iterator<UntypedResultSet.Row> it = res.iterator();
         List<Long> tokens = new ArrayList<>();
diff --git a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java
index b2618e54f7ff..e37f7d7cd3f0 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AntiCompactionTest.java
@@ -299,7 +299,7 @@ public void generateSStable(ColumnFamilyStore store, String Suffix)
                     .build()
                     .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     @Test
@@ -442,7 +442,7 @@ private ColumnFamilyStore prepareColumnFamilyStore()
                 .build()
                 .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         return store;
     }
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
index beed019554b9..551fef16bb44 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
@@ -462,7 +462,7 @@ public void testStandardCompactionTaskCancellation() throws Throwable
         for (int i = 0; i < 10; i++)
         {
             execute("insert into %s (id, something) values (?,?)", i, i);
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
         }
         AbstractCompactionTask ct = null;
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
index 68936f55427b..d7d281aaaed1 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
@@ -195,7 +195,7 @@ private void populate(int count) throws Throwable
                 execute(String.format("INSERT INTO %s.%s(k, t, v) VALUES (?, ?, ?)", KEYSPACE, TABLE), i, j, b);
 
         ColumnFamilyStore cfs = getColumnFamilyStore();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         if (cfs.getLiveSSTables().size() > 1)
         {
             // we want just one big sstable to avoid doing actual compaction in compact() above
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java
index 500a88179f3e..fd76c6e711e5 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java
@@ -91,7 +91,7 @@ public void testMaxPurgeableTimestamp()
         {
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp1); //memtable only
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             assertTrue(controller.getPurgeEvaluator(key).test(Long.MAX_VALUE)); //no memtables and no sstables
         }
 
@@ -99,7 +99,7 @@ public void testMaxPurgeableTimestamp()
 
         // create another sstable
         applyMutation(cfs.metadata(), key, timestamp2);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // check max purgeable timestamp when compacting the first sstable with and without a memtable
         try (CompactionController controller = new CompactionController(cfs, compacting, 0))
@@ -112,7 +112,7 @@ public void testMaxPurgeableTimestamp()
         }
 
         // check max purgeable timestamp again without any sstables but with different insertion orders on the memtable
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         //newest to oldest
         try (CompactionController controller = new CompactionController(cfs, null, 0))
@@ -124,7 +124,7 @@ public void testMaxPurgeableTimestamp()
             assertPurgeBoundary(controller.getPurgeEvaluator(key), timestamp3); //memtable only
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         //oldest to newest
         try (CompactionController controller = new CompactionController(cfs, null, 0))
@@ -152,14 +152,14 @@ public void testGetFullyExpiredSSTables()
 
         // create sstable with tombstone that should be expired in no older timestamps
         applyDeleteMutation(cfs.metadata(), key, timestamp2);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // first sstable with tombstone is compacting
         Set<SSTableReader> compacting = Sets.newHashSet(cfs.getLiveSSTables());
 
         // create another sstable with more recent timestamp
         applyMutation(cfs.metadata(), key, timestamp1);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // second sstable is overlapping
         Set<SSTableReader> overlapping = Sets.difference(Sets.newHashSet(cfs.getLiveSSTables()), compacting);
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
index 0aab021d0054..ff3f210ec2ad 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
@@ -466,7 +466,7 @@ public void duplicateRowsTest() throws Throwable
         createTable("CREATE TABLE %s (pk text, ck1 int, ck2 int, v int, PRIMARY KEY (pk, ck1, ck2))");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (pk, ck1, ck2, v) values (?, ?, ?, ?)", "key", i, i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
 
         DatabaseDescriptor.setSnapshotOnDuplicateRowDetection(true);
         TableMetadata metadata = getCurrentColumnFamilyStore().metadata();
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
index d29ab5294a18..7856500bfea9 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
@@ -511,7 +511,7 @@ private static SSTableReader createSSTableWithKey(String keyspace, String table,
         .build()
         .applyUnsafe();
         Set<SSTableReader> before = cfs.getLiveSSTables();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Set<SSTableReader> after = cfs.getLiveSSTables();
         return Iterables.getOnlyElement(Sets.difference(after, before));
     }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
index af74603fd205..049415c85968 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
@@ -71,10 +71,10 @@ public void compactionInterruption() throws Exception
         cfs.getCompactionStrategyManager().disable();
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (1, 1);");
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (2, 2);");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (3, 3);");
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (4, 4);");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Set<SSTableReader> sstables = cfs.getLiveSSTables();
 
         Assert.assertEquals(2, sstables.size());
@@ -111,13 +111,13 @@ public void mixedSSTableFailure() throws Exception
     {
         cfs.getCompactionStrategyManager().disable();
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (1, 1);");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (2, 2);");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (3, 3);");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (4, 4);");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         Assert.assertEquals(4, sstables.size());
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
index 95069f17a770..e49923b0b387 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
@@ -128,7 +128,7 @@ public void testCompactingCFCounting() throws Throwable
 
         execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 1, 1);
         assertEquals(0, CompactionManager.instance.compactingCF.count(cfs));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(cfs));
         assertEquals(0, CompactionManager.instance.compactingCF.count(cfs));
@@ -145,7 +145,7 @@ private void createPossiblyExpiredSSTable(final ColumnFamilyStore cfs, final boo
         {
             execute("INSERT INTO %s (id, val) values (2, 'immortal')");
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     private void createLowGCGraceTable(){
@@ -192,7 +192,7 @@ public void testStopCompactionRepaired(Consumer<ColumnFamilyStore> compactionRun
             {
                 execute("insert into %s (k, c, v) values (?, ?, ?)", i, j, i*j);
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         cfs.getCompactionStrategyManager().mutateRepaired(cfs.getLiveSSTables(), System.currentTimeMillis(), null, false);
         for (int i = 0; i < 5; i++)
@@ -201,7 +201,7 @@ public void testStopCompactionRepaired(Consumer<ColumnFamilyStore> compactionRun
             {
                 execute("insert into %s (k, c, v) values (?, ?, ?)", i, j, i*j);
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         assertTrue(cfs.getTracker().getCompacting().isEmpty());
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
index 48caef6eefe8..4bbc526989fd 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
@@ -295,7 +295,7 @@ public void testCompactionInvalidRTs() throws Throwable
         RangeTombstone rt = new RangeTombstone(Slice.ALL, new DeletionTime(System.currentTimeMillis(), -1));
         RowUpdateBuilder rub = new RowUpdateBuilder(getCurrentColumnFamilyStore().metadata(), System.currentTimeMillis() * 1000, 22).clustering(33).addRangeTombstone(rt);
         rub.build().apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         compactAndValidate();
         readAndValidate(true);
         readAndValidate(false);
@@ -309,7 +309,7 @@ public void testCompactionInvalidTombstone() throws Throwable
         // write a standard tombstone with negative local deletion time (LDTs are not set by user and should not be negative):
         RowUpdateBuilder rub = new RowUpdateBuilder(getCurrentColumnFamilyStore().metadata(), -1, System.currentTimeMillis() * 1000, 22).clustering(33).delete("b");
         rub.build().apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         compactAndValidate();
         readAndValidate(true);
         readAndValidate(false);
@@ -323,7 +323,7 @@ public void testCompactionInvalidPartitionDeletion() throws Throwable
         // write a partition deletion with negative local deletion time (LDTs are not set by user and should not be negative)::
         PartitionUpdate pu = PartitionUpdate.simpleBuilder(getCurrentColumnFamilyStore().metadata(), 22).nowInSec(-1).delete().build();
         new Mutation(pu).apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         compactAndValidate();
         readAndValidate(true);
         readAndValidate(false);
@@ -336,7 +336,7 @@ public void testCompactionInvalidRowDeletion() throws Throwable
         prepare();
         // write a row deletion with negative local deletion time (LDTs are not set by user and should not be negative):
         RowUpdateBuilder.deleteRowAt(getCurrentColumnFamilyStore().metadata(), System.currentTimeMillis() * 1000, -1, 22, 33).apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         compactAndValidate();
         readAndValidate(true);
         readAndValidate(false);
@@ -358,7 +358,7 @@ public void testIndexedReaderRowDeletion() throws Throwable
         DatabaseDescriptor.setColumnIndexSize(1024);
         prepareWide();
         RowUpdateBuilder.deleteRowAt(getCurrentColumnFamilyStore().metadata(), System.currentTimeMillis() * 1000, -1, 22, 33).apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         readAndValidate(true);
         readAndValidate(false);
         DatabaseDescriptor.setColumnIndexSize(maxSizePre);
@@ -374,7 +374,7 @@ public void testIndexedReaderTombstone() throws Throwable
         prepareWide();
         RowUpdateBuilder rub = new RowUpdateBuilder(getCurrentColumnFamilyStore().metadata(), -1, System.currentTimeMillis() * 1000, 22).clustering(33).delete("b");
         rub.build().apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         readAndValidate(true);
         readAndValidate(false);
         DatabaseDescriptor.setColumnIndexSize(maxSizePre);
@@ -391,7 +391,7 @@ public void testIndexedReaderRT() throws Throwable
         RangeTombstone rt = new RangeTombstone(Slice.ALL, new DeletionTime(System.currentTimeMillis(), -1));
         RowUpdateBuilder rub = new RowUpdateBuilder(getCurrentColumnFamilyStore().metadata(), System.currentTimeMillis() * 1000, 22).clustering(33).addRangeTombstone(rt);
         rub.build().apply();
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         readAndValidate(true);
         readAndValidate(false);
         DatabaseDescriptor.setColumnIndexSize(maxSizePreKB);
@@ -413,7 +413,7 @@ public void testLCSThresholdParams() throws Throwable
             {
                 execute("insert into %s (id, id2, t) values (?, ?, ?)", i, j, value);
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         assertEquals(50, cfs.getLiveSSTables().size());
         LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
@@ -430,7 +430,7 @@ public void testSTCSinL0() throws Throwable
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
         cfs.disableAutoCompaction();
         execute("insert into %s (id, id2, t) values (?, ?, ?)", 1,1,"L1");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.forceMajorCompaction();
         SSTableReader l1sstable = cfs.getLiveSSTables().iterator().next();
         assertEquals(1, l1sstable.getSSTableLevel());
@@ -444,7 +444,7 @@ public void testSTCSinL0() throws Throwable
             {
                 execute("insert into %s (id, id2, t) values (?, ?, ?)", i, j, value);
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         assertEquals(51, cfs.getLiveSSTables().size());
 
@@ -471,14 +471,14 @@ public void testAbortNotifications() throws Throwable
             r.nextBytes(b);
             execute("insert into %s (id, x) values (?, ?)", i, ByteBuffer.wrap(b));
         }
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         getCurrentColumnFamilyStore().disableAutoCompaction();
         for (int i = 0; i < 1000; i++)
         {
             r.nextBytes(b);
             execute("insert into %s (id, x) values (?, ?)", i, ByteBuffer.wrap(b));
         }
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) getCurrentColumnFamilyStore().getCompactionStrategyManager().getUnrepairedUnsafe().first();
         LeveledCompactionTask lcsTask;
@@ -505,7 +505,7 @@ public void testAbortNotifications() throws Throwable
             r.nextBytes(b);
             execute("insert into %s (id, x) values (?, ?)", i, ByteBuffer.wrap(b));
         }
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         // now we have a bunch of sstables in L2 and one in L0 - bump the L0 one to L1:
         for (SSTableReader sstable : getCurrentColumnFamilyStore().getLiveSSTables())
         {
@@ -647,7 +647,7 @@ public void testPerCFSNeverPurgeTombstonesHelper(boolean deletedCell) throws Thr
         {
             execute("INSERT INTO %s (id, b) VALUES (?, ?)", i, String.valueOf(i));
         }
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
 
         assertTombstones(getCurrentColumnFamilyStore().getLiveSSTables().iterator().next(), false);
         if (deletedCell)
@@ -655,7 +655,7 @@ public void testPerCFSNeverPurgeTombstonesHelper(boolean deletedCell) throws Thr
         else
             execute("DELETE FROM %s WHERE id = ?", 50);
         getCurrentColumnFamilyStore().setNeverPurgeTombstones(false);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Thread.sleep(2000); // wait for gcgs to pass
         getCurrentColumnFamilyStore().forceMajorCompaction();
         assertTombstones(getCurrentColumnFamilyStore().getLiveSSTables().iterator().next(), false);
@@ -664,7 +664,7 @@ public void testPerCFSNeverPurgeTombstonesHelper(boolean deletedCell) throws Thr
         else
             execute("DELETE FROM %s WHERE id = ?", 44);
         getCurrentColumnFamilyStore().setNeverPurgeTombstones(true);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         Thread.sleep(1100);
         getCurrentColumnFamilyStore().forceMajorCompaction();
         assertTombstones(getCurrentColumnFamilyStore().getLiveSSTables().iterator().next(), true);
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
index a0d52aa6cccd..887ebddfee4f 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
@@ -102,14 +102,14 @@ public void testMajorCompactionPurge()
                    .build().applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // deletes
         for (int i = 0; i < 10; i++)
         {
             RowUpdateBuilder.deleteRow(cfs.metadata(), 1, key, String.valueOf(i)).applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // resurrect one column
         RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata(), 2, key);
@@ -117,7 +117,7 @@ public void testMajorCompactionPurge()
                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                .build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // major compact and test that all columns but the resurrected one is completely gone
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
@@ -146,14 +146,14 @@ public void testMajorCompactionPurgeTombstonesWithMaxTimestamp()
                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                    .build().applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // deletes
         for (int i = 0; i < 10; i++)
         {
             RowUpdateBuilder.deleteRow(cfs.metadata(), Long.MAX_VALUE, key, String.valueOf(i)).applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // major compact - tombstones should be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
@@ -164,7 +164,7 @@ public void testMajorCompactionPurgeTombstonesWithMaxTimestamp()
                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                .build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         cfs.invalidateCachedPartition(dk(key));
 
@@ -191,13 +191,13 @@ public void testMajorCompactionPurgeTopLevelTombstoneWithMaxTimestamp()
                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                    .build().applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new Mutation.PartitionUpdateCollector(KEYSPACE1, dk(key))
             .add(PartitionUpdate.fullPartitionDelete(cfs.metadata(), dk(key), Long.MAX_VALUE, FBUtilities.nowInSeconds()))
             .build()
             .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // major compact - tombstones should be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
@@ -208,7 +208,7 @@ public void testMajorCompactionPurgeTopLevelTombstoneWithMaxTimestamp()
                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                .build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         cfs.invalidateCachedPartition(dk(key));
 
@@ -235,11 +235,11 @@ public void testMajorCompactionPurgeRangeTombstoneWithMaxTimestamp()
                    .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                    .build().applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), Long.MAX_VALUE, dk(key))
             .addRangeTombstone(String.valueOf(0), String.valueOf(9)).build().applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // major compact - tombstones should be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, Integer.MAX_VALUE, false));
@@ -250,7 +250,7 @@ public void testMajorCompactionPurgeRangeTombstoneWithMaxTimestamp()
                .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                .build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         cfs.invalidateCachedPartition(dk(key));
 
@@ -278,7 +278,7 @@ public void testMinorCompactionPurge()
                         .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                         .build().applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             // deletes
             for (int i = 0; i < 10; i++)
@@ -286,7 +286,7 @@ public void testMinorCompactionPurge()
                 RowUpdateBuilder.deleteRow(cfs.metadata(), 1, key, String.valueOf(i)).applyUnsafe();
             }
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         DecoratedKey key1 = Util.dk("key1");
@@ -294,7 +294,7 @@ public void testMinorCompactionPurge()
 
         // flush, remember the current sstable and then resurrect one column
         // for first key. Then submit minor compaction on remembered sstables.
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Collection<SSTableReader> sstablesIncomplete = cfs.getLiveSSTables();
 
         RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata(), 2, "key1");
@@ -302,7 +302,7 @@ public void testMinorCompactionPurge()
                 .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                 .build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
         {
             Iterables.getOnlyElement(tasks).execute(ActiveCompactionsTracker.NOOP);
@@ -343,16 +343,16 @@ public void testMinTimestampPurge()
         .add("val", ByteBufferUtil.EMPTY_BYTE_BUFFER)
         .build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         // delete c1
         RowUpdateBuilder.deleteRow(cfs.metadata(), 10, key3, "c1").applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Collection<SSTableReader> sstablesIncomplete = cfs.getLiveSSTables();
 
         // delete c2 so we have new delete in a diffrent SSTable
         RowUpdateBuilder.deleteRow(cfs.metadata(), 9, key3, "c2").applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // compact the sstables with the c1/c2 data and the c1 tombstone
         try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
@@ -393,7 +393,7 @@ public void testCompactionPurgeOneFile() throws ExecutionException, InterruptedE
         {
             RowUpdateBuilder.deleteRow(cfs.metadata(), 1, key, String.valueOf(i)).applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(String.valueOf(cfs.getLiveSSTables()), 1, cfs.getLiveSSTables().size()); // inserts & deletes were in the same memtable -> only deletes in sstable
 
         // compact and test that the row is completely gone
@@ -438,7 +438,7 @@ public void testCompactionPurgeCachedRow() throws ExecutionException, Interrupte
         assertFalse(Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).isEmpty());
 
         // flush and major compact
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
 
         // Since we've force purging (by passing MAX_VALUE for gc_before), the row should have been invalidated and we should have no deletion info anymore
@@ -474,7 +474,7 @@ public void testCompactionPurgeTombstonedRow() throws ExecutionException, Interr
         assertFalse(partition.partitionLevelDeletion().isLive());
 
         // flush and major compact (with tombstone purging)
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
         assertFalse(Util.getOnlyPartitionUnfiltered(Util.cmd(cfs, key).build()).isEmpty());
 
@@ -504,14 +504,14 @@ public void testRowTombstoneObservedBeforePurging()
         // write a row out to one sstable
         QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
                                                      keyspace, table, 1, "foo", 1));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         UntypedResultSet result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(1, result.size());
 
         // write a row tombstone out to a second sstable
         QueryProcessor.executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // basic check that the row is considered deleted
         assertEquals(2, cfs.getLiveSSTables().size());
@@ -529,14 +529,14 @@ public void testRowTombstoneObservedBeforePurging()
         // write a row out to one sstable
         QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v1, v2) VALUES (%d, '%s', %d)",
                                                      keyspace, table, 1, "foo", 1));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(2, cfs.getLiveSSTables().size());
         result = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s WHERE k = %d", keyspace, table, 1));
         assertEquals(1, result.size());
 
         // write a row tombstone out to a different sstable
         QueryProcessor.executeInternal(String.format("DELETE FROM %s.%s WHERE k = %d", keyspace, table, 1));
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // compact the two sstables with a gcBefore that *does* allow the row tombstone to be purged
         FBUtilities.waitOnFutures(CompactionManager.instance.submitMaximal(cfs, (int) (System.currentTimeMillis() / 1000) + 10000, false));
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
index f5a0f10668df..5dc3388038ee 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
@@ -141,7 +141,7 @@ public void testSingleSSTableCompaction() throws Exception
 
         long timestamp = populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); //ttl=3s
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, store.getLiveSSTables().size());
         long originalSize = store.getLiveSSTables().iterator().next().uncompressedLength();
 
@@ -183,11 +183,11 @@ public void testUncheckedTombstoneSizeTieredCompaction() throws Exception
 
         //Populate sstable1 with with keys [0..9]
         populate(KEYSPACE1, CF_STANDARD1, 0, 9, 3); //ttl=3s
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         //Populate sstable2 with with keys [10..19] (keys do not overlap with SSTable1)
         long timestamp2 = populate(KEYSPACE1, CF_STANDARD1, 10, 19, 3); //ttl=3s
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(2, store.getLiveSSTables().size());
 
@@ -267,7 +267,7 @@ public void testUserDefinedCompaction() throws Exception
             .add("val", "val1")
             .build().applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Collection<SSTableReader> sstables = cfs.getLiveSSTables();
 
         assertEquals(1, sstables.size());
@@ -302,7 +302,7 @@ public static void writeSSTableWithRangeTombstoneMaskingOneColumn(ColumnFamilySt
             notYetDeletedRowUpdateBuilder.clustering("02").add("val", "a"); //Range tombstone doesn't cover this (timestamp 3 > 2)
             notYetDeletedRowUpdateBuilder.build().applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     @Test
@@ -387,7 +387,7 @@ private void testDontPurgeAccidentally(String k, String cfname) throws Interrupt
         rowUpdateBuilder.clustering("c").add("val", "a");
         rowUpdateBuilder.build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Collection<SSTableReader> sstablesBefore = cfs.getLiveSSTables();
 
@@ -405,7 +405,7 @@ private void testDontPurgeAccidentally(String k, String cfname) throws Interrupt
         // Sleep one second so that the removal is indeed purgeable even with gcgrace == 0
         Thread.sleep(1000);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Collection<SSTableReader> sstablesAfter = cfs.getLiveSSTables();
         Collection<SSTableReader> toCompact = new ArrayList<SSTableReader>();
@@ -487,7 +487,7 @@ public void testNeedsCleanup()
             insertRowWithKey(i + 100);
             insertRowWithKey(i + 200);
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, store.getLiveSSTables().size());
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
diff --git a/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java
index 95542a128dae..01f81dca36af 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CorruptedSSTablesCompactionsTest.java
@@ -162,7 +162,7 @@ public void testCorruptedSSTables(String tableName) throws Exception
                 maxTimestampExpected = Math.max(timestamp, maxTimestampExpected);
                 inserted.add(key);
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             CompactionsTest.assertMaxTimestamp(cfs, maxTimestampExpected);
             assertEquals(inserted.toString(), inserted.size(), Util.getAll(Util.cmd(cfs).build()).size());
         }
diff --git a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
index f75842d05a85..2b2f6a0b118f 100644
--- a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
@@ -231,9 +231,9 @@ public void testPrepBucket()
                 .clustering("column")
                 .add("val", value).build().applyUnsafe();
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
 
@@ -267,9 +267,9 @@ public void testFilterOldSSTables()
                 .clustering("column")
                 .add("val", value).build().applyUnsafe();
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Iterable<SSTableReader> filtered;
         List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
@@ -304,7 +304,7 @@ public void testDropExpiredSSTables() throws InterruptedException
             .clustering("column")
             .add("val", value).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader expiredSSTable = cfs.getLiveSSTables().iterator().next();
         Thread.sleep(10);
 
@@ -313,7 +313,7 @@ public void testDropExpiredSSTables() throws InterruptedException
             .clustering("column")
             .add("val", value).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(cfs.getLiveSSTables().size(), 2);
 
         Map<String, String> options = new HashMap<>();
@@ -357,7 +357,7 @@ public void testSTCSBigWindow()
                     .clustering("column")
                     .add("val", bigValue).build().applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         // and small ones:
         for (int r = 0; r < numSSTables / 2; r++)
@@ -366,7 +366,7 @@ public void testSTCSBigWindow()
             new RowUpdateBuilder(cfs.metadata(), timestamp, key.getKey())
                 .clustering("column")
                 .add("val", value).build().applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         Map<String, String> options = new HashMap<>();
         options.put(SizeTieredCompactionStrategyOptions.MIN_SSTABLE_SIZE_KEY, "1");
diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
index 2d6835bbe579..17bb5ee9040e 100644
--- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
@@ -134,7 +134,7 @@ public void testGrouperLevels() throws Exception{
             for (int c = 0; c < columns; c++)
                 update.newRow("column" + c).add("val", value);
             update.applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         waitForLeveling(cfs);
@@ -189,7 +189,7 @@ public void testValidationMultipleSSTablePerLevel() throws Exception
             for (int c = 0; c < columns; c++)
                 update.newRow("column" + c).add("val", value);
             update.applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         waitForLeveling(cfs);
@@ -263,7 +263,7 @@ public void testCompactionProgress() throws Exception
             for (int c = 0; c < columns; c++)
                 update.newRow("column" + c).add("val", value);
             update.applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         waitForLeveling(cfs);
@@ -300,9 +300,9 @@ public void testMutateLevel() throws Exception
             for (int c = 0; c < columns; c++)
                 update.newRow("column" + c).add("val", value);
             update.applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getStrategies().get(1).get(0);
         cfs.forceMajorCompaction();
 
@@ -341,7 +341,7 @@ public void testNewRepairedSSTable() throws Exception
             for (int c = 0; c < columns; c++)
                 update.newRow("column" + c).add("val", value);
             update.applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         waitForLeveling(cfs);
         cfs.disableAutoCompaction();
@@ -416,7 +416,7 @@ public void testTokenRangeCompaction() throws Exception
                     update.newRow("column" + c).add("val", value);
                 update.applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         // create 20 more sstables with 10 containing data for key1 and other 10 containing data for key2
@@ -426,7 +426,7 @@ public void testTokenRangeCompaction() throws Exception
                 for (int c = 0; c < columns; c++)
                     update.newRow("column" + c).add("val", value);
                 update.applyUnsafe();
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             }
         }
 
@@ -471,7 +471,7 @@ public void testTokenRangeCompaction() throws Exception
                     update.newRow("column" + c).add("val", value);
                 update.applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         // create 20 more sstables with 10 containing data for key1 and other 10 containing data for key2
@@ -483,7 +483,7 @@ public void testTokenRangeCompaction() throws Exception
                 for (int c = 0; c < columns; c++)
                     update.newRow("column" + c).add("val", value);
                 update.applyUnsafe();
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             }
         }
 
@@ -529,7 +529,7 @@ public void testCompactionCandidateOrdering() throws Exception
             for (int c = 0; c < columns; c++)
                 update.newRow("column" + c).add("val", value);
             update.applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) (cfs.getCompactionStrategyManager()).getStrategies().get(1).get(0);
         // get readers for level 0 sstables
diff --git a/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java
index 241003ad55a0..3559d2b03923 100644
--- a/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/NeverPurgeTest.java
@@ -72,13 +72,13 @@ public void minorNeverPurgeTombstonesTest() throws Throwable
             {
                 execute("INSERT INTO %s (a, b, c) VALUES (" + j + ", 2, '3')");
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         execute("UPDATE %s SET c = null WHERE a=1 AND b=2");
         execute("DELETE FROM %s WHERE a=2 AND b=2");
         execute("DELETE FROM %s WHERE a=3");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.enableAutoCompaction();
         while (cfs.getLiveSSTables().size() > 1 || !cfs.getTracker().getCompacting().isEmpty())
             Thread.sleep(100);
@@ -92,7 +92,7 @@ private void testHelper(String deletionStatement) throws Throwable
         execute("INSERT INTO %s (a, b, c) VALUES (1, 2, '3')");
         execute(deletionStatement);
         Thread.sleep(1000);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.forceMajorCompaction();
         verifyContainsTombstones(cfs.getLiveSSTables(), 1);
     }
diff --git a/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
index 0c469dc534b3..c28943c12950 100644
--- a/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/OneCompactionTest.java
@@ -71,7 +71,7 @@ private void testCompaction(String columnFamilyName, int insertsPerTable)
                 .applyUnsafe();
 
             inserted.add(key);
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             assertEquals(inserted.size(), Util.getAll(Util.cmd(store).build()).size());
         }
         CompactionManager.instance.performMaximal(store, false);
diff --git a/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java b/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
index 61cf302d6c51..ca64e4a2a34d 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
@@ -42,7 +42,7 @@ public void basicTest() throws Throwable
         createTable("create table %s (id int primary key, t text) with compaction = {'class':'LeveledCompactionStrategy','single_sstable_uplevel':true}");
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
         execute("insert into %s (id, t) values (1, 'meep')");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstable, OperationType.COMPACTION))
@@ -95,7 +95,7 @@ private void compactionTestHelper(boolean singleSSTUplevel) throws Throwable
                 execute("insert into %s (id, id2, t) values (?, ?, ?)", i, j, value);
             }
             if (i % 100 == 0)
-                cfs.forceBlockingFlush();
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         // now we have a bunch of data in L0, first compaction will be a normal one, containing all sstables:
         LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
@@ -123,7 +123,7 @@ public void corruptMetadataTest() throws Throwable
         createTable("create table %s (id int primary key, t text) with compaction = {'class':'LeveledCompactionStrategy','single_sstable_uplevel':true}");
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
         execute("insert into %s (id, t) values (1, 'meep')");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         String filenameToCorrupt = sstable.descriptor.filenameFor(Component.STATS);
diff --git a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
index 00c4a86e0dd0..bf761d8eb637 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
@@ -165,9 +165,9 @@ public void testPrepBucket() throws Exception
             new RowUpdateBuilder(cfs.metadata(), 0, key)
                 .clustering("column").add("val", value)
                 .build().applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
         Pair<List<SSTableReader>, Double> bucket;
diff --git a/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java b/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
index a2352fcf02aa..5fa7ba4d5f01 100644
--- a/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/TTLExpiryTest.java
@@ -94,7 +94,7 @@ public void testAggressiveFullyExpired()
                     .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                     .build()
                     .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         new RowUpdateBuilder(cfs.metadata(), 2L, 1, key)
                     .add("col1", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                     .build()
@@ -105,7 +105,7 @@ public void testAggressiveFullyExpired()
                     .build()
                     .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), 4L, 1, key)
                     .add("col1", ByteBufferUtil.EMPTY_BYTE_BUFFER)
@@ -117,7 +117,7 @@ public void testAggressiveFullyExpired()
                     .build()
                     .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
 
         new RowUpdateBuilder(cfs.metadata(), 6L, 3, key)
@@ -130,7 +130,7 @@ public void testAggressiveFullyExpired()
                     .build()
                     .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Set<SSTableReader> sstables = Sets.newHashSet(cfs.getLiveSSTables());
         int now = (int)(System.currentTimeMillis() / 1000);
@@ -173,7 +173,7 @@ public void testSimpleExpire(boolean force10944Bug) throws InterruptedException
                         .build()
                         .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         new RowUpdateBuilder(cfs.metadata(), timestamp, 1, key)
             .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
@@ -183,7 +183,7 @@ public void testSimpleExpire(boolean force10944Bug) throws InterruptedException
             .applyUnsafe();
 
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         // To reproduce #10944, we need to avoid the optimization that get rid of full sstable because everything
         // is known to be gcAble, so keep some data non-expiring in that case.
         new RowUpdateBuilder(cfs.metadata(), timestamp, force10944Bug ? 0 : 1, key)
@@ -192,14 +192,14 @@ public void testSimpleExpire(boolean force10944Bug) throws InterruptedException
                     .applyUnsafe();
 
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         new RowUpdateBuilder(cfs.metadata(), timestamp, 1, key)
                             .add("col311", ByteBufferUtil.EMPTY_BYTE_BUFFER)
                             .build()
                             .applyUnsafe();
 
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Thread.sleep(2000); // wait for ttl to expire
         assertEquals(4, cfs.getLiveSSTables().size());
         cfs.enableAutoCompaction(true);
@@ -221,24 +221,24 @@ public void testNoExpire() throws InterruptedException, IOException
             .build()
             .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         new RowUpdateBuilder(cfs.metadata(), timestamp, 1, key)
             .add("col2", ByteBufferUtil.EMPTY_BYTE_BUFFER)
             .build()
             .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         new RowUpdateBuilder(cfs.metadata(), timestamp, 1, key)
             .add("col3", ByteBufferUtil.EMPTY_BYTE_BUFFER)
             .build()
             .applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         String noTTLKey = "nottl";
         new RowUpdateBuilder(cfs.metadata(), timestamp, noTTLKey)
             .add("col311", ByteBufferUtil.EMPTY_BYTE_BUFFER)
             .build()
             .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Thread.sleep(2000); // wait for ttl to expire
         assertEquals(4, cfs.getLiveSSTables().size());
         cfs.enableAutoCompaction(true);
@@ -270,7 +270,7 @@ public void testCheckForExpiredSSTableBlockers() throws InterruptedException
                 .build()
                 .applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader blockingSSTable = cfs.getSSTables(SSTableSet.LIVE).iterator().next();
         for (int i = 0; i < 10; i++)
         {
@@ -279,7 +279,7 @@ public void testCheckForExpiredSSTableBlockers() throws InterruptedException
                             .delete("col1")
                             .build()
                             .applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         Multimap<SSTableReader, SSTableReader> blockers = SSTableExpiredBlockers.checkForExpiredSSTableBlockers(cfs.getSSTables(SSTableSet.LIVE), (int) (System.currentTimeMillis() / 1000) + 100);
         assertEquals(1, blockers.keySet().size());
diff --git a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
index 8fd25e700316..2ca490a6f13f 100644
--- a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
@@ -171,7 +171,7 @@ public void testPrepBucket()
                 .clustering("column")
                 .add("val", value).build().applyUnsafe();
 
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         // Decrement the timestamp to simulate a timestamp in the past hour
         for (int r = 3; r < 5; r++)
@@ -181,10 +181,10 @@ public void testPrepBucket()
             new RowUpdateBuilder(cfs.metadata(), r, key.getKey())
                 .clustering("column")
                 .add("val", value).build().applyUnsafe();
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         HashMultimap<Long, SSTableReader> buckets = HashMultimap.create();
         List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
@@ -227,7 +227,7 @@ public void testPrepBucket()
                     .clustering("column")
                     .add("val", value).build().applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         // Reset the buckets, overfill it now
@@ -262,7 +262,7 @@ public void testDropExpiredSSTables() throws InterruptedException
             .clustering("column")
             .add("val", value).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader expiredSSTable = cfs.getLiveSSTables().iterator().next();
         Thread.sleep(10);
 
@@ -272,7 +272,7 @@ public void testDropExpiredSSTables() throws InterruptedException
             .clustering("column")
             .add("val", value).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(cfs.getLiveSSTables().size(), 2);
 
         Map<String, String> options = new HashMap<>();
@@ -314,7 +314,7 @@ public void testDropOverlappingExpiredSSTables() throws InterruptedException
             .clustering("column")
             .add("val", value).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader expiredSSTable = cfs.getLiveSSTables().iterator().next();
         Thread.sleep(10);
 
@@ -327,7 +327,7 @@ public void testDropOverlappingExpiredSSTables() throws InterruptedException
             .clustering("column")
             .add("val", value).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(cfs.getLiveSSTables().size(), 2);
 
         Map<String, String> options = new HashMap<>();
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java
index 1e0d15786fc4..aebd95f46065 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/LifecycleTransactionTest.java
@@ -30,11 +30,11 @@
 import org.junit.Assert;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction.ReaderState.Action;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction.ReaderState;
+import org.apache.cassandra.db.memtable.SkipListMemtable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.MockSchema;
 import org.apache.cassandra.utils.Pair;
@@ -77,7 +77,7 @@ public void restoreIncrementalBackup()
     public void testUpdates() // (including obsoletion)
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
-        Tracker tracker = new Tracker(null, false);
+        Tracker tracker = Tracker.newDummyTracker();
         SSTableReader[] readers = readersArray(0, 3, cfs);
         SSTableReader[] readers2 = readersArray(0, 4, cfs);
         SSTableReader[] readers3 = readersArray(0, 4, cfs);
@@ -141,7 +141,7 @@ public void testUpdates() // (including obsoletion)
     public void testCancellation()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
-        Tracker tracker = new Tracker(null, false);
+        Tracker tracker = Tracker.newDummyTracker();
         List<SSTableReader> readers = readers(0, 3, cfs);
         tracker.addInitialSSTables(readers);
         LifecycleTransaction txn = tracker.tryModify(readers, OperationType.UNKNOWN);
@@ -185,7 +185,7 @@ public void testCancellation()
     public void testSplit()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
-        Tracker tracker = new Tracker(null, false);
+        Tracker tracker = Tracker.newDummyTracker();
         List<SSTableReader> readers = readers(0, 4, cfs);
         tracker.addInitialSSTables(readers);
         LifecycleTransaction txn = tracker.tryModify(readers, OperationType.UNKNOWN);
@@ -271,7 +271,7 @@ private static final class TxnTest extends TestableTransaction
 
         private static Tracker tracker(ColumnFamilyStore cfs, List<SSTableReader> readers)
         {
-            Tracker tracker = new Tracker(new Memtable(new AtomicReference<>(CommitLogPosition.NONE), cfs), false);
+            Tracker tracker = new Tracker(cfs, cfs.createMemtable(new AtomicReference<>(CommitLogPosition.NONE)), false);
             tracker.addInitialSSTables(readers);
             return tracker;
         }
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java b/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java
index 4390b2078195..e39f71f02512 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/TrackerTest.java
@@ -24,23 +24,22 @@
 import java.util.Optional;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
-
 import javax.annotation.Nullable;
 
 import com.google.common.base.Function;
 import com.google.common.base.Predicate;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
+import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.junit.Assert;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.notifications.*;
 import org.apache.cassandra.schema.CachingParams;
@@ -85,7 +84,7 @@ public static void setUp()
     public void testTryModify()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
-        Tracker tracker = new Tracker(null, false);
+        Tracker tracker = Tracker.newDummyTracker();
         List<SSTableReader> readers = ImmutableList.of(MockSchema.sstable(0, true, cfs), MockSchema.sstable(1, cfs), MockSchema.sstable(2, cfs));
         tracker.addInitialSSTables(copyOf(readers));
         Assert.assertNull(tracker.tryModify(ImmutableList.of(MockSchema.sstable(0, cfs)), OperationType.COMPACTION));
@@ -108,7 +107,7 @@ public void testTryModify()
     public void testApply()
     {
         final ColumnFamilyStore cfs = MockSchema.newCFS();
-        final Tracker tracker = new Tracker(null, false);
+        final Tracker tracker = Tracker.newDummyTracker();
         final View resultView = ViewTest.fakeView(0, 0, cfs);
         final AtomicInteger count = new AtomicInteger();
         tracker.apply(new Predicate<View>()
@@ -277,15 +276,15 @@ public void testMemtableReplacement()
         Tracker tracker = cfs.getTracker();
         tracker.subscribe(listener);
 
-        Memtable prev1 = tracker.switchMemtable(true, new Memtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition()), cfs));
+        Memtable prev1 = tracker.switchMemtable(true, cfs.createMemtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition())));
         OpOrder.Group write1 = cfs.keyspace.writeOrder.getCurrent();
         OpOrder.Barrier barrier1 = cfs.keyspace.writeOrder.newBarrier();
-        prev1.setDiscarding(barrier1, new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
+        prev1.switchOut(barrier1, new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
         barrier1.issue();
-        Memtable prev2 = tracker.switchMemtable(false, new Memtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition()), cfs));
+        Memtable prev2 = tracker.switchMemtable(false, cfs.createMemtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition())));
         OpOrder.Group write2 = cfs.keyspace.writeOrder.getCurrent();
         OpOrder.Barrier barrier2 = cfs.keyspace.writeOrder.newBarrier();
-        prev2.setDiscarding(barrier2, new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
+        prev2.switchOut(barrier2, new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
         barrier2.issue();
         Memtable cur = tracker.getView().getCurrentMemtable();
         OpOrder.Group writecur = cfs.keyspace.writeOrder.getCurrent();
@@ -325,7 +324,7 @@ public void testMemtableReplacement()
         tracker = cfs.getTracker();
         listener = new MockListener(false);
         tracker.subscribe(listener);
-        prev1 = tracker.switchMemtable(false, new Memtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition()), cfs));
+        prev1 = tracker.switchMemtable(false, cfs.createMemtable(new AtomicReference<>(CommitLog.instance.getCurrentPosition())));
         tracker.markFlushing(prev1);
         reader = MockSchema.sstable(0, 10, true, cfs);
         cfs.invalidate(false);
@@ -348,7 +347,7 @@ public void testNotifications()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
         SSTableReader r1 = MockSchema.sstable(0, cfs), r2 = MockSchema.sstable(1, cfs);
-        Tracker tracker = new Tracker(null, false);
+        Tracker tracker = Tracker.newDummyTracker();
         MockListener listener = new MockListener(false);
         tracker.subscribe(listener);
         tracker.notifyAdded(singleton(r1), false);
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
index fd320870237c..eb162d59b9f7 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
@@ -34,9 +34,9 @@
 import org.junit.Assert;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.MockSchema;
diff --git a/test/unit/org/apache/cassandra/db/MemtableTest.java b/test/unit/org/apache/cassandra/db/memtable/FlushingTest.java
similarity index 76%
rename from test/unit/org/apache/cassandra/db/MemtableTest.java
rename to test/unit/org/apache/cassandra/db/memtable/FlushingTest.java
index 63b27ed19932..4275f732b24c 100644
--- a/test/unit/org/apache/cassandra/db/MemtableTest.java
+++ b/test/unit/org/apache/cassandra/db/memtable/FlushingTest.java
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.db;
+package org.apache.cassandra.db.memtable;
 
 import java.util.ArrayList;
 import java.util.List;
@@ -32,6 +32,9 @@
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -48,7 +51,7 @@
 
 @RunWith(BMUnitRunner.class)
 @BMUnitConfig(debug = true)
-public class MemtableTest extends CQLTester
+public class FlushingTest extends CQLTester
 {
     List<PartitionPosition> ranges;
     List<Directories.DataDirectory> locations;
@@ -69,7 +72,8 @@ public void setup() throws Throwable
         memtable = cfs.getTracker().getView().getCurrentMemtable();
 
         OpOrder.Barrier barrier = cfs.keyspace.writeOrder.newBarrier();
-        memtable.setDiscarding(barrier, new AtomicReference<>(CommitLog.instance.getCurrentPosition()));
+        Memtable.LastCommitLogPosition position = new Memtable.LastCommitLogPosition(CommitLog.instance.getCurrentPosition());
+        memtable.switchOut(barrier, new AtomicReference<>(position));
         barrier.issue();
 
         ranges = new ArrayList<>();
@@ -94,17 +98,17 @@ public void testAbortingFlushRunnablesWithoutStarting() throws Throwable
         // abort without starting
         try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
         {
-            List<Memtable.FlushRunnable> flushRunnables = memtable.createFlushRunnables(ranges, locations, txn);
+            List<Flushing.FlushRunnable> flushRunnables = Flushing.flushRunnables(cfs, memtable, ranges, locations, txn);
             assertNotNull(flushRunnables);
 
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
-                assertEquals(Memtable.FlushRunnableWriterState.IDLE, flushRunnable.state());
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Flushing.FlushRunnableWriterState.IDLE, flushRunnable.state());
 
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
                 assertNull(flushRunnable.abort(null));
 
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
-                assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state());
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Flushing.FlushRunnableWriterState.ABORTED, flushRunnable.state());
         }
     }
 
@@ -122,16 +126,16 @@ public static void stopAndWait() throws InterruptedException
 
     @Test
     @BMRule(name = "Wait before loop",
-    targetClass = "Memtable$FlushRunnable",
+    targetClass = "Flushing$FlushRunnable",
     targetMethod = "writeSortedContents",
-    targetLocation = "AT INVOKE Logger.isTraceEnabled()",
-    action = "org.apache.cassandra.db.MemtableTest.stopAndWait()")
+    targetLocation = "AT ENTRY",
+    action = "org.apache.cassandra.db.memtable.FlushingTest.stopAndWait()")
     public void testAbortingFlushRunnablesAfterStarting() throws Throwable
     {
         // abort after starting
         try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
         {
-            List<Memtable.FlushRunnable> flushRunnables = memtable.createFlushRunnables(ranges, locations, txn);
+            List<Flushing.FlushRunnable> flushRunnables = Flushing.flushRunnables(cfs, memtable, ranges, locations, txn);
 
             stopSignal = new Semaphore(0);
             continueSignal = new Semaphore(0);
@@ -139,14 +143,14 @@ public void testAbortingFlushRunnablesAfterStarting() throws Throwable
             List<Future<SSTableMultiWriter>> futures = flushRunnables.stream().map(executor::submit).collect(Collectors.toList());
 
             stopSignal.acquire(nThreads);
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
                 assertNull(flushRunnable.abort(null));
             continueSignal.release(flushRunnables.size());  // release all, including the ones that have not started yet
 
             FBUtilities.waitOnFutures(futures);
 
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
-                assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state());
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Flushing.FlushRunnableWriterState.ABORTED, flushRunnable.state());
         }
     }
 
@@ -156,17 +160,17 @@ public void testAbortingFlushRunnablesBeforeStarting() throws Throwable
         // abort before starting
         try (LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.FLUSH))
         {
-            List<Memtable.FlushRunnable> flushRunnables = memtable.createFlushRunnables(ranges, locations, txn);
+            List<Flushing.FlushRunnable> flushRunnables = Flushing.flushRunnables(cfs, memtable, ranges, locations, txn);
 
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
                 assertNull(flushRunnable.abort(null));
 
             List<Future<SSTableMultiWriter>> futures = flushRunnables.stream().map(executor::submit).collect(Collectors.toList());
 
             FBUtilities.waitOnFutures(futures);
 
-            for (Memtable.FlushRunnable flushRunnable : flushRunnables)
-                assertEquals(Memtable.FlushRunnableWriterState.ABORTED, flushRunnable.state());
+            for (Flushing.FlushRunnable flushRunnable : flushRunnables)
+                assertEquals(Flushing.FlushRunnableWriterState.ABORTED, flushRunnable.state());
         }
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/repair/AbstractPendingAntiCompactionTest.java b/test/unit/org/apache/cassandra/db/repair/AbstractPendingAntiCompactionTest.java
index 62b7db148465..cf09979571f4 100644
--- a/test/unit/org/apache/cassandra/db/repair/AbstractPendingAntiCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/repair/AbstractPendingAntiCompactionTest.java
@@ -109,7 +109,7 @@ void makeSSTables(int num, ColumnFamilyStore cfs, int rowsPerSSTable)
             int val = i * rowsPerSSTable;  // multiplied to prevent ranges from overlapping
             for (int j = 0; j < rowsPerSSTable; j++)
                 QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?)", ks, cfs.getTableName()), val + j, val + j);
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         Assert.assertEquals(num, cfs.getLiveSSTables().size());
     }
diff --git a/test/unit/org/apache/cassandra/db/repair/CompactionManagerGetSSTablesForValidationTest.java b/test/unit/org/apache/cassandra/db/repair/CompactionManagerGetSSTablesForValidationTest.java
index 3b29cc5b50d7..b47f2e49daf9 100644
--- a/test/unit/org/apache/cassandra/db/repair/CompactionManagerGetSSTablesForValidationTest.java
+++ b/test/unit/org/apache/cassandra/db/repair/CompactionManagerGetSSTablesForValidationTest.java
@@ -93,7 +93,7 @@ private void makeSSTables()
         for (int i=0; i<3; i++)
         {
             QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES(?, ?)", ks, tbl), i, i);
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         Assert.assertEquals(3, cfs.getLiveSSTables().size());
 
diff --git a/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java b/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java
index 1c5c245b57ee..dc46c27cb78e 100644
--- a/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java
@@ -122,12 +122,12 @@ public void successCase() throws Exception
         {
             QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?)", ks, tbl), i, i);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         for (int i = 8; i < 12; i++)
         {
             QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?)", ks, tbl), i, i);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(2, cfs.getLiveSSTables().size());
 
         Token left = ByteOrderedPartitioner.instance.getToken(ByteBufferUtil.bytes((int) 6));
diff --git a/test/unit/org/apache/cassandra/db/rows/ThrottledUnfilteredIteratorTest.java b/test/unit/org/apache/cassandra/db/rows/ThrottledUnfilteredIteratorTest.java
index d5e6348d3bf2..cabf72dca5d2 100644
--- a/test/unit/org/apache/cassandra/db/rows/ThrottledUnfilteredIteratorTest.java
+++ b/test/unit/org/apache/cassandra/db/rows/ThrottledUnfilteredIteratorTest.java
@@ -112,7 +112,7 @@ public void emptyPartitionDeletionTest() throws Throwable
 
         // flush and generate 1 sstable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.disableAutoCompaction();
         cfs.forceMajorCompaction();
 
@@ -147,7 +147,7 @@ public void emptyStaticTest() throws Throwable
 
         // flush and generate 1 sstable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.disableAutoCompaction();
         cfs.forceMajorCompaction();
 
@@ -205,7 +205,7 @@ else if (ck1 == ck2 - 1) // cell tombstone
 
         // flush and generate 1 sstable
         ColumnFamilyStore cfs = Keyspace.open(keyspace()).getColumnFamilyStore(currentTable());
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.disableAutoCompaction();
         cfs.forceMajorCompaction();
 
@@ -623,7 +623,7 @@ public void testThrottledIteratorWithRangeDeletions() throws Exception
 
         new RowUpdateBuilder(cfs.metadata(), 1, key).addRangeTombstone(10, 22).build().applyUnsafe();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         builder = UpdateBuilder.create(cfs.metadata(), key).withTimestamp(2);
         for (int i = 1; i < 40; i += 2)
diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java
index 58d26c11adb9..11c9c00b04bf 100644
--- a/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamWriterTest.java
@@ -102,7 +102,7 @@ public static void defineSchemaAndPrepareSSTable()
             .build()
             .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         sstable = store.getLiveSSTables().iterator().next();
diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java
index 5cf8c0bd0f6e..a4d4a7ddefee 100644
--- a/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/CassandraOutgoingFileTest.java
@@ -79,7 +79,7 @@ public static void defineSchemaAndPrepareSSTable()
             .build()
             .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         sstable = store.getLiveSSTables().iterator().next();
diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamHeaderTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamHeaderTest.java
index 999a44eeb83b..80803010af19 100644
--- a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamHeaderTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamHeaderTest.java
@@ -80,7 +80,7 @@ public static void defineSchemaAndPrepareSSTable()
             .build()
             .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         sstable = store.getLiveSSTables().iterator().next();
diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java
index 0b37d667e9ed..faa7b99f4fec 100644
--- a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java
@@ -114,7 +114,7 @@ private SSTableReader createSSTable(Runnable queryable)
     {
         Set<SSTableReader> before = cfs.getLiveSSTables();
         queryable.run();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Set<SSTableReader> after = cfs.getLiveSSTables();
 
         Set<SSTableReader> diff = Sets.difference(after, before);
diff --git a/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java b/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java
index 3cc89431e805..5a2f476bfaac 100644
--- a/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/EntireSSTableStreamConcurrentComponentMutationTest.java
@@ -128,7 +128,7 @@ public static void defineSchemaAndPrepareSSTable()
             .build()
             .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         Token start = ByteOrderedPartitioner.instance.getTokenFactory().fromString(Long.toHexString(0));
diff --git a/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java b/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java
index 2e2ee8ff1e56..e44cbcde8e2e 100644
--- a/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java
+++ b/test/unit/org/apache/cassandra/db/transform/DuplicateRowCheckerTest.java
@@ -65,7 +65,7 @@ public void setup() throws Throwable
         createTable("CREATE TABLE %s (pk text, ck1 int, ck2 int, v int, PRIMARY KEY (pk, ck1, ck2))");
         for (int i = 0; i < 10; i++)
             execute("insert into %s (pk, ck1, ck2, v) values (?, ?, ?, ?)", "key", i, i, i);
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
 
         metadata = getCurrentColumnFamilyStore().metadata();
         cfs = getCurrentColumnFamilyStore();
diff --git a/test/unit/org/apache/cassandra/db/view/ViewBuilderTaskTest.java b/test/unit/org/apache/cassandra/db/view/ViewBuilderTaskTest.java
index 2341c730a423..ebf0244eeee0 100644
--- a/test/unit/org/apache/cassandra/db/view/ViewBuilderTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/view/ViewBuilderTaskTest.java
@@ -84,8 +84,8 @@ private void test(int indexOfStartToken,
                               int expectedRowsInView) throws Throwable
             {
                 // Truncate the materialized view (not the base table)
-                cfs.viewManager.forceBlockingFlush();
-                cfs.viewManager.truncateBlocking(cfs.forceBlockingFlush(), System.currentTimeMillis());
+                cfs.viewManager.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+                cfs.viewManager.truncateBlocking(cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS), System.currentTimeMillis());
                 assertRowCount(execute("SELECT * FROM " + viewName), 0);
 
                 // Get the tokens from the referenced inserted rows
diff --git a/test/unit/org/apache/cassandra/index/CustomIndexTest.java b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
index 27a946ce7688..5a222545b9df 100644
--- a/test/unit/org/apache/cassandra/index/CustomIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
@@ -38,6 +38,7 @@
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.db.rows.Unfiltered;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
@@ -645,7 +646,7 @@ public void testFailing2iFlush() throws Throwable
 
         try
         {
-            getCurrentColumnFamilyStore().forceBlockingFlush();
+            flush();
             fail("Exception should have been propagated");
         }
         catch (Throwable t)
@@ -667,7 +668,7 @@ public void indexBuildingPagesLargePartitions() throws Throwable
         // Insert a single wide partition to be indexed
         for (int i = 0; i < totalRows; i++)
             execute("INSERT INTO %s (k, c, v) VALUES (0, ?, ?)", i, i);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Create the index, which won't automatically start building
         String indexName = "build_single_partition_idx";
@@ -724,7 +725,7 @@ public void partitionIndexTest() throws Throwable
         execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 5, 3, 3);
         execute("DELETE FROM %s WHERE k = ?", 5);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         String indexName = "partition_index_test_idx";
         createIndex(String.format("CREATE CUSTOM INDEX %s ON %%s(v) USING '%s'",
@@ -786,7 +787,7 @@ public void partitionIsNotOverIndexed() throws Throwable
         // Insert a single row partition to be indexed
         for (int i = 0; i < totalRows; i++)
             execute("INSERT INTO %s (k, c, v) VALUES (0, ?, ?)", i, i);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Create the index, which won't automatically start building
         String indexName = "partition_overindex_test_idx";
@@ -812,7 +813,7 @@ public void rangeTombstoneTest() throws Throwable
 
         // Insert a single range tombstone
         execute("DELETE FROM %s WHERE k=1 and c > 2");
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // Create the index, which won't automatically start building
         String indexName = "range_tombstone_idx";
@@ -1168,7 +1169,7 @@ public void testFlushObserver() throws Throwable
         assertEquals(0, index.flushedUnfiltereds.get());
         assertEquals(0, index.completeFlushCalls.get());
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, index.beginFlushCalls.get());
         assertEquals(2, index.flushedPartitions.get());
@@ -1180,7 +1181,7 @@ public void testFlushObserver() throws Throwable
         execute("DELETE FROM %s WHERE k=?", 0);
         execute("DELETE FROM %s WHERE k=? AND c>=?", 1, 1);
         index.reset();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, index.beginFlushCalls.get());
         assertEquals(2, index.flushedPartitions.get());
@@ -1338,7 +1339,7 @@ public void testGroupedWrites() throws Throwable
         assertEquals(10, index2.finishCalls);
 
         // flush the previous data to get rid of it, reset the group counters and flush a new memtable
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         group.reset();
         execute("INSERT INTO %s (k, s) VALUES (?, ?)", 1, 0);
         execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 1, 0, 0);
@@ -1348,7 +1349,7 @@ public void testGroupedWrites() throws Throwable
         execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 2, 2, 0);
         execute("DELETE FROM %s WHERE k=? AND c=?", 2, 3);
         execute("DELETE FROM %s WHERE k=?", 3);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // verify that the flush observer calls get only once to the group
         assertEquals(1, group.beginFlushCalls.get());
diff --git a/test/unit/org/apache/cassandra/index/StubIndex.java b/test/unit/org/apache/cassandra/index/StubIndex.java
index d63fe46d33da..a5a59ab5c800 100644
--- a/test/unit/org/apache/cassandra/index/StubIndex.java
+++ b/test/unit/org/apache/cassandra/index/StubIndex.java
@@ -23,6 +23,7 @@
 import java.util.function.BiFunction;
 
 import org.apache.cassandra.Util;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.*;
diff --git a/test/unit/org/apache/cassandra/index/StubIndexGroup.java b/test/unit/org/apache/cassandra/index/StubIndexGroup.java
index 8d6177853ce3..7a1f4c9fa088 100644
--- a/test/unit/org/apache/cassandra/index/StubIndexGroup.java
+++ b/test/unit/org/apache/cassandra/index/StubIndexGroup.java
@@ -25,11 +25,11 @@
 import javax.annotation.Nullable;
 
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.Memtable;
 import org.apache.cassandra.db.RegularAndStaticColumns;
 import org.apache.cassandra.db.WriteContext;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.index.transactions.IndexTransaction;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
diff --git a/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java b/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
index 115a92ad43ca..44b26e899cb4 100644
--- a/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
+++ b/test/unit/org/apache/cassandra/index/internal/CustomCassandraIndex.java
@@ -30,6 +30,7 @@
 
 import com.google.common.collect.ImmutableSet;
 
+import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.index.TargetParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -135,7 +136,7 @@ public Optional<ColumnFamilyStore> getBackingTable()
     public Callable<Void> getBlockingFlushTask()
     {
         return () -> {
-            indexCfs.forceBlockingFlush();
+            indexCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             return null;
         };
     }
@@ -598,7 +599,7 @@ private void invalidate()
         CompactionManager.instance.interruptCompactionForCFs(cfss, (sstable) -> true, true);
         CompactionManager.instance.waitForCessation(cfss, (sstable) -> true);
         indexCfs.keyspace.writeOrder.awaitNewBarrier();
-        indexCfs.forceBlockingFlush();
+        indexCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         indexCfs.readOrdering.awaitNewBarrier();
         indexCfs.invalidate();
     }
@@ -623,7 +624,7 @@ private Callable<?> getBuildIndexTask()
 
     private void buildBlocking()
     {
-        baseCfs.forceBlockingFlush();
+        baseCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         try (ColumnFamilyStore.RefViewFragment viewFragment = baseCfs.selectAndReference(View.selectFunction(SSTableSet.CANONICAL));
              Refs<SSTableReader> sstables = viewFragment.refs)
@@ -647,7 +648,7 @@ private void buildBlocking()
                                                                          ImmutableSet.copyOf(sstables));
             Future<?> future = CompactionManager.instance.submitIndexBuild(builder);
             FBUtilities.waitOnFuture(future);
-            indexCfs.forceBlockingFlush();
+            indexCfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         logger.info("Index build of {} complete", metadata.name);
     }
diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
index 78e7b29905b7..752d77185692 100644
--- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
@@ -456,7 +456,7 @@ private void testPrefixSearchWithContainsMode(boolean forceFlush)
 
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         final UntypedResultSet results = executeCQL(FTS_CF_NAME, "SELECT * FROM %s.%s WHERE artist LIKE 'lady%%'");
         Assert.assertNotNull(results);
@@ -808,7 +808,7 @@ private void testColumnNamesWithSlashes(boolean forceFlush)
         rm3.build().apply();
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         final ByteBuffer dataOutputId = UTF8Type.instance.decompose("/data/output/id");
 
@@ -970,7 +970,7 @@ private void redistributeSummaries(int expected, ColumnFamilyStore store, ByteBu
     {
         setMinIndexInterval(minIndexInterval);
         IndexSummaryManager.instance.redistributeSummaries();
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Set<String> rows = getIndexed(store, 100, buildExpression(firstName, Operator.LIKE_CONTAINS, UTF8Type.instance.decompose("a")));
         Assert.assertEquals(rows.toString(), expected, rows.size());
@@ -1214,7 +1214,7 @@ private void testUnicodeSupport(boolean forceFlush)
         rm.build().apply();
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Set<String> rows;
 
@@ -1286,7 +1286,7 @@ private void testUnicodeSuffixModeNoSplits(boolean forceFlush)
         rm.build().apply();
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Set<String> rows;
 
@@ -1347,7 +1347,7 @@ public void testThatTooBigValueIsRejected()
             rows = getIndexed(store, 10, buildExpression(comment, Operator.LIKE_MATCHES, bigValue.duplicate()));
             Assert.assertEquals(0, rows.size());
 
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             rows = getIndexed(store, 10, buildExpression(comment, Operator.LIKE_MATCHES, bigValue.duplicate()));
             Assert.assertEquals(0, rows.size());
@@ -1450,7 +1450,7 @@ public void testChinesePrefixSearch()
         update(rm, fullName, UTF8Type.instance.decompose("利久 寺地"), 8000);
         rm.build().apply();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
 
         Set<String> rows;
@@ -1487,7 +1487,7 @@ public void testLowerCaseAnalyzer(boolean forceFlush)
         rm.build().apply();
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Set<String> rows;
 
@@ -1572,7 +1572,7 @@ public void testPrefixSSTableLookup()
         rm.build().apply();
 
         // first flush would make interval for name - 'johnny' -> 'pavel'
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         rm = new Mutation.PartitionUpdateCollector(KS_NAME, decoratedKey("key6"));
         update(rm, name, UTF8Type.instance.decompose("Jason"), 6000);
@@ -1587,7 +1587,7 @@ public void testPrefixSSTableLookup()
         rm.build().apply();
 
         // this flush is going to produce range - 'jason' -> 'vijay'
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // make sure that overlap of the prefixes is properly handled across sstables
         // since simple interval tree lookup is not going to cover it, prefix lookup actually required.
@@ -1751,7 +1751,7 @@ public void testClusteringIndexes(boolean forceFlush)
         executeCQL(CLUSTERING_CF_NAME_1 ,"INSERT INTO %s.%s (name, nickname, location, age, height, score) VALUES (?, ?, ?, ?, ?, ?)", "Jordan", "jrwest", "US", 27, 182, 1.0);
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         UntypedResultSet results;
 
@@ -1838,7 +1838,7 @@ public void testClusteringIndexes(boolean forceFlush)
         executeCQL(CLUSTERING_CF_NAME_2 ,"INSERT INTO %s.%s (name, nickname, location, age, height, score) VALUES (?, ?, ?, ?, ?, ?)", "Christopher", "chis", "US", 27, 180, 1.0);
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         results = executeCQL(CLUSTERING_CF_NAME_2 ,"SELECT * FROM %s.%s WHERE location LIKE 'US' AND age = 43 ALLOW FILTERING");
         Assert.assertNotNull(results);
@@ -1864,7 +1864,7 @@ public void testStaticIndex(boolean shouldFlush)
         executeCQL(STATIC_CF_NAME, "INSERT INTO %s.%s (sensor_id,date,value,variance) VALUES(?, ?, ?, ?)", 1, 20160403L, 24.96, 4);
 
         if (shouldFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         executeCQL(STATIC_CF_NAME, "INSERT INTO %s.%s (sensor_id,sensor_type) VALUES(?, ?)", 2, "PRESSURE");
         executeCQL(STATIC_CF_NAME, "INSERT INTO %s.%s (sensor_id,date,value,variance) VALUES(?, ?, ?, ?)", 2, 20160401L, 1.03, 9);
@@ -1872,7 +1872,7 @@ public void testStaticIndex(boolean shouldFlush)
         executeCQL(STATIC_CF_NAME, "INSERT INTO %s.%s (sensor_id,date,value,variance) VALUES(?, ?, ?, ?)", 2, 20160403L, 1.01, 4);
 
         if (shouldFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         UntypedResultSet results;
 
@@ -1953,7 +1953,7 @@ public void testTableRebuild() throws Exception
         executeCQL(CLUSTERING_CF_NAME_1, "INSERT INTO %s.%s (name, location, age, height, score) VALUES (?, ?, ?, ?, ?)", "Pavel", "BY", 28, 182, 2.0);
         executeCQL(CLUSTERING_CF_NAME_1, "INSERT INTO %s.%s (name, nickname, location, age, height, score) VALUES (?, ?, ?, ?, ?, ?)", "Jordan", "jrwest", "US", 27, 182, 1.0);
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTable ssTable = store.getSSTables(SSTableSet.LIVE).iterator().next();
         Path path = FileSystems.getDefault().getPath(ssTable.getFilename().replace("-Data", "-SI_" + CLUSTERING_CF_NAME_1 + "_age"));
@@ -1992,7 +1992,7 @@ public void testIndexRebuild()
 
         executeCQL(CLUSTERING_CF_NAME_1, "INSERT INTO %s.%s (name, nickname) VALUES (?, ?)", "Alex", "ifesdjeen");
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for (Index index : store.indexManager.listIndexes())
         {
@@ -2118,7 +2118,7 @@ private void testLIKEAndEQSemanticsWithDifferenceKindsOfIndexes(String containsT
         {
             Keyspace keyspace = Keyspace.open(KS_NAME);
             for (String table : Arrays.asList(containsTable, prefixTable, analyzedPrefixTable))
-                keyspace.getColumnFamilyStore(table).forceBlockingFlush();
+                keyspace.getColumnFamilyStore(table).forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         UntypedResultSet results;
@@ -2352,7 +2352,7 @@ public void testIndexMemtableSwitching() throws Exception
 
         Assert.assertTrue(rangesSize(beforeFlushMemtable, expression) > 0);
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         IndexMemtable afterFlushMemtable = index.getCurrentMemtable();
 
@@ -2499,7 +2499,7 @@ private ColumnFamilyStore loadData(Map<String, Pair<String, Integer>> data, bool
         ColumnFamilyStore store = Keyspace.open(KS_NAME).getColumnFamilyStore(CF_NAME);
 
         if (forceFlush)
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         return store;
     }
diff --git a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
index 8760f43b8089..e67a4c3de4cd 100644
--- a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
+++ b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
@@ -96,7 +96,7 @@ private void insertN(ColumnFamilyStore cfs, int n, long base) throws Throwable
             execute("INSERT INTO %s (pk) VALUES (?)", base + i);
 
         // flush to write the sstable
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
     }
 
     private void assertDiskSpaceEqual(ColumnFamilyStore cfs)
diff --git a/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java b/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java
index 461c13cd09ee..721eabfdd5ba 100644
--- a/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java
+++ b/test/unit/org/apache/cassandra/io/compress/CompressedSequentialWriterReopenTest.java
@@ -78,12 +78,12 @@ public void compressionEnabled() throws Throwable
         {
             execute("insert into %s (id, t) values (?, ?)", i, ByteBuffer.wrap(blob));
         }
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         for (int i = 0; i < 10000; i++)
         {
             execute("insert into %s (id, t) values (?, ?)", i, ByteBuffer.wrap(blob));
         }
-        getCurrentColumnFamilyStore().forceBlockingFlush();
+        flush();
         DatabaseDescriptor.setSSTablePreemptiveOpenIntervalInMB(1);
         getCurrentColumnFamilyStore().forceMajorCompaction();
     }
diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
index 9bae92386e2a..bff6a547edeb 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
@@ -196,7 +196,7 @@ private void createSSTables(String ksname, String cfname, int numSSTables, int n
                     .build()
                     .applyUnsafe();
             }
-            futures.add(cfs.forceFlush());
+            futures.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
         }
         for (Future future : futures)
         {
@@ -521,7 +521,7 @@ public void testRebuildAtSamplingLevel() throws IOException
             .applyUnsafe();
         }
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         assertEquals(1, sstables.size());
@@ -586,7 +586,7 @@ public void testJMXFunctions() throws IOException
                 .build()
                 .applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         assertTrue(manager.getAverageIndexInterval() >= cfs.metadata().params.minIndexInterval);
diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
index 07a2212e8f9d..548274fd5a86 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
@@ -125,7 +125,7 @@ private void createSSTables(String ksname, String cfname, int numSSTables, int n
                 .build()
                 .applyUnsafe();
             }
-            futures.add(cfs.forceFlush());
+            futures.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS));
         }
         for (Future future : futures)
         {
diff --git a/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java b/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java
index aaf1a2aa779b..8c1a343940ea 100644
--- a/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/ReducingKeyIteratorTest.java
@@ -87,7 +87,7 @@ public void testTotalAndReadBytes(int tableCount, int rowCount) throws IOExcepti
                 .build()
                 .applyUnsafe();
             }
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         Set<SSTableReader> sstables = store.getLiveSSTables();
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java
index 2510c5e8a487..abf9f554ffc2 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableCorruptionDetectionTest.java
@@ -117,7 +117,7 @@ public static void setUp()
                    .add("reg2", ByteBuffer.wrap(reg2));
             writer.append(builder.build().unfilteredIterator());
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         ssTableReader = writer.finish(true);
         txn.update(ssTableReader, false);
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
index ac0cda1cde80..c8aaebc19af6 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java
@@ -140,7 +140,7 @@ public void testLoadingSSTable() throws Exception
         }
 
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
-        cfs.forceBlockingFlush(); // wait for sstables to be on disk else we won't be able to stream them
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); // wait for sstables to be on disk else we won't be able to stream them
 
         final CountDownLatch latch = new CountDownLatch(1);
         SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false));
@@ -181,7 +181,7 @@ public void testLoadingIncompleteSSTable() throws Exception
         }
 
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD2);
-        cfs.forceBlockingFlush(); // wait for sstables to be on disk else we won't be able to stream them
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); // wait for sstables to be on disk else we won't be able to stream them
 
         //make sure we have some tables...
         assertTrue(Objects.requireNonNull(dataDir.listFiles()).length > 0);
@@ -229,14 +229,14 @@ public void testLoadingSSTableToDifferentKeyspace() throws Exception
         }
 
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
-        cfs.forceBlockingFlush(); // wait for sstables to be on disk else we won't be able to stream them
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); // wait for sstables to be on disk else we won't be able to stream them
 
         final CountDownLatch latch = new CountDownLatch(1);
         SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false), 1, KEYSPACE2);
         loader.stream(Collections.emptySet(), completionStreamListener(latch)).get();
 
         cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         List<FilteredPartition> partitions = Util.getAll(Util.cmd(cfs).build());
 
@@ -268,7 +268,7 @@ public void testLoadingBackupsTable() throws Exception
         }
 
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_BACKUPS);
-        cfs.forceBlockingFlush(); // wait for sstables to be on disk else we won't be able to stream them
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); // wait for sstables to be on disk else we won't be able to stream them
 
         final CountDownLatch latch = new CountDownLatch(1);
         SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false));
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
index 33a5d7ca055c..cde8d089efc2 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
@@ -95,7 +95,7 @@ public void testTrackMaxDeletionTime()
             .build()
             .applyUnsafe();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, store.getLiveSSTables().size());
         int ttltimestamp = (int) (System.currentTimeMillis() / 1000);
         int firstDelTime = 0;
@@ -113,7 +113,7 @@ public void testTrackMaxDeletionTime()
         .applyUnsafe();
 
         ttltimestamp = (int) (System.currentTimeMillis() / 1000);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(2, store.getLiveSSTables().size());
         List<SSTableReader> sstables = new ArrayList<>(store.getLiveSSTables());
         if (sstables.get(0).getSSTableMetadata().maxLocalDeletionTime < sstables.get(1).getSSTableMetadata().maxLocalDeletionTime)
@@ -163,7 +163,7 @@ public void testWithDeletes()
         .build()
         .applyUnsafe();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, store.getLiveSSTables().size());
         int ttltimestamp = (int) (System.currentTimeMillis() / 1000);
         int firstMaxDelTime = 0;
@@ -175,7 +175,7 @@ public void testWithDeletes()
 
         RowUpdateBuilder.deleteRow(store.metadata(), timestamp + 1, "deletetest", "todelete").applyUnsafe();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(2, store.getLiveSSTables().size());
         boolean foundDelete = false;
         for (SSTableReader sstable : store.getLiveSSTables())
@@ -212,7 +212,7 @@ public void trackMaxMinColNames() throws CharacterCodingException
                     .applyUnsafe();
             }
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, store.getLiveSSTables().size());
         for (SSTableReader sstable : store.getLiveSSTables())
         {
@@ -233,7 +233,7 @@ public void trackMaxMinColNames() throws CharacterCodingException
             .applyUnsafe();
         }
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         store.forceMajorCompaction();
         assertEquals(1, store.getLiveSSTables().size());
         for (SSTableReader sstable : store.getLiveSSTables())
@@ -260,7 +260,7 @@ public void testLegacyCounterShardTracking()
         ColumnFamily cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertTrue(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
 
@@ -271,7 +271,7 @@ public void testLegacyCounterShardTracking()
         cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertTrue(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
 
@@ -282,7 +282,7 @@ public void testLegacyCounterShardTracking()
         cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertTrue(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
 
@@ -292,7 +292,7 @@ public void testLegacyCounterShardTracking()
         cells = ArrayBackedSortedColumns.factory.create(cfs.metadata);
         cells.addColumn(new BufferCounterCell(cellname("col"), state.context, 1L, Long.MIN_VALUE));
         new Mutation(Util.dk("k").getKey(), cells).applyUnsafe();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertFalse(cfs.getLiveSSTables().iterator().next().getSSTableMetadata().hasLegacyCounterShards);
         cfs.truncateBlocking();
     } */
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 80828e1b64e3..4e7bae6f1e88 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -144,7 +144,7 @@ public void testGetPositionsForRanges()
                 .build()
                 .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         List<Range<Token>> ranges = new ArrayList<Range<Token>>();
@@ -193,7 +193,7 @@ public void testEstimatedKeysForRangesAndKeySamples()
             tokens.add(mutation.key().getToken());
         }
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, store.getLiveSSTables().size());
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
 
@@ -264,7 +264,7 @@ public void testSpannedIndexPositions() throws IOException
                 .build()
                 .applyUnsafe();
             }
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
             CompactionManager.instance.performMaximal(store, false);
 
             // check that all our keys are found correctly
@@ -306,7 +306,7 @@ public void testPersistentStatistics()
             .build()
             .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         clearAndLoad(store);
         assert store.metric.maxPartitionSize.getValue() != 0;
@@ -335,7 +335,7 @@ public void testReadRateTracking()
             .applyUnsafe();
         }
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
         assertEquals(0, sstable.getReadMeter().count());
@@ -368,7 +368,7 @@ public void testGetPositionsForRangesWithKeyCache()
             .applyUnsafe();
 
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
@@ -400,7 +400,7 @@ public void testPersistentStatisticsWithSecondaryIndex()
             .build()
             .applyUnsafe();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // check if opening and querying works
         assertIndexQueryWorks(store);
@@ -422,7 +422,7 @@ public void testGetPositionsKeyCacheStats()
             .build()
             .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
@@ -470,7 +470,7 @@ public void testOpeningSSTable() throws Exception
                 .build()
                 .applyUnsafe();
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
         Descriptor desc = sstable.descriptor;
@@ -572,7 +572,7 @@ public void testLoadingSummaryUsesCorrectPartitioner() throws Exception
         .build()
         .applyUnsafe();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         for(ColumnFamilyStore indexCfs : store.indexManager.getAllIndexColumnFamilyStores())
         {
@@ -601,7 +601,7 @@ public void testGetScannerForNoIntersectingRanges() throws Exception
             .build()
             .applyUnsafe();
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         boolean foundScanner = false;
         for (SSTableReader s : store.getLiveSSTables())
         {
@@ -635,7 +635,7 @@ public void testGetPositionsForRangesFromTableOpenedForBulkLoading() throws IOEx
             .applyUnsafe();
 
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         // construct a range which is present in the sstable, but whose
@@ -674,7 +674,7 @@ public void testIndexSummaryReplacement() throws IOException, ExecutionException
             .applyUnsafe();
 
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         Collection<SSTableReader> sstables = store.getLiveSSTables();
@@ -753,7 +753,7 @@ private void testIndexSummaryUpsampleAndReload0() throws Exception
             .applyUnsafe();
 
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         Collection<SSTableReader> sstables = store.getLiveSSTables();
@@ -868,7 +868,7 @@ private SSTableReader getNewSSTable(ColumnFamilyStore cfs, int numKeys, int step
             .build()
             .applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         return Sets.difference(cfs.getLiveSSTables(), before).iterator().next();
     }
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
index 30b1d5ca3221..ccd41b739c67 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
@@ -83,7 +83,7 @@ public void basicTest()
                 .build()
                 .apply();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Set<SSTableReader> sstables = new HashSet<>(cfs.getLiveSSTables());
         assertEquals(1, sstables.size());
         assertEquals(sstables.iterator().next().bytesOnDisk(), cfs.metric.liveDiskSpaceUsed.getCount());
@@ -700,7 +700,7 @@ public void testAllKeysReadable() throws Exception
                     .build()
                     .apply();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         cfs.forceMajorCompaction();
         validateKeys(keyspace);
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
index eff95fccbb1c..6d94f778f460 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableScannerTest.java
@@ -215,7 +215,7 @@ public void testSingleDataRange() throws IOException
 
         for (int i = 2; i < 10; i++)
             insertRowWithKey(store.metadata(), i);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, store.getLiveSSTables().size());
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
@@ -321,7 +321,7 @@ public void testMultipleRanges() throws IOException
         for (int i = 0; i < 3; i++)
             for (int j = 2; j < 10; j++)
                 insertRowWithKey(store.metadata(), i * 100 + j);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, store.getLiveSSTables().size());
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
@@ -441,7 +441,7 @@ public void testSingleKeyMultipleRanges() throws IOException
         store.disableAutoCompaction();
 
         insertRowWithKey(store.metadata(), 205);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         assertEquals(1, store.getLiveSSTables().size());
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java
index 3cf96f2698f8..8092bb60c2b3 100644
--- a/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java
@@ -116,7 +116,7 @@ public static void defineSchema() throws Exception
             .applyUnsafe();
             expectedRowCount++;
         }
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         sstable = store.getLiveSSTables().iterator().next();
     }
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java b/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
index 58185782f8ec..d2cafecc941c 100644
--- a/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
@@ -81,7 +81,7 @@ public void emptyBlockTolerance()
         QueryProcessor.executeInternal(String.format("UPDATE %s.%s SET v1=? WHERE k=? AND c=?", KEYSPACE, table), bytes(0x20000), key, 2);
         QueryProcessor.executeInternal(String.format("UPDATE %s.%s SET v1=? WHERE k=? AND c=?", KEYSPACE, table), bytes(0x20000), key, 3);
 
-        tbl.forceBlockingFlush();
+        tbl.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader sstable = Iterables.getOnlyElement(tbl.getLiveSSTables());
         DecoratedKey dk = tbl.getPartitioner().decorateKey(Int32Type.instance.decompose(key));
         BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
diff --git a/test/unit/org/apache/cassandra/repair/ValidatorTest.java b/test/unit/org/apache/cassandra/repair/ValidatorTest.java
index cf3411a24fe4..f4e47ed6cc75 100644
--- a/test/unit/org/apache/cassandra/repair/ValidatorTest.java
+++ b/test/unit/org/apache/cassandra/repair/ValidatorTest.java
@@ -186,7 +186,7 @@ public void simpleValidationTest(int n) throws Exception
 
         CompactionsTest.populate(keyspace, columnFamily, 0, n, 0); //ttl=3s
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
 
         // wait enough to force single compaction
@@ -243,7 +243,7 @@ public void testSizeLimiting() throws Exception
         // 2 ** 14 rows would normally use 2^14 leaves, but with only 1 meg we should only use 2^12
         CompactionsTest.populate(keyspace, columnFamily, 0, 1 << 14, 0);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
 
         // wait enough to force single compaction
@@ -302,7 +302,7 @@ public void testRangeSplittingTreeSizeLimit() throws Exception
         // 2 ** 14 rows would normally use 2^14 leaves, but with only 1 meg we should only use 2^12
         CompactionsTest.populate(keyspace, columnFamily, 0, 1 << 14, 0);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertEquals(1, cfs.getLiveSSTables().size());
 
         // wait enough to force single compaction
diff --git a/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java b/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java
index 6c427245c4e5..9172a7ce462b 100644
--- a/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java
+++ b/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java
@@ -114,7 +114,7 @@ private static SSTableReader createSSTable(int startKey, int keys)
             int key = startKey + i;
             QueryProcessor.executeInternal(String.format("INSERT INTO %s.%s (k, v) VALUES (?, ?)", cfm.keyspace, cfm.name), key, key);
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         return Iterables.getOnlyElement(Sets.difference(cfs.getLiveSSTables(), existing));
     }
 
diff --git a/test/unit/org/apache/cassandra/schema/MigrationManagerTest.java b/test/unit/org/apache/cassandra/schema/MigrationManagerTest.java
index ff58151f998f..ad05c36490e5 100644
--- a/test/unit/org/apache/cassandra/schema/MigrationManagerTest.java
+++ b/test/unit/org/apache/cassandra/schema/MigrationManagerTest.java
@@ -182,7 +182,7 @@ public void addNewTable() throws ConfigurationException
         // flush to exercise more than just hitting the memtable
         ColumnFamilyStore cfs = Keyspace.open(ksName).getColumnFamilyStore(tableName);
         assertNotNull(cfs);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // and make sure we get out what we put in
         UntypedResultSet rows = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s", ksName, tableName));
@@ -205,7 +205,7 @@ public void dropCf() throws ConfigurationException
                                            "dropCf", "col" + i, "anyvalue");
         ColumnFamilyStore store = Keyspace.open(cfm.keyspace).getColumnFamilyStore(cfm.name);
         assertNotNull(store);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertTrue(store.getDirectories().sstableLister(Directories.OnTxnErr.THROW).list().size() > 0);
 
         MigrationManager.announceTableDrop(ks.name, cfm.name, false);
@@ -254,7 +254,7 @@ public void addNewKS() throws ConfigurationException
                                        "key0", "col0", "val0");
         ColumnFamilyStore store = Keyspace.open(cfm.keyspace).getColumnFamilyStore(cfm.name);
         assertNotNull(store);
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         UntypedResultSet rows = QueryProcessor.executeInternal("SELECT * FROM newkeyspace1.newstandard1");
         assertRows(rows, row("key0", "col0", "val0"));
@@ -276,7 +276,7 @@ public void dropKS() throws ConfigurationException
                                            "dropKs", "col" + i, "anyvalue");
         ColumnFamilyStore cfs = Keyspace.open(cfm.keyspace).getColumnFamilyStore(cfm.name);
         assertNotNull(cfs);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         assertTrue(!cfs.getDirectories().sstableLister(Directories.OnTxnErr.THROW).list().isEmpty());
 
         MigrationManager.announceKeyspaceDrop(ks.name);
@@ -357,7 +357,7 @@ public void createEmptyKsAddNewCf() throws ConfigurationException
 
         ColumnFamilyStore cfs = Keyspace.open(newKs.name).getColumnFamilyStore(newCf.name);
         assertNotNull(cfs);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         UntypedResultSet rows = QueryProcessor.executeInternal(String.format("SELECT * FROM %s.%s", EMPTY_KEYSPACE, tableName));
         assertRows(rows, row("key0", "col0", "val0"));
@@ -512,7 +512,7 @@ public void testDropIndex() throws ConfigurationException
                                                     TABLE1i),
                                        "key0", "col0", 1L, 1L);
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         ColumnFamilyStore indexCfs = cfs.indexManager.getIndexByName(indexName)
                                                      .getBackingTable()
                                                      .orElseThrow(throwAssert("Cannot access index cfs"));
diff --git a/test/unit/org/apache/cassandra/schema/MockSchema.java b/test/unit/org/apache/cassandra/schema/MockSchema.java
index 901acbfcf2f2..0e7914ee1a79 100644
--- a/test/unit/org/apache/cassandra/schema/MockSchema.java
+++ b/test/unit/org/apache/cassandra/schema/MockSchema.java
@@ -30,6 +30,8 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.db.memtable.SkipListMemtable;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
@@ -65,7 +67,7 @@ public class MockSchema
 
     public static Memtable memtable(ColumnFamilyStore cfs)
     {
-        return new Memtable(cfs.metadata());
+        return new SkipListMemtable(cfs.metadata);
     }
 
     public static SSTableReader sstable(int generation, ColumnFamilyStore cfs)
diff --git a/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java b/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java
index ad680f52d334..e84f0fb2dd05 100644
--- a/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java
+++ b/test/unit/org/apache/cassandra/service/ActiveRepairServiceTest.java
@@ -326,7 +326,7 @@ private void createSSTables(ColumnFamilyStore cfs, int count)
                 .build()
                 .applyUnsafe();
             }
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
index ee652e2b91c9..27ca4f67e8ac 100644
--- a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
+++ b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
@@ -122,7 +122,7 @@ public void testTombstoneWarning() throws Exception
                 client.execute(query);
             }
             ColumnFamilyStore store = Keyspace.open(KEYSPACE).getColumnFamilyStore(currentTable());
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             for (int i = 0; i < iterations; i++)
             {
@@ -132,7 +132,7 @@ public void testTombstoneWarning() throws Exception
                                                                     i), QueryOptions.DEFAULT);
                 client.execute(query);
             }
-            store.forceBlockingFlush();
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
             {
                 QueryMessage query = new QueryMessage(String.format("SELECT * FROM %s.%s WHERE pk = 1",
diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
index 348f9adca975..967d05d5566e 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
@@ -97,7 +97,7 @@ public void testRangeQueried()
             builder.add("val", String.valueOf(i));
             builder.build().applyUnsafe();
         }
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         PartitionRangeReadCommand command = (PartitionRangeReadCommand) Util.cmd(cfs).build();
         AbstractBounds<PartitionPosition> keyRange = command.dataRange().keyRange();
diff --git a/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java b/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java
index 085302fb96d2..60e51fcf4285 100644
--- a/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java
+++ b/test/unit/org/apache/cassandra/streaming/EntireSSTableStreamingCorrectFilesCountTest.java
@@ -102,7 +102,7 @@ public static void defineSchemaAndPrepareSSTable()
             .applyUnsafe();
         }
 
-        store.forceBlockingFlush();
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
         sstable = store.getLiveSSTables().iterator().next();
diff --git a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
index 0bf7f2047fac..e76123ee5907 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
@@ -84,7 +84,7 @@ public void testScheduleTimeout() throws Exception
         for (int i = 0; i < 2; i++)
         {
             SchemaLoader.insertData(KEYSPACE1, CF_STANDARD, i, 1);
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         // create streaming task that streams those two sstables
@@ -135,7 +135,7 @@ public void testFailSessionDuringTransferShouldNotReleaseReferences() throws Exc
         for (int i = 0; i < 2; i++)
         {
             SchemaLoader.insertData(KEYSPACE1, CF_STANDARD, i, 1);
-            cfs.forceBlockingFlush();
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
 
         // create streaming task that streams those two sstables
diff --git a/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java b/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
index d99d35f90489..75d449a12821 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamingTransferTest.java
@@ -176,7 +176,7 @@ private List<String> createAndTransfer(ColumnFamilyStore cfs, Mutator mutator, b
         long timestamp = 1234;
         for (int i = 1; i <= 3; i++)
             mutator.mutate("key" + i, "col" + i, timestamp);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
         assertEquals(1, cfs.getLiveSSTables().size());
 
@@ -364,7 +364,7 @@ public void testTransferRangeTombstones() throws Exception
                 .build()
                 .apply();
 
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         cfs.clearUnsafe();
@@ -556,7 +556,7 @@ public void mutate(String key, String colName, long timestamp) throws Exception
         // write a lot more data so the data is spread in more than 1 chunk.
         for (int i = 1; i <= 6000; i++)
             mutator.mutate("key" + i, "col" + i, System.currentTimeMillis());
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         Util.compactAll(cfs, Integer.MAX_VALUE).get();
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
         cfs.clearUnsafe();
diff --git a/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java b/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java
index 9785d840f807..0aad6b863ead 100644
--- a/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java
+++ b/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java
@@ -139,7 +139,7 @@ private void setupTestSstables() throws Throwable
             executeFormattedQuery(formatQuery("INSERT INTO %s (id, val) VALUES (?, ?)"), "mockData" + i, "mockData" + i);
 
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         Set<SSTableReader> sstables = cfs.getLiveSSTables();
         sstableFileName = sstables.iterator().next().getFilename();
diff --git a/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java b/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java
index 570610b964d5..ddf3d28b4c22 100644
--- a/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java
+++ b/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java
@@ -139,7 +139,7 @@ public void testUpgrade() throws Throwable
     private List<String> getSStableFiles(String ks, String table) throws StartupException
     {
         ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore(table);
-        cfs.forceBlockingFlush();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         ColumnFamilyStore.scrubDataDirectories(cfs.metadata());
 
         Set<SSTableReader> sstables = cfs.getLiveSSTables();
diff --git a/update-history/STAR-801/53-f0c0b5f5db STAR-14: Memtable API b/update-history/STAR-801/53-f0c0b5f5db STAR-14: Memtable API
new file mode 100644
index 000000000000..9291b2bc36bf
--- /dev/null
+++ b/update-history/STAR-801/53-f0c0b5f5db STAR-14: Memtable API	
@@ -0,0 +1,1038 @@
+--- a/build.xml
++++ b/build.xml
+@@ -728,16 +728,9 @@
+         <dependency groupId="org.apache.ant" artifactId="ant-junit"/>
+         <!-- adding this dependency is necessary for assertj. When updating assertj, need to also update the version of
+              this that the new assertj's `assertj-parent-pom` depends on. -->
+-<<<<<<<
+-        <dependency groupId="org.junit" artifactId="junit-bom" type="pom"/>
++        <dependency groupId="org.junit" artifactId="junit-bom" version="5.6.0" type="pom"/>
+         <dependency groupId="org.awaitility" artifactId="awaitility"/>
+         <dependency groupId="org.hamcrest" artifactId="hamcrest"/>
+-=======
+-        <dependency groupId="org.junit" artifactId="junit-bom" version="5.6.0" type="pom" scope="test"/>
+-        <dependency groupId="org.assertj" artifactId="assertj-core" scope="test"/>
+-        <dependency groupId="org.awaitility" artifactId="awaitility" scope="test"/>
+-        <dependency groupId="org.hamcrest" artifactId="hamcrest" scope="test"/>
+->>>>>>>
+         <!-- coverage debs -->
+         <dependency groupId="org.jacoco" artifactId="org.jacoco.agent"/>
+         <dependency groupId="org.jacoco" artifactId="org.jacoco.ant"/>
+--- a/test/unit/org/apache/cassandra/cql3/MemtableSizeTest.java
++++ b/test/unit/org/apache/cassandra/cql3/MemtableSizeTest.java
+@@ -18,13 +18,10 @@
+ 
+ package org.apache.cassandra.cql3;
+ 
+-<<<<<<<
+-import com.google.common.base.Throwables;
+-=======
+ import java.util.List;
+ 
++import com.google.common.base.Throwables;
+ import com.google.common.collect.ImmutableList;
+->>>>>>>
+ import org.junit.Assert;
+ import org.junit.BeforeClass;
+ import org.junit.Test;
+@@ -77,48 +74,24 @@
+     @Test
+     public void testTruncationReleasesLogSpace()
+     {
+-<<<<<<<
+         Util.flakyTest(this::testSize, 2, "Fails occasionally, see CASSANDRA-16684");
+     }
+-=======
+-        keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
+-        table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid))" +
+-                                      " with compression = {'enabled': false}" +
+-                                      " and memtable = { 'class': '" + memtableClass + "'}");
+-        execute("use " + keyspace + ';');
+-
+-        String writeStatement = "INSERT INTO "+table+"(userid,picid,commentid)VALUES(?,?,?)";
+-
+-        cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+-        cfs.disableAutoCompaction();
+-        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+-
+-        long deepSizeBefore = ObjectSizes.measureDeep(cfs.getTracker().getView().getCurrentMemtable());
+-        System.out.printf("Memtable deep size before %s\n%n",
+-                          FBUtilities.prettyPrintMemory(deepSizeBefore));
+-        long i;
+-        long limit = partitions;
+-        System.out.println("Writing " + partitions + " partitions of " + rowsPerPartition + " rows");
+-        for (i = 0; i < limit; ++i)
+-        {
+-            for (long j = 0; j < rowsPerPartition; ++j)
+-                execute(writeStatement, i, j, i + j);
+-        }
+->>>>>>>
+ 
+     private void testSize()
+     {
+         try
+         {
+             keyspace = createKeyspace("CREATE KEYSPACE %s with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 } and durable_writes = false");
+-            table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid)) with compression = {'enabled': false}");
++            table = createTable(keyspace, "CREATE TABLE %s ( userid bigint, picid bigint, commentid bigint, PRIMARY KEY(userid, picid))" +
++                                      " with compression = {'enabled': false}" +
++                                      " and memtable = { 'class': '" + memtableClass + "'}");
+             execute("use " + keyspace + ';');
+ 
+             String writeStatement = "INSERT INTO " + table + "(userid,picid,commentid)VALUES(?,?,?)";
+ 
+             cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
+             cfs.disableAutoCompaction();
+-            cfs.forceBlockingFlush();
++            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+ 
+             long deepSizeBefore = ObjectSizes.measureDeep(cfs.getTracker().getView().getCurrentMemtable());
+             System.out.printf("Memtable deep size before %s\n%n",
+@@ -153,15 +126,13 @@
+                 System.out.println("Warning: " + cfs.getLiveSSTables().size() + " sstables created.");
+ 
+             Memtable memtable = cfs.getTracker().getView().getCurrentMemtable();
+-            long actualHeap = memtable.getAllocator().onHeap().owns();
+-            System.out.printf("Memtable in %s mode: %d ops, %s serialized bytes, %s (%.0f%%) on heap, %s (%.0f%%) off-heap%n",
++            Memtable.MemoryUsage usage = Memtable.getMemoryUsage(memtable);
++        long actualHeap = usage.ownsOnHeap;
++            System.out.printf("Memtable in %s mode: %d ops, %s serialized bytes, %s %n",
+                               DatabaseDescriptor.getMemtableAllocationType(),
+                               memtable.getOperations(),
+                               FBUtilities.prettyPrintMemory(memtable.getLiveDataSize()),
+-                              FBUtilities.prettyPrintMemory(actualHeap),
+-                              100 * memtable.getAllocator().onHeap().ownershipRatio(),
+-                              FBUtilities.prettyPrintMemory(memtable.getAllocator().offHeap().owns()),
+-                              100 * memtable.getAllocator().offHeap().ownershipRatio());
++                              usage);
+ 
+             long deepSizeAfter = ObjectSizes.measureDeep(memtable);
+             System.out.printf("Memtable deep size %s\n%n",
+@@ -178,32 +149,5 @@
+         {
+             Throwables.propagate(throwable);
+         }
+-<<<<<<<
+-=======
+-
+-
+-        if (!cfs.getLiveSSTables().isEmpty())
+-            System.out.println("Warning: " + cfs.getLiveSSTables().size() + " sstables created.");
+-
+-        Memtable memtable = cfs.getTracker().getView().getCurrentMemtable();
+-        Memtable.MemoryUsage usage = Memtable.getMemoryUsage(memtable);
+-        long actualHeap = usage.ownsOnHeap;
+-        System.out.printf("Memtable in %s mode: %d ops, %s serialized bytes, %s%n",
+-                          DatabaseDescriptor.getMemtableAllocationType(),
+-                          memtable.getOperations(),
+-                          FBUtilities.prettyPrintMemory(memtable.getLiveDataSize()),
+-                          usage);
+-
+-        long deepSizeAfter = ObjectSizes.measureDeep(memtable);
+-        System.out.printf("Memtable deep size %s\n%n",
+-                          FBUtilities.prettyPrintMemory(deepSizeAfter));
+-
+-        long expectedHeap = deepSizeAfter - deepSizeBefore;
+-        String message = String.format("Expected heap usage close to %s, got %s.\n",
+-                                       FBUtilities.prettyPrintMemory(expectedHeap),
+-                                       FBUtilities.prettyPrintMemory(actualHeap));
+-        System.out.println(message);
+-        Assert.assertTrue(message, Math.abs(actualHeap - expectedHeap) <= MAX_DIFFERENCE);
+->>>>>>>
+     }
+ }
+\ No newline at end of file
+--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
++++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
+@@ -616,13 +616,9 @@
+             new CounterMutation(new Mutation(update), ConsistencyLevel.ONE).apply();
+         }
+ 
+-<<<<<<<
+-        cfs.forceBlockingFlush();
++        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+ 
+         return tokenSorted.toArray(ArrayUtils.EMPTY_STRING_ARRAY);
+-=======
+-        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+->>>>>>>
+     }
+ 
+     @Test
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java
+index e130378d4f..e39acdb176 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewComplexDeletionsTest.java
+@@ -42,6 +42,7 @@ import org.apache.cassandra.utils.FBUtilities;
+ import static org.apache.cassandra.cql3.ViewComplexTest.createView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateViewWithFlush;
++import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
+ import static org.junit.Assert.assertEquals;
+ 
+ /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
+@@ -103,17 +104,17 @@ public class ViewComplexDeletionsTest extends CQLTester
+ 
+         updateView("UPDATE %s USING TIMESTAMP 10 SET b=1 WHERE k=1 AND c=1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRows(execute("SELECT * from %s"), row(1, 1, null, 1));
+         assertRows(execute("SELECT * from mv"), row(1, 1));
+         updateView("DELETE b FROM %s USING TIMESTAMP 11 WHERE k=1 AND c=1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertEmpty(execute("SELECT * from %s"));
+         assertEmpty(execute("SELECT * from mv"));
+         updateView("UPDATE %s USING TIMESTAMP 1 SET a=1 WHERE k=1 AND c=1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRows(execute("SELECT * from %s"), row(1, 1, 1, null));
+         assertRows(execute("SELECT * from mv"), row(1, 1));
+ 
+@@ -255,27 +256,27 @@ public class ViewComplexDeletionsTest extends CQLTester
+ 
+         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 0", 1, 1, 1, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
+ 
+         // remove view row
+         updateView("UPDATE %s using timestamp 1 set b = null WHERE a=1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+         // remove base row, no view updated generated.
+         updateView("DELETE FROM %s using timestamp 2 where a=1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+ 
+         // restor view row with b,c column. d is still tombstone
+         updateView("UPDATE %s using timestamp 3 set b = 1,c = 1 where a=1", version, this); // upsert
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, null));
+     }
+@@ -314,35 +315,35 @@ public class ViewComplexDeletionsTest extends CQLTester
+         updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(3, 1L));
+         // sstable-2
+         updateView("Delete from %s using timestamp 2 where p = 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"));
+         // sstable-3
+         updateView("Insert into %s (p, v1) values (3, 1) using timestamp 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+         // sstable-4
+         updateView("UPdate %s using timestamp 4 set v1 = 2 where p = 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, null, null));
+         // sstable-5
+         updateView("UPdate %s using timestamp 5 set v1 = 1 where p = 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+ 
+@@ -396,7 +397,7 @@ public class ViewComplexDeletionsTest extends CQLTester
+         updateView("Insert into %s (p1, p2, v1, v2) values (1, 2, 3, 4) using timestamp 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                 row(3, 4, 1L));
+@@ -404,14 +405,14 @@ public class ViewComplexDeletionsTest extends CQLTester
+         updateView("Delete from %s using timestamp 2 where p1 = 1 and p2 = 2;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         // view are empty
+         assertRowsIgnoringOrder(execute("SELECT * from mv2"));
+         // insert PK with TS=3
+         updateView("Insert into %s (p1, p2) values (1, 2) using timestamp 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         // deleted column in MV remained dead
+         assertRowsIgnoringOrder(execute("SELECT * from mv2"), row(2, 1, null, null));
+ 
+@@ -421,21 +422,21 @@ public class ViewComplexDeletionsTest extends CQLTester
+         // reset values
+         updateView("Insert into %s (p1, p2, v1, v2) values (1, 2, 3, 4) using timestamp 10;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                 row(3, 4, 10L));
+ 
+         updateView("UPDATE %s using timestamp 20 SET v2 = 5 WHERE p1 = 1 and p2 = 2", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                 row(3, 5, 20L));
+ 
+         updateView("DELETE FROM %s using timestamp 10 WHERE p1 = 1 and p2 = 2", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, v2, WRITETIME(v2) from mv2 WHERE p1 = ? AND p2 = ?", 1, 2),
+                                 row(null, 5, 20L));
+@@ -460,21 +461,21 @@ public class ViewComplexDeletionsTest extends CQLTester
+         updateView("Insert into %s (p, v1, v2) values (3, 1, 5) using timestamp 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(5, 1L));
+         // remove row/mv TS=2
+         updateView("Delete from %s using timestamp 2 where p = 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         // view are empty
+         assertRowsIgnoringOrder(execute("SELECT * from mv"));
+         // insert PK with TS=3
+         updateView("Insert into %s (p, v1) values (3, 1) using timestamp 3;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         // deleted column in MV remained dead
+         assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 3, null));
+ 
+@@ -482,7 +483,7 @@ public class ViewComplexDeletionsTest extends CQLTester
+         updateView("Insert into %s (p, v1, v2) values (3, 1, 5) using timestamp 2;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         // deleted column in MV remained dead
+         assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 3, null));
+         assertRowsIgnoringOrder(execute("SELECT * from mv limit 1"), row(1, 3, null));
+@@ -491,7 +492,7 @@ public class ViewComplexDeletionsTest extends CQLTester
+         executeNet(version, "UPDATE %s USING TIMESTAMP 3 SET v2 = ? WHERE p = ?", 4, 3);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 4, 3L));
+ 
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java
+index 6999bef2d6..db9c2051cc 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewComplexLivenessTest.java
+@@ -38,6 +38,7 @@ import org.apache.cassandra.utils.FBUtilities;
+ import static org.apache.cassandra.cql3.ViewComplexTest.createView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateViewWithFlush;
++import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
+ import static org.junit.Assert.assertEquals;
+ 
+ /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
+@@ -181,8 +182,8 @@ public class ViewComplexLivenessTest extends CQLTester
+         }
+         if (flush)
+         {
+-            ks.getColumnFamilyStore("mv1").forceBlockingFlush();
+-            ks.getColumnFamilyStore("mv2").forceBlockingFlush();
++            ks.getColumnFamilyStore("mv1").forceBlockingFlush(UNIT_TESTS);
++            ks.getColumnFamilyStore("mv2").forceBlockingFlush(UNIT_TESTS);
+         }
+ 
+         for (String view : Arrays.asList("mv1", "mv2"))
+@@ -224,7 +225,7 @@ public class ViewComplexLivenessTest extends CQLTester
+         assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
+ 
+         updateView("Update %s set v1 = null WHERE p = 1", version, this);
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"));
+ 
+         cfs.forceMajorCompaction(); // before gc grace second, strict-liveness tombstoned dead row remains
+@@ -237,7 +238,7 @@ public class ViewComplexLivenessTest extends CQLTester
+         assertEquals(0, cfs.getLiveSSTables().size());
+ 
+         updateView("Update %s using ttl 5 set v1 = 1 WHERE p = 1", version, this);
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p, v1, v2 from mv"), row(1, 1, 1));
+ 
+         cfs.forceMajorCompaction(); // before ttl+gc_grace_second, strict-liveness ttled dead row remains
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java
+index 76a8933fb7..6eb16ef29e 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewComplexTTLTest.java
+@@ -36,6 +36,7 @@ import org.apache.cassandra.utils.FBUtilities;
+ import static org.apache.cassandra.cql3.ViewComplexTest.createView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateViewWithFlush;
++import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
+ import static org.junit.Assert.assertTrue;
+ 
+ /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
+@@ -111,7 +112,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("UPDATE %s SET a = 1 WHERE k = 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT * from %s"), row(1, 1, null));
+         assertRows(execute("SELECT * from mv"), row(1, 1, null));
+@@ -119,7 +120,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("DELETE a FROM %s WHERE k = 1", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT * from %s"));
+         assertEmpty(execute("SELECT * from mv"));
+@@ -127,7 +128,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("INSERT INTO %s (k) VALUES (1);", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT * from %s"), row(1, null, null));
+         assertEmpty(execute("SELECT * from mv"));
+@@ -135,7 +136,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("UPDATE %s USING TTL 5 SET a = 10 WHERE k = 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT * from %s"), row(1, 10, null));
+         assertRows(execute("SELECT * from mv"), row(10, 1, null));
+@@ -143,7 +144,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("UPDATE %s SET b = 100 WHERE k = 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT * from %s"), row(1, 10, 100));
+         assertRows(execute("SELECT * from mv"), row(10, 1, 100));
+@@ -158,7 +159,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("DELETE b FROM %s WHERE k=1", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRows(execute("SELECT * from %s"), row(1, null, null));
+         assertEmpty(execute("SELECT * from mv"));
+@@ -166,7 +167,7 @@ public class ViewComplexTTLTest extends CQLTester
+         updateView("DELETE FROM %s WHERE k=1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertEmpty(execute("SELECT * from %s"));
+         assertEmpty(execute("SELECT * from mv"));
+@@ -250,11 +251,11 @@ public class ViewComplexTTLTest extends CQLTester
+ 
+         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) using timestamp 1;", version, this);
+ 
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING TTL 3 and timestamp 1;", version, this);
+ 
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         Thread.sleep(4000);
+ 
+@@ -265,11 +266,11 @@ public class ViewComplexTTLTest extends CQLTester
+ 
+         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING TTL 3 and timestamp 1;", version, this);
+ 
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         updateView("INSERT INTO %s (p, c, v) VALUES (0, 0, 0) USING timestamp 1;", version, this);
+ 
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         Thread.sleep(4000);
+ 
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
+index ea05eef50c..7ff7cd2465 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewComplexTest.java
+@@ -47,6 +47,7 @@ import org.apache.cassandra.db.compaction.CompactionManager;
+ import org.apache.cassandra.transport.ProtocolVersion;
+ import org.apache.cassandra.utils.FBUtilities;
+ 
++import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.*;
+ import static org.junit.Assert.fail;
+ 
+ /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
+@@ -133,7 +134,7 @@ public class ViewComplexTest extends CQLTester
+             Thread.sleep(1);
+         }
+         if (flush)
+-            Keyspace.open(cqlTester.keyspace()).flush();
++            Keyspace.open(cqlTester.keyspace()).flush(UNIT_TESTS);
+     }
+ 
+     @Test
+@@ -167,37 +168,37 @@ public class ViewComplexTest extends CQLTester
+ 
+         updateView("UPDATE %s USING TIMESTAMP 1 set v1 =1 where p1 = 1 AND p2 = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, 1, null));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, 1, null));
+ 
+         updateView("UPDATE %s USING TIMESTAMP 2 set v1 = null, v2 = 1 where p1 = 1 AND p2 = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, 1));
+ 
+         updateView("UPDATE %s USING TIMESTAMP 2 set v2 = null where p1 = 1 AND p2 = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"));
+ 
+         updateView("INSERT INTO %s (p1,p2) VALUES(1,1) USING TIMESTAMP 3;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, null));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, null));
+ 
+         updateView("DELETE FROM %s USING TIMESTAMP 4 WHERE p1 =1 AND p2 = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"));
+ 
+         updateView("UPDATE %s USING TIMESTAMP 5 set v2 = 1 where p1 = 1 AND p2 = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from %s"), row(1, 1, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT p1, p2, v1, v2 from mv"), row(1, 1, null, 1));
+     }
+@@ -233,14 +234,14 @@ public class ViewComplexTest extends CQLTester
+         updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 1;", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3), row(3, 1L));
+         // sstable 2
+         updateView("UPdate %s using timestamp 2 set v2 = null where p = 3", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v2, WRITETIME(v2) from mv WHERE v1 = ? AND p = ?", 1, 3),
+                                 row(null, null));
+@@ -248,14 +249,14 @@ public class ViewComplexTest extends CQLTester
+         updateView("UPdate %s using timestamp 3 set v1 = 2 where p = 3", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, null, null));
+         // sstable 4
+         updateView("UPdate %s using timestamp 4 set v1 = 1 where p = 3", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, null, null));
+ 
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java b/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java
+index f2a627df56..5c973a100c 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewComplexUpdatesTest.java
+@@ -36,6 +36,7 @@ import org.apache.cassandra.utils.FBUtilities;
+ 
+ import static org.apache.cassandra.cql3.ViewComplexTest.createView;
+ import static org.apache.cassandra.cql3.ViewComplexTest.updateView;
++import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
+ 
+ /* ViewComplexTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
+  * Any changes here check if they apply to the other classes:
+@@ -109,7 +110,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("UPDATE %s USING TIMESTAMP 0 SET v1 = 1 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, 1, null));
+         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+@@ -117,7 +118,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("DELETE v1 FROM %s USING TIMESTAMP 1 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+         assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+@@ -126,7 +127,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("UPDATE %s USING TIMESTAMP 1 SET v1 = 1 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+         assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+@@ -134,7 +135,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("UPDATE %s USING TIMESTAMP 2 SET v2 = 1 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+@@ -142,7 +143,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("DELETE v1 FROM %s USING TIMESTAMP 3 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+@@ -150,7 +151,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("DELETE v2 FROM %s USING TIMESTAMP 4 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertEmpty(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0));
+         assertEmpty(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0));
+@@ -158,7 +159,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("UPDATE %s USING TTL 3 SET v2 = 1 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+@@ -171,7 +172,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+         updateView("UPDATE %s SET v2 = 1 WHERE p = 0 AND c = 0", version, this);
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * from %s WHERE c = ? AND p = ?", 0, 0), row(0, 0, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT * from mv WHERE c = ? AND p = ?", 0, 0), row(0, 0));
+@@ -212,23 +213,23 @@ public class ViewComplexUpdatesTest extends CQLTester
+ 
+         updateView("UPDATE %s SET l=l+[1,2,3] WHERE k = 1 AND c = 1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+ 
+         updateView("UPDATE %s SET l=l-[1,2] WHERE k = 1 AND c = 1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRows(execute("SELECT * from mv"), row(1, 1, null, null));
+ 
+         updateView("UPDATE %s SET b=3 WHERE k=1 AND c=1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRows(execute("SELECT * from mv"), row(1, 1, null, 3));
+ 
+         updateView("UPDATE %s SET b=null, l=l-[3], s=s-{3} WHERE k = 1 AND c = 1", version, this);
+         if (flush)
+         {
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+             ks.getColumnFamilyStore("mv").forceMajorCompaction();
+         }
+         assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"));
+@@ -236,7 +237,7 @@ public class ViewComplexUpdatesTest extends CQLTester
+ 
+         updateView("UPDATE %s SET m=m+{3:3}, l=l-[1], s=s-{2} WHERE k = 1 AND c = 1", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT k,c,a,b from %s"), row(1, 1, null, null));
+         assertRowsIgnoringOrder(execute("SELECT * from mv"), row(1, 1, null, null));
+ 
+@@ -279,23 +280,23 @@ public class ViewComplexUpdatesTest extends CQLTester
+         // reset value
+         updateView("Insert into %s (p, v1, v2) values (3, 1, 3) using timestamp 6;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
+         // increase pk's timestamp to 20
+         updateView("Insert into %s (p) values (3) using timestamp 20;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
+         // change v1's to 2 and remove existing view row with ts7
+         updateView("UPdate %s using timestamp 7 set v1 = 2 where p = 3;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(2, 3, 3, 6L));
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv limit 1"), row(2, 3, 3, 6L));
+         // change v1's to 1 and remove existing view row with ts8
+         updateView("UPdate %s using timestamp 8 set v1 = 1 where p = 3;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT v1, p, v2, WRITETIME(v2) from mv"), row(1, 3, 3, 6L));
+     }
+ 
+@@ -330,41 +331,41 @@ public class ViewComplexUpdatesTest extends CQLTester
+         ks.getColumnFamilyStore("mv").disableAutoCompaction();
+         updateView("DELETE FROM %s USING TIMESTAMP 0 WHERE k = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         // sstable-1, Set initial values TS=1
+         updateView("INSERT INTO %s(k, a, b) VALUES (1, 1, 1) USING TIMESTAMP 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 1));
+         updateView("UPDATE %s USING TIMESTAMP 10 SET b = 2 WHERE k = 1;", version, this);
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+         updateView("UPDATE %s USING TIMESTAMP 2 SET a = 2 WHERE k = 1;", version, this);
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 2, 2));
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         ks.getColumnFamilyStore("mv").forceMajorCompaction();
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 2, 2));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv limit 1"), row(1, 2, 2));
+         updateView("UPDATE %s USING TIMESTAMP 11 SET a = 1 WHERE k = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, 1, 2));
+ 
+         // set non-key base column as tombstone, view row is removed with shadowable
+         updateView("UPDATE %s USING TIMESTAMP 12 SET a = null WHERE k = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, null, 2));
+ 
+         // column b should be alive
+         updateView("UPDATE %s USING TIMESTAMP 13 SET a = 1 WHERE k = 1;", version, this);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from mv"), row(1, 1, 2));
+         assertRowsIgnoringOrder(execute("SELECT k,a,b from %s"), row(1, 1, 2));
+ 
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
+index 374f79f27c..2d4cbb65ab 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
+@@ -41,6 +41,8 @@ import org.apache.cassandra.db.SystemKeyspace;
+ import org.apache.cassandra.transport.ProtocolVersion;
+ import org.apache.cassandra.utils.FBUtilities;
+ 
++import static org.apache.cassandra.db.ColumnFamilyStore.FlushReason.UNIT_TESTS;
++
+ /* ViewFilteringTest class has been split into multiple ones because of timeout issues (CASSANDRA-16670)
+  * Any changes here check if they apply to the other classes
+  * - ViewFilteringPKTest
+@@ -203,7 +205,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 0", 1, 1, 1, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         // views should be updated.
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
+@@ -215,7 +217,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("UPDATE %s using timestamp 1 set c = ? WHERE a=?", 0, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
+         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
+@@ -226,7 +228,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("UPDATE %s using timestamp 2 set c = ? WHERE a=?", 1, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         // row should be back in views.
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
+@@ -238,7 +240,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("UPDATE %s using timestamp 3 set d = ? WHERE a=?", 0, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 0));
+         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
+@@ -249,7 +251,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("UPDATE %s using timestamp 4 set c = ? WHERE a=?", 0, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
+         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
+@@ -260,7 +262,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("UPDATE %s using timestamp 5 set d = ? WHERE a=?", 1, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         // should not update as c=0
+         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
+@@ -283,7 +285,7 @@ public class ViewFilteringTest extends CQLTester
+         updateView("UPDATE %s using timestamp 7 set b = ? WHERE a=?", 2, 1);
+         if (flush)
+         {
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+             for (String view : views)
+                 ks.getColumnFamilyStore(view).forceMajorCompaction();
+         }
+@@ -297,7 +299,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("DELETE b, c FROM %s using timestamp 6 WHERE a=?", 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM %s"), row(1, 2, null, 1));
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+@@ -309,7 +311,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("DELETE FROM %s using timestamp 8 where a=?", 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
+         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
+@@ -320,7 +322,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("UPDATE %s using timestamp 9 set b = ?,c = ? where a=?", 1, 1, 1); // upsert
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, null));
+         assertRows(execute("SELECT * FROM mv_test2"));
+@@ -331,7 +333,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("DELETE FROM %s using timestamp 10 where a=?", 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
+         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
+@@ -342,7 +344,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?) using timestamp 11", 1, 1, 1, 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         // row should be back in views.
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1, 1));
+@@ -354,7 +356,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         updateView("DELETE FROM %s using timestamp 12 where a=?", 1);
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowCount(execute("SELECT * FROM mv_test1"), 0);
+         assertRowCount(execute("SELECT * FROM mv_test2"), 0);
+@@ -415,7 +417,7 @@ public class ViewFilteringTest extends CQLTester
+                 list(1, 1, 2),
+                 set(1, 2),
+                 map(1, 1, 2, 2));
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"), row(1, 1, 1));
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test2"), row(1, 1));
+@@ -423,7 +425,7 @@ public class ViewFilteringTest extends CQLTester
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test4"), row(1, 1));
+ 
+         execute("UPDATE %s SET l=l-[1] WHERE a = 1 AND b = 1" );
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test2"));
+@@ -431,7 +433,7 @@ public class ViewFilteringTest extends CQLTester
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test4"), row(1, 1));
+ 
+         execute("UPDATE %s SET s=s-{2}, m=m-{2} WHERE a = 1 AND b = 1");
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT a,b,c FROM %s"), row(1, 1, 1));
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+@@ -440,7 +442,7 @@ public class ViewFilteringTest extends CQLTester
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test4"), row(1, 1));
+ 
+         execute("UPDATE %s SET  m=m-{1} WHERE a = 1 AND b = 1");
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT a,b,c FROM %s"), row(1, 1, 1));
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+@@ -450,7 +452,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         // filter conditions result not changed
+         execute("UPDATE %s SET  l=l+[2], s=s-{0}, m=m+{3:3} WHERE a = 1 AND b = 1");
+-        FBUtilities.waitOnFutures(ks.flush());
++        FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         assertRowsIgnoringOrder(execute("SELECT a,b,c FROM %s"), row(1, 1, 1));
+         assertRowsIgnoringOrder(execute("SELECT * FROM mv_test1"));
+@@ -892,14 +894,14 @@ public class ViewFilteringTest extends CQLTester
+         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         //update c's timestamp TS=2
+         executeNet(version, "UPDATE %s USING TIMESTAMP 2 SET c = ? WHERE a = ? and b = ? ", 1, 0, 0);
+         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0));
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         //change c's value and TS=3, tombstones c=1 and adds c=0 record
+         executeNet(version, "UPDATE %s USING TIMESTAMP 3 SET c = ? WHERE a = ? and b = ? ", 0, 0, 0);
+@@ -908,7 +910,7 @@ public class ViewFilteringTest extends CQLTester
+         if(flush)
+         {
+             ks.getColumnFamilyStore("mv").forceMajorCompaction();
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         }
+ 
+         //change c's value back to 1 with TS=4, check we can see d
+@@ -916,7 +918,7 @@ public class ViewFilteringTest extends CQLTester
+         if (flush)
+         {
+             ks.getColumnFamilyStore("mv").forceMajorCompaction();
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+         }
+ 
+         assertRows(execute("SELECT d, e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, null));
+@@ -926,14 +928,14 @@ public class ViewFilteringTest extends CQLTester
+         assertRows(execute("SELECT d, e from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(0, 1));
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         //Change d value @ TS=2
+         executeNet(version, "UPDATE %s USING TIMESTAMP 2 SET d = ? WHERE a = ? and b = ? ", 2, 0, 0);
+         assertRows(execute("SELECT d from mv WHERE c = ? and a = ? and b = ?", 1, 0, 0), row(2));
+ 
+         if (flush)
+-            FBUtilities.waitOnFutures(ks.flush());
++            FBUtilities.waitOnFutures(ks.flush(UNIT_TESTS));
+ 
+         //Change d value @ TS=3
+         executeNet(version, "UPDATE %s USING TIMESTAMP 3 SET d = ? WHERE a = ? and b = ? ", 1, 0, 0);
+@@ -1022,7 +1024,7 @@ public class ViewFilteringTest extends CQLTester
+         for (int i = 0; i < 100; i++)
+             updateView("INSERT into %s (k,c,val)VALUES(?,?,?)", 0, i % 2, "baz");
+ 
+-        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush();
++        Keyspace.open(keyspace()).getColumnFamilyStore(currentTable()).forceBlockingFlush(UNIT_TESTS);
+ 
+         Assert.assertEquals(2, execute("select * from %s").size());
+         Assert.assertEquals(2, execute("select * from mv_tstest").size());

From 55dbb8e71d537141b244e7ccc98fec36c2237469 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 16 Apr 2021 10:23:29 +0200
Subject: [PATCH 051/151] STAR-489: Add missing expected 'memtable' param of
 column family in cqlsh tests

(cherry picked from commit dd5d05cba1f8e38fdf9dad5df1396a8268028074)
(cherry picked from commit 96c1f63022dfd5da3405f973e0f7ed78eff8f98c)
---
 pylib/cqlshlib/test/test_cqlsh_output.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pylib/cqlshlib/test/test_cqlsh_output.py b/pylib/cqlshlib/test/test_cqlsh_output.py
index 5678def303c0..116a004ba9f7 100644
--- a/pylib/cqlshlib/test/test_cqlsh_output.py
+++ b/pylib/cqlshlib/test/test_cqlsh_output.py
@@ -665,6 +665,7 @@ def test_describe_columnfamily_output(self):
                 AND comment = ''
                 AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy', 'max_threshold': '32', 'min_threshold': '4'}
                 AND compression = {'chunk_length_in_kb': '16', 'class': 'org.apache.cassandra.io.compress.LZ4Compressor'}
+                AND memtable = {}
                 AND crc_check_chance = 1.0
                 AND default_time_to_live = 0
                 AND extensions = {}

From 70fc3e6cd3ee2accaf0d7c8d27013fb68f20c4e6 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Fri, 16 Apr 2021 13:29:21 +0200
Subject: [PATCH 052/151] STAR-258: Fix Scrubber

It fixes the problem introduced in one of the previous tickets, namely STAR-15 or something aroung.

seekToNext row is expected to position dataFile at the data position retrieved from the index entry. This is where the buggy impl finished. However, in the original implementation, we continue iterating over the index until we get the data position to which we can seek data file properly.

(cherry picked from commit 07611f6171a516d8d34630af407543fab4bd7d78)
(cherry picked from commit 3a518db0a2cfeb1c2f5ab04bec07c7847c471615)
---
 .../cassandra/db/compaction/Scrubber.java     | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index b0d601937a47..ce93389d1917 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -384,19 +384,31 @@ private boolean indexAvailable()
 
     private void seekToNextRow()
     {
-        long nextRowPositionFromIndex = indexIterator.isExhausted()
-                                        ? dataFile.length()
-                                        : indexIterator.dataPosition();
-
-        try
-        {
-            dataFile.seek(nextRowPositionFromIndex);
-        }
-        catch (Throwable th)
+        while (!indexIterator.isExhausted())
         {
-            throwIfFatal(th);
-            outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th);
-            badRows++;
+            long nextRowPositionFromIndex = indexIterator.dataPosition();
+
+            try
+            {
+                dataFile.seek(nextRowPositionFromIndex);
+                return;
+            }
+            catch (Throwable th)
+            {
+                throwIfFatal(th);
+                outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th);
+                badRows++;
+            }
+
+            try
+            {
+                indexIterator.advance();
+            }
+            catch (Throwable th)
+            {
+                outputHandler.warn(String.format("Failed to go to the next entry in index, index position: %d", indexIterator.indexPosition()), th);
+                throw Throwables.cleaned(th);
+            }
         }
     }
 

From 1d642978df157294759d01d1a728233a59c402a1 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Mon, 19 Apr 2021 10:08:51 +0200
Subject: [PATCH 053/151] STAR-247: TrieIndex SSTable format implementation

[ddc51ba35cd6312eb2529e48dfd659cf1f39506f] STAR-247: Add bigtable unit test configuration

[afbedc62b3cb097901d0eaf11b44900a75721f34] STAR-247: Disable potentiallyUnclosedCloseable in eclipse_compiler.properties

[fcd91fb96e42b7010874386f852cebe7e5f20c8c] STAR-247: Minor methods and fields signature changes and non invasive refactorings

[a808a209628547049076a64d889010acc14d08df] STAR-247: Move and rename BigTableZeroCopyWriter to SSTableZeroCopyWriter

[ffe6182b3bd4dad5d1e4a6bd15ee294b4a97c426] STAR-247: Don't hardcode sets of components of an SSTable
Use sets predefined per format instead

[a4fb1f14133824cdf62e582775aa02dff0eacfa2] STAR-247: Refactor creation of SSTableReader, use factory where possible
Previously SSTableReader was created usually in the following scheme:
static openXXX method -> SSTableReaderBuilder -> factory method -> constructor

Instead of that, we will use the scheme:
factory method -> static method -> builder -> constructor

The intermediate static method is left to keep the amount of changes minimal

[c27c2b0a64a822aa01e7fe6a7f8335f29545acd7] STAR-247: Update SSTable version - introduce DSE-specific components
Those components will be ignored for now, but we don't want to fail loading
such SSTables.

[f8efff91bd3108230b0b39855a6861eddbf384e9] STAR-247: Change contract of SSTR.keyAt and SSTFO.startPartition
The previous contract states that those methods accept a key position
in the primary index file. We need to change the contract as
primary index is not an obvious component as BTI format does not
have it. Instead, those methods will use a key position in data file.

[d7a2268265ec939ad97d446f8223beafad88d6cc] STAR-247: Refactor usages of default file handle builder
In particular, index file handle builder will take a component as an argument

[ce68cb7fe3f90b15ce040ee776fabf25cc93e05f] STAR-247: Fix in Walker (add to STAR-75)

[17ad86e983f7f44054eec2c8b6f2d4d15c5ec92f] STAR-247: Use RowIndexEntry where possible

[13a6e62550eb0e22ebfe807f01022d7be6d36c37] STAR-247: Bring back ScrubIterator

[7e09ae0c948f45783c4ea37aa35faecbb2667d19] STAR-247: Add SSTR.hasIndex() method

[6ac6f1f7788627c867bcc850f6d558229d48097c] STAR-247: Refactor Scrubber

[b3383b16b07fbd669e974260db1e599bc1e8a0c9] STAR-247: Refactor IndexSummaryManager and IndexSummaryRedistribution
so that they are used only for BigTable SSTables

[7d18ef94652a91564af3888ed37530a730bc4373] STAR-247: Refactor instance tidier

[14429e6b5a3fb60fff483360628bd22f7113b4ae] STAR-247: Don't assume any format in CassandraStreamHeader

[9cef1d57193b69202c7655614580f7b326b7bd27] STAR-247: Pull up maybeLogLargePartitionWarning

[e80117d52146d6a0f62b13192d4050ec02da5e4e] STAR-247: Refactor SSTableWriter.openEarly

[092d615af68f4844a17e6de5edac5936efb62026] STAR-247: Add SequentialWriter.requestSyncOnNextFlush

[93ebc76e5a1439c8da99ab8e9188b4e5929a3d3e] STAR-247: Refactor FileHandle

[4194c58ec022741276271fa90a9e5b92ce43ea47] STAR-247: Use aligned rebufferrer when it makes sense

[b1464106170b512b86ca8ca1439de7652d45f86d] STAR-247: Refactor stats collection

[dfa458278cfab40458cf7475c9e6b64285b01c88] STAR-247: Refactor SSTable verfication

[36ef55edcf11edf7b1f16ca6b6cdfe14c86855b4] STAR-247: Various small changes and fixes

[4cfcb8d3611d655e67d2dd2c30b4cd9ecfe5347b] STAR-247: Assume big table format for big table format specific tests

[4090410205d150c69cccf6b94fbd83818d3771c7] STAR-247: Generify some tests to work regardless of sstable format being used

[d6dd8ea54c7baf106ce03e8ffee7171a2d2dec83] STAR-247: Exhaust row iterator before going to the next partition in tests

[87328ec17dfa907be63610e6310c874ec61d9ff1] STAR-247: Refactor ScrubTest

[a693753e5a37fe9de39aa370e9342fb81e3b097e] STAR-247: Comment out a part of one test case in SSTableRewriterTest
It is not clear what is the contract - see the comment in the code

[d1fb135b4359a802fc556252c59aaf30180f9acf] STAR-247: Other small fixes and changes in tests

[0c2095cd333c0638ddb92131f82d0048ad4972ed] STAR-247: Apply changes in OSS for use additional BTI implementation

[02df51813dae659cfe40d28b8ae55a37fd6723be] STAR-247: Copy BTI implementation from DSE

[10946deec4fcb8c43fbb6c3bfacdae50e99251a4] STAR-247: BTI specific tests

[5427eb03d56e4425ba14d9b1aea899bca651f69d] STAR-247: BTI - minor adjustments to OSS
- removed notions of reader constraint
- removed notions of zero copy metadata
- removed notions of file access type
- adjusted generic types
- renamed usage getSeekPosition to getFilePointer
- changed usages of getTempKey to getKey

[e80f503759e5e5942486980f3ab434506f95ae91] STAR-247: BTI - Add missing methods to TrieIndexFormat

[71b374ebac8dbce78931472209940472b073226f] STAR-247: BTI - Fix inheritance and visibility of some classes

[af3475a59b52c4b5b25efde749a6ce73a1ae969f] STAR-247: BTI - Fix TrieIndex version, add missing fields and remove unused

[371db79b7464cc741140195a229cd89ff3e0ced5] STAR-247: BTI - Refactor usages of update fileFileHandle

[9c7efbd9612081966c1ffee493174b53409a2b2e] STAR-247: BTI - remove index encryption from writer

[8d2b9985e479f870c7f36b9ba57ded7a43b5b90f] STAR-247: BTI - refactor usages of some methods which have different names in OSS
Also, when using default file handle builder for indexes, don't use compression

[7ca897a51bd5bca9a5af1521e5508fc1150a188d] STAR-247: BTI - remove maybeLogLargePartitionWarning as it is defined in parent class

[2efbf86d96cb1e77699472b479d1023f37d69832] STAR-247: BTI - update constructors of reader and writer

[de983f6d1c41682d24444e589a9a3b73fbbb6b93] STAR-247: BTI - Update clone and internalOpen method, add a new internalOpen method to reader

[efae04bca106455197d5874f80bf936dabafa374] STAR-247: BTI - Add bloom filter related methods

[c67bc891df77c576e3989dee29dd0c6524f583d1] STAR-247: BTI - rename / remove methods from ReaderFactory

[103bb1d5575962b5cdd64c8641b8db29812999cc] STAR-247: BTI - add a new open method for reader
The method loads indexes and bloom filter

Also add missing methods to the reader factory

[4134ab07a33784a454401a228152c90acea6b37f] STAR-247: BTI - add missing clone methods to the reader

[575eb7781021d41e92b2d3fd87ca632eb0d01b6c] STAR-247: BTI - add missing iterator and scanner creating methods

[6d062208a614225b57d1fbebb44cf953417a157b] STAR-247: BTI - add missing implementations of index summary and cache related methods and remove methods which are not needed from reader

[fade402a4f60c32792c3a339f653782541c4061d] STAR-247: BTI - refactor components verification - it is already implemented in the parent class

[98da04446b77fbaa143a82c05aa35d0513f152c5] STAR-247: BTI - refactor getExactPosition methods and add missing filterFirst and FilterLast

[3a7a1045678d3620cd5b950d0607daf72e6b580b] STAR-247: BTI - add one more implementation of coveredKeysIterator

[78d6d9e7b30cf5013f6727cd8d6a5c588f533a4e] STAR-247: BTI - add/remove remaining missing/unused methods in reader

[45b76a25aa3950ebb931752d933f63ef79fd0719] STAR-247: BTI - update writer factory

[162d580c1f5a619b8afa4f25d9ff7d0c0cdc5f1b] STAR-247: BTI - tidier usage refactoring

[0aa9cb1e438be5b7b34c094b2111fe6bbcadba8f] STAR-247: BTI - add missing methods to writer

[819ed6a5b363e88e18f352735f35f57120c753b3] STAR-247: BTI - refactor TrieIndexScanner

[9bdbcb3262c58dd0104605a2e5e8b8e38ac5587c] STAR-247: BTI - refactor PartitionIterator to conform the interface

[0150544fcdc1e66cc47edddfdb273b133a40ee27] STAR-247: BTI - add/remove the remaining missing/unused methods

[846414f267f94398c5a4f9fee338af66bfac1d0e] STAR-247: BTI - rename rowIndexCount to columnIndexCount in TrieIndexEntry

[2f2539dd2dd6032b0b275ba1c93d201dad249877] STAR-247: BTI - renamed TRIE_INDEX to BTI

[29caf8bf4637a924f39c929f3e44b45b4f1767c1] STAR-247: BTI - apply the remaining changes and import optimization

[26f797aa86b15dc2bd1064fbf756f488bff88aa7] STAR-247: Review: revert changes in Refs/RefCounted/SelfRefCounted

[213c5bdb7c98e97643d813645d944013f41fd006] STAR-247: Review: fix key iterators
It was actually a bug that I implemented KeyIterator so that it can returns something quasi-meaningful other than what 'next' call can return. I removed some unused methods and whenever we needed something more than a key, we use PartitionIndexIterator.

Also, I've introduced the notion of "preferred component" for keys. This means that SSTableFlushObserver.startPartition, PartitionIndexIterator.keyPosition and SSTableReader.keyAt have documented use of 'position' argument as key position in preferred component according to SSTable implementation. Big table uses index file and Trie Index uses data file. I paid attention to never explicitly say which component it is when operating on abstract level.

[f9fedc21c2cfe3431ba5a971ab2c9c5c1b016c0d] STAR-247: Review: fix description in build.xml

[d3406a60afeb12abb0de0fd3ad893b9b9fe35e89] STAR-247: Review: remove me and nc formats

[fcc7bede4163a52a8f9dcc73b6396aec7e2661c7] STAR-247: Review: fixed releasing a ref
According to the suggestion in review comments, regardless of whether closing component readers is successful or not, we continue and try to release a ref

Errors are reported accordingly, stashed and eventually thrown after passing them to JVMStabilityInspector.

[391073c0a530e1e679ad8631c32cc9ddfea57ee9] STAR-247: Review: fixed indentation in SSTableWriterTest

[491b714acd0fd7ae23f7624733be5290ba531315] STAR-247: Review: Add a comment to ScrubTest

[90b29e89c8c4216bb9cb041f6814d4960a292235] STAR-247: Review: Removed MetadataCollector.addKeyHash and DecoratedKey.hash2_64

[166450cb534752db5e4ad83a26c5fe2dd5868cea] STAR-247: Review: Remove some code related to DB-4159

[a9eb3fec6f02e5e7a01ec4908fbab06951255856] STAR-247: Review: Revert some code related to DB-1220

[81f19103d9a94f13c7ff0d28ef72703b0d86f082] STAR-247: Review: Reformat TrieIndexSSTableReader

[e9f338b46c552172b1b21e50afd4aeff67d0042b] STAR-247: Review: Fix tidying SSTableReader

[46aa1fad1846b7f0f45a9aa32e6a35a86205e445] STAR-247: Review: Add assertion on permitMatchPastLast

[cb4f3d3fea130785ab6a6f1fc118c0ab530588d1] STAR-247: Review: Add a comment about moving validation to other place

[3669667a79b64659464aea43efa9b54619ccc75e] STAR-247: Review: Remove supplier from simpleIterator

[e3fbd93d94eb16f7c8d991065585984b75ff38c8] STAR-247: Review: Bring back PartitionIndex.dumpTrie and add some test to ensure coverage

[9d3d07e140d4c3aef1562ee79a8e1465f3a9280c] STAR-247: Review: Removed one stupid todo

[267447cf95e3c472c05a043f67e9965b094b5148] STAR-247: Review: moved keys.advance method to a separate line

[8101526f2da23985ac0445e2062d37d90ef456be] STAR-247: Review: Updated JavaDoc in PartitionIndexIterator

[ef9682b8710db624cdeff05533772370ada7e249] STAR-247: Review: Refactored keyAt and added SSTableReader.openKeyComponentReader

[8163cbb5a034a1a7835fe87d4379e0c6d2eb507d] STAR-247: Review: Removed notions of NotInCacheException

[1db85a55b5db307cef45d766377ba1eb31a37bf4] STAR-247: Review: Hopefully applied the rest of the comments

[67ebe91a97e4265fd8136bf3d16a372d855d76e5] STAR-247: fix JMXCompatabilityTest

[cf4a3d6ef0a99fd08ec013819df25c17843136b9] STAR-247: Post rebase changes: add missing method to TrieIndexFormat

[ef6e7f299a947487c294c08ee792a3aba47d0612] STAR-247: Post rebase changes: fix compilation problem in ScrubTest (does not fix the test though)

[c742b4264826494d1dc7ce4e8e7a57dace8771d1] STAR-247 fix ScrubTest

[dd264390237b38744249f407a8e799d60d24fe3b] Remove unneeded generic type argument of RowIndexEntry

[7b2b269b114e3da1c6c02c5fc7d7594f39402328] Remove the TPC-imposed Reader structure and replace it with iterators

[7457283eafede5b45fcc3dad0597092074947199] Combine AbstractBigTableIterator into AbstractSSTableIterator

[c73d9bf4e20df81dce2bf8d2bbf27be43d4f689f] Fix forgotten maybeValidateUnfiltered call

[bd8d36910d10a229f89d125c87ffadb9714357c8] Fix skipping over blocks with no content and reporting slice end tomsbstone

[9ea8af38209069fa71aa3402a3fa1155190610a4] Applied the remaining comments

[f566e336df4c16a9995cdd9aeff90e1098db70f1] Applied the remaining comments (2)

[d3aa9a46bd63757b27d926bcda21ca8fddbeb17d] Update LegacySSTableTest and add other formats to test

[109f509d786cb98c4ad91ecde54a3dba1e927b36] Suppress resource in default handle builder methods

Co-authored-by: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
Co-authored-by: Jacek Lewandowski <jacek.lewandowski@datastax.com>
(cherry picked from commit 48104fc268869dffd74eb8382a0daeb97469d97c)
(cherry picked from commit 568d8ed4e8aaab851f8c74f3d39d56b20fadd02e)
---
 build.xml                                     |   34 +-
 eclipse_compiler.properties                   |    5 +-
 .../apache/cassandra/cache/ChunkCache.java    |   28 +-
 .../cassandra/db/ColumnFamilyStore.java       |    2 +-
 .../org/apache/cassandra/db/DeletionTime.java |   10 +
 .../apache/cassandra/db/SSTableImporter.java  |    7 +-
 .../db/SinglePartitionReadCommand.java        |    3 +-
 .../cassandra/db/UnfilteredDeserializer.java  |   11 +-
 .../AbstractCompactionStrategy.java           |    4 +-
 .../db/compaction/CompactionController.java   |    2 +-
 .../db/compaction/CompactionInfo.java         |    4 +-
 .../db/compaction/LeveledManifest.java        |    3 +-
 .../cassandra/db/compaction/Scrubber.java     |   95 +-
 .../cassandra/db/compaction/Verifier.java     |   26 +-
 .../writers/MajorLeveledCompactionWriter.java |    6 +-
 .../writers/MaxSSTableSizeWriter.java         |    4 +-
 .../SplittingSizeTieredCompactionWriter.java  |    4 +-
 .../cassandra/db/lifecycle/Helpers.java       |    4 +-
 .../db/lifecycle/LifecycleTransaction.java    |    6 +-
 .../cassandra/db/lifecycle/Tracker.java       |    4 +-
 .../apache/cassandra/db/lifecycle/View.java   |   14 +-
 ...azilyInitializedUnfilteredRowIterator.java |    8 +
 .../org/apache/cassandra/db/rows/Rows.java    |    4 +-
 .../CassandraEntireSSTableStreamReader.java   |   10 +-
 .../db/streaming/CassandraOutgoingFile.java   |    6 +-
 .../db/streaming/CassandraStreamHeader.java   |    3 +-
 .../db/streaming/CassandraStreamManager.java  |    5 +-
 .../db/streaming/ComponentContext.java        |    3 +-
 .../db/streaming/ComponentManifest.java       |   19 +-
 .../cassandra/index/sai/SSTableContext.java   |    8 +-
 .../sai/StorageAttachedIndexBuilder.java      |   18 +-
 .../index/sasi/SASIIndexBuilder.java          |   37 +-
 .../cassandra/index/sasi/SSTableIndex.java    |    8 +-
 .../cassandra/io/sstable/Component.java       |    8 +
 .../io/sstable/CorruptSSTableException.java   |    6 +
 .../io/sstable/IndexSummaryManager.java       |    7 +-
 .../sstable/IndexSummaryRedistribution.java   |    5 +-
 .../cassandra/io/sstable/KeyIterator.java     |   55 +-
 .../io/sstable/ReducingKeyIterator.java       |   18 +-
 .../apache/cassandra/io/sstable/SSTable.java  |    5 -
 .../io/sstable/SSTableIdentityIterator.java   |    5 +-
 .../cassandra/io/sstable/SSTableLoader.java   |   36 +-
 .../cassandra/io/sstable/SSTableRewriter.java |   18 +-
 .../io/sstable/SimpleSSTableMultiWriter.java  |    4 +-
 .../sstable/UnsupportedSSTableException.java  |   29 +
 .../format/AbstractSSTableIterator.java       |  213 +++-
 .../format/PartitionIndexIterator.java        |   13 +-
 .../io/sstable/format/RowIndexEntry.java      |   24 +-
 .../sstable/format/SSTableFlushObserver.java  |    4 +-
 .../io/sstable/format/SSTableFormat.java      |   33 +-
 .../io/sstable/format/SSTableReader.java      |  377 ++++--
 .../sstable/format/SSTableReaderBuilder.java  |   57 +-
 .../sstable/format/SSTableReadsListener.java  |    4 +-
 .../io/sstable/format/SSTableWriter.java      |   43 +-
 ...Writer.java => SSTableZeroCopyWriter.java} |   33 +-
 .../format/ScrubPartitionIterator.java        |   54 +
 .../cassandra/io/sstable/format/Version.java  |    9 +
 .../format/big/AbstractBigTableIterator.java  |   87 --
 .../io/sstable/format/big/BigFormat.java      |   77 +-
 .../big/BigTablePartitionIndexIterator.java   |   19 +
 .../io/sstable/format/big/BigTableReader.java |   71 +-
 .../format/big/BigTableRowIndexEntry.java     |    2 +-
 .../io/sstable/format/big/BigTableWriter.java |   90 +-
 .../io/sstable/format/big/IndexState.java     |    2 +-
 .../sstable/format/big/SSTableIterator.java   |  137 +-
 .../format/big/SSTableReversedIterator.java   |    3 +-
 .../io/sstable/format/big/ScrubIterator.java  |   84 ++
 .../format/trieindex/PartitionIndex.java      |  434 +++++++
 .../trieindex/PartitionIndexBuilder.java      |  229 ++++
 .../format/trieindex/PartitionIndexEarly.java |   50 +
 .../format/trieindex/PartitionIterator.java   |  254 ++++
 .../format/trieindex/PartitionWriter.java     |  208 ++++
 .../format/trieindex/RowIndexReader.java      |  186 +++
 .../trieindex/RowIndexReverseIterator.java    |   70 ++
 .../format/trieindex/RowIndexWriter.java      |  122 ++
 .../format/trieindex/SSTableIterator.java     |  111 ++
 .../trieindex/SSTableReversedIterator.java    |  287 +++++
 .../format/trieindex/ScrubIterator.java       |   93 ++
 .../format/trieindex/TrieIndexEntry.java      |  105 ++
 .../format/trieindex/TrieIndexFormat.java     |  437 +++++++
 .../trieindex/TrieIndexSSTableReader.java     | 1099 +++++++++++++++++
 .../trieindex/TrieIndexSSTableWriter.java     |  580 +++++++++
 .../format/trieindex/TrieIndexScanner.java    |  432 +++++++
 .../sstable/metadata/MetadataCollector.java   |   11 +
 .../io/sstable/metadata/StatsMetadata.java    |   65 +
 .../io/tries/ReverseValueIterator.java        |  179 +++
 .../cassandra/io/tries/ValueIterator.java     |   13 +-
 .../org/apache/cassandra/io/tries/Walker.java |   12 +-
 .../cassandra/io/util/EmptyRebufferer.java    |   71 ++
 .../apache/cassandra/io/util/FileHandle.java  |   19 +-
 .../cassandra/io/util/LimitingRebufferer.java |   69 +-
 .../cassandra/io/util/RandomAccessReader.java |    1 +
 .../cassandra/io/util/RebuffererFactory.java  |    2 +
 .../cassandra/io/util/SimpleChunkReader.java  |    5 +-
 .../io/util/TailOverridingRebufferer.java     |   20 +-
 .../cassandra/io/util/WrappingRebufferer.java |  116 +-
 .../cassandra/metrics/TableMetrics.java       |    7 +-
 .../cassandra/repair/LocalSyncTask.java       |    2 +-
 .../cassandra/streaming/ProgressInfo.java     |    4 +-
 .../tools/SSTableExpiredBlockers.java         |    2 +-
 .../apache/cassandra/tools/SSTableExport.java |    2 +-
 .../tools/SSTableMetadataViewer.java          |    2 +-
 .../tools/SSTableOfflineRelevel.java          |    2 +-
 .../cassandra/tools/StandaloneScrubber.java   |    2 +-
 .../cassandra/tools/StandaloneSplitter.java   |   10 +-
 .../cassandra/tools/StandaloneUpgrader.java   |    2 +-
 .../cassandra/tools/StandaloneVerifier.java   |    9 +-
 src/java/org/apache/cassandra/tools/Util.java |    8 +-
 .../cassandra/utils/ByteBufferUtil.java       |   23 +
 .../org/apache/cassandra/utils/IFilter.java   |    6 +
 .../utils/JVMStabilityInspector.java          |    4 +
 .../apache/cassandra/utils/NativeLibrary.java |   13 +
 .../org/apache/cassandra/utils/SyncUtil.java  |    7 +
 .../apache/cassandra/utils/Throwables.java    |    3 +
 .../cassandra/utils/concurrent/Refs.java      |   11 +-
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 87 bytes
 .../legacy_aa_clust/aa-1-bti-Data.db          |  Bin 0 -> 5328 bytes
 .../legacy_aa_clust/aa-1-bti-Digest.crc32     |    1 +
 .../legacy_aa_clust/aa-1-bti-Filter.db        |  Bin 0 -> 16 bytes
 .../legacy_aa_clust/aa-1-bti-Partitions.db    |  Bin 0 -> 62 bytes
 .../legacy_aa_clust/aa-1-bti-Rows.db          |  Bin 0 -> 563 bytes
 .../legacy_aa_clust/aa-1-bti-Statistics.db    |  Bin 0 -> 7095 bytes
 .../legacy_aa_clust/aa-1-bti-TOC.txt          |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 87 bytes
 .../legacy_aa_clust_compact/aa-1-bti-Data.db  |  Bin 0 -> 5364 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../legacy_aa_clust_compact/aa-1-bti-Rows.db  |  Bin 0 -> 563 bytes
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 7095 bytes
 .../legacy_aa_clust_compact/aa-1-bti-TOC.txt  |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 79 bytes
 .../legacy_aa_clust_counter/aa-1-bti-Data.db  |  Bin 0 -> 6003 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../legacy_aa_clust_counter/aa-1-bti-Rows.db  |  Bin 0 -> 563 bytes
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 7104 bytes
 .../legacy_aa_clust_counter/aa-1-bti-TOC.txt  |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 79 bytes
 .../aa-1-bti-Data.db                          |  Bin 0 -> 5923 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../aa-1-bti-Rows.db                          |  Bin 0 -> 563 bytes
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 7104 bytes
 .../aa-1-bti-TOC.txt                          |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../aa-1-bti-Data.db                          |  Bin 0 -> 68 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 39 bytes
 .../aa-1-bti-Rows.db                          |    0
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 4814 bytes
 .../aa-1-bti-TOC.txt                          |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_aa_simple/aa-1-bti-Data.db         |  Bin 0 -> 89 bytes
 .../legacy_aa_simple/aa-1-bti-Digest.crc32    |    1 +
 .../legacy_aa_simple/aa-1-bti-Filter.db       |  Bin 0 -> 16 bytes
 .../legacy_aa_simple/aa-1-bti-Partitions.db   |  Bin 0 -> 59 bytes
 .../legacy_aa_simple/aa-1-bti-Rows.db         |    0
 .../legacy_aa_simple/aa-1-bti-Statistics.db   |  Bin 0 -> 4648 bytes
 .../legacy_aa_simple/aa-1-bti-TOC.txt         |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_aa_simple_compact/aa-1-bti-Data.db |  Bin 0 -> 91 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 59 bytes
 .../legacy_aa_simple_compact/aa-1-bti-Rows.db |    0
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 4737 bytes
 .../legacy_aa_simple_compact/aa-1-bti-TOC.txt |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_aa_simple_counter/aa-1-bti-Data.db |  Bin 0 -> 143 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 60 bytes
 .../legacy_aa_simple_counter/aa-1-bti-Rows.db |    0
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 4657 bytes
 .../legacy_aa_simple_counter/aa-1-bti-TOC.txt |    8 +
 .../aa-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../aa-1-bti-Data.db                          |  Bin 0 -> 142 bytes
 .../aa-1-bti-Digest.crc32                     |    1 +
 .../aa-1-bti-Filter.db                        |  Bin 0 -> 16 bytes
 .../aa-1-bti-Partitions.db                    |  Bin 0 -> 60 bytes
 .../aa-1-bti-Rows.db                          |    0
 .../aa-1-bti-Statistics.db                    |  Bin 0 -> 4754 bytes
 .../aa-1-bti-TOC.txt                          |    8 +
 .../ac-1-bti-CompressionInfo.db               |  Bin 0 -> 87 bytes
 .../legacy_ac_clust/ac-1-bti-Data.db          |  Bin 0 -> 5236 bytes
 .../legacy_ac_clust/ac-1-bti-Digest.crc32     |    1 +
 .../legacy_ac_clust/ac-1-bti-Filter.db        |  Bin 0 -> 24 bytes
 .../legacy_ac_clust/ac-1-bti-Partitions.db    |  Bin 0 -> 62 bytes
 .../legacy_ac_clust/ac-1-bti-Rows.db          |  Bin 0 -> 563 bytes
 .../legacy_ac_clust/ac-1-bti-Statistics.db    |  Bin 0 -> 7095 bytes
 .../legacy_ac_clust/ac-1-bti-TOC.txt          |    8 +
 .../ac-1-bti-CompressionInfo.db               |  Bin 0 -> 79 bytes
 .../legacy_ac_clust_counter/ac-1-bti-Data.db  |  Bin 0 -> 5805 bytes
 .../ac-1-bti-Digest.crc32                     |    1 +
 .../ac-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../ac-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../legacy_ac_clust_counter/ac-1-bti-Rows.db  |  Bin 0 -> 563 bytes
 .../ac-1-bti-Statistics.db                    |  Bin 0 -> 7104 bytes
 .../legacy_ac_clust_counter/ac-1-bti-TOC.txt  |    8 +
 .../ac-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_ac_simple/ac-1-bti-Data.db         |  Bin 0 -> 88 bytes
 .../legacy_ac_simple/ac-1-bti-Digest.crc32    |    1 +
 .../legacy_ac_simple/ac-1-bti-Filter.db       |  Bin 0 -> 24 bytes
 .../legacy_ac_simple/ac-1-bti-Partitions.db   |  Bin 0 -> 59 bytes
 .../legacy_ac_simple/ac-1-bti-Rows.db         |    0
 .../legacy_ac_simple/ac-1-bti-Statistics.db   |  Bin 0 -> 4648 bytes
 .../legacy_ac_simple/ac-1-bti-TOC.txt         |    8 +
 .../ac-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_ac_simple_counter/ac-1-bti-Data.db |  Bin 0 -> 140 bytes
 .../ac-1-bti-Digest.crc32                     |    1 +
 .../ac-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../ac-1-bti-Partitions.db                    |  Bin 0 -> 60 bytes
 .../legacy_ac_simple_counter/ac-1-bti-Rows.db |    0
 .../ac-1-bti-Statistics.db                    |  Bin 0 -> 4657 bytes
 .../legacy_ac_simple_counter/ac-1-bti-TOC.txt |    8 +
 .../ad-1-bti-CompressionInfo.db               |  Bin 0 -> 87 bytes
 .../legacy_ad_clust/ad-1-bti-Data.db          |  Bin 0 -> 5235 bytes
 .../legacy_ad_clust/ad-1-bti-Digest.crc32     |    1 +
 .../legacy_ad_clust/ad-1-bti-Filter.db        |  Bin 0 -> 24 bytes
 .../legacy_ad_clust/ad-1-bti-Partitions.db    |  Bin 0 -> 62 bytes
 .../legacy_ad_clust/ad-1-bti-Rows.db          |  Bin 0 -> 563 bytes
 .../legacy_ad_clust/ad-1-bti-Statistics.db    |  Bin 0 -> 7112 bytes
 .../legacy_ad_clust/ad-1-bti-TOC.txt          |    8 +
 .../ad-1-bti-CompressionInfo.db               |  Bin 0 -> 79 bytes
 .../legacy_ad_clust_counter/ad-1-bti-Data.db  |  Bin 0 -> 5845 bytes
 .../ad-1-bti-Digest.crc32                     |    1 +
 .../ad-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../ad-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../legacy_ad_clust_counter/ad-1-bti-Rows.db  |  Bin 0 -> 563 bytes
 .../ad-1-bti-Statistics.db                    |  Bin 0 -> 7121 bytes
 .../legacy_ad_clust_counter/ad-1-bti-TOC.txt  |    8 +
 .../ad-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_ad_simple/ad-1-bti-Data.db         |  Bin 0 -> 89 bytes
 .../legacy_ad_simple/ad-1-bti-Digest.crc32    |    1 +
 .../legacy_ad_simple/ad-1-bti-Filter.db       |  Bin 0 -> 24 bytes
 .../legacy_ad_simple/ad-1-bti-Partitions.db   |  Bin 0 -> 59 bytes
 .../legacy_ad_simple/ad-1-bti-Rows.db         |    0
 .../legacy_ad_simple/ad-1-bti-Statistics.db   |  Bin 0 -> 4665 bytes
 .../legacy_ad_simple/ad-1-bti-TOC.txt         |    8 +
 .../ad-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_ad_simple_counter/ad-1-bti-Data.db |  Bin 0 -> 141 bytes
 .../ad-1-bti-Digest.crc32                     |    1 +
 .../ad-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../ad-1-bti-Partitions.db                    |  Bin 0 -> 60 bytes
 .../legacy_ad_simple_counter/ad-1-bti-Rows.db |    0
 .../ad-1-bti-Statistics.db                    |  Bin 0 -> 4674 bytes
 .../legacy_ad_simple_counter/ad-1-bti-TOC.txt |    8 +
 .../ba-1-bti-CompressionInfo.db               |  Bin 0 -> 87 bytes
 .../legacy_ba_clust/ba-1-bti-Data.db          |  Bin 0 -> 5257 bytes
 .../legacy_ba_clust/ba-1-bti-Digest.crc32     |    1 +
 .../legacy_ba_clust/ba-1-bti-Filter.db        |  Bin 0 -> 24 bytes
 .../legacy_ba_clust/ba-1-bti-Partitions.db    |  Bin 0 -> 62 bytes
 .../legacy_ba_clust/ba-1-bti-Rows.db          |  Bin 0 -> 563 bytes
 .../legacy_ba_clust/ba-1-bti-Statistics.db    |  Bin 0 -> 7159 bytes
 .../legacy_ba_clust/ba-1-bti-TOC.txt          |    8 +
 .../ba-1-bti-CompressionInfo.db               |  Bin 0 -> 79 bytes
 .../legacy_ba_clust_counter/ba-1-bti-Data.db  |  Bin 0 -> 5836 bytes
 .../ba-1-bti-Digest.crc32                     |    1 +
 .../ba-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../ba-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../legacy_ba_clust_counter/ba-1-bti-Rows.db  |  Bin 0 -> 563 bytes
 .../ba-1-bti-Statistics.db                    |  Bin 0 -> 7168 bytes
 .../legacy_ba_clust_counter/ba-1-bti-TOC.txt  |    8 +
 .../ba-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_ba_simple/ba-1-bti-Data.db         |  Bin 0 -> 89 bytes
 .../legacy_ba_simple/ba-1-bti-Digest.crc32    |    1 +
 .../legacy_ba_simple/ba-1-bti-Filter.db       |  Bin 0 -> 24 bytes
 .../legacy_ba_simple/ba-1-bti-Partitions.db   |  Bin 0 -> 59 bytes
 .../legacy_ba_simple/ba-1-bti-Rows.db         |    0
 .../legacy_ba_simple/ba-1-bti-Statistics.db   |  Bin 0 -> 4669 bytes
 .../legacy_ba_simple/ba-1-bti-TOC.txt         |    8 +
 .../ba-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_ba_simple_counter/ba-1-bti-Data.db |  Bin 0 -> 141 bytes
 .../ba-1-bti-Digest.crc32                     |    1 +
 .../ba-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../ba-1-bti-Partitions.db                    |  Bin 0 -> 60 bytes
 .../legacy_ba_simple_counter/ba-1-bti-Rows.db |    0
 .../ba-1-bti-Statistics.db                    |  Bin 0 -> 4678 bytes
 .../legacy_ba_simple_counter/ba-1-bti-TOC.txt |    8 +
 .../bb-1-bti-CompressionInfo.db               |  Bin 0 -> 87 bytes
 .../legacy_bb_clust/bb-1-bti-Data.db          |  Bin 0 -> 5330 bytes
 .../legacy_bb_clust/bb-1-bti-Digest.crc32     |    1 +
 .../legacy_bb_clust/bb-1-bti-Filter.db        |  Bin 0 -> 24 bytes
 .../legacy_bb_clust/bb-1-bti-Partitions.db    |  Bin 0 -> 62 bytes
 .../legacy_bb_clust/bb-1-bti-Rows.db          |  Bin 0 -> 563 bytes
 .../legacy_bb_clust/bb-1-bti-Statistics.db    |  Bin 0 -> 7176 bytes
 .../legacy_bb_clust/bb-1-bti-TOC.txt          |    8 +
 .../bb-1-bti-CompressionInfo.db               |  Bin 0 -> 79 bytes
 .../legacy_bb_clust_counter/bb-1-bti-Data.db  |  Bin 0 -> 5946 bytes
 .../bb-1-bti-Digest.crc32                     |    1 +
 .../bb-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../bb-1-bti-Partitions.db                    |  Bin 0 -> 62 bytes
 .../legacy_bb_clust_counter/bb-1-bti-Rows.db  |  Bin 0 -> 563 bytes
 .../bb-1-bti-Statistics.db                    |  Bin 0 -> 7185 bytes
 .../legacy_bb_clust_counter/bb-1-bti-TOC.txt  |    8 +
 .../bb-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_bb_simple/bb-1-bti-Data.db         |  Bin 0 -> 92 bytes
 .../legacy_bb_simple/bb-1-bti-Digest.crc32    |    1 +
 .../legacy_bb_simple/bb-1-bti-Filter.db       |  Bin 0 -> 24 bytes
 .../legacy_bb_simple/bb-1-bti-Partitions.db   |  Bin 0 -> 59 bytes
 .../legacy_bb_simple/bb-1-bti-Rows.db         |    0
 .../legacy_bb_simple/bb-1-bti-Statistics.db   |  Bin 0 -> 4686 bytes
 .../legacy_bb_simple/bb-1-bti-TOC.txt         |    8 +
 .../bb-1-bti-CompressionInfo.db               |  Bin 0 -> 47 bytes
 .../legacy_bb_simple_counter/bb-1-bti-Data.db |  Bin 0 -> 145 bytes
 .../bb-1-bti-Digest.crc32                     |    1 +
 .../bb-1-bti-Filter.db                        |  Bin 0 -> 24 bytes
 .../bb-1-bti-Partitions.db                    |  Bin 0 -> 60 bytes
 .../legacy_bb_simple_counter/bb-1-bti-Rows.db |    0
 .../bb-1-bti-Statistics.db                    |  Bin 0 -> 4695 bytes
 .../legacy_bb_simple_counter/bb-1-bti-TOC.txt |    8 +
 .../distributed/test/FailingRepairTest.java   |   13 +
 .../format/ForwardingSSTableReader.java       |   36 +-
 test/unit/org/apache/cassandra/Util.java      |    2 +-
 .../cassandra/cache/AutoSavingCacheTest.java  |    9 +-
 .../cassandra/cql3/KeyCacheCqlTest.java       |   46 +-
 .../cql3/QueryWithIndexedSSTableTest.java     |    4 +-
 .../TombstonesWithIndexedSSTableTest.java     |   18 +-
 .../cassandra/db/ColumnFamilyStoreTest.java   |    9 +-
 .../org/apache/cassandra/db/KeyCacheTest.java |    7 +-
 .../org/apache/cassandra/db/KeyspaceTest.java |    4 +-
 .../apache/cassandra/db/ReadCommandTest.java  |    3 +-
 .../db/RepairedDataTombstonesTest.java        |    4 +
 .../org/apache/cassandra/db/ScrubTest.java    |   46 +-
 .../cassandra/db/SerializationHeaderTest.java |    4 +-
 .../org/apache/cassandra/db/VerifyTest.java   |   32 +-
 .../streaming/CassandraStreamManagerTest.java |    2 +-
 .../index/sai/view/IndexViewManagerTest.java  |    2 +-
 .../cassandra/io/DiskSpaceMetricsTest.java    |    6 +
 .../io/sstable/BigTableWriterTest.java        |    4 +
 .../io/sstable/IndexSummaryManagerTest.java   |   42 +-
 .../IndexSummaryRedistributionTest.java       |    6 +-
 .../io/sstable/LegacySSTableTest.java         |   40 +-
 .../io/sstable/SSTableReaderTest.java         |   88 +-
 .../io/sstable/SSTableRewriterTest.java       |   52 +-
 .../io/sstable/SSTableWriterTest.java         |   32 +-
 .../format/SSTableFlushObserverTest.java      |   19 +-
 ...st.java => SSTableZeroCopyWriterTest.java} |   25 +-
 .../SSTableReverseIteratorTest.java           |    6 +-
 .../format/trieindex/PartitionIndexTest.java  |  937 ++++++++++++++
 .../format/trieindex/RowIndexTest.java        |  521 ++++++++
 .../format/trieindex/TrieIndexFormatUtil.java |   79 ++
 .../metadata/MetadataSerializerTest.java      |    2 +-
 .../apache/cassandra/io/tries/WalkerTest.java |    8 +-
 .../io/util/TailOverridingRebuffererTest.java |    3 +
 .../io/util/WrappingRebuffererTest.java       |   41 +-
 .../streaming/StreamTransferTaskTest.java     |    8 +-
 .../cassandra/tools/JMXCompatabilityTest.java |    6 +
 .../StandaloneSplitterWithCQLTesterTest.java  |   10 +-
 .../StandaloneUpgraderOnSStablesTest.java     |    2 +-
 .../cassandra/tools/nodetool/RingTest.java    |    1 +
 .../cassandra/stress/CompactionStress.java    |    2 +-
 ...7: TrieIndex SSTable format implementation |  348 ++++++
 357 files changed, 9045 insertions(+), 1145 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/io/sstable/UnsupportedSSTableException.java
 rename src/java/org/apache/cassandra/io/sstable/format/{big/BigTableZeroCopyWriter.java => SSTableZeroCopyWriter.java} (84%)
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/ScrubPartitionIterator.java
 delete mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/big/ScrubIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndex.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexBuilder.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexEarly.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReader.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReverseIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexWriter.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableReversedIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/ScrubIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexEntry.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexScanner.java
 create mode 100644 src/java/org/apache/cassandra/io/tries/ReverseValueIterator.java
 create mode 100644 src/java/org/apache/cassandra/io/util/EmptyRebufferer.java
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-TOC.txt
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Rows.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Statistics.db
 create mode 100644 test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-TOC.txt
 rename test/unit/org/apache/cassandra/io/sstable/format/{big/BigTableZeroCopyWriterTest.java => SSTableZeroCopyWriterTest.java} (91%)
 create mode 100644 test/unit/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexTest.java
 create mode 100644 test/unit/org/apache/cassandra/io/sstable/format/trieindex/RowIndexTest.java
 create mode 100644 test/unit/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormatUtil.java
 create mode 100644 update-history/STAR-801/50-568d8ed4e8 STAR-247: TrieIndex SSTable format implementation

diff --git a/build.xml b/build.xml
index a1c8e0a5d4e6..6d4b593c424b 100644
--- a/build.xml
+++ b/build.xml
@@ -1482,6 +1482,23 @@
     </sequential>
   </macrodef>
 
+  <macrodef name="testlist-bigtable">
+    <attribute name="test.file.list"/>
+    <attribute name="testlist.offset"/>
+    <sequential>
+      <testmacrohelper inputdir="${test.dir}/${test.classlistprefix}" filelist="@{test.file.list}" poffset="@{testlist.offset}"
+                       exclude="**/*.java" timeout="${test.timeout}" testtag="bigtable">
+        <jvmarg value="-Dlegacy-sstable-root=${test.data}/legacy-sstables"/>
+        <jvmarg value="-Dinvalid-legacy-sstable-root=${test.data}/invalid-legacy-sstables"/>
+        <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
+        <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
+        <jvmarg value="-Dcassandra.config.loader=org.apache.cassandra.OffsetAwareConfigurationLoader"/>
+        <jvmarg value="-Dcassandra.skip_sync=true" />
+        <jvmarg value="-Dcassandra.sstable.format.default=big" />
+      </testmacrohelper>
+    </sequential>
+  </macrodef>
+
   <macrodef name="testlist-system-keyspace-directory">
     <attribute name="test.file.list" />
     <sequential>
@@ -1569,7 +1586,15 @@
     <testhelper testdelegate="testlist-cdc" />
   </target>
 
-  <target name="test-system-keyspace-directory" depends="build-test" description="Execute unit tests with a system keyspaces directory configured">
+  <target name="test-bigtable" depends="build-test" description="Execute unit tests with default sstable format set to Big Table (otherwise the tests are executed with the default format set to Trie Index)">
+      <path id="all-test-classes-path">
+          <fileset dir="${test.unit.src}" includes="**/${test.name}.java" />
+      </path>
+      <property name="all-test-classes" refid="all-test-classes-path"/>
+      <testhelper testdelegate="testlist-bigtable" />
+  </target>
+
+    <target name="test-system-keyspace-directory" depends="build-test" description="Execute unit tests with a system keyspaces directory configured">
     <path id="all-test-classes-path">
       <fileset dir="${test.unit.src}" includes="**/${test.name}.java" />
     </path>
@@ -1818,6 +1843,13 @@
       <property name="all-test-classes" refid="all-test-classes-path"/>
       <testhelper testdelegate="testlist-cdc"/>
   </target>
+  <target name="testclasslist-bigtable" depends="build-test" description="Parallel-run tests given in file -Dtest.classlistfile (one-class-per-line, e.g. org/apache/cassandra/db/SomeTest.java)">
+    <path id="all-test-classes-path">
+      <fileset dir="${test.dir}/${test.classlistprefix}" includesfile="${test.classlistfile}"/>
+    </path>
+    <property name="all-test-classes" refid="all-test-classes-path"/>
+    <testhelper testdelegate="testlist-bigtable"/>
+  </target>
   <target name="testclasslist-system-keyspace-directory" depends="build-test" description="Run tests given in file -Dtest.classlistfile (one-class-per-line, e.g. org/apache/cassandra/db/SomeTest.java)">
       <path id="all-test-classes-path">
           <fileset dir="${test.dir}/${test.classlistprefix}" includesfile="${test.classlistfile}"/>
diff --git a/eclipse_compiler.properties b/eclipse_compiler.properties
index e1f28021c5f2..c0d02fb6397b 100644
--- a/eclipse_compiler.properties
+++ b/eclipse_compiler.properties
@@ -5,7 +5,10 @@
 #
 # Autoclosables not in try-with-references
 org.eclipse.jdt.core.compiler.problem.explicitlyClosedAutoCloseable=error
-org.eclipse.jdt.core.compiler.problem.potentiallyUnclosedCloseable=error
+
+# the analysis is too shallow and forces to add supressions everywhere, which in turn may hide the real problems
+# which could be detected by much smarter analysers (like the one in IntelliJ)
+org.eclipse.jdt.core.compiler.problem.potentiallyUnclosedCloseable=ignore
 org.eclipse.jdt.core.compiler.problem.unclosedCloseable=ignore
 #Ignore and disable all other checks too keep the logs clean
 
diff --git a/src/java/org/apache/cassandra/cache/ChunkCache.java b/src/java/org/apache/cassandra/cache/ChunkCache.java
index c53810ac4806..2b53552f3594 100644
--- a/src/java/org/apache/cassandra/cache/ChunkCache.java
+++ b/src/java/org/apache/cassandra/cache/ChunkCache.java
@@ -28,6 +28,9 @@
 import com.google.common.collect.Iterables;
 import com.google.common.util.concurrent.MoreExecutors;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import com.github.benmanes.caffeine.cache.*;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
@@ -39,6 +42,8 @@
 public class ChunkCache
         implements CacheLoader<ChunkCache.Key, ChunkCache.Buffer>, RemovalListener<ChunkCache.Key, ChunkCache.Buffer>, CacheSize
 {
+    private final static Logger logger = LoggerFactory.getLogger(ChunkCache.class);
+
     public static final int RESERVED_POOL_SPACE_IN_MB = 32;
     public static final long cacheSize = 1024L * 1024L * Math.max(0, DatabaseDescriptor.getFileCacheSizeInMB() - RESERVED_POOL_SPACE_IN_MB);
     public static final boolean roundUp = DatabaseDescriptor.getFileCacheRoundUp();
@@ -226,15 +231,28 @@ public CachingRebufferer(ChunkReader file)
         @Override
         public Buffer rebuffer(long position)
         {
+            int spin = 0;
             try
             {
                 long pageAlignedPos = position & alignmentMask;
                 Buffer buf;
-                do
-                    buf = cache.get(new Key(source, pageAlignedPos)).reference();
-                while (buf == null);
-
-                return buf;
+                Key key = new Key(source, pageAlignedPos);
+                while (true)
+                {
+                    buf = cache.get(key).reference();
+                    if (buf != null)
+                        return buf;
+
+                    if (++spin == 1000)
+                    {
+                        String msg = String.format("Could not acquire a reference to for %s after 1000 attempts. " +
+                                                   "This is likely due to the chunk cache being too small for the " +
+                                                   "number of concurrently running requests.", key);
+                        throw new RuntimeException(msg);
+                        // Note: this might also be caused by reference counting errors, especially double release of
+                        // chunks.
+                    }
+                }
             }
             catch (Throwable t)
             {
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 6f24b77c748b..49a05dccacde 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -1946,7 +1946,7 @@ public Refs<SSTableReader> getSnapshotSSTableReaders(String tag) throws IOExcept
                     if (logger.isTraceEnabled())
                         logger.trace("using snapshot sstable {}", entries.getKey());
                     // open offline so we don't modify components or track hotness.
-                    sstable = SSTableReader.open(entries.getKey(), entries.getValue(), metadata, true, true);
+                    sstable = entries.getKey().getFormat().getReaderFactory().open(entries.getKey(), entries.getValue(), metadata, true, true);
                     refs.tryRef(sstable);
                     // release the self ref as we never add the snapshot sstable to DataTracker where it is otherwise released
                     sstable.selfRef().release();
diff --git a/src/java/org/apache/cassandra/db/DeletionTime.java b/src/java/org/apache/cassandra/db/DeletionTime.java
index d8ac91db9834..718fcf7a144b 100644
--- a/src/java/org/apache/cassandra/db/DeletionTime.java
+++ b/src/java/org/apache/cassandra/db/DeletionTime.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.db;
 
 import java.io.IOException;
+import java.nio.ByteBuffer;
 
 import com.google.common.base.Objects;
 
@@ -180,6 +181,15 @@ public DeletionTime deserialize(DataInputPlus in) throws IOException
                  : new DeletionTime(mfda, ldt);
         }
 
+        public DeletionTime deserialize(ByteBuffer buf, int offset)
+        {
+            int ldt = buf.getInt(offset);
+            long mfda = buf.getLong(offset + 4);
+            return mfda == Long.MIN_VALUE && ldt == Integer.MAX_VALUE
+                   ? LIVE
+                   : new DeletionTime(mfda, ldt);
+        }
+
         public void skip(DataInputPlus in) throws IOException
         {
             in.skipBytesFully(4 + 8);
diff --git a/src/java/org/apache/cassandra/db/SSTableImporter.java b/src/java/org/apache/cassandra/db/SSTableImporter.java
index 5bcbd4c528cd..65255f52c2a3 100644
--- a/src/java/org/apache/cassandra/db/SSTableImporter.java
+++ b/src/java/org/apache/cassandra/db/SSTableImporter.java
@@ -136,7 +136,8 @@ synchronized List<String> importNewSSTables(Options options)
                     Descriptor newDescriptor = cfs.getUniqueDescriptorFor(entry.getKey(), targetDir);
                     maybeMutateMetadata(entry.getKey(), options);
                     movedSSTables.add(new MovedSSTable(newDescriptor, entry.getKey(), entry.getValue()));
-                    SSTableReader sstable = SSTableReader.moveAndOpenSSTable(cfs, entry.getKey(), newDescriptor, entry.getValue(), options.copyData);
+                    SSTableReader sstable = newDescriptor.getFormat().getReaderFactory()
+                                                         .moveAndOpenSSTable(cfs, entry.getKey(), newDescriptor, entry.getValue(), options.copyData);
                     newSSTablesPerDirectory.add(sstable);
                 }
                 catch (Throwable t)
@@ -215,7 +216,7 @@ private File getTargetDirectory(String srcPath, Descriptor descriptor, Set<Compo
         SSTableReader sstable = null;
         try
         {
-            sstable = SSTableReader.open(descriptor, components, cfs.metadata);
+            sstable = descriptor.getFormat().getReaderFactory().open(descriptor, components, cfs.metadata);
             targetDirectory = cfs.getDirectories().getLocationForDisk(cfs.diskBoundaryManager.getDiskBoundaries(cfs).getCorrectDiskForSSTable(sstable));
         }
         finally
@@ -342,7 +343,7 @@ private void verifySSTableForImport(Descriptor descriptor, Set<Component> compon
         SSTableReader reader = null;
         try
         {
-            reader = SSTableReader.open(descriptor, components, cfs.metadata);
+            reader = descriptor.getFormat().getReaderFactory().open(descriptor, components, cfs.metadata);
             Verifier.Options verifierOptions = Verifier.options()
                                                        .extendedVerification(extendedVerify)
                                                        .checkOwnsTokens(verifyTokens)
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index 5baa6d7e1ad8..e44c50104d8f 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -46,7 +46,6 @@
 import org.apache.cassandra.metrics.TableMetrics;
 import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.tracing.Tracing;
@@ -1152,7 +1151,7 @@ private static final class SSTableReadMetricsCollector implements SSTableReadsLi
         private int mergedSSTables;
 
         @Override
-        public void onSSTableSelected(SSTableReader sstable, RowIndexEntry<?> indexEntry, SelectionReason reason)
+        public void onSSTableSelected(SSTableReader sstable, RowIndexEntry indexEntry, SelectionReason reason)
         {
             sstable.incrementReadCount();
             mergedSSTables++;
diff --git a/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java b/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java
index 843054195c67..c3bdb900a7e9 100644
--- a/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java
+++ b/src/java/org/apache/cassandra/db/UnfilteredDeserializer.java
@@ -20,6 +20,7 @@
 import java.io.IOException;
 
 import org.apache.cassandra.db.marshal.ByteArrayAccessor;
+import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.io.util.DataInputPlus;
@@ -34,7 +35,7 @@
 public class UnfilteredDeserializer
 {
     protected final TableMetadata metadata;
-    protected final DataInputPlus in;
+    protected final FileDataInput in;
     protected final DeserializationHelper helper;
 
     private final ClusteringPrefix.Deserializer clusteringDeserializer;
@@ -44,11 +45,12 @@ public class UnfilteredDeserializer
     private int nextExtendedFlags;
     private boolean isReady;
     private boolean isDone;
+    private long preparePos;
 
     private final Row.Builder builder;
 
     private UnfilteredDeserializer(TableMetadata metadata,
-                                   DataInputPlus in,
+                                   FileDataInput in,
                                    SerializationHeader header,
                                    DeserializationHelper helper)
     {
@@ -58,10 +60,11 @@ private UnfilteredDeserializer(TableMetadata metadata,
         this.header = header;
         this.clusteringDeserializer = new ClusteringPrefix.Deserializer(metadata.comparator, in, header);
         this.builder = BTreeRow.sortedBuilder();
+        this.preparePos = -1;
     }
 
     public static UnfilteredDeserializer create(TableMetadata metadata,
-                                                DataInputPlus in,
+                                                FileDataInput in,
                                                 SerializationHeader header,
                                                 DeserializationHelper helper)
     {
@@ -85,6 +88,7 @@ private void prepareNext() throws IOException
         if (isDone)
             return;
 
+        preparePos = in.getFilePointer();
         nextFlags = in.readUnsignedByte();
         if (UnfilteredSerializer.isEndOfPartition(nextFlags))
         {
@@ -170,4 +174,5 @@ public void skipNext() throws IOException
             UnfilteredSerializer.serializer.skipRowBody(in);
         }
     }
+
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
index 6b4df9471328..3a40f0d771ba 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,6 +41,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.schema.CompactionParams;
@@ -214,7 +216,7 @@ public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, final
      * @param originalCandidates The collection to check for excluded SSTables
      * @return list of the SSTables with excluded ones filtered out
      */
-    public static List<SSTableReader> filterSuspectSSTables(Iterable<SSTableReader> originalCandidates)
+    public static List<SSTableReader> filterSuspectSSTables(Iterable<? extends SSTableReader> originalCandidates)
     {
         List<SSTableReader> filtered = new ArrayList<>();
         for (SSTableReader sstable : originalCandidates)
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
index a0ab3fdd6727..6d98c1b542f6 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
@@ -322,7 +322,7 @@ private UnfilteredRowIterator getShadowIterator(SSTableReader reader, DecoratedK
             reader.getMaxTimestamp() <= minTimestamp ||
             tombstoneOnly && !reader.mayHaveTombstones())
             return null;
-        return reader.simpleIterator(() -> openDataFiles.computeIfAbsent(reader, this::openDataFile), key, tombstoneOnly);
+        return reader.simpleIterator(openDataFiles.computeIfAbsent(reader, this::openDataFile), key, tombstoneOnly);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
index 148690b88e43..703687f5c4fc 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
@@ -51,12 +51,12 @@ public final class CompactionInfo
     private final UUID compactionId;
     private final ImmutableSet<SSTableReader> sstables;
 
-    public CompactionInfo(TableMetadata metadata, OperationType tasktype, long bytesComplete, long totalBytes, UUID compactionId, Collection<SSTableReader> sstables)
+    public CompactionInfo(TableMetadata metadata, OperationType tasktype, long bytesComplete, long totalBytes, UUID compactionId, Collection<? extends SSTableReader> sstables)
     {
         this(metadata, tasktype, bytesComplete, totalBytes, Unit.BYTES, compactionId, sstables);
     }
 
-    private CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, UUID compactionId, Collection<SSTableReader> sstables)
+    private CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, UUID compactionId, Collection<? extends SSTableReader> sstables)
     {
         this.tasktype = tasktype;
         this.completed = completed;
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
index 7c865c7be1e7..e3cbdab57219 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -310,7 +311,7 @@ private CompactionCandidate getSTCSInL0CompactionCandidate()
 
     private List<SSTableReader> getSSTablesForSTCS(Collection<SSTableReader> sstables)
     {
-        Iterable<SSTableReader> candidates = cfs.getTracker().getUncompacting(sstables);
+        Iterable<? extends SSTableReader> candidates = cfs.getTracker().getUncompacting(sstables);
         List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(AbstractCompactionStrategy.filterSuspectSSTables(candidates));
         List<List<SSTableReader>> buckets = SizeTieredCompactionStrategy.getBuckets(pairs,
                                                                                     options.bucketHigh,
diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index ce93389d1917..664522067464 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -27,7 +27,7 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableSet;
 
-import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.ScrubPartitionIterator;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -60,7 +60,7 @@ public class Scrubber implements Closeable
 
     private final ReadWriteLock fileAccessLock;
     private final RandomAccessReader dataFile;
-    private final PartitionIndexIterator indexIterator;
+    private ScrubPartitionIterator indexIterator;
     private final ScrubInfo scrubInfo;
 
     private int goodRows;
@@ -111,12 +111,12 @@ public Scrubber(ColumnFamilyStore cfs,
         this.destination = cfs.getDirectories().getLocationForDisk(cfs.getDiskBoundaries().getCorrectDiskForSSTable(sstable));
         this.isCommutative = cfs.metadata().isCounter();
 
-        boolean hasIndexFile = (new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX))).exists();
+        boolean hasIndexFile = sstable.hasIndex();
         this.isIndex = cfs.isIndex();
         if (!hasIndexFile)
         {
             // if there's any corruption in the -Data.db then rows can't be skipped over. but it's worth a shot.
-            outputHandler.warn("Missing component: " + sstable.descriptor.filenameFor(Component.PRIMARY_INDEX));
+            outputHandler.warn("Missing index component");
         }
         this.checkData = checkData && !this.isIndex; //LocalByPartitionerType does not support validation
         this.expectedBloomFilterSize = Math.max(
@@ -142,11 +142,11 @@ public Scrubber(ColumnFamilyStore cfs,
             outputHandler.output("Starting scrub with reinsert overflowed TTL option");
     }
 
-    private PartitionIndexIterator openIndexIterator()
+    private ScrubPartitionIterator openIndexIterator()
     {
         try
         {
-            return sstable.allKeysIterator();
+            return sstable.scrubPartitionsIterator();
         }
         catch (IOException e)
         {
@@ -180,8 +180,9 @@ public void scrub()
                 if (scrubInfo.isStopRequested())
                     throw new CompactionInterruptedException(scrubInfo.getCompactionInfo());
 
-                long rowStart = dataFile.getFilePointer();
-                outputHandler.debug("Reading row at " + rowStart);
+                // position in a data file where the partition starts
+                long dataStart = dataFile.getFilePointer();
+                outputHandler.debug("Reading row at " + dataStart);
 
                 DecoratedKey key = null;
                 try
@@ -194,24 +195,43 @@ public void scrub()
                     // check for null key below
                 }
 
-                long dataStart = dataFile.getFilePointer();
-
+                // position of the partition in a data file, it points to the beginning of the partition key
                 long dataStartFromIndex = -1;
+                // size of the partition (including partition key)
                 long dataSizeFromIndex = -1;
-                ByteBuffer currentIndexKey = indexIterator != null ? indexIterator.key() : null;
-                if (currentIndexKey != null)
+                ByteBuffer currentIndexKey = null;
+                if (indexAvailable())
                 {
-                    dataStartFromIndex = indexIterator.dataPosition() + TypeSizes.SHORT_SIZE + currentIndexKey.remaining();
-                    if (advanceIndexNoThrow())
-                        dataSizeFromIndex = indexIterator.dataPosition() - dataStartFromIndex;
+                    currentIndexKey = indexIterator.key();
+                    dataStartFromIndex = indexIterator.dataPosition();
+                    if (!indexIterator.isExhausted())
+                    {
+                        try
+                        {
+                            indexIterator.advance();
+                            if (!indexIterator.isExhausted())
+                                dataSizeFromIndex = indexIterator.dataPosition() - dataStartFromIndex;
+                        }
+                        catch (Throwable th)
+                        {
+                            throwIfFatal(th);
+                            outputHandler.warn(String.format(
+                                "Failed to advance to the next index position. Index is corrupted. " +
+                                "Continuing without the index. " +
+                                "Last position read is %d.", indexIterator.dataPosition()), th);
+                            indexIterator.close();
+                            indexIterator = null;
+                            currentIndexKey = null;
+                            dataStartFromIndex = -1;
+                            dataSizeFromIndex = -1;
+                        }
+                    }
                 }
 
                 // avoid an NPE if key is null
                 String keyName = key == null ? "(unreadable key)" : ByteBufferUtil.bytesToHex(key.getKey());
                 outputHandler.debug(String.format("row %s is %s", keyName, FBUtilities.prettyPrintMemory(dataSizeFromIndex)));
 
-                assert currentIndexKey != null || !indexAvailable();
-
                 try
                 {
                     if (key == null)
@@ -241,12 +261,14 @@ public void scrub()
                     if (currentIndexKey != null
                         && (key == null || !key.getKey().equals(currentIndexKey) || dataStart != dataStartFromIndex))
                     {
+                        // position where the row should start in a data file (right after the partition key)
+                        long rowStartFromIndex = dataStartFromIndex + TypeSizes.SHORT_SIZE + currentIndexKey.remaining();
                         outputHandler.output(String.format("Retrying from row index; data is %s bytes starting at %s",
-                                                  dataSizeFromIndex, dataStartFromIndex));
+                                                  dataSizeFromIndex, rowStartFromIndex));
                         key = sstable.decorateKey(currentIndexKey);
                         try
                         {
-                            dataFile.seek(dataStartFromIndex);
+                            dataFile.seek(rowStartFromIndex);
 
                             if (tryAppend(prevKey, key, writer))
                                 prevKey = key;
@@ -265,10 +287,23 @@ public void scrub()
                     {
                         throwIfCannotContinue(key, th);
 
-                        outputHandler.warn("Row starting at position " + dataStart + " is unreadable; skipping to next");
                         badRows++;
-                        if (currentIndexKey != null)
+                        if (indexIterator != null)
+                        {
+                            outputHandler.warn("Row starting at position " + dataStart + " is unreadable; skipping to next");
                             seekToNextRow();
+                        }
+                        else
+                        {
+                            outputHandler.warn(String.format(
+                                "Unrecoverable error while scrubbing %s." +
+                                "Scrubbing cannot continue. The sstable will be marked for deletion. " +
+                                "You can attempt manual recovery from the pre-scrub snapshot. " +
+                                "You can also run nodetool repair to transfer the data from a healthy replica, if any.",
+                            sstable));
+                            // There's no way to resync and continue. Give up.
+                            break;
+                        }
                     }
                 }
             }
@@ -362,21 +397,6 @@ private UnfilteredRowIterator getIterator(DecoratedKey key)
                                                                                     negativeLocalDeletionInfoMetrics) : rowMergingIterator;
     }
 
-    private boolean advanceIndexNoThrow()
-    {
-        try
-        {
-            return indexAvailable() && indexIterator.advance();
-        }
-        catch (Throwable th)
-        {
-            JVMStabilityInspector.inspectThrowable(th);
-            outputHandler.warn("Error reading index file", th);
-            indexIterator.close();
-            return false;
-        }
-    }
-
     private boolean indexAvailable()
     {
         return indexIterator != null && !indexIterator.isExhausted();
@@ -397,7 +417,6 @@ private void seekToNextRow()
             {
                 throwIfFatal(th);
                 outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th);
-                badRows++;
             }
 
             try
@@ -406,7 +425,7 @@ private void seekToNextRow()
             }
             catch (Throwable th)
             {
-                outputHandler.warn(String.format("Failed to go to the next entry in index, index position: %d", indexIterator.indexPosition()), th);
+                outputHandler.warn("Failed to go to the next entry in index", th);
                 throw Throwables.cleaned(th);
             }
         }
diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java
index fb4a17f810f3..a811a3d3cf9d 100644
--- a/src/java/org/apache/cassandra/db/compaction/Verifier.java
+++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java
@@ -146,7 +146,7 @@ public void verify()
 
         try
         {
-            outputHandler.debug("Deserializing index for "+sstable);
+            outputHandler.debug("Deserializing index for " + sstable);
             deserializeIndex(sstable);
         }
         catch (Throwable t)
@@ -155,16 +155,19 @@ public void verify()
             markAndThrow(t);
         }
 
-        try
-        {
-            outputHandler.debug("Deserializing index summary for "+sstable);
-            deserializeIndexSummary(sstable);
-        }
-        catch (Throwable t)
+        if (sstable.descriptor.getFormat().supportedComponents().contains(Component.SUMMARY))
         {
-            outputHandler.output("Index summary is corrupt - if it is removed it will get rebuilt on startup "+sstable.descriptor.filenameFor(Component.SUMMARY));
-            outputHandler.warn(t);
+            try
+            {
+                outputHandler.debug("Deserializing index summary for " + sstable);
+                deserializeIndexSummary(sstable);
+            }
+            catch (Throwable t)
+            {
+                outputHandler.output("Index summary is corrupt - if it is removed it will get rebuilt on startup " + sstable.descriptor.filenameFor(Component.SUMMARY));
+                outputHandler.warn(t);
             markAndThrow(t, false);
+            }
         }
 
         try
@@ -410,7 +413,10 @@ private void deserializeIndex(SSTableReader sstable) throws IOException
     {
         try (PartitionIndexIterator it = sstable.allKeysIterator()) {
             //noinspection StatementWithEmptyBody
-            while (it.advance()); // no-op, just check if index is readable
+            ByteBuffer last = it.key();
+            while (it.advance()) last = it.key(); // no-op, just check if index is readable
+            if (!Objects.equals(last, sstable.last.getKey()))
+                throw new CorruptSSTableException(new IOException("Failed to read partition index"), it.toString());
         }
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
index c5ec0eef7be6..ac4dd64f1f04 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
@@ -21,12 +21,12 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.LeveledManifest;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 
@@ -70,7 +70,7 @@ public MajorLeveledCompactionWriter(ColumnFamilyStore cfs,
     @SuppressWarnings("resource")
     public boolean realAppend(UnfilteredRowIterator partition)
     {
-        BigTableRowIndexEntry rie = sstableWriter.append(partition);
+        RowIndexEntry rie = sstableWriter.append(partition);
         partitionsWritten++;
         long totalWrittenInCurrentWriter = sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten();
         if (totalWrittenInCurrentWriter > maxSSTableSize)
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
index 8eaa8c18612c..9fd7531127db 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
@@ -21,7 +21,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -81,7 +81,7 @@ private static long getTotalWriteSize(Iterable<SSTableReader> nonExpiredSSTables
 
     protected boolean realAppend(UnfilteredRowIterator partition)
     {
-        BigTableRowIndexEntry rie = sstableWriter.append(partition);
+        RowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > maxSSTableSize)
         {
             switchCompactionLocation(sstableDirectory);
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
index c43d224fd92e..0199bc03b7f7 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
@@ -25,7 +25,7 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -86,7 +86,7 @@ public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories di
     @Override
     public boolean realAppend(UnfilteredRowIterator partition)
     {
-        BigTableRowIndexEntry rie = sstableWriter.append(partition);
+        RowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > currentBytesToWrite && currentRatioIndex < ratios.length - 1) // if we underestimate how many keys we have, the last sstable might get more than we expect
         {
             currentRatioIndex++;
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
index 8e0d5144b8af..70b8e1768f0d 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
@@ -40,7 +40,7 @@ class Helpers
      * really present, and that the items to add are not (unless we're also removing them)
      * @return a new set with the contents of the provided one modified
      */
-    static <T> Set<T> replace(Set<T> original, Set<T> remove, Iterable<T> add)
+    static <T> Set<T> replace(Set<T> original, Set<? extends T> remove, Iterable<? extends T> add)
     {
         return ImmutableSet.copyOf(replace(identityMap(original), remove, add).keySet());
     }
@@ -50,7 +50,7 @@ static <T> Set<T> replace(Set<T> original, Set<T> remove, Iterable<T> add)
      * really present, and that the items to add are not (unless we're also removing them)
      * @return a new identity map with the contents of the provided one modified
      */
-    static <T> Map<T, T> replace(Map<T, T> original, Set<T> remove, Iterable<T> add)
+    static <T> Map<T, T> replace(Map<T, T> original, Set<? extends T> remove, Iterable<? extends T> add)
     {
         // ensure the ones being removed are the exact same ones present
         for (T reader : remove)
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
index 8b7550d804df..e8f8000cddf2 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
@@ -28,13 +28,13 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import org.apache.cassandra.io.sstable.format.SSTableReader.UniqueIdentifier;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableReader.UniqueIdentifier;
 import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
@@ -159,12 +159,12 @@ public static LifecycleTransaction offline(OperationType operationType)
     }
 
     @SuppressWarnings("resource") // log closed during postCleanup
-    LifecycleTransaction(Tracker tracker, OperationType operationType, Iterable<SSTableReader> readers)
+    LifecycleTransaction(Tracker tracker, OperationType operationType, Iterable<? extends SSTableReader> readers)
     {
         this(tracker, new LogTransaction(operationType, tracker), readers);
     }
 
-    LifecycleTransaction(Tracker tracker, LogTransaction log, Iterable<SSTableReader> readers)
+    LifecycleTransaction(Tracker tracker, LogTransaction log, Iterable<? extends SSTableReader> readers)
     {
         this.tracker = tracker;
         this.log = log;
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
index 949ca79db17d..d5b68b58d4d7 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
@@ -99,7 +99,7 @@ public LifecycleTransaction tryModify(SSTableReader sstable, OperationType opera
     /**
      * @return a Transaction over the provided sstables if we are able to mark the given @param sstables as compacted, before anyone else
      */
-    public LifecycleTransaction tryModify(Iterable<SSTableReader> sstables, OperationType operationType)
+    public LifecycleTransaction tryModify(Iterable<? extends SSTableReader> sstables, OperationType operationType)
     {
         if (Iterables.isEmpty(sstables))
             return new LifecycleTransaction(this, operationType, sstables);
@@ -407,7 +407,7 @@ public Iterable<SSTableReader> getUncompacting()
         return view.get().select(SSTableSet.NONCOMPACTING);
     }
 
-    public Iterable<SSTableReader> getUncompacting(Iterable<SSTableReader> candidates)
+    public Iterable<? extends SSTableReader> getUncompacting(Iterable<? extends SSTableReader> candidates)
     {
         return view.get().getUncompacting(candidates);
     }
diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java
index 4aad49ed832f..5fe39364c70e 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/View.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/View.java
@@ -170,15 +170,9 @@ public Iterable<SSTableReader> select(SSTableSet sstableSet)
         }
     }
 
-    public Iterable<SSTableReader> getUncompacting(Iterable<SSTableReader> candidates)
+    public Iterable<? extends SSTableReader> getUncompacting(Iterable<? extends SSTableReader> candidates)
     {
-        return filter(candidates, new Predicate<SSTableReader>()
-        {
-            public boolean apply(SSTableReader sstable)
-            {
-                return !compacting.contains(sstable);
-            }
-        });
+        return filter(candidates, (Predicate<SSTableReader>) sstable -> !compacting.contains(sstable));
     }
 
     public boolean isEmpty()
@@ -258,7 +252,7 @@ public static Function<View, Iterable<SSTableReader>> selectLive(AbstractBounds<
     // METHODS TO CONSTRUCT FUNCTIONS FOR MODIFYING A VIEW:
 
     // return a function to un/mark the provided readers compacting in a view
-    static Function<View, View> updateCompacting(final Set<SSTableReader> unmark, final Iterable<SSTableReader> mark)
+    static Function<View, View> updateCompacting(final Set<? extends SSTableReader> unmark, final Iterable<? extends SSTableReader> mark)
     {
         if (unmark.isEmpty() && Iterables.isEmpty(mark))
             return Functions.identity();
@@ -276,7 +270,7 @@ public View apply(View view)
 
     // construct a predicate to reject views that do not permit us to mark these readers compacting;
     // i.e. one of them is either already compacting, has been compacted, or has been replaced
-    static Predicate<View> permitCompacting(final Iterable<SSTableReader> readers)
+    static Predicate<View> permitCompacting(final Iterable<? extends SSTableReader> readers)
     {
         return new Predicate<View>()
         {
diff --git a/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java b/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java
index d8bd36f7e16c..8851b404faad 100644
--- a/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java
+++ b/src/java/org/apache/cassandra/db/rows/LazilyInitializedUnfilteredRowIterator.java
@@ -35,6 +35,8 @@ public abstract class LazilyInitializedUnfilteredRowIterator extends AbstractIte
 
     private UnfilteredRowIterator iterator;
 
+    private boolean closed = false;
+
     public LazilyInitializedUnfilteredRowIterator(DecoratedKey partitionKey)
     {
         this.partitionKey = partitionKey;
@@ -104,5 +106,11 @@ public void close()
     {
         if (iterator != null)
             iterator.close();
+        closed = true;
+    }
+
+    public boolean isClosed()
+    {
+        return closed;
     }
 }
diff --git a/src/java/org/apache/cassandra/db/rows/Rows.java b/src/java/org/apache/cassandra/db/rows/Rows.java
index 873f4760416f..ce4d6dd48450 100644
--- a/src/java/org/apache/cassandra/db/rows/Rows.java
+++ b/src/java/org/apache/cassandra/db/rows/Rows.java
@@ -119,9 +119,8 @@ private static int unpackColumnCount(long v)
      *
      * @param row the row for which to collect stats.
      * @param collector the stats collector.
-     * @return the total number of cells in {@code row}.
      */
-    public static int collectStats(Row row, PartitionStatisticsCollector collector)
+    public static void collectStats(Row row, PartitionStatisticsCollector collector)
     {
         assert !row.isEmpty();
 
@@ -131,7 +130,6 @@ public static int collectStats(Row row, PartitionStatisticsCollector collector)
         long result = row.accumulate(StatsAccumulation::accumulateOnColumnData, collector, 0);
 
         collector.updateColumnSetPerRow(StatsAccumulation.unpackColumnCount(result));
-        return StatsAccumulation.unpackCellCount(result);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java b/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java
index 0bfe99311246..18e408ad48f9 100644
--- a/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java
+++ b/src/java/org/apache/cassandra/db/streaming/CassandraEntireSSTableStreamReader.java
@@ -34,7 +34,7 @@
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableMultiWriter;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
-import org.apache.cassandra.io.sstable.format.big.BigTableZeroCopyWriter;
+import org.apache.cassandra.io.sstable.format.SSTableZeroCopyWriter;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.schema.TableId;
@@ -61,7 +61,7 @@ public class CassandraEntireSSTableStreamReader implements IStreamReader
 
     public CassandraEntireSSTableStreamReader(StreamMessageHeader messageHeader, CassandraStreamHeader streamHeader, StreamSession session)
     {
-        if (streamHeader.format != SSTableFormat.Type.BIG)
+        if (streamHeader.format != SSTableFormat.Type.BIG && streamHeader.format != SSTableFormat.Type.BTI)
             throw new AssertionError("Unsupported SSTable format " + streamHeader.format);
 
         if (session.getPendingRepair() != null)
@@ -104,7 +104,7 @@ public SSTableMultiWriter read(DataInputPlus in) throws IOException
                      prettyPrintMemory(totalSize),
                      cfs.metadata());
 
-        BigTableZeroCopyWriter writer = null;
+        SSTableZeroCopyWriter writer = null;
 
         try
         {
@@ -167,7 +167,7 @@ private File getDataDir(ColumnFamilyStore cfs, long totalSize) throws IOExceptio
     }
 
     @SuppressWarnings("resource")
-    protected BigTableZeroCopyWriter createWriter(ColumnFamilyStore cfs, long totalSize, Collection<Component> components) throws IOException
+    protected SSTableZeroCopyWriter createWriter(ColumnFamilyStore cfs, long totalSize, Collection<Component> components) throws IOException
     {
         File dataDir = getDataDir(cfs, totalSize);
 
@@ -180,6 +180,6 @@ protected BigTableZeroCopyWriter createWriter(ColumnFamilyStore cfs, long totalS
 
         logger.debug("[Table #{}] {} Components to write: {}", cfs.metadata(), desc.filenameFor(Component.DATA), components);
 
-        return new BigTableZeroCopyWriter(desc, cfs.metadata, lifecycleNewTracker, components);
+        return new SSTableZeroCopyWriter(desc, cfs.metadata, lifecycleNewTracker, components);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java b/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java
index 0904720a627b..0e089c9d0096 100644
--- a/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java
+++ b/src/java/org/apache/cassandra/db/streaming/CassandraOutgoingFile.java
@@ -43,7 +43,7 @@
  */
 public class CassandraOutgoingFile implements OutgoingStream
 {
-    private final Ref<SSTableReader> ref;
+    private final Ref<? extends SSTableReader> ref;
     private final long estimatedKeys;
     private final List<SSTableReader.PartitionPositionBounds> sections;
     private final String filename;
@@ -51,7 +51,7 @@ public class CassandraOutgoingFile implements OutgoingStream
     private final StreamOperation operation;
     private final CassandraStreamHeader header;
 
-    public CassandraOutgoingFile(StreamOperation operation, Ref<SSTableReader> ref,
+    public CassandraOutgoingFile(StreamOperation operation, Ref<? extends SSTableReader> ref,
                                  List<SSTableReader.PartitionPositionBounds> sections, List<Range<Token>> normalizedRanges,
                                  long estimatedKeys)
     {
@@ -106,7 +106,7 @@ public static CassandraOutgoingFile fromStream(OutgoingStream stream)
     }
 
     @VisibleForTesting
-    public Ref<SSTableReader> getRef()
+    public Ref<? extends SSTableReader> getRef()
     {
         return ref;
     }
diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamHeader.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamHeader.java
index c9e10cf6a4cb..252b6c75c997 100644
--- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamHeader.java
+++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamHeader.java
@@ -201,8 +201,9 @@ public CassandraStreamHeader deserialize(DataInputPlus in, int version) throws I
         @VisibleForTesting
         public CassandraStreamHeader deserialize(DataInputPlus in, int version, Function<TableId, IPartitioner> partitionerMapper) throws IOException
         {
-            Version sstableVersion = SSTableFormat.Type.current().info.getVersion(in.readUTF());
+            String sstableVersionString = in.readUTF();
             SSTableFormat.Type format = SSTableFormat.Type.validate(in.readUTF());
+            Version sstableVersion = format.info.getVersion(sstableVersionString);
 
             long estimatedKeys = in.readLong();
             int count = in.readInt();
diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java
index 2cdcd49bcde9..b6c71fa7f08d 100644
--- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java
+++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.big.BigTableReader;
 import org.apache.cassandra.locator.RangesAtEndpoint;
 import org.apache.cassandra.locator.Replica;
 import org.apache.cassandra.service.ActiveRepairService;
@@ -137,9 +138,9 @@ else if (pendingRepair == ActiveRepairService.NO_PENDING_REPAIR)
             for (SSTableReader sstable : refs)
             {
                 List<Range<Token>> ranges = sstable.isRepaired() ? normalizedFullRanges : normalizedAllRanges;
-                List<SSTableReader.PartitionPositionBounds> sections = sstable.getPositionsForRanges(ranges);
+                List<BigTableReader.PartitionPositionBounds> sections = sstable.getPositionsForRanges(ranges);
 
-                Ref<SSTableReader> ref = refs.get(sstable);
+                Ref<? extends SSTableReader> ref = refs.get(sstable);
                 if (sections.isEmpty())
                 {
                     ref.release();
diff --git a/src/java/org/apache/cassandra/db/streaming/ComponentContext.java b/src/java/org/apache/cassandra/db/streaming/ComponentContext.java
index b9c60b9f795e..49a9cf304307 100644
--- a/src/java/org/apache/cassandra/db/streaming/ComponentContext.java
+++ b/src/java/org/apache/cassandra/db/streaming/ComponentContext.java
@@ -19,6 +19,7 @@
 package org.apache.cassandra.db.streaming;
 
 import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -57,7 +58,7 @@ public static ComponentContext create(Descriptor descriptor)
     {
         Map<Component, File> hardLinks = new HashMap<>(1);
 
-        for (Component component : MUTABLE_COMPONENTS)
+        for (Component component : Sets.intersection(MUTABLE_COMPONENTS, descriptor.getFormat().supportedComponents()))
         {
             File file = new File(descriptor.filenameFor(component));
             if (!file.exists())
diff --git a/src/java/org/apache/cassandra/db/streaming/ComponentManifest.java b/src/java/org/apache/cassandra/db/streaming/ComponentManifest.java
index bb896caffa06..f823fb93ace1 100644
--- a/src/java/org/apache/cassandra/db/streaming/ComponentManifest.java
+++ b/src/java/org/apache/cassandra/db/streaming/ComponentManifest.java
@@ -37,9 +37,9 @@
  */
 public final class ComponentManifest implements Iterable<Component>
 {
-    private static final List<Component> STREAM_COMPONENTS = ImmutableList.of(Component.DATA, Component.PRIMARY_INDEX, Component.STATS,
-                                                                             Component.COMPRESSION_INFO, Component.FILTER, Component.SUMMARY,
-                                                                             Component.DIGEST, Component.CRC);
+    private static final List<Component> STREAM_COMPONENTS = ImmutableList.of(Component.DATA, Component.PRIMARY_INDEX, Component.PARTITION_INDEX, Component.ROW_INDEX,
+                                                                              Component.STATS, Component.COMPRESSION_INFO, Component.FILTER, Component.SUMMARY,
+                                                                              Component.DIGEST, Component.CRC);
 
     private final LinkedHashMap<Component, Long> components;
 
@@ -51,15 +51,18 @@ public ComponentManifest(Map<Component, Long> components)
     @VisibleForTesting
     public static ComponentManifest create(Descriptor descriptor)
     {
-        LinkedHashMap<Component, Long> components = new LinkedHashMap<>(STREAM_COMPONENTS.size());
+        LinkedHashMap<Component, Long> components = new LinkedHashMap<>(descriptor.getFormat().supportedComponents().size());
 
         for (Component component : STREAM_COMPONENTS)
         {
-            File file = new File(descriptor.filenameFor(component));
-            if (!file.exists())
-                continue;
+            if (descriptor.getFormat().supportedComponents().contains(component))
+            {
+                File file = new File(descriptor.filenameFor(component));
+                if (!file.exists())
+                    continue;
 
-            components.put(component, file.length());
+                components.put(component, file.length());
+            }
         }
 
         return new ComponentManifest(components);
diff --git a/src/java/org/apache/cassandra/index/sai/SSTableContext.java b/src/java/org/apache/cassandra/index/sai/SSTableContext.java
index d43793f93918..5543b3edd735 100644
--- a/src/java/org/apache/cassandra/index/sai/SSTableContext.java
+++ b/src/java/org/apache/cassandra/index/sai/SSTableContext.java
@@ -85,7 +85,7 @@ public static SSTableContext create(SSTableReader sstable)
     {
         IndexComponents groupComponents = IndexComponents.perSSTable(sstable);
 
-        Ref<SSTableReader> sstableRef = null;
+        Ref<? extends SSTableReader> sstableRef = null;
         FileHandle token = null, offset = null;
         LongArray.Factory tokenReaderFactory, offsetReaderFactory;
         KeyFetcher keyFetcher;
@@ -140,9 +140,9 @@ public SSTableContext sharedCopy()
     private static class Cleanup implements RefCounted.Tidy
     {
         private final FileHandle token, offset;
-        private final Ref<SSTableReader> sstableRef;
+        private final Ref<? extends SSTableReader> sstableRef;
 
-        private Cleanup(FileHandle token, FileHandle offset, Ref<SSTableReader> sstableRef)
+        private Cleanup(FileHandle token, FileHandle offset, Ref<? extends SSTableReader> sstableRef)
         {
             this.token = token;
             this.offset = offset;
@@ -232,7 +232,7 @@ public static class DecoratedKeyFetcher implements KeyFetcher
         @Override
         public RandomAccessReader createReader()
         {
-            return sstable.openIndexReader();
+            return sstable.openKeyComponentReader();
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
index 87978c27b2bf..729afd3955f8 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
@@ -53,9 +53,9 @@
 import org.apache.cassandra.index.sai.disk.io.CryptoUtils;
 import org.apache.cassandra.index.sai.disk.io.IndexComponents;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.KeyIterator;
 import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
 import org.apache.cassandra.io.sstable.SSTableSimpleIterator;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -140,7 +140,7 @@ private boolean indexSSTable(SSTableReader sstable, Set<StorageAttachedIndex> in
         CountDownLatch perSSTableFileLock = null;
         StorageAttachedIndexWriter indexWriter = null;
 
-        Ref<SSTableReader> ref = sstable.tryRef();
+        Ref<? extends SSTableReader> ref = sstable.tryRef();
         if (ref == null)
         {
             logger.warn(logMessage("Couldn't acquire reference to the SSTable {}. It may have been removed."), sstable.descriptor);
@@ -162,17 +162,17 @@ private boolean indexSSTable(SSTableReader sstable, Set<StorageAttachedIndex> in
             long previousKeyPosition = 0;
             indexWriter.begin();
 
-            try (KeyIterator keys = KeyIterator.forSSTable(sstable))
+            try (PartitionIndexIterator keys = sstable.allKeysIterator())
             {
-                while (keys.hasNext())
+                while (!keys.isExhausted())
                 {
                     if (isStopRequested())
                     {
                         throw new CompactionInterruptedException(getCompactionInfo());
                     }
 
-                    final DecoratedKey key = keys.next();
-                    final long keyPosition = keys.getKeyPosition();
+                    final DecoratedKey key = sstable.decorateKey(keys.key());
+                    final long keyPosition = keys.keyPosition();
 
                     indexWriter.startPartition(key, keyPosition);
 
@@ -206,8 +206,10 @@ private boolean indexSSTable(SSTableReader sstable, Set<StorageAttachedIndex> in
                         }
                     }
 
-                    bytesProcessed += keyPosition - previousKeyPosition;
-                    previousKeyPosition = keyPosition;
+                    keys.advance();
+                    long dataPosition = keys.isExhausted() ? sstable.uncompressedLength() : keys.dataPosition();
+                    bytesProcessed += dataPosition - previousKeyPosition;
+                    previousKeyPosition = dataPosition;
                 }
 
                 completeSSTable(indexWriter, sstable, indexes, perSSTableFileLock);
diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
index c2b0aa19c9d3..6152deb5e27b 100644
--- a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
@@ -22,10 +22,12 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.*;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.UUID;
 
-import org.apache.cassandra.io.sstable.format.RowIndexEntry;
-import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.compaction.CompactionInfo;
@@ -36,11 +38,12 @@
 import org.apache.cassandra.index.sasi.conf.ColumnIndex;
 import org.apache.cassandra.index.sasi.disk.PerSSTableIndexWriter;
 import org.apache.cassandra.io.FSReadError;
-import org.apache.cassandra.io.sstable.KeyIterator;
-import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.UUIDGen;
 
@@ -58,7 +61,7 @@ public SASIIndexBuilder(ColumnFamilyStore cfs, SortedMap<SSTableReader, Map<Colu
     {
         long totalIndexBytes = 0;
         for (SSTableReader sstable : sstables.keySet())
-            totalIndexBytes += getPrimaryIndexLength(sstable);
+            totalIndexBytes += sstable.uncompressedLength();
 
         this.cfs = cfs;
         this.sstables = sstables;
@@ -78,19 +81,19 @@ public void build()
                 PerSSTableIndexWriter indexWriter = SASIIndex.newWriter(keyValidator, sstable.descriptor, indexes, OperationType.COMPACTION);
 
                 long previousKeyPosition = 0;
-                try (KeyIterator keys = KeyIterator.forSSTable(sstable))
+                try (PartitionIndexIterator keys = sstable.allKeysIterator())
                 {
-                    while (keys.hasNext())
+                    while (!keys.isExhausted())
                     {
                         if (isStopRequested())
                             throw new CompactionInterruptedException(getCompactionInfo());
 
-                        final DecoratedKey key = keys.next();
-                        final long keyPosition = keys.getKeyPosition();
+                        final DecoratedKey key = sstable.decorateKey(keys.key());
+                        final long keyPosition = keys.keyPosition();
 
                         indexWriter.startPartition(key, keyPosition);
 
-                        RowIndexEntry<?> indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
+                        RowIndexEntry indexEntry = sstable.getPosition(key, SSTableReader.Operator.EQ);
                         dataFile.seek(indexEntry.position);
                         ByteBufferUtil.readWithShortLength(dataFile); // key
 
@@ -104,8 +107,10 @@ public void build()
                                 indexWriter.nextUnfilteredCluster(partition.next());
                         }
 
-                        bytesProcessed += keyPosition - previousKeyPosition;
-                        previousKeyPosition = keyPosition;
+                        keys.advance();
+                        long dataPosition = keys.isExhausted() ? sstable.uncompressedLength() : keys.dataPosition();
+                        bytesProcessed += dataPosition - previousKeyPosition;
+                        previousKeyPosition = dataPosition;
                     }
 
                     completeSSTable(indexWriter, sstable, indexes.values());
@@ -128,12 +133,6 @@ public CompactionInfo getCompactionInfo()
                                   sstables.keySet());
     }
 
-    private long getPrimaryIndexLength(SSTable sstable)
-    {
-        File primaryIndex = new File(sstable.getIndexFilename());
-        return primaryIndex.exists() ? primaryIndex.length() : 0;
-    }
-
     private void completeSSTable(PerSSTableIndexWriter indexWriter, SSTableReader sstable, Collection<ColumnIndex> indexes)
     {
         indexWriter.complete();
diff --git a/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java b/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java
index c67c39c645bc..f4bc0884a8ad 100644
--- a/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java
+++ b/src/java/org/apache/cassandra/index/sasi/SSTableIndex.java
@@ -33,7 +33,9 @@
 import org.apache.cassandra.index.sasi.utils.RangeIterator;
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.utils.concurrent.Ref;
 
 import org.apache.commons.lang3.builder.HashCodeBuilder;
@@ -43,7 +45,7 @@
 public class SSTableIndex
 {
     private final ColumnIndex columnIndex;
-    private final Ref<SSTableReader> sstableRef;
+    private final Ref<? extends SSTableReader> sstableRef;
     private final SSTableReader sstable;
     private final OnDiskIndex index;
     private final AtomicInteger references = new AtomicInteger(1);
@@ -174,9 +176,9 @@ private static class DecoratedKeyFetcher implements Function<Long, DecoratedKey>
 
         public DecoratedKey apply(Long offset)
         {
-            try
+            try (RandomAccessReader in = sstable.openKeyComponentReader())
             {
-                return sstable.keyAt(offset);
+                return sstable.keyAt(in, offset);
             }
             catch (IOException e)
             {
diff --git a/src/java/org/apache/cassandra/io/sstable/Component.java b/src/java/org/apache/cassandra/io/sstable/Component.java
index a81db859f23d..aba9ea21a474 100644
--- a/src/java/org/apache/cassandra/io/sstable/Component.java
+++ b/src/java/org/apache/cassandra/io/sstable/Component.java
@@ -42,6 +42,10 @@ public enum Type
         // the base data for an sstable: the remaining components can be regenerated
         // based on the data component
         DATA("Data.db"),
+        // partition index trie (TrieIndexFormat)
+        PARTITION_INDEX("Partitions.db"),
+        // row indices (TrieIndexFormat)
+        ROW_INDEX("Rows.db"),
         // index of the row keys with pointers to their positions in the data file
         PRIMARY_INDEX("Index.db"),
         // serialized bloom filter for the row keys in the sstable
@@ -83,6 +87,8 @@ static Type fromRepresentation(String repr)
 
     // singleton components for types that don't need ids
     public final static Component DATA = new Component(Type.DATA);
+    public final static Component PARTITION_INDEX = new Component(Type.PARTITION_INDEX);
+    public final static Component ROW_INDEX = new Component(Type.ROW_INDEX);
     public final static Component PRIMARY_INDEX = new Component(Type.PRIMARY_INDEX);
     public final static Component FILTER = new Component(Type.FILTER);
     public final static Component COMPRESSION_INFO = new Component(Type.COMPRESSION_INFO);
@@ -133,6 +139,8 @@ public static Component parse(String name)
         switch (type)
         {
             case DATA:             return Component.DATA;
+            case PARTITION_INDEX:  return Component.PARTITION_INDEX;
+            case ROW_INDEX:        return Component.ROW_INDEX;
             case PRIMARY_INDEX:    return Component.PRIMARY_INDEX;
             case FILTER:           return Component.FILTER;
             case COMPRESSION_INFO: return Component.COMPRESSION_INFO;
diff --git a/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java b/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java
index 93be2eec6a7e..d4ed3def4259 100644
--- a/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java
+++ b/src/java/org/apache/cassandra/io/sstable/CorruptSSTableException.java
@@ -33,4 +33,10 @@ public CorruptSSTableException(Throwable cause, String path)
     {
         this(cause, new File(path));
     }
+
+    protected CorruptSSTableException(String msg, Throwable cause, File path)
+    {
+        super(msg, cause);
+        this.path = path;
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
index a85d855abdb9..c98529266411 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
@@ -186,7 +186,8 @@ private List<SSTableReader> getAllSSTables()
         for (Keyspace ks : Keyspace.all())
         {
             for (ColumnFamilyStore cfStore: ks.getColumnFamilyStores())
-                result.addAll(cfStore.getLiveSSTables());
+                for (SSTableReader tr : SSTableReader.selectOnlyBigTableReaders(cfStore.getLiveSSTables()))
+                    result.add(tr);
         }
 
         return result;
@@ -213,8 +214,8 @@ private Pair<Long, Map<TableId, LifecycleTransaction>> getRestributionTransactio
                 do
                 {
                     View view = cfStore.getTracker().getView();
-                    allSSTables = ImmutableSet.copyOf(view.select(SSTableSet.CANONICAL));
-                    nonCompacting = ImmutableSet.copyOf(view.getUncompacting(allSSTables));
+                    allSSTables = ImmutableSet.copyOf(SSTableReader.selectOnlyBigTableReaders(view.select(SSTableSet.CANONICAL)));
+                    nonCompacting = ImmutableSet.copyOf(SSTableReader.selectOnlyBigTableReaders(view.getUncompacting(allSSTables)));
                 }
                 while (null == (txn = cfStore.getTracker().tryModify(nonCompacting, OperationType.INDEX_SUMMARY)));
 
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
index 90a86215566d..453d27414290 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
@@ -26,6 +26,7 @@
 import java.util.List;
 import java.util.Map;
 import java.util.UUID;
+import java.util.stream.Collectors;
 
 import com.google.common.annotations.VisibleForTesting;
 
@@ -84,7 +85,7 @@ public List<SSTableReader> redistributeSummaries() throws IOException
         List<SSTableReader> redistribute = new ArrayList<>();
         for (LifecycleTransaction txn : transactions.values())
         {
-            redistribute.addAll(txn.originals());
+            redistribute.addAll(SSTableReader.selectOnlyBigTableReaders(txn.originals(), Collectors.toList()));
         }
 
         long total = nonRedistributingOffHeapSize;
@@ -119,7 +120,7 @@ public List<SSTableReader> redistributeSummaries() throws IOException
         logger.trace("Index summaries for compacting SSTables are using {} MB of space",
                      (memoryPoolBytes - remainingBytes) / 1024.0 / 1024.0);
         List<SSTableReader> newSSTables;
-        try (Refs<SSTableReader> refs = Refs.ref(sstablesByHotness))
+        try (Refs<? extends SSTableReader> refs = Refs.ref(sstablesByHotness))
         {
             newSSTables = adjustSamplingLevels(sstablesByHotness, transactions, totalReadsPerSec, remainingBytes);
 
diff --git a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
index c31af70d4c0d..475a6c9f03a3 100644
--- a/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/KeyIterator.java
@@ -25,40 +25,35 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.CloseableIterator;
 
+// TODO STAR-247: Implement a unit test
 public class KeyIterator extends AbstractIterator<DecoratedKey> implements CloseableIterator<DecoratedKey>
 {
     private final IPartitioner partitioner;
     private final PartitionIndexIterator it;
     private final ReadWriteLock fileAccessLock;
-    private final long indexLength;
+    private final long totalBytes;
 
-    private long keyPosition = -1;
+    private boolean initialized = false;
 
-    public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner, ReadWriteLock fileAccessLock)
+    public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner, long totalBytes, ReadWriteLock fileAccessLock)
     {
         this.it = it;
         this.partitioner = partitioner;
+        this.totalBytes = totalBytes;
         this.fileAccessLock = fileAccessLock;
-        this.indexLength = it.indexLength();
     }
 
-    public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner)
+    public KeyIterator(PartitionIndexIterator it, IPartitioner partitioner, long totalBytes)
     {
-        this(it, partitioner, null);
+        this(it, partitioner, totalBytes, null);
     }
 
     public static KeyIterator forSSTable(SSTableReader ssTableReader) throws IOException
     {
-        return new KeyIterator(ssTableReader.allKeysIterator(), ssTableReader.getPartitioner(), new ReentrantReadWriteLock());
-    }
-
-    public static KeyIterator create(SSTableReader.Factory factory, Descriptor descriptor, TableMetadata metadata)
-    {
-        return new KeyIterator(factory.indexIterator(descriptor, metadata), metadata.partitioner, new ReentrantReadWriteLock());
+        return new KeyIterator(ssTableReader.allKeysIterator(), ssTableReader.getPartitioner(), ssTableReader.uncompressedLength(), new ReentrantReadWriteLock());
     }
 
     protected DecoratedKey computeNext()
@@ -67,16 +62,15 @@ protected DecoratedKey computeNext()
             fileAccessLock.readLock().lock();
         try
         {
-            if (keyPosition < 0)
+            if (!initialized)
             {
-                keyPosition = 0;
+                initialized = true;
                 return it.isExhausted()
                        ? endOfData()
                        : partitioner.decorateKey(it.key());
             }
             else
             {
-                keyPosition = it.indexPosition();
                 return it.advance()
                        ? partitioner.decorateKey(it.key())
                        : endOfData();
@@ -114,7 +108,7 @@ public long getBytesRead()
             fileAccessLock.readLock().lock();
         try
         {
-            return it.indexPosition();
+            return it.isExhausted() ? totalBytes : it.dataPosition();
         }
         finally
         {
@@ -125,31 +119,6 @@ public long getBytesRead()
 
     public long getTotalBytes()
     {
-        return indexLength;
-    }
-
-    public long getKeyPosition()
-    {
-        return keyPosition;
-    }
-
-    public void reset()
-    {
-        if (fileAccessLock != null)
-            fileAccessLock.readLock().lock();
-        try
-        {
-            it.reset();
-            keyPosition = -1;
-        }
-        catch (IOException ex)
-        {
-            throw new RuntimeException(ex);
-        }
-        finally
-        {
-            if (fileAccessLock != null)
-                fileAccessLock.readLock().unlock();
-        }
+        return totalBytes;
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
index 9a231c932355..8c240c8cef87 100644
--- a/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/ReducingKeyIterator.java
@@ -36,20 +36,27 @@ public class ReducingKeyIterator implements CloseableIterator<DecoratedKey>
 {
     private final ArrayList<KeyIterator> iters;
     private volatile IMergeIterator<DecoratedKey, DecoratedKey> mi;
+    private final long totalLength;
 
     public ReducingKeyIterator(Collection<SSTableReader> sstables)
     {
         iters = new ArrayList<>(sstables.size());
+        long len = 0;
         try
         {
             for (SSTableReader sstable : sstables)
-                iters.add(KeyIterator.forSSTable(sstable));
+            {
+                KeyIterator iter = KeyIterator.forSSTable(sstable);
+                iters.add(iter);
+                len += iter.getTotalBytes();
+            }
         }
         catch (IOException | RuntimeException ex)
         {
             iters.forEach(KeyIterator::close);
             throw Throwables.cleaned(ex);
         }
+        this.totalLength = len;
     }
 
     private void maybeInit()
@@ -93,14 +100,7 @@ public void close()
 
     public long getTotalBytes()
     {
-        maybeInit();
-
-        long m = 0;
-        for (Iterator<DecoratedKey> iter : mi.iterators())
-        {
-            m += ((KeyIterator) iter).getTotalBytes();
-        }
-        return m;
+        return totalLength;
     }
 
     public long getBytesRead()
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java
index f138c1b63669..a375781837fe 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTable.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java
@@ -161,11 +161,6 @@ public String getFilename()
         return descriptor.filenameFor(Component.DATA);
     }
 
-    public String getIndexFilename()
-    {
-        return descriptor.filenameFor(Component.PRIMARY_INDEX);
-    }
-
     public String getColumnFamilyName()
     {
         return descriptor.cfname;
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
index cf19083c9f55..367d0c9fa1c0 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableIdentityIterator.java
@@ -70,13 +70,16 @@ public static SSTableIdentityIterator create(SSTableReader sstable, RandomAccess
     }
 
     @SuppressWarnings("resource")
-    public static SSTableIdentityIterator create(SSTableReader sstable, FileDataInput dfile, RowIndexEntry<?> indexEntry, DecoratedKey key, boolean tombstoneOnly)
+    public static SSTableIdentityIterator create(SSTableReader sstable, FileDataInput dfile, RowIndexEntry indexEntry, DecoratedKey key, boolean tombstoneOnly)
     {
         try
         {
             dfile.seek(indexEntry.position);
             ByteBufferUtil.skipShortLength(dfile); // Skip partition key
             DeletionTime partitionLevelDeletion = DeletionTime.serializer.deserialize(dfile);
+            if (!partitionLevelDeletion.validate())
+                UnfilteredValidation.handleInvalid(sstable.metadata(), key, sstable, "partitionLevelDeletion="+partitionLevelDeletion.toString());
+
             DeserializationHelper helper = new DeserializationHelper(sstable.metadata(), sstable.descriptor.version.correspondingMessagingVersion(), DeserializationHelper.Flag.LOCAL);
             SSTableSimpleIterator iterator = tombstoneOnly
                     ? SSTableSimpleIterator.createTombstoneOnly(sstable.metadata(), dfile, sstable.header, helper, partitionLevelDeletion)
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
index 47de00c6a21a..e6998ee22348 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java
@@ -21,7 +21,9 @@
 import java.util.*;
 
 import com.google.common.collect.HashMultimap;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
 
 import org.apache.cassandra.db.streaming.CassandraOutgoingFile;
 import org.apache.cassandra.locator.InetAddressAndPort;
@@ -90,12 +92,6 @@ protected Collection<SSTableReader> openSSTables(final Map<InetAddressAndPort, C
                                           if (p == null || !p.right.equals(Component.DATA))
                                               return false;
 
-                                          if (!new File(desc.filenameFor(Component.PRIMARY_INDEX)).exists())
-                                          {
-                                              outputHandler.output(String.format("Skipping file %s because index is missing", name));
-                                              return false;
-                                          }
-
                                           TableMetadataRef metadata = client.getTableMetadata(desc.cfname);
 
                                           if (metadata == null && // we did not find metadata
@@ -126,22 +122,14 @@ protected Collection<SSTableReader> openSSTables(final Map<InetAddressAndPort, C
                                               return false;
                                           }
 
-                                          Set<Component> components = new HashSet<>();
-                                          components.add(Component.DATA);
-                                          components.add(Component.PRIMARY_INDEX);
-                                          if (new File(desc.filenameFor(Component.SUMMARY)).exists())
-                                              components.add(Component.SUMMARY);
-                                          if (new File(desc.filenameFor(Component.COMPRESSION_INFO)).exists())
-                                              components.add(Component.COMPRESSION_INFO);
-                                          if (new File(desc.filenameFor(Component.STATS)).exists())
-                                              components.add(Component.STATS);
+                                          Set<Component> components = mainComponentsPresent(desc);
 
                                           try
                                           {
                                               // To conserve memory, open SSTableReaders without bloom filters and discard
                                               // the index summary after calculating the file sections to stream and the estimated
                                               // number of keys for each endpoint. See CASSANDRA-5555 for details.
-                                              SSTableReader sstable = SSTableReader.openForBatch(desc, components, metadata);
+                                              SSTableReader sstable = desc.getFormat().getReaderFactory().openForBatch(desc, components, metadata);
                                               sstables.add(sstable);
 
                                               // calculate the sstable sections to stream as well as the estimated number of
@@ -153,7 +141,7 @@ protected Collection<SSTableReader> openSSTables(final Map<InetAddressAndPort, C
 
                                                   List<SSTableReader.PartitionPositionBounds> sstableSections = sstable.getPositionsForRanges(tokenRanges);
                                                   long estimatedKeys = sstable.estimatedKeysForRanges(tokenRanges);
-                                                  Ref<SSTableReader> ref = sstable.ref();
+                                                  Ref<? extends SSTableReader> ref = sstable.ref();
                                                   OutgoingStream stream = new CassandraOutgoingFile(StreamOperation.BULK_LOAD, ref, sstableSections, tokenRanges, estimatedKeys);
                                                   streamingDetails.put(endpoint, stream);
                                               }
@@ -173,6 +161,20 @@ protected Collection<SSTableReader> openSSTables(final Map<InetAddressAndPort, C
         return sstables;
     }
 
+    public static Set<Component> mainComponentsPresent(Descriptor desc)
+    {
+        Set<Component> lookFor = Sets.union(desc.getFormat().requiredComponents(),
+                                            ImmutableSet.of(Component.COMPRESSION_INFO));
+
+        Set<Component> components = new HashSet<>();
+        for (Component component : lookFor)
+        {
+            if (new File(desc.filenameFor(component)).exists())
+                components.add(component);
+        }
+        return components;
+    }
+
     public StreamResultFuture stream()
     {
         return stream(Collections.<InetAddressAndPort>emptySet());
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
index 1b6336dfbef2..a390648bf1bc 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
@@ -118,21 +118,21 @@ public SSTableWriter currentWriter()
         return writer;
     }
 
-    public BigTableRowIndexEntry append(UnfilteredRowIterator partition)
+    public RowIndexEntry append(UnfilteredRowIterator partition)
     {
         // we do this before appending to ensure we can resetAndTruncate() safely if the append fails
         DecoratedKey key = partition.partitionKey();
         maybeReopenEarly(key);
-        BigTableRowIndexEntry index = writer.append(partition);
+        RowIndexEntry index = writer.append(partition);
         if (DatabaseDescriptor.shouldMigrateKeycacheOnCompaction())
         {
-            if (!transaction.isOffline() && index != null)
+            if (!transaction.isOffline() && index instanceof BigTableRowIndexEntry)
             {
                 for (SSTableReader reader : transaction.originals())
                 {
                     if (reader.getCachedPosition(key, false) != null)
                     {
-                        cachedKeys.put(key, index);
+                        cachedKeys.put(key, (BigTableRowIndexEntry) index);
                         break;
                     }
                 }
@@ -142,7 +142,7 @@ public BigTableRowIndexEntry append(UnfilteredRowIterator partition)
     }
 
     // attempts to append the row, if fails resets the writer position
-    public BigTableRowIndexEntry tryAppend(UnfilteredRowIterator partition)
+    public RowIndexEntry tryAppend(UnfilteredRowIterator partition)
     {
         writer.mark();
         try
@@ -164,20 +164,18 @@ private void maybeReopenEarly(DecoratedKey key)
             {
                 for (SSTableReader reader : transaction.originals())
                 {
-                    RowIndexEntry<?> index = reader.getPosition(key, SSTableReader.Operator.GE);
+                    RowIndexEntry index = reader.getPosition(key, SSTableReader.Operator.GE);
                     NativeLibrary.trySkipCache(reader.getFilename(), 0, index == null ? 0 : index.position);
                 }
             }
             else
             {
-                SSTableReader reader = writer.setMaxDataAge(maxAge).openEarly();
-                if (reader != null)
-                {
+                writer.setMaxDataAge(maxAge).openEarly(reader -> {
                     transaction.update(reader, false);
                     currentlyOpenedEarlyAt = writer.getFilePointer();
                     moveStarts(reader, reader.last);
                     transaction.checkpoint();
-                }
+                });
             }
         }
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
index b0c773243420..93d9aa4695e2 100644
--- a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
@@ -21,7 +21,7 @@
 import java.util.Collections;
 import java.util.UUID;
 
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -45,7 +45,7 @@ protected SimpleSSTableMultiWriter(SSTableWriter writer, LifecycleNewTracker lif
 
     public boolean append(UnfilteredRowIterator partition)
     {
-        BigTableRowIndexEntry indexEntry = writer.append(partition);
+        RowIndexEntry indexEntry = writer.append(partition);
         return indexEntry != null;
     }
 
diff --git a/src/java/org/apache/cassandra/io/sstable/UnsupportedSSTableException.java b/src/java/org/apache/cassandra/io/sstable/UnsupportedSSTableException.java
new file mode 100644
index 000000000000..41e5ae04241e
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/UnsupportedSSTableException.java
@@ -0,0 +1,29 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.io.File;
+
+public class UnsupportedSSTableException extends CorruptSSTableException
+{
+    public UnsupportedSSTableException(String msg, Throwable cause, File path)
+    {
+        super(msg, cause, path);
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java
index 2a1e67572426..5d8f02811660 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/AbstractSSTableIterator.java
@@ -15,6 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 package org.apache.cassandra.io.sstable.format;
 
 import java.io.Closeable;
@@ -22,16 +23,33 @@
 import java.util.Iterator;
 import java.util.NoSuchElementException;
 
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.filter.ColumnFilter;
-import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.BufferClusteringBound;
+import org.apache.cassandra.db.ClusteringBound;
+import org.apache.cassandra.db.Columns;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.UnfilteredDeserializer;
+import org.apache.cassandra.db.UnfilteredValidation;
+import org.apache.cassandra.db.rows.DeserializationHelper;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredSerializer;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
-public abstract class AbstractSSTableIterator<E extends RowIndexEntry<?>> implements UnfilteredRowIterator
+public abstract class AbstractSSTableIterator<E extends RowIndexEntry> implements UnfilteredRowIterator
 {
     protected final SSTableReader sstable;
     // We could use sstable.metadata(), but that can change during execution so it's good hygiene to grab an immutable instance
@@ -154,10 +172,10 @@ private Slice nextSlice()
      */
     protected abstract boolean hasMoreSlices();
 
-    private static Row readStaticRow(SSTableReader sstable,
-                                     FileDataInput file,
-                                     DeserializationHelper helper,
-                                     Columns statics) throws IOException
+    public static Row readStaticRow(SSTableReader sstable,
+                                    FileDataInput file,
+                                    DeserializationHelper helper,
+                                    Columns statics) throws IOException
     {
         if (!sstable.header.hasStatic())
             return Rows.EMPTY_STATIC_ROW;
@@ -179,7 +197,7 @@ private Reader createReader(E indexEntry, FileDataInput file, boolean shouldClos
     {
         return slices.isEmpty() ? new NoRowsReader(file, shouldCloseFile)
                                 : createReaderInternal(indexEntry, file, shouldCloseFile);
-    };
+    }
 
     public TableMetadata metadata()
     {
@@ -287,6 +305,181 @@ public void close()
         }
     }
 
+    public abstract class RowReader extends Reader {
+        public UnfilteredDeserializer deserializer;
+
+        // Records the currently open range tombstone (if any)
+        public DeletionTime openMarker;
+
+        protected RowReader(FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+
+            if (file != null)
+                createDeserializer();
+        }
+
+        private void createDeserializer()
+        {
+            assert file != null && deserializer == null;
+            deserializer = UnfilteredDeserializer.create(metadata, file, sstable.header, helper);
+        }
+
+        public void seekToPosition(long position) throws IOException
+        {
+            // This may be the first time we're actually looking into the file
+            if (file == null)
+            {
+                file = sstable.getFileDataInput(position);
+                createDeserializer();
+            }
+            else
+            {
+                file.seek(position);
+                deserializer.clearState();
+            }
+        }
+
+        protected void updateOpenMarker(RangeTombstoneMarker marker)
+        {
+            // Note that we always read index blocks in forward order so this method is always called in forward order
+            openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null;
+        }
+    }
+
+    protected class ForwardReader extends RowReader
+    {
+        // The start of the current slice. This will be null as soon as we know we've passed that bound.
+        protected ClusteringBound<?> start;
+        // The end of the current slice. Will never be null.
+        protected ClusteringBound<?> end = BufferClusteringBound.TOP;
+
+        protected Unfiltered next; // the next element to return: this is computed by hasNextInternal().
+
+        protected boolean sliceDone; // set to true once we know we have no more result for the slice. This is in particular
+        // used by the indexed reader when we know we can't have results based on the index.
+
+        public ForwardReader(FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+        }
+
+        public void setForSlice(Slice slice) throws IOException
+        {
+            start = slice.start().isBottom() ? null : slice.start();
+            end = slice.end();
+
+            sliceDone = false;
+            next = null;
+        }
+
+        // Skip all data that comes before the currently set slice.
+        // Return what should be returned at the end of this, or null if nothing should.
+        private Unfiltered handlePreSliceData() throws IOException
+        {
+            assert deserializer != null;
+
+            // Note that the following comparison is not strict. The reason is that the only cases
+            // where it can be == is if the "next" is a RT start marker (either a '[' of a ')[' boundary),
+            // and if we had a strict inequality and an open RT marker before this, we would issue
+            // the open marker first, and then return then next later, which would send in the
+            // stream both '[' (or '(') and then ')[' for the same clustering value, which is wrong.
+            // By using a non-strict inequality, we avoid that problem (if we do get ')[' for the same
+            // clustering value than the slice, we'll simply record it in 'openMarker').
+            while (deserializer.hasNext() && deserializer.compareNextTo(start) <= 0)
+            {
+                if (deserializer.nextIsRow())
+                    deserializer.skipNext();
+                else
+                    updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
+            }
+
+            ClusteringBound<?> sliceStart = start;
+            start = null;
+
+            // We've reached the beginning of our queried slice. If we have an open marker
+            // we should return that first.
+            if (openMarker != null)
+                return new RangeTombstoneBoundMarker(sliceStart, openMarker);
+
+            return null;
+        }
+
+        // Compute the next element to return, assuming we're in the middle to the slice
+        // and the next element is either in the slice, or just after it. Returns null
+        // if we're done with the slice.
+        protected Unfiltered computeNext() throws IOException
+        {
+            assert deserializer != null;
+
+            while (true)
+            {
+                // We use a same reasoning as in handlePreSliceData regarding the strictness of the inequality below.
+                // We want to exclude deserialized unfiltered equal to end, because 1) we won't miss any rows since those
+                // woudn't be equal to a slice bound and 2) a end bound can be equal to a start bound
+                // (EXCL_END(x) == INCL_START(x) for instance) and in that case we don't want to return start bound because
+                // it's fundamentally excluded. And if the bound is a  end (for a range tombstone), it means it's exactly
+                // our slice end, but in that  case we will properly close the range tombstone anyway as part of our "close
+                // an open marker" code in hasNextInterna
+                if (!deserializer.hasNext() || deserializer.compareNextTo(end) >= 0)
+                    return null;
+
+                Unfiltered next = deserializer.readNext();
+                UnfilteredValidation.maybeValidateUnfiltered(next, metadata(), key, sstable);
+                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                if (next.isEmpty())
+                    continue;
+
+                if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+                    updateOpenMarker((RangeTombstoneMarker) next);
+                return next;
+            }
+        }
+
+        protected boolean hasNextInternal() throws IOException
+        {
+            if (next != null)
+                return true;
+
+            if (sliceDone)
+                return false;
+
+            if (start != null)
+            {
+                Unfiltered unfiltered = handlePreSliceData();
+                if (unfiltered != null)
+                {
+                    next = unfiltered;
+                    return true;
+                }
+            }
+
+            next = computeNext();
+            if (next != null)
+                return true;
+
+            // for current slice, no data read from deserialization
+            sliceDone = true;
+            // If we have an open marker, we should not close it, there could be more slices
+            if (openMarker != null)
+            {
+                next = new RangeTombstoneBoundMarker(end, openMarker);
+                return true;
+            }
+            return false;
+        }
+
+        protected Unfiltered nextInternal() throws IOException
+        {
+            if (!hasNextInternal())
+                throw new NoSuchElementException();
+
+            Unfiltered toReturn = next;
+            next = null;
+            return toReturn;
+        }
+    }
+
     protected abstract class Reader implements Iterator<Unfiltered>, Closeable
     {
         public FileDataInput file;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java b/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java
index 616db43aec2f..ff8e7c31f7fa 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/PartitionIndexIterator.java
@@ -24,8 +24,9 @@
 /**
  * Iterator over the partitions of an sstable.
  * <p>
- * The index iterator starts with a key/position ready. advance() should be used to move to the next key; iteration
- * completes when key() == null.
+ * The index iterator starts with a key/position ready. {@link #advance()} should be used to move to the next key;
+ * iteration completes when {@link #advance()} returns {@code false}. For testing the state of iterator any time
+ * {@link #isExhausted()} can be used.
  */
 public interface PartitionIndexIterator extends Closeable
 {
@@ -34,6 +35,11 @@ public interface PartitionIndexIterator extends Closeable
      */
     public ByteBuffer key();
 
+    /**
+     * Position in the component preferred for reading keys. This is specific to SSTable implementation
+     */
+    long keyPosition();
+
     /**
      * Position in the data file where the associated content resides
      */
@@ -55,8 +61,7 @@ public interface PartitionIndexIterator extends Closeable
     boolean isExhausted();
 
     /**
-     * Returns the current position in index file (which along with {@link #indexLength()}
-     * can be used to track iteration progress
+     * Returns the current position in index file
      */
     long indexPosition();
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java
index c00a37ae39bf..80d03b2ff201 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/RowIndexEntry.java
@@ -20,9 +20,18 @@
 
 import org.apache.cassandra.cache.IMeasurableMemory;
 import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.utils.ObjectSizes;
 
-public abstract class RowIndexEntry<T> implements IMeasurableMemory
+/**
+ * The base RowIndexEntry is not stored on disk, only specifies a position in the data file
+ */
+public class RowIndexEntry implements IMeasurableMemory
 {
+    private static final long EMPTY_SIZE = ObjectSizes.measure(new RowIndexEntry(0));
+
+    /**
+     * Row position in a data file
+     */
     public final long position;
 
     public RowIndexEntry(long position)
@@ -31,7 +40,7 @@ public RowIndexEntry(long position)
     }
 
     /**
-     * @return true if this index entry contains the row-level tombstone and column summary.  Otherwise,
+     * @return true if this index entry contains the row-level tombstone and column summary. Otherwise,
      * caller should fetch these from the row header.
      */
     public boolean isIndexed()
@@ -39,10 +48,19 @@ public boolean isIndexed()
         return columnsIndexCount() > 1;
     }
 
-    public abstract DeletionTime deletionTime();
+    public DeletionTime deletionTime()
+    {
+        throw new UnsupportedOperationException();
+    }
 
     public int columnsIndexCount()
     {
         return 0;
     }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return EMPTY_SIZE;
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java
index 1f9177bffcdb..0cbc57003b84 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFlushObserver.java
@@ -38,9 +38,9 @@ public interface SSTableFlushObserver
      * but before any cells are processed (see {@link #nextUnfilteredCluster(Unfiltered)}).
      *
      * @param key The key being appended to SSTable.
-     * @param indexPosition The position of the key in the SSTable PRIMARY_INDEX file.
+     * @param position The position of the key in the component preferred for reading keys
      */
-    void startPartition(DecoratedKey key, long indexPosition);
+    void startPartition(DecoratedKey key, long position);
 
     /**
      * Called when the deletion time of a partition is written to the sstable.
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
index 2ecef6025ac3..75b461aefd60 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
@@ -17,16 +17,22 @@
  */
 package org.apache.cassandra.io.sstable.format;
 
+import java.util.Set;
+
 import com.google.common.base.CharMatcher;
 
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexFormat;
 
 /**
  * Provides the accessors to data on disk.
  */
 public interface SSTableFormat
 {
-    static boolean enableSSTableDevelopmentTestMode = Boolean.getBoolean("cassandra.test.sstableformatdevelopment");
+    public final static String FORMAT_DEFAULT_PROP = "cassandra.sstable.format.default";
 
     Type getType();
 
@@ -39,14 +45,17 @@ public interface SSTableFormat
     public enum Type
     {
         //The original sstable format
-        BIG("big", BigFormat.instance);
+        BIG("big", BigFormat.instance),
+
+        //Sstable format with trie indices
+        BTI("bti", TrieIndexFormat.instance);
 
         public final SSTableFormat info;
         public final String name;
 
         public static Type current()
         {
-            return BIG;
+            return Type.valueOf(System.getProperty(FORMAT_DEFAULT_PROP, BTI.name()).toUpperCase());
         }
 
         Type(String name, SSTableFormat info)
@@ -70,4 +79,22 @@ public static Type validate(String name)
             throw new IllegalArgumentException("No Type constant " + name);
         }
     }
+
+    /**
+     * Returns components required by the particular implementation of SSTable reader so that it can operate on
+     * the SSTable in a regular way.
+     */
+    Set<Component> requiredComponents();
+
+    /**
+     * Returns all the components, both mandatory and optional, which are used by the particular implemetation of
+     * SSTable format.
+     */
+    Set<Component> supportedComponents();
+
+    /**
+     * Returns all the components of the particular implementation of SSTable format which are suitable for streaming.
+     */
+    Set<Component> streamingComponents();
+
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index ae58fb1acb75..f7e34a25cce9 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -17,18 +17,39 @@
  */
 package org.apache.cassandra.io.sstable.format;
 
-import java.io.*;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.lang.ref.WeakReference;
 import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.concurrent.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.ScheduledThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicLong;
-import java.util.function.Supplier;
+import java.util.stream.Collector;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
 import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.RateLimiter;
 import org.slf4j.Logger;
@@ -37,14 +58,24 @@
 import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
 import com.clearspring.analytics.stream.cardinality.ICardinality;
-
 import org.apache.cassandra.cache.InstrumentingCache;
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.concurrent.DebuggableThreadPoolExecutor;
 import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringBoundOrBoundary;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.db.compaction.Scrubber;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.BTreeRow;
 import org.apache.cassandra.db.rows.Cell;
@@ -64,10 +95,30 @@
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.CompressionMetadata;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.BloomFilterTracker;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.Downsampling;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.IndexSummary;
+import org.apache.cassandra.io.sstable.IndexSummaryBuilder;
+import org.apache.cassandra.io.sstable.KeyIterator;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
-import org.apache.cassandra.io.sstable.metadata.*;
-import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.io.sstable.metadata.CompactionMetadata;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.io.util.CheckedFunction;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.metrics.RestorableMeter;
 import org.apache.cassandra.schema.CachingParams;
 import org.apache.cassandra.schema.Schema;
@@ -77,11 +128,20 @@
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.BloomFilter;
+import org.apache.cassandra.utils.BloomFilterSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.ExecutorUtils;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.IFilter;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.NativeLibrary;
+import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.concurrent.Ref;
+import org.apache.cassandra.utils.concurrent.RefCounted;
 import org.apache.cassandra.utils.concurrent.SelfRefCounted;
-import org.apache.cassandra.utils.BloomFilterSerializer;
 
 import static org.apache.cassandra.db.Directories.SECONDARY_INDEX_NAME_SEPARATOR;
 
@@ -166,6 +226,8 @@ private static ScheduledThreadPoolExecutor initSyncExecutor()
     public static final Comparator<SSTableReader> maxTimestampDescending = (o1, o2) -> Long.compare(o2.getMaxTimestamp(), o1.getMaxTimestamp());
     public static final Comparator<SSTableReader> maxTimestampAscending = (o1, o2) -> Long.compare(o1.getMaxTimestamp(), o2.getMaxTimestamp());
 
+    public abstract boolean hasIndex();
+
     // it's just an object, which we use regular Object equality on; we introduce a special class just for easy recognition
     public static final class UniqueIdentifier {}
 
@@ -175,13 +237,7 @@ public static final class UniqueIdentifier {}
 
     public static final Ordering<SSTableReader> sstableOrdering = Ordering.from(sstableComparator);
 
-    public static final Comparator<SSTableReader> sizeComparator = new Comparator<SSTableReader>()
-    {
-        public int compare(SSTableReader o1, SSTableReader o2)
-        {
-            return Longs.compare(o1.onDiskLength(), o2.onDiskLength());
-        }
-    };
+    public static final Comparator<? super SSTableReader> sizeComparator = (o1, o2) -> Longs.compare(o1.onDiskLength(), o2.onDiskLength());
 
     /**
      * maxDataAge is a timestamp in local server time (e.g. System.currentTimeMilli) which represents an upper bound
@@ -230,13 +286,23 @@ public enum OpenReason
     protected final AtomicLong keyCacheHit = new AtomicLong(0);
     protected final AtomicLong keyCacheRequest = new AtomicLong(0);
 
-    private final InstanceTidier tidy;
+    protected final InstanceTidier tidy;
     private final Ref<SSTableReader> selfRef;
 
     private RestorableMeter readMeter;
 
     private volatile double crcCheckChance;
 
+    public static <T extends SSTableReader> Iterable<T> selectOnlyBigTableReaders(Iterable<T> readers)
+    {
+        return Iterables.filter(readers, tr -> tr.descriptor.formatType == SSTableFormat.Type.BIG);
+    }
+
+    public static <T> T selectOnlyBigTableReaders(Collection<? extends SSTableReader> readers, Collector<? super SSTableReader, ?, T> collector)
+    {
+        return readers.stream().filter(tr -> tr.descriptor.formatType == SSTableFormat.Type.BIG).collect(collector);
+    }
+
     /**
      * Calculate approximate key count.
      * If cardinality estimator is available on all given sstables, then this method use them to estimate
@@ -246,7 +312,7 @@ public enum OpenReason
      * @param sstables SSTables to calculate key count
      * @return estimated key count
      */
-    public static long getApproximateKeyCount(Iterable<SSTableReader> sstables)
+    public static long getApproximateKeyCount(Iterable<? extends SSTableReader> sstables)
     {
         long count = -1;
 
@@ -369,24 +435,24 @@ public static SSTableReader open(Descriptor descriptor)
         return open(descriptor, metadata);
     }
 
-    public static SSTableReader open(Descriptor desc, TableMetadataRef metadata)
+    private static SSTableReader open(Descriptor desc, TableMetadataRef metadata)
     {
         return open(desc, componentsFor(desc), metadata);
     }
 
-    public static SSTableReader open(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata)
+    private static SSTableReader open(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata)
     {
         return open(descriptor, components, metadata, true, false);
     }
 
     // use only for offline or "Standalone" operations
-    public static SSTableReader openNoValidation(Descriptor descriptor, Set<Component> components, ColumnFamilyStore cfs)
+    private static SSTableReader openNoValidation(Descriptor descriptor, Set<Component> components, ColumnFamilyStore cfs)
     {
         return open(descriptor, components, cfs.metadata, false, true);
     }
 
     // use only for offline or "Standalone" operations
-    public static SSTableReader openNoValidation(Descriptor descriptor, TableMetadataRef metadata)
+    private static SSTableReader openNoValidation(Descriptor descriptor, TableMetadataRef metadata)
     {
         return open(descriptor, componentsFor(descriptor), metadata, false, true);
     }
@@ -400,7 +466,7 @@ public static SSTableReader openNoValidation(Descriptor descriptor, TableMetadat
      * @return opened SSTableReader
      * @throws IOException
      */
-    public static SSTableReader openForBatch(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata)
+    private static SSTableReader openForBatch(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata)
     {
         // Minimum components without which we can't do anything
         assert components.contains(Component.DATA) : "Data component is missing for sstable " + descriptor;
@@ -454,11 +520,12 @@ public static SSTableReader openForBatch(Descriptor descriptor, Set<Component> c
      * @return {@link SSTableReader}
      * @throws IOException
      */
+    @VisibleForTesting
     public static SSTableReader open(Descriptor descriptor,
-                                     Set<Component> components,
-                                     TableMetadataRef metadata,
-                                     boolean validate,
-                                     boolean isOffline)
+                                      Set<Component> components,
+                                      TableMetadataRef metadata,
+                                      boolean validate,
+                                      boolean isOffline)
     {
         // Minimum components without which we can't do anything
         assert components.contains(Component.DATA) : "Data component is missing for sstable " + descriptor;
@@ -544,7 +611,7 @@ public void run()
                     SSTableReader sstable;
                     try
                     {
-                        sstable = open(entry.getKey(), entry.getValue(), metadata);
+                        sstable = entry.getKey().getFormat().getReaderFactory().open(entry.getKey(), entry.getValue(), metadata);
                     }
                     catch (CorruptSSTableException ex)
                     {
@@ -689,6 +756,14 @@ public static long getTotalUncompressedBytes(Iterable<SSTableReader> sstables)
 
     public abstract PartitionIndexIterator allKeysIterator() throws IOException;
 
+    /**
+     * Partition iterator used only for scrubing (see {@link Scrubber} and {@link ScrubPartitionIterator}).
+     *
+     * @return iterator for scrubing or {@code null} if this {@link SSTableReader} doesn't have the iterator
+     * implemenation (this may be the case if there is no index file for the iterator)
+     */
+    public abstract ScrubPartitionIterator scrubPartitionsIterator() throws IOException;
+
     public boolean equals(Object that)
     {
         return that instanceof SSTableReader && ((SSTableReader) that).descriptor.equals(this.descriptor);
@@ -767,7 +842,7 @@ public static void saveBloomFilter(Descriptor descriptor, IFilter filter)
      *
      * @param task to be guarded by sstable lock
      */
-    public <R> R runWithLock(CheckedFunction<Descriptor, R, IOException> task) throws IOException
+    public <R, E extends Exception> R runWithLock(CheckedFunction<Descriptor, R, E> task) throws E
     {
         synchronized (tidy.global)
         {
@@ -925,7 +1000,7 @@ public SSTableReader cloneWithNewStart(DecoratedKey newStart, final Runnable run
         }
     }
 
-    private static class DropPageCache implements Runnable
+    protected static class DropPageCache implements Runnable
     {
         final FileHandle dfile;
         final long dfilePosition;
@@ -933,7 +1008,7 @@ private static class DropPageCache implements Runnable
         final long ifilePosition;
         final Runnable andThen;
 
-        private DropPageCache(FileHandle dfile, long dfilePosition, FileHandle ifile, long ifilePosition, Runnable andThen)
+        public DropPageCache(FileHandle dfile, long dfilePosition, FileHandle ifile, long ifilePosition, Runnable andThen)
         {
             this.dfile = dfile;
             this.dfilePosition = dfilePosition;
@@ -1003,11 +1078,14 @@ else if (samplingLevel < indexSummary.getSamplingLevel())
     private IndexSummary buildSummaryAtLevel(int newSamplingLevel) throws IOException
     {
         // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
-        try (KeyIterator iterator = KeyIterator.forSSTable(this);
+        try (PartitionIndexIterator iterator = allKeysIterator();
              IndexSummaryBuilder summaryBuilder = new IndexSummaryBuilder(estimatedKeys(), metadata().params.minIndexInterval, newSamplingLevel))
         {
-            while (iterator.hasNext())
-                summaryBuilder.maybeAddEntry(iterator.next(), iterator.getKeyPosition());
+            while (!iterator.isExhausted())
+            {
+                summaryBuilder.maybeAddEntry(decorateKey(iterator.key()), iterator.keyPosition());
+                iterator.advance();
+            }
 
             return summaryBuilder.build(getPartitioner());
         }
@@ -1040,10 +1118,11 @@ public double getEffectiveIndexInterval()
 
     public void releaseSummary()
     {
-        tidy.releaseSummary();
+        indexSummary.close();
+        assert indexSummary.isCleanedUp();
     }
 
-    private void validate()
+    public void validate()
     {
         if (this.first.compareTo(this.last) > 0)
         {
@@ -1363,7 +1442,7 @@ public boolean isKeyCacheEnabled()
      * allow key selection by token bounds but only if op != * EQ
      * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins.
      */
-    public final RowIndexEntry<?> getPosition(PartitionPosition key, Operator op)
+    public final RowIndexEntry getPosition(PartitionPosition key, Operator op)
     {
         return getPosition(key, op, true, false, SSTableReadsListener.NOOP_LISTENER);
     }
@@ -1383,11 +1462,11 @@ public final boolean checkEntryExists(PartitionPosition key,
      * @param listener a listener used to handle internal events
      * @return The index entry corresponding to the key, or null if the key is not present
      */
-    protected abstract RowIndexEntry<?> getPosition(PartitionPosition key,
-                                                    Operator op,
-                                                    boolean updateCacheAndStats,
-                                                    boolean permitMatchPastLast,
-                                                    SSTableReadsListener listener);
+    protected abstract RowIndexEntry getPosition(PartitionPosition key,
+                                                 Operator op,
+                                                 boolean updateCacheAndStats,
+                                                 boolean permitMatchPastLast,
+                                                 SSTableReadsListener listener);
 
     public abstract UnfilteredRowIterator iterator(DecoratedKey key,
                                                    Slices slices,
@@ -1395,7 +1474,7 @@ public abstract UnfilteredRowIterator iterator(DecoratedKey key,
                                                    boolean reversed,
                                                    SSTableReadsListener listener);
 
-    public abstract UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, DecoratedKey key, boolean tombstoneOnly);
+    public abstract UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, boolean tombstoneOnly);
 
     /**
      * Finds and returns the first key beyond a given token in this SSTable or null if no such key exists.
@@ -1413,7 +1492,7 @@ public DecoratedKey firstKeyBeyond(PartitionPosition token)
         try (PartitionIndexIterator iterator = allKeysIterator())
         {
             iterator.indexPosition(sampledPosition);
-            KeyIterator keyIterator = new KeyIterator(iterator, getPartitioner());
+            KeyIterator keyIterator = new KeyIterator(iterator, getPartitioner(), uncompressedLength());
 
             while (keyIterator.hasNext())
             {
@@ -1598,14 +1677,12 @@ public boolean isRepaired()
         return sstableMetadata.repairedAt != ActiveRepairService.UNREPAIRED_SSTABLE;
     }
 
-    public DecoratedKey keyAt(RandomAccessReader indexFileReader, long indexPosition) throws IOException
+    public DecoratedKey keyAt(RandomAccessReader reader, long position) throws IOException
     {
-        indexFileReader.seek(indexPosition);
-        return keyAt(indexFileReader);
+        reader.seek(position);
+        return keyAt(reader);
     }
 
-    public abstract DecoratedKey keyAt(long indexPosition) throws IOException;
-
     public abstract DecoratedKey keyAt(FileDataInput reader) throws IOException;
 
     /**
@@ -1946,6 +2023,8 @@ public RandomAccessReader openIndexReader()
         return null;
     }
 
+    public abstract RandomAccessReader openKeyComponentReader();
+
     public ChannelProxy getDataChannel()
     {
         return dfile.channel;
@@ -2018,9 +2097,8 @@ public Ref<SSTableReader> ref()
         return selfRef.ref();
     }
 
-    protected void setup(boolean trackHotness)
+    public void setup(boolean trackHotness)
     {
-        tidy.setup(this, trackHotness);
         this.readMeter = tidy.global.readMeter;
     }
 
@@ -2050,15 +2128,12 @@ public void addTo(Ref.IdentityCollection identities)
      * When the InstanceTidier cleansup, it releases its reference to its GlobalTidy; when all InstanceTidiers
      * for that type have run, the GlobalTidy cleans up.
      */
-    private static final class InstanceTidier implements Tidy
+    protected static final class InstanceTidier implements RefCounted.Tidy
     {
         private final Descriptor descriptor;
         private final TableId tableId;
-        private IFilter bf;
-        private IndexSummary summary;
 
-        private FileHandle dfile;
-        private FileHandle ifile;
+        private List<AutoCloseable> closables;
         private Runnable runOnClose;
         private boolean isReplaced = false;
 
@@ -2069,18 +2144,15 @@ private static final class InstanceTidier implements Tidy
 
         private volatile boolean setup;
 
-        void setup(SSTableReader reader, boolean trackHotness)
+        public void setup(SSTableReader reader, boolean trackHotness, List<AutoCloseable> closables)
         {
             this.setup = true;
-            this.bf = reader.bf;
-            this.summary = reader.indexSummary;
-            this.dfile = reader.dfile;
-            this.ifile = reader.ifile;
             // get a new reference to the shared descriptor-type tidy
             this.globalRef = GlobalTidy.get(reader);
             this.global = globalRef.get();
             if (trackHotness)
                 global.ensureReadMeter();
+            this.closables = closables;
         }
 
         InstanceTidier(Descriptor descriptor, TableId tableId)
@@ -2108,34 +2180,49 @@ public void tidy()
             else
                 barrier = null;
 
-            ScheduledExecutors.nonPeriodicTasks.execute(new Runnable()
-            {
-                public void run()
+            ScheduledExecutors.nonPeriodicTasks.execute(() -> {
+                if (logger.isTraceEnabled())
+                    logger.trace("Async instance tidier for {}, before barrier", descriptor);
+
+                if (barrier != null)
+                    barrier.await();
+
+                if (logger.isTraceEnabled())
+                    logger.trace("Async instance tidier for {}, after barrier", descriptor);
+
+                Throwable exceptions = null;
+                if (runOnClose != null) try
                 {
-                    if (logger.isTraceEnabled())
-                        logger.trace("Async instance tidier for {}, before barrier", descriptor);
-
-                    if (barrier != null)
-                        barrier.await();
-
-                    if (logger.isTraceEnabled())
-                        logger.trace("Async instance tidier for {}, after barrier", descriptor);
-
-                    if (bf != null)
-                        bf.close();
-                    if (summary != null)
-                        summary.close();
-                    if (runOnClose != null)
-                        runOnClose.run();
-                    if (dfile != null)
-                        dfile.close();
-                    if (ifile != null)
-                        ifile.close();
-                    globalRef.release();
+                    runOnClose.run();
+                }
+                catch (RuntimeException | Error ex)
+                {
+                    logger.error("Failed to run on-close listeners for sstable " + descriptor.baseFilename(), ex);
+                    exceptions = ex;
+                }
 
-                    if (logger.isTraceEnabled())
-                        logger.trace("Async instance tidier for {}, completed", descriptor);
+                Throwable closeExceptions = Throwables.close(null, closables);
+                if (closeExceptions != null)
+                {
+                    logger.error("Failed to close some sstable components of " + descriptor.baseFilename(), closeExceptions);
+                    exceptions = Throwables.merge(exceptions, closeExceptions);
                 }
+
+                try
+                {
+                    globalRef.release();
+                }
+                catch (RuntimeException | Error ex)
+                {
+                    logger.error("Failed to release the global ref of " + descriptor.baseFilename(), ex);
+                    exceptions = Throwables.merge(exceptions, ex);
+                }
+
+                if (exceptions != null)
+                    JVMStabilityInspector.inspectThrowable(exceptions);
+
+                if (logger.isTraceEnabled())
+                    logger.trace("Async instance tidier for {}, completed", descriptor);
             });
         }
 
@@ -2144,12 +2231,6 @@ public String name()
             return descriptor.toString();
         }
 
-        void releaseSummary()
-        {
-            summary.close();
-            assert summary.isCleanedUp();
-            summary = null;
-        }
     }
 
     /**
@@ -2160,7 +2241,7 @@ void releaseSummary()
      * and stash a reference to it to be released when they are. Once all such references are
      * released, this shared tidy will be performed.
      */
-    static final class GlobalTidy implements Tidy
+    static final class GlobalTidy implements RefCounted.Tidy
     {
         static final WeakReference<ScheduledFuture<?>> NULL = new WeakReference<>(null);
         // keyed by descriptor, mapping to the shared GlobalTidy for that descriptor
@@ -2264,13 +2345,97 @@ public static void resetTidying()
         GlobalTidy.lookup.clear();
     }
 
+    /**
+     * Main entry point for opening (creating a new instance of) a Reader. The desired usage is obtaining the
+     * factory instance from SSTable descriptor and invoking one of the open methods. This usage makes
+     * static @{link SSTableReader} open* methods obsolete (all of them are private now).
+     * {@link SSTableReader} subclasses are exepected to provide an implementation of this interface.
+     */
     public interface Factory
     {
-        SSTableReader open(SSTableReaderBuilder builder);
-
         PartitionIndexIterator indexIterator(Descriptor descriptor, TableMetadata metadata);
+
+        // TODO in the implementation of those methods we will refer the current static methods which are implemented in AbstractdBigTableReader
+        // TODO make those static openXXX methods private
+
+        SSTableReader openForBatch(Descriptor desc, Set<Component> components, TableMetadataRef metadata);
+
+        SSTableReader open(Descriptor desc);
+
+        SSTableReader open(Descriptor desc, TableMetadataRef metadata);
+
+        SSTableReader open(Descriptor desc, Set<Component> components, TableMetadataRef metadata);
+
+        SSTableReader open(Descriptor desc, Set<Component> components, TableMetadataRef metadata, boolean validate, boolean isOffline);
+
+        SSTableReader openNoValidation(Descriptor desc, TableMetadataRef tableMetadataRef);
+
+        SSTableReader openNoValidation(Descriptor desc, Set<Component> components, ColumnFamilyStore cfs);
+
+        SSTableReader moveAndOpenSSTable(ColumnFamilyStore cfs, Descriptor oldDescriptor, Descriptor newDescriptor, Set<Component> components, boolean copyData);
+
     }
 
+    /**
+     * Opens BigTable format readers. Proxies open calls to (private) static methods of @{link SSTableReader}.
+     * This class servers as a proxy to legacy private static methods that should be refactored out of
+     * SSTableReader (as they are specific to BigTable format) but are left in the Reader for painless
+     * upstream merges.
+     * Implementations of this abstract class is provided by BigTable format reader.
+     */
+    public abstract static class AbstractBigTableReaderFactory implements SSTableReader.Factory
+    {
+        @Override
+        public SSTableReader openForBatch(Descriptor desc, Set<Component> components, TableMetadataRef metadata)
+        {
+            return SSTableReader.openForBatch(desc, components, metadata);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc)
+        {
+            return SSTableReader.open(desc);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc, TableMetadataRef metadata)
+        {
+            return SSTableReader.open(desc, metadata);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc, Set<Component> components, TableMetadataRef metadata)
+        {
+            return SSTableReader.open(desc, components, metadata);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc, Set<Component> components, TableMetadataRef metadata, boolean validate, boolean isOffline)
+        {
+            return SSTableReader.open(desc, components, metadata, validate, isOffline);
+        }
+
+        @Override
+        public SSTableReader openNoValidation(Descriptor desc, TableMetadataRef tableMetadataRef)
+        {
+            return SSTableReader.openNoValidation(desc, tableMetadataRef);
+        }
+
+        @Override
+        public SSTableReader openNoValidation(Descriptor desc, Set<Component> components, ColumnFamilyStore cfs)
+        {
+            return SSTableReader.openNoValidation(desc, components, cfs);
+        }
+
+        @Override
+        public SSTableReader moveAndOpenSSTable(ColumnFamilyStore cfs, Descriptor oldDescriptor, Descriptor newDescriptor, Set<Component> components, boolean copyData)
+        {
+            return SSTableReader.moveAndOpenSSTable(cfs, oldDescriptor, newDescriptor, components, copyData);
+        }
+
+    }
+
+
     public static class PartitionPositionBounds
     {
         public final long lowerPosition;
@@ -2375,7 +2540,7 @@ public static SSTableReader moveAndOpenSSTable(ColumnFamilyStore cfs, Descriptor
         SSTableReader reader;
         try
         {
-            reader = SSTableReader.open(newDescriptor, components, cfs.metadata);
+            reader = newDescriptor.formatType.info.getReaderFactory().open(newDescriptor, components, cfs.metadata);
         }
         catch (Throwable t)
         {
@@ -2391,4 +2556,20 @@ public static void shutdownBlocking(long timeout, TimeUnit unit) throws Interrup
         ExecutorUtils.shutdownNowAndWait(timeout, unit, syncExecutor);
         resetTidying();
     }
+
+    public static void checkRequiredComponents(Descriptor descriptor, Set<Component> components, boolean validate)
+    {
+        if (validate)
+        {
+            Set<Component> requiredComponents = descriptor.formatType.info.requiredComponents();
+            // Minimum components without which we can't do anything
+            assert components.containsAll(requiredComponents) : String.format("Required components %s missing for sstable %s", Sets.difference(requiredComponents, components), descriptor);
+        }
+        else
+        {
+            // Scrub-only case, we just need data file.
+            assert components.contains(Component.DATA);
+        }
+    }
+
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
index 24edf70d1c24..4b2248612ba9 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
@@ -18,36 +18,36 @@
 
 package org.apache.cassandra.io.sstable.format;
 
+import java.io.BufferedInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.ImmutableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.cache.ChunkCache;
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.format.big.BigTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
 import org.apache.cassandra.io.util.DiskOptimizationStrategy;
 import org.apache.cassandra.io.util.FileHandle;
 import org.apache.cassandra.io.util.FileUtils;
-import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.utils.*;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedInputStream;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.Set;
-import java.util.concurrent.TimeUnit;
-
-import com.google.common.collect.ImmutableMap;
 
 public abstract class SSTableReaderBuilder
 {
@@ -91,13 +91,15 @@ public SSTableReaderBuilder(Descriptor descriptor,
 
     public abstract SSTableReader build();
 
-    public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor)
+    @SuppressWarnings("resource")
+    public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor, Component component)
     {
-        return new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX))
+        return new FileHandle.Builder(descriptor.filenameFor(component))
                 .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
                 .withChunkCache(ChunkCache.instance);
     }
 
+    @SuppressWarnings("resource")
     public static FileHandle.Builder defaultDataHandleBuilder(Descriptor descriptor)
     {
         return new FileHandle.Builder(descriptor.filenameFor(Component.DATA))
@@ -178,12 +180,11 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter,
                 bf = FilterFactory.getFilter(estimatedKeys, metadata.params.bloomFilterFpChance);
 
             // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
-            try (KeyIterator keyIterator = new KeyIterator(indexIterator, metadata.partitioner);
-                 IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL))
+            try (IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL))
             {
-                while (keyIterator.hasNext())
+                while (!indexIterator.isExhausted())
                 {
-                    DecoratedKey decoratedKey = keyIterator.next();
+                    DecoratedKey decoratedKey = metadata.partitioner.decorateKey(indexIterator.key());
 
                     if (!summaryLoaded)
                     {
@@ -191,11 +192,13 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter,
                             first = decoratedKey;
                         last = decoratedKey;
 
-                        summaryBuilder.maybeAddEntry(decoratedKey, keyIterator.getKeyPosition());
+                        summaryBuilder.maybeAddEntry(decoratedKey, indexIterator.keyPosition());
                     }
 
                     if (recreateBloomFilter)
                         bf.add(decoratedKey);
+
+                    indexIterator.advance();
                 }
 
                 if (!summaryLoaded)
@@ -275,7 +278,7 @@ public SSTableReaderBuilder.ForWriter summary(IndexSummary summary)
         @Override
         public SSTableReader build()
         {
-            SSTableReader reader = readerFactory.open(this);
+            SSTableReader reader = new BigTableReader(this);
 
             reader.setup(true);
             return reader;
@@ -304,7 +307,7 @@ public SSTableReader build()
             initSummary(dataFilePath, components, statsMetadata);
 
             boolean compression = components.contains(Component.COMPRESSION_INFO);
-            try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor);
+            try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor, Component.PRIMARY_INDEX);
                  FileHandle.Builder dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression))
             {
                 long indexFileLength = new File(descriptor.filenameFor(Component.PRIMARY_INDEX)).length();
@@ -315,7 +318,7 @@ public SSTableReader build()
                 dfile = dbuilder.bufferSize(dataBufferSize).complete();
                 bf = FilterFactory.AlwaysPresent;
 
-                SSTableReader sstable = readerFactory.open(this);
+                SSTableReader sstable = new BigTableReader(this);
 
                 sstable.first = first;
                 sstable.last = last;
@@ -380,7 +383,7 @@ public SSTableReader build()
                 throw new CorruptSSTableException(t, dataFilePath);
             }
 
-            SSTableReader sstable = readerFactory.open(this);
+            SSTableReader sstable = new BigTableReader(this);
 
             sstable.first = first;
             sstable.last = last;
@@ -444,7 +447,7 @@ void load(boolean recreateBloomFilter,
                   Set<Component> components) throws IOException
         {
             boolean compression = components.contains(Component.COMPRESSION_INFO);
-            try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor);
+            try (FileHandle.Builder ibuilder = defaultIndexHandleBuilder(descriptor, Component.PRIMARY_INDEX);
                  FileHandle.Builder dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression))
             {
                 loadSummary();
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
index 0b34fa4b8d32..db5d0398d3c1 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReadsListener.java
@@ -17,8 +17,6 @@
  */
 package org.apache.cassandra.io.sstable.format;
 
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
-
 /**
  * Listener for receiving notifications associated with reading SSTables.
  */
@@ -66,7 +64,7 @@ default void onSSTableSkipped(SSTableReader sstable, SkippingReason reason)
      * @param indexEntry the index entry
      * @param reason the reason for which the SSTable has been selected
      */
-    default void onSSTableSelected(SSTableReader sstable, RowIndexEntry<?> indexEntry, SelectionReason reason)
+    default void onSSTableSelected(SSTableReader sstable, RowIndexEntry indexEntry, SelectionReason reason)
     {
     }
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index 14d976b7b4bc..1e807dbb7ed1 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
@@ -19,14 +19,18 @@
 package org.apache.cassandra.io.sstable.format;
 
 import java.util.*;
+import java.util.function.Consumer;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Sets;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.SerializationHeader;
-import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.index.Index;
@@ -34,7 +38,6 @@
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTable;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -43,6 +46,7 @@
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
 /**
@@ -53,6 +57,8 @@
  */
 public abstract class SSTableWriter extends SSTable implements Transactional
 {
+    private static final Logger logger = LoggerFactory.getLogger(SSTableWriter.class);
+
     protected long repairedAt;
     protected UUID pendingRepair;
     protected boolean isTransient;
@@ -74,6 +80,7 @@ protected abstract class TransactionalProxy extends AbstractTransactional
     }
 
     protected SSTableWriter(Descriptor descriptor,
+                            Set<Component> components,
                             long keyCount,
                             long repairedAt,
                             UUID pendingRepair,
@@ -81,10 +88,9 @@ protected SSTableWriter(Descriptor descriptor,
                             TableMetadataRef metadata,
                             MetadataCollector metadataCollector,
                             SerializationHeader header,
-                            Collection<SSTableFlushObserver> observers,
-                            Set<Component> indexComponents)
+                            Collection<SSTableFlushObserver> observers)
     {
-        super(descriptor, components(metadata.getLocal(), indexComponents), metadata, DatabaseDescriptor.getDiskOptimizationStrategy());
+        super(descriptor, components, metadata, DatabaseDescriptor.getDiskOptimizationStrategy());
         this.keyCount = keyCount;
         this.repairedAt = repairedAt;
         this.pendingRepair = pendingRepair;
@@ -166,7 +172,10 @@ public static SSTableWriter create(Descriptor descriptor,
         return create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, 0, header, indexGroups, lifecycleNewTracker);
     }
 
-    private static Set<Component> components(TableMetadata metadata, Collection<Component> indexComponents)
+    /**
+     * BigTable SSTable components. Should be moved to BigTableWriter but is left here for painless upstream merges.
+     */
+    public static Set<Component> bigTableComponents(TableMetadata metadata, Collection<Component> indexComponents)
     {
         Set<Component> components = new HashSet<Component>(Arrays.asList(Component.DATA,
                 Component.PRIMARY_INDEX,
@@ -227,7 +236,7 @@ private static Collection<SSTableFlushObserver> observers(Descriptor descriptor,
      *
      * @throws FSWriteError if a write to the dataFile fails
      */
-    public abstract BigTableRowIndexEntry append(UnfilteredRowIterator iterator);
+    public abstract RowIndexEntry append(UnfilteredRowIterator iterator);
 
     public abstract long getFilePointer();
 
@@ -262,7 +271,7 @@ public SSTableWriter setOpenResult(boolean openResult)
     /**
      * Open the resultant SSTableReader before it has been fully written
      */
-    public abstract SSTableReader openEarly();
+    public abstract boolean openEarly(Consumer<SSTableReader> callWhenReady);
 
     /**
      * Open the resultant SSTableReader once it has been fully written, but before the
@@ -373,7 +382,8 @@ public static void rename(Descriptor tmpdesc, Descriptor newdesc, Set<Component>
         FileUtils.renameWithConfirm(tmpdesc.filenameFor(Component.DATA), newdesc.filenameFor(Component.DATA));
 
         // rename it without confirmation because summary can be available for loadNewSSTables but not for closeAndOpenReader
-        FileUtils.renameWithOutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
+        if (components.contains(Component.SUMMARY))
+            FileUtils.renameWithOutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
     }
 
     public static void copy(Descriptor tmpdesc, Descriptor newdesc, Set<Component> components)
@@ -387,7 +397,8 @@ public static void copy(Descriptor tmpdesc, Descriptor newdesc, Set<Component> c
         FileUtils.copyWithConfirm(tmpdesc.filenameFor(Component.DATA), newdesc.filenameFor(Component.DATA));
 
         // copy it without confirmation because summary can be available for loadNewSSTables but not for closeAndOpenReader
-        FileUtils.copyWithOutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
+        if (components.contains(Component.SUMMARY))
+            FileUtils.copyWithOutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
     }
 
     public static void hardlink(Descriptor tmpdesc, Descriptor newdesc, Set<Component> components)
@@ -401,7 +412,8 @@ public static void hardlink(Descriptor tmpdesc, Descriptor newdesc, Set<Componen
         FileUtils.createHardLinkWithConfirm(tmpdesc.filenameFor(Component.DATA), newdesc.filenameFor(Component.DATA));
 
         // copy it without confirmation because summary can be available for loadNewSSTables but not for closeAndOpenReader
-        FileUtils.createHardLinkWithoutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
+        if (components.contains(Component.SUMMARY))
+            FileUtils.createHardLinkWithoutConfirm(tmpdesc.filenameFor(Component.SUMMARY), newdesc.filenameFor(Component.SUMMARY));
     }
 
     public interface SSTableSizeParameters
@@ -427,4 +439,13 @@ public abstract SSTableWriter open(Descriptor descriptor,
                                            LifecycleNewTracker lifecycleNewTracker,
                                            Set<Component> indexComponents);
     }
+
+    protected void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
+    {
+        if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
+        {
+            String keyString = metadata().partitionKeyType.getString(key.getKey());
+            logger.warn("Writing large partition {}/{}:{} ({}) to sstable {}", metadata.keyspace, metadata.name, keyString, FBUtilities.prettyPrintMemory(rowSize), getFilename());
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java
similarity index 84%
rename from src/java/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriter.java
rename to src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java
index f05ea94cb7ea..90f788db0646 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java
@@ -15,7 +15,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.cassandra.io.sstable.format.big;
+package org.apache.cassandra.io.sstable.format;
 
 import java.io.EOFException;
 import java.io.File;
@@ -39,7 +39,6 @@
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableMultiWriter;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.SequentialWriter;
 import org.apache.cassandra.io.util.SequentialWriterOption;
@@ -50,11 +49,10 @@
 import static java.lang.String.format;
 import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory;
 
-public class BigTableZeroCopyWriter extends SSTable implements SSTableMultiWriter
+public class SSTableZeroCopyWriter extends SSTable implements SSTableMultiWriter
 {
-    private static final Logger logger = LoggerFactory.getLogger(BigTableZeroCopyWriter.class);
+    private static final Logger logger = LoggerFactory.getLogger(SSTableZeroCopyWriter.class);
 
-    private final TableMetadataRef metadata;
     private volatile SSTableReader finalReader;
     private final Map<Component.Type, SequentialWriter> componentWriters;
 
@@ -65,30 +63,19 @@ public class BigTableZeroCopyWriter extends SSTable implements SSTableMultiWrite
                               .bufferType(BufferType.OFF_HEAP)
                               .build();
 
-    private static final ImmutableSet<Component> SUPPORTED_COMPONENTS =
-        ImmutableSet.of(Component.DATA,
-                        Component.PRIMARY_INDEX,
-                        Component.SUMMARY,
-                        Component.STATS,
-                        Component.COMPRESSION_INFO,
-                        Component.FILTER,
-                        Component.DIGEST,
-                        Component.CRC);
-
-    public BigTableZeroCopyWriter(Descriptor descriptor,
-                                  TableMetadataRef metadata,
-                                  LifecycleNewTracker lifecycleNewTracker,
-                                  final Collection<Component> components)
+    public SSTableZeroCopyWriter(Descriptor descriptor,
+                                 TableMetadataRef metadata,
+                                 LifecycleNewTracker lifecycleNewTracker,
+                                 final Collection<Component> components)
     {
         super(descriptor, ImmutableSet.copyOf(components), metadata, DatabaseDescriptor.getDiskOptimizationStrategy());
 
         lifecycleNewTracker.trackNew(this);
-        this.metadata = metadata;
         this.componentWriters = new EnumMap<>(Component.Type.class);
 
-        if (!SUPPORTED_COMPONENTS.containsAll(components))
+        if (!descriptor.getFormat().streamingComponents().containsAll(components))
             throw new AssertionError(format("Unsupported streaming component detected %s",
-                                            Sets.difference(ImmutableSet.copyOf(components), SUPPORTED_COMPONENTS)));
+                                            Sets.difference(ImmutableSet.copyOf(components), descriptor.getFormat().streamingComponents())));
 
         for (Component c : components)
             componentWriters.put(c.type, makeWriter(descriptor, c));
@@ -145,7 +132,7 @@ public Collection<SSTableReader> finish(boolean openResult)
     public Collection<SSTableReader> finished()
     {
         if (finalReader == null)
-            finalReader = SSTableReader.open(descriptor, components, metadata);
+            finalReader = descriptor.getFormat().getReaderFactory().open(descriptor, components, metadata);
 
         return ImmutableList.of(finalReader);
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/ScrubPartitionIterator.java b/src/java/org/apache/cassandra/io/sstable/format/ScrubPartitionIterator.java
new file mode 100644
index 000000000000..f5f8800b161a
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/ScrubPartitionIterator.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+/**
+ * Iterator over the partitions of an sstable used for scrubbing.
+ *
+ * The difference between this and PartitionIndexIterator is that this only uses information present in the index file
+ * and does not try to read keys of the data file (for the trie index format), thus key() can be null.
+ *
+ * Starts advanced to a position, advance() is to be used to go to next, and iteration completes when dataPosition() == -1.
+ */
+public interface ScrubPartitionIterator extends Closeable
+{
+    /**
+     * Serialized partition key or {@code null} if the iterator reached the end of the index or if the key may not
+     * be fully retrieved from the index file.
+     * @return
+     */
+    ByteBuffer key();
+
+    /**
+     * Key position in data file or -1 if the iterator reached the end of the index.
+     */
+    long dataPosition();
+
+    /**
+     * Move to the next position in the index file.
+     */
+    void advance() throws IOException;
+
+    boolean isExhausted();
+
+    void close();
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/sstable/format/Version.java b/src/java/org/apache/cassandra/io/sstable/format/Version.java
index 7ca5bc09b09c..4b115cae054f 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/Version.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/Version.java
@@ -126,5 +126,14 @@ public int hashCode()
         return version != null ? version.hashCode() : 0;
     }
 
+    // the fields below are present only in DSE but we do not use them here; though in order to be able to read
+    // DSE sstables we need to at least skip that data
+    public abstract boolean hasZeroCopyMetadata();
+
+    public abstract boolean hasIncrementalNodeSyncMetadata();
+
+    // TODO TBD
+    public abstract boolean hasMaxColumnValueLengths();
+
     public abstract boolean hasOriginatingHostId();
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java
deleted file mode 100644
index 4ff6a727c12b..000000000000
--- a/src/java/org/apache/cassandra/io/sstable/format/big/AbstractBigTableIterator.java
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.io.sstable.format.big;
-
-import java.io.IOException;
-
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.DeletionTime;
-import org.apache.cassandra.db.Slices;
-import org.apache.cassandra.db.UnfilteredDeserializer;
-import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator;
-import org.apache.cassandra.db.filter.ColumnFilter;
-import org.apache.cassandra.db.rows.RangeTombstoneMarker;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.FileDataInput;
-import org.apache.cassandra.io.util.FileHandle;
-
-public abstract class AbstractBigTableIterator extends AbstractSSTableIterator<BigTableRowIndexEntry>
-{
-    protected AbstractBigTableIterator(SSTableReader sstable,
-                                       FileDataInput file,
-                                       DecoratedKey key,
-                                       BigTableRowIndexEntry indexEntry,
-                                       Slices slices,
-                                       ColumnFilter columnFilter,
-                                       FileHandle ifile)
-    {
-        super(sstable, file, key, indexEntry, slices, columnFilter, ifile);
-    }
-
-    protected abstract class RowReader extends Reader {
-        protected UnfilteredDeserializer deserializer;
-
-        // Records the currently open range tombstone (if any)
-        protected DeletionTime openMarker;
-
-        protected RowReader(FileDataInput file, boolean shouldCloseFile)
-        {
-            super(file, shouldCloseFile);
-
-            if (file != null)
-                createDeserializer();
-        }
-
-        private void createDeserializer()
-        {
-            assert file != null && deserializer == null;
-            deserializer = UnfilteredDeserializer.create(metadata, file, sstable.header, helper);
-        }
-
-        public void seekToPosition(long position) throws IOException
-        {
-            // This may be the first time we're actually looking into the file
-            if (file == null)
-            {
-                file = sstable.getFileDataInput(position);
-                createDeserializer();
-            }
-            else
-            {
-                file.seek(position);
-            }
-        }
-
-        protected void updateOpenMarker(RangeTombstoneMarker marker)
-        {
-            // Note that we always read index blocks in forward order so this method is always called in forward order
-            openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : null;
-        }
-    }
-}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index 5e5888480a8f..1a6a236c3746 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -22,6 +22,9 @@
 import java.util.Set;
 import java.util.UUID;
 
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -35,6 +38,8 @@
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.net.MessagingService;
 
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultIndexHandleBuilder;
+
 /**
  * Legacy bigtable format
  */
@@ -45,6 +50,29 @@ public class BigFormat implements SSTableFormat
     private static final SSTableReader.Factory readerFactory = new ReaderFactory();
     private static final SSTableWriter.Factory writerFactory = new WriterFactory();
 
+    private final static Set<Component> REQUIRED_COMPONENTS = ImmutableSet.of(Component.DATA,
+                                                                              Component.PRIMARY_INDEX,
+                                                                              Component.STATS);
+
+    private final static Set<Component> SUPPORTED_COMPONENTS = ImmutableSet.of(Component.DATA,
+                                                                               Component.PRIMARY_INDEX,
+                                                                               Component.FILTER,
+                                                                               Component.COMPRESSION_INFO,
+                                                                               Component.STATS,
+                                                                               Component.DIGEST,
+                                                                               Component.CRC,
+                                                                               Component.SUMMARY,
+                                                                               Component.TOC);
+
+    private final static Set<Component> STREAMING_COMPONENTS = ImmutableSet.of(Component.DATA,
+                                                                               Component.PRIMARY_INDEX,
+                                                                               Component.SUMMARY,
+                                                                               Component.STATS,
+                                                                               Component.COMPRESSION_INFO,
+                                                                               Component.FILTER,
+                                                                               Component.DIGEST,
+                                                                               Component.CRC);
+
     private BigFormat()
     {
 
@@ -80,6 +108,24 @@ public SSTableReader.Factory getReaderFactory()
         return readerFactory;
     }
 
+    @Override
+    public Set<Component> requiredComponents()
+    {
+        return REQUIRED_COMPONENTS;
+    }
+
+    @Override
+    public Set<Component> supportedComponents()
+    {
+        return SUPPORTED_COMPONENTS;
+    }
+
+    @Override
+    public Set<Component> streamingComponents()
+    {
+        return STREAMING_COMPONENTS;
+    }
+
     static class WriterFactory extends SSTableWriter.Factory
     {
         @Override
@@ -109,18 +155,12 @@ public SSTableWriter open(Descriptor descriptor,
         }
     }
 
-    static class ReaderFactory implements SSTableReader.Factory
+    static class ReaderFactory extends SSTableReader.AbstractBigTableReaderFactory
     {
-        @Override
-        public SSTableReader open(SSTableReaderBuilder builder)
-        {
-            return new BigTableReader(builder);
-        }
-
         @Override
         public PartitionIndexIterator indexIterator(Descriptor descriptor, TableMetadata metadata)
         {
-            try (FileHandle iFile = SSTableReaderBuilder.defaultIndexHandleBuilder(descriptor).complete()) {
+            try (FileHandle iFile = defaultIndexHandleBuilder(descriptor, Component.PRIMARY_INDEX).complete()) {
                 SerializationHeader.Component headerComponent = (SerializationHeader.Component)
                                                                 descriptor.getMetadataSerializer()
                                                                           .deserialize(descriptor, MetadataType.HEADER);
@@ -215,6 +255,7 @@ public boolean hasCommitLogIntervals()
             return hasCommitLogIntervals;
         }
 
+        @Override
         public boolean hasPendingRepair()
         {
             return hasPendingRepair;
@@ -256,6 +297,7 @@ public boolean hasPartitionLevelDeletionsPresenceMarker()
             return hasPartitionLevelDeletionPresenceMarker;
         }
 
+        @Override
         public boolean isCompatible()
         {
             return version.compareTo(earliest_supported_version) >= 0 && version.charAt(0) <= current_version.charAt(0);
@@ -283,5 +325,24 @@ public boolean hasOldBfFormat()
         {
             return hasOldBfFormat;
         }
+
+        @Override
+        public boolean hasZeroCopyMetadata()
+        {
+            return false;
+        }
+
+        @Override
+        public boolean hasIncrementalNodeSyncMetadata()
+        {
+            return false;
+        }
+
+        // TODO TBD
+        @Override
+        public boolean hasMaxColumnValueLengths()
+        {
+            return false;
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java
index 846f00809cae..180f69073ef7 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTablePartitionIndexIterator.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 @NotThreadSafe
+// TODO STAR-247: implement unit test
 public class BigTablePartitionIndexIterator implements PartitionIndexIterator
 {
     private final FileHandle indexFile;
@@ -38,6 +39,7 @@ public class BigTablePartitionIndexIterator implements PartitionIndexIterator
 
     private ByteBuffer key;
     private long dataPosition;
+    private long keyPosition;
 
     private BigTablePartitionIndexIterator(FileHandle indexFile,
                                            RandomAccessReader reader,
@@ -100,6 +102,7 @@ public void close()
     {
         key = null;
         dataPosition = -1;
+        keyPosition = -1;
         FileUtils.closeQuietly(reader);
         FileUtils.closeQuietly(indexFile);
     }
@@ -109,12 +112,14 @@ public boolean advance() throws IOException
     {
         if (!reader.isEOF())
         {
+            keyPosition = reader.getFilePointer();
             key = ByteBufferUtil.readWithShortLength(reader);
             dataPosition = rowIndexEntrySerializer.deserializePositionAndSkip(reader);
             return true;
         }
         else
         {
+            keyPosition = -1;
             dataPosition = -1;
             key = null;
             return false;
@@ -133,6 +138,12 @@ public ByteBuffer key()
         return key;
     }
 
+    @Override
+    public long keyPosition()
+    {
+        return keyPosition;
+    }
+
     @Override
     public long dataPosition()
     {
@@ -152,6 +163,7 @@ public void indexPosition(long position) throws IOException
             throw new IndexOutOfBoundsException("The requested position exceeds the index length");
         reader.seek(position);
         key = null;
+        keyPosition = 0;
         dataPosition = 0;
         advance();
     }
@@ -167,7 +179,14 @@ public void reset() throws IOException
     {
         reader.seek(initialPosition);
         key = null;
+        keyPosition = 0;
         dataPosition = 0;
         advance();
     }
+
+    @Override
+    public String toString()
+    {
+        return String.format("BigTable-PartitionIndexIterator(%s)", indexFile.path());
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
index d91267830c8c..11c9b1a90424 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
@@ -17,17 +17,21 @@
  */
 package org.apache.cassandra.io.sstable.format.big;
 
+import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
-import java.util.*;
-import java.util.function.Supplier;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
 
-import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
-import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.Slices;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.Rows;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
@@ -35,12 +39,19 @@
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SelectionReason;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SkippingReason;
+import org.apache.cassandra.io.sstable.format.ScrubPartitionIterator;
 import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -54,12 +65,23 @@ public class BigTableReader extends SSTableReader
 
     protected final BigTableRowIndexEntry.IndexSerializer<IndexInfo> rowIndexEntrySerializer;
 
-    BigTableReader(SSTableReaderBuilder builder)
+    @Override
+    public boolean hasIndex()
+    {
+        return new File(descriptor.filenameFor(Component.PRIMARY_INDEX)).exists();
+    }
+
+    public BigTableReader(SSTableReaderBuilder builder)
     {
         super(builder);
         this.rowIndexEntrySerializer = new BigTableRowIndexEntry.Serializer(descriptor.version, header);
     }
 
+    @Override
+    public void setup(boolean trackHotness) {
+        tidy.setup(this, trackHotness, Arrays.asList(bf, indexSummary, dfile, ifile));
+        super.setup(trackHotness);
+    }
     @Override
     public PartitionIndexIterator allKeysIterator() throws IOException
     {
@@ -130,12 +152,12 @@ public ISSTableScanner getScanner(Collection<Range<Token>> ranges)
 
     @SuppressWarnings("resource") // caller to close
     @Override
-    public UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, DecoratedKey key, boolean tombstoneOnly)
+    public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, boolean tombstoneOnly)
     {
         BigTableRowIndexEntry position = getPosition(key, SSTableReader.Operator.EQ, true, false, SSTableReadsListener.NOOP_LISTENER);
         if (position == null)
             return null;
-        return SSTableIdentityIterator.create(this, dfile.get(), position, key, tombstoneOnly);
+        return SSTableIdentityIterator.create(this, dfile, position, key, tombstoneOnly);
     }
 
     /**
@@ -144,6 +166,7 @@ public UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, Decor
      * @param op The Operator defining matching keys: the nearest key to the target matching the operator wins.
      * @param updateCacheAndStats true if updating stats and cache
      * @return The index entry corresponding to the key, or null if the key is not present
+     * TODO @param permitMatchPastLast is always set to false, we should remove it
      */
     protected BigTableRowIndexEntry getPosition(PartitionPosition key,
                                                 Operator op,
@@ -299,29 +322,25 @@ protected BigTableRowIndexEntry getPosition(PartitionPosition key,
         return null;
     }
 
-
-    @Override
-    public DecoratedKey keyAt(long indexPosition) throws IOException
-    {
-        try (FileDataInput in = ifile.createReader(indexPosition))
-        {
-            return keyAt(in);
-        }
-    }
-
     @Override
     public DecoratedKey keyAt(FileDataInput reader) throws IOException
     {
         if (reader.isEOF()) return null;
 
-        DecoratedKey key = decorateKey(ByteBufferUtil.readWithShortLength(reader));
+        return decorateKey(ByteBufferUtil.readWithShortLength(reader));
+    }
 
-        // hint read path about key location if caching is enabled
-        // this saves index summary lookup and index file iteration which whould be pretty costly
-        // especially in presence of promoted column indexes
-        if (isKeyCacheEnabled())
-            cacheKey(key, rowIndexEntrySerializer.deserialize(reader));
+    @Override
+    public RandomAccessReader openKeyComponentReader()
+    {
+        return openIndexReader();
+    }
 
-        return key;
+    @Override
+    public ScrubPartitionIterator scrubPartitionsIterator() throws IOException
+    {
+        if (ifile == null)
+            return null;
+        return new ScrubIterator(ifile, rowIndexEntrySerializer);
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java
index dc8e91ec93e3..66b0b4509b5f 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableRowIndexEntry.java
@@ -127,7 +127,7 @@
  * </p>
  *
  */
-public class BigTableRowIndexEntry extends RowIndexEntry<IndexInfo> implements IMeasurableMemory
+public class BigTableRowIndexEntry extends RowIndexEntry implements IMeasurableMemory
 {
     public static final long EMPTY_SIZE = ObjectSizes.measure(new BigTableRowIndexEntry(0));
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
index 637dab9290b3..f66342f2d063 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java
@@ -17,26 +17,44 @@
  */
 package org.apache.cassandra.io.sstable.format.big;
 
-import java.io.*;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.nio.BufferOverflowException;
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
 
-import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.cache.ChunkCache;
-import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.PartitionSerializationException;
+import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker;
+import org.apache.cassandra.db.rows.RangeTombstoneBoundaryMarker;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.compress.CompressedSequentialWriter;
 import org.apache.cassandra.io.compress.ICompressor;
-import org.apache.cassandra.io.sstable.*;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.Downsampling;
+import org.apache.cassandra.io.sstable.IndexSummary;
+import org.apache.cassandra.io.sstable.IndexSummaryBuilder;
+import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder;
@@ -45,10 +63,23 @@
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
-import org.apache.cassandra.io.util.*;
+import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
+import org.apache.cassandra.io.util.ChecksummedSequentialWriter;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.io.util.SequentialWriterOption;
 import org.apache.cassandra.schema.CompressionParams;
 import org.apache.cassandra.schema.TableMetadataRef;
-import org.apache.cassandra.utils.*;
+import org.apache.cassandra.utils.BloomFilter;
+import org.apache.cassandra.utils.BloomFilterSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.FilterFactory;
+import org.apache.cassandra.utils.IFilter;
+import org.apache.cassandra.utils.SyncUtil;
 import org.apache.cassandra.utils.concurrent.Transactional;
 
 public class BigTableWriter extends SSTableWriter
@@ -81,12 +112,12 @@ public BigTableWriter(Descriptor descriptor,
                           LifecycleNewTracker lifecycleNewTracker,
                           Set<Component> indexComponents)
     {
-        super(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers, indexComponents);
+        super(descriptor, bigTableComponents(metadata.getLocal(), indexComponents), keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers);
         lifecycleNewTracker.trackNew(this); // must track before any files are created
 
         if (compression)
         {
-            final CompressionParams compressionParams = compressionFor(lifecycleNewTracker.opType());
+            final CompressionParams compressionParams = compressionFor(lifecycleNewTracker.opType(), metadata);
 
             dataFile = new CompressedSequentialWriter(new File(getFilename()),
                                              descriptor.filenameFor(Component.COMPRESSION_INFO),
@@ -114,7 +145,8 @@ public BigTableWriter(Descriptor descriptor,
      * @param opType
      * @return {@link org.apache.cassandra.schema.CompressionParams}
      */
-    private CompressionParams compressionFor(final OperationType opType)
+    // TODO STAR-247: this method should be pulled up to SSTable
+    public static CompressionParams compressionFor(final OperationType opType, TableMetadataRef metadata)
     {
         CompressionParams compressionParams = metadata.getLocal().params.compression;
         final ICompressor compressor = compressionParams.getSstableCompressor();
@@ -246,21 +278,12 @@ public BigTableRowIndexEntry append(UnfilteredRowIterator iterator)
         }
     }
 
-    private void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
-    {
-        if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
-        {
-            String keyString = metadata().partitionKeyType.getString(key.getKey());
-            logger.warn("Writing large partition {}/{}:{} ({}) to sstable {}", metadata.keyspace, metadata.name, keyString, FBUtilities.prettyPrintMemory(rowSize), getFilename());
-        }
-    }
-
-    private static class StatsCollector extends Transformation
+    // TODO: Move this class to the parent SSTableReader or to a distinct file as it seems to be generic enough
+    public static class StatsCollector extends Transformation
     {
         private final MetadataCollector collector;
-        private int cellCount;
 
-        StatsCollector(MetadataCollector collector)
+        public StatsCollector(MetadataCollector collector)
         {
             this.collector = collector;
         }
@@ -269,7 +292,7 @@ private static class StatsCollector extends Transformation
         public Row applyToStatic(Row row)
         {
             if (!row.isEmpty())
-                cellCount += Rows.collectStats(row, collector);
+                Rows.collectStats(row, collector);
             return row;
         }
 
@@ -277,7 +300,7 @@ public Row applyToStatic(Row row)
         public Row applyToRow(Row row)
         {
             collector.updateClusteringValues(row.clustering());
-            cellCount += Rows.collectStats(row, collector);
+            Rows.collectStats(row, collector);
             return row;
         }
 
@@ -301,7 +324,7 @@ public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
         @Override
         public void onPartitionClose()
         {
-            collector.addCellPerPartitionCount(cellCount);
+            collector.addCellPerPartitionCount();
         }
 
         @Override
@@ -313,12 +336,12 @@ public DeletionTime applyToDeletion(DeletionTime deletionTime)
     }
 
     @SuppressWarnings("resource")
-    public SSTableReader openEarly()
+    public boolean openEarly(Consumer<SSTableReader> callWhenReady)
     {
         // find the max (exclusive) readable key
         IndexSummaryBuilder.ReadableBoundary boundary = iwriter.getMaxReadable();
         if (boundary == null)
-            return null;
+            return false;
 
         StatsMetadata stats = statsMetadata();
         assert boundary.indexLength > 0 && boundary.dataLength > 0;
@@ -332,7 +355,7 @@ public SSTableReader openEarly()
         int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
         FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete(boundary.dataLength);
         invalidateCacheAtBoundary(dfile);
-        SSTableReader sstable = SSTableReader.internalOpen(descriptor,
+        SSTableReader sstable = BigTableReader.internalOpen(descriptor,
                                                            components, metadata,
                                                            ifile, dfile,
                                                            indexSummary,
@@ -345,7 +368,8 @@ public SSTableReader openEarly()
         // now it's open, find the ACTUAL last readable key (i.e. for which the data file has also been flushed)
         sstable.first = getMinimalKey(first);
         sstable.last = getMinimalKey(boundary.lastKey);
-        return sstable;
+        callWhenReady.accept(sstable);
+        return true;
     }
 
     void invalidateCacheAtBoundary(FileHandle dfile)
@@ -485,7 +509,7 @@ class IndexWriter extends AbstractTransactional implements Transactional
         IndexWriter(long keyCount)
         {
             indexFile = new SequentialWriter(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)), writerOption);
-            builder = SSTableReaderBuilder.defaultIndexHandleBuilder(descriptor);
+            builder = SSTableReaderBuilder.defaultIndexHandleBuilder(descriptor, Component.PRIMARY_INDEX);
             summary = new IndexSummaryBuilder(keyCount, metadata().params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL);
             bf = FilterFactory.getFilter(keyCount, metadata().params.bloomFilterFpChance);
             // register listeners to be alerted when the data files are flushed
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java b/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java
index 857b251b7a0c..ec694e4ad501 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/IndexState.java
@@ -24,7 +24,7 @@
 import org.apache.cassandra.db.ClusteringBound;
 import org.apache.cassandra.db.ClusteringComparator;
 import org.apache.cassandra.db.ClusteringPrefix;
-import org.apache.cassandra.io.sstable.format.big.AbstractBigTableIterator.RowReader;
+import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator.RowReader;
 import org.apache.cassandra.io.util.DataPosition;
 import org.apache.cassandra.io.util.FileHandle;
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
index 40d5d6fa9544..c2f50f6bdf7f 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableIterator.java
@@ -18,11 +18,11 @@
 package org.apache.cassandra.io.sstable.format.big;
 
 import java.io.IOException;
-import java.util.NoSuchElementException;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileHandle;
@@ -30,7 +30,7 @@
 /**
  *  A Cell Iterator over SSTable
  */
-public class SSTableIterator extends AbstractBigTableIterator
+public class SSTableIterator extends AbstractSSTableIterator<BigTableRowIndexEntry>
 {
     /**
      * The index of the slice being processed.
@@ -73,139 +73,6 @@ public boolean isReverseOrder()
         return false;
     }
 
-    private class ForwardReader extends RowReader
-    {
-        // The start of the current slice. This will be null as soon as we know we've passed that bound.
-        protected ClusteringBound<?> start;
-        // The end of the current slice. Will never be null.
-        protected ClusteringBound<?> end = BufferClusteringBound.TOP;
-
-        protected Unfiltered next; // the next element to return: this is computed by hasNextInternal().
-
-        protected boolean sliceDone; // set to true once we know we have no more result for the slice. This is in particular
-                                     // used by the indexed reader when we know we can't have results based on the index.
-
-        private ForwardReader(FileDataInput file, boolean shouldCloseFile)
-        {
-            super(file, shouldCloseFile);
-        }
-
-        public void setForSlice(Slice slice) throws IOException
-        {
-            start = slice.start().isBottom() ? null : slice.start();
-            end = slice.end();
-
-            sliceDone = false;
-            next = null;
-        }
-
-        // Skip all data that comes before the currently set slice.
-        // Return what should be returned at the end of this, or null if nothing should.
-        private Unfiltered handlePreSliceData() throws IOException
-        {
-            assert deserializer != null;
-
-            // Note that the following comparison is not strict. The reason is that the only cases
-            // where it can be == is if the "next" is a RT start marker (either a '[' of a ')[' boundary),
-            // and if we had a strict inequality and an open RT marker before this, we would issue
-            // the open marker first, and then return then next later, which would send in the
-            // stream both '[' (or '(') and then ')[' for the same clustering value, which is wrong.
-            // By using a non-strict inequality, we avoid that problem (if we do get ')[' for the same
-            // clustering value than the slice, we'll simply record it in 'openMarker').
-            while (deserializer.hasNext() && deserializer.compareNextTo(start) <= 0)
-            {
-                if (deserializer.nextIsRow())
-                    deserializer.skipNext();
-                else
-                    updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
-            }
-
-            ClusteringBound<?> sliceStart = start;
-            start = null;
-
-            // We've reached the beginning of our queried slice. If we have an open marker
-            // we should return that first.
-            if (openMarker != null)
-                return new RangeTombstoneBoundMarker(sliceStart, openMarker);
-
-            return null;
-        }
-
-        // Compute the next element to return, assuming we're in the middle to the slice
-        // and the next element is either in the slice, or just after it. Returns null
-        // if we're done with the slice.
-        protected Unfiltered computeNext() throws IOException
-        {
-            assert deserializer != null;
-
-            while (true)
-            {
-                // We use a same reasoning as in handlePreSliceData regarding the strictness of the inequality below.
-                // We want to exclude deserialized unfiltered equal to end, because 1) we won't miss any rows since those
-                // woudn't be equal to a slice bound and 2) a end bound can be equal to a start bound
-                // (EXCL_END(x) == INCL_START(x) for instance) and in that case we don't want to return start bound because
-                // it's fundamentally excluded. And if the bound is a  end (for a range tombstone), it means it's exactly
-                // our slice end, but in that  case we will properly close the range tombstone anyway as part of our "close
-                // an open marker" code in hasNextInterna
-                if (!deserializer.hasNext() || deserializer.compareNextTo(end) >= 0)
-                    return null;
-
-                Unfiltered next = deserializer.readNext();
-                UnfilteredValidation.maybeValidateUnfiltered(next, metadata(), key, sstable);
-                // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
-                if (next.isEmpty())
-                    continue;
-
-                if (next.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
-                    updateOpenMarker((RangeTombstoneMarker) next);
-                return next;
-            }
-        }
-
-        protected boolean hasNextInternal() throws IOException
-        {
-            if (next != null)
-                return true;
-
-            if (sliceDone)
-                return false;
-
-            if (start != null)
-            {
-                Unfiltered unfiltered = handlePreSliceData();
-                if (unfiltered != null)
-                {
-                    next = unfiltered;
-                    return true;
-                }
-            }
-
-            next = computeNext();
-            if (next != null)
-                return true;
-
-            // for current slice, no data read from deserialization
-            sliceDone = true;
-            // If we have an open marker, we should not close it, there could be more slices
-            if (openMarker != null)
-            {
-                next = new RangeTombstoneBoundMarker(end, openMarker);
-                return true;
-            }
-            return false;
-        }
-
-        protected Unfiltered nextInternal() throws IOException
-        {
-            if (!hasNextInternal())
-                throw new NoSuchElementException();
-
-            Unfiltered toReturn = next;
-            next = null;
-            return toReturn;
-        }
-    }
-
     private class ForwardIndexedReader extends ForwardReader
     {
         private final IndexState indexState;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
index 64f4325f981e..9431e8c1d483 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/SSTableReversedIterator.java
@@ -24,6 +24,7 @@
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
 import org.apache.cassandra.db.rows.*;
+import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileHandle;
@@ -34,7 +35,7 @@
 /**
  *  A Cell Iterator in reversed clustering order over SSTable
  */
-public class SSTableReversedIterator extends AbstractBigTableIterator
+public class SSTableReversedIterator extends AbstractSSTableIterator<BigTableRowIndexEntry>
 {
     /**
      * The index of the slice being processed.
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/ScrubIterator.java b/src/java/org/apache/cassandra/io/sstable/format/big/ScrubIterator.java
new file mode 100644
index 000000000000..cc5cd39d5fd7
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/ScrubIterator.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.big;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.io.sstable.format.ScrubPartitionIterator;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+public class ScrubIterator implements ScrubPartitionIterator
+{
+    private final FileHandle ifile;
+    private final RandomAccessReader reader;
+    private final BigTableRowIndexEntry.IndexSerializer<IndexInfo> rowIndexEntrySerializer;
+
+    private ByteBuffer key;
+    private long dataPosition;
+
+    public ScrubIterator(FileHandle ifile, BigTableRowIndexEntry.IndexSerializer<IndexInfo> rowIndexEntrySerializer) throws IOException
+    {
+        this.ifile = ifile.sharedCopy();
+        this.reader = this.ifile.createReader();
+        this.rowIndexEntrySerializer = rowIndexEntrySerializer;
+        advance();
+    }
+
+    @Override
+    public void close()
+    {
+        reader.close();
+        ifile.close();
+    }
+
+    @Override
+    public void advance() throws IOException
+    {
+        if (!reader.isEOF())
+        {
+            key = ByteBufferUtil.readWithShortLength(reader);
+            dataPosition = rowIndexEntrySerializer.deserializePositionAndSkip(reader);
+        }
+        else
+        {
+            dataPosition = -1;
+            key = null;
+        }
+    }
+
+    @Override
+    public ByteBuffer key()
+    {
+        return key;
+    }
+
+    @Override
+    public long dataPosition()
+    {
+        return dataPosition;
+    }
+
+    @Override
+    public boolean isExhausted()
+    {
+        return dataPosition == -1;
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndex.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndex.java
new file mode 100644
index 000000000000..d51c03b44d7f
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndex.java
@@ -0,0 +1,434 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.Closeable;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.ByteBuffer;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.io.tries.SerializationNode;
+import org.apache.cassandra.io.tries.TrieNode;
+import org.apache.cassandra.io.tries.TrieSerializer;
+import org.apache.cassandra.io.tries.ValueIterator;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.PageAware;
+import org.apache.cassandra.utils.SizedInts;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+import org.apache.cassandra.utils.concurrent.Ref;
+
+/**
+ * This class holds the partition index as an on-disk trie mapping unique prefixes of decorated keys to:
+ *     - data file position if the partition is small enough to not need an index
+ *     - row index file position if the partition has a row index
+ * plus
+ *     - the last 8 bits of the key's filter hash which is used to filter out mismatched keys without reading the key
+ *
+ * To avoid having to create an object to carry the result, the two are distinguished by sign. Direct-to-dfile entries
+ * are recorded as ~position (~ instead of - to differentiate 0 in ifile from 0 in dfile).
+ *
+ * In either case the contents of the file at this position start with a serialization of the key which can be used
+ * to verify the correct key is found.
+ *
+ * To read the index one must obtain a thread-unsafe Reader or IndexPosIterator.
+ *
+ * Not to be used outside of package. Public only so that it can be used by tools (IndexAnalyzer and IndexRewriter).
+ */
+public class PartitionIndex implements Closeable
+{
+    private static final Logger logger = LoggerFactory.getLogger(PartitionIndex.class);
+
+    private final FileHandle fh;
+    private final long keyCount;
+    private final DecoratedKey first;
+    private final DecoratedKey last;
+    private final long root;
+    /** Key to apply when a caller asks for a full index. Normally null, but set to first for zero-copied indexes. */
+    private final DecoratedKey filterFirst;
+    /** Key to apply when a caller asks for a full index. Normally null, but set to last for zero-copied indexes. */
+    private final DecoratedKey filterLast;
+
+    public static final long NOT_FOUND = Long.MIN_VALUE;
+    public static final int FOOTER_LENGTH = 3 * 8;
+
+    public PartitionIndex(FileHandle fh, long trieRoot, long keyCount, DecoratedKey first, DecoratedKey last, DecoratedKey filterFirst, DecoratedKey filterLast)
+    {
+        this.keyCount = keyCount;
+        this.fh = fh.sharedCopy();
+        this.first = first;
+        this.last = last;
+        this.root = trieRoot;
+        this.filterFirst = filterFirst;
+        this.filterLast = filterLast;
+    }
+
+    private PartitionIndex(PartitionIndex src)
+    {
+        this(src.fh, src.root, src.keyCount, src.first, src.last, src.filterFirst, src.filterLast);
+    }
+
+    static class Payload
+    {
+        final long position;
+        final short hashBits;
+
+        public Payload(long position, short hashBits)
+        {
+            this.position = position;
+            assert this.position != NOT_FOUND;
+            this.hashBits = hashBits;
+        }
+    }
+
+    static final PartitionIndexSerializer TRIE_SERIALIZER = new PartitionIndexSerializer();
+
+    private static class PartitionIndexSerializer implements TrieSerializer<Payload, DataOutput>
+    {
+        public int sizeofNode(SerializationNode<Payload> node, long nodePosition)
+        {
+            return TrieNode.typeFor(node, nodePosition).sizeofNode(node) +
+                   (node.payload() != null ? 1 + SizedInts.nonZeroSize(node.payload().position) : 0);
+        }
+
+        @Override
+        public void write(DataOutput dest, SerializationNode<Payload> node, long nodePosition) throws IOException
+        {
+            write(dest, TrieNode.typeFor(node, nodePosition), node, nodePosition);
+        }
+
+        public void write(DataOutput dest, TrieNode type, SerializationNode<Payload> node, long nodePosition) throws IOException
+        {
+            Payload payload = node.payload();
+            if (payload != null)
+            {
+                int payloadBits;
+                int size = SizedInts.nonZeroSize(payload.position);
+                payloadBits = 7 + size;
+                type.serialize(dest, node, payloadBits, nodePosition);
+                dest.writeByte(payload.hashBits);
+                SizedInts.write(dest, payload.position, size);
+            }
+            else
+                type.serialize(dest, node, 0, nodePosition);
+        }
+    }
+
+    public long size()
+    {
+        return keyCount;
+    }
+
+    public DecoratedKey firstKey()
+    {
+        return first;
+    }
+
+    public DecoratedKey lastKey()
+    {
+        return last;
+    }
+
+    public PartitionIndex sharedCopy()
+    {
+        return new PartitionIndex(this);
+    }
+
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        fh.addTo(identities);
+    }
+
+    public static PartitionIndex load(FileHandle.Builder fhBuilder,
+                                      IPartitioner partitioner,
+                                      boolean preload) throws IOException
+    {
+        try (FileHandle fh = fhBuilder.complete())
+        {
+            return load(fh, partitioner, preload);
+        }
+    }
+
+    public static PartitionIndex load(FileHandle fh, IPartitioner partitioner, boolean preload) throws IOException
+    {
+        try (FileDataInput rdr = fh.createReader(fh.dataLength() - FOOTER_LENGTH))
+        {
+            long firstPos = rdr.readLong();
+            long keyCount = rdr.readLong();
+            long root = rdr.readLong();
+            rdr.seek(firstPos);
+            DecoratedKey first = partitioner != null ? partitioner.decorateKey(ByteBufferUtil.readWithShortLength(rdr)) : null;
+            DecoratedKey last = partitioner != null ? partitioner.decorateKey(ByteBufferUtil.readWithShortLength(rdr)) : null;
+            if (preload)
+            {
+                int csum = 0;
+                // force a read of all the pages of the index
+                for (long pos = 0; pos < fh.dataLength(); pos += PageAware.PAGE_SIZE)
+                {
+                    rdr.seek(pos);
+                    csum += rdr.readByte();
+                }
+                logger.trace("Checksum {}", csum);      // Note: trace is required so that reads aren't optimized away.
+            }
+
+            return new PartitionIndex(fh, root, keyCount, first, last, null, null);
+        }
+    }
+
+    @Override
+    public void close()
+    {
+        fh.close();
+    }
+
+    public Reader openReader()
+    {
+        return new Reader(this);
+    }
+
+    protected IndexPosIterator allKeysIterator()
+    {
+        return new IndexPosIterator(this);
+    }
+
+    protected Rebufferer instantiateRebufferer()
+    {
+        return fh.instantiateRebufferer();
+    }
+
+
+    /**
+     * @return the file handle to the file on disk. This is needed for locking the index in RAM,
+     * see DB-342 and follow up ticket on how this should be reworked.
+     */
+    FileHandle getFileHandle()
+    {
+        return fh;
+    }
+
+    private static long getIndexPos(ByteBuffer contents, int payloadPos, int bytes)
+    {
+        if (bytes > 7)
+        {
+            ++payloadPos;
+            bytes -= 7;
+        }
+        if (bytes == 0)
+            return NOT_FOUND;
+        return SizedInts.read(contents, payloadPos, bytes);
+    }
+
+    public interface Acceptor<ArgType, ResultType>
+    {
+        ResultType accept(long position, boolean assumeNoMatch, ArgType v) throws IOException;
+    }
+
+    /**
+     * Provides methods to read the partition index trie.
+     * Thread-unsafe, uses class members to store lookup state.
+     */
+    public static class Reader extends Walker<Reader>
+    {
+        protected Reader(PartitionIndex index)
+        {
+            super(index.instantiateRebufferer(), index.root);
+        }
+
+        /**
+         * Finds a candidate for an exact key search. Returns an ifile (if positive) or dfile (if negative, using ~)
+         * position. The position returned has a low chance of being a different entry, but only if the sought key
+         * is not present in the file.
+         */
+        public long exactCandidate(DecoratedKey key)
+        {
+            // A hit must be a prefix of the byte-comparable representation of the key.
+            int b = follow(key);
+            // If the prefix ended in a node with children it is only acceptable if it is a full match.
+            if (b != ByteSource.END_OF_STREAM && hasChildren())
+                return NOT_FOUND;
+            if (!checkHashBits(key.filterHashLowerBits()))
+                return NOT_FOUND;
+            return getCurrentIndexPos();
+        }
+
+        final boolean checkHashBits(short hashBits)
+        {
+            int bytes = payloadFlags();
+            if (bytes <= 7)
+                return bytes > 0;
+            return (buf.get(payloadPosition()) == (byte) hashBits);
+        }
+
+        public <ResultType> ResultType ceiling(PartitionPosition key, Acceptor<PartitionPosition, ResultType> acceptor) throws IOException
+        {
+            // Look for a prefix of the key. If there is one, the key it stands for could be less, equal, or greater
+            // than the required value so try that first.
+            int b = followWithGreater(key);
+            // If the prefix ended in a node with children it is only acceptable if it is a full match.
+            if (!hasChildren() || b == ByteSource.END_OF_STREAM)
+            {
+                long indexPos = getCurrentIndexPos();
+                if (indexPos != NOT_FOUND)
+                {
+                    ResultType res = acceptor.accept(indexPos, false, key);
+                    if (res != null)
+                        return res;
+                }
+            }
+            // If that was not found, the closest greater value can be used instead, and we know that
+            // it stands for a key greater than the argument.
+            if (greaterBranch == -1)
+                return null;
+            goMin(greaterBranch);
+            long indexPos = getCurrentIndexPos();
+            return acceptor.accept(indexPos, true, key);
+        }
+
+
+        public <ResultType> ResultType floor(PartitionPosition key, Acceptor<PartitionPosition, ResultType> acceptor) throws IOException
+        {
+            // Check for a prefix and find closest smaller branch.
+            Long indexPos = prefixAndNeighbours(key, Reader::getSpecificIndexPos);
+
+            if (indexPos != null && indexPos != NOT_FOUND)
+            {
+                ResultType res = acceptor.accept(indexPos, false, key);
+                if (res != null)
+                    return res;
+            }
+
+            // Otherwise return the IndexInfo for the closest entry of the smaller branch (which is the max of lesserBranch).
+            // Note (see prefixAndNeighbours): since we accept prefix matches above, at this point there cannot be another
+            // prefix match that is closer than max(lesserBranch).
+            if (lesserBranch == -1)
+                return null;
+            goMax(lesserBranch);
+            indexPos = getCurrentIndexPos();
+
+            return acceptor.accept(indexPos, true, key);
+        }
+
+
+        public Long getSpecificIndexPos(int pos, int bits)
+        {
+            return getIndexPos(buf, pos, bits);
+        }
+
+        public long getCurrentIndexPos()
+        {
+            return getIndexPos(buf, payloadPosition(), payloadFlags());
+        }
+
+        public long getLastIndexPosition()
+        {
+            goMax(root);
+            return getCurrentIndexPos();
+        }
+
+        /**
+         * To be used only in analysis.
+         */
+        @SuppressWarnings("unused")
+        protected int payloadSize()
+        {
+            int bytes = payloadFlags();
+            return bytes > 7 ? bytes - 6 : bytes;
+        }
+    }
+
+    /**
+     * Iterator of index positions covered between two keys. Since we store prefixes only, the first and last returned
+     * values can be outside the span (and inclusiveness is not given as we cannot verify it).
+     */
+    public static class IndexPosIterator extends ValueIterator<IndexPosIterator>
+    {
+        static final long INVALID = -1;
+        long pos = INVALID;
+
+        /**
+         * @param index PartitionIndex to use for the iteration.
+         *
+         * Note: For performance reasons this class does not keep a reference of the index. Caller must ensure a
+         * reference is held for the lifetime of this object.
+         */
+        public IndexPosIterator(PartitionIndex index)
+        {
+            super(index.instantiateRebufferer(), index.root, index.filterFirst, index.filterLast, true);
+        }
+
+        IndexPosIterator(PartitionIndex index, PartitionPosition start, PartitionPosition end)
+        {
+            super(index.instantiateRebufferer(), index.root, start, end, true);
+        }
+
+        /**
+         * Returns the position in the row index or data file.
+         */
+        protected long nextIndexPos()
+        {
+            // without missing positions, we save and reuse the unreturned position.
+            if (pos == INVALID)
+            {
+                pos = nextPayloadedNode();
+                if (pos == INVALID)
+                    return NOT_FOUND;
+            }
+
+            go(pos);
+
+            pos = INVALID; // make sure next time we call nextPayloadedNode() again
+            return getIndexPos(buf, payloadPosition(), payloadFlags()); // this should not throw
+        }
+    }
+
+    /**
+     * debug/test code
+     */
+    @VisibleForTesting
+    public void dumpTrie(String fileName)
+    {
+        try(PrintStream ps = new PrintStream(fileName))
+        {
+            dumpTrie(ps);
+        }
+        catch (Throwable t)
+        {
+            logger.warn("Failed to dump trie to {} due to exception {}", fileName, t);
+        }
+    }
+
+    private void dumpTrie(PrintStream out)
+    {
+        try (Reader rdr = openReader())
+        {
+            rdr.dumpTrie(out, (buf, ppos, pbits) -> Long.toString(getIndexPos(buf, ppos, pbits)));
+        }
+    }
+
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexBuilder.java
new file mode 100644
index 000000000000..b654764a1018
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexBuilder.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.util.function.Consumer;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.io.tries.IncrementalTrieWriter;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Partition index builder: stores index or data positions in an incrementally built, page aware on-disk trie.
+ *
+ * Not to be used outside of package. Public only for IndexRewriter tool.
+ */
+public class PartitionIndexBuilder implements AutoCloseable
+{
+    private final SequentialWriter writer;
+    private final IncrementalTrieWriter<PartitionIndex.Payload> trieWriter;
+    private final FileHandle.Builder fhBuilder;
+
+    // the last synced data file position
+    private long dataSyncPosition;
+    // the last synced row index file position
+    private long rowIndexSyncPosition;
+    // the last synced partition index file position
+    private long partitionIndexSyncPosition;
+
+    // Partial index can only be used after all three files have been synced to the required positions.
+    private long partialIndexDataEnd;
+    private long partialIndexRowEnd;
+    private long partialIndexPartitionEnd;
+    private IncrementalTrieWriter.PartialTail partialIndexTail;
+    private Consumer<PartitionIndex> partialIndexConsumer;
+    private DecoratedKey partialIndexLastKey;
+
+    private int lastDiffPoint;
+    private DecoratedKey firstKey;
+    private DecoratedKey lastKey;
+    private DecoratedKey lastWrittenKey;
+    private PartitionIndex.Payload lastPayload;
+
+    public PartitionIndexBuilder(SequentialWriter writer, FileHandle.Builder fhBuilder)
+    {
+        this.writer = writer;
+        this.trieWriter = IncrementalTrieWriter.open(PartitionIndex.TRIE_SERIALIZER, writer);
+        this.fhBuilder = fhBuilder;
+    }
+
+    /*
+     * Called when partition index has been flushed to the given position.
+     * If this makes all required positions for a partial view flushed, this will call the partialIndexConsumer.
+     */
+    public void markPartitionIndexSynced(long upToPosition)
+    {
+        partitionIndexSyncPosition = upToPosition;
+        refreshReadableBoundary();
+    }
+
+    /*
+     * Called when row index has been flushed to the given position.
+     * If this makes all required positions for a partial view flushed, this will call the partialIndexConsumer.
+     */
+    public void markRowIndexSynced(long upToPosition)
+    {
+        rowIndexSyncPosition = upToPosition;
+        refreshReadableBoundary();
+    }
+
+    /*
+     * Called when data file has been flushed to the given position.
+     * If this makes all required positions for a partial view flushed, this will call the partialIndexConsumer.
+     */
+    public void markDataSynced(long upToPosition)
+    {
+        dataSyncPosition = upToPosition;
+        refreshReadableBoundary();
+    }
+
+    private void refreshReadableBoundary()
+    {
+        if (partialIndexConsumer == null)
+            return;
+        if (dataSyncPosition < partialIndexDataEnd)
+            return;
+        if (rowIndexSyncPosition < partialIndexRowEnd)
+            return;
+        if (partitionIndexSyncPosition < partialIndexPartitionEnd)
+            return;
+
+        try (FileHandle fh = fhBuilder.complete(writer.getLastFlushOffset()))
+        {
+            PartitionIndex pi = new PartitionIndexEarly(fh, partialIndexTail.root(), partialIndexTail.count(), firstKey, partialIndexLastKey, partialIndexTail.cutoff(), partialIndexTail.tail());
+            partialIndexConsumer.accept(pi);
+            partialIndexConsumer = null;
+        }
+    }
+
+    /**
+    * @param decoratedKey the key for this record
+    * @param position the position to write with the record:
+    *    - positive if position points to an index entry in the index file
+    *    - negative if ~position points directly to the key in the data file
+    */
+    public PartitionIndexBuilder addEntry(DecoratedKey decoratedKey, long position) throws IOException
+    {
+        if (lastKey == null)
+        {
+            firstKey = decoratedKey;
+            lastDiffPoint = 0;
+        }
+        else
+        {
+            int diffPoint = ByteComparable.diffPoint(lastKey, decoratedKey, Walker.BYTE_COMPARABLE_VERSION);
+            ByteComparable prevPrefix = ByteComparable.cut(lastKey, Math.max(diffPoint, lastDiffPoint));
+            trieWriter.add(prevPrefix, lastPayload);
+            lastWrittenKey = lastKey;
+            lastDiffPoint = diffPoint;
+        }
+        lastKey = decoratedKey;
+        lastPayload = new PartitionIndex.Payload(position, decoratedKey.filterHashLowerBits());
+        return this;
+    }
+
+    public long complete() throws IOException
+    {
+        // Do not trigger pending partial builds.
+        partialIndexConsumer = null;
+
+        if (lastKey != lastWrittenKey)
+        {
+            ByteComparable prevPrefix = ByteComparable.cut(lastKey, lastDiffPoint);
+            trieWriter.add(prevPrefix, lastPayload);
+        }
+
+        long root = trieWriter.complete();
+        long count = trieWriter.count();
+        long firstKeyPos = writer.position();
+        if (firstKey != null)
+        {
+            ByteBufferUtil.writeWithShortLength(firstKey.getKey(), writer);
+            ByteBufferUtil.writeWithShortLength(lastKey.getKey(), writer);
+        }
+        else
+        {
+            assert lastKey == null;
+            writer.writeShort(0);
+            writer.writeShort(0);
+        }
+
+        writer.writeLong(firstKeyPos);
+        writer.writeLong(count);
+        writer.writeLong(root);
+
+        writer.sync();
+        fhBuilder.withLength(writer.getLastFlushOffset());
+
+        return root;
+    }
+
+    /**
+     * Builds a PartitionIndex representing the records written until this point without interrupting writes. Because
+     * data in buffered writers does not get immediately flushed to the file system and we do not want to force flushing
+     * of the relevant files (which e.g. could cause a problem for compressed data files), this call cannot return
+     * immediately. Instead it will take an index snapshot but wait with making it active (by calling the provided
+     * callback) until it registers that all relevant files (data, row index and partition index) have been flushed at
+     * least as far as the required positions.
+     *
+     * @param callWhenReady callback that is given the prepared partial index when all relevant data has been flushed
+     * @param rowIndexEnd the position in the row index file we need to be able to read to (exclusive) to read all
+     *                    records written so far
+     * @param dataEnd the position in the data file we need to be able to read to (exclusive) to read all records
+     *                    written so far
+     * @return true if the request was accepted, false if there's no point to do this at this time (e.g. another
+     *         partial representation is prepared but still isn't usable).
+     */
+    public boolean buildPartial(Consumer<PartitionIndex> callWhenReady, long rowIndexEnd, long dataEnd)
+    {
+        // If we haven't advanced since the last time we prepared, there's nothing to do.
+        if (lastWrittenKey == partialIndexLastKey)
+            return false;
+
+        // Don't waste time if an index was already prepared but hasn't reached usability yet.
+        if (partialIndexConsumer != null)
+            return false;
+
+        try
+        {
+            partialIndexTail = trieWriter.makePartialRoot();
+            partialIndexDataEnd = dataEnd;
+            partialIndexRowEnd = rowIndexEnd;
+            partialIndexPartitionEnd = writer.position();
+            partialIndexLastKey = lastWrittenKey;
+            partialIndexConsumer = callWhenReady;
+            return true;
+        }
+        catch (IOException e)
+        {
+            // As writes happen on in-memory buffers, failure here is not expected.
+            throw new AssertionError(e);
+        }
+    }
+
+    // close the builder and release any associated memory
+    public void close()
+    {
+        trieWriter.close();
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexEarly.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexEarly.java
new file mode 100644
index 000000000000..e338c5423365
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexEarly.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.io.util.TailOverridingRebufferer;
+
+/**
+ * Early-opened partition index. Part of the data is already written to file, but some nodes, including the ones in the
+ * chain leading to the last entry in the index, are in the supplied byte buffer and are attached as a tail at the given
+ * position to form a view over the partially-written data.
+ */
+class PartitionIndexEarly extends PartitionIndex
+{
+    final long cutoff;
+    final ByteBuffer tail;
+
+    public PartitionIndexEarly(FileHandle fh, long trieRoot, long keyCount, DecoratedKey first, DecoratedKey last,
+                               long cutoff, ByteBuffer tail)
+    {
+        super(fh, trieRoot, keyCount, first, last, null, last);
+        this.cutoff = cutoff;
+        this.tail = tail;
+    }
+
+    @Override
+    protected Rebufferer instantiateRebufferer()
+    {
+        return new TailOverridingRebufferer(super.instantiateRebufferer(), cutoff, tail);
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIterator.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIterator.java
new file mode 100644
index 000000000000..1fb22736ada6
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionIterator.java
@@ -0,0 +1,254 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Throwables;
+
+// TODO STAR-247: implement unit test
+class PartitionIterator extends PartitionIndex.IndexPosIterator implements PartitionIndexIterator
+{
+    private final PartitionIndex partitionIndex;
+    private final IPartitioner partitioner;
+    private final PartitionPosition limit;
+    private final int exclusiveLimit;
+    private final FileHandle dataFile;
+    private final FileHandle rowIndexFile;
+
+    private FileDataInput dataInput;
+    private FileDataInput indexInput;
+
+    private DecoratedKey currentKey;
+    private RowIndexEntry currentEntry;
+    private DecoratedKey nextKey;
+    private RowIndexEntry nextEntry;
+    private boolean closeHandles = false;
+
+    /**
+     * Note: For performance reasons this class does not request a reference of the files it uses.
+     * If it is the only reference to the data, caller must request shared copies and apply closeHandles().
+     */
+    PartitionIterator(PartitionIndex partitionIndex, IPartitioner partitioner, FileHandle rowIndexFile, FileHandle dataFile,
+                      PartitionPosition left, int inclusiveLeft, PartitionPosition right, int exclusiveRight) throws IOException
+    {
+        super(partitionIndex, left, right);
+        this.partitionIndex = partitionIndex;
+        this.partitioner = partitioner;
+        this.limit = right;
+        this.exclusiveLimit = exclusiveRight;
+        this.rowIndexFile = rowIndexFile;
+        this.dataFile = dataFile;
+
+        readNext();
+        // first value can be off
+        if (nextKey != null && !(nextKey.compareTo(left) > inclusiveLeft))
+        {
+            readNext();
+        }
+        advance();
+    }
+
+    /**
+     * Note: For performance reasons this class does not request a reference of the files it uses.
+     * If it is the only reference to the data, caller must request shared copies and apply closeHandles().
+     */
+    PartitionIterator(PartitionIndex partitionIndex, IPartitioner partitioner, FileHandle rowIndexFile, FileHandle dataFile) throws IOException
+    {
+        this(partitionIndex, partitioner, rowIndexFile, dataFile, partitionIndex.firstKey(), -1, partitionIndex.lastKey(), 0);
+    }
+
+    private PartitionIterator(PartitionIndex partitionIndex)
+    {
+        super(partitionIndex, partitionIndex.firstKey(), partitionIndex.firstKey());
+        this.partitionIndex = partitionIndex;
+        this.partitioner = null;
+        this.limit = partitionIndex.firstKey();
+        this.exclusiveLimit = -1;
+        this.rowIndexFile = null;
+        this.dataFile = null;
+
+        this.currentEntry = null;
+        this.currentKey = null;
+        this.nextEntry = null;
+        this.nextKey = null;
+    }
+
+    static PartitionIterator empty(PartitionIndex partitionIndex)
+    {
+        return new PartitionIterator(partitionIndex);
+    }
+
+    public PartitionIterator closeHandles()
+    {
+        this.closeHandles = true;
+        return this;
+    }
+
+    @Override
+    public void close()
+    {
+        Throwable accum = null;
+        if (closeHandles)
+        {
+            accum = Throwables.close(accum, partitionIndex, dataFile, rowIndexFile);
+        }
+        accum = Throwables.close(accum, dataInput, indexInput);
+        accum = Throwables.perform(accum, super::close);
+        Throwables.maybeFail(accum);
+    }
+
+    public DecoratedKey decoratedKey()
+    {
+        return currentKey;
+    }
+
+    public ByteBuffer key()
+    {
+        return currentKey.getKey();
+    }
+
+    @Override
+    public long dataPosition()
+    {
+        return currentEntry != null ? currentEntry.position : -1;
+    }
+
+    @Override
+    public long keyPosition()
+    {
+        return dataPosition();
+    }
+
+    public RowIndexEntry entry()
+    {
+        return currentEntry;
+    }
+
+    @Override
+    public boolean advance() throws IOException
+    {
+        currentKey = nextKey;
+        currentEntry = nextEntry;
+        if (currentKey != null)
+        {
+            readNext();
+            // if nextKey is null, then currentKey is the last key to be published, therefore check against any limit
+            // and suppress the partition if it is beyond the limit
+            if (nextKey == null && limit != null && currentKey.compareTo(limit) > exclusiveLimit)
+            {   // exclude last partition outside range
+                currentKey = null;
+                currentEntry = null;
+                return false;
+            }
+            return true;
+        }
+        return false;
+    }
+
+    private void readNext() throws IOException
+    {
+        long pos = nextIndexPos();
+        if (pos != PartitionIndex.NOT_FOUND)
+        {
+            if (pos >= 0)
+            {
+                FileDataInput in = indexInput(pos);
+                nextKey = partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in));
+                nextEntry = TrieIndexEntry.deserialize(in, in.getFilePointer());
+            }
+            else
+            {
+                pos = ~pos;
+                FileDataInput in = dataInput(pos);
+                nextKey = partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in));
+                nextEntry = new RowIndexEntry(pos);
+            }
+        }
+        else
+        {
+            nextKey = null;
+            nextEntry = null;
+        }
+    }
+
+    private FileDataInput indexInput(long pos) throws IOException
+    {
+        FileDataInput in = indexInput;
+        if (in == null)
+            in = indexInput = rowIndexFile.createReader(pos);
+        else
+            in.seek(pos);
+        return in;
+    }
+
+    private FileDataInput dataInput(long pos) throws IOException
+    {
+        FileDataInput in = dataInput;
+        if (in == null)
+            in = dataInput = dataFile.createReader(pos);
+        else
+            in.seek(pos);
+        return in;
+    }
+
+    @Override
+    public boolean isExhausted()
+    {
+        return currentKey == null;
+    }
+
+    @Override
+    public long indexPosition()
+    {
+        return 0;
+    }
+
+    @Override
+    public void indexPosition(long position)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long indexLength()
+    {
+        return 0;
+    }
+
+    @Override
+    public void reset()
+    {
+        go(root);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("TrieIndex-PartitionIndexIterator(%s)", partitionIndex.getFileHandle().path());
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
new file mode 100644
index 000000000000..263e73a24a90
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.SerializationHelper;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredSerializer;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Partition writer used by {@link TrieIndexSSTableWriter}.
+ *
+ * Writes all passed data to the given SequentialWriter and if necessary builds a RowIndex by constructing an entry
+ * for each row within a partition that follows {@link org.apache.cassandra.config.Config#column_index_cache_size_in_kb}
+ * kilobytes of written data.
+ */
+class PartitionWriter implements AutoCloseable
+{
+    public int rowIndexCount;
+
+    private final UnfilteredSerializer unfilteredSerializer;
+    private final SerializationHeader header;
+    private final SequentialWriter writer;
+    private final Collection<SSTableFlushObserver> observers;
+    private final RowIndexWriter rowTrie;
+    private final SerializationHelper helper;
+    private final Version version;
+
+    private long initialPosition;
+    private long startPosition;
+
+    private int written;
+    private long previousRowStart;
+
+    private ClusteringPrefix<?> firstClustering;
+    private ClusteringPrefix<?> lastClustering;
+
+    private DeletionTime openMarker = DeletionTime.LIVE;
+    private DeletionTime startOpenMarker = DeletionTime.LIVE;
+
+    PartitionWriter(SerializationHeader header,
+                    ClusteringComparator comparator,
+                    SequentialWriter writer,
+                    SequentialWriter indexWriter,
+                    Version version,
+                    Collection<SSTableFlushObserver> observers)
+    {
+        this.header = header;
+        this.writer = writer;
+        this.observers = observers;
+        this.rowTrie = new RowIndexWriter(comparator, indexWriter);
+        this.unfilteredSerializer = UnfilteredSerializer.serializer;
+        this.helper = new SerializationHelper(header);
+        this.version = version;
+    }
+
+    public void reset()
+    {
+        this.initialPosition = writer.position();
+        this.startPosition = -1;
+        this.previousRowStart = 0;
+        this.rowIndexCount = 0;
+        this.written = 0;
+        this.firstClustering = null;
+        this.lastClustering = null;
+        this.openMarker = DeletionTime.LIVE;
+        rowTrie.reset();
+    }
+
+    @Override
+    public void close()
+    {
+        rowTrie.close();
+    }
+
+    public long writePartition(UnfilteredRowIterator partition) throws IOException
+    {
+        writePartitionHeader(partition.partitionKey(), partition.partitionLevelDeletion(), partition.staticRow());
+
+        while (partition.hasNext())
+        {
+            Unfiltered unfiltered = partition.next();
+            addUnfiltered(unfiltered);
+        }
+
+        return finish();
+    }
+
+    void writePartitionHeader(DecoratedKey partitionKey, DeletionTime partitionLevelDeletion, Row staticRow) throws IOException
+    {
+        ByteBufferUtil.writeWithShortLength(partitionKey.getKey(), writer);
+
+        long deletionTimePosition = writer.position();
+        DeletionTime.serializer.serialize(partitionLevelDeletion, writer);
+        if (!observers.isEmpty())
+            observers.forEach(o -> o.partitionLevelDeletion(partitionLevelDeletion, deletionTimePosition));
+        if (header.hasStatic())
+            doWriteStaticRow(staticRow);
+    }
+
+    private void doWriteStaticRow(Row staticRow) throws IOException
+    {
+        long staticRowPosition = writer.position();
+        unfilteredSerializer.serializeStaticRow(staticRow, helper, writer, version.correspondingMessagingVersion());
+        if (!observers.isEmpty())
+            observers.forEach(o -> o.staticRow(staticRow, staticRowPosition));
+    }
+
+    void addUnfiltered(Unfiltered unfiltered) throws IOException
+    {
+        long pos = currentPosition();
+
+        if (firstClustering == null)
+        {
+            // Beginning of an index block. Remember the start and position
+            firstClustering = unfiltered.clustering();
+            startOpenMarker = openMarker;
+            startPosition = pos;
+        }
+
+        long unfilteredPosition = writer.position();
+        unfilteredSerializer.serialize(unfiltered, helper, writer, pos - previousRowStart, version.correspondingMessagingVersion());
+
+        // notify observers about each new row
+        if (!observers.isEmpty())
+            observers.forEach(o -> o.nextUnfilteredCluster(unfiltered, unfilteredPosition));
+
+        lastClustering = unfiltered.clustering();
+        previousRowStart = pos;
+        ++written;
+
+        if (unfiltered.kind() == Unfiltered.Kind.RANGE_TOMBSTONE_MARKER)
+        {
+            RangeTombstoneMarker marker = (RangeTombstoneMarker) unfiltered;
+            openMarker = marker.isOpen(false) ? marker.openDeletionTime(false) : DeletionTime.LIVE;
+        }
+
+        // if we hit the row index size that we have to index after, go ahead and index it.
+        if (currentPosition() - startPosition >= DatabaseDescriptor.getColumnIndexSize())
+            addIndexBlock();
+    }
+
+    long finish() throws IOException
+    {
+        long endPosition = currentPosition();
+        unfilteredSerializer.writeEndOfPartition(writer);
+
+        // It's possible we add no rows, just a top level deletion
+        if (written == 0)
+            return -1;
+
+        long trieRoot = -1;
+        // the last row may have fallen on an index boundary already.  if not, index it explicitly.
+        if (firstClustering != null && rowIndexCount > 0)
+            addIndexBlock();
+        if (rowIndexCount > 1)
+            trieRoot = rowTrie.complete(endPosition);
+        // Otherwise we don't complete the trie. Even if we did write something (which shouldn't be the case as the
+        // first entry has an empty key and root isn't filled), that's not a problem.
+
+        return trieRoot;
+    }
+
+    private long currentPosition()
+    {
+        return writer.position() - initialPosition;
+    }
+
+    private void addIndexBlock() throws IOException
+    {
+        IndexInfo cIndexInfo = new IndexInfo(startPosition, startOpenMarker);
+        rowTrie.add(firstClustering, lastClustering, cIndexInfo);
+        firstClustering = null;
+        ++rowIndexCount;
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReader.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReader.java
new file mode 100644
index 000000000000..3a3356333fa9
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReader.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.io.tries.SerializationNode;
+import org.apache.cassandra.io.tries.TrieNode;
+import org.apache.cassandra.io.tries.TrieSerializer;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.SizedInts;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Reader class for row index files.
+ *
+ * Row index tries do not need to store whole keys, as what we need from them is to be able to tell where in the data file
+ * to start looking for a given key. Instead, we store some prefix that is greater than the greatest key of the previous
+ * index section and smaller than or equal to the smallest key of the next. So for a given key the first index section
+ * that could potentially contain it is given by the trie's floor for that key.
+ *
+ * This builds upon the trie Walker class which provides basic trie walking functionality. The class is thread-unsafe
+ * and must be re-instantiated for every thread that needs access to the trie (its overhead is below that of a
+ * RandomAccessReader).
+ */
+class RowIndexReader extends Walker<RowIndexReader>
+{
+    private static final int FLAG_OPEN_MARKER = 8;
+
+    static class IndexInfo
+    {
+        final long offset;
+        final DeletionTime openDeletion;
+
+        IndexInfo(long offset, DeletionTime openDeletion)
+        {
+            this.offset = offset;
+            this.openDeletion = openDeletion;
+        }
+    }
+
+    public RowIndexReader(FileHandle file, long root)
+    {
+        super(file.instantiateRebufferer(), root);
+    }
+
+    public RowIndexReader(FileHandle file, TrieIndexEntry entry)
+    {
+        this(file, entry.indexTrieRoot);
+    }
+
+    /**
+     * Computes the floor for a given key.
+     */
+    public IndexInfo separatorFloor(ByteComparable key)
+    {
+        // Check for a prefix and find closest smaller branch.
+        IndexInfo res = prefixAndNeighbours(key, RowIndexReader::readPayload);
+        // If there's a prefix, in a separator trie it could be less than, equal, or greater than sought value.
+        // Sought value is still greater than max of previous section.
+        // On match the prefix must be used as a starting point.
+        if (res != null)
+            return res;
+
+        // Otherwise return the IndexInfo for the closest entry of the smaller branch (which is the max of lesserBranch).
+        // Note (see prefixAndNeighbours): since we accept prefix matches above, at this point there cannot be another
+        // prefix match that is closer than max(lesserBranch).
+        if (lesserBranch == -1)
+            return null;
+        goMax(lesserBranch);
+        return getCurrentIndexInfo();
+    }
+
+    public IndexInfo min()
+    {
+        goMin(root);
+        return getCurrentIndexInfo();
+    }
+
+    protected IndexInfo getCurrentIndexInfo()
+    {
+        return readPayload(payloadPosition(), payloadFlags());
+    }
+
+    protected IndexInfo readPayload(int ppos, int bits)
+    {
+        return readPayload(buf, ppos, bits);
+    }
+
+    static IndexInfo readPayload(ByteBuffer buf, int ppos, int bits)
+    {
+        long dataOffset;
+        if (bits == 0)
+            return null;
+        int bytes = bits & ~FLAG_OPEN_MARKER;
+        dataOffset = SizedInts.read(buf, ppos, bytes);
+        ppos += bytes;
+        DeletionTime deletion = (bits & FLAG_OPEN_MARKER) != 0
+                ? DeletionTime.serializer.deserialize(buf, ppos)
+                : null;
+        return new IndexInfo(dataOffset, deletion);
+    }
+
+    // The trie serializer describes how the payloads are written. Placed here (instead of writer) so that reading and
+    // writing the payload are close together should they need to be changed.
+    static final TrieSerializer<IndexInfo, DataOutputPlus> trieSerializer = new TrieSerializer<IndexInfo, DataOutputPlus>()
+    {
+        @Override
+        public int sizeofNode(SerializationNode<IndexInfo> node, long nodePosition)
+        {
+            return TrieNode.typeFor(node, nodePosition).sizeofNode(node) + sizeof(node.payload());
+        }
+
+        @Override
+        public void write(DataOutputPlus dest, SerializationNode<IndexInfo> node, long nodePosition) throws IOException
+        {
+            write(dest, TrieNode.typeFor(node, nodePosition), node, nodePosition);
+        }
+
+        public int sizeof(IndexInfo payload)
+        {
+            int size = 0;
+            if (payload != null)
+            {
+                size += SizedInts.nonZeroSize(payload.offset);
+                if (!payload.openDeletion.isLive())
+                    size += DeletionTime.serializer.serializedSize(payload.openDeletion);
+            }
+            return size;
+        }
+
+        public void write(DataOutputPlus dest, TrieNode type, SerializationNode<IndexInfo> node, long nodePosition) throws IOException
+        {
+            IndexInfo payload = node.payload();
+            int bytes = 0;
+            int hasOpenMarker = 0;
+            if (payload != null)
+            {
+                bytes = SizedInts.nonZeroSize(payload.offset);
+                if (!payload.openDeletion.isLive())
+                    hasOpenMarker = FLAG_OPEN_MARKER;
+            }
+            type.serialize(dest, node, bytes | hasOpenMarker, nodePosition);
+            if (payload != null)
+            {
+                SizedInts.write(dest, payload.offset, bytes);
+
+                if (hasOpenMarker != 0)
+                    DeletionTime.serializer.serialize(payload.openDeletion, dest);
+            }
+        }
+
+    };
+
+    @SuppressWarnings("unused")
+    public void dumpTrie(PrintStream out)
+    {
+        dumpTrie(out, (buf, ppos, bits) -> {
+            IndexInfo ii = readPayload(buf, ppos, bits);
+
+            return ii != null
+                   ? String.format("pos %x %s", ii.offset, ii.openDeletion == null ? "" : ii.openDeletion)
+                   : "pos null";
+        });
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReverseIterator.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReverseIterator.java
new file mode 100644
index 000000000000..9e4ed137093f
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexReverseIterator.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.PrintStream;
+
+import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
+import org.apache.cassandra.io.tries.ReverseValueIterator;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+/**
+ * Reverse iterator over the row index. Needed to get previous index blocks for reverse iteration.
+ */
+class RowIndexReverseIterator extends ReverseValueIterator<RowIndexReverseIterator>
+{
+    private long currentNode = -1;
+
+    public RowIndexReverseIterator(FileHandle file, long root, ByteComparable start, ByteComparable end)
+    {
+        super(file.instantiateRebufferer(), root, start, end, true);
+    }
+
+    public RowIndexReverseIterator(FileHandle file, TrieIndexEntry entry, ByteComparable end)
+    {
+        this(file, entry.indexTrieRoot, ByteComparable.EMPTY, end);
+    }
+
+    /**
+     * This method must be async-read-safe.
+     */
+    public IndexInfo nextIndexInfo()
+    {
+        if (currentNode == -1)
+        {
+            currentNode = nextPayloadedNode();
+            if (currentNode == -1)
+                return null;
+        }
+
+        go(currentNode);
+        IndexInfo info = RowIndexReader.readPayload(buf, payloadPosition(), payloadFlags());
+
+        currentNode = -1;
+        return info;
+    }
+
+    public void dumpTrie(PrintStream out)
+    {
+        dumpTrie(out, (buf, ppos, bits) -> {
+            IndexInfo ii = RowIndexReader.readPayload(buf, ppos, bits);
+            return String.format("pos %x %s", ii.offset, ii.openDeletion == null ? "" : ii.openDeletion);
+        });
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexWriter.java
new file mode 100644
index 000000000000..86087b6a6c03
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/RowIndexWriter.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
+import org.apache.cassandra.io.tries.IncrementalTrieWriter;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * Preparer / writer of row index tries.
+ *
+ * Uses IncrementalTrieWriter to build a trie of index section separators of shortest possible length such that
+ * prevMax < separator <= nextMin.
+ */
+class RowIndexWriter implements AutoCloseable
+{
+    private final ClusteringComparator comparator;
+    private final IncrementalTrieWriter<IndexInfo> trie;
+    private ByteComparable prevMax = null;
+    private ByteComparable prevSep = null;
+
+    RowIndexWriter(ClusteringComparator comparator, DataOutputPlus out)
+    {
+        this.comparator = comparator;
+        this.trie = IncrementalTrieWriter.open(RowIndexReader.trieSerializer, out);
+    }
+
+    void reset()
+    {
+        prevMax = null;
+        prevSep = null;
+        trie.reset();
+    }
+
+    @Override
+    public void close()
+    {
+        trie.close();
+    }
+
+    void add(ClusteringPrefix<?> firstName, ClusteringPrefix<?> lastName, IndexInfo info) throws IOException
+    {
+        assert info.openDeletion != null;
+        ByteComparable sep = prevMax == null
+                             ? ByteComparable.EMPTY
+                             : ByteComparable.separatorGt(prevMax, comparator.asByteComparable(firstName));
+        trie.add(sep, info);
+        prevSep = sep;
+        prevMax = comparator.asByteComparable(lastName);
+    }
+
+    public long complete(long endPos) throws IOException
+    {
+        // Add a separator after the last section, so that greater inputs can be quickly rejected.
+        // To maximize its efficiency we add it with the length of the last added separator.
+        int i = 0;
+        ByteSource max = prevMax.asComparableBytes(Walker.BYTE_COMPARABLE_VERSION);
+        ByteSource sep = prevSep.asComparableBytes(Walker.BYTE_COMPARABLE_VERSION);
+        int c;
+        while ((c = max.next()) == sep.next() && c != ByteSource.END_OF_STREAM)
+            ++i;
+        assert c != ByteSource.END_OF_STREAM : "Corrupted row order, max=" + prevMax;
+
+        trie.add(nudge(prevMax, i), new IndexInfo(endPos, DeletionTime.LIVE));
+
+        return trie.complete();
+    }
+
+    /**
+     * Produces a source that is slightly greater than argument with length at least nudgeAt.
+     */
+    private ByteComparable nudge(ByteComparable value, int nudgeAt)
+    {
+        return version -> new ByteSource()
+        {
+            private final ByteSource v = value.asComparableBytes(version);
+            private int cur = 0;
+
+            @Override
+            public int next()
+            {
+                int b = ByteSource.END_OF_STREAM;
+                if (cur <= nudgeAt)
+                {
+                    b = v.next();
+                    if (cur == nudgeAt)
+                    {
+                        if (b < 255)
+                            ++b;
+                        else
+                            return b;  // can't nudge here, increase next instead (eventually will be -1)
+                    }
+                }
+                ++cur;
+                return b;
+            }
+        };
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableIterator.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableIterator.java
new file mode 100644
index 000000000000..d24992c865f5
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableIterator.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+
+/**
+ *  A Cell Iterator over SSTable
+ */
+class SSTableIterator extends AbstractSSTableIterator<RowIndexEntry>
+{
+    /**
+     * The index of the slice being processed.
+     */
+    private int slice;
+
+    public SSTableIterator(TrieIndexSSTableReader sstable,
+                           FileDataInput file,
+                           DecoratedKey key,
+                           RowIndexEntry indexEntry,
+                           Slices slices,
+                           ColumnFilter columns,
+                           FileHandle ifile)
+    {
+        super(sstable, file, key, indexEntry, slices, columns, ifile);
+    }
+
+    protected Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    {
+        return indexEntry.isIndexed()
+             ? new ForwardIndexedReader(indexEntry, file, shouldCloseFile)
+             : new ForwardReader(file, shouldCloseFile);
+    }
+
+    protected int nextSliceIndex()
+    {
+        int next = slice;
+        slice++;
+        return next;
+    }
+
+    protected boolean hasMoreSlices()
+    {
+        return slice < slices.size();
+    }
+
+    public boolean isReverseOrder()
+    {
+        return false;
+    }
+
+    private class ForwardIndexedReader extends ForwardReader
+    {
+        private final RowIndexReader indexReader;
+        long basePosition;
+
+        private ForwardIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+            basePosition = indexEntry.position;
+            indexReader = new RowIndexReader(ifile, (TrieIndexEntry) indexEntry);
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            indexReader.close();
+            super.close();
+        }
+
+        @Override
+        public void setForSlice(Slice slice) throws IOException
+        {
+            super.setForSlice(slice);
+            IndexInfo indexInfo = indexReader.separatorFloor(metadata.comparator.asByteComparable(slice.start()));
+            assert indexInfo != null;
+            long position = basePosition + indexInfo.offset;
+            if (file == null || position > file.getFilePointer())
+            {
+                openMarker = indexInfo.openDeletion;
+                seekToPosition(position);
+            }
+            // Otherwise we are already in the relevant index block, there is no point to go back to its beginning.
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableReversedIterator.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableReversedIterator.java
new file mode 100644
index 000000000000..ee613189e17b
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/SSTableReversedIterator.java
@@ -0,0 +1,287 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.util.NoSuchElementException;
+
+import com.carrotsearch.hppc.LongStack;
+import org.apache.cassandra.db.*;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.RangeTombstoneBoundMarker;
+import org.apache.cassandra.db.rows.RangeTombstoneMarker;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.io.sstable.format.AbstractSSTableIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+
+/**
+ *  A Cell Iterator in reversed clustering order over SSTable
+ */
+class SSTableReversedIterator extends AbstractSSTableIterator<RowIndexEntry>
+{
+    /**
+     * The index of the slice being processed.
+     */
+    private int slice;
+
+    public SSTableReversedIterator(TrieIndexSSTableReader sstable,
+                                   FileDataInput file,
+                                   DecoratedKey key,
+                                   RowIndexEntry indexEntry,
+                                   Slices slices,
+                                   ColumnFilter columns,
+                                   FileHandle ifile)
+    {
+        super(sstable, file, key, indexEntry, slices, columns, ifile);
+    }
+
+    protected Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+    {
+        return indexEntry.isIndexed()
+             ? new ReverseIndexedReader(indexEntry, file, shouldCloseFile)
+             : new ReverseReader(file, shouldCloseFile);
+    }
+
+    public boolean isReverseOrder()
+    {
+        return true;
+    }
+
+    protected int nextSliceIndex()
+    {
+        int next = slice;
+        slice++;
+        return slices.size() - (next + 1);
+    }
+
+    protected boolean hasMoreSlices()
+    {
+        return slice < slices.size();
+    }
+
+    /**
+     * Reverse iteration is performed by going through an index block (or the whole partition if not indexed) forwards
+     * and storing the positions of each entry that falls within the slice in a stack. Reverse iteration then pops out
+     * positions and reads the entries.
+     *
+     * Note: The earlier version of this was constructing an in-memory view of the block instead, which gives better
+     * performance on bigger queries and index blocks (due to not having to read disk again). With the lower
+     * granularity of the tries it makes better sense to store as little as possible as the beginning of the block
+     * should very rarely be in other page/chunk cache locations. This has the benefit of being able to answer small
+     * queries (esp. LIMIT 1) faster and with less GC churn.
+     */
+    private class ReverseReader extends RowReader
+    {
+        LongStack rowOffsets = new LongStack();
+        RangeTombstoneMarker blockOpenMarker, blockCloseMarker;
+        Unfiltered next = null;
+        boolean foundLessThan;
+        long startPos = -1;
+
+        private ReverseReader(FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+        }
+
+        public void setForSlice(Slice slice) throws IOException
+        {
+            // read full row and filter
+            if (startPos == -1)
+                startPos = file.getFilePointer();
+            else
+                seekToPosition(startPos);
+
+            fillOffsets(slice, true, true, Long.MAX_VALUE);
+        }
+
+        protected boolean hasNextInternal() throws IOException
+        {
+            if (next != null)
+                return true;
+            next = computeNext();
+            return next != null;
+        }
+
+        protected Unfiltered nextInternal() throws IOException
+        {
+            if (!hasNextInternal())
+                throw new NoSuchElementException();
+
+            Unfiltered toReturn = next;
+            next = null;
+            return toReturn;
+        }
+
+        private Unfiltered computeNext() throws IOException
+        {
+            Unfiltered toReturn;
+            do
+            {
+                if (blockCloseMarker != null)
+                {
+                    toReturn = blockCloseMarker;
+                    blockCloseMarker = null;
+                    return toReturn;
+                }
+                while (!rowOffsets.isEmpty())
+                {
+                    seekToPosition(rowOffsets.pop());
+                    boolean hasNext = deserializer.hasNext();
+                    assert hasNext;
+                    toReturn = deserializer.readNext();
+                    UnfilteredValidation.maybeValidateUnfiltered(toReturn, metadata(), key, sstable);
+                    // We may get empty row for the same reason expressed on UnfilteredSerializer.deserializeOne.
+                    if (!toReturn.isEmpty())
+                        return toReturn;
+                }
+            }
+            while (!foundLessThan && advanceIndexBlock());
+
+            // open marker to be output only as slice is finished
+            if (blockOpenMarker != null)
+            {
+                toReturn = blockOpenMarker;
+                blockOpenMarker = null;
+                return toReturn;
+            }
+            return null;
+        }
+
+        protected boolean advanceIndexBlock() throws IOException
+        {
+            return false;
+        }
+
+        void fillOffsets(Slice slice, boolean filterStart, boolean filterEnd, long stopPosition) throws IOException
+        {
+            filterStart &= !slice.start().equals(ClusteringBound.BOTTOM);
+            filterEnd &= !slice.end().equals(ClusteringBound.TOP);
+            long currentPosition = -1;
+
+            ClusteringBound start = slice.start();
+            currentPosition = file.getFilePointer();
+            foundLessThan = false;
+            // This is a copy of handlePreSliceData which also checks currentPosition < stopPosition.
+            // Not extracted to method as we need both marker and currentPosition.
+            if (filterStart)
+            {
+                while (currentPosition < stopPosition && deserializer.hasNext() && deserializer.compareNextTo(start) <= 0)
+                {
+                    if (deserializer.nextIsRow())
+                        deserializer.skipNext();
+                    else
+                        updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
+
+                    currentPosition = file.getFilePointer();
+                    foundLessThan = true;
+                }
+            }
+
+            // We've reached the beginning of our queried slice. If we have an open marker
+            // we should return that at the end of the slice to close the deletion.
+            if (openMarker != null)
+                blockOpenMarker = new RangeTombstoneBoundMarker(start, openMarker);
+
+
+            // Now deserialize everything until we reach our requested end (if we have one)
+            // See SSTableIterator.ForwardRead.computeNext() for why this is a strict inequality below: this is the same
+            // reasoning here.
+            while (currentPosition < stopPosition && deserializer.hasNext()
+                   && (!filterEnd || deserializer.compareNextTo(slice.end()) < 0))
+            {
+                rowOffsets.push(currentPosition);
+                if (deserializer.nextIsRow())
+                    deserializer.skipNext();
+                else
+                    updateOpenMarker((RangeTombstoneMarker)deserializer.readNext());
+
+                currentPosition = file.getFilePointer();
+            }
+
+            // If we have an open marker, we should output that first, unless end is not being filtered
+            // (i.e. it's either top (where a marker can't be open) or we placed that marker during previous block).
+            if (openMarker != null && filterEnd)
+            {
+                // If we have no end and still an openMarker, this means we're indexed and the marker is closed in a following block.
+                blockCloseMarker = new RangeTombstoneBoundMarker(slice.end(), openMarker);
+                openMarker = null;
+            }
+        }
+    }
+
+    private class ReverseIndexedReader extends ReverseReader
+    {
+        private RowIndexReverseIterator indexReader;
+        final TrieIndexEntry indexEntry;
+        long basePosition;
+        Slice currentSlice;
+        long currentBlockStart;
+
+        public ReverseIndexedReader(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile)
+        {
+            super(file, shouldCloseFile);
+            basePosition = indexEntry.position;
+            this.indexEntry = (TrieIndexEntry) indexEntry;
+        }
+
+        @Override
+        public void close() throws IOException
+        {
+            if (indexReader != null)
+                indexReader.close();
+            super.close();
+        }
+
+        @Override
+        public void setForSlice(Slice slice) throws IOException
+        {
+            currentSlice = slice;
+            ClusteringComparator comparator = metadata.comparator;
+            if (indexReader != null)
+                indexReader.close();
+            indexReader = new RowIndexReverseIterator(ifile,
+                                                      indexEntry,
+                                                      comparator.asByteComparable(slice.end()));
+            gotoBlock(indexReader.nextIndexInfo(), true, Long.MAX_VALUE);
+        }
+
+        boolean gotoBlock(IndexInfo indexInfo, boolean filterEnd, long blockEnd) throws IOException
+        {
+            blockOpenMarker = null;
+            blockCloseMarker = null;
+            rowOffsets.clear();
+            if (indexInfo == null)
+                return false;
+            currentBlockStart = basePosition + indexInfo.offset;
+            openMarker = indexInfo.openDeletion;
+
+            seekToPosition(currentBlockStart);
+            fillOffsets(currentSlice, true, filterEnd, blockEnd);
+            return !rowOffsets.isEmpty();
+        }
+
+        @Override
+        protected boolean advanceIndexBlock() throws IOException
+        {
+            return gotoBlock(indexReader.nextIndexInfo(), false, currentBlockStart);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/ScrubIterator.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/ScrubIterator.java
new file mode 100644
index 000000000000..976a5c68c49b
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/ScrubIterator.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.io.sstable.format.ScrubPartitionIterator;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+// TODO STAR-247: implement unit test
+public class ScrubIterator extends PartitionIndex.IndexPosIterator implements ScrubPartitionIterator
+{
+    ByteBuffer key;
+    long dataPosition;
+    final FileHandle rowIndexFile;
+
+    ScrubIterator(PartitionIndex partitionIndex, FileHandle rowIndexFile) throws IOException
+    {
+        super(partitionIndex);
+        this.rowIndexFile = rowIndexFile.sharedCopy();
+        advance();
+    }
+
+    @Override
+    public void close()
+    {
+        super.close();
+        rowIndexFile.close();
+    }
+
+    @Override
+    public ByteBuffer key()
+    {
+        return key;
+    }
+
+    @Override
+    public long dataPosition()
+    {
+        return dataPosition;
+    }
+
+    @Override
+    public void advance() throws IOException
+    {
+        long pos = nextIndexPos();
+        if (pos != PartitionIndex.NOT_FOUND)
+        {
+            if (pos >= 0) // row index position
+            {
+                try (FileDataInput in = rowIndexFile.createReader(pos))
+                {
+                    key = ByteBufferUtil.readWithShortLength(in);
+                    dataPosition = TrieIndexEntry.deserialize(in, in.getFilePointer()).position;
+                }
+            }
+            else
+            {
+                key = null;
+                dataPosition = ~pos;
+            }
+        }
+        else
+        {
+            key = null;
+            dataPosition = -1;
+        }
+    }
+
+    @Override
+    public boolean isExhausted()
+    {
+        return dataPosition == -1;
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexEntry.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexEntry.java
new file mode 100644
index 000000000000..a73fddd9330f
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexEntry.java
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.utils.ObjectSizes;
+
+/**
+ * An entry in the row index for a partition whose rows are indexed in a trie.
+ *
+ * Not to be used outside of package. Public only for IndexRewriter tool.
+ */
+public final class TrieIndexEntry extends RowIndexEntry
+{
+    private static final long BASE_SIZE;
+
+    static
+    {
+        BASE_SIZE = ObjectSizes.measure(new TrieIndexEntry(0, 0, 10, DeletionTime.LIVE));
+    }
+
+    final long indexTrieRoot;
+    private final int rowIndexCount;
+    private final DeletionTime deletionTime;
+
+    TrieIndexEntry(long dataFilePosition, long indexTrieRoot, int rowIndexCount,
+                   DeletionTime deletionTime)
+    {
+        super(dataFilePosition);
+
+        assert rowIndexCount > 1;
+        this.indexTrieRoot = indexTrieRoot;
+        this.rowIndexCount = rowIndexCount;
+        this.deletionTime = deletionTime;
+    }
+
+    @Override
+    public int columnsIndexCount()
+    {
+        return rowIndexCount;
+    }
+
+    @Override
+    public DeletionTime deletionTime()
+    {
+        return deletionTime;
+    }
+
+    @Override
+    public long unsharedHeapSize()
+    {
+        return BASE_SIZE;
+    }
+
+    public void serialize(DataOutputPlus indexFile, long basePosition) throws IOException
+    {
+        indexFile.writeUnsignedVInt(position);
+        indexFile.writeVInt(indexTrieRoot - basePosition);
+        indexFile.writeUnsignedVInt(rowIndexCount);
+        DeletionTime.serializer.serialize(deletionTime, indexFile);
+    }
+
+    /**
+     * Create an index entry. The row index trie must already have been written (by RowIndexWriter) to the row index
+     * file and its root position must be specified in trieRoot.
+     */
+    public static RowIndexEntry create(long dataStartPosition,
+                                       long trieRoot,
+                                       DeletionTime partitionLevelDeletion,
+                                       int rowIndexCount)
+    {
+        if (trieRoot == -1)
+            return new RowIndexEntry(dataStartPosition) {};
+        return new TrieIndexEntry(dataStartPosition, trieRoot, rowIndexCount, partitionLevelDeletion);
+    }
+
+    public static RowIndexEntry deserialize(DataInputPlus in, long basePosition) throws IOException
+    {
+        long dataFilePosition = in.readUnsignedVInt();
+        long indexTrieRoot = in.readVInt() + basePosition;
+        int rowIndexCount = (int) in.readUnsignedVInt();
+        DeletionTime deletionTime = DeletionTime.serializer.deserialize(in);
+        return new TrieIndexEntry(dataFilePosition, indexTrieRoot, rowIndexCount, deletionTime);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
new file mode 100644
index 000000000000..080ced3113dc
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
@@ -0,0 +1,437 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Set;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.format.Version;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.db.Directories.SECONDARY_INDEX_NAME_SEPARATOR;
+import static org.apache.cassandra.io.sstable.SSTable.componentsFor;
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultDataHandleBuilder;
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultIndexHandleBuilder;
+
+/**
+ * Bigtable format with trie indices
+ */
+public class TrieIndexFormat implements SSTableFormat
+{
+    // Data, primary index and row index (which may be 0-length) are required.
+    // For the 3.0+ sstable format, the (misnomed) stats component hold the serialization header which we need to deserialize the sstable content
+    private static final Set<Component> REQUIRED_COMPONENTS = ImmutableSet.of(Component.DATA,
+                                                                Component.PARTITION_INDEX,
+                                                                Component.ROW_INDEX,
+                                                                Component.STATS);
+
+    private final static Set<Component> SUPPORTED_COMPONENTS = ImmutableSet.of(Component.DATA,
+                                                                               Component.PARTITION_INDEX,
+                                                                               Component.ROW_INDEX,
+                                                                               Component.FILTER,
+                                                                               Component.COMPRESSION_INFO,
+                                                                               Component.STATS,
+                                                                               Component.DIGEST,
+                                                                               Component.CRC,
+                                                                               Component.TOC);
+
+    private final static Set<Component> STREAMING_COMPONENTS = ImmutableSet.of(Component.DATA,
+                                                                               Component.PARTITION_INDEX,
+                                                                               Component.ROW_INDEX,
+                                                                               Component.STATS,
+                                                                               Component.COMPRESSION_INFO,
+                                                                               Component.FILTER,
+                                                                               Component.DIGEST,
+                                                                               Component.CRC);
+    public static final TrieIndexFormat instance = new TrieIndexFormat();
+    public static final Version latestVersion = new TrieIndexVersion(TrieIndexVersion.current_version);
+    static final ReaderFactory readerFactory = new ReaderFactory();
+    static final WriterFactory writerFactory = new WriterFactory();
+
+
+    private TrieIndexFormat()
+    {
+
+    }
+
+    @Override
+    public Type getType()
+    {
+        return Type.BTI;
+    }
+
+    @Override
+    public Version getLatestVersion()
+    {
+        return latestVersion;
+    }
+
+    @Override
+    public Version getVersion(String version)
+    {
+        return new TrieIndexVersion(version);
+    }
+
+
+    @Override
+    public SSTableWriter.Factory getWriterFactory()
+    {
+        return writerFactory;
+    }
+
+    @Override
+    public SSTableReader.Factory getReaderFactory()
+    {
+        return readerFactory;
+    }
+
+    @Override
+    public Set<Component> requiredComponents()
+    {
+        return REQUIRED_COMPONENTS;
+    }
+
+    @Override
+    public Set<Component> supportedComponents()
+    {
+        return SUPPORTED_COMPONENTS;
+    }
+
+    @Override
+    public Set<Component> streamingComponents()
+    {
+        return STREAMING_COMPONENTS;
+    }
+    static class WriterFactory extends SSTableWriter.Factory
+    {
+        @Override
+        public long estimateSize(SSTableWriter.SSTableSizeParameters parameters)
+        {
+            return (long) ((parameters.partitionCount() // index entries
+                            + parameters.partitionCount() // keys in data file
+                            + parameters.dataSize()) // data
+                           * 1.2); // bloom filter and row index overhead
+        }
+
+        @Override
+        public SSTableWriter open(Descriptor descriptor,
+                                  long keyCount,
+                                  long repairedAt,
+                                  UUID pendingRepair,
+                                  boolean isTransient,
+                                  TableMetadataRef metadata,
+                                  MetadataCollector metadataCollector,
+                                  SerializationHeader header,
+                                  Collection<SSTableFlushObserver> observers,
+                                  LifecycleNewTracker lifecycleNewTracker,
+                                  Set<Component> indexComponents)
+        {
+            SSTable.validateRepairedMetadata(repairedAt, pendingRepair, isTransient);
+            return new TrieIndexSSTableWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers, lifecycleNewTracker, indexComponents);
+        }
+    }
+
+    static class ReaderFactory implements SSTableReader.Factory
+    {
+        @SuppressWarnings("IOResourceOpenedButNotSafelyClosed")
+        @Override
+        public PartitionIndexIterator indexIterator(Descriptor desc, TableMetadata metadata)
+        {
+            IPartitioner partitioner = metadata.partitioner;
+            boolean compressedData = desc.fileFor(Component.COMPRESSION_INFO).exists();
+            try
+            {
+                @SuppressWarnings("unused") StatsMetadata stats = (StatsMetadata) desc.getMetadataSerializer().deserialize(desc, MetadataType.STATS);
+
+                try (FileHandle.Builder piBuilder = defaultIndexHandleBuilder(desc, Component.PARTITION_INDEX);
+                     FileHandle.Builder riBuilder = defaultIndexHandleBuilder(desc, Component.ROW_INDEX);
+                     FileHandle.Builder dBuilder = defaultDataHandleBuilder(desc).compressed(compressedData);
+                     PartitionIndex index = PartitionIndex.load(piBuilder, partitioner, false);
+                     FileHandle dFile = dBuilder.complete();
+                     FileHandle riFile = riBuilder.complete())
+                {
+                    return new PartitionIterator(index.sharedCopy(),
+                                                 partitioner,
+                                                 riFile.sharedCopy(),
+                                                 dFile.sharedCopy())
+                            .closeHandles();
+                }
+            }
+            catch (IOException e)
+            {
+                throw Throwables.cleaned(e);
+            }
+        }
+
+        @Override
+        public SSTableReader openForBatch(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata)
+        {
+            return TrieIndexSSTableReader.open(descriptor, Sets.difference(components, Collections.singleton(Component.FILTER)), metadata, true, true);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor descriptor)
+        {
+            TableMetadataRef metadata;
+            if (descriptor.cfname.contains(SECONDARY_INDEX_NAME_SEPARATOR))
+            {
+                int i = descriptor.cfname.indexOf(SECONDARY_INDEX_NAME_SEPARATOR);
+                String indexName = descriptor.cfname.substring(i + 1);
+                metadata = Schema.instance.getIndexTableMetadataRef(descriptor.ksname, indexName);
+                if (metadata == null)
+                    throw new AssertionError("Could not find index metadata for index cf " + i);
+            }
+            else
+            {
+                metadata = Schema.instance.getTableMetadataRef(descriptor.ksname, descriptor.cfname);
+            }
+            return open(descriptor, metadata);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc, TableMetadataRef metadata)
+        {
+            return open(desc, componentsFor(desc), metadata);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc, Set<Component> components, TableMetadataRef metadata)
+        {
+            return open(desc, components, metadata, true, false);
+        }
+
+        @Override
+        public SSTableReader open(Descriptor desc, Set<Component> components, TableMetadataRef metadata, boolean validate, boolean isOffline)
+        {
+            return TrieIndexSSTableReader.open(desc, components, metadata, validate, isOffline);
+        }
+
+        @Override
+        public SSTableReader openNoValidation(Descriptor desc, TableMetadataRef tableMetadataRef)
+        {
+            return TrieIndexSSTableReader.open(desc, componentsFor(desc), tableMetadataRef, false, true);
+        }
+
+        @Override
+        public SSTableReader openNoValidation(Descriptor desc, Set<Component> components, ColumnFamilyStore cfs)
+        {
+            return TrieIndexSSTableReader.open(desc, components, cfs.metadata, false, true);
+        }
+
+        @Override
+        public SSTableReader moveAndOpenSSTable(ColumnFamilyStore cfs, Descriptor oldDescriptor, Descriptor newDescriptor, Set<Component> components, boolean copyData)
+        {
+            return SSTableReader.moveAndOpenSSTable(cfs, oldDescriptor, newDescriptor, components, copyData);
+        }
+    }
+
+    // versions are denoted as [major][minor].  Minor versions must be forward-compatible:
+    // new fields are allowed in e.g. the metadata component, but fields can't be removed
+    // or have their size changed.
+    //
+    static class TrieIndexVersion extends Version
+    {
+        public static final String current_version = "ca";
+        public static final String earliest_supported_version = "aa";
+
+        // aa (DSE 6.0): trie index format
+        // ab (DSE pre-6.8): ILLEGAL - handled as 'b' (predates 'ba'). Pre-GA "LABS" releases of DSE 6.8 used this
+        //                   sstable version.
+        // ac (DSE 6.0.11, 6.7.6): corrected sstable min/max clustering (DB-3691/CASSANDRA-14861)
+        // ad (DSE 6.0.14, 6.7.11): added hostId of the node from which the sstable originated (DB-4629)
+        // b  (DSE early 6.8 "LABS") has some of 6.8 features but not all
+        // ba (DSE 6.8): encrypted indices and metadata
+        //               new BloomFilter serialization format
+        //               add incremental NodeSync information to metadata
+        //               improved min/max clustering representation
+        //               presence marker for partition level deletions
+        // bb (DSE 6.8.5): added hostId of the node from which the sstable originated (DB-4629)
+        // ca (DSE-DB aka Stargazer based on OSS 4.0): all bb fields  + all OSS fields
+        // NOTE: when adding a new version, please add that to LegacySSTableTest, too.
+
+        private final boolean isLatestVersion;
+
+        /**
+         * DB-2648/CASSANDRA-9067: DSE 6.8/OSS 4.0 bloom filter representation changed (bitset data is no longer stored
+         * as BIG_ENDIAN longs, which avoids some redundant bit twiddling).
+         */
+        private final boolean hasOldBfFormat;
+        private final boolean hasAccurateLegacyMinMax;
+        private final boolean hasOriginatingHostId;
+        private final boolean hasMaxColumnValueLengths;
+
+        private final int correspondingMessagingVersion;
+
+        TrieIndexVersion(String version)
+        {
+            super(instance, version = mapAb(version));
+
+            isLatestVersion = version.compareTo(current_version) == 0;
+            hasOldBfFormat = version.compareTo("b") < 0;
+            hasAccurateLegacyMinMax = version.compareTo("ac") >= 0;
+            hasOriginatingHostId = version.matches("(a[d-z])|(b[b-z])") || version.compareTo("ca") >= 0;
+            hasMaxColumnValueLengths = version.matches("b[a-z]"); // TODO TBD
+            correspondingMessagingVersion = version.compareTo("ca") >= 0 ? MessagingService.VERSION_40 : MessagingService.VERSION_3014;
+        }
+
+        // this is for the ab version which was used in the LABS, and then has been renamed to ba
+        private static String mapAb(String version)
+        {
+            return "ab".equals(version) ? "ba" : version;
+        }
+
+        @Override
+        public boolean isLatestVersion()
+        {
+            return isLatestVersion;
+        }
+
+        @Override
+        public boolean hasCommitLogLowerBound()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean hasCommitLogIntervals()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean hasMaxCompressedLength()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean hasPendingRepair()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean hasMetadataChecksum()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean hasZeroCopyMetadata()
+        {
+            return version.compareTo("b") >= 0 && version.compareTo("c") < 0;
+        }
+
+        @Override
+        public boolean hasIncrementalNodeSyncMetadata()
+        {
+            return version.compareTo("b") >= 0 && version.compareTo("c") < 0;
+        }
+
+        @Override
+        public boolean hasAccurateMinMax()
+        {
+            return hasAccurateLegacyMinMax;
+        }
+
+        @Override
+        public boolean hasPartitionLevelDeletionsPresenceMarker()
+        {
+            return version.compareTo("ba") >= 0;
+        }
+
+        @Override
+        public boolean hasImprovedMinMax()
+        {
+            return version.compareTo("ba") >= 0;
+        }
+
+        // TODO TBD
+        @Override
+        public boolean hasMaxColumnValueLengths()
+        {
+            return hasMaxColumnValueLengths;
+        }
+
+        // TODO TBD
+        @Override
+        public boolean hasOriginatingHostId()
+        {
+            return hasOriginatingHostId;
+        }
+
+        @Override
+        public boolean isCompatible()
+        {
+            return version.compareTo(earliest_supported_version) >= 0 && version.charAt(0) <= current_version.charAt(0);
+        }
+
+        @Override
+        public boolean hasOldBfFormat()
+        {
+            return hasOldBfFormat;
+        }
+
+        // this field is not present in DSE
+        @Override
+        public int correspondingMessagingVersion()
+        {
+            return correspondingMessagingVersion;
+        }
+
+        // this field is not present in DSE
+        @Override
+        public boolean isCompatibleForStreaming()
+        {
+            return isCompatible() && version.charAt(0) == current_version.charAt(0);
+        }
+
+        // this field is not present in DSE
+        @Override
+        public boolean hasIsTransient()
+        {
+            return version.compareTo("ca") >= 0;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
new file mode 100644
index 000000000000..88fcb5e1e50c
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
@@ -0,0 +1,1099 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.BufferedInputStream;
+import java.io.Closeable;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Set;
+import javax.annotation.Nonnull;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cache.InstrumentingCache;
+import org.apache.cassandra.cache.KeyCacheKey;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterators;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.Downsampling;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SelectionReason;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SkippingReason;
+import org.apache.cassandra.io.sstable.format.ScrubPartitionIterator;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.trieindex.PartitionIndex.IndexPosIterator;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
+import org.apache.cassandra.io.util.ChannelProxy;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.BloomFilter;
+import org.apache.cassandra.utils.BloomFilterSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.FilterFactory;
+import org.apache.cassandra.utils.IFilter;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.SyncUtil;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.concurrent.Ref;
+
+import static org.apache.cassandra.io.sstable.format.SSTableReader.Operator.EQ;
+import static org.apache.cassandra.io.sstable.format.SSTableReader.Operator.GE;
+import static org.apache.cassandra.io.sstable.format.SSTableReader.Operator.GT;
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultDataHandleBuilder;
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultIndexHandleBuilder;
+
+/**
+ * SSTableReaders are open()ed by Keyspace.onStart; after that they are created by SSTableWriter.renameAndOpen.
+ * Do not re-call open() on existing SSTable files; use the references kept by ColumnFamilyStore post-start instead.
+ */
+public class TrieIndexSSTableReader extends SSTableReader
+{
+    private static final Logger logger = LoggerFactory.getLogger(TrieIndexSSTableReader.class);
+
+    protected FileHandle rowIndexFile;
+    protected PartitionIndex partitionIndex;
+
+    @VisibleForTesting
+    public static final double fpChanceTolerance = Double.parseDouble(System.getProperty(Config.PROPERTY_PREFIX + "bloom_filter_fp_chance_tolerance", "0.000001"));
+
+    TrieIndexSSTableReader(Descriptor desc, Set<Component> components, TableMetadataRef metadata, Long maxDataAge, StatsMetadata sstableMetadata, OpenReason openReason, SerializationHeader header, FileHandle dfile, FileHandle rowIndexFile, PartitionIndex partitionIndex, IFilter bf)
+    {
+        super(desc, components, metadata, maxDataAge, sstableMetadata, openReason, header, null, dfile, null, bf);
+        this.rowIndexFile = rowIndexFile;
+        this.partitionIndex = partitionIndex;
+    }
+
+    /**
+     * Clone this reader with the new start, open reason, bloom filter, and set the clone as replacement.
+     *
+     * @param first      new start for the replacement
+     * @param openReason the {@code OpenReason} for the replacement.
+     * @param bf         Bloom filter for the replacement
+     * @return the cloned reader. That reader is set as a replacement by the method.
+     */
+    private TrieIndexSSTableReader cloneInternal(DecoratedKey first, OpenReason openReason, IFilter bf)
+    {
+        TrieIndexSSTableReader clone = new TrieIndexSSTableReader(descriptor,
+                                                                  components,
+                                                                  metadata,
+                                                                  maxDataAge,
+                                                                  sstableMetadata,
+                                                                  openReason,
+                                                                  header,
+                                                                  dfile.sharedCopy(),
+                                                                  rowIndexFile.sharedCopy(),
+                                                                  partitionIndex.sharedCopy(),
+                                                                  bf);
+        clone.first = first;
+        clone.last = last;
+        clone.isSuspect.set(isSuspect.get());
+        clone.setup(true);
+
+        return clone;
+    }
+
+    /**
+     * Open a RowIndexedReader which already has its state initialized (by SSTableWriter).
+     */
+    static TrieIndexSSTableReader internalOpen(Descriptor desc,
+                                               Set<Component> components,
+                                               TableMetadataRef metadata,
+                                               FileHandle rowIndexFile,
+                                               FileHandle dfile,
+                                               PartitionIndex partitionIndex,
+                                               IFilter bf,
+                                               long maxDataAge,
+                                               StatsMetadata sstableMetadata,
+                                               OpenReason openReason,
+                                               SerializationHeader header)
+    {
+        assert desc != null && rowIndexFile != null && dfile != null && partitionIndex != null && bf != null && sstableMetadata != null;
+
+        // Make sure the SSTableReader internalOpen part does the same.
+        assert desc.getFormat() == TrieIndexFormat.instance;
+        TrieIndexSSTableReader reader = new TrieIndexSSTableReader(desc, components, metadata, maxDataAge, sstableMetadata, openReason, header, dfile, rowIndexFile, partitionIndex, bf);
+        reader.first = partitionIndex.firstKey();
+        reader.last = partitionIndex.lastKey();
+
+        return reader;
+    }
+
+    static TrieIndexSSTableReader internalOpen(Descriptor desc,
+                                               Set<Component> components,
+                                               TableMetadataRef metadata,
+                                               FileHandle dfile,
+                                               IFilter bf,
+                                               long maxDataAge,
+                                               StatsMetadata sstableMetadata,
+                                               OpenReason openReason,
+                                               SerializationHeader header)
+    {
+        assert desc != null && dfile != null && bf != null && sstableMetadata != null;
+
+        // Make sure the SSTableReader internalOpen part does the same.
+        assert desc.getFormat() == TrieIndexFormat.instance;
+        return new TrieIndexSSTableReader(desc, components, metadata, maxDataAge, sstableMetadata, openReason, header, dfile, null, null, bf);
+    }
+
+    @Override
+    public void setup(boolean trackHotness)
+    {
+        tidy.setup(this, trackHotness, Arrays.asList(bf, dfile, partitionIndex, rowIndexFile));
+        super.setup(trackHotness);
+    }
+
+    @Override
+    public void addTo(Ref.IdentityCollection identities)
+    {
+        super.addTo(identities);
+        rowIndexFile.addTo(identities);
+        partitionIndex.addTo(identities);
+    }
+
+    protected boolean filterFirst()
+    {
+        return openReason == OpenReason.MOVED_START;
+    }
+
+    protected boolean filterLast()
+    {
+        return false;
+    }
+
+    public long estimatedKeys()
+    {
+        return partitionIndex == null ? 0 : partitionIndex.size();
+    }
+
+    @Override
+    protected RowIndexEntry getPosition(PartitionPosition key, Operator op, boolean updateCacheAndStats, boolean permitMatchPastLast, SSTableReadsListener listener)
+    {
+        assert !permitMatchPastLast;
+
+        PartitionPosition searchKey;
+        Operator searchOp;
+
+        if (op == EQ)
+            return getExactPosition((DecoratedKey) key, listener, updateCacheAndStats);
+
+        if (op == GT || op == GE)
+        {
+            if (filterLast() && last.compareTo(key) < 0)
+                return null;
+            boolean filteredLeft = (filterFirst() && first.compareTo(key) > 0);
+            searchKey = filteredLeft ? first : key;
+            searchOp = filteredLeft ? GE : op;
+
+            try (PartitionIndex.Reader reader = partitionIndex.openReader())
+            {
+                return reader.ceiling(searchKey, (pos, assumeNoMatch, compareKey) -> retrieveEntryIfAcceptable(searchOp, compareKey, pos, assumeNoMatch));
+            }
+            catch (IOException e)
+            {
+                markSuspect();
+                throw new CorruptSSTableException(e, rowIndexFile.path());
+            }
+        }
+
+        throw new UnsupportedOperationException("Unsupported op: " + op);
+    }
+
+    /**
+     * Called by getPosition above (via Reader.ceiling/floor) to check if the position satisfies the full key constraint.
+     * This is called once if there is a prefix match (which can be in any relationship with the sought key, thus
+     * assumeNoMatch: false), and if it returns null it is called again for the closest greater position
+     * (with assumeNoMatch: true).
+     * Returns the index entry at this position, or null if the search op rejects it.
+     */
+    private RowIndexEntry retrieveEntryIfAcceptable(Operator searchOp, PartitionPosition searchKey, long pos, boolean assumeNoMatch) throws IOException
+    {
+        if (pos >= 0)
+        {
+            try (FileDataInput in = rowIndexFile.createReader(pos))
+            {
+                if (assumeNoMatch)
+                    ByteBufferUtil.skipShortLength(in);
+                else
+                {
+                    ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
+                    DecoratedKey decorated = decorateKey(indexKey);
+                    if (searchOp.apply(decorated.compareTo(searchKey)) != 0)
+                        return null;
+                }
+                return TrieIndexEntry.deserialize(in, in.getFilePointer());
+            }
+        }
+        else
+        {
+            pos = ~pos;
+            if (!assumeNoMatch)
+            {
+                try (FileDataInput in = dfile.createReader(pos))
+                {
+                    ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
+                    DecoratedKey decorated = decorateKey(indexKey);
+                    if (searchOp.apply(decorated.compareTo(searchKey)) != 0)
+                        return null;
+                }
+            }
+            return new RowIndexEntry(pos);
+        }
+    }
+
+    protected boolean inBloomFilter(DecoratedKey dk)
+    {
+        return first.compareTo(dk) <= 0 && last.compareTo(dk) >= 0 && bf.isPresent(dk);
+    }
+
+    @Override
+    public DecoratedKey keyAt(RandomAccessReader reader, long dataPosition) throws IOException
+    {
+        reader.seek(dataPosition);
+        if (reader.isEOF()) return null;
+        return decorateKey(ByteBufferUtil.readWithShortLength(reader));
+    }
+
+    @Override
+    public DecoratedKey keyAt(FileDataInput reader) throws IOException
+    {
+        if (reader.isEOF()) return null;
+
+        return decorateKey(ByteBufferUtil.readWithShortLength(reader));
+    }
+
+    public RowIndexEntry getExactPosition(DecoratedKey dk,
+                                          SSTableReadsListener listener,
+                                          boolean updateStats)
+    {
+        if (!inBloomFilter(dk))
+        {
+            listener.onSSTableSkipped(this, SkippingReason.BLOOM_FILTER);
+            Tracing.trace("Bloom filter allows skipping sstable {}", descriptor.generation);
+            return null;
+        }
+
+        if ((filterFirst() && first.compareTo(dk) > 0) || (filterLast() && last.compareTo(dk) < 0))
+        {
+            if (updateStats)
+                bloomFilterTracker.addFalsePositive();
+            listener.onSSTableSkipped(this, SkippingReason.MIN_MAX_KEYS);
+            return null;
+        }
+
+        try (PartitionIndex.Reader reader = partitionIndex.openReader())
+        {
+            long indexPos = reader.exactCandidate(dk);
+            if (indexPos == PartitionIndex.NOT_FOUND)
+            {
+                if (updateStats)
+                    bloomFilterTracker.addFalsePositive();
+                listener.onSSTableSkipped(this, SkippingReason.PARTITION_INDEX_LOOKUP);
+                return null;
+            }
+
+            FileHandle fh;
+            long seekPosition;
+            if (indexPos >= 0)
+            {
+                fh = rowIndexFile;
+                seekPosition = indexPos;
+            }
+            else
+            {
+                fh = dfile;
+                seekPosition = ~indexPos;
+            }
+
+            try (FileDataInput in = fh.createReader(seekPosition))
+            {
+                return ByteBufferUtil.equalsWithShortLength(in, dk.getKey())
+                       ? handleKeyFound(updateStats, listener, in, indexPos)
+                       : handleKeyNotFound(updateStats, listener);
+            }
+        }
+        catch (IOException e)
+        {
+            markSuspect();
+            throw new CorruptSSTableException(e, rowIndexFile.path());
+        }
+    }
+
+    private RowIndexEntry handleKeyNotFound(boolean updateStats, SSTableReadsListener listener)
+    {
+        if (updateStats)
+            bloomFilterTracker.addFalsePositive();
+        listener.onSSTableSkipped(this, SkippingReason.INDEX_ENTRY_NOT_FOUND);
+        return null;
+    }
+
+    private RowIndexEntry handleKeyFound(boolean updateStats, SSTableReadsListener listener, FileDataInput in, long indexPos) throws IOException
+    {
+        if (updateStats)
+            bloomFilterTracker.addTruePositive();
+        RowIndexEntry entry = indexPos >= 0 ? TrieIndexEntry.deserialize(in, in.getFilePointer())
+                                            : new RowIndexEntry(~indexPos);
+
+        listener.onSSTableSelected(this, entry, SelectionReason.INDEX_ENTRY_FOUND);
+        return entry;
+    }
+
+    /**
+     * @param bounds Must not be wrapped around ranges
+     * @return PartitionIndexIterator within the given bounds
+     */
+    public PartitionIterator coveredKeysIterator(AbstractBounds<PartitionPosition> bounds) throws IOException
+    {
+        return new KeysRange(bounds).iterator();
+    }
+
+    private final class KeysRange
+    {
+        PartitionPosition left;
+        boolean inclusiveLeft;
+        PartitionPosition right;
+        boolean inclusiveRight;
+
+        KeysRange(AbstractBounds<PartitionPosition> bounds)
+        {
+            assert !AbstractBounds.strictlyWrapsAround(bounds.left, bounds.right) : String.format("[%s,%s]", bounds.left, bounds.right);
+
+            left = bounds.left;
+            inclusiveLeft = bounds.inclusiveLeft();
+            if (filterFirst() && first.compareTo(left) > 0)
+            {
+                left = first;
+                inclusiveLeft = true;
+            }
+
+            right = bounds.right;
+            inclusiveRight = bounds.inclusiveRight();
+            if (filterLast() && last.compareTo(right) < 0)
+            {
+                right = last;
+                inclusiveRight = true;
+            }
+        }
+
+        PartitionIterator iterator() throws IOException
+        {
+            return coveredKeysIterator(left, inclusiveLeft, right, inclusiveRight);
+        }
+    }
+
+    public PartitionIterator coveredKeysIterator(PartitionPosition left, boolean inclusiveLeft, PartitionPosition right, boolean inclusiveRight) throws IOException
+    {
+        AbstractBounds<PartitionPosition> cover = Bounds.bounds(left, inclusiveLeft, right, inclusiveRight);
+        boolean isLeftInSStableRange = !filterFirst() || first.compareTo(left) <= 0 && last.compareTo(left) >= 0;
+        boolean isRightInSStableRange = !filterLast() || first.compareTo(right) <= 0 && last.compareTo(right) >= 0;
+        if (isLeftInSStableRange || isRightInSStableRange || (cover.contains(first) && cover.contains(last)))
+        {
+            inclusiveLeft = isLeftInSStableRange ? inclusiveLeft : true;
+            inclusiveRight = isRightInSStableRange ? inclusiveRight : true;
+            return new PartitionIterator(partitionIndex,
+                                         metadata().partitioner,
+                                         rowIndexFile, dfile,
+                                         isLeftInSStableRange ? left : first, inclusiveLeft ? -1 : 0,
+                                         isRightInSStableRange ? right : last, inclusiveRight ? 0 : -1);
+        }
+        else
+            return PartitionIterator.empty(partitionIndex);
+    }
+
+    public PartitionIterator allKeysIterator() throws IOException
+    {
+        return new PartitionIterator(partitionIndex, metadata().partitioner, rowIndexFile, dfile);
+    }
+
+    public ScrubPartitionIterator scrubPartitionsIterator() throws IOException
+    {
+        if (partitionIndex == null)
+            return null;
+        return new ScrubIterator(partitionIndex, rowIndexFile);
+    }
+
+    @Override
+    public Iterable<DecoratedKey> getKeySamples(final Range<Token> range)
+    {
+        Iterator<IndexPosIterator> partitionKeyIterators = TrieIndexScanner.makeBounds(this,
+                                                                                       Collections.singleton(range))
+                                                                           .stream()
+                                                                           .map(this::indexPosIteratorForRange)
+                                                                           .iterator();
+
+        if (!partitionKeyIterators.hasNext())
+            return Collections.emptyList();
+
+        return () -> new AbstractIterator<DecoratedKey>()
+        {
+            IndexPosIterator currentItr = partitionKeyIterators.next();
+            long count = -1;
+
+            private long getNextPos() throws IOException
+            {
+                long pos;
+                while ((pos = currentItr.nextIndexPos()) == PartitionIndex.NOT_FOUND
+                       && partitionKeyIterators.hasNext())
+                {
+                    closeCurrentIt();
+                    currentItr = partitionKeyIterators.next();
+                }
+                return pos;
+            }
+
+            private void closeCurrentIt()
+            {
+                if (currentItr != null)
+                    currentItr.close();
+                currentItr = null;
+            }
+
+            @Override
+            protected DecoratedKey computeNext()
+            {
+                try
+                {
+                    while (true)
+                    {
+                        long pos = getNextPos();
+                        count++;
+                        if (pos == PartitionIndex.NOT_FOUND)
+                            break;
+                        if (count % Downsampling.BASE_SAMPLING_LEVEL == 0)
+                        {
+                            // handle exclusive start and exclusive end
+                            DecoratedKey key = getKeyByPos(pos);
+                            if (range.contains(key.getToken()))
+                                return key;
+                            count--;
+                        }
+                    }
+                    closeCurrentIt();
+                    return endOfData();
+                }
+                catch (IOException e)
+                {
+                    closeCurrentIt();
+                    markSuspect();
+                    throw new CorruptSSTableException(e, dfile.path());
+                }
+            }
+        };
+    }
+
+    private DecoratedKey getKeyByPos(long pos) throws IOException
+    {
+        assert pos != PartitionIndex.NOT_FOUND;
+
+        if (pos >= 0)
+            try (FileDataInput in = rowIndexFile.createReader(pos))
+            {
+                return metadata().partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in));
+            }
+        else
+            try (FileDataInput in = dfile.createReader(~pos))
+            {
+                return metadata().partitioner.decorateKey(ByteBufferUtil.readWithShortLength(in));
+            }
+    }
+
+    private IndexPosIterator indexPosIteratorForRange(AbstractBounds<PartitionPosition> bound)
+    {
+        return new IndexPosIterator(partitionIndex, bound.left, bound.right);
+    }
+
+    @Override
+    public long estimatedKeysForRanges(Collection<Range<Token>> ranges)
+    {
+        // Estimate the number of partitions by calculating the bytes of the sstable that are covered by the specified
+        // ranges and using the mean partition size to obtain a number of partitions from that.
+        long selectedDataSize = 0;
+        for (Range<Token> range : Range.normalize(ranges))
+        {
+            PartitionPosition left = range.left.minKeyBound();
+            if (left.compareTo(first) <= 0)
+                left = null;
+            else if (left.compareTo(last) > 0)
+                continue;   // no intersection
+
+            PartitionPosition right = range.right.minKeyBound();
+            if (range.right.isMinimum() || right.compareTo(last) >= 0)
+                right = null;
+            else if (right.compareTo(first) < 0)
+                continue;   // no intersection
+
+            if (left == null && right == null)
+                return partitionIndex.size();   // sstable is fully covered, return full partition count to avoid rounding errors
+
+            if (left == null && filterFirst())
+                left = first;
+            if (right == null && filterLast())
+                right = last;
+
+            long startPos = left != null ? getPosition(left, GE).position : 0;
+            long endPos = right != null ? getPosition(right, GE).position : uncompressedLength();
+            selectedDataSize += endPos - startPos;
+        }
+        return (long) (selectedDataSize / sstableMetadata.estimatedPartitionSize.rawMean());
+    }
+
+    @Override
+    public UnfilteredRowIterator iterator(DecoratedKey key,
+                                          Slices slices,
+                                          ColumnFilter selectedColumns,
+                                          boolean reversed,
+                                          SSTableReadsListener listener)
+    {
+        return iterator(null, key, getExactPosition(key, listener, true), slices, selectedColumns, reversed);
+    }
+
+    public UnfilteredRowIterator iterator(FileDataInput dataFileInput,
+                                          DecoratedKey key,
+                                          RowIndexEntry indexEntry,
+                                          Slices slices,
+                                          ColumnFilter selectedColumns,
+                                          boolean reversed)
+    {
+        if (indexEntry == null)
+            return UnfilteredRowIterators.noRowsIterator(metadata(), key, Rows.EMPTY_STATIC_ROW, DeletionTime.LIVE, reversed);
+
+        return reversed
+               ? new SSTableReversedIterator(this, dataFileInput, key, indexEntry, slices, selectedColumns, rowIndexFile)
+               : new SSTableIterator(this, dataFileInput, key, indexEntry, slices, selectedColumns, rowIndexFile);
+    }
+
+    public interface PartitionReader extends Closeable
+    {
+        /**
+         * Returns next item or null if exhausted.
+         */
+        Unfiltered next() throws IOException;
+    }
+
+    @Override
+    public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, boolean tombstoneOnly)
+    {
+        RowIndexEntry position = getPosition(key, SSTableReader.Operator.EQ, true, false, SSTableReadsListener.NOOP_LISTENER);
+        if (position == null)
+            return null;
+        return SSTableIdentityIterator.create(this, dfile, position, key, tombstoneOnly);
+    }
+
+    public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, RowIndexEntry indexEntry, boolean tombstoneOnly)
+    {
+        return SSTableIdentityIterator.create(this, dfile, indexEntry, key, tombstoneOnly);
+    }
+
+    @Override
+    public ISSTableScanner getScanner()
+    {
+        return TrieIndexScanner.getScanner(this);
+    }
+
+    @Override
+    public ISSTableScanner getScanner(Collection<Range<Token>> ranges)
+    {
+        if (ranges != null)
+            return TrieIndexScanner.getScanner(this, ranges);
+        else
+            return getScanner();
+    }
+
+    @Override
+    public ISSTableScanner getScanner(Iterator<AbstractBounds<PartitionPosition>> rangeIterator)
+    {
+        return TrieIndexScanner.getScanner(this, rangeIterator);
+    }
+
+    @Override
+    public ISSTableScanner getScanner(ColumnFilter columns, DataRange dataRange, SSTableReadsListener listener)
+    {
+        return TrieIndexScanner.getScanner(this, columns, dataRange, listener);
+    }
+
+    @Override
+    public boolean hasIndex()
+    {
+        return descriptor.fileFor(Component.PARTITION_INDEX).exists();
+    }
+
+    @Override
+    public void setupOnline()
+    {
+        final ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata().id);
+        if (cfs != null)
+            setCrcCheckChance(cfs.getCrcCheckChance());
+    }
+
+    @Override
+    public SSTableReader cloneAndReplace(IFilter newBloomFilter)
+    {
+        return cloneInternal(first, openReason, newBloomFilter);
+    }
+
+    @Override
+    public SSTableReader cloneWithRestoredStart(DecoratedKey restoredStart)
+    {
+        return runWithLock(d -> cloneInternal(restoredStart, OpenReason.NORMAL, bf.sharedCopy()));
+    }
+
+    @Override
+    public SSTableReader cloneWithNewStart(DecoratedKey newStart, Runnable runOnClose)
+    {
+        return runWithLock(d -> {
+            assert openReason != OpenReason.EARLY;
+            // TODO: merge with caller's firstKeyBeyond() work,to save time
+            if (newStart.compareTo(first) > 0)
+            {
+                final long dataStart = getPosition(newStart, Operator.EQ).position;
+                runOnClose(new DropPageCache(dfile, dataStart, null, -1, runOnClose));
+            }
+
+            return cloneInternal(newStart, OpenReason.MOVED_START, bf.sharedCopy());
+        });
+    }
+
+    @Override
+    public SSTableReader cloneWithNewSummarySamplingLevel(ColumnFamilyStore parent, int samplingLevel)
+    {
+        // nothing to do here, the method updates something in index summary which is missing in trie index impl
+        return cloneInternal(first, openReason, bf.sharedCopy());
+    }
+
+    @Override
+    public int getIndexSummarySamplingLevel()
+    {
+        return 0; // tries do not have index summaries
+    }
+
+    @Override
+    public long getIndexSummaryOffHeapSize()
+    {
+        return 0; // tries do not have index summaries
+    }
+
+    @Override
+    public int getMinIndexInterval()
+    {
+        return 0; // tries do not have index summaries
+    }
+
+    @Override
+    public double getEffectiveIndexInterval()
+    {
+        return 0; // tries do not have index summaries
+    }
+
+    @Override
+    public void releaseSummary()
+    {
+        // no-op - tries do not have index summaries
+    }
+
+    @Override
+    public long getIndexScanPosition(PartitionPosition key)
+    {
+        // TODO check this
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int getIndexSummarySize()
+    {
+        return 0; // tries do not have index summaries
+    }
+
+    @Override
+    public int getMaxIndexSummarySize()
+    {
+        return 0; // tries do not have index summaries
+    }
+
+    @Override
+    public byte[] getIndexSummaryKey(int index)
+    {
+        // TODO check this
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public void cacheKey(DecoratedKey key, BigTableRowIndexEntry info)
+    {
+        // no-op - tries do not have index summaries
+    }
+
+    @Override
+    public BigTableRowIndexEntry getCachedPosition(DecoratedKey key, boolean updateStats)
+    {
+        return null; // tries do not have index summaries
+    }
+
+    @Override
+    protected BigTableRowIndexEntry getCachedPosition(KeyCacheKey unifiedKey, boolean updateStats)
+    {
+        return null; // tries do not have index summaries
+    }
+
+    @Override
+    public boolean isKeyCacheEnabled()
+    {
+        return false; // tries do not have index summaries
+    }
+
+    @Override
+    public DecoratedKey firstKeyBeyond(PartitionPosition token)
+    {
+        try
+        {
+            RowIndexEntry pos = getPosition(token, Operator.GT);
+            if (pos == null)
+                return null;
+
+            try (FileDataInput in = dfile.createReader(pos.position))
+            {
+                ByteBuffer indexKey = ByteBufferUtil.readWithShortLength(in);
+                return decorateKey(indexKey);
+            }
+        }
+        catch (IOException e)
+        {
+            markSuspect();
+            throw new CorruptSSTableException(e, dfile.path());
+        }
+    }
+
+    @Override
+    public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
+    {
+        return null; // tries do not have key cache
+    }
+
+    @Override
+    public long getKeyCacheHit()
+    {
+        return 0L; // tries do not have key cache
+    }
+
+    @Override
+    public long getKeyCacheRequest()
+    {
+        return 0L;  // tries do not have key cache
+    }
+
+    @Override
+    public ChannelProxy getIndexChannel()
+    {
+        throw new UnsupportedOperationException("tries do not have primary index");
+    }
+
+    @Override
+    public RandomAccessReader openIndexReader()
+    {
+        throw new UnsupportedOperationException("tries do not have primary index");
+    }
+
+    @Override
+    public RandomAccessReader openKeyComponentReader()
+    {
+        return openDataReader();
+    }
+
+    @Override
+    public FileHandle getIndexFile()
+    {
+        throw new UnsupportedOperationException("tries do not have primary index");
+    }
+
+    private static IFilter deserializeBloomFilter(Descriptor descriptor, boolean oldBfFormat)
+    {
+        try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(descriptor.filenameFor(Component.FILTER))))))
+        {
+            return BloomFilter.serializer.deserialize(stream, oldBfFormat);
+        }
+        catch (Throwable t)
+        {
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.error("Failed to deserialize bloom filter: {}", t.getMessage());
+            return null;
+        }
+    }
+
+    private static IFilter recreateBloomFilter(Descriptor descriptor, TableMetadata metadata, long estimatedKeysCount, Map<MetadataType, MetadataComponent> sstableMetadata, double fpChance)
+    {
+        if (estimatedKeysCount <= 0)
+        {
+            logger.warn("Cannot recreate bloom filter, cannot estimate number of keys");
+            return null;
+        }
+
+        IFilter bf = null;
+        try
+        {
+            bf = FilterFactory.getFilter(estimatedKeysCount, fpChance);
+
+            Factory readerFactory = descriptor.getFormat().getReaderFactory();
+            try (PartitionIterator iter = (PartitionIterator) readerFactory.indexIterator(descriptor, metadata))
+            {
+                while (true)
+                {
+                    DecoratedKey key = iter.decoratedKey();
+                    if (key == null)
+                        break;
+                    bf.add(key);
+                    iter.advance();
+                }
+            }
+
+            File path = descriptor.fileFor(Component.FILTER);
+            try (SeekableByteChannel fos = Files.newByteChannel(path.toPath(), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE);
+                 DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos))
+            {
+                BloomFilter.serializer.serialize((BloomFilter) bf, stream);
+                stream.flush();
+                SyncUtil.sync((FileChannel) fos);
+            }
+
+            // Update the sstable metadata to contain the current FP chance
+            ValidationMetadata validation = new ValidationMetadata(metadata.partitioner.getClass().getCanonicalName(), fpChance);
+            sstableMetadata.put(MetadataType.VALIDATION, validation);
+            descriptor.getMetadataSerializer().rewriteSSTableMetadata(descriptor, sstableMetadata);
+            return bf;
+        }
+        catch (Throwable t)
+        {
+            if (bf != null)
+            {
+                bf.close();
+            }
+
+            JVMStabilityInspector.inspectThrowable(t);
+            logger.error("Failed to recreate bloom filter: {}", t.getMessage());
+
+            return null;
+        }
+    }
+
+    /**
+     * Load the bloom filter from Filter.db file, if it exists and if the FP chance has not changed.
+     * Otherwise recreate the bloom filter using the current FP chance.
+     *
+     * @param sstableMetadata the sstable metadata, for extracting and changing the FP chance
+     * @param fpChance        the current FP chance taken from the table metadata
+     */
+    @VisibleForTesting
+    static @Nonnull
+    IFilter getBloomFilter(Descriptor descriptor, boolean loadIfNeeded, boolean recreateIfNeeded, TableMetadata metadata, long estimatedKeysCount, Map<MetadataType, MetadataComponent> sstableMetadata, double fpChance)
+    {
+        if (Math.abs(1 - fpChance) <= fpChanceTolerance)
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Returning pass-through bloom filter, FP chance is equal to 1: {}", fpChance);
+
+            return FilterFactory.AlwaysPresent;
+        }
+
+        ValidationMetadata validation = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
+        boolean fpChanged = Math.abs(fpChance - validation.bloomFilterFPChance) > fpChanceTolerance;
+
+        if (loadIfNeeded && descriptor.fileFor(Component.FILTER).exists())
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Deserializing bloom filter");
+
+            IFilter bf = deserializeBloomFilter(descriptor, descriptor.version.hasOldBfFormat());
+            if (bf != null)
+                return bf;
+        }
+
+        String reason = fpChanged
+                        ? String.format("false positive chance changed from %f to %f", validation.bloomFilterFPChance, fpChance)
+                        : (!descriptor.fileFor(Component.FILTER).exists()
+                           ? "there is no bloom filter file"
+                           : "deserialization failed");
+
+        if (logger.isDebugEnabled())
+            logger.debug("Recreating bloom filter because {}", reason);
+
+        IFilter bf = recreateIfNeeded
+                     ? recreateBloomFilter(descriptor, metadata, estimatedKeysCount, sstableMetadata, fpChance)
+                     : FilterFactory.AlwaysPresent;
+        if (bf != null)
+            return bf;
+
+        logger.warn("Could not recreate or deserialize existing bloom filter, continuing with a pass-through " +
+                    "bloom filter but this will significantly impact reads performance");
+
+        return FilterFactory.AlwaysPresent;
+    }
+
+    public static boolean hasBloomFilter(double fpChance)
+    {
+        Preconditions.checkArgument(fpChance <= 1, "FP chance should be less or equal to 1: " + fpChance);
+        return Math.abs(1 - fpChance) > fpChanceTolerance;
+    }
+
+    public static TrieIndexSSTableReader open(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata, boolean validate, boolean isOffline)
+    {
+        checkRequiredComponents(descriptor, components, validate);
+
+        EnumSet<MetadataType> types = EnumSet.of(MetadataType.VALIDATION, MetadataType.STATS, MetadataType.HEADER);
+        Map<MetadataType, MetadataComponent> sstableMetadata;
+        try
+        {
+            sstableMetadata = descriptor.getMetadataSerializer().deserialize(descriptor, types);
+        }
+        catch (IOException e)
+        {
+            throw new CorruptSSTableException(e, descriptor.filenameFor(Component.STATS));
+        }
+
+        ValidationMetadata validationMetadata = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
+        StatsMetadata statsMetadata = (StatsMetadata) sstableMetadata.get(MetadataType.STATS);
+        SerializationHeader.Component header = (SerializationHeader.Component) sstableMetadata.get(MetadataType.HEADER);
+
+        // Check if sstable is created using same partitioner.
+        // Partitioner can be null, which indicates older version of sstable or no stats available.
+        // In that case, we skip the check.
+        String partitionerName = metadata.get().partitioner.getClass().getCanonicalName();
+        if (validationMetadata != null && !partitionerName.equals(validationMetadata.partitioner))
+        {
+            String msg = String.format("Cannot open %s; partitioner %s does not match system partitioner %s. " +
+                                       "Note that the default partitioner starting with Cassandra 1.2 is Murmur3Partitioner, " +
+                                       "so you will need to edit that to match your old partitioner if upgrading.",
+                                       descriptor, validationMetadata.partitioner, partitionerName);
+            logger.error("{}", msg);
+            System.exit(1);
+        }
+
+        long fileLength = descriptor.filenameFor(Component.DATA).length();
+        logger.debug("Opening {} ({})", descriptor, FBUtilities.prettyPrintMemory(fileLength));
+
+        double fpChance = metadata.get().params.bloomFilterFpChance;
+
+        FileHandle dataFH = null;
+        FileHandle rowIdxFH = null;
+        PartitionIndex partitionIndex = null;
+        IFilter bloomFilter = null;
+        boolean compressedData = descriptor.fileFor(Component.COMPRESSION_INFO).exists();
+
+        boolean loadBFIfNeeded = components.contains(Component.FILTER);
+        boolean recreatedBFIfNeeded = !isOffline;
+
+        try (FileHandle.Builder dataFHBuilder = defaultDataHandleBuilder(descriptor).compressed(compressedData);
+             @Nonnull IFilter bf = getBloomFilter(descriptor, loadBFIfNeeded, recreatedBFIfNeeded, metadata.get(), statsMetadata.totalRows, sstableMetadata, fpChance))
+        {
+            TrieIndexSSTableReader sstable;
+            dataFH = dataFHBuilder.complete();
+            bloomFilter = bf.sharedCopy();
+
+            if (components.contains(Component.PARTITION_INDEX))
+            {
+                try (FileHandle.Builder partitionIdxFHBuilder = defaultIndexHandleBuilder(descriptor, Component.PARTITION_INDEX);
+                     FileHandle.Builder rowIdxFHBuilder = defaultIndexHandleBuilder(descriptor, Component.ROW_INDEX))
+                {
+                    rowIdxFH = rowIdxFHBuilder.complete();
+                    partitionIndex = PartitionIndex.load(partitionIdxFHBuilder, metadata.get().partitioner, loadBFIfNeeded && !hasBloomFilter(fpChance));
+                    sstable = TrieIndexSSTableReader.internalOpen(descriptor,
+                                                                  components,
+                                                                  metadata,
+                                                                  rowIdxFH,
+                                                                  dataFH,
+                                                                  partitionIndex,
+                                                                  bloomFilter,
+                                                                  System.currentTimeMillis(),
+                                                                  statsMetadata,
+                                                                  OpenReason.NORMAL,
+                                                                  header.toHeader(metadata.get()));
+                }
+            }
+            else
+            {
+                sstable = TrieIndexSSTableReader.internalOpen(descriptor,
+                                                              components,
+                                                              metadata,
+                                                              dataFH,
+                                                              bloomFilter,
+                                                              System.currentTimeMillis(),
+                                                              statsMetadata,
+                                                              OpenReason.NORMAL,
+                                                              header.toHeader(metadata.get()));
+            }
+            if (validate)
+                sstable.validate();
+            sstable.setup(!isOffline); // this should come last, right before returning sstable
+            return sstable;
+        }
+        catch (RuntimeException e)
+        {
+            throw Throwables.cleaned(Throwables.close(e, bloomFilter, rowIdxFH, partitionIndex, dataFH));
+        }
+        catch (IOException e)
+        {
+            throw new CorruptSSTableException(Throwables.close(e, bloomFilter, rowIdxFH, partitionIndex, dataFH), descriptor.filenameFor(Component.DATA));
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
new file mode 100644
index 000000000000..3248001f4c9c
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
@@ -0,0 +1,580 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
+import java.util.Collection;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Sets;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cache.ChunkCache;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.transform.Transformation;
+import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.io.compress.CompressedSequentialWriter;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
+import org.apache.cassandra.io.util.ChecksummedSequentialWriter;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.io.util.DataPosition;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.BloomFilter;
+import org.apache.cassandra.utils.BloomFilterSerializer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.FilterFactory;
+import org.apache.cassandra.utils.IFilter;
+import org.apache.cassandra.utils.SyncUtil;
+import org.apache.cassandra.utils.Throwables;
+import org.apache.cassandra.utils.concurrent.Transactional;
+
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultDataHandleBuilder;
+import static org.apache.cassandra.io.sstable.format.SSTableReaderBuilder.defaultIndexHandleBuilder;
+import static org.apache.cassandra.io.sstable.format.big.BigTableWriter.compressionFor;
+
+@VisibleForTesting
+public class TrieIndexSSTableWriter extends SSTableWriter
+{
+    private static final Logger logger = LoggerFactory.getLogger(TrieIndexSSTableWriter.class);
+
+    private final PartitionWriter partitionWriter;
+    private final IndexWriter iwriter;
+    private final FileHandle.Builder dbuilder;
+    protected final SequentialWriter dataFile;
+    private DecoratedKey lastWrittenKey;
+    private DataPosition dataMark;
+    private long lastEarlyOpenLength = 0;
+    private final Optional<ChunkCache> chunkCache = Optional.ofNullable(ChunkCache.instance);
+
+    private static final SequentialWriterOption WRITER_OPTION = SequentialWriterOption.newBuilder()
+                                                                                      .trickleFsync(DatabaseDescriptor.getTrickleFsync())
+                                                                                      .trickleFsyncByteInterval(DatabaseDescriptor.getTrickleFsyncIntervalInKb() * 1024)
+                                                                                      .bufferType(BufferType.OFF_HEAP)
+                                                                                      .build();
+
+    public TrieIndexSSTableWriter(Descriptor descriptor,
+                                  long keyCount,
+                                  long repairedAt,
+                                  UUID pendingRepair,
+                                  boolean isTransient,
+                                  TableMetadataRef metadata,
+                                  MetadataCollector metadataCollector,
+                                  SerializationHeader header,
+                                  Collection<SSTableFlushObserver> observers,
+                                  LifecycleNewTracker lifecycleNewTracker,
+                                  Set<Component> indexComponents)
+    {
+        super(descriptor, components(metadata.getLocal(), indexComponents),keyCount, repairedAt, pendingRepair, isTransient, metadata, metadataCollector, header, observers);
+        lifecycleNewTracker.trackNew(this); // must track before any files are created
+
+        if (compression)
+        {
+            final CompressionParams compressionParams = compressionFor(lifecycleNewTracker.opType(), metadata);
+
+            dataFile = new CompressedSequentialWriter(getFile(),
+                                                      descriptor.filenameFor(Component.COMPRESSION_INFO),
+                                                      descriptor.fileFor(Component.DIGEST),
+                                                      WRITER_OPTION,
+                                                      compressionParams,
+                                                      metadataCollector);
+        }
+        else
+        {
+            dataFile = new ChecksummedSequentialWriter(getFile(),
+                                                       descriptor.fileFor(Component.CRC),
+                                                       descriptor.fileFor(Component.DIGEST),
+                                                       WRITER_OPTION);
+        }
+
+        dbuilder = defaultDataHandleBuilder(descriptor).compressed(compression);
+        chunkCache.ifPresent(dbuilder::withChunkCache);
+        iwriter = new IndexWriter(metadata.get());
+        partitionWriter = new PartitionWriter(this.header, metadata().comparator, dataFile, iwriter.rowIndexFile, descriptor.version, this.observers);
+    }
+
+    private static Set<Component> components(TableMetadata metadata, Set<Component> indexComponents)
+    {
+        Set<Component> components = Sets.newHashSet(Component.DATA,
+                                                    Component.PARTITION_INDEX,
+                                                    Component.ROW_INDEX,
+                                                    Component.STATS,
+                                                    Component.TOC,
+                                                    Component.DIGEST);
+
+        if (metadata.params.bloomFilterFpChance < 1.0)
+            components.add(Component.FILTER);
+
+        if (metadata.params.compression.isEnabled())
+        {
+            components.add(Component.COMPRESSION_INFO);
+        }
+        else
+        {
+            // it would feel safer to actually add this component later in maybeWriteDigest(),
+            // but the components are unmodifiable after construction
+            components.add(Component.CRC);
+        }
+
+        components.addAll(indexComponents);
+
+        return components;
+    }
+
+    public void mark()
+    {
+        dataMark = dataFile.mark();
+        iwriter.mark();
+    }
+
+    public void resetAndTruncate()
+    {
+        dataFile.resetAndTruncate(dataMark);
+        iwriter.resetAndTruncate();
+    }
+
+    @SuppressWarnings("resource")
+    public boolean openEarly(Consumer<SSTableReader> callWhenReady)
+    {
+        long dataLength = dataFile.position();
+
+        return iwriter.buildPartial(dataLength, partitionIndex ->
+        {
+            StatsMetadata stats = statsMetadata();
+            FileHandle ifile = iwriter.rowIndexFHBuilder.complete(iwriter.rowIndexFile.getLastFlushOffset());
+            if (compression)
+                dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(dataFile.getLastFlushOffset()));
+            int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
+            FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete(dataFile.getLastFlushOffset());
+            invalidateCacheAtBoundary(dfile);
+            SSTableReader sstable = TrieIndexSSTableReader.internalOpen(descriptor,
+                                                               components, metadata,
+                                                               ifile, dfile, partitionIndex, iwriter.bf.sharedCopy(),
+                                                               maxDataAge, stats, SSTableReader.OpenReason.EARLY, header);
+
+            sstable.first = getMinimalKey(partitionIndex.firstKey());
+            sstable.last = getMinimalKey(partitionIndex.lastKey());
+            sstable.setup(true);
+            callWhenReady.accept(sstable);
+        });
+    }
+
+    void invalidateCacheAtBoundary(FileHandle dfile)
+    {
+        if (lastEarlyOpenLength != 0 && dfile.dataLength() > lastEarlyOpenLength)
+        {
+            dfile.invalidateIfCached(lastEarlyOpenLength);
+        }
+
+        lastEarlyOpenLength = dfile.dataLength();
+    }
+
+    public SSTableReader openFinalEarly()
+    {
+        // we must ensure the data is completely flushed to disk
+        iwriter.complete(); // This will be called by completedPartitionIndex() below too, but we want it done now to
+                            // ensure outstanding openEarly actions are not triggered.
+        dataFile.sync();
+        iwriter.rowIndexFile.sync();
+        // Note: Nothing must be written to any of the files after this point, as the chunk cache could pick up and
+        // retain a partially-written page (see DB-2446).
+
+        return openFinal(SSTableReader.OpenReason.EARLY);
+    }
+
+    @SuppressWarnings("resource")
+    private SSTableReader openFinal(SSTableReader.OpenReason openReason)
+    {
+        if (maxDataAge < 0)
+            maxDataAge = System.currentTimeMillis();
+
+        StatsMetadata stats = statsMetadata();
+        // finalize in-memory state for the reader
+        PartitionIndex partitionIndex = iwriter.completedPartitionIndex();
+        FileHandle rowIndexFile = iwriter.rowIndexFHBuilder.complete();
+        int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile()));
+        if (compression)
+            dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(dataFile.getLastFlushOffset()));
+        FileHandle dfile = dbuilder.bufferSize(dataBufferSize).complete();
+        invalidateCacheAtBoundary(dfile);
+        SSTableReader sstable = TrieIndexSSTableReader.internalOpen(descriptor,
+                                                            components,
+                                                            this.metadata,
+                                                            rowIndexFile,
+                                                            dfile,
+                                                            partitionIndex,
+                                                            iwriter.bf.sharedCopy(),
+                                                            maxDataAge,
+                                                            stats,
+                                                            openReason,
+                                                            header);
+        sstable.first = getMinimalKey(first);
+        sstable.last = getMinimalKey(last);
+        sstable.setup(true);
+        return sstable;
+    }
+
+    protected SSTableWriter.TransactionalProxy txnProxy()
+    {
+        return new TransactionalProxy();
+    }
+
+    class TransactionalProxy extends SSTableWriter.TransactionalProxy
+    {
+        // finalise our state on disk, including renaming
+        protected void doPrepare()
+        {
+            iwriter.prepareToCommit();
+
+            // write sstable statistics
+            dataFile.prepareToCommit();
+            writeMetadata(descriptor, finalizeMetadata());
+
+            // save the table of components
+            SSTable.appendTOC(descriptor, components);
+
+            if (openResult)
+                finalReader = openFinal(SSTableReader.OpenReason.NORMAL);
+        }
+
+        protected Throwable doCommit(Throwable accumulate)
+        {
+            accumulate = dataFile.commit(accumulate);
+            accumulate = iwriter.commit(accumulate);
+            return accumulate;
+        }
+
+        @Override
+        protected Throwable doPostCleanup(Throwable accumulate)
+        {
+            partitionWriter.close();
+            accumulate = dbuilder.close(accumulate);
+            return accumulate;
+        }
+
+        protected Throwable doAbort(Throwable accumulate)
+        {
+            accumulate = iwriter.abort(accumulate);
+            accumulate = dataFile.abort(accumulate);
+            return accumulate;
+        }
+    }
+
+    private void writeMetadata(Descriptor desc, Map<MetadataType, MetadataComponent> components)
+    {
+        File file = desc.fileFor(Component.STATS);
+        try (SequentialWriter out = new SequentialWriter(file, WRITER_OPTION))
+        {
+            desc.getMetadataSerializer().serialize(components, out, desc.version);
+            out.finish();
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, file);
+        }
+    }
+
+    public long getFilePointer()
+    {
+        return dataFile.position();
+    }
+
+    public long getOnDiskFilePointer()
+    {
+        return dataFile.getOnDiskFilePointer();
+    }
+
+    public long getEstimatedOnDiskBytesWritten()
+    {
+        return dataFile.getEstimatedOnDiskBytesWritten();
+    }
+
+    /**
+     * Perform sanity checks on @param decoratedKey and @return the position in the data file before any data is written
+     */
+    protected long beforeAppend(DecoratedKey decoratedKey)
+    {
+        assert decoratedKey != null : "Keys must not be null"; // empty keys ARE allowed b/c of indexed row values
+        if (lastWrittenKey != null && lastWrittenKey.compareTo(decoratedKey) >= 0)
+            throw new RuntimeException("Last written key " + lastWrittenKey + " >= current key " + decoratedKey + " writing into " + getFile());
+        return (lastWrittenKey == null) ? 0 : dataFile.position();
+    }
+
+    private long afterAppend(DecoratedKey decoratedKey, RowIndexEntry index) throws IOException
+    {
+        metadataCollector.addKey(decoratedKey.getKey());
+        lastWrittenKey = decoratedKey;
+        last = lastWrittenKey;
+        if (first == null)
+            first = lastWrittenKey;
+
+        if (logger.isTraceEnabled())
+            logger.trace("wrote {} at {}", decoratedKey, index.position);
+        return iwriter.append(decoratedKey, index);
+    }
+
+    /**
+     * Appends partition data to this writer.
+     *
+     * @param partition the partition to write
+     * @return the created index entry if something was written, that is if {@code iterator}
+     * wasn't empty, {@code null} otherwise.
+     *
+     * @throws FSWriteError if a write to the dataFile fails
+     *
+     * WARNING: changes to method name or parameter name will need to be reflected in byteman tests. In particular
+     * OutOfSpaceTest.
+     */
+    public RowIndexEntry append(UnfilteredRowIterator partition)
+    {
+        DecoratedKey key = partition.partitionKey();
+
+        if (key.getKeyLength() > FBUtilities.MAX_UNSIGNED_SHORT)
+        {
+            logger.error("Key size {} exceeds maximum of {}, skipping row", key.getKeyLength(), FBUtilities.MAX_UNSIGNED_SHORT);
+            return null;
+        }
+
+        if (partition.isEmpty())
+            return null;
+
+        long startPosition = beforeAppend(key);
+        if (!observers.isEmpty())
+            observers.forEach(o -> o.startPartition(key, startPosition));
+
+        // Reuse the writer for each row
+        partitionWriter.reset();
+
+        try (UnfilteredRowIterator collecting = Transformation.apply(partition, new BigTableWriter.StatsCollector(metadataCollector)))
+        {
+            long trieRoot = partitionWriter.writePartition(collecting);
+
+            RowIndexEntry entry = TrieIndexEntry.create(startPosition, trieRoot,
+                                                        collecting.partitionLevelDeletion(),
+                                                        partitionWriter.rowIndexCount);
+
+            long endPosition = dataFile.position();
+            long rowSize = endPosition - startPosition;
+            maybeLogLargePartitionWarning(key, rowSize);
+            metadataCollector.addPartitionSizeInBytes(rowSize);
+            afterAppend(key, entry);
+            return entry;
+        }
+        catch (IOException e)
+        {
+            throw new FSWriteError(e, dataFile.getPath());
+        }
+    }
+
+    private File getFile()
+    {
+        return descriptor.fileFor(Component.DATA);
+    }
+
+    /**
+     * Encapsulates writing the index and filter for an SSTable. The state of this object is not valid until it has been closed.
+     */
+    class IndexWriter extends AbstractTransactional implements Transactional
+    {
+        private final SequentialWriter rowIndexFile;
+        public final FileHandle.Builder rowIndexFHBuilder;
+        private final SequentialWriter partitionIndexFile;
+        public final FileHandle.Builder partitionIndexFHBuilder;
+        public final PartitionIndexBuilder partitionIndex;
+        public final IFilter bf;
+        boolean partitionIndexCompleted = false;
+        private DataPosition riMark, piMark;
+
+        IndexWriter(TableMetadata table)
+        {
+            rowIndexFile = new SequentialWriter(descriptor.fileFor(Component.ROW_INDEX), WRITER_OPTION);
+            rowIndexFHBuilder = defaultIndexHandleBuilder(descriptor, Component.ROW_INDEX);
+            partitionIndexFile = new SequentialWriter(descriptor.fileFor(Component.PARTITION_INDEX), WRITER_OPTION);
+            partitionIndexFHBuilder = defaultIndexHandleBuilder(descriptor, Component.PARTITION_INDEX);
+            partitionIndex = new PartitionIndexBuilder(partitionIndexFile, partitionIndexFHBuilder);
+            bf = FilterFactory.getFilter(keyCount, table.params.bloomFilterFpChance);
+            // register listeners to be alerted when the data files are flushed
+            partitionIndexFile.setPostFlushListener(() -> partitionIndex.markPartitionIndexSynced(partitionIndexFile.getLastFlushOffset()));
+            rowIndexFile.setPostFlushListener(() -> partitionIndex.markRowIndexSynced(rowIndexFile.getLastFlushOffset()));
+            dataFile.setPostFlushListener(() -> partitionIndex.markDataSynced(dataFile.getLastFlushOffset()));
+        }
+
+        public long append(DecoratedKey key, RowIndexEntry indexEntry) throws IOException
+        {
+            bf.add(key);
+            long position;
+            if (indexEntry.isIndexed())
+            {
+                long indexStart = rowIndexFile.position();
+                try
+                {
+                    ByteBufferUtil.writeWithShortLength(key.getKey(), rowIndexFile);
+                    ((TrieIndexEntry) indexEntry).serialize(rowIndexFile, rowIndexFile.position());
+                }
+                catch (IOException e)
+                {
+                    throw new FSWriteError(e, rowIndexFile.getPath());
+                }
+
+                if (logger.isTraceEnabled())
+                    logger.trace("wrote index entry: {} at {}", indexEntry, indexStart);
+                position = indexStart;
+            }
+            else
+            {
+                // Write data position directly in trie.
+                position = ~indexEntry.position;
+            }
+            partitionIndex.addEntry(key, position);
+            return position;
+        }
+
+        public boolean buildPartial(long dataPosition, Consumer<PartitionIndex> callWhenReady)
+        {
+            return partitionIndex.buildPartial(callWhenReady, rowIndexFile.position(), dataPosition);
+        }
+
+        /**
+         * Closes the index and bloomfilter, making the public state of this writer valid for consumption.
+         */
+        void flushBf()
+        {
+            if (components.contains(Component.FILTER))
+            {
+                File path = descriptor.fileFor(Component.FILTER);
+                try (SeekableByteChannel fos = Files.newByteChannel(path.toPath(), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE);
+                     DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos))
+                {
+                    // bloom filter
+                    BloomFilter.serializer.serialize((BloomFilter) bf, stream);
+                    stream.flush();
+                    SyncUtil.sync((FileChannel) fos);
+                }
+                catch (IOException e)
+                {
+                    throw new FSWriteError(e, path);
+                }
+            }
+        }
+
+        public void mark()
+        {
+            riMark = rowIndexFile.mark();
+            piMark = partitionIndexFile.mark();
+        }
+
+        public void resetAndTruncate()
+        {
+            // we can't un-set the bloom filter addition, but extra keys in there are harmless.
+            // we can't reset dbuilder either, but that is the last thing called in afterappend so
+            // we assume that if that worked then we won't be trying to reset.
+            rowIndexFile.resetAndTruncate(riMark);
+            partitionIndexFile.resetAndTruncate(piMark);
+        }
+
+        protected void doPrepare()
+        {
+            flushBf();
+
+            // truncate index file
+            rowIndexFile.prepareToCommit();
+            rowIndexFHBuilder.withLength(rowIndexFile.getLastFlushOffset());
+
+            complete();
+        }
+
+        void complete() throws FSWriteError
+        {
+            if (partitionIndexCompleted)
+                return;
+
+            try
+            {
+                partitionIndex.complete();
+                partitionIndexCompleted = true;
+            }
+            catch (IOException e)
+            {
+                throw new FSWriteError(e, partitionIndexFile.getPath());
+            }
+        }
+
+        PartitionIndex completedPartitionIndex()
+        {
+            complete();
+            try
+            {
+                return PartitionIndex.load(partitionIndexFHBuilder, getPartitioner(), false);
+            }
+            catch (IOException e)
+            {
+                throw new FSReadError(e, partitionIndexFile.getPath());
+            }
+        }
+
+        protected Throwable doCommit(Throwable accumulate)
+        {
+            return rowIndexFile.commit(accumulate);
+        }
+
+        protected Throwable doAbort(Throwable accumulate)
+        {
+            return rowIndexFile.abort(accumulate);
+        }
+
+        @Override
+        protected Throwable doPostCleanup(Throwable accumulate)
+        {
+            return Throwables.close(accumulate, bf, partitionIndex, rowIndexFile, rowIndexFHBuilder, partitionIndexFile, partitionIndexFHBuilder);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexScanner.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexScanner.java
new file mode 100644
index 000000000000..1245a586fd84
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexScanner.java
@@ -0,0 +1,432 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.NoSuchElementException;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterators;
+
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.LazilyInitializedUnfilteredRowIterator;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.AbstractBounds.Boundary;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReader.PartitionPositionBounds;
+import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.AbstractIterator;
+import org.apache.cassandra.utils.CloseableIterator;
+
+import static org.apache.cassandra.dht.AbstractBounds.isEmpty;
+import static org.apache.cassandra.dht.AbstractBounds.maxLeft;
+import static org.apache.cassandra.dht.AbstractBounds.minRight;
+
+// TODO STAR-247: implement unit test
+public class TrieIndexScanner implements ISSTableScanner
+{
+    private final AtomicBoolean isClosed = new AtomicBoolean(false);
+    protected final RandomAccessReader dfile;
+    public final TrieIndexSSTableReader sstable;
+
+    private final Iterator<AbstractBounds<PartitionPosition>> rangeIterator;
+
+    private final ColumnFilter columns;
+    private final DataRange dataRange;
+    private final SSTableReadsListener listener;
+    private long startScan = -1;
+    private long bytesScanned = 0;
+
+    protected CloseableIterator<UnfilteredRowIterator> iterator;
+
+    // Full scan of the sstables
+    public static ISSTableScanner getScanner(TrieIndexSSTableReader sstable)
+    {
+        return getScanner(sstable, Iterators.singletonIterator(fullRange(sstable)));
+    }
+
+    public static ISSTableScanner getScanner(TrieIndexSSTableReader sstable,
+                                             ColumnFilter columns,
+                                             DataRange dataRange,
+                                             SSTableReadsListener listener)
+    {
+        return new TrieIndexScanner(sstable, columns, dataRange, makeBounds(sstable, dataRange).iterator(), listener);
+    }
+
+    public static ISSTableScanner getScanner(TrieIndexSSTableReader sstable, Collection<Range<Token>> tokenRanges)
+    {
+        // We want to avoid allocating a SSTableScanner if the range don't overlap the sstable (#5249)
+        List<PartitionPositionBounds> positions = sstable.getPositionsForRanges(tokenRanges);
+        if (positions.isEmpty())
+            return new EmptySSTableScanner(sstable);
+
+        return getScanner(sstable, makeBounds(sstable, tokenRanges).iterator());
+    }
+
+    public static ISSTableScanner getScanner(TrieIndexSSTableReader sstable, Iterator<AbstractBounds<PartitionPosition>> rangeIterator)
+    {
+        return new TrieIndexScanner(sstable, ColumnFilter.all(sstable.metadata()), null, rangeIterator, SSTableReadsListener.NOOP_LISTENER);
+    }
+
+    private TrieIndexScanner(TrieIndexSSTableReader sstable,
+                             ColumnFilter columns,
+                             DataRange dataRange,
+                             Iterator<AbstractBounds<PartitionPosition>> rangeIterator,
+                             SSTableReadsListener listener)
+    {
+        assert sstable != null;
+
+        this.dfile = sstable.openDataReader();
+        this.sstable = sstable;
+        this.columns = columns;
+        this.dataRange = dataRange;
+        this.rangeIterator = rangeIterator;
+        this.listener = listener;
+    }
+
+    public static List<AbstractBounds<PartitionPosition>> makeBounds(SSTableReader sstable, Collection<Range<Token>> tokenRanges)
+    {
+        List<AbstractBounds<PartitionPosition>> boundsList = new ArrayList<>(tokenRanges.size());
+        for (Range<Token> range : Range.normalize(tokenRanges))
+            addRange(sstable, Range.makeRowRange(range), boundsList);
+        return boundsList;
+    }
+
+    static List<AbstractBounds<PartitionPosition>> makeBounds(SSTableReader sstable, DataRange dataRange)
+    {
+        List<AbstractBounds<PartitionPosition>> boundsList = new ArrayList<>(2);
+        addRange(sstable, dataRange.keyRange(), boundsList);
+        return boundsList;
+    }
+
+    static AbstractBounds<PartitionPosition> fullRange(SSTableReader sstable)
+    {
+        return new Bounds<>(sstable.first, sstable.last);
+    }
+
+    private static void addRange(SSTableReader sstable, AbstractBounds<PartitionPosition> requested, List<AbstractBounds<PartitionPosition>> boundsList)
+    {
+        if (requested instanceof Range && ((Range<?>) requested).isWrapAround())
+        {
+            if (requested.right.compareTo(sstable.first) >= 0)
+            {
+                // since we wrap, we must contain the whole sstable prior to stopKey()
+                Boundary<PartitionPosition> left = new Boundary<>(sstable.first, true);
+                Boundary<PartitionPosition> right;
+                right = requested.rightBoundary();
+                right = minRight(right, sstable.last, true);
+                if (!isEmpty(left, right))
+                    boundsList.add(AbstractBounds.bounds(left, right));
+            }
+            if (requested.left.compareTo(sstable.last) <= 0)
+            {
+                // since we wrap, we must contain the whole sstable after dataRange.startKey()
+                Boundary<PartitionPosition> right = new Boundary<>(sstable.last, true);
+                Boundary<PartitionPosition> left;
+                left = requested.leftBoundary();
+                left = maxLeft(left, sstable.first, true);
+                if (!isEmpty(left, right))
+                    boundsList.add(AbstractBounds.bounds(left, right));
+            }
+        }
+        else
+        {
+            assert !AbstractBounds.strictlyWrapsAround(requested.left, requested.right);
+            Boundary<PartitionPosition> left, right;
+            left = requested.leftBoundary();
+            right = requested.rightBoundary();
+            left = maxLeft(left, sstable.first, true);
+            // apparently isWrapAround() doesn't count Bounds that extend to the limit (min) as wrapping
+            right = requested.right.isMinimum() ? new Boundary<>(sstable.last, true)
+                                                    : minRight(right, sstable.last, true);
+            if (!isEmpty(left, right))
+                boundsList.add(AbstractBounds.bounds(left, right));
+        }
+    }
+
+    public void close()
+    {
+        try
+        {
+            if (isClosed.compareAndSet(false, true))
+            {
+                FileUtils.close(dfile);
+                if (iterator != null)
+                    iterator.close();
+            }
+        }
+        catch (IOException e)
+        {
+            sstable.markSuspect();
+            throw new CorruptSSTableException(e, sstable.getFilename());
+        }
+    }
+
+    public long getBytesScanned()
+    {
+        return bytesScanned;
+    }
+
+    @Override
+    public long getLengthInBytes()
+    {
+        return sstable.uncompressedLength();
+    }
+
+
+    public long getCompressedLengthInBytes()
+    {
+        return sstable.onDiskLength();
+    }
+
+    @Override
+    public long getCurrentPosition()
+    {
+        return dfile.getFilePointer();
+    }
+
+    @Override
+    public Set<SSTableReader> getBackingSSTables()
+    {
+        return ImmutableSet.of(sstable);
+    }
+
+    public int level()
+    {
+        return sstable.getSSTableLevel();
+    }
+
+    public TableMetadata metadata()
+    {
+        return sstable.metadata();
+    }
+
+    public boolean hasNext()
+    {
+        if (iterator == null)
+            iterator = createIterator();
+        return iterator.hasNext();
+    }
+
+    public UnfilteredRowIterator next()
+    {
+        if (iterator == null)
+            iterator = createIterator();
+        return iterator.next();
+    }
+
+    public void remove()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    private CloseableIterator<UnfilteredRowIterator> createIterator()
+    {
+        this.listener.onScanningStarted(sstable);
+        return new KeyScanningIterator();
+    }
+
+    protected class KeyScanningIterator extends AbstractIterator<UnfilteredRowIterator> implements CloseableIterator<UnfilteredRowIterator>
+    {
+        private DecoratedKey currentKey;
+        private RowIndexEntry currentEntry;
+        private PartitionIterator iterator;
+        private LazilyInitializedUnfilteredRowIterator currentRowIterator;
+
+        protected UnfilteredRowIterator computeNext()
+        {
+            if (currentRowIterator != null && currentRowIterator.initialized() && !currentRowIterator.isClosed() && currentRowIterator.hasNext())
+                throw new IllegalStateException("The UnfilteredRowIterator returned by the last call to next() was initialized: " +
+                                                "it should be either exhausted or closed before calling hasNext() or next() again.");
+
+            try
+            {
+                while (true)
+                {
+                    if (startScan != -1)
+                        bytesScanned += getCurrentPosition() - startScan;
+
+                    if (iterator != null)
+                    {
+                        currentEntry = iterator.entry();
+                        currentKey = iterator.decoratedKey();
+                        if (currentEntry != null)
+                        {
+                            iterator.advance();
+                            break;
+                        }
+                        iterator.close();
+                        iterator = null;
+                    }
+
+                    // try next range
+                    if (!rangeIterator.hasNext())
+                        return endOfData();
+                    iterator = sstable.coveredKeysIterator(rangeIterator.next());
+                }
+                startScan = -1;
+
+                /*
+                 * For a given partition key, we want to avoid hitting the data
+                 * file unless we're explicitly asked to. This is important
+                 * for PartitionRangeReadCommand#checkCacheFilter.
+                 */
+                currentRowIterator = new LazilyInitializedUnfilteredRowIterator(currentKey)
+                {
+                    // Store currentEntry referency during object instantination as later (during initialize) the
+                    // reference may point to a different entry.
+                    private final RowIndexEntry rowIndexEntry = currentEntry;
+
+                    protected UnfilteredRowIterator initializeIterator()
+                    {
+                        try
+                        {
+                            if (startScan != -1)
+                                bytesScanned += getCurrentPosition() - startScan;
+
+                            startScan = rowIndexEntry.position;
+                            if (dataRange == null)
+                            {
+                                return sstable.simpleIterator(dfile, partitionKey(), rowIndexEntry, false);
+                            }
+                            else
+                            {
+                                ClusteringIndexFilter filter = dataRange.clusteringIndexFilter(partitionKey());
+                                return sstable.iterator(dfile, partitionKey(), rowIndexEntry, filter.getSlices(TrieIndexScanner.this.metadata()), columns, filter.isReversed());
+                            }
+                        }
+                        catch (CorruptSSTableException e)
+                        {
+                            sstable.markSuspect();
+                            throw new CorruptSSTableException(e, sstable.getFilename());
+                        }
+                    }
+                };
+                return currentRowIterator;
+            }
+            catch (CorruptSSTableException | IOException e)
+            {
+                sstable.markSuspect();
+                throw new CorruptSSTableException(e, sstable.getFilename());
+            }
+        }
+
+        @Override
+        public void close()
+        {
+            if (iterator != null)
+                iterator.close();
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("%s(dfile=%s sstable=%s)", getClass().getSimpleName(), dfile, sstable);
+    }
+
+    public static class EmptySSTableScanner extends AbstractUnfilteredPartitionIterator implements ISSTableScanner
+    {
+        private final SSTableReader sstable;
+
+        public EmptySSTableScanner(SSTableReader sstable)
+        {
+            this.sstable = sstable;
+        }
+
+        public long getFilePointer()
+        {
+            return 0;
+        }
+
+        public long getBytesScanned()
+        {
+            return 0;
+        }
+
+        @Override
+        public long getLengthInBytes()
+        {
+            return 0;
+        }
+
+        public long getCompressedLengthInBytes()
+        {
+            return 0;
+        }
+
+        @Override
+        public long getCurrentPosition()
+        {
+            return 0;
+        }
+
+        public int level()
+        {
+            return sstable.getSSTableLevel();
+        }
+
+        @Override
+        public Set<SSTableReader> getBackingSSTables()
+        {
+            return Collections.emptySet();
+        }
+
+        public TableMetadata metadata()
+        {
+            return sstable.metadata();
+        }
+
+        public void close()
+        {
+
+        }
+
+        public boolean hasNext()
+        {
+            return false;
+        }
+
+        public UnfilteredRowIterator next()
+        {
+            throw new NoSuchElementException();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
index 733f594bc766..159b9061bf1b 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
@@ -44,6 +44,9 @@
 public class MetadataCollector implements PartitionStatisticsCollector
 {
     public static final double NO_COMPRESSION_RATIO = -1.0;
+    private static final ByteBuffer[] EMPTY_CLUSTERING = new ByteBuffer[0];
+
+    private long currentPartitionCells = 0;
 
     static EstimatedHistogram defaultCellPerPartitionCountHistogram()
     {
@@ -184,6 +187,13 @@ public MetadataCollector addCellPerPartitionCount(long cellCount)
         return this;
     }
 
+    public MetadataCollector addCellPerPartitionCount()
+    {
+        estimatedCellPerPartitionCount.add(currentPartitionCells);
+        currentPartitionCells = 0;
+        return this;
+    }
+
     /**
      * Ratio is compressed/uncompressed and it is
      * if you have 1.x then compression isn't helping
@@ -206,6 +216,7 @@ public void update(LivenessInfo newInfo)
 
     public void update(Cell<?> cell)
     {
+        ++currentPartitionCells;
         updateTimestamp(cell.timestamp());
         updateTTL(cell.ttl());
         updateLocalDeletionTime(cell.localDeletionTime());
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
index 589b13d331ba..c87695f011c8 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.io.sstable.metadata;
 
+import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.List;
@@ -37,9 +38,11 @@
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.EncodingStats;
 import org.apache.cassandra.io.ISerializer;
+import org.apache.cassandra.io.sstable.UnsupportedSSTableException;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.serializers.AbstractTypeSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.EstimatedHistogram;
@@ -327,6 +330,24 @@ public int serializedSize(Version version, StatsMetadata component) throws IOExc
                     size += UUIDSerializer.serializer.serializedSize(component.pendingRepair, 0);
             }
 
+            // we do not have zero copy metadata
+            if (version.hasZeroCopyMetadata())
+            {
+                size += 1;
+            }
+
+            // we do not have node sync metadata
+            if (version.hasIncrementalNodeSyncMetadata())
+            {
+                size += Long.BYTES;
+            }
+
+            // TODO TBD
+            if (version.hasMaxColumnValueLengths())
+            {
+                size += 4; // num columns
+            }
+
             if (version.hasIsTransient())
             {
                 size += TypeSizes.sizeof(component.isTransient);
@@ -405,6 +426,24 @@ public void serialize(Version version, StatsMetadata component, DataOutputPlus o
                 }
             }
 
+            // we do not have zero copy metadata
+            if (version.hasZeroCopyMetadata())
+            {
+                out.writeByte(0);
+            }
+
+            // we do not have node sync metadata
+            if (version.hasIncrementalNodeSyncMetadata())
+            {
+                out.writeLong(Long.MAX_VALUE);
+            }
+
+            // TODO TBD
+            if (version.hasMaxColumnValueLengths())
+            {
+                out.writeInt(0);
+            }
+
             if (version.hasIsTransient())
             {
                 out.writeBoolean(component.isTransient);
@@ -510,6 +549,32 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
                 pendingRepair = UUIDSerializer.serializer.deserialize(in, 0);
             }
 
+            if (version.hasZeroCopyMetadata() && in.readByte() != 0)
+            {
+                throw new UnsupportedSSTableException(String.format("Found DSE zero copy metadata in %s. " +
+                                                                    "Files copied over using partial zero-copy " +
+                                                                    "streaming in DSE are not currently supported.", in),
+                                                      null,
+                                                      in instanceof FileDataInput ? new File(((FileDataInput) in).getPath()) : null);
+            }
+
+            if (version.hasIncrementalNodeSyncMetadata())
+            {
+                logger.warn("Ignoring incremental node sync metadata from {} as it is not supported", in);
+                in.readLong();
+            }
+
+            // TODO TBD
+            if (version.hasMaxColumnValueLengths())
+            {
+                int colCount = in.readInt();
+                for (int i = 0; i < colCount; i++)
+                {
+                    ByteBufferUtil.readWithVIntLength(in);
+                    in.readInt();
+                }
+            }
+
             boolean isTransient = version.hasIsTransient() && in.readBoolean();
 
             // If not recorded, the only time we can guarantee there is no partition level deletion is if there is no
diff --git a/src/java/org/apache/cassandra/io/tries/ReverseValueIterator.java b/src/java/org/apache/cassandra/io/tries/ReverseValueIterator.java
new file mode 100644
index 000000000000..e67ecf7e29d0
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/tries/ReverseValueIterator.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.tries;
+
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+import org.apache.cassandra.utils.bytecomparable.ByteSource;
+
+/**
+ * Thread-unsafe reverse value iterator for on-disk tries. Uses the assumptions of Walker.
+ */
+public class ReverseValueIterator<Concrete extends ReverseValueIterator<Concrete>> extends Walker<Concrete>
+{
+    private final ByteSource limit;
+    private IterationPosition stack;
+    private long next;
+    private boolean reportingPrefixes;
+
+    static class IterationPosition
+    {
+        long node;
+        int childIndex;
+        int limit;
+        IterationPosition prev;
+
+        public IterationPosition(long node, int childIndex, int limit, IterationPosition prev)
+        {
+            super();
+            this.node = node;
+            this.childIndex = childIndex;
+            this.limit = limit;
+            this.prev = prev;
+        }
+    }
+
+    protected ReverseValueIterator(Rebufferer source, long root)
+    {
+        super(source, root);
+        stack = new IterationPosition(root, -1, 256, null);
+        limit = null;
+        next = advanceNode();
+    }
+
+    protected ReverseValueIterator(Rebufferer source, long root, ByteComparable start, ByteComparable end, boolean admitPrefix)
+    {
+        super(source, root);
+        limit = start.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        ByteSource endStream = end.asComparableBytes(BYTE_COMPARABLE_VERSION);
+        IterationPosition prev = null;
+        boolean atLimit = true;
+        int childIndex;
+        int limitByte;
+        reportingPrefixes = admitPrefix;
+
+        // Follow end position while we still have a prefix, stacking path.
+        go(root);
+        while (true)
+        {
+            int s = endStream.next();
+            childIndex = search(s);
+
+            limitByte = -1;
+            if (atLimit)
+            {
+                limitByte = limit.next();
+                if (s > limitByte)
+                    atLimit = false;
+            }
+            if (childIndex < 0)
+                break;
+
+            prev = new IterationPosition(position, childIndex, limitByte, prev);
+            go(transition(childIndex));
+        }
+
+        // Advancing now gives us first match.
+        childIndex = -1 - childIndex;
+        stack = new IterationPosition(position, childIndex, limitByte, prev);
+        next = advanceNode();
+    }
+
+    /**
+     * This method must be async-read-safe.
+     */
+    protected long nextPayloadedNode()     // returns payloaded node position
+    {
+        long toReturn = next;
+        if (next != -1)
+            next = advanceNode();
+        return toReturn;
+    }
+
+    /**
+     * This method must be async-read-safe.
+     */
+    long advanceNode()
+    {
+        if (stack == null)
+            return -1;
+
+        long child;
+        int transitionByte;
+
+        go(stack.node);
+        while (true)
+        {
+            // advance position in node
+            int childIdx = stack.childIndex - 1;
+            boolean beyondLimit = true;
+            if (childIdx >= 0)
+            {
+                transitionByte = transitionByte(childIdx);
+                beyondLimit = transitionByte < stack.limit;
+                if (beyondLimit)
+                {
+                    assert stack.limit >= 0;    // we are at a limit position (not in a node that's completely within the span)
+                    reportingPrefixes = false;  // there exists a smaller child than limit, no longer should report prefixes
+                }
+            }
+            else
+                transitionByte = Integer.MIN_VALUE;
+
+            if (beyondLimit)
+            {
+                // ascend to parent, remove from stack
+                IterationPosition stackTop = stack;
+                stack = stack.prev;
+
+                // Report payloads on the way up
+                // unless we are at limit and there has been a smaller child
+                if (reportingPrefixes && payloadFlags() != 0)
+                {
+                    if (stackTop.limit >= 0)    // if we are at limit position only report the closest prefix
+                        reportingPrefixes = false;
+                    return stackTop.node;
+                }
+
+                if (stack == null)        // exhausted whole trie
+                    return -1;
+                go(stack.node);
+                continue;
+            }
+
+            child = transition(childIdx);
+            if (child != -1)
+            {
+                go(child);
+
+                stack.childIndex = childIdx;
+
+                // descend, stack up position
+                int l = -1;
+                if (transitionByte == stack.limit)
+                    l = limit.next();
+
+                stack = new IterationPosition(child, transitionRange(), l, stack);
+            }
+            else
+            {
+                stack.childIndex = childIdx;
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/tries/ValueIterator.java b/src/java/org/apache/cassandra/io/tries/ValueIterator.java
index 2c25cf38f8b6..37af465acee6 100644
--- a/src/java/org/apache/cassandra/io/tries/ValueIterator.java
+++ b/src/java/org/apache/cassandra/io/tries/ValueIterator.java
@@ -163,21 +163,14 @@ protected long nextPayloadedNode()
         return toReturn;
     }
 
-    /**
-     * This method must be async-read-safe. Every read from new buffering position (the go() calls) can
-     * trigger NotInCacheException, and iteration must be able to redo the work that was interrupted during the next
-     * call. Hence the mutable state must be fully ready before all go() calls (i.e. they must either be the
-     * last step in the loop or the state must be unchanged until that call has succeeded).
-     */
     private long advanceNode()
     {
         long child;
         int transitionByte;
 
-        go(stack.node); // can throw NotInCacheException, OK no state modified yet
+        go(stack.node);
         while (true)
         {
-            // advance position in node but don't change the stack just yet due to NotInCacheExceptions
             int childIndex = stack.childIndex + 1;
             transitionByte = transitionByte(childIndex);
 
@@ -187,7 +180,7 @@ private long advanceNode()
                 stack = stack.prev;
                 if (stack == null)        // exhausted whole trie
                     return -1;
-                go(stack.node); // can throw NotInCacheException, OK - stack ready to re-enter loop with parent
+                go(stack.node);
                 continue;
             }
 
@@ -198,7 +191,7 @@ private long advanceNode()
                 assert child >= 0 : String.format("Expected value >= 0 but got %d - %s", child, this);
 
                 // descend
-                go(child); // can throw NotInCacheException, OK - stack not yet changed, limit not yet incremented
+                go(child);
 
                 int l = 256;
                 if (transitionByte == stack.limit)
diff --git a/src/java/org/apache/cassandra/io/tries/Walker.java b/src/java/org/apache/cassandra/io/tries/Walker.java
index a7e4a40d3148..825d327de81d 100644
--- a/src/java/org/apache/cassandra/io/tries/Walker.java
+++ b/src/java/org/apache/cassandra/io/tries/Walker.java
@@ -32,6 +32,7 @@
  * <p>
  * Assumes data was written using page-aware builder and thus no node crosses a page and thus a buffer boundary.
  */
+// TODO STAR-247: unit test are insufficient - they did not catch a problem fixed in STAR-247
 public class Walker<VALUE extends Walker<VALUE>> implements AutoCloseable
 {
     private final Rebufferer source;
@@ -82,9 +83,9 @@ protected final void go(long position)
         long curOffset = position - bh.offset();
         if (curOffset < 0 || curOffset >= buf.limit())
         {
-            BufferHolder currentBh = bh;
+            bh.release();
+            bh = Rebufferer.EMPTY; // prevents double release if the call below fails
             bh = source.rebuffer(position);
-            currentBh.release();
             buf = bh.buffer();
             curOffset = position - bh.offset();
             assert curOffset >= 0 && curOffset < buf.limit() : String.format("Invalid offset: %d, buf: %s, bh: %s", curOffset, buf, bh);
@@ -305,10 +306,8 @@ public <RESULT> RESULT prefixAndNeighbours(ByteComparable key, Extractor<RESULT,
         {
             int b = stream.next();
             int searchIndex = search(b);
-            payload = null;
 
             greaterBranch = greaterTransition(searchIndex, greaterBranch);
-            lesserBranch = lesserTransition(searchIndex, lesserBranch);
 
             if (searchIndex == -1 || searchIndex == 0)
             {
@@ -316,6 +315,11 @@ public <RESULT> RESULT prefixAndNeighbours(ByteComparable key, Extractor<RESULT,
                 if (payloadBits > 0)
                     payload = extractor.extract((VALUE) this, payloadPosition(), payloadBits);
             }
+            else
+            {
+                lesserBranch = lesserTransition(searchIndex, lesserBranch);
+                payload = null;
+            }
 
             if (searchIndex < 0)
                 return payload;
diff --git a/src/java/org/apache/cassandra/io/util/EmptyRebufferer.java b/src/java/org/apache/cassandra/io/util/EmptyRebufferer.java
new file mode 100644
index 000000000000..aa8e7e046f39
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/util/EmptyRebufferer.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.util;
+
+public class EmptyRebufferer implements Rebufferer, RebuffererFactory
+{
+    private final ChannelProxy channel;
+
+    public EmptyRebufferer(ChannelProxy channel)
+    {
+        this.channel = channel;
+    }
+
+    @Override
+    public void close()
+    {
+
+    }
+
+    @Override
+    public ChannelProxy channel()
+    {
+        return channel;
+    }
+
+    @Override
+    public long fileLength()
+    {
+        return 0;
+    }
+
+    @Override
+    public double getCrcCheckChance()
+    {
+        return 0;
+    }
+
+    @Override
+    public BufferHolder rebuffer(long position)
+    {
+        return EMPTY;
+    }
+
+    @Override
+    public void closeReader()
+    {
+
+    }
+
+    @Override
+    public Rebufferer instantiateRebufferer()
+    {
+        return this;
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/util/FileHandle.java b/src/java/org/apache/cassandra/io/util/FileHandle.java
index 50da403c236e..8d7b1fd67b1d 100644
--- a/src/java/org/apache/cassandra/io/util/FileHandle.java
+++ b/src/java/org/apache/cassandra/io/util/FileHandle.java
@@ -182,6 +182,11 @@ private Rebufferer instantiateRebufferer(RateLimiter limiter)
         return rebufferer;
     }
 
+    public void invalidateIfCached(long position)
+    {
+        rebuffererFactory.invalidateIfCached(position);
+    }
+
     /**
      * Perform clean up of all resources held by {@link FileHandle}.
      */
@@ -248,6 +253,7 @@ public static class Builder implements AutoCloseable
 
         private boolean mmapped = false;
         private boolean compressed = false;
+        private long length = -1;
 
         public Builder(String path)
         {
@@ -327,6 +333,11 @@ public Builder bufferType(BufferType bufferType)
             return this;
         }
 
+        public void withLength(long length)
+        {
+            this.length = length;
+        }
+
         /**
          * Complete building {@link FileHandle} without overriding file length.
          *
@@ -334,7 +345,7 @@ public Builder bufferType(BufferType bufferType)
          */
         public FileHandle complete()
         {
-            return complete(-1L);
+            return complete(length);
         }
 
         /**
@@ -363,7 +374,11 @@ public FileHandle complete(long overrideLength)
                 long length = overrideLength > 0 ? overrideLength : compressed ? compressionMetadata.compressedFileLength : channelCopy.size();
 
                 RebuffererFactory rebuffererFactory;
-                if (mmapped)
+                if (length == 0)
+                {
+                    rebuffererFactory = new EmptyRebufferer(channelCopy);
+                }
+                else if (mmapped)
                 {
                     if (compressed)
                     {
diff --git a/src/java/org/apache/cassandra/io/util/LimitingRebufferer.java b/src/java/org/apache/cassandra/io/util/LimitingRebufferer.java
index b5c7f3498d76..f71774db6fc0 100644
--- a/src/java/org/apache/cassandra/io/util/LimitingRebufferer.java
+++ b/src/java/org/apache/cassandra/io/util/LimitingRebufferer.java
@@ -20,7 +20,7 @@
  */
 package org.apache.cassandra.io.util;
 
-import java.nio.ByteBuffer;
+import javax.annotation.concurrent.NotThreadSafe;
 
 import com.google.common.primitives.Ints;
 import com.google.common.util.concurrent.RateLimiter;
@@ -30,20 +30,18 @@
  *
  * Instantiated once per RandomAccessReader, thread-unsafe.
  * The instances reuse themselves as the BufferHolder to avoid having to return a new object for each rebuffer call.
+ * Only one BufferHolder can be active at a time. Calling {@link #rebuffer(long)} before the previously obtained
+ * buffer holder is released will throw {@link AssertionError}.
  */
-public class LimitingRebufferer implements Rebufferer, Rebufferer.BufferHolder
+@NotThreadSafe
+public class LimitingRebufferer extends WrappingRebufferer
 {
-    final private Rebufferer wrapped;
     final private RateLimiter limiter;
     final private int limitQuant;
 
-    private BufferHolder bufferHolder;
-    private ByteBuffer buffer;
-    private long offset;
-
     public LimitingRebufferer(Rebufferer wrapped, RateLimiter limiter, int limitQuant)
     {
-        this.wrapped = wrapped;
+        super(wrapped);
         this.limiter = limiter;
         this.limitQuant = limitQuant;
     }
@@ -51,9 +49,7 @@ public LimitingRebufferer(Rebufferer wrapped, RateLimiter limiter, int limitQuan
     @Override
     public BufferHolder rebuffer(long position)
     {
-        bufferHolder = wrapped.rebuffer(position);
-        buffer = bufferHolder.buffer();
-        offset = bufferHolder.offset();
+        super.rebuffer(position);
         int posInBuffer = Ints.checkedCast(position - offset);
         int remaining = buffer.limit() - posInBuffer;
         if (remaining == 0)
@@ -68,59 +64,10 @@ public BufferHolder rebuffer(long position)
         return this;
     }
 
-    @Override
-    public ChannelProxy channel()
-    {
-        return wrapped.channel();
-    }
-
-    @Override
-    public long fileLength()
-    {
-        return wrapped.fileLength();
-    }
-
-    @Override
-    public double getCrcCheckChance()
-    {
-        return wrapped.getCrcCheckChance();
-    }
-
-    @Override
-    public void close()
-    {
-        wrapped.close();
-    }
-
-    @Override
-    public void closeReader()
-    {
-        wrapped.closeReader();
-    }
-
     @Override
     public String toString()
     {
-        return "LimitingRebufferer[" + limiter + "]:" + wrapped;
-    }
-
-    // BufferHolder methods
-
-    @Override
-    public ByteBuffer buffer()
-    {
-        return buffer;
-    }
-
-    @Override
-    public long offset()
-    {
-        return offset;
+        return "LimitingRebufferer[" + limiter + "]:" + this.source;
     }
 
-    @Override
-    public void release()
-    {
-        bufferHolder.release();
-    }
 }
diff --git a/src/java/org/apache/cassandra/io/util/RandomAccessReader.java b/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
index 4a164a7274dc..647622fd4de1 100644
--- a/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
+++ b/src/java/org/apache/cassandra/io/util/RandomAccessReader.java
@@ -65,6 +65,7 @@ public void reBuffer()
     private void reBufferAt(long position)
     {
         bufferHolder.release();
+        bufferHolder = Rebufferer.EMPTY; // prevents double release if the call below fails
         bufferHolder = rebufferer.rebuffer(position);
         buffer = bufferHolder.buffer();
         buffer.position(Ints.checkedCast(position - bufferHolder.offset()));
diff --git a/src/java/org/apache/cassandra/io/util/RebuffererFactory.java b/src/java/org/apache/cassandra/io/util/RebuffererFactory.java
index ec35f0ba530b..a4737dbfa77b 100644
--- a/src/java/org/apache/cassandra/io/util/RebuffererFactory.java
+++ b/src/java/org/apache/cassandra/io/util/RebuffererFactory.java
@@ -29,4 +29,6 @@
 public interface RebuffererFactory extends ReaderFileProxy
 {
     Rebufferer instantiateRebufferer();
+
+    default void invalidateIfCached(long position) {}
 }
diff --git a/src/java/org/apache/cassandra/io/util/SimpleChunkReader.java b/src/java/org/apache/cassandra/io/util/SimpleChunkReader.java
index bc1a5297a9bc..8d00ce5d4000 100644
--- a/src/java/org/apache/cassandra/io/util/SimpleChunkReader.java
+++ b/src/java/org/apache/cassandra/io/util/SimpleChunkReader.java
@@ -57,7 +57,10 @@ public BufferType preferredBufferType()
     @Override
     public Rebufferer instantiateRebufferer()
     {
-        return new BufferManagingRebufferer.Unaligned(this);
+        if (Integer.bitCount(bufferSize) == 1)
+            return new BufferManagingRebufferer.Aligned(this);
+        else
+            return new BufferManagingRebufferer.Unaligned(this);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java b/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java
index 3578b2d9e859..86cd5359a621 100644
--- a/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java
+++ b/src/java/org/apache/cassandra/io/util/TailOverridingRebufferer.java
@@ -18,10 +18,17 @@
 package org.apache.cassandra.io.util;
 
 import java.nio.ByteBuffer;
+import javax.annotation.concurrent.NotThreadSafe;
 
 /**
  * Special rebufferer that replaces the tail of the file (from the specified cutoff point) with the given buffer.
+ *
+ * Instantiated once per RandomAccessReader, thread-unsafe.
+ * The instances reuse themselves as the BufferHolder to avoid having to return a new object for each rebuffer call.
+ * Only one BufferHolder can be active at a time. Calling {@link #rebuffer(long)} before the previously obtained
+ * buffer holder is released will throw {@link AssertionError}.
  */
+@NotThreadSafe
 public class TailOverridingRebufferer extends WrappingRebufferer
 {
     private final long cutoff;
@@ -37,17 +44,19 @@ public TailOverridingRebufferer(Rebufferer source, long cutoff, ByteBuffer tail)
     @Override
     public Rebufferer.BufferHolder rebuffer(long position)
     {
+        assert buffer == null : "Buffer holder has been already acquired and has been not released yet";
         if (position < cutoff)
         {
-            WrappingBufferHolder ret = (WrappingBufferHolder) super.rebuffer(position);
-            if (ret.offset() + ret.limit() > cutoff)
-                ret.limit((int) (cutoff - ret.offset()));
-            return ret;
+            super.rebuffer(position);
+            if (offset + buffer.limit() > cutoff)
+                buffer.limit((int) (cutoff - offset));
         }
         else
         {
-            return newBufferHolder().initialize(null, tail.duplicate(), cutoff);
+            buffer = tail.duplicate();
+            offset = cutoff;
         }
+        return this;
     }
 
     @Override
@@ -61,5 +70,4 @@ public String toString()
     {
         return String.format("%s[+%d@%d]:%s", getClass().getSimpleName(), tail.limit(), cutoff, source.toString());
     }
-
 }
diff --git a/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java b/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java
index 965f1157892e..1935530661e1 100644
--- a/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java
+++ b/src/java/org/apache/cassandra/io/util/WrappingRebufferer.java
@@ -18,35 +18,42 @@
 package org.apache.cassandra.io.util;
 
 import java.nio.ByteBuffer;
-import java.util.Deque;
-import java.util.concurrent.ConcurrentLinkedDeque;
-import javax.annotation.Nullable;
-
-public class WrappingRebufferer implements Rebufferer
+import javax.annotation.concurrent.NotThreadSafe;
+
+/**
+ * Instantiated once per RandomAccessReader, thread-unsafe.
+ * The instances reuse themselves as the BufferHolder to avoid having to return a new object for each rebuffer call.
+ * Only one buffer holder can be active at a time. Calling {@link #rebuffer(long)} before the previously obtained
+ * buffer holder is released will throw {@link AssertionError}. We will get that exception also in case we try to close
+ * the rebufferer without closing the recently obtained buffer holder.
+ *
+ * Calling methods of {@link BufferHolder} will also produce {@link AssertionError} if buffer holder is not acquired.
+ *
+ * The overriding classes must conform to the aforementioned rules.
+ */
+@NotThreadSafe
+public abstract class WrappingRebufferer implements Rebufferer, Rebufferer.BufferHolder
 {
     protected final Rebufferer source;
-    private final Deque<WrappingBufferHolder> buffers;
+
+    protected BufferHolder bufferHolder;
+    protected ByteBuffer buffer;
+    protected long offset;
 
     public WrappingRebufferer(Rebufferer source)
     {
         this.source = source;
-        this.buffers = new ConcurrentLinkedDeque<>();
     }
 
     @Override
     public BufferHolder rebuffer(long position)
     {
-        BufferHolder bufferHolder = source.rebuffer(position);
-        return newBufferHolder().initialize(bufferHolder, bufferHolder.buffer(), bufferHolder.offset());
-    }
-
-    protected WrappingBufferHolder newBufferHolder()
-    {
-        WrappingBufferHolder ret = buffers.pollFirst();
-        if (ret == null)
-            ret = new WrappingBufferHolder();
+        assert buffer == null;
+        bufferHolder = source.rebuffer(position);
+        buffer = bufferHolder.buffer();
+        offset = bufferHolder.offset();
 
-        return ret;
+        return this;
     }
 
     @Override
@@ -70,6 +77,7 @@ public double getCrcCheckChance()
     @Override
     public void close()
     {
+        assert buffer == null : "Rebufferer is attempted to be closed but the buffer holder has not been released";
         source.close();
     }
 
@@ -79,70 +87,36 @@ public void closeReader()
         source.closeReader();
     }
 
-
     @Override
     public String toString()
     {
         return String.format("%s[]:%s", getClass().getSimpleName(), source.toString());
     }
 
-    protected final class WrappingBufferHolder implements BufferHolder
+    @Override
+    public ByteBuffer buffer()
     {
-        @Nullable
-        private BufferHolder bufferHolder;
-
-        private ByteBuffer buffer;
-        private long offset;
-
-        protected WrappingBufferHolder initialize(@Nullable BufferHolder bufferHolder, ByteBuffer buffer, long offset)
-        {
-            assert this.bufferHolder == null && this.buffer == null && this.offset == 0L : "initialized before release";
-
-            this.bufferHolder = bufferHolder;
-            this.buffer = buffer;
-            this.offset = offset;
-
-            return this;
-        }
-
-        @Override
-        public ByteBuffer buffer()
-        {
-            return buffer;
-        }
-
-        @Override
-        public long offset()
-        {
-            return offset;
-        }
-
-
-        public int limit()
-        {
-            return buffer.limit();
-        }
+        assert buffer != null : "Buffer holder has not been acquired";
+        return buffer;
+    }
 
-        public void limit(int limit)
-        {
-            this.buffer.limit(limit);
-        }
+    @Override
+    public long offset()
+    {
+        assert buffer != null : "Buffer holder has not been acquired";
+        return offset;
+    }
 
-        @Override
-        public void release()
+    @Override
+    public void release()
+    {
+        assert buffer != null;
+        if (bufferHolder != null)
         {
-            assert buffer != null : "released twice";
-
-            if (bufferHolder != null)
-            {
-                bufferHolder.release();
-                bufferHolder = null;
-            }
-
-            buffer = null;
-            offset = 0L;
-
-            buffers.offerFirst(this);
+            bufferHolder.release();
+            bufferHolder = null;
         }
+        buffer = null;
     }
+
 }
diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java
index 8605c71007e5..341b7dac6f72 100644
--- a/src/java/org/apache/cassandra/metrics/TableMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.metrics;
 
+import static org.apache.cassandra.io.sstable.format.SSTableReader.selectOnlyBigTableReaders;
 import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
 
 import java.nio.ByteBuffer;
@@ -785,7 +786,7 @@ public Long getValue()
             public Long getValue()
             {
                 long total = 0;
-                for (SSTableReader sst : cfs.getSSTables(SSTableSet.LIVE))
+                for (SSTableReader sst : selectOnlyBigTableReaders(cfs.getSSTables(SSTableSet.LIVE)))
                     total += sst.getIndexSummaryOffHeapSize();
                 return total;
             }
@@ -819,7 +820,7 @@ public Ratio getRatio()
             protected double getNumerator()
             {
                 long hits = 0L;
-                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                for (SSTableReader sstable : selectOnlyBigTableReaders(cfs.getSSTables(SSTableSet.LIVE)))
                     hits += sstable.getKeyCacheHit();
                 return hits;
             }
@@ -827,7 +828,7 @@ protected double getNumerator()
             protected double getDenominator()
             {
                 long requests = 0L;
-                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
+                for (SSTableReader sstable : selectOnlyBigTableReaders(cfs.getSSTables(SSTableSet.LIVE)))
                     requests += sstable.getKeyCacheRequest();
                 return Math.max(requests, 1); // to avoid NaN.
             }
diff --git a/src/java/org/apache/cassandra/repair/LocalSyncTask.java b/src/java/org/apache/cassandra/repair/LocalSyncTask.java
index 591640150804..fe628f5a6534 100644
--- a/src/java/org/apache/cassandra/repair/LocalSyncTask.java
+++ b/src/java/org/apache/cassandra/repair/LocalSyncTask.java
@@ -138,7 +138,7 @@ public void handleStreamEvent(StreamEvent event)
                 state.trace("{}/{} ({}%) {} idx:{}{}",
                             new Object[] { FBUtilities.prettyPrintMemory(pi.currentBytes),
                                            FBUtilities.prettyPrintMemory(pi.totalBytes),
-                                           pi.currentBytes * 100 / pi.totalBytes,
+                                           pi.totalBytes == 0 ? 0 : (pi.currentBytes * 100 / pi.totalBytes),
                                            pi.direction == ProgressInfo.Direction.OUT ? "sent to" : "received from",
                                            pi.sessionIndex,
                                            pi.peer });
diff --git a/src/java/org/apache/cassandra/streaming/ProgressInfo.java b/src/java/org/apache/cassandra/streaming/ProgressInfo.java
index 2b306f8c1beb..a1c695989832 100644
--- a/src/java/org/apache/cassandra/streaming/ProgressInfo.java
+++ b/src/java/org/apache/cassandra/streaming/ProgressInfo.java
@@ -58,8 +58,6 @@ public static Direction fromByte(byte direction)
 
     public ProgressInfo(InetAddressAndPort peer, int sessionIndex, String fileName, Direction direction, long currentBytes, long totalBytes)
     {
-        assert totalBytes > 0;
-
         this.peer = peer;
         this.sessionIndex = sessionIndex;
         this.fileName = fileName;
@@ -111,7 +109,7 @@ public String toString(boolean withPorts)
         StringBuilder sb = new StringBuilder(fileName);
         sb.append(" ").append(currentBytes);
         sb.append("/").append(totalBytes).append(" bytes ");
-        sb.append("(").append(currentBytes*100/totalBytes).append("%) ");
+        sb.append("(").append(totalBytes == 0 ? 0 : currentBytes*100/totalBytes).append("%) ");
         sb.append(direction == Direction.OUT ? "sent to " : "received from ");
         sb.append("idx:").append(sessionIndex);
         sb.append(peer.toString(withPorts));
diff --git a/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java b/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java
index 56c57d94e206..a8c8719363ff 100644
--- a/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java
+++ b/src/java/org/apache/cassandra/tools/SSTableExpiredBlockers.java
@@ -72,7 +72,7 @@ public static void main(String[] args)
             {
                 try
                 {
-                    SSTableReader reader = SSTableReader.open(sstable.getKey());
+                    SSTableReader reader = sstable.getKey().getFormat().getReaderFactory().open(sstable.getKey());
                     sstables.add(reader);
                 }
                 catch (Throwable t)
diff --git a/src/java/org/apache/cassandra/tools/SSTableExport.java b/src/java/org/apache/cassandra/tools/SSTableExport.java
index 74af9fbbe49d..6222bc69ddd0 100644
--- a/src/java/org/apache/cassandra/tools/SSTableExport.java
+++ b/src/java/org/apache/cassandra/tools/SSTableExport.java
@@ -142,7 +142,7 @@ public static void main(String[] args) throws ConfigurationException
         try
         {
             TableMetadata metadata = Util.metadataFromSSTable(desc);
-            SSTableReader sstable = SSTableReader.openNoValidation(desc, TableMetadataRef.forOfflineTools(metadata));
+            SSTableReader sstable = desc.getFormat().getReaderFactory().openNoValidation(desc, TableMetadataRef.forOfflineTools(metadata));
             IPartitioner partitioner = sstable.getPartitioner();
             if (cmd.hasOption(ENUMERATE_KEYS_OPTION))
             {
diff --git a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
index b9f9ad0eb5dd..9795294fce74 100755
--- a/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
+++ b/src/java/org/apache/cassandra/tools/SSTableMetadataViewer.java
@@ -174,7 +174,7 @@ public String scannedOverviewOutput(String key, long value)
     private void printScannedOverview(Descriptor descriptor, StatsMetadata stats) throws IOException
     {
         TableMetadata cfm = Util.metadataFromSSTable(descriptor);
-        SSTableReader reader = SSTableReader.openNoValidation(descriptor, TableMetadataRef.forOfflineTools(cfm));
+        SSTableReader reader = descriptor.getFormat().getReaderFactory().openNoValidation(descriptor, TableMetadataRef.forOfflineTools(cfm));
         try (ISSTableScanner scanner = reader.getScanner())
         {
             long bytes = scanner.getLengthInBytes();
diff --git a/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java b/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java
index 79fec81f345b..c9788c3913eb 100644
--- a/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java
+++ b/src/java/org/apache/cassandra/tools/SSTableOfflineRelevel.java
@@ -116,7 +116,7 @@ public static void main(String[] args) throws IOException
             {
                 try
                 {
-                    SSTableReader reader = SSTableReader.open(sstable.getKey());
+                    SSTableReader reader = sstable.getKey().getFormat().getReaderFactory().open(sstable.getKey());
                     sstableMultimap.put(reader.descriptor.directory, reader);
                 }
                 catch (Throwable t)
diff --git a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
index bd71c64f5abe..1eda9af15198 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
@@ -201,7 +201,7 @@ public static void main(String args[])
 
                 try
                 {
-                    SSTableReader sstable = SSTableReader.openNoValidation(descriptor, components, cfs);
+                    SSTableReader sstable = descriptor.getFormat().getReaderFactory().openNoValidation(descriptor, components, cfs);
                     sstables.add(sstable);
                 }
                 catch (Exception e)
diff --git a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
index e15e5bccd585..8b2fbe4b56eb 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneSplitter.java
@@ -89,13 +89,7 @@ else if (!ksName.equals(desc.ksname))
                 else if (!cfName.equals(desc.cfname))
                     throw new IllegalArgumentException("All sstables must be part of the same table");
 
-                Set<Component> components = new HashSet<Component>(Arrays.asList(new Component[]{
-                    Component.DATA,
-                    Component.PRIMARY_INDEX,
-                    Component.FILTER,
-                    Component.COMPRESSION_INFO,
-                    Component.STATS
-                }));
+                Set<Component> components = new HashSet<>(desc.getFormat().supportedComponents());
 
                 Iterator<Component> iter = components.iterator();
                 while (iter.hasNext()) {
@@ -122,7 +116,7 @@ else if (!cfName.equals(desc.cfname))
             {
                 try
                 {
-                    SSTableReader sstable = SSTableReader.openNoValidation(fn.getKey(), fn.getValue(), cfs);
+                    SSTableReader sstable = fn.getKey().getFormat().getReaderFactory().openNoValidation(fn.getKey(), fn.getValue(), cfs);
                     if (!isSSTableLargerEnough(sstable, options.sizeInMB)) {
                         System.out.println(String.format("Skipping %s: it's size (%.3f MB) is less than the split size (%d MB)",
                                 sstable.getFilename(), ((sstable.onDiskLength() * 1.0d) / 1024L) / 1024L, options.sizeInMB));
diff --git a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
index 323dab142c29..0f5d4d176e63 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneUpgrader.java
@@ -85,7 +85,7 @@ public static void main(String args[])
 
                 try
                 {
-                    SSTableReader sstable = SSTableReader.openNoValidation(entry.getKey(), components, cfs);
+                    SSTableReader sstable = entry.getKey().getFormat().getReaderFactory().openNoValidation(entry.getKey(), components, cfs);
                     if (sstable.descriptor.version.equals(SSTableFormat.Type.current().info.getLatestVersion()))
                     {
                         sstable.selfRef().release();
diff --git a/src/java/org/apache/cassandra/tools/StandaloneVerifier.java b/src/java/org/apache/cassandra/tools/StandaloneVerifier.java
index 4e4a80a960a3..e8d3a060e2d1 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneVerifier.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneVerifier.java
@@ -18,6 +18,7 @@
  */
 package org.apache.cassandra.tools;
 
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
@@ -84,18 +85,20 @@ public static void main(String args[])
             for (Map.Entry<Descriptor, Set<Component>> entry : lister.list().entrySet())
             {
                 Set<Component> components = entry.getValue();
-                if (!components.contains(Component.DATA) || !components.contains(Component.PRIMARY_INDEX))
+                Descriptor descriptor = entry.getKey();
+                if (!components.contains(Component.DATA) ||
+                    (SSTableFormat.Type.BIG == descriptor.getFormat().getType() && !components.contains(Component.PRIMARY_INDEX)))
                     continue;
 
                 try
                 {
-                    SSTableReader sstable = SSTableReader.openNoValidation(entry.getKey(), components, cfs);
+                    SSTableReader sstable = descriptor.getFormat().getReaderFactory().openNoValidation(descriptor, components, cfs);
                     sstables.add(sstable);
                 }
                 catch (Exception e)
                 {
                     JVMStabilityInspector.inspectThrowable(e);
-                    System.err.println(String.format("Error Loading %s: %s", entry.getKey(), e.getMessage()));
+                    System.err.println(String.format("Error Loading %s: %s", descriptor, e.getMessage()));
                     if (options.debug)
                         e.printStackTrace(System.err);
                 }
diff --git a/src/java/org/apache/cassandra/tools/Util.java b/src/java/org/apache/cassandra/tools/Util.java
index 3757754f3508..e9ae4a4f7220 100644
--- a/src/java/org/apache/cassandra/tools/Util.java
+++ b/src/java/org/apache/cassandra/tools/Util.java
@@ -41,6 +41,7 @@
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.schema.TableMetadata;
@@ -310,8 +311,11 @@ public static <T> Stream<T> iterToStream(Iterator<T> iter)
      */
     public static TableMetadata metadataFromSSTable(Descriptor desc) throws IOException
     {
-        if (desc.version.getVersion().compareTo("ma") < 0)
-            throw new IOException("pre-3.0 SSTable is not supported.");
+        if (desc.getFormat().getType() == SSTableFormat.Type.BIG)
+        {
+            if (desc.version.getVersion().compareTo("ma") < 0)
+                throw new IOException("pre-3.0 SSTable is not supported.");
+        }
 
         EnumSet<MetadataType> types = EnumSet.of(MetadataType.STATS, MetadataType.HEADER);
         Map<MetadataType, MetadataComponent> sstableMetadata = desc.getMetadataSerializer().deserialize(desc, types);
diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
index 499c2297d7f1..4567ea128d87 100644
--- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
+++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
@@ -37,6 +37,7 @@
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.compress.BufferType;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
@@ -445,6 +446,28 @@ public static void skipShortLength(DataInputPlus in) throws IOException
         in.skipBytesFully(skip);
     }
 
+    /**
+     * Returns true if the buffer at the current position in the input matches given buffer.
+     * If true, the input is positioned at the end of the consumed buffer.
+     * If false, the position of the input is undefined.
+     * <p>
+     * The matched buffer is unchanged
+     *
+     * @throws IOException
+     */
+    public static boolean equalsWithShortLength(FileDataInput in, ByteBuffer toMatch) throws IOException
+    {
+        int length = readShortLength(in);
+        if (length != toMatch.remaining())
+            return false;
+        int limit = toMatch.limit();
+        for (int i = toMatch.position(); i < limit; ++i)
+            if (toMatch.get(i) != in.readByte())
+                return false;
+
+        return true;
+    }
+
     public static ByteBuffer read(DataInput in, int length) throws IOException
     {
         if (length == 0)
diff --git a/src/java/org/apache/cassandra/utils/IFilter.java b/src/java/org/apache/cassandra/utils/IFilter.java
index b5eb2c416021..67596fae7bcb 100644
--- a/src/java/org/apache/cassandra/utils/IFilter.java
+++ b/src/java/org/apache/cassandra/utils/IFilter.java
@@ -25,6 +25,12 @@ interface FilterKey
     {
         /** Places the murmur3 hash of the key in the given long array of size at least two. */
         void filterHash(long[] dest);
+        default short filterHashLowerBits()
+        {
+            long[] dest = new long[2];
+            filterHash(dest);
+            return (short) dest[1];
+        }
     }
 
     void add(FilterKey key);
diff --git a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
index 4c0f9723a757..6ef6310f06c6 100644
--- a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
+++ b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
@@ -128,6 +128,10 @@ else if (t instanceof UnrecoverableIllegalStateException)
         if (isUnstable)
             killer.killCurrentJVM(t);
 
+        if (t.getSuppressed() != null)
+            for (Throwable suppressed : t.getSuppressed())
+                inspectThrowable(suppressed, fn);
+
         if (t.getCause() != null)
             inspectThrowable(t.getCause(), fn);
     }
diff --git a/src/java/org/apache/cassandra/utils/NativeLibrary.java b/src/java/org/apache/cassandra/utils/NativeLibrary.java
index eb0eaafd2621..bbc37cadd034 100644
--- a/src/java/org/apache/cassandra/utils/NativeLibrary.java
+++ b/src/java/org/apache/cassandra/utils/NativeLibrary.java
@@ -418,4 +418,17 @@ public static long getProcessID()
 
         return -1;
     }
+
+    public static FileDescriptor getFileDescriptor(FileChannel channel)
+    {
+        try
+        {
+            return (FileDescriptor)FILE_CHANNEL_FD_FIELD.get(channel);
+        }
+        catch (IllegalArgumentException | IllegalAccessException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
 }
diff --git a/src/java/org/apache/cassandra/utils/SyncUtil.java b/src/java/org/apache/cassandra/utils/SyncUtil.java
index 1917e8bc5844..aa74247d1053 100644
--- a/src/java/org/apache/cassandra/utils/SyncUtil.java
+++ b/src/java/org/apache/cassandra/utils/SyncUtil.java
@@ -25,6 +25,7 @@
 import java.nio.MappedByteBuffer;
 import java.nio.channels.ClosedChannelException;
 import java.nio.channels.FileChannel;
+import java.util.Objects;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.cassandra.config.Config;
@@ -185,6 +186,12 @@ public static void sync(FileOutputStream fos) throws IOException
         sync(fos.getFD());
     }
 
+    public static void sync(FileChannel fc) throws IOException
+    {
+        Objects.requireNonNull(fc);
+        sync(NativeLibrary.getFileDescriptor(fc));
+    }
+
     public static void trySync(int fd)
     {
         if (SKIP_SYNC)
diff --git a/src/java/org/apache/cassandra/utils/Throwables.java b/src/java/org/apache/cassandra/utils/Throwables.java
index a9ee2e46c55a..70dc84ff5d49 100644
--- a/src/java/org/apache/cassandra/utils/Throwables.java
+++ b/src/java/org/apache/cassandra/utils/Throwables.java
@@ -226,6 +226,8 @@ public static Throwable close(Throwable accumulate, Iterable<? extends AutoClose
         
         for (AutoCloseable closeable : closeables)
         {
+            if (closeable != null)
+            {
             try
             {
                 closeable.close();
@@ -235,6 +237,7 @@ public static Throwable close(Throwable accumulate, Iterable<? extends AutoClose
                 accumulate = merge(accumulate, t);
             }
         }
+        }
         return accumulate;
     }
 
diff --git a/src/java/org/apache/cassandra/utils/concurrent/Refs.java b/src/java/org/apache/cassandra/utils/concurrent/Refs.java
index e5d9c37a4c88..e34ea45b683d 100644
--- a/src/java/org/apache/cassandra/utils/concurrent/Refs.java
+++ b/src/java/org/apache/cassandra/utils/concurrent/Refs.java
@@ -89,7 +89,7 @@ public Ref<T> get(T referenced)
      */
     public void release(T referenced)
     {
-        Ref ref = references.remove(referenced);
+        Ref<T> ref = references.remove(referenced);
         if (ref == null)
             throw new IllegalStateException("This Refs collection does not hold a reference to " + referenced);
         ref.release();
@@ -102,7 +102,7 @@ public void release(T referenced)
      */
     public boolean releaseIfHolds(T referenced)
     {
-        Ref ref = references.remove(referenced);
+        Ref<T> ref = references.remove(referenced);
         if (ref != null)
             ref.release();
         return ref != null;
@@ -114,9 +114,9 @@ public void relaseAllExcept(Collection<T> keep)
         release.retainAll(keep);
         release(release);
     }
+
     /**
      * Release a retained Ref to all of the provided objects; if any is not held, an exception will be thrown
-     * @param release
      */
     public void release(Collection<T> release)
     {
@@ -217,7 +217,7 @@ public static <T extends RefCounted<T>> Refs<T> tryRef(Iterable<T> reference)
             }
             refs.put(rc, ref);
         }
-        return new Refs<T>(refs);
+        return new Refs<>(refs);
     }
 
     public static <T extends RefCounted<T>> Refs<T> ref(Iterable<T> reference)
@@ -232,9 +232,10 @@ public static void release(Iterable<? extends Ref<?>> refs)
     {
         maybeFail(release(refs, null));
     }
+
     public static Throwable release(Iterable<? extends Ref<?>> refs, Throwable accumulate)
     {
-        for (Ref ref : refs)
+        for (Ref<?> ref : refs)
         {
             try
             {
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..dbc18f6cc25614b7e39a6e473793b27f66a1c603
GIT binary patch
literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{lk4fixSK4JVi{LzwJGAT;-S2+cnq
ILJQ9Y0Mxw=6951J

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..1f9357b1f068c1b1f97130a1d8f3f7d1dc1c293a
GIT binary patch
literal 5328
zcmd5=|8G;*6@M?mG&PxC*lHRwjhCWq1SWBv#5k@-e4SFXbSsdllxE!|f*n$mFT{!C
z;Gls`+6-@^U0O}8RmMn~k<rl(LRZ1mLY&0;{sku_!TBCRrKQp|p^h=oR^INp_IvNX
zXnz2|`E%~M_nvosKA&^$<v5;uhvSVtNs|2Jx5Q;|Z?qeYS=l+JZRT9dvwDt<NbgC<
z{EY&a<o6$}aM<0|hs!JLYs<af@=BN2S#zY!Q(x*ibih?!Qs=6!v^!j7_M`Ty`f{hM
z((ziox6EEuT~c4;sdd=xZqE^Ksq?72rtIM1@|se2g{QRKQRk@exNAzP4|>X~O5Bx|
zuN^FNRM@>`<?gByM@j9WibJK9N6X!1j#38+?{Ze24uWc}3N&u-JRY&>+^9XPMo$A>
z-L|UAf3nebx!+36m$R_qYVB-!VAFZlWl!sb_*eP-fks<rPdYJ+IcDnl9$A35>Ckck
z_?q75WM9sv!(D4!;JQugy5$nMnpIbrzbgu!xf)Ng+KTGdS+tH#Hox8@#QcKs%|CJ4
zbhTpk^=ILe&h4>vjh`nUDYjz-hBh2J<i+rH*&Q(tjJy7o_k1le_~|LM;ci0;EE^jq
z&|)Fs$PjhE!Wrshgu9P6Grp+1MY4Lt_|jx*=q@vgyT3?fR>I-LA_z4{-YC2=NF$Xt
z(@0V1lusmQ8y`mDeCtnHcmSsn7!qR6;WyY(9B%dFH1d`5Y#i<iF?*x%ZR=rmG2Ele
zWCBG!8SE(P*<n3Q4#*@2+GAf13FJsSGU9<=%=_3Q%EPFq0U}dj|05?0CBME-;0&>;
zy&F0*n8Dt>jtu2+QSZ+?GMIf)?=P66N4=*RgnO?y`AoF}PK$>$p7(mK*lwlp9LAyh
zUpooTlX2(z{{r3)_=`fy*#5nziaA2#i$L8NiS^uR3Gy6wZg)N&{B7UCe<1vuZxQ|i
z+G63c4*l#r*1105IX=rrOdv$u9-V0XTOv3J1&Pu=LTaJ+Vdid~Yd@6$hS&W&jWG!@
zKOyZCoIK2_0%rH6CL74vXW1yT&+WneB`%-djW1_qr#`8y%=z`W|0GFUl1mSGWd$H{
zTNJFOs2T=X2Z<CA5m@2-r46iR#upT0AjuEr+!fB7LSVHp&SYWnhmZ{4z$7y19JURV
zat4KmfEzgj`MYHe=-W{&Jw-%9rq;k8AY)|#Km%b4mC-vmO-;(d1e^wHA&(9|!=~Y2
zLBuaAVu1&pDI{1$Ee1~(YT%!|8UYv_xXKCu?z1;jsZvNhxcq>fKa`vfFfIXjC^uCz
zy03_VXvkxIU1dBP3W8F}E2E*S(n~D7rG@taDvTapKXj#<`|OGk7XaLMVu>(Salpft
zWp?Y~qU~Mm^dnz?PM|CS!|&1qRagI%IZ!k_sX_NxH1aJCGFyT2vI6wATz3$tC_rCm
z`^N->0-PzN<$1$5lT#_-H5L1|<Ojz0-3dG<jJLd*ApKo9u}Zov+~Y5BScm%)`mwH7
z*u{S&Tn8Dr4KlDXDJ}!qK4PLW@RnXQzP$<@gs&1WfC`Lw3jd!9j9zoaRN&O=3-Kjs
zc^~<06HP!>1xDvCk$+aq?g9;<%Yp=q{^u@Ob9VFcbl~reZyUjCYEVJ|qMCn<YKe+$
zQ+>hu@{81J*7|yOp=w^*EaqyRgYdT}V-f*y{l*oHK&ce+-<aaIu;jn-m4^HypG2h1
zD*16_Ewn2_y!xYv5hG=G8;+ExGnElXe(42gxe`V`Y}UXq%TE+pX7h7QV0!~JwpByG
zu@|J5NP5b(e`JbQcUfe{4q2f^QRq4rpp)g|Kh_E{%cFR#Yl$3*g3;J*b{LK8*s#Vk
zG*lnoK~Ga&3ynMAFcor+H_)?`i9d4R!V({iw`fZLct{JEKuBfYk4swK-`v24bu<6i
z8D`&`#mD+s=s)ge;o0MEmHK#eeX-9(`V+$aeTB4?n?HK{pOosc#vBMw+t@8JvX9za
zj|tgxN1$GA71lf<6$7jHUm{EwEqsDhEYv`C-U`;^cY*)Z6$#eFnCtJ2aRJEj5kD#b
zBZ6pLt4`fd6yWCkn|fdi(*sO?FXb)0i#x6tvyy?<Tj`Qi04Dg<6#_81vk*v~{TvA}
z9uX+Oq!V}{=8VR|r!kRofAT6=O$llY&;+yrOT=v}d<UVJiRKl)UltRqum+R3pqrmc
z4gijmpWq)j0mr#d@T=!(`yV3?OgI*yHB~PC6X`{4``?D9nx)O;d*tSSDr8~!j;8L=
zQJL?z5}KJ&eCxaYu)8w4Gf4woL<O1A-`F(p&wjS}qg!K=RTTg{frCEg5_0SAGCMh%
zk_k}ee-yFJKZ?Gpk(OvZ3vo2KM4BTxXcYZiJ33pyd?AWHD(E8VqVRwE8NZ1{Df55A
z9_kR(!jGn(x9QYYX}Sc$Qs<t8os_yi-A18&c+&$-wlp=Y>E*m2yYbALNAECe&3s=A
zcT@GmX+j~rd!I&61t{O;PTrgwyS7$S!rLYq9+UfcyZLF_VAl(0pES^`#>BZypnUEX
z&LCvc2ZVZ{eAr2aK=ux-GJy5Yc@Tmt+&bA6+Jni}k+>3=e1tVr36c$>aZ`LJG8cM*
znNv$Edx8CdM(gb4ugR@5>%LCeo*T%(GnbMe9hKX_>^(?kF}sM~nd=3zCXoR&_w-N(
zRQH^#O!QFnhpE|=s)hhU@G`ka5+Tq!qxuxcZ9OKj1Lfy_X9288z}#mbHL3*6&49?L
z5-|6W$cs|nNrN!IUQ_>PZWJ)gLh}~d?91hU{uK)2S&8QB*xr9W)~d?AKc4?<F_W`+
zKH3bC$Qiwm7}UUxQ4M@9mE}KPI8dZ%{%8IX&@}&Of!yfTB99i%QYcHnLI)e@!UC<W
zauvr54=*u+qQwl&U0`vC7CMryvCYO}gEh!dZ?T1K0nqGP@=AeBS%x6|U$IR1Q5Ra&
zmi|Rj`js_WeZOk7Eu|+G+DMmT;^|WW*z%XsH#X&y6YQi<Gp&5SPTZMKZY`!R#efc^
zG>}B>Rt%zl73l`U6ektZyZf=9yr4Ui&T$u&mpG%I^GYZDbc?f1k4LS)Z2As=RQQmu
z7fu1q0}1&$bUl&}4|3Zwp=AyT!9-~_h(i9X5h}iD{v$7~7E+-P2vD(@-An2-5sOW#
pKgfcc)Hu>LFy3H&*AHSrpFE^i?wWu;4PjkHA-oLo-kQCJ{{edfBH#c3

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..3d9631973846
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3036065180
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..2cf64b034c96546c4de0b3100349686aa0f7bf30
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9t;b6+-2x2leXJfYKWUAv3He_a)%+AcPf<1uQz{G-!gN2!4
zFB>z%B_Mvz7Qk$1YRLl<`@_u4$krIZW@uz=VqxWKsUpC^qUgZK&%kKFz{^|z|Ns96
z1_)ph<zdp|fmz4Rl*tXXj)$p^8*CjH&^j(8>o|ed0r7KAB<t9K*0o@>&hQ{x6J0Gd
lI>gIPR||~~3-?gNLT+G?@*@Q)FEB{Eu?4Bg5h+1b3jr>=VAcQt

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..3d552f20e788ad431475a86f1b45bc48811f1b63
GIT binary patch
literal 7095
zcmeI%Yfw~G902fh?t_=G!Ga(#QmH7CE-!gC=q9cKO%uc;v25hR?#P+~v%5n`Iy8n~
zm<Bf|BWHZ{m>35g8O0Ka?1KiBmddDDBMi-K)acZfqqHiq-P`Wf|Nqnnr!UoX&&-{B
ze*DkwoOA9u=l*9Qga`{4$BV>QmdH0MUsn0z<m+|$;t7dQs<K<nB&FWA!5}qAHMJH)
zjbyh=b!MAnFxNT^`CA;8Vw>4wvzSXHo5Sj`*4J5V#@6O<3yws+Qg!i6hx|qHUaxm1
zES8G9ZtS4WsfTa&baBT%nHDFe$HHGaJu@yhQJN`5=>sgRmA=6u_YoBVi4b*s>YoGD
zivJ$SIEZlw<50#r#u1F87|&rmm+^eY$&Aw(=P@p1{2b#AjJGo0&A5m0C^luD2hC#Q
z4#p?3xfsUVv3chE`Mo$URA7r4*a6G21DWv#-No^cA?#2$_Cq_c_3vOuKaU+Zg8fJe
z_QC_$hDz-0ec1T}*d?>Et75V3&tkVp*qzM!9Wvp#JHc<?KB5V~pg658$KHq6Cv{&&
zajp#SXYLDZzVa*7UJ&s57cQJdadEgEIpF6m<e)Mya`21=dDcuMvd)axV}$7k6p!kQ
zMxHYshdg(*6nXwR{JfJr_<3hIKSk}APke{`)JojHwQ(*K-}s{*`PJfX<UO}@kb6(#
z`5Eg)^GQR(Z^8D|SfIQI?iJcH@8CVi6?*1WDm1s~1Ug^2w{#a_oR4cl>x1umF&M^$
zrr;IO;<`~Z-hg^*6O0G;x~8CmO{;H0&r0Y=`zfqae;&re`JzGS$m~S4o?~>k--hw{
zLvLJzeso#kG3Z5C*5pB_oUc6som;hSKeXwa)HTqh2R3Iy*Cc-Q1a#w_uu16l&$<er
z50&ki2mNJ7>rUvY4@Xu5)1<KKKIocNtKj~pT_q>f{jW5m-Zyn{|8o}C_{ZS$+{ED(
z&A_~ax^@8z3(o0_pgo?g@cD?pI(NhON8sYf$(5jeP{+~c<G>-s?+)64L)W+e1jn!Y
zI^r4}f5e5t9zAG3TWV{7?}ONb#(o2cCwxAshtF%lje?>!5MNT^PK5bDX2$hF9f%u8
zzwdt;`02gfhhcwSaOY;i^;}nK>U4tkt<}9l6Tthgjtz7IpM3BAZrESfspOID`*Es&
z?d)_uaM8wFcemd+vHR=l8SFdb|D1zy;4@0~|B`d5^O4SsAMJ5DrsZ5{g0!3qO$Yhg
zIhXHiw46)Jxxj+ba;|?e=c4P^4h(-lqRShJxUR7+{5Q`Z_q2iG7Wp?0<x(G&(V5pm
z({f=xqK1?^6^^E+rln^r%goBo$<=Dw9BAu6+YA5h^T2m(X@`||U~4Ze?Zy3{NnX^n
z>yE!7-~DMEIetkw)38B`_ly{GeLC;4t6AGTge3j-|G=sZn<bmQR@!8!D1S1id`p9c
TK5$Dy`9^8e12<p#xZV0Y<am*k

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e57b49d47a09d46587a2451f4c2835221d92242
GIT binary patch
literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{lk)fixSK4JVlIL742fAT;+;2+a?r
Hg%1D#+S3h@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..b038dc27c272dc4d1d1bf718ce88827aa6dcb701
GIT binary patch
literal 5364
zcmd6r{Zmxe8OQGhS8=tuCXs}}n9DS@qGP=aEN=$)EHzEE$;53;qSSOGfCPD6cUfJ?
z(A|tBZf$+bbP6@rTB+G&+9Y7A(n*9BQQmPCMBX8pc52dQw4F|3)5z`foV_pSGWi4i
zh2eS5_n!NlyXX5k&%GSSb8m3G#V5<MAO4rQ1ny_e7E5wUsx>V=BQwj)!H8Th*ZS)O
zj`RC>lojN=cU6>@S67vKy`|+&Z?S7niKn{Av-4$VY2j|?uJZf>XGwlder0uOv9rA3
zm1=KEe&w#hYL};~AV1&j+2buLu5r6cc2tzQiri(MqSAuh1!W$$t8mv2Pf2B=yS)6B
z9VG>2`QDOJcV%HgVb#vEokis}rS6h~q5>%Ibn-eA7S-qr)MaxQwjZ{eTvBtg%S<bs
z|JFR8e_x%QYfQ31{&X_Y#5V52{(#+7^-#*FNl+Gc$R7ahg;O(-7gN)zwjSn)vYVt!
z2`IK6<y0{yXS}FPGb-g4RHo}n-fkl8kKRFPhEclKfznJ}3X_VtHpva8MwAW8?Y&{Y
z*)68DX5qR{ZMSpn4NG9DVl$C2)5x`ENv(dmNJSKs+S_bWm6+nA&!UlU6{M*=zqv|?
zOcEsRcTd3WweVtMlOf#nwjr#zX$be*nNYg;4!6pq&(sydYEzXPNf&Q^<aLY9@=7c<
zD1~13iST^el(i79BYJC_P$fhXLDKQ<2An%2P&w&XcN;@(^d}Z3lsde%0c|q4w!RCO
zsEDPG4@j;)Ds>F7VyWYE46Um8(!yFdT$-p<>7U)q4kmqcfDPA^C)lv!r)~(j*vPe)
zI<gzs(OYd<OnBLx-Oh%qjH4T8-(W|J{USU&60cNsY~5Z6Z8SW!op7NK{dYIRK3EsY
zY=X?#<0#3-#NQm$yPxiRA;f{B-v<*bb~bZIWgqN!(0_GrSh;yUe^U68|Gn^M#7rhn
zOyt7qE@G-Y$#Z<N4{``4Zj(v0{5={K!iH%1Mew)4U2e>L)>OHj@_@B?=QQka<U#rZ
z`9-0a$EzYHzdT?eln8d}a(t4Vvoo-<#FV&&+T^_7OuAN%U^?{<uT6p0-x!Bk>xf<n
zW(%E|O`D_Li^dYu=V(vjwiYvtTCqKp=n*rGQt@Lh)+2y$i?RkJO82~mkN`-C?A}wG
z>|FO_+$!8Eln6m``O|=pu|oHHf;8|`8U#sq{#Mo~-LH{&k7^sz{inmsAnE?c5-3$0
z?Rg-<5T<QlLaFC@M1kt5o>%?4*Mm~eQP}H(3WCm<r}sL!yR|9{V7#^G0KiXSLy+`L
z+I_J7Qxz6Sy)oy(h1ziM3NKq{?~^S|C8T%99VV1|>+mV6h4ucy5MAVA8PoNC!o|Xq
zR8T-VlVV7`TL(p76p7c>r0+WvtLn^sPh_{l8Z;R6RoZ|-8fE$p+H!GNlfT!>f~D`O
z%wUJ~-S)!+RI4Wa4+4~t`VkzE&ZjmIlUq=50Os#3%0}}e4URkT%(ncoZhr;!zIKdT
z;JEu(_EYnB)^*(z=Xi7G-YEGKVb47H^B?|96fsGsVsL3&__IP60$~Jm9k5)=gA^a6
zC=U*rMa#Z<C=hN&pP@MDe=g}8aWL=}7s0`==bup~jnJUKHeoIe1}@oQ^N1-eNCP@G
z!eHR9e$1wB<<$w$idPokQ7KxzS{cZK@8W?ers+~V+@EX%d#4+PI~}lz#SDW`3LJ*3
zg~bH!PS5)*!7wJir6)SUl)C?+CAjigCesoGN&m4tx+(<`K@wi*#W_?2gjb)yP}zti
z{PYYHO5xJBXJP&7HVyB$Gk^%c8wfEcb~xO`TuQ>XZnNRl2jj3()wR;qbh#Mzs7itU
zHguCd`hu;34ZSo}0dTe6IC{<)PEl{^bKONVY2h_%NYDvz&E*%F=m>&JbLF)&t=O`t
z%|QQDpp!N2wUM?&R{f<R9E@-E!l=>>kio1OC`+GvFqaOiPj=924C`4i4W2bb!<0*E
zGtfWV^+gq$e~W{j>f07)=TNKDZk?uRXS-4{`@r%I_lDUs`tjIYBP?1V=b^2$17LMA
zlaG?~1Q(vtYcRX53jsi-l3<IDNM8>r1W5IPjuN0>5H0uXmr?`_Ej{s#2)O>Em<R$E
zqp#P{X(J36N}ZX@fa_&>u#d%*??ahNT>t^;`k~%LH#wc0IuKa}Ee*W=ULQ?cqx4$P
z@&j;ofI4lC`tzPFOs6wyWXbRnr#-$(SVW*aYW!gdy(MPerxzRrMLaix7SnMs?4l-9
z+X2AA@R6+s91LxaKu)zZ8E$I8ITRd7!&lIU>Zmk)i|q(*JYZ)He<Q6JFICZT<9P(2
z9=Z{IdRPynV#2cy76;_U^~1DIl?OvE*3;z1ofe-N-rY18k1U(51yid;7<t^ck@5J*
zk6FWvRIh_ZT)*^~Z7&<P7_{TY!LMv=4?nuxh6k$(grOt&4x{3NjAq|%%hD}QMqjcp
zOO{4ot^KVI1ZngrYzd|X82t-d$LOdby2D0e4;!Mjwg#w)#4mBs7j%o`^l|EcwaL%O
zjJ(-8DFE!;P}9u40f6*}Y1>^b{BB_^+K+HUVc4B+G2<K;D<skH4AsbhTX~cWsyr|+
z7T*i5-**DhaE^P3(Td;udZ`oN6|6q64kg3lSkbaVxf5JU$uRb-rUSU24BrCA`{DP(
z7lkiO+U@!7L^Afl?YU4G|ECjvS}|oSZKSIOED8KLu}EI(5mRADX(fU8SHXcJTI2K*
z5C}h1fG|h96MtGWf!by4<|hIeD;&T$h_T)fO!UTu5?KOFd<K3uGz60i!Qk3Af}k`u
zh{u%ba5A|%M4hF*6HGqCvAcG%=9ye4tW7iW<nb7`-JU$ZfsvERiGU&eUl$WfQ_I+$
zJGB=3ZOxZcx%6C92{86=?3ndYuc1MhdXtW-{-65L;Qy&H!+rj1V9E3&BR(tBiA-m+
z!_xFNV_0LWU~M}68}_Y08o!@KfHZv`FV=`KJz;N4WQZ^=7vtlI^aPEEehwd+W)dEK
zI|p(h`XK~!IgtB73+66Fa#1-Zu{Z~qWo_3JfnWR!%eqK2W(R$xcl9lK>CFU=dtdof
z2Zz6z+bzH5k4zfOyC-~q+Pa*t5#Hgeg<qn@1JOAq>GFvjbY5B#HqQZ}AzGe~M9AUB
zE#N)3m>%Ti`9d6a1%U`E#vSnP4yah6UxBEobW&6>a?BoVLQ>F=A#x*sUJ&2?dTE&h
MzaH|BR4&W-Kf?woIRF3v

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..4c817caeae51
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+112902994
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..50052c4fdc6ffdfe2b0ed173a40e140aa24ba4a2
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9tVQ0!@4`MPlXJfYKWU6BqHe_a)%*M>Hf-QjAz{G-!gN2!4
zFDo;{B_Mvz8o+F5YRLl<`@_V{$kq_RW@uz=VqxWKsUpC^qUgZK&%kKFz{^|z|Ns96
z1_)ph<zdp|fmz4Rl*tXXj)$p^8*CjH&^j(8>o|ed0r7KAB<t9K*0o@>&hQ{xBV8>t
mI>gIHR||~~3wKk)LSQgI=R*oo9$=7mVGB}|BT@pW76Jek%V5L+

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..e37caf6feba7493696af6cd0af3feafe5612a877
GIT binary patch
literal 7095
zcmeI%ZA?>F7y#gV?!5(y&_O{Ih-_0)L{ln=K!SE+9k46``H;G4<U$*%Qy^`_kh!op
z{6iMj869f;+7~fsB#Ivenfrl)n_ocO*cKpc>Dc13RgE%K(t5A->ieGUhuNQP**(e4
zJrD0`bI!TvocpFQ48zY`5UXOQ$|CVb#LFsPta!aAUL3>3CY0K(W+ti1woxZk3*{9S
zUAbVl3zcS@pfguEbXi*+mOPu;VzZd@1)Ia_uvS%CY=(xqAF_`}zEyhTa<lkFu|A*A
z8yZbiJMZiu&hh8(wzsk;Kc7^MP7a5?altzxHj$c1Md|}Aw57hmBK8p#9*JS(@yUM<
zkSqRsAmt#+!IVQNYbb|Pj-))3@+``8C?`@*rkqJRhw>WA8!2z2yqj`6<w0yhJrA;!
ziaRKu$7W|x-j2;t-_PyGalRN^m4Y3x3_Fk-Z_q;=5AMYd@nAo<16z9-JL)y;m;vk;
zlCb9;!q%Cv(+*%~bz$dE$1aV=wy(x+6tG*U_1kO2aZjAzQ~QX--9~X@D#AX1*C+9O
zgW_x<-p}k;*qrGaYR~g{{qy}-P+WDX2|3_aD{@ew4_WQCAW!o~AZyHcJ%$^9LGj3r
zDCC(VG03w93y|kr$Im-)0zdB*=NG8`veBQAUs-|sw>HL&;+uZcBEOT@hP?NFI<o5m
zo}XbCnoklOb`Q2EhHU9Q@T8FX*+-s0tdPsydT4h3S#-X#A8KyEI2Tik)(6+IUJc`X
zt$I1MYTY0jZ$Oo`7RCcz?r~_fan)VuX>pxsKZTmKeJ~!z<@P{Fq{XB4JVSGT4~)ki
z{op3_i%W7&LeKxUBojKRui`j#M(NuJp^e|`OP~u5ZApbLk3aJgbj^d%G3cg?t=Z5=
z3-`{3zS`XIKJ@tMfmOgHA+)Rmx_sqIxc^CO{#kkdOU;OHmj>>C*5V#H2R_e^9$Q`q
z%sGgACon(vx;7VjVqzP7KB_;QyW#sIa6!b_3eY~N`9$3*;NZNEdThWU8=8KH<JWu_
zeiM#Ayg#R13))W?8mr;^Ao_@*QwQR4Uyf<v^O}1nJGT+U7ZrQrVLp(W@@tO<#0`T#
zcfJYy>i)K4us^SRGE(7st}HONI6?b{GFR^?@WG+st`^|)AAiyY`|Ivb97uZ_C)#(f
zOlAS+Zo2ny`%^P^e_cL<r_T64=U^=Oj8y%<<XrN6Bva!@dt8huITxBBCFerZ!Tjx<
zYwBy1oJ+~Mz=Be8u75J;A{*9rUHXiPDym^r>uMSo`}B|R1sD73#osujOMX-ar)i&e
zNZlZtx)c6{UcWdwWl8GNwDb(6rp$q|4wSv{?>-O2BTPKDl*39nu$7mV^5Xu_BrkI2
z^}t^d@BTE7=|3!;Y1kme>;Ba3n#^1{R8lYg%$e}l{{t)2Z4qqt3SqOZxaj5dqOH{y
U^2{w6!qo_ypSgMEk6&9K0SYCRRsaA1

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_compact/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..9f719378e34d2c5c64c1b0f1eba1a2472cd679f2
GIT binary patch
literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^sc0BKe*8&0r1gfKY|LukG^5L$R5
E0GII%E&u=k

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..f4c625fb8992377d74681e6c08c9449ca6f2e33a
GIT binary patch
literal 6003
zcmZ{oe^eCbxyNT=b!TJCAgg3{0}L+0?(l2&2goj(JJv%G8!JQ;HIXWw9*JO0NN$Xl
zV}*OAahvK5i8W|rTP0+*w@m~?3`yI2`6<7IE{KAlu#0W4$D10kZDP&2<$m8;W@q4>
z>|gMGp6C7fyq|gAm!dS(Ta?CDF*7q$iN9y4c&e(wX0tn-8JR1xT<%9GSY~SOM_Cz)
zT25&ysX&_PJMmQRtzSIzZ^cuCHw&o+g|EK8*jR8=TR~|MebGLlB{8)xrc{knE&C_6
zbXL^S-IH3O!G1|EwYJc!eKEOelDbsis;4dMS-)1eXb%LVDpIQaRLjq-lUkiS>O%FT
zHrf%pfSDZzjFVK$?+ef-KiD*><(wl@R1I3xpa=f}V)?rtQ5mZ!If|Opiml;iNQ(v|
zji}7IQAu4SYLyL9E0ieGUM{?8e#chCWX+8!fj7+`I*Az9+!#+8v7Bw5)H3HX$#hRH
zJftU${Tj}+p(UUVYs};vjgqZy$0XI-GXg_`fAwuG^>#MbFr7l8ykhgt^8u|cYjOF5
z0M^IxQ7|AsK(*f6FVfZ};rm6%?b4EZ#kOy*ZrAJhB}p~e+^jD|+io(LCw$%rCK&G(
z+VVXHI(f-%k3p8iwwHbFdYbpR+VzTd3adyPKb+ItC-J1Y!7{bq%{-z_*JHk+F23Y8
z(5|JCQ)Gu^vE2mGy42<|a1D$_nxfEL^<*r~FaFt+u`0i8Y7@;b%`ah9+Os7rw3n$(
zoYNZ-jBP&%Kljo%M@raG$L;UjKLo#q`-eiYV*964j77&U^?8azqR>9-DW;Q`ey%Kn
z(Ebms3m+Y&cI$|NTSBy>gbkY`uY_!ke#%oUT+EoApV+aDEoF7i=(5TVy{2>zlV+aX
z#&g&|Y-39v*+$u2=s3Yx(DVhQ!w~}~%VNisB5b2GI?#n|crQW;5Iep^8tV8*zmaC<
zGCYRhB3zD6Gt&u=rnc$BRa<;H6Ke9KPo|keD|7i7NY+P7)67bf!sS;y2EFOWsztbb
zoG8CiD;J?8T(M#}>1M8>gq_{FUZ$LOE%ti?T4uMT)gd#R@4dH^wnl|VlyeQwht!tE
zG1&R2q`IA4L$RV}6ti?f)VFIRH1|@thFw9!jsVrNrMV$fml(Bo7n`r4H1?#wkuERp
zNM@+Z(?$#JWEVCM>Zxa98LMRR=KHCx$9Lt>R^!H9IYj)tK;l>GZyxES0JRGQWI?16
zj=cHTr353Lkte^NXKWJUDuOt(kA{JO%=xgHFTkGw)m12=#JG>7w<8-eLlY0GT`zW3
zNGv&FBbl$nk>meMi6xC?uw*Df#L^0HnjZ!pxXigR0eI8=7vKZ8LW!Zkvu>)M@XQp=
z!j?5Rikl))nMfDTr9`18P9#U)^fb2jE)Os<|Gr@63!-l)sc-sMk3n$Q=0^MwyU!vi
z6fAYFtUhAUx!8gS$Xl5-LigzsR&TtBLmMJecT*lAl4*d5G6lNF;Afgro!Rb(INR38
zB1_m1<h(+Uk&N^)K}kfh=UbG=pmRUAdH4EQIjzN>qSa)s`_3!$5ql~siFVSb)@tEF
zTbm*=S(!*;Ps<&Mmf-@3GLpp3=X~V2vSluETovCxAr!qoJfO5H^hSaMd`u!?gN&tK
zp(dMl$K=$j7}>kQ<BibqG2e?U5A`ne?u{%*@DTu3?nUUWMlc`aRcT^xz0Dgz8#}lS
zD!)H~rS6zJ&7}(adhbC?h@}CP!WsoAR~C<;Z)dE9d|t>zXSpPFmS45eJp0B~o66bF
zw&`g{E-3}5Hobxh@k#;Gi8k)SUdpTzuDsZBfi}nfs8z4UWMw2>`48+cTSvB2@%(^b
zrA^sQ;YuGA<Dao|iqmJ<b6ebkJ3lfkm`;(mSm;{>*&df@^=q}N>Gla&K5JYKmQ;q4
z*m<hp{s0~SK~u9n{JjXSK><@|Q`}kovH|~uHmnYYSvCPSonuX?LE@+5$F<s4Ju{0Z
zVt7cfZ8t_1O=!bmQTWtlLTb?QPS3cOYSmW(QH}}#>Ypk3q;}#r5L(p|TJ`NcdyMAe
zUORh?ldBulM1HFOqvE^ta<2C;Td1|`xCW~fK`Md1qs<>8PvUAHrZBw3wsT_tkROR>
z=kDemH-xq6XZ3{zQi1kA#cOU$gbM*~#Us?g`_Y7N{eO%9dcMW$JqqBQUDRRbZK=>(
zgcK3~z#z$Kv$ZRr4M)agte0x6MT|*%nBXeC306KQ9;QrHBr$#G0Wp2A2&NVDqV~FA
zGQ9?pWX%hzhC6L~4;I(Fm<wk}a~(83ZA;>+nC+U`VZK6`)q@1G{xJ`|I&7E1vF5KZ
zRpB!xV!Dfru`UY-&Kexw)y$ZfR1L9H0j=2g8+=uD?zoPEHa%xo#bIXxlutjMLc5p6
z`EkhQ!pdj!0AucDPfRH%U-a$RO@?*=OCcw!=sQFh!`EO96z%5(G5p7vyrQ)LB{YBi
zfT9V$JES;T_J52;1;K&vbTiGHV^k%WFkprwZ(gM$gfQ?eP$SE{rASp;iG97ROITf2
z+#jpwaBSeapq<Wbeicg_8qvVPYK+Kuf;2kIF>&A>AJINi*F6Kj#q#iG-*k$CqJdFz
zwEwC)+5@DJN=}GUNrS8;In>fhv}H|rYC1)stT4EZ05zcu6A58qu>1$8b`#zplatxi
zcdxyWc5&f86}*JOQ_tlf_<|ZygH7bK)IUc;!KgR=1z#YUSc6}ZFVOOD%t6-GXbhQO
zZkyZ^8j5(;?cbK8<Co|E;H{7-Tz$554;)`8-J{YDXuQyl&CzP0S(m?0<{?*6{Yz2*
zTDf%Oq6$|1E+C2LHh?-x5!JtCT`Zl&?ae!ZD}qu*_3zIkEga2@4V6%+!}{M|8%rm0
zzpgSI2cDc2$o|T!`uTjidVN4oJC}c61}fJ2nb`}Ngs5wA<A-T$Qr7rkHJ`5jBk#X5
zb?(H7dVpwBqCy=|dF}B^<P@K@`LivY{gF-RY{n~GE2?j#-AS)T;?oe)DxmR6Cro`I
zKk?cxt>p98^xq!Nel~l?Q6ODByN**j32buVwjxWq0&WPh?OJQW*Rw5Y?2N*;6(<QA
zHZ#THGcH1?0LgUoLw^u$SqC*UF6P^W4x1^E@46NTTIaTy;z1GDyin&!gc{Z%5>xt@
zD8y_bi}1LxaRi2h@1sQ4cq2d~Rim1Az3?hprY%1KFo2^PWjF*!Lo5ntT~=b+6#lvB
zu&Y9)TJ!oTt2CewFzgVKt~a4AnjgSeIcGT?njeMUuaDtcvBom>080=?(q7dfng2uF
z^mM#+2|(k>R85e~emM34+BG|i+Vo+JB~-JC!%xtpzwKL9$Qa&jBiav7s<k6!L_0mA
z*1Yx549%!Ykoa<;BMx^=K~9(;MG3i9$QQz)m8d_m5}#t#&=Q9Kgd@xUvr^UW2)z{?
zU(UyR4AM18=9Vzx^zNj2K2HtEk&UFR;-8<Ry+rCX{)kIqk~s2aBo4jtKUIF>NTUlQ
zHFl@(Q${~BTtp`NS(kF>5=Xu`Lk>YQb8csd1L6&17aUpVkWFFN&~tBNFo|Vx8-xt|
z!Kan{E8Qd9SOxdw<c-Qig&T!r?8z_T>Kj7hjS5NoDaI+^1jQS*m4sp+-yzpS`A&MH
zrI`$LEK=QIpyn9uPJURRXyT~WO41_5c*zqA7Df{;d1yXm6$($eL18qPe5X>jK?^}_
zs2nM0O+4<!^-ig{hJGGF)Qb_8P`L##4gIdJkxob)%!}206NQ_?v7xbd8{z)<cN^6l
z8=6^Up%asq)HaebE29Fbe!?&xW22t6r!mf7)bE^IW{vf21{zZ*cjbf_mOn|2))~v;
z{rfkIIr}WaFs`xBa!=FNv;{6)k3$H%x#;j)5EI>V#Pn=jq|<V8PpdsQt~Aq$Y0s}T
zYu0|w0Au)h#hWj!&RwVD(_YVACzo@=&FcO_+L`t%-;WKNwWm4zyTo^Gf%Mq+)uDIR
z_}VGz!t7-g*H%E!9Yo&E56^!+^3u8^738^I)6A@4o+eORPmcWH9pr%dPJx6xX8Xb!
zHzU1etzTFePVD!sNL*LCu6k0N<gh7^Srm|PYw>ObTaue3cnyah+X%ZxI}Z?crQ8He
zSAbngq*P1YvXM8N^jttlz7h(z@Tvn1w+O-E{513!NWgXLMe<p$Dq+<xcNiWgT_?_!
zu)yQIce{paJh-^iFFz<{=Re%}d?fGh#v^{-!`)Xnkvg}q*rrPV@yBT5ZYxu3*?G{c
z`Sk>{@_8D6)<c%#g(-+Kv&7LqH<5KnrEv99Y8i3-6+Z!jfbpvu_$h`&8{a3ITxpGe
zSOm><MY$7;<AY})vHwdd61j6>e3}$dHe827h1*2T2+pnB=2)$|6HlyS5zIeTshZ8i
zwyi`fYgTLJC!uMnO=0WHloLL23iTX&L0xT)=~1P)Qn@_bsX`=}_yqekx3Zcbl@&jI
z9-l{f!6v=}m2_@<<Q;SzthZ|4NX6~MA8%^DP1YP=P3DsHv`GxNVaTRnFm8$}=Q2JH
z8p3XCg=g`ppqIRCuxPg9cH0y(a0iZ}{ycWnz<i=;HPHBmNQj>*;>QqK-%*rbFGz1q
z<bnD&HOBW8f)Z<b3b}^w2X$)AX<=-=V;|(5-H=OzvL>MA_SpB)S*cYcuc4J7>uQPo
jdXonWREMwD><yBr!uwMZ7F}~{+g2+0(`d@N_wN6H&2F~v

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..5720255cb5e0
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+647001919
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..6b74492bf38eb1a90aec71ed49200373d323391e
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r@@rCA&AM?oQ>IPAycV^upu);$9`spIqm_>1|}9<94yQX
z>(4PW91de<xOOjq+0fLI2PF0?nVI2>Y5<#|k+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
qqpO8R2VeNp)k5P#cPgl1VH7Y(pCJWl4lqddum!2f;R_2<Ed&6?<bW#x

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..02f7d47d3059348d0f51c171a0eaf46680fa7afc
GIT binary patch
literal 7104
zcmeI%drTZf7y$5{eef!r-0>=qqPC@_*c`9&GPGymDqu`N9+qlNS-4B@D%_F1D^kR?
zsq`OWkX8|!^x=~<(3G^O1xo85txzr6mNcTK5KK-qC3>}Kk+y_$SMJ*H8%;F+qsE<N
zv-7)eZf|CHX7&p(48yNj6)P|cWwm@F@?n(^Rz6;l4~}7C6Dl266O&|@w(7+?vD#wR
zSBnmZXfsKo-ehs=b9Xq+MUu%ZnN7u_<g_}icAHr;G&g>me=PjP$}1N-<O{`mJf8WG
zXd>*pv4^zHJb1IWi#>6APMDk<4}JdJ{Dj;@X{HpV53tZy`UZ>KM^tzuhVhNh_pzU^
z;ywFQ4xk)FIheABvX*i<<;9ejP+mqkk#Z{K9LfchpQgN(auelt%Dt4wunF}%NCp*m
zQa*#tMp534%~99q4&XRnjxD5N`>n<Hr^Xv_7srE!v4h>%5A4C#y@eh540g;Y_Crb7
zD-L7pE3h*UV&@KG7e`=MMq@jk!fp|<JE{3QY{YSQoYxEMh{XMj;-sPs`yig5#Qi0T
zv!!@Fv!7ve6<?tCJdfu;e|ZGOh4Zb*e%HH@14=!}f%9hMMe||E8WWz6TH_BW9^M~`
zym%r8dC6D_^0JHgc_+@|=bhH}F>1ed@;l@w*5m$digBU%wjXuKyNY^{_utM!?mLI?
z&v+lYpCl;s7Hm%p`O16XULiY|9=Qj(LN1(5hGtiuMEfiIhUN;4b1@BQesKNI1;RMr
z5V#In*gS^D>u0w%z_@>(YX&;dxZx)BqPPLHo<b^gmtZ`UD;$Cj%Zx|!IZAW;O&E{u
z-g^!D;WY&(pjUpiF$X&7lI0k5cI8WlppD-oZ-g#6T$>JE9ski|(Diphrl4Ct>B@)h
zF5SNr`ty$FSD|M<9Nhp+5<;r_p{w)q;QA+B#V391Uui}>uWR7?XU(pOQ{emT<k5AF
zz?_q~_5t(DFX{@RXJ?z>^AUb&YlrU-|5agA>p}Z~j^m9V00$MlJtP4KZ)yDrj$iY&
z_8J_&_Hse54z!ODTk7EZAo_@5Ko8<^pHAuE^ICo*zpw?wSC_lvVLp(aHa(;Pal_d6
z11|tSd7$Sg?9a3A>~uJvZ6(IeHqgGgs&9A__|Vny!A{^a@4VXs`|CQJIGTAsPINDi
z%;f?XZo74N_kA-CZ{4>CFYNKZ&%s#m9i{qz$+>*@Bb6FITH|s|&AHGGQgbeJJD9(n
zb1hs(&AHT^3rr|A=lUmeF0y6Q;Q9BN$g+Ay*j(SDjj;T7P8qT6l>f$|9KJ_oc&+=<
z>tVZ`K2vAHA5TtBNljamo{^cAt=80gpw0tzE&RLp19=ORw=H$EQa5b%rKP^O|1-&p
zFY~(Nt;lC@8poWPR`xV(km9}9bUI}&=aH)!?G}bf_~ZY9Rq1O*$zc&|^yOubXO-=!
kGn2pEk|A8ZSd;Q+7tXV9w>iyHp1o#!txfrbnrR{GS5^;}KL7v#

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..1cdf45e437d7c5f9dded2aeb22305f7aafb161c7
GIT binary patch
literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^sc0BKe*8&0sCfiO9nAT(bJgcddf
E0FOWnMF0Q*

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..200f4d9e8256e6c0aaec2af1f69d3048c3e29e1a
GIT binary patch
literal 5923
zcmaKwdt6jy8pqGT=$uiTVI9+f4Tk{b9MsM*1I!#m539)iGG-|nrS5J9>aNt5BAMVy
zGFcY77Ia-i9c{aT7gn<pxr0C>7X{=_pU>LGm8>nYR`z)goH;LiJ}rL@zvp@0>+^ok
z^S%_Npbk+AWA^CiXb$`yrF^K|Qll}*WDX7qwOILS6j(+JZqH=26y;ATa;SktUWq<b
zRL?sP@7Xlm(w$08N!`BRq;YmDvnd5cp9<<zVoY)Ng4|x}(w3M(rOIN?(hn-R(x5W6
z*xXcya$8{TAa#`K2cs!yq+iLM3L0?5^;4Jj6+s+tem!MSIosqtMLRwaXmpUebi991
zNwa5429*YLhZt1=O)8*^Y9F+;4ntIMI3-1)r9Trbw_rGikcly)=di0_jU9qQ<sfQM
zDOAiI!Kg)b5M`Md6=p+AA`L-SIS6}Nn~#>X4TDPUi4dG`;ndw~G-Nu#z~mJ{W2Q5r
z<yi6{Rq=O^0GbK@&?CT5`t1VrcL?Rn(kj&|-n7zMsn(tycO9TAE`Ga!W`mo)U0^U(
ziA(7aDtb3mfot$hW0mCU;41#RQ=s{f85uB!v2ce_>0g45A?6%XOg@Q&eGC>ya^xzv
zRHj1kbFeIuCRFZ^s8ZAH>mH=`F0@2@ec}-yyVaHB6tCr1sSW1wj=9PkcAZ-PLwtsz
zR2v}9k1O98?@hC&jCgOCn^5JRa$KY0LuWF_HBy6I)m)e{9~zVf{W>&bi&M2^7{`(T
zT}ggI)uTq}ShsaR>2}6Ww3%rK*)#9<ZbB=l1I^D-SglrM9&+hi)oH~lnhm|ASVd^f
zw|9fqL&S4cH%09i(1NA4tyX*3bt;QxPQP7eC|%?`zKc0G29zq+^vc{Mmn&E8pO8UY
zEEb<iHFrvCfU8~(8^&55+$j*AS)%p=YIfb&!c{x-&eM#gFYi1VeCuoo1m8UyA_vz@
zD>y(iR?mV1gkw!0IP%u;js$OM<+z&IGCR#$Q_Ac_a2jZgbq8pr>H6_4drO-|Gp6^n
zS%%UyQ9G7AK$Tz6WYHE=gKZzo`w3a@x(A)Vya${&$?BaMSu|^=XJly~cZrv*M>X_3
zbM#S7vVu|s&Ho;|JGCuPOVzwbN5`sI>!+(Tw2UYpPwuB``z*dR!{5Fbg5oWkh^l4I
zm?tUV*GI^#F?t~|4t{qp@}j08=GW0>?SZ)LF>(cUxdH67C*4MW0MHCjb<+wF4Pl;;
zeg;z*$XXgbs4BHkH*Xlyh_#*slUa_#bG`y|kzI2GEWvURTK<@UmK_XOLMF!SEkVm6
zI8lbs2{DFZI6Z7BB)h>fA;eGt2HlkbrTYWE2J6I_0Wf)eZ`WzRVgWzezfKwbfr9Sq
zm4F&{vCboaGZ`mDaDE+|^(tnuiKI}?mv+csv-Ot<yH1g(Gnk}wuV1c*bzv8;BtcaF
z`xKah=acDrmm^odCta`h%1_t3(yhMSp;3D^J6uOxsK4FsfO8){rEs&$O=y@+<9hga
z$Y-2u2pL*K^Wl-ux2w5^)C6xjFnkM<&<)Rm#D?!rklDgDya9=C3I8D9p>aV-sHN-D
z^(vO1LsF>WGgv5<CA`sz=Tl;s(9jcs<N3ye;Lv5-irKMb4h&TqkZ?I56CJ}K1!OJi
z9BL2SN$v3w)G}e)B|q-+^SEt?VcR5hUOtAKe>#Rh`{hcIn5!9>TrRgk(DeTy*u&K_
zbCYP64<*sl=$5Xh`K7alWxxuJ^ENE<Rk2HhVJltjHQFdZ;nMZgDw#Zu+eB?QYEr+A
zFN4M`JwmM_Ak%ot06i})HxSTkyaF1(bZA80C!ujTC&63A3+}Lr(u9R3-AP!M0AE-{
z89lYph#r3RMD)~dntOz{^3SAHsI@YAYM+nGQ0ouJ!N+DCJnILj+9N)>Iu*m8-YytQ
zW%SgR|1FH3$G7I{asfT2YygjUQ!-XSY!Xtacs)?4>Cx!7Xhzt|B*`VpwgtKURI{VU
zNLwS$IG@pL8POlV=1tEX*3qo+B9o+LMP{AgnyWtpJV&(k>@@^UTZIiPZDM`ZlD@B8
zq?QX>q0^{cw`U`>n!m{Tb!J8UXqBO5uA}y9UlZ%J>5#j^=PFo2mFai0GXV{lB?)4M
zG+pbZ+=~pFe*+UQ1X9uvLWH6zia}FKKbS%%MWx%sboBv~Wm1$qA58@tz+{~ib;60J
zvLe{wS#xu>k157S`ZV@d>j6flvkrcmjAibyE>CN7-ABF9_7x-r9q<zkKTA!IDn`Au
z3l-=<4Jd`Cw>AKlEW$^6_~(=<hzJ#_?NmedsQ6{~8KGOLOmZ%SK>+(Qh-7od_oKG1
zgw(e9ptf_E)LxDO&FgJ~PtH<kh-S;7Dsttn(+LgHY+nu;`dtT(Wx3l)LcF5WgCjG~
z2`lCz%$3jx5Cc|tfat1|nl3>3V-cED2B1KeT)8U`+c(kd@-+J<S2%YiQ`GWM8!wc&
zD+QwFnBaE-kMZ&*IL2~&UEs$ozlq0K5``AE5x_7j^0hEGk1W}@B^pbJ$aQHC_(~J!
zTAo4(NA8giH`j6yAsl%uAw#A`%URHv$jY|8a*5Q`iww33Hw)qWB5_4%>45E5v625;
z<m(c-mLH*TVpmKfniN`RpM#o~UBRL#)kv+p=omvda;<Ac?THCmCR926iaaAR?$!eM
z#ssc#5DacDLn>F3N()x(E51P9j+aM5(_I>t%Mvl8113TgNX_17?6iNx!nsM*qB>WJ
z1WalUw^yiXeLO)bsHd8rh|K_~Y_}cNNHr4r?kfjh|5x(|lt?ObHXMax2PTuMZI&l|
z-J|?G0}Rq#N(5E=%GIzJ(d$=x6P#)@-+`+{)UrDZ3_*h;Ah4Wk&#*#q6`gJM1=x+T
z%E7h2!lU>O?>mmPlU)0UB^#m1PfIpRNCUTK_}1&Y;ad;j3$}?D$SXOo+C;)hRavM-
zyvQYeKMNt(e)InnW-&72LJN@&tF@clN39ef?y3i#K0iNZNa=<^+NBt^YU2<Ps}UZY
zcn+7PA&`hjG$jq=5#5NVap_F(U!#S|FX5gN4`j{_@Gu$4)~eLp!4wG57uj_{q5@(Z
z0H3y1K0qQqYU6OF+7Lv<i5$9GQixd`a}g^di9=UE?||k!d(-ut$(%*Y({gBptKY%o
zRD7($31vz!(HzeC+DzmF>puds3Yj#kpd2(sS_#HndkE%8vwz4WwZFgwX%?^6(%BWR
z6##H(7H`zDR%Qc@!Mn8U;0xE<^Ff?-<NhRSNvw-dLfcaaV?Jt+(REXG2-il=;Y8x5
zXArG+%z>fPY+wm!5}`U4KN3l^aj}m?x;o-I*4cG5ANMCcf!2<lMGg(^WtR`3&`}6O
zrrGxjNw-gE)EYX-Zm-9H=JgdBddCjKv&O4a;F>ByhwJqAfU8XW!W@{jbcN&E4&p^M
zst8I%$x-MGhXRTYh%YB^a6)IQ5&F?>8Bi+LMH1yu=d(Kx(dPL55n`@tB}jF?fv4qN
zNCWv2*m)jFQzws<>+A$YnR&>a)J9-^0uxk()HMerHX*R^fNPXoS6Crj025+ClOV;l
zm4wIAOw`4yR1#5bPPp$gQN5L*%7s+htqN4{PUKU&X1f3^xSxj<kPE5yMY%e9UUWox
zD&i?P8?w6JI2lh}hb_M9Sx5VniX&kY0d>6zDk=ZOzFs4RYcCg5U8jq}U|tnP5FkV9
z`ZrilCI0+bhzy_Z>3OgyNnUw6t%R-aKuDRuBp$9BE-4NQV85ibU$1eCiZuis34?)-
zI=Jq1T+yUVSkYop!a}Yal1=UTCi)#kG1?}+h~2!Kd#|&KqCOhGX2OLNh{lf<ZQUn^
zf4yi+`uL<wH0wUa=rZj?h^z$U(q%m)k8dZu7eL9TeQfF|uN9x05~ilQi}g?n*qNW0
zLFIgtQGRZM^%8p2g^<mbWPKX6Q);^AdJn@|sjJlp$I~xiuwUtRTr=MkB*%)3>S5rl
zsLaWW^M{n;6;I++PXe?JwUHAU@UiRJjsi^eWc8!z0CYR~k?kECE$>5{lONeWjX_JJ
ze&QP&POYN|EnlZV%)}=)!;iQvwrND7_x@ItSW*hZ+_%NT$JkZ7^sa@?Q(0_V;o>Tz
zs_S+JtOb*(Ad4xXcNYK&d|%8VFMM3@n@}h5wttipwCcS8wDPkxz#|h0tllnhP+yXR
z`Uxav<qWc-;`;8xi_yvjWI5IsjwRa4xCz=LqPBU0mL+O0z<eYQaec2thU(uxu~Ej=
z_1_?_5`9{Ygxh%_jpt=(O^Ouyx+4n0ROXfc9%(Zwh>}R?`#A>|gMUzCg=vqWs`!<K
z@TKnenV&fX+Y}Nrq|}y)50b^%3v-9S5W9a9%|Dou0fv8$8Qg~8rDLQ9-spCefO<|=
z?LOGZYr~+Pm(>#+P%Sfp`mswj)Vu_<Ykvtg@Xz97Q85%VL)3L+17ImpXaHDkb?F1t
zbz6NZ?U%Uzm_|_#rp;p}*lNSJIt<ud=VD$lsxR*n)cUt~!F_P-nkb4N<m%KVpTM0=
M{8K1;19wdSA7AaiL;wH)

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..672e8beb7691
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+400579342
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..6b74492bf38eb1a90aec71ed49200373d323391e
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r@@rCA&AM?oQ>IPAycV^upu);$9`spIqm_>1|}9<94yQX
z>(4PW91de<xOOjq+0fLI2PF0?nVI2>Y5<#|k+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
qqpO8R2VeNp)k5P#cPgl1VH7Y(pCJWl4lqddum!2f;R_2<Ed&6?<bW#x

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..a8c896b1bfe5b63c0236f68614f686a212e48e9a
GIT binary patch
literal 7104
zcmeI%drTBZ7y$5{-928y2?wG;q*g?c+5>qAB<Lxe0!<Srj{>cYEZoLB3wPx1q)2US
zEdE0pinbO_ee{tMgGPe16)5T-TF`0>5-r*ap~;mtHtn@WDJtQ*TW<CHrcK)PpSI~v
zvf26Z4a>~#%<PwiVHket@+b{6TUJOXA{`d#V5Q?#>EIY9Dz3s|F);}?`)0jRD^!_9
zeU;#F2v(C_(3{LoebzRom~S_UcF|NI*qs)q#by=lhQ@}kbB>3-SaIcIt8}3#kH<3;
z6hSn5ZtWs1laJi)YG+TJn%0a@j|M+~Ze~nsA~%zZ+y_`_%YB1I>LV&V62mCtQ-18D
zRJ><j%6^msC<jv3QVyXUMtK3{g_L6`$5T$CoJl#C^3#+zQ{GN_FXb-EBiMv`9%L02
zcTzr$&CaL16Pu&1&mF*Vz64v7jP0`;+m{-z-+dep7{Cs6W6#}%t$Q0g{2A=XVeCZ-
z*h>#%>&vjy4q|8ZVHeE9u86>PJcZpPV7F28cfg3_?r5)P*Aa>S8O2FiG4??`KZ*Mb
z6laU@dS*Yx=E^=t?Rg&0fBy0%6xW<@M)vus9oetQgX}*eBF~u#Mb?_|d<-#uhvH#9
z;m8ZdB9Rx46e7nA;^!Sdg`anF%Nf*u_4qf)PprfJE01)c_?91Z$S>!2An(7Mj@*3?
z-=EQLbU#Tz@EzEm7;@zIz=J||EI#r8QiWVRn+VM=JBjvJ_D$^-80RAE(fr_gp7V!s
zzTSTgv}WT78n2JdQV-+4-L6S!f8+Yw&~u`D(RvCh(+$CRFqhX49hw%4=JR~*-F+|~
zb#(Vl=tom>Pe3obR+<T&Fl0Usol)`9A!y^5iKWnmhig)yt71P|3te|FXac(VllC0w
zqec4{Lx0xV_!{)&hr{cENnB864|G*_HeCOtz2Ky>{^e%G^M)3#e^zvjeGGn|9Y40F
z0hn_V*B)Sg$)GL|dTMGrd_J0ATK2;Chwt*xiFKg8Uu$Q>2fzXO@ATV&12;AQ2*<Cz
z9&!_oKjd<5mkzX_Cp6W<_d&!FL$4mhqc2S8;PYB?D<`iB#8;HKV_`m!n*4pg7Q_uB
z-}b%${N#a-W3WHZx-(MYe6|)E+gd>T#>(!2ao|HYM*G@;Prv(K2kfuwZ2WNA!#L5s
za%nmXIB(0H`#T?+ad_*>9z46p|2_v}!SBe`|4YuL+>az`{Ai6!F*WBxH%QI7(CuLU
zcFr|>88zopb1pEU)ST;|%(=*>^1k!$GvUQ`jAmn9Q<8S*w{zju+8xq24*5_XmEm>s
zqPO;~?Nm&i3E!ERxH2g@C3RI=dWKq4?}0iG)V1*M-VdZLOxm{8%}U*{)t8p~;{MMh
zFG}Wh&s&ks-ZYN+O8#6Ueg1$3DPGFsF}c&3kKHJ3mOdNC{qg_6D)lvj-C-80^(Dm{
p(u=p%isUc1WC&L$RImKA3uoK5TAiXj+g81`#w!1X@(Wi7e+5D?mD~UT

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_clust_counter_compact/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8449a126036626584653a47dafbbe01eaaef367
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fOW5D^Fu02LVs*8l(j

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..801ff7c5dd8511ccf45310f02917a4f5badc42ef
GIT binary patch
literal 68
zcmZ=^U|=}Jzyc&w>i_@$-@w4g@PUDek%56t=s{3g9|IEu1EVN|Au}7d0E2)EF9U-i
V10xH=1a<)i4xk<uMn55yI{@GB3zYx>

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..a65a24d4b144
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+2180385804
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b58e3946e23084a51d78d96f37ff33ea78b7fbb4
GIT binary patch
literal 16
XcmZQzU|?lnU|?imP*6}{U}6FQ0zd$3

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..c0f56d107fcac771ab84207b642ed9ba26ccc363
GIT binary patch
literal 39
ccmd<M`=5b@fq@|fN&=Zczzm@n7@;%+0A3^lxc~qF

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..50bb4e27256828179287e8515ded21a8c181e295
GIT binary patch
literal 4814
zcmeI$Z%mU%90%~{Ux6ZYv>*xwvZ?4Gt{4krBxncLKo=7j=)g@f9@@@gMj>sZ$TTjF
zEs+q$<{X-!&MhGZjbvj}U`AiKf&Vs@sNKjkm~^nXxZ*M$PDA%p`o!P&wimwHizU6}
z>D}jk*N3}%?&)(p9LGtEGUsWynY2X7c;%>54naA#DTl~$`iX>dJCnKW2KNS|+$h_e
z4x>%>cx0E|EgS7lud!&0*HP-WJKPR?ne6t~dFvWn4!60b`DXFaq_=9WobOal6c-Ez
zr{hz2eox=+q@1()%}QOUK9oX#lqS}p&d{WE3X%lkICVg3b(C5N=h2K~7{@Y>W2|GW
zXPm@%F5@)D3m9iHUdFf}Y;~SWnD{G<H!$AHcsJud#y&L9To3+vChlc?5-rSOyaO#V
z=NG#$F0DmtOz5cP=xAoVF%K{vJB*GCpr3jdZTJM8{3<$i1pRb2deH&2u?n5H4_$Nu
zT{atClY;iVjBb_D-OTzOwqQJv4(8MEpsUuP_u=~F1J@ufRH*B8rdl|M7OTF8_L79_
zU%Gq&;+iw<;Hb$Sa7;xItetj%XH6%9b#`2jddn{mPZ~%D&%KcfPV<$67hJ^aoi&Bm
z+thIi+Akmb8T?Wq_HSLPAL1Kt8^CXu_Ja4`e*xV84bIQ?ewa@_HsLO9&zp<ez}x2^
z0-ry<lv-GP9DZMgk9AjQTug0(^&t*aYiV3+(ypM^toOlqqZ;a(Xgs>#e~(&gS@j3?
ztn@*+pW>?wmuNgev<^`x=4HToo};_JhsNg}{@@n%GdU$+P%r+WvVc1KlJh8ae$BT1
z)RxhumDJ@2>T{`W8OL6v-t<@eICcBCJ;l_AEB4N({;sp-ed>E(jjST(pN+2_pth}C
zN$-EYr|dYr|M}oYI(q*L4*!j>$@hh^BP&ABay5xw-v6NrC6^XnG+3#prnb`SqxrLA
zH+_CYXC{splJ+s3pErL=99w#D$W0vgdi!s5{JJ0Yx9IrwmrME#r2TBUwUIs#QVy91
zjU=9acHBU(*TUb6t*s=!WNjdW?gw&BzYgh0-0Yhe+)TW>tM>@)&ufAFT)LiJ<(BRa
z(!Qm(e|U^||4-LXbQ7Qa^s`>tU;pW>k-W!o-tf+ahegEJjdvgHcx=uSE~~%6Y_2vH
z)IvCq?7474B6}{FkI0@2rh|*@xnMpbd#=cyi!6x$*Pe^7ceP*5;iPM2oMzYNoA#Zf
zkMd)a&g!;1UE7rx8|u)^lWOVFI}N_jXDcB8eE;*ADk<$nF7zpdKUvSj!lxw<Yqe8X
zT}n);r&h~Khl^@z-+SnJ&eW<MooK7<;xhmJEl#bmUUqw&@|#Ah%WGP;W=o?(eDbFM
RMN5w7H$C}ztJB^{{2QhRxGDeu

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..582d8fbce369
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_inaccurate_min_max/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Filter.db
+Data.db
+TOC.txt
+Statistics.db
+CompressionInfo.db
+Rows.db
+Digest.crc32
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..fc38a25eea5d107acffd27df97523c4e5b4c1664
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fP8AtDeS02*!yKmY&$

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..9f7645e8e8e097cb7c0e0eab7cbca70b3adcde38
GIT binary patch
literal 89
zcmZ3>z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpFm@hBAY+
lkurnL*-rsLmNAfZCF%o^WunYrdYkisGD84ZL-gvvivU$)89)F4

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..36c915b373f1
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+4174191692
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..e20b4e2f2700a2d5278b3858fd96631b7f66fa97
GIT binary patch
literal 59
zcmd<!{Lf*2kK^uc4yGBL`u|H87_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1ldcHk

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..5328b8858270e17ed2049a66b49958fd44f0647f
GIT binary patch
literal 4648
zcmeI$e@s(H902gU_xkuzq0)jV5ShR52a;k$h9+ngs(>+p@nfQMkq2!Ae*kHlis<5^
zn3!f^jHqbhZzD6+NG4MYLrv6y^T$+Th8iI?srW0dn^8n1)K~4(@AtAS{;j>_a_`gc
z_4V%FUGG{#h+_Pt7=gS?2~t6&Tqb2s%0Eb%Cq#YrcDijdNvtp}(~6a1NvTm=BAU%&
zxxplA4W$-s+8T>7+hj1BjD{T1WGS<hRg@b|x|-^{8Al`57TvhiAl)d&<M9lJMp2>u
z=_Xp&_r<ejJ9q5VfZ!bH4qI`4ut%!mt>#VMHo(N%+cub_Hln0JB1G<={BnSN;VT0f
z2QgMM4q>cftY#d+cm(5-KFjSK%f#auPi36SIFs?$jF&N9$9OB_X2xCElzATXOD1k%
zd>We@&Ugbh&)lEiiQ|d_Y#|9dU?z4T)8C-iIIir#4sl}--Gr^#iygTbJGv8lR3i5H
zJ=oep?37*DX>Hg!!?25@u+3j#+eGX}X8v~Qaoin?Ok@AWF3iW?h36-A|ApdQ9$wGf
zMQpzCPgGx_!1G^m{W6LR7iy6M{;?wm<#~{U2aU)>2E&n620S0t`UfZ;(GrO~q9+=8
zWLGZo*sJ(?$M@ssom6)Q)t~9Shdh4{Zr`G47m6=`qCx&Xy9s&6%jw9D^LTu^9cVnM
zGVD35PjwmIec*kf>&F~=59xwlIyVKHn{X0+zjE7DH(;EPu0r#JZ}~PD#uZh;v!I0~
zU8uhS6=hX09_VoOK?m#SK7$?-+ltmxXrbmBjEC`A?a<*VacDjdSH0X0<1vSSdIUXs
zTIMn62{#v}LML7;JqkUe=)2v}`rA_$Lg(&Tl?+`HcVafQ^;KvubnWl<4CupoJH|j?
zX{h-Ty6<@BTwppew73PjBt0Fje`?P;3D-aMY*E4W&lz1kzk&C2&Lgv`fq4sc)dMTW
zUDaek_xG=Z&qsJsw-xRmfs?{}=YaY_4Zl?X3arfD*KPt1Sz7xy?7!-k`Vs8E`g&%w
z2Gk!W+A86G5Oqk`ss-`bKYBIrd5wFTk!1t%gaUUQJP#x%J#JTlxUTDd>uTTyJDZNc
z_I%@>kqqZ^d9J>(4%DwHc62y_ci-u5YXm;M|3DLLuj^cVXUc~-)oi>xkOrK!{Q2t*
zA56@?Yxz4^$ylz+$rrxzQ#o_V-SVHg<dJw=`p;ZwIEequg~r2w=7M_&A^tPhXLRPG
zOBc0WI7lM%twdO2wS|d}x8>v&+t*9~ad@-5R&-9QXiqTgmHmIE;0sEn?*qus-XCt#
z#taQUJSqKQ^+4+6JIQw)giQSJe`kudt3;EzR9vYo$e%Yoe@&%Px*cy7SAKN;p3L0(
EHv*27XaE2J

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..1c738aa0288a04b77b2805690c4b1cf12edc033c
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fPuAR-VR02;3eMF0Q*

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..95ea5e12f87a7c3d8ef955195f51e9d05d262e18
GIT binary patch
literal 91
zcmdnQz`!txfzhD;|Ns9D42%rF*cUMJ1I4)0^79pv5{rPiijjfQP>CU1kb&XYas?Fz
mStAt&>$713K$bC(bw#2B$TCr3Fu65jh6+OfSjTLK7)t<Vco*mZ

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..314119cbea6e
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+230017823
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..657d5463fca87319f855d06b434464f4ccfb5064
GIT binary patch
literal 59
zcmd<!{Lf*2m*ehE4yLJ``u__T7_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1k<tj+

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..c265700c200e17ed28272f585dc034350f46ff15
GIT binary patch
literal 4737
zcmeI$eM}Q)7y$5l?|M)QR9X-PBJ&FsG>3`|P0&uL1B?mChv0^hgI)xuK-vzGxwtt@
zmS#aTIyCWn88K)i%BH}WCVqfl^Mjb721rdhG#b}wl;I@QyV=v{d9y_Sb?KX2-uvD2
zw0+;Z_j*r`VHnMn*eITP)n>>Yk+(8=W998TdE*$yaOO^`<69=a!n#TjE5(vhNhlF*
zHnH4d6$MMFT}WAPmolvu$tqd0M612bURF^qSxvPychioBeNlAta)bOuQ67(HBshZb
z+n#JD^@ATgZE0eU|2oV&hx<Z4J2%oVrzmMkQ^o)rZDnk*$zw!KgTyfE{M5sK>W;Vf
zr>v!{qZ~w8PuV~@jPh8@<GfbqImv5v9&uFuY|6=$(<v{byo&M$$~!2xQ0~Dd)b$`A
zQgJ)wQ`qbn%A2q`>ifCfIIhXV<`b~}=41O)^VPn<aa|X7kQ;l{W^Chr?C?*pBfGK3
z$74^~hb<IfC+)#b>A=n!ja?LhZCi@%5V0Gn_1k5}ad$K_iGGY-kc+(suTSFs9mUxk
zb)COD%wEFg3a+AjjRvoO&5bK4&R?iQ_WP>|S)1cQ4j7S;14lxU^%lGy4d(kO9@Z9)
zJhnd)d0bC6@}z6{ddCgn>zz=48s*P-{(-!BAs*lINEeE)eq==cDzh1R*Yml^t>^Ih
z>1##jljuU8!F*y$Q=S8_8`(JV&}+y$<nq~>(CpNc=>L`dUVjtDxyWj?KDf3O0Whwq
z4p;!q7xtj}`c;%w!?=H|YY;lXyyz)(V01gWpMndF*I_(_%jkp-O^QM5d5r$~P8g3m
zy!8R}ggNQQp{L%;Plk@aUV0RIUeT9(q0M(@=0j)iTbl@75_94s=&FB$2cYZDH>E)z
z&e=5)`nQJKEzpBMcP|1a(}IiJpi5Fy;r&mVvQEPLpLn+G;r-7_uKr)Z=UL~G1vS8&
zow&9EYbIYaW<U=OZGhK@|GRz%e17=Hh7K$Q`PznKH9rCCG7ofGfrD1o{R!u<zioH`
z=Wn=?-eLs#qeVw0d>%v`GPMgJ9({4Z2(Q=VCutcD5TB9fj)C<+V#32tJ&2on?zOK2
zUb4IS2prF+?s<uDJ(p*j8|y)SZE<Ut6L@cLUq>VGse?Z>!|}S##&svXi4)^DSB6u7
zGgd!)vFVK&n|ELR57tpuQ(1M#+y7r`E_GUJ&80d{-SPIbDWiK_j`?aXbb@>}7djn`
zujWGM!&h_pYA&!K-mjXAtX$r4;YTJsw~FBls~od>wg2`@?E5#$?>LlAotnv682ELi
zfa+m2M)$n;fZTh3fH|)eHu8fR8vMBa>iC_*$<urD3$+Y0?Z4m76boxbtF2UABjn|N
mJU4fJr9|F6ldBTfyn8k;XMZlmzHJ=J%OaaHkf^$Y%`X9*lCwGh

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_compact/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..e2860e1eb16acf51fdc0f6e8d44ad2c33bab9080
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$j+Y5D^Fu029>+z5oCK

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..be45380232b1af79b596288cc2d52376d57cd236
GIT binary patch
literal 143
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13GAb)|e27~Jm83vEpMaOqAA18YoK
zm%yzH8Vs2XjEoG7hE@#uTKo*hiyUki*j2=zr!p|uFvu9$Fxa1cmjM(qmrGtz!60J{
l6u1)30TNI-;{jA*0u;E-`U5C%K)YUn0Rk8qU%aeR0svNZCW`<7

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..9d786a90ad7a
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3862701472
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..773d3c8891c3128145d30327bae64f5fc3b26757
GIT binary patch
literal 60
zcmd<!{Lf*2l;dtEC)0mlPW}Ia3k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G`YV
A4FCWD

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..036a76a00adecbda759158113a952341b9806888
GIT binary patch
literal 4657
zcmeI$drVVT7y$5l?(OAKp`!&+Aa+hg1x-;I0~54J>wskmjK{>yMK1J4aL|(5!Vq0t
z94<?<Aeu2|;$x4D7&M~d0~l&D6@1}?7*Qk0l8VN~bu+?n61rRM)$co5w!fG4Bq!(o
z?ss~7p1mh!7)CWaE{12Gr}=V0<Xk3aR?c6@nPV93Pq$3Y&zS_9w8kJ*3MHkYp+vCT
z1*=6843<)dA$^@g%$6*oBwBI=$x-Ggvsp#SSXcXV#{P))Mb|Dg$q$P0csvuKQH0<2
zXftUTef4o$3w!AJI6pK#5cc8ei9xxFx0*M3+W-@7Z`)v!+lZ11iD8ufDYpZZ3tt&X
zIf$~FatLJ&Wi90h%2O#%qdb#xJmp0`EB#8P;+d4+rM!mn2Fi_;+bH*A6KX!l8&upu
z`2;pQh4LnBj(R?~1IJYb*nA>(z!L00s=q-`aa`Sl9pc7*Wiz(!8|=vUu%r91rzc>~
z-i>W2#7^Fco!*6=Ga0)m3fuk;wo|}vrq*wd8OPmvWTO8ayD%Snr?Nhu-zM(sD9+~L
z{mh=n<_a&N`YIJ(|EjAOQJg<pj~wt@3vy7N2RV2`M4mJej;yiZ^{6%9NAZY`NaU%5
z(a6*KbCG9W#`7INhUYu6;UucRWauvPTg!0!Rz|x}eC<OW@+aA?$lIT!Ah(~!<1^5X
z#*?VSMqqtn%<#SkUL>+{#-10DFUW;c3!vFK$I$O9`<3PzjC0X7Xnk-U9|Xg=swQ|T
zG{34J^*6v)Rs-XK?XFSiVDs|F(3A9?Xg`G(>aM_e7?;%z9iAME*7Fq2ldoYsX7APq
z(67Coc?f#WjTNcT30F$@LoY7+co($!)`At#xw|Wppi5$pra@Q#89EGIf2Jh^dT-wL
z8PL5=bz7iEkMu1ECUZlJJD^KUCb<7eOU^O4|B2^I4cz~%=o<VHe4ZWJx3m_Ra}d`y
zVAZV4x-96iu?=v3_&*vN;rk;nE_`?ys2|jHu=X&pI{Vvh2{>eR{jae9nw#1Ou>abt
znQb~yf3o1Lgztl>J;qK0i0jV{>)?FNdX$mn1o8O=?pSypNJ{*rTLa?8{(GGt0l&SY
zbsucc`|ia_a6Mad&CLy<eqC{U&k*pg+XG$Az$d=@z7@9Dbt=9u`DL8wKD{`e4xF`i
z<mslDX6(Le<u_POS*gk@7ryfUdgfAw0qt@5rvJ>PjLfsrf968N!T8TyXgvI9F8`Se
zEQtTsnTxDm*>&~@CNjU8;a62V!>ZDrmGjo8Y?S}w@MfiJ#=)cdhf!x575`Jo`+{=m
z`vT^G_n;)dn4!U!=c-L{<Eis*Cyk^r%-p~Kcc$1-AxQR8q1;fA|7J@5x=K-g99J!r
bFZ{RqCR>%&Axb7&c~yng`@!Dc&+hyMd6<{y

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-CompressionInfo.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..1237cc7f0057f1dba767e8984dc43eda2d555beb
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$j;O5D^Fu02CGo!vFvP

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Data.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..eccef889e67e702c31de3d2d5a7facfb2df27226
GIT binary patch
literal 142
zcmdOAWMG)Yz-Una|Ns9621bTYA`2Kb7#J8-7#JHE9_mW@DloX+ofq1e_Tkc<7zWmu
zvM$5-6EqpJ7#JBD7!9o%^0gQkjxDyZWneeAm~_IHLDtBY!TxNJ21vl(aR*Sq7$|T>
jWC2LPjl}>cU;-4lHDv=(;DAqy1JEuIU}RJ-=a>xuM5HB7

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Digest.crc32 b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Digest.crc32
new file mode 100644
index 000000000000..50f631b1a2c3
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3409102979
\ No newline at end of file
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Filter.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..b8cb5146f59dc86fd554422a2dd68cf4d6b4937c
GIT binary patch
literal 16
XcmZQzU|?lnU|{4p!pNeg{Hz5431I?@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Partitions.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..6c9a78cd043ccd50f357bc566c930e4d0b5e4c96
GIT binary patch
literal 60
zcmd<!{Lf*2gyU{IC)0l~PW}J93k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G#a!
A0{{R3

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Rows.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Statistics.db b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..6ede946fc24541a177ca0f0b5648ecbdf4c09ba3
GIT binary patch
literal 4754
zcmeI$e@qis902fpcRhYosI>f0ATky42Qx*-7@D9}S_dp7AU_0uEOO9`pwN<QVaQxu
z941S%u+B`GnLjp-#GsK(wh72w;tz1@rVukWBZMs-TU@qoP8m+t?g~BqeqWaT_;Xob
za(VZ;@AvNBd-tyQQifqv^B2YQ%v4z{S47^b<c*cL*W`_3814C6M#rm6idEX87ixuy
zN>N`S*ldEuED3sZrCp!7-7e-yW>FH&`GRDxvR7FxqGV`l{3`2I)Qe>|ueQh+iuZcG
zp72=0@4361ILDs2*U`qF{%C?9nHY-LcF{8|r})x*#n%Q{X#3g*i`+(3R7eb?^iMe)
zr0n?XV9Fts)s(|1Yba|eM^T<hc^2hF%E^?U@>>~~48N6rXH)s>C~u*>gYsU=9h3*L
z3H3b4GgRD8`5ZPogYqtHj=DZ~0LN9u*nBE>&<gBes=px*aa`Sp9p=WKz8hQj4tC52
z?6?8!*(uob4`b^~vC|J?XZB#{M`M@8V%yeYI|S@jYX0__aNM1MOcH*^E-k`7sLao)
z!^HhLinE2vyq!ACUcu%{KSTK{6`ud9{%a`CUv5SY`l$^$q|l2T>JgErc_NWDW;`FY
zrteTZsw)P0=5QSHtib~0#OwHZCy(Rjo$CA;<*yj|2Kl+wxP9y6TqwTv2OaXudF{yi
zA1p)eyokqVs1uDRQAgZ|`NWXr`wl!RWar%Dk09@ms~46)vkT6k-&b~@<|d4DaSdpG
za9z)b!nmp-bR{&uX%O`{$Xe9^<H4P-G3Zd!ntRaG61vg*6ke*k0pk%|PA_z1dJ>w?
zGc*t0g7NqhZ`^^N^K|xU=mlStWI(6fs5}L|yzHez(572UN}vl4*Q7yLBz?FFy8gHD
zQRwDNZCTJK3ir>2{<Nj(b?C7V2G#(Rh2iC0&=p1_eE&&X{u%iG6YrZE`2Mq^YxpcU
zpB*{5vJsfG6W1PK)x7Jv9O&`!9q{?^zdHBA??>>W$kEjxKcwZo#`l5MdGGd0z+s!4
ze}w(le5t(y`>*ZK?$Ck!Xu(knzX!3$4c&SWPxxe12cOrxyIDC75MNyEPJ-)!wAAl=
zH6U&n{I>fA;M@c4Ct-Uwx|gTH`D`gLwK_q5Q+a3K2=JlXLp`m)=Z+k0hwXJ;NFGRk
z94ES0u1#bD=WM<IaMxoqHvhiz8?2_Rq_WD6zy80jxs+i*@3?$0u;x-mX0i^fxzKPh
zfi)KzkHDHMu;v02;=i@#BAeIuTt3Fc6xB2Qrh3Oxw{x=2ZCbTc{*S}ADP1!RdGfa9
zer-|wS2gbs%C-Ln%$2w0WAcX?8r)op`sMeDj3;lGSmg!7!ax6ard(emNVZC$T3=lB
v?6RWmwIccFnOwb4z4Y&{YqZu`?4o3}R@c>7CU3yYr`3u7{=}B(#Qr}3_)oq_

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-TOC.txt b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-TOC.txt
new file mode 100644
index 000000000000..db06c09bbb50
--- /dev/null
+++ b/test/data/legacy-sstables/aa/legacy_tables/legacy_aa_simple_counter_compact/aa-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Rows.db
+Data.db
+Statistics.db
+TOC.txt
+Filter.db
+Digest.crc32
+CompressionInfo.db
+Partitions.db
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..c1a1a8ebd10df930b60f16f6459d22c8b5823178
GIT binary patch
literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{lkafHWJJ4JVi{L742jAvAX{gyzqI
H(88_&%?}Jw

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Data.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..3d65c28a8e3926b79cfc93a4c9485ccde7764e6f
GIT binary patch
literal 5236
zcmd6r{cls(8ON^)CW(2u(2<U&s3%1cq7}+<-W?izUBp%>0m2w7-PRG}<(0%Qi4&ZN
zCN2YN+Jd5>En(F*vIt)~#K3fDp#{fw>?p8y3RShH6=Eq=TDGDsQ>j2hiT6C``rdmS
z@dxlrl<#xC=iKL<dp^%O*MeXWdIf{IJCR8E;Qxe>D;x@%&G`j|Ma3nhmNJt7Es4Iw
z>%POHkm2*Sy7$&pH@0|M8{JKv_S!~QORaN%gU4IfT3_qlTiw`H=h^43u4`$vcXoPf
zJkC11y{5_0((I~ht8+P;Y8;+=m;L7*?fcu?8rqzW`h8w+YjvyL<*IReoA)($)->;T
zI~wgBwH=<$cDuXQRc-gQxm?X1H4To&24@4LcQJpx5tABp`VW_fo2zWbRvOH2F|kA!
z1m-(^ZyvV2J!FOO#e7M#TElH-o3ZW5g3pa&a#r0w|6yCW<0%M}!V>1}g8>XSBlZ0g
zy+ywkR4)d5{OBxBb)H&-&XQE;`7v~sraC|Lq0^G;9I1q$7%ao=`xo5I#OHi6Z8LgE
zL5KxUZ?}bSoq<V`phVF%xCiFf=VJ3qdSv?VO*l+9TZBO&B$EGK+192G5s)H=ai6J8
zR7MQY$l_2L-<q|XYn38T*>JF=S&BTnl#?{F*1D1JTVC%oc}St=-e_UDCZ1!tZiie4
z%U^}{)eIFGT_$)ndSK@Sjz2824)9WKkv%>76J=8O!7-2Kmx=R$UmH#)`$I6#<XAH8
zoD#Cs%w=-$z;C#=Bd0LYA|(%*9Je|=BsiR_E$JU8yR;>eN$-K5wP|Mhs~7bJlId^R
zf+Uu+uEMO94m{xUrKi8-)wv6tBA3>k(}yLO%N?AGfD)x_ZQVj45i(rQ0;eTw_~q9j
zzw)G%!awM2h2|GfPgOElr_WFA5`g0KJimPj+~!jF)rrRg-(Z=cU%X@J6aRpvrNST?
z>E1Jv(e-<SV94)=5Y{1KyOEe5rlCRnX<A|fFfB%|Wo$FL_OJ@TVI567rBr}b>4^>E
z4-MGIiC{;@7iGb|$+t1EPj8o^>Hcn0t8_8HU~Q&0vfwjGQE@lCBU1R4K^p-v{4SDB
zitg%ZVDxEx`J{M`chw|DOXhf2D`jAgH?JSPmO0*%dWedPCCDrR%tzN%vLZ077J=x7
z9Igle!xQgpb_>>xq(HIAbn`S!k6tYBu__RdqC3JkWU`s;3XAQkps#sxL#sXaR&hq!
zVCI!u;FV{JR|keL$3k@&-vE0Pg{!`=t)tMjS&H23k+2wRKEI0}q{oIT%<vD;XulPh
z(5&x1;PNDcd>c8NqTc5f{m_DiK%M%?dZbQO*Jovod?<~R9mBwx*80h8J?H&`SM>I#
zu-GOFS@@1%egCzBn+N%C$gX#!-?D<MK60k~GuVpBX9VboVdjSB1(`W>12R(3hc`Pl
zoD0&bk{-Ex<rruz1Ot_2$hQ6Ltvd{K<&ee@9BH=_J|sPI?=d;3gO*>+OWYRs&xb$v
z4407f`%{w9BSzB`cf>7em&7q4aT~)A$CCx1pc__}6@cR=Vm^W0?*Y+gk^hmbrH|r&
zWM_}U|63Vg6t6%ZcCL@CoMgLB=YQlCFT5{O5KC8wHTX<)&n65ORsiKnZ-m^A2#}&f
zS^^lXV;6<u6en?1P!E)pk`yax*-G7p#B-!H#Z22T<NjGB!6cFKkuE=OAdUHvg8&8`
z{K~FD!;#$_^D@mDV^h+Ie~Y1`@GjHiYjCvWmZQDPI7Lnl<8?r78@RwxMc?yrPI|7M
zSC=fjx}1~3`_&AVopRL6bWMLM=HY3+Z|rka3g$8$i|}n9y~io?X%`;wYOWJiT(0D1
z=$PI{CV54_S`p?!p4uFR)nMnlOxDis)PR%8*X#Q^^&ei*Z-%67m2sKghmxkR>BFPE
zO1O0upt1DXUTsj23wbHU#HC>ouW$@%nHE(d2!QyHE7?u^&>KHG#E7;qz>#ZqNnNcl
zJz}vuCZ~7l$9Np(i3`&cRfw)F113T&hIxr9u>}v`JdD0Hfz#Wv1T@vGTf#}o3%eoC
zDF0X_=4E<U#5|Tg_9)C_yQ&n-7vb5RCtA}IRgCl4GZ*G?9(#?^Td)CARC5B*_hPjs
zg@0A@$NmU)jeyajd3q8Ut)IeZ@f>f{I7Un6cwIvf6&Fg77D65KoI<TZ9`mwIqd^|?
z0n<F>F+Xst-R@<&O2J5j^_`_So(lPiNmNNVNFQ!%)k)svAt%4HW$WktppQEbshq<2
z*-C!M^GE%7+g9;=KgVZ+<dTe&)c}XaCi!#b3Eum`^stEMd=hdJdbyu#CebbZZegZZ
z>rO|{U^!HubiId5gZ^z3r^pvp09MQCTR*$WtBWub@@nU~O!B%m>ljLLgKb!QMnH;Z
zvc*(-255X~$P4D=fllMgnM}4y@fATmKj^#iq@EJ=&FyN{A4KD8gy+@ehQ_sBcsZA{
zzf%bx(dPW1&2W|Bt$tUV^ObMVX2!QIR+96W@LjT)U6-BW+nI!YIK<X5!b>9p5ewL_
z0Zt_RdjK<Vue!VD6O3+!o1RC!&c2LXqGZgaJG@Egi@Jdi2K}NeV!ok|hS0}fn0OTQ
z@fN=VeFeMHIz@Pla4na{r7hX8v-G1q-tJ0X?dL7f(*)={0XEn<OOlhqGy(5}Kpecg
zsGbBykKo=Q#dEwzAK+UzNsv9_9Y6LZY7O`JFE60haF3ttN3G!=9{|>P+~cP!dECiM
zf7u38(HK5AW=EOCodmBjsWS4X>BMrp;;UQ#3Qs%@pW%<nn2)F%QYK>+4!1B{uhk0r
zQI)>8WjYSDaTobdVP$pb=_}^cgZ(DfOpV*WPoYu|_B-Qz+dqbQ_4NR!$T$4qP44rG
zK73KnE+CDj!JE;p=m9#qAcPm<<k?PtvZ7o2mI1SgtB@d<jOh;r`9u(<dW9c-l4n{P
z&3+@r{7gfNT(v`&%al*<+Lc?4y*WbaC%039dUSQ!;{o->q9+V4@gRej?f>VWWmpwo
zosYYkh<Wp!f@GB9d*H8!lq@qiRW~aE@oK!blTyx(u-A^W62Q4J+Bk$IVC&O}KaQSD
T9j7E?Qzia-XJ{Jy?C$>oWpLCb

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Digest.crc32 b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Digest.crc32
new file mode 100644
index 000000000000..703e9a110763
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Digest.crc32
@@ -0,0 +1 @@
+1685416100
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Filter.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Partitions.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Rows.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..88f2a3b55db85ed1b7560ad87006e086dc43db42
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9tVQ0!@4`MPlXJfYKWU6BqHe_a)%*xELf;E8Iz{G-!gN2!4
zFEca4B_Mvz9KdX7YRLl<`~9Dpk*zj>&Ctl$#KOweQbmA+MbUwepMlYUftR=b|Ns9D
z3=qI1%EP3^1GA2sDU%y&9S>6-H`qEZpmkhG)^P%@1LEhL2<!f^0j+DnW}V?dwmQ06
ZXmp5|nXVQZ9~SPQtA!><q<B#+1OVE8VV3{^

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Statistics.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..bddde4d664538eb89650c3e392c9ff670e5651ba
GIT binary patch
literal 7095
zcmeI%YfMvD902fh?!Dzrr3D{AWb*+cA%%emBx)yAfn^EEV|MB?F0_$4d8BO^GB-EH
zeIO2zsc0rTT|x{P39^mHn10Z~Hgzh|q01mF>DcD7l^JD-gznza>i>VX4`yGs#e0&Q
zdw%>+n{)0xkADgwM2K1%t|BufR=QE?vPc&vU3;X9CnP+&&}K1_xN_@eome3jmzZ_M
zqRl3jnXIDDRASeqZL^!RttPY8Y|0U>c8lFoUS_uHYpTD?I34m@;q{A+(ierhT(0TB
zP^xOUvzykBKXtdGg*)@<lxk#ZDCm_7)5B5|xtUz#KET3S?i(ypA5jsI2=R>1^Jgzl
z#eMf??8DfPaR6gAV-4dF#`74@XS|4U4C4gGD;Q@oeu433#@iVmVBEoY5SudZgDz*{
zcE)G1xw(vYV)M-R^G9%8D8N=FVtcK`_GZTG^8m;F`mqC?*t2$HYv0G7vjID70Q>1U
z?5JbdIwN*+6LwlJcFt_<!cc76I_z2zyP284{RSL&Mz}q*j%dVhC{B&}*iCqTQs>tw
z&gJ3t%zcT?8^1#B1p&{0;o2n>SAAZG>^0Vc?33q0_MJ8(`%ecWt4(-5Y79T3cu4mg
z<axtk$nyttkr!RT?>lA!zwgBQbEy5wk?)b8TaEkoVweNPxBQ|-elxob`SAS|<gN?&
z`3!ZT=Sls7?!op{pCO+E4-4I~@Z>{C6?*Y}JT$kY6Ya0uLG^VQ=fkSd{NTG^@`Z7s
z%6ApCYSSPZuUEOH3dX&=9OKZwhBbGg{UdtNdI~gZufli`pVbE)oE(Yf^IY})LogoR
zy6-mhf@PU!pqJcSzXCe$YRPHn)WX+~LL0t`Uk{yotTYL_IP&9Xp(`f?N1^LJYsr9a
z%{#mh`f_8<Ug+_U2G#)6=)j_G=;HKrxc+HNPN!%6%gv~3zZ$N8&g>Ze1bm(wIkl=9
zn730$1F*38iZ%;+Vq!adKdRsB55V)oduj0KYS7-NvAz03V886+eOBOrm+O9o<5%C(
z+=k=VT+8gxg7&k;+6s6cgr3y*=s-N;i%~6nUyJW#WYvOrY=JWp<^xHIKliCYTtE0j
z&#S=CA89)U`?JxRngr)_S+1eE9<;A1>gpc>KKkuYZ!_@O6Cbp}{yNUb3?x5_Q|+!x
zQ)$3iTkbvB`N+iPuH!xUuQ?b8J|kD3l$>kk%p)X$89!R%QcTIY&;wF(F7!CaKhC*k
zzDCKpl$;ApC?)6mH*+ppT2}B@EAfAC4^h2a={h($_UFARQnN$)jYGaXuS&o7r_OiY
zexu!E=1zt7`1ofM5|<?{PfkfyYRWTE=7F*n{@eS3w1r9Ama<tX8@6(2DF^reCVBB>
zUX$*Mba$t5q&HpO)38B`clpK*<CHCSU2pXmA<=*TKd>TQsc5y8h+B09`D;`1w^f+w
TW49!fuN1dFcJuuW!%=?#hzyk;

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-TOC.txt b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-TOC.txt
new file mode 100644
index 000000000000..de43ad25cf42
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust/ac-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Statistics.db
+CompressionInfo.db
+TOC.txt
+Data.db
+Partitions.db
+Digest.crc32
+Rows.db
+Filter.db
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..f90cafcfb320835d753a24de5d6973ad7c45317a
GIT binary patch
literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^tn18G(;8&0t7fiO99AvB*Ugcf=W
E0Exs5o&W#<

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Data.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..d0438a81a233382a537535745f4514fdd423c5b3
GIT binary patch
literal 5805
zcmZ{oeOOd=7RT?%DEFE%Bxy6IUIlIDYI=tOWMF$7G`9uQBrC<Fgmy)cN~|cbQc6=S
zDZrFLa2F6~us0ce39XD_W>9M@d}*U{6{y|p<uP>4eCcuad+yA=bK!aP4<DZIIp_B~
z=lAlt=Qj+aVJaDod5z!iFTvk_W*oE5Wj4oJ;^GsgC0dh)GO+loE(|y67-j;aDPg+5
zO4$G0!n=Px*bw=dVQp_aGkEi}{;EN3r?hJr4We6O&uA&8Va?=qr<u0A;$AIleB9cr
z6<x9WIcc;Z4(*Cb>w1~jTDxJ3DC*XVEwMdHR5#PMG7V9}Gi<MRq-D%tT`&ub*vqsP
z8_4#2SFe_j>ym=1FrWq#l$<5Y+9pKB-^IvLWO=iIESpmilh8k=&P0|5M2?)+KW6t4
zvb2;UCb56a2PI@Vn1L8;|Cqz5lNCB5d$l@e0@Z3CvYMj=j`zV}`mz~=S@uen#wDnw
zw;ZzZKk1NN=O<G@1G6`QuevjuGmbLTx&*zQ;<?mzjy0Tz)#8zsu|d-q!Cy^I;|!Nc
z42<fqxiy_$E<DMuEq6rd6p^c^#D--dF-5y&B{W0i)?`(%Aq860Xb(52JyRu*kAxf(
z)A^lN)0nN&%_xX9h($J)c2_Pm)1Okj93K~)OLWgFvau%n-Eeg2j7pt5uFvNT6R*!#
z9BuA7Y-0o~%<gVfCI0~SniUq<k^IhxK=9UN(Y+)&jZLyYis^ziImKmJUXIo0bgmU!
zf^$2?6=uru$-C0zS)BaSeX9|<vQJqdC%+1kU@y+7QoO{~#~cx?;ia+ng{1X$G}bE*
zSMl+I(wyR}8zJe-;N_YUEmtPK*WIlRw^mJQ&DDooD{*4N1$)Kp0!8Dub}!_FEqX7f
z)UZuJ4Tjo>XYuhHqND*esJ8wx&0L3Fk!R+2F4wb@?2XIyawz|9V>V}a2RkJDh^}g?
zVejaAMdA;v&*2QM>vI%E{OEqnX#CJLHJW$j>1j%b*f!I{v8JjS9*%eAN?PN*9;R(}
zY5^-$rKJ|=mW9Om%ktrxzAT@2<%HBlIizOP?d4rd1L_hCklnOYA0A(oRkBn+Ps3<p
zV=mHsElVcp828bZ2pp2bEfKnjlImE}&A9ysj&tEA`}NN<NxhIRacR=-vO^4bwL3{c
z6*1!V3nK{;?#`|oqssn8GXmEHP!;>lAka<MG{}DmaOq*%!*EZ+E!9J^dbDA5SCe7U
zvE;;fIa|99JID$VI8l6DD3mUE)C^dnMj{O7K0Xviw%euKV>tVrc90{2Pv{>w_6iL-
zz7<1G>mPQ91fYpHa(rU{n5lNeSf&)95E{|iKhE+JtkDT6$OUJkTfn;LiU#}r8p<U(
z9Y#}<34dD5vrw2uHyr<$k!|KU*o-MSX)=v~lauZqZFr*nguRDov`=D}Q@n@6Iwe)7
zB1N57!jZ$)KKE<Hk4nc`mfap|f+9RvqLLO>kd%ou6tC<olKzI-6)6u(%nG@zN_s-#
z{)#S@^vqfEdb&*|y?}*eP0yFp4#`=R)Irm)b!N-(L0PBDQC5=u+gwj5+tLE5s12#A
z@ir}@Dc;aSa!Bk!oo@=z3|mWcLQ~XW(O6*{&?}d~?<mDRYfw4yZ<F$tN)h+u()Dan
zOYh%LBOQz^jgm(aJy-_T<nBgI@xhw-Po~$hrs|{9Yh~=9rtTsS2fm!?T^RB^?f?>w
zIe-K?@*7<CN!4G6G8Io}J0ik`>K^hcm5_rXe?rNuaJKK>F-jiia7b7~pBD_0m-e?#
z^19fh8tuvflKv=C7SUnzXiBlNrfrNPTWkr&LLE#WAU!+R6T*)B`zg_2zz<V^q7Zi6
zzZ_o+_2=Vjc~^)#?!IHBp1awo=8pT~FeB7U!=MJa18}*o-DG4Z+OOVZTnFk{)_^(d
zZS$zv5$+=ZHLL)DJR>3>8p9f*78LOP+7ZvZU6knj)0o~Qxi`-1Wjy+izr~i{mRP70
zBwRT~&qC9EtaaPN)<X$=?9c~j;P7;*kE(p_-%bfw3a`*w6$fzTS)6@iTy1-FTnJZj
zhnQmBCaEPGK^OL~bW5TiV*{{?Um$fuBXkqA#bjM3?cmZr9Gipzf%&wP!V|bS{Img4
zPf0tvvI(Zd0a4GkQdD6%Ox6KWtLn*AOq)J*>?+-CO)$?=N@O*CB26!5;ZKYClQ67G
zsm>neA40a5=?oZOK_Q~clBV$w(djkhpaE`1_w>NV*Y#*^EtXhSpu}p=dg*FzM3q4V
zImOpa7%i%-M?2_*nO9&{83{G!P3m6up;MC2ZZ$3xzWFFhIpOuqB-P=xD$>qslA1D9
z(!04Ny_2GnT+*%Gi;0xLeM#^#y?+_2rI6^rcxc8CFtLmxr#QhyRT>M3uZm+^A5HP`
z5!HC!E|NURFi0Y&c%+S_!*-R|KbuJUv_vI!R*>`sl2&p%SxnN2^pM1Nom|TaUs*$n
zcq$E~FnqheG?agLEaD7jASnd+-T+D83n|$DIK?|lP?2FWS|ZttKU-DC8GfoNQxx&%
zA8GV+DvAFeIb9$nKq062dmTAlQaOp&N-2e_YRO*!$#gvxdoII8^oH#(Wlc4MiYryb
z@rKXva^W#GL($)cYH)~N9h~E8ZpIgyp+W*pkW)100jSn(ai$)vE=VKvFdnNF_34xE
zEL3PjY6`9)YnVQ{P#0{1NWA+zx!yyqQ3yt!dvo%+5wmmhiA8a1aNM`#s98kKkawhO
zFDulH5bFE0$oo*%o``T$&4bPOngwBmmxOoJQJQ_sPzR4K!iauq*Sw1=axh3nMUIRk
z)cS~|Wme=U*z#sWGSw`p$iX`-fGzkfZO%sQGRzZOpX^Uar{|!#V{yf&633JmqFH*J
z-5WreH}}UIqpT*sDTK0VuM-F6=rY6nGl>I_AHyC-ufSKC`mV(F8vjh)gG7QkWPPR%
zR^jO)BF})pB03Mkb6GgJ@x$RiSBLulBEGPa&cqrsf-PqCXkJcSHv_h)4Lw>-X%wha
zrpg4Da4g-&nDIDMnmoN|iB$zl-0^OTq#0j5ix~s0k5hc(3Q3i%bk_^mP+^V;adQuu
zwxR$&UNsa|Rdtk1+fV?X(7!6aX1Sgl^>?&yGOYeljhi4eWT-y0J6@u2M~dpoI`ew9
zw<$4i-wy}VTPAvm$>Z!MDjDBH2_XXAA>CJDLsZCZ&v<fkUG{J)D80|oga?jb!ay(N
z6c3&!>7!;<4u`%b$*qP`yPG6miOTDfa*~ebs-)xDBz-wfC4FOtWcX!jp^_Q@+nfT<
zaB&WvP{ES;D^QGon9&F;1e%(@6MfYbJU3+^*nAo+RCr9Sww=>VDGWhD^l@|Wrl}1N
z4e1bl!=T00-qh*iG?A-h9yx5jTaQq~7NzpF`ShrsHQd%u8;$Z=!|naFakO6sUZsh?
z3Cr^_cZ^SIsW7(*nyJ=!VX_P&Tt-V?KBtdIgC=)HfQvVoT^|J$q<AvbAoBjXjhHhF
zPeu7TM$T!wNKJJ^4L!%ab1R@uNKbbq5R2m1;OJh^4rGxpaX|qxO5Pkb;CC9|llAmC
zt6llaLIp<XjC{62J^ZH^>fs-x5mN9ywGnU135BU!qBQm0x&UaL;;IZRi@ta_TEifG
zoc!u*$JnIW7wK6nyS*$ui`RXgDd7ltVmCHkrWGyyJX1!Q%~#e<(xyL-@v%FS;Way*
zuoau|n7Q?^mtl6xKWv#}!L4ZlYWXUT-k4go<#i!ySzpljt>|tr@B$5P@D>+li?-w2
z4QlBg9pX<S`uP2e#qG4&raGKC@CVHf0(3cHeyaxj*_jTTvC#yZ%ksQxM9^nm7B<8<
ze_WvFQ3Xou@$I7PE4+=m752P4o}}HVn~y^VN_t?=-aazz!G-1HRYOtct<o+10LPS1
z7*N!QZi@Q21g2>NqS|(m$%9+VCk}}6R*>m1N>liAq?n{nttzQA6_Vi#oNa|jC(V$I
zCs<G__}h?Ed?1Yu>w<nQQI~=I{?*qHNvBW?E1Yh_6f%;|{76!7lWNu<T1onExk~!E
zj-;Q8RMM{-NxFaxl?r~(An6h^RHSPuBwf|3B)<SD>N>hk)!nvrgxvz*#BJA5R{?@4
za9S9Pr>9W6NY$>`I)+?<bK-WcmC93~)e&(7LmO`1r!-s+TgUKb%HWn#T#i7W1Z||9
z{6>VjB50$qgREc(eKMep%_YAv#gz&e#g6FWO4hV}e7{bq;|?o*Zze|KC$a!%5j<%e
zg7v0Vl_4<s7UO-%p%R%(=sJ8$c4l+NN4oIyr0TWidkjBL>9g^>lZr~doMY>Nu-c`s
ZKn`KaxBLM1zd5g*3H;M&MrPc)@&A}VyI24K

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Digest.crc32 b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Digest.crc32
new file mode 100644
index 000000000000..dc2697987fc2
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3561445797
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Filter.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Partitions.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Rows.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..1a324e57b52cbd3ad18ecee4bf8869a43a013254
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r_PkPK8VTKoQ>IPAycWjupu);$6jWJIj#ZB1|}9<94yQX
z>rXQ?91dn?xOO{$+0fLI2PF0?ftlfpQUIHwk+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
crmKZU2VeNm)k5P#cS`AMp~>M3^H41W0FDEH_W%F@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Statistics.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..838d351e6ce2eaa17555cf39bd8acd9490127e1d
GIT binary patch
literal 7104
zcmeI%drVVT7y$5d?rka04hlYi$Tk#3LJIN_NYqZK<B=sGkI7yc7kVRg(2}+yGB-De
z{}98NPSGg3M?wr3iLy;#%>5yQZ8{ZVR<;bnl8!AdTQ{Q&m9)EewEBH#OSbHvZP~ra
z%{{+<r?=<admdka5TaVSI#x{<j%4XVrPC~(oOJG!PM(n1#45YlL{hA_t$LwBsHqk8
zHG<tPSWGrSZ>n|Zb9Xw#Vw*{{iKY_4<}f?VR*Psev^0O6e>&><s%sZJq#MP0y<SgP
zG*$1vv6r^bK6taQn>+i#yn1?mGW@w=&y-X}t|lLH8{olOZW}zLHljm?M2NqC{*MFv
z1>aS}IFNBL<50$0#u1F87%yeKjPY{DNsQANuVY-m_;JQt8Si3zkZ~VlCpKl?2VKj=
z9gNRob4wWS!RDF!^T%*pRe`Ne#|~JJtzr5bco)ZmN3lcQ*bnT**1dtf_(|-TaqNdu
zuvebI)>mR@bz<iZVV5kzu8PLCZ^mvFu)CP?J8Hyncbv})^N7a%jN-Ji9J>>bPwM_0
z#kn#(pSe%5`O3?vzDkA1ziR9fimN|vLk{?<8#%DdiyY(;kwZL@$XXK~j}gZ2P&{g2
zG4j%>802NnQsm`V@cT}h!|yx2{X<lL{q#4;MH_MZo``Xw__iN($S)Q5A|JV(jod$s
zpU-4JdY&{m{1&WF4f*muaPOeI<4@j$RG=3xq(XD6&Y|^{JEXk^<9tjL8Xx??GeIz}
zY6{u_t$x~x`Ws+1H^I23-!%&zWZZNUIwWoo&8M(R-BlP5=L<)mBeN3FcwVBteHg}L
zdtSQ^{cuLXS?E<0Th>9RT&+C~om2I~acJY0sav2+Pt<2Z*Cf362z29}uo>vKkGk`r
zd&-W)Lx0xM@+$P~yW^XHX<}IQ0CY`W9-RNQyX2gI{>#;<_jN6t|D5QWdLMkBn?ALn
z8JKra*M4BtiYvN8=()LF@cpQNX+H?}56$YxnT?=+V8@x}cYuS7-yE?4hd$f(BkaHS
zi-_y6{}E#aeL7Hok<i)z_k-w@hCw}u$9*!RgYRp_jr_t^5KpdfC%}9lGyVG!Er=VO
z-wy5oe(YH9DcGK;+&P(WJX=bQUG1QLOLhO~H1P4SCWpF!&%gC{FKn;tLehBF{W#UV
za%nynxNzI8yL;}N*nMTZ2LC<><G^?1;{PS*TG;alNn`qt=C~A7axV0Ml$;Aa4)V8i
zu7%quIhT@ifdQrDT>oUwMe8jUFZYm;qx*>ZK%@7NXX3YOXX4y$>2DnJ>3>y*1L~q3
z&d@f$nJZPDNljgomY$KhHY+<vDJjoD83)Q-_;;@d(h?>uTgqakEZEAXrEJ{)ndHTv
zdEN09q^mEDBg5J9nuZNhy!fKA%k%c+%|kCv5R&-E{{ySm*9$g#tx%`0D1S7&d}o75
j|8h-2`9`5`&7a*k&$`{>5N&zZy6yEA`4<{Z8Jb@K?lG2%

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-TOC.txt b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-TOC.txt
new file mode 100644
index 000000000000..de43ad25cf42
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_clust_counter/ac-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Statistics.db
+CompressionInfo.db
+TOC.txt
+Data.db
+Partitions.db
+Digest.crc32
+Rows.db
+Filter.db
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..fc38a25eea5d107acffd27df97523c4e5b4c1664
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fP8AtDeS02*!yKmY&$

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Data.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..485ae9a782b5a776beead0c866b73735054edc55
GIT binary patch
literal 88
zcmZ3>z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpGaAf--}&
kkurnL;VuUt%NWQyGWh|JWunZWc64nfLjYL4e~@}H03*>CjsO4v

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Digest.crc32 b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Digest.crc32
new file mode 100644
index 000000000000..773778b02738
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3030696842
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Filter.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Partitions.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..e20b4e2f2700a2d5278b3858fd96631b7f66fa97
GIT binary patch
literal 59
zcmd<!{Lf*2kK^uc4yGBL`u|H87_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1ldcHk

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Rows.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Statistics.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..17cd637bc236d4d565623011f665e322af4d3e3d
GIT binary patch
literal 4648
zcmeI$drVVT7y$5d?(L;KySCti64|EW14%J5L=&|Ws)c0>$YbIwa-mgl@<`huqDJR1
zS%^V&3Nu5Unh*m<HrXcNEAauoAH;|nAvNik(P+^q;zYVz*Q?)m;vf5a>q$<|{qcR=
zo^$UxJtrlED8@~U5Xe6zMrx>33#7_Pb+c4?LNurDBvpP#VvDWI^<s&bmv7PMi8h;9
zWVVWWbG}`lxY}+>wVEwfi#biS+6(Lj#YGmYp`!er=}_33oEsNwr5i=K-R_}~a4Kwj
zx`|fzfB3AqfjfMBP;d-(tG_xk)FY*M(mch}2Uu8p`UZ>CM^qF@gvjHQ|LiAUcv~f7
zf5w4~gBhzBYZ!+y9?5u=*K$9{F!5-{Qy9-+oXq%3#>*M6WxRuNGvh97%DfLcm5JLK
zpTOouFy4sGGxz7~a9ojvEyQ8_&BRtR<Mn@q<AEL6!7l6%HeqXj!XCa5Ti1#GQ7rbj
zJ=pqe?1Wv|iN~<hhGFM~W83CoSBlv6%>3;z;<zgknMOXr&d$W%h36-A-9&LN1FvW9
z0ydw01?4Lgc>XJ{UqW%=TotlkUjwp#h8sCx$buX+6pE}e<N2sD-beAU*5Sw_dvwU7
zy3&!yT*dD@dH}!gxayNAf2QLP<j-c~{w>fsQGCTCE%JucCghzjry;kT!Oy3=1wBt1
zsD2Lfslnvg2i_LCe)NI2kS^%O)03gO@kh}2D_5hs0pq-`49yR|bx8n>E6M_9K?{q!
z(0Kid3(8<z+2ZVn4lvGn1|1aHhSpO^w)Pr~tNE06=+J~HG@nPPUj6{%5sh0PK!5y6
z@?q%lx8~1)j=h$D2zq+Xy4}#mJCo-_r|&6@ht7-o{Zr`Dze0MUtIjr<pc^xGj)uNm
zTd@VY|F_OLz;r@LZYy+NQW9MMv?1*XT>sR)T?N-aXL0r%1^>@E4$dkE=IzwE4OlVu
zsx}3BU|=nLKf;UZ9dQ3pP7Ljx4f6eKe<}YJI52f@yA?QiY1Lyme${Qw12}%o_2gzP
z$R8$FmcacW{D7fN58{#Md$sU=jeTlLsRZ$uELRjf55&hkY*&G}q3h4KRluLuH64Wg
zS>&1?59f1Hy0N|*<X7ajbU1)_-|aqD4}4<Z{wCO8=jrIqgm-bO{r1veB5=xz=dU)t
zGqHKE<?mpDv7E}u7vA>2a^{kU<vVl9PvTAMJ9DAOL40Q}^gMiLF1Uvf;yZKwkIr1Q
zuqf-BMiTVnW+HrF>aJPW_ol6zq+T!m#^I^*S}{0G%4NEJ=zJ;1*jC<nrPlia<mbCm
zlC&{HgAad^T=T`CEoR=)*L{Rcc>k|6x%xuUYReZ_>a#NEPRm?fVv%mgmx?R@b^ZqR
H*028tF_V?o

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-TOC.txt b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-TOC.txt
new file mode 100644
index 000000000000..de43ad25cf42
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple/ac-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Statistics.db
+CompressionInfo.db
+TOC.txt
+Data.db
+Partitions.db
+Digest.crc32
+Rows.db
+Filter.db
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..e2860e1eb16acf51fdc0f6e8d44ad2c33bab9080
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$j+Y5D^Fu029>+z5oCK

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Data.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..c95bc74083d11eadd5d745ad03d44bc83a5c9889
GIT binary patch
literal 140
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13GAuA6Qv4|jgQYmvZN$>v9g7+5El
z{h#(cLxUlcfsv7c(a?$^UyGmNkXfb;1H1W*!#`{oWQ=SW><>2;fCOgoNB{+lfdWT5
jG(iHh1up;vOl%nRk1pK8z;R%X(E|nuU}RL6|LXw&B0ecb

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Digest.crc32 b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Digest.crc32
new file mode 100644
index 000000000000..f8c73f51b6b7
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Digest.crc32
@@ -0,0 +1 @@
+1495453984
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Filter.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Partitions.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..773d3c8891c3128145d30327bae64f5fc3b26757
GIT binary patch
literal 60
zcmd<!{Lf*2l;dtEC)0mlPW}Ia3k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G`YV
A4FCWD

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Rows.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Statistics.db b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..4bc9a659aa90f5750a303608f45efcce01fccefc
GIT binary patch
literal 4657
zcmeI$e@qis902fpcRir|=xD(oKx7*VC?kg(V`!pw(mKGH@Ov_Kk)yo`4q9@p44I3Y
z!(=0dF*ijc{@OA^fIud50%MvegKfI0#H^bcge4tYT()jT5tYzgwWr_j%d-8wtS`B|
z``q{4^}ToR-Fqp+Fq)(#aXfQBr79Uwj!NZVm1B=`a15hAd)3tVCX;5Dw+Ld5XsM6{
zi|BBOHmfWO)(WSPwcRP@$yQ00tofqstaMh|ZIWzksQ)p$JNlKfE9YC32gP~4-r0y4
z!tcGln>0^7dZV|4J@MHLKQS{F`O?trxKhMd%$Iz1fQh!RZZIizL`j3hFlzhM%R%ai
zKM$rHLOGOjIAtAWJ>_W1^C>T&ypVDV<){2s+qHs<=TLs0@)pWFDDR`(OWB1@sP{ph
zq2f-;r?J_2ly_ls)bqJk9M_a!^Xb?@E3t#A_J-WW@z4?Ma1Zt)yRi+&u;*^Tjvd8*
zEDby9D7H|Fop~5LYY;ns4t7}#wxa;MQN(Ve#_xz3$35}LB>qqA(qinx>iE2WnRvcM
zakdE0XZ9;>uJjvJUZcU|Uvv2ait}GIAqV}@fgDogMb^$r$YHZl$T}+?k9zYjC?4HE
z7kU18Eb;<ZA@ahD_<g5L<M*B3d<K<YIq?(n+SRyzFT}c0eCuxp<k#|gkPqHohTJ!V
z`)8~V^(P69ya~$_W43P{c#z1>g!TtePRRLlOQG51Q|SB3?$=#`aW1wFjSsH>MJ<eL
z>a?q%`Asg=-XMEr9gGL}xu>AD<~28<!{P_fe2OSFT!QgPE_WC@Dl-v{=Xtu@2Vgv|
z^POwZk3XGr0y_D-^(&y$E>(0xFE4xb2(<a?()G}VN2@cSEs38z3tf9BViLOP^Nwuj
z&Z2_}&|kMSybV2da&!$aSsYQ`4{b4-;QS{Y`KRFgC*JpTaQ?HBd;C*yKReO2svekg
z68BzU&7zBjT<GcP9q|3|x0?6C^&@ym)Z}VVKBVQN`j3G_^NtV8z~P&leuwSXeXqX;
z+poWz(`x|b=ZK9pa6O1=Hx39O9{=T}0lu$A*RyjQK|Hm@lL+?%8R@?c>p<M-`g!1G
z;B~D%U9dhIJ<BuTc(xUq+nPc7hVs6V3E(3?j19H{pML*?9#~)Zxs=h&hjC(f<HAf9
zaPHQdcXvHB<M5xW-(a4yT9j2!{Q3WN&!u()n&ZmFz@AI(nR|I)&xN{!3GBI0e+2ei
zfjt)(5dW<`7pb<Dyxz%#9oob2@78+vztw&(Z+$|)Q~8g>cc@J>j_U<WTYH)6f6CQ`
zSIPZ9z?|Hu+@LJX(BQ|j5|V~y9H|9^T4h7N`0xLnDHp0m*-;@@2_?nPEi2w$BPoyL
eYQ?H2|E;{qzRl*8WRtyWTeZ!1qibmMhkpSK@R#NQ

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-TOC.txt b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-TOC.txt
new file mode 100644
index 000000000000..de43ad25cf42
--- /dev/null
+++ b/test/data/legacy-sstables/ac/legacy_tables/legacy_ac_simple_counter/ac-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Statistics.db
+CompressionInfo.db
+TOC.txt
+Data.db
+Partitions.db
+Digest.crc32
+Rows.db
+Filter.db
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..a8a93a03c6e34cad07f0734b9898fff19b342090
GIT binary patch
literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{lkafHWJJ4JVk-LYVAZAT)O;gyzqJ
H(8A6D%VP{Z

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Data.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..63bb37e15c9dc68a7875a0387a499cf8070b3697
GIT binary patch
literal 5235
zcmd6r{cjY<8OQg4!}jsAhG2vY>JVy01R>A)e12=;_ClMMh8MePP@ql0Hh#m{zVpky
z7~#7WZ*f~*QmAAT6ctMCC5b55Dk)9ie7-kaN|4JBm5FFd2~u2|wjiiP1LgWWv$wmm
zh(CZ|EPtQ*&g?U@^Lb|W1i>O47A*E~I-Qo`|Flpb>>0A#iyXzylG3vB3Y!2e>ArN2
zyjK*mWjWAX8*KLZo9pXaYlEJa&f3<N`anx}qc_;y+Sw9pY_4gn3Ha-p+uQ0I0`0!;
z9f7TNZH@Jwc3)@1j{3&>+L}PHrM}zaYuxVX-0t)F1Fa2ht^T%-x<FH1dyU8AZE0%V
z)>hlO&F8PHZw$0<Z|vv@dRrRWYdV@+Jk5~aaM4ODCUw%&yLUy@XLnmW=}=LJjU_sL
zaGFPcYp?szaTkP#izMCZiprC2t8c#Jx>d}~$|Lvgbw~HhHa{sYWyU_}Z*f~`|3=y7
zo9}!_(5x8zbuRQLC1&TDF&wni?EF|pXPMb~sT`f<X6LoL5EO?hF!}!JH*)YfpPh1B
zyNDygf&-QA=$&eqBN>t?x=wC^>HX5prkC+3<g1<E^ASgwtpTo!d(?U!W+19#3`(PO
zy!f1qq@yoJD!{L|I{QuL6-}I38vVfpC+X;qLQcOnk~F$CDBHS7u|BeAXQxk0zpQ8a
z{=<-w2OSOIgEMZ0hQ{%^BO-GPPh4c*g`m+)4nA=iioUZnDr!?u$O)NfSt#Va$32|!
z^jXeGhmPWEI@JUf^83j8E;2Myz#dr%OZj0$^y$s?Vim~D>=c?@Y|LCCSH3z7&h$Jz
z+wluqjjm+Eq;8tj?H%uS(P2G_FCRDY@<C)P<Rs<mIYr825Ln3q1ClHHS{dpU87)OO
zJ$@PVH&IVkF-~i$cjgKJ@EIQ6zH}INxD>59^?Z;!RAK2CKezOW?_*V2YmuyUTTrrk
ze`OIYMPUeG2@)!;#Qtb18pJoJrdI*Z;%NH(daHK}gP$Wh_ML1M{v}!IRpRRw?1M9Q
z$Fh1QhQFIpJy0pd*5M{5j^#P}z!up?HtZ)|(r{7nZHqnt5J-M}fV5kreB%7l$OFl&
zUky8pl$hPLbpZ}t`joe85~F2iFWD(kKPfj`$sS>uUz{aDW@*4Zwtr((1pY@f{$syb
z%JHudF>LdTYO6wzJq+8R&^ctrzCurq;bV)xzQ~wRm{8>`MD;#rusrGNW2{S>BJt2>
z*qWJ#D>QYN5z10g=+!7^rGI`K$tu<|`Lvs5sj{!o7u@U>XwBqS1YptC9j2m;e1-l?
zKw>ikk$X>?IOl&kHAv$Yv&;!^*1ga=3s+JdXHbgg<;n!m*Y%>CEJGE3g)B7I5xN%O
zz?zrFm-AAJJFC+3V40ml9=;_Qdj%^EnW1Ui?P}9|E9AR9mpD~vQg4{lmJtK}K^j{E
z<xzBLoX3`hoS8asqN)PP?zwYh9>ZKcoiPMw*=K1PIgdHy=Y;d}-BZ%H#K1K8^T2Wr
zIsZnLWbG1TS?SNkHB--tV?z2ChF2}l;O_{-vNHS++KBxKc7Fgoe}MGIw+qjrKYqGN
zrT?edU=%Mv9|nKCWBe)j<L7XT6GtLT8y4877qc-~ycQ5wdn4fqRwkT#^aL<?<rF$g
z%+7Jx{Xo6hIZ=(lGPCo>Y6yxmC72=d9lwqBLgQU>$0tYdFjlwH0O)_q_}10aUZFdV
zj~LZJq4&>YXN~qpy{Oc#3Qe;^Ufb@8EVt1(C?#erHOaXTO)`JXB$xDak|thyZ8ubf
z484OiQMhKecFHRxBvFpRgd@~xNGVbJqLdF+LPZ_YSb5hNSs|}P@W9t5@M@H+He?;&
zMmCt#CX=c`m0?$*O_T0?xGt%f1FHKrvi+d76E7yHtF9+-chJ>N)|R21Cia-Dy(V>t
zSM<ouXo2x2j-BNvc{0?AFO*>omF6HW$PiHe!zy;m7VP(j8Pw&BaYUP4P+wB9jvvl_
zPOvWf1dqc)aYk0U8tFZH6QCyLmMQ6Ku>()vLX4g`i>R-e4VvoKf&L8W#bJmu&_5B0
zeU8x;Nl%<9e-`P9f5N86Oa4qey$eOx)O0mNJ@LEjr%+GaI}R&P9IGHjEhk|8KR>}}
z@l`c{?2lakZzD#XQ;Z}8w^d475foBlHj|vo6?j!B<=ij^$^BymkWn;4g3OTa<h(IF
z%QCb}?&N}*5*jkcI|LTZlG=rl3Y~c9Q_vI@^$Wg2Kf23il_?2>G`T#4*LxLukWT5_
zdchc!l4bX})00Y3$X+iA-#W#sH9=lgR`co&JFm9DN^oa;!#H=g1H_T*EJF=k(uOQ(
zigxrO#v1WSzn335<mMFF#dA#e<?{R=;M>U0P3ou#zISCI&mf*AQm#z~({!MUU6FLg
z-!oN<(~I$$8u7{BhI+ZK=O_4tE|^M1G+LnDH54*tYP#~IhXMaW6?{V5i)!V*1Or_A
zs<s!YU!bjwaChSJ+s_H%rL)+D*(3fehp=yl#I<T<ylhk;qJ#Yw;6cVewKMWHcW4^H
z=q&-szsx?6yrN_+pnr#>m|gFkVZei7KQoWm|747Y*e53zJPZ5OT&SXm{aSXn^@#8o
z;SU8gd1GcCBBfvMQ}g@b4FjBxDbw{d1=%L0@@3d;q<D5_R+uMn|FtHJI%7r_7+vyx
zjFvp*UA7pvt4)IR5${xCHfkOBRLKNt{culJL{aOsr(Vf<9dgx9cNpMR_+Zvq2YUZH
z$-}PDb*pgzjrG)q$a<Z1g}fORjXhVre{;Lw+N2c{h1Q)H@(lD_dT}7_X!V*P?83^Y
zw*>)74t6=0hn*(B2onzO-RcL24*Od-ai|RJG<B$_7v`(Ak|VC&Mqkp$D|&1jpYF+W
zE+wMyPVxaNjbHj3sCNeKQt}_i*$Qgy<ij2w;g3%72uCg+fRQxXQ>jaTgKYJlmwEWf
z)$A|$#%mGms;=i8^{ac~E?4*A#e(^(+oM9Ax<2`QKt1f7Yw?P^7`$xz|7DzERdR6}
zZfYVHPV)e`Qu4kRp)Q?c@n~*V0+Ns1tOPt|Pfg8bHGs2Y4VeBAYrwjN$UhEWKxdCh
P)|M*#@y?Q!Y#9ANM>6O=

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Digest.crc32 b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Digest.crc32
new file mode 100644
index 000000000000..b00168f4ba45
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Digest.crc32
@@ -0,0 +1 @@
+1723118615
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Filter.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Partitions.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Rows.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..88f2a3b55db85ed1b7560ad87006e086dc43db42
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9tVQ0!@4`MPlXJfYKWU6BqHe_a)%*xELf;E8Iz{G-!gN2!4
zFEca4B_Mvz9KdX7YRLl<`~9Dpk*zj>&Ctl$#KOweQbmA+MbUwepMlYUftR=b|Ns9D
z3=qI1%EP3^1GA2sDU%y&9S>6-H`qEZpmkhG)^P%@1LEhL2<!f^0j+DnW}V?dwmQ06
ZXmp5|nXVQZ9~SPQtA!><q<B#+1OVE8VV3{^

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Statistics.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..0662076e3897c997ca2256fa9700dec930cf49ec
GIT binary patch
literal 7112
zcmeI%drVVT7y$5l?!Dzvpo4-BV6aU^k(k1G2qbDJR-wx>7!SeCDVMerm;z}VSDA~8
z!+*$v=u|ZEwP^?e22nNzR9u3CZTLiE-7-j*b<AkCszHWI=<Xe@e&5;tF#Bg)c29D1
z&yVl4Ip^N<_)-{#;pfheR4`Lzfp{b0RVH4nczq~d9K%GWD|_{OnD|QbI*m{z7)nhV
zgJ7`;6-Kk5F_v028Jn!8Y_rj1HW_mSv$f1xR#{;(Yip{1$ZQLHxA@vXgZM>}Znt|f
zD4ZyE-rGj%$DSMQXl9R}nNT<;hJxQdH#saek(x<G>H{parM|%;_7N2xiDA6s^Zwb#
zTk+g|Df>}YQVyi7q8vgwjPgv%QIuy>j-#ANc@gC-$}1_aqr92&Zps~$?bw8RA7mjF
zw^HuGW@k{|ip^2q&+W%?z5rX1gzd8!+m{-z-y<AX_G1USu%Fq6tv-xB{dMe!LG0(_
zvF9Gb))Zl<9>C5xjh!<MyEq)%@)~xnfZa&V-+mpAyP`dwT1O=ME{c<)eCz{weiGMp
z6le4BdS<`D=87(&_B@a0KY!&siYvaVL-x7TjO>@^M)sdHAqPx`BCCveK8EOSqIg)>
zbmW=C5y(;YT;$o8@cWJ%$L~9-{w!+0*zptc%S&<pRz)~by!;n6^84BC$on3oA$Oj`
z&u6F;Jx`(xz7N|IZKiY%JT7F*tft2hD`cQI0h*n6678?-9@RA%=OS!qesEoH`olPH
z^Irn3SZzn+^{Fhg!MJaya}3&Fw|o>jAi5i^r=TMBWf%|U^nK8wsWE6i&rm(s3*(V3
zJ8nV0keqcKdftt~MbPn=OWUB+i{Cp4t@|#a5IXnJh7@Q+%!!wvH$Ds+fv)?qITN}i
zZ{IBF3k@~fp~pTSTn<cPgG#!f4a=6n^-r2}PI}kB)Qq@yso?r&P0ry{;Pb4bbxAca
zXC=;^!2Fy`YCZJ$_-6Qi6u;H)hUbUx{Lqo5puJzivFgu&mDwNnnSle>*8K{{ulhFR
z794-bm8=dmXg^J;t%B!4c$2nU1LDzNkEr4MnsYBxUkl<33S2QTA4o~M-KPR^t^Mcj
zcYt5n-`)!Q^M)%u1<vP+TwP;5XkSy(+3x^8`2Eo7M&O=Lj<m!6I(y>=Q=i0%`h)Wm
z8NmAT`;WFhF=O%6@gDr=9E=5@k*fcfoNMaLW0*v0{Ai7fF*)Z#4@l0r(BokKcFr~R
zHFC}+=UiYy$vM|QnR5}Vtx!421Y9`EDB8xtc2qU}ao_u@ZHxFDhje*gm3I5h*eJ(G
zKYC5w34bgh;l;$H<dlV}Y3XuJeg^V9kk`V$dp{7jFmc<GH!FF=mJcoY;Qr4fFW$`S
zp{F9=J!u@%ZIJdfY>?uStF6iQPusJP9{!@@NcrtE1JCy4kcqmMw@%y<e+P~I>wk)s
lXf_CDOR2D4Q;@$REq_y$i9B^nhHx8&^-tY=b=uuEzXKW<ooE06

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-TOC.txt b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-TOC.txt
new file mode 100644
index 000000000000..c1b10099fd70
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust/ad-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Rows.db
+Data.db
+TOC.txt
+CompressionInfo.db
+Digest.crc32
+Statistics.db
+Filter.db
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..aed58e0e5d50419ec10ec3ee783252cbde03768a
GIT binary patch
literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^tn0cln+8&0qsgfKZvAT(bfgcjxm
E0E(px;s5{u

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Data.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..2fe9d5e2ed34e020151cbfc736ac4d4712e22ee0
GIT binary patch
literal 5845
zcmZ{odstL;8pqG*2xqjNAxWPZN)Mqm=ZJP@xDIw1x~Z5cW-4twDQ+2BYuHNRWsz1l
z$1PK?L)%k<;*42}Lu%Nj!Q2*Yb3v`OJA>PXSqiz^Vwau$zGpZWc%J&Bhv)lVe(&${
ze&#(gjE<>dbheGrXtV-<N10@1v(IL;r=@3PW@S5aMl-NP-QSJnO$;-c(N!>~-^lpK
zzUFyN4Q;D-t-NsbJ!aJY@2{!cu=G>?Mn;F|7W)@^irKj_b@S&;TdA!_&sx9T)1w!C
z_QM7#TbxJ##FWiFOtq^UHkY|uFSghtN>n$~R?&;7q$^Fv@o5uAnFfnM!XBpW`Gy{S
z{FJIuK?9e5R*G>WpAPx!3lJnEbb5^AGj3tzD6%vqkmWj8kDkjM8q?fCmdS`5pEWe*
zFBN35A(G1;8uM2NS#Cz;c*oF~Q+;HaiAa+-leStPS8kvJKHmcazc?Am(hf?NrX|?R
z#aWQ8%g#df^`}z6PUc{y_-byqK3=G-Hd<wGFZV;iU2JyMlVyH`*pe2I&T%KxWOTNV
zY7H9VbE?X5&ZbyukF)*i5+qLA<}s=g-%Yi>ToN>>T1Gu-ukqT@Stybf-Z`Mv7N9~_
z=qOe-vHihQHbc^xQ#Li*Z_0uOmj9%WZbqr(u{?xY&$ZDOO;%-#dk3T~?wXpS@%^0z
zSDRgRVFzt;Fojnv??eX6<=G`%`fj6?Kvy@Ndv<n-!SW;iko~w9>ZY?f)p5PbroG~y
zd5<wZr`p)66z=63=S*OQ>hV)cl}K@S1ch5BYNdVHKP#l<s>wLtEqMsM@s>O;BepxQ
zc<31fTBi+Q$G&MP^5(e1O-5Fze7(u2dLGU9!_(ePJCnn?Q1@bF%;{dN`W-h=DYv({
zxQs1{Qo<+b)wg&03*maVzfg5O(+*eRUWY4j@Cv-*r<GLoLRQ_b_|K(udIe2IN+9>m
zRqwNW_0pyyO(}iBLTJUoLe96s9NWk#8+UByL2Tr3OQz{jNo<;j-kf_WmE|k_mr^<3
z@|e0L!n7^yUv7@)D^Ff!<$R9}Izujce-oTP90T=XRsAiW!ro9B;Zx@67@a-k-?U)M
zk|`#pJ^J-t)=|A?)(@$s$&$Q#NjKx4!PX?kXIH<C=SuSBe2GoVM{GaE09wsMakcHc
z@pNSVk&VC1tLhDA1a6G+%HBC58r^1k1^?H9F%iapFTM8^e{n=Vk~;P-vY5t_mo8`!
z7w(}`KH-2VT@y-u{(y7=r*HvWhBgYOq9Tg=2${LeVNomWWQrgymo+SEHL|k8`7)TY
zheefjk*R+cOpc*ZTt$M_kaQ)%YU&oE(HBgsMxlNx&$Sw?aMUop*hQLcXpmULBcGLR
zMZQ58dATtG13RU9Q@~(4BvH#N)?W?Dn&1i$hk_D^yyB}7D#cuY>yiDuQUfI;ry@01
zke78pBfT}BB(Xyyy?Z@L+4UMJ+y}|x)GgrB<n$*26jr!1A2gGt4|hP~?@31e>9Y25
z88q{}bBks8c*Vrrh(4aLey(mh7$f`j;1uJ9nnig3vc~-jEs<06>$WY4CXd7&ulTiv
zT5cH@QdaHnr9xJs%M|HKD<sP*Pm!F<;|%Crq&=n4#K5R?X#bkuqqxB}J!0Tj305rn
zOedN;RvTi+C3uW1zqJ7y%TUDYx{=UYSwgoJLywoc7FUy9y}coV-&5IB3{3VE8H5*e
zl>(8Ge{nYm)tI2wGV*C7dyv-TlE?!;M-FKR4ru*yS<+f76wS-c+<F8XdB?R}uFuQO
zlceC*BdW9j5?}K-4`FBU+_?uO?Q}@Nj>qYrcx8n-UZ{2-%rq4%eDU9SJc65i6;SfY
zEJCBEJzWKCdAp#hEb?cq0Bp_sJE5v5^56D%8+$`_%VPItKqqY@kn_LPc{J__bBRAY
zp8$UXk1<^aJcKV6m2g8QlEA)Vyb!+j`Y|5P$4de93m$l~6R%0A`6|2ABuKpSIs=Cs
zaJ`{cs7=(3xndf<fU>jsG)>t;GbVIO$WjI@+i0dN2CTqQ_LqTGYp+*<m41v_YkFRq
z%*Ih$2hZY4QgQ}ym9a>gipH5H>(`RCqn$47b`-^B4h!tuL8iNtVagg7b!IJ@=Jdgo
zJuK?eS!Bw`qMTz`RJTO3f<1T~qbII2&DIUp>7qh<8N~60SQGF$F%qZZGS!ca#*yv9
zdDyHEFU6m}v^-rj&Sch+gAQ-`sUtY!?3zoqh~C+fX4gbYv<_Tu#d+t{7NabaOr-xd
zQjH~M^_7az5=e<_BKi@BGG&l>oWYR~K%TU|^PFDip03fy@RVUwgOUNhV+Tmv;?ip4
zBcyFF&}tKXq`8qn>Ehsx70~#X5^!pAyE}tZRzS4w?1nfv9bw*aNl)r>f2zhY&4E2;
z*LpgLyJ~z^GA*?C02Ycd3PnygK(p+hgLz5D+pYa19o9;Bw^8KV`516yue{gD%dhd`
z=FcfH2w_%3*@Z<wpyhpylURgPW-T8;lKWbG1b50>KP&SXWjcW-e^@Qu;}5+WK6%B*
zyRb8%_RE!zso`rHDC7B?N)?IQu<|iOV*hUA2=W^Qlo+glY8lXkaxN&7ergRFblQ5J
zzuJ`;FVyKZo?<mlofTLbh~*P_2?vQUh+O5~@YD7pB(oWTgM$$^@~7Le5x#cB3Nz<Z
z5f!LPAVkF-bSa3!U=*mmLaOmm_^OBsye#2tEap5JQRKV39=_v}e6DIBDt#jwprT-c
zmQge#1)G!Bu|nOXq;(3SFe7#|!SgfoSiWxA%sefjf^S~`rV(3DwZOecB16->2ovn*
z=OW2+elBN^5z5Q`XHykBsczLH<`Mi{g=WE@3-Ie>S$&{LcJzoL-Mg7sY3;k2_-<>Q
z9?i-pw%LDvdK)XjP5yYa*`+VH*)hI_3<<x?x4PdqsC>(Mf#5L5R5&tv2f^X0E4YJ{
zXV4ckj%7cqi{4?npYSk;tO-&BZbbufj$wh<9H;x-M)#R97XJ5|Z~`;M>*^u%m1s|Z
zNSh5~(d1&-tn~vDh_<P9Fn~|+?LOR(Rji1CNE0c6C^XeU%QbXhxfq8$&QJzR+8(UP
zrI)bEwiu653}t(ahNL&pBN>`hw56LrLX!yxG2L*P!?twi9?I8>m6TUIvy!AXETl;D
zmXZ|c(@2YLB)!+Bk(Mx!_|rS6Hn|0sorh-q1o1MFfRtlrY2WGifl=V&6;~digkQ|k
zBwW=<(pP3kGLAw|l|W0p0PJdpD{-Af-HW}Tt~wKeeYaDB?_1TJ+_ONC<p&LKxUF+a
z*c|r=ZR1vU;6qPv$7$TeS|4&*jM|oexdM`PEaoW{OF@4-@wV<}W`RN*v2g{U#Jk5$
zD^-UTarb;mI6=#meaV#Y`U6`M;U|wxA)kR9Z_waNJlIdtG;Xf42G@F1DSjjD8ZWL5
z(iC^*O)2t4azQ9L?wsV6W?2$D*18f|b1vc(A_Y(EAR^T{1NaafBoa0GNz6e)-3-4$
z`F?}=@uhNu^}aoSOjLkGu00pa4dz7`%M~7x_~p0dhH(#lTTT;G#zs8esf}71(Zm!w
zV_$=D%{{Ae+fW>ZjgAhMcW-fYXmJ!eYr&9e&ac6N*;O2c$2J!kEYCHoka32`zlbI<
zZ*RuDIEW~Z7@qiJ7Y?oJWT8SMXE+6Y!dmOPe@>0z2z7d#;Tv=4#9ujCD5Fda-}F-#
z4%1Ts;_sRcuY2+dTWN1qtiL84WSHi`fAfL_Ze_-(6}I>PGPT@$i&bhlHt3=bj66uZ
zh?+R34F#m^&zGoWdMGZML-dJ89~5tW4Ho_|-VXSK2?znE<s+jG`nbHi3^wagbS?&=
z^=d}qPv{H$bh6g_%VPY|L`wV#--}*kh4!=P9+^Mk`8Z`(I1XxX>8L?25d3k5A3&LG
zQWpV8lZ$FEBnk*0EF}O9+t$NqPJG8=DK2YRRB<DjB4(JfhebWU6{f^5L4ngYdC2mL
zC99!M>Lm)OGnrQ`%Z1E;(@8bUT-qQ&v-V+el}n*>U34A=ONVf&;%ayb11^;z;+Afb
zegcUUE`^^vPE%vO9&0JBwcSU4EU%H=6(n7ippjlGCdo9QkzSooQX-a7a_yq`CtP~~
zl6V`gcya8C*E+@5Sy*_k9$Qfcru}`;tdp>qN~X|P(%|b}Y8x^zwR}VaQ+;ZGxf*`B
zofZ=vnpx)94oH^t)0>oW$r<*qCAVztJQpq^x7)tcBs-BuUU#%<lAXexvX&WVkZkZm
zxDU^4Vukv-8cF=D6OCxT&s(IDDRj96WSZnIel}5OTBn6m=*Ow02J;j6M>ov}T_~Db
wYMAtNL%_gX0*cguDI7Pj%gCD7=andc!oU<B9l--Qu(5=R{ioCAcEwNmKL_^DHUIzs

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Digest.crc32 b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Digest.crc32
new file mode 100644
index 000000000000..3c2551889938
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Digest.crc32
@@ -0,0 +1 @@
+2961106595
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Filter.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Partitions.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Rows.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..2f8e2aefce5f3486734c3d173c1ed29af3fdc9f9
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r_PkPK8VTKoQ>IPAycWjupu);$6jWJIj#ZB1|}9<94yQX
z>rXQ?91dn?xOO{$+0fLI2PF0?o|)l`VgQ?=k+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
cqN{~Q2VZ#8)k5P#cS`7Lp~>M3b5Sh>0EbI|@c;k-

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Statistics.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..627ee8ecadb98a53134914218ebba802d27fcb24
GIT binary patch
literal 7121
zcmeI%drT8|902h9-L*UlbWrdCM7F7*5>t?eK%#a+9dt2)@fhqCIp|Gb(319a$lP2U
z{=*h5Zi*(pHVrYrAj+n|)GfooHgqa8>s}C+RJLfgZUz}Dp}QNce!ssh*|LAOWp~Lh
zcOSmL9>3rH9`{YlFpMg0S+ts&J(f!+B9%(1uu^$dDjdT^XK4COdzb{PZKIyA<;yDs
zeK~Kp^A@v>*PAOuedZQX$hDaTn_$l4ZDOTZX|)J8Lqq+y*_{#3mtGlak!}?2@pxuJ
zB8hs}jUB`>dH+Or8++o+w0dlMB=p&HGow-!xte^)ZGZ=Dxoz-}+K3Jn62tiV=lipt
z@4|cbryM|8LphkTmU0;72+H#*FQB}Lay;cE%Bv{nP=1W^M#@_$@21>M*@;c4@gOUy
zxJdadHam~<c5IHiKer#pRYln9WNg3H*#1<11Mc9sW)M5rjeXw^Y~5k(xldq64P!r$
zfE{-TTVH~mb^tr`G<Mz`?9xbV`=i*6Ja#KJe+P{??vC+#b{&zJpHZBY6k;F1^OLwg
zM{%|QuV?lXY_8-}R9~gS^IvuOJc_G7YC`t=sSP=xz=Iq(BOnLOgd=Oscs_<1zen+i
z-nq#0N28DzIP;MgUBu%ZKZVCT+3_B#zk2K{@*`_;`yP*Sp?K8~I^>sfyO8(YN=NQF
zho8?#4|<+N6M7TYCx&eKJ#hCR+ZG<X3+aLk^(R8JOHQHvmEEJg0^?j%9hx6p@6&-W
zuBr=M1Fe3_iTdkjt*nD_{~p&Qbf9tF1aweLA6ic#CAv#69?F>ppu^K*(R`k#y|ov{
zquZOWK|h$1a{_wF7sacf6E0PBLT8k|a1h$~Rbnx8{-K&w=<?W;4?%Cf9WoBx^kG{z
zbbG<Rh0qsT8g@cYzB9ZIm@Ew`>xC}Q%7W{kwB?=ht$(>1@w~2u>z@@|qwj*xvtu1=
z>VY|txOM@n7GKnvpr@v`!tqi6;@Az}AO6e2$Jc`T0WHVt-v-v?zA<0}4&Ko8BkaHS
z%dl&(|6!MNx^<xb9KNv@z7Hag8T#}f9`o_I4vyF28`-8t5MN&8j)nO^YV!2~Er=VO
z-}OBQyncUI2W-!i?u=A8pDp>uRtKoxP}Vaz27K_Fk<+cfXWu;11>5WDj~`CE7bm)x
z&rfFpo2qW!*?!NA-FuDq;NRz9EclFk`G3i|X5V=XlSK6&t#K))<Xq?hDLEH<9L(R&
zxn^&p<XlS51tye|bN!P!7ZK}<H4{wGg`<qRb26e?bo_SSn_s$3`i?^`zM(R_7Wehl
zS8AGlrp`olJTY-aQgTY_%Cz(h<w|)5$~;ik!oPbzkhU;s+fp_wWy4lpTFQ(2Ka;%p
zGOyd-3+e1l<Cp>Ys}HF_gA|W^){)|Tt1I{D;rA^^s;-|I`e1!=!E{smGbbmc?`f9)
z@sDC<`WoJ5ui&foMTHNi7jCH)$X~9>5N<PHz2eVqoMqi)5d~Y8wR%&HMg9-<+=dr_
E1vo#TqyPW_

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-TOC.txt b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-TOC.txt
new file mode 100644
index 000000000000..c1b10099fd70
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_clust_counter/ad-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Rows.db
+Data.db
+TOC.txt
+CompressionInfo.db
+Digest.crc32
+Statistics.db
+Filter.db
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..fc38a25eea5d107acffd27df97523c4e5b4c1664
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fP8AtDeS02*!yKmY&$

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Data.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..a2eff457f1c16628dcf54592f7f9e8620d5edc0f
GIT binary patch
literal 89
zcmZ3>z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpED{1!V?l
lBV`7g!%rlDEMp++sE!4YWunYrdMwaDnIQnIVQJ-|W&le(7+wGX

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Digest.crc32 b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Digest.crc32
new file mode 100644
index 000000000000..5dd842571884
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3089812609
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Filter.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Partitions.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..e20b4e2f2700a2d5278b3858fd96631b7f66fa97
GIT binary patch
literal 59
zcmd<!{Lf*2kK^uc4yGBL`u|H87_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1ldcHk

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Rows.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Statistics.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..50687c4f16e9b557019e4b2c24ab60744c64c506
GIT binary patch
literal 4665
zcmeI$drVVT7y$5d?(OAKpwfa5AmV)BgOFlGh9+ti>HxC@#$zxy9T!>!ra;<;A-ZTB
z{zDd4%^aHeVABv&5L7lTOdW{|Zs1fj>oh`Y(xK68bq*O$LU-$W_4`g1v%j~V<mB8R
z-{JN=drnFSQOsQsBapvSf>cl`mr0qE@+VT}2~poT@w=^-BvzU>YQ-wCq|~S_5zS_?
z!eA1$hEj_*b&JKAWil8|Mnkq}vXoiMDl3d8U0v<n^tQ;YMK}8zr5D9`Jf4ZLC@SoE
zyo*{#XN+{%xKroG1?Tu+_y-p!hNLRqYTo2+15B*FZG%Z_BT5P+LgfC*e-4lzd}Scx
zAjV3@p^R0G)r=z<Ph&jYXStoTnRq<oMU0m*&S3mD<Bg2BG2Y9#gRujfGV?)SXW|yd
z=drn|jCW%5%=`HRIIhUY7Lu?7mSYDp{SA7C<H}y_P&f9ZUD%o?>?v!pqX)2GOT?ag
z7+YJ2opKO6^(=PwWbC3SZ1Y>#^&)mNvwnN^IPQ)`rm??a7v^Cf#Osr~e?)OE7w>28
zdu+b&2UK67!0TUey&uJe%MHi@k8H?6xgO-;2_tgIL<F+RfY+m1e;>soJEtH|8;VAr
z?#Mx&eHG7l{1~3^B<r`R{&MHf$ZJ;N_PrDBLh<rnG{`%$+L8A^U4m@Ch{tEpj>eNJ
z!=J$VRG02O2VNw)eb%uTkRE8?g@w@Eyff(gmHS+E6UO=I8niz6&UL{suBZuK0WG}e
zK>ZD<EUSU>K)Y)cI#|DY1Ue+P3+<<{Ld`W859c#`pd(V^(0ZP#db$tBV_J4UfS$QH
z;}rC~p9+>iCtfRUgI-$n(III4?S%!<IfpkVLzl#zUI|_OXV@@w!zEifbW863S<qJ+
z>pq1Z{d!<EFr6P(+zDNhmIn7fwPl}y`=5F~Q^Ebu8C^r)fdA*5tt)DQc?)&z0anbp
zs>y^N8`}owNBG0K7oH!13nGSBf%-v>Cu_d~R%U(CV*(C+zu{Ndf7LDZ1K5A{^^6V;
zs6Sb(uY%`6)G=L`7Q|z}8`i-2n)5h4vmV3~^4)RpK9HRButx>rI>)`P4}sr2(B2B$
z^R9boGF;CUIr?TRs9#rX?{xwnx-)pT8TkB{N7`X~T^Hg9QeMWX=HveHRN&0=C(m}i
zG%@?0<?mpDv0RmtAAIG1<<2E{%YWyRN8)+uzjLACApScS8V~=S3!Wi_`0rf*qdOP1
z)D$R3NXV6=L}(k0w7R>WmyIhQZI^!I@Md|h=$zG}Ey2(v`&FfU@Jglc1IS75#Yj4s
zp}~jK8?B2S$J?`xHg#7VDSvpbZ~fM^{_%#E4W}!+37P-TKi3p%H;X27sklj-pZCU+
Vye(Bm>HU1QxanWlU#nBw{{mM0pick*

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-TOC.txt b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-TOC.txt
new file mode 100644
index 000000000000..c1b10099fd70
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple/ad-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Rows.db
+Data.db
+TOC.txt
+CompressionInfo.db
+Digest.crc32
+Statistics.db
+Filter.db
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..e2860e1eb16acf51fdc0f6e8d44ad2c33bab9080
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$j+Y5D^Fu029>+z5oCK

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Data.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..f2b7b5e0d29792fae20205175817a49109101788
GIT binary patch
literal 141
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13Hz*O**iyWW4vf&+PrvtI8#&A__u
zo5;jO4h@D(21Z5(Mnfxxd@X*4L%9+*4D9BUb1G~YWQ=SW><`~a00~UFegG(73=}vj
iGZQ2*P5uE;zyv68%oV7X{lIjW8w?P@$av-Xl*0g8k1H|&

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Digest.crc32 b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Digest.crc32
new file mode 100644
index 000000000000..a3dad7e92a66
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Digest.crc32
@@ -0,0 +1 @@
+1039976897
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Filter.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..2e1d5d29ca0683d58365c1bda385a89ced35f1f9
GIT binary patch
literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Partitions.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..773d3c8891c3128145d30327bae64f5fc3b26757
GIT binary patch
literal 60
zcmd<!{Lf*2l;dtEC)0mlPW}Ia3k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G`YV
A4FCWD

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Rows.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Statistics.db b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..474cedbc0b2f9d63c120743a004f407a6a707948
GIT binary patch
literal 4674
zcmeI$e@qis902fpcRhX-=xD(oKx7U@g_xp4h9+ny)B$D*j30uV%X-k8z@R19>X5mx
zIQ)k!tP+)(_;VU!3Ii2?0RCE{gKg+kG^;Lyu%u&)X6xpV;Usid?dkXXvSfcR>q{>0
zKK*`seed0Sz1K1fqgs#@$1_jLV!0ynQ7Ip+e0(Gy9K&dD9{a;t&!kwTje4P0D6bIp
z<$}#7Sj>{3H&@v8nOp2)o@5p!(VQ<x_DXxD)gnrU#`+(!JEPw--5P9_FBIqXdZ!~|
z2;XvlJ8?`rKi=(RPn@6PN2i7(-?=<JBB%J$e8txWSZMp&28-NAR8&X|qx4UCIY>G2
z*TIxSD61)lQ`S({QjVrPoAMmW^C%}%e$j8GUl~+9hw?hg8!2z4yo+)-WfwM~?gx2^
zirXn)z-DJr-iFOl*XQ=)xT+YNPsI*ei5*P!H{=nHs|T>dJ=o7}$JQOhp7}a<>=5?c
z6zl~Du=S<b>HDxV&td1!z&6ET+g`(N60qB-`8#05aZfxliT@qDv<Q2jGC!ZZOg!JB
zI9rI<Gy63*SNaXgSE=y)SKYXR;{2D*$U(n3kwXf-$f46Ba@cefvc`<(qt^H{ibwa(
zM4mkoi#*3wfIROSzTe4{_<pB4E~5OEqdy|AU5(qfKGu!mRln$vKgjDs-t%xda?fQv
zKEpj|Jc&B;0n8_cY~OR>X(0^>N1sAIA%mBeK(h<apzl|9x8@d%bFp=3esH~ShQhe2
zE_4+%zrlt28)U7lgYn=V_XKpPam_e%SbQH^PZ6cM>o6Y4<@Q5Ir6;2KJWKQN6Bv(c
z-*FfEg=IM>pcmdQ$$(C|UeO7?!u0-rXybQFN}vl4)TBX|C!T&8dh=fqW6;fCIJ2SK
z3-=^IUu|vt7<%H=&>CQ}D59(vx;!fju7BdpKLgi4@$S^X_0Nj#k+b0a>}bcTdSK2@
z+%3SW`PX#0(36u};r-$NbnJr9kKm-JvDF|yr1g0HNnmx}p?(QC{H^9+VgEIEw0B|u
zwKsCQbs&F+&{PYb2QfzteR>d&|7uJJ@7MhM*||+1zPQ+v2=@bNsrULdAZ~E|)b}p%
zt9!dTV0+&1tVo0N*-~I^bAbHDvYvra;QikZpKAlY@af?$*k1Rg<e~IuaiaV1%2XzB
zZq<WF+n$-R`OlT_V4kv)$|@)R`v1D;QicJoarwo-o=X{-$8})Og@%I(?77f*1om8k
zJr|e||E)b2vDcNT$C<FJM;N|yBHH0KJ+51gw;SaDIDCiFHN!|wQbxiC>i47)#kKO_
zm23Y8FsEE{iu_=P1|Lptb}Vy!)|GeU;OCaZRrk&h+WIaQOf|Q^eY(NKFpK{A@0v1w
wjUd@7glc_p(JRY~w$zI9^|{SL_0oUK&$4c^*hMMJTD_^p;(O!Vma4|T0kWc`qyPW_

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-TOC.txt b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-TOC.txt
new file mode 100644
index 000000000000..c1b10099fd70
--- /dev/null
+++ b/test/data/legacy-sstables/ad/legacy_tables/legacy_ad_simple_counter/ad-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Rows.db
+Data.db
+TOC.txt
+CompressionInfo.db
+Digest.crc32
+Statistics.db
+Filter.db
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..7c74582095d62cb4ab91c9d057da547c75710eac
GIT binary patch
literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{ljvfHWJJ4JVk-L7412AT;+Z2+iLJ
Hp@pLW&RGnD

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Data.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..ae0e0497a3bb9284d18b75527686106756d48fda
GIT binary patch
literal 5257
zcmd6r{cls(8ON_n48eq5&~QL7bkLM`fp#1xaT2oyAFr!WDj;Ob%0OF7LVzSBBz6+#
z#TBt7NbwLL1eB(o3K<CBp#1;|Fz8mW^THY&+p&f;!Cu;oNsLVhu?e=UqPcsXW8Zs^
z(f$B_iSj+?InRC0z31~h&#@$#q+^oF=2ulU0RO8}p0t0^W-BNxvKN<>I@VYum{C7a
z_Xc`pDLW8wd%ex|t=qRZ*44E8YTMjiPxsDt_pbUTcbB`hq0Zghxu?n3zSHe-ckc0Y
z?ylMC@p*T)`s%uO?{YVG?WpT$_SO12n%qrIJ6hYCcDi@(?(}u}I@>+1p613KolU!1
z8-0yC8d_`ibTl@#*3`ILw|8}U+}rom)V0;sHZ(wZLj`Ni7}VL8U~gGOnRc1m=wLys
zg$Fu2l+_YA(Ca!Lbi(jZ0n<&LkxLg{W^ZocU9+6tl|K;dbwwt#VVD$^aLW(Cfyrg2
zkIK>3{*k2F!XWvwhjfr)!%%XW*|7Oqz~ap<$*`mg-RQH_Fh!rs^U<dx)AF@(7?cOs
zVDx>n|1=j3%j#X`c2anQ51wPL;SJKNb~4B)8lBw++fx?wSiEw|AjF2Z9YrU8{tC`Y
zH)Psk!-4cX9X{CerdRU^<|kbca(Y%F=Vw;xlM1~O5a#Hx!|<cf<Q-8>bzzi6sfQ@!
zA&*gA$z$9EG5)Ho8D6Uv!^pg{t)g7aWmrya3BJR~D^4mJlw{hkE8E&mhYWWcMQb+l
z)&OX2!)&Cc+K4nnM;dzUu-LB0;(2xq-e))uA0DES*TZ>UVh^WY_a-*dl~fnCYXb@m
z^rN%Xv_gXH4KS9hskI{~xJ*yd(^C=IFjW|AIqg>=7p`{+>NldIzkMuO+u#c?O-J;@
znJNg-$>ll5A~n}5K(E#Ho=;Jip}t?qQT@BL@2&&{UvR9^=(R{6s(8d%q<hJWf$re<
zO=sl)n0_H&#<b!vF*7~kW@h&plVmFJ!w}{mX_J}Q{x%B@<nmeS=52uk^2k39Ti^?d
zIFEj^;)um9^;k$j$sBdFTyMe^9O;p#Nyh1Sai-60Vx!{`_=co#$z0v76tJ+d+$kI`
znl|YUkiT#C;i&zN5d+d+!j`tPp@QP)jK6mWfFUKB#&jbRNlKryeH*bPj^}LM*I-nh
z&mc7Q+$PspSCHecok!PLU<>3pjsFlGHTf+a>GK$3V}};_Es*L8104Sl9XlRg16$S3
z>uFKZ3l8yse%|wrVEHsa47WFPrVYywySd~d2AOgSGLNUIqO_(rzpLVbP0<e#TK#r-
zt-5tpd059HL~rhz*$cA~&OS)rjS56t_jMSNGUR#HX-IbS@?+OhEf9r928^&2Iu0pG
zpCd_Wl@45pMpyO(Aw&)O(JE%foS;25S{#;*-ACVe0AE{U7Zz{$>KORasE=;3{LW*N
ztlIma(Z-p2@mcRW_ZgB!4^$d4hgkG@wh<=y{>a(vl{NH7>#iS$Fm?1tJ7!jbM)Xft
zLP53_yx;lckDPR^M%xOYo=vF%M1Az2>jgpGvF|1Ip4^!Qf1a4GAnG5F!z-6Z=BR&_
z*U!2lk4x%396ldNXMjRK0KhZA5eu;$$K_7|>R&=Zm!m`F6=wG~UjBueK2f?PRsIiV
zgQ5H>tZBib9{n}{x&?Xym{@lWC;@y6$L45$F#}9g0Bmr)HUR6X^_T#*FTg?jemw*n
zY~Z<~IMdRDSdfxT%e?>xz@KT^>wrNyhru3^@`*moNCx#2{YNvYpBShXsb|sQ2Rx~1
zVAG2>bdaW=O<a>urjD~h!%0zn)-S5tkWO?<@`WFJgY-J`2-(xszYn!}BxkceKxZl?
zE7bfnG%PlI1&>kVK6=+r1Xg1U@9@;5+0)qa8x=6Gkso_Gll<7XF&4wx)e{tDd8Vw4
z<mtfD*pF^q6_N;x{d58LBYN?zoe?TNi~c3w4;_%U>AIhNW~2gzw)9~Oq)m42{a7GB
z=I1g!Ne@0YlB7b8T|c8!uh5eNf*X1&BQ=Gd-3O>O=A#d9tntc#{FX}Ic;9~1{FD=p
z9eApm^Nuw?#d!3|#TSI}(m0mL)$+VKYPH17Y$B@?xFZhJOKP>;iUoN!5}vaH!C(J1
z(3IQa;H%YX?2G&`&ascn#J12_irmMBr2of#d@<01ZGR3H^wqL+mRikAf9y(%5{-Pu
z;$*(nV+zZp@O6kX9TlP<|CS9acTpm>2Xuj$zX}-zzT!(p99aAdB`dW4&zU;=dFjtI
zrR&^?*E|M+A?Wc67it~+_~r$u^>QD7dnyzBc+GeQ__6EPGQn@?LkAl8aSvu2?YPoz
zhp6^H7S+)KQT2%@lJ=Jib7VlRGSUFOlnhvO>J@S|V28u2F&`UtVq>on5e^!qULm8{
zVCcNZC%Dv1C-Vh(=<S7K0QZ7J2F?n(kDavk4Dv6Lb%lHxMfCMbo-nbk5MR2`vsiqK
zpXiwiH^#)CpM@$w98)mZ%6SBijp!VZnN=-YuW#;|sUA)gttJX3BEB}-IIORIR3
zzp#?utuJT)nR3jvhF!aEw{Xr~v8kySg!7Wcypisf-=AycH;UMUiwJt@Rp^28`A-Dh
z$n=wXj=yFd%(CL>GY8@yU4%fm<zlmScD{t!BA%6<P62j5Oz{+uW6ial*Du}J`X}D4
zFnclz|F&SSGw~qD>Lms3yywMByL2m$CEkm~WjA!kvR?NS4=dqo=NA4!zx0~|7Ba0a
z>UjRfuVNoB-uNZB-A{hK8kUQ9eoPU|Y<Pk!41z|&C&?1jI`NaQJV31zKe;N2S|@(8
zus@Ud$?v!Y;uU)2s0e(BO|HvF2O9B{8>dmGFoM~)#to^Et#3BNiPcIttqF)#$g@H)
zrCpB&MFQbkgLx8k;cZ+)^-xzthNG)7pE#KTa=#>&dAR9{Rh~_TT|x+CRH*R^iDszZ
z3!T40KF?71oqa|?<i9i6k87_WX&Q*+L3~<spE$+a_4G7N&c`c+W=)gUfN_7K$>sPM
zy|qu~t%c`kwk+v@>;bncZGa}%zxPiuBP6Q?-*W0M{?5`<Uz}JWW!}X-DX5b_3A`9k
z581zA^2vudyjDEbwUxZ27GKK3b8VILvbvGd#2}QPc2cs;)U8?b6p;8N$Wwq|j<64J
l;VD3{#?f2(m;!#h%qORs_oZ~IkC~e)@h3e~^5?lf_#Y0O0rLO=

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Digest.crc32 b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Digest.crc32
new file mode 100644
index 000000000000..dc10d1f6b92a
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Digest.crc32
@@ -0,0 +1 @@
+180495317
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Filter.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Partitions.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Rows.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..b500ebd8876d6912b85d6dec8740ad5f947d737d
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9tVQ0!@4`MPlXJfYKWU6BqHe_a)%*xELf;E8Iz{G-!gN2!4
zFA!e>;^!;@%!a0xJRq?@jLeK|^#N>#M#d%<R<4#R0vs%g4t)F!j0Oz6y!HS8|8HP`
z047l$CM_PAb=*vu+)(RynCiH})^P!?<3h5I6KEX}Kj%cUjtyvC3pVQv53)7T)k32~
WysUJ!(D<-$7hNqhIU>c6Y9Rmu#9*fY

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Statistics.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..922484440442232a4896c0ca8a2a40269b01334b
GIT binary patch
literal 7159
zcmeI$drVVT7y$5dZhH$9=%66-h|H-V5>pt6g+%DYDs))_<1ulMj0<fnP9A9+hRj9d
zFiXh7Ix~kxd@gAW8i}$^z{i&8fK4`qIoXy$Skke@W$R{;p%S`#ZL8mRwm-=J*_Pdt
z+}!i)ciNtF?>Xmwxr7j9@`?nW_{vK0M8%^<JUH=qLp&6OBv@m*Ql>~+y=}8jXb`Hc
z7G1Sqw+nS<o1imW9lHE&4oi{EY_VC)#e&UI<EW{xv)BwR&EFNAihiZ?`jvL^hZ4MA
z?_78s<@em$O`X$^PIY&3XD-b06SHFxFI}D+7n?}Uq$2eJ7S>YVU=jO>iV}&C!1w|m
zs{$4OIf!vEV>RP2#u~<vjH4OHGG4@ZDdSYe8I0F3E@b=+<IRk>Gv3R%oAD?%W$p)E
z&BPsyFJf~G8SlbYFu$+pz;R_cHlK;D%Eb<1#v6PO$JGPaVIJ&9c4KRgV8=X*9Y2D-
zI1M}bFt)A&TYnHc|2%f_0_@5-Z2QyLtpc`-nZE-@9QP#p?OR7Q@i!Ew6=m25@%*Ho
z8z|0|;`PjZimj;l9JN;}@%&c~51}|e*oLh7wG%nG)QcQ4XF(2~i$d0z@qCOl{)pny
zy)nqK<MGIgMoW;FUd8u2bq3$>Oy`HFeeT5f$WN}t{o54pM)55_Yms*rbs_J+la1VS
z8DF2V9&|maI^s5LPYngqbKpUtJC+=O0I@=^TuO)LmY+rYEBB7(I*cpgo6!7F^u7=R
z<I1LxJZS#;Q8ZpveN7XL2lcq8p+k)8r=UX<`_Os{uh3qD@d$;fA392(gy!=?&7FNP
zo^ax=o6wJ~Dm(+d{Hu*?pwq5dPeJEYzIq7S_)Ypo=#s;=S<ux<A3OowcsG0!y6xl6
z0_YQ^`<Fm}*52|a^z{29>w#%XcvUZS^}2O%{nO6kvw`(5HKX3QHE{iN7Weo$@O^IL
zWL`6{!a?18fR)RxYE95HGuz?);eU7Th0l+m6;YFGLHpqL)6MSztBa2I+knGfZ2JX{
zU-Nb3O*sC@;lge$Xum*cZGg{%xZ{RC9f&7>GO304YuT*=Q!9wCEcYbAd>||Hr+y8H
z8%BTVdl~qtj;@oiKhJq`vfzBKD>1s9pnXeK&%gxmp>M~|yMQkqd$$Yr*L^8<ME@{O
zwXY4$<^!9y+`hN#p^4pJ$9wRfb1)8kN2>l`axUL9kB|&z{Ai7fF*)Z#7f8;z(B&Y1
zJLmHKjGS}HITx5va?bTn=3KOHAoo}^2^}sW{JuU@qigW~`E_;d4)Gm_bOdgd!O80n
zn1((I7@ld$U%yYO($xw!yH(h#D=*uSUAC>kLd6#%EvM7dAJ52Km9<)*og>%eYaq`9
zc`f|A_XE}aZA;#)<PBSXXvq)m|4i}{$kFclE8^Lo@R2VX#LvVRCD0%ll!9Vv_@(r^
oiz*t0t>Sqn*;G{Pnr*5-(2+by$ozA@dAIca)x4YE$Q!Bs1GciKr2qf`

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-TOC.txt b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-TOC.txt
new file mode 100644
index 000000000000..2b093990c9a5
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust/ba-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Data.db
+CompressionInfo.db
+TOC.txt
+Digest.crc32
+Statistics.db
+Filter.db
+Rows.db
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..7b47ea84770214e8df121518cbbe9c388ad3a28f
GIT binary patch
literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^tn18G(;8&0sSfG|1JAvB*AgcfE5
E0EVp#o&W#<

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Data.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..593264dd21c1bfef5a4c4d09e3a7f7ef336ab19a
GIT binary patch
literal 5836
zcmaKwdr(t%7RPTe%8k`rYNp);7cV8X$p!U>cS5z7wbTWnNVO`pZbhpu)Q-Y9g#=vz
z!^>&4ELz2xZVQSD?Y0#VU0P>pOmIaAA$D7<eb|6?mu_E??K)Vy&7PCwK4hlLAD#L=
z=lp)>{2rfsejx}M(M-q;Re?aD8vZ{(#1ge#218;}a!P7iy3sU~088M=5BKURf_RXS
zRTJs;qp!^A%$;$f`Rph8d3$P!=`HW&ly`qF-ywt8?!+(T7;&sBq4pZ#Xw{F&Ba<rD
zH1#QDV{*1D(V;+V+m86!G2&pOPac_Ge$nrfv)zfKLY$9qoE(5SZN>Icxy-5Hx)YOJ
zp*R?IjBpIaj>$>wzQXHrqFo8r<gs9^js1ri49u&(F*%*WYZC_4+B-o@<+cBgqt;UZ
zT3Sdu!JSjIqUaz&Oy?annW&9)fM(=1Jq%AWpB9bDDSIkb(=ww`K`e^n&Kax0qWXGN
z9ucLZlX`;2_6=jixhx1KwLi8(!eHsTnSf!(h@MpX_@NAvDgUTm#rxa1b88(=MP#~J
zsdp;a?r`PDWbI12A|r=1nHiXc3Jng<dwcs8Bx8ydHTK+LrH+g@=QXNCx3)=e(-bO&
zyzK1Tup}h2S#48>-R$gNVMR!0hgTTP&R&G|AepLCiF5^)gVetq1&fj#%EQk3)u5?u
zSRJJks(Ys}i!_zr$aX6DOjzf%pI4HM`MsZ4^1(LNsq!Z%G#C8|bn*#hDEM#&>zs$`
zN2rFYWt~Y7Y%-6Ji>0y7M}nHaSI7_sT!x`PNkTd2<5!frNRvgL*)Jr+Iaj{{X-Cic
zz`*CnIe%vXNu%m%q*H`xOL`hfy=5+FT?^xc>iQPM!QTC_AddI6vt@7~Nwv0|W4puA
zcGe8bN$OeS@)*I7u63Ry8B4mEqf;7G!2xAZ@k@ocmrET&hIaN4gX_9{Yo9nE{W`G<
zf>*A~62@cy+yarRO(&EQg-1m7R56xPvb<D~XxF+dsH6ra?kZ8^#5UHEpD>`%${TG;
z+1xg&IH<O-7$qD}!Es|MqG0oW9}+i!xTUF16=|%PI|=eu(bqDwk~CH%7mv!REuz}g
zl}gU9Shn-V3K=0w)c$}IcJGL%2<KRKw=y!_a%kX^f|?)HZ7Y0)>-5fTBvZMqtdG*?
z2W}Pw8MJ-Dwm%TSrjsTCOGu>y6ZazwoZtN)ELmC+YE>aHC4LeCB%?Ya0=EDoqlD{D
zBf_b&%Refg24^`b11uDYPDH<g>!;v=lgxQD9&<4iF&5VX7S&Y<>sQbzVi?*M6@X3q
zW$$%)giT4O-W`|`yiER57&7hdu*V!Y*-v_~q~9I0x(Ko~?TYk)$#{2A@i+#}Eryft
zV_-!n^YlPKhFO%tZ6NhJpaPp@$byn&)ExNlez4Kzi3C!=Ytcjk4<fiPw_OS{-qPt(
zP^W{`vT^;jnJ1Jaljb;~r22#KwXyv#olYg0T}w}=(n%pdJ9}gR&+JMyoLOFa&yQ)9
z6+&8Ji5{c#A+Uu4{P144OEK`|6(P{Zou3Z%lgwW_gh<e$pm378kS*0>tFgSx<4{d<
zXgqK<JOH9@A``?ggKVtp-Ubz!ZkcEtg-;Cc%AQ{_FP}8+o<1Os$)4X7)HJYtlr0D$
z8+*Q-(vi&W1(3EFTW{e&%R7=GXx#a>;@p@>)9$riV5yML+pzj%^s{*}{3@`$S3uEh
zdpe5`wX+VWmDKE9^a3y0*uncJ_K-}qdtwir5*~~ld;m0kH7piSAgC>O6c(W}0!!l^
zxxt0z22~_e{ZH5<QCy;iS@hrI`tu??m~{j`J=J%hkd($CgJ9}H)M}Kd2~%%|y))HB
zH;J{egFB-tL5nv?7I3i2wh>}eZ5xGHZtz88UQ8rYv&7!Ri+1`T93p*94kXAUh;wbo
zb}1tDl^RH}OC*qMJABCWl@B0yMFhF(3!Ms5oq5ShcZmdY9raqt^vb-V_&B;tM3C!*
z_p(B@-X}jr$bdmfRY0KXIm8NVbC`VO(XkD{qro?~WRb}=>nFdEM_jv1g?o`53AH|A
zh+Pu{b$zua2AKqZ?JLHJ!R}GHLbF+)PccDw+1a5-jPNz6`OmT(I&tPI^ouMAc2N7?
z+;0xzOQ07tRhAIEhL*Sfa!<8aMvCxC9w4mL;b1d%4_@$faLmcLS&k68Qu2Zg&^=VN
z+=|vsQD9}%X1EHeQvyqTXo`jFM5j%OD(ga%9?lA#J|)UhgC-U)oN-E2Q}DJwS_(&D
z=DbSkNm)40J(x&~1LE|#kSuV>F%4E_4|QN#APQ}*O<>clf>hbAq*by&6cr~!2Vzvt
z4W1hxh2F4cw-Nr)os=jE4YGTvqX?<EcO6v2<B1#UG+>sSI)zhdXL|{ho~xEf?|VUt
zE{7H(bi`rr8qul7FQCZIeyT_5k4=)evD+veg^7j0?_esDX){QqTQC)=dCS7_7uwGL
zTmX`;rx!18s5ye0b}1K6RV~~W0WI#?i*pnj$Ko7`bRS9sDDhBo$AF~vF3ynv%{~9=
zEQRJ{5iB;=ZG>x^zW4ir+!)c3b7%H0j3YJw7SpqF?yNS*FZ#}+IDuX6yi%;<=eq`c
z@=PPiSZADp#)xN}yO8OJduvrL$q~534PGsTy^F7TS@@dR*$tDhU5s^(rI`=5v(Jv=
z9MKYq-ezegwbqCH2@!OgKqbyyQQWT}wF|x1<uaK~9Kae212CzrlGSfe;FwG3e9`N`
z6_g&6N{2g8dR!{K26u(j6-+j&1S+wEH=gT*Jv!2pD(Q*_zbWYhd>x10Qo^Lc$)m@W
zk=mM*$CR?~!6I^LXx1!XSk2$=#?p{>>o#(J&DAaHTA)&L6`cD8ZBi&<ljOGLK$FB+
z*MIva%|DNVNE)W1D@nc8c?V_+GYMW9?swGi71tKEl&fWXIXW@)UcpXo_*bu}VCx1y
zGw^J(al>;hu&A0#clPiI<A$}_un5stOVm_|U^aG`UE?B6)*HoeTiP-LsRj5ZG^{o!
zse(uoo{5IwJ25vzwc#}HFH_C>Re?ygG+;p>kVmbb7Fdc%cm(#6jJp{dqFQ>rEU*-S
zf-Y!XhKG`w^dp3(1m<TV4r&PpoQ0G*5b!t!<^PsliS4tVY=Y<{gLJrl1^X17Nm3V8
z6y(o-F%jgCyIY3*u@*x^i~Ny<2Kh6*aUeLM+5i)V+aqqcqy<KlFNlj+7{?|hmy$wl
zgiXl}mlb3B35bCd$#_`VA7asz+z6GWPN}RBk1z9Zq2!P^b)fX9ULv)aQOfs8q}FVd
zRt`v{4h=~9XR2YEFm&wf$#Kxsn?5v%Y;wO#tWq$gM#=p>bqS7E^mF1^^x3hs<b$=_
z&!o!6)(UXqhTq?Zr|JdNc)DJI+~K`)MFcKA)W2S#sgpWgz728e{bOqdSlQ`oGDzbn
zzEAm~{#6ZLsz1R4TeeD^V4?sf(6%EU7QtJLcfMr;S@%{eR2zP^0TqMTwOXl4-p|H*
zW<$BU3zIH*ZS!PfJu&n06`GNG0!VC}XTBPmN#^4g_)PE(lZU~+toDm}`O;VecDuFL
z={7bc&Hub`hKamV)?-`(K>r$eDNeN*rHQ+wERdpqjM522_7p@xC#A2OQ`sy6$_-Bt
z;J}NJCEqo{6V2`O!S4rp>%kb1!)+5l!2Qj`uY&XIzXmmoCv^7LKr^MYjeFq$gxe;R
zC!c_!@9SH81o@Ow1t-~OmGTLjCeIE(Okc<7p?IE8a65d2=d-R46p>8b!9yy(X97NX
zb}3*?U1I4W75IyE@{D)(kfyrVyl}cg+<~7HjIhy~W6kgKz_PP<{CJGc;PX99CvGGX
zpYKdx6hs1@;Lpa5P{H^6x#q0`*G5zu!A*0qc`KcG9Nm@|AjJ|FUY~vSLO(&A<bUik
zBm&0LLJ%97oA}ERtGm=9Ld>{Z7BEuF5mRgMGHVH#E$*Z0AjGI=W&}*gqCntj_Mxx9
zqW>I@E6}I~iL-{<IxS$70gV>D3pU-2*dUE|i6bJ7JbQ7YbxZUi7)c_7FdE4ygILB7
zq!3s%vh+8oO?Kd2haC6>kwhcQ??rgk695f5MKT&@|AY_Y<V9dgol@B9i)fl<08`qO
zs4XREqI<!VJ|(IQN+KC;tH=)a1qDh8rJ_XdhDWKfJ~<Nx5WrE}0f#`h)CU^hH*uZ=
zO$g??tVzj>&Q7;3&LK_pYl_7YS-ZC%ZbAK~by<LvaP!2CoZ5#|mq=T^vq3(UIV7<z
zJ$hN1B$5ZxfM%dbB3<qTNwZJvrR?mrYV<lZF6ksENRagHjZlSvrrWSpq`DKHSv;9I
z&*35{jH!F_rsUb@osRFZ`m@san0Gd;imBc;(7-paHqLvW!KQ>Ay#N{UY~sAJ<Cx#y
zp^4*J!+95HqV}C+K+c;Lh1xBN#(HyMVpBszwg7D#>s^J(G|Z6rao)8@VPb8A(xt%G
zjDOIvo;RSq)z_`hjggYc)71-PO0L@spOy{yqYRHHAdcsB8^Y{)5`IAvyC=_Zd>o8p
b&3IKxC(nnY0Ikj!vx(53O!iL9m*4&mm3y;K

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Digest.crc32 b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Digest.crc32
new file mode 100644
index 000000000000..46f41cdb9804
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3249746475
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Filter.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Partitions.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Rows.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..1a324e57b52cbd3ad18ecee4bf8869a43a013254
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r_PkPK8VTKoQ>IPAycWjupu);$6jWJIj#ZB1|}9<94yQX
z>rXQ?91dn?xOO{$+0fLI2PF0?ftlfpQUIHwk+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
crmKZU2VeNm)k5P#cS`AMp~>M3^H41W0FDEH_W%F@

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Statistics.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..09913adde254313e1cfeec77d3e43053248a9a93
GIT binary patch
literal 7168
zcmeI$e@qis902fpcWr?J9TY@?h?|1^h$#pnkcd@Ug>EJ=eoWk4<e&}3K}*`okhw@4
zW(!%6`3ucZr%Q-IBN4U<%(*{wuuV6GxUpXlmUL`!*}55IsHEMUZT0(oTQakMwq^H{
z%e_y(ukCwx@7??67>1E8O^%f_-m*+M5#gv24pul`6%LMJV$D%KNz+WK)mE+H>-j3P
zNmIq!?YzZk<26RJLzB1NVJfs4O*WIUh_^Xv95q&p$);^-`ZoVe<O>zoFLww(6zlPL
z+@aA#zW3&C(mu0jy04q<eQ!=aIX52m+$Hyf&_rw|7O@Yo&=&g!i_k|@WJnC-AD{nY
zg}>rE2T%^AtfU-5Sw%UVawO#ilowK7LOF$UI^|WA3n)KDxtj70$_FU-Q69r4)cqhU
zsknpk1#EUc<z3hu_4`~Wj?2oh<r&zDT<id9yn%OdTse##;=+DpH@5l>?5L-)V@9zb
zO~qb%3|mu<opl5|?>u(VJnV{SZ2Obgtvq%YHGhZoIPQw`*}INN+^;B3%1g12;Q2{h
zpQAWig4Z+qF*aBJDQYj1;rTBcxq{;I58IFxzjPx9mUxhZ+$Q8;cLcJ^i05Ot{s$C~
z9Ed_*FcE{iaI6@4$yI#6Q)cn~&S-xhwa=aW4*Bu5xPO~soG8BaCpB_&VGr`*+u6wd
zm+<u&??=~@D8p{S_C%X6?gRG=*}3@SeFznD`C=M0n|KaAU)h7I>oCs6G@|*z4LlnJ
z<Fdw})zI?IV`#hzYfU4J2lP8<po8@5rlEu52GM#7EmvQI@i5LX1Rar;faddj)$Kzt
z9((Gw8_<ufDCmVw{9@xO=+tZGGtfB|FCK-~f0ecoy7*XKCUjN8J8PgD?u1T3w|&r^
z4}GfS@M7psI$HKX&%8al4wxi`Rt`W{>2z@YlkTE({`D_5BcA;#xc*s_bK+g_eRlHn
z>Ly^$L7aPmW${<l2I$$@9q|6he``Mg_m6<&h^e)pePGAgrni8Vg~x|%z#&`Oeum>$
zeHnfOjz4^)pid3j&*NL`;eHT(Qah*t@wkts)bM`A-^@3(g7~sBR|3ojGBbW0Qh~U3
z?EAszfj4yaoQC~*#+8!^=d-0)-_;J<w^a5IPXZtPX8e2?@P!j^_Q3u+FQ$xUJ%|(a
zOIPOdfDK!3-QD%TjNMnq&)`4iU@Z8KSpC1`T;4s8VbZDbqctwXq?`*~ASvfUmxKA+
zIhXfmq?}92xxj>ya;|?e=OUKj+!IYq@JKNuKQw4)=xx4tepwm4Q+UTA9{yXU-M8oo
z-I~p>_zlk_>CfM%RBGyYo88RUYRXF2XP0iTHxc25NXyx@wB_j;D>7GRW#>pW=^9A$
zKw1m`?&pEx-eXI8SV<3TY15K6?*B~k;?L3U_$tELm+&#;Zs9ZGMF})W2F0L|8h*A#
ztZ)r{t#FQKa=2t}uF?8>XL>WkJbcdg&@H`xwfwJ=Nu70@#bL7PthL+fEMiufSpD^;
F-vQ~`sY3t&

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-TOC.txt b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-TOC.txt
new file mode 100644
index 000000000000..2b093990c9a5
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_clust_counter/ba-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Data.db
+CompressionInfo.db
+TOC.txt
+Digest.crc32
+Statistics.db
+Filter.db
+Rows.db
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..fc38a25eea5d107acffd27df97523c4e5b4c1664
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fP8AtDeS02*!yKmY&$

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Data.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..0fac1ff244b6f128f335d1af3ca2dd903965b047
GIT binary patch
literal 89
zcmZ3>z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpD<SgffG)
lkurnL8IKMi%NWQy7a#y+nJ6=uUJ&$9W(WXlP?tLp4*)NR7PSBX

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Digest.crc32 b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Digest.crc32
new file mode 100644
index 000000000000..40185ea83742
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Digest.crc32
@@ -0,0 +1 @@
+1237147338
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Filter.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Partitions.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..e20b4e2f2700a2d5278b3858fd96631b7f66fa97
GIT binary patch
literal 59
zcmd<!{Lf*2kK^uc4yGBL`u|H87_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1ldcHk

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Rows.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Statistics.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..55d370a854f66f31b6c00c9aba7076d55e263871
GIT binary patch
literal 4669
zcmeI$YfMvD902fh?t?d#7DOJAZ7MzpxfmHk6R{KPpk@ikV=%WxE^TLVT1eY5WG)&9
zU&z9knM0?d(}b96B#Il4IsKrco4N^vtlKh3O**!?Y~73^PDXd@diDQ5*@u0eJ;}{E
zKmPw-&bjxTo|6(ngvHC!Me<L{Rw}Aob;`vl*E`C^6QcF>b=gcLr{2Crk{e``)gqZ>
zheNiR?XqOHI;FxLPD_d1Y_VI+rLx^w=d7!@S?v0j<{L$)lisWuyV|MTC_NAe%*Llu
zaqq2G+A;OS?Oqpm_Olt$J2Rf}+U3~^B_)^^EWtj&!aCSDSd>1ZA|Mf>j!*q_gxUzz
zk&L4l$1sj#tYNHWoWwYV@%)h0elBF<S&UaOUd_0e@e7Q%Fy6^{ALCxe9&F0I5Bdxf
zcQU?!&CO%H2b*W^&mY2Zp%Pon#f~Vzj%3CgbsxuL+}Lq`>^ZI2x}(_1o3PVHv7gMr
zUVH>ws>aSgj9qvhyL2vgO)9oyBX*mN-ObEjw*klf8OSu_5A5m+?8A6|QvWw7&Xwc!
z%w56etG`D1f`I40FmesW#V^{CBYt-wN0kSVqh~G1v9pQD8Z(}cTEovMo-~k*oHCJy
zJl|7>yzn}H-&xc6edl&uMEM2YACWh##r=CR&4=P!f7K!HF6lu&aBn4Y|7HAq#{1Fp
zq%jG1U_R9s1^0o6h3;B#;vtj<y?SXmG`FM=eZO)aXvSciPisQ+gCBS$8pegD=rz#d
z%N{h|i2AxF7?14tO+iN+*58JX%@{=MDZW}a4C4vBaR@pwKNHR8d769sVLbihdpDt<
zdb;>5^pbC@Rzv3uTTer;s(Gsm+VK7ID(JE!+w-7JnV&oh-S}txBy{`dt|I7@<p&l(
zf7RLYF7(vLqw9g`()ii|Xw$lNaQ#zPX&+qwH1NI#u7A$roA?xbp7Wks(+tczsc$c^
zu;{wZ2t7T$6TTnuZpS{je?%@zoLmd?qdLztp9795`Dn-v9JjgsH#mOHciNk9{MwP?
zULDAvE4MYk{UG&(eozAOj4vm3@O>@1Rb*@f@$5=}COi-1<^D3H0dc+Or@=RXpFh-d
z3ijtE|EfGVpKWD^?hcUOQrqwL0(bo|e!d&{!m;B$u)n@bS)=)n;#BwcwV6U-<JLR(
z_dGIjgxczNu*g_V<<v%~{=b~L)M16sT<Vi}P>0W4=y8znnF~FS@R<wlA%uj_T#wP2
zi`v`;$C^p(NEs3L4;o(=h9A_e$tk;(-#CJox>oewM%k5ZKB^kk%8m;T%IdzW)S>$m
z_XZ_j$$|#|I~Y`YGWJ)n!8gj=lscA3eXmx|G}RycuthoLF8%kPY-*+LvfW{ow@H;1
U&#kQ3(O?PP;Xmi!wN{z`2Fc}^=Kufz

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-TOC.txt b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-TOC.txt
new file mode 100644
index 000000000000..2b093990c9a5
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple/ba-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Data.db
+CompressionInfo.db
+TOC.txt
+Digest.crc32
+Statistics.db
+Filter.db
+Rows.db
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..e2860e1eb16acf51fdc0f6e8d44ad2c33bab9080
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$j+Y5D^Fu029>+z5oCK

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Data.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..7b258def9d97ce108c70937af8dd51897651b469
GIT binary patch
literal 141
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13ItZ4ajkZSZDWI%)SUKcBi52G$w7
zJj<FFXfR|lFfuYQ8d@>rYw<H22~DtJU^lPq`C-E#V`Rf%e@1o!NT4xP11Mk&6ga17
h2oh)xc>okJ0ScUdlnE3#&?@_a0Rk8qm+$%=3;<q3C>H<#

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Digest.crc32 b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Digest.crc32
new file mode 100644
index 000000000000..27b4432f967b
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Digest.crc32
@@ -0,0 +1 @@
+178975066
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Filter.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Partitions.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..773d3c8891c3128145d30327bae64f5fc3b26757
GIT binary patch
literal 60
zcmd<!{Lf*2l;dtEC)0mlPW}Ia3k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G`YV
A4FCWD

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Rows.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Statistics.db b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..fc9a7da7a404171d6978c6279815dca225a49d9b
GIT binary patch
literal 4678
zcmeI$e@q)y902h5?t1*D(!$t)43)t)$Wo?=YzeFmI-$k{<cC9@k)yo~CbZ;QCc~MT
z3I7m9T%2sWF`Wrnx@F9v^20g*7z1_a1|e3>AedA%Gg@6XWR6hpw%+#rzL@y8u`jv2
z``q{2<Gpw9^&TaJ=vHT@2;_EIr&Lrq>Xn02j#rg~C&b_#=yBLcwo~425}QO@on*3!
zE|=)A%c99%=Qb7Ya!aMMU6LhxnJByK-Stj~B%9k>e<(hg@M6u>g>L0WDS<#>DK?1;
z`>wXruK9bf^?SI}pDqfsi_>w>k1Wk7DQcQp)HcAvT5TIFN*htpArYeWPrDqZodoM}
z#u1F87{@TyGd3_zV7!9yO2(@gXEA;-XsuuQOuU5glZ>}B-pzPF<9^0oY|1<j`Y;oB
zGd_#WEoZzJn`iFNAHs266}FIr9aexH&h$6pCXPpqVaNEf?`g+2zKfmsG<NbN_I=sd
ztB+uts<HD9V;7#mE?b6OlZ5Sh0=q-R?q%lhm<7lEsmL_-5A5nn?8DmpynUJazd>=X
z0<UN8OKiUSYm~3k;rXwd7)Np8^G@Wj-#y3?6#?YPB?&otDIQsG$Mex(`5DC%h7yri
z%p@bP^p+#9x`>~5)&hRsIbENj{DRpZkssTP+xJwm55;%<YD9jyv=8~<jSa|yBY1qK
z2hn)asJQDepPGx+_rRS(_oN-W1LcHXIJX{}TQh*ZU%9vRQ!vgax1jmK4?P<R<GPl}
zP0+$NFY0fYv%Uq!!v}ry(2<rc*Px?QhtYb9tu|hQ@i^W(3LT%9f#&ma{fz@Ko^t$+
zE712pRB{@6&9__gp|da5orK<4^HLAA<@@zpq05gn=0e*tK7Isx=U=gN(4FT!#n8tq
z4yHkW)!p_w^!!JYTY%}>*xDgzTTu~Q|I|}90M|bays3xlpObtupMcMEvnMvS0`qR_
z+Xt*mzi6~VFD&ea&qw&PYd?H{glERjZ3g)f-5<1m2pm=V-lz;5^IYd|u>bn+3|C<P
z4HG5(Mv%Wu>}Z1TgQR2TVH1d_elcf+&nx|Ev9$xl*H!s5;C>)C=a*4Eh?~7X4Zi^V
z_@TZNuszTCH|E0m>?pVNc7go1+QG3|;GWCVXL^Cp9(}(Lw%2zqYclU{oEl#lUn~T+
z?znz)?_CpD@Lc;278q-(oOTkd|F3&4Z5YrRS1yM3T-wOoszZA&G#n(f=R)HV+H-~W
zTwp@{xAt7rF;;N2l|)aJ6XC$HwK?b1t-3uiX^-+7hk9sTGtZP{=BI6A{+|p+l{YTf
zC~NOsr4HVojH~-3B?}t-Z8fO0WNJh`;dhD+N}WzFdr16Zi}Rhg`%e+F_Md;UsWml<
rva3#PFjZAPx}kDclN7we1OJv^<ZO1hCArAi(A?-yKX`s5<F&s5pCO!g

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-TOC.txt b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-TOC.txt
new file mode 100644
index 000000000000..2b093990c9a5
--- /dev/null
+++ b/test/data/legacy-sstables/ba/legacy_tables/legacy_ba_simple_counter/ba-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+Partitions.db
+Data.db
+CompressionInfo.db
+TOC.txt
+Digest.crc32
+Statistics.db
+Filter.db
+Rows.db
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-CompressionInfo.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..307774984c2eeecc0954fb9ff37553e319fba29f
GIT binary patch
literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{q5zK$;EAh7-&uAx!qY5SsfUgyug5
Ip@o+K0Jj+oz5oCK

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Data.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..1a295113288a8f7c73556bc4d86e75e2ab6b1ff7
GIT binary patch
literal 5330
zcmd5=eNa@_6@Lq^;Kt@zE7UOQJfz!*vT&DWSr)6@-KZT*t&m#3rV|CfSWuTmcGraw
zcEyma)*2F%NX^)o5ZG;<M4+7sPOU7T@*xDRA(*1<*iJPw$&_}IisQ%Yx%ch+SSSC-
z|Mr~wyXT$z-ueB`xyvvr<_$)r_wqdNgWo)(WnOO6>oW~m#%xoLIakBL3V)KX@zt|T
zyw6uvx_6ho<bbod*j?pt+FkCFN>7E$zW+e6r*vmYc}baL``*&+<@+it?C#1cw|!qp
zS!r>VtK@*as?<?d?sD!cu{-wd@OW%{i_7fpJypB+9;n`4wWrFy+r48)adok0_l^o%
zNky5*wX33fUzyXn-&NwQa932=T>C3*PDrmkbA_5D)l}}UU*5Gg-lDEVZJGNuG||~D
zv&wz1)>~dXlLzkhOe{h3x=M;HYFDD+x|)rg%IovjTe^-rz|CcuDDor(P+8QdIZUv|
zQ;diu&anxC%#K2GUtiR^6HPLZv8Zu3yV<94aXHb*6ehu)XqJ&Aq{#+Awk?+=-#B}6
z4)Lv-wy2$);UaZ5V@tQ6S?c84Fe0E99NN|3g*~0ZaslpcoCj&KKKcS$aQE3pVhYP(
zUJXW>-;62Ep6{ndnc7x`iF#II<Q5~Lo?m;nx;TRvxW|4c#5%dGh>d!xu^qDHm(VOx
zs3va<yj2N()Y8dHS)rch&RkdxvBHsh&L@-CAb|%$avA}2&7-imJ3da#DS+<#mHT?<
z&&bFY)SD90x;UelV6Q2pg#<gbk=N+&=!E>`<u#9NG8*@8$ybC%y$5UjPSkE>5`^RR
zo~SvkM4vN<6vylBw<L*4LLTF?72y8!$VLK*S@o%F{p8~&MP6Iyg^$6^qh>S4q4E1y
zz+NHP1vvCy{a>|}T-$Qu_-7&IscKL03<KmnM+hrWSHrx+lj_D@37ek`*0!xwz0clN
zy~qBUD2iEy)kxop)sFX7j4IO$E+Rinftu6bjwJv#KbGGJRA&293)iU~+bHoBZ0@^n
z5r}V!<2SN471=}i+_y4_DfKOs(9H$7KgkDW!5J3LkpeS54eno>Ox#&BDk%VA-)d5A
zoN+?Vf_S$u#G4(B<vd=_iQ6+x(O~W*d~Yr%8jC)fh6>@#(NHw1hq&zbFl5Fz7g#P;
z1DApOLUj<w)EPLYM1It_31}p)x8Q-4c&`R@&LFaXU_}uLC|1{i?KfMY+C<W#fy2kP
zRxo*66cI*%#`1>2ctT7GKlrT7wiYy)#jlqa@!-mZOoBoI1~<Z9HX;DHudG9sDcEbT
zT_fuO8mz%cApnCPgcMoe!H#^zDe&NMlxPjjwkcN8&?3-P>BK{;yo%@Gp`EEjr6jry
z{hpYj2%x^&hi}L<MnfZcs}$tKL(>EN0{P_(Bd-zAq6Q4N*81hmc=$7`@(h=^MTybn
zkCgI)M&fGP6?;djKtIT57GwbT{jL$Kovb7SeSfG0eIzPyx{&te-0{`Fq@0gx)wg&b
zA${M4&=W#>^KWDLFWE!0_*?APs<VXjmy5{3y5snNu-4eK>_vwElDKboM6|%*1tw4}
zIIiLJr^)t5K<@X56pZe%eVY_qtp>`H8Ne40K(-zBl#|XrddI7Avyu#4y$}Yj09_5E
zabi@W1AmJp9OSZ|2gVD#3E?wHW8{p-r38rc#X{oDjz;>aG?=21Uo{Y9j*J8lzE)rc
zFFO~*5qW6r*KYeDL5cjIw#@i6O7f2n#7B|;lb`WR6#Sc5+!#fEH1UB&Ei++aiqBJ^
zuRqwVC{Ofx5D172rjbnPU3iW7?;lPpyWV6hh9)kl!Q@NFDnUK$3eQ2>$0KrsCx<~1
zpo~Qg_+%U;9^^?J1JDig!x|u%XyY4WW8`^}IRM?viwbx1P32gCrcy8NP)Nv>ov={~
zHC5|Vo^R?cyh>h0QxinGA~Miaex%+KPELBE8y-!^f#w41*m4U|Gh>L(RRn^=$;)sf
z?CO$S)^PH>F-yVKEw~t~8ZD|;1;G<bCxa&N;FOg7SqUz-SHP&OYR*{)AG%MrN7W}_
zOTX;6_bRBC@sbo3|2XFii7_BU_WQ}8u!<@~6XQ2>uf&&v<M||hZ6R&@D>oNxpqe0z
z?nD{rplcLVVQj+pgs4K!XF_x3pmj4?jjueF52C=C?<zVmXo3eCxs%=SG>|2iC0HU9
zUra}YBFhVYDik9ur;m$~nm9}WY*;(Ls6d<=1O;gFk}taWQlph>g<=?Z&3B~U8rJTF
zu0pO1R$*?~W(lXJr&@u<hGp`Ip{moM)IG1$wx4xfo)Kc85U$79!O(BKr#J<8mn`mK
z-CTAwmUEpj(#b&qCLwNydY~CEz;HCaiMgc3OI3Fu1vXLeKq;Y36U4DlcAT<U$;dE7
zD+LNUoVM(!5eg%p-WLK`J2-O)I3}zIP}&Rm(0L7Y^tuqhX~iDMT3p3xE+}AydCU=I
zHie_izZ8PW<ut8;bcKmc^%O>Fx8|3^6Cw}9Y4^qyPe8f^_E$hmaaE@ssMCOC#4Uvw
zd;n=5JgrzpI{o%WA{b)!I&(kF^jSUh;t)W(Rci^$rPZG|z!gO{*Kay3n^;$(@7EGF
zlAc}Np*)W6=w{gra>w7Sgs&-JgXryfG|hf07XkE>VGSmeLS&gcO&%;-sJXS9g$Ic?
z_aVvBIEyMo0PD`1<rfm<y6B=9XdueE{JJviwFV*yA~L+ayQq*}kTVZXoF~NR=RS5-
zZp)4u58wr#eBhwf{@fD(cO(WDHD{9V#e{p7@N?Ipu{slTqVC3c@j4$FhRrfM272(W
zX>hXtTR_UCaV|J&VB)3M2%{HVR1pf367_$SuI^M2bpO8dt%8vLi=ZF~Zb}Gx{~yz0
z-QAAI)gXO?2g)PSFe&|i0AhgCU&<#O$jXuaJ|H~awU9uK??^ciXDOvt0){O408V#|
zk!ll|-Mq~jPWPN4jvvx>>Uf#fqd;})@LLGt85z+qPsx1}<RmZ@=oBG8(($M01wbt0
zcUuq9`+`XS^rl7<Qmm<SKPrW)j!-v<{-i8a^l`Ehda=l^gmzCTYkFvYRGiU<Iv8vs
zybE$ye?S_S!1y2+H?mG5X%LMUcY^|o>E6_nM+$5Ya{pRymyz5T&p;0qW1_FF5akrl
z@eSdwxW?ThED|mB&><rOC`0|JM`CUOXUtim;CT?SFb2v@hV^+f@=Ji-3pO5>WdQ7F
z6KiFrqYQ2Pq+%Ioq}pT#LonToieNZHpDUToh4d<aZ;fg3tVIlSF7kIA&{@NL&L5&2
z&M+?ejyU|7%(NR1scy0rF*n$0xZHP=TL0-ZWmR?&w8&0&EQTM2`5C7t!kx&;nH*rV
zVHoQ>1^%a}v=V=gj3!Js5K-XFGMG4*B@_Bv=oHipJvx2$ghjo#ko-5OI%2xq@PAY(
B6CnTq

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Digest.crc32 b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Digest.crc32
new file mode 100644
index 000000000000..a94bf9de2134
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3247259799
\ No newline at end of file
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Filter.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Partitions.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Rows.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..bf8566506058273b20b0f18fe6c1d26307107115
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9t;b6+-2x2leXJfYKWUAv3He_a)%+AcPf<1uQz{G-!gN2!4
zFB>z%B_Mvz7Qk$1YRLl<`@_u4$krIZW@uz=VqxWKsUpC^qUgZK&%kKFz{^|z|Ns96
z1_)ph<zdp|fmz4Rl*tXXj)$p^8*CjH&^j(8>o|ed0r7KAB<t9K*0o@>&hQ{x6Qvdc
z9dL;cXrU02g#the`><JPbcmN7-9iB-EfJUxrI|A2p+1ygs#6C0Pz~rqEhHc60xbjv
d^m9`r3(bHQZo_7w@nPX<bhXgrh?Fm?g#fU`WBmXC

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Statistics.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..8dcf1546d03218661b139dca280b10205d28cc84
GIT binary patch
literal 7176
zcmeI$drVVT7y$5l?rlq{P@y0QM3^W(m=?xEU_(2x4p<gJ9s~EtxX^~;l(M!uKsKG?
zvXF&!Mnw}Jdn9PkND!UC9Q#8Boo*^I>$VKSl8#N6t(!pxO6cyjt$yFx{vi8jTXt`9
zbI-5eX?xB+kNf2^3?o~xI7-fVjwQl{2xpaWvcmbcaB>V2Ro4)sR4@r<OQn{t;VVo=
zZ3S<&^3?_luQix#+N?U8G0$Q!T8xH#-eRk=Rhg@e7F}cgciAUGx0Kzu+#>u?l-uo|
z3XCA~UAK1<`&i6qXB&I+lS%pT<Y3Ut=ck5*Dq=P95ZeF`+G5+_A+!-4G9-ra_0RXQ
z!dLKK{U|Fbt0)IhR#Oh997=gM<vEn+QI4maL^+dk4&`SlS5j`EyoYipWd}B)#)B-Q
z;x@{?*lalE?bsak``iH>mlb2nld%=cvHht2D(~UAsvkSRg*|g8w&p$Tu;;KN2e2PW
zz+P||TU&~qb`U%3EO!1Z?6L@K>sst49=nyAzx{d~cSU>cSw|%LHxwtOMc4=N{3Na~
zQJgKr>zVxwn=AbS)tAZe{Fhz7g5vUv&B%&h+mMxoZe;%{Bl3)?5M;Fh&&Oc>k0>76
z9fmx6C=z*&qX2o{RXpDD<9NK2?VqCh%ZI;5etIQt-?~UAim(4!gS;)T9eMvmI&#-}
ze18VJ(ETK;pgXWW(PfLzfd>cK8hi8sgaWyIE)kktcn0mS>|XT^80R8u(fr`LU-XA@
zS*`yHX!#2c)L(_Ususrmx}0Os{`%FU&@-ZY(0U3i)m($|ATGBLIwUO)&F66S#6B30
zI`-}@=(#C5C!rU9RgwvvaLsfAI-~5hL(uwf5=)>94sT3_u88~SN$AaY14p2nFSKPt
zA1mA+3;lUZ<2%q}rw3L8lSP5$-Ov@QR>Ad8+Vao%*1uSdxOc1J`e%*Kp^w4$+2P|W
z>VY{Maqa?^&A+P2g&rSofa4?o-M$AtKl~PljI0Fpl`S3hr+`&?@Ap}N172$W1@>S4
zb?`0N|KRI6of=So7T;6@p9c{~bv;@TkG?dbf#Ws*c6M$Ph%YI2#ld_aHTkDLHHhmR
zKlHo`{LF#&<FGx?yE0PYe6B9gx7tDd#`3QIVc<jG4xViV?)~7ycGzC$x%h#!hjF6W
zab+?KICuS>d)ptHv3kpR5B_rw#)9vN#s5pr<$2~YOcK?9w8n*)lyjjQB;{P_b})ZC
z=kolFlygZr7no2|&h<~`Tx1g$cJp;6;EBDAydzW->)Ln!y7tkxwhHe!#M3uaI(z0t
zUCBC_KlnlnS@hTMQ_8g)d5hJ=Z_pMOJ(XTmS7Ri?3z5c-#Kgyvl2cNbrKM*`CFveW
z^FUe)|L*-jaev#AHY;hvmL6KtgZn>|y!aBdyWWCu^(K7G#8Tli;YA5FNCw5=CN4b2
zAr`pJ{08B=jUhL;JZ`Ssy{7w3l_{mR_mv|%F1VT|?UxKWX5qc!^pn78*Yy1Nw5yN4
HQMB|AUWBNc

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-TOC.txt b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-TOC.txt
new file mode 100644
index 000000000000..5db770495d06
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust/bb-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+TOC.txt
+Rows.db
+Data.db
+Partitions.db
+CompressionInfo.db
+Statistics.db
+Filter.db
+Digest.crc32
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-CompressionInfo.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..22deb5490bef42a6e1e96a93ee3d399db21541af
GIT binary patch
literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^sbfHW(Z4JTOkK$x605Sp(SLJMaD
E0CZ*x_5c6?

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Data.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..6045f6cda87249a712b1ca0ca659f1f3e006afbb
GIT binary patch
literal 5946
zcmZ{odsI}_9>>qX=<K6i4vEnWMF$6P&S7Ra0}RXzl?UG-3PUaxixO5><59`IS{Hi&
zDcO`(q$RYpbrEr9$})!3=w&=+2+R^K3(FJ+d)Q+I_M+FF``c$`&N;Z2{=ss6f4|?}
zzu(^b_u2alK`4kFgu+x82n6ck{{bR_Xz-X!Y3Ui}%q)x5HjMyFpy|>Kh9Zb0LQzj#
zd@y_cUC$Q0v2|Lxx#sA<T*MU5%o*B0?PryBgaV>F(#|V!OlzIK;T+MOo8VWfEJ^t}
zekJEgJ3<P>O?4{Qr!@G9mwq1xW8&PFVI|j*HX_Ci6Wvw05NAuOtoJKp)8nU5p+sO%
zKhgbEwO@(Y$GiMWI%7zPYJwpZc#39hdHEZNGT%gmqp;;{Bes0r4>6gOWBwe(mfs-J
z*eofA@GJQqsu*XHJk1SJmdR1GVI;(A)evQsqVP=FcCf@{M(Ze7CN8&mYAuQNWkaBA
z?mm=BrymjwZEO5QPlE$gUEPo~iuR1_vA>h+T5HrIOJeEp2&_x`;gItP(bF8=OCok>
zsb5KXg{5?IJqJEAAvW3hk%<m@I{BVcZZC;6rzdHDx8Q_-LCz>DIHHs9{YMV2Z}wF+
zFkndJdKpIyvdz}Psz}=6aXn~@qa}^+T~*_ZLhQ<RN#lB}Dyk4Wd)?toIzyb~{!U0n
zci)(oq&<`_RN=!=NJ?T&FDcEj<8UUj%zh@fm*hIa(OmC-qn6aP8MSnVGfIf%dk<S5
zi$Uz6YOgHVOX*0qixG2i(I11N`H0emGjVYj)Hvbu<zk$R{_VB}knbNKZwY%jdGCw~
zFR4n=MWt%R+`QUUEn;=?qpN6hc%8giH|#}>E-h6{dyLvpS<l|qgxJKVAB&Os<vGhB
zcJ2)YqQ-k~H&v*SrhL;5QRBQ1L3WI8!+0&%VRpm>2lO^=+5y2kCpo^BA5W?e=Es9y
zc#*w5U9f9R_XMnbSmeB)RjNtN>8hh)$wiMhWy2ibJIiFTzA20|O2sh5rCQokp$;x`
z*SqB-xHo$oP>ZKrP}>Seh_27d)rd(vQ?8~xcZq8E6#adm{(aI!d)7(nHF>>=O;R_+
zP-}zYnh~PgTHg?(N>0kitu9m$inQ53;|V*~q)>z}X(Noy=r*b;Mo^uF!$e=?U$r3q
z&vOWiKiUN}-FYzSYXW%nDrQuWRu~Y^T#Chf@GWk<GOCz)!w^^(L{-{n5kNP}9TE5y
z$TC9o#Wq7jq_bClGop-yo=?vP3x%)~;U)RLRD^qoEym!HLUD61gcd^V0%!)>ECpf9
zY9I+>i>+YEoE%dR?S$C6T(D$KilKMH#<9CP!C;vj(lG?R#2$omPFtlAn1O%q)<N9M
zDYrt3)VC~PhuS<P@GH*X(SDdm<A~OA=_V-%TYiN8N4o#1achR)!~1Si<DF+#!T}KJ
z<K+6az!{r<7DR~b>0gGiptmQ8ZRh0s%ZuSO+4Oh1C6ViYOmN%^DTaHB@Be|pkQ##%
zD}rkP)k6^}hDfnRo%{ej21&+Uw=_N+$_;S75@a)^_B&!EZT=-aVhwXH>7`a4SlI$S
zr7M!7hxb)>LVYZ=?$!mOCpU1fe-*MADoQ88XW&tQ59?@wtV8hO23o+;W@v$}3s21t
z94d9hsBDJUnkA7N_!@AQV)#l*kv}#S(4$H*PLXTLuue$B0IEgi9h<9Ij#7-daD#+l
zb8L$<iqvd|OG51KW33>m_s-cP;m5Z*KOV8nBl+>78-MJi15m>p9|96ev18-4Xr^)X
zf(V6EQs2K3!fmOmYhd+50OH?2Q4QOflG-P`HjcBouuCjwmIog%p$m|Wd%z(%a>w5p
zg~j5IH_F!K1HEG%VofWe9}p+zKiCfRQdvwcJ3feE1o@t8hmB&G=5gRsh(G`k-+udu
z5~<rJ6Uf)uRtA`Wzj6Zkj_<?j`};r*AqW`e`<m6FWM=fQTXq70((8af_4lg_@zaMb
zz%**bH2UzVy-y-G?ZqEnqQg&<_4<Zk;?(AMGf{HpbEin6hol%GpsmnPoS2U9=5%dK
zHpL3`a&f1;8XaOY&og`Jv}w!n$V$8LvU1LR_(y?XBEGVw5f26F_0f#ME7L1$29oHN
z(NAolb_p-E>A0Iy88}K>V`-3G=B2{RXgU?A+=8tufIx^Xf&Qg4CkN68yd=qXm@tPf
z3;NrHPpsuCM+|L|1rY$8kM9`51D3&UNn2%MVB=0tT^f&4tg|ZB)XW<is5=y)JU2oj
zbYG<!S*@p|y(IAHNF=PvMyeq)5QhUlEr)B&lD!q0g7c)`t_Z|Y1_kyY07fTV`=gL0
z{ov4ulJ7`QlZ6WK9=fHmLanmhcuxWxx6t#5A8NPbEDy9uN02`=3mR2ryUAQEiQH$|
zlnyaBZ>&&D(BnVfIt=yO<~-La0g3zk9Voyyr*l|}=FcW3;DzEdC5=0~I$DQpe34a(
z<<B<OW9aRp3&eKk&z^(iknUom{DE}h9DX3Bk4q0E{)^4+y`;*LvzM6!sdFkA(KdHR
zHB23*)VUS-g*3O?Std<$?!!(D=D8PhrF^*a`FM5at!&Dc9M4z6ma0<bRmwW~d@Zhd
z-ljph_WaFxptaY_YCiup=G3oHNr*MAbq2VRhZD4O@{nad+w6k|PZeHEZEHq}!NUGZ
zwJIfJC8Tsjc;;gN5%4d+06d;w=YT`fVSYVmdfL!13Z_qT`e08M=*vfyRA<nl7*+DK
z_LhzUN#C6bVA&=-pISlYv^Vd`Mz)1HU410=X3#rgR7vmdgJxZLaH5trza^=sV`0Y@
zo>7DE+v3PvqVjXN`WD5s!-VgM&fNUs^=cKnXbE(;`KTm6+J)12_N<AzS9tWa0yUu0
z{Y(L}WVDUB;GKVW&_FEO!wp{e?j^)ze7B*B_Ed&sIO-?0JtRsr=XH|`^#@`4wJN~k
z$c5)qhWTkY*2M28!?0_`{c(zX`4Et4N$G+H;95o<^v$))1*~NRv9?}M5yMGw<FMt6
zll>+-%^&>aQWWKJx^Hdp#Lb^_IEo5-<NX;<s>A1F@h*EFoi-{s<ByhM=BD3Qf%D=&
zfQ_*N*jyY_w=D*x<k!O5g(RLCtI5%-*rkziPSKBN7s19M^@=Y|RQAt-M7!{Wn|C6s
zKmeln<!KVW_|{yyR@ij*t>!C3acNJB1jVd<m?))Gbz~qFHk$LhVK(i>cE2(bw%xKp
z5lE$SFj7kKp-G9)8C9whmzjcQ>t(?kOc0z{n{YuH0c9<n0bykntb>^z1B1073I+#=
z$p`Htd3{zroKQ<a+8|gx>8(;in8QhDisF%F@!3vbC{Rom3+EfwR=))K;B4kWVpN~-
zk>}#|Wi>D#Thb3DWNbxZfr=NSc>R3@wr#Y6%`6)Y7vES6rnqfI@pR^8mAPOj#O}i<
zy-PT|Fv+_x1Y(a?!^EQIFU8tOvrIE6T42p=g~X6tyoLcsCb|Y^7>+h141+lH`pN~w
zR-X1;D}4O9xMYi2i)^{`a(&X|2F3$b$0Yfc5vjmNaRhTYjG*BlJmbYEs5{`|4L8|9
zOSCxNBZ+M>@U191FUGr~nO^}9yO72imcOtL*$N9LWQ7{4plrmhgQ3Mj4gbCfvF!aw
z)ZE<W$MJ9vcfrSOa5y*p#GmlUEPRxMZ@I9vV_!C@d%W>Q=xeh8HW#0K-tLZ4vAO52
zg*R8ok9&RsR?SlQOsNkeEgZe~(-Ii)g-=UFNBWiNb%-f+Pp<>F?E=VLoZ;|x0D58j
z?R!K&?wthC;ts7>hl+4>-6XEzZQzn9^}{A1O&2VQIOSl)0#ft7OyPz$!Fs4az4$8a
z2{OyY&)(1lccdk6f7dqn{<|mi`3A>R7rko-VwO<5c7!?BM!=<BFrf!^+3t0;rz+@-
z)wQpeA?uR0@O?=I;U@TQqMgXRq#o`K%qepn?CRBoz$u%IQ-<%GaLJ?IGLcizAaFMr
z7oI7JSF!fFN1`aJ5YV<BP-XaO5He;8zbk<c<1nn&L&nCUMp${)p8FLL@467Jj4Qr^
zpH1xc-A6@@H{PBH>!2y74#>2%YFNN)O6F-5H9u0XWCxF+u>#<M*nisLv>X&PAq8Xg
zNxa(C6I<xCSFs#n7k)(>e0u(aRv$sUI_ZbC+7R1}4?TWmw%PwPB(crfs|eImjhJ51
zxV;;SVLI)Duv5S`$~iSqhv5_m#B$q(ZrlQe1D=*+07X*?k%3yk7}t0;MCrCcV*{Le
zq!9s5yy4FlyzPZQp&*@Pp+Pzsn?`W++jp8^#xS0Y|80jMV|GLH-mB))Ga!?2nwU^V
z*CSv~Y#qFyR20Q?>1NrexQqi8U}l}2fR4<|Dtyg~O)mj(=&Z}4UY~_c9o1m6To!e7
z44aO?>WF>Na|TOq=E$VJZjhLsXo*;E?!!jVbiN7DLb!5sk=3vyY~dgIc_Ih6Q#H7m
z?E}NOnZtD8J}JYKe9#Y*1Yzao&cR7U?3sGmyk8Mm`a+_Qo4fc8mPYI{uOIMHV!wp?
z#Tv%n07>)1FRP?Hes^H6u`!8mT&m*EYEm<CXLY#aU3}^kcrqc~U+zA5eFY)q=2WZU
zYF9_VjTEL4IFc-`t4LWRgA~Zoupo%NDpyPfG<84DCVJ6*beK|n>h&JbV$?n|0?R?h
zfol?!j}ou@2KIlF&XnUJXck^<qOSoK0cjFt8kcHfA3$u8PSV`886-=Q-mx9%6k--<
zr2YUp+oSlsR0vo^$sR6l<cSy+V~>s1O#)U*O%th3E{vbVtE?*cRJI$d;KwZ)ujaNt
SNw(XHiQqqlVxRZHUH=DQnVj_i

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Digest.crc32 b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Digest.crc32
new file mode 100644
index 000000000000..50cd0ea6af2d
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3625765685
\ No newline at end of file
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Filter.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Partitions.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Rows.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..6dfe025aa605922b6a676eacce8b526e8b4cef8f
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r@@rCA&AM?oQ>IPAycV^upu);$9`spIqm_>1|}9<94yQX
z>(4PW91de<xOOjq+0fLI2PF0?nVI2>Y5<#|k+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
zqtwDnKnvr57QRBVumEVG5jG2r4!-b5w{QlNx(dvP)=Y`Vp+4NeR2mHS;XR-avygoF
j6=-22GsAUWBnzhjEsVluq4A+RO?0);<nV>Ns1^bMYLkIj

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Statistics.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..23e356f1700e5c5b4e9b2a12ba03b01e2c8b502f
GIT binary patch
literal 7185
zcmeI$e@q)y902fpy|$DNDip@g43xpxa4C!*19V#*bOJ_&@x#?$k%Kmr2`y{O1`M5<
zY{`hhI=A7HZTKVN!pI0i9pJ>oO>l0yjVz)@2qqQXj8>P8v5ipg)^_`TpC&H;qsDv5
z<=&^?*Y>@;_wIdj48uqlEs2&fp0bob5&o#=4+(#~#2+lfL^n3aD&$PE#agA|>bOdi
zQB%p;Y@FF(<unG9U6a#jHx^nAMyt_K#98gt_G*jSXw|kfeV2DQbaVOnQ|<f@MZ4YZ
zxxh#w+j(glaZJZv?(32q`EXV?IXfQo?1{MvzKPIGC_*1#p)K?c7QT<DNRb%EH$LCT
za$m)J_M@zztfU-3Sw%URawz2p$_pqjq?|-KmGTP8`IH}{Tt&H=@@~p~l*h0Mbw9{*
zDsHDdge?iDycL_JexKck<I*y0SsJ!{CAJ?mUd0U@SB_u@xUlcqhOK@bJM0PUs8Q^@
zld%_dU~BZ)nftMGj$s$g$1ab=wyni(<*+-c`8%S+aaWAjo^?cGenWAhFU8)E=O=M}
ziQ<wHyq+bWVYB)#P<yEq&wuH;(<m<cv<+GQYZtPj#EtAfXGESi7lN!Z;Q1J=`w_)M
z2f~meCZdoRj1?m<JcI9d(hR=eX^xLj`<0X5BR{kn_wVs2CyKBCS&jTcVGr`&Ygx$s
zC-C(d??=~@D1)xR_C%W}>;tz7*%Ei~7WfJ|bvy-Hl6Vw7UnP4~=V6?UYC!XY9eCOw
z#-$DZtDt31jiK?%E!7P$?$_^}hW6L3xeProW)Q8XK)w1bj0dp=!_Xm_31~ittFFBU
z<I#s+xd?qvdj1jU#IM$^fKEPZIt-m%{@ej*-8U)gpo=?dGoULI-hTjk<MqHP=(bO~
z@}Liu?2Uu|yuIaR=;`-H*8r2nffWPLmASca{gbYuqrUYoG$Za^D!Be7M(4x_;QNxv
z?o~~|terS_0!!o1s0*NHW}4yskzIA{hWm%#l8~v@puM8Kx9MGAW#JpcR^WhV+J1rK
zSA89P5sp9jTz;P#w4cwl*1`QC@}PE51L83!r_}I%#b3%RXa(`5Wv&F64`ihMG^_$~
z?br{4n}8qQ*V7IA^Q0>~1I}l2v98kr+P75nk4yp|_;&nQC-BglZ}q_bI*%ufX5Nky
z^^2!xbASuhU%9dMwi%nZj-SDQ%)un!J3{sUl5=_XJcdc7#*fxG9}{yfbb-X23tbN8
zZ|7W|pAmB|G3NpkO3b<b$()PSvtbvWX9Diu!^nC=)p1u%H_vPD-LZv##~~cPTcv%q
z@#D86!Z@GdnJoVE_bC;cTFz=SaW$H<(g(9j8|#dOe<9M+o076DH7z}3d1h9&SQD>-
zI1j|N@b7*e$ZtNj#D|smz!o<xapV5aBrm=k?Yg(ZpS=kmVR!JK@h?iCK{6-=>)G(w
zM}!Kyk*nd)@eH}J`94e4u15#9D^2MQLmS>{PVDZOb(}QhZ!$5=ohN~J+|u*k%l<06
Y%(c{;?M7>^rKY~tETowmI`lQa12+z-PXGV_

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-TOC.txt b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-TOC.txt
new file mode 100644
index 000000000000..5db770495d06
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_clust_counter/bb-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+TOC.txt
+Rows.db
+Data.db
+Partitions.db
+CompressionInfo.db
+Statistics.db
+Filter.db
+Digest.crc32
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-CompressionInfo.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..dde7f70c204043547344062dff0c93507b11db0b
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$fQpAtDeS02+u1LI3~&

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Data.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..bcb0684d4d0648e30203557b90e67f78802cfb3a
GIT binary patch
literal 92
zcmZ3_z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpI{@17!wj
sBV`7g^HV&O86=IB8FGa9AE@SWs4&Qws4$r7&7Gmb5CGQm=yT&{0E8hKZ2$lO

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Digest.crc32 b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Digest.crc32
new file mode 100644
index 000000000000..48ec9c53b42f
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Digest.crc32
@@ -0,0 +1 @@
+82145779
\ No newline at end of file
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Filter.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Partitions.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..14cfb572e16f58a2422dbc83c749bb6490e817f7
GIT binary patch
literal 59
zcmd<!{Lf*2kK^uc4yGBL`u|H77_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1lY$81

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Rows.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Statistics.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..987b32a732d1abeccd10504071a9a1b815574e0e
GIT binary patch
literal 4686
zcmeI$YfKbJ902g0eegKo${onFVtwF)n8Sz^wdz$k1xzEzqtZ7Ecj52>^zO8Xwl)^~
zfi!qEDmC%dHljfzRoV*F(hpLo)m9~@SDQkv$(5>oTx%4uQm(u0j{g6Xe(1+GxlJ}R
zzy1GNW_D+GCdYA{IDJ;K#QlBDRtln=)yl~$=Nrl?aGc>98O?c~ORsfQndCaT(rz<V
z%1)<TV|B<TtKDVF-sG|sIIK2@%~~isT-C1X+8Ud~+}QA4?uppV<yX$PC^t$D1Onqx
ziA36Yb31X5OuE(4#-I9lO!AHmMz8H1A5v1(H1$y101wt`+u)(J5gj5D$I<@L%OSL)
zm7$Eo80#2EGS)LTFpg!Mz<8o&+K;K4X+P7L{JD%5G0tQBB;zW^TNv+V+`-s`O=vr{
z%j8ie?qb}7&Brs|hAlAn7xv+}Sc)xWV23Qm4rTfqb`QsO{n(Ly?1#2v8xLW}J&m0-
zfc;21_Vk0;rZViT{n*)OunQ+(mnUL7S7A5F*saX??YH2#KLwej{Dxgtg1sM)PvZXy
z#ra}9pZU+Rg|aVEz9{1HFJ8WY;?if$$RWSBA%_(Qki*Aq$Pwc)$a*Utj|R&%6p!tS
zLrxe<LZ0X;LY{gNzwfkB{Jt~XpP>B3-tUoDEXD16D#?f9YkoE&zgp0ay!Y;W<j!9F
zd<Hww^CY_H+c2M)bJca={z0}*IeH&Tg`7V-2b!O88hu~+J^Cv!E+o~X@ga0Q7Y^fM
zefSb+=~)lzZ%A!*J&cET`bMC`Ez52}N2GM4`4m-VyaeOXLVh1~Ojary&++=ZZ^3x-
zu{Uo(Po9@|3VO!Zs~171U$UQoUQqt>0cguNb5=ta9bBIYU77mf<Io#_j~a$<{<JL@
z`dIPaDbQcEH12>N`CwofFqs)u(FI+ZlLO~JX)8Pp=RXPT(!=@B+k8VGfxqXy$Coq!
z3ohc@2`o;#Xv~Km9o+)ok95bq8?GOrvtovqg8Z<SlMU|!>k8iPa{x!a(EJPRzy7M>
z2JFA#a$biK<WG>B>fm~ic+}i&0`Zh{!$$bNrrpfVZvyezrT$d7AIQx3u}=@;X3r1Z
zF9AQXul+b|&oll7nQ%PU6j@r`AiuGqv)>DR;M>77t-w9+yxR`j>pPn^ko6!=jIUi7
z%LdM0bNk-52j-kwoqmHQ#x#|u6|JOu6?$2u6|JQAp_kD&LAm%}*>lnM(LEP+8m(yM
zT*hdQE3x373q7FVo(nw=F1Y7H&m*|!3hueUfcSsyxyS|~?)ocS<YRj{>13>N^1_6_
z$^y@OTb189)RT72>>X(Bm9E~QMy0aif<=+8mrAMKpZiH&gOn_2@W0jI1|fcux_c(V
zMtPl5MsVc%=7qIYyH<9+uCvdp?|JcveaG<OG50y^^6mtVoB7Yb;Z&H`%MPbqUS}#T
USw6pHQ=Lt_<-gA7AO2(QpI~2_MgRZ+

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-TOC.txt b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-TOC.txt
new file mode 100644
index 000000000000..5db770495d06
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple/bb-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+TOC.txt
+Rows.db
+Data.db
+Partitions.db
+CompressionInfo.db
+Statistics.db
+Filter.db
+Digest.crc32
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-CompressionInfo.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..fe5bd9a9faf355fc9998dcd0b9853f669f512ee0
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*$j+25D^Fu02A*BzyJUM

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Data.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..25a9967b157abf0e26966f013c1b408077a40e07
GIT binary patch
literal 145
zcmdO4WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13GgtF7|Wr}*t`<H|6p*jKomfpw#Z
zkkb7h8Vs2XjEoG7hE@#uTKo*hLM3b%*v&NrfRv1p4TJr8vks7eA<qLF21#QZhI(y&
oh6kzw9=1S1w|f_C8Dvdt8SM2GW&i~a_}`wv00E4Q>tzF204Jd&I{*Lx

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Digest.crc32 b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Digest.crc32
new file mode 100644
index 000000000000..df0bd76bdd5d
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Digest.crc32
@@ -0,0 +1 @@
+3351188643
\ No newline at end of file
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Filter.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Partitions.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..6a1e5a50993e9a62fccd07076feadb26c714d7d7
GIT binary patch
literal 60
zcmd<!{Lf*2l;dtEC)0mlPW}G^3k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G^-;
A3;+NC

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Rows.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Rows.db
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Statistics.db b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-Statistics.db
new file mode 100644
index 0000000000000000000000000000000000000000..25750b93928c939821123767afffeb58ad7cd15b
GIT binary patch
literal 4695
zcmeI$YfMvD902fp?(OAKp`!&sAmUW;K}>N&hR#PPEh{X`U_2&!bD<5zLCbE7BD%Ra
z+y}O>YE(4wu}8uT8i_I|FvmXVU`#ian6VdxB^^_j+07}#$>{E#J^KHjEZJw1p5*47
z-}#^3o^$UxJtyTjPBU*|vdG;|i<FEgSG98S%Jr6V2^^=tL?+BHa2YPS$|%)Il@7bH
zQgXW`r%jfOHiyTUv%zC8kZpF^ZYz{zPqnAo<+RJDrp6zwN8&b?Up&*Q+$cE^2uwvM
z5^?*rt;9P%>w0H9f9#V<addJx=B3k9BT5ssnVQr-z{FbZ8%#<cQPLoB933C64x@!&
z9?m#|v6gWZV;y5X<2c4M7|&!pmvK7d2ZN^LUJ*2XE_qD*rx;f;-o$t(<4(py*o1y>
zu$nx?#666AvH1kXTd)P@{=!}y*OX$5nb=_~vBR11M%==2Z9jIDAN#(o*oJqo<DbD!
z8pOUo1AE>9Y-1UA_CD;Klh}pRvC9*&-A`gSOW1A9{Oz~kxIYD%r2LLuR)W2c&d=M`
z#Q!yl^Tl{Q^Iu>KWnZE88V#QRnhR%9T>QKRIqbJ~<cQ(`a^#d9dD>Jgvd)I*quz1}
z#pAl;k!Oq~A<rBtLY{jLzwh)3{Jt~2pQ83FM}I<oY&GuR(@8!QU;C>8`L%)$<UKc+
zB6pp}&u6#`Jx`*IxdGb~lU3aZ?j*8#&cQoS3gpbG#nAlx6X^Gq-=(_<<3dscnjb><
z^N}#FX^30}Ej~Ad#vA6UZh-OdF5ftGq$T${^t6;7w4S2N4Ci4yM#%4jj?GR(^Ep9x
zb2p49AKG>WdiIjMW6<-zS+fE<<GkYt^s@3-_Cs5~Tf7Fk=s;~2bY<E{k3g^cD|!sN
z<+FAx^r7NCbD+O$ZF(De{P<ulFj){?(G6W`HpBH#+6zy>^-lsjba4IicHhXy;Pd?G
z;Z=>mf`|CF18Y*x8S<egCN{zMBmU{#3HOiig|TC+LHmf-qm3T|YYX1(lYygNY`F@@
zulrVi1&&{TA+OT_+E167>)?Knc+k{i1o4!CF#~*Gsn@Le%^<$0)Sm{=16i5B^yxs{
zH1u=Ndf><Rb{vNNdDg!y3(jX}k)_QG+Ba2n^^XGY|6%xK8*uM??{~od`c9<}X5Wny
z!|P`!bAa>L-ng~pt~qzGOn-w##<VF<3&EV8J!rLt7J@mw53NSO2}<R?dgh|zL2F#8
z44t{?lleCfow?BC;6i6E^gKdmuF#naOo)5y%th*j_{*<yQOkF6;?X$6Y~}c)<^|vH
z-mLt`p<Z-orje^^@>8Q+*>S<9NcT%658j{qLp|pyt)RhotHF99Vb+^!L0Bi%D0wPJ
zE^mC;Rkh=Z?l-iKB@MkVf3R&pyMEF;V6!e);tT%!Kb#6<tt7i0QjM{+<k6)i8|v)A
cTR!lA?ai)wr^havT{ZQ!PW6Mi$8-Mv2a&O%<NyEw

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-TOC.txt b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-TOC.txt
new file mode 100644
index 000000000000..5db770495d06
--- /dev/null
+++ b/test/data/legacy-sstables/bb/legacy_tables/legacy_bb_simple_counter/bb-1-bti-TOC.txt
@@ -0,0 +1,8 @@
+TOC.txt
+Rows.db
+Data.db
+Partitions.db
+CompressionInfo.db
+Statistics.db
+Filter.db
+Digest.crc32
diff --git a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
index e6d8f83ec394..45b3b33e8cd4 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
@@ -66,6 +66,7 @@
 import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.format.ScrubPartitionIterator;
 import org.apache.cassandra.io.util.ChannelProxy;
 import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.repair.RepairParallelism;
@@ -305,6 +306,18 @@ public ChannelProxy getDataChannel()
             throw new RuntimeException();
         }
 
+        @Override
+        public boolean hasIndex()
+        {
+            return false;
+        }
+
+        @Override
+        public ScrubPartitionIterator scrubPartitionsIterator() throws IOException
+        {
+            return null;
+        }
+
         public String toString()
         {
             return "FailingSSTableReader[" + super.toString() + "]";
diff --git a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
index d1f0b3c9250f..a95cc3c94a6d 100644
--- a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
+++ b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
@@ -24,7 +24,6 @@
 import java.util.Iterator;
 import java.util.List;
 import java.util.UUID;
-import java.util.function.Supplier;
 
 import com.google.common.util.concurrent.RateLimiter;
 
@@ -296,7 +295,7 @@ public UnfilteredRowIterator iterator(DecoratedKey key, Slices slices, ColumnFil
     }
 
     @Override
-    public UnfilteredRowIterator simpleIterator(Supplier<FileDataInput> dfile, DecoratedKey key, boolean tombstoneOnly)
+    public UnfilteredRowIterator simpleIterator(FileDataInput dfile, DecoratedKey key, boolean tombstoneOnly)
     {
         return delegate.simpleIterator(dfile, key, tombstoneOnly);
     }
@@ -415,12 +414,6 @@ public boolean isRepaired()
         return delegate.isRepaired();
     }
 
-    @Override
-    public DecoratedKey keyAt(long indexPosition) throws IOException
-    {
-        return delegate.keyAt(indexPosition);
-    }
-
     @Override
     public DecoratedKey keyAt(FileDataInput reader) throws IOException
     {
@@ -613,6 +606,12 @@ public RandomAccessReader openIndexReader()
         return delegate.openIndexReader();
     }
 
+    @Override
+    public RandomAccessReader openKeyComponentReader()
+    {
+        return delegate.openKeyComponentReader();
+    }
+
     @Override
     public ChannelProxy getDataChannel()
     {
@@ -680,7 +679,7 @@ public Ref<SSTableReader> ref()
     }
 
     @Override
-    protected void setup(boolean trackHotness)
+    public void setup(boolean trackHotness)
     {
         delegate.setup(trackHotness);
     }
@@ -715,12 +714,6 @@ public DecoratedKey decorateKey(ByteBuffer key)
         return delegate.decorateKey(key);
     }
 
-    @Override
-    public String getIndexFilename()
-    {
-        return delegate.getIndexFilename();
-    }
-
     @Override
     public String getColumnFamilyName()
     {
@@ -762,4 +755,17 @@ public AbstractBounds<Token> getBounds()
     {
         return delegate.getBounds();
     }
+
+    @Override
+    public boolean hasIndex()
+    {
+        return delegate.hasIndex();
+    }
+
+    @Override
+    public ScrubPartitionIterator scrubPartitionsIterator() throws IOException
+    {
+        return delegate.scrubPartitionsIterator();
+    }
+
 }
diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index c103f4b62e03..7d89c68b20e5 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -397,10 +397,10 @@ public static Row getOnlyRow(ReadCommand cmd)
             assert iterator.hasNext() : "Expecting one row in one partition but got nothing";
             try (RowIterator partition = iterator.next())
             {
-                assert !iterator.hasNext() : "Expecting a single partition but got more";
                 assert partition.hasNext() : "Expecting one row in one partition but got an empty partition";
                 Row row = partition.next();
                 assert !partition.hasNext() : "Expecting a single row but got more";
+                assert !iterator.hasNext() : "Expecting a single partition but got more";
                 return row;
             }
         }
diff --git a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
index d07837788f85..399c4cd24f91 100644
--- a/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
+++ b/test/unit/org/apache/cassandra/cache/AutoSavingCacheTest.java
@@ -17,14 +17,17 @@
  */
 package org.apache.cassandra.cache;
 
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.AsciiType;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+
 import org.junit.Assert;
+import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -35,6 +38,8 @@
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.hamcrest.Matchers.is;
+
 public class AutoSavingCacheTest
 {
     private static final String KEYSPACE1 = "AutoSavingCacheTest1";
@@ -43,6 +48,8 @@ public class AutoSavingCacheTest
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
+
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
                                     KeyspaceParams.simple(1),
diff --git a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
index 98f15b3db8df..16edfd4efd06 100644
--- a/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
+++ b/test/unit/org/apache/cassandra/cql3/KeyCacheCqlTest.java
@@ -31,6 +31,7 @@
 import org.apache.cassandra.cache.KeyCacheKey;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.index.Index;
@@ -84,6 +85,11 @@ public class KeyCacheCqlTest extends CQLTester
                                      "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
                                      "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789";
 
+    private int cacheInteractionsIfSupported(int cacheInteractionsAssumingCacheIsSupported)
+    {
+        return SSTableFormat.Type.current() == SSTableFormat.Type.BIG ? cacheInteractionsAssumingCacheIsSupported : 0;
+    }
+
     /**
      * Prevent system tables from populating the key cache to ensure that
      * the test can reliably check the size of the key cache size and its metrics.
@@ -261,8 +267,8 @@ private void test2iKeyCachePaths() throws Throwable
 
         long hits = metrics.hits.getCount();
         long requests = metrics.requests.getCount();
-        assertEquals(0, hits);
-        assertEquals(210, requests);
+        assertEquals(cacheInteractionsIfSupported(0), hits);
+        assertEquals(cacheInteractionsIfSupported(210), requests);
 
         for (int i = 0; i < 10; i++)
         {
@@ -276,8 +282,8 @@ private void test2iKeyCachePaths() throws Throwable
         metrics = CacheService.instance.keyCache.getMetrics();
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(200, hits);
-        assertEquals(420, requests);
+        assertEquals(cacheInteractionsIfSupported(200), hits);
+        assertEquals(cacheInteractionsIfSupported(420), requests);
 
         CacheService.instance.keyCache.submitWrite(Integer.MAX_VALUE).get();
 
@@ -346,8 +352,8 @@ private void test2iKeyCachePathsSaveKeysForDroppedTable() throws Throwable
 
         long hits = metrics.hits.getCount();
         long requests = metrics.requests.getCount();
-        assertEquals(0, hits);
-        assertEquals(210, requests);
+        assertEquals(cacheInteractionsIfSupported(0), hits);
+        assertEquals(cacheInteractionsIfSupported(210), requests);
 
         //
 
@@ -363,8 +369,8 @@ private void test2iKeyCachePathsSaveKeysForDroppedTable() throws Throwable
         metrics = CacheService.instance.keyCache.getMetrics();
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(200, hits);
-        assertEquals(420, requests);
+        assertEquals(cacheInteractionsIfSupported(200), hits);
+        assertEquals(cacheInteractionsIfSupported(420), requests);
 
         dropTable("DROP TABLE %s");
 
@@ -418,8 +424,8 @@ private void testKeyCacheNonClustered() throws Throwable
         CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
         long hits = metrics.hits.getCount();
         long requests = metrics.requests.getCount();
-        assertEquals(0, hits);
-        assertEquals(10, requests);
+        assertEquals(cacheInteractionsIfSupported(0), hits);
+        assertEquals(cacheInteractionsIfSupported(10), requests);
 
         for (int i = 0; i < 100; i++)
         {
@@ -429,8 +435,8 @@ private void testKeyCacheNonClustered() throws Throwable
 
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(10, hits);
-        assertEquals(120, requests);
+        assertEquals(cacheInteractionsIfSupported(10), hits);
+        assertEquals(cacheInteractionsIfSupported(120), requests);
     }
 
     @Test
@@ -466,8 +472,8 @@ private void testKeyCacheClustered() throws Throwable
         CacheMetrics metrics = CacheService.instance.keyCache.getMetrics();
         long hits = metrics.hits.getCount();
         long requests = metrics.requests.getCount();
-        assertEquals(0, hits);
-        assertEquals(10, requests);
+        assertEquals(cacheInteractionsIfSupported(0), hits);
+        assertEquals(cacheInteractionsIfSupported(10), requests);
 
         // 10 queries, each 50 result rows
         for (int i = 0; i < 10; i++)
@@ -478,8 +484,8 @@ private void testKeyCacheClustered() throws Throwable
         metrics = CacheService.instance.keyCache.getMetrics();
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(10, hits);
-        assertEquals(10 + 10, requests);
+        assertEquals(cacheInteractionsIfSupported(10), hits);
+        assertEquals(cacheInteractionsIfSupported(10 + 10), requests);
 
         // 100 queries - must get a hit in key-cache
         for (int i = 0; i < 10; i++)
@@ -494,8 +500,8 @@ private void testKeyCacheClustered() throws Throwable
         metrics = CacheService.instance.keyCache.getMetrics();
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(10 + 100, hits);
-        assertEquals(20 + 100, requests);
+        assertEquals(cacheInteractionsIfSupported(10 + 100), hits);
+        assertEquals(cacheInteractionsIfSupported(20 + 100), requests);
 
         // 5000 queries - first 10 partitions already in key cache
         for (int i = 0; i < 100; i++)
@@ -509,8 +515,8 @@ private void testKeyCacheClustered() throws Throwable
 
         hits = metrics.hits.getCount();
         requests = metrics.requests.getCount();
-        assertEquals(110 + 4910, hits);
-        assertEquals(120 + 5500, requests);
+        assertEquals(cacheInteractionsIfSupported(110 + 4910), hits);
+        assertEquals(cacheInteractionsIfSupported(120 + 5500), requests);
     }
 
     // Inserts 100 partitions split over 10 sstables (flush after 10 partitions).
diff --git a/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java b/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
index ec34230d176f..731fdea86d6d 100644
--- a/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
+++ b/test/unit/org/apache/cassandra/cql3/QueryWithIndexedSSTableTest.java
@@ -23,7 +23,7 @@
 
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -59,7 +59,7 @@ public void queryIndexedSSTableTest() throws Throwable
         boolean hasIndexed = false;
         for (SSTableReader sstable : getCurrentColumnFamilyStore().getLiveSSTables())
         {
-            BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
+            RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
             hasIndexed |= indexEntry != null && indexEntry.isIndexed();
         }
         assert hasIndexed;
diff --git a/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java b/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
index 15b4cca35535..a106ba01d91f 100644
--- a/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
+++ b/test/unit/org/apache/cassandra/cql3/TombstonesWithIndexedSSTableTest.java
@@ -19,18 +19,29 @@
 
 import java.util.Random;
 
+import org.junit.Assume;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.Int32Type;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.FileDataInput;
+import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.hamcrest.Matchers.is;
+
 public class TombstonesWithIndexedSSTableTest extends CQLTester
 {
+    @BeforeClass
+    public static void beforeClass()
+    {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
+    }
+
     @Test
     public void testTombstoneBoundariesInIndexCached() throws Throwable
     {
@@ -80,9 +91,8 @@ public void testTombstoneBoundariesInIndex(String cacheKeys) throws Throwable
                 BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
                 if (indexEntry != null && indexEntry.isIndexed())
                 {
-                    try (FileDataInput reader = sstable.openIndexReader())
+                    try (BigTableRowIndexEntry.IndexInfoRetriever infoRetriever = indexEntry.openWithIndex(sstable.getIndexFile()))
                     {
-                        BigTableRowIndexEntry.IndexInfoRetriever infoRetriever = indexEntry.openWithIndex(sstable.getIndexFile());
                         ClusteringPrefix<?> firstName = infoRetriever.columnsIndex(1).firstName;
                         if (firstName.kind().isBoundary())
                             break deletionLoop;
diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
index 479c215db8a6..5e8ab729afef 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
@@ -264,12 +264,11 @@ public void testBackupAfterFlush() throws Throwable
         new RowUpdateBuilder(cfs.metadata(), 0, ByteBufferUtil.bytes("key2")).clustering("Column1").add("val", "asdf").build().applyUnsafe();
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
-        for (int version = 1; version <= 2; ++version)
+        for (SSTableReader liveSSTable : cfs.getLiveSSTables())
         {
-            Descriptor existing = new Descriptor(cfs.getDirectories().getDirectoryForNewSSTables(), KEYSPACE2, CF_STANDARD1, version,
-                                                 SSTableFormat.Type.BIG);
-            Descriptor desc = new Descriptor(Directories.getBackupsDirectory(existing), KEYSPACE2, CF_STANDARD1, version, SSTableFormat.Type.BIG);
-            for (Component c : new Component[]{ Component.DATA, Component.PRIMARY_INDEX, Component.FILTER, Component.STATS })
+            Descriptor existing = liveSSTable.descriptor;
+            Descriptor desc = new Descriptor(Directories.getBackupsDirectory(existing), KEYSPACE2, CF_STANDARD1, liveSSTable.descriptor.generation, liveSSTable.descriptor.formatType);
+            for (Component c : liveSSTable.components)
                 assertTrue("Cannot find backed-up file:" + desc.filenameFor(c), new File(desc.filenameFor(c)).exists());
         }
     }
diff --git a/test/unit/org/apache/cassandra/db/KeyCacheTest.java b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
index 1155cc446b45..f4fb1f0444c2 100644
--- a/test/unit/org/apache/cassandra/db/KeyCacheTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyCacheTest.java
@@ -39,6 +39,7 @@
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.CacheService;
@@ -327,6 +328,10 @@ private void assertKeyCacheSize(int expected, String keyspace, String columnFami
             if (k.desc.ksname.equals(keyspace) && k.desc.cfname.equals(columnFamily))
                 size++;
         }
-        assertEquals(expected, size);
+
+        if (SSTableFormat.Type.current() == SSTableFormat.Type.BIG)
+            assertEquals(expected, size);
+        else
+            assertEquals(0, size);
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/KeyspaceTest.java b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
index aaf28db817a6..bb75ab00652d 100644
--- a/test/unit/org/apache/cassandra/db/KeyspaceTest.java
+++ b/test/unit/org/apache/cassandra/db/KeyspaceTest.java
@@ -21,7 +21,6 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.assertj.core.api.Assertions;
 import org.junit.Test;
 import org.mockito.Mockito;
@@ -36,6 +35,7 @@
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.filter.*;
 import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.metrics.ClearableHistogram;
 import org.apache.cassandra.schema.SchemaProvider;
@@ -409,7 +409,7 @@ public void testGetSliceFromLarge() throws Throwable
 
         // verify that we do indeed have multiple index entries
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
-        BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(Util.dk("0"), SSTableReader.Operator.EQ);
+        RowIndexEntry indexEntry = sstable.getPosition(Util.dk("0"), SSTableReader.Operator.EQ);
         assert indexEntry.columnsIndexCount() > 2;
 
         validateSliceLarge(cfs);
diff --git a/test/unit/org/apache/cassandra/db/ReadCommandTest.java b/test/unit/org/apache/cassandra/db/ReadCommandTest.java
index fc069450767d..8aca305bdef5 100644
--- a/test/unit/org/apache/cassandra/db/ReadCommandTest.java
+++ b/test/unit/org/apache/cassandra/db/ReadCommandTest.java
@@ -58,6 +58,7 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.big.BigTableReader;
 import org.apache.cassandra.io.util.DataInputBuffer;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.WrappedDataOutputStreamPlus;
@@ -672,7 +673,7 @@ public void testSinglePartitionNamesSkipsOptimisationsIfTrackingRepairedData()
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
         assertEquals(2, sstables.size());
-        Collections.sort(sstables, SSTableReader.maxTimestampDescending);
+        Collections.sort(sstables, BigTableReader.maxTimestampDescending);
 
         ReadCommand readCommand = Util.cmd(cfs, Util.dk("key")).includeRow("dd").columns("a").build();
 
diff --git a/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java b/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java
index bb272afe9f9c..310aa52777bd 100644
--- a/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java
+++ b/test/unit/org/apache/cassandra/db/RepairedDataTombstonesTest.java
@@ -261,6 +261,10 @@ private void verify(int expectedRows, int minVal, int maxVal, boolean includePur
                             }
                         }
                     }
+                    else
+                    {
+                        while (rowIter.hasNext()) rowIter.next();
+                    }
                 }
             }
         }
diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index 11a20f7f8bb6..53bf6b4d55fe 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
@@ -42,6 +42,7 @@
 import com.google.common.collect.Sets;
 import org.apache.commons.lang3.ArrayUtils;
 import org.junit.AfterClass;
+import org.junit.Assume;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -81,6 +82,7 @@
 import org.apache.cassandra.io.sstable.SSTableRewriter;
 import org.apache.cassandra.io.sstable.SSTableTxnWriter;
 import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.format.big.BigTableWriter;
@@ -99,6 +101,7 @@
 import static org.apache.cassandra.SchemaLoader.getCompressionParameters;
 import static org.apache.cassandra.SchemaLoader.loadSchema;
 import static org.apache.cassandra.SchemaLoader.standardCFMD;
+import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
@@ -193,7 +196,9 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
         //make sure to override at most 1 chunk when compression is enabled
-        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"));
+        //use 0x00 instead of the usual 0x7A because if by any chance it's able to iterate over the corrupt
+        //section, then we get many out-of-order errors, which we don't want
+        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"), (byte)0x00);
 
         // with skipCorrupted == false, the scrub is expected to fail
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(Collections.singletonList(sstable), OperationType.SCRUB);
@@ -353,12 +358,14 @@ public void testScrubOneRowWithCorruptedKey() throws IOException, ExecutionExcep
         assertOrderedAll(cfs, 4);
 
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
-        overrideWithGarbage(sstable, 0, 2);
+        overrideWithGarbage(sstable, 0, 2, (byte)0x7A);
 
         CompactionManager.instance.performScrub(cfs, false, true, 2);
 
         // check data is still there
-        assertOrderedAll(cfs, 4);
+        // For Trie format we won't be able to recover the damaged partition key (partion index doesn't store the
+        // whole key)
+        assertOrderedAll(cfs, SSTableFormat.Type.current() == SSTableFormat.Type.BTI ? 3 : 4);
     }
 
     @Test
@@ -403,7 +410,19 @@ public void testScrubNoIndex() throws ExecutionException, InterruptedException,
         assertOrderedAll(cfs, 10);
 
         for (SSTableReader sstable : cfs.getLiveSSTables())
-            assertTrue(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete());
+        {
+            switch (sstable.descriptor.getFormat().getType()) {
+                case BIG:
+                    assertTrue(new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete());
+                    break;
+                case BTI:
+                    assertTrue(new File(sstable.descriptor.filenameFor(Component.PARTITION_INDEX)).delete());
+                    new File(sstable.descriptor.filenameFor(Component.ROW_INDEX)).delete(); // row index is optional
+                    break;
+                default:
+                    fail("Unknonw SSTable format");
+            }
+        }
 
         CompactionManager.instance.performScrub(cfs, false, true, 2);
 
@@ -414,6 +433,11 @@ public void testScrubNoIndex() throws ExecutionException, InterruptedException,
     @Test
     public void testScrubOutOfOrder() throws IOException
     {
+        // Run only for Big Table format because Big Table Format does not complain if partitions are given in invalid
+        // order. Legacy SSTables with out-of-order partitions exist in production systems and must be corrected
+        // by scrubbing. The trie index format does not permit such partitions.
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
+
         // This test assumes ByteOrderPartitioner to create out-of-order SSTable
         IPartitioner oldPartitioner = DatabaseDescriptor.getPartitioner();
         DatabaseDescriptor.setPartitionerUnsafe(new ByteOrderedPartitioner());
@@ -447,7 +471,7 @@ public void testScrubOutOfOrder() throws IOException
 
             try
             {
-                SSTableReader.open(desc, cfs.metadata);
+                desc.getFormat().getReaderFactory().open(desc, cfs.metadata);
                 fail("SSTR validation should have caught the out-of-order rows");
             }
             catch (CorruptSSTableException ise)
@@ -464,7 +488,7 @@ public void testScrubOutOfOrder() throws IOException
             components.add(Component.SUMMARY);
             components.add(Component.TOC);
 
-            SSTableReader sstable = SSTableReader.openNoValidation(desc, components, cfs);
+            SSTableReader sstable = desc.getFormat().getReaderFactory().openNoValidation(desc, components, cfs);
             if (sstable.last.compareTo(sstable.first) < 0)
                 sstable.last = sstable.first;
 
@@ -535,7 +559,7 @@ private void overrideWithGarbage(String path, long startPosition, long endPositi
         try (RandomAccessFile file = new RandomAccessFile(path, "rw"))
         {
             file.seek(startPosition);
-            int length = (int) (endPosition - startPosition);
+            int length = (int)(endPosition - startPosition);
             byte[] buff = new byte[length];
             Arrays.fill(buff, junk);
             file.write(buff, 0, length);
@@ -743,7 +767,7 @@ private void testScrubIndex(String cfName, String colName, boolean composite, bo
                 boolean failure = !scrubs[i];
                 if (failure)
                 { //make sure the next scrub fails
-                    overrideWithGarbage(indexCfs.getLiveSSTables().iterator().next(), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(2L));
+                    overrideWithGarbage(indexCfs.getLiveSSTables().iterator().next(), ByteBufferUtil.bytes(1L), ByteBufferUtil.bytes(2L), (byte)0x7A);
                 }
                 CompactionManager.AllSSTableOpStatus result = indexCfs.scrub(false, false, false, true, false, 0);
                 assertEquals(failure ?
@@ -864,7 +888,7 @@ public void testSkipScrubCorruptedCounterRowWithTool() throws IOException, Write
         assertEquals(1, cfs.getLiveSSTables().size());
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"));
+        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"), (byte)0x7A);
 
         // with skipCorrupted == true, the corrupt rows will be skipped
         ToolResult tool = ToolRunner.invokeClass(StandaloneScrubber.class, "-s", ksName, COUNTER_CF);
@@ -886,7 +910,9 @@ public void testNoSkipScrubCorruptedCounterRowWithTool() throws IOException, Wri
         assertEquals(1, cfs.getLiveSSTables().size());
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
 
-        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"));
+        //use 0x00 instead of the usual 0x7A because if by any chance it's able to iterate over the corrupt
+        //section, then we get many out-of-order errors, which we don't want
+        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"), (byte) 0x0);
 
         // with skipCorrupted == false, the scrub is expected to fail
         try
diff --git a/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java b/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java
index c9649b15a720..6a41659899db 100644
--- a/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java
+++ b/test/unit/org/apache/cassandra/db/SerializationHeaderTest.java
@@ -111,8 +111,8 @@ public void testWrittenAsDifferentKind() throws Exception
 
             Descriptor sstableWithRegular = writer.apply(schemaWithRegular, BufferClustering::new).call();
             Descriptor sstableWithStatic = writer.apply(schemaWithStatic, value -> Clustering.STATIC_CLUSTERING).call();
-            SSTableReader readerWithStatic = SSTableReader.openNoValidation(sstableWithStatic, TableMetadataRef.forOfflineTools(schemaWithRegular));
-            SSTableReader readerWithRegular = SSTableReader.openNoValidation(sstableWithRegular, TableMetadataRef.forOfflineTools(schemaWithStatic));
+            SSTableReader readerWithStatic = sstableWithStatic.getFormat().getReaderFactory().openNoValidation(sstableWithStatic, TableMetadataRef.forOfflineTools(schemaWithRegular));
+            SSTableReader readerWithRegular = sstableWithStatic.getFormat().getReaderFactory().openNoValidation(sstableWithRegular, TableMetadataRef.forOfflineTools(schemaWithStatic));
 
             try (ISSTableScanner partitions = readerWithStatic.getScanner()) {
                 for (int i = 0 ; i < 5 ; ++i)
diff --git a/test/unit/org/apache/cassandra/db/VerifyTest.java b/test/unit/org/apache/cassandra/db/VerifyTest.java
index 9ca98b8e7504..b6bdab8cb7d7 100644
--- a/test/unit/org/apache/cassandra/db/VerifyTest.java
+++ b/test/unit/org/apache/cassandra/db/VerifyTest.java
@@ -26,7 +26,7 @@
 import org.apache.cassandra.batchlog.BatchlogManager;
 import org.apache.cassandra.cache.ChunkCache;
 import org.apache.cassandra.UpdateBuilder;
-import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.Verifier;
 import org.apache.cassandra.db.marshal.UUIDType;
@@ -37,9 +37,9 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.io.FSWriteError;
-import org.apache.cassandra.io.compress.CorruptBlockException;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.CorruptSSTableException;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.locator.InetAddressAndPort;
@@ -48,13 +48,15 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.UUIDGen;
+
 import org.apache.commons.lang3.StringUtils;
+import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
 import java.io.*;
-import java.net.InetAddress;
 import java.net.UnknownHostException;
 import java.nio.file.Files;
 import java.util.ArrayList;
@@ -69,6 +71,7 @@
 import static org.apache.cassandra.SchemaLoader.createKeyspace;
 import static org.apache.cassandra.SchemaLoader.loadSchema;
 import static org.apache.cassandra.SchemaLoader.standardCFMD;
+import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -98,6 +101,8 @@ public class VerifyTest
     public static void defineSchema() throws ConfigurationException
     {
         CompressionParams compressionParameters = CompressionParams.snappy(32768);
+        DatabaseDescriptor.daemonInitialization();
+        DatabaseDescriptor.setColumnIndexSize(0);
 
         loadSchema();
         createKeyspace(KEYSPACE,
@@ -518,10 +523,26 @@ public void testMutateRepair() throws IOException, ExecutionException, Interrupt
     }
 
     @Test
-    public void testVerifyIndex() throws IOException
+    public void testVerifyPrimaryIndex() throws IOException
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
         testBrokenComponentHelper(Component.PRIMARY_INDEX);
     }
+
+    @Test
+    public void testVerifyPartitionIndex() throws IOException
+    {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BTI));
+        testBrokenComponentHelper(Component.PARTITION_INDEX);
+    }
+
+    @Test
+    public void testVerifyRowIndex() throws IOException
+    {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BTI));
+        testBrokenComponentHelper(Component.ROW_INDEX);
+    }
+
     @Test
     public void testVerifyBf() throws IOException
     {
@@ -531,6 +552,7 @@ public void testVerifyBf() throws IOException
     @Test
     public void testVerifyIndexSummary() throws IOException
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
         testBrokenComponentHelper(Component.SUMMARY);
     }
 
@@ -693,7 +715,7 @@ public void testVerifyLocalPartitioner() throws UnknownHostException
         tmd.updateNormalToken(new ByteOrderedPartitioner.BytesToken(tk1), InetAddressAndPort.getByName("127.0.0.1"));
         tmd.updateNormalToken(new ByteOrderedPartitioner.BytesToken(tk2), InetAddressAndPort.getByName("127.0.0.2"));
         // write some bogus to a localpartitioner table
-        Batch bogus = Batch.createLocal(UUID.randomUUID(), 0, Collections.emptyList());
+        Batch bogus = Batch.createLocal(UUIDGen.getTimeUUID(), 0, Collections.emptyList());
         BatchlogManager.store(bogus);
         ColumnFamilyStore cfs = Keyspace.open("system").getColumnFamilyStore("batches");
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
diff --git a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java
index faa7b99f4fec..2b5557ac16ad 100644
--- a/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/streaming/CassandraStreamManagerTest.java
@@ -134,7 +134,7 @@ private static Set<SSTableReader> sstablesFromStreams(Collection<OutgoingStream>
         Set<SSTableReader> sstables = new HashSet<>();
         for (OutgoingStream stream: streams)
         {
-            Ref<SSTableReader> ref = CassandraOutgoingFile.fromStream(stream).getRef();
+            Ref<? extends SSTableReader> ref = CassandraOutgoingFile.fromStream(stream).getRef();
             sstables.add(ref.get());
             ref.release();
         }
diff --git a/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java b/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java
index 794738a1f794..6c2a61df7244 100644
--- a/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/view/IndexViewManagerTest.java
@@ -152,7 +152,7 @@ public void testConcurrentUpdate() throws Throwable
 
         List<SSTableReader> sstables = IntStream.rangeClosed(1, 4)
                                                 .mapToObj(i -> new Descriptor(tmpDir.toFile(), KEYSPACE, tableName, i))
-                                                .map(SSTableReader::open)
+                                                .map(desc -> desc.getFormat().getReaderFactory().open(desc))
                                                 .collect(Collectors.toList());
 
         List<SSTableReader> none = Collections.emptyList();
diff --git a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
index e67a4c3de4cd..5f354ed3cb3f 100644
--- a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
+++ b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java
@@ -26,6 +26,7 @@
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import org.junit.Assert;
+import org.junit.Assume;
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQLTester;
@@ -36,10 +37,13 @@
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.io.sstable.IndexSummaryManager;
 import org.apache.cassandra.io.sstable.IndexSummaryRedistribution;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.hamcrest.Matchers.is;
+
 public class DiskSpaceMetricsTest extends CQLTester
 {
     /**
@@ -66,6 +70,8 @@ public void baseline() throws Throwable
     @Test
     public void summaryRedistribution() throws Throwable
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
+
         createTable("CREATE TABLE %s (pk bigint, PRIMARY KEY (pk)) WITH min_index_interval=1");
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java
index 9e3594bc0fa5..eac04fa161e1 100644
--- a/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/BigTableWriterTest.java
@@ -21,6 +21,7 @@
 import java.io.File;
 import java.io.IOException;
 
+import org.junit.Assume;
 import org.junit.BeforeClass;
 
 import org.junit.Assert;
@@ -31,8 +32,10 @@
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.concurrent.AbstractTransactionalTest;
+import org.hamcrest.Matchers;
 
 public class BigTableWriterTest extends AbstractTransactionalTest
 {
@@ -44,6 +47,7 @@ public class BigTableWriterTest extends AbstractTransactionalTest
     @BeforeClass
     public static void defineSchema() throws Exception
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), Matchers.is(SSTableFormat.Type.BIG));
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
                                     KeyspaceParams.simple(1),
diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
index bff6a547edeb..4585f0a8e10b 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
@@ -28,6 +28,7 @@
 import com.google.common.base.Joiner;
 import com.google.common.collect.Sets;
 import org.junit.After;
+import org.junit.Assume;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -42,7 +43,6 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.RowUpdateBuilder;
-import org.apache.cassandra.db.compaction.AntiCompactionTest;
 import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.CompactionManager;
@@ -50,14 +50,15 @@
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.metrics.CompactionMetrics;
 import org.apache.cassandra.metrics.RestorableMeter;
 import org.apache.cassandra.schema.CachingParams;
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.schema.MigrationManager;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.hamcrest.Matchers;
 
 import static com.google.common.collect.ImmutableMap.of;
 import static java.util.Arrays.asList;
@@ -65,6 +66,7 @@
 import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
 import static org.apache.cassandra.io.sstable.IndexSummaryRedistribution.DOWNSAMPLE_THESHOLD;
 import static org.apache.cassandra.io.sstable.IndexSummaryRedistribution.UPSAMPLE_THRESHOLD;
+import static org.apache.cassandra.io.sstable.format.SSTableReader.selectOnlyBigTableReaders;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
@@ -88,6 +90,8 @@ public class IndexSummaryManagerTest
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), Matchers.is(SSTableFormat.Type.BIG));
+
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
                                     KeyspaceParams.simple(1),
@@ -224,7 +228,7 @@ public void testChangeMinIndexInterval() throws IOException
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -234,7 +238,7 @@ public void testChangeMinIndexInterval() throws IOException
         // double the min_index_interval
         MigrationManager.announceTableUpdate(cfs.metadata().unbuild().minIndexInterval(originalMinIndexInterval * 2).build(), true);
         IndexSummaryManager.instance.redistributeSummaries();
-        for (SSTableReader sstable : cfs.getLiveSSTables())
+        for (SSTableReader sstable : selectOnlyBigTableReaders(cfs.getLiveSSTables()))
         {
             assertEquals(cfs.metadata().params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
             assertEquals(numRows / cfs.metadata().params.minIndexInterval, sstable.getIndexSummarySize());
@@ -243,7 +247,7 @@ public void testChangeMinIndexInterval() throws IOException
         // return min_index_interval to its original value
         MigrationManager.announceTableUpdate(cfs.metadata().unbuild().minIndexInterval(originalMinIndexInterval).build(), true);
         IndexSummaryManager.instance.redistributeSummaries();
-        for (SSTableReader sstable : cfs.getLiveSSTables())
+        for (SSTableReader sstable : selectOnlyBigTableReaders(cfs.getLiveSSTables()))
         {
             assertEquals(cfs.metadata().params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
             assertEquals(numRows / cfs.metadata().params.minIndexInterval, sstable.getIndexSummarySize());
@@ -252,14 +256,14 @@ public void testChangeMinIndexInterval() throws IOException
         // halve the min_index_interval, but constrain the available space to exactly what we have now; as a result,
         // the summary shouldn't change
         MigrationManager.announceTableUpdate(cfs.metadata().unbuild().minIndexInterval(originalMinIndexInterval / 2).build(), true);
-        SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        SSTableReader sstable = selectOnlyBigTableReaders(cfs.getLiveSSTables()).iterator().next();
         long summarySpace = sstable.getIndexSummaryOffHeapSize();
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(asList(sstable), OperationType.UNKNOWN))
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), summarySpace);
         }
 
-        sstable = cfs.getLiveSSTables().iterator().next();
+        sstable = selectOnlyBigTableReaders(cfs.getLiveSSTables()).iterator().next();
         assertEquals(originalMinIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
         assertEquals(numRows / originalMinIndexInterval, sstable.getIndexSummarySize());
 
@@ -270,7 +274,7 @@ public void testChangeMinIndexInterval() throws IOException
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), (long) Math.ceil(summarySpace * 1.5));
         }
-        sstable = cfs.getLiveSSTables().iterator().next();
+        sstable = selectOnlyBigTableReaders(cfs.getLiveSSTables()).iterator().next();
         assertEquals(previousSize * 1.5, (double) sstable.getIndexSummarySize(), 1);
         assertEquals(previousInterval * (1.0 / 1.5), sstable.getEffectiveIndexInterval(), 0.001);
 
@@ -281,7 +285,7 @@ public void testChangeMinIndexInterval() throws IOException
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), (long) Math.ceil(summarySpace / 2.0));
         }
-        sstable = cfs.getLiveSSTables().iterator().next();
+        sstable = selectOnlyBigTableReaders(cfs.getLiveSSTables()).iterator().next();
         assertEquals(originalMinIndexInterval * 2, sstable.getEffectiveIndexInterval(), 0.001);
         assertEquals(numRows / (originalMinIndexInterval * 2), sstable.getIndexSummarySize());
 
@@ -294,7 +298,7 @@ public void testChangeMinIndexInterval() throws IOException
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), 10);
         }
-        sstable = cfs.getLiveSSTables().iterator().next();
+        sstable = selectOnlyBigTableReaders(cfs.getLiveSSTables()).iterator().next();
         assertEquals(cfs.metadata().params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
     }
 
@@ -309,7 +313,7 @@ public void testChangeMaxIndexInterval() throws IOException
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -317,7 +321,7 @@ public void testChangeMaxIndexInterval() throws IOException
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), 10);
         }
-        sstables = new ArrayList<>(cfs.getLiveSSTables());
+        sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
             assertEquals(cfs.metadata().params.maxIndexInterval, sstable.getEffectiveIndexInterval(), 0.01);
 
@@ -327,7 +331,7 @@ public void testChangeMaxIndexInterval() throws IOException
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), 1);
         }
-        sstables = new ArrayList<>(cfs.getLiveSSTables());
+        sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
         {
             assertEquals(cfs.metadata().params.maxIndexInterval, sstable.getEffectiveIndexInterval(), 0.01);
@@ -340,7 +344,7 @@ public void testChangeMaxIndexInterval() throws IOException
         {
             redistributeSummaries(Collections.EMPTY_LIST, of(cfs.metadata.id, txn), 1);
         }
-        for (SSTableReader sstable : cfs.getLiveSSTables())
+        for (SSTableReader sstable : selectOnlyBigTableReaders(cfs.getLiveSSTables()))
         {
             assertEquals(cfs.metadata().params.maxIndexInterval, sstable.getEffectiveIndexInterval(), 0.01);
             assertEquals(numRows / cfs.metadata().params.maxIndexInterval, sstable.getIndexSummarySize());
@@ -360,7 +364,7 @@ public void testRedistributeSummaries() throws IOException
 
         int minSamplingLevel = (BASE_SAMPLING_LEVEL * cfs.metadata().params.minIndexInterval) / cfs.metadata().params.maxIndexInterval;
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -523,7 +527,7 @@ public void testRebuildAtSamplingLevel() throws IOException
 
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         assertEquals(1, sstables.size());
         SSTableReader original = sstables.get(0);
 
@@ -629,7 +633,7 @@ public void testCancelIndexHelper(Consumer<ColumnFamilyStore> cancelFunction) th
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> allSSTables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> allSSTables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         List<SSTableReader> sstables = allSSTables.subList(0, 4);
         List<SSTableReader> compacting = allSSTables.subList(4, 8);
 
@@ -703,7 +707,7 @@ public void run()
         assertTrue("Expected no active compactions", CompactionManager.instance.active.getCompactions().isEmpty());
 
         Set<SSTableReader> beforeRedistributionSSTables = new HashSet<>(allSSTables);
-        Set<SSTableReader> afterCancelSSTables = new HashSet<>(cfs.getLiveSSTables());
+        Set<SSTableReader> afterCancelSSTables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toSet());
         Set<SSTableReader> disjoint = Sets.symmetricDifference(beforeRedistributionSSTables, afterCancelSSTables);
         assertTrue(String.format("Mismatched files before and after cancelling redistribution: %s",
                                  Joiner.on(",").join(disjoint)),
@@ -723,7 +727,7 @@ public void testPauseIndexSummaryManager() throws Exception
         int numRows = 256;
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
index 548274fd5a86..3fe5d082160b 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java
@@ -24,6 +24,7 @@
 import java.util.List;
 import java.util.concurrent.ExecutionException;
 import java.util.concurrent.Future;
+import java.util.stream.Collectors;
 
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -40,6 +41,7 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.schema.MigrationManager;
 
+import static org.apache.cassandra.io.sstable.format.SSTableReader.selectOnlyBigTableReaders;
 import static org.junit.Assert.assertEquals;
 
 public class IndexSummaryRedistributionTest
@@ -72,7 +74,7 @@ public void testMetricsLoadAfterRedistribution() throws IOException
         StorageMetrics.load.dec(load); // reset the load metric
         createSSTables(ksname, cfname, numSSTables, numRows);
 
-        List<SSTableReader> sstables = new ArrayList<>(cfs.getLiveSSTables());
+        List<SSTableReader> sstables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toList());
         for (SSTableReader sstable : sstables)
             sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0));
 
@@ -92,7 +94,7 @@ public void testMetricsLoadAfterRedistribution() throws IOException
         IndexSummaryManager.instance.redistributeSummaries();
 
         long newSize = 0;
-        for (SSTableReader sstable : cfs.getLiveSSTables())
+        for (SSTableReader sstable : selectOnlyBigTableReaders(cfs.getLiveSSTables()))
         {
             assertEquals(cfs.metadata().params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001);
             assertEquals(numRows / cfs.metadata().params.minIndexInterval, sstable.getIndexSummarySize());
diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
index 579fc15abd7b..da61fb81fdf0 100644
--- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
@@ -21,45 +21,38 @@
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Random;
 import java.util.UUID;
 
-import com.google.common.collect.Lists;
 import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.cql3.statements.SelectStatement;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SinglePartitionSliceCommandTest;
 import org.apache.cassandra.db.compaction.AbstractCompactionTask;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.Verifier;
 import org.apache.cassandra.db.repair.PendingAntiCompaction;
-import org.apache.cassandra.db.streaming.CassandraOutgoingFile;
-import org.apache.cassandra.db.ReadExecutionController;
-import org.apache.cassandra.db.SinglePartitionReadCommand;
-import org.apache.cassandra.db.SinglePartitionSliceCommandTest;
-import org.apache.cassandra.db.compaction.Verifier;
-import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.db.rows.RangeTombstoneMarker;
 import org.apache.cassandra.db.rows.Unfiltered;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.streaming.CassandraOutgoingFile;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
@@ -68,14 +61,11 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
-import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.CacheService;
-import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.OutgoingStream;
-import org.apache.cassandra.streaming.StreamPlan;
-import org.apache.cassandra.streaming.StreamSession;
 import org.apache.cassandra.streaming.StreamOperation;
+import org.apache.cassandra.streaming.StreamPlan;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
@@ -102,7 +92,7 @@ public class LegacySSTableTest
      * See {@link #testGenerateSstables()} to generate sstables.
      * Take care on commit as you need to add the sstable files using {@code git add -f}
      */
-    public static final String[] legacyVersions = {"nb", "na", "me", "md", "mc", "mb", "ma"};
+    public static final String[] legacyVersions = {"nb", "na", "me", "md", "mc", "mb", "ma", "aa", "ac", "ad", "ba", "bb"};
 
     // 1200 chars
     static final String longString = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
@@ -150,14 +140,10 @@ public void tearDown()
     /**
      * Get a descriptor for the legacy sstable at the given version.
      */
-    protected Descriptor getDescriptor(String legacyVersion, String table)
+    protected Descriptor getDescriptor(String legacyVersion, String table) throws IOException
     {
-        return new Descriptor(SSTableFormat.Type.BIG.info.getVersion(legacyVersion),
-                              getTableDir(legacyVersion, table),
-                              "legacy_tables",
-                              table,
-                              1,
-                              SSTableFormat.Type.BIG);
+        Path file = Files.list(getTableDir(legacyVersion, table).toPath()).findFirst().orElseThrow(() -> new RuntimeException(String.format("No files for verion=%s and table=%s", legacyVersion, table)));
+        return Descriptor.fromFilename(file.toFile());
     }
 
     @Test
@@ -298,7 +284,8 @@ private void doTestLegacyCqlTables() throws Exception
             CacheService.instance.invalidateKeyCache();
             long startCount = CacheService.instance.keyCache.size();
             verifyReads(legacyVersion);
-            verifyCache(legacyVersion, startCount);
+            if (Keyspace.open("legacy_tables").getColumnFamilyStore(String.format("legacy_%s_simple", legacyVersion)).getLiveSSTables().stream().anyMatch(sstr -> sstr.descriptor.formatType.info.getType() == SSTableFormat.Type.BIG))
+                verifyCache(legacyVersion, startCount);
             compactLegacyTables(legacyVersion);
         }
     }
@@ -432,7 +419,8 @@ private void streamLegacyTables(String legacyVersion) throws Exception
     private void streamLegacyTable(String tablePattern, String legacyVersion) throws Exception
     {
         String table = String.format(tablePattern, legacyVersion);
-        SSTableReader sstable = SSTableReader.open(getDescriptor(legacyVersion, table));
+        Descriptor descriptor = getDescriptor(legacyVersion, table);
+        SSTableReader sstable = descriptor.formatType.info.getReaderFactory().open(getDescriptor(legacyVersion, table));
         IPartitioner p = sstable.getPartitioner();
         List<Range<Token>> ranges = new ArrayList<>();
         ranges.add(new Range<>(p.getMinimumToken(), p.getToken(ByteBufferUtil.bytes("100"))));
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 4e7bae6f1e88..4eac572f204a 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -27,11 +27,14 @@
 import java.time.Instant;
 import java.util.*;
 import java.util.concurrent.*;
+import java.util.stream.Collectors;
 
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Sets;
 import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.After;
 import org.junit.Assert;
+import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Rule;
 import org.junit.Test;
@@ -58,6 +61,7 @@
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.io.FSReadError;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
@@ -75,8 +79,11 @@
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FilterFactory;
 import org.apache.cassandra.utils.IFilter;
+import org.apache.cassandra.utils.PageAware;
 
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+import static org.apache.cassandra.io.sstable.format.SSTableReader.selectOnlyBigTableReaders;
+import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
@@ -245,8 +252,9 @@ private boolean closeEstimation(long expected, long estimated)
     @Test
     public void testSpannedIndexPositions() throws IOException
     {
+        // expect to create many regions - that is, the size of index must exceed the page size multiple times
         int originalMaxSegmentSize = MmappedRegions.MAX_SEGMENT_SIZE;
-        MmappedRegions.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
+        MmappedRegions.MAX_SEGMENT_SIZE = PageAware.PAGE_SIZE;
 
         try
         {
@@ -256,7 +264,7 @@ public void testSpannedIndexPositions() throws IOException
 
             // insert a bunch of data and compact to a single sstable
             CompactionManager.instance.disableAutoCompaction();
-            for (int j = 0; j < 100; j += 2)
+            for (int j = 0; j < 10000; j += 2)
             {
                 new RowUpdateBuilder(store.metadata(), j, String.valueOf(j))
                 .clustering("0")
@@ -269,7 +277,7 @@ public void testSpannedIndexPositions() throws IOException
 
             // check that all our keys are found correctly
             SSTableReader sstable = store.getLiveSSTables().iterator().next();
-            for (int j = 0; j < 100; j += 2)
+            for (int j = 0; j < 10000; j += 2)
             {
                 DecoratedKey dk = Util.dk(String.valueOf(j));
                 FileDataInput file = sstable.getFileDataInput(sstable.getPosition(dk, SSTableReader.Operator.EQ).position);
@@ -278,7 +286,7 @@ public void testSpannedIndexPositions() throws IOException
             }
 
             // check no false positives
-            for (int j = 1; j < 110; j += 2)
+            for (int j = 1; j < 11000; j += 2)
             {
                 DecoratedKey dk = Util.dk(String.valueOf(j));
                 assert sstable.getPosition(dk, SSTableReader.Operator.EQ) == null;
@@ -438,7 +446,6 @@ public void testGetPositionsKeyCacheStats()
 
     }
 
-
     @Test
     public void testOpeningSSTable() throws Exception
     {
@@ -474,9 +481,10 @@ public void testOpeningSSTable() throws Exception
 
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
         Descriptor desc = sstable.descriptor;
+        boolean hasSummary = desc.getFormat().supportedComponents().contains(Component.SUMMARY);
 
         // test to see if sstable can be opened as expected
-        SSTableReader target = SSTableReader.open(desc);
+        SSTableReader target = desc.getFormat().getReaderFactory().open(desc);
         assert target.first.equals(firstKey);
         assert target.last.equals(lastKey);
 
@@ -487,36 +495,36 @@ public void testOpeningSSTable() throws Exception
         Path summaryPath = summaryFile.toPath();
 
         long bloomModified = Files.getLastModifiedTime(bloomPath).toMillis();
-        long summaryModified = Files.getLastModifiedTime(summaryPath).toMillis();
+        long summaryModified = hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0;
 
         TimeUnit.MILLISECONDS.sleep(1000); // sleep to ensure modified time will be different
 
         // Offline tests
         // check that bloomfilter/summary ARE NOT regenerated
-        target = SSTableReader.openNoValidation(desc, store.metadata);
+        target = desc.getFormat().getReaderFactory().openNoValidation(desc, store.metadata);
 
         assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
-        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+        assertEquals(summaryModified, hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0);
 
         target.selfRef().release();
 
         // check that bloomfilter/summary ARE NOT regenerated and BF=AlwaysPresent when filter component is missing
         Set<Component> components = SSTable.discoverComponentsFor(desc);
         components.remove(Component.FILTER);
-        target = SSTableReader.openNoValidation(desc, components, store);
+        target = desc.getFormat().getReaderFactory().openNoValidation(desc, components, store);
 
         assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
-        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+        assertEquals(summaryModified, hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0);
         assertEquals(FilterFactory.AlwaysPresent, target.getBloomFilter());
 
         target.selfRef().release();
 
         // #### online tests ####
         // check that summary & bloomfilter are not regenerated when SSTable is opened and BFFP has been changed
-        target = SSTableReader.open(desc, store.metadata);
+        target = desc.getFormat().getReaderFactory().open(desc, store.metadata);
 
         assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
-        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+        assertEquals(summaryModified, hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0);
 
         target.selfRef().release();
 
@@ -524,25 +532,26 @@ public void testOpeningSSTable() throws Exception
         components = SSTable.discoverComponentsFor(desc);
         components.remove(Component.FILTER);
 
-        target = SSTableReader.open(desc, components, store.metadata);
+        target = desc.getFormat().getReaderFactory().open(desc, components, store.metadata);
 
         assertTrue("Bloomfilter was not recreated", bloomModified < Files.getLastModifiedTime(bloomPath).toMillis());
-        assertTrue("Summary was not recreated", summaryModified < Files.getLastModifiedTime(summaryPath).toMillis());
+        assertTrue("Summary was not recreated", !hasSummary || summaryModified < Files.getLastModifiedTime(summaryPath).toMillis());
 
         target.selfRef().release();
 
         // check that only the summary is regenerated when it is deleted
         components.add(Component.FILTER);
-        summaryModified = Files.getLastModifiedTime(summaryPath).toMillis();
-        summaryFile.delete();
+        summaryModified = hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0;
+        if (hasSummary)
+            summaryFile.delete();
 
         TimeUnit.MILLISECONDS.sleep(1000); // sleep to ensure modified time will be different
         bloomModified = Files.getLastModifiedTime(bloomPath).toMillis();
 
-        target = SSTableReader.open(desc, components, store.metadata);
+        target = desc.getFormat().getReaderFactory().open(desc, components, store.metadata);
 
         assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
-        assertTrue("Summary was not recreated", summaryModified < Files.getLastModifiedTime(summaryPath).toMillis());
+        assertTrue("Summary was not recreated", !hasSummary || summaryModified < Files.getLastModifiedTime(summaryPath).toMillis());
 
         target.selfRef().release();
 
@@ -550,12 +559,12 @@ public void testOpeningSSTable() throws Exception
         components.add(Component.SUMMARY);
         components.remove(Component.PRIMARY_INDEX);
 
-        summaryModified = Files.getLastModifiedTime(summaryPath).toMillis();
-        target = SSTableReader.open(desc, components, store.metadata, false, false);
+        summaryModified = hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0;
+        target = desc.getFormat().getReaderFactory().open(desc, components, store.metadata, false, false);
 
         TimeUnit.MILLISECONDS.sleep(1000); // sleep to ensure modified time will be different
         assertEquals(bloomModified, Files.getLastModifiedTime(bloomPath).toMillis());
-        assertEquals(summaryModified, Files.getLastModifiedTime(summaryPath).toMillis());
+        assertEquals(summaryModified, hasSummary ? Files.getLastModifiedTime(summaryPath).toMillis() : 0);
 
         target.selfRef().release();
     }
@@ -563,6 +572,8 @@ public void testOpeningSSTable() throws Exception
     @Test
     public void testLoadingSummaryUsesCorrectPartitioner() throws Exception
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
+
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore("Indexed1");
 
@@ -577,11 +588,11 @@ public void testLoadingSummaryUsesCorrectPartitioner() throws Exception
         for(ColumnFamilyStore indexCfs : store.indexManager.getAllIndexColumnFamilyStores())
         {
             assert indexCfs.isIndex();
-            SSTableReader sstable = indexCfs.getLiveSSTables().iterator().next();
+            SSTableReader sstable = (SSTableReader) indexCfs.getLiveSSTables().iterator().next();
             assert sstable.first.getToken() instanceof LocalToken;
 
             SSTableReader.saveSummary(sstable.descriptor, sstable.first, sstable.last, sstable.indexSummary);
-            SSTableReader reopened = SSTableReader.open(sstable.descriptor);
+            SSTableReader reopened = sstable.descriptor.getFormat().getReaderFactory().open(sstable.descriptor);
             assert reopened.first.getToken() instanceof LocalToken;
             reopened.selfRef().release();
         }
@@ -605,10 +616,13 @@ public void testGetScannerForNoIntersectingRanges() throws Exception
         boolean foundScanner = false;
         for (SSTableReader s : store.getLiveSSTables())
         {
-            try (ISSTableScanner scanner = s.getScanner(new Range<Token>(t(0), t(1))))
+            try (ISSTableScanner scanner = s.getScanner(new Range<>(t(0), t(1))))
             {
-                scanner.next(); // throws exception pre 5407
-                foundScanner = true;
+                if (scanner.hasNext())
+                {
+                    scanner.next(); // throws exception pre 5407
+                    foundScanner = true;
+                }
             }
         }
         assertTrue(foundScanner);
@@ -648,10 +662,10 @@ public void testGetPositionsForRangesFromTableOpenedForBulkLoading() throws IOEx
         assert sections.size() == 1 : "Expected to find range in sstable" ;
 
         // re-open the same sstable as it would be during bulk loading
-        Set<Component> components = Sets.newHashSet(Component.DATA, Component.PRIMARY_INDEX);
+        Set<Component> components = Sets.newHashSet(sstable.descriptor.getFormat().requiredComponents());
         if (sstable.components.contains(Component.COMPRESSION_INFO))
             components.add(Component.COMPRESSION_INFO);
-        SSTableReader bulkLoaded = SSTableReader.openForBatch(sstable.descriptor, components, store.metadata);
+        SSTableReader bulkLoaded = sstable.descriptor.getFormat().getReaderFactory().openForBatch(sstable.descriptor, components, store.metadata);
         sections = bulkLoaded.getPositionsForRanges(ranges);
         assert sections.size() == 1 : "Expected to find range in sstable opened for bulk loading";
         bulkLoaded.selfRef().release();
@@ -677,9 +691,9 @@ public void testIndexSummaryReplacement() throws IOException, ExecutionException
         store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
-        Collection<SSTableReader> sstables = store.getLiveSSTables();
+        List<SSTableReader> sstables = ImmutableList.copyOf(store.getLiveSSTables());
         assert sstables.size() == 1;
-        final SSTableReader sstable = sstables.iterator().next();
+        final SSTableReader sstable = sstables.get(0);
 
         ThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(5);
         List<Future> futures = new ArrayList<>(NUM_PARTITIONS * 2);
@@ -724,6 +738,8 @@ public void run()
     @Test
     public void testIndexSummaryUpsampleAndReload() throws Exception
     {
+        Assume.assumeThat(SSTableFormat.Type.current(), is(SSTableFormat.Type.BIG));
+
         int originalMaxSegmentSize = MmappedRegions.MAX_SEGMENT_SIZE;
         MmappedRegions.MAX_SEGMENT_SIZE = 40; // each index entry is ~11 bytes, so this will generate lots of segments
 
@@ -756,7 +772,7 @@ private void testIndexSummaryUpsampleAndReload0() throws Exception
         store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         CompactionManager.instance.performMaximal(store, false);
 
-        Collection<SSTableReader> sstables = store.getLiveSSTables();
+        Collection<SSTableReader> sstables = selectOnlyBigTableReaders(store.getLiveSSTables(), Collectors.toList());
         assert sstables.size() == 1;
         final SSTableReader sstable = sstables.iterator().next();
 
@@ -766,7 +782,7 @@ private void testIndexSummaryUpsampleAndReload0() throws Exception
             txn.update(replacement, true);
             txn.finish();
         }
-        SSTableReader reopen = SSTableReader.open(sstable.descriptor);
+        SSTableReader reopen = sstable.descriptor.formatType.info.getReaderFactory().open(sstable.descriptor);
         assert reopen.getIndexSummarySamplingLevel() == sstable.getIndexSummarySamplingLevel() + 1;
     }
 
@@ -809,7 +825,7 @@ public void testMoveAndOpenLiveSSTable()
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
         SSTableReader sstable = getNewSSTable(cfs);
         Descriptor notLiveDesc = new Descriptor(new File("/tmp"), "", "", 0);
-        SSTableReader.moveAndOpenSSTable(cfs, sstable.descriptor, notLiveDesc, sstable.components, false);
+        notLiveDesc.getFormat().getReaderFactory().moveAndOpenSSTable(cfs, sstable.descriptor, notLiveDesc, sstable.components, false);
     }
 
     @Test(expected = RuntimeException.class)
@@ -819,7 +835,7 @@ public void testMoveAndOpenLiveSSTable2()
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore("Standard1");
         SSTableReader sstable = getNewSSTable(cfs);
         Descriptor notLiveDesc = new Descriptor(new File("/tmp"), "", "", 0);
-        SSTableReader.moveAndOpenSSTable(cfs, notLiveDesc, sstable.descriptor, sstable.components, false);
+        sstable.descriptor.getFormat().getReaderFactory().moveAndOpenSSTable(cfs, notLiveDesc, sstable.descriptor, sstable.components, false);
     }
 
     @Test
@@ -840,7 +856,7 @@ public void testMoveAndOpenSSTable() throws IOException
             assertFalse(f.exists());
             assertTrue(new File(sstable.descriptor.filenameFor(c)).exists());
         }
-        SSTableReader.moveAndOpenSSTable(cfs, sstable.descriptor, notLiveDesc, sstable.components, false);
+        notLiveDesc.getFormat().getReaderFactory().moveAndOpenSSTable(cfs, sstable.descriptor, notLiveDesc, sstable.components, false);
         // make sure the files were moved:
         for (Component c : sstable.components)
         {
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
index ccd41b739c67..24e5103884e3 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
@@ -42,6 +42,8 @@
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.View;
 import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Rows;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.CompactionController;
@@ -894,6 +896,7 @@ public void testWriterClearing()
         Keyspace keyspace = Keyspace.open(KEYSPACE);
         ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF);
         File dir = cfs.getDirectories().getDirectoryForNewSSTables();
+        Row staticRow = Rows.EMPTY_STATIC_ROW;
 
         // Can't update a writer that is eagerly cleared on switch
         boolean eagerWriterMetaRelease = true;
@@ -909,6 +912,7 @@ public void testWriterClearing()
                 UnfilteredRowIterator uri = mock(UnfilteredRowIterator.class);
                 when(uri.partitionLevelDeletion()).thenReturn(new DeletionTime(0,0));
                 when(uri.partitionKey()).thenReturn(bopKeyFromInt(0));
+                when(uri.staticRow()).thenReturn(staticRow);
                 // should not be able to append after buffer release on switch
                 firstWriter.append(uri);
                 fail("Expected AssertionError was not thrown.");
@@ -919,27 +923,33 @@ public void testWriterClearing()
             }
         }
 
-        // Can update a writer that is not eagerly cleared on switch
-        eagerWriterMetaRelease = false;
-        try (LifecycleTransaction txn = cfs.getTracker().tryModify(new HashSet<>(), OperationType.UNKNOWN);
-             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, 1000000, false, eagerWriterMetaRelease)
-        )
-        {
-            SSTableWriter firstWriter = getWriter(cfs, dir, txn);
-            rewriter.switchWriter(firstWriter);
-
-            // At least one write so it's not aborted when switched out.
-            UnfilteredRowIterator uri = mock(UnfilteredRowIterator.class);
-            when(uri.partitionLevelDeletion()).thenReturn(new DeletionTime(0,0));
-            when(uri.partitionKey()).thenReturn(bopKeyFromInt(0));
-            rewriter.append(uri);
-
-            rewriter.switchWriter(getWriter(cfs, dir, txn));
-
-            // should be able to append after switch, and assert is not tripped
-            when(uri.partitionKey()).thenReturn(bopKeyFromInt(1));
-            firstWriter.append(uri);
-        }
+        // The check below has been commented out as it is not clear what is the contract it attempts to verify
+        // In particular, SSTableRewriter.switchWriter calls openFinalEarly on the previous instance of writer
+        // which implies that the writer is done, as following the intuition, we should not be able to add any more
+        // data to that writer - but the below check attempts to do that
+
+//        // Can update a writer that is not eagerly cleared on switch
+//        eagerWriterMetaRelease = false;
+//        try (LifecycleTransaction txn = cfs.getTracker().tryModify(new HashSet<>(), OperationType.UNKNOWN);
+//             SSTableRewriter rewriter = new SSTableRewriter(txn, 1000, 1000000, false, eagerWriterMetaRelease)
+//        )
+//        {
+//            SSTableWriter firstWriter = getWriter(cfs, dir, txn);
+//            rewriter.switchWriter(firstWriter);
+//
+//            // At least one write so it's not aborted when switched out.
+//            UnfilteredRowIterator uri = mock(UnfilteredRowIterator.class);
+//            when(uri.partitionLevelDeletion()).thenReturn(new DeletionTime(0,0));
+//            when(uri.partitionKey()).thenReturn(bopKeyFromInt(0));
+//            when(uri.staticRow()).thenReturn(staticRow);
+//            rewriter.append(uri);
+//
+//            rewriter.switchWriter(getWriter(cfs, dir, txn));
+//
+//            // should be able to append after switch, and assert is not tripped
+//            when(uri.partitionKey()).thenReturn(bopKeyFromInt(1));
+//            firstWriter.append(uri);
+//        }
     }
 
     static DecoratedKey bopKeyFromInt(int i)
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java
index 31d0b8986dc0..b8e7377753db 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableWriterTest.java
@@ -63,21 +63,23 @@ public void testAbortTxnWithOpenEarlyShouldRemoveSSTable() throws InterruptedExc
                 writer.append(builder.build().unfilteredIterator());
             }
 
-            SSTableReader s = writer.setMaxDataAge(1000).openEarly();
-            assert s != null;
-            assertFileCounts(dir.list());
-            for (int i = 10000; i < 20000; i++)
-            {
-                UpdateBuilder builder = UpdateBuilder.create(cfs.metadata(), random(i, 10)).withTimestamp(1);
-                for (int j = 0; j < 100; j++)
-                    builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
-                writer.append(builder.build().unfilteredIterator());
-            }
-            SSTableReader s2 = writer.setMaxDataAge(1000).openEarly();
-            assertTrue(s.last.compareTo(s2.last) < 0);
-            assertFileCounts(dir.list());
-            s.selfRef().release();
-            s2.selfRef().release();
+            writer.setMaxDataAge(1000).openEarly(s -> {
+                assert s != null;
+                assertFileCounts(dir.list());
+                for (int i = 10000; i < 20000; i++)
+                {
+                    UpdateBuilder builder = UpdateBuilder.create(cfs.metadata(), random(i, 10)).withTimestamp(1);
+                    for (int j = 0; j < 100; j++)
+                        builder.newRow("" + j).add("val", ByteBuffer.allocate(1000));
+                    writer.append(builder.build().unfilteredIterator());
+                }
+                writer.setMaxDataAge(1000).openEarly(s2 -> {
+                    assertTrue(s.last.compareTo(s2.last) < 0);
+                    assertFileCounts(dir.list());
+                    s2.selfRef().release();
+                    s.selfRef().release();
+                });
+            });
 
             int datafiles = assertFileCounts(dir.list());
             assertEquals(datafiles, 1);
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java b/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java
index 8983433345fb..cbed7796abf5 100644
--- a/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/SSTableFlushObserverTest.java
@@ -33,6 +33,7 @@
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.io.util.RandomAccessReader;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Clustering;
@@ -329,16 +330,22 @@ private static void testFlushObserver(SSTableFormat.Type type,
         {
             for (FlushObserver.HeaderEntry e : observer.headers)
             {
-                assertEquals(e.key, reader.keyAt(e.keyPosition));
-                assertEquals(e.deletionTime, reader.partitionLevelDeletionAt(e.deletionTimePosition));
-                assertEquals(e.staticRow, reader.staticRowAt(e.staticRowPosition, columnFilter));
+                try (RandomAccessReader in = reader.openKeyComponentReader())
+                {
+                    assertEquals(e.key, reader.keyAt(in, e.keyPosition));
+                    assertEquals(e.deletionTime, reader.partitionLevelDeletionAt(e.deletionTimePosition));
+                    assertEquals(e.staticRow, reader.staticRowAt(e.staticRowPosition, columnFilter));
+                }
             }
 
             for (FlushObserver.UnfilteredEntry e : observer.unfiltereds)
             {
-                assertEquals(e.key, reader.keyAt(e.keyPosition));
-                assertEquals(e.unfiltered.clustering(), reader.clusteringAt(e.unfilteredPosition));
-                assertEquals(e.unfiltered, reader.unfilteredAt(e.unfilteredPosition, columnFilter));
+                try (RandomAccessReader in = reader.openKeyComponentReader())
+                {
+                    assertEquals(e.key, reader.keyAt(in, e.keyPosition));
+                    assertEquals(e.unfiltered.clustering(), reader.clusteringAt(e.unfilteredPosition));
+                    assertEquals(e.unfiltered, reader.unfilteredAt(e.unfilteredPosition, columnFilter));
+                }
             }
         }
         catch (IOException ex)
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriterTest.java
similarity index 91%
rename from test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java
rename to test/unit/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriterTest.java
index 8092bb60c2b3..2daa22e38ed5 100644
--- a/test/unit/org/apache/cassandra/io/sstable/format/big/BigTableZeroCopyWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriterTest.java
@@ -16,7 +16,7 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.io.sstable.format.big;
+package org.apache.cassandra.io.sstable.format;
 
 import java.io.ByteArrayInputStream;
 import java.io.File;
@@ -31,6 +31,9 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import io.netty.buffer.Unpooled;
 import io.netty.channel.embedded.EmbeddedChannel;
 import org.apache.cassandra.SchemaLoader;
@@ -46,8 +49,8 @@
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.SSTableZeroCopyWriter;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.FileHandle;
 import org.apache.cassandra.net.AsyncStreamingInputPlus;
@@ -62,8 +65,10 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
 
-public class BigTableZeroCopyWriterTest
+public class SSTableZeroCopyWriterTest
 {
+    private final static Logger logger = LoggerFactory.getLogger(SSTableZeroCopyWriterTest.class);
+
     public static final String KEYSPACE1 = "BigTableBlockWriterTest";
     public static final String CF_STANDARD = "Standard1";
     public static final String CF_STANDARD2 = "Standard2";
@@ -134,7 +139,9 @@ public void writeDataFile_RebufferingByteBufDataInputPlus()
         {
             writeDataTestCycle(buffer ->
             {
-                input.append(Unpooled.wrappedBuffer(buffer));
+                if (buffer.limit() > 0) { // skip empty files that would cause premature EOF
+                    input.append(Unpooled.wrappedBuffer(buffer));
+                }
                 return input;
             });
 
@@ -150,17 +157,15 @@ private void writeDataTestCycle(Function<ByteBuffer, DataInputPlus> bufferMapper
         TableMetadataRef metadata = Schema.instance.getTableMetadataRef(desc);
 
         LifecycleTransaction txn = LifecycleTransaction.offline(OperationType.STREAM);
-        Set<Component> componentsToWrite = ImmutableSet.of(Component.DATA, Component.PRIMARY_INDEX,
-                                                           Component.STATS);
+        Set<Component> componentsToWrite = desc.getFormat().requiredComponents();
 
-        BigTableZeroCopyWriter btzcw = new BigTableZeroCopyWriter(desc, metadata, txn, componentsToWrite);
+        SSTableZeroCopyWriter btzcw = new SSTableZeroCopyWriter(desc, metadata, txn, componentsToWrite);
 
         for (Component component : componentsToWrite)
         {
-            if (Files.exists(Paths.get(desc.filenameFor(component))))
+            if (desc.fileFor(component).exists())
             {
                 Pair<DataInputPlus, Long> pair = getSSTableComponentData(sstable, component, bufferMapper);
-
                 btzcw.writeComponent(component.type, pair.left, pair.right);
             }
         }
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java b/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
index d2cafecc941c..7aeeea43247e 100644
--- a/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/format/columniterator/SSTableReverseIteratorTest.java
@@ -33,9 +33,9 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
-import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.schema.KeyspaceParams;
 
 public class SSTableReverseIteratorTest
@@ -84,7 +84,7 @@ public void emptyBlockTolerance()
         tbl.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader sstable = Iterables.getOnlyElement(tbl.getLiveSSTables());
         DecoratedKey dk = tbl.getPartitioner().decorateKey(Int32Type.instance.decompose(key));
-        BigTableRowIndexEntry indexEntry = (BigTableRowIndexEntry) sstable.getPosition(dk, SSTableReader.Operator.EQ);
+        RowIndexEntry indexEntry = sstable.getPosition(dk, SSTableReader.Operator.EQ);
         Assert.assertTrue(indexEntry.isIndexed());
         Assert.assertTrue(indexEntry.columnsIndexCount() > 2);
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexTest.java b/test/unit/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexTest.java
new file mode 100644
index 000000000000..4f298ee5c702
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/format/trieindex/PartitionIndexTest.java
@@ -0,0 +1,937 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.channels.FileChannel;
+import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+import java.util.UUID;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.function.Supplier;
+
+import com.google.common.base.Optional;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.cache.ChunkCache;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.dht.ByteOrderedPartitioner;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.io.tries.TrieNode;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+import org.apache.cassandra.io.util.WrappingRebufferer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.PageAware;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class PartitionIndexTest
+{
+    private final static Logger logger = LoggerFactory.getLogger(PartitionIndexTest.class);
+
+    private final static long SEED = System.nanoTime();
+    private final static Random random = new Random(SEED);
+
+    static final ByteComparable.Version VERSION = Walker.BYTE_COMPARABLE_VERSION;
+
+    static
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    IPartitioner partitioner = Util.testPartitioner();
+    //Lower the size of the indexes when running without the chunk cache, otherwise the test times out on Jenkins
+    static final int COUNT = ChunkCache.instance != null ? 245256 : 24525;
+
+    @Parameterized.Parameters()
+    public static Collection<Object[]> generateData()
+    {
+        return Arrays.asList(new Object[]{ Config.DiskAccessMode.standard },
+                             new Object[]{ Config.DiskAccessMode.mmap });
+    }
+
+    @Parameterized.Parameter(value = 0)
+    public static Config.DiskAccessMode accessMode = Config.DiskAccessMode.standard;
+
+    public static void beforeClass()
+    {
+        logger.info("Using random seed: {}", SEED);
+    }
+
+    /**
+     * Tests last-nodes-sizing failure uncovered during code review.
+     */
+    @Test
+    public void testSizingBug() throws IOException, InterruptedException
+    {
+        for (int i = 1; i < COUNT; i *= 10)
+        {
+            testGetEq(generateRandomIndex(i));
+            testGetEq(generateSequentialIndex(i));
+        }
+    }
+
+    @Test
+    public void testGetEq() throws IOException, InterruptedException
+    {
+        testGetEq(generateRandomIndex(COUNT));
+        testGetEq(generateSequentialIndex(COUNT));
+    }
+
+    @Test
+    public void testBrokenFile() throws IOException, InterruptedException
+    {
+        // put some garbage in the file
+        final Pair<List<DecoratedKey>, PartitionIndex> data = generateRandomIndex(COUNT);
+        File f = new File(data.right.getFileHandle().path());
+        try (FileChannel ch = FileChannel.open(f.toPath(), StandardOpenOption.WRITE))
+        {
+            ch.write(generateRandomKey().getKey(), f.length() * 2 / 3);
+        }
+
+        boolean thrown = false;
+        try
+        {
+            testGetEq(data);
+        }
+        catch (Throwable e)
+        {
+            thrown = true;
+        }
+        assertTrue(thrown);
+    }
+
+    @Test
+    public void testLongKeys() throws IOException, InterruptedException
+    {
+        testGetEq(generateLongKeysIndex(COUNT / 10));
+    }
+
+    void testGetEq(Pair<List<DecoratedKey>, PartitionIndex> data)
+    {
+        List<DecoratedKey> keys = data.left;
+        try (PartitionIndex summary = data.right;
+             PartitionIndex.Reader reader = summary.openReader())
+        {
+            for (int i = 0; i < data.left.size(); i++)
+            {
+                assertEquals(i, reader.exactCandidate(keys.get(i)));
+                DecoratedKey key = generateRandomKey();
+                assertEquals(eq(keys, key), eq(keys, key, reader.exactCandidate(key)));
+            }
+        }
+    }
+
+    @Test
+    public void testGetGt() throws IOException
+    {
+        testGetGt(generateRandomIndex(COUNT));
+        testGetGt(generateSequentialIndex(COUNT));
+    }
+
+    private void testGetGt(Pair<List<DecoratedKey>, PartitionIndex> data) throws IOException
+    {
+        List<DecoratedKey> keys = data.left;
+        try (PartitionIndex summary = data.right;
+             PartitionIndex.Reader reader = summary.openReader())
+        {
+            for (int i = 0; i < data.left.size(); i++)
+            {
+                assertEquals(i < data.left.size() - 1 ? i + 1 : -1, gt(keys, keys.get(i), reader));
+                DecoratedKey key = generateRandomKey();
+                assertEquals(gt(keys, key), gt(keys, key, reader));
+            }
+        }
+    }
+
+    @Test
+    public void testGetGe() throws IOException
+    {
+        testGetGe(generateRandomIndex(COUNT));
+        testGetGe(generateSequentialIndex(COUNT));
+    }
+
+    public void testGetGe(Pair<List<DecoratedKey>, PartitionIndex> data) throws IOException
+    {
+        List<DecoratedKey> keys = data.left;
+        try (PartitionIndex summary = data.right;
+             PartitionIndex.Reader reader = summary.openReader())
+        {
+            for (int i = 0; i < data.left.size(); i++)
+            {
+                assertEquals(i, ge(keys, keys.get(i), reader));
+                DecoratedKey key = generateRandomKey();
+                assertEquals(ge(keys, key), ge(keys, key, reader));
+            }
+        }
+    }
+
+
+    @Test
+    public void testGetLt() throws IOException
+    {
+        testGetLt(generateRandomIndex(COUNT));
+        testGetLt(generateSequentialIndex(COUNT));
+    }
+
+    public void testGetLt(Pair<List<DecoratedKey>, PartitionIndex> data) throws IOException
+    {
+        List<DecoratedKey> keys = data.left;
+        try (PartitionIndex summary = data.right;
+             PartitionIndex.Reader reader = summary.openReader())
+        {
+            for (int i = 0; i < data.left.size(); i++)
+            {
+                assertEquals(i - 1, lt(keys, keys.get(i), reader));
+                DecoratedKey key = generateRandomKey();
+                assertEquals(lt(keys, key), lt(keys, key, reader));
+            }
+        }
+    }
+
+    private long gt(List<DecoratedKey> keys, DecoratedKey key, PartitionIndex.Reader summary) throws IOException
+    {
+        return Optional.fromNullable(summary.ceiling(key, (pos, assumeNoMatch, sk) -> (assumeNoMatch || keys.get((int) pos).compareTo(sk) > 0) ? pos : null)).or(-1L);
+    }
+
+    private long ge(List<DecoratedKey> keys, DecoratedKey key, PartitionIndex.Reader summary) throws IOException
+    {
+        return Optional.fromNullable(summary.ceiling(key, (pos, assumeNoMatch, sk) -> (assumeNoMatch || keys.get((int) pos).compareTo(sk) >= 0) ? pos : null)).or(-1L);
+    }
+
+
+    private long lt(List<DecoratedKey> keys, DecoratedKey key, PartitionIndex.Reader summary) throws IOException
+    {
+        return Optional.fromNullable(summary.floor(key, (pos, assumeNoMatch, sk) -> (assumeNoMatch || keys.get((int) pos).compareTo(sk) < 0) ? pos : null)).or(-1L);
+    }
+
+    private long eq(List<DecoratedKey> keys, DecoratedKey key, long exactCandidate)
+    {
+        int idx = (int) exactCandidate;
+        if (exactCandidate == PartitionIndex.NOT_FOUND)
+            return -1;
+        return (keys.get(idx).equals(key)) ? idx : -1;
+    }
+
+    private long gt(List<DecoratedKey> keys, DecoratedKey key)
+    {
+        int index = Collections.binarySearch(keys, key);
+        if (index < 0)
+            index = -1 - index;
+        else
+            ++index;
+        return index < keys.size() ? index : -1;
+    }
+
+    private long lt(List<DecoratedKey> keys, DecoratedKey key)
+    {
+        int index = Collections.binarySearch(keys, key);
+
+        if (index < 0)
+            index = -index - 2;
+
+        return index >= 0 ? index : -1;
+    }
+
+    private long ge(List<DecoratedKey> keys, DecoratedKey key)
+    {
+        int index = Collections.binarySearch(keys, key);
+        if (index < 0)
+            index = -1 - index;
+        return index < keys.size() ? index : -1;
+    }
+
+    private long eq(List<DecoratedKey> keys, DecoratedKey key)
+    {
+        int index = Collections.binarySearch(keys, key);
+        return index >= 0 ? index : -1;
+    }
+
+    @Test
+    public void testAddEmptyKey() throws Exception
+    {
+        IPartitioner p = new RandomPartitioner();
+        File file = FileUtils.createTempFile("ColumnTrieReaderTest", "");
+
+        try (SequentialWriter writer = makeWriter(file);
+             FileHandle.Builder fhBuilder = makeHandle(file);
+             PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder);
+        )
+        {
+            DecoratedKey key = p.decorateKey(ByteBufferUtil.EMPTY_BYTE_BUFFER);
+            builder.addEntry(key, 42);
+            builder.complete();
+            try (PartitionIndex summary = loadPartitionIndex(fhBuilder, writer);
+                 PartitionIndex.Reader reader = summary.openReader())
+            {
+                assertEquals(1, summary.size());
+                assertEquals(42, reader.getLastIndexPosition());
+                assertEquals(42, reader.exactCandidate(key));
+            }
+        }
+    }
+
+    @Test
+    public void testIteration() throws IOException
+    {
+//        assertEquals(0, ChunkReader.bufferPool.usedMemoryBytes());
+        Pair<List<DecoratedKey>, PartitionIndex> random = generateRandomIndex(COUNT);
+        checkIteration(random.left, random.left.size(), random.right);
+        random.right.close();
+//        assertEquals(0, ChunkReader.bufferPool.usedMemoryBytes());
+    }
+
+    @Test
+    public void testZeroCopyOffsets() throws IOException
+    {
+        Pair<List<DecoratedKey>, PartitionIndex> random = generateRandomIndexWithZeroCopy(COUNT, 1, COUNT - 2);
+        List<DecoratedKey> keys = random.left;
+        try (PartitionIndex index = random.right)
+        {
+            assertEquals(COUNT - 2, index.size());
+            assertEquals(keys.get(1), index.firstKey());
+            assertEquals(keys.get(COUNT - 2), index.lastKey());
+        }
+    }
+
+    public void checkIteration(List<DecoratedKey> keys, int keysSize, PartitionIndex index)
+    {
+        try (PartitionIndex enforceIndexClosing = index;
+             PartitionIndex.IndexPosIterator iter = index.allKeysIterator())
+        {
+            int i = 0;
+            while (true)
+            {
+                long pos = iter.nextIndexPos();
+                if (pos == PartitionIndex.NOT_FOUND)
+                    break;
+                assertEquals(i, pos);
+                ++i;
+            }
+            assertEquals(keysSize, i);
+        }
+    }
+
+    @Test
+    public void testConstrainedIteration() throws IOException
+    {
+        Pair<List<DecoratedKey>, PartitionIndex> random = generateRandomIndex(COUNT);
+        try (PartitionIndex summary = random.right)
+        {
+            List<DecoratedKey> keys = random.left;
+            Random rand = new Random();
+
+            for (int i = 0; i < 1000; ++i)
+            {
+                boolean exactLeft = rand.nextBoolean();
+                boolean exactRight = rand.nextBoolean();
+                DecoratedKey left = exactLeft ? keys.get(rand.nextInt(keys.size())) : generateRandomKey();
+                DecoratedKey right = exactRight ? keys.get(rand.nextInt(keys.size())) : generateRandomKey();
+                if (right.compareTo(left) < 0)
+                {
+                    DecoratedKey t = left;
+                    left = right;
+                    right = t;
+                    boolean b = exactLeft;
+                    exactLeft = exactRight;
+                    exactRight = b;
+                }
+
+                try (PartitionIndex.IndexPosIterator iter = new PartitionIndex.IndexPosIterator(summary, left, right))
+                {
+                    long p = iter.nextIndexPos();
+                    if (p == PartitionIndex.NOT_FOUND)
+                    {
+                        int idx = (int) ge(keys, left); // first greater key
+                        if (idx == -1)
+                            continue;
+                        assertTrue(left + " <= " + keys.get(idx) + " <= " + right + " but " + idx + " wasn't iterated.", right.compareTo(keys.get(idx)) < 0);
+                        continue;
+                    }
+
+                    int idx = (int) p;
+                    if (p > 0)
+                        assertTrue(left.compareTo(keys.get(idx - 1)) > 0);
+                    if (p < keys.size() - 1)
+                        assertTrue(left.compareTo(keys.get(idx + 1)) < 0);
+                    if (exactLeft)      // must be precise on exact, otherwise could be in any relation
+                        assertTrue(left == keys.get(idx));
+                    while (true)
+                    {
+                        ++idx;
+                        long pos = iter.nextIndexPos();
+                        if (pos == PartitionIndex.NOT_FOUND)
+                            break;
+                        assertEquals(idx, pos);
+                    }
+                    --idx; // seek at last returned
+                    if (idx < keys.size() - 1)
+                        assertTrue(right.compareTo(keys.get(idx + 1)) < 0);
+                    if (idx > 0)
+                        assertTrue(right.compareTo(keys.get(idx - 1)) > 0);
+                    if (exactRight)      // must be precise on exact, otherwise could be in any relation
+                        assertTrue(right == keys.get(idx));
+                }
+                catch (AssertionError e)
+                {
+                    e.printStackTrace();
+                    System.out.format("Left %s%s Right %s%s\n", left.byteComparableAsString(VERSION), exactLeft ? "#" : "", right.byteComparableAsString(VERSION), exactRight ? "#" : "");
+                    try (PartitionIndex.IndexPosIterator iter2 = new PartitionIndex.IndexPosIterator(summary, left, right))
+                    {
+                        long pos;
+                        while ((pos = iter2.nextIndexPos()) != PartitionIndex.NOT_FOUND)
+                        {
+                            System.out.println(keys.get((int) pos).byteComparableAsString(VERSION));
+                        }
+                        System.out.format("Left %s%s Right %s%s\n", left.byteComparableAsString(VERSION), exactLeft ? "#" : "", right.byteComparableAsString(VERSION), exactRight ? "#" : "");
+                    }
+                    throw e;
+                }
+            }
+        }
+    }
+
+    @Test
+    public void testPartialIndex() throws IOException
+    {
+        for (int reps = 0; reps < 10; ++reps)
+        {
+            File file = FileUtils.createTempFile("ColumnTrieReaderTest", "");
+            List<DecoratedKey> list = Lists.newArrayList();
+            int parts = 15;
+            try (SequentialWriter writer = makeWriter(file);
+                 FileHandle.Builder fhBuilder = makeHandle(file);
+                 PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder)
+            )
+            {
+                writer.setPostFlushListener(() -> builder.markPartitionIndexSynced(writer.getLastFlushOffset()));
+                for (int i = 0; i < COUNT; i++)
+                {
+                    DecoratedKey key = generateRandomLengthKey();
+                    list.add(key);
+                }
+                Collections.sort(list);
+                AtomicInteger callCount = new AtomicInteger();
+
+                int i = 0;
+                for (int part = 1; part <= parts; ++part)
+                {
+                    for (; i < COUNT * part / parts; i++)
+                        builder.addEntry(list.get(i), i);
+
+                    final int addedSize = i;
+                    builder.buildPartial(index ->
+                                         {
+                                             int indexSize = Collections.binarySearch(list, index.lastKey()) + 1;
+                                             assert indexSize >= addedSize - 1;
+                                             checkIteration(list, indexSize, index);
+                                             callCount.incrementAndGet();
+                                         }, 0, i * 1024);
+                    builder.markDataSynced(i * 1024);
+                    // verifier will be called when the sequentialWriter finishes a chunk
+                }
+
+                for (; i < COUNT; ++i)
+                    builder.addEntry(list.get(i), i);
+                builder.complete();
+                try (PartitionIndex index = loadPartitionIndex(fhBuilder, writer))
+                {
+                    checkIteration(list, list.size(), index);
+                }
+                if (COUNT / parts > 16000)
+                {
+                    assertTrue(String.format("Expected %d or %d calls, got %d", parts, parts - 1, callCount.get()),
+                               callCount.get() == parts - 1 || callCount.get() == parts);
+                }
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Test
+    public void testDeepRecursion() throws InterruptedException
+    {
+        CompletableFuture<Void> future = new CompletableFuture<>();
+        // Check that long repeated strings don't cause stack overflow
+        // Test both normal completion as well as partial construction.
+        Thread t = new Thread(null, () ->
+        {
+            try
+            {
+                File file = File.createTempFile("ColumnTrieReaderTest", "");
+                SequentialWriter writer = new SequentialWriter(file, SequentialWriterOption.newBuilder().finishOnClose(true).build());
+                List<DecoratedKey> list = Lists.newArrayList();
+                String longString = "";
+                for (int i = 0; i < PageAware.PAGE_SIZE + 99; ++i)
+                {
+                    longString += i;
+                }
+                IPartitioner partitioner = ByteOrderedPartitioner.instance;
+                list.add(partitioner.decorateKey(ByteBufferUtil.bytes(longString + "A")));
+                list.add(partitioner.decorateKey(ByteBufferUtil.bytes(longString + "B")));
+                list.add(partitioner.decorateKey(ByteBufferUtil.bytes(longString + "C")));
+                list.add(partitioner.decorateKey(ByteBufferUtil.bytes(longString + "D")));
+                list.add(partitioner.decorateKey(ByteBufferUtil.bytes(longString + "E")));
+
+                try (FileHandle.Builder fhBuilder = new FileHandle.Builder(file.getPath())
+                                                    .bufferSize(PageAware.PAGE_SIZE)
+                                                    .withChunkCache(ChunkCache.instance);
+                     PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder)
+                )
+                {
+                    int i = 0;
+                    for (i = 0; i < 3; ++i)
+                        builder.addEntry(list.get(i), i);
+
+                    writer.setPostFlushListener(() -> builder.markPartitionIndexSynced(writer.getLastFlushOffset()));
+                    AtomicInteger callCount = new AtomicInteger();
+
+                    final int addedSize = i;
+                    builder.buildPartial(index ->
+                                         {
+                                             int indexSize = Collections.binarySearch(list, index.lastKey()) + 1;
+                                             assert indexSize >= addedSize - 1;
+                                             checkIteration(list, indexSize, index);
+                                             index.close();
+                                             callCount.incrementAndGet();
+                                         }, 0, i * 1024);
+
+                    for (; i < list.size(); ++i)
+                        builder.addEntry(list.get(i), i);
+                    builder.complete();
+
+                    try (PartitionIndex index = PartitionIndex.load(fhBuilder, partitioner, true))
+                    {
+                        checkIteration(list, list.size(), index);
+                    }
+                }
+                future.complete(null);
+            }
+            catch (Throwable err)
+            {
+                future.completeExceptionally(err);
+            }
+        }, "testThread", 32 * 1024);
+
+        t.start();
+        future.join();
+    }
+
+    class JumpingFile extends SequentialWriter
+    {
+        long[] cutoffs;
+        long[] offsets;
+
+        JumpingFile(File file, SequentialWriterOption option, long... cutoffsAndOffsets)
+        {
+            super(file, option);
+            assert (cutoffsAndOffsets.length & 1) == 0;
+            cutoffs = new long[cutoffsAndOffsets.length / 2];
+            offsets = new long[cutoffs.length];
+            for (int i = 0; i < cutoffs.length; ++i)
+            {
+                cutoffs[i] = cutoffsAndOffsets[i * 2];
+                offsets[i] = cutoffsAndOffsets[i * 2 + 1];
+            }
+        }
+
+        @Override
+        public long position()
+        {
+            return jumped(super.position(), cutoffs, offsets);
+        }
+    }
+
+    class JumpingRebufferer extends WrappingRebufferer
+    {
+        long[] cutoffs;
+        long[] offsets;
+
+        JumpingRebufferer(Rebufferer source, long... cutoffsAndOffsets)
+        {
+            super(source);
+            assert (cutoffsAndOffsets.length & 1) == 0;
+            cutoffs = new long[cutoffsAndOffsets.length / 2];
+            offsets = new long[cutoffs.length];
+            for (int i = 0; i < cutoffs.length; ++i)
+            {
+                cutoffs[i] = cutoffsAndOffsets[i * 2];
+                offsets[i] = cutoffsAndOffsets[i * 2 + 1];
+            }
+        }
+
+        @Override
+        public BufferHolder rebuffer(long position)
+        {
+            long pos;
+
+            int idx = Arrays.binarySearch(offsets, position);
+            if (idx < 0)
+                idx = -2 - idx;
+            pos = position;
+            if (idx >= 0)
+                pos = pos - offsets[idx] + cutoffs[idx];
+
+            super.rebuffer(pos);
+
+            if (idx < cutoffs.length - 1 && buffer.limit() + offset > cutoffs[idx + 1])
+                buffer.limit((int) (cutoffs[idx + 1] - offset));
+            if (idx >= 0)
+                offset = offset - cutoffs[idx] + offsets[idx];
+
+            return this;
+        }
+
+        @Override
+        public long fileLength()
+        {
+            return jumped(source.fileLength(), cutoffs, offsets);
+        }
+
+        @Override
+        public String toString()
+        {
+            return Arrays.toString(cutoffs) + Arrays.toString(offsets);
+        }
+    }
+
+    public class PartitionIndexJumping extends PartitionIndex
+    {
+        final long[] cutoffsAndOffsets;
+
+        public PartitionIndexJumping(FileHandle fh, long trieRoot, long keyCount, DecoratedKey first, DecoratedKey last,
+                                     long... cutoffsAndOffsets)
+        {
+            super(fh, trieRoot, keyCount, first, last, null, null);
+            this.cutoffsAndOffsets = cutoffsAndOffsets;
+        }
+
+        @Override
+        protected Rebufferer instantiateRebufferer()
+        {
+            return new JumpingRebufferer(super.instantiateRebufferer(), cutoffsAndOffsets);
+        }
+    }
+
+    long jumped(long pos, long[] cutoffs, long[] offsets)
+    {
+        int idx = Arrays.binarySearch(cutoffs, pos);
+        if (idx < 0)
+            idx = -2 - idx;
+        if (idx < 0)
+            return pos;
+        return pos - cutoffs[idx] + offsets[idx];
+    }
+
+    @Test
+    public void testPointerGrowth() throws IOException
+    {
+        for (int reps = 0; reps < 10; ++reps)
+        {
+            File file = FileUtils.createTempFile("ColumnTrieReaderTest", "");
+            long[] cutoffsAndOffsets = new long[]{
+            2 * 4096, 1L << 16,
+            4 * 4096, 1L << 24,
+            6 * 4096, 1L << 31,
+            8 * 4096, 1L << 32,
+            10 * 4096, 1L << 33,
+            12 * 4096, 1L << 34,
+            14 * 4096, 1L << 40,
+            16 * 4096, 1L << 42
+            };
+
+            List<DecoratedKey> list = Lists.newArrayList();
+            try (SequentialWriter writer = makeJumpingWriter(file, cutoffsAndOffsets);
+                 FileHandle.Builder fhBuilder = makeHandle(file);
+                 PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder);
+            )
+            {
+                writer.setPostFlushListener(() -> builder.markPartitionIndexSynced(writer.getLastFlushOffset()));
+                for (int i = 0; i < COUNT; i++)
+                {
+                    DecoratedKey key = generateRandomKey();
+                    list.add(key);
+                }
+                Collections.sort(list);
+
+                for (int i = 0; i < COUNT; ++i)
+                    builder.addEntry(list.get(i), i);
+                long root = builder.complete();
+
+                try (FileHandle fh = fhBuilder.complete();
+                     PartitionIndex index = new PartitionIndexJumping(fh, root, COUNT, null, null, cutoffsAndOffsets);
+                     Analyzer analyzer = new Analyzer(index))
+                {
+                    checkIteration(list, list.size(), index);
+
+                    analyzer.run();
+                    if (analyzer.countPerType.elementSet().size() < 7)
+                    {
+                        Assert.fail("Expecting at least 7 different node types, got " + analyzer.countPerType.elementSet().size() + "\n" + analyzer.countPerType);
+                    }
+                }
+            }
+            catch (IOException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    @Test
+    public void testDumpTrieToFile() throws IOException
+    {
+        File file = FileUtils.createTempFile("testDumpTrieToFile", "index");
+
+        ArrayList<DecoratedKey> list = Lists.newArrayList();
+        try (SequentialWriter writer = new SequentialWriter(file, SequentialWriterOption.DEFAULT);
+             FileHandle.Builder fhBuilder = makeHandle(file);
+             PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder);
+        )
+        {
+            writer.setPostFlushListener(() -> builder.markPartitionIndexSynced(writer.getLastFlushOffset()));
+            for (int i = 0; i < 1000; i++)
+            {
+                DecoratedKey key = generateRandomKey();
+                list.add(key);
+            }
+            Collections.sort(list);
+
+            for (int i = 0; i < 1000; ++i)
+                builder.addEntry(list.get(i), i);
+            long root = builder.complete();
+
+            try (FileHandle fh = fhBuilder.complete();
+                 PartitionIndex index = new PartitionIndex(fh, root, 1000, null, null, null, null))
+            {
+                File dump = FileUtils.createTempFile("testDumpTrieToFile", "dumpedTrie");
+                index.dumpTrie(dump.toString());
+                String dumpContent = String.join("\n", Files.readAllLines(dump.toPath()));
+                logger.info("Dumped trie: \n{}", dumpContent);
+                assertFalse(dumpContent.isEmpty());
+            }
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static class Analyzer extends PartitionIndex.Reader
+    {
+        Multiset<TrieNode> countPerType = HashMultiset.create();
+
+        public Analyzer(PartitionIndex index)
+        {
+            super(index);
+        }
+
+        public void run()
+        {
+            run(root);
+        }
+
+        void run(long node)
+        {
+            go(node);
+
+            countPerType.add(nodeType);
+
+            int tr = transitionRange();
+            for (int i = 0; i < tr; ++i)
+            {
+                long child = transition(i);
+                if (child == -1)
+                    continue;
+                run(child);
+                go(node);
+            }
+        }
+    }
+
+
+    private Pair<List<DecoratedKey>, PartitionIndex> generateRandomIndex(int size) throws IOException
+    {
+        return generateIndex(size, this::generateRandomKey);
+    }
+
+    Pair<List<DecoratedKey>, PartitionIndex> generateLongKeysIndex(int size) throws IOException
+    {
+        return generateIndex(size, this::generateLongKey);
+    }
+
+    private Pair<List<DecoratedKey>, PartitionIndex> generateSequentialIndex(int size) throws IOException
+    {
+        return generateIndex(size, new Supplier<DecoratedKey>()
+        {
+            long i = 0;
+
+            public DecoratedKey get()
+            {
+                return sequentialKey(i++);
+            }
+        });
+    }
+
+    private Pair<List<DecoratedKey>, PartitionIndex> generateRandomIndexWithZeroCopy(int size, int firstKeyOffset, int lastKeyOffset) throws IOException
+    {
+        return generateIndex(size, this::generateRandomKey, firstKeyOffset, lastKeyOffset, true);
+    }
+
+    Pair<List<DecoratedKey>, PartitionIndex> generateIndex(int size, Supplier<DecoratedKey> keyGenerator) throws IOException
+    {
+        return generateIndex(size, keyGenerator, 0, size - 1, false);
+    }
+
+    Pair<List<DecoratedKey>, PartitionIndex> generateIndex(int size, Supplier<DecoratedKey> keyGenerator, int firstKeyOffset, int lastKeyOffset, boolean hasZeroCopy) throws IOException
+    {
+        File file = FileUtils.createTempFile("ColumnTrieReaderTest", "");
+        List<DecoratedKey> list = Lists.newArrayList();
+        try (SequentialWriter writer = makeWriter(file);
+             FileHandle.Builder fhBuilder = makeHandle(file);
+             PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder);
+        )
+        {
+            for (int i = 0; i < size; i++)
+            {
+                DecoratedKey key = keyGenerator.get();
+                list.add(key);
+            }
+            Collections.sort(list);
+
+            for (int i = firstKeyOffset; i <= lastKeyOffset; i++)
+                builder.addEntry(list.get(i), i);
+            builder.complete();
+
+            PartitionIndex summary = loadPartitionIndex(fhBuilder, writer);
+
+            return Pair.create(list, summary);
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    DecoratedKey generateRandomKey()
+    {
+        UUID uuid = new UUID(random.nextLong(), random.nextLong());
+        return partitioner.decorateKey(ByteBufferUtil.bytes(uuid));
+    }
+
+    DecoratedKey generateRandomLengthKey()
+    {
+        Random rand = ThreadLocalRandom.current();
+        int length = nextPowerRandom(rand, 100, 10, 2);     // favor long strings
+        StringBuilder s = new StringBuilder();
+        for (int i = 0; i < length; ++i)
+            s.append(alphabet.charAt(nextPowerRandom(rand, 0, alphabet.length(), 2))); // favor clashes at a
+
+        return partitioner.decorateKey(ByteBufferUtil.bytes(s.toString()));
+    }
+
+    DecoratedKey generateLongKey()
+    {
+        Random rand = ThreadLocalRandom.current();
+        int length = nextPowerRandom(rand, 10000, 2000, 2);     // favor long strings
+        StringBuilder s = new StringBuilder();
+        for (int i = 0; i < length; ++i)
+            s.append(alphabet.charAt(nextPowerRandom(rand, 0, alphabet.length(), 2))); // favor clashes at a
+
+        return partitioner.decorateKey(ByteBufferUtil.bytes(s.toString()));
+    }
+
+    int nextPowerRandom(Random rand, int x0, int x1, double power)
+    {
+        double r = Math.pow(rand.nextDouble(), power);
+        return x0 + (int) ((x1 - x0) * r);
+    }
+
+    private static final String alphabet = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+    DecoratedKey sequentialKey(long i)
+    {
+        String s = "";
+        for (int j = 50; j >= 0; j--)
+        {
+            int p = (int) Math.pow(10, j);
+            int idx = (int) ((j + i) / p);
+            s += alphabet.charAt(idx % alphabet.length());
+        }
+        return partitioner.decorateKey(ByteBufferUtil.bytes(s));
+    }
+
+
+    protected SequentialWriter makeWriter(File file)
+    {
+        return new SequentialWriter(file, SequentialWriterOption.newBuilder().finishOnClose(false).build());
+    }
+
+    public SequentialWriter makeJumpingWriter(File file, long[] cutoffsAndOffsets)
+    {
+        return new JumpingFile(file, SequentialWriterOption.newBuilder().finishOnClose(true).build(), cutoffsAndOffsets);
+    }
+
+    protected FileHandle.Builder makeHandle(File file)
+    {
+        return new FileHandle.Builder(file.getPath())
+               .bufferSize(PageAware.PAGE_SIZE)
+               .mmapped(accessMode == Config.DiskAccessMode.mmap)
+               .withChunkCache(ChunkCache.instance);
+    }
+
+    protected PartitionIndex loadPartitionIndex(FileHandle.Builder fhBuilder, SequentialWriter writer) throws IOException
+    {
+        return PartitionIndex.load(fhBuilder, partitioner, false);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/trieindex/RowIndexTest.java b/test/unit/org/apache/cassandra/io/sstable/format/trieindex/RowIndexTest.java
new file mode 100644
index 000000000000..592d1e595804
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/format/trieindex/RowIndexTest.java
@@ -0,0 +1,521 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringBound;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ClusteringPrefix;
+import org.apache.cassandra.db.DeletionTime;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
+import org.apache.cassandra.io.tries.Walker;
+import org.apache.cassandra.io.util.DataOutputStreamPlus;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.RandomAccessReader;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.io.util.SequentialWriterOption;
+import org.apache.cassandra.utils.Pair;
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class RowIndexTest
+{
+    private final static Logger logger = LoggerFactory.getLogger(RowIndexTest.class);
+
+    static final ByteComparable.Version VERSION = Walker.BYTE_COMPARABLE_VERSION;
+
+    static final Random RANDOM;
+
+    static
+    {
+        long seed = System.currentTimeMillis();
+        logger.info("seed = " + seed);
+        RANDOM = new Random(seed);
+
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    static final ClusteringComparator comparator = new ClusteringComparator(UUIDType.instance);
+    static final long END_MARKER = 1L << 40;
+    static final int COUNT = 8192;
+
+    @Parameterized.Parameters()
+    public static Collection<Object[]> generateData()
+    {
+        return Arrays.asList(new Object[]{ Config.DiskAccessMode.standard },
+                             new Object[]{ Config.DiskAccessMode.mmap });
+    }
+
+    @Parameterized.Parameter(value = 0)
+    public static Config.DiskAccessMode accessMode = Config.DiskAccessMode.standard;
+
+    @Test
+    public void testSingletons() throws IOException
+    {
+        Pair<List<ClusteringPrefix<?>>, RowIndexReader> random = generateRandomIndexSingletons(COUNT);
+        RowIndexReader summary = random.right;
+        List<ClusteringPrefix<?>> keys = random.left;
+        for (int i = 0; i < COUNT; i++)
+        {
+            assertEquals(i, summary.separatorFloor(comparator.asByteComparable(keys.get(i))).offset);
+        }
+        summary.close();
+    }
+
+    @Test
+    public void testSpans() throws IOException
+    {
+        Pair<List<ClusteringPrefix<?>>, RowIndexReader> random = generateRandomIndexQuads(COUNT);
+        RowIndexReader summary = random.right;
+        List<ClusteringPrefix<?>> keys = random.left;
+        int missCount = 0;
+        IndexInfo ii;
+        for (int i = 0; i < COUNT; i++)
+        {
+            // These need to all be within the span
+            assertEquals(i, (ii = summary.separatorFloor(comparator.asByteComparable(keys.get(4 * i + 1)))).offset);
+            assertEquals(i, summary.separatorFloor(comparator.asByteComparable(keys.get(4 * i + 2))).offset);
+            assertEquals(i, summary.separatorFloor(comparator.asByteComparable(keys.get(4 * i + 3))).offset);
+
+            // check other data
+            assertEquals(i + 2, ii.openDeletion.markedForDeleteAt());
+            assertEquals(i - 3, ii.openDeletion.localDeletionTime());
+
+            // before entry. hopefully here, but could end up in prev if matches prevMax too well
+            ii = summary.separatorFloor(comparator.asByteComparable(keys.get(4 * i)));
+            if (ii.offset != i)
+            {
+                ++missCount;
+                assertEquals(i - 1, ii.offset);
+            }
+        }
+        ii = summary.separatorFloor(comparator.asByteComparable(keys.get(4 * COUNT)));
+        if (ii.offset != END_MARKER)
+        {
+            ++missCount;
+            assertEquals(COUNT - 1, ii.offset);
+        }
+        ii = summary.separatorFloor(comparator.asByteComparable(ClusteringBound.BOTTOM));
+        assertEquals(0, ii.offset);
+
+        ii = summary.separatorFloor(comparator.asByteComparable(ClusteringBound.TOP));
+        assertEquals(END_MARKER, ii.offset);
+
+        summary.close();
+        if (missCount > COUNT / 5)
+            logger.error("Unexpectedly high miss count: {}/{}", missCount, COUNT);
+    }
+
+    File file;
+    DataOutputStreamPlus dos;
+    RowIndexWriter writer;
+    FileHandle fh;
+    long root;
+
+    @After
+    public void cleanUp()
+    {
+        FileUtils.closeQuietly(dos);
+        FileUtils.closeQuietly(writer);
+        FileUtils.closeQuietly(fh);
+    }
+
+    public RowIndexTest() throws IOException
+    {
+        this(FileUtils.createTempFile("ColumnTrieReaderTest", ""));
+    }
+
+    RowIndexTest(File file) throws IOException
+    {
+        this(file, new SequentialWriter(file, SequentialWriterOption.newBuilder().finishOnClose(true).build()));
+    }
+
+    RowIndexTest(File file, DataOutputStreamPlus dos) throws IOException
+    {
+        this.file = file;
+        this.dos = dos;
+
+        // write some junk
+        dos.writeUTF("JUNK");
+        dos.writeUTF("JUNK");
+
+        writer = new RowIndexWriter(comparator, dos);
+    }
+
+    public void complete() throws IOException
+    {
+        root = writer.complete(END_MARKER);
+        dos.writeUTF("JUNK");
+        dos.writeUTF("JUNK");
+        dos.close();
+        dos = null;
+    }
+
+    public RowIndexReader completeAndRead() throws IOException
+    {
+        complete();
+
+        try (FileHandle.Builder builder = new FileHandle.Builder(file.getPath())
+                                          .mmapped(accessMode == Config.DiskAccessMode.mmap))
+        {
+            fh = builder.complete();
+            try (RandomAccessReader rdr = fh.createReader())
+            {
+                assertEquals("JUNK", rdr.readUTF());
+                assertEquals("JUNK", rdr.readUTF());
+            }
+            return new RowIndexReader(fh, root);
+        }
+    }
+
+    @Test
+    public void testAddEmptyKey() throws Exception
+    {
+        ClusteringPrefix<?> key = Clustering.EMPTY;
+        writer.add(key, key, new IndexInfo(42, DeletionTime.LIVE));
+        try (RowIndexReader summary = completeAndRead())
+        {
+            IndexInfo i = summary.min();
+            assertEquals(42, i.offset);
+
+            i = summary.separatorFloor(comparator.asByteComparable(ClusteringBound.BOTTOM));
+            assertEquals(42, i.offset);
+
+            i = summary.separatorFloor(comparator.asByteComparable(ClusteringBound.TOP));
+            assertEquals(END_MARKER, i.offset);
+
+            i = summary.separatorFloor(comparator.asByteComparable(key));
+            assertEquals(42, i.offset);
+        }
+    }
+
+    @Test
+    public void testAddDuplicateEmptyThrow() throws Exception
+    {
+        ClusteringPrefix<?> key = Clustering.EMPTY;
+        Throwable t = null;
+        writer.add(key, key, new IndexInfo(42, DeletionTime.LIVE));
+        try
+        {
+            writer.add(key, key, new IndexInfo(43, DeletionTime.LIVE));
+            try (RowIndexReader summary = completeAndRead())
+            {
+                // failing path
+            }
+        }
+        catch (AssertionError e)
+        {
+            // correct path
+            t = e;
+            logger.info("Got " + e.getMessage());
+        }
+        Assert.assertNotNull("Should throw an assertion error.", t);
+    }
+
+    @Test
+    public void testAddDuplicateThrow() throws Exception
+    {
+        ClusteringPrefix<?> key = generateRandomKey();
+        Throwable t = null;
+        writer.add(key, key, new IndexInfo(42, DeletionTime.LIVE));
+        try
+        {
+            writer.add(key, key, new IndexInfo(43, DeletionTime.LIVE));
+            try (RowIndexReader summary = completeAndRead())
+            {
+                // failing path
+            }
+        }
+        catch (AssertionError e)
+        {
+            // correct path
+            t = e;
+            logger.info("Got " + e.getMessage());
+        }
+        Assert.assertNotNull("Should throw an assertion error.", t);
+    }
+
+    @Test
+    public void testAddOutOfOrderThrow() throws Exception
+    {
+        ClusteringPrefix<?> key1 = generateRandomKey();
+        ClusteringPrefix<?> key2 = generateRandomKey();
+        while (comparator.compare(key1, key2) <= 0) // make key2 smaller than 1
+            key2 = generateRandomKey();
+
+        Throwable t = null;
+        writer.add(key1, key1, new IndexInfo(42, DeletionTime.LIVE));
+        try
+        {
+            writer.add(key2, key2, new IndexInfo(43, DeletionTime.LIVE));
+            try (RowIndexReader summary = completeAndRead())
+            {
+                // failing path
+            }
+        }
+        catch (AssertionError e)
+        {
+            // correct path
+            t = e;
+            logger.info("Got " + e.getMessage());
+        }
+        Assert.assertNotNull("Should throw an assertion error.", t);
+    }
+
+    @Test
+    public void testConstrainedIteration() throws IOException
+    {
+        // This is not too relevant: due to the way we construct separators we can't be good enough on the left side.
+        Pair<List<ClusteringPrefix<?>>, RowIndexReader> random = generateRandomIndexSingletons(COUNT);
+        List<ClusteringPrefix<?>> keys = random.left;
+
+        for (int i = 0; i < 500; ++i)
+        {
+            boolean exactLeft = RANDOM.nextBoolean();
+            boolean exactRight = RANDOM.nextBoolean();
+            ClusteringPrefix<?> left = exactLeft ? keys.get(RANDOM.nextInt(keys.size())) : generateRandomKey();
+            ClusteringPrefix<?> right = exactRight ? keys.get(RANDOM.nextInt(keys.size())) : generateRandomKey();
+            if (comparator.compare(right, left) < 0)
+            {
+                ClusteringPrefix<?> t = left;
+                left = right;
+                right = t;
+                boolean b = exactLeft;
+                exactLeft = exactRight;
+                exactRight = b;
+            }
+
+            try (RowIndexReverseIterator iter = new RowIndexReverseIterator(fh, root, comparator.asByteComparable(left), comparator.asByteComparable(right)))
+            {
+                IndexInfo indexInfo = iter.nextIndexInfo();
+                if (indexInfo == null)
+                {
+                    int idx = Collections.binarySearch(keys, right, comparator);
+                    if (idx < 0)
+                        idx = -2 - idx; // less than or equal
+                    if (idx <= 0)
+                        continue;
+                    assertTrue(comparator.asByteComparable(left) + " <= "
+                               + comparator.asByteComparable(keys.get(idx)) + " <= "
+                               + comparator.asByteComparable(right) + " but " + idx + " wasn't iterated.",
+                               comparator.compare(left, keys.get(idx - 1)) > 0);
+                    continue;
+                }
+
+                int idx = (int) indexInfo.offset;
+                if (indexInfo.offset == END_MARKER)
+                    idx = keys.size();
+                if (idx > 0)
+                    assertTrue(comparator.compare(right, keys.get(idx - 1)) > 0);
+                if (idx < keys.size() - 1)
+                    assertTrue(comparator.compare(right, keys.get(idx + 1)) < 0);
+                if (exactRight)      // must be precise on exact, otherwise could be in any relation
+                    assertEquals(right, keys.get(idx));
+                while (true)
+                {
+                    --idx;
+                    IndexInfo ii = iter.nextIndexInfo();
+                    if (ii == null)
+                        break;
+                    assertEquals(idx, (int) ii.offset);
+                }
+                ++idx; // seek at last returned
+                if (idx < keys.size() - 1)
+                    assertTrue(comparator.compare(left, keys.get(idx + 1)) < 0);
+                // Because of the way we build the index (using non-prefix separator) we are usually going to miss the last item.
+                if (idx >= 2)
+                    assertTrue(comparator.compare(left, keys.get(idx - 2)) > 0);
+            }
+            catch (AssertionError e)
+            {
+                logger.error(e.getMessage(), e);
+                ClusteringPrefix<?> ll = left;
+                ClusteringPrefix<?> rr = right;
+                logger.info(keys.stream()
+                                       .filter(x -> comparator.compare(ll, x) <= 0 && comparator.compare(x, rr) <= 0)
+                                       .map(clustering -> comparator.asByteComparable(clustering))
+                                       .map(bc -> bc.byteComparableAsString(VERSION))
+                                       .collect(Collectors.joining(", ")));
+                logger.info("Left {}{} Right {}{}", comparator.asByteComparable(left), exactLeft ? "#" : "", comparator.asByteComparable(right), exactRight ? "#" : "");
+                try (RowIndexReverseIterator iter2 = new RowIndexReverseIterator(fh, root, comparator.asByteComparable(left), comparator.asByteComparable(right)))
+                {
+                    IndexInfo ii;
+                    while ((ii = iter2.nextIndexInfo()) != null)
+                    {
+                        logger.info(comparator.asByteComparable(keys.get((int) ii.offset)).toString());
+                    }
+                    logger.info("Left {}{} Right {}{}", comparator.asByteComparable(left), exactLeft ? "#" : "", comparator.asByteComparable(right), exactRight ? "#" : "");
+                }
+                throw e;
+            }
+        }
+    }
+
+    @Test
+    public void testReverseIteration() throws IOException
+    {
+        Pair<List<ClusteringPrefix<?>>, RowIndexReader> random = generateRandomIndexSingletons(COUNT);
+        List<ClusteringPrefix<?>> keys = random.left;
+
+        for (int i = 0; i < 1000; ++i)
+        {
+            boolean exactRight = RANDOM.nextBoolean();
+            ClusteringPrefix<?> right = exactRight ? keys.get(RANDOM.nextInt(keys.size())) : generateRandomKey();
+
+            try (RowIndexReverseIterator iter = new RowIndexReverseIterator(fh, root, ByteComparable.EMPTY, comparator.asByteComparable(right)))
+            {
+                IndexInfo indexInfo = iter.nextIndexInfo();
+                if (indexInfo == null)
+                {
+                    int idx = Collections.binarySearch(keys, right, comparator);
+                    if (idx < 0)
+                        idx = -2 - idx; // less than or equal
+                    assertTrue(comparator.asByteComparable(keys.get(idx)) + " <= "
+                               + comparator.asByteComparable(right) + " but " + idx + " wasn't iterated.",
+                               idx < 0);
+                    continue;
+                }
+
+                int idx = (int) indexInfo.offset;
+                if (indexInfo.offset == END_MARKER)
+                    idx = keys.size();
+                if (idx > 0)
+                    assertTrue(comparator.compare(right, keys.get(idx - 1)) > 0);
+                if (idx < keys.size() - 1)
+                    assertTrue(comparator.compare(right, keys.get(idx + 1)) < 0);
+                if (exactRight)      // must be precise on exact, otherwise could be in any relation
+                    assertEquals(right, keys.get(idx));
+                while (true)
+                {
+                    --idx;
+                    IndexInfo ii = iter.nextIndexInfo();
+                    if (ii == null)
+                        break;
+                    assertEquals(idx, (int) ii.offset);
+                }
+                assertEquals(-1, idx);
+            }
+            catch (AssertionError e)
+            {
+                logger.error(e.getMessage(), e);
+                ClusteringPrefix<?> rr = right;
+                logger.info(keys.stream()
+                                       .filter(x -> comparator.compare(x, rr) <= 0)
+                                       .map(comparator::asByteComparable)
+                                       .map(bc -> bc.byteComparableAsString(VERSION))
+                                       .collect(Collectors.joining(", ")));
+                logger.info("Right {}{}", comparator.asByteComparable(right), exactRight ? "#" : "");
+                try (RowIndexReverseIterator iter2 = new RowIndexReverseIterator(fh, root, ByteComparable.EMPTY, comparator.asByteComparable(right)))
+                {
+                    IndexInfo ii;
+                    while ((ii = iter2.nextIndexInfo()) != null)
+                    {
+                        logger.info(comparator.asByteComparable(keys.get((int) ii.offset)).toString());
+                    }
+                }
+                logger.info("Right {}{}", comparator.asByteComparable(right), exactRight ? "#" : "");
+                throw e;
+            }
+        }
+    }
+
+    private Pair<List<ClusteringPrefix<?>>, RowIndexReader> generateRandomIndexSingletons(int size) throws IOException
+    {
+        List<ClusteringPrefix<?>> list = generateList(size);
+        for (int i = 0; i < size; i++)
+        {
+            assert i == 0 || comparator.compare(list.get(i - 1), list.get(i)) < 0;
+            assert i == 0 || ByteComparable.compare(comparator.asByteComparable(list.get(i - 1)), comparator.asByteComparable(list.get(i)), VERSION) < 0 :
+            String.format("%s bs %s versus %s bs %s", list.get(i - 1).clustering().clusteringString(comparator.subtypes()), comparator.asByteComparable(list.get(i - 1)), list.get(i).clustering().clusteringString(comparator.subtypes()), comparator.asByteComparable(list.get(i)));
+            writer.add(list.get(i), list.get(i), new IndexInfo(i, DeletionTime.LIVE));
+        }
+
+        RowIndexReader summary = completeAndRead();
+        return Pair.create(list, summary);
+    }
+
+    List<ClusteringPrefix<?>> generateList(int size)
+    {
+        List<ClusteringPrefix<?>> list = Lists.newArrayList();
+
+        Set<ClusteringPrefix<?>> set = Sets.newHashSet();
+        for (int i = 0; i < size; i++)
+        {
+            ClusteringPrefix<?> key = generateRandomKey(); // keys must be unique
+            while (!set.add(key))
+                key = generateRandomKey();
+            list.add(key);
+        }
+        list.sort(comparator);
+        return list;
+    }
+
+    private Pair<List<ClusteringPrefix<?>>, RowIndexReader> generateRandomIndexQuads(int size) throws IOException
+    {
+        List<ClusteringPrefix<?>> list = generateList(4 * size + 1);
+        for (int i = 0; i < size; i++)
+            writer.add(list.get(i * 4 + 1), list.get(i * 4 + 3), new IndexInfo(i, new DeletionTime(i + 2, i - 3)));
+
+        RowIndexReader summary = completeAndRead();
+        return Pair.create(list, summary);
+    }
+
+    ClusteringPrefix<?> generateRandomKey()
+    {
+        UUID uuid = randomSeededUUID();
+        ClusteringPrefix<?> key = comparator.make(uuid);
+        return key;
+    }
+
+    private static UUID randomSeededUUID()
+    {
+        byte[] randomBytes = new byte[16];
+        RANDOM.nextBytes(randomBytes);
+        return UUID.nameUUIDFromBytes(randomBytes);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormatUtil.java b/test/unit/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormatUtil.java
new file mode 100644
index 000000000000..2e4fd1586708
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormatUtil.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.io.sstable.format.trieindex;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReader.OpenReason;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.io.sstable.metadata.MetadataType;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.io.util.SequentialWriter;
+import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.FilterFactory;
+
+import static org.apache.cassandra.service.ActiveRepairService.UNREPAIRED_SSTABLE;
+
+public class TrieIndexFormatUtil
+{
+    static
+    {
+        try
+        {
+            File f = FileUtils.createTempFile("empty-index", "db");
+            try (SequentialWriter writer = new SequentialWriter(f);
+                 FileHandle.Builder fhBuilder = new FileHandle.Builder(f.getPath());
+                 PartitionIndexBuilder builder = new PartitionIndexBuilder(writer, fhBuilder))
+            {
+                builder.complete();
+                partitionIndex = PartitionIndex.load(fhBuilder, Util.testPartitioner(), false);
+            }
+        }
+        catch (IOException e)
+        {
+            throw new AssertionError(e);
+        }
+    }
+
+    public static final PartitionIndex partitionIndex;
+
+    public static SSTableReader emptyReader(Descriptor desc,
+                                            Set<Component> components,
+                                            TableMetadataRef metadata,
+                                            FileHandle ifile,
+                                            FileHandle dfile)
+    {
+        SerializationHeader header = SerializationHeader.make(metadata.get(), Collections.emptyList());
+        StatsMetadata sstableMetadata = (StatsMetadata) new MetadataCollector(metadata.get().comparator)
+                                                        .finalizeMetadata(metadata.get().partitioner.getClass().getCanonicalName(), metadata.get().params.bloomFilterFpChance, UNREPAIRED_SSTABLE, null, false, header)
+                                                        .get(MetadataType.STATS);
+        return TrieIndexSSTableReader.internalOpen(desc, components, metadata, ifile, dfile,
+                                                   partitionIndex.sharedCopy(), FilterFactory.AlwaysPresent,
+                                                   1, sstableMetadata, OpenReason.NORMAL, header);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
index 79cf83162877..21c41e4e54fd 100644
--- a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
@@ -175,7 +175,7 @@ public void testOldReadsNew(String oldV, String newV) throws IOException
         File statsFileLb = serialize(originalMetadata, serializer, BigFormat.instance.getVersion(newV));
         File statsFileLa = serialize(originalMetadata, serializer, BigFormat.instance.getVersion(oldV));
         // Reading both as earlier version should yield identical results.
-        SSTableFormat.Type stype = SSTableFormat.Type.current();
+        SSTableFormat.Type stype = SSTableFormat.Type.BIG;
         Descriptor desc = new Descriptor(stype.info.getVersion(oldV), statsFileLb.getParentFile(), "", "", 0, stype);
         try (RandomAccessReader inLb = RandomAccessReader.open(statsFileLb);
              RandomAccessReader inLa = RandomAccessReader.open(statsFileLa))
diff --git a/test/unit/org/apache/cassandra/io/tries/WalkerTest.java b/test/unit/org/apache/cassandra/io/tries/WalkerTest.java
index cfd3f88c326f..015042aae0e0 100644
--- a/test/unit/org/apache/cassandra/io/tries/WalkerTest.java
+++ b/test/unit/org/apache/cassandra/io/tries/WalkerTest.java
@@ -169,9 +169,8 @@ public void testPartialTail() throws IOException
         long rootPos = builder.complete();
         Rebufferer source = new ByteBufRebufferer(buf.asNewBuffer());
         Rebufferer partialSource = new TailOverridingRebufferer(new ByteBufRebufferer(buf.asNewBuffer()), ptail.cutoff(), ptail.tail());
-
-        InternalIterator it = new InternalIterator(source, rootPos, source("151"), source("515"), true);
-        InternalIterator tailIt = new InternalIterator(partialSource, ptail.root(), source("151"), source("515"), true);
+        InternalIterator it = new InternalIterator(new ByteBufRebufferer(buf.asNewBuffer()), rootPos, source("151"), source("515"), true);
+        InternalIterator tailIt = new InternalIterator(new TailOverridingRebufferer(new ByteBufRebufferer(buf.asNewBuffer()), ptail.cutoff(), ptail.tail()), ptail.root(), source("151"), source("515"), true);
 
         while (true)
         {
@@ -186,6 +185,9 @@ public void testPartialTail() throws IOException
             int f1 = TrieNode.at(bh1.buffer(), (int) (i1 - bh1.offset())).payloadFlags(bh1.buffer(), (int) (i1 - bh1.offset()));
             int f2 = TrieNode.at(bh2.buffer(), (int) (i2 - bh2.offset())).payloadFlags(bh2.buffer(), (int) (i2 - bh2.offset()));
             assertEquals(f1, f2);
+
+            bh2.release();
+            bh1.release();
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java b/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java
index 937588d6e22f..7513dc1f24b0 100644
--- a/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java
+++ b/test/unit/org/apache/cassandra/io/util/TailOverridingRebuffererTest.java
@@ -59,6 +59,7 @@ public void testAccessLeftToTailFully()
         {
             Rebufferer.BufferHolder bh = tor.rebuffer(i);
             assertEquals(head, bh.buffer());
+            bh.release();
         }
 
         assertEquals(10, tor.fileLength());
@@ -76,6 +77,7 @@ public void testAccessLeftToTailPartial()
         {
             Rebufferer.BufferHolder bh = tor.rebuffer(i);
             assertEquals(head.limit(6), bh.buffer());
+            bh.release();
         }
 
         assertEquals(10, tor.fileLength());
@@ -93,6 +95,7 @@ public void testAccessRightToTail()
         {
             Rebufferer.BufferHolder bh = tor.rebuffer(i);
             assertEquals(tail, bh.buffer());
+            bh.release();
         }
 
         assertEquals(10, tor.fileLength());
diff --git a/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java b/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java
index bc34f15dd626..f7657888801d 100644
--- a/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java
+++ b/test/unit/org/apache/cassandra/io/util/WrappingRebuffererTest.java
@@ -22,6 +22,7 @@
 
 import org.junit.Test;
 
+import static org.assertj.core.api.Assertions.assertThatExceptionOfType;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertSame;
@@ -30,10 +31,10 @@
 public class WrappingRebuffererTest
 {
     @Test
-    public void testRecycleSameHolder()
+    public void testRebufferRelease()
     {
         TestRebufferer mock = new TestRebufferer();
-        try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock))
+        try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock) {})
         {
             Rebufferer.BufferHolder ret = rebufferer.rebuffer(0);
             assertNotNull(ret);
@@ -42,40 +43,44 @@ public void testRecycleSameHolder()
 
             ret.release();
             assertTrue(mock.released);
-
-            assertSame(ret, rebufferer.rebuffer(0)); // same buffer holder was recycled
         }
     }
 
     @Test
-    public void testRecycleTwoHolders()
+    public void testRebufferReleaseFailingContract()
     {
         TestRebufferer mock = new TestRebufferer();
-        try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock))
+        try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock) {})
         {
-
             Rebufferer.BufferHolder ret1 = rebufferer.rebuffer(0);
             assertNotNull(ret1);
             assertEquals(mock.buffer(), ret1.buffer());
             assertEquals(mock.offset(), ret1.offset());
 
-            Rebufferer.BufferHolder ret2 = rebufferer.rebuffer(1);
-            assertNotNull(ret2);
-            assertEquals(mock.buffer(), ret2.buffer());
-            assertEquals(mock.offset(), ret2.offset());
-
+            assertThatExceptionOfType(AssertionError.class).isThrownBy(() -> rebufferer.rebuffer(1));
             ret1.release();
-            assertTrue(mock.released);
 
-            mock.released = false;
-            ret2.release();
             assertTrue(mock.released);
-
-            assertSame(ret2, rebufferer.rebuffer(0)); // first buffer holder was recycled
-            assertSame(ret1, rebufferer.rebuffer(1)); // second buffer holder was recycled
+            assertThatExceptionOfType(AssertionError.class).isThrownBy(ret1::buffer);
+            assertThatExceptionOfType(AssertionError.class).isThrownBy(ret1::offset);
         }
     }
 
+    @Test
+    public void testRebufferReleaseFailingContractWhenClosing()
+    {
+        assertThatExceptionOfType(AssertionError.class).isThrownBy(() -> {
+            TestRebufferer mock = new TestRebufferer();
+            try (WrappingRebufferer rebufferer = new WrappingRebufferer(mock) {})
+            {
+                Rebufferer.BufferHolder ret1 = rebufferer.rebuffer(0);
+                assertNotNull(ret1);
+                assertEquals(mock.buffer(), ret1.buffer());
+                assertEquals(mock.offset(), ret1.offset());
+            }
+        });
+    }
+
 
     private static class TestRebufferer implements Rebufferer, Rebufferer.BufferHolder
     {
diff --git a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
index e76123ee5907..58feb1d7adff 100644
--- a/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
+++ b/test/unit/org/apache/cassandra/streaming/StreamTransferTaskTest.java
@@ -140,12 +140,12 @@ public void testFailSessionDuringTransferShouldNotReleaseReferences() throws Exc
 
         // create streaming task that streams those two sstables
         StreamTransferTask task = new StreamTransferTask(session, cfs.metadata.id);
-        List<Ref<SSTableReader>> refs = new ArrayList<>(cfs.getLiveSSTables().size());
+        List<Ref<? extends SSTableReader>> refs = new ArrayList<>(cfs.getLiveSSTables().size());
         for (SSTableReader sstable : cfs.getLiveSSTables())
         {
             List<Range<Token>> ranges = new ArrayList<>();
             ranges.add(new Range<>(sstable.first.getToken(), sstable.last.getToken()));
-            Ref<SSTableReader> ref = sstable.selfRef();
+            Ref<? extends SSTableReader> ref = sstable.selfRef();
             refs.add(ref);
             task.addTransferStream(new CassandraOutgoingFile(StreamOperation.BOOTSTRAP, ref, sstable.getPositionsForRanges(ranges), ranges, 1));
         }
@@ -167,7 +167,7 @@ public void testFailSessionDuringTransferShouldNotReleaseReferences() throws Exc
         session.onError(new Exception("Fake exception")).get(5, TimeUnit.SECONDS);
 
         //make sure reference was not released
-        for (Ref<SSTableReader> ref : refs)
+        for (Ref<? extends SSTableReader> ref : refs)
         {
             assertEquals(1, ref.globalCount());
         }
@@ -189,7 +189,7 @@ public void testFailSessionDuringTransferShouldNotReleaseReferences() throws Exc
         }
 
         //now reference should be released
-        for (Ref<SSTableReader> ref : refs)
+        for (Ref<? extends SSTableReader> ref : refs)
         {
             assertEquals(0, ref.globalCount());
         }
diff --git a/test/unit/org/apache/cassandra/tools/JMXCompatabilityTest.java b/test/unit/org/apache/cassandra/tools/JMXCompatabilityTest.java
index cb9f5a5c7ba5..61eef67a714f 100644
--- a/test/unit/org/apache/cassandra/tools/JMXCompatabilityTest.java
+++ b/test/unit/org/apache/cassandra/tools/JMXCompatabilityTest.java
@@ -29,6 +29,7 @@
 
 import com.datastax.driver.core.SimpleStatement;
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.service.CassandraDaemon;
 import org.apache.cassandra.service.GCInspector;
 import org.apache.cassandra.tools.ToolRunner.ToolResult;
@@ -172,6 +173,11 @@ private void diff(List<String> excludeObjects, List<String> excludeAttributes, L
             args.add("--exclude-object");
             args.add(a);
         });
+        if (SSTableFormat.Type.current() == SSTableFormat.Type.BTI)
+        {
+            args.add("--exclude-object");
+            args.add("org.apache.cassandra.metrics:type=Index,scope=RowIndexEntry,name=.*");
+        }
         excludeAttributes.forEach(a -> {
             args.add("--exclude-attribute");
             args.add(a);
diff --git a/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java b/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java
index 0aad6b863ead..2a45ffdc5d48 100644
--- a/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java
+++ b/test/unit/org/apache/cassandra/tools/StandaloneSplitterWithCQLTesterTest.java
@@ -23,6 +23,7 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.Objects;
 import java.util.Set;
 import java.util.stream.Collectors;
 
@@ -38,6 +39,7 @@
 import org.apache.cassandra.tools.ToolRunner.ToolResult;
 import org.assertj.core.api.Assertions;
 
+import static org.assertj.core.api.AssertionsForClassTypes.assertThat;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
@@ -119,7 +121,7 @@ public void testNoSnapshotOption() throws Throwable
     {
         restoreOrigSstables();
         ToolResult tool  = ToolRunner.invokeClass(StandaloneSplitter.class, "-s", "1", "--no-snapshot", sstableFileName);
-        assertTrue(origSstables.size() < Arrays.asList(sstablesDir.listFiles()).size());
+        assertThat(origSstables.size()).isLessThan(sstablesDir.listFiles().length);
         assertTrue(tool.getStdout(), tool.getStdout().isEmpty());
         assertTrue(tool.getCleanedStderr(), tool.getCleanedStderr().isEmpty());
         assertEquals(0, tool.getExitCode());
@@ -167,15 +169,15 @@ private void setupTestSstables() throws Throwable
 
     private void restoreOrigSstables()
     {
-        Arrays.asList(sstablesDir.listFiles()).stream().forEach(f -> {
+        Arrays.stream(Objects.requireNonNull(sstablesDir.listFiles())).forEach(f -> {
             if (f.isFile())
                 f.delete();
         });
-        Arrays.asList(sstablesBackupDir.listFiles()).stream().forEach(f -> {
+        Arrays.stream(Objects.requireNonNull(sstablesBackupDir.listFiles())).forEach(f -> {
             if (f.isFile())
                 try
                 {
-                    Files.copy(f, new File(sstablesDir.getAbsolutePath() + "/" + f.getName()));
+                    Files.copy(f, new File(sstablesDir.getAbsolutePath() + '/' + f.getName()));
                 }
                 catch(IOException e)
                 {
diff --git a/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java b/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java
index ddf3d28b4c22..0a890bfd4fc3 100644
--- a/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java
+++ b/test/unit/org/apache/cassandra/tools/StandaloneUpgraderOnSStablesTest.java
@@ -53,7 +53,7 @@
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class StandaloneUpgraderOnSStablesTest
 {
-    String legacyId = LegacySSTableTest.legacyVersions[LegacySSTableTest.legacyVersions.length - 1];
+    String legacyId = "ma";
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
diff --git a/test/unit/org/apache/cassandra/tools/nodetool/RingTest.java b/test/unit/org/apache/cassandra/tools/nodetool/RingTest.java
index 83771f631475..2042fce183dc 100644
--- a/test/unit/org/apache/cassandra/tools/nodetool/RingTest.java
+++ b/test/unit/org/apache/cassandra/tools/nodetool/RingTest.java
@@ -79,6 +79,7 @@ public void testRingOutput()
     private void validateRingOutput(String hostForm, String... args)
     {
         ToolRunner.ToolResult nodetool = ToolRunner.invokeNodetool(args);
+        logger.info(nodetool.getStdout());
         nodetool.assertOnCleanExit();
         /*
          Datacenter: datacenter1
diff --git a/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java b/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
index 88aa6a4aeb04..6bfe620954d3 100644
--- a/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
+++ b/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
@@ -133,7 +133,7 @@ ColumnFamilyStore initCf(StressProfile stressProfile, boolean loadSSTables)
 
                 try
                 {
-                    SSTableReader sstable = SSTableReader.openNoValidation(entry.getKey(), components, cfs);
+                    SSTableReader sstable = entry.getKey().getFormat().getReaderFactory().openNoValidation(entry.getKey(), components, cfs);
                     sstables.add(sstable);
                 }
                 catch (Exception e)
diff --git a/update-history/STAR-801/50-568d8ed4e8 STAR-247: TrieIndex SSTable format implementation b/update-history/STAR-801/50-568d8ed4e8 STAR-247: TrieIndex SSTable format implementation
new file mode 100644
index 000000000000..87d7db71bf29
--- /dev/null
+++ b/update-history/STAR-801/50-568d8ed4e8 STAR-247: TrieIndex SSTable format implementation	
@@ -0,0 +1,348 @@
+--- a/src/java/org/apache/cassandra/db/compaction/Verifier.java
++++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java
+@@ -157,10 +157,6 @@
+ 
+         if (sstable.descriptor.getFormat().supportedComponents().contains(Component.SUMMARY))
+         {
+-<<<<<<<
+-            outputHandler.output("Index summary is corrupt - if it is removed it will get rebuilt on startup "+sstable.descriptor.filenameFor(Component.SUMMARY));
+-            outputHandler.warn(t);
+-=======
+             try
+             {
+                 outputHandler.debug("Deserializing index summary for " + sstable);
+@@ -169,8 +165,7 @@
+             catch (Throwable t)
+             {
+                 outputHandler.output("Index summary is corrupt - if it is removed it will get rebuilt on startup " + sstable.descriptor.filenameFor(Component.SUMMARY));
+-                outputHandler.warn(t.getMessage());
+->>>>>>>
++                outputHandler.warn(t);
+             markAndThrow(t, false);
+             }
+         }
+--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+@@ -23,10 +23,12 @@
+ import java.io.File;
+ import java.io.IOException;
+ import java.nio.file.Files;
++import java.nio.file.Path;
+ import java.nio.file.Paths;
+ import java.util.Set;
+ import java.util.concurrent.TimeUnit;
+ 
++import com.google.common.collect.ImmutableMap;
+ import org.slf4j.Logger;
+ import org.slf4j.LoggerFactory;
+ 
+@@ -35,19 +37,9 @@
+ import org.apache.cassandra.config.DatabaseDescriptor;
+ import org.apache.cassandra.db.DecoratedKey;
+ import org.apache.cassandra.db.SerializationHeader;
+-<<<<<<<
+ import org.apache.cassandra.io.sstable.*;
+-import org.apache.cassandra.io.sstable.metadata.MetadataType;
+-=======
+-import org.apache.cassandra.io.sstable.Component;
+-import org.apache.cassandra.io.sstable.CorruptSSTableException;
+-import org.apache.cassandra.io.sstable.Descriptor;
+-import org.apache.cassandra.io.sstable.Downsampling;
+-import org.apache.cassandra.io.sstable.IndexSummary;
+-import org.apache.cassandra.io.sstable.IndexSummaryBuilder;
+-import org.apache.cassandra.io.sstable.SSTable;
+ import org.apache.cassandra.io.sstable.format.big.BigTableReader;
+->>>>>>>
++import org.apache.cassandra.io.sstable.metadata.MetadataType;
+ import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+ import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+ import org.apache.cassandra.io.util.DiskOptimizationStrategy;
+@@ -55,29 +47,7 @@
+ import org.apache.cassandra.io.util.FileUtils;
+ import org.apache.cassandra.schema.TableMetadata;
+ import org.apache.cassandra.schema.TableMetadataRef;
+-<<<<<<<
+ import org.apache.cassandra.utils.*;
+-import org.slf4j.Logger;
+-import org.slf4j.LoggerFactory;
+-
+-import java.io.BufferedInputStream;
+-import java.io.DataInputStream;
+-import java.io.File;
+-import java.io.IOException;
+-import java.nio.file.Files;
+-import java.nio.file.Path;
+-import java.nio.file.Paths;
+-import java.util.Set;
+-import java.util.concurrent.TimeUnit;
+-=======
+-import org.apache.cassandra.utils.BloomFilterSerializer;
+-import org.apache.cassandra.utils.ByteBufferUtil;
+-import org.apache.cassandra.utils.FBUtilities;
+-import org.apache.cassandra.utils.FilterFactory;
+-import org.apache.cassandra.utils.IFilter;
+->>>>>>>
+-
+-import com.google.common.collect.ImmutableMap;
+ 
+ public abstract class SSTableReaderBuilder
+ {
+@@ -121,44 +91,12 @@
+ 
+     public abstract SSTableReader build();
+ 
+-<<<<<<<
+-    public SSTableReaderBuilder dfile(FileHandle dfile)
+-    {
+-        this.dfile = dfile;
+-        return this;
+-    }
+-
+-    public SSTableReaderBuilder ifile(FileHandle ifile)
+-    {
+-        this.ifile = ifile;
+-        return this;
+-    }
+-
+-    public SSTableReaderBuilder bf(IFilter bf)
+-    {
+-        this.bf = bf;
+-        return this;
+-    }
+-
+-    public SSTableReaderBuilder summary(IndexSummary summary)
+-    {
+-        this.summary = summary;
+-        return this;
+-    }
+-
+     @SuppressWarnings("resource")
+     public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor, Component component)
+     {
+         return new FileHandle.Builder(descriptor.filenameFor(component))
+-               .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
+-               .withChunkCache(ChunkCache.instance);
+-=======
+-    public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor)
+-    {
+-        return new FileHandle.Builder(descriptor.filenameFor(Component.PRIMARY_INDEX))
+                 .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
+                 .withChunkCache(ChunkCache.instance);
+->>>>>>>
+     }
+ 
+     @SuppressWarnings("resource")
+--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
++++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
+@@ -38,13 +38,9 @@
+ import java.util.concurrent.ExecutionException;
+ import java.util.concurrent.atomic.AtomicInteger;
+ 
+-<<<<<<<
+-
+-=======
+ import com.google.common.collect.ImmutableList;
+ import com.google.common.collect.Sets;
+ import org.apache.commons.lang3.ArrayUtils;
+->>>>>>>
+ import org.junit.AfterClass;
+ import org.junit.Assume;
+ import org.junit.Before;
+@@ -319,23 +315,11 @@
+         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+ 
+         // overwrite one row with garbage
+-<<<<<<<
+         corrupt.accept(sstable, keys);
+-=======
+-        overrideWithGarbage(sstable, ByteBufferUtil.bytes("0"), ByteBufferUtil.bytes("1"), (byte)0x7A);
+->>>>>>>
+ 
+         // with skipCorrupted == false, the scrub is expected to fail
+         if (!isFullyRecoverable)
+         {
+-<<<<<<<
+-            // with skipCorrupted == true, the corrupt row will be skipped
+-            scrubber.scrub();
+-            fail("Expected a CorruptSSTableException to be thrown");
+-        }
+-        catch (IOError err) {
+-            // no assertion on the cause since caused may be different for different SSTable formats
+-=======
+             try (LifecycleTransaction txn = cfs.getTracker().tryModify(Arrays.asList(sstable), OperationType.SCRUB);
+                  Scrubber scrubber = new Scrubber(cfs, txn, false, true))
+             {
+@@ -346,7 +330,6 @@
+             catch (IOError err)
+             {
+             }
+->>>>>>>
+         }
+ 
+         try (LifecycleTransaction txn = cfs.getTracker().tryModify(ImmutableList.of(sstable), OperationType.SCRUB);
+@@ -526,7 +509,7 @@
+         }
+     }
+ 
+-    private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2, byte junk) throws IOException
++    private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2) throws IOException
+     {
+         overrideWithGarbage(sstable, key1, key2, (byte) 'z');
+     }
+@@ -561,7 +544,7 @@
+         overrideWithGarbage(sstable, startPosition, endPosition, junk);
+     }
+ 
+-    private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition, byte junk) throws IOException
++    private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition) throws IOException
+     {
+         overrideWithGarbage(sstable, startPosition, endPosition, (byte) 'z');
+     }
+@@ -576,17 +559,10 @@
+         try (RandomAccessFile file = new RandomAccessFile(path, "rw"))
+         {
+             file.seek(startPosition);
+-<<<<<<<
+-            int length = (int) (endPosition - startPosition);
++            int length = (int)(endPosition - startPosition);
+             byte[] buff = new byte[length];
+             Arrays.fill(buff, junk);
+             file.write(buff, 0, length);
+-=======
+-            int length = (int)(endPosition - startPosition);
+-        byte[] buff = new byte[length];
+-        Arrays.fill(buff, junk);
+-        file.write(buff, 0, length);
+->>>>>>>
+         }
+         if (ChunkCache.instance != null)
+             ChunkCache.instance.invalidateFile(path);
+--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
++++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+@@ -31,13 +31,10 @@
+ 
+ import com.google.common.collect.ImmutableList;
+ import com.google.common.collect.Sets;
+-<<<<<<<
+ import com.google.common.util.concurrent.Uninterruptibles;
+-=======
+-import org.junit.Assume;
+->>>>>>>
+ import org.junit.After;
+ import org.junit.Assert;
++import org.junit.Assume;
+ import org.junit.BeforeClass;
+ import org.junit.Rule;
+ import org.junit.Test;
+@@ -64,14 +61,11 @@
+ import org.apache.cassandra.dht.Token;
+ import org.apache.cassandra.index.Index;
+ import org.apache.cassandra.io.FSReadError;
+-import org.apache.cassandra.io.sstable.format.SSTableReader;
+-<<<<<<<
+ import org.apache.cassandra.io.sstable.format.SSTableFormat;
+-=======
++import org.apache.cassandra.io.sstable.format.SSTableReader;
+ import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+ import org.apache.cassandra.io.sstable.metadata.MetadataType;
+ import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+->>>>>>>
+ import org.apache.cassandra.io.util.FileDataInput;
+ import org.apache.cassandra.io.util.MmappedRegions;
+ import org.apache.cassandra.schema.CachingParams;
+@@ -84,11 +78,8 @@
+ import org.apache.cassandra.utils.BloomFilter;
+ import org.apache.cassandra.utils.ByteBufferUtil;
+ import org.apache.cassandra.utils.FilterFactory;
+-<<<<<<<
+ import org.apache.cassandra.utils.IFilter;
+-=======
+ import org.apache.cassandra.utils.PageAware;
+->>>>>>>
+ 
+ import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
+ import static org.apache.cassandra.io.sstable.format.SSTableReader.selectOnlyBigTableReaders;
+diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+index 470d2cd39f..f7e34a25cc 100644
+--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+@@ -520,7 +520,8 @@ public abstract class SSTableReader extends SSTable implements SelfRefCounted<SS
+      * @return {@link SSTableReader}
+      * @throws IOException
+      */
+-    private static SSTableReader open(Descriptor descriptor,
++    @VisibleForTesting
++    public static SSTableReader open(Descriptor descriptor,
+                                       Set<Component> components,
+                                       TableMetadataRef metadata,
+                                       boolean validate,
+diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+index 43146fe9cd..4b2248612b 100644
+--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+@@ -92,7 +92,7 @@ public abstract class SSTableReaderBuilder
+     public abstract SSTableReader build();
+ 
+     @SuppressWarnings("resource")
+-    public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor)
++    public static FileHandle.Builder defaultIndexHandleBuilder(Descriptor descriptor, Component component)
+     {
+         return new FileHandle.Builder(descriptor.filenameFor(component))
+                 .mmapped(DatabaseDescriptor.getIndexAccessMode() == Config.DiskAccessMode.mmap)
+diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
+index a4eb0c91f9..88fcb5e1e5 100644
+--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
+@@ -872,7 +872,7 @@ public class TrieIndexSSTableReader extends SSTableReader
+     {
+         try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(descriptor.filenameFor(Component.FILTER))))))
+         {
+-            return BloomFilterSerializer.deserialize(stream, oldBfFormat);
++            return BloomFilter.serializer.deserialize(stream, oldBfFormat);
+         }
+         catch (Throwable t)
+         {
+@@ -912,7 +912,7 @@ public class TrieIndexSSTableReader extends SSTableReader
+             try (SeekableByteChannel fos = Files.newByteChannel(path.toPath(), StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE);
+                  DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos))
+             {
+-                BloomFilterSerializer.serialize((BloomFilter) bf, stream);
++                BloomFilter.serializer.serialize((BloomFilter) bf, stream);
+                 stream.flush();
+                 SyncUtil.sync((FileChannel) fos);
+             }
+diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
+index 96438ae7c5..3248001f4c 100644
+--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
++++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableWriter.java
+@@ -495,7 +495,7 @@ public class TrieIndexSSTableWriter extends SSTableWriter
+                      DataOutputStreamPlus stream = new BufferedDataOutputStreamPlus(fos))
+                 {
+                     // bloom filter
+-                    BloomFilterSerializer.serialize((BloomFilter) bf, stream);
++                    BloomFilter.serializer.serialize((BloomFilter) bf, stream);
+                     stream.flush();
+                     SyncUtil.sync((FileChannel) fos);
+                 }
+diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
+index aa28c246ef..53bf6b4d55 100644
+--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
++++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
+@@ -509,7 +509,7 @@ public class ScrubTest
+         }
+     }
+ 
+-    private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2, byte junk) throws IOException
++    private void overrideWithGarbage(SSTableReader sstable, ByteBuffer key1, ByteBuffer key2) throws IOException
+     {
+         overrideWithGarbage(sstable, key1, key2, (byte) 'z');
+     }
+@@ -544,7 +544,7 @@ public class ScrubTest
+         overrideWithGarbage(sstable, startPosition, endPosition, junk);
+     }
+ 
+-    private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition, byte junk) throws IOException
++    private void overrideWithGarbage(SSTableReader sstable, long startPosition, long endPosition) throws IOException
+     {
+         overrideWithGarbage(sstable, startPosition, endPosition, (byte) 'z');
+     }

From 98d8c278caecf1592b1a3876edfbcf1db3421253 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Tue, 20 Apr 2021 11:11:38 +0200
Subject: [PATCH 054/151] STAR-254 Port DSE custom CQL types (#107)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* STAR-254 copy DSE 6.8.11 additional types

* STAR-254 change DataStax copyrights to Apache

* STAR-254 add esri-geometry-api

2.2.4 is currently the latest release.

This is a different version then the one in DSE 6.8.11
(1.2.1). This one doesn't have the following dependencies:
org.codehaus.jackson:jackson-core-asl and org.json:json.
Instead it relies on com.fasterxml.jackson.core:jackson-core
that is already present on C* classpath.

* STAR-254 fix DateRangeUtil

by moving some Apache Lucene™ methods. Using the exact same code
as DSE ensures compatibility and at the same avoids dragging in
the whole lucene jar to this project.

* STAR-254 adjust to OSS serializer interface

DateRangeSerializer was adjusted to the new interface.
The instances are created without slicing a new ByteBuffer,
unfortunately it implied manual offset counting (duh!).

Geometric TypeSerializer uses a sliced buffer as using the
ValueAccessor is not possible in this case (the buffer is
eventually passed to an external library).

* STAR-254 fix DateRangeSerializerTest

Adjust to Cassandra test parametrization.

* STAR-254 fix tests

* STAR-254 fix GeometryIntegrationTest

Allocation types parameters are removed, a dedicated tests
for endianess are added instead.

Fake (!) endianess logic for OGCGeometry was simplified.

* STAR-254 fix geo tests by adjusting esri

Polygon assertion was changed for the equivalent polygon.
Esri is now more consistent in results of to* methods
(polygon points are always listed clockwise).

(cherry picked from commit af6e4e6325615afc57c1c416f55b462ab29e0401)
(cherry picked from commit baade55bb28ca02e83e15756b809702ab46bb946)
---
 bin/cqlsh.py                                  |   8 +-
 build.xml                                     |   2 +
 pylib/cqlshlib/daterangetype.py               | 116 +++++
 pylib/cqlshlib/geotypes.py                    | 119 ++++++
 .../db/marshal/AbstractGeometricType.java     | 155 +++++++
 .../cassandra/db/marshal/DateRangeType.java   |  95 +++++
 .../cassandra/db/marshal/GeometryCodec.java   |  71 +++
 .../cassandra/db/marshal/LineStringType.java  |  32 ++
 .../cassandra/db/marshal/PointType.java       |  32 ++
 .../cassandra/db/marshal/PolygonType.java     |  32 ++
 .../db/marshal/datetime/DateRange.java        | 403 ++++++++++++++++++
 .../db/marshal/datetime/DateRangeUtil.java    | 286 +++++++++++++
 .../db/marshal/geometry/GeometricType.java    |  45 ++
 .../db/marshal/geometry/LineString.java       | 128 ++++++
 .../db/marshal/geometry/OgcGeometry.java      | 222 ++++++++++
 .../cassandra/db/marshal/geometry/Point.java  | 146 +++++++
 .../db/marshal/geometry/Polygon.java          | 128 ++++++
 .../serializers/DateRangeSerializer.java      | 263 ++++++++++++
 .../db/marshal/DateRangeIntegrationTest.java  | 287 +++++++++++++
 .../db/marshal/DateRangeTypeTest.java         | 125 ++++++
 .../db/marshal/GeometricTypeTests.java        |  84 ++++
 .../db/marshal/GeometryCodecTest.java         |  64 +++
 .../db/marshal/GeometryIntegrationTest.java   | 139 ++++++
 .../db/marshal/LineStringTypeTest.java        | 150 +++++++
 .../cassandra/db/marshal/PointTypeTest.java   | 139 ++++++
 .../cassandra/db/marshal/PolygonTypeTest.java | 246 +++++++++++
 .../db/marshal/datetime/DateRangeCodec.java   |  85 ++++
 .../marshal/datetime/DateRangeCodecTest.java  |  56 +++
 .../db/marshal/datetime/DateRangeTest.java    |  57 +++
 .../marshal/datetime/DateRangeUtilTest.java   | 240 +++++++++++
 .../serializers/DateRangeSerializerTest.java  | 144 +++++++
 31 files changed, 4098 insertions(+), 1 deletion(-)
 create mode 100644 pylib/cqlshlib/daterangetype.py
 create mode 100644 pylib/cqlshlib/geotypes.py
 create mode 100644 src/java/org/apache/cassandra/db/marshal/AbstractGeometricType.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/DateRangeType.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/GeometryCodec.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/LineStringType.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/PointType.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/PolygonType.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/datetime/DateRange.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/datetime/DateRangeUtil.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/geometry/GeometricType.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/geometry/LineString.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/geometry/OgcGeometry.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/geometry/Point.java
 create mode 100644 src/java/org/apache/cassandra/db/marshal/geometry/Polygon.java
 create mode 100644 src/java/org/apache/cassandra/serializers/DateRangeSerializer.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/DateRangeIntegrationTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/DateRangeTypeTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/GeometricTypeTests.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/GeometryCodecTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/GeometryIntegrationTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/LineStringTypeTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/PointTypeTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/PolygonTypeTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodec.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodecTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeUtilTest.java
 create mode 100644 test/unit/org/apache/cassandra/serializers/DateRangeSerializerTest.java

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index 37f839d9d443..74c9c4dc40eb 100755
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py
@@ -157,7 +157,7 @@ def find_zip(libprefix):
     sys.path.insert(0, cqlshlibdir)
 
 from cqlshlib import cql3handling, cqlhandling, pylexotron, sslhandling, cqlshhandling
-from cqlshlib.copyutil import ExportTask, ImportTask
+from cqlshlib.copyutil import ExportTask, ImportTask, ImportConversion
 from cqlshlib.displaying import (ANSI_RESET, BLUE, COLUMN_NAME_COLORS, CYAN,
                                  RED, WHITE, FormattedValue, colorme)
 from cqlshlib.formatting import (DEFAULT_DATE_FORMAT, DEFAULT_NANOTIME_FORMAT,
@@ -166,6 +166,12 @@ def find_zip(libprefix):
 from cqlshlib.tracing import print_trace, print_trace_session
 from cqlshlib.util import get_file_encoding_bomsize, trim_if_present
 
+from cqlshlib.geotypes import patch_geotypes_import_conversion  # nopep8
+from cqlshlib.daterangetype import patch_daterange_import_conversion  # nopep
+
+patch_geotypes_import_conversion(ImportConversion)
+patch_daterange_import_conversion(ImportConversion)
+
 DEFAULT_HOST = '127.0.0.1'
 DEFAULT_PORT = 9042
 DEFAULT_SSL = False
diff --git a/build.xml b/build.xml
index 6d4b593c424b..4390c9010b2a 100644
--- a/build.xml
+++ b/build.xml
@@ -600,6 +600,7 @@
           <dependency groupId="com.clearspring.analytics" artifactId="stream" version="2.5.2">
             <exclusion groupId="it.unimi.dsi" artifactId="fastutil" />
           </dependency>
+          <dependency groupId="com.esri.geometry" artifactId="esri-geometry-api" version="2.2.4"/>
           <dependency groupId="com.datastax.cassandra" artifactId="cassandra-driver-core" version="3.11.0" classifier="shaded">
             <exclusion groupId="io.netty" artifactId="netty-buffer"/>
             <exclusion groupId="io.netty" artifactId="netty-codec"/>
@@ -824,6 +825,7 @@
         <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
         <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
         <dependency groupId="org.hamcrest" artifactId="hamcrest" scope="test"/>
+        <dependency groupId="com.esri.geometry" artifactId="esri-geometry-api"/>
 
         <!-- sasi deps -->
         <dependency groupId="de.jflex" artifactId="jflex" />
diff --git a/pylib/cqlshlib/daterangetype.py b/pylib/cqlshlib/daterangetype.py
new file mode 100644
index 000000000000..e052001683b6
--- /dev/null
+++ b/pylib/cqlshlib/daterangetype.py
@@ -0,0 +1,116 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from datetime import datetime
+from cassandra.util import DateRange, DateRangeBound, DateRangePrecision, OPEN_BOUND, ms_timestamp_from_datetime
+
+
+def _raise_parse_error(msg):
+    """
+    We need to throw ParseError which is defined in copyutil
+    since copyutil imports this file as part of it's initialization though,
+    importing it at the top of this file would create an import loop, so
+    we just import it locally when/if we need it
+    """
+    from cqlshlib import copyutil
+    raise copyutil.ParseError(msg)
+
+
+def _new_date_range_bound(datetime, precision):
+    millis = ms_timestamp_from_datetime(datetime)
+    return DateRangeBound(value=millis, precision=precision)
+
+
+def _parse_date_range_bound(val):
+    if val == '*':
+        return OPEN_BOUND
+    else:
+        year = 1
+        month = 1
+        day = 1
+        hour = 0
+        minute = 0
+        second = 0
+        millisecond = 0
+        precision = DateRangePrecision.MILLISECOND
+
+        date_split = val.replace('Z', '').split('-')
+        if len(date_split) > 0:
+            year = int(date_split[0])
+            precision = DateRangePrecision.YEAR
+        if len(date_split) > 1:
+            month = int(date_split[1])
+            precision = DateRangePrecision.MONTH
+        if len(date_split) > 2:
+            day_split = date_split[2].split('T')
+            day = int(day_split[0])
+            precision = DateRangePrecision.DAY
+            if len(day_split) > 1:
+                time_split = day_split[1].split(':')
+                if len(time_split) > 0:
+                    hour = int(time_split[0])
+                    precision = DateRangePrecision.HOUR
+                if len(time_split) > 1:
+                    minute = int(time_split[1])
+                    precision = DateRangePrecision.MINUTE
+                if len(time_split) > 2:
+                    second_split = time_split[2].split('.')
+                    second = int(second_split[0])
+                    precision = DateRangePrecision.SECOND
+                    if len(second_split) > 1:
+                        millisecond = int(second_split[1])
+                        precision = DateRangePrecision.MILLISECOND
+
+    return _new_date_range_bound(
+        datetime=datetime(year=year, month=month, day=day, hour=hour, minute=minute, second=second,
+                          microsecond=millisecond * 1000),
+        precision=precision)
+
+
+def _convert_daterange(val):
+    if val.startswith('['):
+        if val.endswith(']'):
+            bounds = val[1:-1].split(' TO ')
+            if len(bounds) == 2:
+                lower_bound = _parse_date_range_bound(bounds[0].strip()).round_down()
+                upper_bound = _parse_date_range_bound(bounds[1].strip()).round_up()
+                return DateRange(lower_bound=lower_bound, upper_bound=upper_bound)
+            else:
+                _raise_parse_error("If date range starts with [ must contain ' TO '; got {}".format(val))
+        else:
+            _raise_parse_error("If date range starts with [ must end with ]; got {}".format(val))
+    else:
+        bound = _parse_date_range_bound(val).round_down()
+        return DateRange(value=bound)
+
+
+def _patch_get_converters(klass):
+    original_method = klass._get_converter
+
+    def new_method(self, cql_type):
+        if cql_type.typename == 'daterange':
+            return _convert_daterange
+        else:
+            return original_method(self, cql_type)
+
+    klass._get_converter = new_method
+
+
+def patch_daterange_import_conversion(klass):
+    """
+    monkey patches cqlshlib.copyutil.ImportConversion to support DateRangeType
+    """
+    _patch_get_converters(klass)
diff --git a/pylib/cqlshlib/geotypes.py b/pylib/cqlshlib/geotypes.py
new file mode 100644
index 000000000000..0ab2bb03483e
--- /dev/null
+++ b/pylib/cqlshlib/geotypes.py
@@ -0,0 +1,119 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from numbers import Number
+import re
+
+from cassandra.metadata import protect_value
+from cassandra.util import Point, LineString, Polygon
+from geomet import wkt
+
+
+def _raise_parse_error(msg):
+    """
+    We need to throw ParseError which is defined in copyutil
+    since copyutil imports this file as part of it's initialization though,
+    importing it at the top of this file would create an import loop, so
+    we just import it locally when/if we need it
+    """
+    from cqlshlib import copyutil
+    raise copyutil.ParseError(msg)
+
+
+def _validate_point(val, raw):
+    if not all([isinstance(v, Number) for v in val]):
+        _raise_parse_error("Got non-numeric value in {}".format(raw))
+
+    if len(val) != 2:
+        _raise_parse_error("Got point with {} coordinates in: {}".format(len(val), raw))
+
+
+def _get_coords(val, expected_type):
+    geojson = wkt.loads(val)
+    gjtype = geojson.get('type')
+    if gjtype != expected_type:
+        _raise_parse_error("Expected {} type, but got {} type for {}".format(expected_type, gjtype, val))
+
+    return geojson.get("coordinates", [])
+
+
+def _convert_point(val):
+    coords = _get_coords(val, 'Point')
+    _validate_point(coords, val)
+    point = Point(*coords)
+    return point
+
+
+def _convert_linestring(val):
+    points = _get_coords(val, 'LineString')
+    for xy in points:
+        _validate_point(xy, val)
+    linestring = LineString(points)
+    return linestring
+
+
+def _convert_polygon(val):
+    rings = _get_coords(val, 'Polygon')
+    if len(rings) == 0:
+        return Polygon([])
+
+    for ring in rings:
+        for point in ring:
+            _validate_point(point, val)
+
+    polygon = Polygon(exterior=rings[0], interiors=rings[1:])
+    return polygon
+
+
+def _patch_get_converters(klass):
+    """
+    patches the get converters method to convert WKT to dse.util.{Point, LineString, Polygon}
+    when using prepared statements to batch load
+    """
+    original_method = klass._get_converter
+    def new_method(self, cql_type):
+        if cql_type.typename == 'PointType':
+            return _convert_point
+        elif cql_type.typename == 'LineStringType':
+            return _convert_linestring
+        elif cql_type.typename == 'PolygonType':
+            return _convert_polygon
+        else:
+            return original_method(self, cql_type)
+    klass._get_converter = new_method
+
+
+def _patch_init(klass):
+    """
+    patches the constructor method to also protect (quote) geotype values
+    when making queries with string literal values
+    """
+    original_method = klass.__init__
+    def new_method(self, *args, **kwargs):
+        original_method(self, *args, **kwargs)
+        ptypes = zip(self.protectors, self.coltypes)
+        clean = lambda t: re.sub("[\W]", "", t.split('.')[-1])  # discard java package names and ' characters
+        gtypes = {'PointType', 'LineStringType', 'PolygonType'}
+        self.protectors = [protect_value if clean(t) in gtypes else p for p,t in ptypes]
+    klass.__init__ = new_method
+
+
+def patch_geotypes_import_conversion(klass):
+    """
+    monkey patches cqlshlib.copyutil.ImportConversion to support geotypes
+    """
+    _patch_get_converters(klass)
+    _patch_init(klass)
diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractGeometricType.java b/src/java/org/apache/cassandra/db/marshal/AbstractGeometricType.java
new file mode 100644
index 000000000000..92ef146aa208
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractGeometricType.java
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.cql3.Constants;
+import org.apache.cassandra.cql3.Json;
+import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.db.marshal.geometry.GeometricType;
+import org.apache.cassandra.db.marshal.geometry.OgcGeometry;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.serializers.TypeSerializer;
+import org.apache.cassandra.transport.ProtocolVersion;
+
+public abstract class AbstractGeometricType<T extends OgcGeometry> extends AbstractType<T>
+{
+    private final TypeSerializer<T> serializer = new TypeSerializer<T>()
+    {
+        @Override
+        public ByteBuffer serialize(T geometry)
+        {
+            return geoSerializer.toWellKnownBinary(geometry);
+        }
+
+        @Override
+        public <V> T deserialize(V value, ValueAccessor<V> accessor)
+        {
+            // OGCGeometry does not respect the current position of the buffer, so you need to use slice()
+            ByteBuffer byteBuffer = accessor.toBuffer(value);
+            return geoSerializer.fromWellKnownBinary(byteBuffer.slice());
+        }
+
+        @Override
+        public <V> void validate(V value, ValueAccessor<V> accessor) throws MarshalException
+        {
+            ByteBuffer byteBuffer = accessor.toBuffer(value);
+            int pos = byteBuffer.position();
+            // OGCGeometry does not respect the current position of the buffer, so you need to use slice()
+            geoSerializer.fromWellKnownBinary(byteBuffer.slice()).validate();
+            byteBuffer.position(pos);
+        }
+
+        @Override
+        public String toString(T geometry)
+        {
+            return geoSerializer.toWellKnownText(geometry);
+        }
+
+        @Override
+        public Class<T> getType()
+        {
+            return klass;
+        }
+    };
+
+    private final GeometricType type;
+    private final Class<T> klass;
+    private final OgcGeometry.Serializer<T> geoSerializer;
+
+    public AbstractGeometricType(GeometricType type)
+    {
+        super(ComparisonType.BYTE_ORDER);
+        this.type = type;
+        this.klass = (Class<T>) type.getGeoClass();
+        this.geoSerializer = type.getSerializer();
+    }
+
+    public GeometricType getGeoType()
+    {
+        return type;
+    }
+
+    @Override
+    public ByteBuffer fromString(String s) throws MarshalException
+    {
+        try
+        {
+            T geometry = geoSerializer.fromWellKnownText(s);
+            geometry.validate();
+            return geoSerializer.toWellKnownBinary(geometry);
+        }
+        catch (Exception e)
+        {
+            String parentMsg = e.getMessage() != null ? " " + e.getMessage() : "";
+            String msg = String.format("Unable to make %s from '%s'", getClass().getSimpleName(), s) + parentMsg;
+            throw new MarshalException(msg, e);
+        }
+    }
+
+    @Override
+    public Term fromJSONObject(Object parsed) throws MarshalException
+    {
+        if (!(parsed instanceof String))
+        {
+            try
+            {
+                parsed = Json.JSON_OBJECT_MAPPER.writeValueAsString(parsed);
+            }
+            catch (IOException e)
+            {
+                throw new MarshalException(e.getMessage());
+            }
+        }
+
+        T geometry;
+        try
+        {
+            geometry = geoSerializer.fromGeoJson((String) parsed);
+        }
+        catch (MarshalException e)
+        {
+            try
+            {
+                geometry = geoSerializer.fromWellKnownText((String) parsed);
+            }
+            catch (MarshalException ignored)
+            {
+                throw new MarshalException(e.getMessage());
+            }
+        }
+        geometry.validate();
+        return new Constants.Value(geoSerializer.toWellKnownBinary(geometry));
+    }
+
+    @Override
+    public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion)
+    {
+        // OGCGeometry does not respect the current position of the buffer, so you need to use slice()
+        return geoSerializer.toGeoJson(geoSerializer.fromWellKnownBinary(buffer.slice()));
+    }
+
+    @Override
+    public TypeSerializer<T> getSerializer()
+    {
+        return serializer;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/DateRangeType.java b/src/java/org/apache/cassandra/db/marshal/DateRangeType.java
new file mode 100644
index 000000000000..dc1136463f6d
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/DateRangeType.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+
+import java.nio.ByteBuffer;
+
+import org.apache.cassandra.cql3.Constants;
+import org.apache.cassandra.cql3.Term;
+import org.apache.cassandra.db.marshal.datetime.DateRange;
+import org.apache.cassandra.db.marshal.datetime.DateRangeUtil;
+import org.apache.cassandra.serializers.DateRangeSerializer;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.serializers.TypeSerializer;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+
+/**
+ * Date range C* type with lower and upper bounds represented as timestamps with a millisecond precision.
+ */
+public class DateRangeType extends AbstractType<DateRange>
+{
+    public static final DateRangeType instance = new DateRangeType();
+
+    private DateRangeType()
+    {
+        super(ComparisonType.BYTE_ORDER);
+    }
+
+    @Override
+    public ByteBuffer fromString(String source) throws MarshalException
+    {
+        if (source.isEmpty())
+        {
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        }
+        try
+        {
+            DateRange dateRange = DateRangeUtil.parseDateRange(source);
+            return decompose(dateRange);
+        }
+        catch (Exception e)
+        {
+            throw new MarshalException(String.format("Could not parse date range: %s %s", source, e.getMessage()), e);
+        }
+    }
+
+    @Override
+    public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion)
+    {
+        DateRange dateRange = this.getSerializer().deserialize(buffer);
+        return '"' + dateRange.formatToSolrString() + '"';
+    }
+
+    @Override
+    public Term fromJSONObject(Object parsed) throws MarshalException
+    {
+        if (parsed instanceof String)
+        {
+            return new Constants.Value(fromString((String) parsed));
+        }
+        throw new MarshalException(String.format(
+                "Expected a string representation of a date range value, but got a %s: %s",
+                parsed.getClass().getSimpleName(), parsed));
+    }
+
+    @Override
+    public boolean isEmptyValueMeaningless()
+    {
+        return true;
+    }
+
+    @Override
+    public TypeSerializer<DateRange> getSerializer()
+    {
+        return DateRangeSerializer.instance;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/GeometryCodec.java b/src/java/org/apache/cassandra/db/marshal/GeometryCodec.java
new file mode 100644
index 000000000000..62c8804235e9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/GeometryCodec.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+
+import com.datastax.driver.core.DataType;
+import com.datastax.driver.core.ProtocolVersion;
+import com.datastax.driver.core.TypeCodec;
+import com.datastax.driver.core.exceptions.InvalidTypeException;
+import org.apache.cassandra.db.marshal.geometry.LineString;
+import org.apache.cassandra.db.marshal.geometry.OgcGeometry;
+import org.apache.cassandra.db.marshal.geometry.Point;
+import org.apache.cassandra.db.marshal.geometry.Polygon;
+
+public class GeometryCodec<T extends OgcGeometry> extends TypeCodec<T>
+{
+    public static final TypeCodec<Point> pointCodec = new GeometryCodec<>(PointType.instance);
+    public static final TypeCodec<LineString> lineStringCodec = new GeometryCodec<>(LineStringType.instance);
+    public static final TypeCodec<Polygon> polygonCodec = new GeometryCodec<>(PolygonType.instance);
+
+    private final OgcGeometry.Serializer<T> serializer;
+
+    public GeometryCodec(AbstractGeometricType type)
+    {
+        super(DataType.custom(type.getClass().getName()), (Class<T>) type.getGeoType().getGeoClass());
+        this.serializer = (OgcGeometry.Serializer<T>) type.getGeoType().getSerializer();
+    }
+
+    @Override
+    public T deserialize(ByteBuffer bb, ProtocolVersion protocolVersion) throws InvalidTypeException
+    {
+        return bb == null || bb.remaining() == 0 ? null : serializer.fromWellKnownBinary(bb);
+    }
+
+    @Override
+    public ByteBuffer serialize(T geometry, ProtocolVersion protocolVersion) throws InvalidTypeException
+    {
+        return geometry == null ? null : geometry.asWellKnownBinary();
+    }
+
+    @Override
+    public T parse(String s) throws InvalidTypeException
+    {
+        if (s == null || s.isEmpty() || s.equalsIgnoreCase("NULL"))
+            return null;
+        return serializer.fromWellKnownText(s);
+    }
+
+    @Override
+    public String format(T geometry) throws InvalidTypeException
+    {
+        return geometry == null ? "NULL" : geometry.asWellKnownText();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/LineStringType.java b/src/java/org/apache/cassandra/db/marshal/LineStringType.java
new file mode 100644
index 000000000000..07400eab1fc9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/LineStringType.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import org.apache.cassandra.db.marshal.geometry.GeometricType;
+import org.apache.cassandra.db.marshal.geometry.LineString;
+
+public class LineStringType extends AbstractGeometricType<LineString>
+{
+    public static final LineStringType instance = new LineStringType();
+
+    public LineStringType()
+    {
+        super(GeometricType.LINESTRING);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/PointType.java b/src/java/org/apache/cassandra/db/marshal/PointType.java
new file mode 100644
index 000000000000..487f73553172
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/PointType.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import org.apache.cassandra.db.marshal.geometry.GeometricType;
+import org.apache.cassandra.db.marshal.geometry.Point;
+
+public class PointType extends AbstractGeometricType<Point>
+{
+    public static final PointType instance = new PointType();
+
+    public PointType()
+    {
+        super(GeometricType.POINT);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/PolygonType.java b/src/java/org/apache/cassandra/db/marshal/PolygonType.java
new file mode 100644
index 000000000000..a0bb2b186d4c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/PolygonType.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import org.apache.cassandra.db.marshal.geometry.GeometricType;
+import org.apache.cassandra.db.marshal.geometry.Polygon;
+
+public class PolygonType extends AbstractGeometricType<Polygon>
+{
+    public static final PolygonType instance = new PolygonType();
+
+    public PolygonType()
+    {
+        super(GeometricType.POLYGON);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/datetime/DateRange.java b/src/java/org/apache/cassandra/db/marshal/datetime/DateRange.java
new file mode 100644
index 000000000000..8fe5ba1df77c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/datetime/DateRange.java
@@ -0,0 +1,403 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.datetime;
+
+import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.format.DateTimeFormatter;
+import java.time.format.DateTimeFormatterBuilder;
+import java.util.Locale;
+
+import com.google.common.base.MoreObjects;
+import com.google.common.base.Objects;
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.builder.EqualsBuilder;
+
+import org.apache.cassandra.db.marshal.DateRangeType;
+
+import static java.time.temporal.ChronoField.DAY_OF_MONTH;
+import static java.time.temporal.ChronoField.HOUR_OF_DAY;
+import static java.time.temporal.ChronoField.MILLI_OF_SECOND;
+import static java.time.temporal.ChronoField.MINUTE_OF_HOUR;
+import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
+import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
+
+/**
+ * Domain object of type {@link DateRangeType}. Lower and upper bounds are inclusive. Value type.
+ */
+public class DateRange
+{
+    private final DateRangeBound lowerBound;
+    private final DateRangeBound upperBound;
+
+    public DateRange(DateRangeBound lowerBound)
+    {
+        Preconditions.checkArgument(lowerBound != null);
+        this.lowerBound = lowerBound;
+        this.upperBound = null;
+    }
+
+    public DateRange(DateRangeBound lowerBound, DateRangeBound upperBound)
+    {
+        Preconditions.checkArgument(lowerBound != null);
+        Preconditions.checkArgument(upperBound != null);
+        Preconditions.checkArgument(upperBound.isAfter(lowerBound), "Wrong order: " + lowerBound + " TO " + upperBound);
+        this.lowerBound = lowerBound;
+        this.upperBound = upperBound;
+    }
+
+    private DateRange(DateRangeBuilder builder)
+    {
+        this.lowerBound = builder.lowerBound;
+        this.upperBound = builder.upperBound;
+    }
+
+    public DateRangeBound getLowerBound()
+    {
+        return lowerBound;
+    }
+
+    public DateRangeBound getUpperBound()
+    {
+        return upperBound;
+    }
+
+    public boolean isUpperBoundDefined()
+    {
+        return upperBound != null;
+    }
+
+    public String formatToSolrString()
+    {
+        if (isUpperBoundDefined())
+        {
+            return String.format("[%s TO %s]", lowerBound, upperBound);
+        }
+        else
+        {
+            return lowerBound.toString();
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                .add("lowerBound", lowerBound)
+                .add("precision", lowerBound.getPrecision())
+                .add("upperBound", upperBound)
+                .add("precision", upperBound != null ? upperBound.getPrecision() : "null")
+                .toString();
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        if (obj == null || obj.getClass() != getClass())
+        {
+            return false;
+        }
+        if (obj == this)
+        {
+            return true;
+        }
+
+        DateRange rhs = (DateRange) obj;
+        return new EqualsBuilder()
+                .append(lowerBound, rhs.lowerBound)
+                .append(upperBound, rhs.upperBound)
+                .isEquals();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hashCode(lowerBound, upperBound);
+    }
+
+    public static class DateRangeBound
+    {
+        public static final DateRangeBound UNBOUNDED = new DateRangeBound();
+
+        private final ZonedDateTime timestamp;
+        private final Precision precision;
+
+        private DateRangeBound(ZonedDateTime timestamp, Precision precision)
+        {
+            Preconditions.checkArgument(timestamp != null);
+            Preconditions.checkArgument(precision != null);
+            this.timestamp = timestamp;
+            this.precision = precision;
+        }
+
+        private DateRangeBound()
+        {
+            this.timestamp = null;
+            this.precision = null;
+        }
+
+        public static DateRangeBound lowerBound(Instant timestamp, Precision precision)
+        {
+            return lowerBound(ZonedDateTime.ofInstant(timestamp, ZoneOffset.UTC), precision);
+        }
+
+        public static DateRangeBound lowerBound(ZonedDateTime timestamp, Precision precision)
+        {
+            ZonedDateTime roundedLowerBound = DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, precision);
+            return new DateRangeBound(roundedLowerBound, precision);
+        }
+
+        public static DateRangeBound upperBound(Instant timestamp, Precision precision)
+        {
+            return upperBound(ZonedDateTime.ofInstant(timestamp, ZoneOffset.UTC), precision);
+        }
+
+        public static DateRangeBound upperBound(ZonedDateTime timestamp, Precision precision)
+        {
+            ZonedDateTime roundedUpperBound = DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, precision);
+            return new DateRangeBound(roundedUpperBound, precision);
+        }
+
+        public boolean isUnbounded()
+        {
+            return timestamp == null;
+        }
+
+        public boolean isAfter(DateRangeBound other)
+        {
+            return isUnbounded() || other.isUnbounded() || timestamp.isAfter(other.timestamp);
+        }
+
+        public Instant getTimestamp()
+        {
+            return timestamp.toInstant();
+        }
+
+        public Precision getPrecision()
+        {
+            return precision;
+        }
+
+        @Override
+        public String toString()
+        {
+            if (isUnbounded())
+            {
+                return "*";
+            }
+
+            return precision.formatter.format(timestamp);
+        }
+
+        @Override
+        public boolean equals(Object obj)
+        {
+            if (obj == null || obj.getClass() != getClass())
+            {
+                return false;
+            }
+            if (obj == this)
+            {
+                return true;
+            }
+
+            DateRangeBound rhs = (DateRangeBound) obj;
+            return new EqualsBuilder()
+                    .append(isUnbounded(), rhs.isUnbounded())
+                    .append(timestamp, rhs.timestamp)
+                    .append(precision, rhs.precision)
+                    .isEquals();
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(timestamp, precision);
+        }
+
+        public enum Precision
+        {
+            YEAR(0x00,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu")
+                            .parseDefaulting(MONTH_OF_YEAR, 1)
+                            .parseDefaulting(DAY_OF_MONTH, 1)
+                            .parseDefaulting(HOUR_OF_DAY, 0)
+                            .parseDefaulting(MINUTE_OF_HOUR, 0)
+                            .parseDefaulting(SECOND_OF_MINUTE, 0)
+                            .parseDefaulting(MILLI_OF_SECOND, 0)
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT)),
+
+            MONTH(0x01,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu-MM")
+                            .parseDefaulting(DAY_OF_MONTH, 1)
+                            .parseDefaulting(HOUR_OF_DAY, 0)
+                            .parseDefaulting(MINUTE_OF_HOUR, 0)
+                            .parseDefaulting(SECOND_OF_MINUTE, 0)
+                            .parseDefaulting(MILLI_OF_SECOND, 0)
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT)),
+
+            DAY(0x02,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu-MM-dd")
+                            .parseDefaulting(HOUR_OF_DAY, 0)
+                            .parseDefaulting(MINUTE_OF_HOUR, 0)
+                            .parseDefaulting(SECOND_OF_MINUTE, 0)
+                            .parseDefaulting(MILLI_OF_SECOND, 0)
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT)),
+
+            HOUR(0x03,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu-MM-dd'T'HH")
+                            .parseDefaulting(MINUTE_OF_HOUR, 0)
+                            .parseDefaulting(SECOND_OF_MINUTE, 0)
+                            .parseDefaulting(MILLI_OF_SECOND, 0)
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT)),
+
+            MINUTE(0x04,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu-MM-dd'T'HH:mm")
+                            .parseDefaulting(SECOND_OF_MINUTE, 0)
+                            .parseDefaulting(MILLI_OF_SECOND, 0)
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT)),
+
+            SECOND(0x05,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu-MM-dd'T'HH:mm:ss")
+                            .parseDefaulting(MILLI_OF_SECOND, 0)
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT)),
+
+            MILLISECOND(0x06,
+                    new DateTimeFormatterBuilder()
+                            .parseCaseSensitive()
+                            .parseStrict()
+                            .appendPattern("uuuu-MM-dd'T'HH:mm:ss.SSS")
+                            .optionalStart()
+                            .appendZoneId()
+                            .optionalEnd()
+                            .toFormatter()
+                            .withZone(ZoneOffset.UTC)
+                            .withLocale(Locale.ROOT));
+
+            private final int encoded;
+            private final DateTimeFormatter formatter;
+
+            Precision(int encoded, DateTimeFormatter formatter)
+            {
+                this.encoded = encoded;
+                this.formatter = formatter;
+            }
+
+            public int toEncoded()
+            {
+                return encoded;
+            }
+
+            public static Precision fromEncoded(byte encoded)
+            {
+                for (Precision precision : values())
+                {
+                    if (precision.encoded == encoded)
+                    {
+                        return precision;
+                    }
+                }
+                throw new IllegalArgumentException("Invalid precision encoding: " + encoded);
+            }
+        }
+    }
+
+    public static class DateRangeBuilder
+    {
+        private DateRangeBound lowerBound = null;
+        private DateRangeBound upperBound = null;
+
+        private DateRangeBuilder() {}
+
+        public static DateRangeBuilder dateRange()
+        {
+            return new DateRangeBuilder();
+        }
+
+        public DateRangeBuilder withLowerBound(String lowerBound, DateRangeBound.Precision precision)
+        {
+            return withLowerBound(Instant.parse(lowerBound), precision);
+        }
+
+        public DateRangeBuilder withUnboundedLowerBound()
+        {
+            this.lowerBound = DateRangeBound.UNBOUNDED;
+            return this;
+        }
+
+        public DateRangeBuilder withUnboundedUpperBound()
+        {
+            this.upperBound = DateRangeBound.UNBOUNDED;
+            return this;
+        }
+
+        public DateRangeBuilder withUpperBound(String upperBound, DateRangeBound.Precision precision)
+        {
+            return withUpperBound(Instant.parse(upperBound), precision);
+        }
+
+        DateRangeBuilder withLowerBound(Instant lowerBound, DateRangeBound.Precision precision)
+        {
+            this.lowerBound = DateRangeBound.lowerBound(lowerBound, precision);
+            return this;
+        }
+
+        DateRangeBuilder withUpperBound(Instant upperBound, DateRangeBound.Precision precision)
+        {
+            this.upperBound = DateRangeBound.upperBound(upperBound, precision);
+            return this;
+        }
+
+        public DateRange build()
+        {
+            return new DateRange(this);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/datetime/DateRangeUtil.java b/src/java/org/apache/cassandra/db/marshal/datetime/DateRangeUtil.java
new file mode 100644
index 000000000000..8b73cfaa0899
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/datetime/DateRangeUtil.java
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.datetime;
+
+import java.text.ParseException;
+import java.time.LocalDateTime;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.temporal.ChronoField;
+import java.util.Calendar;
+import java.util.Locale;
+import java.util.TimeZone;
+
+import org.apache.commons.lang3.StringUtils;
+
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound;
+
+import static java.time.temporal.TemporalAdjusters.firstDayOfMonth;
+import static java.time.temporal.TemporalAdjusters.firstDayOfYear;
+import static java.time.temporal.TemporalAdjusters.lastDayOfMonth;
+import static java.time.temporal.TemporalAdjusters.lastDayOfYear;
+
+public class DateRangeUtil
+{
+    private static final int YEAR_LEVEL = 3;
+    private static final int[] FIELD_BY_LEVEL =
+    {
+        -1/*unused*/, -1, -1, Calendar.YEAR, Calendar.MONTH, Calendar.DAY_OF_MONTH,
+        Calendar.HOUR_OF_DAY, Calendar.MINUTE, Calendar.SECOND, Calendar.MILLISECOND
+    };
+    private static final TimeZone UTC_TIME_ZONE = TimeZone.getTimeZone("UTC");
+
+    public static DateRange parseDateRange(String source) throws ParseException
+    {
+        if (StringUtils.isBlank(source))
+        {
+            throw new IllegalArgumentException("Date range is null or blank");
+        }
+        if (source.charAt(0) == '[')
+        {
+            if (source.charAt(source.length() - 1) != ']')
+            {
+                throw new IllegalArgumentException("If date range starts with [ must end with ]; got " + source);
+            }
+            int middle = source.indexOf(" TO ");
+            if (middle < 0)
+            {
+                throw new IllegalArgumentException("If date range starts with [ must contain ' TO '; got " + source);
+            }
+            String lowerBoundString = source.substring(1, middle);
+            String upperBoundString = source.substring(middle + " TO ".length(), source.length() - 1);
+            return new DateRange(parseLowerBound(lowerBoundString), parseUpperBound(upperBoundString));
+        }
+        else
+        {
+            return new DateRange(parseLowerBound(source));
+        }
+    }
+
+    public static ZonedDateTime roundUpperBoundTimestampToPrecision(ZonedDateTime timestamp, DateRangeBound.Precision precision)
+    {
+        switch (precision)
+        {
+            case YEAR:
+                timestamp = timestamp.with(lastDayOfYear());
+            case MONTH:
+                timestamp = timestamp.with(lastDayOfMonth());
+            case DAY:
+                timestamp = timestamp.with(ChronoField.HOUR_OF_DAY, 23);
+            case HOUR:
+                timestamp = timestamp.with(ChronoField.MINUTE_OF_HOUR, 59);
+            case MINUTE:
+                timestamp = timestamp.with(ChronoField.SECOND_OF_MINUTE, 59);
+            case SECOND:
+                timestamp = timestamp.with(ChronoField.MILLI_OF_SECOND, 999);
+            case MILLISECOND:
+                // DateRangeField ignores any precision beyond milliseconds
+                return timestamp;
+            default:
+                throw new IllegalStateException("Unsupported date time precision for the upper bound: " + precision);
+        }
+    }
+
+    public static ZonedDateTime roundLowerBoundTimestampToPrecision(ZonedDateTime timestamp, DateRangeBound.Precision precision)
+    {
+        switch (precision)
+        {
+            case YEAR:
+                timestamp = timestamp.with(firstDayOfYear());
+            case MONTH:
+                timestamp = timestamp.with(firstDayOfMonth());
+            case DAY:
+                timestamp = timestamp.with(ChronoField.HOUR_OF_DAY, 0);
+            case HOUR:
+                timestamp = timestamp.with(ChronoField.MINUTE_OF_HOUR, 0);
+            case MINUTE:
+                timestamp = timestamp.with(ChronoField.SECOND_OF_MINUTE, 0);
+            case SECOND:
+                timestamp = timestamp.with(ChronoField.MILLI_OF_SECOND, 0);
+            case MILLISECOND:
+                // DateRangeField ignores any precision beyond milliseconds
+                return timestamp;
+            default:
+                throw new IllegalStateException("Unsupported date time precision for the upper bound: " + precision);
+        }
+    }
+
+    private static DateRangeBound parseLowerBound(String source) throws ParseException
+    {
+        Calendar lowerBoundCalendar = parseCalendar(source);
+        int calPrecisionField = getCalPrecisionField(lowerBoundCalendar);
+        if (calPrecisionField < 0)
+        {
+            return DateRangeBound.UNBOUNDED;
+        }
+        return DateRangeBound.lowerBound(toZonedDateTime(lowerBoundCalendar), getCalendarPrecision(calPrecisionField));
+    }
+
+    private static DateRangeBound parseUpperBound(String source) throws ParseException
+    {
+        Calendar upperBoundCalendar = parseCalendar(source);
+        int calPrecisionField = getCalPrecisionField(upperBoundCalendar);
+        if (calPrecisionField < 0)
+        {
+            return DateRangeBound.UNBOUNDED;
+        }
+        ZonedDateTime upperBoundDateTime = toZonedDateTime(upperBoundCalendar);
+        DateRangeBound.Precision precision = getCalendarPrecision(calPrecisionField);
+        return DateRangeBound.upperBound(upperBoundDateTime, precision);
+    }
+
+    /**
+     * This method was extracted from org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree
+     * (Apache Lucene™) for compatibility with DSE.
+     * The class is distributed under Apache-2.0 License attached to this release.
+     *
+     * Calendar utility method:
+     * Gets the Calendar field code of the last field that is set prior to an unset field. It only
+     * examines fields relevant to the prefix tree. If no fields are set, it returns -1. */
+    private static int getCalPrecisionField(Calendar cal) {
+        int lastField = -1;
+        for (int level = YEAR_LEVEL; level < FIELD_BY_LEVEL.length; level++) {
+            int field = FIELD_BY_LEVEL[level];
+            if (!cal.isSet(field))
+                break;
+            lastField = field;
+        }
+        return lastField;
+    }
+
+    /**
+     * This method was extracted from org.apache.lucene.spatial.prefix.tree.DateRangePrefixTree
+     * (Apache Lucene™) for compatibility with DSE.
+     * The class is distributed under Apache-2.0 License attached to this release.
+     *
+     * Calendar utility method:
+     * It will only set the fields found, leaving
+     * the remainder in an un-set state. A leading '-' or '+' is optional (positive assumed), and a
+     * trailing 'Z' is also optional.
+     * @param str not null and not empty
+     * @return not null
+     */
+    private static Calendar parseCalendar(String str) throws ParseException {
+        // example: +2014-10-23T21:22:33.159Z
+        if (str == null || str.isEmpty())
+            throw new IllegalArgumentException("str is null or blank");
+        Calendar cal = Calendar.getInstance(UTC_TIME_ZONE, Locale.ROOT);
+        cal.clear();
+        if (str.equals("*"))
+            return cal;
+        int offset = 0;//a pointer
+        try {
+            //year & era:
+            int lastOffset = str.charAt(str.length()-1) == 'Z' ? str.length() - 1 : str.length();
+            int hyphenIdx = str.indexOf('-', 1);//look past possible leading hyphen
+            if (hyphenIdx < 0)
+                hyphenIdx = lastOffset;
+            int year = Integer.parseInt(str.substring(offset, hyphenIdx));
+            cal.set(Calendar.ERA, year <= 0 ? 0 : 1);
+            cal.set(Calendar.YEAR, year <= 0 ? -1*year + 1 : year);
+            offset = hyphenIdx + 1;
+            if (lastOffset < offset)
+                return cal;
+
+            //NOTE: We aren't validating separator chars, and we unintentionally accept leading +/-.
+            // The str.substring()'s hopefully get optimized to be stack-allocated.
+
+            //month:
+            cal.set(Calendar.MONTH, Integer.parseInt(str.substring(offset, offset+2)) - 1);//starts at 0
+            offset += 3;
+            if (lastOffset < offset)
+                return cal;
+            //day:
+            cal.set(Calendar.DAY_OF_MONTH, Integer.parseInt(str.substring(offset, offset+2)));
+            offset += 3;
+            if (lastOffset < offset)
+                return cal;
+            //hour:
+            cal.set(Calendar.HOUR_OF_DAY, Integer.parseInt(str.substring(offset, offset+2)));
+            offset += 3;
+            if (lastOffset < offset)
+                return cal;
+            //minute:
+            cal.set(Calendar.MINUTE, Integer.parseInt(str.substring(offset, offset+2)));
+            offset += 3;
+            if (lastOffset < offset)
+                return cal;
+            //second:
+            cal.set(Calendar.SECOND, Integer.parseInt(str.substring(offset, offset+2)));
+            offset += 3;
+            if (lastOffset < offset)
+                return cal;
+            //ms:
+            cal.set(Calendar.MILLISECOND, Integer.parseInt(str.substring(offset, offset+3)));
+            offset += 3;//last one, move to next char
+            if (lastOffset == offset)
+                return cal;
+        } catch (Exception e) {
+            ParseException pe = new ParseException("Improperly formatted date: "+str, offset);
+            pe.initCause(e);
+            throw pe;
+        }
+        throw new ParseException("Improperly formatted date: "+str, offset);
+    }
+
+    private static DateRangeBound.Precision getCalendarPrecision(int calendarPrecision)
+    {
+        switch (calendarPrecision)
+        {
+            case Calendar.YEAR:
+                return DateRangeBound.Precision.YEAR;
+            case Calendar.MONTH:
+                return DateRangeBound.Precision.MONTH;
+            case Calendar.DAY_OF_MONTH:
+                return DateRangeBound.Precision.DAY;
+            case Calendar.HOUR_OF_DAY:
+                return DateRangeBound.Precision.HOUR;
+            case Calendar.MINUTE:
+                return DateRangeBound.Precision.MINUTE;
+            case Calendar.SECOND:
+                return DateRangeBound.Precision.SECOND;
+            case Calendar.MILLISECOND:
+                return DateRangeBound.Precision.MILLISECOND;
+            default:
+                throw new IllegalStateException("Unsupported date time precision: " + calendarPrecision);
+        }
+    }
+
+    private static ZonedDateTime toZonedDateTime(Calendar calendar)
+    {
+        int year = calendar.get(Calendar.YEAR);
+        if (calendar.get(Calendar.ERA) == 0)
+        {
+            // BC era; 1 BC == 0 AD, 0 BD == -1 AD, etc
+            year -= 1;
+            if (year > 0)
+            {
+                year = -year;
+            }
+        }
+        LocalDateTime localDateTime = LocalDateTime.of(year,
+                calendar.get(Calendar.MONTH) + 1,
+                calendar.get(Calendar.DAY_OF_MONTH),
+                calendar.get(Calendar.HOUR_OF_DAY),
+                calendar.get(Calendar.MINUTE),
+                calendar.get(Calendar.SECOND));
+        localDateTime = localDateTime.with(ChronoField.MILLI_OF_SECOND, calendar.get(Calendar.MILLISECOND));
+        return ZonedDateTime.of(localDateTime, ZoneOffset.UTC);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/geometry/GeometricType.java b/src/java/org/apache/cassandra/db/marshal/geometry/GeometricType.java
new file mode 100644
index 000000000000..2516d7d23bda
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/geometry/GeometricType.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.geometry;
+
+public enum GeometricType
+{
+    POINT(Point.class, Point.serializer),
+    LINESTRING(LineString.class, LineString.serializer),
+    POLYGON(Polygon.class, Polygon.serializer);
+
+    private final Class<? extends OgcGeometry> geoClass;
+    private final OgcGeometry.Serializer serializer;
+
+    GeometricType(Class<? extends OgcGeometry> geoClass, OgcGeometry.Serializer serializer)
+    {
+        this.geoClass = geoClass;
+        this.serializer = serializer;
+    }
+
+    public Class<? extends OgcGeometry> getGeoClass()
+    {
+        return geoClass;
+    }
+
+    public OgcGeometry.Serializer getSerializer()
+    {
+        return serializer;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/geometry/LineString.java b/src/java/org/apache/cassandra/db/marshal/geometry/LineString.java
new file mode 100644
index 000000000000..a31854fa9100
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/geometry/LineString.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.geometry;
+
+import java.nio.ByteBuffer;
+
+import com.esri.core.geometry.GeoJsonExportFlags;
+import com.esri.core.geometry.Operator;
+import com.esri.core.geometry.OperatorExportToGeoJson;
+import com.esri.core.geometry.OperatorFactoryLocal;
+import com.esri.core.geometry.ogc.OGCGeometry;
+import com.esri.core.geometry.ogc.OGCLineString;
+import org.apache.cassandra.serializers.MarshalException;
+
+public class LineString extends OgcGeometry
+{
+    public static final Serializer<LineString> serializer = new Serializer<LineString>()
+    {
+        @Override
+        public String toWellKnownText(LineString geometry)
+        {
+            return geometry.lineString.asText();
+        }
+
+        @Override
+        public ByteBuffer toWellKnownBinaryNativeOrder(LineString geometry)
+        {
+            return geometry.lineString.asBinary();
+        }
+
+        @Override
+        public String toGeoJson(LineString geometry)
+        {
+            OperatorExportToGeoJson op = (OperatorExportToGeoJson) OperatorFactoryLocal.getInstance().getOperator(Operator.Type.ExportToGeoJson);
+            return op.execute(GeoJsonExportFlags.geoJsonExportSkipCRS, geometry.lineString.esriSR, geometry.lineString.getEsriGeometry());
+        }
+
+        @Override
+        public LineString fromWellKnownText(String source)
+        {
+            return new LineString(fromOgcWellKnownText(source, OGCLineString.class));
+        }
+
+        @Override
+        public LineString fromWellKnownBinary(ByteBuffer source)
+        {
+            return new LineString(fromOgcWellKnownBinary(source, OGCLineString.class));
+        }
+
+        @Override
+        public LineString fromGeoJson(String source)
+        {
+            return new LineString(fromOgcGeoJson(source, OGCLineString.class));
+        }
+    };
+
+    private final OGCLineString lineString;
+
+    public LineString(OGCLineString lineString)
+    {
+        this.lineString = lineString;
+        validate();
+    }
+
+    @Override
+    public GeometricType getType()
+    {
+        return GeometricType.LINESTRING;
+    }
+
+    @Override
+    public void validate() throws MarshalException
+    {
+        validateOgcGeometry(lineString);
+    }
+
+    @Override
+    public Serializer getSerializer()
+    {
+        return serializer;
+    }
+
+    @Override
+    protected OGCGeometry getOgcGeometry()
+    {
+        return lineString;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        LineString that = (LineString) o;
+
+        return !(lineString != null ? !lineString.equals(that.lineString) : that.lineString != null);
+
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return lineString != null ? lineString.hashCode() : 0;
+    }
+
+    @Override
+    public String toString()
+    {
+        return asWellKnownText();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/geometry/OgcGeometry.java b/src/java/org/apache/cassandra/db/marshal/geometry/OgcGeometry.java
new file mode 100644
index 000000000000..6f3be38e7415
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/geometry/OgcGeometry.java
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.geometry;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import com.esri.core.geometry.GeometryException;
+import com.esri.core.geometry.JsonGeometryException;
+import com.esri.core.geometry.SpatialReference;
+import com.esri.core.geometry.ogc.OGCGeometry;
+import org.apache.cassandra.serializers.MarshalException;
+
+public abstract class OgcGeometry
+{
+
+    // default spatial reference for wkt/wkb
+    public static final SpatialReference SPATIAL_REFERENCE_4326 = SpatialReference.create(4326);
+
+    public interface Serializer<T extends OgcGeometry>
+    {
+        String toWellKnownText(T geometry);
+
+        // We need to return a Big Endian ByteBuffer as that's required by org.apache.cassandra.db.NativeDecoratedKey
+        // when the memtable allocation type is "offheap_objects". See https://datastax.jira.com/browse/DSP-16302
+        // Note that the order set here may not match the actual endianess. OGC serialization encodes actual endianess
+        // and discards BB order set here.
+        default ByteBuffer toWellKnownBinary(T geometry)
+        {
+            return toWellKnownBinaryNativeOrder(geometry).order(ByteOrder.BIG_ENDIAN);
+        }
+
+        ByteBuffer toWellKnownBinaryNativeOrder(T geometry);
+
+        String toGeoJson(T geometry);
+
+        T fromWellKnownText(String source);
+
+        T fromWellKnownBinary(ByteBuffer source);
+
+        T fromGeoJson(String source);
+    }
+
+    public abstract GeometricType getType();
+
+    public abstract void validate() throws MarshalException;
+
+    public abstract Serializer getSerializer();
+
+    static void validateType(OGCGeometry geometry, Class<? extends OGCGeometry> klass)
+    {
+        if (!geometry.getClass().equals(klass))
+        {
+            throw new MarshalException(String.format("%s is not of type %s",
+                    geometry.getClass().getSimpleName(),
+                    klass.getSimpleName()));
+        }
+    }
+
+    static ByteBuffer getWkb(OGCGeometry geometry)
+    {
+        try
+        {
+            return geometry.asBinary();
+        }
+        catch (GeometryException | IllegalArgumentException e)
+        {
+            throw new MarshalException("Invalid Geometry", e);
+        }
+    }
+
+    static String getWkt(OGCGeometry geometry)
+    {
+        try
+        {
+            return geometry.asText();
+        }
+        catch (GeometryException | IllegalArgumentException e)
+        {
+            throw new MarshalException("Invalid Geometry", e);
+        }
+    }
+
+    static void validateNormalization(OGCGeometry geometry, ByteBuffer source)
+    {
+        ByteBuffer normalized = getWkb(geometry);
+        ByteBuffer inputCopy = source.slice();
+
+        // since the data we get is sometimes part of a longer string of bytes, we set the limit to the normalized
+        // buffer length. Normalization only ever adds and rearranges points though, so this should be ok
+        if (inputCopy.remaining() > normalized.remaining())
+        {
+            inputCopy.limit(normalized.remaining());
+        }
+
+        if (!normalized.equals(inputCopy))
+        {
+            String klass = geometry.getClass().getSimpleName();
+            String msg = String.format("%s is not normalized. %s should be defined/serialized as: %s", klass, klass, getWkt(geometry));
+            throw new MarshalException(msg);
+        }
+    }
+
+    static <T extends OGCGeometry> T fromOgcWellKnownText(String source, Class<T> klass)
+    {
+        OGCGeometry geometry;
+        try
+        {
+            geometry = OGCGeometry.fromText(source);
+        }
+        catch (IllegalArgumentException e)
+        {
+            throw new MarshalException(e.getMessage());
+        }
+        validateType(geometry, klass);
+        return (T) geometry;
+    }
+
+    static <T extends OGCGeometry> T fromOgcWellKnownBinary(ByteBuffer source, Class<T> klass)
+    {
+        OGCGeometry geometry;
+        try
+        {
+            geometry = OGCGeometry.fromBinary(source);
+        }
+        catch (IllegalArgumentException e)
+        {
+            throw new MarshalException(e.getMessage());
+        }
+        validateType(geometry, klass);
+        validateNormalization(geometry, source);
+        return (T) geometry;
+    }
+
+    static <T extends OGCGeometry> T fromOgcGeoJson(String source, Class<T> klass)
+    {
+        OGCGeometry geometry;
+        try
+        {
+            geometry = OGCGeometry.fromGeoJson(source);
+        }
+        catch (IllegalArgumentException | JsonGeometryException e)
+        {
+            throw new MarshalException(e.getMessage());
+        }
+        validateType(geometry, klass);
+        return (T) geometry;
+    }
+
+    public boolean contains(OgcGeometry geometry)
+    {
+        if (!(geometry instanceof OgcGeometry))
+        {
+            throw new UnsupportedOperationException(String.format("%s is not compatible with %s.contains",
+                    geometry.getClass().getSimpleName(), getClass().getSimpleName()));
+        }
+
+        OGCGeometry thisGeometry = getOgcGeometry();
+        OGCGeometry thatGeometry = ((OgcGeometry) geometry).getOgcGeometry();
+        if (thisGeometry != null && thatGeometry != null)
+        {
+            return thisGeometry.contains(thatGeometry);
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    protected abstract OGCGeometry getOgcGeometry();
+
+    static void validateOgcGeometry(OGCGeometry geometry)
+    {
+        try
+        {
+            if (geometry.is3D())
+            {
+                throw new MarshalException(String.format("'%s' is not 2D", getWkt(geometry)));
+            }
+
+            if (!geometry.isSimple())
+            {
+                throw new MarshalException(String.format("'%s' is not simple. Points and edges cannot self-intersect.", getWkt(geometry)));
+            }
+        }
+        catch (GeometryException e)
+        {
+            throw new MarshalException("Invalid geometry", e);
+        }
+    }
+
+    public String asWellKnownText()
+    {
+        return getSerializer().toWellKnownText(this);
+    }
+
+    public ByteBuffer asWellKnownBinary()
+    {
+        return getSerializer().toWellKnownBinary(this);
+    }
+
+    public String asGeoJson()
+    {
+        return getSerializer().toGeoJson(this);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/geometry/Point.java b/src/java/org/apache/cassandra/db/marshal/geometry/Point.java
new file mode 100644
index 000000000000..0992bd768725
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/geometry/Point.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.geometry;
+
+import java.nio.ByteBuffer;
+
+import com.esri.core.geometry.GeoJsonExportFlags;
+import com.esri.core.geometry.Operator;
+import com.esri.core.geometry.OperatorExportToGeoJson;
+import com.esri.core.geometry.OperatorFactoryLocal;
+import com.esri.core.geometry.ogc.OGCGeometry;
+import com.esri.core.geometry.ogc.OGCPoint;
+import org.apache.cassandra.serializers.MarshalException;
+
+public class Point extends OgcGeometry
+{
+    public static final Serializer<Point> serializer = new Serializer<Point>()
+    {
+        @Override
+        public String toWellKnownText(Point geometry)
+        {
+            return geometry.point.asText();
+        }
+
+        @Override
+        public ByteBuffer toWellKnownBinaryNativeOrder(Point geometry)
+        {
+            return geometry.point.asBinary();
+        }
+
+        @Override
+        public String toGeoJson(Point geometry)
+        {
+            OperatorExportToGeoJson op = (OperatorExportToGeoJson) OperatorFactoryLocal.getInstance().getOperator(Operator.Type.ExportToGeoJson);
+            return op.execute(GeoJsonExportFlags.geoJsonExportSkipCRS, geometry.point.esriSR, geometry.point.getEsriGeometry());
+        }
+
+        @Override
+        public Point fromWellKnownText(String source)
+        {
+            return new Point(fromOgcWellKnownText(source, OGCPoint.class));
+        }
+
+        @Override
+        public Point fromWellKnownBinary(ByteBuffer source)
+        {
+            return new Point(fromOgcWellKnownBinary(source, OGCPoint.class));
+        }
+
+        @Override
+        public Point fromGeoJson(String source)
+        {
+            return new Point(fromOgcGeoJson(source, OGCPoint.class));
+        }
+    };
+
+    final OGCPoint point;
+
+    public Point(double x, double y)
+    {
+        this(new OGCPoint(new com.esri.core.geometry.Point(x, y), OgcGeometry.SPATIAL_REFERENCE_4326));
+    }
+
+    private Point(OGCPoint point)
+    {
+        this.point = point;
+        validate();
+    }
+
+    @Override
+    public boolean contains(OgcGeometry geometry)
+    {
+        return false;
+    }
+
+    @Override
+    public GeometricType getType()
+    {
+        return GeometricType.POINT;
+    }
+
+    @Override
+    public void validate() throws MarshalException
+    {
+        validateOgcGeometry(point);
+        if (point.isEmpty() || point.is3D())
+            throw new MarshalException(getClass().getSimpleName() + " requires exactly 2 coordinate values");
+    }
+
+    @Override
+    protected OGCGeometry getOgcGeometry()
+    {
+        return point;
+    }
+
+    @Override
+    public Serializer getSerializer()
+    {
+        return serializer;
+    }
+
+    public OGCPoint getOgcPoint()
+    {
+        return point;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        Point point1 = (Point) o;
+
+        return !(point != null ? !point.equals(point1.point) : point1.point != null);
+
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return point != null ? point.hashCode() : 0;
+    }
+
+    @Override
+    public String toString()
+    {
+        return asWellKnownText();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/marshal/geometry/Polygon.java b/src/java/org/apache/cassandra/db/marshal/geometry/Polygon.java
new file mode 100644
index 000000000000..d51181566d04
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/marshal/geometry/Polygon.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.geometry;
+
+import java.nio.ByteBuffer;
+
+import com.esri.core.geometry.GeoJsonExportFlags;
+import com.esri.core.geometry.Operator;
+import com.esri.core.geometry.OperatorExportToGeoJson;
+import com.esri.core.geometry.OperatorFactoryLocal;
+import com.esri.core.geometry.ogc.OGCGeometry;
+import com.esri.core.geometry.ogc.OGCPolygon;
+import org.apache.cassandra.serializers.MarshalException;
+
+public class Polygon extends OgcGeometry
+{
+    public static final Serializer<Polygon> serializer = new Serializer<Polygon>()
+    {
+        @Override
+        public String toWellKnownText(Polygon geometry)
+        {
+            return geometry.polygon.asText();
+        }
+
+        @Override
+        public ByteBuffer toWellKnownBinaryNativeOrder(Polygon geometry)
+        {
+            return geometry.polygon.asBinary();
+        }
+
+        @Override
+        public String toGeoJson(Polygon geometry)
+        {
+            OperatorExportToGeoJson op = (OperatorExportToGeoJson) OperatorFactoryLocal.getInstance().getOperator(Operator.Type.ExportToGeoJson);
+            return op.execute(GeoJsonExportFlags.geoJsonExportSkipCRS, geometry.polygon.esriSR, geometry.polygon.getEsriGeometry());
+        }
+
+        @Override
+        public Polygon fromWellKnownText(String source)
+        {
+            return new Polygon(fromOgcWellKnownText(source, OGCPolygon.class));
+        }
+
+        @Override
+        public Polygon fromWellKnownBinary(ByteBuffer source)
+        {
+            return new Polygon(fromOgcWellKnownBinary(source, OGCPolygon.class));
+        }
+
+        @Override
+        public Polygon fromGeoJson(String source)
+        {
+            return new Polygon(fromOgcGeoJson(source, OGCPolygon.class));
+        }
+    };
+
+    OGCPolygon polygon;
+
+    public Polygon(OGCPolygon polygon)
+    {
+        this.polygon = polygon;
+        validate();
+    }
+
+    @Override
+    protected OGCGeometry getOgcGeometry()
+    {
+        return polygon;
+    }
+
+    @Override
+    public GeometricType getType()
+    {
+        return GeometricType.POLYGON;
+    }
+
+    @Override
+    public void validate() throws MarshalException
+    {
+        validateOgcGeometry(polygon);
+    }
+
+    @Override
+    public Serializer getSerializer()
+    {
+        return serializer;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        Polygon polygon1 = (Polygon) o;
+
+        return !(polygon != null ? !polygon.equals(polygon1.polygon) : polygon1.polygon != null);
+
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return polygon != null ? polygon.hashCode() : 0;
+    }
+
+    @Override
+    public String toString()
+    {
+        return asWellKnownText();
+    }
+}
diff --git a/src/java/org/apache/cassandra/serializers/DateRangeSerializer.java b/src/java/org/apache/cassandra/serializers/DateRangeSerializer.java
new file mode 100644
index 000000000000..ae3db350b62c
--- /dev/null
+++ b/src/java/org/apache/cassandra/serializers/DateRangeSerializer.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.serializers;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.time.Instant;
+import java.util.List;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.marshal.ValueAccessor;
+import org.apache.cassandra.db.marshal.datetime.DateRange;
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound.Precision;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * Responsible for {@link DateRange} serialization/deserialization with respect to the following format:
+ * -------------------------
+ * <type>[<time0><precision0>[<time1><precision1>]]
+ *
+ * Where:
+ *
+ * <type> is a [byte] encoding of
+ * - 0x00 - single value as in "2001-01-01"
+ * - 0x01 - closed range as in "[2001-01-01 TO 2001-01-31]"
+ * - 0x02 - open range high as in "[2001-01-01 TO *]"
+ * - 0x03 - open range low as in "[* TO 2001-01-01]"
+ * - 0x04 - both ranges open as in "[* TO *]"
+ * - 0x05 - single value open as in "*"
+ *
+ * <time0> is an optional [long] millisecond offset from epoch. Absent for <type> in [4,5], present otherwise.
+ * Represents a single date value for <type> = 0, the range start for <type> in [1,2], or range end for <type> = 3.
+ *
+ * <precision0> is an optional [byte]s and represents the precision of field <time0>. Absent for <type> in [4,5], present otherwise.
+ * Possible values are:
+ * - 0x00 - year
+ * - 0x01 - month
+ * - 0x02 - day
+ * - 0x03 - hour
+ * - 0x04 - minute
+ * - 0x05 - second
+ * - 0x06 - millisecond
+ *
+ * <time1> is an optional [long] millisecond offset from epoch. Represents the range end for <type> = 1. Not present
+ * otherwise.
+ *
+ * <precision1> is an optional [byte] and represents the precision of field <time1>. Only present if <type> = 1. Values
+ * are the same as for <precision0>.
+ */
+public final class DateRangeSerializer extends TypeSerializer<DateRange>
+{
+    public static final DateRangeSerializer instance = new DateRangeSerializer();
+
+    // e.g. [2001-01-01]
+    private final static byte DATE_RANGE_TYPE_SINGLE_DATE = 0x00;
+    // e.g. [2001-01-01 TO 2001-01-31]
+    private final static byte DATE_RANGE_TYPE_CLOSED_RANGE = 0x01;
+    // e.g. [2001-01-01 TO *]
+    private final static byte DATE_RANGE_TYPE_OPEN_RANGE_HIGH = 0x02;
+    // e.g. [* TO 2001-01-01]
+    private final static byte DATE_RANGE_TYPE_OPEN_RANGE_LOW = 0x03;
+    // [* TO *]
+    private final static byte DATE_RANGE_TYPE_BOTH_OPEN_RANGE = 0x04;
+    // *
+    private final static byte DATE_RANGE_TYPE_SINGLE_DATE_OPEN = 0x05;
+
+    /**
+     * Size of the single serialized DateRange boundary. As specified in @{@link DateRangeSerializer}.
+     *
+     * Tightly coupled with {@link #deserializeDateRangeLowerBound(int, Object, ValueAccessor)} and
+     * {@link #deserializeDateRangeUpperBound(int, Object, ValueAccessor)}.
+     */
+    private final static int SERIALIZED_DATE_RANGE_BOUND_SIZE = TypeSizes.LONG_SIZE + TypeSizes.BYTE_SIZE;
+
+    private static final List<Integer> VALID_SERIALIZED_LENGTHS = ImmutableList.of(
+            // types: 0x04, 0x05
+            Byte.BYTES,
+            // types: 0x00, 0x02, 0x03
+            Byte.BYTES + Long.BYTES + Byte.BYTES,
+            // types: 0x01
+            Byte.BYTES + Long.BYTES + Byte.BYTES + Long.BYTES + Byte.BYTES
+    );
+
+
+    @Override
+    public ByteBuffer serialize(DateRange dateRange)
+    {
+        if (dateRange == null)
+        {
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        }
+
+        byte rangeType = encodeType(dateRange);
+
+        int bufferSize = 1;
+        if (!dateRange.getLowerBound().isUnbounded())
+        {
+            bufferSize += 9;
+        }
+        if (dateRange.isUpperBoundDefined() && !dateRange.getUpperBound().isUnbounded())
+        {
+            bufferSize += 9;
+        }
+
+        try (DataOutputBuffer output = new DataOutputBuffer(bufferSize))
+        {
+            output.writeByte(rangeType);
+            DateRange.DateRangeBound lowerBound = dateRange.getLowerBound();
+            if (!lowerBound.isUnbounded())
+            {
+                output.writeLong(lowerBound.getTimestamp().toEpochMilli());
+                output.writeByte(lowerBound.getPrecision().toEncoded());
+            }
+
+            if (dateRange.isUpperBoundDefined())
+            {
+                DateRange.DateRangeBound upperBound = dateRange.getUpperBound();
+                if (!upperBound.isUnbounded())
+                {
+                    output.writeLong(upperBound.getTimestamp().toEpochMilli());
+                    output.writeByte(upperBound.getPrecision().toEncoded());
+                }
+            }
+            return output.buffer();
+        }
+        catch (IOException e)
+        {
+            throw new AssertionError("Unexpected error", e);
+        }
+    }
+
+    @Override
+    public <V> DateRange deserialize(V value, ValueAccessor<V> accessor)
+    {
+        if (accessor.isEmpty(value))
+        {
+            return null;
+        }
+
+        try
+        {
+            byte type = accessor.toByte(value);
+            int offset = TypeSizes.BYTE_SIZE;
+            switch (type)
+            {
+                case DATE_RANGE_TYPE_SINGLE_DATE:
+                    return new DateRange(deserializeDateRangeLowerBound(offset, value, accessor));
+                case DATE_RANGE_TYPE_CLOSED_RANGE:
+                    DateRange.DateRangeBound lowerBound = deserializeDateRangeLowerBound(offset, value, accessor);
+                    offset += SERIALIZED_DATE_RANGE_BOUND_SIZE;
+                    DateRange.DateRangeBound upperBound = deserializeDateRangeUpperBound(offset, value, accessor);
+                    return new DateRange(lowerBound, upperBound);
+                case DATE_RANGE_TYPE_OPEN_RANGE_HIGH:
+                    return new DateRange(deserializeDateRangeLowerBound(offset, value, accessor), DateRange.DateRangeBound.UNBOUNDED);
+                case DATE_RANGE_TYPE_OPEN_RANGE_LOW:
+                    return new DateRange(DateRange.DateRangeBound.UNBOUNDED, deserializeDateRangeUpperBound(offset, value, accessor));
+                case DATE_RANGE_TYPE_BOTH_OPEN_RANGE:
+                    return new DateRange(DateRange.DateRangeBound.UNBOUNDED, DateRange.DateRangeBound.UNBOUNDED);
+                case DATE_RANGE_TYPE_SINGLE_DATE_OPEN:
+                    return new DateRange(DateRange.DateRangeBound.UNBOUNDED);
+                default:
+                    throw new IllegalArgumentException("Unknown date range type: " + type);
+            }
+        }
+        catch (IOException e)
+        {
+            throw new AssertionError("Unexpected error", e);
+        }
+    }
+
+    @Override
+    public <V> void validate(V value, ValueAccessor<V> accessor) throws MarshalException
+    {
+        if (accessor.isEmpty(value))
+        {
+            return;
+        }
+        else if (!VALID_SERIALIZED_LENGTHS.contains(accessor.size(value)))
+        {
+            throw new MarshalException(String.format("Date range should be have %s bytes, got %d instead.", VALID_SERIALIZED_LENGTHS, accessor.size(value)));
+        }
+        DateRange dateRange = deserialize(value, accessor);
+        validateDateRange(dateRange);
+    }
+
+    @Override
+    public String toString(DateRange dateRange)
+    {
+        return dateRange == null ? "" : dateRange.formatToSolrString();
+    }
+
+    @Override
+    public Class<DateRange> getType()
+    {
+        return DateRange.class;
+    }
+
+    private byte encodeType(DateRange dateRange)
+    {
+        if (dateRange.isUpperBoundDefined())
+        {
+            if (dateRange.getLowerBound().isUnbounded())
+            {
+                return dateRange.getUpperBound().isUnbounded() ? DATE_RANGE_TYPE_BOTH_OPEN_RANGE : DATE_RANGE_TYPE_OPEN_RANGE_LOW;
+            }
+            else
+            {
+                return dateRange.getUpperBound().isUnbounded() ? DATE_RANGE_TYPE_OPEN_RANGE_HIGH : DATE_RANGE_TYPE_CLOSED_RANGE;
+            }
+        }
+        else
+        {
+            return dateRange.getLowerBound().isUnbounded() ? DATE_RANGE_TYPE_SINGLE_DATE_OPEN : DATE_RANGE_TYPE_SINGLE_DATE;
+        }
+    }
+
+    private <V> DateRange.DateRangeBound deserializeDateRangeLowerBound(int offset, V value, ValueAccessor<V> accessor) throws IOException
+    {
+        long epochMillis = accessor.getLong(value, offset);
+        offset += TypeSizes.LONG_SIZE;
+        Precision precision = Precision.fromEncoded(accessor.getByte(value, offset));
+        return DateRange.DateRangeBound.lowerBound(Instant.ofEpochMilli(epochMillis), precision);
+    }
+
+    private <V> DateRange.DateRangeBound deserializeDateRangeUpperBound(int offset, V value, ValueAccessor<V> accessor) throws IOException
+    {
+        long epochMillis = accessor.getLong(value, offset);
+        offset += TypeSizes.LONG_SIZE;
+        Precision precision = Precision.fromEncoded(accessor.getByte(value, offset));
+        return DateRange.DateRangeBound.upperBound(Instant.ofEpochMilli(epochMillis), precision);
+    }
+
+    private void validateDateRange(DateRange dateRange)
+    {
+        if (dateRange != null && !dateRange.getLowerBound().isUnbounded() && dateRange.isUpperBoundDefined() && !dateRange.getUpperBound().isUnbounded())
+        {
+            if (dateRange.getLowerBound().getTimestamp().isAfter(dateRange.getUpperBound().getTimestamp()))
+            {
+                throw new MarshalException(String.format("Lower bound of a date range should be before upper bound, got: %s",
+                        dateRange.formatToSolrString()));
+            }
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/DateRangeIntegrationTest.java b/test/unit/org/apache/cassandra/db/marshal/DateRangeIntegrationTest.java
new file mode 100644
index 000000000000..d444296dbc84
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/DateRangeIntegrationTest.java
@@ -0,0 +1,287 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import com.google.common.collect.Sets;
+import org.junit.BeforeClass;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.ExpectedException;
+
+import com.datastax.driver.core.CodecRegistry;
+import com.datastax.driver.core.PreparedStatement;
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import com.datastax.driver.core.Session;
+import com.datastax.driver.core.TupleValue;
+import com.datastax.driver.core.UDTValue;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.marshal.datetime.DateRange;
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound.Precision;
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBuilder;
+import org.apache.cassandra.db.marshal.datetime.DateRangeCodec;
+import org.hamcrest.Matchers;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertThat;
+
+public class DateRangeIntegrationTest extends CQLTester
+{
+    @BeforeClass
+    public static void setup()
+    {
+        CodecRegistry.DEFAULT_INSTANCE.register(DateRangeCodec.instance);
+    }
+
+    @Rule
+    public ExpectedException expectedException = ExpectedException.none();
+
+    @Test
+    public void testDateRangeAsPrimaryKey() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TABLE dr (k 'DateRangeType' PRIMARY KEY, v int)");
+        executeNet("INSERT INTO dr (k, v) VALUES ('[2010-12-03 TO 2010-12-04]', 1)");
+        executeNet("INSERT INTO dr (k, v) VALUES ('[2015-12-03T10:15:30.001Z TO 2016-01-01T00:05:11.967Z]', 2)");
+
+        ResultSet results = executeNet(String.format("SELECT * FROM %s.dr", keyspace));
+        List<Row> rows = results.all();
+
+        assertEquals(2, rows.size());
+        DateRange expected = dateRange("2010-12-03T00:00:00.000Z", Precision.DAY, "2010-12-04T23:59:59.999Z", Precision.DAY);
+        assertEquals(expected, rows.get(0).get("k", DateRange.class));
+        expected = dateRange("2015-12-03T10:15:30.001Z", Precision.MILLISECOND, "2016-01-01T00:05:11.967Z", Precision.MILLISECOND);
+        assertEquals(expected, rows.get(1).get("k", DateRange.class));
+
+        results = executeNet("SELECT * FROM dr WHERE k = '[2015-12-03T10:15:30.001Z TO 2016-01-01T00:05:11.967]'");
+        rows = results.all();
+
+        assertEquals(1, rows.size());
+        assertEquals(2, rows.get(0).getInt("v"));
+
+
+        flush(keyspace, "dr");
+
+        results = executeNet("SELECT * FROM dr");
+        rows = results.all();
+
+        assertEquals(2, rows.size());
+        expected = dateRange("2015-12-03T10:15:30.001Z", Precision.MILLISECOND, "2016-01-01T00:05:11.967Z", Precision.MILLISECOND);
+        assertEquals(expected, rows.get(1).get("k", DateRange.class));
+    }
+
+    @Test
+    public void testCreateDateRange() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TABLE dr (k int PRIMARY KEY, v 'DateRangeType')");
+        executeNet("INSERT INTO dr (k, v) VALUES (1, '[2000-01-01T10:15:30.301Z TO *]')");
+        executeNet("INSERT INTO dr (k, v) VALUES (2, '[2000-02 TO 2000-03]')");
+        executeNet("INSERT INTO dr (k, v) VALUES (3, '[* TO 2020]')");
+        executeNet("INSERT INTO dr (k, v) VALUES (4, null)");
+        executeNet("INSERT INTO dr (k) VALUES (5)");
+
+        ResultSet results = executeNet("SELECT * FROM dr");
+        List<Row> rows = results.all();
+        assertEquals(5, rows.size());
+        DateRange dateRange = rows.get(4).get("v", DateRange.class);
+        assertNotNull(dateRange);
+        DateRange expected = DateRangeBuilder.dateRange()
+                .withUnboundedLowerBound()
+                .withUpperBound("2020-12-31T23:59:59.999Z", Precision.YEAR)
+                .build();
+        assertEquals(expected, dateRange);
+    }
+
+    @Test
+    public void testInvalidDateRangeOrder() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TABLE dr (k int PRIMARY KEY, v 'DateRangeType')");
+
+        expectedException.expect(InvalidQueryException.class);
+        expectedException.expectMessage("Wrong order: 2020-01-01T10:15:30.009Z TO 2010-01-01T00:05:11.031Z");
+        expectedException.expectMessage("Could not parse date range: [2020-01-01T10:15:30.009Z TO 2010-01-01T00:05:11.031Z]");
+        executeNet("INSERT INTO dr (k, v) VALUES (1, '[2020-01-01T10:15:30.009Z TO 2010-01-01T00:05:11.031Z]')");
+    }
+
+    @Test
+    public void testDateRangeInTuples() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy' , 'replication_factor': '1'}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TYPE IF NOT EXISTS test_udt (i int, range 'DateRangeType')");
+        executeNet("CREATE TABLE dr (k int PRIMARY KEY, u test_udt, uf frozen<test_udt>, t tuple<'DateRangeType', int>, tf frozen<tuple<'DateRangeType', int>>)");
+
+        executeNet("INSERT INTO dr (k, u, uf, t, tf) VALUES (" +
+                "1, " +
+                "{i: 10, range: '[2000-01-01T10:15:30.003Z TO 2020-01-01T10:15:30.001Z]'}, " +
+                "{i: 20, range: '[2000-01-01T10:15:30.003Z TO 2020-01-01T10:15:30.001Z]'}, " +
+                "('[2000-01-01T10:15:30.003Z TO 2020-01-01T10:15:30.001Z]', 30), " +
+                "('[2000-01-01T10:15:30.003Z TO 2020-01-01T10:15:30.001Z]', 40))");
+
+        DateRange expected = dateRange("2000-01-01T10:15:30.003Z", Precision.MILLISECOND, "2020-01-01T10:15:30.001Z", Precision.MILLISECOND);
+        ResultSet results = executeNet("SELECT * FROM dr");
+        List<Row> rows = results.all();
+        assertEquals(1, rows.size());
+
+        UDTValue u = rows.get(0).get("u", UDTValue.class);
+        DateRange dateRange = u.get("range", DateRange.class);
+        assertEquals(expected, dateRange);
+        assertEquals(10, u.getInt("i"));
+
+        u = rows.get(0).get("uf", UDTValue.class);
+        dateRange = u.get("range", DateRange.class);
+        assertEquals(expected, dateRange);
+        assertEquals(20, u.getInt("i"));
+
+        TupleValue t = rows.get(0).get("t", TupleValue.class);
+        dateRange = t.get(0, DateRange.class);
+        assertEquals(expected, dateRange);
+        assertEquals(30, t.getInt(1));
+
+        t = rows.get(0).get("tf", TupleValue.class);
+        dateRange = t.get(0, DateRange.class);
+        assertEquals(expected, dateRange);
+        assertEquals(40, t.getInt(1));
+    }
+
+    @Test
+    public void testDateRangeInCollections() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy' , 'replication_factor': '1'}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TABLE dr (k int PRIMARY KEY, l list<'DateRangeType'>, s set<'DateRangeType'>, dr2i map<'DateRangeType', int>, i2dr map<int, 'DateRangeType'>)");
+
+        executeNet("INSERT INTO dr (k, l, s, i2dr, dr2i) VALUES (" +
+                "1, " +
+                "['[2000-01-01T10:15:30.001Z TO 2020]', '[2010-01-01T10:15:30.001Z TO 2020]', '2001-01-02'], " +
+                "{'[2000-01-01T10:15:30.001Z TO 2020]', '[2000-01-01T10:15:30.001Z TO 2020]', '[2010-01-01T10:15:30.001Z TO 2020]'}, " +
+                "{1: '[2000-01-01T10:15:30.001Z TO 2020]', 2: '[2010-01-01T10:15:30.001Z TO 2020]'}, " +
+                "{'[2000-01-01T10:15:30.001Z TO 2020]': 1, '[2010-01-01T10:15:30.001Z TO 2020]': 2})");
+
+        ResultSet results = executeNet("SELECT * FROM dr");
+        List<Row> rows = results.all();
+        assertEquals(1, rows.size());
+
+        List<DateRange> drList = rows.get(0).getList("l", DateRange.class);
+        assertEquals(3, drList.size());
+        assertEquals(dateRange("2000-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR), drList.get(0));
+        assertEquals(dateRange("2010-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR), drList.get(1));
+        assertEquals(DateRangeBuilder.dateRange().withLowerBound("2001-01-02T00:00:00.000Z", Precision.DAY).build(), drList.get(2));
+
+        Set<DateRange> drSet = rows.get(0).getSet("s", DateRange.class);
+        assertEquals(2, drSet.size());
+        assertEquals(
+                Sets.newHashSet(
+                        dateRange("2000-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR),
+                        dateRange("2010-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR)),
+                drSet);
+
+        Map<DateRange, Integer> dr2i = rows.get(0).getMap("dr2i", DateRange.class, Integer.class);
+        assertEquals(2, dr2i.size());
+        assertEquals(1, (int) dr2i.get(dateRange("2000-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR)));
+        assertEquals(2, (int) dr2i.get(dateRange("2010-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR)));
+
+        Map<Integer, DateRange> i2dr = rows.get(0).getMap("i2dr", Integer.class, DateRange.class);
+        assertEquals(2, i2dr.size());
+        assertEquals(dateRange("2000-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR), i2dr.get(1));
+        assertEquals(dateRange("2010-01-01T10:15:30.001Z", Precision.MILLISECOND, "2020-12-31T23:59:59.999Z", Precision.YEAR), i2dr.get(2));
+    }
+
+    @Test
+    public void testPreparedStatementsWithDateRange() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TABLE dr (k int PRIMARY KEY, v 'DateRangeType')");
+
+        Session session = sessionNet();
+        PreparedStatement statement = session.prepare(String.format("INSERT INTO %s.dr (k,v) VALUES(?,?)", keyspace));
+
+        DateRange dateRange = dateRange("2007-12-03T00:00:00.000Z", Precision.DAY, "2007-12-17T00:00:00.000Z", Precision.MONTH);
+        session.execute(statement.bind(1, dateRange));
+
+        ResultSet results = executeNet("SELECT * FROM dr");
+        List<Row> rows = results.all();
+        assertEquals(1, rows.size());
+
+        DateRange actual = rows.get(0).get("v", DateRange.class);
+        assertEquals(Precision.DAY, actual.getLowerBound().getPrecision());
+        assertEquals(Precision.MONTH, actual.getUpperBound().getPrecision());
+        assertEquals("[2007-12-03 TO 2007-12]", actual.formatToSolrString());
+
+        results = executeNet("SELECT JSON * FROM dr");
+        assertThat(results.all().get(0).toString(), Matchers.containsString("\"v\": \"[2007-12-03 TO 2007-12]\""));
+    }
+
+    @Test
+    public void testSemanticallyEquivalentDateRanges() throws Throwable
+    {
+        String keyspace = randomKeyspace();
+        executeNet(String.format("CREATE KEYSPACE %s WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}", keyspace));
+        executeNet("USE " + keyspace);
+        executeNet("CREATE TABLE dr (k int, c0 'DateRangeType', PRIMARY KEY (k, c0))");
+
+        executeNet("INSERT INTO dr (k, c0) VALUES (1, '2016-01-01')");
+        executeNet("INSERT INTO dr (k, c0) VALUES (1, '[2016-01-01 TO 2016-01-01]')");
+        executeNet("INSERT INTO dr (k, c0) VALUES (1, '[2016-01-01T00:00:00.000Z TO 2016-01-01]')");
+        executeNet("INSERT INTO dr (k, c0) VALUES (1, '[2016-01-01T00:00:00.000Z TO 2016-01-01:23:59:59.999Z]')");
+        executeNet("INSERT INTO dr (k, c0) VALUES (1, '[2016-01-01 TO 2016-01-01:23:59:59.999Z]')");
+
+        ResultSet results = executeNet("SELECT * FROM dr");
+        assertEquals(5, results.all().size());
+
+        results = executeNet("SELECT * FROM dr WHERE c0 = '2016-01-01' ALLOW FILTERING");
+        assertEquals(1, results.all().size());
+
+        results = executeNet("SELECT * FROM dr WHERE k = 1 AND c0 = '[2016-01-01T00:00:00.000Z TO 2016-01-01:23:59:59.999Z]'");
+        assertEquals(1, results.all().size());
+    }
+
+    private String randomKeyspace()
+    {
+        return "ks" + System.nanoTime();
+    }
+
+    private DateRange dateRange(String lowerBound, Precision lowerBoundPrecision, String upperBound, Precision upperBoundPrecision)
+    {
+        return DateRangeBuilder.dateRange()
+                .withLowerBound(lowerBound, lowerBoundPrecision)
+                .withUpperBound(upperBound, upperBoundPrecision)
+                .build();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/DateRangeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/DateRangeTypeTest.java
new file mode 100644
index 000000000000..fd622e7bb745
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/DateRangeTypeTest.java
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.marshal.datetime.DateRange;
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound.Precision;
+import org.apache.cassandra.transport.ProtocolVersion;
+
+import static org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBuilder.dateRange;
+import static org.junit.Assert.assertEquals;
+
+@RunWith(Parameterized.class)
+public class DateRangeTypeTest
+{
+    private final DateRangeType dateRangeType = DateRangeType.instance;
+
+    @Parameterized.Parameter(0)
+    public DateRange dataRange;
+
+    @Parameterized.Parameter(1)
+    public String source;
+
+    @Test
+    public void shouldFormatDateRangeAsJson()
+    {
+        ByteBuffer bytes = dateRangeType.decompose(dataRange);
+        String actualJson = dateRangeType.toJSONString(bytes, ProtocolVersion.CURRENT);
+        assertEquals('"' + source + '"', actualJson);
+    }
+
+    @Test
+    public void shouldCreateProperDateRangeFromString()
+    {
+        ByteBuffer dateRangeBytes = dateRangeType.fromString(source);
+        DateRange actual = dateRangeType.getSerializer().deserialize(dateRangeBytes);
+        assertEquals(dataRange, actual);
+    }
+
+    @SuppressWarnings("unused")
+    @Parameterized.Parameters(name = "dataRange = {0}, source = {1}")
+    public static Collection<Object[]> testData()
+    {
+        return Arrays.asList(
+                new Object[]{
+                        dateRange()
+                                .withLowerBound("1950-01-01T00:00:00.000Z", Precision.YEAR)
+                                .withUnboundedUpperBound()
+                                .build(),
+                        "[1950 TO *]"
+                },
+                new Object[]{
+                        dateRange()
+                                .withLowerBound("1998-01-01T00:00:00.000Z", Precision.MILLISECOND)
+                                .withUpperBound("1999-02-01T00:00:00.000Z", Precision.DAY)
+                                .build(),
+                        "[1998-01-01T00:00:00.000Z TO 1999-02-01]"
+                },
+                new Object[]{
+                        dateRange()
+                                .withLowerBound("1930-12-03T01:01:01.003Z", Precision.DAY)
+                                .withUpperBound("1951-01-02T00:00:00.003Z", Precision.MILLISECOND)
+                                .build(),
+                        "[1930-12-03 TO 1951-01-02T00:00:00.003Z]"
+                },
+                new Object[]{
+                        dateRange()
+                                .withUnboundedLowerBound()
+                                .withUpperBound("2014-01-02T00:00:00.003Z", Precision.YEAR)
+                                .build(),
+                        "[* TO 2014]"
+                },
+                new Object[]{
+                        dateRange()
+                                .withUnboundedLowerBound()
+                                .withUnboundedUpperBound()
+                                .build(),
+                        "[* TO *]"
+                },
+                new Object[]{
+                        dateRange()
+                                .withLowerBound("1966-03-03T03:30:30.030Z", Precision.YEAR)
+                                .build(),
+                        "1966"
+                },
+                new Object[]{
+                        dateRange()
+                                .withLowerBound("1700-01-01T00:00:00.000Z", Precision.MILLISECOND)
+                                .build(),
+                        "1700-01-01T00:00:00.000Z"
+                },
+                new Object[]{
+                        dateRange()
+                                .withLowerBound("-0009-01-01T00:00:00.000Z", Precision.YEAR)
+                                .build(),
+                        "-0009"
+                }
+        );
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/GeometricTypeTests.java b/test/unit/org/apache/cassandra/db/marshal/GeometricTypeTests.java
new file mode 100644
index 000000000000..cbc0a4975a49
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/GeometricTypeTests.java
@@ -0,0 +1,84 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+
+import com.esri.core.geometry.Polyline;
+import com.esri.core.geometry.ogc.OGCLineString;
+import com.esri.core.geometry.ogc.OGCPolygon;
+import org.apache.cassandra.db.marshal.geometry.LineString;
+import org.apache.cassandra.db.marshal.geometry.OgcGeometry;
+import org.apache.cassandra.db.marshal.geometry.Point;
+import org.apache.cassandra.db.marshal.geometry.Polygon;
+
+public class GeometricTypeTests
+{
+    public static Point p(double x, double y)
+    {
+        return new Point(x, y);
+    }
+
+    public static com.esri.core.geometry.Point ep(double x, double y)
+    {
+        return new com.esri.core.geometry.Point(x, y);
+    }
+
+    public static com.esri.core.geometry.Point ep(Point p)
+    {
+        return new com.esri.core.geometry.Point(p.getOgcPoint().X(), p.getOgcPoint().Y());
+    }
+
+    public static LineString lineString(Point p1, Point p2, Point... pn)
+    {
+        Polyline polyline = new Polyline(ep(p1), ep(p2));
+        for (Point p : pn)
+        {
+            polyline.lineTo(ep(p));
+        }
+
+        return new LineString(new OGCLineString(polyline, 0, OgcGeometry.SPATIAL_REFERENCE_4326));
+    }
+
+    public static Polygon polygon(Point p1, Point p2, Point p3, Point... pn)
+    {
+        com.esri.core.geometry.Polygon polygon = new com.esri.core.geometry.Polygon();
+        polygon.startPath(ep(p1));
+        polygon.lineTo(ep(p2));
+        polygon.lineTo(ep(p3));
+        for (Point p : pn)
+        {
+            polygon.lineTo(ep(p));
+        }
+        return new Polygon(new OGCPolygon(polygon, OgcGeometry.SPATIAL_REFERENCE_4326));
+    }
+
+    /**
+     * pads the buffer with some leading and trailing data to aid testing
+     * proper deserialization from continuous buffers
+     */
+    public static ByteBuffer padBuffer(ByteBuffer bb)
+    {
+        ByteBuffer padded = ByteBuffer.allocate(8 + bb.limit()).putInt(49).put(bb).putInt(50);
+        padded.position(4);
+        return padded;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/GeometryCodecTest.java b/test/unit/org/apache/cassandra/db/marshal/GeometryCodecTest.java
new file mode 100644
index 000000000000..e21806572727
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/GeometryCodecTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.datastax.driver.core.ProtocolVersion;
+import junit.framework.TestCase;
+import org.apache.cassandra.db.marshal.geometry.OgcGeometry;
+import org.apache.cassandra.db.marshal.geometry.Point;
+
+public class GeometryCodecTest extends TestCase
+{
+    private final GeometryCodec<Point> codec = new GeometryCodec<>(PointType.instance);
+
+    @Test
+    public void testFormat()
+    {
+        Assert.assertEquals("NULL", codec.format(null));
+        Assert.assertEquals("POINT (5.4 1)", codec.format(new Point(5.4, 1.0)));
+    }
+
+    @Test
+    public void testParse()
+    {
+        Assert.assertEquals(null, codec.parse(null));
+        Assert.assertEquals(new Point(5.4, 1.0), codec.parse("POINT (5.4 1)"));
+    }
+
+    @Test
+    public void testSerializationRoundTrip()
+    {
+        Point point = new Point(5.4, 1.0);
+        ByteBuffer serialized = codec.serialize(point, ProtocolVersion.NEWEST_SUPPORTED);
+        OgcGeometry deserialized = codec.deserialize(serialized, ProtocolVersion.NEWEST_SUPPORTED);
+        Assert.assertEquals(point, deserialized);
+    }
+
+    @Test
+    public void testEmptyValuesSerialization()
+    {
+        Assert.assertEquals(null, codec.serialize(null, ProtocolVersion.NEWEST_SUPPORTED));
+        Assert.assertEquals(null, codec.deserialize(null, ProtocolVersion.NEWEST_SUPPORTED));
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/marshal/GeometryIntegrationTest.java b/test/unit/org/apache/cassandra/db/marshal/GeometryIntegrationTest.java
new file mode 100644
index 000000000000..f25fd3f0430a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/GeometryIntegrationTest.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.CodecRegistry;
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.marshal.geometry.LineString;
+import org.apache.cassandra.db.marshal.geometry.OgcGeometry;
+import org.apache.cassandra.db.marshal.geometry.Point;
+import org.apache.cassandra.db.marshal.geometry.Polygon;
+import org.apache.cassandra.transport.ProtocolVersion;
+
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.lineString;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.p;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.polygon;
+
+public class GeometryIntegrationTest extends CQLTester
+{
+
+    @BeforeClass
+    public static void setupCluster() throws IOException
+    {
+        CodecRegistry.DEFAULT_INSTANCE.register(GeometryCodec.pointCodec,
+                                                GeometryCodec.lineStringCodec,
+                                                GeometryCodec.polygonCodec);
+    }
+
+    @Before
+    public void setUpKeyspace() throws Throwable
+    {
+        executeNet("CREATE KEYSPACE ks WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor': 1}");
+    }
+
+    @After
+    public void teardown() throws Throwable
+    {
+        executeNet("DROP KEYSPACE ks;");
+    }
+
+    private <T extends OgcGeometry> void testType(T expected, Class<T> klass, AbstractGeometricType<T> type, String tableName, String wkt, String columnType) throws Throwable
+    {
+        executeNet(String.format("CREATE TABLE ks.%s (k INT PRIMARY KEY, g '%s')", tableName, columnType));
+        executeNet(String.format("INSERT INTO ks.%s (k, g) VALUES (1, '%s')", tableName, wkt));
+
+        ResultSet results = executeNet(String.format("SELECT * FROM ks.%s", tableName));
+        List<Row> rows = results.all();
+        Assert.assertEquals(1, rows.size());
+        Row row = rows.get(0);
+        T actual = row.get("g", klass);
+        Assert.assertEquals(expected, actual);
+        results = executeNet(String.format("SELECT toJson(g) FROM ks.%s", tableName));
+        rows = results.all();
+        Assert.assertEquals(1, rows.size());
+        row = rows.get(0);
+        String actualJson = row.getString("system.tojson(g)");
+        String expectedJson = type.toJSONString(type.getSerializer().serialize(expected), ProtocolVersion.CURRENT);
+        Assert.assertEquals(expectedJson, actualJson);
+    }
+
+    @Test
+    public void pointTest() throws Throwable
+    {
+        executeNet("CREATE TABLE ks.point (k INT PRIMARY KEY, g 'PointType')");
+        String wkt = "POINT(1.1 2.2)";
+        executeNet(String.format("INSERT INTO ks.point (k, g) VALUES (1, '%s')", wkt));
+
+        ResultSet results = executeNet("SELECT * FROM ks.point");
+        List<Row> rows = results.all();
+        Assert.assertEquals(1, rows.size());
+        Row row = rows.get(0);
+        Point point = row.get("g", Point.class);
+        Assert.assertEquals(new Point(1.1, 2.2), point);
+    }
+
+    @Test
+    public void lineStringTest() throws Throwable
+    {
+        LineString expected = lineString(p(30, 10), p(10, 30), p(40, 40));
+        String wkt = "linestring(30 10, 10 30, 40 40)";
+        testType(expected, LineString.class, LineStringType.instance, "linestring", wkt, "LineStringType");
+    }
+
+    @Test
+    public void polygonTest() throws Throwable
+    {
+        Polygon expected = polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40));
+        String wkt = "polygon((30 10, 40 40, 20 40, 10 20, 30 10))";
+        testType(expected, Polygon.class, PolygonType.instance, "polygon", wkt, "PolygonType");
+    }
+
+    @Test
+    public void primaryKeyTest() throws Throwable
+    {
+        executeNet("CREATE TABLE ks.geo (k 'PointType', c 'LineStringType', g 'PointType', PRIMARY KEY (k, c))");
+        executeNet("INSERT INTO ks.geo (k, c, g) VALUES ('POINT(1 2)', 'linestring(30 10, 10 30, 40 40)', 'POINT(1.1 2.2)')");
+        ResultSet results = executeNet("SELECT * FROM ks.geo");
+        List<Row> rows = results.all();
+        Assert.assertEquals(1, rows.size());
+        Row row = rows.get(0);
+
+        Point point1 = row.get("k", Point.class);
+        Assert.assertEquals(new Point(1, 2), point1);
+
+        LineString lineString = row.get("c", LineString.class);
+        Assert.assertEquals(lineString(p(30, 10), p(10, 30), p(40, 40)), lineString);
+
+        Point point = row.get("g", Point.class);
+        Assert.assertEquals(new Point(1.1, 2.2), point);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/marshal/LineStringTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/LineStringTypeTest.java
new file mode 100644
index 000000000000..8c8667939aa6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/LineStringTypeTest.java
@@ -0,0 +1,150 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.Constants;
+import org.apache.cassandra.db.marshal.geometry.LineString;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.lineString;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.p;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.padBuffer;
+
+public class LineStringTypeTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(LineStringTypeTest.class);
+
+    LineStringType type = LineStringType.instance;
+
+    @Test
+    public void successCase2d()
+    {
+        ByteBuffer actual = type.fromString("linestring(30 10, 10 30, 40 40)");
+
+        ByteBuffer expected = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
+        expected.position(0);
+
+        expected.put((byte) (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? 1 : 0)); // endianness
+        expected.putInt(2);  // type
+        expected.putInt(3);  // num points
+        expected.putDouble(30);  // x1
+        expected.putDouble(10);  // y1
+        expected.putDouble(10);  // x2
+        expected.putDouble(30);  // y2
+        expected.putDouble(40);  // x3
+        expected.putDouble(40);  // y3
+        expected.flip();
+
+        logger.debug("expected: {}", ByteBufferUtil.bytesToHex(expected));
+        logger.debug("actual:   {}", ByteBufferUtil.bytesToHex(actual));
+        String failMsg = String.format("%s != %s", ByteBufferUtil.bytesToHex(actual), ByteBufferUtil.bytesToHex(expected));
+        Assert.assertEquals(failMsg, expected, actual);
+
+        LineString expectedGeometry = lineString(p(30, 10), p(10, 30), p(40, 40));
+        LineString actualGeometry = type.getSerializer().deserialize(actual);
+        logger.debug("expected: {}", expectedGeometry);
+        logger.debug("actual:   {}", actualGeometry);
+        Assert.assertEquals(expectedGeometry, actualGeometry);
+    }
+
+    @Test(expected=MarshalException.class)
+    public void emptyFailure()
+    {
+        type.fromString("linestring()");
+    }
+
+    @Test(expected=MarshalException.class)
+    public void failure3d()
+    {
+        type.fromString("linestring(30 10 20, 10 30 20)");
+    }
+
+    /**
+     * Line strings that cross themselves shouldn't validate
+     */
+    @Test(expected=MarshalException.class)
+    public void simpleFailure()
+    {
+        type.fromString("linestring(0 0, 1 1, 0 1, 1 0)");
+    }
+
+    @Test(expected=MarshalException.class)
+    public void parseFailure()
+    {
+        type.fromString("superlinestring(30 10, 10 30, 40 40)");
+    }
+
+    @Test
+    public void jsonWktInput()
+    {
+        Constants.Value value = (Constants.Value) type.fromJSONObject("linestring(30 10, 10 30, 40 40)");
+        Assert.assertEquals(lineString(p(30, 10), p(10, 30), p(40, 40)), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonInput()
+    {
+        String json = "{\"type\":\"LineString\",\"coordinates\":[[30.0,10.0],[10.0,30.0],[40.0,40.0]]}";
+        Constants.Value value = (Constants.Value) type.fromJSONObject(json);
+        Assert.assertEquals(lineString(p(30, 10), p(10, 30), p(40, 40)), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonOutput()
+    {
+        String json = type.toJSONString(type.getSerializer().serialize(lineString(p(30, 10), p(10, 30), p(40, 40))), ProtocolVersion.CURRENT);
+        Assert.assertEquals("{\"type\":\"LineString\",\"coordinates\":[[30,10],[10,30],[40,40]]}", json);
+        logger.debug(json);
+    }
+
+    /**
+     * Use of absolute indexing in deserializers shouldn't cause problems
+     */
+    @Test
+    public void bufferOffset()
+    {
+        LineString expected = lineString(p(30, 10), p(10, 30), p(40, 40));
+        ByteBuffer bb = padBuffer(type.getSerializer().serialize(expected));
+        type.getSerializer().validate(bb);
+        LineString actual = type.getSerializer().deserialize(bb);
+        Assert.assertEquals(expected, actual);
+    }
+
+    @Test
+    public void bufferBigEndianess()
+    {
+        LineString expected = lineString(p(30, 10), p(10, 30), p(40, 40));
+        ByteBuffer bb = padBuffer(type.getSerializer().serialize(expected));
+        Assert.assertEquals(ByteOrder.BIG_ENDIAN, bb.order());
+    }
+
+}
+
diff --git a/test/unit/org/apache/cassandra/db/marshal/PointTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/PointTypeTest.java
new file mode 100644
index 000000000000..6ff7c9c15521
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/PointTypeTest.java
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.Constants;
+import org.apache.cassandra.db.marshal.geometry.LineString;
+import org.apache.cassandra.db.marshal.geometry.Point;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.*;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.p;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.padBuffer;
+
+public class PointTypeTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(PointTypeTest.class);
+
+    PointType type = PointType.instance;
+
+    @Test
+    public void successCase2d()
+    {
+        ByteBuffer actual = type.fromString("point(1.1 2.2)");
+
+        ByteBuffer expected = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
+        expected.position(0);
+
+        expected.put((byte) (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? 1 : 0)); // endianness
+        expected.putInt(1);  // type
+        expected.putDouble(1.1); // x
+        expected.putDouble(2.2); // y
+        expected.flip();
+
+        logger.debug("expected: {}", ByteBufferUtil.bytesToHex(expected));
+        logger.debug("actual:   {}", ByteBufferUtil.bytesToHex(actual));
+        String failMsg = String.format("%s != %s", ByteBufferUtil.bytesToHex(actual), ByteBufferUtil.bytesToHex(expected));
+        Assert.assertEquals(failMsg, expected, actual);
+
+        Point point = type.getSerializer().deserialize(actual);
+        Assert.assertEquals(p(1.1, 2.2), point);
+    }
+
+    @Test(expected=MarshalException.class)
+    public void parseFailure()
+    {
+        type.fromString("superpoint(1.1 2.2 3.3)");
+    }
+
+    @Test
+    public void jsonWktInput()
+    {
+        Constants.Value value = (Constants.Value) type.fromJSONObject("point(1 2)");
+        Assert.assertEquals(p(1, 2), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonInput()
+    {
+        String json = "{\"type\":\"Point\",\"coordinates\":[1.0,2.0]}";
+        Constants.Value value = (Constants.Value) type.fromJSONObject(json);
+        Assert.assertEquals(p(1, 2), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonInputWithoutPrecision()
+    {
+        String json = "{\"type\":\"Point\",\"coordinates\":[1,2]}";
+        Constants.Value value = (Constants.Value) type.fromJSONObject(json);
+        Assert.assertEquals(p(1, 2), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonOutput()
+    {
+        String json = type.toJSONString(type.getSerializer().serialize(p(1, 2)), ProtocolVersion.CURRENT);
+        Assert.assertEquals("{\"type\":\"Point\",\"coordinates\":[1,2]}", json);
+        logger.debug(json);
+    }
+
+    /**
+     * Use of absolute indexing in deserializers shouldn't cause problems
+     */
+    @Test
+    public void bufferOffset()
+    {
+        Point expected = p(1, 2);
+        ByteBuffer bb = padBuffer(type.getSerializer().serialize(expected));
+        type.getSerializer().validate(bb);
+        Point actual = type.getSerializer().deserialize(bb);
+        Assert.assertEquals(expected, actual);
+    }
+
+    private static ByteBuffer getExpectedSerialization(Point point, ByteOrder order)
+    {
+        ByteBuffer expected = ByteBuffer.allocate(1024).order(order);
+        expected.put((byte) (order == ByteOrder.LITTLE_ENDIAN ? 1 : 0)); // endianness
+        expected.putInt(1);  // type
+        expected.putDouble(point.getOgcPoint().X()); // x
+        expected.putDouble(point.getOgcPoint().Y()); // y
+        expected.flip();
+        return expected;
+    }
+
+    @Test
+    public void bufferBigEndianess()
+    {
+        Point expected = p(1, 2);
+        ByteBuffer bb = padBuffer(type.getSerializer().serialize(expected));
+        Assert.assertEquals(ByteOrder.BIG_ENDIAN, bb.order());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/PolygonTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/PolygonTypeTest.java
new file mode 100644
index 000000000000..cf414c4ec817
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/PolygonTypeTest.java
@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.junit.Assert;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.esri.core.geometry.ogc.OGCGeometry;
+import com.esri.core.geometry.ogc.OGCPolygon;
+import org.apache.cassandra.cql3.Constants;
+import org.apache.cassandra.db.marshal.geometry.Polygon;
+import org.apache.cassandra.serializers.MarshalException;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.p;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.padBuffer;
+import static org.apache.cassandra.db.marshal.GeometricTypeTests.polygon;
+
+public class PolygonTypeTest
+{
+    private static final Logger logger = LoggerFactory.getLogger(LineStringTypeTest.class);
+
+    private static final PolygonType type = PolygonType.instance;
+
+    @Test
+    public void successCase()
+    {
+        ByteBuffer actualBB = type.fromString("polygon((30 10, 40 40, 20 40, 10 20, 30 10))");
+
+        ByteBuffer expectedBB = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
+        expectedBB.position(0);
+
+        expectedBB.put((byte) (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? 1 : 0)); // endianness
+        expectedBB.putInt(3);  // type
+        expectedBB.putInt(1);  // num rings
+        expectedBB.putInt(5);  // num points (ring 1/1)
+        expectedBB.putDouble(30);  // x1
+        expectedBB.putDouble(10);  // y1
+        expectedBB.putDouble(40);  // x2
+        expectedBB.putDouble(40);  // y2
+        expectedBB.putDouble(20);  // x3
+        expectedBB.putDouble(40);  // y3
+        expectedBB.putDouble(10);  // x4
+        expectedBB.putDouble(20);  // y4
+        expectedBB.putDouble(30);  // x5
+        expectedBB.putDouble(10);  // y5
+        expectedBB.flip();
+
+        logger.debug("expected: {}", ByteBufferUtil.bytesToHex(expectedBB));
+        logger.debug("actual:   {}", ByteBufferUtil.bytesToHex(actualBB));
+        String failMsg = String.format("%s != %s", ByteBufferUtil.bytesToHex(actualBB), ByteBufferUtil.bytesToHex(expectedBB));
+        Assert.assertEquals(failMsg, expectedBB, actualBB);
+
+        Polygon expectedGeometry = polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40));
+        Polygon actualGeometry = type.getSerializer().deserialize(actualBB);
+        Assert.assertEquals(expectedGeometry, actualGeometry);
+    }
+
+    @Test(expected=MarshalException.class)
+    public void emptyFailure()
+    {
+        type.fromString("polygon(())");
+    }
+
+    @Test(expected=MarshalException.class)
+    public void failure3d()
+    {
+        type.fromString("polygon((30 10 1, 40 40 1, 20 40 1, 10 20 1, 30 10 1))");
+    }
+
+    /**
+     * Line strings that cross themselves shouldn't validate
+     */
+    @Test(expected=MarshalException.class)
+    public void simpleFailure()
+    {
+        type.fromString("polygon((0 0, 1 1, 0 1, 1 0, 0 0))");
+    }
+
+    @Test(expected=MarshalException.class)
+    public void parseFailure()
+    {
+        type.fromString("polygon123((30 10, 40 40, 20 40, 10 20, 30 10))");
+    }
+
+    @Test
+    public void jsonWktInput()
+    {
+        Constants.Value value = (Constants.Value) type.fromJSONObject("polygon((30 10, 40 40, 20 40, 10 20, 30 10))");
+        Assert.assertEquals(polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40)), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonInput()
+    {
+        String json = "{\"type\":\"Polygon\",\"coordinates\":[[[30.0,10.0],[10.0,20.0],[20.0,40.0],[40.0,40.0],[30.0,10.0]]]}";
+        Constants.Value value = (Constants.Value) type.fromJSONObject(json);
+        Assert.assertEquals(polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40)), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonInputNoPrecision()
+    {
+        String json = "{\"type\":\"Polygon\",\"coordinates\":[[[30,10],[10,20],[20,40],[40,40],[30,10]]]}";
+        Constants.Value value = (Constants.Value) type.fromJSONObject(json);
+        Assert.assertEquals(polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40)), type.getSerializer().deserialize(value.bytes));
+    }
+
+    @Test
+    public void geoJsonOutputWithDoubles()
+    {
+        String json = type.toJSONString(type.getSerializer().serialize(polygon(p(30.1111, 10.2), p(10.3, 20.4), p(20.5, 40.6), p(40.7, 40.8))), ProtocolVersion.CURRENT);
+        logger.debug(json);
+        Assert.assertEquals("{\"type\":\"Polygon\",\"coordinates\":[[[30.1111,10.2],[40.7,40.8],[20.5,40.6],[10.3,20.4],[30.1111,10.2]]]}", json);
+    }
+
+    @Test
+    public void geoJsonOutput()
+    {
+        String json = type.toJSONString(type.getSerializer().serialize(polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40))), ProtocolVersion.CURRENT);
+        logger.debug(json);
+        Assert.assertEquals("{\"type\":\"Polygon\",\"coordinates\":[[[30,10],[40,40],[20,40],[10,20],[30,10]]]}", json);
+    }
+
+    /**
+     * Use of absolute indexing in deserializers shouldn't cause problems
+     */
+    @Test
+    public void bufferOffset()
+    {
+        Polygon expected = polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40));
+        ByteBuffer bb = padBuffer(type.getSerializer().serialize(expected));
+        type.getSerializer().validate(bb);
+        Polygon actual = type.getSerializer().deserialize(bb);
+        Assert.assertEquals(expected, actual);
+    }
+
+    /**
+     * Duplicates DSP-10070
+     * There are some cases where esri can parse wkt into an invalid geometry object, but
+     * can't then convert the invalid geometry to a wkt string. We should catch these cases
+     * and throw a MarshalException
+     */
+    @Test(expected=MarshalException.class)
+    public void invalidInnerRingWkt()
+    {
+        String wkt = "POLYGON ((0.0 0.0, 0.0 10.0, 10.0 10.0, 10.0 0.0, 0.0 0.0), (1.0 10.0, 9.0 0.0, 9.0 9.0, 0.0 9.0, 0.0 0.0))";
+        OGCGeometry geometry = OGCGeometry.fromText(wkt);
+        Assert.assertTrue(geometry instanceof OGCPolygon);
+        new Polygon((OGCPolygon) geometry);
+    }
+
+    /**
+     * Duplicates DSP-10070
+     * There are some cases where esri can parse wkb into an invalid geometry object, but
+     * can't then convert the invalid geometry back to wkb. We should catch these cases
+     * and throw a MarshalException
+     */
+    @Test(expected=MarshalException.class)
+    public void invalidInnerRingWkb()
+    {
+        ByteBuffer bb = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
+        bb.position(0);
+
+        bb.put((byte) (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? 1 : 0)); // endianness
+        bb.putInt(3);  // type
+        bb.putInt(2);  // num rings
+        bb.putInt(5);  // num points (ring 1/2)
+        bb.putDouble(0).putDouble(0);
+        bb.putDouble(0).putDouble(10);
+        bb.putDouble(10).putDouble(10);
+        bb.putDouble(10).putDouble(0);
+        bb.putDouble(0).putDouble(0);
+        bb.putInt(5);  // num points (ring 1/2)
+        bb.putDouble(1).putDouble(10);
+        bb.putDouble(9).putDouble(0);
+        bb.putDouble(9).putDouble(9);
+        bb.putDouble(0).putDouble(9);
+        bb.putDouble(0).putDouble(0);
+        bb.flip();
+
+        type.getSerializer().validate(bb);
+    }
+
+    /**
+     * DSP-10092
+     * Tests that a polygon serialized with a clockwise outer ring and no closing point fails validation, since
+     * it's normalized form has the closing point with points defined counterclockwise.
+     *
+     * Esri 'helps' us by normalizing the geometries it deserializes. Since some values are reserialized from the
+     * deserialized objects, and some aren't, we need to make sure that binary data we get can be reserialized into
+     * equal bytes. Otherwise, users can run into issues with data appearing to disappear.
+     */
+    @Test(expected=MarshalException.class)
+    public void denormalizedPolygon()
+    {
+        ByteBuffer bb = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
+        bb.position(0);
+
+        bb.put((byte) (ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN ? 1 : 0)); // endianness
+        bb.putInt(3);  // type
+        bb.putInt(1);  // num rings
+        bb.putInt(3);  // num points (ring 1/1)
+        bb.putDouble(0);  // x1
+        bb.putDouble(0);  // y1
+        bb.putDouble(1);  // x2
+        bb.putDouble(1);  // y2
+        bb.putDouble(1);  // x3
+        bb.putDouble(0);  // y3
+        bb.flip();
+
+        type.getSerializer().validate(bb);
+    }
+
+    @Test
+    public void bufferBigEndianess()
+    {
+        Polygon expected = polygon(p(30, 10), p(10, 20), p(20, 40), p(40, 40));
+        ByteBuffer bb = padBuffer(type.getSerializer().serialize(expected));
+        Assert.assertEquals(ByteOrder.BIG_ENDIAN, bb.order());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodec.java b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodec.java
new file mode 100644
index 000000000000..146c6e47f926
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodec.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.datetime;
+
+import java.nio.ByteBuffer;
+import java.text.ParseException;
+
+import com.datastax.driver.core.DataType;
+import com.datastax.driver.core.ProtocolVersion;
+import com.datastax.driver.core.TypeCodec;
+import com.datastax.driver.core.exceptions.InvalidTypeException;
+import org.apache.cassandra.db.marshal.DateRangeType;
+import org.apache.cassandra.serializers.DateRangeSerializer;
+
+/**
+ * {@link TypeCodec} that maps binary representation of {@link DateRangeType} to {@link DateRange}.
+ */
+public class DateRangeCodec extends TypeCodec<DateRange>
+{
+    public static final DateRangeCodec instance = new DateRangeCodec();
+
+    private DateRangeCodec()
+    {
+        super(DataType.custom(DateRangeType.class.getName()), DateRange.class);
+    }
+
+    @Override
+    public ByteBuffer serialize(DateRange dateRange, ProtocolVersion protocolVersion) throws InvalidTypeException
+    {
+        if (dateRange == null)
+        {
+            return null;
+        }
+        return DateRangeSerializer.instance.serialize(dateRange);
+    }
+
+    @Override
+    public DateRange deserialize(ByteBuffer bytes, ProtocolVersion protocolVersion) throws InvalidTypeException
+    {
+        if (bytes == null || bytes.remaining() == 0)
+        {
+            return null;
+        }
+        return DateRangeSerializer.instance.deserialize(bytes);
+    }
+
+    @Override
+    public DateRange parse(String value) throws InvalidTypeException
+    {
+        if (value == null || value.isEmpty() || value.equalsIgnoreCase("NULL"))
+        {
+            return null;
+        }
+        try
+        {
+            return DateRangeUtil.parseDateRange(value);
+        }
+        catch (ParseException e)
+        {
+            throw new IllegalArgumentException(String.format("Invalid date range literal: '%s'", value), e);
+        }
+    }
+
+    @Override
+    public String format(DateRange dateRange) throws InvalidTypeException
+    {
+        return dateRange == null ? "NULL" : dateRange.formatToSolrString();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodecTest.java b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodecTest.java
new file mode 100644
index 000000000000..7b4b80c83f79
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeCodecTest.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.marshal.datetime;
+
+import java.nio.ByteBuffer;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.ProtocolVersion;
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound.Precision;
+
+import static org.junit.Assert.assertEquals;
+
+public class DateRangeCodecTest
+{
+    private final DateRangeCodec codec = DateRangeCodec.instance;
+
+    @Test
+    public void testSerializeRoundTrip()
+    {
+        DateRange expected = DateRange.DateRangeBuilder.dateRange()
+                .withLowerBound("2015-12-03T10:15:30.00Z", Precision.SECOND)
+                .withUpperBound("2016-01-01T00:00:01.00Z", Precision.MILLISECOND)
+                .build();
+
+        ByteBuffer serialized = codec.serialize(expected, ProtocolVersion.NEWEST_SUPPORTED);
+
+        // For UDT or tuple type buffer contains whole cell payload, and codec can't rely on absolute byte addressing
+        ByteBuffer payload = ByteBuffer.allocate(5 + serialized.capacity());
+        // put serialized date range in between other data
+        payload.putInt(44).put(serialized).put((byte) 1);
+        payload.position(4);
+
+        DateRange actual = codec.deserialize(payload, ProtocolVersion.NEWEST_SUPPORTED);
+
+        assertEquals(expected, actual);
+        //provided ByteBuffer should never be consumed by read operations that modify its current position
+        assertEquals(4, payload.position());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeTest.java b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeTest.java
new file mode 100644
index 000000000000..3f839ce1d4d2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeTest.java
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal.datetime;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+public class DateRangeTest
+{
+    @Test
+    public void testDateRangeEquality()
+    {
+        DateRange first = DateRange.DateRangeBuilder.dateRange()
+                .withLowerBound("2015-12-03T10:15:30.00Z", DateRangeBound.Precision.SECOND)
+                .withUpperBound("2016-01-01T00:00:01.00Z", DateRangeBound.Precision.MILLISECOND)
+                .build();
+        DateRange second = DateRange.DateRangeBuilder.dateRange()
+                // millis are off, but precision is higher so we skip them
+                .withLowerBound("2015-12-03T10:15:30.01Z", DateRangeBound.Precision.SECOND)
+                .withUpperBound("2016-01-01T00:00:01.00Z", DateRangeBound.Precision.MILLISECOND)
+                .build();
+        DateRange third = DateRange.DateRangeBuilder.dateRange()
+                .withLowerBound("2015-12-03T10:15:30.00Z", DateRangeBound.Precision.MILLISECOND)
+                .withUpperBound("2016-01-01T00:00:01.00Z", DateRangeBound.Precision.MILLISECOND)
+                .build();
+
+        assertEquals(first, second);
+        assertEquals(first, first);
+        assertEquals(first.hashCode(), second.hashCode());
+        assertEquals(first.hashCode(), first.hashCode());
+        assertEquals(first.formatToSolrString(), second.formatToSolrString());
+        assertNotEquals(first, third);
+        assertNotEquals(first.hashCode(), third.hashCode());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeUtilTest.java b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeUtilTest.java
new file mode 100644
index 000000000000..391f7d77d41f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/marshal/datetime/DateRangeUtilTest.java
@@ -0,0 +1,240 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+
+package org.apache.cassandra.db.marshal.datetime;
+
+import java.text.ParseException;
+import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.runners.Enclosed;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound.Precision;
+import org.apache.cassandra.serializers.DateRangeSerializer;
+
+import static org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBuilder.dateRange;
+import static org.junit.Assert.assertEquals;
+
+@RunWith(Enclosed.class)
+public class DateRangeUtilTest
+{
+    @RunWith(Parameterized.class)
+    public static class ParameterizedCases
+    {
+        @Parameterized.Parameter(0)
+        public String source;
+
+        @Parameterized.Parameter(1)
+        public DateRange expected;
+
+        @Test
+        public void shouldParseAndFormatSolrDateRangeFormat() throws ParseException
+        {
+            DateRange parsedSource = DateRangeUtil.parseDateRange(source);
+
+            assertEquals(expected, parsedSource);
+            assertEquals(source, parsedSource.formatToSolrString());
+        }
+
+        @Test
+        public void shouldSerializeAndDeserializeDateRange()
+        {
+            DateRange parsed = DateRangeSerializer.instance.deserialize(DateRangeSerializer.instance.serialize(expected));
+            assertEquals(expected, parsed);
+        }
+        
+        @SuppressWarnings("unused")
+        @Parameterized.Parameters(name = "source = {0}, expected = {1}")
+        public static Collection<Object[]> testData()
+        {
+            return Arrays.asList(
+                    new Object[]{
+                            "[2011-01 TO 2015]",
+                            dateRange()
+                                    .withLowerBound("2011-01-01T00:00:00.000Z", Precision.MONTH)
+                                    .withUpperBound("2015-12-31T23:59:59.999Z", Precision.YEAR)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[2010-01-02 TO 2015-05-05T13]",
+                            dateRange()
+                                    .withLowerBound("2010-01-02T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("2015-05-05T13:59:59.999Z", Precision.HOUR)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[1973-06-30T13:57:28.123Z TO 1999-05-05T14:14:59]",
+                            dateRange()
+                                    .withLowerBound("1973-06-30T13:57:28.123Z", Precision.MILLISECOND)
+                                    .withUpperBound("1999-05-05T14:14:59.999Z", Precision.SECOND)
+                                    .build()
+                    },
+                    // leap year
+                    new Object[]{
+                            "[2010-01-01T15 TO 2016-02]",
+                            dateRange()
+                                    .withLowerBound("2010-01-01T15:00:00.000Z", Precision.HOUR)
+                                    .withUpperBound("2016-02-29T23:59:59.999Z", Precision.MONTH)
+                                    .build()
+                    },
+                    // pre-epoch
+                    new Object[]{
+                            "[1500 TO 1501]",
+                            dateRange()
+                                    .withLowerBound("1500-01-01T00:00:00.000Z", Precision.YEAR)
+                                    .withUpperBound("1501-12-31T23:59:59.999Z", Precision.YEAR)
+                                    .build()
+                    },
+                    // AD/BC era boundary
+                    new Object[]{
+                            "[0001-01-01 TO 0001-01-01]",
+                            dateRange()
+                                    .withLowerBound("0001-01-01T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("0001-01-01T00:00:00.000Z", Precision.DAY)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[0001-01-01 TO 0001-01-02]",
+                            dateRange()
+                                    .withLowerBound("0001-01-01T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("0001-01-02T23:59:59.999Z", Precision.DAY)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[0000-01-01 TO 0000-01-01]",
+                            dateRange()
+                                    .withLowerBound("0000-01-01T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("0000-01-01T00:00:00.000Z", Precision.DAY)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[0000-01-01 TO 0000-01-02]",
+                            dateRange()
+                                    .withLowerBound("0000-01-01T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("0000-01-02T23:59:59.999Z", Precision.DAY)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[-0001-01-01 TO -0001-01-01]",
+                            dateRange()
+                                    .withLowerBound("-0001-01-01T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("-0001-01-01T00:00:00.000Z", Precision.DAY)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[-0001-01-01 TO -0001-01-02]",
+                            dateRange()
+                                    .withLowerBound("-0001-01-01T00:00:00.000Z", Precision.DAY)
+                                    .withUpperBound("-0001-01-02T23:59:59.999Z", Precision.DAY)
+                                    .build()
+                    },
+                    // unbounded
+                    new Object[]{
+                            "[* TO 2014-12-01]",
+                            dateRange()
+                                    .withUnboundedLowerBound()
+                                    .withUpperBound("2014-12-01T23:59:59.999Z", Precision.DAY)
+                                    .build()
+                    },
+                    new Object[]{
+                            "[1999 TO *]",
+                            dateRange()
+                                    .withLowerBound("1999-01-01T00:00:00Z", Precision.YEAR)
+                                    .withUnboundedUpperBound()
+                                    .build()
+                    },
+                    new Object[]{
+                            "[* TO *]",
+                            dateRange()
+                                    .withUnboundedLowerBound()
+                                    .withUnboundedUpperBound()
+                                    .build()
+                    },
+                    new Object[]{
+                            "*",
+                            dateRange()
+                                    .withUnboundedLowerBound()
+                                    .build()
+                    },
+                    // unit shapes
+                    new Object[]{
+                            "-0009",
+                            dateRange()
+                                    .withLowerBound("-0009-01-01T00:00:00.000Z", Precision.YEAR)
+                                    .build()
+                    },
+                    new Object[]{
+                            "2000-11",
+                            dateRange()
+                                    .withLowerBound("2000-11-01T00:00:00.000Z", Precision.MONTH)
+                                    .build()
+                    }
+            );
+        }
+    }
+
+    public static class RoundingCases
+    {
+        @Rule
+        public ExpectedException expectedException = ExpectedException.none();
+
+        @Test
+        public void shouldNotParseDateRangeWithWrongDateOrder() throws ParseException
+        {
+            expectedException.expect(IllegalArgumentException.class);
+            expectedException.expectMessage("Wrong order: 2010 TO 2009");
+            DateRangeUtil.parseDateRange("[2010 TO 2009]");
+        }
+
+        @Test
+        public void shouldRoundUpperBoundToTheGivenPrecision()
+        {
+            ZonedDateTime timestamp = ZonedDateTime.ofInstant(Instant.parse("2011-02-03T04:05:16.789Z"), ZoneOffset.UTC);
+            assertEquals("2011-02-03T04:05:16.789Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.MILLISECOND).toInstant().toString());
+            assertEquals("2011-02-03T04:05:16.999Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.SECOND).toInstant().toString());
+            assertEquals("2011-02-03T04:05:59.999Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.MINUTE).toInstant().toString());
+            assertEquals("2011-02-03T04:59:59.999Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.HOUR).toInstant().toString());
+            assertEquals("2011-02-03T23:59:59.999Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.DAY).toInstant().toString());
+            assertEquals("2011-02-28T23:59:59.999Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.MONTH).toInstant().toString());
+            assertEquals("2011-12-31T23:59:59.999Z", DateRangeUtil.roundUpperBoundTimestampToPrecision(timestamp, Precision.YEAR).toInstant().toString());
+        }
+
+        @Test
+        public void shouldRoundLowerBoundToTheGivenPrecision()
+        {
+            ZonedDateTime timestamp = ZonedDateTime.ofInstant(Instant.parse("2011-02-03T04:05:16.789Z"), ZoneOffset.UTC);
+            assertEquals("2011-02-03T04:05:16.789Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.MILLISECOND).toInstant().toString());
+            assertEquals("2011-02-03T04:05:16Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.SECOND).toInstant().toString());
+            assertEquals("2011-02-03T04:05:00Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.MINUTE).toInstant().toString());
+            assertEquals("2011-02-03T04:00:00Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.HOUR).toInstant().toString());
+            assertEquals("2011-02-03T00:00:00Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.DAY).toInstant().toString());
+            assertEquals("2011-02-01T00:00:00Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.MONTH).toInstant().toString());
+            assertEquals("2011-01-01T00:00:00Z", DateRangeUtil.roundLowerBoundTimestampToPrecision(timestamp, Precision.YEAR).toInstant().toString());
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/serializers/DateRangeSerializerTest.java b/test/unit/org/apache/cassandra/serializers/DateRangeSerializerTest.java
new file mode 100644
index 000000000000..bca5f688cc51
--- /dev/null
+++ b/test/unit/org/apache/cassandra/serializers/DateRangeSerializerTest.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.serializers;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.experimental.runners.Enclosed;
+import org.junit.rules.ExpectedException;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.marshal.datetime.DateRange;
+import org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBound.Precision;
+
+import static org.apache.cassandra.db.marshal.datetime.DateRange.DateRangeBuilder.dateRange;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+@RunWith(Enclosed.class)
+public class DateRangeSerializerTest
+{
+    @RunWith(Parameterized.class)
+    public static class ValidCases
+    {
+        @Parameterized.Parameter
+        public DateRange dateRange;
+
+        @Test
+        public void testSerializeRoundTrip()
+        {
+            ByteBuffer serialized = DateRangeSerializer.instance.serialize(dateRange);
+
+            // For UDT or tuple type buffer contains whole cell payload, and codec can't rely on absolute byte addressing
+            ByteBuffer payload = ByteBuffer.allocate(5 + serialized.capacity());
+            // put serialized date range in between other data
+            payload.putInt(44).put(serialized).put((byte) 1);
+            payload.position(4);
+
+            DateRange actual = DateRangeSerializer.instance.deserialize(payload);
+
+            assertEquals(dateRange, actual);
+            //provided ByteBuffer should never be consumed by read operations that modify its current position
+            assertEquals(4, payload.position());
+        }
+
+        @SuppressWarnings("unused")
+        @Parameterized.Parameters(name = "dateRange = {0}")
+        public static Collection<Object[]> dateRanges()
+        {
+            return Arrays.asList(
+                new Object[]{
+                        // 2015-12-03T10:15:30 TO 2016-01-01T00:00:01.001Z
+                        dateRange()
+                                .withLowerBound("2015-12-03T10:15:30.000Z", Precision.SECOND)
+                                .withUpperBound("2016-01-01T00:00:01.001Z", Precision.MILLISECOND)
+                                .build()
+                },
+                new Object[]{
+                        // 1998-01-01 TO *
+                        dateRange()
+                                .withLowerBound("1998-01-01T00:00:00.000Z", Precision.DAY)
+                                .withUnboundedUpperBound()
+                                .build()
+                },
+                new Object[]{
+                        // * TO 1951-01-02T01
+                        dateRange()
+                                .withUnboundedLowerBound()
+                                .withUpperBound("1951-01-02T01:00:00.003Z", Precision.HOUR)
+                                .build()
+                },
+                new Object[]{
+                        // *
+                        dateRange()
+                                .withUnboundedLowerBound()
+                                .build()
+                },
+                new Object[]{
+                        // [* TO *]
+                        dateRange()
+                                .withUnboundedLowerBound()
+                                .withUnboundedUpperBound()
+                                .build()
+                },
+                new Object[]{
+                        // 1966
+                        dateRange()
+                                .withLowerBound("1966-03-03T03:30:30.030Z", Precision.YEAR)
+                                .build(),
+                }
+            );
+        }
+    }
+
+    public static class InvalidCases
+    {
+
+        @Rule
+        public ExpectedException expectedException = ExpectedException.none();
+
+        @Test
+        public void testNullValueSerializeRoundTrip()
+        {
+            ByteBuffer serialized = DateRangeSerializer.instance.serialize(null);
+            assertEquals(0, serialized.capacity());
+            assertNull(DateRangeSerializer.instance.deserialize(serialized));
+        }
+
+        @Test
+        public void testDeserializeInvalidLengthInput()
+        {
+            expectedException.expect(IndexOutOfBoundsException.class);
+            DateRangeSerializer.instance.deserialize(ByteBuffer.allocate(5));
+        }
+
+        @Test
+        public void testDeserializeUnsupportedHeader()
+        {
+            expectedException.expect(IllegalArgumentException.class);
+            expectedException.expectMessage("Unknown date range type");
+            DateRangeSerializer.instance.deserialize(ByteBuffer.allocate(1).put(0, (byte) 0x15));
+        }
+    }
+}

From 9e9b9e97900b0c5dab8194b985a0ce00ecb6229c Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Tue, 16 Mar 2021 14:55:05 +0000
Subject: [PATCH 055/151] STAR-196: Multi-node tests for SAI

(cherry picked from commit 27e5f761cadc06c489dc071c86e633ae3de78ca1)
(cherry picked from commit c2c1c18821ff404de6829291a536d4a6a3f609d6)
---
 .../org/apache/cassandra/db/ReadCommand.java  |  17 +
 .../org/apache/cassandra/index/Index.java     |  20 +-
 .../index/SecondaryIndexManager.java          |   3 +-
 .../internal/CassandraIndexSearcher.java      |   6 +
 .../index/sai/StorageAttachedIndexGroup.java  |   9 +-
 .../index/sai/disk/MemtableIndexWriter.java   |   5 +
 .../index/sai/disk/v1/NumericIndexWriter.java |   5 +
 .../plan/StorageAttachedIndexSearcher.java    | 150 +++-
 .../index/sai/utils/PrimaryKeys.java          |   2 +-
 .../index/sai/virtual/IndexesSystemView.java  |   4 +-
 .../index/sai/virtual/SSTablesSystemView.java |   5 +-
 .../index/sai/virtual/SegmentsSystemView.java |   5 +-
 .../index/sasi/plan/SASIIndexSearcher.java    |   8 +
 .../cassandra/service/reads/DataResolver.java |  13 +-
 .../test/sai/AbstractQueryTester.java         | 129 +++
 .../test/sai/IndexAvailabilityTest.java       | 413 +++++++++
 .../test/sai/IndexConsistencyTest.java        | 793 ++++++++++++++++++
 .../test/sai/MultiNodeExecutor.java           |  97 +++
 .../test/sai/NativeIndexDDLTest.java          | 330 ++++++++
 .../test/sai/QueryCellDeletionsTest.java      |  32 +
 .../test/sai/QueryRowDeletionsTest.java       |  32 +
 .../test/sai/QueryTimeToLiveTest.java         |  32 +
 .../test/sai/QueryWriteLifecycleTest.java     |  32 +
 .../distributed/test/sai/SAIUtil.java         |  57 ++
 .../distributed/test/sai/TraceTest.java       |  99 +++
 .../index/ExpressionFilteringIndex.java       |  58 +-
 .../org/apache/cassandra/index/StubIndex.java |  25 +-
 .../apache/cassandra/index/sai/SAITester.java |   4 +-
 .../index/sai/cql/AbstractQueryTester.java    |  79 ++
 .../cassandra/index/sai/cql/DataModel.java    | 181 ++--
 .../index/sai/cql/EmptyMemtableFlushTest.java |  71 ++
 .../sai/cql/IndexOperatorSupportTest.java     |  74 --
 .../index/sai/cql/IndexQuerySupport.java      | 255 +++---
 .../index/sai/cql/QueryCellDeletionsTest.java |   4 +-
 .../index/sai/cql/QueryRowDeletionsTest.java  |   4 +-
 .../index/sai/cql/QueryTimeToLiveTest.java    |   4 +-
 .../sai/cql/QueryWriteLifecycleTest.java      |   5 +-
 .../index/sai/cql/SingleNodeExecutor.java     |  96 +++
 .../TinySegmentQueryCellDeletionsTest.java    |   4 +-
 .../cql/TinySegmentQueryRowDeletionsTest.java |   4 +-
 .../cql/TinySegmentQueryTimeToLiveTest.java   |   4 +-
 .../TinySegmentQueryWriteLifecycleTest.java   |   4 +-
 42 files changed, 2762 insertions(+), 412 deletions(-)
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/AbstractQueryTester.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/IndexConsistencyTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/MultiNodeExecutor.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/NativeIndexDDLTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/QueryCellDeletionsTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/QueryRowDeletionsTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/QueryTimeToLiveTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/QueryWriteLifecycleTest.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java
 create mode 100644 test/distributed/org/apache/cassandra/distributed/test/sai/TraceTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/EmptyMemtableFlushTest.java
 delete mode 100644 test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java

diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index b63a5ec9dd72..3c999c8d4044 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java
@@ -509,6 +509,23 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut
 
     protected abstract void recordLatency(TableMetrics metric, long latencyNanos);
 
+    /**
+     * Allow to post-process the result of the query after it has been reconciled on the coordinator
+     * but before it is passed to the CQL layer to return the ResultSet.
+     *
+     * See CASSANDRA-8717 for why this exists.
+     */
+    public PartitionIterator postReconciliationProcessing(PartitionIterator result)
+    {
+        return indexQueryPlan == null ? result : indexQueryPlan.postProcessor().apply(result);
+    }
+
+    @Override
+    public PartitionIterator executeInternal(ReadExecutionController controller)
+    {
+        return postReconciliationProcessing(UnfilteredPartitionIterators.filter(executeLocally(controller), nowInSec()));
+    }
+
     public ReadExecutionController executionController()
     {
         return ReadExecutionController.forCommand(this);
diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
index 223e68b68e6f..bd6985231dc0 100644
--- a/src/java/org/apache/cassandra/index/Index.java
+++ b/src/java/org/apache/cassandra/index/Index.java
@@ -608,7 +608,6 @@ default void validate(ReadCommand command) throws InvalidRequestException
      * @param rowFilter rowFilter of query to decide if it supports replica filtering protection or not
      * @return true if this index supports replica filtering protection, false otherwise
      */
-    //TODO Need to confirm whether SAI needs to implement this as false
     default boolean supportsReplicaFilteringProtection(RowFilter rowFilter)
     {
         return true;
@@ -629,11 +628,30 @@ default boolean supportsReplicaFilteringProtection(RowFilter rowFilter)
      */
     public interface Searcher
     {
+        /**
+         * Returns the {@link ReadCommand} for which this searcher has been created.
+         *
+         * @return the base read command
+         */
+        ReadCommand command();
+
         /**
          * @param executionController the collection of OpOrder.Groups which the ReadCommand is being performed under.
          * @return partitions from the base table matching the criteria of the search.
          */
         public UnfilteredPartitionIterator search(ReadExecutionController executionController);
+
+        /**
+         * Replica filtering protection may fetch data that doesn't match query conditions.
+         *
+         * On coordinator, we need to filter the replicas' responses again.
+         *
+         * @return filtered response that satisfied query conditions
+         */
+        default PartitionIterator filterReplicaFilteringProtection(PartitionIterator fullResponse)
+        {
+            return command().rowFilter().filter(fullResponse, command().metadata(), command().nowInSec());
+        }
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
index b65827b000f4..fe527ecd5908 100644
--- a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
@@ -671,7 +671,8 @@ private String getIndexNames(Set<Index> indexes)
      * @param isNewCF {@code true} if this method is invoked when initializing a new table/columnfamily (i.e. loading a CF at startup),
      * {@code false} for all other cases (i.e. newly added index)
      */
-    private synchronized void markIndexesBuilding(Set<Index> indexes, boolean isFullRebuild, boolean isNewCF)
+    @VisibleForTesting
+    public synchronized void markIndexesBuilding(Set<Index> indexes, boolean isFullRebuild, boolean isNewCF)
     {
         String keyspaceName = baseCfs.keyspace.getName();
 
diff --git a/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java b/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java
index e2e0600aaeb8..698264688bec 100644
--- a/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java
+++ b/src/java/org/apache/cassandra/index/internal/CassandraIndexSearcher.java
@@ -54,6 +54,12 @@ public CassandraIndexSearcher(ReadCommand command,
         this.index = index;
     }
 
+    @Override
+    public ReadCommand command()
+    {
+        return command;
+    }
+
     @SuppressWarnings("resource") // Both the OpOrder and 'indexIter' are closed on exception, or through the closing of the result
     // of this method.
     public UnfilteredPartitionIterator search(ReadExecutionController executionController)
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
index a5dd9c6909ad..e8b08c24ae3a 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
@@ -68,7 +68,7 @@
 /**
  * Orchestrates building of storage-attached indices, and manages lifecycle of resources shared between them.
  */
-public class StorageAttachedIndexGroup implements Index.Group, INotificationConsumer, Iterable<StorageAttachedIndex>
+public class StorageAttachedIndexGroup implements Index.Group, INotificationConsumer
 {
     private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
@@ -154,13 +154,6 @@ public boolean containsIndex(Index index)
         return index instanceof StorageAttachedIndex && indices.contains(index);
     }
 
-    @SuppressWarnings("NullableProblems")
-    @Override
-    public Iterator<StorageAttachedIndex> iterator()
-    {
-        return indices.iterator();
-    }
-
     @Override
     public Index.Indexer indexerFor(Predicate<Index> indexSelector,
                                     DecoratedKey key,
diff --git a/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java
index cb276d4842fc..254a5cd89251 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/MemtableIndexWriter.java
@@ -164,8 +164,13 @@ private long flush(DecoratedKey minKey, DecoratedKey maxKey, AbstractType<?> ter
             }
         }
 
+        // If no rows were written we need to delete any created column index components
+        // so that the index is correctly identified as being empty (only having a completion marker)
         if (numRows == 0)
+        {
+            indexComponents.deleteColumnIndex();
             return 0;
+        }
 
         // During index memtable flush, the data is sorted based on terms.
         SegmentMetadata metadata = new SegmentMetadata(0, numRows, terms.getMinSSTableRowId(), terms.getMaxSSTableRowId(),
diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java
index 2928e75a89c9..51793c860e7f 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/NumericIndexWriter.java
@@ -146,6 +146,11 @@ public SegmentMetadata.ComponentMetadataMap writeAll(MutableOneDimPointValues va
 
             bkdPosition = writer.writeField(bkdOutput, values, leafCallback);
 
+            // If the bkdPosition is less than 0 then we didn't write any values out
+            // and the index is empty
+            if (bkdPosition < 0)
+                return components;
+
             final long bkdLength = bkdOutput.getFilePointer() - bkdOffset;
 
             Map<String, String> attributes = new LinkedHashMap<>();
diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
index 9f1c73a42932..3f55e3f648a6 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
@@ -21,6 +21,7 @@
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
+import java.util.NoSuchElementException;
 
 import com.google.common.collect.Iterators;
 
@@ -30,10 +31,13 @@
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.ReadCommand;
 import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.RegularAndStaticColumns;
 import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.db.rows.AbstractUnfilteredRowIterator;
 import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
 import org.apache.cassandra.db.rows.Unfiltered;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.AbstractBounds;
@@ -63,19 +67,31 @@ public StorageAttachedIndexSearcher(ColumnFamilyStore cfs,
         this.controller = new QueryController(cfs, command, expressions, queryContext, tableQueryMetrics);
     }
 
+    @Override
+    public ReadCommand command()
+    {
+        return command;
+    }
+
+    @Override
+    public PartitionIterator filterReplicaFilteringProtection(PartitionIterator fullResponse)
+    {
+        for (RowFilter.Expression expression : controller.getExpressions())
+        {
+            if (controller.getContext(expression).getAnalyzer().transformValue())
+                return applyIndexFilter(fullResponse, analyzeFilter(), queryContext);
+        }
+
+        // if no analyzer does transformation
+        return Index.Searcher.super.filterReplicaFilteringProtection(fullResponse);
+    }
+
     @Override
     public UnfilteredPartitionIterator search(ReadExecutionController executionController) throws RequestTimeoutException
     {
         return  new ResultRetriever(analyze(), controller, executionController, queryContext);
     }
 
-//    @Override
-//    public Flow<FlowableUnfilteredPartition> search(ReadExecutionController executionController) throws RequestTimeoutException
-//    {
-//        return analyzeAsync().map(operation -> new ResultRetriever(operation, controller, executionController, queryContext))
-//                             .flatMap(FlowablePartitions::fromPartitions);
-//    }
-
     /**
      * Converts expressions into filter tree and reference {@link SSTableIndex}s used for query.
      *
@@ -95,7 +111,6 @@ private Operation analyze()
      *
      * @return root of the filter tree.
      */
-    //TODO How does this get applied in OS
     private FilterTree analyzeFilter()
     {
         return Operation.initTreeBuilder(controller).completeFilter();
@@ -281,31 +296,96 @@ public void close()
         }
     }
 
-//    /**
-//     * Used by {@link StorageAttachedIndexSearcher#filterReplicaFilteringProtection} which is not ported to OSS yet.
-//     */
-//    private static <U extends Unfiltered, F extends FlowablePartitionBase<U, F>> Flow<F>  applyIndexFilter(Flow<F> fp, FilterTree tree, QueryContext queryContext)
-//    {
-//        return fp.flatMap(partition ->
-//        {
-//            Row staticRow = partition.staticRow();
-//            /*
-//             * If {@code content} is empty, which means either all clustering row and static row pairs failed,
-//             *       or static row and static row pair failed. In both cases, we should not return any partition.
-//             * If {@code content} is not empty, which means either there are some clustering row and static row pairs match the filters,
-//             *       or static row and static row pair matches the filters. In both cases, we should return a partition with static row,
-//             *       and remove the static row marker from the {@code content} for the latter case.
-//             */
-//            Flow<U> content = partition.content()
-//                                       .filter(Unfiltered::isRow)
-//                                       .ifEmpty((U) staticRow)
-//                                       .filter(row ->
-//                                               {
-//                                                   queryContext.rowsFiltered++;
-//                                                   return tree.satisfiedBy(partition.partitionKey(), row, staticRow);
-//                                               });
-//
-//                              return content.skipMapEmpty(c -> partition.withContent(c.filter(unfiltered -> !((Row)unfiltered).isStatic())));
-//                          });
-//    }
+    /**
+     * Used by {@link StorageAttachedIndexSearcher#filterReplicaFilteringProtection} to filter rows for columns that
+     * have transformations so won't get handled correctly by the row filter.
+     */
+    @SuppressWarnings("resource")
+    private static PartitionIterator applyIndexFilter(PartitionIterator response, FilterTree tree, QueryContext queryContext)
+    {
+        return new PartitionIterator()
+        {
+            @Override
+            public void close()
+            {
+                response.close();
+            }
+
+            @Override
+            public boolean hasNext()
+            {
+                return response.hasNext();
+            }
+
+            @Override
+            public RowIterator next()
+            {
+                RowIterator delegate = response.next();
+                Row staticRow = delegate.staticRow();
+
+                return new RowIterator()
+                {
+                    boolean hasNext;
+                    Row next;
+
+                    @Override
+                    public TableMetadata metadata()
+                    {
+                        return delegate.metadata();
+                    }
+
+                    @Override
+                    public boolean isReverseOrder()
+                    {
+                        return delegate.isReverseOrder();
+                    }
+
+                    @Override
+                    public RegularAndStaticColumns columns()
+                    {
+                        return delegate.columns();
+                    }
+
+                    @Override
+                    public DecoratedKey partitionKey()
+                    {
+                        return delegate.partitionKey();
+                    }
+
+                    @Override
+                    public Row staticRow()
+                    {
+                        return staticRow;
+                    }
+
+                    @Override
+                    public void close()
+                    {
+                        delegate.close();
+                    }
+
+                    @Override
+                    public boolean hasNext()
+                    {
+                        while (hasNext = delegate.hasNext())
+                        {
+                            next = delegate.next();
+                            queryContext.rowsFiltered++;
+                            if (tree.satisfiedBy(delegate.partitionKey(), next, staticRow))
+                                return true;
+                        }
+                        return false;
+                    }
+
+                    @Override
+                    public Row next()
+                    {
+                        if (!hasNext)
+                            throw new NoSuchElementException();
+                        return next;
+                    }
+                };
+            }
+        };
+    }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
index 2828ac769c6c..8f4b9e2ce6f2 100644
--- a/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
+++ b/src/java/org/apache/cassandra/index/sai/utils/PrimaryKeys.java
@@ -121,7 +121,7 @@ private Skinny()
         @Override
         public long add(DecoratedKey key, Clustering clustering)
         {
-            assert clustering.isEmpty();
+            assert clustering.isEmpty() : "Expected Clustering.EMPTY but got " + clustering;
             return keys.add(key) ? SET_ENTRY_OVERHEAD : 0;
         }
 
diff --git a/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java b/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java
index f001fdf63b80..ba5e0f717815 100644
--- a/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java
+++ b/src/java/org/apache/cassandra/index/sai/virtual/IndexesSystemView.java
@@ -114,9 +114,9 @@ public DataSet data()
 
                 if (group != null)
                 {
-                    for (StorageAttachedIndex index : group)
+                    for (Index index : group.getIndexes())
                     {
-                        ColumnContext context = index.getContext();
+                        ColumnContext context = ((StorageAttachedIndex)index).getContext();
                         String indexName = context.getIndexName();
                         View view = context.getView();
 
diff --git a/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java b/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java
index 50f7c055b5d4..dbbe2b83876c 100644
--- a/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java
+++ b/src/java/org/apache/cassandra/index/sai/virtual/SSTablesSystemView.java
@@ -27,6 +27,7 @@
 import org.apache.cassandra.dht.AbstractBounds;
 import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.sai.ColumnContext;
 import org.apache.cassandra.index.sai.SSTableIndex;
 import org.apache.cassandra.index.sai.StorageAttachedIndex;
@@ -98,9 +99,9 @@ public DataSet data()
                 {
                     Token.TokenFactory tokenFactory = cfs.metadata().partitioner.getTokenFactory();
 
-                    for (StorageAttachedIndex index : group)
+                    for (Index index : group.getIndexes())
                     {
-                        ColumnContext columnContext = index.getContext();
+                        ColumnContext columnContext = ((StorageAttachedIndex)index).getContext();
 
                         for (SSTableIndex sstableIndex : columnContext.getView())
                         {
diff --git a/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java b/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java
index 27315ce707ba..5e2b74f62802 100644
--- a/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java
+++ b/src/java/org/apache/cassandra/index/sai/virtual/SegmentsSystemView.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.db.virtual.VirtualTable;
 import org.apache.cassandra.dht.LocalPartitioner;
 import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.sai.ColumnContext;
 import org.apache.cassandra.index.sai.SSTableIndex;
 import org.apache.cassandra.index.sai.StorageAttachedIndex;
@@ -135,9 +136,9 @@ private void forEachIndex(Consumer<ColumnContext> process)
 
                 if (group != null)
                 {
-                    for (StorageAttachedIndex index : group)
+                    for (Index index : group.getIndexes())
                     {
-                        process.accept(index.getContext());
+                        process.accept(((StorageAttachedIndex)index).getContext());
                     }
                 }
             }
diff --git a/src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java b/src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java
index 4e78d3e6c5e5..a613aabcf735 100644
--- a/src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java
+++ b/src/java/org/apache/cassandra/index/sasi/plan/SASIIndexSearcher.java
@@ -32,10 +32,12 @@
 
 public class SASIIndexSearcher implements Index.Searcher
 {
+    private final ReadCommand command;
     private final QueryController controller;
 
     public SASIIndexSearcher(ColumnFamilyStore cfs, ReadCommand command, long executionQuotaMs)
     {
+        this.command = command;
         this.controller = new QueryController(cfs, (PartitionRangeReadCommand) command, executionQuotaMs);
     }
 
@@ -63,6 +65,12 @@ private Operation analyze()
         }
     }
 
+    @Override
+    public ReadCommand command()
+    {
+        return command;
+    }
+
     @Override
     public UnfilteredPartitionIterator search(ReadExecutionController executionController)
     {
diff --git a/src/java/org/apache/cassandra/service/reads/DataResolver.java b/src/java/org/apache/cassandra/service/reads/DataResolver.java
index f9741957801d..e3ec4e1b8c18 100644
--- a/src/java/org/apache/cassandra/service/reads/DataResolver.java
+++ b/src/java/org/apache/cassandra/service/reads/DataResolver.java
@@ -243,13 +243,24 @@ private PartitionIterator resolveWithReplicaFilteringProtection(E replicas, Repa
 
         PartitionIterator completedPartitions = resolveWithReadRepair(secondPhaseContext,
                                                                       i -> rfp.queryProtectedPartitions(firstPhasePartitions, i),
-                                                                      results -> command.rowFilter().filter(results, command.metadata(), command.nowInSec()),
+                                                                      preCountFilterForReplicaFilteringProtection(),
                                                                       repairedDataTracker);
 
         // Ensure that the RFP instance has a chance to record metrics when the iterator closes.
         return PartitionIterators.doOnClose(completedPartitions, firstPhasePartitions::close);
     }
 
+    private  UnaryOperator<PartitionIterator> preCountFilterForReplicaFilteringProtection()
+    {
+        return results -> {
+            Index.Searcher searcher = command.indexSearcher();
+            // in case of "ALLOW FILTERING" without index
+            if (searcher == null)
+                return command.rowFilter().filter(results, command.metadata(), command.nowInSec());
+            return searcher.filterReplicaFilteringProtection(results);
+        };
+    }
+
     @SuppressWarnings("resource")
     private PartitionIterator resolveInternal(ResolveContext context,
                                               UnfilteredPartitionIterators.MergeListener mergeListener,
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/AbstractQueryTester.java b/test/distributed/org/apache/cassandra/distributed/test/sai/AbstractQueryTester.java
new file mode 100644
index 000000000000..33a2eb2372af
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/AbstractQueryTester.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.function.Supplier;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.shared.Byteman;
+import org.apache.cassandra.distributed.shared.Shared;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+import org.apache.cassandra.index.sai.cql.DataModel;
+import org.apache.cassandra.index.sai.cql.IndexQuerySupport;
+
+@RunWith(Parameterized.class)
+public class AbstractQueryTester extends TestBaseImpl
+{
+    protected static final String INJECTION_SCRIPT = "RULE count searches\n" +
+                                                     "CLASS org.apache.cassandra.index.sai.plan.StorageAttachedIndexSearcher\n" +
+                                                     "METHOD search\n" +
+                                                     "AT ENTRY\n" +
+                                                     "IF TRUE\n" +
+                                                     "DO\n" +
+                                                     "   org.apache.cassandra.distributed.test.sai.AbstractQueryTester$Counter.increment()\n" +
+                                                     "ENDRULE\n";
+
+    @Parameterized.Parameter(0)
+    public String name;
+    @Parameterized.Parameter(1)
+    public Supplier<DataModel> dataModel;
+    @Parameterized.Parameter(2)
+    public List<IndexQuerySupport.BaseQuerySet> sets;
+
+    protected static DataModel.Executor executor;
+
+    protected static Cluster cluster;
+
+    @BeforeClass
+    public static void setupCluster() throws Exception
+    {
+        cluster = Cluster.build(3)
+                         .withConfig(config -> config.set("hinted_handoff_enabled", false))
+                         .withInstanceInitializer((cl, nodeNumber) -> {
+                             Byteman.createFromText(INJECTION_SCRIPT).install(cl);
+                         })
+                         .start();
+
+        cluster.schemaChange("CREATE KEYSPACE " + DataModel.KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}");
+
+        executor = new MultiNodeExecutor(cluster);
+    }
+
+    @AfterClass
+    public static void closeCluster()
+    {
+        if (cluster != null)
+            cluster.close();
+    }
+
+    @SuppressWarnings("unused")
+    @Parameterized.Parameters(name = "{0}")
+    public static List<Object[]> params() throws Throwable
+    {
+        List<Object[]> scenarios = new LinkedList<>();
+
+        scenarios.add(new Object[]{ "BaseDataModel",
+                                    (Supplier<DataModel>) () -> new DataModel.BaseDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA),
+                                    IndexQuerySupport.BASE_QUERY_SETS });
+
+        scenarios.add(new Object[]{ "CompoundKeyDataModel",
+                                    (Supplier<DataModel>) () -> new DataModel.CompoundKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA),
+                                    IndexQuerySupport.BASE_QUERY_SETS });
+
+        scenarios.add(new Object[]{ "CompoundKeyWithStaticsDataModel",
+                                    (Supplier<DataModel>) () -> new DataModel.CompoundKeyWithStaticsDataModel(DataModel.STATIC_COLUMNS, DataModel.STATIC_COLUMN_DATA),
+                                    IndexQuerySupport.STATIC_QUERY_SETS });
+
+        scenarios.add(new Object[]{ "CompositePartitionKeyDataModel",
+                                    (Supplier<DataModel>) () -> new DataModel.CompositePartitionKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA),
+                                    ImmutableList.builder().addAll(IndexQuerySupport.BASE_QUERY_SETS).addAll(IndexQuerySupport.COMPOSITE_PARTITION_QUERY_SETS).build() });
+
+        return scenarios;
+    }
+
+    @Shared
+    protected static final class Counter
+    {
+        protected static AtomicLong counter = new AtomicLong(0);
+
+        public static void increment()
+        {
+            counter.incrementAndGet();
+        }
+
+        public static void reset()
+        {
+            counter.set(0);
+        }
+
+        public static long get()
+        {
+            return counter.get();
+        }
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java
new file mode 100644
index 000000000000..5f5a4918b95f
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/IndexAvailabilityTest.java
@@ -0,0 +1,413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.net.InetAddress;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Function;
+import java.util.function.IntFunction;
+
+import com.google.common.base.Objects;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.api.IInvokableInstance;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.index.SecondaryIndexManager;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.KeyspaceMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
+import static org.apache.cassandra.distributed.api.Feature.NETWORK;
+import static org.apache.cassandra.distributed.test.sai.SAIUtil.waitForIndexQueryable;
+import static org.awaitility.Awaitility.await;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class IndexAvailabilityTest extends TestBaseImpl
+{
+    private static final String CREATE_KEYSPACE = "CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': %d}";
+    private static final String CREATE_TABLE = "CREATE TABLE %s.%s (pk text primary key, v1 int, v2 text) " +
+                                               "WITH compaction = {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+    private static final String CREATE_INDEX = "CREATE CUSTOM INDEX %s ON %s.%s(%s) USING 'StorageAttachedIndex'";
+    
+    private static Map<NodeIndex, Index.Status> expectedNodeIndexQueryability = new ConcurrentHashMap<>();
+    private List<String> keyspaces;
+    private List<String> indexesPerKs;
+
+    @Test
+    public void verifyIndexStatusPropagation() throws Exception
+    {
+        try (Cluster cluster = init(Cluster.build(2)
+                                           .withConfig(config -> config.with(GOSSIP)
+                                                                       .with(NETWORK))
+                                           .start()))
+        {
+            String ks1 = "ks1";
+            String ks2 = "ks2";
+            String ks3 = "ks3";
+            String cf1 = "cf1";
+            String index1 = "cf1_idx1";
+            String index2 = "cf1_idx2";
+
+            keyspaces = Arrays.asList(ks1, ks2, ks3);
+            indexesPerKs = Arrays.asList(index1, index2);
+
+            // create 1 tables per keyspace, 2 indexes per table. all indexes are queryable
+            for (String ks : keyspaces)
+            {
+                cluster.schemaChange(String.format(CREATE_KEYSPACE, ks, 2));
+                cluster.schemaChange(String.format(CREATE_TABLE, ks, cf1));
+                cluster.schemaChange(String.format(CREATE_INDEX, index1, ks, cf1, "v1"));
+                cluster.schemaChange(String.format(CREATE_INDEX, index2, ks, cf1, "v2"));
+                waitForIndexQueryable(cluster, ks);
+                cluster.forEach(node -> {
+                    expectedNodeIndexQueryability.put(NodeIndex.create(ks, index1, node), Index.Status.BUILD_SUCCEEDED);
+                    expectedNodeIndexQueryability.put(NodeIndex.create(ks, index2, node), Index.Status.BUILD_SUCCEEDED);
+                });
+            }
+
+            // mark ks1 index1 as non-queryable on node1
+            markIndexNonQueryable(cluster.get(1), ks1, cf1, index1);
+            // on node2, it observes that node1 ks1.index1 is not queryable
+            waitForIndexingStatus(cluster.get(2), ks1, index1, cluster.get(1), Index.Status.BUILD_FAILED);
+            // other indexes or keyspaces should not be affected
+            assertIndexingStatus(cluster);
+
+            // mark ks2 index2 as non-queryable on node2
+            markIndexNonQueryable(cluster.get(2), ks2, cf1, index2);
+            // on node1, it observes that node2 ks2.index2 is not queryable
+            waitForIndexingStatus(cluster.get(1), ks2, index2, cluster.get(2), Index.Status.BUILD_FAILED);
+            // other indexes or keyspaces should not be affected
+            assertIndexingStatus(cluster);
+
+            // mark ks1 index1 as queryable on node1
+            markIndexQueryable(cluster.get(1), ks1, cf1, index1);
+            // on node2, it observes that node1 ks1.index1 is queryable
+            waitForIndexingStatus(cluster.get(2), ks1, index1, cluster.get(1), Index.Status.BUILD_SUCCEEDED);
+            // other indexes or keyspaces should not be affected
+            assertIndexingStatus(cluster);
+
+            // mark ks2 index2 as indexing on node1
+            markIndexBuilding(cluster.get(1), ks2, cf1, index2);
+            // on node2, it observes that node1 ks2.index2 is not queryable
+            waitForIndexingStatus(cluster.get(2), ks2, index2, cluster.get(1), Index.Status.FULL_REBUILD_STARTED);
+            // other indexes or keyspaces should not be affected
+            assertIndexingStatus(cluster);
+
+            // drop ks1, ks1 index1/index2 should be non queryable on all nodes
+            cluster.schemaChange("DROP KEYSPACE " + ks1);
+            expectedNodeIndexQueryability.keySet().forEach(k -> {
+                if (k.keyspace.equals(ks1))
+                    expectedNodeIndexQueryability.put(k, Index.Status.UNKNOWN);
+            });
+            assertIndexingStatus(cluster);
+
+            // drop ks2 index2, there should be no ks2 index2 status on all node
+            cluster.schemaChange("DROP INDEX " + ks2 + "." + index2);
+            expectedNodeIndexQueryability.keySet().forEach(k -> {
+                if (k.keyspace.equals(ks2) && k.index.equals(index2))
+                    expectedNodeIndexQueryability.put(k, Index.Status.UNKNOWN);
+            });
+            assertIndexingStatus(cluster);
+
+            // drop ks3 cf1, there should be no ks3 index1/index2 status
+            cluster.schemaChange("DROP TABLE " + ks3 + "." + cf1);
+            expectedNodeIndexQueryability.keySet().forEach(k -> {
+                if (k.keyspace.equals(ks3))
+                    expectedNodeIndexQueryability.put(k, Index.Status.UNKNOWN);
+            });
+            assertIndexingStatus(cluster);
+        }
+    }
+
+    @Test
+    public void testNonQueryableNodeN2Rf2() throws Exception
+    {
+        shouldSkipNonQueryableNode(2, Collections.singletonList(1), Arrays.asList(1, 2));
+    }
+
+    @Test
+    public void testSkipNonQueryableNodeN3Rf3() throws Exception
+    {
+        shouldSkipNonQueryableNode(3, Collections.singletonList(1), Arrays.asList(1, 2), Arrays.asList(1, 2, 3));
+    }
+
+    @Test
+    public void testSkipNonQueryableNodeN1Rf1() throws Exception
+    {
+        shouldSkipNonQueryableNode(1, Collections.singletonList(1));
+    }
+
+    private void shouldSkipNonQueryableNode(int nodes, List<Integer>... nonQueryableNodesList) throws Exception
+    {
+        try (Cluster cluster = init(Cluster.build(nodes)
+                                           .withConfig(config -> config.with(GOSSIP)
+                                                                       .with(NETWORK))
+                                           .start()))
+        {
+            String table = "non_queryable_node_test_" + System.currentTimeMillis();
+            cluster.schemaChange(String.format(CREATE_TABLE, KEYSPACE, table));
+            cluster.schemaChange(String.format(CREATE_INDEX, "", KEYSPACE, table, "v1"));
+            cluster.schemaChange(String.format(CREATE_INDEX, "", KEYSPACE, table, "v2"));
+            waitForIndexQueryable(cluster, KEYSPACE);
+
+            // create 100 rows in 1 sstable
+            int rows = 100;
+            for (int i = 0; i < rows; i++)
+                cluster.coordinator(1).execute(String.format("INSERT INTO %s.%s(pk, v1, v2) VALUES ('%d', 0, '0');", KEYSPACE, table, i), ConsistencyLevel.QUORUM);
+            cluster.forEach(node -> node.flush(KEYSPACE));
+
+            String numericQuery = String.format("SELECT pk FROM %s.%s WHERE v1=0", KEYSPACE, table);
+            String stringQuery = String.format("SELECT pk FROM %s.%s WHERE v2='0'", KEYSPACE, table);
+            String multiIndexQuery = String.format("SELECT pk FROM %s.%s WHERE v1=0 AND v2='0'", KEYSPACE, table);
+
+            // get index name base on node id to have different non-queryable index on different nodes.
+            Function<Integer, String> nodeIdToColumn = nodeId -> "v" + (nodeId % 2 + 1);
+            IntFunction<String> nodeIdToIndex = nodeId -> IndexMetadata.generateDefaultIndexName(table, ColumnIdentifier.getInterned(nodeIdToColumn.apply(nodeId), false));
+
+            for (List<Integer> nonQueryableNodes : nonQueryableNodesList)
+            {
+                int numericLiveReplicas = (int) (nodes - nonQueryableNodes.stream().map(nodeIdToColumn).filter(c -> c.equals("v1")).count());
+                int stringLiveReplicas = (int) (nodes - nonQueryableNodes.stream().map(nodeIdToColumn).filter(c -> c.equals("v2")).count());
+                int liveReplicas = nodes - nonQueryableNodes.size();
+
+                // mark index non-queryable at once and wait for ack from remote peers
+                for (int local : nonQueryableNodes)
+                    markIndexNonQueryable(cluster.get(local), KEYSPACE, table, nodeIdToIndex.apply(local));
+
+                for (int local : nonQueryableNodes)
+                    for (int remote = 1; remote <= cluster.size(); remote++)
+                        waitForIndexingStatus(cluster.get(remote), KEYSPACE, nodeIdToIndex.apply(local), cluster.get(local), Index.Status.BUILD_FAILED);
+
+                // test different query types
+                executeOnAllCoordinatorsAllConsistencies(cluster, numericQuery, numericLiveReplicas, rows);
+                executeOnAllCoordinatorsAllConsistencies(cluster, stringQuery, stringLiveReplicas, rows);
+                executeOnAllCoordinatorsAllConsistencies(cluster, multiIndexQuery, liveReplicas, rows);
+
+                // rebuild local index at once and wait for remote ack
+                for (int local : nonQueryableNodes)
+                {
+                    String index = nodeIdToIndex.apply(local);
+                    cluster.get(local).runOnInstance(() -> ColumnFamilyStore.rebuildSecondaryIndex(KEYSPACE, table, index));
+                }
+
+                for (int local : nonQueryableNodes)
+                    for (int remote = 1; remote <= cluster.size(); remote++)
+                        waitForIndexingStatus(cluster.get(remote), KEYSPACE, nodeIdToIndex.apply(local), cluster.get(local), Index.Status.BUILD_SUCCEEDED);
+
+                // With cl=all, query should pass
+                executeOnAllCoordinators(cluster, numericQuery, ConsistencyLevel.ALL, rows);
+                executeOnAllCoordinators(cluster, stringQuery, ConsistencyLevel.ALL, rows);
+                executeOnAllCoordinators(cluster, multiIndexQuery, ConsistencyLevel.ALL, rows);
+            }
+        }
+    }
+
+    private void executeOnAllCoordinatorsAllConsistencies(Cluster cluster, String statement, int liveReplicas, int num) throws Exception
+    {
+        int allReplicas = cluster.size();
+
+        // test different consistency levels
+        executeOnAllCoordinators(cluster, statement, ConsistencyLevel.ONE, liveReplicas >= 1 ? num : -1);
+        if (allReplicas >= 2)
+            executeOnAllCoordinators(cluster, statement, ConsistencyLevel.TWO, liveReplicas >= 2 ? num : -1);
+        executeOnAllCoordinators(cluster, statement, ConsistencyLevel.ALL, liveReplicas >= allReplicas ? num : -1);
+    }
+
+    private void executeOnAllCoordinators(Cluster cluster, String query, ConsistencyLevel level, int expected) throws Exception
+    {
+        // test different coordinator
+        for (int nodeId = 1; nodeId <= cluster.size(); nodeId++)
+        {
+            final int node = nodeId;
+            if (expected >= 0)
+                assertEquals(expected, cluster.coordinator(nodeId).execute(query, level).length);
+            else
+            {
+                try
+                {
+                    cluster.coordinator(node).execute(query, level);
+                }
+                catch (Throwable e)
+                {
+                    assertTrue(e.getClass().getSimpleName().equals("ReadFailureException"));
+                }
+            }
+        }
+    }
+
+    private void markIndexNonQueryable(IInvokableInstance node, String keyspace, String table, String indexName) throws Exception
+    {
+        expectedNodeIndexQueryability.put(NodeIndex.create(keyspace, indexName, node), Index.Status.BUILD_FAILED);
+
+        node.runOnInstance(() -> {
+            SecondaryIndexManager sim = Schema.instance.getKeyspaceInstance(keyspace).getColumnFamilyStore(table).indexManager;
+            Index index = sim.getIndexByName(indexName);
+            sim.makeIndexNonQueryable(index, Index.Status.BUILD_FAILED);
+        });
+    }
+
+    private void markIndexQueryable(IInvokableInstance node, String keyspace, String table, String indexName) throws Exception
+    {
+        expectedNodeIndexQueryability.put(NodeIndex.create(keyspace, indexName, node), Index.Status.BUILD_SUCCEEDED);
+
+        node.runOnInstance(() -> {
+            SecondaryIndexManager sim = Schema.instance.getKeyspaceInstance(keyspace).getColumnFamilyStore(table).indexManager;
+            Index index = sim.getIndexByName(indexName);
+            sim.makeIndexNonQueryable(index, Index.Status.BUILD_SUCCEEDED);
+        });
+    }
+
+    private void markIndexBuilding(IInvokableInstance node, String keyspace, String table, String indexName) throws Exception
+    {
+        expectedNodeIndexQueryability.put(NodeIndex.create(keyspace, indexName, node), Index.Status.FULL_REBUILD_STARTED);
+
+        node.runOnInstance(() -> {
+            SecondaryIndexManager sim = Schema.instance.getKeyspaceInstance(keyspace).getColumnFamilyStore(table).indexManager;
+            Index index = sim.getIndexByName(indexName);
+            sim.markIndexesBuilding(Collections.singleton(index), true, false);
+        });
+    }
+
+    private void assertIndexingStatus(Cluster cluster)
+    {
+        for (String ks : keyspaces)
+        {
+            for (String indexName : indexesPerKs)
+            {
+                assertIndexingStatus(cluster, ks, indexName);
+            }
+        }
+    }
+
+    private static void assertIndexingStatus(Cluster cluster, String keyspace, String indexName)
+    {
+        for (int nodeId = 1; nodeId <= cluster.size(); nodeId++)
+        {
+            for (int replica = 1; replica <= cluster.size(); replica++)
+            {
+                NodeIndex nodeIndex = NodeIndex.create(keyspace, indexName, cluster.get(replica));
+                Index.Status expected = expectedNodeIndexQueryability.get(nodeIndex);
+
+                assertIndexingStatus(cluster.get(nodeId), keyspace, indexName, cluster.get(replica), expected);
+            }
+        }
+    }
+
+    private static void assertIndexingStatus(IInvokableInstance node, String keyspaceName, String indexName, IInvokableInstance replica, Index.Status expected)
+    {
+        InetAddressAndPort replicaAddressAndPort = getFullAddress(replica);
+        try
+        {
+            Index.Status actual = getNodeIndexStatus(node, keyspaceName, indexName, replicaAddressAndPort);
+            String errorMessage = String.format("Failed to verify %s.%s status for replica %s on node %s, expected %s, but got %s.",
+                                                keyspaceName, indexName, replica.broadcastAddress(), node.broadcastAddress(), expected, actual);
+            assertEquals(errorMessage, expected, actual);
+        }
+        catch (Exception e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static void waitForIndexingStatus(IInvokableInstance node, String keyspace, String index, IInvokableInstance replica, Index.Status status)
+    {
+        InetAddressAndPort replicaAddressAndPort = getFullAddress(replica);
+        await().atMost(5, TimeUnit.SECONDS)
+               .until(() -> node.callOnInstance(() -> getIndexStatus(keyspace, index, replicaAddressAndPort) == status).booleanValue());
+    }
+
+    private static Index.Status getNodeIndexStatus(IInvokableInstance node, String keyspaceName, String indexName, InetAddressAndPort replica)
+    {
+        return Index.Status.values()[node.callsOnInstance(() -> getIndexStatus(keyspaceName, indexName, replica).ordinal()).call()];
+    }
+    
+    private static Index.Status getIndexStatus(String keyspaceName, String indexName, InetAddressAndPort replica)
+    {
+        KeyspaceMetadata keyspace = Schema.instance.getKeyspaceMetadata(keyspaceName);
+        if (keyspace == null)
+            return Index.Status.UNKNOWN;
+
+        TableMetadata table = keyspace.findIndexedTable(indexName).orElse(null);
+        if (table == null)
+            return Index.Status.UNKNOWN;
+
+        SecondaryIndexManager indexManager = Keyspace.openAndGetStore(table).indexManager;
+        
+        return indexManager.getIndexStatus(replica, keyspaceName, indexName);
+    }
+
+    private static InetAddressAndPort getFullAddress(IInvokableInstance node)
+    {
+        InetAddress address = node.broadcastAddress().getAddress();
+        int port = node.callOnInstance(() -> FBUtilities.getBroadcastAddressAndPort().port);
+        return InetAddressAndPort.getByAddressOverrideDefaults(address, port);
+    }
+    
+    private static class NodeIndex
+    {
+        private final String keyspace;
+        private final String index;
+        private final IInvokableInstance node;
+
+        NodeIndex(String keyspace, String index, IInvokableInstance node)
+        {
+            this.keyspace = keyspace;
+            this.index = index;
+            this.node = node;
+        }
+
+        public static NodeIndex create(String keyspace, String index, IInvokableInstance node)
+        {
+            return new NodeIndex(keyspace, index, node);
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            NodeIndex that = (NodeIndex) o;
+            return node.equals(that.node) &&
+                   Objects.equal(keyspace, that.keyspace) &&
+                   Objects.equal(index, that.index);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(keyspace, index, node);
+        }
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/IndexConsistencyTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/IndexConsistencyTest.java
new file mode 100644
index 000000000000..243cbd0cad32
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/IndexConsistencyTest.java
@@ -0,0 +1,793 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.shared.AssertUtils;
+import org.apache.cassandra.distributed.shared.Byteman;
+import org.apache.cassandra.distributed.shared.Shared;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+
+import static org.apache.cassandra.distributed.shared.AssertUtils.fail;
+import static org.apache.cassandra.distributed.shared.AssertUtils.row;
+import static org.junit.Assert.assertEquals;
+
+/**
+ * Tests scenarios where two replicas have different versions of the same rows.
+ *
+ * If the coordinator detects rows that are present in only some of the replica responses, it should ask for them by
+ * primary key to those replicas where they were omitted to check if they have a more recent version that wasn't sent
+ * because of not satisfying the row filter.
+ *
+ * See CASSANDRA-8272, CASSANDRA-8273.
+ */
+public class IndexConsistencyTest extends TestBaseImpl
+{
+    private static final int NUM_REPLICAS = 2;
+
+    private static final String INJECTION_SCRIPT = "RULE fail indexer\n" +
+                                                   "CLASS org.apache.cassandra.index.sai.StorageAttachedIndexGroup\n" +
+                                                   "METHOD indexerFor\n" +
+                                                   "AT ENTRY\n" +
+                                                   "IF org.apache.cassandra.distributed.test.sai.IndexConsistencyTest$FailureEnabled.isEnabled(%d)\n" +
+                                                   "DO\n" +
+                                                   "   throw new java.lang.RuntimeException(\"Injected index failure\")\n" +
+                                                   "ENDRULE\n" +
+                                                   "RULE count indexer\n" +
+                                                   "CLASS org.apache.cassandra.index.sai.StorageAttachedIndexGroup\n" +
+                                                   "METHOD indexerFor\n" +
+                                                   "AT ENTRY\n" +
+                                                   "IF TRUE\n" +
+                                                   "DO\n" +
+                                                   "   org.apache.cassandra.distributed.test.sai.IndexConsistencyTest$Counter.increment(%d)\n" +
+                                                   "ENDRULE\n";
+
+
+
+
+    private static AtomicInteger seq = new AtomicInteger();
+    private static String table;
+
+    private static Cluster cluster;
+
+    @BeforeClass
+    public static void setupCluster() throws Exception
+    {
+        cluster = Cluster.build(NUM_REPLICAS)
+                         .withConfig(config -> config.set("hinted_handoff_enabled", false))
+                         .withInstanceInitializer((cl, nodeNumber) -> {
+                             Byteman.createFromText(String.format(INJECTION_SCRIPT, nodeNumber, nodeNumber)).install(cl);
+                         })
+                         .start();
+
+        cluster.schemaChange(withKeyspace("CREATE KEYSPACE %s WITH replication = " +
+                                          "{'class': 'SimpleStrategy', 'replication_factor': " + NUM_REPLICAS + "};"));
+    }
+
+    @AfterClass
+    public static void closeCluster()
+    {
+        if (cluster != null)
+            cluster.close();
+    }
+
+    @Before
+    public void before()
+    {
+        table = "t_" + seq.getAndIncrement();
+    }
+
+    @After
+    public void after()
+    {
+        cluster.schemaChange(formatQuery("DROP TABLE IF EXISTS %s"));
+        FailureEnabled.clear();
+        Counter.clear();
+    }
+
+    @Test
+    public void testUpdateOnSkinnyTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int PRIMARY KEY, v text)"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, v) VALUES (0, 'old')");
+
+        executeIsolated(1, "UPDATE %s SET v = 'new' WHERE k = 0");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+        assertRows("SELECT * FROM %s WHERE v = 'new'", row(0, "new"));
+    }
+
+    @Test
+    public void testUpdateOnWideTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, s int STATIC, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, s) VALUES (0, 9)",
+                "INSERT INTO %s(k, c, v) VALUES (0, -1, 'old')",
+                "INSERT INTO %s(k, c, v) VALUES (0, 0, 'old')",
+                "INSERT INTO %s(k, c, v) VALUES (0, 1, 'old')");
+
+        executeIsolated(1, "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 0");
+
+        assertRows("SELECT * FROM %s WHERE v = 'old'", row(0, -1, 9, "old"), row(0, 1, 9, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'new'", row(0, 0, 9, "new"));
+    }
+
+    @Test
+    public void testUpdateOnWideTableCaseInsensitive() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k1 int, k2 int, v text, v1 text, primary key(k1, k2)) with read_repair='NONE'"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v", false, false)));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k1, k2, v, v1) VALUES (0, 0, 'Old', '0')");
+
+        executeIsolated(1, "UPDATE %s SET v = 'New' WHERE k1=0 and k2=0");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'Old'");
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+
+        assertRows("SELECT * FROM %s WHERE v = 'NEW'", row(0, 0, "New", "0"));
+        assertRows("SELECT * FROM %s WHERE v = 'NEW' and v1 ='0' ALLOW FILTERING", row(0, 0, "New", "0"));
+        assertEmpty("SELECT * FROM %s WHERE v = 'NEW' and v1 ='1' ALLOW FILTERING");
+    }
+
+    @Test
+    public void testUpdateOnWideTableNormalized() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k1 int, k2 int, v text, v1 text, primary key(k1, k2)) with read_repair='NONE'"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v", true, true)));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k1, k2, v, v1) VALUES (0, 0, '\u00E1bc', '0')"); //
+
+        executeIsolated(1, "UPDATE %s SET v = '\u0061\u0301bc' WHERE k1=0 and k2=0");
+
+        assertEmpty("SELECT * FROM %s WHERE v = '\u0061\u0301bc' and v1 ='1' ALLOW FILTERING");
+        assertRows("SELECT * FROM %s WHERE v = '\u0061\u0301bc' and v1 ='0' ALLOW FILTERING",
+                   row(0, 0, "\u0061\u0301bc", "0"));
+        assertRows("SELECT * FROM %s WHERE v = '\u00E1bc'", row(0, 0, "\u0061\u0301bc", "0"));
+        assertRows("SELECT * FROM %s WHERE v = '\u0061\u0301bc'", row(0, 0, "\u0061\u0301bc", "0"));
+    }
+
+    @Test
+    public void testUpdateOnWideTableCaseInsensitiveNormalized() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k1 int, k2 int, v text, v1 text, primary key(k1, k2)) with read_repair='NONE'"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v", false, true)));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k1, k2, v, v1) VALUES (0, 0, '\u00E1Bc', '0')");
+
+        executeIsolated(1, "UPDATE %s SET v = '\u0061\u0301bC' WHERE k1=0 and k2=0");
+
+        assertEmpty("SELECT * FROM %s WHERE v = '\u0061\u0301bc' and v1 ='1' ALLOW FILTERING");
+        assertRows("SELECT * FROM %s WHERE v = '\u0061\u0301bc' and v1 ='0' ALLOW FILTERING",
+                   row(0, 0, "\u0061\u0301bC", "0"));
+        assertRows("SELECT * FROM %s WHERE v = '\u00E1Bc'", row(0, 0, "\u0061\u0301bC", "0"));
+        assertRows("SELECT * FROM %s WHERE v = '\u0061\u0301Bc'", row(0, 0, "\u0061\u0301bC", "0"));
+    }
+
+    @Test
+    public void testUpdateOnStaticColumnWithEmptyPartition() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v int, s text STATIC, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, s) VALUES (0, 'old')",
+                "INSERT INTO %s(k, s) VALUES (1, 'old')");
+
+        executeIsolated(1, "UPDATE %s SET s = 'new' WHERE k = 0");
+
+        assertRows("SELECT * FROM %s WHERE s = 'old'", row(1, null, "old", null));
+        assertRows("SELECT * FROM %s WHERE s = 'new'", row(0, null, "new", null));
+    }
+
+    @Test
+    public void testUpdateOnStaticColumnWithNotEmptyPartition() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v int, s text STATIC, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, s) VALUES (0, 'old')",
+                "INSERT INTO %s(k, s) VALUES (1, 'old')",
+                "INSERT INTO %s(k, c, v) VALUES (0, 10, 100)",
+                "INSERT INTO %s(k, c, v) VALUES (0, 20, 200)",
+                "INSERT INTO %s(k, c, v) VALUES (1, 30, 300)",
+                "INSERT INTO %s(k, c, v) VALUES (1, 40, 400)");
+
+        executeIsolated(1, "UPDATE %s SET s = 'new' WHERE k = 0");
+
+        assertRows("SELECT * FROM %s WHERE s = 'old'", row(1, 30, "old", 300), row(1, 40, "old", 400));
+        assertRows("SELECT * FROM %s WHERE s = 'new'", row(0, 10, "new", 100), row(0, 20, "new", 200));
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnPartitionKeyColumnWithEmptyPartitions() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k1 int, k2 int, c int, s int STATIC, PRIMARY KEY((k1, k2), c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("k1")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k1, k2, s) VALUES (0, 1, 10)",
+                "INSERT INTO %s (k1, k2, s) VALUES (0, 2, 20)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k1 = 0 AND k2 = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k1 = 0 AND k2 = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE k1 = 0 LIMIT 1");
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnPartitionKeyColumnWithNotEmptyPartitions() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k1 int, k2 int, c int, s int STATIC, PRIMARY KEY((k1, k2), c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("k1")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k1, k2, c, s) VALUES (0, 1, 10, 100)",
+                "INSERT INTO %s (k1, k2, c, s) VALUES (0, 2, 20, 200)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k1 = 0 AND k2 = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k1 = 0 AND k2 = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE k1 = 0 LIMIT 1");
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnClusteringKeyColumn() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("c")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c) VALUES (1, 0)",
+                "INSERT INTO %s (k, c) VALUES (2, 0)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE c = 0 LIMIT 1");
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnStaticColumnWithEmptyPartitions() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, s int STATIC, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, s) VALUES (1, 0)",
+                "INSERT INTO %s (k, s) VALUES (2, 0)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE s = 0 LIMIT 1");
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnStaticColumnWithEmptyPartitionsAndRowsAfter() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, s int STATIC, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, s) VALUES (1, 0)",
+                "INSERT INTO %s (k, s) VALUES (2, 0)",
+                "INSERT INTO %s (k, s) VALUES (3, 0)",
+                "INSERT INTO %s (k, c) VALUES (3, 1)",
+                "INSERT INTO %s (k, c) VALUES (3, 2)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 2");
+
+        assertRows("SELECT * FROM %s WHERE s = 0 LIMIT 1", row(3, 1, 0));
+        assertRows("SELECT * FROM %s WHERE s = 0 LIMIT 10", row(3, 1, 0), row(3, 2, 0));
+        assertRows("SELECT * FROM %s WHERE s = 0", row(3, 1, 0), row(3, 2, 0));
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnStaticColumnWithNotEmptyPartitions() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, s int STATIC, v int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v, s) VALUES (1, 10, 100, 0)",
+                "INSERT INTO %s (k, c, v, s) VALUES (2, 20, 200, 0)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE s = 0 LIMIT 1");
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnStaticColumnWithNotEmptyPartitionsAndRowsAfter() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, s int STATIC, v int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v, s) VALUES (1, 10, 100, 0)",
+                "INSERT INTO %s (k, c, v, s) VALUES (2, 20, 200, 0)",
+                "INSERT INTO %s (k, s) VALUES (3, 0)",
+                "INSERT INTO %s (k, c) VALUES (3, 1)",
+                "INSERT INTO %s (k, c) VALUES (3, 2)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 2");
+
+        assertRows("SELECT * FROM %s WHERE s = 0 LIMIT 1", row(3, 1, 0, null));
+        assertRows("SELECT * FROM %s WHERE s = 0 LIMIT 10", row(3, 1, 0, null), row(3, 2, 0, null));
+        assertRows("SELECT * FROM %s WHERE s = 0", row(3, 1, 0, null), row(3, 2, 0, null));
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitOnRegularColumn() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY(k, c)) WITH speculative_retry = 'NONE'"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 1, 0)",
+                "INSERT INTO %s (k, c, v) VALUES (0, 2, 0)");
+
+        executeIsolated(1, "DELETE FROM %s WHERE k = 0 AND c = 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 0 AND c = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 0 LIMIT 1");
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitAndRowsAfter() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 1, 0)",
+                "INSERT INTO %s (k, c, v) VALUES (0, 2, 0)",
+                "INSERT INTO %s (k, c, v) VALUES (0, 3, 0)");
+
+        executeIsolated(1,
+                        "DELETE FROM %s WHERE k = 0 AND c = 1",
+                        "INSERT INTO %s (k, c, v) VALUES (0, 4, 0)");
+        executeIsolated(2,
+                        "INSERT INTO %s (k, c, v) VALUES (0, 5, 0)",
+                        "DELETE FROM %s WHERE k = 0 AND c = 2");
+
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 1", row(0, 3, 0));
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 2", row(0, 3, 0), row(0, 4, 0));
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 3", row(0, 3, 0), row(0, 4, 0), row(0, 5, 0));
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 4", row(0, 3, 0), row(0, 4, 0), row(0, 5, 0));
+    }
+
+    @Test
+    public void testComplementaryDeletionWithLimitAndRowsBetween() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 1, 0)",
+                "INSERT INTO %s (k, c, v) VALUES (0, 4, 0)");
+
+        executeIsolated(1,
+                        "DELETE FROM %s WHERE k = 0 AND c = 1");
+        executeIsolated(2,
+                        "INSERT INTO %s (k, c, v) VALUES (0, 2, 0)",
+                        "INSERT INTO %s (k, c, v) VALUES (0, 3, 0)",
+                        "DELETE FROM %s WHERE k = 0 AND c = 4");
+
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 1", row(0, 2, 0));
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 2", row(0, 2, 0), row(0, 3, 0));
+        assertRows("SELECT * FROM %s WHERE v = 0 LIMIT 3", row(0, 2, 0), row(0, 3, 0));
+    }
+
+    @Test
+    public void testComplementaryUpdateWithLimitOnStaticColumnWithEmptyPartitions() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, s text STATIC, v int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, s) VALUES (1, 'old')",
+                "INSERT INTO %s (k, s) VALUES (2, 'old')");
+
+        executeIsolated(1, "UPDATE %s SET s = 'new' WHERE k = 1");
+        executeIsolated(2, "UPDATE %s SET s = 'new' WHERE k = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE s = 'old' LIMIT 1");
+        assertRows("SELECT k, s FROM %s WHERE s = 'new' LIMIT 1", row(1, "new"));
+        assertRows("SELECT k, s FROM %s WHERE s = 'new'", row(1, "new"), row(2, "new"));
+    }
+
+    @Test
+    public void testComplementaryUpdateWithLimitOnStaticColumnWithNotEmptyPartitions() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, s text STATIC, v int, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("s")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v, s) VALUES (1, 10, 100, 'old')",
+                "INSERT INTO %s (k, c, v, s) VALUES (2, 20, 200, 'old')");
+
+        executeIsolated(1, "UPDATE %s SET s = 'new' WHERE k = 1");
+        executeIsolated(2, "UPDATE %s SET s = 'new' WHERE k = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE s = 'old' LIMIT 1");
+        assertRows("SELECT k, c, v, s FROM %s WHERE s = 'new' LIMIT 1", row(1, 10, 100, "new"));
+        assertRows("SELECT k, c, v, s FROM %s WHERE s = 'new'",
+                   row(1, 10, 100, "new"), row(2, 20, 200, "new"));
+    }
+
+    @Test
+    public void testComplementaryUpdateWithLimitOnRegularColumn() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 1, 'old')",
+                "INSERT INTO %s (k, c, v) VALUES (0, 2, 'old')");
+
+        executeIsolated(1, "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 1");
+        executeIsolated(2, "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 2");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old' LIMIT 1");
+        assertRows("SELECT * FROM %s WHERE v = 'new' LIMIT 1", row(0, 1, "new"));
+        assertRows("SELECT * FROM %s WHERE v = 'new'", row(0, 1, "new"), row(0, 2, "new"));
+    }
+
+    @Test
+    public void testComplementaryUpdateWithLimitAndRowsBetween() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s (k, c, v) VALUES (0, 1, 'old')",
+                "INSERT INTO %s (k, c, v) VALUES (0, 4, 'old')");
+
+        executeIsolated(1,
+                        "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 1");
+        executeIsolated(2,
+                        "INSERT INTO %s (k, c, v) VALUES (0, 2, 'old')",
+                        "INSERT INTO %s (k, c, v) VALUES (0, 3, 'old')",
+                        "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 4");
+
+        assertRows("SELECT * FROM %s WHERE v = 'old' LIMIT 1", row(0, 2, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'old' LIMIT 2", row(0, 2, "old"), row(0, 3, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'old' LIMIT 3", row(0, 2, "old"), row(0, 3, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'new' LIMIT 1", row(0, 1, "new"));
+        assertRows("SELECT * FROM %s WHERE v = 'new' ", row(0, 1, "new"), row(0, 4, "new"));
+    }
+
+    @Test
+    public void testPartitionDeletionOnSkinnyTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int PRIMARY KEY, v text)"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        executeIsolated(1, "INSERT INTO %s (k, v) VALUES (0, 'old') USING TIMESTAMP 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 0");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old' LIMIT 1");
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+    }
+
+    @Test
+    public void testPartitionDeletionOnWideTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        executeIsolated(1, "INSERT INTO %s (k, c, v) VALUES (0, 1, 'old') USING TIMESTAMP 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 0");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old' LIMIT 1");
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+    }
+
+    @Test
+    public void testRowDeletionOnWideTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        executeIsolated(1, "INSERT INTO %s (k, c, v) VALUES (0, 1, 'old') USING TIMESTAMP 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 0 AND c = 1");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old' LIMIT 1");
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+    }
+
+    @Test
+    public void testRangeDeletionOnWideTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        executeIsolated(1,
+                        "INSERT INTO %s (k, c, v) VALUES (0, 1, 'old') USING TIMESTAMP 1",
+                        "INSERT INTO %s (k, c, v) VALUES (0, 2, 'old') USING TIMESTAMP 1",
+                        "INSERT INTO %s (k, c, v) VALUES (0, 3, 'old') USING TIMESTAMP 1",
+                        "INSERT INTO %s (k, c, v) VALUES (0, 4, 'old') USING TIMESTAMP 1");
+        executeIsolated(2, "DELETE FROM %s WHERE k = 0 AND c > 1 AND c < 4");
+
+        assertRows("SELECT * FROM %s WHERE v = 'old' LIMIT 1", row(0, 1, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'old'", row(0, 1, "old"), row(0, 4, "old"));
+    }
+
+    @Test
+    public void testMismatchingInsertionsOnSkinnyTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int PRIMARY KEY, v text)"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        executeIsolated(1, "INSERT INTO %s (k, v) VALUES (0, 'old') USING TIMESTAMP 1");
+        executeIsolated(2, "INSERT INTO %s (k, v) VALUES (0, 'new') USING TIMESTAMP 2");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old' LIMIT 1");
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+        assertRows("SELECT * FROM %s WHERE v = 'new' ", row(0, "new"));
+    }
+
+    @Test
+    public void testMismatchingInsertionsOnWideTable() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY(k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        executeIsolated(1, "INSERT INTO %s (k, c, v) VALUES (0, 1, 'old') USING TIMESTAMP 1");
+        executeIsolated(2, "INSERT INTO %s (k, c, v) VALUES (0, 1, 'new') USING TIMESTAMP 2");
+
+        assertEmpty("SELECT * FROM %s WHERE v = 'old' LIMIT 1");
+        assertEmpty("SELECT * FROM %s WHERE v = 'old'");
+        assertRows("SELECT * FROM %s WHERE v = 'new' ", row(0, 1, "new"));
+    }
+
+    @Test
+    public void testConsistentSkinnyTable()
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int PRIMARY KEY, v text)"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, v) VALUES (1, 'old')", // updated to 'new'
+                "INSERT INTO %s(k, v) VALUES (2, 'old')",
+                "INSERT INTO %s(k, v) VALUES (3, 'old')", // updated to 'new'
+                "INSERT INTO %s(k, v) VALUES (4, 'old')",
+                "INSERT INTO %s(k, v) VALUES (5, 'old')", // deleted partition
+                "UPDATE %s SET v = 'new' WHERE k = 1",
+                "UPDATE %s SET v = 'new' WHERE k = 3",
+                "DELETE FROM %s WHERE k = 5");
+
+        assertRows("SELECT * FROM %s WHERE v = 'old' LIMIT 1", row(2, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'new' LIMIT 1", row(1, "new"));
+        assertRows("SELECT * FROM %s WHERE v = 'old'", row(2, "old"), row(4, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'new'", row(1, "new"), row(3, "new"));
+    }
+
+    @Test
+    public void testConsistentWideTable()
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int, c int, v text, PRIMARY KEY (k, c))"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, c, v) VALUES (0, 1, 'old')", // updated to 'new'
+                "INSERT INTO %s(k, c, v) VALUES (0, 2, 'old')",
+                "INSERT INTO %s(k, c, v) VALUES (0, 3, 'old')", // updated to 'new'
+                "INSERT INTO %s(k, c, v) VALUES (0, 4, 'old')",
+                "INSERT INTO %s(k, c, v) VALUES (0, 5, 'old')", // deleted row
+                "INSERT INTO %s(k, c, v) VALUES (1, 1, 'old')", // deleted partition
+                "INSERT INTO %s(k, c, v) VALUES (1, 2, 'old')", // deleted partition
+                "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 1",
+                "UPDATE %s SET v = 'new' WHERE k = 0 AND c = 3",
+                "DELETE FROM %s WHERE k = 0 AND c = 5",
+                "DELETE FROM %s WHERE k = 1");
+
+        assertRows("SELECT * FROM %s WHERE v = 'old' LIMIT 1", row(0, 2, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'new' LIMIT 1", row(0, 1, "new"));
+        assertRows("SELECT * FROM %s WHERE v = 'old'", row(0, 2, "old"), row(0, 4, "old"));
+        assertRows("SELECT * FROM %s WHERE v = 'new'", row(0, 1, "new"), row(0, 3, "new"));
+    }
+
+    @Test
+    public void testCount() throws Exception
+    {
+        cluster.schemaChange(formatQuery("CREATE TABLE %s (k int PRIMARY KEY, v text)"));
+        cluster.schemaChange(formatQuery(createIndexQuery("v")));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+        execute("INSERT INTO %s(k, v) VALUES (1, 'old')",
+                "INSERT INTO %s(k, v) VALUES (2, 'old')",
+                "INSERT INTO %s(k, v) VALUES (3, 'old')",
+                "INSERT INTO %s(k, v) VALUES (4, 'old')",
+                "INSERT INTO %s(k, v) VALUES (5, 'old')");
+
+        executeIsolated(1,
+                        "UPDATE %s SET v = 'new' WHERE k = 2",
+                        "UPDATE %s SET v = 'new' WHERE k = 4");
+
+        assertRows("SELECT COUNT(*) FROM %s WHERE v = 'old' LIMIT 1", row(3L));
+        assertRows("SELECT COUNT(*) FROM %s WHERE v = 'old'", row(3L));
+        assertRows("SELECT COUNT(*) FROM %s WHERE v = 'new'", row(2L));
+    }
+
+    /**
+     * Executes the specified CQL query with CL=ALL, so all replicas get it.
+     *
+     * @param query the CQL queries to be executed in all replicas
+     *
+     * @return the query result
+     */
+    private static Object[][] execute(String query)
+    {
+        return cluster.coordinator(1).execute(formatQuery(query), ConsistencyLevel.ALL);
+    }
+
+    /**
+     * Executes the specified CQL queries with CL=ALL, so all replicas get them.
+     *
+     * @param queries the CQL queries to be executed in all replicas
+     */
+    private static void execute(String... queries)
+    {
+        for (String query : queries)
+        {
+            execute(query);
+        }
+    }
+
+    /**
+     * Executes the specified CQL queries only in the specified replica, with CL=ONE and the other replicas temporally
+     * rejecting mutations.
+     *
+     * @param targetNode the index of the replica that is going to receive the queries in isolation
+     * @param queries the CQL queries to be executed in a single replica
+     */
+    private static void executeIsolated(int targetNode, String... queries) throws Exception
+    {
+        try
+        {
+            // enable mutation failure and reset its verification counter in all the replicas of the target node
+            for (int node = 1; node <= NUM_REPLICAS; node++)
+            {
+                if (node != targetNode)
+                {
+                    FailureEnabled.enable(node);
+                    Counter.reset(node);
+                }
+            }
+
+            // execute queries in the target node with CL=ONE
+            for (String query : queries)
+            {
+                cluster.coordinator(targetNode).execute(formatQuery(query), ConsistencyLevel.ONE);
+            }
+
+            // verify that no mutation has been run in all the replicas of the target node
+            for (int node = 1; node <= NUM_REPLICAS; node++)
+            {
+                if (node != targetNode)
+                {
+                    assertEquals(0, Counter.get(node));
+                }
+            }
+        }
+        finally
+        {
+            // disable mutation failure in all the replicas of the target node
+            for (int node = 1; node <= NUM_REPLICAS; node++)
+            {
+                if (node != targetNode)
+                {
+                    FailureEnabled.disable(node);
+                }
+            }
+        }
+    }
+
+    private static void assertEmpty(String query)
+    {
+        Object[][] result = execute(query);
+        if (result != null && result.length > 0)
+            fail(String.format("Expected empty result but got %d rows", result.length));
+    }
+
+    private static void assertRows(String query, Object[]... expected)
+    {
+        AssertUtils.assertRows(execute(query), expected);
+    }
+
+    private static String formatQuery(String query)
+    {
+        return String.format(query, KEYSPACE + "." + table);
+    }
+
+    private static String createIndexQuery(String column, boolean caseSensitive, boolean normalize)
+    {
+        String options = String.format("WITH OPTIONS = { 'case_sensitive' : %s, 'normalize' : %s };", caseSensitive, normalize);
+        return String.format("CREATE CUSTOM INDEX ON %%s(%s) USING '%s' %s", column, StorageAttachedIndex.class.getName(), options);
+    }
+
+    private static String createIndexQuery(String column)
+    {
+        return String.format("CREATE CUSTOM INDEX ON %%s(%s) USING '%s'", column, StorageAttachedIndex.class.getName());
+    }
+
+    @Shared
+    private static final class FailureEnabled
+    {
+        private static volatile Map<Integer, Boolean> enabled = new HashMap<>();
+
+        public static boolean isEnabled(int node)
+        {
+            return enabled.containsKey(node) && enabled.get(node);
+        }
+
+        public static void enable(int node)
+        {
+            enabled.put(node, true);
+        }
+
+        public static void disable(int node)
+        {
+            enabled.put(node, false);
+        }
+
+        public static void clear()
+        {
+            enabled.clear();
+        }
+    }
+
+    @Shared
+    private static final class Counter
+    {
+        private static volatile ConcurrentMap<Integer, Integer> counters = new ConcurrentHashMap<>();
+
+        public static void increment(int node)
+        {
+            counters.put(node, counters.getOrDefault(node, 0) + 1);
+        }
+
+        public static int get(int node)
+        {
+            return counters.getOrDefault(node, 0);
+        }
+
+        public static void reset(int node)
+        {
+            counters.put(node, 0);
+        }
+
+        public static void clear()
+        {
+            counters.clear();
+        }
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/MultiNodeExecutor.java b/test/distributed/org/apache/cassandra/distributed/test/sai/MultiNodeExecutor.java
new file mode 100644
index 000000000000..8b8ad1240e73
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/MultiNodeExecutor.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.index.sai.cql.DataModel;
+
+public class MultiNodeExecutor implements DataModel.Executor
+{
+    private final Cluster cluster;
+
+    public MultiNodeExecutor(Cluster cluster)
+    {
+        this.cluster = cluster;
+    }
+
+    @Override
+    public void createTable(String statement)
+    {
+        cluster.schemaChange(statement);
+    }
+
+    @Override
+    public void flush(String keyspace, String table)
+    {
+        cluster.forEach(node -> node.flush(keyspace));
+    }
+
+    @Override
+    public void compact(String keyspace, String table)
+    {
+        cluster.forEach(node -> node.forceCompact(keyspace, table));
+    }
+
+    @Override
+    public void disableCompaction(String keyspace, String table)
+    {
+        cluster.forEach((node) -> node.runOnInstance(() -> Keyspace.open(keyspace).getColumnFamilyStore(table).disableAutoCompaction()));
+    }
+
+    @Override
+    public void waitForIndexQueryable(String keyspace, String table)
+    {
+        SAIUtil.waitForIndexQueryable(cluster, keyspace);
+    }
+
+    @Override
+    public void executeLocal(String query, Object... values) throws Throwable
+    {
+        cluster.coordinator(1).execute(query, ConsistencyLevel.QUORUM);
+    }
+
+    @Override
+    public List<Object> executeRemote(String query, int fetchSize, Object... values) throws Throwable
+    {
+        Iterator<Object> iterator = cluster.coordinator(1).executeWithPagingWithResult(query, ConsistencyLevel.QUORUM, fetchSize, values).map(row -> row.get(0));
+
+        List<Object> result = new ArrayList<>();
+        iterator.forEachRemaining(result::add);
+
+        return result;
+    }
+
+    @Override
+    public void counterReset()
+    {
+        AbstractQueryTester.Counter.reset();
+    }
+
+    @Override
+    public long getCounter()
+    {
+        return AbstractQueryTester.Counter.get();
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/NativeIndexDDLTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/NativeIndexDDLTest.java
new file mode 100644
index 000000000000..cdf7bc3dd0e3
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/NativeIndexDDLTest.java
@@ -0,0 +1,330 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.api.TokenSupplier;
+import org.apache.cassandra.distributed.shared.Byteman;
+import org.apache.cassandra.distributed.shared.NetworkTopology;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.Throwables;
+
+import static org.apache.cassandra.distributed.api.Feature.GOSSIP;
+import static org.apache.cassandra.distributed.api.Feature.NETWORK;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
+
+public class NativeIndexDDLTest extends TestBaseImpl
+{
+    private static final String FAILURE_SCRIPT = "RULE fail IndexGCTransaction\n" +
+                                                 "CLASS org.apache.cassandra.index.SecondaryIndexManager$IndexGCTransaction\n" +
+                                                 "METHOD <init>\n" +
+                                                 "AT ENTRY\n" +
+                                                 "IF TRUE\n" +
+                                                 "DO\n" +
+                                                 "   throw new java.lang.RuntimeException(\"Injected index failure\")\n" +
+                                                 "ENDRULE\n" +
+                                                 "RULE fail CleanupGCTransaction\n" +
+                                                 "CLASS org.apache.cassandra.index.SecondaryIndexManager$CleanupGCTransaction\n" +
+                                                 "METHOD <init>\n" +
+                                                 "AT ENTRY\n" +
+                                                 "IF TRUE\n" +
+                                                 "DO\n" +
+                                                 "   throw new java.lang.RuntimeException(\"Injected index failure\")\n" +
+                                                 "ENDRULE\n";
+
+    private static final String CREATE_TABLE_TEMPLATE = "CREATE TABLE %s.%s (id TEXT PRIMARY KEY, v1 INT, v2 TEXT) " +
+                                                        "WITH compaction = {'class' : 'SizeTieredCompactionStrategy', 'enabled' : false }";
+    protected static final String CREATE_INDEX_TEMPLATE = "CREATE CUSTOM INDEX ON %s.%s(%s) USING 'StorageAttachedIndex'";
+
+    private Cluster cluster;
+
+    @Before
+    public void setupClusterWithSingleNode() throws Throwable
+    {
+        cluster = builder().withNodes(1)
+                           .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(2))
+                           .withNodeIdTopology(NetworkTopology.singleDcNetworkTopology(2, "dc0", "rack0"))
+                           .withConfig(config -> config.with(GOSSIP, NETWORK))
+                           .withInstanceInitializer((cl, nodeNumber) -> {
+                               Byteman.createFromText(FAILURE_SCRIPT).install(cl);
+                           })
+                           .start();
+
+        cluster.schemaChange(withKeyspace("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}"));
+    }
+
+    @After
+    public void destroyCluster() throws Throwable
+    {
+        if (cluster != null)
+            cluster.close();
+    }
+
+    @Test
+    public void verifyIndexWithDecommission() throws Exception
+    {
+        // prepare schema ks rf=1 with 2 indexes
+        String table = "verify_ndi_during_decommission_test";
+        cluster.schemaChange(String.format(CREATE_TABLE_TEMPLATE, KEYSPACE, table));
+        cluster.schemaChange(String.format(CREATE_INDEX_TEMPLATE, KEYSPACE, table, "v1"));
+        cluster.schemaChange(String.format(CREATE_INDEX_TEMPLATE, KEYSPACE, table, "v2"));
+        SAIUtil.waitForIndexQueryable(cluster, KEYSPACE);
+
+        // create 100 rows in 1 sstable
+        int num = 100;
+        for (int i = 0; i < num; i++)
+        {
+            cluster.coordinator(1).execute(String.format("INSERT INTO %s.%s (id, v1, v2) VALUES ('%s', 0, '0')", KEYSPACE, table, i), ConsistencyLevel.ONE);
+        }
+
+        cluster.get(1).flush(KEYSPACE);
+
+        verifyIndexQuery(1, table, num, num);
+        assertEquals(num, getIndexedCellCount(1, table, "v1"));
+        assertEquals(num, getIndexedCellCount(1, table, "v2"));
+
+        // Start node2
+        bootstrapAndJoinNode(cluster);
+
+        // node1 still has all indexed data before cleanup
+        assertEquals(num, getIndexedCellCount(1, table, "v1"));
+        assertEquals(num, getIndexedCellCount(1, table, "v2"));
+
+        // compaction won't cleanup data
+        upgradeSSTables(1, KEYSPACE, table);
+        assertEquals(num, getIndexedCellCount(1, table, "v1"));
+        assertEquals(num, getIndexedCellCount(1, table, "v2"));
+
+        // repair streaming does not transfer entire storage-attached indexes
+        //TODO Is this assumption correct?
+        long indexRowsNode2 = getIndexedCellCount(2, table, "v1");
+        assertNotEquals(0, indexRowsNode2);
+        assertNotEquals(num, indexRowsNode2);
+        assertEquals(indexRowsNode2, getIndexedCellCount(2, table, "v2"));
+        verifyIndexQuery(2, table, num, num);
+
+        // rewrite storage-attached indexes on node2, SAI indexes should not contain rows belonging to node1
+        upgradeSSTables(2, KEYSPACE, table);
+        indexRowsNode2 = getIndexedCellCount(2, table, "v1");
+        assertNotEquals(0, indexRowsNode2);
+        assertNotEquals(num, indexRowsNode2);
+        assertEquals(indexRowsNode2, getIndexedCellCount(2, table, "v2"));
+
+        // verify data with concurrent nodetool cleanup
+        TestWithConcurrentVerification cleanupTest = new TestWithConcurrentVerification(() -> {
+            try
+            {
+                verifyIndexQuery(1, table, num, num);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        }, () -> cluster.get(1).runOnInstance(() -> {
+            try
+            {
+                StorageService.instance.forceKeyspaceCleanup(KEYSPACE, table);
+            }
+            catch (IOException | ExecutionException | InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }));
+
+        cleanupTest.start();
+
+        // verify indexed rows on node1 and it should remove transferred data
+        long indexRowsNode1 = getIndexedCellCount(1, table, "v1");
+        assertNotEquals(0, indexRowsNode1);
+        assertEquals(indexRowsNode1, getIndexedCellCount(1, table, "v2"));
+        assertEquals(num, indexRowsNode1 + indexRowsNode2);
+
+        verifyIndexQuery(1, table, num, num);
+
+        // have to change system_distributed and system_traces to RF=1 for decommission to pass in 2-node setup
+        for (String ks : Arrays.asList("system_traces", "system_distributed"))
+        {
+            cluster.schemaChange("ALTER KEYSPACE " + ks + " WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}");
+        }
+
+        // verify data with concurrent decommission
+        TestWithConcurrentVerification decommissionTest = new TestWithConcurrentVerification(() -> {
+            try
+            {
+                verifyIndexQuery(1, table, num, num);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
+            }
+        }, () -> cluster.get(2).runOnInstance(() -> {
+            try
+            {
+                StorageService.instance.decommission(false);
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        }));
+
+        decommissionTest.start();
+        cluster.get(2).shutdown().get();
+
+        verifyIndexQuery(1, table, num, num);
+
+        // node1 has all indexed data after decommission
+        assertEquals(num, getIndexedCellCount(1, table, "v1"));
+        assertEquals(num, getIndexedCellCount(1, table, "v2"));
+    }
+
+    private void upgradeSSTables(int node, String keyspace, String table)
+    {
+        cluster.get(node).runOnInstance(() -> {
+            try
+            {
+                StorageService.instance.upgradeSSTables(keyspace, false, table);
+            }
+            catch (IOException | ExecutionException | InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+        });
+    }
+
+    private void verifyIndexQuery(int node, String table, int numericIndexRows, int stringIndexRows) throws Exception
+    {
+        verifyNumericIndexQuery(node, table, numericIndexRows);
+        verifyStringIndexQuery(node, table, stringIndexRows);
+    }
+
+    private void verifyNumericIndexQuery(int node, String table, int numericIndexRows) throws Exception
+    {
+        Object[][] result = cluster.coordinator(node).execute(String.format("SELECT id FROM %s.%s WHERE v1=0", KEYSPACE, table), ConsistencyLevel.ONE);
+        assertEquals(numericIndexRows, result.length);
+    }
+
+    private void verifyStringIndexQuery(int node, String table, int stringIndexRows) throws Exception
+    {
+        Object[][] result = cluster.coordinator(node).execute(String.format("SELECT id FROM %s.%s WHERE v2='0'", KEYSPACE, table), ConsistencyLevel.ONE);
+        assertEquals(stringIndexRows, result.length);
+    }
+
+    protected long getIndexedCellCount(int node, String table, String column) throws Exception
+    {
+        return cluster.get(node).callOnInstance(() -> {
+            try
+            {
+                ColumnIdentifier columnID = ColumnIdentifier.getInterned(column, true);
+                ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(table);
+                String indexName = IndexMetadata.generateDefaultIndexName(table, columnID);
+                StorageAttachedIndex index = (StorageAttachedIndex) cfs.indexManager.getIndexByName(indexName);
+                return index.getContext().getCellCount();
+            }
+            catch (Throwable e)
+            {
+                return -1L;
+            }
+        });
+    }
+
+    protected static class TestWithConcurrentVerification
+    {
+        private final Runnable verificationTask;
+        private final CountDownLatch verificationStarted = new CountDownLatch(1);
+
+        private final Runnable targetTask;
+        private final CountDownLatch taskCompleted = new CountDownLatch(1);
+
+        private final int verificationIntervalInMs;
+        private final int verificationMaxInMs = 30000; // 30s
+
+        public TestWithConcurrentVerification(Runnable verificationTask, Runnable targetTask)
+        {
+            this(verificationTask, targetTask, 10);
+        }
+
+        /**
+         * @param verificationTask to be run concurrently with target task
+         * @param targetTask task to be performed once
+         * @param verificationIntervalInMs interval between each verification task, -1 to run verification task once
+         */
+        public TestWithConcurrentVerification(Runnable verificationTask, Runnable targetTask, int verificationIntervalInMs)
+        {
+            this.verificationTask = verificationTask;
+            this.targetTask = targetTask;
+            this.verificationIntervalInMs = verificationIntervalInMs;
+        }
+
+        public void start()
+        {
+            Thread verificationThread = new Thread(() -> {
+                verificationStarted.countDown();
+
+                while (true)
+                {
+                    try
+                    {
+                        verificationTask.run();
+
+                        if (verificationIntervalInMs < 0 || taskCompleted.await(verificationIntervalInMs, TimeUnit.MILLISECONDS))
+                            break;
+                    }
+                    catch (Throwable e)
+                    {
+                        throw Throwables.unchecked(e);
+                    }
+                }
+            });
+
+            try
+            {
+                verificationThread.start();
+                verificationStarted.await();
+
+                targetTask.run();
+                taskCompleted.countDown();
+
+                verificationThread.join(verificationMaxInMs);
+            }
+            catch (InterruptedException e)
+            {
+                throw Throwables.unchecked(e);
+            }
+        }
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/QueryCellDeletionsTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryCellDeletionsTest.java
new file mode 100644
index 000000000000..0727364763e5
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryCellDeletionsTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.cql.IndexQuerySupport;
+
+public class QueryCellDeletionsTest extends AbstractQueryTester
+{
+    @Test
+    public void testCellDeletions() throws Throwable
+    {
+        IndexQuerySupport.cellDeletions(executor, dataModel.get(), sets);
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/QueryRowDeletionsTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryRowDeletionsTest.java
new file mode 100644
index 000000000000..5950bcfcbd2d
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryRowDeletionsTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.cql.IndexQuerySupport;
+
+public class QueryRowDeletionsTest extends AbstractQueryTester
+{
+    @Test
+    public void testRowDeletions() throws Throwable
+    {
+        IndexQuerySupport.rowDeletions(executor, dataModel.get(), sets);
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/QueryTimeToLiveTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryTimeToLiveTest.java
new file mode 100644
index 000000000000..5f0a84fde39c
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryTimeToLiveTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.cql.IndexQuerySupport;
+
+public class QueryTimeToLiveTest extends AbstractQueryTester
+{
+    @Test
+    public void testTimeToLive() throws Throwable
+    {
+        IndexQuerySupport.timeToLive(executor, dataModel.get(), sets);
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/QueryWriteLifecycleTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryWriteLifecycleTest.java
new file mode 100644
index 000000000000..cbda82ec2f99
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/QueryWriteLifecycleTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.cql.IndexQuerySupport;
+
+public class QueryWriteLifecycleTest extends AbstractQueryTester
+{
+    @Test
+    public void testWriteLifecycle() throws Throwable
+    {
+        IndexQuerySupport.writeLifecycle(executor, dataModel.get(), sets);
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java b/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java
new file mode 100644
index 000000000000..13293afee02d
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/SAIUtil.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.IInvokableInstance;
+import org.apache.cassandra.distributed.api.SimpleQueryResult;
+
+import static org.awaitility.Awaitility.await;
+import static org.junit.Assert.assertTrue;
+
+public class SAIUtil
+{
+    public static void waitForIndexQueryable(Cluster cluster, String keyspace)
+    {
+        String query = String.format("SELECT is_queryable FROM system_views.indexes WHERE keyspace_name = '%s' ALLOW FILTERING", keyspace);
+        await().atMost(5, TimeUnit.SECONDS)
+               .until(() -> cluster.stream()
+                                   .map(node -> node.executeInternalWithResult(query))
+                                   .filter(SimpleQueryResult::hasNext)
+                                   .map(result -> result.next().get("is_queryable"))
+                                   .allMatch(flag -> (Boolean) flag));
+    }
+    
+    public static void waitForIndexQueryable(IInvokableInstance node, String index)
+    {
+        await().atMost(5, TimeUnit.SECONDS)
+               .untilAsserted(() -> assertTrue(getIndexState(node, index, "is_queryable")));
+    }
+
+    // This will pull one of the boolean index states from a node
+    private static boolean getIndexState(IInvokableInstance node, String index, String state)
+    {
+        String query = String.format("SELECT %s FROM system_views.indexes WHERE index_name = '%s' ALLOW FILTERING", state, index);
+        SimpleQueryResult queryResult = node.executeInternalWithResult(query);
+        assertTrue("No rows returned", queryResult.hasNext());
+        return queryResult.next().get(state);
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/test/sai/TraceTest.java b/test/distributed/org/apache/cassandra/distributed/test/sai/TraceTest.java
new file mode 100644
index 000000000000..cd3665193155
--- /dev/null
+++ b/test/distributed/org/apache/cassandra/distributed/test/sai/TraceTest.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.distributed.test.sai;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.junit.Test;
+
+import org.apache.cassandra.distributed.Cluster;
+import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.api.TokenSupplier;
+import org.apache.cassandra.distributed.impl.TracingUtil;
+import org.apache.cassandra.distributed.test.TestBaseImpl;
+
+import static org.awaitility.Awaitility.await;
+import static org.junit.Assert.assertEquals;
+
+public class TraceTest extends TestBaseImpl
+{
+    private static int ROWS = 100;
+    private static Pattern NUMBER_PATTERN = Pattern.compile("\\d+");
+
+    @Test
+    public void testMultiIndexTracing() throws Throwable
+    {
+        String originalTraceTimeout = TracingUtil.setWaitForTracingEventTimeoutSecs("1");
+        TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3);
+
+        try (Cluster cluster = init(Cluster.build(3)
+                                           .withTokenSupplier(TokenSupplier.evenlyDistributedTokens(3))
+                                           .start()))
+        {
+            cluster.schemaChange("CREATE KEYSPACE trace_ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};");
+            cluster.schemaChange("CREATE TABLE trace_ks.tbl (pk int primary key, v1 int, v2 text)");
+            cluster.schemaChange("CREATE CUSTOM INDEX tbl_v1_idx ON trace_ks.tbl(v1) USING 'StorageAttachedIndex'");
+            cluster.schemaChange("CREATE CUSTOM INDEX tbl_v2_idx ON trace_ks.tbl(v2) USING 'StorageAttachedIndex'");
+
+            for (int row = 0; row < ROWS; row++)
+            {
+                cluster.coordinator(1).execute(String.format("INSERT INTO trace_ks.tbl (pk, v1, v2) VALUES (%s, %s, '0')", row, row), ConsistencyLevel.ONE);
+            }
+
+            cluster.forEach(c -> c.flush(KEYSPACE));
+
+            SAIUtil.waitForIndexQueryable(cluster, "trace_ks");
+
+            UUID sessionId = UUID.randomUUID();
+            cluster.coordinator(1).executeWithTracingWithResult(sessionId, "SELECT * from trace_ks.tbl WHERE v1 < 30 AND v2 = '0'", ConsistencyLevel.ONE);
+
+            await().atMost(5, TimeUnit.SECONDS).until(() -> {
+                List<TracingUtil.TraceEntry> traceEntries = TracingUtil.getTrace(cluster, sessionId, ConsistencyLevel.ONE);
+                return traceEntries.stream().map(traceEntry -> traceEntry.activity)
+                                            .filter(activity -> activity.contains("post-filtered"))
+                                            .mapToLong(activity -> fetchPartitionCount(activity)).sum() == 30;
+            });
+            
+            //TODO We can improve the asserts for this when we have improved tracing and multi-node support
+            assertEquals(30, TracingUtil.getTrace(cluster, sessionId, ConsistencyLevel.ONE)
+                                        .stream()
+                                        .map(traceEntry -> traceEntry.activity)
+                                        .filter(activity -> activity.contains("post-filtered"))
+                                        .mapToLong(activity -> fetchPartitionCount(activity)).sum());
+        }
+        finally
+        {
+            TracingUtil.setWaitForTracingEventTimeoutSecs(originalTraceTimeout);
+        }
+    }
+    
+    private long fetchPartitionCount(String activity)
+    {
+        List<Long> values = new ArrayList<>();
+        Matcher matcher = NUMBER_PATTERN.matcher(activity);
+        while (matcher.find())
+            values.add(Long.parseLong(matcher.group()));
+        return values.get(3);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java b/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java
index e762126f0f6d..70478e3bd04e 100644
--- a/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java
+++ b/test/unit/org/apache/cassandra/index/ExpressionFilteringIndex.java
@@ -34,6 +34,7 @@
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -85,33 +86,44 @@ public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey,
     @Override
     public Searcher searcherFor(ReadCommand command)
     {
-        return controller -> {
-            searches.incrementAndGet();
-
-            ReadCommand all;
-            if (command instanceof SinglePartitionReadCommand)
-            {
-                SinglePartitionReadCommand cmd = (SinglePartitionReadCommand) command;
-                all = SinglePartitionReadCommand.create(table,
-                                                        cmd.nowInSec(),
-                                                        cmd.partitionKey(),
-                                                        cmd.clusteringIndexFilter().getSlices(cmd.metadata()));
-            }
-            else if (command instanceof PartitionRangeReadCommand)
+        return new Searcher(command)
+        {
+            @Override
+            public ReadCommand command()
             {
-                PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
-                all = PartitionRangeReadCommand.create(table,
-                                                       cmd.nowInSec(),
-                                                       ColumnFilter.all(table),
-                                                       RowFilter.NONE,
-                                                       DataLimits.NONE,
-                                                       cmd.dataRange());
+                return command;
             }
-            else
+
+            @Override
+            public UnfilteredPartitionIterator search(ReadExecutionController executionController)
             {
-                throw new UnsupportedOperationException();
+                searches.incrementAndGet();
+
+                ReadCommand all;
+                if (command instanceof SinglePartitionReadCommand)
+                {
+                    SinglePartitionReadCommand cmd = (SinglePartitionReadCommand) command;
+                    all = SinglePartitionReadCommand.create(table,
+                                                            cmd.nowInSec(),
+                                                            cmd.partitionKey(),
+                                                            cmd.clusteringIndexFilter().getSlices(cmd.metadata()));
+                }
+                else if (command instanceof PartitionRangeReadCommand)
+                {
+                    PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
+                    all = PartitionRangeReadCommand.create(table,
+                                                           cmd.nowInSec(),
+                                                           ColumnFilter.all(table),
+                                                           RowFilter.NONE,
+                                                           DataLimits.NONE,
+                                                           cmd.dataRange());
+                }
+                else
+                {
+                    throw new UnsupportedOperationException();
+                }
+                return all.executeLocally(ReadExecutionController.empty());
             }
-            return all.executeLocally(ReadExecutionController.empty());
         };
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/StubIndex.java b/test/unit/org/apache/cassandra/index/StubIndex.java
index a5a59ab5c800..5358a69feb46 100644
--- a/test/unit/org/apache/cassandra/index/StubIndex.java
+++ b/test/unit/org/apache/cassandra/index/StubIndex.java
@@ -24,6 +24,7 @@
 
 import org.apache.cassandra.Util;
 import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.*;
@@ -207,11 +208,33 @@ public void validate(PartitionUpdate update) throws InvalidRequestException
 
     public Searcher searcherFor(final ReadCommand command)
     {
-        return (controller) -> Util.executeLocally((PartitionRangeReadCommand)command, baseCfs, controller);
+        return new Searcher(command);
     }
 
     public BiFunction<PartitionIterator, ReadCommand, PartitionIterator> postProcessorFor(ReadCommand readCommand)
     {
         return (iter, command) -> iter;
     }
+
+    protected class Searcher implements Index.Searcher
+    {
+        private final ReadCommand command;
+
+        Searcher(ReadCommand command)
+        {
+            this.command = command;
+        }
+
+        @Override
+        public ReadCommand command()
+        {
+            return command;
+        }
+
+        @Override
+        public UnfilteredPartitionIterator search(ReadExecutionController executionController)
+        {
+            return Util.executeLocally((PartitionRangeReadCommand)command, baseCfs, executionController);
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/SAITester.java b/test/unit/org/apache/cassandra/index/sai/SAITester.java
index 945253b2502e..3fb0cedf7199 100644
--- a/test/unit/org/apache/cassandra/index/sai/SAITester.java
+++ b/test/unit/org/apache/cassandra/index/sai/SAITester.java
@@ -668,12 +668,12 @@ private void verifySSTableComponents(String table, boolean indexComponentsExist)
         }
     }
 
-    private Set<File> componentFiles(Collection<File> indexFiles, Component component)
+    protected Set<File> componentFiles(Collection<File> indexFiles, Component component)
     {
         return indexFiles.stream().filter(c -> c.getName().endsWith(component.name)).collect(Collectors.toSet());
     }
 
-    private Set<File> componentFiles(Collection<File> indexFiles, String shortName)
+    protected Set<File> componentFiles(Collection<File> indexFiles, String shortName)
     {
         String suffix = String.format("_%s.db", shortName);
         return indexFiles.stream().filter(c -> c.getName().endsWith(suffix)).collect(Collectors.toSet());
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java b/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java
new file mode 100644
index 000000000000..8408e95f9f77
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.plan.StorageAttachedIndexSearcher;
+import org.apache.cassandra.inject.Injections;
+
+import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
+
+@RunWith(Parameterized.class)
+public class AbstractQueryTester extends SAITester
+{
+    protected static final Injections.Counter INDEX_QUERY_COUNTER = Injections.newCounter("IndexQueryCounter")
+                                                                              .add(newInvokePoint().onClass(StorageAttachedIndexSearcher.class).onMethod("search"))
+                                                                              .build();
+
+    @Parameterized.Parameter(0)
+    public DataModel dataModel;
+    @Parameterized.Parameter(1)
+    public List<IndexQuerySupport.BaseQuerySet> sets;
+
+    protected DataModel.Executor executor;
+
+    @Before
+    public void setup() throws Throwable
+    {
+        requireNetwork();
+
+        schemaChange(String.format("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}", DataModel.KEYSPACE));
+
+        Injections.inject(INDEX_QUERY_COUNTER);
+
+        executor = new SingleNodeExecutor(this, INDEX_QUERY_COUNTER);
+    }
+
+    @SuppressWarnings("unused")
+    @Parameterized.Parameters(name = "{0}")
+    public static List<Object[]> params() throws Throwable
+    {
+        List<Object[]> scenarios = new LinkedList<>();
+
+        scenarios.add(new Object[]{ new DataModel.BaseDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA), IndexQuerySupport.BASE_QUERY_SETS });
+
+        scenarios.add(new Object[]{ new DataModel.CompoundKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA), IndexQuerySupport.BASE_QUERY_SETS });
+
+        scenarios.add(new Object[]{ new DataModel.CompoundKeyWithStaticsDataModel(DataModel.STATIC_COLUMNS, DataModel.STATIC_COLUMN_DATA), IndexQuerySupport.STATIC_QUERY_SETS });
+
+        scenarios.add(new Object[]{ new DataModel.CompositePartitionKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA),
+                                    ImmutableList.builder().addAll(IndexQuerySupport.BASE_QUERY_SETS).addAll(IndexQuerySupport.COMPOSITE_PARTITION_QUERY_SETS).build()});
+
+        return scenarios;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java b/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
index 5793f9d9d4e2..8ac6acb6484b 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
@@ -23,6 +23,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 
@@ -31,17 +32,13 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Sets;
 
-import com.datastax.driver.core.ResultSet;
-import com.datastax.driver.core.SimpleStatement;
 import org.apache.cassandra.cql3.CQL3Type;
-import org.apache.cassandra.cql3.UntypedResultSet;
-import org.apache.cassandra.index.sai.SAITester;
 import org.apache.cassandra.utils.Pair;
 
-import static org.apache.cassandra.cql3.CQLTester.KEYSPACE;
-
 public interface DataModel
 {
+    static final String KEYSPACE = "sai_query_keyspace";
+
     String SIMPLE_SELECT_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? LIMIT ?";
     String SIMPLE_SELECT_WITH_FILTERING_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? LIMIT ? ALLOW FILTERING";
     String TWO_CLAUSE_AND_QUERY_TEMPLATE = "SELECT %s FROM %%s WHERE %s %s ? AND %s %s ? LIMIT ?";
@@ -132,55 +129,53 @@ public interface DataModel
                                                        "1845, " + NORMAL_COLUMN_DATA.get(14),
                                                        "1845, " + NORMAL_COLUMN_DATA.get(15));
 
+    static AtomicInteger seq = new AtomicInteger();
+
     DataModel withTableOptions(String tableOptions) throws Throwable;
 
     List<Pair<String, String>> keyColumns();
 
-    void createTables(SAITester tester) throws Throwable;
-
-    void createIndexes(SAITester tester) throws Throwable;
+    void createTables(Executor tester) throws Throwable;
 
-    void flush(SAITester tester) throws Throwable;
+    void createIndexes(Executor tester) throws Throwable;
 
-    void disableCompaction(SAITester tester) throws Throwable;
+    void flush(Executor tester) throws Throwable;
 
-    void compact(SAITester tester) throws Throwable;
+    void disableCompaction(Executor tester) throws Throwable;
 
-    void truncateTables(SAITester tester) throws Throwable;
+    void compact(Executor tester) throws Throwable;
 
-    void insertRows(SAITester tester) throws Throwable;
+    void truncateTables(Executor tester) throws Throwable;
 
-    void insertRowsWithTTL(SAITester tester) throws Throwable;
+    void insertRows(Executor tester) throws Throwable;
 
-    void updateCells(SAITester tester) throws Throwable;
+    void insertRowsWithTTL(Executor tester) throws Throwable;
 
-    void deleteCells(SAITester tester) throws Throwable;
+    void updateCells(Executor tester) throws Throwable;
 
-    void deleteRows(SAITester tester) throws Throwable;
+    void deleteCells(Executor tester) throws Throwable;
 
-    ResultSet executeIndexed(SAITester tester, String query, Object... values) throws Throwable;
+    void deleteRows(Executor tester) throws Throwable;
 
-    ResultSet executeIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable;
+    List<Object> executeIndexed(Executor tester, String query, int fetchSize, Object... values) throws Throwable;
 
-    ResultSet executeNonIndexed(SAITester tester, String query, Object... values) throws Throwable;
+    List<Object> executeNonIndexed(Executor tester, String query, int fetchSize, Object... values) throws Throwable;
 
-    ResultSet executeNonIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable;
-
-    class BaseDataModel implements DataModel
+    public class BaseDataModel implements DataModel
     {
         final List<Pair<String, String>> columns;
         final String columnNames;
         final List<String> rows;
+        final String indexedTable = "table_" + seq.getAndIncrement();
+        final String nonIndexedTable = "table_" + seq.getAndIncrement();
 
         String tableOptions = "";
-        String indexedTable;
-        String nonIndexedTable;
 
         List<Pair<String, String>> keyColumns;
         String primaryKey;
         List<String> keys;
 
-        BaseDataModel(List<Pair<String, String>> columns, List<String> rows)
+        public BaseDataModel(List<Pair<String, String>> columns, List<String> rows)
         {
             this.keyColumns = ImmutableList.of(Pair.create("p", "int"));
             this.primaryKey = keyColumns.stream().map(pair -> pair.left).collect(Collectors.joining(", "));
@@ -203,23 +198,23 @@ public List<Pair<String, String>> keyColumns()
             return keyColumns;
         }
 
-        public void createTables(SAITester tester)
+        public void createTables(Executor tester)
         {
             String keyColumnDefs = keyColumns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
             String normalColumnDefs = columns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
 
-            String template = "CREATE TABLE %%s (%s, %s, PRIMARY KEY (%s))" + tableOptions;
-            indexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
-            nonIndexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
+            String template = "CREATE TABLE %s (%s, %s, PRIMARY KEY (%s))" + tableOptions;
+            tester.createTable(String.format(template, KEYSPACE + "." + indexedTable, keyColumnDefs, normalColumnDefs, primaryKey));
+            tester.createTable(String.format(template, KEYSPACE + "." + nonIndexedTable, keyColumnDefs, normalColumnDefs, primaryKey));
         }
 
-        public void truncateTables(SAITester tester) throws Throwable
+        public void truncateTables(Executor tester) throws Throwable
         {
             executeLocal(tester, "TRUNCATE TABLE %s");
             executeLocal(tester, "TRUNCATE TABLE %s");
         }
 
-        public void createIndexes(SAITester tester) throws Throwable
+        public void createIndexes(Executor tester) throws Throwable
         {
             String template = "CREATE CUSTOM INDEX ndi_%s_index_%s ON %%s (%s) USING 'StorageAttachedIndex'";
 
@@ -228,30 +223,30 @@ public void createIndexes(SAITester tester) throws Throwable
                 if (!skipColumns.contains(column.left))
                 {
                     executeLocalIndexed(tester, String.format(template, column.left, indexedTable, column.left));
-                    tester.waitForIndexQueryable();
+                    tester.waitForIndexQueryable(KEYSPACE, indexedTable);
                 }
             }
         }
 
-        public void flush(SAITester tester) throws Throwable
+        public void flush(Executor tester) throws Throwable
         {
             tester.flush(KEYSPACE, indexedTable);
             tester.flush(KEYSPACE, nonIndexedTable);
         }
 
-        public void disableCompaction(SAITester tester) throws Throwable
+        public void disableCompaction(Executor tester) throws Throwable
         {
             tester.disableCompaction(KEYSPACE, indexedTable);
             tester.disableCompaction(KEYSPACE, nonIndexedTable);
         }
 
-        public void compact(SAITester tester) throws Throwable
+        public void compact(Executor tester) throws Throwable
         {
             tester.compact(KEYSPACE, indexedTable);
             tester.compact(KEYSPACE, nonIndexedTable);
         }
 
-        public void insertRows(SAITester tester) throws Throwable
+        public void insertRows(Executor tester) throws Throwable
         {
             String template = "INSERT INTO %%s (%s, %s) VALUES (%s, %s)";
 
@@ -261,7 +256,7 @@ public void insertRows(SAITester tester) throws Throwable
             }
         }
 
-        public void insertRowsWithTTL(SAITester tester) throws Throwable
+        public void insertRowsWithTTL(Executor tester) throws Throwable
         {
             String template = "INSERT INTO %%s (%s, %s) VALUES (%s, %s)%s";
 
@@ -272,7 +267,7 @@ public void insertRowsWithTTL(SAITester tester) throws Throwable
             }
         }
 
-        public void updateCells(SAITester tester) throws Throwable
+        public void updateCells(Executor tester) throws Throwable
         {
             executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p = 0", BIGINT_COLUMN));
             executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p = 1", BOOLEAN_COLUMN));
@@ -290,7 +285,7 @@ public void updateCells(SAITester tester) throws Throwable
             executeLocal(tester, String.format("UPDATE %%s SET %s = 1fc81a4c-d17d-11e8-a8d5-f2801f1b9fd1 WHERE p = 13", TIMEUUID_COLUMN));
         }
 
-        public void deleteCells(SAITester tester) throws Throwable
+        public void deleteCells(Executor tester) throws Throwable
         {
             for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
             {
@@ -298,7 +293,7 @@ public void deleteCells(SAITester tester) throws Throwable
             }
         }
 
-        public void deleteRows(SAITester tester) throws Throwable
+        public void deleteRows(Executor tester) throws Throwable
         {
             String template = "DELETE FROM %%s WHERE p = %d";
 
@@ -308,44 +303,25 @@ public void deleteRows(SAITester tester) throws Throwable
             }
         }
 
-        public void executeLocal(SAITester tester, String query, Object... values) throws Throwable
+        public void executeLocal(Executor tester, String query, Object... values) throws Throwable
         {
-            tester.executeFormattedQuery(formatIndexedQuery(query), values);
-            tester.executeFormattedQuery(formatNonIndexedQuery(query), values);
+            tester.executeLocal(formatIndexedQuery(query), values);
+            tester.executeLocal(formatNonIndexedQuery(query), values);
         }
 
-        public UntypedResultSet executeLocalIndexed(SAITester tester, String query, Object... values) throws Throwable
+        public void executeLocalIndexed(Executor tester, String query, Object... values) throws Throwable
         {
-            return tester.executeFormattedQuery(formatIndexedQuery(query), values);
+            tester.executeLocal(formatIndexedQuery(query), values);
         }
 
-        public UntypedResultSet executeLocalNonIndexed(SAITester tester, String query, Object... values) throws Throwable
+        public List<Object> executeIndexed(Executor tester, String query, int fetchSize, Object... values) throws Throwable
         {
-            return tester.executeFormattedQuery(formatNonIndexedQuery(query), values);
+            return tester.executeRemote(formatIndexedQuery(query), fetchSize, values);
         }
 
-        public ResultSet executeIndexed(SAITester tester, String query, Object... values) throws Throwable
+        public List<Object> executeNonIndexed(Executor tester, String query, int fetchSize, Object... values) throws Throwable
         {
-            return tester.sessionNet().execute(new SimpleStatement(formatIndexedQuery(query), values));
-        }
-
-        public ResultSet executeIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable
-        {
-            SimpleStatement statement = new SimpleStatement(formatIndexedQuery(query), values);
-            statement.setFetchSize(fetchSize);
-            return tester.sessionNet().execute(statement);
-        }
-
-        public ResultSet executeNonIndexed(SAITester tester, String query, Object... values) throws Throwable
-        {
-            return tester.sessionNet().execute(new SimpleStatement(formatNonIndexedQuery(query), values));
-        }
-
-        public ResultSet executeNonIndexed(SAITester tester, String query, int fetchSize, Object... values) throws Throwable
-        {
-            SimpleStatement statement = new SimpleStatement(formatNonIndexedQuery(query), values);
-            statement.setFetchSize(fetchSize);
-            return tester.sessionNet().execute(statement);
+            return tester.executeRemote(formatNonIndexedQuery(query), fetchSize, values);
         }
 
         protected Set<Integer> deletable()
@@ -370,9 +346,9 @@ public String toString()
         }
     }
 
-    class CompoundKeyWithStaticsDataModel extends CompoundKeyDataModel
+    public class CompoundKeyWithStaticsDataModel extends CompoundKeyDataModel
     {
-        CompoundKeyWithStaticsDataModel(List<Pair<String, String>> columns, List<String> rows)
+        public CompoundKeyWithStaticsDataModel(List<Pair<String, String>> columns, List<String> rows)
         {
             super(columns, rows);
 
@@ -380,7 +356,7 @@ class CompoundKeyWithStaticsDataModel extends CompoundKeyDataModel
         }
 
         @Override
-        public void insertRows(SAITester tester) throws Throwable
+        public void insertRows(Executor tester) throws Throwable
         {
             super.insertRows(tester);
 
@@ -388,7 +364,7 @@ public void insertRows(SAITester tester) throws Throwable
         }
 
         @Override
-        public void updateCells(SAITester tester) throws Throwable
+        public void updateCells(Executor tester) throws Throwable
         {
             executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p = 0 AND c = 0", BIGINT_COLUMN));
             executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p = 0 AND c = 1", BOOLEAN_COLUMN));
@@ -409,7 +385,7 @@ public void updateCells(SAITester tester) throws Throwable
         }
 
         @Override
-        public void deleteCells(SAITester tester) throws Throwable
+        public void deleteCells(Executor tester) throws Throwable
         {
             for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
             {
@@ -419,7 +395,7 @@ public void deleteCells(SAITester tester) throws Throwable
         }
 
         @Override
-        public void deleteRows(SAITester tester) throws Throwable
+        public void deleteRows(Executor tester) throws Throwable
         {
             executeLocal(tester, "DELETE FROM %s WHERE p = 2 AND c = 0");
             executeLocal(tester, "DELETE FROM %s WHERE p = 4 AND c = 0");
@@ -433,9 +409,9 @@ protected Set<Integer> deletable()
         }
     }
 
-    class CompositePartitionKeyDataModel extends BaseDataModel
+    public class CompositePartitionKeyDataModel extends BaseDataModel
     {
-        CompositePartitionKeyDataModel(List<Pair<String, String>> columns, List<String> rows)
+        public CompositePartitionKeyDataModel(List<Pair<String, String>> columns, List<String> rows)
         {
             super(columns, rows);
 
@@ -445,18 +421,18 @@ class CompositePartitionKeyDataModel extends BaseDataModel
         }
 
         @Override
-        public void createTables(SAITester tester)
+        public void createTables(Executor tester)
         {
             String keyColumnDefs = keyColumns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
             String normalColumnDefs = columns.stream().map(column -> column.left + ' ' + column.right).collect(Collectors.joining(", "));
 
-            String template = "CREATE TABLE %%s (%s, %s, PRIMARY KEY ((%s)))" + tableOptions;
-            indexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
-            nonIndexedTable = tester.createTable(String.format(template, keyColumnDefs, normalColumnDefs, primaryKey));
+            String template = "CREATE TABLE %s (%s, %s, PRIMARY KEY ((%s)))" + tableOptions;
+            tester.createTable(String.format(template, KEYSPACE + "." + indexedTable, keyColumnDefs, normalColumnDefs, primaryKey));
+            tester.createTable(String.format(template, KEYSPACE + "." + nonIndexedTable, keyColumnDefs, normalColumnDefs, primaryKey));
         }
 
         @Override
-        public void createIndexes(SAITester tester) throws Throwable
+        public void createIndexes(Executor tester) throws Throwable
         {
             super.createIndexes(tester);
             String template = "CREATE CUSTOM INDEX ndi_%s_index_%s ON %%s (%s) USING 'StorageAttachedIndex'";
@@ -466,19 +442,19 @@ public void createIndexes(SAITester tester) throws Throwable
                 if (!skipColumns.contains(column.left))
                 {
                     executeLocalIndexed(tester, String.format(template, column.left, indexedTable, column.left));
-                    tester.waitForIndexQueryable();
+                    tester.waitForIndexQueryable(KEYSPACE, indexedTable);
                 }
             }
         }
 
         @Override
-        public void insertRows(SAITester tester) throws Throwable
+        public void insertRows(Executor tester) throws Throwable
         {
             super.insertRows(tester);
         }
 
         @Override
-        public void updateCells(SAITester tester) throws Throwable
+        public void updateCells(Executor tester) throws Throwable
         {
             executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p1 = 0 AND p2 = 0", BIGINT_COLUMN));
             executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p1 = 0 AND p2 = 1", BOOLEAN_COLUMN));
@@ -497,7 +473,7 @@ public void updateCells(SAITester tester) throws Throwable
         }
 
         @Override
-        public void deleteCells(SAITester tester) throws Throwable
+        public void deleteCells(Executor tester) throws Throwable
         {
             for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
             {
@@ -508,7 +484,7 @@ public void deleteCells(SAITester tester) throws Throwable
         }
 
         @Override
-        public void deleteRows(SAITester tester) throws Throwable
+        public void deleteRows(Executor tester) throws Throwable
         {
             executeLocal(tester, "DELETE FROM %s WHERE p1 = 2 AND p2 = 0");
             executeLocal(tester, "DELETE FROM %s WHERE p1 = 4 AND p2 = 1");
@@ -524,9 +500,9 @@ protected Set<Integer> deletable()
         }
     }
 
-    class CompoundKeyDataModel extends BaseDataModel
+    public class CompoundKeyDataModel extends BaseDataModel
     {
-        CompoundKeyDataModel(List<Pair<String, String>> columns, List<String> rows)
+        public CompoundKeyDataModel(List<Pair<String, String>> columns, List<String> rows)
         {
             super(columns, rows);
 
@@ -536,7 +512,7 @@ class CompoundKeyDataModel extends BaseDataModel
         }
 
         @Override
-        public void updateCells(SAITester tester) throws Throwable
+        public void updateCells(Executor tester) throws Throwable
         {
             executeLocal(tester, String.format("UPDATE %%s SET %s = 9700000000 WHERE p = 0 AND c = 0", BIGINT_COLUMN));
             executeLocal(tester, String.format("UPDATE %%s SET %s = false WHERE p = 1 AND c = 0", BOOLEAN_COLUMN));
@@ -555,7 +531,7 @@ public void updateCells(SAITester tester) throws Throwable
         }
 
         @Override
-        public void deleteCells(SAITester tester) throws Throwable
+        public void deleteCells(Executor tester) throws Throwable
         {
             for (int i = 0; i < NORMAL_COLUMNS.size(); i++)
             {
@@ -609,4 +585,25 @@ public String toString()
             return String.format("CompoundPrimaryKeyList[rows: %d, partition size: %d]", primaryKeys.size(), rowsPerPartition);
         }
     }
+
+    public static interface Executor
+    {
+        void createTable(String statement);
+
+        void flush(String keyspace, String table);
+
+        void compact(String keyspace, String table);
+
+        void disableCompaction(String keyspace, String table);
+
+        void waitForIndexQueryable(String keyspace, String table);
+
+        void executeLocal(String query, Object...values) throws Throwable;
+
+        List<Object> executeRemote(String query, int fetchSize, Object...values) throws Throwable;
+
+        void counterReset();
+
+        long getCounter();
+    }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/EmptyMemtableFlushTest.java b/test/unit/org/apache/cassandra/index/sai/cql/EmptyMemtableFlushTest.java
new file mode 100644
index 000000000000..a158cd925451
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/EmptyMemtableFlushTest.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+
+import static org.junit.Assert.assertEquals;
+
+public class EmptyMemtableFlushTest extends SAITester
+{
+    @Test
+    public void numericIndexTest() throws Throwable
+    {
+        requireNetwork();
+        createTable("CREATE TABLE %s (id int PRIMARY KEY, val1 int, val2 int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(val1) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(val2) USING 'StorageAttachedIndex'");
+        execute("INSERT INTO %s (id, val1, val2) VALUES (0, 0, 0)");
+        execute("INSERT INTO %s (id, val2) VALUES (1, 1)");
+        execute("DELETE FROM %s WHERE id = 0");
+        flush();
+        // After this we should have only 1 set of index files but 2 completion markers
+        assertEquals(1, componentFiles(indexFiles(), IndexComponents.NDIType.KD_TREE.name).size());
+        assertEquals(1, componentFiles(indexFiles(), IndexComponents.NDIType.KD_TREE_POSTING_LISTS.name).size());
+        assertEquals(1, componentFiles(indexFiles(), IndexComponents.NDIType.META.name).size());
+        assertEquals(2, componentFiles(indexFiles(), IndexComponents.NDIType.COLUMN_COMPLETION_MARKER.name).size());
+
+        assertEquals(0, execute("SELECT * from %s WHERE val1 = 0").size());
+        assertEquals(1, execute("SELECT * from %s WHERE val2 = 1").size());
+    }
+
+    @Test
+    public void literalIndexTest() throws Throwable
+    {
+        requireNetwork();
+        createTable("CREATE TABLE %s (id int PRIMARY KEY, val1 text, val2 text)");
+        createIndex("CREATE CUSTOM INDEX ON %s(val1) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(val2) USING 'StorageAttachedIndex'");
+        execute("INSERT INTO %s (id, val1, val2) VALUES (0, '0', '0')");
+        execute("INSERT INTO %s (id, val2) VALUES (1, '1')");
+        execute("DELETE FROM %s WHERE id = 0");
+        flush();
+        // After this we should have only 1 set of index files but 2 completion markers
+        assertEquals(1, componentFiles(indexFiles(), IndexComponents.NDIType.TERMS_DATA.name).size());
+        assertEquals(1, componentFiles(indexFiles(), IndexComponents.NDIType.POSTING_LISTS.name).size());
+        assertEquals(1, componentFiles(indexFiles(), IndexComponents.NDIType.META.name).size());
+        assertEquals(2, componentFiles(indexFiles(), IndexComponents.NDIType.COLUMN_COMPLETION_MARKER.name).size());
+
+        assertEquals(0, execute("SELECT * from %s WHERE val1 = '0'").size());
+        assertEquals(1, execute("SELECT * from %s WHERE val2 = '1'").size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java b/test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java
deleted file mode 100644
index e41bcfce5a48..000000000000
--- a/test/unit/org/apache/cassandra/index/sai/cql/IndexOperatorSupportTest.java
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.index.sai.cql;
-
-import java.util.LinkedList;
-import java.util.List;
-
-import org.junit.Test;
-
-import com.datastax.driver.core.exceptions.InvalidQueryException;
-import org.apache.cassandra.index.sai.SAITester;
-import org.apache.cassandra.utils.Pair;
-
-import static org.assertj.core.api.Assertions.assertThatThrownBy;
-
-public class IndexOperatorSupportTest extends SAITester
-{
-    @Test
-    public void shouldRejectAllQueries() throws Throwable
-    {
-        requireNetwork();
-
-        DataModel model = new DataModel.BaseDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA);
-        model.createTables(this);
-        model.createIndexes(this);
-
-        for (String[] scenario : scenarios())
-        {
-            assertThatThrownBy(() -> model.executeIndexed(this, String.format(scenario[2], scenario[1]))).isInstanceOf(InvalidQueryException.class).as(scenario[0]);
-        }
-    }
-
-    private List<String[]> scenarios()
-    {
-        List<String[]> scenarios = new LinkedList<>();
-
-        for (Pair<String, String> column : DataModel.NORMAL_COLUMNS)
-        {
-            scenarios.add(new String[]{ "Should reject LIKE query for " + column.left, column.left, "SELECT * FROM %%s WHERE %s LIKE 'foo%%%%'" });
-        }
-
-        scenarios.add(new String[] { "Should reject range query for " + DataModel.ASCII_COLUMN, DataModel.ASCII_COLUMN, "SELECT * FROM %%s WHERE %s > 'foo'" });
-        scenarios.add(new String[] { "Should reject range query for " + DataModel.ASCII_COLUMN, DataModel.ASCII_COLUMN, "SELECT * FROM %%s WHERE %s != 'foo'" });
-
-        scenarios.add(new String[] { "Should reject range query for " + DataModel.TEXT_COLUMN, DataModel.TEXT_COLUMN, "SELECT * FROM %%s WHERE %s > 'foo'" });
-        scenarios.add(new String[] { "Should reject range query for " + DataModel.TEXT_COLUMN, DataModel.TEXT_COLUMN, "SELECT * FROM %%s WHERE %s != 'foo'" });
-
-        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.TINYINT_COLUMN, DataModel.TINYINT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
-        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.SMALLINT_COLUMN, DataModel.SMALLINT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
-        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.INT_COLUMN, DataModel.INT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
-        scenarios.add(new String[] { "Should reject NEQ query for " + DataModel.BIGINT_COLUMN, DataModel.BIGINT_COLUMN, "SELECT * FROM %%s WHERE %s != 10" });
-
-        scenarios.add(new String[] { "Should reject range query for " + DataModel.BOOLEAN_COLUMN, DataModel.BOOLEAN_COLUMN, "SELECT * FROM %%s WHERE %s > true" });
-
-        scenarios.add(new String[] { "Should reject range query for " + DataModel.UUID_COLUMN, DataModel.UUID_COLUMN, "SELECT * FROM %%s WHERE %s > e37394dc-d17b-11e8-a8d5-f2801f1b9fd1" });
-
-        return scenarios;
-    }
-}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java b/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
index 9ab0879bb119..add8c2103f1b 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
@@ -21,18 +21,14 @@
 package org.apache.cassandra.index.sai.cql;
 
 import java.util.Arrays;
-import java.util.LinkedList;
 import java.util.List;
-import java.util.UUID;
 import java.util.concurrent.TimeUnit;
+import java.util.function.Supplier;
 import java.util.stream.Collectors;
 
 import com.google.common.base.MoreObjects;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
-import org.junit.Before;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
 
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.marshal.InetAddressType;
@@ -40,14 +36,11 @@
 import org.apache.cassandra.db.marshal.TimeType;
 import org.apache.cassandra.db.marshal.TimestampType;
 import org.apache.cassandra.db.marshal.UUIDType;
-import org.apache.cassandra.index.sai.SAITester;
 import org.apache.cassandra.index.sai.plan.StorageAttachedIndexSearcher;
-import org.apache.cassandra.inject.Injections;
 import org.apache.cassandra.utils.Pair;
 import org.hamcrest.Matchers;
 
 import static org.apache.cassandra.index.sai.cql.DataModel.INET_COLUMN;
-import static org.apache.cassandra.inject.InvokePointBuilder.newInvokePoint;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertThat;
 
@@ -63,201 +56,164 @@
  * 5.) ...queries across varying primary key and table structures.
  * 6.) ...queries across static, normal, and clustering column types.
  * 7.) ...queries across various paging and limit settings.
+ *
+ * IMPORTANT: This class is shared between the single-node SAITester based classes and the
+ * multi-node distributed classes. It must not reference SAITester or CQLTester directly
+ * to avoid static loading and initialisation.
  */
-@RunWith(Parameterized.class)
-public abstract class IndexQuerySupport extends SAITester
+public class IndexQuerySupport
 {
-    static List<BaseQuerySet> BASE_QUERY_SETS = ImmutableList.of(new BaseQuerySet(10, 5),
-                                                                 new BaseQuerySet(10, 9),
-                                                                 new BaseQuerySet(10, 10),
-                                                                 new BaseQuerySet(10, Integer.MAX_VALUE),
-                                                                 new BaseQuerySet(24, 10),
-                                                                 new BaseQuerySet(24, 100),
-                                                                 new BaseQuerySet(24, Integer.MAX_VALUE));
-
-    static List<BaseQuerySet> COMPOSITE_PARTITION_QUERY_SETS = ImmutableList.of(new CompositePartitionQuerySet(10, 5),
-                                                                                new CompositePartitionQuerySet(10, 10),
-                                                                                new CompositePartitionQuerySet(10, Integer.MAX_VALUE),
-                                                                                new CompositePartitionQuerySet(24, 10),
-                                                                                new CompositePartitionQuerySet(24, 100),
-                                                                                new CompositePartitionQuerySet(24, Integer.MAX_VALUE));
-
-    static List<BaseQuerySet> STATIC_QUERY_SETS = ImmutableList.of(new StaticColumnQuerySet(10, 5),
-                                                                   new StaticColumnQuerySet(10, 10),
-                                                                   new StaticColumnQuerySet(10, Integer.MAX_VALUE),
-                                                                   new StaticColumnQuerySet(24, 10),
-                                                                   new StaticColumnQuerySet(24, 100),
-                                                                   new StaticColumnQuerySet(24, Integer.MAX_VALUE));
-
-    static final Injections.Counter INDEX_QUERY_COUNTER = Injections.newCounter("IndexQueryCounter")
-                                                                    .add(newInvokePoint().onClass(StorageAttachedIndexSearcher.class).onMethod("search"))
-                                                                    .build();
-
-    @Parameterized.Parameter(0)
-    public DataModel dataModel;
-    @Parameterized.Parameter(1)
-    public List<BaseQuerySet> sets;
-
-    @Before
-    public void setup() throws Throwable
-    {
-        requireNetwork();
-
-        Injections.inject(INDEX_QUERY_COUNTER);
-    }
-
-    protected void writeLifecycle() throws Throwable
+    public static List<BaseQuerySet> BASE_QUERY_SETS = ImmutableList.of(new BaseQuerySet(10, 5),
+                                                                        new BaseQuerySet(10, 9),
+                                                                        new BaseQuerySet(10, 10),
+                                                                        new BaseQuerySet(10, Integer.MAX_VALUE),
+                                                                        new BaseQuerySet(24, 10),
+                                                                        new BaseQuerySet(24, 100),
+                                                                        new BaseQuerySet(24, Integer.MAX_VALUE));
+
+    public static List<BaseQuerySet> COMPOSITE_PARTITION_QUERY_SETS = ImmutableList.of(new CompositePartitionQuerySet(10, 5),
+                                                                                       new CompositePartitionQuerySet(10, 10),
+                                                                                       new CompositePartitionQuerySet(10, Integer.MAX_VALUE),
+                                                                                       new CompositePartitionQuerySet(24, 10),
+                                                                                       new CompositePartitionQuerySet(24, 100),
+                                                                                       new CompositePartitionQuerySet(24, Integer.MAX_VALUE));
+
+    public static List<BaseQuerySet> STATIC_QUERY_SETS = ImmutableList.of(new StaticColumnQuerySet(10, 5),
+                                                                          new StaticColumnQuerySet(10, 10),
+                                                                          new StaticColumnQuerySet(10, Integer.MAX_VALUE),
+                                                                          new StaticColumnQuerySet(24, 10),
+                                                                          new StaticColumnQuerySet(24, 100),
+                                                                          new StaticColumnQuerySet(24, Integer.MAX_VALUE));
+
+    public static void writeLifecycle(DataModel.Executor executor, DataModel dataModel, List<BaseQuerySet> sets) throws Throwable
     {
-        dataModel.createTables(this);
+        dataModel.createTables(executor);
 
-        dataModel.disableCompaction(this);
+        dataModel.disableCompaction(executor);
 
-        dataModel.createIndexes(this);
+        dataModel.createIndexes(executor);
 
         // queries against Memtable adjacent in-memory indexes
-        dataModel.insertRows(this);
-        executeQueries(dataModel, sets);
+        dataModel.insertRows(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries with Memtable flushed to SSTable on disk
-        dataModel.flush(this);
-        executeQueries(dataModel, sets);
+        dataModel.flush(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries across memory and disk indexes
-        dataModel.insertRows(this);
-        executeQueries(dataModel, sets);
+        dataModel.insertRows(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries w/ multiple SSTable indexes
-        dataModel.flush(this);
-        executeQueries(dataModel, sets);
+        dataModel.flush(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries after compacting to a single SSTable index
-        dataModel.compact(this);
-        executeQueries(dataModel, sets);
+        dataModel.compact(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries against Memtable updates and the existing SSTable index
-        dataModel.updateCells(this);
-        executeQueries(dataModel, sets);
+        dataModel.updateCells(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries against the newly flushed SSTable index and the existing SSTable index
-        dataModel.flush(this);
-        executeQueries(dataModel, sets);
+        dataModel.flush(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries after compacting updates into to a single SSTable index
-        dataModel.compact(this);
-        executeQueries(dataModel, sets);
+        dataModel.compact(executor);
+        executeQueries(dataModel, executor, sets);
     }
 
-    public void rowDeletions() throws Throwable
+    public static void rowDeletions(DataModel.Executor executor, DataModel dataModel, List<BaseQuerySet> sets) throws Throwable
     {
-        dataModel.createTables(this);
+        dataModel.createTables(executor);
 
-        dataModel.disableCompaction(this);
+        dataModel.disableCompaction(executor);
 
-        dataModel.createIndexes(this);
-        dataModel.insertRows(this);
-        dataModel.flush(this);
-        dataModel.compact(this);
+        dataModel.createIndexes(executor);
+        dataModel.insertRows(executor);
+        dataModel.flush(executor);
+        dataModel.compact(executor);
 
         // baseline queries
-        executeQueries(dataModel, sets);
+        executeQueries(dataModel, executor, sets);
 
         // queries against Memtable deletes and the existing SSTable index
-        dataModel.deleteRows(this);
-        executeQueries(dataModel, sets);
+        dataModel.deleteRows(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries against the newly flushed SSTable index and the existing SSTable index
-        dataModel.flush(this);
-        executeQueries(dataModel, sets);
+        dataModel.flush(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries after compacting deletes into to a single SSTable index
-        dataModel.compact(this);
-        executeQueries(dataModel, sets);
+        dataModel.compact(executor);
+        executeQueries(dataModel, executor, sets);
 
         // truncate, reload, and verify that the load is clean
-        dataModel.truncateTables(this);
-        dataModel.insertRows(this);
-        executeQueries(dataModel, sets);
+        dataModel.truncateTables(executor);
+        dataModel.insertRows(executor);
+        executeQueries(dataModel, executor, sets);
     }
 
-    public void cellDeletions() throws Throwable
+    public static void cellDeletions(DataModel.Executor executor, DataModel dataModel, List<BaseQuerySet> sets) throws Throwable
     {
-        dataModel.createTables(this);
+        dataModel.createTables(executor);
 
-        dataModel.disableCompaction(this);
+        dataModel.disableCompaction(executor);
 
-        dataModel.createIndexes(this);
-        dataModel.insertRows(this);
-        dataModel.flush(this);
-        dataModel.compact(this);
+        dataModel.createIndexes(executor);
+        dataModel.insertRows(executor);
+        dataModel.flush(executor);
+        dataModel.compact(executor);
 
         // baseline queries
-        executeQueries(dataModel, sets);
+        executeQueries(dataModel, executor, sets);
 
         // queries against Memtable deletes and the existing SSTable index
-        dataModel.deleteCells(this);
-        executeQueries(dataModel, sets);
+        dataModel.deleteCells(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries against the newly flushed SSTable index and the existing SSTable index
-        dataModel.flush(this);
-        executeQueries(dataModel, sets);
+        dataModel.flush(executor);
+        executeQueries(dataModel, executor, sets);
 
         // queries after compacting deletes into to a single SSTable index
-        dataModel.compact(this);
-        executeQueries(dataModel, sets);
+        dataModel.compact(executor);
+        executeQueries(dataModel, executor, sets);
     }
 
-    public void timeToLive() throws Throwable
+    public static void timeToLive(DataModel.Executor executor, DataModel dataModel, List<BaseQuerySet> sets) throws Throwable
     {
-        dataModel.createTables(this);
+        dataModel.createTables(executor);
 
-        dataModel.disableCompaction(this);
+        dataModel.disableCompaction(executor);
 
-        dataModel.createIndexes(this);
-        dataModel.insertRowsWithTTL(this);
+        dataModel.createIndexes(executor);
+        dataModel.insertRowsWithTTL(executor);
 
         // Wait for the TTL to become effective:
         TimeUnit.SECONDS.sleep(DataModel.DEFAULT_TTL_SECONDS);
 
         // Make sure TTLs are reflected in our query results from the Memtable:
-        executeQueries(dataModel, sets);
+        executeQueries(dataModel, executor, sets);
 
         // Make sure TTLs are reflected in our query results from SSTables:
-        dataModel.flush(this);
-        executeQueries(dataModel, sets);
+        dataModel.flush(executor);
+        executeQueries(dataModel, executor, sets);
 
         // Make sure fresh overwrites invalidate TTLs:
-        dataModel.insertRows(this);
-        executeQueries(dataModel, sets);
-    }
-
-    @SuppressWarnings("unused")
-    @Parameterized.Parameters(name = "{0}")
-    public static List<Object[]> params() throws Throwable
-    {
-        List<Object[]> scenarios = new LinkedList<>();
-
-        scenarios.add(new Object[]{ new DataModel.BaseDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA), BASE_QUERY_SETS });
-
-        scenarios.add(new Object[]{ new DataModel.CompoundKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA), BASE_QUERY_SETS });
-
-        scenarios.add(new Object[]{ new DataModel.CompoundKeyWithStaticsDataModel(DataModel.STATIC_COLUMNS, DataModel.STATIC_COLUMN_DATA), STATIC_QUERY_SETS });
-
-        scenarios.add(new Object[]{ new DataModel.CompositePartitionKeyDataModel(DataModel.NORMAL_COLUMNS, DataModel.NORMAL_COLUMN_DATA),
-                                    ImmutableList.builder().addAll(BASE_QUERY_SETS).addAll(COMPOSITE_PARTITION_QUERY_SETS).build()});
-
-        return scenarios;
-    }
-
-    static String randomPostfix()
-    {
-        return UUID.randomUUID().toString().replace("-", "_");
+        dataModel.insertRows(executor);
+        executeQueries(dataModel, executor, sets);
     }
 
-    private void executeQueries(DataModel dataModel, List<BaseQuerySet> sets) throws Throwable
+    private static void executeQueries(DataModel dataModel, DataModel.Executor executor, List<BaseQuerySet> sets) throws Throwable
     {
         for (BaseQuerySet set : sets)
         {
-            set.execute(this, dataModel);
+            set.execute(executor, dataModel);
         }
     }
 
@@ -268,7 +224,7 @@ static class StaticColumnQuerySet extends BaseQuerySet
             super(limit, fetchSize);
         }
 
-        public void execute(SAITester tester, DataModel model) throws Throwable
+        public void execute(DataModel.Executor tester, DataModel model) throws Throwable
         {
             super.execute(tester, model);
 
@@ -292,7 +248,7 @@ static class CompositePartitionQuerySet extends BaseQuerySet
             super(limit, fetchSize);
         }
 
-        public void execute(SAITester tester, DataModel model) throws Throwable
+        public void execute(DataModel.Executor tester, DataModel model) throws Throwable
         {
             super.execute(tester, model);
 
@@ -325,7 +281,7 @@ public void execute(SAITester tester, DataModel model) throws Throwable
         }
     }
 
-    private static class BaseQuerySet
+    public static class BaseQuerySet
     {
         final int limit;
         final int fetchSize;
@@ -336,7 +292,7 @@ private static class BaseQuerySet
             this.fetchSize = fetchSize;
         }
 
-        void execute(SAITester tester, DataModel model) throws Throwable
+        void execute(DataModel.Executor tester, DataModel model) throws Throwable
         {
             query(tester, model, DataModel.ASCII_COLUMN, Operator.EQ, "MA");
             query(tester, model, DataModel.ASCII_COLUMN, Operator.EQ, "LA");
@@ -550,14 +506,14 @@ void execute(SAITester tester, DataModel model) throws Throwable
             }
         }
 
-        void query(SAITester tester, DataModel model, String column, Operator operator, Object value) throws Throwable
+        void query(DataModel.Executor tester, DataModel model, String column, Operator operator, Object value) throws Throwable
         {
             String query = String.format(DataModel.SIMPLE_SELECT_TEMPLATE, DataModel.ASCII_COLUMN, column, operator);
             String queryValidator = String.format(DataModel.SIMPLE_SELECT_WITH_FILTERING_TEMPLATE, DataModel.ASCII_COLUMN, column, operator);
             validate(tester, model, query, queryValidator, value, limit);
         }
 
-        void andQuery(SAITester tester, DataModel model,
+        void andQuery(DataModel.Executor tester, DataModel model,
                       String column1, Operator operator1, Object value1,
                       String column2, Operator operator2, Object value2,
                       boolean filtering) throws Throwable
@@ -571,7 +527,7 @@ void andQuery(SAITester tester, DataModel model,
             validate(tester, model,query, queryValidator, value1, value2, limit);
         }
 
-        void andQuery(SAITester tester, DataModel model,
+        void andQuery(DataModel.Executor tester, DataModel model,
                       String column1, Operator operator1, Object value1,
                       String column2, Operator operator2, Object value2,
                       String column3, Operator operator3, Object value3) throws Throwable
@@ -586,7 +542,7 @@ void andQuery(SAITester tester, DataModel model,
             validate(tester, model, query, queryValidator, value1, value2, value3, limit);
         }
 
-        void rangeQuery(SAITester tester, DataModel model, String column, Object value1, Object value2) throws Throwable
+        void rangeQuery(DataModel.Executor tester, DataModel model, String column, Object value1, Object value2) throws Throwable
         {
             String template = "SELECT %s FROM %%s WHERE %s > ? AND %s < ? LIMIT ?";
             String templateWithFiltering = "SELECT %s FROM %%s WHERE %s > ? AND %s < ? LIMIT ? ALLOW FILTERING";
@@ -596,19 +552,19 @@ void rangeQuery(SAITester tester, DataModel model, String column, Object value1,
             validate(tester, model, query, queryValidator, value1, value2, limit);
         }
 
-        private List<Object> validate(SAITester tester, DataModel model, String query, String validator, Object... values) throws Throwable
+        private List<Object> validate(DataModel.Executor tester, DataModel model, String query, String validator, Object... values) throws Throwable
         {
             try
             {
-                INDEX_QUERY_COUNTER.reset();
+                tester.counterReset();
 
-                List<Object> actual = model.executeIndexed(tester, query, fetchSize, values).all().stream().map(r -> r.getObject(0)).collect(Collectors.toList());
+                List<Object> actual = model.executeIndexed(tester, query, fetchSize, values);
 
                 // This could be more strict, but it serves as a reasonable paging-aware lower bound:
                 int pageCount = (int) Math.ceil(actual.size() / (double) Math.min(actual.size(), fetchSize));
-                assertThat("Expected more calls to " + StorageAttachedIndexSearcher.class, INDEX_QUERY_COUNTER.get(), Matchers.greaterThanOrEqualTo((long) Math.max(1, pageCount)));
+                assertThat("Expected more calls to " + StorageAttachedIndexSearcher.class, tester.getCounter(), Matchers.greaterThanOrEqualTo((long) Math.max(1, pageCount)));
 
-                List<Object> expected = model.executeNonIndexed(tester, validator, fetchSize, values).all().stream().map(r -> r.getObject(0)).collect(Collectors.toList());
+                List<Object> expected = model.executeNonIndexed(tester, validator, fetchSize, values);
 
                 assertEquals(expected, actual);
 
@@ -616,8 +572,7 @@ private List<Object> validate(SAITester tester, DataModel model, String query, S
             }
             catch (Throwable ex)
             {
-                // When thrown here, AssertionError does not seem to produce a stack trace, so it's logged explicitly:
-                logger.error("Validation failed while executing query: " + query + ", exception message: " + ex.getMessage(), ex);
+                ex.printStackTrace();
                 throw ex;
             }
         }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java
index 41a35a228e8a..6b423580e3bd 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryCellDeletionsTest.java
@@ -19,11 +19,11 @@
 
 import org.junit.Test;
 
-public class QueryCellDeletionsTest extends IndexQuerySupport
+public class QueryCellDeletionsTest extends AbstractQueryTester
 {
     @Test
     public void testCellDeletions() throws Throwable
     {
-        cellDeletions();
+        IndexQuerySupport.cellDeletions(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java
index 3099c2c7fafd..a5a843d99e4d 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryRowDeletionsTest.java
@@ -19,11 +19,11 @@
 
 import org.junit.Test;
 
-public class QueryRowDeletionsTest extends IndexQuerySupport
+public class QueryRowDeletionsTest extends AbstractQueryTester
 {
     @Test
     public void testRowDeletions() throws Throwable
     {
-        rowDeletions();
+        IndexQuerySupport.rowDeletions(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java
index f37db42b0038..39a77b496c48 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryTimeToLiveTest.java
@@ -19,11 +19,11 @@
 
 import org.junit.Test;
 
-public class QueryTimeToLiveTest extends IndexQuerySupport
+public class QueryTimeToLiveTest extends AbstractQueryTester
 {
     @Test
     public void testTimeToLive() throws Throwable
     {
-        timeToLive();
+        IndexQuerySupport.timeToLive(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java
index bc58a8370209..a41ff47f5f86 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/QueryWriteLifecycleTest.java
@@ -19,12 +19,11 @@
 
 import org.junit.Test;
 
-
-public class QueryWriteLifecycleTest extends IndexQuerySupport
+public class QueryWriteLifecycleTest extends AbstractQueryTester
 {
     @Test
     public void testWriteLifecycle() throws Throwable
     {
-        writeLifecycle();
+        IndexQuerySupport.writeLifecycle(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java b/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java
new file mode 100644
index 000000000000..111b0794f748
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.SimpleStatement;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.inject.Injections;
+
+public class SingleNodeExecutor implements DataModel.Executor
+{
+    private final SAITester tester;
+    private final Injections.Counter counter;
+
+    public SingleNodeExecutor(SAITester tester, Injections.Counter counter)
+    {
+        this.tester = tester;
+        this.counter = counter;
+    }
+
+    @Override
+    public void createTable(String statement)
+    {
+        tester.createTable(statement);
+    }
+
+    @Override
+    public void flush(String keyspace, String table)
+    {
+        tester.flush(keyspace, table);
+    }
+
+    @Override
+    public void compact(String keyspace, String table)
+    {
+        tester.compact(keyspace, table);
+    }
+
+    @Override
+    public void disableCompaction(String keyspace, String table)
+    {
+        tester.disableCompaction(keyspace, table);
+    }
+
+    @Override
+    public void waitForIndexQueryable(String keyspace, String table)
+    {
+        tester.waitForIndexQueryable(keyspace, table);
+    }
+
+    @Override
+    public void executeLocal(String query, Object... values) throws Throwable
+    {
+        tester.executeFormattedQuery(query, values);
+    }
+
+    @Override
+    public List<Object> executeRemote(String query, int fetchSize, Object... values)
+    {
+        SimpleStatement statement = new SimpleStatement(query, values);
+        statement.setFetchSize(fetchSize);
+        return tester.sessionNet().execute(statement).all().stream().map(r -> r.getObject(0)).collect(Collectors.toList());
+    }
+
+    @Override
+    public void counterReset()
+    {
+        counter.reset();
+    }
+
+    @Override
+    public long getCounter()
+    {
+        return counter.get();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java
index 908c8617ae3b..d9c364d63b3e 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryCellDeletionsTest.java
@@ -25,7 +25,7 @@
 /**
  * Force generates segments due to a small RAM size on compaction, to test segment splitting
  */
-public class TinySegmentQueryCellDeletionsTest extends IndexQuerySupport
+public class TinySegmentQueryCellDeletionsTest extends AbstractQueryTester
 {
     @Before
     public void setSegmentWriteBufferSpace() throws Throwable
@@ -36,6 +36,6 @@ public void setSegmentWriteBufferSpace() throws Throwable
     @Test
     public void testCellDeletions() throws Throwable
     {
-        cellDeletions();
+        IndexQuerySupport.cellDeletions(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java
index 0ac830494c14..303f299bc168 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryRowDeletionsTest.java
@@ -25,7 +25,7 @@
 /**
  * Force generates segments due to a small RAM size on compaction, to test segment splitting
  */
-public class TinySegmentQueryRowDeletionsTest extends IndexQuerySupport
+public class TinySegmentQueryRowDeletionsTest extends AbstractQueryTester
 {
     @Before
     public void setSegmentWriteBufferSpace() throws Throwable
@@ -36,6 +36,6 @@ public void setSegmentWriteBufferSpace() throws Throwable
     @Test
     public void testRowDeletions() throws Throwable
     {
-        rowDeletions();
+        IndexQuerySupport.rowDeletions(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java
index 0b1388de9158..dfa17f7aa788 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryTimeToLiveTest.java
@@ -25,7 +25,7 @@
 /**
  * Force generates segments due to a small RAM size on compaction, to test segment splitting
  */
-public class TinySegmentQueryTimeToLiveTest extends IndexQuerySupport
+public class TinySegmentQueryTimeToLiveTest extends AbstractQueryTester
 {
     @Before
     public void setSegmentWriteBufferSpace() throws Throwable
@@ -36,6 +36,6 @@ public void setSegmentWriteBufferSpace() throws Throwable
     @Test
     public void testTimeToLive() throws Throwable
     {
-        timeToLive();
+        IndexQuerySupport.timeToLive(executor, dataModel, sets);
     }
 }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java
index 4999fafa51ef..778b70de8509 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/TinySegmentQueryWriteLifecycleTest.java
@@ -25,7 +25,7 @@
 /**
  * Force generates segments due to a small RAM size on compaction, to test segment splitting
  */
-public class TinySegmentQueryWriteLifecycleTest extends IndexQuerySupport
+public class TinySegmentQueryWriteLifecycleTest extends AbstractQueryTester
 {
     @Before
     public void setSegmentWriteBufferSpace() throws Throwable
@@ -36,6 +36,6 @@ public void setSegmentWriteBufferSpace() throws Throwable
     @Test
     public void testWriteLifecycle() throws Throwable
     {
-        writeLifecycle();
+        IndexQuerySupport.writeLifecycle(executor, dataModel, sets);
     }
 }

From d713b7e659feffb53c0f6e6420f76c29458b6b04 Mon Sep 17 00:00:00 2001
From: Aleksandr Sorokoumov <918393+Gerrrr@users.noreply.github.com>
Date: Tue, 20 Apr 2021 18:10:05 +0200
Subject: [PATCH 056/151] STAR-392: Reuse compaction task ID in compaction
 history (#93)

Co-authored-by: Brandon Williams <brandon@datastax.com>
(cherry picked from commit 10cc26d4a39d5e7f73d1bda644d1399f8d8e1edd)
(cherry picked from commit 3c272015bd1b0ddb9c8bd822fa500d8f99529527)
---
 src/java/org/apache/cassandra/db/SystemKeyspace.java        | 5 +++--
 .../org/apache/cassandra/db/compaction/CompactionTask.java  | 6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java
index 3b89ae35dfac..abc9361862ef 100644
--- a/src/java/org/apache/cassandra/db/SystemKeyspace.java
+++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java
@@ -501,7 +501,8 @@ public static void persistLocalMetadata()
                             DatabaseDescriptor.getStoragePort());
     }
 
-    public static void updateCompactionHistory(String ksname,
+    public static void updateCompactionHistory(UUID id,
+                                               String ksname,
                                                String cfname,
                                                long compactedAt,
                                                long bytesIn,
@@ -513,7 +514,7 @@ public static void updateCompactionHistory(String ksname,
             return;
         String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)";
         executeInternal(format(req, COMPACTION_HISTORY),
-                        UUIDGen.getTimeUUID(),
+                        id,
                         ksname,
                         cfname,
                         ByteBufferUtil.bytes(compactedAt),
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index d0ffc64b504b..ea128fe4b676 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
@@ -239,7 +239,7 @@ public boolean apply(SSTableReader sstable)
                 for (int i = 0; i < mergedRowCounts.length; i++)
                     totalSourceRows += mergedRowCounts[i] * (i + 1);
 
-                String mergeSummary = updateCompactionHistory(cfs.keyspace.getName(), cfs.getTableName(), mergedRowCounts, startsize, endsize);
+                String mergeSummary = updateCompactionHistory(taskId, cfs.keyspace.getName(), cfs.getTableName(), mergedRowCounts, startsize, endsize);
 
                 logger.info(String.format("Compacted (%s) %d sstables to [%s] to level=%d.  %s to %s (~%d%% of original) in %,dms.  Read Throughput = %s, Write Throughput = %s, Row Throughput = ~%,d/s.  %,d total partitions merged to %,d.  Partition merge counts were {%s}",
                                            taskId,
@@ -278,7 +278,7 @@ public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
         return new DefaultCompactionWriter(cfs, directories, transaction, nonExpiredSSTables, keepOriginals, getLevel());
     }
 
-    public static String updateCompactionHistory(String keyspaceName, String columnFamilyName, long[] mergedRowCounts, long startSize, long endSize)
+    public static String updateCompactionHistory(UUID id, String keyspaceName, String columnFamilyName, long[] mergedRowCounts, long startSize, long endSize)
     {
         StringBuilder mergeSummary = new StringBuilder(mergedRowCounts.length * 10);
         Map<Integer, Long> mergedRows = new HashMap<>();
@@ -292,7 +292,7 @@ public static String updateCompactionHistory(String keyspaceName, String columnF
             mergeSummary.append(String.format("%d:%d, ", rows, count));
             mergedRows.put(rows, count);
         }
-        SystemKeyspace.updateCompactionHistory(keyspaceName, columnFamilyName, System.currentTimeMillis(), startSize, endSize, mergedRows);
+        SystemKeyspace.updateCompactionHistory(id, keyspaceName, columnFamilyName, System.currentTimeMillis(), startSize, endSize, mergedRows);
         return mergeSummary.toString();
     }
 

From 28bd750b5d6084689f5edad48b2a767a1398809a Mon Sep 17 00:00:00 2001
From: Aleksandr Sorokoumov <918393+Gerrrr@users.noreply.github.com>
Date: Wed, 21 Apr 2021 11:24:08 +0200
Subject: [PATCH 057/151] STAR-391: Reduce garbage when debug/trace is off in
 CompactionTask (#94)

Co-authored-by: nitsanw <nitsanw@yahoo.com>
(cherry picked from commit 03179457fd826cfdb35787dae3ef374be3700d13)
(cherry picked from commit 9face890cf4c33545ace09bee1c7f7620ab6c6f3)
---
 .../db/compaction/CompactionIterator.java     |  18 +-
 .../db/compaction/CompactionTask.java         | 232 ++++++++++++------
 2 files changed, 164 insertions(+), 86 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
index 873fca0547e9..7cd2a1b17ecd 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
@@ -71,11 +71,11 @@ public class CompactionIterator extends CompactionInfo.Holder implements Unfilte
     private long totalSourceCQLRows;
 
     /*
-     * counters for merged rows.
+     * counters for merged rows frequency(AKA histogram).
      * array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row),
      * index 1 is counter for 2 rows merged, and so on.
      */
-    private final long[] mergeCounters;
+    private final long[] mergedRowsHistogram;
 
     private final UnfilteredPartitionIterator compacted;
     private final ActiveCompactionsTracker activeCompactions;
@@ -99,7 +99,7 @@ public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, Ab
         for (ISSTableScanner scanner : scanners)
             bytes += scanner.getLengthInBytes();
         this.totalBytes = bytes;
-        this.mergeCounters = new long[scanners.size()];
+        this.mergedRowsHistogram = new long[scanners.size()];
         // note that we leak `this` from the constructor when calling beginCompaction below, this means we have to get the sstables before
         // calling that to avoid a NPE.
         sstables = scanners.stream().map(ISSTableScanner::getBackingSSTables).flatMap(Collection::stream).collect(ImmutableSet.toImmutableSet());
@@ -135,15 +135,15 @@ public boolean isGlobal()
         return false;
     }
 
-    private void updateCounterFor(int rows)
+    private void incMergedRowsHistogram(int rows)
     {
-        assert rows > 0 && rows - 1 < mergeCounters.length;
-        mergeCounters[rows - 1] += 1;
+        assert rows > 0 && rows - 1 < mergedRowsHistogram.length;
+        mergedRowsHistogram[rows - 1] += 1;
     }
 
-    public long[] getMergedRowCounts()
+    public long[] getMergedRowsHistogram()
     {
-        return mergeCounters;
+        return mergedRowsHistogram;
     }
 
     public long getTotalSourceCQLRows()
@@ -168,7 +168,7 @@ public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey par
 
                 assert merged > 0;
 
-                CompactionIterator.this.updateCounterFor(merged);
+                CompactionIterator.this.incMergedRowsHistogram(merged);
 
                 if (type != OperationType.COMPACTION || !controller.cfs.indexManager.handles(IndexTransaction.Type.COMPACTION))
                     return null;
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index ea128fe4b676..b1c0869e190f 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
@@ -24,8 +24,8 @@
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicLong;
 
-import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import com.google.common.util.concurrent.RateLimiter;
@@ -40,18 +40,24 @@
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
 import org.apache.cassandra.db.compaction.writers.DefaultCompactionWriter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.Refs;
 
+import static org.apache.cassandra.db.compaction.CompactionManager.compactionRateLimiterAcquire;
+import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory;
+import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemoryPerSecond;
+
 public class CompactionTask extends AbstractCompactionTask
 {
     protected static final Logger logger = LoggerFactory.getLogger(CompactionTask.class);
     protected final int gcBefore;
     protected final boolean keepOriginals;
-    protected static long totalBytesCompacted = 0;
+    /** for trace logging purposes only */
+    private static final AtomicLong totalBytesCompacted = new AtomicLong();
     private ActiveCompactionsTracker activeCompactions;
 
     public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
@@ -66,11 +72,12 @@ public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBef
         this.keepOriginals = keepOriginals;
     }
 
-    public static synchronized long addToTotalBytesCompacted(long bytesCompacted)
+    private static long addToTotalBytesCompacted(long bytesCompacted)
     {
-        return totalBytesCompacted += bytesCompacted;
+        return totalBytesCompacted.addAndGet(bytesCompacted);
     }
 
+    @Override
     protected int executeInternal(ActiveCompactionsTracker activeCompactions)
     {
         this.activeCompactions = activeCompactions == null ? ActiveCompactionsTracker.NOOP : activeCompactions;
@@ -78,7 +85,7 @@ protected int executeInternal(ActiveCompactionsTracker activeCompactions)
         return transaction.originals().size();
     }
 
-    public boolean reduceScopeForLimitedSpace(Set<SSTableReader> nonExpiredSSTables, long expectedSize)
+    private boolean reduceScopeForLimitedSpace(Set<SSTableReader> nonExpiredSSTables, long expectedSize)
     {
         if (partialCompactionsAcceptable() && transaction.originals().size() > 1)
         {
@@ -101,6 +108,7 @@ public boolean reduceScopeForLimitedSpace(Set<SSTableReader> nonExpiredSSTables,
      * which are properly serialized.
      * Caller is in charge of marking/unmarking the sstables as compacting.
      */
+    @Override
     protected void runMayThrow() throws Exception
     {
         // The collection of sstables passed may be empty (but not null); even if
@@ -126,28 +134,17 @@ protected void runMayThrow() throws Exception
             buildCompactionCandidatesForAvailableDiskSpace(fullyExpiredSSTables);
 
             // sanity check: all sstables must belong to the same cfs
-            assert !Iterables.any(transaction.originals(), new Predicate<SSTableReader>()
-            {
-                @Override
-                public boolean apply(SSTableReader sstable)
-                {
-                    return !sstable.descriptor.cfname.equals(cfs.name);
-                }
-            });
+            assert !Iterables.any(transaction.originals(), sstable -> !sstable.descriptor.cfname.equals(cfs.name));
 
             UUID taskId = transaction.opId();
 
             // new sstables from flush can be added during a compaction, but only the compaction can remove them,
             // so in our single-threaded compaction world this is a valid way of determining if we're compacting
             // all the sstables (that existed when we started)
-            StringBuilder ssTableLoggerMsg = new StringBuilder("[");
-            for (SSTableReader sstr : transaction.originals())
+            if (logger.isDebugEnabled())
             {
-                ssTableLoggerMsg.append(String.format("%s:level=%d, ", sstr.getFilename(), sstr.getSSTableLevel()));
+                debugLogCompactingMessage(taskId);
             }
-            ssTableLoggerMsg.append("]");
-
-            logger.info("Compacting ({}) {}", taskId, ssTableLoggerMsg);
 
             RateLimiter limiter = CompactionManager.instance.getRateLimiter();
             long start = System.nanoTime();
@@ -159,14 +156,16 @@ public boolean apply(SSTableReader sstable)
             Set<SSTableReader> actuallyCompact = Sets.difference(transaction.originals(), fullyExpiredSSTables);
             Collection<SSTableReader> newSStables;
 
-            long[] mergedRowCounts;
+            long[] mergedRowsHistogram;
             long totalSourceCQLRows;
 
             // SSTableScanners need to be closed before markCompactedSSTablesReplaced call as scanners contain references
-            // to both ifile and dfile and SSTR will throw deletion errors on Windows if it tries to delete before scanner is closed.
+            // to both ifile and dfile and SSTR will throw deletion errors on Windows if it tries to delete before
+            // scanner is closed.
             // See CASSANDRA-8019 and CASSANDRA-8399
             int nowInSec = FBUtilities.nowInSeconds();
-            try (Refs<SSTableReader> refs = Refs.ref(actuallyCompact);
+
+            try (Refs<SSTableReader> ignored = Refs.ref(actuallyCompact);
                  AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(actuallyCompact);
                  CompactionIterator ci = new CompactionIterator(compactionType, scanners.scanners, controller, nowInSec, taskId))
             {
@@ -179,7 +178,8 @@ public boolean apply(SSTableReader sstable)
                 long lastBytesScanned = 0;
 
                 activeCompactions.beginCompaction(ci);
-                try (CompactionAwareWriter writer = getCompactionAwareWriter(cfs, getDirectories(), transaction, actuallyCompact))
+                Directories dirs = getDirectories();
+                try (CompactionAwareWriter writer = getCompactionAwareWriter(cfs, dirs, transaction, actuallyCompact))
                 {
                     // Note that we need to re-check this flag after calling beginCompaction above to avoid a window
                     // where the compaction does not exist in activeCompactions but the CSM gets paused.
@@ -190,14 +190,15 @@ public boolean apply(SSTableReader sstable)
                     estimatedKeys = writer.estimatedKeys();
                     while (ci.hasNext())
                     {
-                        if (writer.append(ci.next()))
+                        UnfilteredRowIterator partition = ci.next();
+                        if (writer.append(partition))
                             totalKeysWritten++;
 
 
                         long bytesScanned = scanners.getTotalBytesScanned();
 
                         // Rate limit the scanners, and account for compression
-                        if (CompactionManager.compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
+                        if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
                             lastBytesScanned = bytesScanned;
 
                         if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
@@ -213,7 +214,7 @@ public boolean apply(SSTableReader sstable)
                 finally
                 {
                     activeCompactions.finishCompaction(ci);
-                    mergedRowCounts = ci.getMergedRowCounts();
+                    mergedRowsHistogram = ci.getMergedRowsHistogram();
                     totalSourceCQLRows = ci.getTotalSourceCQLRows();
                 }
             }
@@ -226,42 +227,32 @@ public boolean apply(SSTableReader sstable)
             {
                 // log a bunch of statistics about the result and save to system table compaction_history
 
-                long durationInNano = System.nanoTime() - start;
-                long dTime = TimeUnit.NANOSECONDS.toMillis(durationInNano);
-                long startsize = inputSizeBytes;
                 long endsize = SSTableReader.getTotalBytes(newSStables);
-                double ratio = (double) endsize / (double) startsize;
-
-                StringBuilder newSSTableNames = new StringBuilder();
-                for (SSTableReader reader : newSStables)
-                    newSSTableNames.append(reader.descriptor.baseFilename()).append(",");
-                long totalSourceRows = 0;
-                for (int i = 0; i < mergedRowCounts.length; i++)
-                    totalSourceRows += mergedRowCounts[i] * (i + 1);
-
-                String mergeSummary = updateCompactionHistory(taskId, cfs.keyspace.getName(), cfs.getTableName(), mergedRowCounts, startsize, endsize);
-
-                logger.info(String.format("Compacted (%s) %d sstables to [%s] to level=%d.  %s to %s (~%d%% of original) in %,dms.  Read Throughput = %s, Write Throughput = %s, Row Throughput = ~%,d/s.  %,d total partitions merged to %,d.  Partition merge counts were {%s}",
-                                           taskId,
-                                           transaction.originals().size(),
-                                           newSSTableNames.toString(),
-                                           getLevel(),
-                                           FBUtilities.prettyPrintMemory(startsize),
-                                           FBUtilities.prettyPrintMemory(endsize),
-                                           (int) (ratio * 100),
-                                           dTime,
-                                           FBUtilities.prettyPrintMemoryPerSecond(startsize, durationInNano),
-                                           FBUtilities.prettyPrintMemoryPerSecond(endsize, durationInNano),
-                                           (int) totalSourceCQLRows / (TimeUnit.NANOSECONDS.toSeconds(durationInNano) + 1),
-                                           totalSourceRows,
-                                           totalKeysWritten,
-                                           mergeSummary));
+
+                updateCompactionHistory(taskId,
+                                        cfs.keyspace.getName(),
+                                        cfs.getTableName(),
+                                        mergedRowsHistogram,
+                                        inputSizeBytes,
+                                        endsize);
+
+                if (logger.isDebugEnabled())
+                {
+                    debugLogCompactionSummaryInfo(taskId,
+                                                  start,
+                                                  totalKeysWritten,
+                                                  inputSizeBytes,
+                                                  newSStables,
+                                                  mergedRowsHistogram,
+                                                  (int) totalSourceCQLRows,
+                                                  endsize);
+                }
                 if (logger.isTraceEnabled())
                 {
-                    logger.trace("CF Total Bytes Compacted: {}", FBUtilities.prettyPrintMemory(CompactionTask.addToTotalBytesCompacted(endsize)));
-                    logger.trace("Actual #keys: {}, Estimated #keys:{}, Err%: {}", totalKeysWritten, estimatedKeys, ((double)(totalKeysWritten - estimatedKeys)/totalKeysWritten));
+                    traceLogCompactionSummaryInfo(totalKeysWritten, estimatedKeys, endsize);
                 }
-                cfs.getCompactionStrategyManager().compactionLogger.compaction(startTime, transaction.originals(), System.currentTimeMillis(), newSStables);
+                cfs.getCompactionStrategyManager().compactionLogger.
+                        compaction(startTime, transaction.originals(), System.currentTimeMillis(), newSStables);
 
                 // update the metrics
                 cfs.metric.compactionBytesWritten.inc(endsize);
@@ -278,24 +269,6 @@ public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
         return new DefaultCompactionWriter(cfs, directories, transaction, nonExpiredSSTables, keepOriginals, getLevel());
     }
 
-    public static String updateCompactionHistory(UUID id, String keyspaceName, String columnFamilyName, long[] mergedRowCounts, long startSize, long endSize)
-    {
-        StringBuilder mergeSummary = new StringBuilder(mergedRowCounts.length * 10);
-        Map<Integer, Long> mergedRows = new HashMap<>();
-        for (int i = 0; i < mergedRowCounts.length; i++)
-        {
-            long count = mergedRowCounts[i];
-            if (count == 0)
-                continue;
-
-            int rows = i + 1;
-            mergeSummary.append(String.format("%d:%d, ", rows, count));
-            mergedRows.put(rows, count);
-        }
-        SystemKeyspace.updateCompactionHistory(id, keyspaceName, columnFamilyName, System.currentTimeMillis(), startSize, endSize, mergedRows);
-        return mergeSummary.toString();
-    }
-
     protected Directories getDirectories()
     {
         return cfs.getDirectories();
@@ -345,7 +318,7 @@ public static boolean getIsTransient(Set<SSTableReader> sstables)
     }
 
 
-    /*
+    /**
      * Checks if we have enough disk space to execute the compaction.  Drops the largest sstable out of the Task until
      * there's enough space (in theory) to handle the compaction.  Does not take into account space that will be taken by
      * other compactions.
@@ -428,4 +401,109 @@ public static long getMaxDataAge(Collection<SSTableReader> sstables)
         }
         return max;
     }
+
+    private void debugLogCompactionSummaryInfo(UUID taskId,
+                                               long start,
+                                               long totalKeysWritten,
+                                               long inputSizeBytes,
+                                               Collection<SSTableReader> newSStables,
+                                               long[] mergedRowsHistogram,
+                                               int totalSourceCQLRows,
+                                               long outputSizeBytes)
+    {
+        // log a bunch of statistics about the result and save to system table compaction_history
+        long durationInNano = System.nanoTime() - start;
+        long dTime = TimeUnit.NANOSECONDS.toMillis(durationInNano);
+        double ratio = (double) outputSizeBytes / (double) inputSizeBytes;
+
+        long totalSourceRows = 0;
+        StringBuilder mergeSummary = new StringBuilder(mergedRowsHistogram.length * 10);
+        mergeSummary.append('{');
+        for (int i = 0; i < mergedRowsHistogram.length; i++)
+        {
+            long mergedRowCount = mergedRowsHistogram[i];
+            if (mergedRowCount != 0)
+            {
+                totalSourceRows += mergedRowCount * (i + 1);
+                mergeSummary.append(i).append(':').append(mergedRowCount).append(", ");
+            }
+        }
+        mergeSummary.append('}');
+
+        StringBuilder newSSTableNames = new StringBuilder(newSStables.size() * 100);
+        for (SSTableReader reader : newSStables)
+            newSSTableNames.append(reader.descriptor.baseFilename()).append(",");
+        logger.debug("Compacted ({}) {} sstables to [{}] to level={}." +
+                     " {} to {} (~{}% of original) in {}ms." +
+                     " Read Throughput = {}, Write Throughput = {}, Row Throughput = ~{}/s." +
+                     " {} total partitions merged to {}." +
+                     " Partition merge counts were {}",
+                     taskId,
+                     transaction.originals().size(),
+                     newSSTableNames.toString(),
+                     getLevel(),
+                     prettyPrintMemory(inputSizeBytes),
+                     prettyPrintMemory(outputSizeBytes),
+                     (int) (ratio * 100),
+                     dTime,
+                     prettyPrintMemoryPerSecond(inputSizeBytes, durationInNano),
+                     prettyPrintMemoryPerSecond(outputSizeBytes, durationInNano),
+                     totalSourceCQLRows / (TimeUnit.NANOSECONDS.toSeconds(durationInNano) + 1),
+                     totalSourceRows,
+                     totalKeysWritten,
+                     mergeSummary.toString());
+    }
+
+    private void debugLogCompactingMessage(UUID taskId)
+    {
+        Set<SSTableReader> originals = transaction.originals();
+        StringBuilder ssTableLoggerMsg = new StringBuilder(originals.size() * 100);
+        ssTableLoggerMsg.append("Compacting (").append(taskId).append(')').append(" [");
+        for (SSTableReader sstr : originals)
+        {
+            ssTableLoggerMsg.append(sstr.getFilename())
+                            .append(":level=")
+                            .append(sstr.getSSTableLevel())
+                            .append(", ");
+        }
+        ssTableLoggerMsg.append("]");
+
+        logger.debug(ssTableLoggerMsg.toString());
+    }
+
+
+    private static void updateCompactionHistory(UUID id,
+                                                String keyspaceName,
+                                                String columnFamilyName,
+                                                long[] mergedRowsHistogram,
+                                                long startSize,
+                                                long endSize)
+    {
+        Map<Integer, Long> mergedRows = new HashMap<>(mergedRowsHistogram.length);
+        for (int i = 0; i < mergedRowsHistogram.length; i++)
+        {
+            long count = mergedRowsHistogram[i];
+            if (count == 0)
+                continue;
+
+            int rows = i + 1;
+            mergedRows.put(rows, count);
+        }
+        SystemKeyspace.updateCompactionHistory(id,
+                                               keyspaceName,
+                                               columnFamilyName,
+                                               System.currentTimeMillis(),
+                                               startSize,
+                                               endSize,
+                                               mergedRows);
+    }
+
+    private void traceLogCompactionSummaryInfo(long totalKeysWritten, long estimatedKeys, long endsize)
+    {
+        logger.trace("CF Total Bytes Compacted: {}", prettyPrintMemory(addToTotalBytesCompacted(endsize)));
+        logger.trace("Actual #keys: {}, Estimated #keys:{}, Err%: {}",
+                     totalKeysWritten,
+                     estimatedKeys,
+                     ((double) (totalKeysWritten - estimatedKeys) / totalKeysWritten));
+    }
 }

From 5bd625c7691efd9b4e80b96597c80499659e9711 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Wed, 21 Apr 2021 14:46:36 +0200
Subject: [PATCH 058/151] STAR-452: Add EverywhereStrategy (#111)

* add EverywhereStrategy classes
Files copied from DSE 6.8.11.
* adjust ES to OSS Replication Strategy interface
* port DB-589 (APOLLO-589)
Only perform drop below RF check on decommission for
non-partitioned keyspaces

(cherry picked from commit 25609fc5dbbf9e2666611b06b389c2705110d6ad)
(cherry picked from commit 9fd515678a2d4a40353db928655bf6b00ba30b25)
---
 .../apache/cassandra/dht/RangeStreamer.java   |   8 +-
 .../locator/AbstractReplicationStrategy.java  |  16 ++
 .../cassandra/locator/EverywhereStrategy.java | 118 ++++++++++
 .../cassandra/locator/LocalStrategy.java      |   6 +
 .../cassandra/locator/TokenMetadata.java      |  13 ++
 .../org/apache/cassandra/schema/Schema.java   |  11 +
 .../cassandra/service/StorageService.java     |   2 +-
 .../locator/EverywhereStrategyTest.java       | 207 ++++++++++++++++++
 .../cassandra/locator/SimpleStrategyTest.java |  10 +
 9 files changed, 387 insertions(+), 4 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/locator/EverywhereStrategy.java
 create mode 100644 test/unit/org/apache/cassandra/locator/EverywhereStrategyTest.java

diff --git a/src/java/org/apache/cassandra/dht/RangeStreamer.java b/src/java/org/apache/cassandra/dht/RangeStreamer.java
index ebf0f0335012..26060769eb14 100644
--- a/src/java/org/apache/cassandra/dht/RangeStreamer.java
+++ b/src/java/org/apache/cassandra/dht/RangeStreamer.java
@@ -467,12 +467,14 @@ else if (useStrictConsistency)
                  if (useStrictConsistency)
                  {
                      EndpointsForRange strictEndpoints;
+
+                     //Start with two sets of who replicates the range before and who replicates it after
+                     EndpointsForRange newEndpoints = strat.calculateNaturalReplicas(toFetch.range().right, tmdAfter);
+
                      //Due to CASSANDRA-5953 we can have a higher RF than we have endpoints.
                      //So we need to be careful to only be strict when endpoints == RF
-                     if (oldEndpoints.size() == strat.getReplicationFactor().allReplicas)
+                     if (!oldEndpoints.stream().allMatch(newEndpoints::contains))
                      {
-                         //Start with two sets of who replicates the range before and who replicates it after
-                         EndpointsForRange newEndpoints = strat.calculateNaturalReplicas(toFetch.range().right, tmdAfter);
                          logger.debug("Old endpoints {}", oldEndpoints);
                          logger.debug("New endpoints {}", newEndpoints);
 
diff --git a/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java b/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java
index 789189558ccc..45f558332d87 100644
--- a/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java
+++ b/src/java/org/apache/cassandra/locator/AbstractReplicationStrategy.java
@@ -284,6 +284,14 @@ public RangesAtEndpoint getAddressReplicas(InetAddressAndPort endpoint)
         return getAddressReplicas(tokenMetadata.cloneOnlyTokenMap(), endpoint);
     }
 
+    /**
+     * Returns the number of token-owning nodes.
+     */
+    protected int getSizeOfRingMemebers()
+    {
+        return tokenMetadata.getAllRingMembers().size();
+    }
+
     public RangesAtEndpoint getPendingAddressRanges(TokenMetadata metadata, Token pendingToken, InetAddressAndPort pendingAddress)
     {
         return getPendingAddressRanges(metadata, Collections.singleton(pendingToken), pendingAddress);
@@ -359,6 +367,14 @@ public static AbstractReplicationStrategy createReplicationStrategy(String keysp
         return strategy;
     }
 
+    /**
+     * Whether this strategy partitions data across the ring
+     */
+    public boolean isPartitioned()
+    {
+        return true;
+    }
+
     /**
      * Before constructing the ARS we first give it a chance to prepare the options map in any way it
      * would like to. For example datacenter auto-expansion or other templating to make the user interface
diff --git a/src/java/org/apache/cassandra/locator/EverywhereStrategy.java b/src/java/org/apache/cassandra/locator/EverywhereStrategy.java
new file mode 100644
index 000000000000..b6ac8b9d35bc
--- /dev/null
+++ b/src/java/org/apache/cassandra/locator/EverywhereStrategy.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.locator;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedHashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+/**
+ * Strategy that replicate data on every {@code live} node.
+ *
+ * <p>This strategy is a {@code MultiDatacentersStrategy}. By consequence, it will handle properly local consistency levels.
+ * Nevertheless, as the data is replicated on every node, consistency levels such as QUORUM should not be used
+ * on clusters having more than 5 nodes.<p>
+ *
+ * <p>During bootstrap the time at which the data will be available is unknown and if the bootstrap is performed with
+ * autobootstrap=false on a seed node, there will be no data locally until rebuild is run.</p>
+ *
+ */
+public class EverywhereStrategy extends AbstractReplicationStrategy
+{
+    public EverywhereStrategy(String keyspaceName,
+                              TokenMetadata tokenMetadata,
+                              IEndpointSnitch snitch,
+                              Map<String, String> configOptions) throws ConfigurationException
+    {
+        super(keyspaceName, tokenMetadata, snitch, configOptions);
+    }
+
+    @Override
+    public EndpointsForRange calculateNaturalReplicas(Token searchToken, TokenMetadata tokenMetadata)
+    {
+        // Even if primary range repairs do not make a lot of sense for this strategy we want the behavior to be
+        // correct if somebody use it.
+        // Primary range repair expect the first endpoint of the list to be the primary range owner.
+        Set<Replica> replicas = new LinkedHashSet<>();
+        Iterator<Token> iter = TokenMetadata.ringIterator(tokenMetadata.sortedTokens(), searchToken, false);
+
+        if (iter.hasNext())
+        {
+            Token end = iter.next();
+            Token start = tokenMetadata.getPredecessor(end);
+            Range<Token> range = new Range<>(start, end);
+
+            InetAddressAndPort endpoint = tokenMetadata.getEndpoint(end);
+            replicas.add(Replica.fullReplica(endpoint, range));
+
+            while (iter.hasNext())
+            {
+                endpoint = tokenMetadata.getEndpoint(iter.next());
+                replicas.add(Replica.fullReplica(endpoint, range));
+            }
+        }
+
+        return EndpointsForRange.copyOf(replicas);
+    }
+
+    @Override
+    public ReplicationFactor getReplicationFactor()
+    {
+        return ReplicationFactor.fullOnly(getSizeOfRingMemebers());
+    }
+
+    @Override
+    public void validateOptions() throws ConfigurationException
+    {
+        // noop
+    }
+
+    @Override
+    public void maybeWarnOnOptions()
+    {
+        // noop
+    }
+
+    @Override
+    public Collection<String> recognizedOptions()
+    {
+        return Collections.emptyList();
+    }
+
+    /**
+     * CASSANDRA-12510 added a check that forbids decommission when the number of
+     * nodes will drop below the RF for a given keyspace. This check is breaking on
+     * EverywhereStrategy because all nodes replicate the keyspace, so this check does
+     * not make sense for partitioned keyspaces such as LocalStrategy and EverywhereStrategy.
+     *
+     * @return <code>false</code> because the data is not partitioned across the ring.
+     */
+    @Override
+    public boolean isPartitioned()
+    {
+        return false;
+    }
+}
diff --git a/src/java/org/apache/cassandra/locator/LocalStrategy.java b/src/java/org/apache/cassandra/locator/LocalStrategy.java
index 0e3a9185feda..64ab89c272e1 100644
--- a/src/java/org/apache/cassandra/locator/LocalStrategy.java
+++ b/src/java/org/apache/cassandra/locator/LocalStrategy.java
@@ -80,4 +80,10 @@ public Collection<String> recognizedOptions()
         // LocalStrategy doesn't expect any options.
         return Collections.emptySet();
     }
+
+    @Override
+    public boolean isPartitioned()
+    {
+        return false;
+    }
 }
diff --git a/src/java/org/apache/cassandra/locator/TokenMetadata.java b/src/java/org/apache/cassandra/locator/TokenMetadata.java
index ab210457f0dd..17525f62d9e6 100644
--- a/src/java/org/apache/cassandra/locator/TokenMetadata.java
+++ b/src/java/org/apache/cassandra/locator/TokenMetadata.java
@@ -1126,6 +1126,19 @@ public int getSizeOfMovingEndpoints()
         }
     }
 
+    public Set<InetAddressAndPort> getAllRingMembers()
+    {
+        lock.readLock().lock();
+        try
+        {
+            return ImmutableSet.copyOf(tokenToEndpointMap.valueSet());
+        }
+        finally
+        {
+            lock.readLock().unlock();
+        }
+    }
+
     public static int firstTokenIndex(final ArrayList<Token> ring, Token start, boolean insertMin)
     {
         assert ring.size() > 0;
diff --git a/src/java/org/apache/cassandra/schema/Schema.java b/src/java/org/apache/cassandra/schema/Schema.java
index c5c1f36d5dbe..2a7a451b558a 100644
--- a/src/java/org/apache/cassandra/schema/Schema.java
+++ b/src/java/org/apache/cassandra/schema/Schema.java
@@ -316,6 +316,17 @@ public List<String> getNonLocalStrategyKeyspaces()
                         .collect(Collectors.toList());
     }
 
+    /**
+     * @return a collection of keyspaces that partition data across the ring
+     */
+    public List<String> getPartitionedKeyspaces()
+    {
+        return keyspaces.stream()
+                        .filter(keyspace -> Keyspace.open(keyspace.name).getReplicationStrategy().isPartitioned())
+                        .map(keyspace -> keyspace.name)
+                        .collect(Collectors.toList());
+    }
+
     /**
      * @return collection of the user defined keyspaces
      */
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index 0d517b15e66d..8d07f655860f 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -4398,7 +4398,7 @@ public void decommission(boolean force) throws InterruptedException
             if (operationMode != Mode.LEAVING) // If we're already decommissioning there is no point checking RF/pending ranges
             {
                 int rf, numNodes;
-                for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces())
+                for (String keyspaceName : Schema.instance.getPartitionedKeyspaces())
                 {
                     if (!force)
                     {
diff --git a/test/unit/org/apache/cassandra/locator/EverywhereStrategyTest.java b/test/unit/org/apache/cassandra/locator/EverywhereStrategyTest.java
new file mode 100644
index 000000000000..8ef3528420a2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/locator/EverywhereStrategyTest.java
@@ -0,0 +1,207 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.locator;
+
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.UUID;
+
+import com.google.common.collect.Sets;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class EverywhereStrategyTest
+{
+    private final Random random = new Random();
+
+    @BeforeClass
+    public static void setup()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @Test
+    public void allRingMembersAreReplicas() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+
+        populateTokenMetadata(3, 1, metadata);
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        assertAllNodesCoverFullRing(strategy, metadata);
+    }
+
+    @Test
+    public void allRingMembersAreReplicasWithvnodes() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+        populateTokenMetadata(5, 8, metadata);
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        assertAllNodesCoverFullRing(strategy, metadata);
+    }
+
+    @Test
+    public void bootstrappingNodesAreNotIncludedAsReplicas() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+
+        populateTokenMetadata(3, 1, metadata);
+
+        metadata.addBootstrapTokens(Arrays.asList(getRandomToken()),
+                                    InetAddressAndPort.getByName("127.0.0.4"));
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        assertAllNodesCoverFullRing(strategy, metadata);
+    }
+
+    @Test
+    public void leavingNodesDoNotAddPendingRanges() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+
+        populateTokenMetadata(3, 1, metadata);
+        InetAddressAndPort leavingEndpoint = metadata.getAllRingMembers().iterator().next();
+        metadata.addLeavingEndpoint(leavingEndpoint);
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        metadata.calculatePendingRanges(strategy, strategy.keyspaceName);
+        PendingRangeMaps pendingRanges = metadata.getPendingRanges(strategy.keyspaceName);
+
+        assertFalse("pending ranges must be empty",
+                    pendingRanges.iterator().hasNext());
+    }
+
+    @Test
+    public void bootstrapNodesNeedFullRingOnPendingRangesCalculation() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+
+        populateTokenMetadata(3, 1, metadata);
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        InetAddressAndPort bootstrapNode = InetAddressAndPort.getByName("127.0.0.4");
+        metadata.addBootstrapTokens(Arrays.asList(getRandomToken()), bootstrapNode);
+
+        metadata.calculatePendingRanges(strategy, strategy.keyspaceName);
+        PendingRangeMaps pendingRangeMaps = metadata.getPendingRanges(strategy.keyspaceName);
+
+        List<Range<Token>> pendingRanges = new ArrayList<>();
+        for (Map.Entry<Range<Token>, EndpointsForRange.Builder> pendingRangeEntry : pendingRangeMaps)
+        {
+            EndpointsForRange.Builder pendingNodes = pendingRangeEntry.getValue();
+            // only the bootstrap node has pending ranges
+            assertEquals(1, pendingNodes.size());
+            assertTrue(pendingNodes.endpoints().contains(bootstrapNode));
+            pendingRanges.add(pendingRangeEntry.getKey());
+        }
+
+        List<Range<Token>> normalizedRanges = Range.normalize(pendingRanges);
+        assertEquals(1, normalizedRanges.size());
+        Range<Token> tokenRange = normalizedRanges.get(0);
+        // it must cover all ranges
+        assertEquals(tokenRange.left, tokenRange.right);
+    }
+
+    @Test
+    public void allRingMembersContributeToReplicationFactor() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+        populateTokenMetadata(10, 5, metadata);
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        assertEquals(10, strategy.getReplicationFactor().fullReplicas);
+        assertEquals(10, strategy.getReplicationFactor().allReplicas);
+    }
+
+    @Test
+    public void noRecognizedOptions() throws Throwable
+    {
+        TokenMetadata metadata = new TokenMetadata();
+        populateTokenMetadata(10, 5, metadata);
+
+        EverywhereStrategy strategy = createStrategy(metadata);
+
+        assertTrue("EverywhereStrategy should have no options", strategy.recognizedOptions().isEmpty());
+    }
+
+    private EverywhereStrategy createStrategy(TokenMetadata tokenMetadata)
+    {
+        IEndpointSnitch snitch = new PropertyFileSnitch();
+        DatabaseDescriptor.setEndpointSnitch(snitch);
+
+        return new EverywhereStrategy("keyspace", tokenMetadata, snitch, Collections.emptyMap());
+    }
+
+    private void populateTokenMetadata(int nodeCount, int tokens, TokenMetadata metadata) throws UnknownHostException
+    {
+        List<InetAddressAndPort> nodes = new ArrayList<>();
+        for (int i = 1; i <= nodeCount; i++)
+        {
+            InetAddress byName = InetAddress.getByName(String.format("127.0.0.%d", i));
+            InetAddressAndPort inetAddressAndPort = InetAddressAndPort.getByAddress(byName);
+            nodes.add(inetAddressAndPort);
+        }
+
+        for (int i = 0; i < tokens; i++)
+        {
+            for (InetAddressAndPort node : nodes)
+            {
+                Token randomToken = getRandomToken();
+                metadata.updateNormalToken(randomToken, node);
+            }
+        }
+    }
+
+    private void assertAllNodesCoverFullRing(AbstractReplicationStrategy strategy, TokenMetadata metadata)
+    {
+        for (Token ringToken : metadata.sortedTokens())
+        {
+            EndpointsForRange endpointsForRange = strategy.calculateNaturalReplicas(ringToken, metadata);
+            assertEquals(metadata.getAllRingMembers().size(), endpointsForRange.size());
+            assertEquals(Sets.newHashSet(metadata.getAllRingMembers()), Sets.newHashSet(endpointsForRange.endpoints()));
+        }
+    }
+
+    private Token getRandomToken()
+    {
+        return Murmur3Partitioner.instance.getRandomToken(random);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
index 4c1ff2639041..ff85ca33363d 100644
--- a/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
+++ b/test/unit/org/apache/cassandra/locator/SimpleStrategyTest.java
@@ -28,6 +28,7 @@
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Multimap;
+import com.google.common.collect.Sets;
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Rule;
@@ -183,6 +184,15 @@ private void verifyGetNaturalEndpoints(Token[] endpointTokens, Token[] keyTokens
         }
     }
 
+    @Test
+    public void testSimpleStrategyKeyspacesArePartitioned()
+    {
+        //local strategy keyspaces should not be returned here since they are not partitioned
+        List<String> partitionedKeyspaces = Schema.instance.getPartitionedKeyspaces();
+        assertEquals(2, partitionedKeyspaces.size());
+        assertEquals(Sets.newHashSet(KEYSPACE1, MULTIDC), Sets.newHashSet(partitionedKeyspaces));
+    }
+
     @Test
     public void testGetEndpointsDuringBootstrap() throws UnknownHostException
     {

From 1b5bf825486dac89433e8fe8d08917325dec9cda Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Wed, 21 Apr 2021 22:18:53 +0200
Subject: [PATCH 059/151] STAR-453 ignore DSE index creation commands (#117)

The commands end without an error with applied = true
(in the same way as "IF EXIST" commands when the entity exists).
A warning is returned to the caller. The index is not created.

(cherry picked from commit b89547e26beb4109a2d1e6b6dbac9c70e8e224f1)
(cherry picked from commit ef28215e04e378907c1ce6695ee32f80ebbf5c74)
---
 .../schema/CreateIndexStatement.java          | 29 +++++++
 .../statements/CreateIndexStatementTest.java  | 84 +++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 test/unit/org/apache/cassandra/cql3/statements/CreateIndexStatementTest.java

diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
index a0ac6e9433c4..52ea9f9bb00f 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
@@ -19,6 +19,7 @@
 
 import java.util.*;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 
@@ -52,6 +53,19 @@ public final class CreateIndexStatement extends AlterSchemaStatement
     private final IndexAttributes attrs;
     private final boolean ifNotExists;
 
+    private static final String DSE_INDEX_WARNING = "Index %s was not created. DSE custom index (%s) is not " +
+                                                    "supported. Consult the docs on alternatives (SAI indexes, " +
+                                                    "Secondary Indexes).";
+
+    @VisibleForTesting
+    public static final Set<String> DSE_INDEXES = ImmutableSet.of(
+        "com.datastax.bdp.cassandra.index.solr.SolrSecondaryIndex",
+        "com.datastax.bdp.cassandra.index.solr.ThriftSolrSecondaryIndex",
+        "com.datastax.bdp.cassandra.index.solr.Cql3SolrSecondaryIndex",
+        "com.datastax.bdp.search.solr.ThriftSolrSecondaryIndex",
+        "com.datastax.bdp.search.solr.Cql3SolrSecondaryIndex"
+    );
+
     public CreateIndexStatement(String keyspaceName,
                                 String tableName,
                                 String indexName,
@@ -69,6 +83,13 @@ public CreateIndexStatement(String keyspaceName,
 
     public Keyspaces apply(Keyspaces schema)
     {
+        if (isDseIndexCreateStatement())
+        {
+            // DSE indexes are not supported. The index is not created, the attempt is ignored (doesn't cause error),
+            // a meaningfull warning is returned instead.
+            return schema;
+        }
+
         attrs.validate();
 
         if (attrs.isCustom && attrs.customClass.equals(SASIIndex.class.getName()) && !DatabaseDescriptor.getEnableSASIIndexes())
@@ -147,9 +168,17 @@ Set<String> clientWarnings(KeyspacesDiff diff)
         if (attrs.isCustom && attrs.customClass.equals(SASIIndex.class.getName()))
             return ImmutableSet.of(SASIIndex.USAGE_WARNING);
 
+        if (isDseIndexCreateStatement())
+            return ImmutableSet.of(String.format(DSE_INDEX_WARNING, indexName, attrs.customClass));
+
         return ImmutableSet.of();
     }
 
+    private boolean isDseIndexCreateStatement()
+    {
+        return DSE_INDEXES.contains(attrs.customClass);
+    }
+
     private void validateIndexTarget(TableMetadata table, IndexMetadata.Kind kind, IndexTarget target)
     {
         ColumnMetadata column = table.getColumn(target.column);
diff --git a/test/unit/org/apache/cassandra/cql3/statements/CreateIndexStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/CreateIndexStatementTest.java
new file mode 100644
index 000000000000..ebb347eac4dd
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/statements/CreateIndexStatementTest.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import java.util.Set;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.cql3.statements.schema.CreateIndexStatement;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+@RunWith(Parameterized.class)
+public class CreateIndexStatementTest extends CQLTester
+{
+    @Parameterized.Parameters(name = "index = {0}")
+    public static Set<String> dseIndexes()
+    {
+        return CreateIndexStatement.DSE_INDEXES;
+    }
+
+    @Parameterized.Parameter()
+    public String indexClass;
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1));
+        QueryProcessor.executeOnceInternal("CREATE TABLE ks.tbl (k int, c int, v int, primary key (k, c))");
+    }
+
+    private void assertNoIndex(String indexName) throws Throwable
+    {
+        try
+        {
+            executeNet("DESCRIBE INDEX ks." + indexName);
+            fail("Expected InvalidQueryException caused by a missing index");
+        }
+        catch (InvalidQueryException e)
+        {
+            assertTrue(e.getMessage().contains(indexName + "' not found"));
+        }
+    }
+
+    @Test
+    public void dseIndexCreationShouldBeIgonerWithWarning() throws Throwable
+    {
+        // should not throw
+        ResultSet rows = executeNet(String.format("CREATE CUSTOM INDEX index_name ON ks.tbl (v) USING '%s'", indexClass));
+
+        assertTrue(rows.wasApplied()); // the command is ignored
+
+        String warning = rows.getAllExecutionInfo().get(0).getWarnings().get(0);
+        assertTrue("Custom DSE index creation should cause a warning", warning.contains("DSE custom index"));
+
+        assertNoIndex("index_name");
+    }
+}

From cf205511d4722bea92fab7391dc6809e86565fae Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Thu, 22 Apr 2021 10:02:16 +0200
Subject: [PATCH 060/151] STAR-497 fix pycodestyle (#123)

(cherry picked from commit d36c1104478b3afb1fb9403d004c7eb9a3425ab6)
(cherry picked from commit f1eb25c7214e2d4803041cf29c22fd3b0d2ad263)
---
 pylib/cqlshlib/geotypes.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pylib/cqlshlib/geotypes.py b/pylib/cqlshlib/geotypes.py
index 0ab2bb03483e..bad2f2b85b1b 100644
--- a/pylib/cqlshlib/geotypes.py
+++ b/pylib/cqlshlib/geotypes.py
@@ -84,6 +84,7 @@ def _patch_get_converters(klass):
     when using prepared statements to batch load
     """
     original_method = klass._get_converter
+
     def new_method(self, cql_type):
         if cql_type.typename == 'PointType':
             return _convert_point
@@ -102,12 +103,13 @@ def _patch_init(klass):
     when making queries with string literal values
     """
     original_method = klass.__init__
+
     def new_method(self, *args, **kwargs):
         original_method(self, *args, **kwargs)
         ptypes = zip(self.protectors, self.coltypes)
-        clean = lambda t: re.sub("[\W]", "", t.split('.')[-1])  # discard java package names and ' characters
+        clean = lambda t: re.sub("[\\W]", "", t.split('.')[-1])  # discard java package names and ' characters
         gtypes = {'PointType', 'LineStringType', 'PolygonType'}
-        self.protectors = [protect_value if clean(t) in gtypes else p for p,t in ptypes]
+        self.protectors = [protect_value if clean(t) in gtypes else p for p, t in ptypes]
     klass.__init__ = new_method
 
 

From 5c12f8726dc1e72adf3714eed1b3b8af1c71465a Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Tue, 20 Apr 2021 17:23:15 +0100
Subject: [PATCH 061/151] STAR-435 Make MmappedRegions.MAX_SEGMENT_SIZE
 configurable with system property

(cherry picked from commit fe40734503feafe3123632f711bfff75c74f1c36)
(cherry picked from commit 5e06b7f4728e481f1c68425b4c06fbd99bdc32d6)
---
 src/java/org/apache/cassandra/io/util/MmappedRegions.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/io/util/MmappedRegions.java b/src/java/org/apache/cassandra/io/util/MmappedRegions.java
index 0b7dd39353e3..8336ffb7c1e5 100644
--- a/src/java/org/apache/cassandra/io/util/MmappedRegions.java
+++ b/src/java/org/apache/cassandra/io/util/MmappedRegions.java
@@ -37,7 +37,7 @@
 public class MmappedRegions extends SharedCloseableImpl
 {
     /** In a perfect world, MAX_SEGMENT_SIZE would be final, but we need to test with a smaller size */
-    public static int MAX_SEGMENT_SIZE = Integer.MAX_VALUE;
+    public static int MAX_SEGMENT_SIZE = Integer.getInteger("cassandra.mmapped_max_segment_size", Integer.MAX_VALUE);
 
     /** When we need to grow the arrays, we add this number of region slots */
     static final int REGION_ALLOC_SIZE = 15;

From a757e2f26318dd63cc3d264f6c082714d2d193a5 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Tue, 20 Apr 2021 16:40:08 +0100
Subject: [PATCH 062/151] STAR-297 change dse.trie_size_limit_mb to
 cassandra.trie_size_limit_mb

(cherry picked from commit e198beb23337227209ebc93da03025c4c731a844)
(cherry picked from commit 31b6eb79e480746643f1a5b28e5045ad12a0f8a2)
---
 src/java/org/apache/cassandra/db/tries/MemtableTrie.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index b492805690b1..8e9bc1ca933d 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -55,7 +55,7 @@ public class MemtableTrie<T> extends MemtableReadTrie<T>
     private static final int ALLOCATED_SIZE_THRESHOLD;
     static
     {
-        String propertyName = "dse.trie_size_limit_mb";
+        String propertyName = "cassandra.trie_size_limit_mb";
         // Default threshold + 10% == 1 GB. Adjusted slightly up to avoid a tiny final allocation for the 2G max.
         int limitInMB = Integer.parseInt(System.getProperty(propertyName,
                                                             Integer.toString(1024 * 10 / 11 + 1)));

From 59c12a978f2cf23fe57d85fa91bb3be416ae5f4c Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Mon, 19 Apr 2021 18:19:58 +0100
Subject: [PATCH 063/151] STAR-476 Fix TrieTermsDictionaryTest for duplicate
 random values

(cherry picked from commit 85477999f65ea601a9ba5c69c4955019c219f92c)
(cherry picked from commit a8fd86c2f8635056e5fa4eddadd1c7318de1833d)
---
 .../sai/disk/v1/TrieTermsDictionaryTest.java  | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java b/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java
index fa743a7735e4..6f934833bed7 100644
--- a/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/disk/v1/TrieTermsDictionaryTest.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.index.sai.disk.v1;
 
 import java.io.IOException;
+import java.util.Collections;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Collectors;
@@ -75,12 +76,7 @@ private void doTestExactMatch() throws Exception
     public void testTermEnum() throws IOException
     {
         final IndexComponents components = newIndexComponents();
-        final int numKeys = randomIntBetween(16, 512);
-        final List<ByteComparable> byteComparables = Stream.generate(() -> randomSimpleString(4, 48))
-                                                           .limit(numKeys)
-                                                           .sorted()
-                                                           .map(this::asByteComparable)
-                                                           .collect(Collectors.toList());
+        final List<ByteComparable> byteComparables = generateSortedByteComparables();
 
         long fp;
         try (TrieTermsDictionaryWriter writer = new TrieTermsDictionaryWriter(components, false))
@@ -114,12 +110,7 @@ public void testTermEnum() throws IOException
     public void testMinMaxTerm() throws IOException
     {
         final IndexComponents components = newIndexComponents();
-        final int numKeys = randomIntBetween(16, 512);
-        final List<ByteComparable> byteComparables = Stream.generate(() -> randomSimpleString(4, 48))
-                                                           .limit(numKeys)
-                                                           .sorted()
-                                                           .map(this::asByteComparable)
-                                                           .collect(Collectors.toList());
+        final List<ByteComparable> byteComparables = generateSortedByteComparables();
 
         long fp;
         try (TrieTermsDictionaryWriter writer = new TrieTermsDictionaryWriter(components, false))
@@ -144,6 +135,21 @@ public void testMinMaxTerm() throws IOException
         }
     }
 
+    private List<ByteComparable> generateSortedByteComparables()
+    {
+        final int numKeys = randomIntBetween(16, 512);
+        final List<String> randomStrings = Stream.generate(() -> randomSimpleString(4, 48))
+                                                 .limit(numKeys)
+                                                 .sorted()
+                                                 .collect(Collectors.toList());
+
+        // Get rid of any duplicates otherwise the tests will fail.
+        return randomStrings.stream()
+                            .filter(string -> Collections.frequency(randomStrings, string) == 1)
+                            .map(this::asByteComparable)
+                            .collect(Collectors.toList());
+    }
+
     private ByteComparable asByteComparable(String s)
     {
         return ByteComparable.fixedLength(ByteBufferUtil.bytes(s));

From cb00ffb26198a32c282ce1cff273c6ca2b123cc2 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Tue, 20 Apr 2021 12:57:18 +0100
Subject: [PATCH 064/151] STAR-343 Frozen Tuple, Frozen UDT & Boolean support

(cherry picked from commit 064d9eeb27c327b7b9a3ae55ac10137e3e30eca6)
(cherry picked from commit bb40cd0ad2a173c260328e25b1941d8c7adbd705)
---
 .../cassandra/db/marshal/TupleType.java       |  2 +-
 .../cassandra/index/sai/ColumnContext.java    |  4 +-
 .../index/sai/StorageAttachedIndex.java       |  6 +-
 .../index/sai/disk/SSTableIndexWriter.java    |  4 +-
 .../cassandra/index/sai/utils/TypeUtil.java   | 28 +++++--
 .../org/apache/cassandra/cql3/CQLTester.java  | 15 ++++
 .../index/sai/cql/BooleanTypeTest.java        | 44 ++++++++++
 .../index/sai/cql/NativeIndexDDLTest.java     | 22 +++--
 .../index/sai/cql/types/BooleanTest.java      | 46 ++++++++++
 .../index/sai/cql/types/DataSet.java          | 28 +++++++
 .../sai/cql/types/IndexingTypeSupport.java    |  2 +
 .../index/sai/cql/types/QuerySet.java         | 37 +++++++++
 .../multicell/FrozenTupleCollectionTest.java  | 54 ++++++++++++
 .../types/multicell/FrozenTupleDataSet.java   | 74 +++++++++++++++++
 .../cql/types/multicell/FrozenTupleTest.java  | 48 +++++++++++
 .../types/multicell/FrozenTupleTupleTest.java | 48 +++++++++++
 .../multicell/FrozenUDTCollectionTest.java    | 51 ++++++++++++
 .../cql/types/multicell/FrozenUDTDataSet.java | 83 +++++++++++++++++++
 .../cql/types/multicell/FrozenUDTTest.java    | 48 +++++++++++
 .../sai/cql/types/multicell/TupleDataSet.java | 44 ++++++++++
 .../sai/cql/types/multicell/TupleTest.java    | 48 +++++++++++
 .../index/sai/disk/TypeUtilTest.java          | 50 ++++++++++-
 22 files changed, 760 insertions(+), 26 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/BooleanTypeTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/BooleanTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleDataSet.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTupleTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTDataSet.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleDataSet.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleTest.java

diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java
index c3b5ddbaac35..c8a8dbda7b8f 100644
--- a/src/java/org/apache/cassandra/db/marshal/TupleType.java
+++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java
@@ -64,7 +64,7 @@ public TupleType(List<AbstractType<?>> types)
         this(types, true);
     }
 
-    protected TupleType(List<AbstractType<?>> types, boolean freezeInner)
+    public TupleType(List<AbstractType<?>> types, boolean freezeInner)
     {
         super(ComparisonType.CUSTOM);
 
diff --git a/src/java/org/apache/cassandra/index/sai/ColumnContext.java b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
index 7b93acbe1aff..9c86d5ecb917 100644
--- a/src/java/org/apache/cassandra/index/sai/ColumnContext.java
+++ b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
@@ -304,9 +304,9 @@ public boolean isNonFrozenCollection()
         return TypeUtil.isNonFrozenCollection(target.left.type);
     }
 
-    public boolean isFrozenCollection()
+    public boolean isFrozen()
     {
-        return TypeUtil.isFrozenCollection(target.left.type);
+        return TypeUtil.isFrozen(target.left.type);
     }
 
     public String getColumnName()
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
index 1f4847531219..729006600208 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndex.java
@@ -207,7 +207,7 @@ public SecondaryIndexBuilder getIndexBuildTask(ColumnFamilyStore cfs,
                                                                         CQL3Type.Native.SMALLINT, CQL3Type.Native.TEXT, CQL3Type.Native.TIME,
                                                                         CQL3Type.Native.TIMESTAMP, CQL3Type.Native.TIMEUUID, CQL3Type.Native.TINYINT,
                                                                         CQL3Type.Native.UUID, CQL3Type.Native.VARCHAR, CQL3Type.Native.INET,
-                                                                        CQL3Type.Native.VARINT, CQL3Type.Native.DECIMAL);
+                                                                        CQL3Type.Native.VARINT, CQL3Type.Native.DECIMAL, CQL3Type.Native.BOOLEAN);
 
     private static final Set<Class<? extends IPartitioner>> ILLEGAL_PARTITIONERS =
             ImmutableSet.of(OrderPreservingPartitioner.class, LocalPartitioner.class, ByteOrderedPartitioner.class, RandomPartitioner.class);
@@ -293,11 +293,11 @@ public static Map<String, String> validateOptions(Map<String, String> options, T
         {
             for (AbstractType<?> subType : type.subTypes())
             {
-                if (!SUPPORTED_TYPES.contains(subType.asCQL3Type()) && !TypeUtil.isFrozenCollection(subType))
+                if (!SUPPORTED_TYPES.contains(subType.asCQL3Type()) && !TypeUtil.isFrozen(subType))
                     throw new InvalidRequestException("Unsupported type: " + subType.asCQL3Type());
             }
         }
-        else if (!SUPPORTED_TYPES.contains(type.asCQL3Type()) && !TypeUtil.isFrozenCollection(type))
+        else if (!SUPPORTED_TYPES.contains(type.asCQL3Type()) && !TypeUtil.isFrozen(type))
         {
             throw new InvalidRequestException("Unsupported type: " + type.asCQL3Type());
         }
diff --git a/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java b/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java
index a80ec39ccc81..49b918e6dae0 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/SSTableIndexWriter.java
@@ -54,7 +54,7 @@ public class SSTableIndexWriter implements ColumnIndexWriter
     private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
 
     public static final int MAX_STRING_TERM_SIZE = Integer.getInteger("cassandra.sai.max_string_term_size_kb", 1) * 1024;
-    public static final int MAX_FROZEN_COLLECTION_TERM_SIZE =Integer.getInteger("cassandra.sai.max_frozen_term_size_kb", 5) * 1024;
+    public static final int MAX_FROZEN_TERM_SIZE = Integer.getInteger("cassandra.sai.max_frozen_term_size_kb", 5) * 1024;
     public static final String TERM_OVERSIZE_MESSAGE =
             "Can't add term of column {} to index for key: {}, term size {} " +
                     "max allowed size {}, use analyzed = true (if not yet set) for that column.";
@@ -84,7 +84,7 @@ public SSTableIndexWriter(Descriptor descriptor, ColumnContext columnContext, Na
         this.analyzer = columnContext.getAnalyzer();
         this.limiter = limiter;
         this.isIndexValid = isIndexValid;
-        this.maxTermSize = columnContext.isFrozenCollection() ? MAX_FROZEN_COLLECTION_TERM_SIZE : MAX_STRING_TERM_SIZE;
+        this.maxTermSize = columnContext.isFrozen() ? MAX_FROZEN_TERM_SIZE : MAX_STRING_TERM_SIZE;
 
     }
 
diff --git a/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java b/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java
index 70b54cde69be..b6c0471affc2 100644
--- a/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java
+++ b/src/java/org/apache/cassandra/index/sai/utils/TypeUtil.java
@@ -38,6 +38,7 @@
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.marshal.BooleanType;
 import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.db.marshal.CollectionType;
 import org.apache.cassandra.db.marshal.CompositeType;
@@ -268,7 +269,7 @@ public static int compare(ByteBuffer b1, ByteBuffer b2, AbstractType<?> type)
             return compareInet(b1, b2);
         // BigInteger values, frozen types and composite types (map entries) use compareUnsigned to maintain
         // a consistent order between the in-memory index and the on-disk index.
-        else if (isBigInteger(type) || isBigDecimal(type) || isCompositeOrFrozenCollection(type))
+        else if (isBigInteger(type) || isBigDecimal(type) || isCompositeOrFrozen(type))
             return FastByteOperations.compareUnsigned(b1, b2);
 
         return type.compare(b1, b2 );
@@ -286,7 +287,7 @@ public static int comparePostFilter(Expression.Value requestedValue, Expression.
         if (isInetAddress(type))
             return compareInet(requestedValue.encoded, columnValue.encoded);
         // Override comparisons for frozen collections and composite types (map entries)
-        else if (isCompositeOrFrozenCollection(type))
+        else if (isCompositeOrFrozen(type))
             return FastByteOperations.compareUnsigned(requestedValue.raw, columnValue.raw);
 
         return type.compare(requestedValue.raw, columnValue.raw);
@@ -312,7 +313,7 @@ public static Iterator<ByteBuffer> collectionIterator(AbstractType<?> validator,
     public static Comparator<ByteBuffer> comparator(AbstractType<?> type)
     {
         // Override the comparator for BigInteger, frozen collections and composite types
-        if (isBigInteger(type) || isBigDecimal(type) || isCompositeOrFrozenCollection(type))
+        if (isBigInteger(type) || isBigDecimal(type) || isCompositeOrFrozen(type))
             return FastByteOperations::compareUnsigned;
 
         return type;
@@ -432,7 +433,7 @@ public static ByteBuffer encodeBigInteger(ByteBuffer value)
      */
     public static boolean isLiteral(AbstractType<?> type)
     {
-        return isUTF8OrAscii(type) || isCompositeOrFrozenCollection(type);
+        return isUTF8OrAscii(type) || isCompositeOrFrozen(type) || baseType(type) instanceof BooleanType;
     }
 
     /**
@@ -445,16 +446,25 @@ public static boolean isUTF8OrAscii(AbstractType<?> type)
     }
 
     /**
-     * Returns <code>true</code> if given {@link AbstractType} is Composite(map entry) or frozen-collection.
+     * Returns <code>true</code> if given {@link AbstractType} is a Composite(map entry) or frozen.
      */
-    public static boolean isCompositeOrFrozenCollection(AbstractType<?> type)
+    public static boolean isCompositeOrFrozen(AbstractType<?> type)
     {
         type = baseType(type);
-        return type instanceof CompositeType || (type.isCollection() && !type.isMultiCell());
+        return type instanceof CompositeType || isFrozen(type);
     }
 
     /**
-     * Returns <code>true</code> if given {@link AbstractType} is frozen-collection.
+     * Returns <code>true</code> if given {@link AbstractType} is frozen.
+     */
+    public static boolean isFrozen(AbstractType<?> type)
+    {
+        type = baseType(type);
+        return !type.subTypes().isEmpty() && !type.isMultiCell();
+    }
+
+    /**
+     * Returns <code>true</code> if given {@link AbstractType} is a frozen collection.
      */
     public static boolean isFrozenCollection(AbstractType<?> type)
     {
@@ -463,7 +473,7 @@ public static boolean isFrozenCollection(AbstractType<?> type)
     }
 
     /**
-     * Returns <code>true</code> if given {@link AbstractType} is non-frozen-collection.
+     * Returns <code>true</code> if given {@link AbstractType} is a non-frozen collection.
      */
     public static boolean isNonFrozenCollection(AbstractType<?> type)
     {
diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 27df5bd298bd..9bd9be5feb83 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -1965,6 +1965,21 @@ public String toString()
         {
             return "TupleValue" + toCQLString();
         }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o) return true;
+            if (o == null || getClass() != o.getClass()) return false;
+            TupleValue that = (TupleValue) o;
+            return Arrays.equals(values, that.values);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(values);
+        }
     }
 
     private static class UserTypeValue extends TupleValue
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/BooleanTypeTest.java b/test/unit/org/apache/cassandra/index/sai/cql/BooleanTypeTest.java
new file mode 100644
index 000000000000..313141441874
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/BooleanTypeTest.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.junit.Assert.assertEquals;
+
+public class BooleanTypeTest extends SAITester
+{
+    @Test
+    public void test() throws Throwable
+    {
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val boolean)");
+
+        createIndex("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+
+        execute("INSERT INTO %s (id, val) VALUES ('0', false)");
+        execute("INSERT INTO %s (id, val) VALUES ('1', true)");
+        execute("INSERT INTO %s (id, val) VALUES ('2', true)");
+
+        assertEquals(2, execute("SELECT id FROM %s WHERE val = true").size());
+        assertEquals(1, execute("SELECT id FROM %s WHERE val = false").size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java b/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java
index 687841cc0015..0b3bb998e42f 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/NativeIndexDDLTest.java
@@ -38,12 +38,14 @@
 import com.datastax.driver.core.exceptions.InvalidQueryException;
 import com.datastax.driver.core.exceptions.ReadFailureException;
 import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.restrictions.IndexRestrictions;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.Int32Type;
 import org.apache.cassandra.db.marshal.ReversedType;
 import org.apache.cassandra.db.marshal.UTF8Type;
@@ -65,6 +67,7 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.Throwables;
 import org.mockito.Mockito;
 
@@ -215,22 +218,27 @@ public void shouldFailOnNormalizeWithNonText()
     }
 
     @Test
-    public void shouldFailCreateWithTupleType()
+    public void shouldFailCreateWithUserType()
     {
-        createTable("CREATE TABLE %s (id text PRIMARY KEY, val tuple<text, int, double>)");
+        String typeName = createType("CREATE TYPE %s (a text, b int, c double)");
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val " + typeName + ")");
 
         assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
                                             "USING 'StorageAttachedIndex'")).isInstanceOf(InvalidQueryException.class);
     }
 
     @Test
-    public void shouldFailCreateWithUserType()
+    public void shouldNotFailCreateWithTupleType() throws Throwable
     {
-        String typeName = createType("CREATE TYPE %s (a text, b int, c double)");
-        createTable("CREATE TABLE %s (id text PRIMARY KEY, val frozen<" + typeName + ">)");
+        createTable("CREATE TABLE %s (id text PRIMARY KEY, val tuple<text, int, double>)");
 
-        assertThatThrownBy(() -> executeNet("CREATE CUSTOM INDEX ON %s(val) " +
-                                            "USING 'StorageAttachedIndex'")).isInstanceOf(InvalidQueryException.class);
+        executeNet("CREATE CUSTOM INDEX ON %s(val) USING 'StorageAttachedIndex'");
+
+        TableMetadata metadata = currentTableMetadata();
+        AbstractType<?> tuple = metadata.getColumn(ColumnIdentifier.getInterned("val", false)).type;
+        assertFalse(tuple.isMultiCell());
+        assertFalse(tuple.isCollection());
+        assertTrue(tuple.isTuple());
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/BooleanTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/BooleanTest.java
new file mode 100644
index 000000000000..2db91b5ebf6b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/BooleanTest.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+@RunWith(Parameterized.class)
+public class BooleanTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new DataSet.BooleanDataSet());
+    }
+
+    public BooleanTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java
index 99232c2118ee..1e3a728e9120 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/DataSet.java
@@ -44,6 +44,11 @@ public abstract class DataSet<T> extends CQLTester
 {
     public T[] values;
 
+    public void init()
+    {
+        // used to create UDT
+    }
+
     public abstract QuerySet querySet();
 
     public Collection<String> decorateIndexColumn(String column)
@@ -426,6 +431,29 @@ public String toString()
         }
     }
 
+    public static class BooleanDataSet extends DataSet<Boolean>
+    {
+        public BooleanDataSet()
+        {
+            values = new Boolean[NUMBER_OF_VALUES];
+            for (int index = 0; index < values.length; index++)
+            {
+                values[index] = getRandom().nextBoolean();
+            }
+        }
+
+        @Override
+        public QuerySet querySet()
+        {
+            return new QuerySet.BooleanQuerySet(this);
+        }
+
+        public String toString()
+        {
+            return "boolean";
+        }
+    }
+
     public static class TextDataSet extends DataSet<String>
     {
         public TextDataSet()
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java b/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java
index c4677294715a..6488d3930d87 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/IndexingTypeSupport.java
@@ -71,6 +71,8 @@ public IndexingTypeSupport(DataSet<?> dataset, boolean widePartitions, Scenario
     @Before
     public void createTable()
     {
+        dataset.init();
+
         createTable(String.format("CREATE TABLE %%s (pk int, ck int, value %s, PRIMARY KEY(pk, ck))", dataset));
 
         disableCompaction();
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java
index cd2f29c31634..aab812ad8b98 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/QuerySet.java
@@ -97,6 +97,35 @@ public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
         }
     }
 
+    public static class BooleanQuerySet extends QuerySet
+    {
+        BooleanQuerySet(DataSet<?> dataSet)
+        {
+            super(dataSet);
+        }
+
+        @Override
+        public void runQueries(SAITester tester, Object[][] allRows) throws Throwable
+        {
+            // Query each value for EQ operator
+            for (int index = 0; index < allRows.length; index++)
+            {
+                Object value = allRows[index][2];
+                assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE value = ?", value), getExpectedRows(value, allRows));
+            }
+        }
+        protected Object[][] getExpectedRows(Object value, Object[][] allRows)
+        {
+            List<Object[]> expected = new ArrayList<>();
+            for (Object[] row : allRows)
+            {
+                if (row[2].equals(value))
+                    expected.add(row);
+            }
+            return expected.toArray(new Object[][]{});
+        }
+    }
+
     public static class LiteralQuerySet extends QuerySet
     {
         LiteralQuerySet(DataSet<?> dataSet)
@@ -197,6 +226,14 @@ protected Object[][] getExpectedRows(Object value, Object[][] allRows)
         }
     }
 
+    public static class FrozenTuple extends FrozenCollectionQuerySet
+    {
+        public FrozenTuple(DataSet<?> dataset)
+        {
+            super(dataset);
+        }
+    }
+
     public static class MapValuesQuerySet extends CollectionQuerySet
     {
         public MapValuesQuerySet(DataSet<?> dataSet, DataSet<?> elementDataSet)
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleCollectionTest.java
new file mode 100644
index 000000000000..ef05aa84efa0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleCollectionTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import java.util.Collection;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+
+@RunWith(Parameterized.class)
+public class FrozenTupleCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new FrozenTupleDataSet(
+        new CollectionDataSet.ListDataSet<>(new DataSet.AsciiDataSet()),
+        new CollectionDataSet.SetDataSet<>(new DataSet.InetDataSet()),
+        new CollectionDataSet.MapDataSet<>(new DataSet.BigintDataSet())
+        ));
+    }
+
+    public FrozenTupleCollectionTest(DataSet<?> dataset, boolean widePartitions, IndexingTypeSupport.Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleDataSet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleDataSet.java
new file mode 100644
index 000000000000..13abd512dcc7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleDataSet.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.QuerySet;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.stream.Collectors;
+
+import static org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport.NUMBER_OF_VALUES;
+
+public class FrozenTupleDataSet extends DataSet<Object>
+{
+    private final DataSet<?>[] elementDataSets;
+
+    public FrozenTupleDataSet(DataSet<?>... elementDataSets)
+    {
+        this.elementDataSets = elementDataSets;
+
+        values = new Object[NUMBER_OF_VALUES];
+        for (int index = 0; index < NUMBER_OF_VALUES; index++)
+        {
+            Object[] fields = new Object[elementDataSets.length];
+            for (int i = 0; i < elementDataSets.length; i++)
+                fields[i] = elementDataSets[i].values[getRandom().nextIntBetween(0, elementDataSets[i].values.length - 1)];
+
+            values[index] = tuple(fields);
+        }
+    }
+
+    @Override
+    public QuerySet querySet()
+    {
+        return new QuerySet.FrozenTuple(this);
+    }
+
+    @Override
+    public Collection<String> decorateIndexColumn(String column)
+    {
+        return Collections.singletonList(column);
+    }
+
+    @Override
+    public String toString()
+    {
+        String fields = Arrays.stream(elementDataSets).map(Object::toString).collect(Collectors.joining(","));
+        return String.format(type(), fields);
+    }
+
+    String type()
+    {
+        // byte default it's considered frozen
+        return "frozen<tuple<%s>>";
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTest.java
new file mode 100644
index 000000000000..49f5926f8fe6
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
+public class FrozenTupleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new FrozenTupleDataSet(new DataSet.BigintDataSet(), new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenTupleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTupleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTupleTest.java
new file mode 100644
index 000000000000..af7380f91c39
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenTupleTupleTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
+public class FrozenTupleTupleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new FrozenTupleDataSet(new FrozenTupleDataSet(new DataSet.AsciiDataSet(), new DataSet.UuidDataSet()), new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenTupleTupleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTCollectionTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTCollectionTest.java
new file mode 100644
index 000000000000..3dd6e9453bac
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTCollectionTest.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.apache.cassandra.index.sai.cql.types.collections.CollectionDataSet;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
+public class FrozenUDTCollectionTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new FrozenUDTDataSet(
+        new CollectionDataSet.ListDataSet<>(new DataSet.AsciiDataSet()),
+        new CollectionDataSet.MapDataSet<>(new DataSet.BigintDataSet())));
+    }
+
+    public FrozenUDTCollectionTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTDataSet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTDataSet.java
new file mode 100644
index 000000000000..31786c868415
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTDataSet.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.QuerySet;
+
+import static org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport.NUMBER_OF_VALUES;
+
+public class FrozenUDTDataSet extends DataSet<Object>
+{
+    private final DataSet<?>[] elementDataSets;
+
+    private volatile String udt;
+
+    public FrozenUDTDataSet(DataSet<?>... elementDataSets)
+    {
+        this.elementDataSets = elementDataSets;
+
+        values = new Object[NUMBER_OF_VALUES];
+        for (int index = 0; index < NUMBER_OF_VALUES; index++)
+        {
+            Object[] fields = new Object[elementDataSets.length * 2]; // field name and filed value
+            for (int i = 0; i < elementDataSets.length; i++)
+            {
+                fields[i * 2] = elementDataSets[i].toString();
+                fields[i * 2 + 1] = elementDataSets[i].values[getRandom().nextIntBetween(0, elementDataSets[i].values.length - 1)];
+            }
+
+            values[index] = userType(fields);
+        }
+    }
+
+    @Override
+    public void init()
+    {
+        StringBuilder fields = new StringBuilder();
+        for (int i = 0; i < elementDataSets.length; i++)
+        {
+            if (i != 0)
+                fields.append(", ");
+
+            fields.append("v_").append(i).append(" ").append(elementDataSets[i]);
+        }
+        udt = createType(String.format("CREATE TYPE %%s(%s)", fields.toString()));
+    }
+
+    @Override
+    public QuerySet querySet()
+    {
+        return new QuerySet.FrozenTuple(this);
+    }
+
+    @Override
+    public Collection<String> decorateIndexColumn(String column)
+    {
+        return Collections.singletonList(column);
+    }
+
+    public String toString()
+    {
+        return String.format("frozen<%s>", udt);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTTest.java
new file mode 100644
index 000000000000..6fe52a49d026
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/FrozenUDTTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
+public class FrozenUDTTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new FrozenUDTDataSet(new DataSet.BigintDataSet(), new DataSet.AsciiDataSet()));
+    }
+
+    public FrozenUDTTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleDataSet.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleDataSet.java
new file mode 100644
index 000000000000..35e858d486c1
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleDataSet.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.QuerySet;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.stream.Collectors;
+
+import static org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport.NUMBER_OF_VALUES;
+
+public class TupleDataSet extends FrozenTupleDataSet
+{
+    public TupleDataSet(DataSet<?>... elementDataSets)
+    {
+        super(elementDataSets);
+    }
+
+    @Override
+    String type()
+    {
+        // byte default it's considered frozen
+        return "tuple<%s>";
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleTest.java b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleTest.java
new file mode 100644
index 000000000000..2b1aaa038697
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/types/multicell/TupleTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql.types.multicell;
+
+import org.apache.cassandra.index.sai.cql.types.DataSet;
+import org.apache.cassandra.index.sai.cql.types.IndexingTypeSupport;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Collection;
+
+@RunWith(Parameterized.class)
+public class TupleTest extends IndexingTypeSupport
+{
+    @Parameterized.Parameters(name = "dataset={0},wide={1},scenario={2}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return generateParameters(new TupleDataSet(new DataSet.BigintDataSet(), new DataSet.AsciiDataSet()));
+    }
+
+    public TupleTest(DataSet<?> dataset, boolean widePartitions, Scenario scenario)
+    {
+        super(dataset, widePartitions, scenario);
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        runIndexQueryScenarios();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java b/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java
index 8dce481a1ea9..e444674bdbe7 100644
--- a/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/disk/TypeUtilTest.java
@@ -27,6 +27,7 @@
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.FieldIdentifier;
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.CompositeType;
@@ -36,12 +37,15 @@
 import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.db.marshal.ReversedType;
 import org.apache.cassandra.db.marshal.SetType;
+import org.apache.cassandra.db.marshal.TupleType;
 import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UserType;
 import org.apache.cassandra.index.sai.StorageAttachedIndex;
 import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
 import org.apache.cassandra.index.sai.utils.NdiRandomizedTest;
 import org.apache.cassandra.index.sai.utils.TypeUtil;
 import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.Pair;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 
@@ -55,10 +59,11 @@ public void testSimpleType()
             AbstractType<?> type = cql3Type.getType();
             AbstractType<?> reversedType = ReversedType.getInstance(type);
 
-            boolean isLiteral = cql3Type == CQL3Type.Native.ASCII || cql3Type == CQL3Type.Native.TEXT || cql3Type == CQL3Type.Native.VARCHAR;
+            boolean isUTF8OrAscii = cql3Type == CQL3Type.Native.ASCII || cql3Type == CQL3Type.Native.TEXT || cql3Type == CQL3Type.Native.VARCHAR;
+            boolean isLiteral = cql3Type == CQL3Type.Native.ASCII || cql3Type == CQL3Type.Native.TEXT || cql3Type == CQL3Type.Native.VARCHAR || cql3Type == CQL3Type.Native.BOOLEAN;
             assertEquals(isLiteral, TypeUtil.isLiteral(type));
             assertEquals(TypeUtil.isLiteral(type), TypeUtil.isLiteral(reversedType));
-            assertEquals(isLiteral, TypeUtil.isUTF8OrAscii(type));
+            assertEquals(isUTF8OrAscii, TypeUtil.isUTF8OrAscii(type));
             assertEquals(TypeUtil.isUTF8OrAscii(type), TypeUtil.isUTF8OrAscii(reversedType));
             assertEquals(TypeUtil.isIn(type, AbstractAnalyzer.ANALYZABLE_TYPES),
                          TypeUtil.isIn(reversedType, AbstractAnalyzer.ANALYZABLE_TYPES));
@@ -95,6 +100,47 @@ public void testListType()
         testCollectionType(ListType::getInstance, (a, b) -> {});
     }
 
+    @Test
+    public void testTuple()
+    {
+        for(CQL3Type elementType : StorageAttachedIndex.SUPPORTED_TYPES)
+        {
+            TupleType type = new TupleType(Arrays.asList(elementType.getType(), elementType.getType()), true);
+            assertFalse(TypeUtil.isFrozenCollection(type));
+            assertTrue(TypeUtil.isFrozen(type));
+            assertTrue(TypeUtil.isLiteral(type));
+
+            type = new TupleType(Arrays.asList(elementType.getType(), elementType.getType()), false);
+            assertFalse(TypeUtil.isFrozenCollection(type));
+            assertTrue(TypeUtil.isFrozen(type));
+            assertTrue(TypeUtil.isLiteral(type));
+        }
+    }
+
+    @Test
+    public void testUDT()
+    {
+        for(CQL3Type elementType : StorageAttachedIndex.SUPPORTED_TYPES)
+        {
+            UserType type = new UserType("ks", ByteBufferUtil.bytes("myType"),
+                                         Arrays.asList(FieldIdentifier.forQuoted("f1"), FieldIdentifier.forQuoted("f2")),
+                                         Arrays.asList(elementType.getType(), elementType.getType()),
+                                         true);
+
+            assertFalse(TypeUtil.isFrozenCollection(type));
+            assertFalse(TypeUtil.isFrozen(type));
+            assertFalse(TypeUtil.isLiteral(type));
+
+            type = new UserType("ks", ByteBufferUtil.bytes("myType"),
+                                Arrays.asList(FieldIdentifier.forQuoted("f1"), FieldIdentifier.forQuoted("f2")),
+                                Arrays.asList(elementType.getType(), elementType.getType()),
+                                false);
+            assertFalse(TypeUtil.isFrozenCollection(type));
+            assertTrue(TypeUtil.isFrozen(type));
+            assertTrue(TypeUtil.isLiteral(type));
+        }
+    }
+
     private static void testCollectionType(BiFunction<AbstractType<?>, Boolean, AbstractType<?>> init,
                                            BiConsumer<AbstractType<?>, AbstractType<?>> nonFrozenCollectionTester)
     {

From 79e4a70fec0b5170e7b16a44f66057b0a642a1c5 Mon Sep 17 00:00:00 2001
From: jacek-lewandowski <jacek.lewandowski@datastax.com>
Date: Thu, 22 Apr 2021 10:19:39 +0200
Subject: [PATCH 065/151] STAR-501: Fixed number of CPUs = 2 for testing

(cherry picked from commit 1c4867c35703063dc4162b2a7c8c213763c91283)
(cherry picked from commit 1b4b2252cbf46fa86ae03aeee45b60401a07590d)
---
 build.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/build.xml b/build.xml
index 4390c9010b2a..abe4b10c03e1 100644
--- a/build.xml
+++ b/build.xml
@@ -1368,6 +1368,7 @@
              more aggressively rather than waiting. See CASSANDRA-14922 for more details.
         -->
         <jvmarg value="-XX:SoftRefLRUPolicyMSPerMB=0" />
+        <jvmarg value="-XX:ActiveProcessorCount=2" />
         <jvmarg value="-Dcassandra.test.driver.connection_timeout_ms=${test.driver.connection_timeout_ms}"/>
         <jvmarg value="-Dcassandra.test.driver.read_timeout_ms=${test.driver.read_timeout_ms}"/>
         <jvmarg value="-Dcassandra.memtable_row_overhead_computation_step=100"/>

From 3ad7794f486985a216030f3643edabe6b1e142ad Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Thu, 22 Apr 2021 17:32:17 +0200
Subject: [PATCH 066/151] STAR-451 override unsupported DSE compaction
 strategies (#122)

* STAR-451 override unsupported DSE compaction strategies

Create table command overrides a recognized unsupported DSE
compaction strategy (MemoryOnlyStrategy and TieredCompactionStrategy)
with default Compaction Strategy (SizeTiered atm).
The table is created successfully, applied == true.
Additional warning is returned to the caller.

(cherry picked from commit ae2eded1eb1f2db40d775bd0245f215907a539d5)
(cherry picked from commit af68a13f7494db1a4908b8a12bbd2649befba315)
---
 .../cql3/statements/PropertyDefinitions.java  |  2 +-
 .../schema/CreateTableStatement.java          | 17 +++-
 .../statements/schema/TableAttributes.java    | 32 ++++++-
 .../statements/CreateTableStatementTest.java  | 86 +++++++++++++++++++
 4 files changed, 133 insertions(+), 4 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java

diff --git a/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java b/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
index 590910f47376..c474949d7a19 100644
--- a/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
+++ b/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
@@ -69,7 +69,7 @@ protected String getSimple(String name) throws SyntaxException
         return (String)val;
     }
 
-    protected Map<String, String> getMap(String name) throws SyntaxException
+    public Map<String, String> getMap(String name) throws SyntaxException
     {
         Object val = properties.get(name);
         if (val == null)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index 1339ba39f7b1..fd65d3c42eb7 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -372,6 +372,8 @@ else if (!builder.hasRegularColumns())
     @Override
     public Set<String> clientWarnings(KeyspacesDiff diff)
     {
+        Set<String> warnings = new HashSet<>();
+
         int tableCount = Schema.instance.getNumberOfTables();
         if (tableCount > DatabaseDescriptor.tableCountWarnThreshold())
         {
@@ -379,9 +381,20 @@ public Set<String> clientWarnings(KeyspacesDiff diff)
                                        tableCount,
                                        Schema.instance.getKeyspaces().size());
             logger.warn(msg);
-            return ImmutableSet.of(msg);
+            warnings.add(msg);
         }
-        return ImmutableSet.of();
+
+        if (attrs.hasUnsupportedDseCompaction())
+        {
+            Map<String, String> compactionOptions = attrs.getMap(TableParams.Option.COMPACTION.toString());
+            String strategy = compactionOptions.get(CompactionParams.Option.CLASS.toString());
+            warnings.add(String.format("The given compaction strategy (%s) is not supported. ", strategy) +
+                         "The compaction strategy parameter was overridden with the default " +
+                         String.format("(%s). ", CompactionParams.DEFAULT.klass().getCanonicalName()) +
+                         "Inspect your schema and adjust other table properties if needed.");
+        }
+
+        return warnings;
     }
 
     private static class DefaultNames
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 686729548583..85fe0fcd499b 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -43,6 +43,13 @@ public final class TableAttributes extends PropertyDefinitions
     private static final Set<String> validKeywords;
     private static final Set<String> obsoleteKeywords;
 
+    private static final Set<String> UNSUPPORTED_DSE_COMPACTION_STRATEGIES = ImmutableSet.of(
+        "org.apache.cassandra.db.compaction.TieredCompactionStrategy",
+        "TieredCompactionStrategy",
+        "org.apache.cassandra.db.compaction.MemoryOnlyStrategy",
+        "MemoryOnlyStrategy"
+    );
+
     static
     {
         ImmutableSet.Builder<String> validBuilder = ImmutableSet.builder();
@@ -84,6 +91,24 @@ public TableId getId() throws ConfigurationException
         }
     }
 
+    /**
+     * Returs `true` if this attributes instance has a COMPACTION option with a recognized unsupported compaction
+     * strategy class (coming from DSE). `false` otherwise.
+     */
+    boolean hasUnsupportedDseCompaction()
+    {
+        if (hasOption(Option.COMPACTION))
+        {
+            Map<String, String> compactionOptions = getMap(Option.COMPACTION);
+            String strategy = compactionOptions.get(CompactionParams.Option.CLASS.toString());
+            return UNSUPPORTED_DSE_COMPACTION_STRATEGIES.contains(strategy);
+        }
+        else
+        {
+            return false;
+        }
+    }
+
     private TableParams build(TableParams.Builder builder)
     {
         if (hasOption(Option.BLOOM_FILTER_FP_CHANCE))
@@ -96,7 +121,12 @@ private TableParams build(TableParams.Builder builder)
             builder.comment(getString(Option.COMMENT));
 
         if (hasOption(Option.COMPACTION))
-            builder.compaction(CompactionParams.fromMap(getMap(Option.COMPACTION)));
+        {
+            if (hasUnsupportedDseCompaction())
+                builder.compaction(CompactionParams.DEFAULT);
+            else
+                builder.compaction(CompactionParams.fromMap(getMap(Option.COMPACTION)));
+        }
 
         if (hasOption(Option.COMPRESSION))
         {
diff --git a/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java
new file mode 100644
index 000000000000..1ef2fde5af7f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class CreateTableStatementTest extends CQLTester
+{
+    @Parameterized.Parameters(name = "compactionStrategy = {0}")
+    public static Set<String> strategies()
+    {
+        return ImmutableSet.of(
+            "{'class': 'org.apache.cassandra.db.compaction.MemoryOnlyStrategy', 'max_threshold': '32', 'min_threshold': '4'}",
+            "{'class': 'MemoryOnlyStrategy', 'max_threshold': '32', 'min_threshold': '4'}",
+            "{'class': 'org.apache.cassandra.db.compaction.TieredCompactionStrategy', 'tiering_strategy': 'TimeWindowStorageStrategy', 'config': 'strategy1', 'max_tier_ages': '3600,7200'}",
+            "{'class': 'TieredCompactionStrategy', 'tiering_strategy': 'TimeWindowStorageStrategy', 'config': 'strategy1', 'max_tier_ages': '3600,7200'}"
+        );
+    }
+
+    @Parameterized.Parameter()
+    public String compactionStrategy;
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1));
+    }
+
+    @Test
+    public void dseCompactionStrategyShouldBeIgnoredWithWarning() throws Throwable
+    {
+        String tableName = createTableName();
+
+        // should not throw
+        ResultSet rows = executeNet(String.format("CREATE TABLE ks.%s (k int PRIMARY KEY, v int) WITH " +
+                                                  "compaction = %s;", tableName, compactionStrategy));
+
+        assertTrue(rows.wasApplied());
+
+        String warning = rows.getAllExecutionInfo().get(0).getWarnings().get(0);
+        assertThat(warning, containsString("The compaction strategy parameter was overridden with the default"));
+
+        assertDefaultCompactionStrategy(tableName);
+    }
+
+    private void assertDefaultCompactionStrategy(String tableName) throws Throwable
+    {
+        ResultSet result = executeNet("DESCRIBE TABLE ks." + tableName);
+
+        String createStatement = result.one().getString("create_statement");
+        assertThat(createStatement, containsString(CompactionParams.DEFAULT.klass().getCanonicalName()));
+    }
+}

From 9f5f99726a0cccadb8db369f951ddc0f9c98b879 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 15 Apr 2021 09:56:37 +0100
Subject: [PATCH 067/151] STAR-184 Endpoint grouping for distributed range
 reads

(cherry picked from commit f108eab68edb1ddd20eb7723194cf69cd84ab2d0)
(cherry picked from commit ba54245e6c992fdae362c513fb3f4dc78cfa24b2)
---
 .../cassandra/db/MultiRangeReadCommand.java   | 424 ++++++++++++++++++
 .../cassandra/db/MultiRangeReadResponse.java  | 411 +++++++++++++++++
 .../db/PartitionRangeReadCommand.java         |  20 +
 .../org/apache/cassandra/db/ReadCommand.java  |  27 +-
 .../cassandra/db/ReadCommandVerbHandler.java  |   4 +-
 .../org/apache/cassandra/index/Index.java     |   8 +
 .../index/sai/plan/QueryController.java       |   6 +
 .../plan/StorageAttachedIndexQueryPlan.java   |  11 +-
 .../org/apache/cassandra/net/Message.java     |  12 +
 src/java/org/apache/cassandra/net/Verb.java   |   6 +-
 .../cassandra/service/reads/DataResolver.java |  36 +-
 .../cassandra/service/reads/ReadCallback.java |   5 +
 .../reads/ShortReadPartitionsProtection.java  |  11 +-
 .../service/reads/ShortReadProtection.java    |  26 +-
 .../range/EndpointGroupingCoordinator.java    | 340 ++++++++++++++
 .../EndpointGroupingRangeCommandIterator.java |  71 +++
 .../NonGroupingRangeCommandIterator.java      | 136 ++++++
 .../reads/range/RangeCommandIterator.java     | 148 ++----
 .../service/reads/range/RangeCommands.java    |  14 +-
 .../db/MultiRangeReadCommandTest.java         | 413 +++++++++++++++++
 .../org/apache/cassandra/index/StubIndex.java |   4 +
 .../index/sai/cql/AbstractQueryTester.java    |   1 -
 .../cassandra/index/sai/cql/DataModel.java    |  14 +
 .../index/sai/cql/IndexQuerySupport.java      |   1 -
 .../index/sai/cql/SingleNodeExecutor.java     |   2 -
 ...pointGroupingRangeCommandIteratorTest.java | 166 +++++++
 .../reads/range/RangeCommandIteratorTest.java |  10 +-
 27 files changed, 2156 insertions(+), 171 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
 create mode 100644 src/java/org/apache/cassandra/db/MultiRangeReadResponse.java
 create mode 100644 src/java/org/apache/cassandra/service/reads/range/EndpointGroupingCoordinator.java
 create mode 100644 src/java/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIterator.java
 create mode 100644 src/java/org/apache/cassandra/service/reads/range/NonGroupingRangeCommandIterator.java
 create mode 100644 test/unit/org/apache/cassandra/db/MultiRangeReadCommandTest.java
 create mode 100644 test/unit/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIteratorTest.java

diff --git a/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java b/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
new file mode 100644
index 000000000000..271f8e29d6c6
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.net.Verb;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.pager.PagingState;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.service.reads.ReadCallback;
+import org.apache.cassandra.transport.ProtocolVersion;
+
+/**
+ * Used by {@code EndpointGroupingCoordinator} to query all involved ranges on a given replica at once.
+ *
+ * Note: digest is not supported because each replica is responsible for different token ranges, there is no point on
+ * sending digest.
+ */
+public class MultiRangeReadCommand extends ReadCommand
+{
+    protected static final SelectionDeserializer selectionDeserializer = new Deserializer();
+
+    private final List<DataRange> dataRanges;
+
+    private MultiRangeReadCommand(boolean isDigest,
+                                  int digestVersion,
+                                  boolean acceptsTransient,
+                                  TableMetadata metadata,
+                                  int nowInSec,
+                                  ColumnFilter columnFilter,
+                                  RowFilter rowFilter,
+                                  DataLimits limits,
+                                  List<DataRange> dataRanges,
+                                  Index.QueryPlan indexQueryPlan)
+    {
+        super(Kind.MULTI_RANGE, isDigest, digestVersion, acceptsTransient, metadata, nowInSec, columnFilter, rowFilter, limits, indexQueryPlan);
+
+        assert dataRanges.size() > 0;
+        this.dataRanges = dataRanges;
+    }
+
+    /**
+     *
+     * @param command current partition range command
+     * @param ranges token ranges to be queried on specific endpoint
+     * @param isRangeContinuation whether it's querying the first range in the batch
+     * @return multi-range read command for specific endpoint
+     */
+    @VisibleForTesting
+    public static MultiRangeReadCommand create(PartitionRangeReadCommand command, List<AbstractBounds<PartitionPosition>> ranges, boolean isRangeContinuation)
+    {
+        List<DataRange> dataRanges = new ArrayList<>(ranges.size());
+        for (AbstractBounds<PartitionPosition> range : ranges)
+            dataRanges.add(command.dataRange().forSubRange(range));
+
+        return new MultiRangeReadCommand(command.isDigestQuery(),
+                                         command.digestVersion(),
+                                         command.acceptsTransient(),
+                                         command.metadata(),
+                                         command.nowInSec(),
+                                         command.columnFilter(),
+                                         command.rowFilter(),
+                                         isRangeContinuation ? command.limits() : command.limits().withoutState(),
+                                         dataRanges,
+                                         command.indexQueryPlan());
+    }
+
+    /**
+     * @param subrangeHandlers handlers for all vnode ranges replicated in current endpoint.
+     * @return multi-range read command for specific endpoint
+     */
+    public static MultiRangeReadCommand create(List<ReadCallback<?, ?>> subrangeHandlers)
+    {
+        assert !subrangeHandlers.isEmpty();
+
+        PartitionRangeReadCommand command = (PartitionRangeReadCommand) subrangeHandlers.get(0).command();
+        List<DataRange> dataRanges = new ArrayList<>(subrangeHandlers.size());
+        for(ReadCallback<?, ?> handler : subrangeHandlers)
+        {
+            dataRanges.add(((PartitionRangeReadCommand) handler.command()).dataRange());
+        }
+
+
+        return new MultiRangeReadCommand(command.isDigestQuery(),
+                                         command.digestVersion(),
+                                         command.acceptsTransient(),
+                                         command.metadata(),
+                                         command.nowInSec(),
+                                         command.columnFilter(),
+                                         command.rowFilter(),
+                                         command.limits(),
+                                         dataRanges,
+                                         command.indexQueryPlan());
+    }
+
+    /**
+     * @return all token ranges to be queried
+     */
+    public List<DataRange> ranges()
+    {
+        return dataRanges;
+    }
+
+    @Override
+    protected void serializeSelection(DataOutputPlus out, int version) throws IOException
+    {
+        int rangeCount = dataRanges.size();
+        out.writeInt(rangeCount);
+
+        for (DataRange range : dataRanges)
+            DataRange.serializer.serialize(range, out, version, metadata());
+    }
+
+    @Override
+    protected long selectionSerializedSize(int version)
+    {
+        int rangeCount = dataRanges.size();
+        long size = TypeSizes.sizeof(rangeCount);
+
+        for (DataRange range : dataRanges)
+            size += DataRange.serializer.serializedSize(range, version, metadata());
+
+        return size;
+    }
+
+    @Override
+    public boolean isLimitedToOnePartition()
+    {
+        if (dataRanges.size() != 1)
+            return false;
+
+        DataRange dataRange = dataRanges.get(0);
+        return dataRange.keyRange() instanceof Bounds
+               && dataRange.startKey().kind() == PartitionPosition.Kind.ROW_KEY
+               && dataRange.startKey().equals(dataRange.stopKey());
+    }
+
+    @Override
+    public boolean isRangeRequest()
+    {
+        return false;
+    }
+
+    @Override
+    public ReadCommand withUpdatedLimit(DataLimits newLimits)
+    {
+        return new MultiRangeReadCommand(isDigestQuery(),
+                                         digestVersion(),
+                                         acceptsTransient(),
+                                         metadata(),
+                                         nowInSec(),
+                                         columnFilter(),
+                                         rowFilter(),
+                                         newLimits,
+                                         dataRanges,
+                                         indexQueryPlan());
+    }
+
+    @Override
+    public long getTimeout(TimeUnit unit)
+    {
+        return DatabaseDescriptor.getRangeRpcTimeout(unit);
+    }
+
+    @Override
+    public ReadResponse createResponse(UnfilteredPartitionIterator iterator)
+    {
+        assert !isDigestQuery();
+        return MultiRangeReadResponse.createDataResponse(iterator, this);
+    }
+
+    @Override
+    public ClusteringIndexFilter clusteringIndexFilter(DecoratedKey key)
+    {
+        for (DataRange dataRange : ranges())
+        {
+            if (dataRange.keyRange().contains(key))
+                return dataRange.clusteringIndexFilter(key);
+        }
+
+        throw new IllegalArgumentException(key + " is not in data ranges " + dataRanges.stream().map(r -> r.toString(metadata())).collect(Collectors.toList()));
+    }
+
+    @Override
+    public ReadCommand copy()
+    {
+        return new MultiRangeReadCommand(isDigestQuery(),
+                                         digestVersion(),
+                                         acceptsTransient(),
+                                         metadata(),
+                                         nowInSec(),
+                                         columnFilter(),
+                                         rowFilter(),
+                                         limits(),
+                                         dataRanges,
+                                         indexQueryPlan());
+    }
+
+    @Override
+    protected ReadCommand copyAsTransientQuery()
+    {
+        return new MultiRangeReadCommand(false,
+                                          0,
+                                          true,
+                                          metadata(),
+                                          nowInSec(),
+                                          columnFilter(),
+                                          rowFilter(),
+                                          limits(),
+                                          dataRanges,
+                                          indexQueryPlan());
+    }
+
+    @Override
+    protected ReadCommand copyAsDigestQuery()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public UnfilteredPartitionIterator queryStorage(ColumnFamilyStore cfs, ReadExecutionController executionController)
+    {
+        return UnfilteredPartitionIterators.concat(dataRanges.stream()
+                                                             .map(this::toPartitionRangeReadCommand)
+                                                             .map(command -> command.queryStorage(cfs, executionController))
+                                                             .collect(Collectors.toList()));
+    }
+
+    @Override
+    public UnfilteredPartitionIterator searchStorage(Index.Searcher searcher, ReadExecutionController controller)
+    {
+        if (indexQueryPlan.supportsMultiRangeReadCommand())
+        {
+            // SAI supports fetching multiple ranges at once
+            return super.searchStorage(searcher, controller);
+        }
+        else
+        {
+            // search each subrange separately as they don't support MultiRangeReadCommand
+            return UnfilteredPartitionIterators.concat(dataRanges.stream()
+                                                                 .map(this::toPartitionRangeReadCommand)
+                                                                 .map(command -> command.searchStorage(searcher, controller))
+                                                                 .collect(Collectors.toList()));
+        }
+    }
+
+    private PartitionRangeReadCommand toPartitionRangeReadCommand(DataRange dataRange)
+    {
+        return PartitionRangeReadCommand.create(metadata(), nowInSec(), columnFilter(), rowFilter(), limits(), dataRange, indexQueryPlan());
+    }
+
+    @Override
+    public boolean isReversed()
+    {
+        return ranges().get(0).isReversed();
+    }
+
+    @Override
+    protected void recordLatency(TableMetrics metric, long latencyNanos)
+    {
+        metric.rangeLatency.addNano(latencyNanos);
+    }
+
+    @Override
+    public Verb verb()
+    {
+        return Verb.MULTI_RANGE_REQ;
+    }
+
+    @Override
+    protected void appendCQLWhereClause(StringBuilder sb)
+    {
+        if (ranges().size() == 1 && ranges().get(0).isUnrestricted() && rowFilter().isEmpty())
+            return;
+
+        sb.append(" WHERE ");
+        // We put the row filter first because the data range can end by "ORDER BY"
+        if (!rowFilter().isEmpty())
+        {
+            sb.append(rowFilter());
+            sb.append(" AND ");
+        }
+
+        boolean isFirst = true;
+        for (int i = 0; i < ranges().size(); i++)
+        {
+            DataRange dataRange = ranges().get(i);
+            if (!dataRange.isUnrestricted())
+            {
+                if (!isFirst)
+                    sb.append(" AND ");
+                isFirst = false;
+                sb.append(dataRange.toCQLString(metadata()));
+            }
+        }
+    }
+
+    @Override
+    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+    {
+        // MultiRangeReadCommand should only be executed on the replica side
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public QueryPager getPager(PagingState pagingState, ProtocolVersion protocolVersion)
+    {
+        // MultiRangeReadCommand should only be executed at replica side"
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean selectsKey(DecoratedKey key)
+    {
+        for (DataRange dataRange : ranges())
+        {
+            if (!dataRange.contains(key))
+                continue;
+
+            return rowFilter().partitionKeyRestrictionsAreSatisfiedBy(key, metadata().partitionKeyType);
+        }
+
+        return false;
+    }
+
+    @Override
+    public boolean selectsClustering(DecoratedKey key, Clustering clustering)
+    {
+        if (clustering == Clustering.STATIC_CLUSTERING)
+            return !columnFilter().fetchedColumns().statics.isEmpty();
+
+        for (DataRange dataRange : ranges())
+        {
+            if (!dataRange.keyRange().contains(key) || !dataRange.clusteringIndexFilter(key).selects(clustering))
+                continue;
+
+            if (rowFilter().clusteringKeyRestrictionsAreSatisfiedBy(clustering))
+                return true;
+        }
+
+        return false;
+    }
+
+    @Override
+    public boolean selectsFullPartition()
+    {
+        return metadata().isStaticCompactTable() ||
+               (ranges().stream().allMatch(DataRange::selectsAllPartition) && !rowFilter().hasExpressionOnClusteringOrRegularColumns());
+    }
+
+    private static class Deserializer extends SelectionDeserializer
+    {
+        @Override
+        public ReadCommand deserialize(DataInputPlus in,
+                                       int version,
+                                       boolean isDigest,
+                                       int digestVersion,
+                                       boolean acceptsTransient,
+                                       TableMetadata metadata,
+                                       int nowInSec,
+                                       ColumnFilter columnFilter,
+                                       RowFilter rowFilter,
+                                       DataLimits limits,
+                                       Index.QueryPlan indexQueryPlan)
+        throws IOException
+        {
+            int rangeCount = in.readInt();
+
+            List<DataRange> ranges = new ArrayList<>(rangeCount);
+            for (int i = 0; i < rangeCount; i++)
+                ranges.add(DataRange.serializer.deserialize(in, version, metadata));
+
+            return new MultiRangeReadCommand(isDigest,
+                                             digestVersion,
+                                             acceptsTransient,
+                                             metadata,
+                                             nowInSec,
+                                             columnFilter,
+                                             rowFilter,
+                                             limits,
+                                             ranges,
+                                             indexQueryPlan);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/MultiRangeReadResponse.java b/src/java/org/apache/cassandra/db/MultiRangeReadResponse.java
new file mode 100644
index 000000000000..29a7d1ef10b5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/MultiRangeReadResponse.java
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.NoSuchElementException;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.partitions.AbstractUnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
+import org.apache.cassandra.db.rows.DeserializationHelper;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.io.IVersionedSerializer;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+/**
+ * It's used to store response of multi-range read request from a given endpoint,
+ * {@link ReadResponse} of subrange can be extracted via {@link #subrangeResponse(MultiRangeReadCommand, AbstractBounds)};
+ */
+public abstract class MultiRangeReadResponse extends ReadResponse
+{
+    protected static final Logger logger = LoggerFactory.getLogger(MultiRangeReadResponse.class);
+
+    public static final IVersionedSerializer<ReadResponse> serializer = new Serializer();
+
+    private MultiRangeReadResponse()
+    {
+    }
+
+    /**
+     * @param data results of multiple ranges
+     * @param command current multi-range read command
+     * @return multi-range read response
+     */
+    static ReadResponse createDataResponse(UnfilteredPartitionIterator data, MultiRangeReadCommand command)
+    {
+        return new LocalDataResponse(data, command);
+    }
+
+    /**
+     * @param command current multi-range read command
+     * @param range target subrange
+     * @return response corresponding to the given range
+     */
+    public abstract ReadResponse subrangeResponse(MultiRangeReadCommand command, AbstractBounds<PartitionPosition> range);
+
+    @Override
+    public ByteBuffer digest(ReadCommand command)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean isDigestResponse()
+    {
+        return false;
+    }
+
+    @Override
+    public ByteBuffer repairedDataDigest()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean isRepairedDigestConclusive()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean mayIncludeRepairedDigest()
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public String toDebugString(ReadCommand command, DecoratedKey key)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * A local response that is not meant to be serialized or used for caching remote endpoint's multi-range response.
+     */
+    private static class LocalResponse extends MultiRangeReadResponse
+    {
+        private final RangeBoundPartitionIterator iterator;
+
+        LocalResponse(UnfilteredPartitionIterator response)
+        {
+            this.iterator = new RangeBoundPartitionIterator(response);
+        }
+
+        @Override
+        public UnfilteredPartitionIterator makeIterator(ReadCommand command)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ReadResponse subrangeResponse(MultiRangeReadCommand command, AbstractBounds<PartitionPosition> range)
+        {
+            // deliver already cached content without deserialization.
+            return new LocalSubrangeResponse(iterator, range);
+        }
+
+        class RangeBoundPartitionIterator
+        {
+            private final UnfilteredPartitionIterator iterator;
+            private UnfilteredRowIterator next = null;
+
+            RangeBoundPartitionIterator(UnfilteredPartitionIterator iterator)
+            {
+                this.iterator = iterator;
+            }
+
+            public boolean hasNext(AbstractBounds<PartitionPosition> range)
+            {
+                if (next != null)
+                    return range.contains(next.partitionKey());
+
+                if (iterator.hasNext())
+                {
+                    next = iterator.next();
+                    if (range.contains(next.partitionKey()))
+                        return true;
+                }
+                return false;
+            }
+
+            public UnfilteredRowIterator next()
+            {
+                if (next != null)
+                {
+                    UnfilteredRowIterator result = next;
+                    next = null;
+                    return result;
+                }
+                throw new NoSuchElementException();
+            }
+        }
+    }
+
+    private static class LocalSubrangeResponse extends ReadResponse
+    {
+        private final LocalResponse.RangeBoundPartitionIterator iterator;
+        private final AbstractBounds<PartitionPosition> range;
+
+        LocalSubrangeResponse(LocalResponse.RangeBoundPartitionIterator iterator, AbstractBounds<PartitionPosition> range)
+        {
+            this.iterator = iterator;
+            this.range = range;
+        }
+
+        @Override
+        public UnfilteredPartitionIterator makeIterator(ReadCommand command)
+        {
+            return new AbstractUnfilteredPartitionIterator()
+            {
+                @Override
+                public TableMetadata metadata()
+                {
+                    return command.metadata();
+                }
+
+                @Override
+                public boolean hasNext()
+                {
+                    return iterator.hasNext(range);
+                }
+
+                @Override
+                public UnfilteredRowIterator next()
+                {
+                    return iterator.next();
+                }
+            };
+        }
+
+        @Override
+        public ByteBuffer digest(ReadCommand command)
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public ByteBuffer repairedDataDigest()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public boolean isRepairedDigestConclusive()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public boolean mayIncludeRepairedDigest()
+        {
+            throw new UnsupportedOperationException();
+        }
+
+        @Override
+        public boolean isDigestResponse()
+        {
+            return false;
+        }
+    }
+
+    /**
+     * A local response that needs to be serialized, i.e. sent to another node. The iterator
+     * is serialized by the build method and can be closed as soon as this response has been created.
+     */
+    private static class LocalDataResponse extends DataResponse
+    {
+        private LocalDataResponse(UnfilteredPartitionIterator iterator, MultiRangeReadCommand command)
+        {
+            super(build(iterator, command.columnFilter()), MessagingService.current_version, DeserializationHelper.Flag.FROM_REMOTE);
+        }
+
+        private static ByteBuffer build(UnfilteredPartitionIterator iterator, ColumnFilter selection)
+        {
+            try (DataOutputBuffer buffer = new DataOutputBuffer())
+            {
+                UnfilteredPartitionIterators.serializerForIntraNode().serialize(iterator, selection, buffer, MessagingService.current_version);
+                return buffer.buffer();
+            }
+            catch (IOException e)
+            {
+                // We're serializing in memory so this shouldn't happen
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    /**
+     * A response received from a remove node. We keep the response serialized in the byte buffer.
+     */
+    private static class RemoteDataResponse extends DataResponse
+    {
+        RemoteDataResponse(ByteBuffer data,
+                           int dataSerializationVersion)
+        {
+            super(data, dataSerializationVersion, DeserializationHelper.Flag.FROM_REMOTE);
+        }
+    }
+
+    /**
+     * The command base class for local or remote responses that stay serialized in a byte buffer,
+     * the data.
+     */
+    static abstract class DataResponse extends MultiRangeReadResponse
+    {
+        // The response, serialized in the current messaging version
+        private final ByteBuffer data;
+        private final int dataSerializationVersion;
+        private final DeserializationHelper.Flag flag;
+
+        private MultiRangeReadResponse.LocalResponse cached;
+
+        DataResponse(ByteBuffer data,
+                     int dataSerializationVersion,
+                     DeserializationHelper.Flag flag)
+        {
+            this.data = data;
+            this.dataSerializationVersion = dataSerializationVersion;
+            this.flag = flag;
+        }
+
+        public UnfilteredPartitionIterator makeIterator(ReadCommand command)
+        {
+            try (DataInputBuffer in = new DataInputBuffer(data, true))
+            {
+                // Note that the command parameter shadows the 'command' field and this is intended because
+                // the later can be null (for RemoteDataResponse as those are created in the serializers and
+                // those don't have easy access to the command). This is also why we need the command as parameter here.
+                return UnfilteredPartitionIterators.serializerForIntraNode().deserialize(in,
+                                                                                         dataSerializationVersion,
+                                                                                         command.metadata(),
+                                                                                         command.columnFilter(),
+                                                                                         flag);
+            }
+            catch (IOException e)
+            {
+                // We're deserializing in memory so this shouldn't happen
+                throw new RuntimeException(e);
+            }
+        }
+
+        public ByteBuffer repairedDataDigest()
+        {
+            return ByteBufferUtil.EMPTY_BYTE_BUFFER;
+        }
+
+        @Override
+        public boolean isRepairedDigestConclusive()
+        {
+            return true;
+        }
+
+        @Override
+        public boolean mayIncludeRepairedDigest()
+        {
+            return dataSerializationVersion >= MessagingService.VERSION_40;
+        }
+
+        @Override
+        public ReadResponse subrangeResponse(MultiRangeReadCommand command, AbstractBounds<PartitionPosition> range)
+        {
+            if (cached == null)
+            {
+                try (DataInputBuffer in = new DataInputBuffer(data, true))
+                {
+                    @SuppressWarnings("resource") // The close operation is a noop for a deserialized UPI
+                    UnfilteredPartitionIterator iterator = UnfilteredPartitionIterators.serializerForIntraNode()
+                                                                                       .deserialize(in,
+                                                                                                    dataSerializationVersion,
+                                                                                                    command.metadata(),
+                                                                                                    command.columnFilter(),
+                                                                                                    flag);
+                    cached = new LocalResponse(iterator);
+                }
+                catch (IOException e)
+                {
+                    throw new RuntimeException(e);
+                }
+            }
+
+            return cached.subrangeResponse(command, range);
+        }
+    }
+
+    /**
+     * A copy of {@code ReadResponse.Serializer} that doesn't support a digest response
+     */
+    private static class Serializer implements IVersionedSerializer<ReadResponse>
+    {
+        public void serialize(ReadResponse response, DataOutputPlus out, int version) throws IOException
+        {
+            ByteBuffer digest = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            ByteBufferUtil.writeWithVIntLength(digest, out);
+            if (version >= MessagingService.VERSION_40)
+            {
+                ByteBufferUtil.writeWithVIntLength(response.repairedDataDigest(), out);
+                out.writeBoolean(response.isRepairedDigestConclusive());
+            }
+            ByteBuffer data = ((DataResponse)response).data;
+            ByteBufferUtil.writeWithVIntLength(data, out);
+        }
+
+        public ReadResponse deserialize(DataInputPlus in, int version) throws IOException
+        {
+            ByteBuffer digest = ByteBufferUtil.readWithVIntLength(in);
+            assert !digest.hasRemaining();
+
+            if (version >= MessagingService.VERSION_40)
+            {
+                ByteBufferUtil.readWithVIntLength(in);
+                in.readBoolean();
+            }
+            ByteBuffer data = ByteBufferUtil.readWithVIntLength(in);
+            return new RemoteDataResponse(data, version);
+        }
+
+        public long serializedSize(ReadResponse response, int version)
+        {
+            ByteBuffer digest = ByteBufferUtil.EMPTY_BYTE_BUFFER;
+            long size = ByteBufferUtil.serializedSizeWithVIntLength(digest);
+
+            if (version >= MessagingService.VERSION_40)
+            {
+                size += ByteBufferUtil.serializedSizeWithVIntLength(response.repairedDataDigest());
+                size += 1;
+            }
+            assert version >= MessagingService.VERSION_30;
+            ByteBuffer data = ((DataResponse)response).data;
+            size += ByteBufferUtil.serializedSizeWithVIntLength(data);
+            return size;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
index 434cb692c555..dc95e787cb11 100644
--- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
@@ -69,6 +69,26 @@ private PartitionRangeReadCommand(boolean isDigest,
         this.dataRange = dataRange;
     }
 
+    public static PartitionRangeReadCommand create(TableMetadata metadata,
+                                                   int nowInSec,
+                                                   ColumnFilter columnFilter,
+                                                   RowFilter rowFilter,
+                                                   DataLimits limits,
+                                                   DataRange dataRange,
+                                                   Index.QueryPlan indexQueryPlan)
+    {
+        return new PartitionRangeReadCommand(false,
+                                             0,
+                                             false,
+                                             metadata,
+                                             nowInSec,
+                                             columnFilter,
+                                             rowFilter,
+                                             limits,
+                                             dataRange,
+                                             indexQueryPlan);
+    }
+
     public static PartitionRangeReadCommand create(TableMetadata metadata,
                                                    int nowInSec,
                                                    ColumnFilter columnFilter,
diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index 3c999c8d4044..1de5f4c68289 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java
@@ -97,7 +97,7 @@ public abstract class ReadCommand extends AbstractReadQuery
     int oldestUnrepairedTombstone = Integer.MAX_VALUE;
 
     @Nullable
-    private final Index.QueryPlan indexQueryPlan;
+    protected final Index.QueryPlan indexQueryPlan;
 
     protected static abstract class SelectionDeserializer
     {
@@ -117,7 +117,8 @@ public abstract ReadCommand deserialize(DataInputPlus in,
     protected enum Kind
     {
         SINGLE_PARTITION (SinglePartitionReadCommand.selectionDeserializer),
-        PARTITION_RANGE  (PartitionRangeReadCommand.selectionDeserializer);
+        PARTITION_RANGE  (PartitionRangeReadCommand.selectionDeserializer),
+        MULTI_RANGE      (MultiRangeReadCommand.selectionDeserializer);
 
         private final SelectionDeserializer selectionDeserializer;
 
@@ -386,6 +387,16 @@ public ReadResponse createResponse(UnfilteredPartitionIterator iterator)
              : ReadResponse.createDataResponse(iterator, this);
     }
 
+    public DataLimits.Counter createLimitedCounter(boolean assumeLiveData)
+    {
+        return limits().newCounter(nowInSec(), assumeLiveData, selectsFullPartition(), metadata().enforceStrictLiveness()).onlyCount();
+    }
+
+    public DataLimits.Counter createUnlimitedCounter(boolean assumeLiveData)
+    {
+        return DataLimits.NONE.newCounter(nowInSec(), assumeLiveData, selectsFullPartition(), metadata().enforceStrictLiveness());
+    }
+
     long indexSerializedSize(int version)
     {
         return null != indexQueryPlan
@@ -436,15 +447,13 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut
         long startTimeNanos = System.nanoTime();
 
         ColumnFamilyStore cfs = Keyspace.openAndGetStore(metadata());
-        Index.QueryPlan indexQueryPlan = indexQueryPlan();
 
         Index.Searcher searcher = null;
         if (indexQueryPlan != null)
         {
             cfs.indexManager.checkQueryability(indexQueryPlan);
-
+            searcher = indexSearcher();
             Index index = indexQueryPlan.getFirst();
-            searcher = indexQueryPlan.searcherFor(this);
             Tracing.trace("Executing read on {}.{} using index {}", cfs.metadata.keyspace, cfs.metadata.name, index.getIndexMetadata().name);
         }
 
@@ -457,7 +466,8 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut
             repairedDataInfo = new RepairedDataInfo(repairedReadCount);
         }
 
-        UnfilteredPartitionIterator iterator = (null == searcher) ? queryStorage(cfs, executionController) : searcher.search(executionController);
+        UnfilteredPartitionIterator iterator = (null == searcher) ? queryStorage(cfs, executionController)
+                                                                  : searchStorage(searcher, executionController);
         iterator = RTBoundValidator.validate(iterator, Stage.MERGED, false);
 
         try
@@ -507,6 +517,11 @@ public UnfilteredPartitionIterator executeLocally(ReadExecutionController execut
         }
     }
 
+    public UnfilteredPartitionIterator searchStorage(Index.Searcher searcher, ReadExecutionController executionController)
+    {
+        return searcher.search(executionController);
+    }
+
     protected abstract void recordLatency(TableMetrics metric, long latencyNanos);
 
     /**
diff --git a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java
index 2c28ed9d4b8f..293ced084b32 100644
--- a/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java
+++ b/src/java/org/apache/cassandra/db/ReadCommandVerbHandler.java
@@ -81,8 +81,10 @@ private void validateTransientStatus(Message<ReadCommand> message)
 
         if (command instanceof SinglePartitionReadCommand)
             token = ((SinglePartitionReadCommand) command).partitionKey().getToken();
-        else
+        else if (command instanceof PartitionRangeReadCommand)
             token = ((PartitionRangeReadCommand) command).dataRange().keyRange().right.getToken();
+        else
+            return;
 
         Replica replica = Keyspace.open(command.metadata().keyspace)
                                   .getReplicationStrategy()
diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
index bd6985231dc0..7dd0f53aefe0 100644
--- a/src/java/org/apache/cassandra/index/Index.java
+++ b/src/java/org/apache/cassandra/index/Index.java
@@ -897,6 +897,14 @@ default Function<PartitionIterator, PartitionIterator> postProcessor()
          *         the index was used to narrow the initial result set
          */
         RowFilter postIndexQueryFilter();
+
+        /**
+         * @return true if the indexes in this plan support querying multiple vnode ranges at once.
+         */
+        default boolean supportsMultiRangeReadCommand()
+        {
+            return false;
+        }
     }
 
     /*
diff --git a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
index 26ee729714b1..72e7592fe833 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
@@ -39,6 +39,7 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DataRange;
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.MultiRangeReadCommand;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.PartitionRangeReadCommand;
 import org.apache.cassandra.db.ReadCommand;
@@ -385,6 +386,11 @@ else if (command instanceof PartitionRangeReadCommand)
             PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
             return Lists.newArrayList(cmd.dataRange());
         }
+        else if (command instanceof MultiRangeReadCommand)
+        {
+            MultiRangeReadCommand cmd = (MultiRangeReadCommand) command;
+            return cmd.ranges();
+        }
         else
         {
             throw new AssertionError("Unsupported read command type: " + command.getClass().getName());
diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
index cbb21784fd32..5c11aee35cd1 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
@@ -133,10 +133,9 @@ public RowFilter postIndexQueryFilter()
         return postIndexFilter;
     }
 
-    //TODO Do we need to support this
-//    @Override
-//    public boolean supportsMultiRangeReadCommand()
-//    {
-//        return true;
-//    }
+    @Override
+    public boolean supportsMultiRangeReadCommand()
+    {
+        return true;
+    }
 }
diff --git a/src/java/org/apache/cassandra/net/Message.java b/src/java/org/apache/cassandra/net/Message.java
index 214c5c0ff6e0..17f157dd967a 100644
--- a/src/java/org/apache/cassandra/net/Message.java
+++ b/src/java/org/apache/cassandra/net/Message.java
@@ -234,6 +234,18 @@ public static <T> Message<T> internalResponse(Verb verb, T payload)
         return outWithParam(0, verb, payload, null, null);
     }
 
+    /**
+     * Used by the {@code MultiRangeReadCommand} to split multi-range responses from a replica
+     * into single-range responses.
+     */
+    public static <T> Message<T> remoteResponse(InetAddressAndPort from, Verb verb, T payload)
+    {
+        assert verb.isResponse();
+        long createdAtNanos = approxTime.now();
+        long expiresAtNanos = verb.expiresAtNanos(createdAtNanos);
+        return new Message<>(new Header(0, verb, from, createdAtNanos, expiresAtNanos, 0, NO_PARAMS), payload);
+    }
+
     /** Builds a response Message with provided payload, and all the right fields inferred from request Message */
     public <T> Message<T> responseWith(T payload)
     {
diff --git a/src/java/org/apache/cassandra/net/Verb.java b/src/java/org/apache/cassandra/net/Verb.java
index fad2fbf6a9ec..5df2f7ed33ee 100644
--- a/src/java/org/apache/cassandra/net/Verb.java
+++ b/src/java/org/apache/cassandra/net/Verb.java
@@ -34,6 +34,8 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.CounterMutation;
 import org.apache.cassandra.db.CounterMutationVerbHandler;
+import org.apache.cassandra.db.MultiRangeReadCommand;
+import org.apache.cassandra.db.MultiRangeReadResponse;
 import org.apache.cassandra.db.Mutation;
 import org.apache.cassandra.db.MutationVerbHandler;
 import org.apache.cassandra.db.ReadCommand;
@@ -127,6 +129,8 @@ public enum Verb
     READ_REQ               (3,   P3, readTimeout,     READ,              () -> ReadCommand.serializer,               () -> ReadCommandVerbHandler.instance,     READ_RSP            ),
     RANGE_RSP              (69,  P2, rangeTimeout,    REQUEST_RESPONSE,  () -> ReadResponse.serializer,              () -> ResponseVerbHandler.instance                             ),
     RANGE_REQ              (9,   P3, rangeTimeout,    READ,              () -> ReadCommand.serializer,               () -> ReadCommandVerbHandler.instance,     RANGE_RSP           ),
+    MULTI_RANGE_RSP        (67,  P2, rangeTimeout,    REQUEST_RESPONSE,  () -> MultiRangeReadResponse.serializer,    () -> ResponseVerbHandler.instance                             ),
+    MULTI_RANGE_REQ        (7,   P3, rangeTimeout,    READ,              () -> MultiRangeReadCommand.serializer,     () -> ReadCommandVerbHandler.instance,     MULTI_RANGE_RSP     ),
 
     GOSSIP_DIGEST_SYN      (14,  P0, longTimeout,     GOSSIP,            () -> GossipDigestSyn.serializer,           () -> GossipDigestSynVerbHandler.instance                      ),
     GOSSIP_DIGEST_ACK      (15,  P0, longTimeout,     GOSSIP,            () -> GossipDigestAck.serializer,           () -> GossipDigestAckVerbHandler.instance                      ),
@@ -450,4 +454,4 @@ class VerbTimeouts
     static final ToLongFunction<TimeUnit> pingTimeout     = DatabaseDescriptor::getPingTimeout;
     static final ToLongFunction<TimeUnit> longTimeout     = units -> Math.max(DatabaseDescriptor.getRpcTimeout(units), units.convert(5L, TimeUnit.MINUTES));
     static final ToLongFunction<TimeUnit> noTimeout       = units -> { throw new IllegalStateException(); };
-}
\ No newline at end of file
+}
diff --git a/src/java/org/apache/cassandra/service/reads/DataResolver.java b/src/java/org/apache/cassandra/service/reads/DataResolver.java
index e3ec4e1b8c18..2017a8097639 100644
--- a/src/java/org/apache/cassandra/service/reads/DataResolver.java
+++ b/src/java/org/apache/cassandra/service/reads/DataResolver.java
@@ -140,9 +140,9 @@ private boolean needsReplicaFilteringProtection()
         return index.supportsReplicaFilteringProtection(command.rowFilter());
     }
 
-    private class ResolveContext
+    protected class ResolveContext
     {
-        private final E replicas;
+        public final E replicas;
         private final DataLimits.Counter mergedResultCounter;
 
         private ResolveContext(E replicas)
@@ -159,7 +159,12 @@ private boolean needsReadRepair()
             return replicas.size() > 1;
         }
 
-        private boolean needShortReadProtection()
+        public DataLimits.Counter mergedResultCounter()
+        {
+            return mergedResultCounter;
+        }
+
+        public boolean needShortReadProtection()
         {
             // If we have only one result, there is no read repair to do and we can't get short reads
             // Also, so-called "short reads" stems from nodes returning only a subset of the results they have for a
@@ -175,19 +180,24 @@ private interface ResponseProvider
         UnfilteredPartitionIterator getResponse(int i);
     }
 
-    private UnfilteredPartitionIterator shortReadProtectedResponse(int i, ResolveContext context)
+    protected UnfilteredPartitionIterator shortReadProtectedResponse(int i, ResolveContext context)
     {
         UnfilteredPartitionIterator originalResponse = responses.get(i).payload.makeIterator(command);
 
-        return context.needShortReadProtection()
-               ? ShortReadProtection.extend(context.replicas.get(i),
-                                            () -> responses.clearUnsafe(i),
-                                            originalResponse,
-                                            command,
-                                            context.mergedResultCounter,
-                                            queryStartNanoTime,
-                                            enforceStrictLiveness)
-               : originalResponse;
+        if (context.needShortReadProtection())
+        {
+            DataLimits.Counter singleResultCounter = command.createLimitedCounter(false);
+            return ShortReadProtection.extend(originalResponse,
+                                              command,
+                                              new ShortReadPartitionsProtection(command,
+                                                                                context.replicas.get(i),
+                                                                                () -> responses.clearUnsafe(i),
+                                                                                singleResultCounter,
+                                                                                context.mergedResultCounter(),
+                                                                                queryStartNanoTime),
+                                              singleResultCounter);
+        }
+        return originalResponse;
     }
 
     private PartitionIterator resolveWithReadRepair(ResolveContext context,
diff --git a/src/java/org/apache/cassandra/service/reads/ReadCallback.java b/src/java/org/apache/cassandra/service/reads/ReadCallback.java
index b7ee18cabc8d..76add52a737d 100644
--- a/src/java/org/apache/cassandra/service/reads/ReadCallback.java
+++ b/src/java/org/apache/cassandra/service/reads/ReadCallback.java
@@ -83,6 +83,11 @@ protected P replicaPlan()
         return replicaPlan.get();
     }
 
+    public ReadCommand command()
+    {
+        return command;
+    }
+
     public boolean await(long timePastStart, TimeUnit unit)
     {
         long time = unit.toNanos(timePastStart) - (System.nanoTime() - queryStartNanoTime);
diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
index 51043c352f6e..10a964d6fc31 100644
--- a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
+++ b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
@@ -55,12 +55,13 @@ public class ShortReadPartitionsProtection extends Transformation<UnfilteredRowI
 
     private final Runnable preFetchCallback; // called immediately before fetching more contents
 
-    private final DataLimits.Counter singleResultCounter; // unmerged per-source counter
+    protected final DataLimits.Counter singleResultCounter; // unmerged per-source counter
     private final DataLimits.Counter mergedResultCounter; // merged end-result counter
 
     private DecoratedKey lastPartitionKey; // key of the last observed partition
 
     private boolean partitionsFetched; // whether we've seen any new partitions since iteration start or last moreContents() call
+    protected boolean rangeFetched = false; // fetched by original read request or SRP request
 
     private final long queryStartNanoTime;
 
@@ -134,9 +135,10 @@ public UnfilteredPartitionIterator moreContents()
          * Either we had an empty iterator as the initial response, or our moreContents() call got us an empty iterator.
          * There is no point to ask the replica for more rows - it has no more in the requested range.
          */
-        if (!partitionsFetched)
+        if (rangeExhausted())
             return null;
         partitionsFetched = false;
+        rangeFetched = true;
 
         /*
          * We are going to fetch one partition at a time for thrift and potentially more for CQL.
@@ -158,6 +160,11 @@ public UnfilteredPartitionIterator moreContents()
         return makeAndExecuteFetchAdditionalPartitionReadCommand(toQuery);
     }
 
+    public boolean rangeExhausted()
+    {
+        return !partitionsFetched;
+    }
+
     private UnfilteredPartitionIterator makeAndExecuteFetchAdditionalPartitionReadCommand(int toQuery)
     {
         PartitionRangeReadCommand cmd = (PartitionRangeReadCommand) command;
diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java
index a1bdc0e8b3e5..50f82bbdde0f 100644
--- a/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java
+++ b/src/java/org/apache/cassandra/service/reads/ShortReadProtection.java
@@ -18,16 +18,11 @@
 
 package org.apache.cassandra.service.reads;
 
-import java.net.InetAddress;
-
-
 import org.apache.cassandra.db.ReadCommand;
 import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.db.transform.MorePartitions;
 import org.apache.cassandra.db.transform.Transformation;
-import org.apache.cassandra.locator.InetAddressAndPort;
-import org.apache.cassandra.locator.Replica;
 
 /**
  * We have a potential short read if the result from a given node contains the requested number of rows
@@ -42,26 +37,11 @@
 public class ShortReadProtection
 {
     @SuppressWarnings("resource")
-    public static UnfilteredPartitionIterator extend(Replica source,
-                                                     Runnable preFetchCallback,
-                                                     UnfilteredPartitionIterator partitions,
+    public static UnfilteredPartitionIterator extend(UnfilteredPartitionIterator partitions,
                                                      ReadCommand command,
-                                                     DataLimits.Counter mergedResultCounter,
-                                                     long queryStartNanoTime,
-                                                     boolean enforceStrictLiveness)
+                                                     ShortReadPartitionsProtection protection,
+                                                     DataLimits.Counter singleResultCounter)
     {
-        DataLimits.Counter singleResultCounter = command.limits().newCounter(command.nowInSec(),
-                                                                             false,
-                                                                             command.selectsFullPartition(),
-                                                                             enforceStrictLiveness).onlyCount();
-
-        ShortReadPartitionsProtection protection = new ShortReadPartitionsProtection(command,
-                                                                                     source,
-                                                                                     preFetchCallback,
-                                                                                     singleResultCounter,
-                                                                                     mergedResultCounter,
-                                                                                     queryStartNanoTime);
-
         /*
          * The order of extention and transformations is important here. Extending with more partitions has to happen
          * first due to the way BaseIterator.hasMoreContents() works: only transformations applied after extension will
diff --git a/src/java/org/apache/cassandra/service/reads/range/EndpointGroupingCoordinator.java b/src/java/org/apache/cassandra/service/reads/range/EndpointGroupingCoordinator.java
new file mode 100644
index 000000000000..876982cd09a5
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/reads/range/EndpointGroupingCoordinator.java
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.reads.range;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.db.MultiRangeReadCommand;
+import org.apache.cassandra.db.MultiRangeReadResponse;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.ReadResponse;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.PartitionIterators;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.exceptions.RequestFailureReason;
+import org.apache.cassandra.locator.Endpoints;
+import org.apache.cassandra.locator.EndpointsForRange;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.locator.ReplicaPlan;
+import org.apache.cassandra.net.Message;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.net.RequestCallback;
+import org.apache.cassandra.net.Verb;
+import org.apache.cassandra.service.reads.DataResolver;
+import org.apache.cassandra.service.reads.ReadCallback;
+import org.apache.cassandra.service.reads.ShortReadPartitionsProtection;
+import org.apache.cassandra.service.reads.ShortReadProtection;
+import org.apache.cassandra.service.reads.repair.NoopReadRepair;
+import org.apache.cassandra.service.reads.repair.ReadRepair;
+
+/**
+ * Coordinates the process of endpoint grouping queries for given vnode ranges based on concurrency factor:
+ * <ol>
+ *     <li> Collect token ranges required by concurrency factor in token order and group token ranges by endpoint
+ *     <li> Create single-range read callbacks corresponding to each vnode range, note that:
+ *        <ul>
+ *           <li> In order to maintain proper single result counting for short-read-protection, single-range read callback
+ *                cannot start resolving before previous one has finished resolving.
+ *        </ul>
+ *     <li> Execute {@link MultiRangeReadCommand} on each selected endpoint with all its replicated ranges at once.
+ *     <li> Upon receiving individual {@link MultiRangeReadResponse}:
+ *        <ol>
+ *           <li> It will split multi-range response into single-range responses by queried vnode ranges.
+ *           <li> It will pass single-range responses to their corresponding single-range read callback to allow progressive data merging.
+ *        </ol>
+ *     <li> Return single-range handlers' result in token order.
+ * </ol>
+ */
+public class EndpointGroupingCoordinator
+{
+    private final PartitionRangeReadCommand command;
+    private final DataLimits.Counter counter;
+    private final Map<InetAddressAndPort, EndpointQueryContext> endpointContexts;
+    private final List<ReadCallback<EndpointsForRange, ReplicaPlan.ForRangeRead>> perRangeHandlers;
+    private final List<PartitionIterator> concurrentQueries;
+
+    private final long queryStartNanoTime;
+    private final int vnodeRanges;
+
+    /**
+     * @param command current range read command
+     * @param counter the unlimited counter for the command
+     * @param replicaPlans to be queried
+     * @param concurrencyFactor number of vnode ranges to query at once
+     * @param queryStartNanoTime the start time of the query
+     */
+    public EndpointGroupingCoordinator(PartitionRangeReadCommand command,
+                                       DataLimits.Counter counter,
+                                       Iterator<ReplicaPlan.ForRangeRead> replicaPlans,
+                                       int concurrencyFactor,
+                                       long queryStartNanoTime)
+    {
+        this.command = command;
+        this.counter = counter;
+        this.queryStartNanoTime = queryStartNanoTime;
+        this.endpointContexts = new HashMap<>();
+
+        // Read callbacks in token order
+        perRangeHandlers = new ArrayList<>(concurrencyFactor);
+        // Range responses in token order
+        concurrentQueries = new ArrayList<>(concurrencyFactor);
+        int vnodeRanges = 0;
+
+        while (replicaPlans.hasNext() && vnodeRanges < concurrencyFactor)
+        {
+            ReplicaPlan.ForRangeRead replicaPlan = replicaPlans.next();
+
+            boolean isFirst = vnodeRanges == 0;
+            vnodeRanges += replicaPlan.vnodeCount();
+            concurrentQueries.add(createResponse(replicaPlan, isFirst));
+        }
+        this.vnodeRanges = vnodeRanges;
+    }
+
+    public int vnodeRanges()
+    {
+        return vnodeRanges;
+    }
+
+    public PartitionIterator execute()
+    {
+        for (EndpointQueryContext replica : replicas())
+            replica.queryReplica();
+
+        return counter.applyTo(PartitionIterators.concat(concurrentQueries));
+    }
+
+    @VisibleForTesting
+    Collection<EndpointQueryContext> endpointRanges()
+    {
+        return endpointContexts.values();
+    }
+
+    /**
+     * @return number of endpoints to be queried
+     */
+    int endpoints()
+    {
+        return endpointContexts.size();
+    }
+
+    private Collection<EndpointQueryContext> replicas()
+    {
+        return endpointContexts.values();
+    }
+
+    /**
+     * Create a {@link SingleRangeResponse} for a given vnode range. The responses are collected and concatenated by
+     * {@code execute}.
+     */
+    private SingleRangeResponse createResponse(ReplicaPlan.ForRangeRead replicaPlan, boolean isFirst)
+    {
+        PartitionRangeReadCommand subrangeCommand = command.forSubRange(replicaPlan.range(), isFirst);
+
+        ReplicaPlan.SharedForRangeRead sharedReplicaPlan = ReplicaPlan.shared(replicaPlan);
+
+        DataResolver<EndpointsForRange, ReplicaPlan.ForRangeRead> resolver =
+                new EndpointDataResolver(subrangeCommand, sharedReplicaPlan, NoopReadRepair.instance, queryStartNanoTime);
+
+        // Create a handler for the range and add it, by replica, to the endpoint contexts.
+        ReadCallback<EndpointsForRange, ReplicaPlan.ForRangeRead> handler =
+                new ReadCallback<>(resolver, subrangeCommand, sharedReplicaPlan, queryStartNanoTime);
+        
+        perRangeHandlers.add(handler);
+        for (Replica replica : replicaPlan.contacts())
+        {
+            endpointContexts.computeIfAbsent(replica.endpoint(),
+                                             k -> new EndpointQueryContext(replica.endpoint(),
+                                                                           command.createLimitedCounter(false))).add(handler);
+        }
+        return new SingleRangeResponse(resolver, handler, NoopReadRepair.instance);
+    }
+
+    /**
+     * Collect and query all involved ranges of a given endpoint
+     */
+    public static class EndpointQueryContext
+    {
+        private final InetAddressAndPort endpoint;
+        private final List<ReadCallback<?, ?>> handlers;
+        // used by SRP to track fetched data from each endpoint to determine if an endpoint is exhausted,
+        // aka. no more data can be fetched.
+        private final DataLimits.Counter singleResultCounter;
+
+        private MultiRangeReadCommand multiRangeCommand;
+
+        public EndpointQueryContext(InetAddressAndPort endpoint, DataLimits.Counter singleResultCounter)
+        {
+            this.endpoint = endpoint;
+            this.handlers = new ArrayList<>();
+            this.singleResultCounter = singleResultCounter;
+        }
+
+        /**
+         * @param handler read callback for a given vnode range on the current endpoint
+         */
+        public void add(ReadCallback<?, ?> handler)
+        {
+            assert multiRangeCommand == null : "Cannot add range to already queried context";
+            handlers.add(handler);
+        }
+
+        /**
+         * Query a single endpoint with multiple vnode ranges asynchronously
+         */
+        public void queryReplica()
+        {
+            assert multiRangeCommand == null : "Can only query given endpoint once";
+            this.multiRangeCommand = MultiRangeReadCommand.create(handlers);
+
+            SingleEndpointCallback proxy = new SingleEndpointCallback();
+            Message<ReadCommand> message = multiRangeCommand.createMessage(false);
+            MessagingService.instance().sendWithCallback(message, endpoint, proxy);
+        }
+
+        @VisibleForTesting
+        public int rangesCount()
+        {
+            return handlers.size();
+        }
+
+        /**
+         * A proxy responsible for:
+         * 0. propagating failure/timeout to single-range handlers
+         * 1. receiving multi-range responses from a given endpoint
+         * 2. spliting the multi-range responses by vnode ranges
+         * 3. passing the split single-range response to a corresponding read callback which will
+         *    start resolving responses if it has got enough responses for the consistency level requirement.
+         */
+        private class SingleEndpointCallback implements RequestCallback<ReadResponse>
+        {
+            @Override
+            public void onResponse(Message<ReadResponse> response)
+            {
+                // split single-endpoint multi-range response into per-range handlers.
+                MultiRangeReadResponse multiRangeResponse = (MultiRangeReadResponse) response.payload;
+                for (ReadCallback<?, ?> handler : handlers)
+                {
+                    AbstractBounds<PartitionPosition> range = ((PartitionRangeReadCommand) handler.command()).dataRange().keyRange();
+
+                    // extract subrange response in token order
+                    ReadResponse subrangeResponse = multiRangeResponse.subrangeResponse(multiRangeCommand, range);
+                    handler.onResponse(Message.remoteResponse(response.header.from, Verb.RANGE_RSP, subrangeResponse));
+                }
+            }
+
+            @Override
+            public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason)
+            {
+                for (ReadCallback<?, ?> handler : handlers)
+                    handler.onFailure(from, failureReason);
+            }
+
+            @Override
+            public boolean invokeOnFailure()
+            {
+                return true;
+            }
+
+            @Override
+            public boolean trackLatencyForSnitch()
+            {
+                return true;
+            }
+        }
+    }
+
+    /**
+     * Short-read-protection needs to know if an endpoint has any more data or it has already reached the limit:
+     * If the endpoint has no more data, aka. the counter hasn't reached the limit, there is no point in doing SRP.
+     * If the endpoint might have more data, aka. the counter has reached the limit, SRP might be needed.
+     *
+     * With token ordered range query or single partition query, {@link DataResolver} uses a new single result counter
+     * per replica for a given range, as all replicas are queried with the same range.
+     *
+     * But with endpoint grouping, each source is queried with different token ranges. So we need a shared
+     * cross-range counter for each replica to know if given endpoint has more data.
+     */
+    private class EndpointDataResolver<E extends Endpoints<E>, P extends ReplicaPlan.ForRead<E>> extends DataResolver<E, P>
+    {
+        public EndpointDataResolver(ReadCommand command, ReplicaPlan.Shared replicaPlan, ReadRepair readRepair, long queryStartNanoTime)
+        {
+            super(command, replicaPlan, readRepair, queryStartNanoTime);
+        }
+
+        @Override
+        protected UnfilteredPartitionIterator shortReadProtectedResponse(int i, DataResolver.ResolveContext context)
+        {
+            UnfilteredPartitionIterator originalResponse = responses.get(i).payload.makeIterator(command);
+
+            if (context.needShortReadProtection())
+            {
+                DataLimits.Counter singleResultCounter = endpointContexts.get(context.replicas.get(i).endpoint()).singleResultCounter;
+                return ShortReadProtection.extend(originalResponse,
+                                                  command,
+                                                  new EndpointShortReadResponseProtection(command,
+                                                                                          context.replicas.get(i),
+                                                                                          () -> responses.clearUnsafe(i),
+                                                                                          singleResultCounter,
+                                                                                          context.mergedResultCounter(),
+                                                                                          queryStartNanoTime),
+                                                  singleResultCounter);
+            }
+            else
+                return originalResponse;
+        }
+
+        /**
+         * On replica, {@link MultiRangeReadCommand} stops fetching remaining ranges when it reaches limit.
+         *
+         * We should do short-read-protection if current range is not fetched due to limit.
+         */
+        public class EndpointShortReadResponseProtection extends ShortReadPartitionsProtection
+        {
+            public EndpointShortReadResponseProtection(ReadCommand command,
+                                                       Replica source,
+                                                       Runnable preFetchCallback,
+                                                       DataLimits.Counter singleResultCounter,
+                                                       DataLimits.Counter mergedResultCounter,
+                                                       long queryStartNanoTime)
+            {
+                super(command, source, preFetchCallback, singleResultCounter, mergedResultCounter, queryStartNanoTime);
+            }
+
+            @Override
+            public boolean rangeExhausted()
+            {
+                // if the range is not fetched by original request or SRP, SRP is needed.
+                return super.rangeExhausted() && (rangeFetched || !singleResultCounter.isDone());
+            }
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIterator.java b/src/java/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIterator.java
new file mode 100644
index 000000000000..0e3579c297c2
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIterator.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.reads.range;
+
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.locator.ReplicaPlan;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.CloseableIterator;
+
+/**
+ * A range command iterator that executes requests by endpoints and then merges responses in token order. It's designed to
+ * reduce the number of range requests when scanning the whole token ring (eg. rows per range is low) for all range
+ * reads that don't use digests and also to reduce the amount of disk-access for storage-attached indexes, as they will
+ * be able to read index content for all required ranges at once.
+ *
+ * <ul>
+ *     <li> With the non-grouping range command iterator, scanning the entire ring requires "num_of_nodes * num_of_tokens * consistency"
+ *     range requests (assuming no ranges are merged by {@link ReplicaPlanMerger}) to their respective replicas.
+ *
+ *     <li> With the endpoint grouping range command iterator, scanning the entire ring only requires at most "num_of_nodes" multi-range
+ *     requests to their respective replicas. So coordinator will cache up to "num_of_nodes" responses.
+ * </ul>
+ */
+public class EndpointGroupingRangeCommandIterator extends RangeCommandIterator
+{
+    EndpointGroupingRangeCommandIterator(CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans,
+                                         PartitionRangeReadCommand command,
+                                         int concurrencyFactor,
+                                         int maxConcurrencyFactor,
+                                         int totalRangeCount,
+                                         long queryStartNanoTime)
+    {
+        super(replicaPlans, command, concurrencyFactor, maxConcurrencyFactor, totalRangeCount, queryStartNanoTime);
+    }
+
+    @Override
+    protected PartitionIterator sendNextRequests()
+    {
+        counter = command.createUnlimitedCounter(true);
+
+        EndpointGroupingCoordinator coordinator = new EndpointGroupingCoordinator(command,
+                                                                                  counter,
+                                                                                  replicaPlans,
+                                                                                  concurrencyFactor(),
+                                                                                  queryStartNanoTime);
+        PartitionIterator partitions = coordinator.execute();
+
+        rangesQueried += coordinator.vnodeRanges();
+        batchesRequested++;
+        Tracing.trace("Submitted concurrent grouped range read requests to {} endpoints", coordinator.endpoints());
+        return partitions;
+    }
+}
diff --git a/src/java/org/apache/cassandra/service/reads/range/NonGroupingRangeCommandIterator.java b/src/java/org/apache/cassandra/service/reads/range/NonGroupingRangeCommandIterator.java
new file mode 100644
index 000000000000..9cbfb9c08f27
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/reads/range/NonGroupingRangeCommandIterator.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.reads.range;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.locator.EndpointsForRange;
+import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.locator.ReplicaPlan;
+import org.apache.cassandra.net.Message;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.reads.DataResolver;
+import org.apache.cassandra.service.reads.ReadCallback;
+import org.apache.cassandra.service.reads.repair.ReadRepair;
+import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.CloseableIterator;
+
+public class NonGroupingRangeCommandIterator extends RangeCommandIterator
+{
+    NonGroupingRangeCommandIterator(CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans,
+                                    PartitionRangeReadCommand command,
+                                    int concurrencyFactor,
+                                    int maxConcurrencyFactor,
+                                    int totalRangeCount,
+                                    long queryStartNanoTime)
+    {
+        super(replicaPlans, command, concurrencyFactor, maxConcurrencyFactor, totalRangeCount, queryStartNanoTime);
+    }
+
+    protected PartitionIterator sendNextRequests()
+    {
+        List<PartitionIterator> concurrentQueries = new ArrayList<>(concurrencyFactor);
+        List<ReadRepair<?, ?>> readRepairs = new ArrayList<>(concurrencyFactor);
+
+        try
+        {
+            for (int i = 0; i < concurrencyFactor() && replicaPlans.hasNext(); )
+            {
+                ReplicaPlan.ForRangeRead replicaPlan = replicaPlans.next();
+
+                @SuppressWarnings("resource") // response will be closed by concatAndBlockOnRepair, or in the catch block below
+                SingleRangeResponse response = query(replicaPlan, i == 0);
+                concurrentQueries.add(response);
+                readRepairs.add(response.getReadRepair());
+                // due to RangeMerger, coordinator may fetch more ranges than required by concurrency factor.
+                rangesQueried += replicaPlan.vnodeCount();
+                i += replicaPlan.vnodeCount();
+            }
+            batchesRequested++;
+        }
+        catch (Throwable t)
+        {
+            for (PartitionIterator response : concurrentQueries)
+                response.close();
+            throw t;
+        }
+
+        Tracing.trace("Submitted {} concurrent range requests", concurrentQueries.size());
+        // We want to count the results for the sake of updating the concurrency factor (see updateConcurrencyFactor)
+        // but we don't want to enforce any particular limit at this point (this could break code than rely on
+        // postReconciliationProcessing), hence the unlimited counter that uses DataLimits.NONE.
+        counter = command.createUnlimitedCounter(true);
+        return counter.applyTo(StorageProxy.concatAndBlockOnRepair(concurrentQueries, readRepairs));
+    }
+
+    /**
+     * Queries the provided sub-range.
+     *
+     * @param replicaPlan the subRange to query.
+     * @param isFirst in the case where multiple queries are sent in parallel, whether that's the first query on
+     * that batch or not. The reason it matters is that whe paging queries, the command (more specifically the
+     * {@code DataLimits}) may have "state" information and that state may only be valid for the first query (in
+     * that it's the query that "continues" whatever we're previously queried).
+     */
+    private SingleRangeResponse query(ReplicaPlan.ForRangeRead replicaPlan, boolean isFirst)
+    {
+        PartitionRangeReadCommand rangeCommand = command.forSubRange(replicaPlan.range(), isFirst);
+        // If enabled, request repaired data tracking info from full replicas but
+        // only if there are multiple full replicas to compare results from
+        if (DatabaseDescriptor.getRepairedDataTrackingForRangeReadsEnabled()
+            && replicaPlan.contacts().filter(Replica::isFull).size() > 1)
+        {
+            command.trackRepairedStatus();
+            rangeCommand.trackRepairedStatus();
+        }
+
+        ReplicaPlan.SharedForRangeRead sharedReplicaPlan = ReplicaPlan.shared(replicaPlan);
+        ReadRepair<EndpointsForRange, ReplicaPlan.ForRangeRead> readRepair =
+        ReadRepair.create(command, sharedReplicaPlan, queryStartNanoTime);
+        DataResolver<EndpointsForRange, ReplicaPlan.ForRangeRead> resolver =
+        new DataResolver<>(rangeCommand, sharedReplicaPlan, readRepair, queryStartNanoTime);
+        ReadCallback<EndpointsForRange, ReplicaPlan.ForRangeRead> handler =
+        new ReadCallback<>(resolver, rangeCommand, sharedReplicaPlan, queryStartNanoTime);
+
+        if (replicaPlan.contacts().size() == 1 && replicaPlan.contacts().get(0).isSelf())
+        {
+            Stage.READ.execute(new StorageProxy.LocalReadRunnable(rangeCommand, handler));
+        }
+        else
+        {
+            for (Replica replica : replicaPlan.contacts())
+            {
+                Tracing.trace("Enqueuing request to {}", replica);
+                ReadCommand command = replica.isFull() ? rangeCommand : rangeCommand.copyAsTransientQuery(replica);
+                Message<ReadCommand> message = command.createMessage(command.isTrackingRepairedStatus() && replica.isFull());
+                MessagingService.instance().sendWithCallback(message, replica.endpoint(), handler);
+            }
+        }
+
+        return new SingleRangeResponse(resolver, handler, readRepair);
+    }
+}
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java
index a345c6091e01..a96dff61e623 100644
--- a/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java
+++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommandIterator.java
@@ -18,16 +18,12 @@
 
 package org.apache.cassandra.service.reads.range;
 
-import java.util.ArrayList;
-import java.util.List;
 import java.util.concurrent.TimeUnit;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.concurrent.Stage;
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.PartitionRangeReadCommand;
 import org.apache.cassandra.db.ReadCommand;
@@ -37,46 +33,60 @@
 import org.apache.cassandra.exceptions.ReadFailureException;
 import org.apache.cassandra.exceptions.ReadTimeoutException;
 import org.apache.cassandra.exceptions.UnavailableException;
-import org.apache.cassandra.locator.EndpointsForRange;
-import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.index.Index;
 import org.apache.cassandra.locator.ReplicaPlan;
 import org.apache.cassandra.metrics.ClientRangeRequestMetrics;
-import org.apache.cassandra.metrics.ClientRequestMetrics;
-import org.apache.cassandra.net.Message;
-import org.apache.cassandra.net.MessagingService;
-import org.apache.cassandra.service.StorageProxy;
-import org.apache.cassandra.service.reads.DataResolver;
-import org.apache.cassandra.service.reads.ReadCallback;
-import org.apache.cassandra.service.reads.repair.ReadRepair;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.CloseableIterator;
 
 @VisibleForTesting
-public class RangeCommandIterator extends AbstractIterator<RowIterator> implements PartitionIterator
+public abstract class RangeCommandIterator extends AbstractIterator<RowIterator> implements PartitionIterator
 {
     private static final Logger logger = LoggerFactory.getLogger(RangeCommandIterator.class);
 
     @VisibleForTesting
     public static final ClientRangeRequestMetrics rangeMetrics = new ClientRangeRequestMetrics("RangeSlice");
 
-    private final CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans;
+    protected final CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans;
     private final int totalRangeCount;
-    private final PartitionRangeReadCommand command;
-    private final boolean enforceStrictLiveness;
+    protected final PartitionRangeReadCommand command;
+    protected final boolean enforceStrictLiveness;
 
     private final long startTime;
-    private final long queryStartNanoTime;
-    private DataLimits.Counter counter;
+    protected final long queryStartNanoTime;
+    protected DataLimits.Counter counter;
     private PartitionIterator sentQueryIterator;
 
     private final int maxConcurrencyFactor;
-    private int concurrencyFactor;
+    protected int concurrencyFactor;
     // The two following "metric" are maintained to improve the concurrencyFactor
     // when it was not good enough initially.
     private int liveReturned;
-    private int rangesQueried;
-    private int batchesRequested = 0;
+    protected int rangesQueried;
+    protected int batchesRequested = 0;
+
+    @SuppressWarnings("resource")
+    public static RangeCommandIterator create(CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans,
+                                              PartitionRangeReadCommand command,
+                                              int concurrencyFactor,
+                                              int maxConcurrencyFactor,
+                                              int totalRangeCount,
+                                              long queryStartNanoTime)
+    {
+        return supportsEndpointGrouping(command) ? new EndpointGroupingRangeCommandIterator(replicaPlans,
+                                                                                            command,
+                                                                                            concurrencyFactor,
+                                                                                            maxConcurrencyFactor,
+                                                                                            totalRangeCount,
+                                                                                            queryStartNanoTime)
+                                                 : new NonGroupingRangeCommandIterator(replicaPlans,
+                                                                                       command,
+                                                                                       concurrencyFactor,
+                                                                                       maxConcurrencyFactor,
+                                                                                       totalRangeCount,
+                                                                                       queryStartNanoTime);
+    }
 
     RangeCommandIterator(CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans,
                          PartitionRangeReadCommand command,
@@ -146,6 +156,17 @@ private void updateConcurrencyFactor()
         concurrencyFactor = computeConcurrencyFactor(totalRangeCount, rangesQueried, maxConcurrencyFactor, command.limits().count(), liveReturned);
     }
 
+    private static boolean supportsEndpointGrouping(ReadCommand command)
+    {
+        // With endpoint grouping, ranges executed on each endpoint are different, digest is unlikely to match.
+        if (command.isDigestQuery())
+            return false;
+
+        // Endpoint grouping is currently only supported by SAI
+        Index.QueryPlan queryPlan = command.indexQueryPlan();
+        return queryPlan != null && queryPlan.supportsMultiRangeReadCommand();
+    }
+
     @VisibleForTesting
     static int computeConcurrencyFactor(int totalRangeCount, int rangesQueried, int maxConcurrencyFactor, int limit, int liveReturned)
     {
@@ -167,88 +188,7 @@ static int computeConcurrencyFactor(int totalRangeCount, int rangesQueried, int
         return concurrencyFactor;
     }
 
-    /**
-     * Queries the provided sub-range.
-     *
-     * @param replicaPlan the subRange to query.
-     * @param isFirst in the case where multiple queries are sent in parallel, whether that's the first query on
-     * that batch or not. The reason it matters is that whe paging queries, the command (more specifically the
-     * {@code DataLimits}) may have "state" information and that state may only be valid for the first query (in
-     * that it's the query that "continues" whatever we're previously queried).
-     */
-    private SingleRangeResponse query(ReplicaPlan.ForRangeRead replicaPlan, boolean isFirst)
-    {
-        PartitionRangeReadCommand rangeCommand = command.forSubRange(replicaPlan.range(), isFirst);
-        // If enabled, request repaired data tracking info from full replicas but
-        // only if there are multiple full replicas to compare results from
-        if (DatabaseDescriptor.getRepairedDataTrackingForRangeReadsEnabled()
-            && replicaPlan.contacts().filter(Replica::isFull).size() > 1)
-        {
-            command.trackRepairedStatus();
-            rangeCommand.trackRepairedStatus();
-        }
-
-        ReplicaPlan.SharedForRangeRead sharedReplicaPlan = ReplicaPlan.shared(replicaPlan);
-        ReadRepair<EndpointsForRange, ReplicaPlan.ForRangeRead> readRepair =
-                ReadRepair.create(command, sharedReplicaPlan, queryStartNanoTime);
-        DataResolver<EndpointsForRange, ReplicaPlan.ForRangeRead> resolver =
-                new DataResolver<>(rangeCommand, sharedReplicaPlan, readRepair, queryStartNanoTime);
-        ReadCallback<EndpointsForRange, ReplicaPlan.ForRangeRead> handler =
-                new ReadCallback<>(resolver, rangeCommand, sharedReplicaPlan, queryStartNanoTime);
-
-        if (replicaPlan.contacts().size() == 1 && replicaPlan.contacts().get(0).isSelf())
-        {
-            Stage.READ.execute(new StorageProxy.LocalReadRunnable(rangeCommand, handler));
-        }
-        else
-        {
-            for (Replica replica : replicaPlan.contacts())
-            {
-                Tracing.trace("Enqueuing request to {}", replica);
-                ReadCommand command = replica.isFull() ? rangeCommand : rangeCommand.copyAsTransientQuery(replica);
-                Message<ReadCommand> message = command.createMessage(command.isTrackingRepairedStatus() && replica.isFull());
-                MessagingService.instance().sendWithCallback(message, replica.endpoint(), handler);
-            }
-        }
-
-        return new SingleRangeResponse(resolver, handler, readRepair);
-    }
-
-    private PartitionIterator sendNextRequests()
-    {
-        List<PartitionIterator> concurrentQueries = new ArrayList<>(concurrencyFactor);
-        List<ReadRepair<?, ?>> readRepairs = new ArrayList<>(concurrencyFactor);
-
-        try
-        {
-            for (int i = 0; i < concurrencyFactor && replicaPlans.hasNext(); )
-            {
-                ReplicaPlan.ForRangeRead replicaPlan = replicaPlans.next();
-
-                @SuppressWarnings("resource") // response will be closed by concatAndBlockOnRepair, or in the catch block below
-                SingleRangeResponse response = query(replicaPlan, i == 0);
-                concurrentQueries.add(response);
-                readRepairs.add(response.getReadRepair());
-                // due to RangeMerger, coordinator may fetch more ranges than required by concurrency factor.
-                rangesQueried += replicaPlan.vnodeCount();
-                i += replicaPlan.vnodeCount();
-            }
-            batchesRequested++;
-        }
-        catch (Throwable t)
-        {
-            for (PartitionIterator response : concurrentQueries)
-                response.close();
-            throw t;
-        }
-
-        Tracing.trace("Submitted {} concurrent range requests", concurrentQueries.size());
-        // We want to count the results for the sake of updating the concurrency factor (see updateConcurrencyFactor)
-        // but we don't want to enforce any particular limit at this point (this could break code than rely on
-        // postReconciliationProcessing), hence the DataLimits.NONE.
-        counter = DataLimits.NONE.newCounter(command.nowInSec(), true, command.selectsFullPartition(), enforceStrictLiveness);
-        return counter.applyTo(StorageProxy.concatAndBlockOnRepair(concurrentQueries, readRepairs));
-    }
+    protected abstract PartitionIterator sendNextRequests();
 
     @Override
     public void close()
diff --git a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
index 433af022e198..b74bb259d6e0 100644
--- a/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
+++ b/src/java/org/apache/cassandra/service/reads/range/RangeCommands.java
@@ -26,6 +26,7 @@
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.ReadCommand;
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.tracing.Tracing;
@@ -98,12 +99,13 @@ static RangeCommandIterator rangeCommandIterator(PartitionRangeReadCommand comma
         }
 
         ReplicaPlanMerger mergedReplicaPlans = new ReplicaPlanMerger(replicaPlans, keyspace, consistencyLevel);
-        return new RangeCommandIterator(mergedReplicaPlans,
-                                        command,
-                                        concurrencyFactor,
-                                        maxConcurrencyFactor,
-                                        replicaPlans.size(),
-                                        queryStartNanoTime);
+
+        return RangeCommandIterator.create(mergedReplicaPlans,
+                                           command,
+                                           concurrencyFactor,
+                                           maxConcurrencyFactor,
+                                           replicaPlans.size(),
+                                           queryStartNanoTime);
     }
 
     /**
diff --git a/test/unit/org/apache/cassandra/db/MultiRangeReadCommandTest.java b/test/unit/org/apache/cassandra/db/MultiRangeReadCommandTest.java
new file mode 100644
index 000000000000..6812fce43f44
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/MultiRangeReadCommandTest.java
@@ -0,0 +1,413 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.Iterables;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataInputPlus;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.reads.ReadCallback;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class MultiRangeReadCommandTest
+{
+    public static final String KEYSPACE1 = "MultiRangeReadCommandTest";
+    public static final String CF_STANDARD1 = "Standard1";
+
+    private static IPartitioner partitioner;
+    private static ColumnFamilyStore cfs;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        StorageService.instance.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        partitioner = DatabaseDescriptor.getPartitioner();
+        SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+        cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+    }
+
+    @Before
+    public void setup()
+    {
+        cfs.clearUnsafe();
+    }
+
+    @Test
+    public void verifyNotFetchingRemainingRangesOverLimit() throws InterruptedException
+    {
+        int rowCount = 1000;
+        for (int i = 0; i < rowCount; ++i)
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata(), 10, String.valueOf(i));
+            builder.clustering("c");
+            builder.add("val", String.valueOf(i));
+            builder.build().applyUnsafe();
+        }
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        int tokens = 100;
+        DataLimits limits = DataLimits.cqlLimits(100);
+        PartitionRangeReadCommand partitionRangeCommand = PartitionRangeReadCommand.allDataRead(cfs.metadata(), 10).withUpdatedLimit(limits);
+        MultiRangeReadCommand command = MultiRangeReadCommand.create(partitionRangeCommand, ranges(tokens), true);
+
+        assert cfs.metric != null;
+        SSTableReader sstable = Iterables.getOnlyElement(cfs.getLiveSSTables());
+        long beforeMetricsRecorded = cfs.metric.liveScannedHistogram.cf.getCount();
+        long beforeSSTableRead = sstable.getReadMeter().count();
+        assertEquals(limits.count(), rows(command.executeLocally(command.executionController())).size());
+
+        long metricsRecorded = cfs.metric.liveScannedHistogram.cf.getCount() - beforeMetricsRecorded;
+        assertEquals(1, metricsRecorded);
+
+        long subrangeScanned =  sstable.getReadMeter().count() - beforeSSTableRead;
+        String errorMessage = String.format("Should only query enough ranges to satisfy limit, but queried %d ranges", subrangeScanned);
+        assertTrue( errorMessage, subrangeScanned > 1 && subrangeScanned < tokens);
+    }
+
+    @Test
+    public void testMultiRangeReadResponse()
+    {
+        int rowCount = 1000;
+        for (int i = 0; i < rowCount; ++i)
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata(), 10, String.valueOf(i));
+            builder.clustering("c");
+            builder.add("val", String.valueOf(i));
+            builder.build().applyUnsafe();
+        }
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        PartitionRangeReadCommand partitionRangeCommand = PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds());
+        List<AbstractBounds<PartitionPosition>> ranges = ranges(100);
+        MultiRangeReadCommand command = MultiRangeReadCommand.create(partitionRangeCommand, ranges, true);
+
+        UnfilteredPartitionIterator data = command.executeLocally(command.executionController());
+        MultiRangeReadResponse response = (MultiRangeReadResponse) command.createResponse(data);
+
+        // verify subrange response from multi-range read responses contains all data in the subrange
+        for (AbstractBounds<PartitionPosition> range : ranges)
+        {
+            PartitionRangeReadCommand subrange = partitionRangeCommand.forSubRange(range, false);
+            ReadResponse subrangeResponse = response.subrangeResponse(command, range);
+
+            UnfilteredPartitionIterator actual = subrangeResponse.makeIterator(subrange);
+            UnfilteredPartitionIterator expected = subrange.executeLocally(subrange.executionController());
+            assertData(expected, actual);
+        }
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testEmptyRangesAssertionInCreation()
+    {
+        PartitionRangeReadCommand partitionRangeCommand = PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds());
+        List<AbstractBounds<PartitionPosition>> ranges = Collections.EMPTY_LIST;
+        MultiRangeReadCommand.create(partitionRangeCommand, ranges, true);
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testEmptyHandlersAssertionInCreation()
+    {
+        List<ReadCallback<?, ?>> subrangeHandlers = Collections.EMPTY_LIST;
+        MultiRangeReadCommand.create(subrangeHandlers);
+    }
+
+    @Test
+    public void testIsLimitedToOnePartition()
+    {
+        // Multiple ranges isn't limited
+        assertFalse(command(ranges(2), true).isLimitedToOnePartition());
+
+        // Single row bounds with different keys isn't limited
+        List<AbstractBounds<PartitionPosition>> ranges = new ArrayList<>();
+        ranges.add(new Bounds<>(partitioner.decorateKey(UTF8Type.instance.decompose("B")),
+                                partitioner.decorateKey(UTF8Type.instance.decompose("A"))));
+        assertFalse(command(ranges, true).isLimitedToOnePartition());
+
+        // Single row bounds with different keys is limited
+        ranges = new ArrayList<>();
+        ranges.add(new Bounds<>(partitioner.decorateKey(UTF8Type.instance.decompose("A")),
+                                partitioner.decorateKey(UTF8Type.instance.decompose("A"))));
+        assertTrue(command(ranges, true).isLimitedToOnePartition());
+    }
+
+    @Test
+    public void testUpdatingLimitsIsReflectedInCommand()
+    {
+        ReadCommand command = command(ranges(2), true);
+        assertTrue(command.limits() == DataLimits.NONE);
+        command = command.withUpdatedLimit(DataLimits.cqlLimits(10));
+        assertEquals(10, command.limits().count());
+    }
+
+    @Test
+    public void testIsRangeRequestReturnsFalse()
+    {
+        assertFalse(command(ranges(2), true).isRangeRequest());
+    }
+
+    @Test
+    public void testTimeoutReturnsRangeTimeout()
+    {
+        assertEquals(DatabaseDescriptor.getRangeRpcTimeout(TimeUnit.SECONDS), command(ranges(2), true).getTimeout(TimeUnit.SECONDS));
+    }
+
+    @Test
+    public void testSelectsKey()
+    {
+        PartitionRangeReadCommand partitionRangeCommand = PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds());
+        List<AbstractBounds<PartitionPosition>> ranges = new ArrayList<>();
+        ranges.add(new Bounds<>(partitioner.decorateKey(UTF8Type.instance.decompose("A")),
+                                partitioner.decorateKey(UTF8Type.instance.decompose("A"))));
+        MultiRangeReadCommand command = MultiRangeReadCommand.create(partitionRangeCommand, ranges, true);
+
+        assertTrue(command.selectsKey(partitioner.decorateKey(UTF8Type.instance.decompose("A"))));
+        assertFalse(command.selectsKey(partitioner.decorateKey(UTF8Type.instance.decompose("B"))));
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testCannotCreateResponseWithDigestQuery()
+    {
+        PartitionRangeReadCommand partitionRangeCommand = PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds());
+        partitionRangeCommand = partitionRangeCommand.copyAsDigestQuery();
+        MultiRangeReadCommand command = MultiRangeReadCommand.create(partitionRangeCommand, ranges(2), true);
+        UnfilteredPartitionIterator data = command.executeLocally(command.executionController());
+        command.createResponse(data);
+    }
+
+    @Test
+    public void testResponseIsNotDigestResponse()
+    {
+        MultiRangeReadCommand command = command(ranges(2), true);
+        MultiRangeReadResponse response = (MultiRangeReadResponse)command.createResponse(command.executeLocally(command.executionController()));
+        assertFalse(response.isDigestResponse());
+    }
+
+    @Test
+    public void testResponseIsRepairedDigestConclusiveForLocalResponse()
+    {
+        MultiRangeReadCommand command = command(ranges(2), true);
+        MultiRangeReadResponse response = (MultiRangeReadResponse)command.createResponse(command.executeLocally(command.executionController()));
+        assertTrue(response.isRepairedDigestConclusive());
+    }
+
+    @Test
+    public void testRepairedDataDigestIsEmptyForLocalResponse()
+    {
+        MultiRangeReadCommand command = command(ranges(2), true);
+        MultiRangeReadResponse response = (MultiRangeReadResponse)command.createResponse(command.executeLocally(command.executionController()));
+        assertFalse(response.repairedDataDigest().hasRemaining());
+    }
+
+    @Test
+    public void testMaybeIncludeRepairedDigestForLocalResponse()
+    {
+        MultiRangeReadCommand command = command(ranges(2), true);
+        MultiRangeReadResponse response = (MultiRangeReadResponse)command.createResponse(command.executeLocally(command.executionController()));
+        assertTrue(response.mayIncludeRepairedDigest());
+    }
+
+    @Test(expected = AssertionError.class)
+    public void testCreateWithEmptyRanges()
+    {
+        command(Collections.EMPTY_LIST, true);
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testMultiRangeReadResponseDigest()
+    {
+        MultiRangeReadCommand command = command(ranges(10), true);
+        UnfilteredPartitionIterator data = command.executeLocally(command.executionController());
+        MultiRangeReadResponse response = (MultiRangeReadResponse) command.createResponse(data);
+        response.digest(null);
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testMultiRangeReadResponseToDebugString()
+    {
+        MultiRangeReadCommand command = command(ranges(10), true);
+        UnfilteredPartitionIterator data = command.executeLocally(command.executionController());
+        MultiRangeReadResponse response = (MultiRangeReadResponse) command.createResponse(data);
+        response.toDebugString(null, null);
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testCreateDigestCommand()
+    {
+        MultiRangeReadCommand command = command(ranges(10), true);
+        command.copyAsDigestQuery();
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testGetPager()
+    {
+        MultiRangeReadCommand command = command(ranges(10), true);
+        command.getPager(null, ProtocolVersion.CURRENT);
+    }
+
+    @Test(expected = UnsupportedOperationException.class)
+    public void testExecute()
+    {
+        MultiRangeReadCommand command = command(ranges(10), true);
+        command.execute(null, null, -1L);
+    }
+
+    @Test
+    public void testSerializationRoundTrip() throws Exception
+    {
+        for (int tokens : Arrays.asList(2, 3, 5, 10, 63, 128))
+        {
+            List<AbstractBounds<PartitionPosition>> ranges = ranges(tokens);
+
+            for (int i = 0; i < ranges.size() - 1; i++)
+            {
+                for (int j = i + 1; j < ranges.size(); j++)
+                {
+                    testSerializationRoundtrip(ranges.subList(i, j), true);
+                    testSerializationRoundtrip(ranges.subList(i, j), false);
+                }
+            }
+        }
+    }
+
+    private static MultiRangeReadCommand command(List<AbstractBounds<PartitionPosition>> subRanges, boolean isRangeContinuation)
+    {
+        PartitionRangeReadCommand partitionRangeCommand = PartitionRangeReadCommand.allDataRead(cfs.metadata(), FBUtilities.nowInSeconds());
+        return MultiRangeReadCommand.create(partitionRangeCommand, subRanges, isRangeContinuation);
+    }
+
+    private static void testSerializationRoundtrip(List<AbstractBounds<PartitionPosition>> subRanges, boolean isRangeContinuation) throws Exception
+    {
+        MultiRangeReadCommand command = command(subRanges, isRangeContinuation);
+        testSerializationRoundtrip(command, command);
+    }
+
+    private static void testSerializationRoundtrip(MultiRangeReadCommand command, MultiRangeReadCommand expected) throws Exception
+    {
+        DataOutputBuffer output = new DataOutputBuffer();
+        ReadCommand.serializer.serialize(command, output, MessagingService.current_version);
+        assertEquals(ReadCommand.serializer.serializedSize(command, MessagingService.current_version), output.position());
+
+        DataInputPlus input = new DataInputBuffer(output.buffer(), false);
+        MultiRangeReadCommand deserialized = (MultiRangeReadCommand)ReadCommand.serializer.deserialize(input, MessagingService.current_version);
+
+        assertEquals(expected.metadata().id, deserialized.metadata().id);
+        assertEquals(expected.nowInSec(), deserialized.nowInSec());
+        assertEquals(expected.limits(), deserialized.limits());
+        assertEquals(expected.indexQueryPlan == null ? null : expected.indexQueryPlan.getFirst().getIndexMetadata(),
+                     deserialized.indexQueryPlan == null ? null : deserialized.indexQueryPlan.getFirst().getIndexMetadata());
+        assertEquals(expected.digestVersion(), deserialized.digestVersion());
+        assertEquals(expected.ranges().size(), deserialized.ranges().size());
+        Iterator<DataRange> expectedRangeIterator = expected.ranges().iterator();
+        Iterator<DataRange> deserializedRangeIterator = expected.ranges().iterator();
+        while (expectedRangeIterator.hasNext())
+        {
+            DataRange expectedRange = expectedRangeIterator.next();
+            DataRange deserializedRange = deserializedRangeIterator.next();
+            assertEquals(expectedRange.keyRange, deserializedRange.keyRange);
+            assertEquals(expectedRange.clusteringIndexFilter, deserializedRange.clusteringIndexFilter);
+        }
+    }
+
+    private List<AbstractBounds<PartitionPosition>> ranges(int numTokens)
+    {
+        assert numTokens >= 2;
+
+        List<Token> tokens = new ArrayList<>(numTokens);
+        tokens.add(partitioner.getMinimumToken());
+        tokens.add(partitioner.getMaximumToken());
+
+        while (tokens.size() < numTokens)
+        {
+            Token next = partitioner.getRandomToken();
+            if (!tokens.contains(next))
+                tokens.add(next);
+        }
+        Collections.sort(tokens);
+
+        List<AbstractBounds<PartitionPosition>> ranges = new ArrayList<>();
+        for (int i = 0; i < tokens.size() - 1; i++)
+        {
+            Token.KeyBound left = tokens.get(i).maxKeyBound(); // exclusive
+            Token.KeyBound right = tokens.get(i + 1).maxKeyBound(); // inclusive
+            ranges.add(new Range<>(left, right));
+        }
+
+        return ranges;
+    }
+
+    private void assertData(UnfilteredPartitionIterator expectedResult, UnfilteredPartitionIterator actualResult)
+    {
+        List<Unfiltered> expected = rows(expectedResult);
+        List<Unfiltered> actual = rows(actualResult);
+        assertEquals(expected, actual);
+    }
+
+    private List<Unfiltered> rows(UnfilteredPartitionIterator iterator)
+    {
+        List<Unfiltered> unfiltered = new ArrayList<>();
+        while (iterator.hasNext())
+        {
+            try (UnfilteredRowIterator rowIterator = iterator.next())
+            {
+                while (rowIterator.hasNext())
+                {
+                    unfiltered.add(rowIterator.next());
+                }
+            }
+        }
+        iterator.close();
+        return unfiltered;
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/StubIndex.java b/test/unit/org/apache/cassandra/index/StubIndex.java
index 5358a69feb46..4accb056eccd 100644
--- a/test/unit/org/apache/cassandra/index/StubIndex.java
+++ b/test/unit/org/apache/cassandra/index/StubIndex.java
@@ -93,6 +93,10 @@ public AbstractType<?> customExpressionValueType()
         return UTF8Type.instance;
     }
 
+    public RowFilter postIndexQueryFilter(RowFilter filter)
+    {
+        return filter;
+    }
     public RowFilter getPostIndexQueryFilter(RowFilter filter)
     {
         return filter;
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java b/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java
index 8408e95f9f77..35d585243e48 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/AbstractQueryTester.java
@@ -23,7 +23,6 @@
 
 import com.google.common.collect.ImmutableList;
 import org.junit.Before;
-import org.junit.BeforeClass;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java b/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
index 8ac6acb6484b..a387b108b85b 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/DataModel.java
@@ -133,6 +133,10 @@ public interface DataModel
 
     DataModel withTableOptions(String tableOptions) throws Throwable;
 
+    String indexedTable();
+
+    String nonIndexedTable();
+
     List<Pair<String, String>> keyColumns();
 
     void createTables(Executor tester) throws Throwable;
@@ -187,6 +191,16 @@ public BaseDataModel(List<Pair<String, String>> columns, List<String> rows)
             this.keys = new SimplePrimaryKeyList(rows.size());
         }
 
+        public String indexedTable()
+        {
+            return indexedTable;
+        }
+
+        public String nonIndexedTable()
+        {
+            return nonIndexedTable;
+        }
+
         public DataModel withTableOptions(String tableOptions)
         {
             this.tableOptions = tableOptions;
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java b/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
index add8c2103f1b..730363d21529 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/IndexQuerySupport.java
@@ -23,7 +23,6 @@
 import java.util.Arrays;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
-import java.util.function.Supplier;
 import java.util.stream.Collectors;
 
 import com.google.common.base.MoreObjects;
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java b/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java
index 111b0794f748..3a0d9fc6ace2 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/SingleNodeExecutor.java
@@ -21,9 +21,7 @@
 import java.util.List;
 import java.util.stream.Collectors;
 
-import com.datastax.driver.core.ResultSet;
 import com.datastax.driver.core.SimpleStatement;
-import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.index.sai.SAITester;
 import org.apache.cassandra.inject.Injections;
 
diff --git a/test/unit/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIteratorTest.java
new file mode 100644
index 000000000000..ae52117c6a17
--- /dev/null
+++ b/test/unit/org/apache/cassandra/service/reads/range/EndpointGroupingRangeCommandIteratorTest.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.service.reads.range;
+
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import com.google.common.collect.Iterables;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.PartitionRangeReadCommand;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.locator.ReplicaPlan;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.CloseableIterator;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+
+public class EndpointGroupingRangeCommandIteratorTest extends CQLTester
+{
+    public static final String KEYSPACE1 = "EndpointGroupingRangeReadTest";
+    public static final String CF_STANDARD1 = "Standard1";
+
+    private static final int MAX_CONCURRENCY_FACTOR = 1;
+
+    private static Keyspace keyspace;
+    private static ColumnFamilyStore cfs;
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        System.setProperty("cassandra.max_concurrent_range_requests", String.valueOf(MAX_CONCURRENCY_FACTOR));
+
+        requireNetwork();
+
+        SchemaLoader.createKeyspace(KEYSPACE1, KeyspaceParams.simple(1), SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1));
+
+        keyspace = Keyspace.open(KEYSPACE1);
+        cfs = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        cfs.clearUnsafe();
+    }
+
+    @AfterClass
+    public static void cleanup()
+    {
+        System.clearProperty("cassandra.max_concurrent_range_requests");
+    }
+
+    @Test
+    public void testEndpointGrouping() throws Throwable
+    {
+        // n tokens divide token ring into n+1 ranges
+        int vnodeCount = setTokens(100, 200, 300, 400).size() + 1;
+
+        int rowCount = 1000;
+        for (int i = 0; i < rowCount; ++i)
+        {
+            RowUpdateBuilder builder = new RowUpdateBuilder(cfs.metadata(), 10, String.valueOf(i));
+            builder.clustering("c");
+            builder.add("val", String.valueOf(i));
+            builder.build().applyUnsafe();
+        }
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        PartitionRangeReadCommand command = (PartitionRangeReadCommand) Util.cmd(cfs).build();
+
+        for (int initialConcurrencyFactor : Arrays.asList(1, 2, 5, 10, 50, 100, 250, 300, 500))
+        {
+            verifyEndpointGrouping(command, vnodeCount, initialConcurrencyFactor);
+        }
+    }
+
+    private static List<Token> setTokens(int... values)
+    {
+        return new TokenUpdater().withKeys(values).update().getTokens();
+    }
+
+    private void verifyEndpointGrouping(PartitionRangeReadCommand command, int vnodeCount, int concurrencyFactor) throws Exception
+    {
+        EndpointGroupingCoordinator coordinator = endpointGroupingCoordinator(command, concurrencyFactor);
+
+        // verify queried vnode ranges respects concurrency factor.
+        assertEquals(vnodeCount, coordinator.vnodeRanges());
+
+        // verify number of replica ranges is the same as grouped ranges.
+        int rangesForQuery = 1;
+        EndpointGroupingCoordinator.EndpointQueryContext endpointContext = Iterables.getOnlyElement(coordinator.endpointRanges());
+        assertEquals(rangesForQuery, endpointContext.rangesCount());
+
+        // verify that endpoint grouping coordinator fetches given ranges according to concurrency factor
+        RangeCommandIterator tokenOrderedIterator = tokenOrderIterator(command, vnodeCount, concurrencyFactor);
+        int expected = Util.size(tokenOrderedIterator.sendNextRequests());
+        int actual = Util.size(coordinator.execute());
+        assertEquals(expected, actual);
+
+        // verify that endpoint grouping executor fetches all data
+        RangeCommandIterator endpointGroupingIterator = endpointGroupingIterator(command, vnodeCount, concurrencyFactor);
+        tokenOrderedIterator = tokenOrderIterator(command, vnodeCount, concurrencyFactor);
+        expected = Util.size(tokenOrderedIterator.sendNextRequests());
+        actual = Util.size(endpointGroupingIterator.sendNextRequests());
+        assertEquals(1, endpointGroupingIterator.batchesRequested());
+        assertEquals(expected, actual);
+    }
+
+    private static EndpointGroupingCoordinator endpointGroupingCoordinator(PartitionRangeReadCommand command, int concurrencyFactor)
+    {
+        CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = replicaPlanIterator(command);
+        DataLimits.Counter counter = DataLimits.NONE.newCounter(command.nowInSec(), true, command.selectsFullPartition(), true);
+        return new EndpointGroupingCoordinator(command, counter, replicaPlans, concurrencyFactor, System.nanoTime());
+    }
+
+    private static EndpointGroupingRangeCommandIterator endpointGroupingIterator(PartitionRangeReadCommand command, int vnodeCount, int concurrencyFactor)
+    {
+        CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = replicaPlanIterator(command);
+        return new EndpointGroupingRangeCommandIterator(replicaPlans, command, concurrencyFactor, concurrencyFactor, vnodeCount, System.nanoTime());
+    }
+
+    private static NonGroupingRangeCommandIterator tokenOrderIterator(PartitionRangeReadCommand command, int vnodeCount, int concurrencyFactor)
+    {
+        CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = replicaPlanIterator(command);
+        return new NonGroupingRangeCommandIterator(replicaPlans, command, concurrencyFactor, 10000, vnodeCount, System.nanoTime());
+    }
+
+    private static CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlanIterator(PartitionRangeReadCommand command)
+    {
+        AbstractBounds<PartitionPosition> keyRange = command.dataRange().keyRange();
+        CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = new ReplicaPlanIterator(keyRange, null, keyspace, ConsistencyLevel.ONE);
+        return new ReplicaPlanMerger(replicaPlans, keyspace, ConsistencyLevel.ONE);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
index 967d05d5566e..9641619499d1 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandIteratorTest.java
@@ -104,27 +104,27 @@ public void testRangeQueried()
 
         // without range merger, there will be 2 batches requested: 1st batch with 1 range and 2nd batch with remaining ranges
         CloseableIterator<ReplicaPlan.ForRangeRead> replicaPlans = replicaPlanIterator(keyRange, keyspace, false);
-        RangeCommandIterator data = new RangeCommandIterator(replicaPlans, command, 1, 1000, vnodeCount, System.nanoTime());
+        RangeCommandIterator data = RangeCommandIterator.create(replicaPlans, command, 1, 1000, vnodeCount, System.nanoTime());
         verifyRangeCommandIterator(data, rows, 2, vnodeCount);
 
         // without range merger and initial cf=5, there will be 1 batches requested: 5 vnode ranges for 1st batch
         replicaPlans = replicaPlanIterator(keyRange, keyspace, false);
-        data = new RangeCommandIterator(replicaPlans, command, vnodeCount, 1000, vnodeCount, System.nanoTime());
+        data = RangeCommandIterator.create(replicaPlans, command, vnodeCount, 1000, vnodeCount, System.nanoTime());
         verifyRangeCommandIterator(data, rows, 1, vnodeCount);
 
         // without range merger and max cf=1, there will be 5 batches requested: 1 vnode range per batch
         replicaPlans = replicaPlanIterator(keyRange, keyspace, false);
-        data = new RangeCommandIterator(replicaPlans, command, 1, 1, vnodeCount, System.nanoTime());
+        data = RangeCommandIterator.create(replicaPlans, command, 1, 1, vnodeCount, System.nanoTime());
         verifyRangeCommandIterator(data, rows, vnodeCount, vnodeCount);
 
         // with range merger, there will be only 1 batch requested, as all ranges share the same replica - localhost
         replicaPlans = replicaPlanIterator(keyRange, keyspace, true);
-        data = new RangeCommandIterator(replicaPlans, command, 1, 1000, vnodeCount, System.nanoTime());
+        data = RangeCommandIterator.create(replicaPlans, command, 1, 1000, vnodeCount, System.nanoTime());
         verifyRangeCommandIterator(data, rows, 1, vnodeCount);
 
         // with range merger and max cf=1, there will be only 1 batch requested, as all ranges share the same replica - localhost
         replicaPlans = replicaPlanIterator(keyRange, keyspace, true);
-        data = new RangeCommandIterator(replicaPlans, command, 1, 1, vnodeCount, System.nanoTime());
+        data = RangeCommandIterator.create(replicaPlans, command, 1, 1, vnodeCount, System.nanoTime());
         verifyRangeCommandIterator(data, rows, 1, vnodeCount);
     }
 

From e0fa04822a8d5d9abee11565aad1146c49029504 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Fri, 23 Apr 2021 11:23:40 +0200
Subject: [PATCH 068/151] STAR-404: Add maxColumnValueLengthsMap support in
 StatsMetadata (ds-trunk) (#116)

* STAR-404: Fix sstable versioning

- maxColumnValueLengths is not present any longer in ca
- removed nb and nc versions (for OSS compatibility)
- update legacy sstables data files for testing

(cherry picked from commit 5150d26ba2b06c4ae297b2102c9f882e4589ebaf)
(cherry picked from commit 295e5d0356c4434ce76298ea7b53584f65b153f8)
---
 .../io/sstable/format/big/BigFormat.java      |   5 +-
 .../format/trieindex/TrieIndexFormat.java     |   5 +-
 .../sstable/metadata/MetadataCollector.java   |   6 ++-
 .../io/sstable/metadata/StatsMetadata.java    |  49 +++++++++++++-----
 .../ca-1-bti-CompressionInfo.db               | Bin 0 -> 207 bytes
 .../legacy_ca_clust/ca-1-bti-Data.db          | Bin 0 -> 8780 bytes
 .../legacy_ca_clust/ca-1-bti-Digest.crc32     |   1 +
 .../legacy_ca_clust/ca-1-bti-Filter.db        | Bin 0 -> 24 bytes
 .../legacy_ca_clust/ca-1-bti-Partitions.db    | Bin 0 -> 62 bytes
 .../legacy_ca_clust/ca-1-bti-Rows.db          | Bin 0 -> 563 bytes
 .../legacy_ca_clust/ca-1-bti-Statistics.db}   | Bin 7086 -> 7220 bytes
 .../legacy_ca_clust/ca-1-bti-TOC.txt}         |   6 +--
 .../ca-1-bti-CompressionInfo.db               | Bin 0 -> 199 bytes
 .../legacy_ca_clust_counter/ca-1-bti-Data.db  | Bin 0 -> 7877 bytes
 .../ca-1-bti-Digest.crc32                     |   1 +
 .../ca-1-bti-Filter.db                        | Bin 0 -> 24 bytes
 .../ca-1-bti-Partitions.db                    | Bin 0 -> 62 bytes
 .../legacy_ca_clust_counter/ca-1-bti-Rows.db  | Bin 0 -> 563 bytes
 .../ca-1-bti-Statistics.db}                   | Bin 7086 -> 7229 bytes
 .../legacy_ca_clust_counter/ca-1-bti-TOC.txt} |   6 +--
 .../ca-1-bti-CompressionInfo.db               | Bin 0 -> 47 bytes
 .../legacy_ca_simple/ca-1-bti-Data.db         | Bin 0 -> 88 bytes
 .../legacy_ca_simple/ca-1-bti-Digest.crc32    |   1 +
 .../legacy_ca_simple/ca-1-bti-Filter.db       | Bin 0 -> 24 bytes
 .../legacy_ca_simple/ca-1-bti-Partitions.db   | Bin 0 -> 59 bytes
 .../legacy_ca_simple/ca-1-bti-Rows.db}        |   0
 .../legacy_ca_simple/ca-1-bti-Statistics.db}  | Bin 4639 -> 4730 bytes
 .../legacy_ca_simple/ca-1-bti-TOC.txt}        |   6 +--
 .../ca-1-bti-CompressionInfo.db               | Bin 0 -> 47 bytes
 .../legacy_ca_simple_counter/ca-1-bti-Data.db | Bin 0 -> 143 bytes
 .../ca-1-bti-Digest.crc32                     |   1 +
 .../ca-1-bti-Filter.db                        | Bin 0 -> 24 bytes
 .../ca-1-bti-Partitions.db                    | Bin 0 -> 60 bytes
 .../ca-1-bti-Rows.db}                         |   0
 .../ca-1-bti-Statistics.db}                   | Bin 4648 -> 4739 bytes
 .../ca-1-bti-TOC.txt}                         |   6 +--
 .../na-1-big-CompressionInfo.db               | Bin 87 -> 207 bytes
 .../legacy_na_clust/na-1-big-Data.db          | Bin 5214 -> 8601 bytes
 .../legacy_na_clust/na-1-big-Digest.crc32     |   2 +-
 .../legacy_na_clust/na-1-big-Index.db         | Bin 157553 -> 157553 bytes
 .../legacy_na_clust/na-1-big-Statistics.db    | Bin 7096 -> 7160 bytes
 .../legacy_na_clust/na-1-big-TOC.txt          |  12 ++---
 .../na-1-bti-CompressionInfo.db               | Bin 83 -> 0 bytes
 .../legacy_na_clust/na-1-bti-Data.db          | Bin 5315 -> 0 bytes
 .../legacy_na_clust/na-1-bti-Digest.crc32     |   1 -
 .../legacy_na_clust/na-1-bti-Filter.db        | Bin 24 -> 0 bytes
 .../legacy_na_clust/na-1-bti-Partitions.db    | Bin 63 -> 0 bytes
 .../legacy_na_clust/na-1-bti-Rows.db          | Bin 738 -> 0 bytes
 .../na-1-bti-CompressionInfo.db               | Bin 83 -> 0 bytes
 .../legacy_na_clust_compact/na-1-bti-Data.db  | Bin 5398 -> 0 bytes
 .../na-1-bti-Digest.crc32                     |   1 -
 .../na-1-bti-Filter.db                        | Bin 24 -> 0 bytes
 .../na-1-bti-Partitions.db                    | Bin 63 -> 0 bytes
 .../legacy_na_clust_compact/na-1-bti-Rows.db  | Bin 738 -> 0 bytes
 .../na-1-big-CompressionInfo.db               | Bin 79 -> 199 bytes
 .../legacy_na_clust_counter/na-1-big-Data.db  | Bin 5759 -> 7551 bytes
 .../na-1-big-Digest.crc32                     |   2 +-
 .../legacy_na_clust_counter/na-1-big-Index.db | Bin 157553 -> 157553 bytes
 .../na-1-big-Statistics.db                    | Bin 7105 -> 7169 bytes
 .../legacy_na_clust_counter/na-1-big-TOC.txt  |  12 ++---
 .../na-1-bti-CompressionInfo.db               | Bin 75 -> 0 bytes
 .../legacy_na_clust_counter/na-1-bti-Data.db  | Bin 5950 -> 0 bytes
 .../na-1-bti-Digest.crc32                     |   1 -
 .../na-1-bti-Filter.db                        | Bin 24 -> 0 bytes
 .../na-1-bti-Partitions.db                    | Bin 63 -> 0 bytes
 .../legacy_na_clust_counter/na-1-bti-Rows.db  | Bin 738 -> 0 bytes
 .../na-1-bti-Statistics.db                    | Bin 7095 -> 0 bytes
 .../na-1-bti-CompressionInfo.db               | Bin 75 -> 0 bytes
 .../na-1-bti-Data.db                          | Bin 5947 -> 0 bytes
 .../na-1-bti-Digest.crc32                     |   1 -
 .../na-1-bti-Filter.db                        | Bin 24 -> 0 bytes
 .../na-1-bti-Partitions.db                    | Bin 63 -> 0 bytes
 .../na-1-bti-Rows.db                          | Bin 738 -> 0 bytes
 .../na-1-bti-Statistics.db                    | Bin 7095 -> 0 bytes
 .../na-1-big-CompressionInfo.db               | Bin 47 -> 47 bytes
 .../legacy_na_simple/na-1-big-Data.db         | Bin 88 -> 88 bytes
 .../legacy_na_simple/na-1-big-Digest.crc32    |   2 +-
 .../legacy_na_simple/na-1-big-Statistics.db   | Bin 4649 -> 4713 bytes
 .../legacy_na_simple/na-1-big-TOC.txt         |  12 ++---
 .../na-1-bti-CompressionInfo.db               | Bin 43 -> 0 bytes
 .../legacy_na_simple/na-1-bti-Data.db         | Bin 89 -> 0 bytes
 .../legacy_na_simple/na-1-bti-Digest.crc32    |   1 -
 .../legacy_na_simple/na-1-bti-Filter.db       | Bin 24 -> 0 bytes
 .../legacy_na_simple/na-1-bti-Partitions.db   | Bin 60 -> 0 bytes
 .../legacy_na_simple/na-1-bti-TOC.txt         |   8 ---
 .../na-1-bti-CompressionInfo.db               | Bin 43 -> 0 bytes
 .../legacy_na_simple_compact/na-1-bti-Data.db | Bin 91 -> 0 bytes
 .../na-1-bti-Digest.crc32                     |   1 -
 .../na-1-bti-Filter.db                        | Bin 24 -> 0 bytes
 .../na-1-bti-Partitions.db                    | Bin 60 -> 0 bytes
 .../na-1-bti-Statistics.db                    | Bin 4680 -> 0 bytes
 .../legacy_na_simple_compact/na-1-bti-TOC.txt |   8 ---
 .../na-1-big-CompressionInfo.db               | Bin 47 -> 47 bytes
 .../legacy_na_simple_counter/na-1-big-Data.db | Bin 138 -> 148 bytes
 .../na-1-big-Digest.crc32                     |   2 +-
 .../na-1-big-Statistics.db                    | Bin 4658 -> 4722 bytes
 .../legacy_na_simple_counter/na-1-big-TOC.txt |  12 ++---
 .../na-1-bti-CompressionInfo.db               | Bin 43 -> 0 bytes
 .../legacy_na_simple_counter/na-1-bti-Data.db | Bin 145 -> 0 bytes
 .../na-1-bti-Digest.crc32                     |   1 -
 .../na-1-bti-Filter.db                        | Bin 24 -> 0 bytes
 .../na-1-bti-Partitions.db                    | Bin 61 -> 0 bytes
 .../legacy_na_simple_counter/na-1-bti-Rows.db |   0
 .../legacy_na_simple_counter/na-1-bti-TOC.txt |   8 ---
 .../na-1-bti-CompressionInfo.db               | Bin 43 -> 0 bytes
 .../na-1-bti-Data.db                          | Bin 140 -> 0 bytes
 .../na-1-bti-Digest.crc32                     |   1 -
 .../na-1-bti-Filter.db                        | Bin 24 -> 0 bytes
 .../na-1-bti-Partitions.db                    | Bin 61 -> 0 bytes
 .../na-1-bti-Rows.db                          |   0
 .../na-1-bti-Statistics.db                    | Bin 4689 -> 0 bytes
 .../na-1-bti-TOC.txt                          |   8 ---
 .../io/sstable/LegacySSTableTest.java         |  23 ++++----
 .../metadata/MetadataSerializerTest.java      |   5 +-
 114 files changed, 103 insertions(+), 114 deletions(-)
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Rows.db
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_clust/na-1-bti-Statistics.db => ca/legacy_tables/legacy_ca_clust/ca-1-bti-Statistics.db} (79%)
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_clust/na-1-bti-TOC.txt => ca/legacy_tables/legacy_ca_clust/ca-1-bti-TOC.txt} (100%)
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Partitions.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Rows.db
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_clust_compact/na-1-bti-Statistics.db => ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Statistics.db} (79%)
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_clust_compact/na-1-bti-TOC.txt => ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-TOC.txt} (100%)
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Partitions.db
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_simple/na-1-bti-Rows.db => ca/legacy_tables/legacy_ca_simple/ca-1-bti-Rows.db} (100%)
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_simple/na-1-bti-Statistics.db => ca/legacy_tables/legacy_ca_simple/ca-1-bti-Statistics.db} (83%)
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_clust_counter/na-1-bti-TOC.txt => ca/legacy_tables/legacy_ca_simple/ca-1-bti-TOC.txt} (100%)
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-CompressionInfo.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Data.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Digest.crc32
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Filter.db
 create mode 100644 test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Partitions.db
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_simple_compact/na-1-bti-Rows.db => ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Rows.db} (100%)
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_simple_counter/na-1-bti-Statistics.db => ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Statistics.db} (83%)
 rename test/data/legacy-sstables/{na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-TOC.txt => ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-TOC.txt} (100%)
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Rows.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Rows.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Rows.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Statistics.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Rows.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Statistics.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-TOC.txt
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Statistics.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-TOC.txt
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Rows.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-TOC.txt
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-CompressionInfo.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Data.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Digest.crc32
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Filter.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Partitions.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Rows.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Statistics.db
 delete mode 100644 test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-TOC.txt

diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index 1a6a236c3746..20288fb189d0 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -228,8 +228,8 @@ static class BigVersion extends Version
             hasCommitLogIntervals = version.compareTo("mc") >= 0;
             hasAccurateMinMax = version.compareTo("md") >= 0;
             hasOriginatingHostId = version.matches("(m[e-z])|(n[b-z])");
-            hasImprovedMinMax = version.compareTo("nb") >= 0;
-            hasPartitionLevelDeletionPresenceMarker = version.compareTo("nb") >= 0;
+            hasImprovedMinMax = false;
+            hasPartitionLevelDeletionPresenceMarker = false;
             hasMaxCompressedLength = version.compareTo("na") >= 0;
             hasPendingRepair = version.compareTo("na") >= 0;
             hasIsTransient = version.compareTo("na") >= 0;
@@ -338,7 +338,6 @@ public boolean hasIncrementalNodeSyncMetadata()
             return false;
         }
 
-        // TODO TBD
         @Override
         public boolean hasMaxColumnValueLengths()
         {
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
index 080ced3113dc..433d59e57ed9 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
@@ -287,7 +287,7 @@ static class TrieIndexVersion extends Version
         //               improved min/max clustering representation
         //               presence marker for partition level deletions
         // bb (DSE 6.8.5): added hostId of the node from which the sstable originated (DB-4629)
-        // ca (DSE-DB aka Stargazer based on OSS 4.0): all bb fields  + all OSS fields
+        // ca (DSE-DB aka Stargazer based on OSS 4.0): bb fields without maxColumnValueLengths + all OSS fields
         // NOTE: when adding a new version, please add that to LegacySSTableTest, too.
 
         private final boolean isLatestVersion;
@@ -311,7 +311,7 @@ static class TrieIndexVersion extends Version
             hasOldBfFormat = version.compareTo("b") < 0;
             hasAccurateLegacyMinMax = version.compareTo("ac") >= 0;
             hasOriginatingHostId = version.matches("(a[d-z])|(b[b-z])") || version.compareTo("ca") >= 0;
-            hasMaxColumnValueLengths = version.matches("b[a-z]"); // TODO TBD
+            hasMaxColumnValueLengths = version.matches("b[a-z]"); // DSE only field
             correspondingMessagingVersion = version.compareTo("ca") >= 0 ? MessagingService.VERSION_40 : MessagingService.VERSION_3014;
         }
 
@@ -387,7 +387,6 @@ public boolean hasImprovedMinMax()
             return version.compareTo("ba") >= 0;
         }
 
-        // TODO TBD
         @Override
         public boolean hasMaxColumnValueLengths()
         {
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
index 159b9061bf1b..2ece9a6ceaa1 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
@@ -88,7 +88,8 @@ public static StatsMetadata defaultStatsMetadata()
                                  -1,
                                  null,
                                  null,
-                                 false);
+                                 false,
+                                 Collections.emptyMap());
     }
 
     protected EstimatedHistogram estimatedPartitionSize = defaultPartitionSizeHistogram();
@@ -351,7 +352,8 @@ public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner,
                                                              totalRows,
                                                              originatingHostId,
                                                              pendingRepair,
-                                                             isTransient));
+                                                             isTransient,
+                                                             Collections.emptyMap()));
         components.put(MetadataType.COMPACTION, new CompactionMetadata(cardinality));
         components.put(MetadataType.HEADER, header.toComponent());
         return components;
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
index c87695f011c8..e6bd14cb25ea 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
@@ -20,9 +20,12 @@
 import java.io.File;
 import java.io.IOException;
 import java.nio.ByteBuffer;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.UUID;
 
+import com.google.common.collect.ImmutableMap;
 import org.apache.commons.lang3.builder.EqualsBuilder;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
 import org.slf4j.Logger;
@@ -94,6 +97,8 @@ public class StatsMetadata extends MetadataComponent
     // Used to serialize min/max clustering. Can be null if the metadata was deserialized from a legacy version
     private final List<AbstractType<?>> clusteringTypes;
 
+    private final ImmutableMap<ByteBuffer, Integer> maxColumnValueLengths;
+
     public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
                          EstimatedHistogram estimatedCellPerPartitionCount,
                          IntervalSet<CommitLogPosition> commitLogIntervals,
@@ -115,7 +120,8 @@ public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
                          long totalRows,
                          UUID originatingHostId,
                          UUID pendingRepair,
-                         boolean isTransient)
+                         boolean isTransient,
+                         Map<ByteBuffer, Integer> maxColumnValueLengths)
     {
         this.estimatedPartitionSize = estimatedPartitionSize;
         this.estimatedCellPerPartitionCount = estimatedCellPerPartitionCount;
@@ -140,6 +146,7 @@ public StatsMetadata(EstimatedHistogram estimatedPartitionSize,
         this.pendingRepair = pendingRepair;
         this.isTransient = isTransient;
         this.encodingStats = new EncodingStats(minTimestamp, minLocalDeletionTime, minTTL);
+        this.maxColumnValueLengths = ImmutableMap.copyOf(maxColumnValueLengths);
     }
 
     public MetadataType getType()
@@ -194,7 +201,8 @@ public StatsMetadata mutateLevel(int newLevel)
                                  totalRows,
                                  originatingHostId,
                                  pendingRepair,
-                                 isTransient);
+                                 isTransient,
+                                 maxColumnValueLengths);
     }
 
     public StatsMetadata mutateRepairedMetadata(long newRepairedAt, UUID newPendingRepair, boolean newIsTransient)
@@ -220,7 +228,8 @@ public StatsMetadata mutateRepairedMetadata(long newRepairedAt, UUID newPendingR
                                  totalRows,
                                  originatingHostId,
                                  newPendingRepair,
-                                 newIsTransient);
+                                 newIsTransient,
+                                 maxColumnValueLengths);
     }
 
     @Override
@@ -251,6 +260,7 @@ public boolean equals(Object o)
                        .append(totalRows, that.totalRows)
                        .append(originatingHostId, that.originatingHostId)
                        .append(pendingRepair, that.pendingRepair)
+                       .append(maxColumnValueLengths, that.maxColumnValueLengths)
                        .build();
     }
 
@@ -278,6 +288,7 @@ public int hashCode()
                        .append(totalRows)
                        .append(originatingHostId)
                        .append(pendingRepair)
+                       .append(maxColumnValueLengths)
                        .build();
     }
 
@@ -342,10 +353,11 @@ public int serializedSize(Version version, StatsMetadata component) throws IOExc
                 size += Long.BYTES;
             }
 
-            // TODO TBD
             if (version.hasMaxColumnValueLengths())
             {
                 size += 4; // num columns
+                for (Map.Entry<ByteBuffer, Integer> entry : component.maxColumnValueLengths.entrySet())
+                    size += ByteBufferUtil.serializedSizeWithVIntLength(entry.getKey()) + 4; // column name, max value length
             }
 
             if (version.hasIsTransient())
@@ -438,10 +450,15 @@ public void serialize(Version version, StatsMetadata component, DataOutputPlus o
                 out.writeLong(Long.MAX_VALUE);
             }
 
-            // TODO TBD
+            // left for being able to import DSE sstables, not used
             if (version.hasMaxColumnValueLengths())
             {
-                out.writeInt(0);
+                out.writeInt(component.maxColumnValueLengths.size());
+                for (Map.Entry<ByteBuffer, Integer> entry : component.maxColumnValueLengths.entrySet())
+                {
+                    ByteBufferUtil.writeWithVIntLength(entry.getKey(), out);
+                    out.writeInt(entry.getValue());
+                }
             }
 
             if (version.hasIsTransient())
@@ -541,7 +558,7 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
             if (version.hasCommitLogIntervals())
                 commitLogIntervals = commitLogPositionSetSerializer.deserialize(in);
             else
-                commitLogIntervals = new IntervalSet<CommitLogPosition>(commitLogLowerBound, commitLogUpperBound);
+                commitLogIntervals = new IntervalSet<>(commitLogLowerBound, commitLogUpperBound);
 
             UUID pendingRepair = null;
             if (version.hasPendingRepair() && in.readByte() != 0)
@@ -564,15 +581,20 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
                 in.readLong();
             }
 
-            // TODO TBD
+            // left for being able to import DSE sstables, not used
+            final Map<ByteBuffer, Integer> maxColumnValueLengths;
             if (version.hasMaxColumnValueLengths())
             {
                 int colCount = in.readInt();
+                ImmutableMap.Builder<ByteBuffer, Integer> builder = ImmutableMap.builderWithExpectedSize(colCount);
+
                 for (int i = 0; i < colCount; i++)
-                {
-                    ByteBufferUtil.readWithVIntLength(in);
-                    in.readInt();
-                }
+                    builder.put(ByteBufferUtil.readWithVIntLength(in), in.readInt());
+                maxColumnValueLengths = builder.build();
+            }
+            else
+            {
+                maxColumnValueLengths = Collections.emptyMap();
             }
 
             boolean isTransient = version.hasIsTransient() && in.readBoolean();
@@ -608,7 +630,8 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
                                      totalRows,
                                      originatingHostId,
                                      pendingRepair,
-                                     isTransient);
+                                     isTransient,
+                                     maxColumnValueLengths);
         }
     }
 }
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..023476374d42e2b0a76234e03734e97f717a7ada
GIT binary patch
literal 207
zcmZSJ^@%cZ&d)6<N-ZwVFJb@#2Zs9p|Nn!TtO}EWv?xRzMl!C3u$Vm|G|Mvx%~lSf
zIcy*_=N$;mT?V0fIUzLPeh4j)1fhkvA++!+2rZfdp~ZP2w8UlzEtL<UWmF-w>}d!s
J4>ey=763_z6T$!h

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Data.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..dc64ae36adb750d96c703e0a36e668bac841064c
GIT binary patch
literal 8780
zcmd5?ZE#f88NQojldz&Ef}troy(yDsA_aE$-n$<WWLNyk6qK!10jVW?nS_sI6E>ft
z$*MsjD5a=Pv{H$b50xrO1w+TeviX3b(8y4VmSRL5G99ci)fz>)eV=pp<KBzE9DnSe
z%<Qwzx#yhsdEfKA=WYbQAZ!#Yjuur_!-7Rng#w|q%i(ZJvf^@kyu~(wzERuNhr+9w
zkR1*$t6NmFu&ieJlB%+XhKjoK<z<V5HD$|IRxeq+Xi-Jein3)Dl?#_tFIiC@YN}qo
ztiEzlQ%y}pb^YSn%3yu4ye3!|TKGU&FjyB{60BcU-neq*(#ll}tD8dgl@;ZS8<*CE
z>T4@2tLqy=Ws5694=i7?yrzCp1wFewSXKGJ;!0XxxAR6T7PY0SZS}NB>!9Dd%HHLy
zvGIjwZp&B@UbEW2X1X<G?{@kU%;}M*gUz;8?rx{_u9d~M)e>%7?T>6O@>^>~+0D7_
z^aP9FYJX*jz;ViRLYxcmjSetfM&|x<Fx^JxTlrvmjLc)_!SouLr*{y@x{9&<j?7b|
zFwcKV{ML{t?JIsS01+Hr?pBvTQoWYY#ld&5RSokUd^4OH(sW6;X3#dav5Q$}s~|&r
zaJqGsuUm3HZaK@oLN0AX%%>k1wHdo^;i5rE|AE}Oh?xA?DE@#w>1TyY*Z2+;i~v>E
zCWa$ClMVrN-KNb&RDj+vK(~>3ix;5B$eaxsM6Z!KCqf`=m1*|v?CUK4$i)pLgI#tQ
znN<$TII&B;UOgS)HF9y^k)2!}W7%|)I23~IIjNydx9psmuBxxFZ>;JWO@yXl1VU;}
zQ_rx^Ec6|(hXap68&RIc*>BR92=7KNkCNm9+|6U|GBW$~G2Crr4xR+)F)~kW0O&O`
z&rx`5*-evG^+k5cs-D9<p9v5`ajZ{M&#_Ci(oUck2e}aY3rI6{%L0n^t3#A-_HJb#
zRnwUOKfx-SdlW8CaY7v`olWY)NW*CQ<`fY}kxJ62ZhB~MXSoW|D}o4)+(s&#Wh3+@
zwmjAkP)TwDUYi5ZWn@km0_Zj}C+-61F*2nvK(CQ`Lx4bbsZia-aE6`D97)4eGxB-J
zedBq^XH&@c?PDRa>qZW2^-Yi5`H-!aP18Qk_C@ZGyvagrDn1Wu)zZGRz4@|gX;-mn
zU!<ZTHT38zwsf!$E$!@OoLRFhKC8Vk&u0z3Xb~*V7J^`w!fdPPxJde_v)C`w)fsf+
zvFOhucUprBxJ~KIC%T;?51X2hrOse?lgjEWP3CzUd`MZRpS#c2*}mv&Tn}jy4npn0
zmaxtK1_Tl1k1UDt&|^R8LHTelxd@=W*+5+;?qObnZWH&|4sbn2u6QC+P$zm#)YGM5
z+m8d>(087mJO(1FKl`4o-B6_zfDY9Ohl~C@?>i@k-#XPUIgCzxflj_vg<aCKNqZ-W
zmzIzgaH}Mlr)Bi>K?)KJbGO=V|7J6N8YjP(Xd{FO^YvPT2&9r63MH=egX%Imer^t!
zZX+{q1Wb>SnSTgOuaQ~QL131VIEiv5B+C1Xp9}EL?W3Widw7wdd(fH&-Y|TuH4ME<
z;5&!&dlkG(UI!CD$JS(b3O=hZx{|7lB#@oASUTD3D86%Cg5GI7)1@ag`q$~5LfGc(
zz4<)Ff<+kd)NQ`rnGwWGY0Zp}Mdi$C0O(4NgfI(A0J@X25Ga;~#X8YrBvuZGZ6tw#
zXs_$Rv~QtV_=L4(b2n;k(>%(^e1x)sng#NxO#v>1=h%sUt_*J-I0R+Ra6|G(Ln+e9
zbt}%KOK1wLMJ|gvpW+IvF9K?^sf||yG}+X%LjX-S)wv6x$)>g<!_7AJ$8vxso7(Lq
zkhM{>FwDZ4nn&)XOrel#9<|LEeXz%84YBEQONzF(;WHeoGr)^u+J%K{xFp(@2bz#1
zr+KOPqEF0Cky7zKiaxDAgEwotVM$LHVJFcn^+Pr@eK$KKP^(7p3qvudl3FN1xl^sT
ziH@fdZ61uOx9{Y)8C72vsSKl9n~h!(-5>X<-nVy{e5!BE2n3P(vf|w%@}X}6*Ur>D
zPUPB|nnwxRiKgc9hHQYw=25&!y`JiFR;(ZWht}AQe*d47xoYtP&f~QS!&({j-5N;^
z(Yl@9r!5y)1`PjsoI7`HK*Ao*9OiA6`h4cjbintN(&14kIgF?Jyb`X3<VxI3>5R1Z
z4(?~Qg_1ML<@MQjzoW%QorI*>)HYw=QUS4)>k>C(_*CDDK|;+J+t6lEO&8mSHZV;W
z+wfd4O&8l%N*evec71?*&O5CS&wxbd*WnfmAMa}`N)chCSlT3`2pygp!XYl3x`p4-
zAw}JSoa`3hRlUa>VqTT>XHAJMVT40UKAKm?D*-|iUe))fd<0TSsspducM>B_{ldTw
zFpd3!c$B=#+%6m|<zAIs4z~23gGQU$g);)$1!aHny;?2JwU+$$vRa~)RVQlwLf^+J
zVoP1hJ7@q@nT;JapNnux>)90NC1F0RM8lxUnhKxvAZ*gH*nk-s^SKOP1_8@DIt9bm
zZ}Ah%gj_seo3DRV4p^=+ymn+pgCS4O5Ky;nEFzq@3#dmo5zzcFP_Kzw6d;skDVSy=
zvyEB`sw3avSiK)n6T}_sz5ss>w`K7C16-a&Gw0#mT%l%By~8Q;X0FR~1Q(Gh?20V4
z6i)Q%AZ<kSTGG@~R)aHe3eYb$V|z2NqR;jORSm{WOKu^Yo7NE~I<8I*g=_Us{Z`lN
zZ=zEPt~DB$rYUTCx?0L@tN&)RZS_A=1YM*Vw8Z$DNZH4D$08rq){pTJ=1H!%l9pTq
z(BB-w6I>?lvmHR)G43g?waBTt>j+oQ&|0W=&8GS{g;V4-bb5}bp{j17zw@!jt;kc+
z`!znZ75EP5OI||R-B+rr!t%(JFd!FSbY<h*lpH-mE-T_i*a-Ob3z~ubfE!pNYs<(*
z0^E)}FYZ}7I<MLOHzSxJe$k4&ghj}6c@b)(Bg}rN0jQ!Ii>cl&1?s}8R8`{=dfWlj
zZQ`EH1M1=2yR<i~ldgFDWiFKA4eR8XPVcA4@z*VR2Q8G}e$*1r&<_V`D=a6ADv7^A
zSZY7pqgj}MbFY*I)2Zkv!9Q)|QpkVPjVCE)m&QCyN7tnIZxBI}^z+eUF8VQ_cH!D?
zS9vWdepH<{6^__}%n=NA>4%aQSMjoO>n4J3<2u0gn7HG^K)ng>kaiH({$%nqmmP#Z
z)jVooLW<mJyX6yt!fxYLz-y_{3q=&iSKNu_vt=2)n5Iz8(3z$DU6A8!I9mT42bV-~
z>iAre0oO@HMLMAyfN&Qd04(e12<%N82u&E;z^x4+xyEHCCPP--Q3BL$8hTG2P>*gN
zJaC=>)NA6F@&tV528CT`i9ebj2}^otd$@*;4e*x`(;8O%o@*EXY02T_CZ?{`^y^=c
zDlX=}k{=>fY~3hPtA$0w@G;K;L1YrwwF<}BGF&W0Z=BtnAp))B?soVZDhV~neAWVc
zUEJD~7Q(o-jm{Mv?<Ai_KUuMX;GNd$%U&D~yn24UEhseGMCUAi3|29kNBC=jZ=k_X
zd7^)FIIt%Ig`~91<T6R)eMkgRzACmUER)oX@_bOD$wdG?%=5uTT$|Pzp4r<tiL0i+
zWzjBbwUcq+s3TPz%>5&jS2J6muI6AX$FwBwwx~I50?x~3pzjXzBsOtBpj@qG$H!u0
zxR~W+POyOe@n)*r%dTag(8)-JtGkv#_Xn>hIoxDw$+lms+Ry!Xc-+&zEaGY;+kRZi
z4hkcLnlEJ;AAoAQl#Sj6X8cl?rd_m7_RdS=D)C<nxrL#&R%s#~%;`uGktEM}YOosF
zT52fxA(2;(EqeFhGb^OOyluEHKqz~T+KeXl2TEH`Ey&k=&BQIr)_m=U$p_;U9e8vk
zW^6h>*W;l5B<7TKD}iYp^z6W-@mGllr}C;%{+E?p8P$@n5o2Atu}G8KOMtp{GXb6H
z0GinG!vWsC-(9$OG?yy&13SHKBB`olO8p{5&cJTjJ1$KPvByw2<FT82hY_B8v#3CL
zqaoVw8`Ny=&*+)Qvx(_y2$u4zy_&7%CdYF@)*Xu3nvSmtvZxb*l5`VtL6!vp%Q`yN
zQNyb#eos5tlmn8B)|FT+t>Gb*ljt@M6<71uySi!cyvKL(J9C11M*EfOU%^@7jrsJ8
zJKhGy{}rN2bE<<+rb<02V=O`T2zMlYs6SKAWAFM4?=(a<C8LVR{@1(URPL)SRdKd<
wG_{Ahuj%MG$zBL>U(>OHZt)Vz?zk47MuMva_U-vT>xuyWE5kC!_r**92S0!DQUCw|

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Digest.crc32 b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Digest.crc32
new file mode 100644
index 000000000000..2b2298994977
--- /dev/null
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Digest.crc32
@@ -0,0 +1 @@
+2438029640
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Filter.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Partitions.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Rows.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..88f2a3b55db85ed1b7560ad87006e086dc43db42
GIT binary patch
literal 563
zcmZP)<z&+03}7-cV`F9tVQ0!@4`MPlXJfYKWU6BqHe_a)%*xELf;E8Iz{G-!gN2!4
zFEca4B_Mvz9KdX7YRLl<`~9Dpk*zj>&Ctl$#KOweQbmA+MbUwepMlYUftR=b|Ns9D
z3=qI1%EP3^1GA2sDU%y&9S>6-H`qEZpmkhG)^P%@1LEhL2<!f^0j+DnW}V?dwmQ06
ZXmp5|nXVQZ9~SPQtA!><q<B#+1OVE8VV3{^

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Statistics.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Statistics.db
similarity index 79%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Statistics.db
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-Statistics.db
index 9e8a4b1b43e6e882ed2752521e44df0b52839877..a8ef980b891b76082601adb65e86ee5389d307fd 100644
GIT binary patch
delta 275
zcmZ2yzQsa^fq{WVO;ef`NP_@~&j`esK+FWhJwVLNz#zT!bJet;6ICo(60V${J~39F
zCHu&~dmC%YIT_10HpnwFvTk0$SkJV1GVf&O$(z_6+!-r-Ul#pk;QrCgz<TIXM7j9{
zFkN@%NB#f*|3MgNFc{d+YS3|GV15qcBh!qNb)>aefDUTe+$gw-g^dlw+_*VWtdnJO
zgP6wTS5hJ_5Q`X>J_2e4x*TdTn2+QLpv3F*k{j1eaeFc4?osg%?XQ>KD{*}HXHomA
RIol^p){<^v*>3A!3;@$QTMz&M

delta 244
zcmdmDvCdqLfq{Vqh(SOFh#7%67KoXExCV%s85pFM8T2Qwmlm7oqBt>Aeq&ELCu8x(
z3G$4LT$?vA)-!Eh%sZJmmNDXS=ZPE!(M9tZSe-2{D^7m_rd>T2)c^ngAB2I1fr0%5
z`Iv^{5Bfk(3J^0o-Umt^084?mUtlaK&B(IRU^Ao89G1xr(&EfO-sXd1lUXJ|5R;o6
gA}yi=v6kUwG)ONHA@LDT`lIwt)m{9><TcVQ00_-QLjV8(

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-TOC.txt b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-TOC.txt
similarity index 100%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-TOC.txt
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-TOC.txt
index c20f4a8ad02b..8edd6bb68a27 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-TOC.txt
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust/ca-1-bti-TOC.txt
@@ -1,8 +1,8 @@
 Data.db
-Filter.db
 Statistics.db
-CompressionInfo.db
-Partitions.db
 TOC.txt
+Partitions.db
+CompressionInfo.db
 Rows.db
+Filter.db
 Digest.crc32
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..8e305d7a16285095b99e6a63ab51f1bd50ee42ea
GIT binary patch
literal 199
zcmZSJ^@%cZ&d)6<N-ZwVFJb@#2Zs9p|Nn!TEJuz3X%UDxjAWbzVKE0lXqI~rnymmr
zb1*?@&e;%}+XzDQ+=tM7#SmJ+5JC%r>;MAcEC?;~4MK}GKxheJ2rYRKLQ6yKmAL`o
H%hdn?Mcx%d

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Data.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..666e28a7c0400e3e4f29057bbf891e1237ae9043
GIT binary patch
literal 7877
zcmaJ`dwf*YwLU`*$qZ;tND2fioMFVy3|yHr?+L<9ey9kFI`M(=q7YORs*-3`CLu2#
zg;0TpKx=$9B7_K6Zzep9wn8$QnFMMv#Y$6)FqzwWQ-m70wvDfzerun3oEb~_gCFeg
zd~2`0zP;Akdk<H{agCfts+DEAj?-{5r{^~KCCO^DJDe`Jr(g^RM1KBKu1VmyX`H5x
z`|N{B{<n6{8{GLxXyVYO&-QSm11C#XU++83*K!)L+pTAL;_RxOvEdBYS>F-mwZ@<n
z<t={eVI8xU*}^ZKz9Gsz7vXjJH%E9&yLBMJiEy1wB>|n*_+ei_=W4gwCXW_kUch3M
z>)f3Ki{`WSQC_t7GurbosKEsLBZPcb6y+y4W^-{CA#XPk^8a9p&Q#9P6@>h=7aUhA
z=hylupH<+?1XXmWl1}v#Dc1{<CzTX!AY_U*$_sm)wAiN18Xb9XNdjQHt`){?2N|+^
zDHikXAcl=|=404@amF$)3;Lg?oZrGhng2=GLA*4c;`UO>2udvKgO0UK37d5(i;pPG
z<jWAe2-l@+#-a?q1reTSo^WNtIwrBmg4;C-!PDyR?l{Yj25v7+@@Wy%hJj8kEmzo%
zgGn}_QX6p}H-Q_}iOw|c>=nda9RU}pD~)@jhq&vT6s|jsd&dxQpIf1DJ!#x~JBj<e
zo^ge;85DNCuuWS=PCeu!>aP9>pApnq>{1%HjAYU8YduS_=ogGc-o+hsTK48d_$=${
z2kX!9d_X5Cn(ehzLa;2!ceoUcfX>oxKj=K?W!ZA}!j5rFHKa{UIJR$pc4-3b+pdkx
zmsiaf$WN$dx1Y&?6E6$@opF&?&{zT&4Sz8W@EMTA;nG|bcny6(;5!sm^A`~y&Qw}R
zPusVGYXam-CH3_aDaQlEok}|0K*;wIZqbv<IZuHy8(Xolr%%P~uLwWMV(xvx0Wev7
zaK<5}a=z{ZWOUDmHUD)6kmOxuw13-s5nA`^RYeiLu-#!jqNUZ)a?5Cs1~N`cT60C=
zNy*}{B@J@BvkL=b%Y!;BYTjTlu1HH^Ag_pAyS)RhZOW5><YnQ%A#~q7rbeugK@q0L
zm5LgH@8?`zjfzF}2{jzfJ4uZkL7ySlP{a+0wbrntM3KW4(8x7H1L=`N==3)7<q;Ew
zR!wZlw%F<jdSaMjo5q^uCFq%6tl;AbFf~qiDpQ$lt{#YSEvTY@Wj-Kz7b~4dc4uon
ztE-Vs!lP_>_fizjJGEW9Ec5EET{_{B*ysS)`9S9Y6naIEUA#Ow`a~&>w)PM3!o$hY
zihPVt>+lWm8#J88T8jw}W-rxKQ*d6DSn1N~0v9N5&(=CBwt9wm?ub*2S8(o`d`n>O
zX-Sv=zo#XXe4AFjs8qG`Z6&|>H&tzGOpyB;Mi8WFC?SZ|+BZs;w+gS~Zw-pp02kQ*
z0)k{XH>^lvyO9tfm%h><=MSu;;9f#-MZ1bhYW=c+ps$($In*pb-CBbFNr88!v5xms
zW>scFG|`ntJJCh7Y^Wx>Q)%L<nFeh^WiITBo-|f;eo&`1Pws^N5wk3B6IPAJ63D|(
zt*~dhwgLaNNolOV;sdSmx>6WK@1VW0*RpKc0H0;9Xx@`8+77Wcv)A&sKKSddbV}W*
zSGFK}aHW?OD89l_#S~#(hYeAVWX2Tvc7%deS&%GPHv8>}^;qF*{Joxw@`c@_%i)=!
zXfRf?!UPCA=_6W`?mmjqeB`U>R51w}XC}y756G3q%17d8&3D7H=uTtlp{>?@e-R*0
z8f%)Lpoeo1*%w7wzC!pRIvTmUoTmGaI-sUUVHuraAGciav0i8<%;5!QEQ9HITR^E<
ziEW<cu4ru-K;^PqZ)N8hv*6*Evz^#IThF$}8rT(WYJ+>SEjJ92E9+raXvrqES_->w
z?WD2iz!I2Rqm)$H@wm0%yQxy6=+b@H_RD8VWn(f^?9Q|@vv?V1GrWj$K%K;;u1{F-
zqxs}Ziw}@Hjks!%pnau)JZY?FI|yo*u!390%e4YvRu5UNtb7fi=`akVH>BBIR}Reh
z9xPk@j^bpE<q+DylNz>-+cxD{999)kiSkBlAZ+MRN4F0B!>00x3+f8Hx0%SLW6ki^
zuHurYjX4DUrXJqb=fyO5AwpvxmBckb7QJ$ehW=0#AKI+s%e2W0lyjLnU;Z9bWlz%x
zstn{L>X*Y^r1@INrjF4_F1@(|a~TFBiYT7nQ6w6LT^U~!=r1Nm&iBF*e(X2G?V}Xt
zdcxk(12cZnM4vfB$rNJ&tm;4jJry0&0EE7cGEYfQrC@HTN0ra(jPRO`d7?kwKu}4y
zlmNJ<r!UHLkLHX1WD^PO#Q8&O`Vd4_6A8TZCAEt`g0%zf!d464>&9_PzwnJqWPKU>
zMR%}&h|jXG;k;*gu01>L%pc4`u(if{&9oZp%&dgNoZC>?-P?s}jJ#)nt_o3hCf&Vs
zfD2BDP<R%BD%x7XVMV6gOBWaE-2E-NEQb<z8EFjXQ^X2R@n2r7HA|9WS?x!-3SkSY
z3gap0KUj+y4a<gOm2DC8WxOzruOQUsA%C7|gFntR;w&ve&(r~OrLpE*B&ga8$eqTz
zsgt0!Fe`e}ShqD0v>9fFF;gmqTQo60>&cPg3V^09L*$cmoVHZG(3Bx(5_ZorV8)%W
zZ1LL`Xyi;GpEhGX?VIlR-$0}%U4e3Cv#NM(+XNqwh}qPd=p$6ag3CE_q}dlw7Ua+4
zeKgcMKQZ)EFH_+{^B>+NO`4fPY-kJ@lcfBe5hQ7&Zu6{isd1*>xP0}PEBBysC2^(R
z)2f~Xrhj{o-DW1ZQ;BudhNd3vP?(-nX1$4+J@xEPkrm9#oEN`OZlxIRh!FRKlK8FS
z1ltE&g_n8OwO?u@_U~~MqDzP0eiNq=zQ;DvRu<*o`4Hu)!q8cs@79S{6<Z1P>vhCG
zGYmeJIJ<40$^c4SaKD%4{}ktFT&5E3BzQniyi0Y;Ey~fx+LPiH{5aB2q_4aRDSkQ#
zx1yLzRik3pr}*&k?fCdA-NMOeOp=mO`#;y+^PP;s&XK(_2~U<@Uc#y?`x7P5?rCK4
zDi%ulITX%Q3ow~sd@IW;;c4dfokEnKYdJFsh-or57dpr~ZjRt(VJxPD-Jb`68K;)g
zMs{pZT4W*ar-P7t&BRXX)DEkPs)YDL2X*SI>++FWHkCWnL4tqV49jK*?2C34lf&62
zq$wh7Ht2~K64K?+*p2o0-l=%6EDcWPL<m|~0?4H@MXS!IU1Gl51jw!O0RBa}(A6&x
zxOg~`LM??M6RB=~sF#hst)x`ou^%g<s8b4>^eUQ!B>f1QJk|tF$|$Y;PJse=LO7Fg
zl*KPXCmq`wu_3~bq8^f=IVzl_*q>nW>mI~cTGKVy$f7$*MccM~-bcuc`9M5L79sy=
zCd8hP9<pO1Ono3+$T~`ZTn=E`tU;xX*lU5Ijo>OFN_mNSsfmQHXHBGPH8qi^^V6D0
zs1jk;8V~i+j<62VwNhCSdPu0S3v(E2^=UmMv<~stn%`@N3bs#2#}`TaYa>&&i&VAe
zB%(vgZT^xtp>iM&KH(rC+aV4P7}z7iYqMmRm%RmS`ErS!B~JWRO|r7P=q)jgel|ik
zs`^OPja=KPlrI$S&yeTRY_HT|HbZ}`8@V@X<hjD#RLmC=`r1nJB?6B`Pa5$J4?%-X
zaMO{C?{7o0$v0d4S`3@c!z$W_+8Uy9a0r-jXaz98ZL7);1oUQ{hqcCyLzHVatBT0>
zO6XUeG}|!d6zKLr_ms|78u~_0u`MT|VvE8(If%-WHN85cC0lf;Owo$oFC|u1KRqv{
zt*DPmh*^MH<s^+c`txZhuR$4lPkE)e{iD81OpDZ4nm|m7)0)etOZ7xwqJ-HzS0Dt(
zgpWtbLGtYEUoe+J#HPp5TKFr@BF8DD$1MNX0l;*Pgm3JU7GDdeAkZ<|U!T1OC~w99
zPyg}T$0iLTq<54e|Lj#i>5uXnbYr_U$yI{-Icii^LG{c*gwUpP2Dbh|oz3jR%(2$6
z*CLx}XjU`r*mIe2FR0jsGlvP9OPkrLvIB>+dkMLv2#6~=JuNraPsp7)K-{UEtLg~3
z2R36mv=oM?u#NZttg*41l&tvl+TQ5D$%^Y0CBi@OjG)A_wuBN6sSd~6IXD~3axwWf
zQwt9aPpm*(@jt&~2LZx^^bu-SVd!3oIzv@stI`=%Y!=wC6Cp=npY&pJHoCON$rP-D
zPFmhG9$<DAlV+Op20_)BJ<%A#H~DC+ZU~<{pT(w#nIgd#EF-8M_C!~jrD7{V4Gn<Y
zDhD(dXISpz2-*p=q9=`YKW#zt%kwe+U3OXiPuhR-BFn=a7&bObV8mXK&`B0gLJUk*
zKi`Do{|dbRoba;e{=6+Wf50S(KVZ5&C{%t@WPl4jICyzQs$MNkv=aBX8u?w}eQfg&
zsklG#TNDRFi=;FX&v<O}4~0LClHVn86>ACms3ja4EBl3`845Lrz^68YnfF$NKfq~l
zy>Qf`PSoXc+<|OSdPsveIRn+Y8fpyT@^ZReRPV=G@4L5)@LNzoYdp7tHiWgA>P?M+
z0+Iv8qg>cmMCI#12c}eCFT#hJ*q2Cbn=3OhAF1~)6Z`LaK=w{5hKfDf2S@1SWd4>k
zcc2)@y(B0M&9e$>H2uJwX|&@Dh^A`=<VvHR%qQqd><8iXS(xiq<%#pKd%A(CmGZ>t
zjOEX%5h_{Dcv4IGk-+My(#Rg%A7FS9Ir+(NF`HpBvV=<6l1#aP?Id}z<<HGPOjF?v
z+5@f=APA3S$Q}yFg>qm<(;zUvZH>wzf?8#wjH`YdT2p^eF_nzFh<b&vNFA6j)*!?$
zgF5Z>8V4%99WKQE<65!{d$?g>=Gk}(79A>E<U1dsNEMY(eAAYtJx0)c6U8@;mD5Df
z!W=;EG}ial5_A{LCdygQ)m|FAw>Lg^(#r%``pbLHGdU8^|L!r|n~EAe`hv^nPR&v}
zhAC>;rLVX{LN6V(`e{KXE#@*jR2M5|_DOUO=@$kw4si%(cs*J_B73x)B5SuMxh$|f
zGcK~gJ_d(H+f_*_Dp`-ci&pU??BK-xq38Mr8e5GyMaSvU2|pljp^uQtAt27AA@cBs
zWrVEj1ma50PsptjAsbtPxKlZIOd@0(Y>J*#&RrmC&5fn->NnQI%EtS|@?U)zHT`@(
zzIVhgC`&ppW_%HI#D82^>}6%k`*GXTOb6u`OnTNF!(AK8icJHT*P*(*B%y=Dx{5Ok
zv)N@sdE4uGuo=O$)x3&YMbBrp7K*35TzHx{Qy=a5VL7?JXAqcOMW#qRRYcI6@K|)H
zXaqfLCdd!lqBD(EJxtI$Fe|!JS>oD-laU?oEu1X6RU}r;dVYM2(j#O-dU#T4maR=>
z@mLO`TVtFq>`N?>y?Ab43)I*b!F-8M5&mf#b&Bd=<3X8q3R$L?m++1kP%=#?hw+<2
z^7$zI(n^d@I`w+C>gR{>FN*2YGi~_v!bDCfUEi#qbRx{-7tWN@g>cplO`%>B-aEpz
Rr8?TgDStE?$M3J6`Y+|p?q~o2

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Digest.crc32 b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Digest.crc32
new file mode 100644
index 000000000000..88e7d42325b1
--- /dev/null
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Digest.crc32
@@ -0,0 +1 @@
+4022876624
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Filter.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Partitions.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..daf1b01ec12e7451c1daad819682b16e15a86e4a
GIT binary patch
literal 62
zcmd<!jO8?EIKz3D(Uy~maUG{Vll%e$7DfXBh65Z74$=+`j0OyhCJYdu0Hs->v>X5~
C?grBU

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Rows.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Rows.db
new file mode 100644
index 0000000000000000000000000000000000000000..2f8e2aefce5f3486734c3d173c1ed29af3fdc9f9
GIT binary patch
literal 563
zcmZP)p2(yw8Ng&@#>UL#r_PkPK8VTKoQ>IPAycWjupu);$6jWJIj#ZB1|}9<94yQX
z>rXQ?91dn?xOO{$+0fLI2PF0?o|)l`VgQ?=k+F$|m8+$S00)br10O#FqX7diZ~g!O
z{~H(}fJt}?le!GdI&G%J%~0!>FqK+^tvdv?&I`%9i$Lolfz~}lvMvp1od!1R3=e!!
cqN{~Q2VZ#8)k5P#cS`7Lp~>M3b5Sh>0EbI|@c;k-

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Statistics.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Statistics.db
similarity index 79%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Statistics.db
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-Statistics.db
index 05a82511a7e54137bc80584db1c7031fdf5983f6..21366868f89be5d960319a38fa8fcc42f575ab60 100644
GIT binary patch
delta 298
zcmZ2yzSlyBfq{WVO;ef`NP_@~&j`esK+FWhJwVLNz#zT!bJet;6ICo(60V${J~39F
zCHu&~dmC%YIT_10HpnwFvTk0$SkJV1GVf&O$(z_6+!-r-Ul#pk;QrCgz<TIXM7ebZ
zn65kXv;P18{~!!B7!2%Z_E_{!IUE7wBh!qNb)>aefDUTe+$gw-g^dlw+_*VWtdnJO
zgP6wTS5hJ_5Q`Ys)&MmET@JMv%tvwrP~!D@$&Kr#xV@Ni_o(=X_SZ{|TR(^YS=7F2
o&hCoITGG*shLd%r6-1r$OY=%ni=6XwN^|o<DhpDj>=j-E0IHN?j{pDw

delta 248
zcmdmMvCdqLfq{Vqh(SOFh#7%67KoXExCV%s85pFM8T2Qwmlm7oqBt>Aeq&ELCu8x(
z3G$4LT$?vA)-!Eh%sZJmmNDXS=ZPE!(M9tZSe-2{yS6KU>5R=B_5c6>2VtOLU|>HX
zLA&67g*lLu0>q4t_kofJz)~RM7n}uQFtThk*vu$2hh?&Zv^X=6xA~yhWR}Sf#N;N2
nNQ>w|tYu{01yl`mJJcCqKEg?Vl-{XkvnxzqBON_?owOzZhsQ%J

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-TOC.txt b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-TOC.txt
similarity index 100%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-TOC.txt
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-TOC.txt
index c20f4a8ad02b..8edd6bb68a27 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-TOC.txt
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_clust_counter/ca-1-bti-TOC.txt
@@ -1,8 +1,8 @@
 Data.db
-Filter.db
 Statistics.db
-CompressionInfo.db
-Partitions.db
 TOC.txt
+Partitions.db
+CompressionInfo.db
 Rows.db
+Filter.db
 Digest.crc32
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..ef683177e8f648a5e67570980ba0d9e79709ac39
GIT binary patch
literal 47
ocmZSJ^@%cZ&d)6<N-ZwVFJb@#2Zs9p|NleS3~PZjBb3bm03g!|e*gdg

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Data.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..15a18fcb1b4bbd2e1644a66bd4ef33fc74e12bc0
GIT binary patch
literal 88
zcmZ3>z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpIy724x0m
nBV`8NBWgf~u`+|r(U~*d7^F><8BC8|l2B#{0IL^XfA1&&G;0{W

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Digest.crc32 b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Digest.crc32
new file mode 100644
index 000000000000..43ff6368ec3a
--- /dev/null
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Digest.crc32
@@ -0,0 +1 @@
+786170984
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Filter.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Partitions.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..e20b4e2f2700a2d5278b3858fd96631b7f66fa97
GIT binary patch
literal 59
zcmd<!{Lf*2kK^uc4yGBL`u|H87_cxh@G>l5XK;{qU|=+0U^HQX09h!_3Z<n1ldcHk

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Rows.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Rows.db
similarity index 100%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Rows.db
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Rows.db
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Statistics.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Statistics.db
similarity index 83%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Statistics.db
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-Statistics.db
index 62b5d3f7c409f09a71dd6f74e2d424e761f80a22..b46a624817f9eb9ad49d3ceab892919d14601264 100644
GIT binary patch
delta 258
zcmbQQ@=HaBfq{WVO;ef`NP_@~&j`esK+FWhJwVLNz#t?t@7}sq6ICo(60V${J~39F
zCHu&~dmC%YIR(psDnJ0F3j|mvHi&PQ=Hh0byouexh_SNwWzk;-?jPL@tcNZ|lwATU
z0MfNfB<lbF{|~|tT?`EN_l0d%FE=|dxlm9v4rmw~OaiC_$^_C-^BAWC9R;!ynU7=_
tP~!D@$&Kr#xV@Ni_o(=X_SZ|7bH<$bv#5R5oFInDM+Bo;=4MV41pvs$Od9|I

delta 203
zcmeyRGG9f7fq{Vqh(Q3vW(49`AZ7yM8X#t7U=Z9pQO9y(ru@d9a!$cwpcn{%)PVrk
z#0lb?wYj+29T+1Xcb>>$5M4Bnfz{dKvSO+Nn6_(jsQ>@}KL|r~FfiEPFZ^hHY8S|;
y6d-1FybqK-Fj<aK5@;L4<b#4@IuNrMZkzzg0}&D*Vb33>cdBkr6(&myMFRjN3^=j?

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-TOC.txt b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-TOC.txt
similarity index 100%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-TOC.txt
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-TOC.txt
index c20f4a8ad02b..8edd6bb68a27 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-TOC.txt
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple/ca-1-bti-TOC.txt
@@ -1,8 +1,8 @@
 Data.db
-Filter.db
 Statistics.db
-CompressionInfo.db
-Partitions.db
 TOC.txt
+Partitions.db
+CompressionInfo.db
 Rows.db
+Filter.db
 Digest.crc32
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-CompressionInfo.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-CompressionInfo.db
new file mode 100644
index 0000000000000000000000000000000000000000..1db9aa06b311ba8dae1bc2eaaeca080ba080a480
GIT binary patch
literal 47
pcmZSJ^@%cZ&d)6<N-ZwVFJb@#2Zs9p|NleSj9NgN5yEC*000^~2>k#6

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Data.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Data.db
new file mode 100644
index 0000000000000000000000000000000000000000..80aefbf1d126dbe37864d5dcd5ab5566b5c05bc6
GIT binary patch
literal 143
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13I0&r5DxH^uG6l)Fd8KeWGI%D{T)
zQbf55hXz9?10y2?qoEZ;z7{{jp~WX`7}(7#I|FPOWQ=SW?2m{&1PN6A$p8u%0|k!u
lJ^%`+)a2h~VA#SSV*(U9cBm34bf9(x0~9baZrQ!r902Z`E9(FN

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Digest.crc32 b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Digest.crc32
new file mode 100644
index 000000000000..08abf520f495
--- /dev/null
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Digest.crc32
@@ -0,0 +1 @@
+1825820643
\ No newline at end of file
diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Filter.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Filter.db
new file mode 100644
index 0000000000000000000000000000000000000000..8868e5c18008783c106cc95dcb2a5942cf662a48
GIT binary patch
literal 24
fcmZQzU|?lnU|?!-Qe<ObWMN=f#-zZ)aDW2<72X1l

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Partitions.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Partitions.db
new file mode 100644
index 0000000000000000000000000000000000000000..773d3c8891c3128145d30327bae64f5fc3b26757
GIT binary patch
literal 60
zcmd<!{Lf*2l;dtEC)0mlPW}Ia3k+Bo1^5^?a4<MXJ1{UBFff`hK!6;SW`)u+0G`YV
A4FCWD

literal 0
HcmV?d00001

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Rows.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Rows.db
similarity index 100%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Rows.db
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Rows.db
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Statistics.db b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Statistics.db
similarity index 83%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Statistics.db
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-Statistics.db
index 171655ce4000b521a9caa573bd3e128e31019a5d..23c2bd70eeb99c1f629b8a6227de60831847abfe 100644
GIT binary patch
delta 258
zcmZ3X(yXe(z`($wrYX$|q(K10X9VI*AZ7yM9w26BU=R|ScW>RQi7J*X30F=}pBO98
zl6~agy^S^HoPy;*6(9i81p=%S8^kwDb8)jz-o)-;#8}z;vgj`Z_m6G{)<c&f${sEN
z)3r+^>;M1%55f>#3=H;nZy9%Mef~MQP*5}uXc!wz0%#<Z38bOsG0qVMiUCz2^O5WV
sO1wTVxpCbTw-;0H9u@!4{(7mEf}`D^MeVERBrceIL@<SA;+Kn4062$EmjD0&

delta 202
zcmZoxU7@1Fz`(!)#2^4-GXile5HkUB4G=RkFbM9QsAD-XQ+{JlIj3MTPz(e>>Og>N
z;so){+Fac14vZ0xJ5S^=h%TDP!0K#qS+V2*n6~TvQUCw{e-MV~U|_Jn=b>q%6muNN
xPXS^^$NNCZ0~7B_0xe^hd{9tK2VxY%qiT>e5Fzmq*8EX=r|S0Qz+`Em6ad~XIST*)

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-TOC.txt b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-TOC.txt
similarity index 100%
rename from test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-TOC.txt
rename to test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-TOC.txt
index c20f4a8ad02b..8edd6bb68a27 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-TOC.txt
+++ b/test/data/legacy-sstables/ca/legacy_tables/legacy_ca_simple_counter/ca-1-bti-TOC.txt
@@ -1,8 +1,8 @@
 Data.db
-Filter.db
 Statistics.db
-CompressionInfo.db
-Partitions.db
 TOC.txt
+Partitions.db
+CompressionInfo.db
 Rows.db
+Filter.db
 Digest.crc32
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-CompressionInfo.db
index 8fad34fe9e11d06e2af4ba6060fefbc095229905..ab804238abcb5ce87dbfa1c3dbc115180c8c7c2d 100644
GIT binary patch
literal 207
zcmZSJ^@%cZ&d)6<N-ZwVFJb@#2Zs9p|Nn!TtO~tAS`;D<BN<mhSj?^vn&lCMX3K}r
z9I6nSa{`3sR)^3$+aWYxE`%0fhtPr*5L#FmLW`V%&|*;cNQgoBlII|_bR&e8Rff=V
JCn2;#E&vW#6RrRN

literal 87
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*{llvK$;EAh7-(}Ax!qo5SqIdLi0yL
HXkkMD$@&Zs

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Data.db
index ae35335fbf9aa78f860a9433538853f387fa6b93..4cddfdb98d08b3dd287392059b2781edeec03ef6 100644
GIT binary patch
literal 8601
zcmd5?X>?TO8NQQbvW&||KrD+okrQYVEi-fH&Jv25!L3LYM${sQ+AzXE$TopY7NaHu
zWeI30MJrKUSQCX)O=7W#N-~pH0k_EUSZLKESV*mhiVF%Vx9{_Pv)nuSPk+pxoO7T1
zeB1kc@B4h;ok(6uS}z%_ZK|q<C4;0&h0^jKtJNmk6^GO1E;UK?M%|#^8(zVr+;F(U
zAFOWiFAP-&nyVHB8tOxp1O(>?8>(vkfq4rWYHOOC<~LWjgyz*&)l@Yt2v*)z5e)g~
z1)3K$%%8s~)U<fvyc++aV3ogiNlV?Gjny@Ejnx&w+U90|!~CX-#f?q1cP^-@oY%C_
zPoo4DRLl<r0#*L{+NS!7K&Y{vrq^q`%7{sAEnl@_@}7r1#(GPSZLx{ZGrKb@7+$r)
zvud!6;9i?YFoW$Y%KERa@ERK~mJb_QY*B6D_7&d#X>Nkeb|+_UpbiXPqh)#lxXL3^
zoC|PfE|?BIbM_%HoqFaS{b0KE%)mM@-FjxtTmo5lDQ4f1{caw{X|Q^YA+!8a=|6p>
zfYr8rFYA{`sAuA$SZ#a-+te^$!Cj-$Lzo`<x-42oC;K|*1XS?;eiNzS>6LA(4JX)H
z?4(Y_ysSSwgjKh3&9+K=nWcR<0%l*C$J;-bdBSk+BFlqy0F^a~#t2Tw?EoFui;=)2
za8nULr=HpM6+oAsx$_7>x1RZIH-T)qod)k>-{kNvUkH;37Q0o??4vzYZ|+fx)qnYr
z9#;RY|NAr7CXq2#XtRl=p#VJ4o*v5d+HF%ZRdqA_%&2}WBzVP*K*(z|)Q{OY1HBX3
z@Fw5m_NzI&mtG>g2R{K<Npb-mZwBblGe7bJbn2NWZvg1hGe5NfbnBU?M-#|E+`{66
zpQXV<7R9P(e)}ITL|-9k#w&q$<Lx}<mA#Zp$9%kajlb}Pm$^8)diJQg6q_(&cq>iB
zQRK1=^-=aTsc?#`5HUWF;MmtwAWpH2O!ZL&Ja02VCCLSt&vV3)WM+7c<|QCH^~900
z0lJbzv>rVLpj*$pw3tA4Zi4z4^HNbfZMi(3$J#R{5$h+sWyTP@N&quftti|2A`6*&
zex6{I?fB{*Q;3xU_^%H|Og0$~{gjIla~43-gK8_~N!imMq=yteZbJv#fkJg!Dl=N1
z>n=0aZ81m&TN^>}$8v6u+faN?v+S%5<>6H<Qgpv5r1zT0j8kh(byBOzY-`Tm5bIG$
zd*yx@Rxj;->O<W=!xOpXFhr2wLG9z68p0;atO!8+BtyJ4<eKvoK;@yt@BqvI0MMal
zE`+U^oqA@~RDdo$GiU|qPBO89WUq08&kxf+KK1pnP))t=$Fpzj2C)f*5qGFg_*kTV
z^bgwwUZc|@({d2lPD>L7hfW5vI0AH9tlboNn2{iCqsX^V5ousyZbMy`)<HzVE*`hd
z@J5#H2B5?{T)7F4j`-u6c04-ck861%y5f)PdAd=J6_a^r3X$2an9U#0GxU{O9-%x7
zneBb0PY9dev#)mqPmmOwKO$^C@_7Ca3(t=wV2^z%HsDcKpS#oGF<O)k9$33=x?v~V
zi{v?qt?r3zK7$C!%LMz=%$?ZS+dn`g`WfQVWqE2pD9YOGcuS&~d!&(#W{0L3?ebmW
zu!*9b90FVad2(ag(0b-eK7ejLGXl3`5jgxY7NU7m>%SEUW6?AB=f#a>i!hN2+tSCs
zYa;4<+(o<*cQ`#%=~Zm4WEbzTlNoBai2rM+5D9rE0W3=9@*d(pcnF|9$wesM?gyx7
zm;@er5}-rRe0Lo{r=EGF8K6tg{BSNnx1KqK62w}6rmF9=>TI<e#b%nuUAiojD?DT^
zW!y<n9lbt0!)iKx{7SOsL-sM(Lsv!)^Ca;^t}RUuIeOf`&7c%%%A!A?8+IX2rWRl&
zyK~gTY)a-XwojtN6~Q;=d4$QxZL~XKGSXU;+4@-GU`9u27WriKbYmUdM|Y=*j_~p#
zXA9@zbk|F5gv)#`rMOJ=Cq<A$ej_J--jl*6Pju`spqk?z1Q`x2<+OJh&*hZU-Z<oP
z>S=GnK7fhS-f<t-={LiFy@#tNu9a{AlXOhvP_(2wJw)rZX)XqcL@zCc9mpQy2O)K*
zW#Ypq8M0M#r=XOQ!@_Bjw;YPTC-UV`%y5!yo|jR18M+ld)9Hyi7a?Hw1KIIAlGM*U
z(Mi0FDap2&&rC2K`YTCUDL`kEh=jgA7oaQ2L~9c;cY=6~-(bS6uin-RWz^N=!O<H|
zrHK@>F1;x;I)+i0EjJy80(NWMSa0s<v9}AC-|mTgJs5K%!62KD-u5IGPG=hGMI~d8
z&PEZ7c@PP32QObrOc2W*EubfS-tWg_!si21@tE-WMKlu%YJ561qv$ck=WE8m=as(F
zIU*JAy>$4n@1z2zHeMKgbg>~lH0V)Yfpx3JOwA4OE&cUD9$dwiYY4DrSiM>-_H<|n
zagg`L8o09pyuLIK0=8@Dv=J%tjZ7Me))f7Fo`<tLa``E&m{CjERCp5_I`s|BTL8K=
zgJ7>OUk%i)p#p7ROeo7yFiHuN@&W+wzg^=sotB8&W~OTknN7L4qpY~%vJx(h_Krdw
zTrS)tnz=bmlyMD^*Q<r>%WSn0CX%z7yv6K3FGH<l_mI`Ag+R|+49A&rAw7>zP}%I5
z-`Lv-=YI1#Gh4?do5FCS4;Fo|;XEU1oh<{ml|rUus+HVvq8-Ch9p{;4&_%v6Cpjk)
z_E}Nw{wn5=_7LVTK5&)ulam1Y0_7EL4GkA%Y!X0cjQV$RDEQ;KyZg9s`l?Y9Gnl@D
z^w6!>=6X9rRqgCr`a1`U8HdHVzriD`#w)CQfX<)|Y}y6bxffE^0^f`mFMlHZCn}kl
z%+oK3PCga>a{(`KLDrgOhWZ=7iusR*Et2&B716i>by?n^5=uEbohP7Z-Ons<M1d-r
zwwS;{UP>L9l&W$r#@_*|GllvNzn8i=b(W9c_%4oYtl}#1PiGvW8(>jPq<0=j6Om;1
zph2C;yvU=uSoJ^dLcE6Ms1|V$2w2{qD{M$Yy_?JGGS!KwTJPT}+~`eBdnj=>=1Cg5
zP~s?e6InYRb#s)C#BqP>vK;3Yqj-^V$)S#upHXgTThdHFg8!JEnsJ~H84#7t6zWM@
z8Ew4;_1~fdj=sM5i|>@c2{C6nHIyb2*<X`N*kqmoys&>kK7)^hS$ty#SSYI)3qQ!^
zP+paa6J(O(i4S5freO;?&g@1^<kvKOlH<%Cpmq%#o8sHegeGikz|ano<NWNzP%yv&
z!y3TOl(xn?pf1fgnC%jP?i6a?Y(m+6lS){wA>K7#+nkt{jq>sD1*VgWR&w3=PX+?w
z@u`Suw{>57XxHP6*QxvR%p&Yt55T=#kz`dv{pz9*Hson>cC4!Hq^}JjcHkmP2DOfP
zjN{FZ#QaQYBZ&Li`6JENx00Q~&jv2I=KK6?ps<H;5!;Q)>L=|l^bF(|r`Xv*;j@rO
z&d5%bUSxdH!$9pR1#u(5)Y^IFQlO}vOdJHw2RC$c$+UMOEK3x`0c%BiDAsFpUW7Uq
zV#_ntG1#e<Sc&UJgF1#?i7FXFLMT3<?iA`w0@>Rmh}3}MX|}jag@<WdQy?c7a3SOo
z$&s)L@~?=Qn1;?b&A{Yw=sEuJ&YO$D?YE*xig_By#Jnb;il!|ldJX?B;z*h3XP`P`
zRB<=57VN!5;tIt-SeEi%cWPq-{o-(%2qe3P(RnJ!!gPY*2Zdj9Pj~m@nOW&CKPN;H
zv2}E;sAKD>P;%xBa&%m%Qm8w)8g{1Xmed%TA2TuyTc~l%I)qQwPJ=|fWj#iWf9#tD
z8#OBueXEE~3K=x(ZlDfrOVVhLAE;9^4wNhBc9~f0Iv+o$pB<;3<0|pr88}48it3n1
z<<rtcB)j|K4BDG3kuYPhnt4`4=T|u*GgP$zcT~989vqIxu#!wlEQ-)VGIjcrt&G$(
zhZrMbHm0EqF&1*em$k!DH&rwyrHF=&Nz@Ku65=-2WvTrL90$!QF;Qx{WDuw`rDY>G
zOP6*O>aS%Jxb6h^gpYsie?03?MZYVc->~@)a~v{BmBN|^TDGK#M9OeO9b3k2lP5=O
zJCD5MC4Q|nvx!+j6s1vT;7rbZLoiMjalhmU_c0AS!(%ib<u;}vql<!lu(6xQ!oo;w
Ub-bB8u-b=zaWGsn=jPY{4-46<4FCWD

literal 5214
zcmd6r{Zmxe8OQGe3$kjirosBs_JG}HsFU$7%L1!a?zOS0OfZshGBINlvn&Lb7k8I;
zsIse#2_dbe=0!}#MkR2yj@2b&6O#b@LR&Ctiyg<I*2xg#bcpS=+Sp=D$J^&QckjLD
zGX4SlqJE$AJm)^=?D;&;IdYtZJHS~SAw^Nb@Nb2)bI(T{jyz|+tDvyRy~f7Dh;mTb
z9p1-t^TOdKUtMcMMPtxg-r;L6_j};)R!^|pTh~zOd#<U&R~2Y%X!ZvE6)hc&m9;Ic
zb)Kf$s+|>{U`Khix6Rj4+g@H<>)jcwXb;rX*VTJ`HNpChs$f;1A<)+BX??D|yrHS3
zs@msoZm6uPEN^J8sI6}H?ktCis#+^N9!PI6Z@m?h>hpK)TNiySTe3C^k-P>QO*DNV
z%^!YopY-BHF_;JQMBTMG`os;%+PuX17b~A!RVduGPl`Sr1vAMnq@D+10*ho7Jjc=3
z^#-T;qS4oeo&{#l;3o7eG<&u$Mb9F$XLuSt-Dc0O_rS<U)?oI%=~puGJ3U>JwTU=~
zsj+{H6n!-SizE?IK(D?nu)bG1Z7sa&APYxwJ8iI7XNc}WA2-YkZ!f}WHHVW^ta|4R
zt3C*`YNVP~zjw3hPxDxHX%dsnPkw@Y()mmii74Nt8Sa1#-`v(|GcvgTEF1f82dWXC
zI&*i6$({Fq20wv1*MJzyJQCJpK`wmB2p$k(Sz)5B9R`<4jIR*;X6G};OhPONrAwdK
zemly;MvmC^jqDrdS%=(D=~fyM4~;S37GsVGO636>4+l_+xegf)vf$pss&zg_k&U|=
z^-sydrW_-0St!j^f8phNV)FlrVMEvJhT>nh9R|bq48w0)(69y#*UNxaN8aB$Zpj>A
zd|JcTk!RyxQ#yf&;^FYzz&GNy^zy&LvhrW#uQ>WhnP?3hvv8KY5E!uv?Vv@dKMj5O
z^t6LOG#}e~`7vt%M@&%1Iww^A+tLp*vr4fiV<Ig862-cIix}IBq!H((Ol?|A*e1Mi
z1daJ?Em}14*A4u^FZH^f02=oP&{$yhJg^Oog=Wv!B{UY9J>SShqucEH%SB-1e<1bp
z*R%vm@moi4(*iJ}6@Xa(6jK0XVf%5qIH&3$3n%Cy4O9GO;nW$NRU3=+GK!r2$lC;!
zLK{3+jfXI)$j|~`kQ??AeK_W3hK1keFf$3`aAJXfZKjWJ(hSuCAPYNsXomXm#V8X?
zCdG}g=~2qROs+)v7A*&1sx8Xv841^(Wx|B%O|1HdgH<<{vg#Ix$+@)TW~OO!D)zGA
zZKug;!{n<E--S)lrLubvetk7|K!~SO$z{Q@Q9x#xN9UU*Z<b#(%dfLijIUTM30T%C
z(oo8(JfgLUFQyVtY;6f@ECFwqQKB!{KbxU^Q`)HsTG?NfFn}b+8kf!$!he#YT;*HS
z;E!9Dvlh`Rv~swTc!u&B|7hx2evDJDqPb%_iM}%gn~Ugk`)$PW3WndZD9TABKfdzQ
zdyyZ11WP7#{_c6;$bSeiw4cWx9G^vgeDfrn4dNWl(56N5AIB3(@=H^-Xaw(x<7jj}
zuP1=Ur+Uy>V0flTR&oGsg=R+&o>o$1IAU}qJ+uo%SfB#EJno~%cA>88m$9siQoL#N
z#vr$TBd>OQSy&Yb8H@^T6YvdQLw(mdRN65QA7xcnfK@-;#;P8PRj=l<s&^5q-n?zX
z{H-Yy=Bp2xG4C5=W92ibl9&r8d)cwmVOG6UZBjL?diQZweXxdABRMSRB(eS!>;`S_
zGE_0Uj<GEK?#75wSIBt=M49}NDwS}V{Mj^iiB-boHfg!B-<LbsTt4xz>S_tyD3xuQ
z6yDtmy&BMdx(u(?j<UD*+@vJyH*E*Klzv-fIli`Rj<8&m06-KwKa;N1AhN}7Aep%5
zrzkc2E<jYVqxJjQ$oGc(K!1n2rifOuG6{En2+S1jKk&q{!ic1FUtfAJx)WJ&3X$#<
zY}a<)m8#UJc>fc=hc|n@u_u<o5e2+UGxU_WuuV*S1CA;HKC0%A6CjS+<7jm)FtR|a
z9j|qyV3uz#UhPQXEZ>LrfR&#wLS6`T!r?=m1YDF7uI>`(CV5H1!Qg%w4aug85$?70
z7ODb1AjWTgPM4&&-E}!QqssVzn2?~*8HeuS8?5@_B&!~cv1-c+R&D(StDbB#DLBYX
zFW>HAAcy?^bHPmcjuJN5>p-PaJ|HBjhVUw{m87cW><yxpV*__io78nikp^y+aVKaw
z7q&6oyyFN>Pvsms@|Pia3uzzh><ZyqNNeUoXE!sgus4V**~`Vm3s13w-5Xf7-&7R$
zuVTjzEN0bfUnWssQbr%ncYbuIk+Mx+(A1<pIBO`}Qo_<TN8m18NN>k}{^d-9bcqQ|
zH{x9s;|TAhPk%q+ow<!duX%T-5wtFp5c>1<EfnBIt6f-}n?&9f0yrr03$lpgf-xH+
zpEz{ShMpLjry^fU@3DR!KQnF@4v)<Se#i#fPn;>~Mv~CbqX-7czc0BYj0Unt5fI{v
z8!2G-zKq6#S*~N09-9cMQ?Ng#{1~85^gxsbI$(T}GNgC%#6?OG!}+Ajk=DqMo-t#7
zWfRV)wel&7k#>xyoH(YbYs*+QZK7xz)?;}7{c~3RYl2l@yl0~Pi}xAIWud><4EfAa
zHukHRP-(4Pp%|&z4G5#D<Hw-DC7B1Eao()we{?L!?LMqkg#mGtU&Li;NrSxqw(%{L
zcbKjY^DWsu2+cFeyO=l;HZDZqy?oph{H+Khh0&Fv68QPG+xzH)T$WKQR#mfajn!pF
zpe#7uhI*h3AJ<O%!y9IPp3QNmlm9mcCL33$GCf+DJvW#hblq<W@Ovq{1dsYk3RS@n
zK4ulduTJyBT^u%XFCtyI*y7im)dFzJM+*Ssj;KfAjSgL(@kZ-eybJJ~S0Ma2yPcf-
QF{v-Z|EIHD**<;u|JS*@9RL6T

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Digest.crc32
index 8a92f3c58325..83138ff5a6b7 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Digest.crc32
@@ -1 +1 @@
-2977407251
\ No newline at end of file
+3857770523
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Index.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Index.db
index d50fdeb4e209041c5469b14cf84433c739f95a91..aeeff93045595e277a579bf1fc5330360ccbfc84 100644
GIT binary patch
delta 159
zcmex(j`QO=&JE3+jJ(?)d}YkzWaQYc$jMj;q7QN~6>_q09K6Q7`Qw6oPDZxPiK~i0
z)LsjqJlke{Cm@ey`vpg!auyblGQA5xWlWpr-2_T7Z5DhCRK~P9r?7yN<<G&_tdk#9
jSus{_Pk6&9Sk1=3Xmp6@ADm-+ShyL^F*zc|0pS1uS#CD)

delta 159
zcmex(j`QO=&JE3+jNIEFd}YkzWaQkg$jMj;q7QN~6>_q09=yi9`Qw6oPDb|4iK~i0
z)LsjqJo{#SCm@e?`vpg!au!yQGQA5xWz3uB-2_T7Zx(zERK~nHr?7yN<?q4Q%#$Bf
iSus{`Pk6&9Sk1=3Xmp5|0nRZ#EZhp`m>iMff^YylUp4^%

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-Statistics.db
index 734186497e79a96bedaf7886b935bc46979e17b5..c9506ee78c2ceafe6fc50aa0da10a74d827844f7 100644
GIT binary patch
delta 154
zcmdmC{=<BNI%{U)3W=nRI@dWF%QsGtXJlmEyn(TvY4c*<$;^`vu{#(sR{Sh1vu5DF
z^NfM@Kwokz{|+$S(dSVA|NnmwW`F|wnO_$0+Op_w&Jdi&X#~;6V9NuPWdtH5K9Z3P
Z4Myzwe-=1zUDdy1vVn99%eB*N7XdSfH~;_u

delta 150
zcmexizQcTiI;*C5N#v)EI@dWFi#JY?XJq8syn(TvY4c*<$;?KKsaCoLfec(PHZZUj
zXFDDj{s5*=dkNM5|NkF^8KA(ve-ZQlFFq!lR|rnyG=gYja5xK;WdtH5K9Z3P3m1NE
W{u8V)`OV}HlMAF<SQfu2n*acwP&L;8

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-TOC.txt
index b03b28372b5d..8a6a30b6db77 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-TOC.txt
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-big-TOC.txt
@@ -1,8 +1,8 @@
-Filter.db
-Digest.crc32
-Index.db
-TOC.txt
 Summary.db
-Statistics.db
-CompressionInfo.db
 Data.db
+TOC.txt
+CompressionInfo.db
+Statistics.db
+Digest.crc32
+Index.db
+Filter.db
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-CompressionInfo.db
deleted file mode 100644
index c96fb7dfd3ba3396018734ae50449374288d92fb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 83
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMli|1z^bqr$YO)?pcM0Y2!s7Ngyvokq50=RXyHBp
D>x2oL

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Data.db
deleted file mode 100644
index 5ecab703ba617cf8159f31f7a5ea84db94d43315..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5315
zcmd5=`%_fc9lr~%;QF{aLW)eM7XdSB8t*PFu*S&QjZW*M6{eFCJDo-zY8F^w7ujW@
z4ZH2ggp8v@noML`ZA~bMaT<j_w4Ff~f#r?7U7ljiR6C6sO{Z3;>h1S?cJIAso&Ev-
zva{ds`J8jU_nh<je81;#oQC@)r_p;PN%F#fiA&~=wdnPRlvHC{dWLDMj)NBIl=QB*
zhUcQa-U_G9ZY{An4!=|2&aZMh4;DBa#g4;{%0uoVr>&@<)Oo~VE45n7N-A8=(n@<#
zNq$9XWl2T8HP3aVyfVMoT~%0BP@eB7EPKaZa@gUp*{pfaLvEY1*i~}GR&ms2vzHtx
zDt6{MY*l%-qpm~6g${Rx-C1_1s=``W4B@pJwreq{hSJ)a*E<>)&Dx`4i=j+M16|s*
z#N(~4G5g-I67aVg%oN+0MLI?TX07w3lo>4_`Y6t;BOPDr!7HSuQ`jl!&Y893+FCPW
zjlbe#tj4SrGffChQ=#bmVKXAr!;#*zh|CB_UZ_W;DIDp)4^F;yD+XV`bZ!|8DSXww
z)hVR3Q(t42xijvOS9ersAtHjBi0?=){j?CGiF6vXaS$(EfhMu@$6lR_4|a#fvJtP&
zshVec5gTe1JG<KVDRTjMZWjX<yQ0`uv1>aMSd-ZGBOebdCr{K>yOE29Fr`g2Owo5t
z+N3s%T^F#kj2656R|~-O@j-Cc18T})b;qz^(ydSM2(Xz-@9wv%Ynj2it68|%eV%P4
z-GMkRwp=hOgG+$s+ml0KUTjZxs*V1a+D@n@@muPo7<IY)R-DMl#nQgS?_JF-7x7<+
zX4lp~CT&vp!T*n17TXi!WwAZme97go#qpt_7Oob6nMqANyKC%Md05-+K?flnrl&Cg
zcpdHA*_5v0Ux$9e0fjG8-cpy2J1rp_NJoMD#o%ts4$Y_he>9))e?(m|X^2*=9wA!$
zry5RU@PG?Nh|AIn`a6*b;CDqzIi=qB_?{yBKCS%#r9V;VpV>QK8>D}Fl$66;HQ0yp
zzvlz*bNugHg8UOw;+81`qX6`M6E{sreWXzaKw5tnWB_(Lphkh-lGW%<dk%XVIV_}y
zV})7ReIesH=+}#I2m})Yg?YsObw1w3uOtwd1cdM3dDp8eXB416*9>2&DnQSbx^M;P
z|BVkqkQIRR-`a?#eBk?^cy==+9}t?^lQ^*FEDIjUCp%g2`w3y-OZqTy4;QQQkPLiv
zi7J&W0X^fHFqXpF!F8-6kU>LSJCCfEMPP8hl&QkWpvS~aEe?Ln#j11~Y`elVlEE)}
z*|-Luu-Ks_P&HKm$k3ksbU9@S=v{^4qv*p>9*JkoW2_>GLl<!|WeE_62HH-Pi>-2J
zGW2yel`6$Cye^lGY*^oa%fbC@PMvP}t#@0Q>4vMrl%O~5={Z#i2<iR)wi{rh5S-4Y
zDf&U<hIGpNtxDF0{B~RNBJq1G9=#yMH=T%(zT}TDfuCoZPmuV(P9WNH{#ulDo6m@(
zitrHKe|CrTAjJc8pn7mpC+N>bNDukw2&hF+4u*d}^8e&u#Apx7fh!t-d^7Z+YA`&p
z7^Vh<jO;aQT|!DAN*)!pV8j)GLnWk|fc3%NNcuB(VFLwYy%GRA{{%+}lpqW#{0nF-
zq^m#<N{P|UsQW@jIMTA5z^O5;An=Ju`BB?_JkE*)jD7@tRS6*GQB0qFs00v)|Jx3!
zS1ABS$Gl)G^dEg(#~!({H6&go&X|eKS>o7%<%JN5Jd`-*Vwp2`<}BoRSxCrO*QKz5
z=fc=O9;s*9Pp+>G)+q}A$6pRh^5YidnaqANZuK#9iQ~twx>Yd{#~awV#>Y$ymW)5%
z#l|(U;^{f2>%@+BhTdf2aI+6{sm%WCueG^Z22WJSd2}F43WF!kuMQji4poa#4IoZP
z7^e&tCtpS>mz8&NJ5vIPlLwh^5GUyesvVo!E(5XU3gEgMcrUi3f7mX)nQcO9S2gGT
zl=3`OnS+bgv9AP~e!VXD1z~!|5GKtn{PHL%7YTjY4m=l3ng}VEFT)JI1--+-$Z~$~
zDzHTg<-j?zvW<6OEs*K~Kjr=qFX-2(U6K5gx9U#d4jGt2;Kug?LFTW(^uC49jFfUI
z^{1lGhEYF@+&GA*MM!xA;tK_Y<iBy8QaW`w7(EVvME~52U}J<D0(!geqc`n2>^&eY
z6aowu6(A5p&(CrKfk}<T9C!tfL5TWb{=j2Yl0TT0W>KLwC_~_*N3^V#sXvu~tdaRj
z0#_2~X-$!<|G+~jMmd0F+7OmQr}wj!fH>`8O)~v)FT*Nv`Y&v&IQ@h@*)wYas;2gt
z*Q$>?c{%S-y<H7CQszCG$-C<j$`xCjsSG3j%vmdw%Eg(1z4Z=S=&1wzZG=V4YOCJ{
z#wyLTuSPS?WOk<;wn63~nJunGDOHFz`wP4cDD<DId1ht!JNp@%{K@QS99#d--bE>u
z%l|pPo}G6t#mCM&w?`jlI?RkI&RuwBWmHF;>%4k`(tlMpy?Q^LyWdaQ4tMZSUJu?{
z%P8H=Q)gDcAaqav1}(=c`QI*!rI(GVt^_1}#$_&+-$<ViDv|L2)KeLd(X~rrf$`)<
z1kkgbpNK(nRPjU`FK*}!6#zyL_^1f1h!yluYRbbM01=qi?9&F12`d8&g&R-m>|C`@
zFyzqda3zlu^(B+}je+NMV8L4tJ0hg$HsjpC%Q7!qs6$6;@X4S_9KM34F=PsVC-LQ^
z)sPl?Q6M*0Qf}!8b_b^3)aT+3D?GosH;m^u%NU*m`n=o6s~H{~KBzO)B43=}se6;j
z%;e^sM=aADO5d+BFD5MSFF=rwJ{F+QvY3#lA@ErpL6^p&6?P9(&^6U#i`)#5PSpXc
z<l?(j`=USD#dj;VIA!ihcJhAJaqp4^HaWE?=;16{as9VD;e%SD=yEn3sI9TgS058#
zJ$r)~2#zeS-?0>CPVJ+v#w^R?`l1M!`5Ju(Xn<hF4}3e~oqP~*PF(NngnX)4cUUWN
z6=RXa#9d2bm5wBF_kIiPMyMxAv|BM=qdZdLduy5@SU-IQXyE#`3XO0MA?quE&(~Ck
z*^}d#=auGrJ3t8{=|eyRrz2ugOw5OBJ4xDB+YJjQPoI>vnu$hCa;bwo2$u01p1fN7
zhXjthDAPT|q~lIXKlkF6CY{m07;JAf8Z;;QS<NZ_Gh}v71Z>!)D;9i#OM8fkGlxGM
z8N7m9=G%55<I|@#aF5@Ei9d(x!~7Tf(Vy{e6^}}=5amU;iGibReCs}VdHT>H;fo(x
R_zs|L+4vZ!xo&>k_dhK<GP?i(

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Digest.crc32
deleted file mode 100644
index e94e369305be..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-2666613329
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Partitions.db
deleted file mode 100644
index aded0e1f0b6b4f667c86ce3a9b6df8f0fbeaa91a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 63
zcmWe&b}%qtVR#_O7$C`bfrGQ2)0|O|^Dg63P9`QxPJO1c42%X0j3x{apa`W|p)>;k
DJhulc

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Rows.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust/na-1-bti-Rows.db
deleted file mode 100644
index 44803b414f9bd8a4440554bea0b31db1c3e6b457..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 738
zcmaLUze@sP7{KxO_3Zq0=4JjUNj#;*bm&g)7Y(IzL4{+3jnYz7Pz`0)(hwAmE=QBJ
zwbdYKHnarcKTt&B7AmwE*Yg}i8D2Mh;eFt}`y5YVju;6cHc>KBIie>k3OtQ%rLiaI
zq7ynkNC|P8Az`y$7n@|#Glb0IIc0)mOaddur}Xl2&pN*#!DJuDc_XA~6c}lInoX}6
z2pmfWNDWiRwDpuwHb<yP4jJnKd^8ITd=ft*bGc*{-ci&vC`}GU7V#QxJ2^<0LlKvK
z01=*37D&dTimc_=-ZTD572!CK<f2&>*|ML!0LKziYnVEw&7$a-P^rQmX`38$$+enO
zO$DVXqsg%`5BGnWoR!rsCO6%2KlW{QH4RkG2h;EpEmQQ?YScD)6K}o?7n)jQN-K^g
z@6~gs$wxoBn1m$f=k12BrhslFApc{ERr^o1O~Kr5=;cy1Rh8B~nu4vZPE#m(@@>Ke
yW_WdXqd7UeXfLQH2aNMBNYSJ2E|R>7;E$DtAT<F}ABMC8xI3B9_?^f8$MgliOQG=q

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-CompressionInfo.db
deleted file mode 100644
index 9c013f70453b1c3b3ad27a1e205d14ba1f0a0f6a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 83
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMli|1z^bqg$YO)?pcM062!s7PgyudCq4|$NXyJzd
D@p%eN

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Data.db
deleted file mode 100644
index 1f5403379ec611ea1efc2db69f9faf4947cdf1f7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5398
zcmd6r{Zmxe8OQGetFoG0+W||l%>~M2FloKZ0t=49S*mSuuu4ltrISR2po<_ZFAEE;
z>>70nPJJ0|tJS6@Qp8L;iI`ZOgqH=D_ZNAU<wZf;PRxw9rgf^VBe&0UcJIAs8UFx(
zVK~qIo^zjb_k2I+Ifvsk+$m0@w@Z@bfd3^fj62+@*GC&-V&jbQ3F~wmJdw^w2OLLu
zj&nH5t@##nL4MJmz1g;`N~`sq?4qK)qCG_wIktV){C(Ml*8N5Kg=TYcL3x?Au)?yh
zAgjEvqM$s>yt{0FX+>6^tunVVyELmPw|K9mU{6s|e!h9PHOH23%_}R|pI=^5mTxK8
zzc0_ayC}bMcYaA(PF`-2t=wWQ&Z#Un=jK6rSE3WOm{gs;`p5>Zt$yC5EfyQ2i*+>9
zB^Lwij$a-zaSbbzAbus9DB>iptvSu4EnQ`p((>Mk*&WrOZF{s3WEEnJRC)%6(3rGh
z`)Vg|T@`zdleK;jG&r1msSu|sF_^2zQH{RJlO_lX@xIE2%cxB7RSr2J$TzOT{Le0#
z4a6b8x@Xds2!<9KtlwqgIs$jXJcULgqO6W<Stqo#M3%~u2GXH7SLtBF20Q%_b($7l
zOh1Vudj@F`JMtEi)DB|DDc|M?rckS@9p-|S7U_6gYYJ5da`Eg#modztD-mMk*SK<=
zktIT-v<4Gx5S`Y<jM7(Qq9-@W-x0q#;arZyO(o)$m{vN)2GR9lv#nH!aq_eqaotQI
z@JjDf;(Bl$ry@5ygL-_L>)lLK?A+muw8~nsv(l$k?}V=wE5Eg~>Dd8Dwl&h5?C+b7
zu}|kV9AcVXue3mlO5d(HX$>^FU{Bw!H(|ZVZzWxap4ekc81Z@hW^(>r?KX!_ole)6
z)olLV92X2pkx*|^pFC}XW)f}IZzG&@`dLybpQyCkzWFQ(1RF5$_cUr<?$V0iabW0=
z!NXoR>$r20=hL_8FTW5@ZrrH3z(3Z!&wq-PN$?|Dv1*uT3omFmO|%_Ch$L>CR?vUp
zj{-i$U)m1t=DF_mC%0+~cTpO!2JgA>&_e^`QfWJH)?lv_)Z*P)wS@A(L``15i*P-k
zWx`h%3?YHa;2MXH^xVQ9Lx_2zQHB5<klui^7>;$Tsi0B6wG5+ij2NRELojOe)e1j@
zB_PE6N`-?MOjxA+vpv?zzeFG}3C-W@qASASSw1xKMHGXB2GU)*l1|SnX&`+sCqN3k
zgTers;S@6RvhPdTl40Q%(gjstIM1%gzC&j0uIN9K(&0<V%eQaB!idGb`^?%F`&Xh?
z@pXT^ln#qi#zA+zU8nd!?BAM{p=wEgO_*H(l*kb1@3_up(Lb}0&LU&LuXQWadtglp
zz*)(8z{F%l2KM$aDU*TYZUn62<$;UTaxw?HCtI=^rw=@;dRuLh!PPb?NO=(%G~B~1
zy&NEe8Bf@x2M^-sP+m0n>*fn=^YdpTZ1e8s`!GXQ{XhHLDhWvUKTUQ4HUSj{J)wyi
zAV3!UAq_Wd{gzH=KXtychwEXlyCXR$b$gOiUJUCe9P*Ry^5p^W=aJ?rTK^5OehJ^c
zRQfBQ>VK8*;iS75{`tI@2nIX2ffC`kPSBsm?vKFWA5tI;#a@3d5N_DKKsdS#q<j<f
z^6+42*X6}LxN*$_Kb&B2EtQ8x9NhR9b$HAc@V#dMq``2Y4Yjd{l{C=y->t=9oGOK&
zkPiE*VDuH{!IlZdfsar;rN?!%g1|JX>xZxI#DEtA+f3XrY|4rYkpja{FFABdQP4n!
zN)mi1AdW;-ZH68)1xAb})&wJOrQr)?SC1S<9?DH|<oxnnSc{73N4i)Ci6dXOWHXP&
z(GX_)6GzE_&B9fImt};E?s4)A2Zqk|9%H>cTDf3Wy)2G4NkLZL%K>q89yeZfsK?B!
z$k;2@xYNo{$=DChveOvL@aa3|V4GyD&x~owh9hHl+?1;_2!`4c*$QAh%%oN6HJ+MO
zpqiJAXE~)HsKAtpVEjEMPjTGU%2XxeGp&(~)5(P2J@|2zk3SL0z)PG+_9+O4+y|=R
zQvwBcq|wdY_4$L>spo0UPD8Z5UE_8AkmP<bTpvG(yXRVd`BEuUL$u_^tM|b4LW0Im
z%H)f2yIzZ)PuhtnNPQXFBGqCr$Ls|`j2+?>1cSVw|C-vBl3?N&+;d5Av)@aC6}a86
z<&*rSOb-esPFs9X02S8Ff4A~h!SH=Z)0-BeU~*Ly+BRm$GYK34UH?5?Z$hk}ng~YU
zxP;NTMcVRVj2ahdPndAUE+>!}b^hdEs4-O(Ox~i_R8cVbXb}qBt6~3UP%!bC!v_V#
zO^{<sLEzqQ!=~(gSW!$<bpJ`hqk{7EI`T>t&*>R4nOZ{Am&H@xzRWxor%W}~tkI|T
z9<46rz2yLzxO1`<{3n}NoU-@%U}5TF|1k@9pjxR3#Hn7^R8x17z@Kt=GQIo>9#mxJ
zOmFpp%Jjh&sP1L4IQ>cU1vazkZacHfi_`z0d#_A_iT}1T2M02f-~<D!;F#H4%g%b{
zI2JYv31&KpsFFgQ`5Q9_GP|^gwf^i{Oj;q*><?3zZ!gXkE#QHLgsx1ZC-kOs@BEZn
zUoP|G6Ff&-3-!K<yBW6f#qhrI8?^GS;QtsHjK%|(r7lDF$6w=u`6znPP=Q9zvQZ|;
z^6G$KjMh{m6Z(SCBNjeR8@wcn!}zDL&gqFE)(%f77FGnW(tn^F+bI=hyY<hd!fk&r
zFJ2#%(n6V@{d*(H?DTaXB*1Cm_G@V9RfcKh?f{S=-u{V0=W}y_*3U&uEWSDT6<T`n
z&B3)4Tr@fYnN#E6?i<E{N`l*8&0|0%!Cc5g45(*=IX$2Ot_nPB9;<@69TuOn!7WXX
z4+_M&s)4PrId4`9199#*aM+f|XdrXV99v1xJyn-$oxKJwOy<MB-Uf$VCq28@ov1mI
z3?XAkJCxsTo|s><R@j|-2>fMeS0kl<w4~@zBj;UEXvetJKznC4tZ(gWo==j@-HLK=
z3<W}TdL;0A0`HpE=0dkFZ?`sHc(w!s?ezAbPLr%9;X%$0NT){~dHph!^wL26F331M
zD^QJGBH?9b?BNky62ss7?G?m>w@t#GweJE1mB;SXO<+^y>C=RFk-uH^CZSH#yG}5@
zbXpaN5fRByI>@~v)*ampgUj8;h&O+Pfbgex33Zz4STVvebH_(<hANZ+TIPHoKhP2R
zZU0H5FgR(+EdMBs<39FW<zZQ%<7%XX4*GL_C*Z}X39iIOYmW0Hnqzz|A|TU0Su4JN
zFBwf4#|2=7xA;qIkr=QTvSF((5sWu9u!EbhUZ8t8;cguT3;;?b+?qW-5DxD@q$pwJ
f80`!}l+f!EsZP_(i}$05HZKkD7d3r#Z~gfHLD)tQ

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Digest.crc32
deleted file mode 100644
index e57848f5c9e5..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-874495544
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Partitions.db
deleted file mode 100644
index aded0e1f0b6b4f667c86ce3a9b6df8f0fbeaa91a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 63
zcmWe&b}%qtVR#_O7$C`bfrGQ2)0|O|^Dg63P9`QxPJO1c42%X0j3x{apa`W|p)>;k
DJhulc

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Rows.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_compact/na-1-bti-Rows.db
deleted file mode 100644
index 6992a68f88e7f7a10dd53ff8f1d64ee800cfe445..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 738
zcmaLUy-NaN9KiACIrT2}F3<N2#8XI2hkWaehSIs9!m+_dX(=kGhB~DsS_(%AjwWeq
zt3l9gXbHl9poqdPRA@7<-|v<(v}gFj?}6v;^YEk=$PgjK5{((D648@26;^9kZ5>EV
ze2Ni+ln|#85;ppo#I!CuN60+R84i-;BwaK)qgPf9o8nRob`Ej7$U@4jq>D|?ve^xe
zz*sRr8ki=g%TPjDnb0W7>x3itsSFHUsUMN4S`nlV6ty|E&7;U1y@AtC9unqJ#LJH$
z!Z~Ax<jk5#sJ;!KiO-q{w~Ho7*`$dq@|P&VSix&FFilJs(3CwUG*)BtgG~;);#x0g
zrjpuL(B#-!g!8{l&f0n(lbi0jpRC(`O(V5)PZmBh$P~X5*uf@m>fLwgLQ`i#?Ih9U
zy?*I6`RHdKla%KDqK)fo3NT{<<sVa`Za*Dt3KsT5uUDF>u67>K6zuHunnLN59+RNG
v-FQI>FWX9*$qEOGc1YQ+y<H@I8^McJJdlPL()2^RK76|iLX-ES;vdsDcqpN^

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-CompressionInfo.db
index f0a1cfb59e84a0e2e737bd0d14f1554ff7674839..d9592e6c848b4a1b17f0378ee909c873fca1a394 100644
GIT binary patch
literal 199
zcmZSJ^@%cZ&d)6<N-ZwVFJb@#2Zs9p|Nn!TEJqFkX%UDxjAWb(VKEy(XqLkenk@!G
zv%iPXoOKYIn+rnotcK8hh7g*67lamcg3v;DA+$&&gcdylp~Y<>w8Rw%E#(5CrPo4e
GSu+4p_!Rm8

literal 79
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMh1rZ|NsAk*(^s618G(;8&0qsg)lkmAT*yfgciCE
E0E;pVqyPW_

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Data.db
index b487fe88edf6a5289f2ebea0d7a7ae6873599409..e21851ee0823fe5f5c9c2995766c6ac53ad43eae 100644
GIT binary patch
literal 7551
zcmaJ`eRLGn6`xHe$!si{#5Ay?IAKZM9Yl6^clHBBNh#IvVckF#thGtXNr74nAk_qw
zL>j_sEse-2DYX(lOrl`P@=-vlBw46HTN*x28>z+3>fuD&<5^O*h^5oteX}34YmI;8
zEbqSGyZ8O>yTAM1$dz;4M$V)(YMR!>nK+Fr<(fl^;&i!HkJsm)lFz}SZ63bfE^=HM
zXKLcCD=Tk{HdLJI9&B&;+ZO8<E~jV1qIJdX@9>SB3Fy7fcX`5WubbF>k$b&ukQZ#<
z@(=Qkkn@0<EtcEC!#2pR?~lNC<MIga*y|kBL5;UIN4TCDOQO6W-!DY@(XMehVmt`g
zqFm2im1KJ$7Ud=P00V8tf~F|%m_3CoH9bJ7)3^kREc39EAlG&Q<4MQdcbP29>Vfg5
zV;;gfg1n*{7+*T3wv{Xm3NZe3%reXtik}|jv(G4Ehlh6Cmb=DGA*`4M(}ZCEd%S6%
zM{@0B#_eQju)tz_?hqV>F5mk70MV|5Z2N!2Avt@JnNV~VZXPa~2pWERc!Xq3C`T$A
zSu$MCD#no{PUh=$a+`+W&$=myDB;qzpQ{tsvMftF!9heK$S*F0(OsPaQ>d-akg2N>
zCe?r=Qyb<B@}3Axo(xpGolN^qz~s$9?JOeGkv5oo8K_-YN|28>!Q{_C?dvAfTY)Gq
z*4;`Zmx>#NI*Q`pFJZC&xfDO$N(Soa64-3#1S}X5Z3NrS?e{od2+}FE+36Iz_p`Kg
zI^Ob=hYJJnu;25K04s*kD~kThQiAlvv%*rcX?Qd{n12{QLQ0}t$&|R=7qj8~MR9lb
zWs3Q148E+x6?#5Yuj#VLn`i<VFV#~ZS1kzIlMef^8kXY!0(HzRdDBrJRYdtbcZ+H0
zJwB_WK=P#{hZR~gr=OTj$9)Pvf^0s7ZBHDJ5PmH_o5y0j%E4eC)sBc%B^`5Z2qs%m
zkZ8Tl3J|OOD7Se<gwOM>yR|UFPv5IL-xO#|v}bk-drfekH>IUyIdHDjirmhE>Ag7!
zK#*@($k?Y9JEr%V;aiZ$m1Asuv9>C}a**}Oqn#`zx>!+KSW3R4r=)k(Ghd#Pmhtvv
zN>tB0N=czunysxN+KUjDVEtO5o)d4MNn0bfQc?=ZT7t*~*?t9~`;5Sj0>=#qP>?4@
zh}#U*I6Ik?6WGCp(O9ZTT$!tT8(#>EP3<NMms()bwpUS+Z{1lS*w?vs7D%DR$>9jM
zV>=Cdw_~_gALej)U%k>^kmp}_XIp!LxF|LJn+T4#eMVD+7i&_ZzwO3o+3Y}sZ#Hoz
zXCqu*=Bt?YJI5yXS~XGRdhPvYp{!-vyS!;Yk<w(k_ukl9bDr9=sP(Lw>oZGEm!N$b
zV6|~1`wbsU3~M56h!PWw=fxf!Aur`il6i4D`?ECdXW~ovYr?r1<a*~1fGg|b`nY<{
zB#U@G%`zP`I~Md1#}8M7Nw)z?Nqex1Om!77sYVn`Rjp)tRDsEpfm+l+rpGNXc{5Op
zYhWsV0)hO{BKb0K4>P%6Lj^c@oaE1d*1?q^ukFWCh{&3@L3|_!+n^{Pg(tz@atJ?N
zN(Sn24mR7f?T7`9f{sb2W70|V*Zc$MC<o%w^>`M|+7Jwz9eZ8J&~Y0BED1x`P8YM3
zB*aT^?xuK3R(~);N;WP}ro`pWh-%H5C_hG0GxopsFru(-DG!@To;(?tmTj=gFQFhL
zZw7F~ax!f%gvpnI+PIKRFJFPlpMm;CIhkIGfY4S+)3U_-(YWYbG!Us<;7G8)b^<@$
z%9~t+_#EqZC1iUEo&;N{n@H8WLZ?Fs(WPZ(U=YgXcKX<@r9>>_8}uW%4G{?Pfi^%~
z#?@XfaJ-BN1o?0i;^-*QGxQ4F3i2CEFvL=nr&mbZy*n0vu|gW+*?289LN<B>$!xei
z86p#+253Ew`xZ5KuTh<_*#l2gEPoPU@?`)IFC|ld3?_dD>J4_|IzNCN%r4OyMVOfy
zGNn9T3Y>kQ2dk?}#+oPLNU*)r1zSk1G$`yKYH-dtvu$oEQLH)*IGy5?`C+plM_P$u
zV>Br7KjOwVMsgb<l*azIDg5{AkD0|_u}=K?UK$#zP7sf38F+<;{#lV2+Mt()H1iK#
zUnq^s@l?F?{0OOtDall*zEl^Zh$5sM){iZECj99w`289fi?1-7cr|}moSLO~{mld&
zLV&`A&mkThHXQQ24SCxvc4cXwnsATy<9;dF0|H3&PQjQckXh%g9A{Rr&yNlAqTVrf
za+o%c1{Vxq@CU2~?5u*2R)BlK{$LMA^)A9j=VDo*tg35}e*ulb)d(t^ia5P9JXe$m
zqmCNGKV30gGS+7wdY9+!8<%l~?-L)u9=>WFYvU;U&arukKutcYBX$G4To*QrQFgy1
z&wa&T2SC0h@^XRX`dwb!Q%!)~w0?f6wt*#x-Hq8$KKH1epuREkmnUfb#1qK`Io-x$
zMN~IZ>02jv>lso>GBl0VIVIs{R^=|La*qKBb(OACq~0d{@@9angLF83hWWJAEhh;u
zbqSQvp8=Q%;SuCJ12~cSH?J0Fo8saWBtLs6TZMe!UCJpco}k#4t>w~!lc~yWGyHTq
zT(eACu4tjlNf61N-up2QiD1iXr_ypc4M=uG`do-xknbvmN9qoAx$1MRrLoy4S6!n}
zDqiTpnE1ao>2H#(-IrdZEDWR5$DfCE@$H)$7$E`o#E^h>`dh@Kq(7JL^uw*S5(krs
zRr)Jy#(0Y9z8<`?eCe>e5GP)MU0}$cj+qrB%R{ZWTBabx<S@qVHT9s*t)xTlfpC=E
z5^~?3d@wQ+|NQ|-l&?b<m7FPn&R}IPd8w^NWUiDOl<M@p`<TEiMg7M0ALKspJWXGw
zAPn_&#yEShB6klVAUrWG2HGE5oua+dQB#8n#Xk<4vo=qa+y^`!y&R6eZ>{)3IZ!I+
z#7m-f)(9D?A4-;l+rN<=`bx2yzk?3eQYRc*pX}F#X%&lh9;$SPV;Q0p>`jL-NmbTz
zcZdPjLo4alwQMYGw&r%&LhAFW6&aJCV==)dF4vXbfYixc0k?uVY(^V!8G>PKuKEBr
zIT%FUxD7xcr238m^VsJ)3gT)Hn(NOVL(6DgNY@QLaD7*Z@5#edmEQCv98HK<kT;bB
z;4^qO9q>4sgJf#68xqs}EDgO_$cBEW7sT-FGy7Q{lJ%fJ(!=pI1XqlZhOJkShB_*Y
zkSB*YJ0_mbKFKK44uLXj@xln7MIt}SbW(hW*gmFHmW|RD>|@Gd^QDl~8uEf<xzP@b
zKZPR8b>LsHONCJTXU1Tu<Ko+SCu#MS7%#EbT1f8Qjj%Kg^W2Xuk}@vomDIkk_3JV8
zq#Y56{VGeQ*zaO`bkt7=z<G&YdirL3*6DqlWO%T3o<I<Lak^mBnBMPcB0|HZNTw^6
zlh`30r#(!K-V$JO=O&q<<GGJSNm2`vq0!*MN?Pbt46T3LRPFWv)22Vax8Nz38v2-`
zr-!`^zZp->!TBSk<`ij`{{6)1-9XYiSNv|aHiNt!nF4Rt({au6wV1RS;<rhB=aTg(
zwj;=Adl0(c03->rlI1U5I6vz3n0^oSzPlY^GgJfuZ(Iiy{GNo&cA*tNL$040A^?kR
zMMUVXZe8CEK>c+5p0o}H6~Fz{BjFS@PLMC+awNHzQ+DFp!F%=8-^diLMu4heDcZ~F
zULxCL*anTB#SR4dQU%yfyg1Ij8rQ780Au@b^`CY#?h{=?aT51$#Hs&F>j>0;n7UB8
z{8`xeQ4v@D(?sA#^71}z20cXGImd+tL}LBO0(AsUZUv9P(_#dvI7)};!V}PSN62kT
zu0dt+t1IA6-qS|1o|>-{cd-WR6%RxD=n9hZO{-m8M9vde`sggoobbueadXyEi==*>
zYjhl)Lj--+kBB{nML^vp9Dm4A$}jQxA?d1o?GUL@YC7!~qMA1)Xlgfz&__V%O?kF{
z%r8}ZXXF1l=H58fmq&Qi1#^PQQ?4paxI^9{ehYsq$@f1PdFl)FZ}IzNW9(fW<nvtX
z_q6qyvxz|iNT=|VGc0EFaxCH6WB`N0&(JyqdDKFf+y)?dd!2TbxVB&0M)QhDmOYOl
z+X??z;@S>IpaIQA`RQTPGlVFy6NLt+2r0^ia}W!@B^APr-vA`HE6MGAx2E06?xFWP
z%m_}f-E;*D#lORJI<^jC&;IrMnxLFZaC<(de|h@P(WhD%@o8V4b`$To%Wi#dgd=`%
zNmBTI|1xRaBAznZ--+KHc+8KM{1T4r-^Guzv~C*gq{pcBNTAo8)rXq9P78?#vUSv0
z67YR6mmT%%7k0G~)r|`~x354_mYKA%;v~|hRV3lzQRQ@D50;XKpMgH4eas1$p2FsD
zP$B)!Uyw$u5}UKNu{w#PX241MeLY;(jrkV;W0vWAov!lSq$?_yvb~fj_istK^mXW{
z+uMLBnfAaqOp(;@rJ|!`Y$1dv12O4oAmsAW+pLl|WByObG7~*Q+%Xki0{S=F?OW&T
z(vwtuAEZao%OR{R{qnQ~X9~%3aQdG#X?|SBvGDyAxf*~s>)kQ^TJ*hKZ3;Uv+IOXS
zx_skej;Z1gvNb>L-%m}#HfI1oL#`(bYayRS=+1(z4E>O<=LR4vp|JmU+StA5Ao1&Y
zr(@c9#wOe*V}f#*WAB1Izl*putU_Gk%>XVl_VA4u&0m0)B6&0B57T^K#{9ck(|8a~
zBPsNEqglk?1b+=MhCe)i?g}e|_<I1YHlOQ?(><qm1iGuLlXSb47`ID&lWsUs3O}qX
z_XKMFQE@nX7nMQLL@I-_6LINDA#~dAeT8<nr~+_TSqg{*%x@*jVg-aI^n3nB+OT>y
zC*hTHwxFKoJ!}C#p418y&uW_OO?jjf@n8*R7uVtQ7C!I!(&tmJko@WMt60leNiCz$
zCWf86^#e2{QMa%^)(<yoqMt58pD0OuOs@_wg*iVh{~#keahvh^^SRPE(Itf=5N+>H
z60JHP;bd_dySUftye+oCzjZY(oY_QO2fLruimP~;I9ZGyggQ2$Xzvf>k07sUr;}#@
z5LH+N-!+5y<;eg&@&RMfFrSuMiH0G_Ez7Z#&j^@obv$;MOwU4+CBFehUXGAuLpd_K
z%2q12C->y0C0B1_?CIapI~^t0<J_3fDN=~kTfcjTEuT>hPHIbiS!dr#KF6qCA*D}!
zCkfw#1~3|*NOE+AF<YO??d9=5=F$`f3qKLNnY2G#dSZlRs7pBgKa=UEwFjI32PvT?
A5dZ)H

literal 5759
zcmaKw0aR3F7RTR{QQqjvLub_)3?G6uGm^ex7+@d^bv+7Zh--<g6|x=`;*^k@Vy&XN
zM6S9-uA#XmMl)brGN=Wolfn$H1LlV2w%GC5fS!)EuA%E1W$o<$zL|OR!8tAG=;8eC
zz2A55H{bow{opvBtKoRd7Qf$Lf#3aH9Jk$Nu_RfOZ7Hc~_Vke)EdDock2Q%LH-+OX
zxHmui*9Oahc|ZMrU*?pu#f77};cH7~ote;cS+j-X5#5~hrG{b-Z%N#Kg=@=+>(dCv
zl>>bm*_HHxR#|O?L$hY;_CD?pEk4*1s(l){IjL8T@^NiX^drjrthY}y-Woex3}%59
z`?$9JOJrML+ozF|dz7Fzupo~O3LD5$UV$jv3@#K!mg0?M*_w-(l)*7&S!8(?S&mN~
z98<0*OEof)(gw#=TqjFy8)EE(Vx+2Qqc-MHv{7`XP^GmaHfw47opl{r!PKz4Ln~R2
zE0+2KRI;ZO>lm9X*uixxkpd2L$5Z6jt8gqaiEgbJ@O8=uX2DU{(ykr0bdF^8D5rRs
zuaA=JqhUz+==xO+pOGC}+bO^61OBDww02mZMqT7s6QKEQRY!Q;5BCxJY$@Y)s)*1(
zW-{oEozR!qN?q$yfzVgjMn#|Z0GV!L6Dc{Ew{)d1PpiM%mnXSIqgvjZOWj&QpS&kS
zZVoxhUzPW1BGbLY?DbLVg10TS$4KlXz7wTH^LsPoy9T0$26bmgX1n$fvB?|5Ol}`N
zDMv7S@1K+t$_9R8%J4Cm%~|RRfQ`#XY|OOj@xgr^a=W1~Pe}JB=~=f}GeQk^lwu1M
zhtMel2sO^=#;59<79XlldaFK4NI3J*mzrUft61+JP_|)aeIdgh^kRjCIfqVbdB#_6
zw^Xh|@>#@Q(FGRXn~~XWtEK!Gl&Xn2WqSoyFyt1MgjRJ*Gm1(O{5Xr|DCkKRW&PqD
ziYyqY(uEFOS}?y<FnL$ZFO^)y`oOLiERyqu8m(b=bBee+z|grq?tNcYY^3ST+-{5H
zS{d~9X`~NsBnqZ8IX4m|*NQMbDHuVQ2u8_OsL=WXwD{cIx+r1F)23g=F622rNxDNj
zHWy44x%S1aRf5f1($udJCo8xS0qvW<zMzdXd24j%h~CY$r$eUl+<!dGLAZ_v^$Li@
zV#n?Aq!sOpzxa90@~){x;Ff@9C3O$;`*(>4@jnll>E+rtpU26@?``PSMBqwW=a5Al
zOJ013R^HYMt9gGl-F7CFn!U1)Ob2lrB^w(B(~eikba**TDMO-mmyyYl1ykyfs2aMW
z<|AkaDQ!s9UJ?fL`+At{L!$PZVbV1tecWxI+3(*iJ~tdSP>Kh~z?N`=WFy)rTP;IE
z>%NEC*xHR%gSzCD^V)mC)>iaFtF0##4Rgxx^wRbpx1jMl6j5@@jh%$r(UD@v>pd5t
z(^U*|?IYA#%AgZl37uQYpw?nSe>X9x-A?FIG=omZ5xP9UpiTkE<ny6`<WLr$^n)3%
zd8w#?8ad^HYAR|&5f!C6c3-0Pt~Z6(lP;ELY7IB0ua}Y?iV`{Ho<`Dx{$+E*8(e-3
zaWQw$P@ye-<+Rd3E`zQW68aHUs9x8zN&7lEl%cX4N#quZlvUJk05Xm0LH@ST26ymb
zjthy8Q(m#LR}*RWzW8Wt*yX#gmDR<zccMZ|N%wdM4qwMRR5!`5pb3Pk6vc7K?g)iV
z47-6vDoh&0Mky>|5EB)aDAk3waAdkCn5M@II(b_BFv5xZ=}IaM$31)?(>)QNsi3#Q
z6jUE5?CRwx$mHEJBU^G61*HS6S(1g1P5<&kpaD^;F92PG5kg3KMX1zq?xT`xRlt8C
zENj!VA#6J)=7hwmp6CX9ws609cTCQ9=#O;^)MrD<LK=U}IV44g*FcKg@xJD$NONb#
zr$?pGgt8_P0_dJww@WZ}K2^6%a+wr=`bKv@;Pv2IJkoSrG%Ld8%yehpqPhyV)dbP#
zT+*%N6^o*eD8dz(vj~mk)(2$E{n(|Sm#xl~O<~zeKFO7fJCzx29PWa=s2!Wd7uN<v
z%eF)rkH(2pH05M{q7hc}vlX~PDMJGDtI1SW1XJpesHc=$ygnDEv>{Q2g%nke+br3K
zL={`ev=xmgj-0efoX-dItfEAo)x&7o-j83cmaYh#c9mEg;U7V^GW;hPcedeImo=OB
zj}Y}dIq+!GsZG?cv#WYFna$QDCQy-V_r`A8v$_F$hUDm_7D9WfnMSHOM5uNngSNj)
z=)iIYRh1Dsl*OQ(`Go3GSg4ca>RE&uP>zcBn1OVE>D3IAawJtLYsY{l93{o&7l$?D
z&F_JY$B=hel%_(n1>@Y(L8A0lFKzS@Qdc*6yR%O-K@16!(**~^DOQl)-3JuoVbtg7
zR$ArEQU)EvM~<m*CbRH|xr9D7G3X;Zq0gcjbSe(W^!Wf%SM}&rJectdAC(iBh@8^&
zY4KXad{&Y?S~Q9A3R*>ORzv@aHklBrP3qzuT0>u0jhxcoh<p{+7^ifB$Ttj;KYy)C
zCm3$B=nH(zg5frct}G{XmqmT}m<8hxariQHrFdQs-?@hmNY+U?T7Ax71#+hrW0)J+
zs|gN8q(APAMWjl<z7HxB)E=Ubq!V2-SbN;FV{)`p@&{JxNqdsF;C}x%k<}g=gWTV&
zJ1LmE_;n`-OOI#F%Qb?j>%k$?bCcVni?nqO?>QF5ulz>rQ(B)A^Ngdrr?+gY!7FRz
zV(jI2WP0wyha{M7Etn-$7U!jtvA<j+h5TgClpoI{i5b`|sV>8xo=ZYy8=rN_sEC*H
zMpTgUM4K1TFORAA=k=>=W@%WzQg>7HFA$%M@Xtoy%MVr{gZMd11NxO#$@^!Ek5l_E
zz)MEjMk!~_#BcT?ah77bJ+`s%e2hw;C)*am*|flfS<Px;)}i6ePPf^Zi`xu|*%i)!
zgRZ!5U8nZbwgohXiBvSkGb4_YSd42Q)EJ!Q47&E_Je;VMTq>vqagL}=&rB`Zmf>V2
z8?zV{JFkaK1x+xe45{bQHZrZKW%_Ml1EJLwEILA{q=-R_Ut`)x&f5qF)0$koP}K2e
z*>W)BT03?P&O!KU{!cd*J-T6#<}CXyJkhJnY*dg<J8p=>tqBMZs$Y>r>_vgeVns5c
z&HW5oZ9rmD>O*PjC(6H(A(!AB1oO5YtQVYVr01^YX$_TE^VE!`=jl<s1CQ#^*z^1a
zN0cy<PBcVbXvI3l-PKjPkno_W?^~$##!yYF<EA=ttl2n7dp1{57JG3p>K5Bd3GL4U
z3eH2Bp30@f4q6zu%HOcZy<-+R9Hxh3D6!opAe{r>2CUxHM1v+Y(zn6YBjGA)#wz1n
z;PHiF>!(|ZTK%|NcufU|AB-&?z8}!g4;Y?(Jj2W0juP~---3p<$D_@s1DJdhH1J5H
zxyLhllu@fQ&yB6;c!-Z}3+`)W4J$r0_;~*G!+EWc@Lu;V4WAaT3J`k0$9Y;OBYl0h
z>o`06c=q*TH?yn7qh-aXH37bci#4qwO>s&e*R*O!;SdA5<CNQObzo@fA)N~%sbk$q
zt>H2ehz@=nvd2rvx9((cjB1d*ZrlR%wTcC?;@jkuldJsIDu18#rl*7Bj?$m66lqF<
zHd%wp^hx9YQKoD3C$Qjy(Ebwdw_|L<!b@;T<!&^V5P!W$Q6}-}5&rZ*mCNLdN|W8l
zg_G5SDFy-JstCWGl;+|lL>c?hY*3mPnH6y<|J+GEcj8b$Xqdo&(DW1{ROvozXqdUI
zbl)9xpOlOY)bt!tvJbZsPYpDoWMh^h)wELMr3@+O8ro4X-|NRxsf<5WdYvPazXzdd
zOcX7Jf05~DBifh__~+Mb2Yh=Tv8KodR&f5^0%jaljd=CpRp-KBpVrliNsdvM?+z*J
zi#SlVf$k~2siWQujXL*suv_}eDBTJ4N1Z^^IWjvU@`+8vnS@#7EMk(LWuwesx&(A*
z)vA!*AY2>+obUj(=QAY?A3FeUn%a#@f`S6)@jgJ~bQBqq9=wb9Ddl~v%y41X7%%EG
zpS3P1W#VJH!qqtBj!DbrU>r-u9H&0SI^r?bOg)>yOl=QrrMo|S<}-Trpr#gw*Kx|z
z7a=S2Tzvhh4%T!+^P|-RaLTE1l>1{SEBxs4G*%#adsjQYgvIcq%hR}x#N_n+SXFpP
rOy0|m6-L1^{Mz!kvSKlwbY{VkQJoX>VuiM@bJ1Mjhv%&ae>diT1fhU1

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Digest.crc32
index ca286e0954b9..a85dfe8c5274 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Digest.crc32
@@ -1 +1 @@
-2759187708
\ No newline at end of file
+2266872816
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Index.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Index.db
index c981a226039e4a75a5fcc064c73c9928daf7b7e5..0e8dc66e9f2dca2d6973b17f22f39b5a102d7283 100644
GIT binary patch
delta 159
zcmex(j`QO=&JE3+jDNR3_{x~a$v9!VA}3=ZC!@sngB(nSoGc3t-uS%v<AQunMzzg}
ztBN=o*KOWw0hBk}tnUQm?b&|85vZKS1*A;x0#Mnh&GT*oC4x2!J_ahgwK=D-fRiQW
p;G5dX52~yfWw$51VHB)pV_-Bo_`(g&F+OxBAI>p3d|?KJ0|1VJKy3g3

delta 159
zcmex(j`QO=&JE3+jK8-(_{x~a$vAPlA}3=ZC!^%{gB(nSoGc3u-uS%v<AQunM)l2!
ztBN=o*Kgiy0hBl2tnUQm?cILC5vZKS6{Jk>0#MoM&GT*oC4x5#J_ahgy*a0_fRiQm
p;G62n52~yf<+mriVHB)pV_-Bo_`(CuF+Oys5Y90<d|?)Z0|1gxKz#rJ

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-Statistics.db
index 33fccc9c84bd3494eb0e6921db5e278b899ab053..a14bd80fa4a19e511a0033ec243e52410de59af3 100644
GIT binary patch
delta 154
zcmX?T-e@sFoi#IYg+$Uuo$H*8<r^o+GcvMn-oRMTw0SY_Wai0-*d2@*D}EN1Su=3o
zdB(tcpf9;qcmbI1=yR<9|NlP-GeCj;jH-LyrfTOlX9!N?G=gYj@OT51WdtH5K9Z3P
aR(}|7|5@O?b=BkrlMSSMS@v8ymH+^H)jKQz

delta 150
zcmZp)IA}gWomEr3B=XZno$H*8#TzHcGcs~*-oRMTw0SY_WM(7AR4d(rKnAWC8yHxN
zvmK9%-vHC6eSg&d|NkF^8KA&^(%HY;M9VuiuMnKZX#~;6;H3kUWdtH5K9Z3PhmM$r
W{0UZ={AT)%$pzBAEE8wF^#TBljWnkK

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-TOC.txt
index b03b28372b5d..8a6a30b6db77 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-TOC.txt
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-big-TOC.txt
@@ -1,8 +1,8 @@
-Filter.db
-Digest.crc32
-Index.db
-TOC.txt
 Summary.db
-Statistics.db
-CompressionInfo.db
 Data.db
+TOC.txt
+CompressionInfo.db
+Statistics.db
+Digest.crc32
+Index.db
+Filter.db
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-CompressionInfo.db
deleted file mode 100644
index a8ab57274e02b642a488ea95d82b866877bde74b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 75
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMli|1z;fg%ki`n+!33{C7@Tt;G+zyb77hmh#SsZR

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Data.db
deleted file mode 100644
index 3632c25b39034397042efbc80f59ae4bb54cc00d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5950
zcmZ{oe^gY}7RT@4=$s*)i;meq!N7pdy@1^L0n7}F7Zp<!l_jM<BgHa>%H)|A(py47
zIoU5CJ`!KP6_O07m4M=+vKPt#lL8_jARr8&Wvw*q^^Ek@ynXJ>+<S2?{R{Wm-+g}E
zvp=)<AqWXkNJw<KE|)70{&o?;L_w8Kr#Bd*V`5Eai!T8qt^>bJqey}XBP4l*?S}l%
z{yj^dwEFgzPCZfZPh#Rib4N&PalbT|kbt^cKPbhT(%guG0pe`VjbW)QEHvpGIWa6{
zs`RJjT);^->9+8KVd6lRQ!2BBe231-$su0nB+iyOL1ziwkv%M(VF;W^dKLmfhl#Vz
zqr*}}f1EWeRU3Oa)d2`7feCIb!6T<ZJR^E8;Z?!Z>^Wru9vbQaQOtNzTne5mY;qf@
zVg(hB_hbbYQ51+w<3)KTcqBC#H0JS|<HdL+M>i}b%VY3vizn@tLojt?lm?l@imdRb
z;S@J=WaTh%?)Iq|WD2Xx19R2w2rRI%bqg}e6tYmwRHxMERaR}Nal3CABI;FRuv%l-
z_xnbq#H%b*ZLo7kYGdkkb~y?Sa}7x+*yW^KVPor`O1H~p#+mfkh}1`H-wxYF^!yB1
zrpK45KNz2c=$RSuN!}SjZ|{My{`*BXroNx^J0`Z-r@`*RXAVVa_;B|5HU*+*`Pmd|
zqjx#>r*&4P%rq;)ksPS5BDJ3FW@ZmbWhRZ;Z%E2idz~43fjtcZtg@%^0T}ynuvup9
zx&eE!DL6S}Z{%!6@P)mTv-X>{Fp*`}8Th!ov0=IcZ+awMY>PC5&Hrw4mM}>}YE2BH
zwIMYzyp3%rs)U2IXg4jfDTIKZ!399nYqjw4qP@r1(8;MwYhuXR8utq83WnDpT9Z1w
zM&x!6+$`Fbx-vc<(>S9eA11!ukuUf)#^Z3*WLQkJRq2k`xQVmDN%4McW1in^jm$!=
zsIV%%3uGG0Ni7Pec9L4b)CwkwIz(!jYVVra#viQkvHWW#Ha;NJ<OiRWq6VWh>Z(kY
zXCoK(C=sPOxJRk3%HV8A;8NDzQ5{jh_p?_ng|E8BZC^P=oSpI}_U?ZZyf^dSXU%!%
z<T7)pOa)p0oZ$TI<Qz0J)G#?`rG${^XZ?z2tlkh#5=|@jLl`=6zmlXlt8Igma8&O*
zsgP0OumA57w*H*LLF>%y?-&9|b^_DO%`uFQzyC4=lcVYRtOrAkwjdDZx*?|T^#SgY
z6+W&zz?>n%ais*(&JbQ<hV;S-90wyL!cK%;&VDQ9lD-3yUTqYl7$J`#eC2)wxjI@9
z;gN+Z%$<m-l`s@DUi8;7EK1l1BeCN}mDzYC*AGTa<2Bs?7DU_NLaNQ<H0oP6g`7SM
zLh?r%a74Z_(RGIaCZ6{nh1{lo%7VohbmKHk+FDqgDKdN%!g;a8G`)8_0-5M^6;4wR
zlx*zzmmT$TL_fX+Ch)+BJ%3t<S5pA_=Dm3A`Kt-ocDzSy`?V5m0rjGsx}dJWZnqLd
zw7RgM4WHHQhf_q_3+pND7+EK_y=lg_RB-3#z3?fl3{g2w*ov1e?1lPTFscfza5X&`
zVwxPVJq!IIY{x5RF5H4_p=W<hY6Wfc)SfaR+qJbK1~t=}DtW*?VINHM69?iD9i=%C
zCsMwaVpGU0QAsxhN4B{Tc7Osliqf|E3(#7ktQn#+Xm&a=iBCNL0w0fQ9<wUt{*AB+
zK9gJ20DPpqIR@CR^75x?<o0ev)77_k^Nvi*;}RR9nTN$G-|`pCA0{$WCGsl(n~6!a
z?ok3TJOpA}J|!|$G7FPaESxmc(o|zb6jM}V6``f&7VH8kzpj`2y#0j`wiPCZsji8p
zXImoy>nQYIUV~h+yk6v+ib<r?l%Env<nWC@Bv)ODa_}Gm;5hWBU;*L8DXSPmj^$H;
zc;Syu(TXwT*x|QXB{PLDsMxGhS9v%DFmrqqJS0V7nYBaG0sxUA7cjIoa3!vc=oOer
zrI<;rh5m3abQ}G(YQ3j&*%ndYB--TouIb^9tE(u^9skI;9=#iZ!gSfC3d!9FlI9>*
z&a_4QrXt!f!8cW{_f5s2r72wHwXJvmZyd#Vh{gI6e{(aeZJG1I3^N2|3k)+lh<x%x
zu7Z0A^Je=fEKM`t>LyrpGFLf0L<UIn@#tq(7^U+NEIj5up?wVB&jWCwV($~V_F+*g
zWtXc>A`t=cE`?+2{2mPP%;QyRTo+i3exN0%2NjU(C7xyO!0X6FkRfKHsRaHu=uS_7
zQ`tclOmGF@!GAzxq<seN98+a@TH*>IBPG~@03fxt;iS}kT&7?QdXWx3%Mfv?JX3{e
z<128ny#*6{skIms+JEUlA<*p7e=G2w><PlXX4|Kzu&u9BY>TEalA?X;#Kx!0*pY4%
zY)t!hs}hAr&ll30Z9iBG_ngj15v6x~odcxQ2k9b8G3`B&-zu9JQmtdzENrteo8>XU
zw%<#~dv9?Hr3BkCYb{>NucNROZ=uu?52p|wGdWYm^Q|L2ISbJ-jHvu!J3eT`l!{rO
zA)-`AIh+WhclHQdV>`}gfK7RDNb2*;Zeckcf9SyMe-Y|Kp0}OTOR(+xhBQ8{+0Hp|
zpfY2u?~-x_QT!f{EA}GE)q}bLt{(KFjp^JRv;xtwF+nT%7)<AG+eWY@i;2~F2Jb2M
z9oUnXS4`&!T*rWFVM9zOz*-*gt;ljY2v=3HqgdxH<}O?C*E1pE>oJ!%jT}R?$vARM
z<hBQ<q)e+p%GXL4m&YFh6g(DxNQ|h9J2PJZ>%z<z#E821Zc!XqpDK#u5d}EAc=FFO
zu)bkQ;sJ%N!vkwjsJ?1-;z>dRgfiwr3gk^A#E9Akh`M|}xLP4Inu==&r3AHtsU8Qa
z?vR4(h%%23bcsQA`RW^RyG)gLBh>m~4iHCR#_lElIf%AA;-BLMm5u36vjCGV5f+Vy
zXBA9$ejGNfiW8ZRZAt{?u1$$*{oy#5A%$x!dw$!#o#UIgMvFCZ4c-bj;=0eF2P4k1
zuS7sxY%%7;YI5@gml^Zn2b`yxuZrD>GyP5CGLuU&AyyMm@AUXV$|8E84m0UKrM4Ik
z-GPfs&YK2;H6(05o>{dIZm2mf1cX2%Q9OxV&g`rOjW#ibqtO9jC74ECk3peKn|_2J
zO@m&f<7m|L9tcfw8!GDMLIcP4d{00Y``66YFa~1|;x|Uv6zr9Uop^oMvVpYGqHuWA
zvta><zIOwY`@UU0k&YQPM>8f$+AFVGvHwyew!IG?<4ce$mvDM~ETji}Wh@J*<=-Ug
z=a^m+|1!@Rz{Q5QjOsG3OGLEUXI-L@{=KoI(9oLaPP8ry<gHBaI@}h`jl_MHhf!?r
zE-0%Iy%<VUFQnMsk_=d$avN~QwoxRn=j4UZRe3qX^!7n-8b0@^kgsf?5451jzfF?L
zyRm(AFhmUft@drDBx)ybXZjK#-RXEzTv7FHDS^(<6d!Ab!}pd{Y}dL9+_ReL1Y8XB
zyy(ltKQ!^n#XRb>akd>I8{5~S!;Fpp$N?wpZR`53K@q7snW+*2vi&j#K94S_>iF~M
zkLtmn@Q*CfC+x@dx6CqcI$RX5Bh#M;bfo63v~3h3GX3QZX<&Q>P|UNc>-C_+@Xb<!
z4wIg?4${8s;9LNhc}vh?9#v2ub=8@-K;2}1zikWDO`f(6dyll`gLhS1J`XGGeYfqL
z9EE+GaZbJ;SQVWM*E(<NI9B0us=sR#e*ESg-2q!4hpB;i`7ELmCg-zaO!fb9H$v{;
zETR-lHSk<Qky1t_n9Ldo;x4Fc%s^g#A+%B}^9y-)u>+sf!I_3`U%Mnw@?;ioX9mtM
z3q&&$4$jd?7XM5Fuizg<7=|BVy0Qm8Kq+U!72V)OE^N2iq1Jzbt&3B*C$rs!4ewvK
z6T}Hm_X&L{pv~e2+n_G}!C*`Hb4&o6`7McS0r><*S2E7=#cl9$4fAJYaAYIbPjXpE
z?m;SfQrZOUusaWK1hrf>ES+W0jen3T+XqA19TbPDZvMRtQ|jtmpp=PKIESRM)dszw
z<C@;VFI2d(d#4y~l((@PbV3|Ne_n^nlrQ<$ybmIwy><r6%p)*(xorgA-{SfAHy9j&
z!JrB}cr#%HICX_zh7bIA{xmJJESR1GvRFX|Bg~7AMtF2tFee?nOyflb@4^rBf>rM-
z$uGIru~O`Rd`Y7m1?-xkCBG7k?Q3CTs$}#iJVc#wn<d~IA~~v%t1g1I>JjcCsubWn
zLR?Kv2}Crjc?R(Hwc9&45=uyC2(=fQtvh>_YU95>!OIm4Tf0{&qZcY>K!aGJz&D0l
zNDgcUsgn0X&t&|D^DbvX%8n{f_`;WuDoEZ`yA3L`g?qNCye2ke&<Sqz=h?6cj~lC*
zG|C;Ubwuj5F`A&u@H)2er!;uD;{6t7!Fibzr=v#ro$yYgd2u}yCy8^`Lva$#y-1Zx
zzKK!iYz0!WNn4@U<301SEpe5|#%hiedzHw4S2|Qh{5<V`cs(T*TUF#c+?!OXq%@Az
zTrbAEuHA|O=8YNZ*9<*b;$Smw3VEY(SrVcb`7BG~gEFDZZbI|9$p0ohe(`=UMXf-V
zMRRWm2gHOHl|rqw=-E=Jm0seWk4h!|I`!e-BM`-A{T`vNdeLp=F7efxY`sQCvwPsN
z>3NSg-28rMgOK)k@LT*tRw+)1feHVKKxVdGHzXz9Xe*U~`B8YFMwLI55bl2x$#Xjg
GEB+5`(!!$v

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Digest.crc32
deleted file mode 100644
index a199ec317a7e..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-2912620103
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Partitions.db
deleted file mode 100644
index aded0e1f0b6b4f667c86ce3a9b6df8f0fbeaa91a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 63
zcmWe&b}%qtVR#_O7$C`bfrGQ2)0|O|^Dg63P9`QxPJO1c42%X0j3x{apa`W|p)>;k
DJhulc

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Rows.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Rows.db
deleted file mode 100644
index b0aea85181ff3c240419d4285128a29c8b283dd5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 738
zcmZ>CVBlb2VBj|}<YZi+&B74#9|%5_{{K*=z+_~`B%sE~z#u#kNT^FHG8vmQCs;Eu
zSS<wd{WO3?;sym~0}~6D1wpI@0=|Yokx~mF(Xk&$%yCy>HZ-+lOUPzt0CLx#0}_YB
zfW)<XK;l(0kocm?z-YkG$jn<0w6p=}3=nWcxQOe(US5TFAQydP|NoH(<RWf^9YBr3
zQ-FlJ49G=14QGKaS_0(zX#<JG&0rVt9(ckxLEt{vMb<#!jzd6Vju+TP{0)Bv6oB0I
z7lFj#NFZ_TA&_{L1|+^{FfbY(_@ai}O(KkeVu?*)H_iV4aRJCpB85Ictw?SXJ(!5a
zP2!9d;ui#qvARj(;6%v;pdq;2WOVR_KW;b4IzE+OBnEQRC;k7QOh9gu-S`02P4bLi
zvA9XWiAPaDh#jk&l$cbN7T|G{@u52vxZR}IRH^Z371&LO|9?6Ga+BJp5>z*7ICY}9
zX*)RXG@BM^1qjUsyU7`-q2nx&m=g((J8h?(+8~due+(oJ=L3mrzk$T7Mj-LUnSs&d
K@P&nFZUO-E8Nn|A

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter/na-1-bti-Statistics.db
deleted file mode 100644
index 3b51c1b07725ae12cad34d40181500b47a702e9a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7095
zcmeI%e@q)y902fp*A_|#9TeCGWa#+ekeI??3`n+BXj#D+L4LUUV{u1&87$C}Yd0Cr
z%uM!}kawq#>E6S8#6$Uq(7oF-G4I`;#O=pqwiMKdGSWka?R>N{!W`+b^d{KIIx
zOJ44M`hC6LyL<25H&>41)ZE;bMt6jk4YcsIY^O!VaY?CMmP^`X6q*EkooKWRZnxlc
zNP^K(=P?#<^N8h=LzF~Eg&=wAJ@qc9D4AMYzOo*Vdy*a?DG&(EMkNr<o@={E$J8Cy
z`@DSbTQi#RnXzaZ?xaQD<lluGYx#WOM$Zdvp+1B@)PyR*eHi0###+WY#*vKmjAIzb
zF<!!WDdSYenT&H7uV!4z_#wud7|Zt-ZhIkUy^rxIHev1uS;fRXj89|piy804Rx!s{
z9l&vQ4Ynp1J8TVhIMZM4Z5-DPV@LY27w*C~yo|l*5$wbf>^rlt?>dBSv|;BT#4bLC
zT@jDHF#+5CAa<L8-OY^OVGEA?lY^d{M<n?t6eqT7?1OlG691<t&R602%zuEbvYkim
z)oMKc)tAnpxaMp-a@dbvWNlRdIbv2sUN9Srtasq?7-RVk#p4DRAuqX-h`e;P5;^q(
zzTcVC_<rYhyp7tg8UF_Pz9M{n4=4IieDe<m<fqGfkoVs#K<<ARU!SplbUleK`UY%I
zOxEX+ccdIa{_spTG{5{LT3`8>^q;`EDzO=j57oei2pCs4N34a`JT{8@8|JEShVk%z
z-xPF&rQ|yFg5*IopQ3Dri!dIoDjR~1%}+<;d9nWHJ{V6ry89~hvX!O1(91ttzZyF0
zV%>4*!i`ULL0c|ouZONY)R+ftPk-ZH=&iS+CZOBj^ID;gR_#xL{;0F{dFZLvM@oQ6
zMpW$pwB2lm^PhMtPKM^c+>8WX)WiADi@qywg75R=$JVw0t31TF2UwkU!B7T0J-r>?
zAI;Ald*SmV{O;I^BG6vjd7|YtU|snuLlSW0<L#5M|N75kuEPGuTq^A|fcEi1TN8X9
zBpfjf8bLhy{RsoSUuoB@Wo;mSPmMnv<^y@T-w)|Q+%)>_;1j_2ALuy-&*xEpVICaM
z&Pq#n2Wa0~+dn)G-1WuSsczuYhhOc1=j%I@Ig&pgCx&Ov%@hNdZN71P=e#+0u#VT@
z-{)XF_>NrtzvNuHV83&i{-ZfgV@l42E|8LQq07Pj?VM}wFiOs)<Xm7t5hds91($>S
zXLBxM`}*0l4P5-;E>2@E_;}g*eZTHY?9)5w`wh8-&du~fqc=l)*+lK2g%o{BgYSf*
z+^yega}}aKk)6FFCwFDus{DdNrKVg1WgIAT;orR;(3OiWTgqakEZE9JOL=hrXOfpt
z<`qi8=suX%ag*}vbox33dOqcwNtsNyK0x0HX8iHrV710ZL2}m#4aS=4bp_Sinnd!K
dTXICTRcKi8X9qUBwm3baWOg-dX>`iJ@Cy#<k*NRx

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-CompressionInfo.db
deleted file mode 100644
index b8a1dac50701f55e807402a797fe9bbf0f167ff4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 75
zcmZSJ^@%cZ&d)6<N-ZwVFJb@zMli|1z;fg<ki`n+!30l07@VCDnlB$h3p)b<!Ho#_

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Data.db
deleted file mode 100644
index f1c5c62240186dc9898d29d81d86784264e3d45c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5947
zcmZ{odt6l27RS%v=$ygZ91NoYgMsnE8Bh=JnIXN0TRspGQ4mm25ETZyrTNj~V<r?R
zW4DZJ8K%|;Q3ujWM>8?IiZQ^5iaZqokw+GJ?S^KBn$2D7%*;6lKOg6>`TX`;d#|-0
z-?R262nn%+kObw~Y_@#(ZzFt(LUT}1a7buac!XA`cO{_3R&w`gnj(k^ge0FZ4aooM
zDvDWTbnPe`bEI%LG4e%S%gi0^UD7;40^Y}hZ%EOnEHALIn>cr{r%x)=&em1-Nf~qS
z*K)4AQ6}k{fWkha+^q*1XPz49kut}EdwIVe;@o)>{PeR^!QCUodnDxuWKAHrox8Wy
zoeY@0b*sB7w4HM*fgTAM;k$7r1*xBHZnVff!ln^UF9|d)sDdWV_q+O}s&K&xTXKT>
zq@?Cg7|<hPxKD8u1`$182~OJKPAB8hDTW3o-EgP+c5GRCv`<Qvhhw_CMs1ekIClo1
zfi_k^XCYs4EeF#3h-&2!$jZZ0)hhFyK(se8RZFL+NzKf`DQZ>735R7bQ5|HcmXn(5
zsDld1!cC-!txhP$(S9(%Xm(2zQ~gp&B&mshR1)d*25kpwb0$om!)t8yDPNO9rk^u+
z$fOXQ8;$Oyeoi)|=hWEhKT0hy@H@F8&Ef+Ik{Yv7&=^aE(OvFoHNvtuW3d=uT<x5P
zRTjC*O!+&~+dw3cnpq@~ATs=5@&=NQnKXF=b4>6)jD|6PM?;f>b7n19qUU<cv@top
zMec$rYk8D{@e~IM@vt>xt*Dik3K~=MylFXf*(?!_U6V=q!`Q!3{z8XL%|@7rfVoc@
z-JSEm)Et}&%Q9C7OBLowkgc)iu2#zExv4O-<|%4Nf_qyE$nmp6W4v8@5An84&5ax-
z_^-=R3R<mC>{`&wv1>)z%p!NWvK{jaGrHTmtW7g1<X*psR;sq(gps6UJx&<G%sJn+
z<(4#36FYs|DN%Q4V2FU&MM5TQ?N7!#fwIupO&MeSC3l(x+q%Djtu}W5TG*uK9Hk?j
z>W?BxdREd$kt(x+(+<q<C8|1n_JR7ccbv+c&8z1;cSg#UpBv7o%vp9Zy_Yzbosk8T
zH0!S|St|1?LEhZhE0s;0J*M!~d<h{5o^%h#JC+_m5p~ad`pZ2Vem+FeoOm$3hp?6n
zHp^wYxNxghP0?|-|HX5d@(V70L;$;vAg}D?LRkF!t_QNBF0=H>*owWM0gicgYz23?
z0O_a-7u)ZEu3o}gtAx`Z5|`HAD;<GnHWFGW5{(FpoV8wutxtNbg^e95cmjmj^U__o
z?MkpIOmIR31Xf_nnSAIjTyTOG)r)X+G;M?MFJidMY8uX-`y+^9wNj-W?(`OJ(fMQH
z!0CoNeO_#l%R=X~h8nrcR5f)=0~Dg(9%=g>gScD?LC~4S@HZrAxL-d2kMw^H66ayW
z)@?fqtE8RxoUs=u?ckA#t#^$_IL_ba2B*u3qx#?sOwgheP8?ew-H2Kzr0zUPZ2e0^
z=p63?p$Os!BpPb0g2vQ;i7CeaL$r76+hAfzjUn11ce*n5Lx355{QD^)*9&eirF2;j
z<iZ2xf>sqr((&h2uxA}KV(UJKlP1&8V~<t~9bR}9dAUFe40SrQ7q<Drtm+qx8-kG?
zIAShT`^14^l24q-@Rk)Ld=seYjF7$P0&_xo?gUKm9bVK#W5VA;w}M?f<UnfrgG8BJ
z%!W+79td;5T-<&)kkl;nyBjDR-HTn41X8<jO`}O6kf`n|@?hR-OB!#;Uc6_>QOdLn
zi=wLK0y@}B<8pE2FJW1o(J+@@keFbcHi=2)BoTWl$qlm`7!aanF1?vLgVZFr_azI%
zTsjbh)0Lnl#fGQnRg?OJI2cM8q_002oHz7~oEzl$x%S_?3%?sq4q07tS(2W&GnW-}
z$m%g|FQ{MG7K%A!oqinXIIs0DP#quwW?5GrL-mhiMlpx1Z}hBE$|lAs^m_||Lm_#H
zp{Ds*3Tfi-e;FdFAuI_<oluvwRyC|txm-#hmKH@+*|7xn2j=$?4e1X8<(_vR1X3D~
zVCBr^ldmaBdQr@4N{2Kmo-4CzetnhZ0LJqx!j;%$$Jgb%<dgBWsIW)CSE!XJpg!Wt
zwb2AnsL@ji%c(v*Xs1_LELSLvro5yD2!Y9^xY%c59aP~WPi#s!icLQlkwuSrZLGp$
zoAlH!XwyFCwga{-shJ38SfzW+?L-PT#X>;TSf4^_nZyz4dK~V19W>IfyKy-BdIU`0
zda7`Q%?sNeRzREo_I&tj4p}O(c~O%kXh8Hz*@m|P%rb4_Gk_=dSV*wQmE(7lx{?d_
zL@^BO1XR-;D~8eq**o_i+q5(uG_QL(N<k{jK;PtNb;90dn+~r<-xq=eU#6)R`INj^
zIGb$KPlI?qHUR^i{V~lne7!ZNqXkd4ne72Hx*3AxiNG|ks7WK~<lk!21U@zFj;@At
zlsLs##HZ$Wpp4O{__hl@GR@x>K`bfri`;oSHTQr-r>ql~N=%Ckg7N$qDh{4REj~u5
zV)QBI3{fI2Gk0OW?T|H}NQ(i>m6V4fXQm||wZ-ltjcutnKy9g8yfg}E<g8BER&oG(
z^xp%!NI+D>Icg)RNlb(yPcWPSY$82R-r3A+OzT`@7O7p_O&?SU)M8rKfvQ<D(wM~)
zifR1>7iCG1kPFjlfr>g{N%9o6GZnV=zgW5Hmu!v~*|oV~DY#@OmV$gVZ5kjxaJo^n
zqHXX=k}F%`)63(jVW1bLEAfq9DJ4`%4K|;Ba^()6ePTFWo|FwFNIaFT<lzKW(&ZR8
zsFJjaT?1L53uV&f<@wO@gnKEyQp8he9-M&VUy1Rw2JqCji5Qg1G^q-&jl=nLrL!3J
za#-3chGseOog6DC_#vX&_J0VwBQ^X(C^UonI6h!ucjecE3Nj#V3tTf?6VTSgUiE-k
zC*40gppbxS2U-Plb^7%yq$aI#{TWrT!;-n02wG^`kNHq}jJ{9pi?oHrb9c!sQ|5Gz
za?T<xLd)Ih;_RTi`g-G&LATVkLI7RFE@VU!l{LbqLq>es1A9sxTxsW=c9X=WqhgQ}
zN!a>zJnrE+G*am?$Lny$NzGavZ12B64fe?t%)vCaT?!kXq)XtiLVYCaj<Cp?_HnrE
zP?P|VN`yF&M{VEa<2g;8mr^ZP1&jPR8nv?|5@l%<M9b5tJxhrp3=JX!Hj;c=gh|1+
zziC8=Wo{6qDooTJW>tC={X)S)75-RObye83EMGXZZ2OHM)Yd@+{O-RtDhjoaApobw
zT$_SL&$24f#gV-h)sC2IIf`|z^U`xIdjOpMAHYzIIrOF{KjXw)Z?>(s7`dFf-hlO@
z;In-Q^GWw>6h>sPnG6_(9%A8{b?rQS`2*77z9R&756ipsq72ai>==zj(Wk=MWjk^z
z;oNKfEEZes#o8i+f!czPq~kaMTBgsKB_?#oRn9O+T!eJoDuvvYZ^e2Gw7NpVS=6QO
z$vr4=jp@|&C6oG$3%i8Wn9kSn?z-Gngf*sfJHl$Yp9ofL=lN);#`Me6;4<z^nC%=)
z!Fefwb>p=znH9C&rGmzEO~u>n@^hx;0w=o`*Bk-ucFhry_A1<5=}aj;GB~O!_KL3?
z)V<o&V*^D1F@acD=3+gSdVShnF|w|#OPN8^Y0{J#JhBiK+xElwY0o9kpd7qH@vH7+
za$?%-?qtMOSRQO^Lx~tySWR_RpA9E9nM+oc45wB1v%X2*GTn;LhLXHRsOq{^3R^St
z&C*DflT<*=?tlvTOpN_cp}Wi3PA9CG?giQnq-MqU+6@Ahy4OE!BI%W~hfRPwC#$;O
zy?uz(tW@1TqzVR5!Oz5WA1}B`2CSS_aFZ(IBm`x52Z)-LFBjZY1@A|7c0BhywmanF
z{(cKV9C6$`;RXs!)7y#GeW&NiL|a){DG*J!RbqRAN<?PG5>UF`P!7UMs}EZ_q*2R8
z+4L0m`V>1VZ3G_wD6oRkp}kK!DI{q4>r?VtX!5@mjVCkYs_-}lfVwV$4rSWR4@-Nc
z5l}D*jvS$GY|h66x@rt;J)TfE4i@94)4?%Sg?_`o-T|PP8?9JdtjaEOht7sSwy7TC
zW5ss(^oyR_-k%KD@g85FigI{|Gv}8pJR+v^FLcn@4~-GZei+R<*bn99H9mNkekrru
z4O@O4=$BHJ+>2Nls)f3K2s!InXrxZ@vTDz}z=5o(@48P44I1I<d<8^@?<#BMr2nGg
zTDVp`!#zhGNbe`E(_AT;c^XPdK-Ot{PkEE!6`;rW=0v#QW#vwU3*NVmA<7lZqxf22
zNLDdjE2k>Br>QbYzm)Zxkb!ZYgt(|Pd_c_eji@$MfO?iw53GT7{gw@(d<PmGayjd_
zmBf^?l|5qdlUy_miWh5DrKlx9i{Sj@QPAj(Mwq`a`#|N4-@`3%Ys%if1#V4Y+yhma
z<ZQSqb~+rA%<kzMcuQ8Tj>fe);aje7cFCw`?gnIKx9^7f%Gskj4SLjM{{lUR{fm2_
zDwA{tG3xyLa49uBeIG8RylMF!a5w1p2=1)<jByPR&#)k4jnX*|tNz)5>EuTV=`iER
zmcgMnl$S*c9ge3T29jFCnJQetSGcF7GI$h*HC*8Vm0RZT9{wuxKkYiPXfJd9og?p&
z^a}4I@2SkI?Xsj-s`|ojBdAe+8=d4UL$$^Yx>%9y23^?ijHoiV=|HutYDFI5_(>$h
I`wHp*0hW^Lj{pDw

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Digest.crc32
deleted file mode 100644
index 7f68c2c8f47f..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-1995270006
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Partitions.db
deleted file mode 100644
index aded0e1f0b6b4f667c86ce3a9b6df8f0fbeaa91a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 63
zcmWe&b}%qtVR#_O7$C`bfrGQ2)0|O|^Dg63P9`QxPJO1c42%X0j3x{apa`W|p)>;k
DJhulc

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Rows.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Rows.db
deleted file mode 100644
index e63ee20c95cdb072e6ed23a2eac94b918caa328a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 738
zcmZ>CVBlb2VBj|}<YZi+&B74#9|%5_{{K*=z+_~`B%sE~z#u#kNT^FHG8vmQCs;Eu
zSS<wd{WO3?;sym~0}~6D1wpI@0=|Yokx~mF(Xk&$%yCy>HZ-+lOUPzt0CLx#0}_YB
zfW)<XK;l(0kocm?z-YkG$jn<0w6p=}3=nWcxQOe(US5TFAQydP|NoH(<RWf^9YBr3
zQ-FlJ49G=14QGKaS_0(zX#<JG&0rVt9(ckxLEt{vMb<#!jzd6Vju+TP{0)Bv6oB0I
z7lFj#NFZ_TA&_{L1|+^{FfbY(_@ai}O(KkeVu?*)H_iV4aRJCpB85Ictw?SXJ(!5a
zP2!9d;ui#qvARj(;6%v;pdq;2WOVR_KW;b4IzE+OBnEQRC;k7QOh9gu-S`02P4bLi
zvA9XWiAPaDh#jk&l$cbN7T|G{@u52vxZR}IRH^Z371&LO|9?6Ga+BJp5>z*7ICY}9
zX)8GHG@BM^1qjUsyU7u#q2n}=m=g|;J8h?(+8~du2gTUoTp)4nCy;nm4<x=gGBBDP
JzOWF@O#tq@!5shq

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_clust_counter_compact/na-1-bti-Statistics.db
deleted file mode 100644
index e4fcc94ef2e208fd5ae17b7493d98c05e4ed4d02..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7095
zcmeI%e@q)y902fpcP&3UTA{!;AY+psGKo1Di~-5E3M~T01mwq7e=P2xmz51#a_uI=
znVHG{7+Dyi6PIjEXF?XXj11HP&S^4*sr!jW(8$ECqM32kWka?R>YcRm{XR`J{$VuU
zB`^0rd|$8c-M#nj3os0$VP=mc`9$QhPA;rmw#kKKn7BkH%_(i*g?gddF7nla%OyB$
zlEB;SZa#mjTP%@mq9oc%1<76Gu5mg<$<);Nb>Xq7XXFdS`Fy?^eGJjI-PlQ5Cl}xB
z?P7c0nbwX^kA=(OcDX2r^1bk&t@H;Ta=*|K7(?KP+CasB4x$`PIfSx~au{U;<p|1A
zlowK7L^+XiD&-8yD=8OIUPE~UW#!qzV=shU@1;C~O{nKVR#0&_<x|+~0?IqEIqLe{
zejL|SVrw(8gH~Y&Q{xS}i{rW>>@Y9(Jv*_DuVT-C96NRxdr2DheFw35D|Yq)?EI71
zrP0{yW3XM1VmAxeoz(mtvf#Kk-tXCUMB;x!abm5&K7i*Z@qUKlY&l-f>_^y~^*m~?
z(ct;7xpWrAwP#w8gMREn4k`B`ht7z|^JXHE4K_R<BP`#ccvSy<<b{`Gkr$1WAtzqI
z&pUMrKkv-ecTxLQ<KG}ZoQM1OSgZ%dH~wHmey*e&dEf0E<i7Xu{Tb^+_mk+tZ^8D&
zRJa>?d%|Jlk58vVvneOg{>r{=_!P#u*akE|xc+saFs^9`T@9^$as-Vx$XU|><H3EN
zN$5~Z!A<CS@dIc*>8-|#Fdoho4?;&~C!_hiz;OE&7>_&h!Zql7mlyRwr;M&$37vM)
zehfNy{j(j=mM_xRLYEz^%Yv>>e(OQ#EqC-2&@CTy6+#~=-<JS=uDxjw^yHhv1;Avf
zzN#O(+H8jFpLCU;2&{jl8S%Yjfa{+XJ(u4GpJ&I9u5JY8+{DuctVz0HEQX$%+6JGG
z_UG2U@cj{df8<0SXdlvkyzvcSUCC>M5^&g4E!W}r4WCC`gX535RMcw(?W2X}diXww
zIcyr>K|KD$2_t-7NjD0Mn?d}6N^dgE2eLB1A2fitY2@31r-2{Y-+dJJ=Lv6a7M#zH
zGD~MGXx~)TH#831@#WabPT*6AUhjte^_)%}&Yp`C<MU^y^MQ*u-nzSE&Wy`n$9wSa
zb1)WsMydW^axR^J+!@sP(HfUyYR-jjkeYL$+rj+poNM+nYR;wRTwp>GHRtL9w}bg-
zb1q`Ny6a3W6Md+I(VBBUDQFx0^=uy)-7f!qLn(p2nO@w)r(9X1{61!r3(5OR4!#$N
zGIxHf%~pu!czXJ>jLhX(E3$KP)tY(_)Onz;g@5;cAn#oAwxw=X>V~bpwA2^(e<pbe
zWL|+3Og{V5I-*~q>}l9@Dc^L$_2f;)QHELi$NvVa;_C#-WfyAs%8G|_Dz?^(<S(~m
b2)9M3UG`@eHaj;v+@fT5)^4tIDBti4D5#KH

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-CompressionInfo.db
index fc38a25eea5d107acffd27df97523c4e5b4c1664..ef683177e8f648a5e67570980ba0d9e79709ac39 100644
GIT binary patch
delta 32
ccmdPbpCBU5;J{G-|Nnmm2v`fG7@=$i0Di#*hyVZp

delta 32
ccmdPbpCBU5$iPtl|Nnmm2v`fGfFck80C-#lNdN!<

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Data.db
index 11219d037a8da6ad934880d1c2b6d003f1aece36..d0ce6c3af9bdb53a0fb7d7ad1728b30a24dcb13b 100644
GIT binary patch
delta 48
zcma!un4qhj?WxQlZKTX#bI7PsnL)-_nL+2;1fY<KGK1;i;EBo%0SZZpRg9HY_dEb5
Cqzw}Q

delta 48
zcma!un4qh@@`N&jw2?A{&7qQ)K$bC(b=XExnL)-xnL+jSYh{K2g`~tP##^GEhXGa3
B4n+U}

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Digest.crc32
index 985d6dcf36d6..2e84cd9a71a0 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Digest.crc32
@@ -1 +1 @@
-462858821
\ No newline at end of file
+739757235
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-Statistics.db
index 3c68ac568f566f72c9a93b6dda378bcc0f309fe9..daf1c16c63ee4d7c7c879ccdc0381f633b023890 100644
GIT binary patch
delta 144
zcmZ3f@=|4j8k-QaEU$L%MxE=Ng5?YlzzC&TCr%LGtj)#EKKT&4gA`-M&%!cm2JSo0
z7+4SVCAXZ+0Mi{ZKPLwWdYD3F8H{cNH3JQRss{6sbTTZPddlq20_Ux(JTn*=G$!ha
OO@1us#1fR_<qZH(rYsu(

delta 140
zcmaE<vQlM&8k^wZwJy8lHtJmG6f9<d07fXyHF1LYW^FERb}7bGE8T)X2Cf$y7+8z5
z9gpo}0Mn;dzMecm(8ClW%U~!5)CklJRSo7N>13E2!07!aSYh%TJD@s^iF#s_m4%#G
J>R&H<3;@@*B@h4r

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-TOC.txt
index b03b28372b5d..8a6a30b6db77 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-TOC.txt
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-big-TOC.txt
@@ -1,8 +1,8 @@
-Filter.db
-Digest.crc32
-Index.db
-TOC.txt
 Summary.db
-Statistics.db
-CompressionInfo.db
 Data.db
+TOC.txt
+CompressionInfo.db
+Statistics.db
+Digest.crc32
+Index.db
+Filter.db
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-CompressionInfo.db
deleted file mode 100644
index 0b7faea4a8a443b914a30cadb6eaba6c821efcb7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
fcmZSJ^@%cZ&d)6<N-ZwVFJb@zMlcB?)*|x&xM2mv

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Data.db
deleted file mode 100644
index 277996bd4c442e53c582ce639c21469d8d695cae..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 89
zcmZ3>z`!txfzhD;|Ns9D42%pv*;M%X892Dp^79pv5{rPiijjfQP>~@<kpFn=31tRp
lBV`7gb0;-`EMp++>Yo=tmWeWh>0Ma`WrhH-hJJx*HUMo~7})>-

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Digest.crc32
deleted file mode 100644
index 654f52bb12ce..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-4102718625
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-Partitions.db
deleted file mode 100644
index f297888fd9cb81f9bc4bb651e98b3e75fd1c832d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 60
zcmWe&b}%qtVR#_O=pfFxfP?cthxt8@ySq7<W^n5NFJ)jfU|=+1fB-ot%?hO%0DW=^
AMgRZ+

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-TOC.txt
deleted file mode 100644
index c20f4a8ad02b..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple/na-1-bti-TOC.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Data.db
-Filter.db
-Statistics.db
-CompressionInfo.db
-Partitions.db
-TOC.txt
-Rows.db
-Digest.crc32
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-CompressionInfo.db
deleted file mode 100644
index adb7fc457930e947bac28ae21637d92b336e2d84..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
fcmZSJ^@%cZ&d)6<N-ZwVFJb@zMlcB?HX-u>xS|Eg

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Data.db
deleted file mode 100644
index 68f29ba3d1a411f895cf6cf0a16734d5ad51d3a9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 91
zcmdnQz`!txfzhD;|Ns9D42%rF*cUMJ1I4)0^79pv5{rPiijjfQP>CU1kb&X2eT52x
mtdR<X^*NIrK$bC(b+!HlkY%F6U~*^u0TqS-unt!RwMGDl!5IYr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Digest.crc32
deleted file mode 100644
index bf775523a2d9..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-380992464
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Partitions.db
deleted file mode 100644
index e4f9ea96058eacea1e515d01d738bfc43c19ecc7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 60
zcmWe&b}%qtVR#_O=pfFxfP?cthxuKOyE{3UrgG~4FJxdeU|=+1fB-ot%?hO%0DKq;
AJOBUy

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-Statistics.db
deleted file mode 100644
index b853cfb74c961a473336dcb9da63e75a7507405d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4680
zcmeI$e@xV690&04^ZnkJA17Sg0V+755Y$*ZoJ8QJUck{oYlHZaXXWw>cM})_ySo{Z
z&1U#VtPwel+!TLp%%KyLQXqe{Szu+U)S_A_t&5y3ubd(#aeXiE8=v<x|M1`1J=?SA
zdEMu8chB>Ep6`91ogfH0;a*8q8>ZGZY8BPGNv)C~7*7cq4X$-&rBSJ>cABdcw_B-q
zxD>Oa+GEb!=y4Xi98Q<hQKGm!HJ+M=dZ)|M()^?CNZc#x2h8*P{nOD2Ox|{5D{G%f
zzS-L?9y>K9k4=rnsNoK^21?+2Y2h6h4=w7rplEZ@UX-;?unprJ&N+g!o^upu17{=W
zIL>o9&ktIg=YpWMd8BjsOF1v+T*&!3&g(b_)|QqX6l%Yh^9Y)8>%pGj;vUYY(c&D=
zThJ2s`O+?o>nhN4COT{dI-Hwt#9fT*htN?z^g~<GroHId&!ZEE(GO>!7w$ou?da^?
z=)9BYl6dsm1ho5EbgP2y;`Z;57301nFiZLaZ7)af#{J2B-#}a})Asqjm&MP}lKo4_
z*XeNo>n@##xO}b+9QJ!RIHJrCj+}OaXHCa~4G!FoM(b6G$Mw$!&%K-oo<C9wUT^`|
zJAD$@JG1=^<gXa}8T@oEj&F6M7vi<QnZU0V_kefa$pQC$jMrzh53VQE$K0a%%wl^J
zygB6{`17+%sKrGm;P+L0$M7|cONmXeKcxOOku<Jrid;!8zc>Q(4Qr@rqVe!P?*w(E
zwcsZ8tfT?BpQ7!ii!>f16%A6yW~ae^o@2Q4E{)GS^!7FCM;<RcM!o3URm-U}E><6*
zUbgntPHOA-OIA^r?pdEjU6pqHN$L%MMvqgsebQ~CK2)|dh5Cz*mba)UJ{T?_W{aaM
z`>Ct)^XdK1x=T*b`=9x@8|eKnI=z=aB<~l;4zFw`mORY6jaZj@!Bj*&Ik}0hk9@m*
z2Yr5oKN>rpOY$Q+-fMoJSYNzv&_x{eQrmSpf5UgiYjpm`ONG5Ak{_?MHqz%o!a>V`
znZ%Pm9XHYSO1)t#Y9;Z<Dtu}5Jdl<7>!5+eEhE1SyiEMeuAakmJTLf`WzqdyUux}Y
zC;2UveM4i!oj;78>>@tB|3D8NulH>FaP~}`nO;9Xl}B7ud+Y9&84K=UU;7Q#bJkKt
ztrKkjFJ~@oTH2XQbGp_Ewo5s~J+8(=XD+xvp)(g;4k2{rg6k1FbA`@aWJCP7&Rop?
z(;MgNg!uiPf}Ef8RYAkif7;6A#?9*g8-c1#&2q(d^{?auf%B+Ctt|J78vIC$3i7|b
z`*gIR+6KQ+xV>G?Qy<jSGlQ8*xt><*J1Pi^@Bar)rFp&La#t&L=8E#Ca>_S0I@yC~
MN*k2A2hJA$2Jj-E?*IS*

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-TOC.txt
deleted file mode 100644
index c20f4a8ad02b..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_compact/na-1-bti-TOC.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Data.db
-Filter.db
-Statistics.db
-CompressionInfo.db
-Partitions.db
-TOC.txt
-Rows.db
-Digest.crc32
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-CompressionInfo.db
index e2860e1eb16acf51fdc0f6e8d44ad2c33bab9080..1db9aa06b311ba8dae1bc2eaaeca080ba080a480 100644
GIT binary patch
delta 32
ecmdPbpCBU5;J{G-|NnmmFksXIQj8Eb0|NkfC<O)p

delta 32
dcmdPbpCBU5$iPtl|NnmmFksXIQa}+10042r1jhgX

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Data.db
index 620cdf260e5aa91c847989cb8ad33ba3d9380dae..11c0a684f10ef84ac823281ffcf60378ff20f21a 100644
GIT binary patch
literal 148
zcmdO5WMG)Yz-Una|Ns9621bTYLJAuE3=Ap^j13G*MS;4HDm_kZc(|bQ_AI4Y46FzG
zl3Q8S87dhV7#SHD4XqgRwfGqh#!s+eU^i&9m|@AlY{MX9WW!*8NbQ6T1Bc@O3swwF
tHVm@HHVpd5K7v%W&;J2bRly)*0#tR_`4dp=K!?l^1_)qee6zTFHvnq3DnS4M

literal 138
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13I4x4$hnnB)Gpb*8;aVQ?ZZ18Z@%
z<8hV%4TekxMn(okLo0@SEq;cBvoviO*v(I*pRi$&F|uK>Ka^<z5;$pg0VrS$6gaHY
jWW&Ir{*S|kLDs~ELH<Dn1IK|=YcDWB03&0ZhsO;7qNOG2

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Digest.crc32
index bc5f671e0191..b905530e1d8c 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Digest.crc32
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Digest.crc32
@@ -1 +1 @@
-3987542254
\ No newline at end of file
+3918697890
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-Statistics.db
index 689bec8f1a850b0d03e73337f9420ad8ee44691d..c803970afe4c4f45fff7e7c797e6de68240a6a93 100644
GIT binary patch
delta 176
zcmdm_@=0Za8k-QaEU$L%MxE=Ng5?YlzzC&TCr%LGtj)#EKKT&4gArrJ&%!cm2JSo0
z7+4SVCAZw<0Mi{Zzv}=0{|~}YZ4CDJ-bx;wTKRW!p`i5S20_lrJ%T)@5Zw&sZ-J_S
o7C=n{^N~zuh|KF;`DcOi)>V-l3=A3*^~5GW7W8I0%6>Z(0MrFKi~s-t

delta 172
zcmeyQvPorv8k^wZwJy8lHtJmG6f9<d07fXyHF1LYW^FERb|c1AE8T)X2Cf$y7+8z5
z9gm$|0H#l^y-@%E|9=pMYGJUybKE!JK<>B68wI5&9}wi6d`6JR6r!8K#tWzlXgJg~
kFdxZe2C<qWp?`uECcp7rz`&p}QBQ2LvXD1Rr%&cP0K|wjE&u=k

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-TOC.txt
index b03b28372b5d..8a6a30b6db77 100644
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-TOC.txt
+++ b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-big-TOC.txt
@@ -1,8 +1,8 @@
-Filter.db
-Digest.crc32
-Index.db
-TOC.txt
 Summary.db
-Statistics.db
-CompressionInfo.db
 Data.db
+TOC.txt
+CompressionInfo.db
+Statistics.db
+Digest.crc32
+Index.db
+Filter.db
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-CompressionInfo.db
deleted file mode 100644
index 0d9c077a99e8f195c534640b97d5b67a498cf4e5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
fcmZSJ^@%cZ&d)6<N-ZwVFJb@zMlcCtXd&|evB(8N

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Data.db
deleted file mode 100644
index 1489bab6d9d3b281b39561a2f5bfa34573a0e54b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 145
zcmdO5WMG)Yz-Una|Ns9621bTY!U`Jv3=Ap^j13ISUU2@q)9w}8!F6kq;QE$X46M!;
zmlZoaG#D}&7#SHD4XqgRwfGs1H$JdoU^mzJw!nr##>j@j{@hUqpn$T0l{Eun1%r&S
t4TJvGpIaC>jLmZo*fKC|VURHas=6y_QNh4|z^?ZP0|YQKK9N*^2mmg+DCGbE

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Digest.crc32
deleted file mode 100644
index a804901b41f5..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-163579974
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Partitions.db
deleted file mode 100644
index 1eed5ad23221734116a11e360a732b27765853d6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 61
zcmWe&b}%qtVR#_O=pfFxfrIluhxt*CyPcd&|9v_2{|hoO8Za=LFhGDjlxBs}3;>Af
B30(jH

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Rows.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-Rows.db
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-TOC.txt
deleted file mode 100644
index c20f4a8ad02b..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter/na-1-bti-TOC.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Data.db
-Filter.db
-Statistics.db
-CompressionInfo.db
-Partitions.db
-TOC.txt
-Rows.db
-Digest.crc32
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-CompressionInfo.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-CompressionInfo.db
deleted file mode 100644
index 56c95a8a36330ee3340d0eea70a0c0f94f7b75e6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 43
fcmZSJ^@%cZ&d)6<N-ZwVFJb@zMlcCt=p*w0vIzx8

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Data.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Data.db
deleted file mode 100644
index 2977f1177934f9ee617890648b4d242818d5dbb5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 140
zcmdOAWMG)Yz-Una|Ns9621bTYA`2Kb7#J8-7#JHEmc8Kocc<Mew1ex`BEj`7vlv*N
zEiSuOXJ|5HF)%VRFdAAj<ZCf79Jiie%fN2#v+IE^gRGG)gZ(*^9Uy_=EgV1rV_OFO
nt2NshI5eWZZ)cD-0m|PwZcxR*ejv$d2GAxDU}RML@huSmn3^dH

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Digest.crc32 b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Digest.crc32
deleted file mode 100644
index 02bf60043831..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Digest.crc32
+++ /dev/null
@@ -1 +0,0 @@
-1528982319
\ No newline at end of file
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Filter.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Filter.db
deleted file mode 100644
index 2e1d5d29ca0683d58365c1bda385a89ced35f1f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
fcmZQzU|?lnU|?ckVPs%abZX%^z`&xww2T1&5k3Nr

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Partitions.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Partitions.db
deleted file mode 100644
index 05e27b4add1aa1016435ed48bd2d259a2fbe0c88..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 61
zcmWe&b}%qtVR#_O=pfFxfrIluhxrkXyX~Az|GhZ%|MN018Za=LFhGDjlxBs}3;>6W
B2~z+7

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Rows.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Rows.db
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Statistics.db b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-Statistics.db
deleted file mode 100644
index 08d8f3ee7df05aba897349c7bc659389d3dd4fb0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4689
zcmeI$drTBZ7y$5{-CG_fT)6{OFwz#GDrpW|q+p_6;f?~03CKgQR#O*l<3Zq#?41_T
z#zygvl7>>FqKV?82{C9SidMj9Vhh?@AG9fLBTCaN8k>67D8)j!&dF{0zKKcuZ{tle
znfdKEw>vXCdplW<<22kvN}>g!c@@n(%^PVJI8J|-OLZ!1j8ctMZkLValFKDIY>H&G
z+1<wM^=`R9vB`>TE0h#>g}cJ(kQGyX-Oss4V>Zwe%<%bq<B@Sh-1cxQX&Qd<QAZnp
z{Op)GI5rSP!_74NlmEIfvG(@|CfYBQ)G??BMYR$rLl}oL4r8ok9Kl$}SkE|y@pQ&B
z8P8&z!ua)o)p=PMusZHMru{O;YZ&|QElfKgXuX4RKQ>|R2YG{uyBVLu=BF{<j4d$d
z7xv(|rW9LD!wy-59m<S1Y!t_}z1R_6>=(9T8}?&QU4b3nhy7A2_AC3ajaKZ8z1Z1j
zunS|cSI1$ymSQ(Z*saX^?X}>zHvyR>{DEyP!QQK`&xymt`#p;D#p=49ILv>AEm*Ij
z_8JXd|C$?DP+Yv!h#c~u4LPjXha5gGBTpHRM%LNzdemF)qIgW#ROIQm;*n?e7a`BO
zhVOUE2)^HGP3KYjMT5U0znz8q_fEVA#Vda|Ab(iUj=cLxCUWN&czy;t(R>nZ)MMD5
zm~uZx-jsL<`Rj}Gp!wOS(ElsHQ}-Q=3-PsReF$Bv!eLxf8@?D?T-lGt8{({}h4Ii%
z&oFejCFc?Jl!R{dJVja!*I_(L$nSxU&PYb<d7AFYE*PJ2_|tpPFVD|A4n6ya<qM%x
zuiKA8FIfFS3$*2@dCQ@T_Eo1tmnVPzCiJ?$B8Q+Gzii8eK3u#z5&E0v`cI&TPxj>i
zlR1%PUC`xbGkpF@Tj44A{1e}H9en<I*>meO_&q;(WN{s^;3l4Jz?!6MhJ5Iek&W>F
zh<`TifbWmcxzR&epnX{LvAPq$+JXZ;3UI`GjrZaBbwBFw!SU;F<aHQ8`&g-=2EGsC
z4w<@*Af9k}$N=wG(!<>R1`vO>)SC?V1L<kM_2@v{)PJY@ec&Z~+K<5gyz5<%4%f4z
z$kN&b+SiwL_6`EK+#ER53ViP1XYH`Ro{K4c8Iy5h*m7ko8#ur6@#yABbFM&L{SVeM
zR-5u_B~U)EdoFbv&@)aC2KQX*%sefFdoDB`TyW2Y<|DZ03hueUg7|;!xrp_bk1kbl
zu?Jf?(VY2h&IbF_vMhVoCi;EDpVgt6Zg$=F44=44ZKIiF-KN0{YLpZIsXe2j25A|%
zAm{UcSfDRz=*iBECEic29JO=YoM(TaDKl0}ipwrl8B0sv$}Cx5Ba{E$QdlQdz4mYC
UH9OZj+_GYJR;{ge_%Fcy4Fs^H3jhEB

diff --git a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-TOC.txt b/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-TOC.txt
deleted file mode 100644
index c20f4a8ad02b..000000000000
--- a/test/data/legacy-sstables/na/legacy_tables/legacy_na_simple_counter_compact/na-1-bti-TOC.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-Data.db
-Filter.db
-Statistics.db
-CompressionInfo.db
-Partitions.db
-TOC.txt
-Rows.db
-Digest.crc32
diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
index da61fb81fdf0..6f02d1a05448 100644
--- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
@@ -60,7 +60,7 @@
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.Version;
-import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexFormat;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.streaming.OutgoingStream;
@@ -92,7 +92,7 @@ public class LegacySSTableTest
      * See {@link #testGenerateSstables()} to generate sstables.
      * Take care on commit as you need to add the sstable files using {@code git add -f}
      */
-    public static final String[] legacyVersions = {"nb", "na", "me", "md", "mc", "mb", "ma", "aa", "ac", "ad", "ba", "bb"};
+    public static final String[] legacyVersions = {"nb", "na", "me", "md", "mc", "mb", "ma", "aa", "ac", "ad", "ba", "bb", "ca"};
 
     // 1200 chars
     static final String longString = "0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789" +
@@ -604,6 +604,7 @@ private static void loadLegacyTable(String tablePattern, String legacyVersion) t
     @Test
     public void testGenerateSstables() throws Throwable
     {
+        Version version = TrieIndexFormat.latestVersion;
         Random rand = new Random();
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < 128; i++)
@@ -616,31 +617,31 @@ public void testGenerateSstables() throws Throwable
         {
             String valPk = Integer.toString(pk);
             QueryProcessor.executeInternal(String.format("INSERT INTO legacy_tables.legacy_%s_simple (pk, val) VALUES ('%s', '%s')",
-                                                         BigFormat.latestVersion, valPk, "foo bar baz"));
+                                                         version, valPk, "foo bar baz"));
 
             QueryProcessor.executeInternal(String.format("UPDATE legacy_tables.legacy_%s_simple_counter SET val = val + 1 WHERE pk = '%s'",
-                                                         BigFormat.latestVersion, valPk));
+                                                         version, valPk));
 
             for (int ck = 0; ck < 50; ck++)
             {
                 String valCk = Integer.toString(ck);
 
                 QueryProcessor.executeInternal(String.format("INSERT INTO legacy_tables.legacy_%s_clust (pk, ck, val) VALUES ('%s', '%s', '%s')",
-                                                             BigFormat.latestVersion, valPk, valCk + longString, randomString));
+                                                             version, valPk, valCk + longString, randomString));
 
                 QueryProcessor.executeInternal(String.format("UPDATE legacy_tables.legacy_%s_clust_counter SET val = val + 1 WHERE pk = '%s' AND ck='%s'",
-                                                             BigFormat.latestVersion, valPk, valCk + longString));
+                                                             version, valPk, valCk + longString));
             }
         }
 
         StorageService.instance.forceKeyspaceFlush("legacy_tables");
 
-        File ksDir = new File(LEGACY_SSTABLE_ROOT, String.format("%s/legacy_tables", BigFormat.latestVersion));
+        File ksDir = new File(LEGACY_SSTABLE_ROOT, String.format("%s/legacy_tables", version));
         ksDir.mkdirs();
-        copySstablesFromTestData(String.format("legacy_%s_simple", BigFormat.latestVersion), ksDir);
-        copySstablesFromTestData(String.format("legacy_%s_simple_counter", BigFormat.latestVersion), ksDir);
-        copySstablesFromTestData(String.format("legacy_%s_clust", BigFormat.latestVersion), ksDir);
-        copySstablesFromTestData(String.format("legacy_%s_clust_counter", BigFormat.latestVersion), ksDir);
+        copySstablesFromTestData(String.format("legacy_%s_simple", version), ksDir);
+        copySstablesFromTestData(String.format("legacy_%s_simple_counter", version), ksDir);
+        copySstablesFromTestData(String.format("legacy_%s_clust", version), ksDir);
+        copySstablesFromTestData(String.format("legacy_%s_clust_counter", version), ksDir);
     }
 
     public static void copySstablesFromTestData(String table, File ksDir) throws IOException
diff --git a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
index 21c41e4e54fd..5c0cd5410b18 100644
--- a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
@@ -39,6 +39,7 @@
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexFormat;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.FileUtils;
@@ -64,9 +65,9 @@ public void testSerialization() throws IOException
         Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
 
         MetadataSerializer serializer = new MetadataSerializer();
-        File statsFile = serialize(originalMetadata, serializer, BigFormat.latestVersion);
+        File statsFile = serialize(originalMetadata, serializer, TrieIndexFormat.latestVersion);
 
-        Descriptor desc = new Descriptor(statsFile.getParentFile(), "", "", 0, SSTableFormat.Type.BIG);
+        Descriptor desc = new Descriptor(statsFile.getParentFile(), "", "", 0, TrieIndexFormat.instance.getType());
         try (RandomAccessReader in = RandomAccessReader.open(statsFile))
         {
             Map<MetadataType, MetadataComponent> deserialized = serializer.deserialize(desc, in, EnumSet.allOf(MetadataType.class));

From 60d2fbe4dd005009468654c4a276ca41a121d83e Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Fri, 23 Apr 2021 14:10:05 +0200
Subject: [PATCH 069/151] STAR-493: Fix Scrubber (missing bad rows
 incrementation) (#127)

(cherry picked from commit 18b2a69f1ab968007784e569b69d9898256b738d)
(cherry picked from commit cf264ff0c378fbb77526b3b5a3c9e44684f5e765)
---
 .../cassandra/db/compaction/Scrubber.java     |  5 ++-
 .../org/apache/cassandra/db/ScrubTest.java    |  3 +-
 ...r (missing bad rows incrementation) (#127) | 37 +++++++++++++++++++
 3 files changed, 43 insertions(+), 2 deletions(-)
 create mode 100644 update-history/STAR-801/34-cf264ff0c3 STAR-493: Fix Scrubber (missing bad rows incrementation) (#127)

diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index 664522067464..47bc1e30a15a 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -185,12 +185,14 @@ public void scrub()
                 outputHandler.debug("Reading row at " + dataStart);
 
                 DecoratedKey key = null;
+                Throwable keyReadError = null;
                 try
                 {
                     key = sstable.decorateKey(ByteBufferUtil.readWithShortLength(dataFile));
                 }
                 catch (Throwable th)
                 {
+                    keyReadError = th;
                     throwIfFatal(th);
                     // check for null key below
                 }
@@ -235,7 +237,7 @@ public void scrub()
                 try
                 {
                     if (key == null)
-                        throw new IOError(new IOException("Unable to read row key from data file"));
+                        throw new IOError(new IOException("Unable to read row key from data file", keyReadError));
 
                     if (currentIndexKey != null && !key.getKey().equals(currentIndexKey))
                     {
@@ -417,6 +419,7 @@ private void seekToNextRow()
             {
                 throwIfFatal(th);
                 outputHandler.warn(String.format("Failed to seek to next row position %d", nextRowPositionFromIndex), th);
+                badRows++;
             }
 
             try
diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index 53bf6b4d55fe..c93d3f4116ad 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
@@ -94,6 +94,7 @@
 import org.apache.cassandra.tools.ToolRunner;
 import org.apache.cassandra.tools.ToolRunner.ToolResult;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.Throwables;
 import org.assertj.core.api.Assertions;
 
 import static org.apache.cassandra.SchemaLoader.counterCFMD;
@@ -209,7 +210,7 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep
         }
         catch (IOError err)
         {
-            assertTrue(err.getCause() instanceof CorruptSSTableException);
+            assertTrue(Throwables.isCausedBy(err, CorruptSSTableException.class));
         }
 
         // with skipCorrupted == true, the corrupt rows will be skipped
diff --git a/update-history/STAR-801/34-cf264ff0c3 STAR-493: Fix Scrubber (missing bad rows incrementation) (#127) b/update-history/STAR-801/34-cf264ff0c3 STAR-493: Fix Scrubber (missing bad rows incrementation) (#127)
new file mode 100644
index 000000000000..a4cea6d00113
--- /dev/null
+++ b/update-history/STAR-801/34-cf264ff0c3 STAR-493: Fix Scrubber (missing bad rows incrementation) (#127)	
@@ -0,0 +1,37 @@
+--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
++++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
+@@ -94,7 +94,6 @@
+ import org.apache.cassandra.tools.ToolRunner;
+ import org.apache.cassandra.tools.ToolRunner.ToolResult;
+ import org.apache.cassandra.utils.ByteBufferUtil;
+-import org.apache.cassandra.utils.JVMStabilityInspector;
+ import org.apache.cassandra.utils.Throwables;
+ import org.assertj.core.api.Assertions;
+ 
+@@ -209,14 +208,9 @@
+             scrubber.scrub();
+             fail("Expected a CorruptSSTableException to be thrown");
+         }
+-<<<<<<<
+         catch (IOError err)
+         {
+-            assertTrue(err.getCause() instanceof CorruptSSTableException);
+-=======
+-        catch (IOError err) {
+             assertTrue(Throwables.isCausedBy(err, CorruptSSTableException.class));
+->>>>>>>
+         }
+ 
+         // with skipCorrupted == true, the corrupt rows will be skipped
+diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
+index 7de5f6945d..c93d3f4116 100644
+--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
++++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
+@@ -94,7 +94,6 @@ import org.apache.cassandra.tools.StandaloneScrubber;
+ import org.apache.cassandra.tools.ToolRunner;
+ import org.apache.cassandra.tools.ToolRunner.ToolResult;
+ import org.apache.cassandra.utils.ByteBufferUtil;
+-import org.apache.cassandra.utils.JVMStabilityInspector;
+ import org.apache.cassandra.utils.Throwables;
+ import org.assertj.core.api.Assertions;
+ 

From 1f456e2b4313a4beaf3e091f8319bde047e890ce Mon Sep 17 00:00:00 2001
From: Aleksandr Sorokoumov <918393+Gerrrr@users.noreply.github.com>
Date: Mon, 26 Apr 2021 17:04:21 +0200
Subject: [PATCH 070/151] STAR-410 (#101)

* STAR-410 Refactor compaction metrics

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

* STAR-410 Rename CompactionInfo.Holder to AbstractTableOperation and CompactionInfo to AbstractTableOperation.Progress

CompactionInfo.Holder was implemented by all table operations, not just compactions. For example index and view building
were reported as compactions. The new naming reflects more correctly the role of this class, which is to report progress or
interrupt these table operations.

CompactionInfo was a basic class capable of reporting the progress of an operation, for example how many bytes (or keys, or ranges)
have been currently processed out of a total number of bytes (or keys, or ranges).

The new names should remove the previous confusion, and allow adding further compaction progress statistics.

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

* STAR-410 More refactoring of compaction-related code:

- Added CompactionProgress and refactored CompactionTask into CompactionOperation
- Extracted interfaces from AbstractTableOperation and its progress
  inner-class, so that they can be extended by CompactionProgress
- Added compactions in progress to the abstract strategy for a basic
  aggregation of compactions in progress, to be extended further

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

* STAR-410 New aggregated statistics for compaction strategies added to nodetool compactionstats and compaction logger

 Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
 Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

* STAR-410 Rework compaction strategy code to remove repetition and clarify

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>

* STAR-410 Minor refactoring of compaction strategies and fixing of CI failures:

- Refactor getEstimatedRemainingTasks() and getMaximalTask for STCS, nits
- Fix a possible deadlock in CompactionStrategyManager
- Fix strong self-ref loop
  A strong self-ref loop of SSTableReader was caused by background compactions  because the
  compaction strategy manager is an observer of the tracker. So long as the txn sstable tidier
  references the tracker, then the tracker references the csm and the sstables via the background
  compaction aggregates or picks.

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

* STAR-410 fix compaction interruptions, refactor compaction iterator to return a separate table operation

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

* STAR-410 Undo some changes in ActiveOperations

* Store all active operations in the same set instead of a by-table map
* ActiveOperations does not extend SchemaChangeListener

* fixup! STAR-410 Minor refactoring of compaction strategies and fixing of CI failures:

* Review: Use throw Throwables.propagate instead of hrowables.maybeFail when catching exceptions

* Review: fix wording in BackgroundCompactions and CompactionTask

* Review: Remove unnecessary maybeReloadDiskBoundaries() call

* STAR-410 Compaction strategies filter out SSTables that are no longer in the live set

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
(cherry picked from commit a8be76dfbf72f46f24d84cc1b62188280deec893)
(cherry picked from commit 35e12f4887c615c3e0423d86dbe40324a42a9fb4)
---
 .../cassandra/cache/AutoSavingCache.java      |  21 +-
 .../cassandra/db/ColumnFamilyStore.java       |  34 +-
 .../apache/cassandra/db/SystemKeyspace.java   |   7 +-
 .../AbstractCompactionStrategy.java           | 291 +++++-
 .../db/compaction/AbstractCompactionTask.java |  62 +-
 .../db/compaction/AbstractTableOperation.java | 242 +++++
 .../db/compaction/ActiveCompactions.java      |  83 --
 .../db/compaction/ActiveOperations.java       |  93 ++
 .../db/compaction/BackgroundCompactions.java  | 268 ++++++
 .../db/compaction/CompactionAggregate.java    | 853 ++++++++++++++++++
 .../CompactionAggregateStatistics.java        | 166 ++++
 .../db/compaction/CompactionInfo.java         | 250 -----
 .../CompactionInterruptedException.java       |   2 +-
 .../db/compaction/CompactionIterator.java     | 240 +++--
 .../db/compaction/CompactionLogger.java       | 226 ++++-
 .../db/compaction/CompactionManager.java      | 296 +++---
 .../db/compaction/CompactionObserver.java     |  65 ++
 .../db/compaction/CompactionPick.java         | 206 +++++
 .../db/compaction/CompactionProgress.java     | 130 +++
 .../db/compaction/CompactionStatistics.java   | 101 +++
 .../compaction/CompactionStrategyManager.java |  49 +-
 .../CompactionStrategyStatistics.java         | 137 +++
 .../db/compaction/CompactionTask.java         | 619 ++++++++++---
 .../DateTieredCompactionStrategy.java         | 127 +--
 .../LeveledCompactionStatistics.java          | 193 ++++
 .../compaction/LeveledCompactionStrategy.java | 184 ++--
 .../db/compaction/LeveledCompactionTask.java  |   9 +-
 .../db/compaction/LeveledGenerations.java     |  12 +-
 .../db/compaction/LeveledManifest.java        | 232 +++--
 .../db/compaction/OperationType.java          |   8 +
 .../db/compaction/PendingRepairManager.java   |   2 +-
 .../db/compaction/SSTableSplitter.java        |  16 +-
 .../cassandra/db/compaction/Scrubber.java     |  20 +-
 .../db/compaction/SingleSSTableLCSTask.java   |   9 +-
 .../SizeTieredCompactionStatistics.java       |  63 ++
 .../SizeTieredCompactionStrategy.java         | 480 +++++-----
 .../SizeTieredCompactionStrategyOptions.java  |  23 +-
 .../db/compaction/TableOperation.java         | 194 ++++
 .../db/compaction/TableOperationObserver.java |  41 +
 .../TieredCompactionStatistics.java           | 115 +++
 .../TimeTieredCompactionStatistics.java       |  69 ++
 .../TimeWindowCompactionStrategy.java         | 392 ++++----
 .../compaction/TimeWindowCompactionTask.java  |   4 +-
 .../cassandra/db/compaction/Verifier.java     |  23 +-
 .../writers/CompactionAwareWriter.java        |   5 +
 .../cassandra/db/lifecycle/Helpers.java       |   8 +-
 .../db/lifecycle/LifecycleTransaction.java    |  15 +-
 .../db/lifecycle/LogTransaction.java          |  32 +-
 .../cassandra/db/lifecycle/Tracker.java       |  10 +-
 .../apache/cassandra/db/lifecycle/View.java   |   3 +-
 .../db/partitions/PurgeFunction.java          |   1 -
 .../repair/CassandraValidationIterator.java   |   8 +-
 .../db/repair/PendingAntiCompaction.java      |  12 +-
 .../cassandra/db/view/ViewBuilderTask.java    |  16 +-
 .../db/virtual/SSTableTasksTable.java         |  18 +-
 .../index/SecondaryIndexBuilder.java          |   4 +-
 .../internal/CollatedViewIndexBuilder.java    |  17 +-
 .../sai/StorageAttachedIndexBuilder.java      |  19 +-
 .../index/sasi/SASIIndexBuilder.java          |  18 +-
 .../cassandra/io/sstable/ISSTableScanner.java |  14 +-
 .../io/sstable/IndexSummaryManager.java       |   2 +-
 .../sstable/IndexSummaryRedistribution.java   |  20 +-
 .../apache/cassandra/io/sstable/SSTable.java  |  15 +
 .../cassandra/io/sstable/SSTableRewriter.java |   7 +
 .../io/sstable/format/SSTableReader.java      |  19 +-
 .../sstable/format/big/BigTableScanner.java   |  10 +
 .../cassandra/metrics/CompactionMetrics.java  | 132 +--
 .../org/apache/cassandra/tools/NodeProbe.java |   1 +
 .../tools/nodetool/CompactionStats.java       |  41 +-
 .../NonThrowingCloseable.java}                |  23 +-
 .../distributed/test/FailingRepairTest.java   |   5 +
 .../db/compaction/LongCompactionsTest.java    |   2 +-
 .../LongLeveledCompactionStrategyTest.java    |   4 +-
 .../compaction/CompactionAllocationTest.java  |   3 +-
 test/unit/org/apache/cassandra/Util.java      |   3 +-
 ...t.java => AbstractTableOperationTest.java} |  17 +-
 ...onsTest.java => ActiveOperationsTest.java} |  58 +-
 .../compaction/BackgroundCompactionsTest.java | 426 +++++++++
 .../db/compaction/CancelCompactionsTest.java  |  82 +-
 .../db/compaction/CompactionIteratorTest.java |  12 +-
 ...ctionStrategyManagerPendingRepairTest.java |   8 +-
 .../CompactionStrategyManagerTest.java        |   2 +
 .../CompactionStrategyStatisticsTest.java     | 807 +++++++++++++++++
 .../db/compaction/CompactionTaskTest.java     |  89 +-
 .../db/compaction/CompactionsBytemanTest.java |  16 +-
 .../db/compaction/CompactionsCQLTest.java     |  10 +-
 .../db/compaction/CompactionsPurgeTest.java   |   4 +-
 .../db/compaction/CompactionsTest.java        |  77 ++
 .../LeveledCompactionStrategyTest.java        |  10 +-
 .../compaction/SingleSSTableLCSTaskTest.java  |  16 +-
 .../SizeTieredCompactionStrategyTest.java     | 385 +++++++-
 .../TimeWindowCompactionStrategyTest.java     |  59 +-
 .../db/compaction/ZombieSSTablesTest.java     | 213 +++++
 .../cassandra/db/lifecycle/HelpersTest.java   |   4 +-
 .../cassandra/db/lifecycle/ViewTest.java      |   8 +-
 .../db/repair/PendingAntiCompactionTest.java  | 171 ++--
 .../index/SecondaryIndexManagerTest.java      |   5 +-
 .../apache/cassandra/index/sai/SAITester.java |   2 +-
 .../io/sstable/IndexSummaryManagerTest.java   |  25 +-
 .../STAR-801/33-35e12f4887 STAR-410 (#101)    | 137 +++
 100 files changed, 7770 insertions(+), 2017 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/db/compaction/AbstractTableOperation.java
 delete mode 100644 src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/ActiveOperations.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java
 delete mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionObserver.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionPick.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionProgress.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/TableOperation.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/TableOperationObserver.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java
 rename src/java/org/apache/cassandra/{db/compaction/ActiveCompactionsTracker.java => utils/NonThrowingCloseable.java} (64%)
 rename test/unit/org/apache/cassandra/db/compaction/{CompactionInfoTest.java => AbstractTableOperationTest.java} (70%)
 rename test/unit/org/apache/cassandra/db/compaction/{ActiveCompactionsTest.java => ActiveOperationsTest.java} (80%)
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java
 create mode 100644 update-history/STAR-801/33-35e12f4887 STAR-410 (#101)

diff --git a/src/java/org/apache/cassandra/cache/AutoSavingCache.java b/src/java/org/apache/cassandra/cache/AutoSavingCache.java
index 34f056a3c5eb..4c0930fcad8d 100644
--- a/src/java/org/apache/cassandra/cache/AutoSavingCache.java
+++ b/src/java/org/apache/cassandra/cache/AutoSavingCache.java
@@ -34,16 +34,15 @@
 import com.google.common.util.concurrent.MoreExecutors;
 
 import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.*;
 import org.apache.cassandra.io.util.CorruptFileException;
@@ -285,10 +284,10 @@ public Future<?> submitWrite(int keysToSave)
         return CompactionManager.instance.submitCacheWrite(getWriter(keysToSave));
     }
 
-    public class Writer extends CompactionInfo.Holder
+    public class Writer extends AbstractTableOperation
     {
         private final Iterator<K> keyIterator;
-        private final CompactionInfo info;
+        private final OperationProgress info;
         private long keysWritten;
         private final long keysEstimate;
 
@@ -316,12 +315,12 @@ else if (cacheType == CacheService.CacheType.COUNTER_CACHE)
             else
                 type = OperationType.UNKNOWN;
 
-            info = CompactionInfo.withoutSSTables(TableMetadata.minimal(SchemaConstants.SYSTEM_KEYSPACE_NAME, cacheType.toString()),
-                                                  type,
-                                                  0,
-                                                  keysEstimate,
-                                                  Unit.KEYS,
-                                                  UUIDGen.getTimeUUID());
+            info = OperationProgress.withoutSSTables(TableMetadata.minimal(SchemaConstants.SYSTEM_KEYSPACE_NAME, cacheType.toString()),
+                                                     type,
+                                                     0,
+                                                     keysEstimate,
+                                                     Unit.KEYS,
+                                                     UUIDGen.getTimeUUID());
         }
 
         public CacheService.CacheType cacheType()
@@ -329,7 +328,7 @@ public CacheService.CacheType cacheType()
             return cacheType;
         }
 
-        public CompactionInfo getCompactionInfo()
+        public OperationProgress getProgress()
         {
             // keyset can change in size, thus total can too
             // TODO need to check for this one... was: info.forProgress(keysWritten, Math.max(keysWritten, keys.size()));
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 49a05dccacde..4e41dace4d59 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -404,6 +404,7 @@ public ColumnFamilyStore(Keyspace keyspace,
 
         // compaction strategy should be created after the CFS has been prepared
         compactionStrategyManager = new CompactionStrategyManager(this);
+        compactionStrategyManager.reload(metadata().params.compaction);
 
         if (maxCompactionThreshold.value() <= 0 || minCompactionThreshold.value() <=0)
         {
@@ -818,6 +819,11 @@ public String getTableName()
         return name;
     }
 
+    public String getKeyspaceName()
+    {
+        return keyspace.getName();
+    }
+
     public Descriptor newSSTableDescriptor(File directory)
     {
         return newSSTableDescriptor(directory, SSTableFormat.Type.current().info.getLatestVersion(), SSTableFormat.Type.current());
@@ -1601,9 +1607,19 @@ public Iterable<SSTableReader> getSSTables(SSTableSet sstableSet)
         return data.getView().select(sstableSet);
     }
 
-    public Iterable<SSTableReader> getUncompactingSSTables()
+    public Iterable<SSTableReader> getNoncompactingSSTables()
+    {
+        return data.getNoncompacting();
+    }
+
+    public Iterable<? extends SSTableReader> getNoncompactingSSTables(Iterable<? extends SSTableReader> candidates)
     {
-        return data.getUncompacting();
+        return data.getNoncompacting(candidates);
+    }
+
+    public Set<SSTableReader> getCompactingSSTables()
+    {
+        return data.getCompacting();
     }
 
     public Map<UUID, PendingStat> getPendingRepairStats()
@@ -2310,6 +2326,7 @@ public void clearUnsafe()
         {
             cfs.runWithCompactionsDisabled((Callable<Void>) () -> {
                 cfs.data.reset(memtableFactory.create(new AtomicReference<>(CommitLogPosition.NONE), cfs.metadata, cfs));
+                cfs.compactionStrategyManager.forceReload();
                 return null;
             }, true, false);
         }
@@ -2396,7 +2413,7 @@ public void run()
             }
         };
 
-        runWithCompactionsDisabled(Executors.callable(truncateRunnable), true, true, CompactionInfo.StopTrigger.TRUNCATE);
+        runWithCompactionsDisabled(Executors.callable(truncateRunnable), true, true, AbstractTableOperation.StopTrigger.TRUNCATE);
 
         viewManager.build();
         logger.info("Truncate of {}.{} is complete", keyspace.getName(), name);
@@ -2429,14 +2446,14 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptV
         return runWithCompactionsDisabled(callable, (sstable) -> true, interruptValidation, interruptViews, true);
     }
 
-    public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptValidation, boolean interruptViews, CompactionInfo.StopTrigger trigger)
+    public <V> V runWithCompactionsDisabled(Callable<V> callable, boolean interruptValidation, boolean interruptViews, AbstractTableOperation.StopTrigger trigger)
     {
         return runWithCompactionsDisabled(callable, (sstable) -> true, interruptValidation, interruptViews, true, trigger);
     }
 
     public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableReader> sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes)
     {
-        return runWithCompactionsDisabled(callable, sstablesPredicate, interruptValidation, interruptViews, interruptIndexes, CompactionInfo.StopTrigger.NONE);
+        return runWithCompactionsDisabled(callable, sstablesPredicate, interruptValidation, interruptViews, interruptIndexes, AbstractTableOperation.StopTrigger.NONE);
     }
 
     /**
@@ -2449,7 +2466,7 @@ public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableR
      * @param interruptIndexes if we should interrupt compactions on indexes. NOTE: if you set this to true your sstablePredicate
      *                         must be able to handle LocalPartitioner sstables!
      */
-    public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableReader> sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes, CompactionInfo.StopTrigger trigger)
+    public <V> V runWithCompactionsDisabled(Callable<V> callable, Predicate<SSTableReader> sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes, AbstractTableOperation.StopTrigger trigger)
     {
         // synchronize so that concurrent invocations don't re-enable compactions partway through unexpectedly,
         // and so we only run one major compaction at a time
@@ -2599,6 +2616,11 @@ public CompactionStrategyManager getCompactionStrategyManager()
         return compactionStrategyManager;
     }
 
+    public CompactionLogger getCompactionLogger()
+    {
+        return compactionStrategyManager == null ? null : compactionStrategyManager.compactionLogger;
+    }
+
     public void setCrcCheckChance(double crcCheckChance)
     {
         try
diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java
index abc9361862ef..9f6620ec567b 100644
--- a/src/java/org/apache/cassandra/db/SystemKeyspace.java
+++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java
@@ -234,7 +234,7 @@ private SystemKeyspace()
                 + "columnfamily_name text,"
                 + "compacted_at timestamp,"
                 + "keyspace_name text,"
-                + "rows_merged map<int, bigint>,"
+                + "rows_merged map<int, bigint>," // Note that we currently store partitions, not rows!
                 + "PRIMARY KEY ((id)))")
                 .defaultTimeToLive((int) TimeUnit.DAYS.toSeconds(7))
                 .build();
@@ -507,11 +507,12 @@ public static void updateCompactionHistory(UUID id,
                                                long compactedAt,
                                                long bytesIn,
                                                long bytesOut,
-                                               Map<Integer, Long> rowsMerged)
+                                               Map<Integer, Long> partitionsMerged)
     {
         // don't write anything when the history table itself is compacted, since that would in turn cause new compactions
         if (ksname.equals("system") && cfname.equals(COMPACTION_HISTORY))
             return;
+        // For historical reasons (pre 3.0 refactor) we call the final field rows_merged but we actually store partitions!
         String req = "INSERT INTO system.%s (id, keyspace_name, columnfamily_name, compacted_at, bytes_in, bytes_out, rows_merged) VALUES (?, ?, ?, ?, ?, ?, ?)";
         executeInternal(format(req, COMPACTION_HISTORY),
                         id,
@@ -520,7 +521,7 @@ public static void updateCompactionHistory(UUID id,
                         ByteBufferUtil.bytes(compactedAt),
                         bytesIn,
                         bytesOut,
-                        rowsMerged);
+                        partitionsMerged);
     }
 
     public static TabularData getCompactionHistory() throws OpenDataException
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
index 3a40f0d771ba..6230237ca604 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
@@ -18,9 +18,13 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.*;
+import java.util.function.Function;
+
+import javax.annotation.Nullable;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableMap;
+import com.google.common.collect.Iterables;
 
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.SerializationHeader;
@@ -45,6 +49,7 @@
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.TableMetadata;
 
 /**
  * Pluggable compaction strategy determines how SSTables get merged.
@@ -95,11 +100,17 @@ public abstract class AbstractCompactionStrategy
      */
     protected boolean isActive = false;
 
+    /**
+     * This class groups all the compaction tasks that are pending, submitted, in progress and completed.
+     */
+    protected final BackgroundCompactions backgroundCompactions;
+
     protected AbstractCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
     {
         assert cfs != null;
         this.cfs = cfs;
         this.options = ImmutableMap.copyOf(options);
+        this.backgroundCompactions = new BackgroundCompactions(this, cfs);
 
         /* checks must be repeated here, as user supplied strategies might not call validateOptions directly */
 
@@ -126,6 +137,11 @@ protected AbstractCompactionStrategy(ColumnFamilyStore cfs, Map<String, String>
         directories = cfs.getDirectories();
     }
 
+    public BackgroundCompactions getBackgroundCompactions()
+    {
+        return backgroundCompactions;
+    }
+
     public Directories getDirectories()
     {
         return directories;
@@ -172,7 +188,124 @@ public void shutdown()
      *
      * Is responsible for marking its sstables as compaction-pending.
      */
-    public abstract AbstractCompactionTask getNextBackgroundTask(final int gcBefore);
+    public abstract AbstractCompactionTask getNextBackgroundTask(int gcBefore);
+
+    /**
+     * Helper base class for strategies that provide CompactionAggregates, implementing the typical
+     * getNextBackgroundTask logic based on a getNextBackgroundAggregate method.
+     */
+    protected static abstract class WithAggregates extends AbstractCompactionStrategy
+    {
+        protected WithAggregates(ColumnFamilyStore cfs, Map<String, String> options)
+        {
+            super(cfs, options);
+        }
+
+        @Override
+        @SuppressWarnings("resource")
+        public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+        {
+            CompactionPick previous = null;
+            while (true)
+            {
+                CompactionAggregate compaction = getNextBackgroundAggregate(gcBefore);
+                if (compaction == null || compaction.isEmpty())
+                    return null;
+
+                // Already tried acquiring references without success. It means there is a race with
+                // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
+                if (compaction.getSelected().equals(previous))
+                {
+                    logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
+                                "unless it happens frequently, in which case it must be reported. Will retry later.",
+                                compaction.getSelected());
+                    return null;
+                }
+
+                LifecycleTransaction transaction = cfs.getTracker().tryModify(compaction.getSelected().sstables, OperationType.COMPACTION);
+                if (transaction != null)
+                {
+                    backgroundCompactions.setSubmitted(transaction.opId(), compaction);
+                    return createCompactionTask(gcBefore, transaction, compaction);
+                }
+
+                // Getting references to the sstables failed. This may be because we tried to compact sstables that are
+                // no longer present (due to races in getting the notification), or because we still haven't
+                // received any replace notifications. Remove any non-live sstables we track and try again.
+                removeDeadSSTables();
+
+                previous = compaction.getSelected();
+            }
+        }
+
+        /**
+         * Select the next compaction to perform. This method is typically synchronized.
+         */
+        protected abstract CompactionAggregate getNextBackgroundAggregate(int gcBefore);
+
+        protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, CompactionAggregate compaction)
+        {
+            return CompactionTask.forCompaction(this, txn, gcBefore);
+        }
+
+        @Override
+        public int getEstimatedRemainingTasks()
+        {
+            return backgroundCompactions.getEstimatedRemainingTasks();
+        }
+    }
+
+    /**
+     * Helper base class for (older, deprecated) strategies that provide a list of tables to compact, implementing the
+     * typical getNextBackgroundTask logic based on a getNextBackgroundSSTables method.
+     */
+    protected static abstract class WithSSTableList extends AbstractCompactionStrategy
+    {
+        protected WithSSTableList(ColumnFamilyStore cfs, Map<String, String> options)
+        {
+            super(cfs, options);
+        }
+
+        @Override
+        @SuppressWarnings("resource")
+        public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+        {
+            List<SSTableReader> previousCandidate = null;
+            while (true)
+            {
+                List<SSTableReader> latestBucket = getNextBackgroundSSTables(gcBefore);
+
+                if (latestBucket.isEmpty())
+                    return null;
+
+                // Already tried acquiring references without success. It means there is a race with
+                // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
+                if (latestBucket.equals(previousCandidate))
+                {
+                    logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
+                                "unless it happens frequently, in which case it must be reported. Will retry later.",
+                                latestBucket);
+                    return null;
+                }
+
+                LifecycleTransaction modifier = cfs.getTracker().tryModify(latestBucket, OperationType.COMPACTION);
+                if (modifier != null)
+                    return createCompactionTask(gcBefore, modifier, false, false);
+
+                // Getting references to the sstables failed. This may be because we tried to compact sstables that are
+                // no longer present (due to races in getting the notification), or because we still haven't
+                // received any replace notifications. Remove any non-live sstables we track and try again.
+                removeDeadSSTables();
+
+                previousCandidate = latestBucket;
+            }
+        }
+
+        /**
+         * Select the next tables to compact. This method is typically synchronized.
+         */
+        protected abstract List<SSTableReader> getNextBackgroundSSTables(final int gcBefore);
+    }
 
     /**
      * @param gcBefore throw away tombstones older than this
@@ -182,7 +315,19 @@ public void shutdown()
      *
      * Is responsible for marking its sstables as compaction-pending.
      */
-    public abstract Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore, boolean splitOutput);
+    @SuppressWarnings("resource")
+    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore, boolean splitOutput)
+    {
+        removeDeadSSTables();
+
+        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(getSSTables());
+        if (Iterables.isEmpty(filteredSSTables))
+            return null;
+        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
+        if (txn == null)
+            return null;
+        return Collections.singleton(createCompactionTask(gcBefore, txn, true, splitOutput));
+    }
 
     /**
      * @param sstables SSTables to compact. Must be marked as compacting.
@@ -193,18 +338,81 @@ public void shutdown()
      *
      * Is responsible for marking its sstables as compaction-pending.
      */
-    public abstract AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, final int gcBefore);
+    @SuppressWarnings("resource")
+    public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
+    {
+        assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
+
+        LifecycleTransaction modifier = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
+        if (modifier == null)
+        {
+            logger.trace("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
+            return null;
+        }
+
+        return createCompactionTask(gcBefore, modifier, false, false).setUserDefined(true);
+    }
 
-    public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, final int gcBefore, long maxSSTableBytes)
+    /**
+     * Create a compaction task for a maximal, user defined or background compaction without aggregates (legacy strategies).
+     * Background compactions for strategies that extend {@link WithAggregates} will use
+     * {@link WithAggregates#createCompactionTask(int, LifecycleTransaction, boolean, boolean)} instead.
+     *
+     * @param gcBefore tombstone threshold, older tombstones can be discarded
+     * @param txn the transaction containing the files to be compacted
+     * @param isMaximal set to true only when it's a maximal compaction
+     * @param splitOutput false except for maximal compactions and passed in by the user to indicate to SizeTieredCompactionStrategy to split the out,
+     *                    ignored otherwise
+     *
+     * @return a compaction task, see {@link AbstractCompactionTask} and sub-classes
+     */
+    protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
     {
-        return new CompactionTask(cfs, txn, gcBefore);
+        return CompactionTask.forCompaction(this, txn, gcBefore);
     }
 
     /**
-     * @return the number of background tasks estimated to still be needed for this columnfamilystore
+     * Create a compaction task for operations that are not driven by the strategies.
+     *
+     * @param txn the transaction containing the files to be compacted
+     * @param gcBefore tombstone threshold, older tombstones can be discarded
+     * @param maxSSTableBytes the maximum size in bytes for an output sstables
+     *
+     * @return a compaction task, see {@link AbstractCompactionTask} and sub-classes
+     */
+    public AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, final int gcBefore, long maxSSTableBytes)
+    {
+        return CompactionTask.forCompaction(this, txn, gcBefore);
+    }
+
+    /**
+     * Get the estimated remaining compactions. Strategies that implement {@link WithAggregates} can delegate this
+     * to {@link BackgroundCompactions} because they set the pending aggregates as background compactions but legacy
+     * strategies that do not support aggregates must implement this method.
+     * <p/>
+     * @return the number of background tasks estimated to still be needed for this strategy
      */
     public abstract int getEstimatedRemainingTasks();
 
+    /**
+     * @return the total number of background compactions, pending or in progress
+     */
+    public int getTotalCompactions()
+    {
+        return getEstimatedRemainingTasks() + backgroundCompactions.getCompactionsInProgress();
+    }
+
+    /**
+     * Return the statistics. Only strategies that implement {@link WithAggregates} will provide non-empty statistics,
+     * the legacy strategies will always have empty statistics.
+     * <p/>
+     * @return statistics about this compaction picks.
+     */
+    public CompactionStrategyStatistics getStatistics()
+    {
+        return backgroundCompactions.getStatistics();
+    }
+
     /**
      * @return size in bytes of the largest sstables for this strategy
      */
@@ -227,11 +435,16 @@ public static List<SSTableReader> filterSuspectSSTables(Iterable<? extends SSTab
         return filtered;
     }
 
+    public static Iterable<SSTableReader> nonSuspectAndNotIn(Iterable<SSTableReader> tables, Set<SSTableReader> compacting)
+    {
+        return Iterables.filter(tables, t -> !t.isMarkedSuspect() && !compacting.contains(t));
+    }
 
     public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
     {
         return range == null ? getScanners(sstables, (Collection<Range<Token>>)null) : getScanners(sstables, Collections.singleton(range));
     }
+
     /**
      * Returns a list of KeyScanners given sstables and a range on which to scan.
      * The default implementation simply grab one SSTableScanner per-sstable, but overriding this method
@@ -259,18 +472,18 @@ public String getName()
         return getClass().getSimpleName();
     }
 
+    public TableMetadata getMetadata()
+    {
+        return cfs.metadata();
+    }
+
     /**
      * Replaces sstables in the compaction strategy
      *
      * Note that implementations must be able to handle duplicate notifications here (that removed are already gone and
      * added have already been added)
      * */
-    public synchronized void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added)
-    {
-        for (SSTableReader remove : removed)
-            removeSSTable(remove);
-        addSSTables(added);
-    }
+    public abstract void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added);
 
     /**
      * Adds sstable, note that implementations must handle duplicate notifications here (added already being in the compaction strategy)
@@ -286,6 +499,37 @@ public synchronized void addSSTables(Iterable<SSTableReader> added)
             addSSTable(sstable);
     }
 
+    /**
+     * Remove any tracked sstable that is no longer in the live set. Note that because we get notifications after the
+     * tracker is modified, anything we know of must be already in the live set -- if it is not, it has been removed
+     * from there, and we either haven't received the removal notification yet, or we did and we messed it up (i.e.
+     * we got it before the addition). The former is transient, but the latter can cause persistent problems, including
+     * fully stopping compaction. In any case, we should remove any such sstables.
+     * There are two special-case implementations of this in MemoryOnlyStrategy and LeveledManifest.
+     */
+    abstract void removeDeadSSTables();
+
+    void removeDeadSSTables(Iterable<SSTableReader> sstables)
+    {
+        synchronized (sstables)
+        {
+            int removed = 0;
+            Set<SSTableReader> liveSet = cfs.getLiveSSTables();
+            for (Iterator<SSTableReader> it = sstables.iterator(); it.hasNext(); )
+            {
+                SSTableReader sstable = it.next();
+                if (!liveSet.contains(sstable))
+                {
+                    it.remove();
+                    ++removed;
+                }
+            }
+
+            if (removed > 0)
+                logger.debug("Removed {} dead sstables from the compactions tracked list.", removed);
+        }
+    }
+
     /**
      * Removes sstable from the strategy, implementations must be able to handle the sstable having already been removed.
      */
@@ -375,6 +619,27 @@ public ScannerList getScanners(Collection<SSTableReader> toCompact)
         return getScanners(toCompact, (Collection<Range<Token>>)null);
     }
 
+    /**
+     * Select a table for tombstone-removing compaction from the given set. Returns null if no table is suitable.
+     */
+    @Nullable
+    CompactionAggregate makeTombstoneCompaction(int gcBefore,
+                                                Iterable<SSTableReader> candidates,
+                                                Function<Collection<SSTableReader>, SSTableReader> selector)
+    {
+        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
+        for (SSTableReader sstable : candidates)
+        {
+            if (worthDroppingTombstones(sstable, gcBefore))
+                sstablesWithTombstones.add(sstable);
+        }
+        if (sstablesWithTombstones.isEmpty())
+            return null;
+
+        final SSTableReader sstable = selector.apply(sstablesWithTombstones);
+        return CompactionAggregate.createForTombstones(sstable);
+    }
+
     /**
      * Check if given sstable is worth dropping tombstones at gcBefore.
      * Check is skipped if tombstone_compaction_interval time does not elapse since sstable creation and returns false.
@@ -516,7 +781,7 @@ public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Coll
     {
         int groupSize = 2;
         List<SSTableReader> sortedSSTablesToGroup = new ArrayList<>(sstablesToGroup);
-        Collections.sort(sortedSSTablesToGroup, SSTableReader.sstableComparator);
+        Collections.sort(sortedSSTablesToGroup, SSTableReader.firstKeyComparator);
 
         Collection<Collection<SSTableReader>> groupedSSTables = new ArrayList<>();
         Collection<SSTableReader> currGroup = new ArrayList<>(groupSize);
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
index 989c21c27c50..4d14b22fb87f 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
@@ -28,15 +28,21 @@
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
 import org.apache.cassandra.io.FSDiskFullWriteError;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.WrappedRunnable;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 
+import static com.google.common.base.Throwables.propagate;
+
+
 public abstract class AbstractCompactionTask extends WrappedRunnable
 {
     protected final ColumnFamilyStore cfs;
     protected LifecycleTransaction transaction;
     protected boolean isUserDefined;
     protected OperationType compactionType;
+    protected TableOperationObserver opObserver;
+    protected CompactionObserver compObserver;
 
     /**
      * @param cfs
@@ -48,12 +54,22 @@ public AbstractCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transa
         this.transaction = transaction;
         this.isUserDefined = false;
         this.compactionType = OperationType.COMPACTION;
-        // enforce contract that caller should mark sstables compacting
-        Set<SSTableReader> compacting = transaction.tracker.getCompacting();
-        for (SSTableReader sstable : transaction.originals())
-            assert compacting.contains(sstable) : sstable.getFilename() + " is not correctly marked compacting";
+        this.opObserver = TableOperationObserver.NOOP;
+        this.compObserver = CompactionObserver.NO_OP;
+
+        try
+        {
+            // enforce contract that caller should mark sstables compacting
+            Set<SSTableReader> compacting = transaction.getCompacting();
+            for (SSTableReader sstable : transaction.originals())
+                assert compacting.contains(sstable) : sstable.getFilename() + " is not correctly marked compacting";
 
-        validateSSTables(transaction.originals());
+            validateSSTables(transaction.originals());
+        }
+        catch (Throwable err)
+        {
+            propagate(cleanup(err));
+        }
     }
 
     /**
@@ -91,13 +107,20 @@ private void validateSSTables(Set<SSTableReader> sstables)
     }
 
     /**
-     * executes the task and unmarks sstables compacting
+     * Executes the task after setting a new observer, normally the observer is the
+     * compaction manager metrics.
      */
-    public int execute(ActiveCompactionsTracker activeCompactions)
+    public int execute(TableOperationObserver observer)
+    {
+        return setOpObserver(observer).execute();
+    }
+
+    /** Executes the task */
+    public int execute()
     {
         try
         {
-            return executeInternal(activeCompactions);
+            return executeInternal();
         }
         catch(FSDiskFullWriteError e)
         {
@@ -107,12 +130,22 @@ public int execute(ActiveCompactionsTracker activeCompactions)
         }
         finally
         {
-            transaction.close();
+            Throwables.maybeFail(cleanup(null));
         }
     }
+
+    private Throwable cleanup(Throwable err)
+    {
+        return Throwables.perform(err,
+                                  () -> compObserver.setCompleted(transaction.opId()),
+                                  () -> transaction.close());
+    }
+
     public abstract CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables);
 
-    protected abstract int executeInternal(ActiveCompactionsTracker activeCompactions);
+    protected abstract int executeInternal();
+
+    // TODO Eventually these three setters should be passed in to the constructor.
 
     public AbstractCompactionTask setUserDefined(boolean isUserDefined)
     {
@@ -126,6 +159,15 @@ public AbstractCompactionTask setCompactionType(OperationType compactionType)
         return this;
     }
 
+    /**
+     * Override the NO OP observer, this is normally overridden by the compaction metrics.
+     */
+    AbstractCompactionTask setOpObserver(TableOperationObserver opObserver)
+    {
+        this.opObserver = opObserver;
+        return this;
+    }
+
     public String toString()
     {
         return "CompactionTask(" + transaction + ")";
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractTableOperation.java b/src/java/org/apache/cassandra/db/compaction/AbstractTableOperation.java
new file mode 100644
index 000000000000..82ff5e4c49f9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractTableOperation.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.io.Serializable;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Predicate;
+
+import com.google.common.collect.ImmutableSet;
+
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * This is a base abstract implementing some default methods of {@link TableOperation}.
+ * <p/>
+ * In previous versions it used to be called CompactionInfo and CompactionInfo.Holder.
+ * <p/>
+ * This class implements serializable to allow structured info to be returned via JMX.
+ **/
+public abstract class AbstractTableOperation implements TableOperation
+{
+    private volatile boolean stopRequested = false;
+    private volatile StopTrigger trigger = StopTrigger.NONE;
+
+    /**
+     * Interrupt the current operation if possible.
+     */
+    public void stop()
+    {
+        stopRequested = true;
+    }
+
+    /**
+     * Interrupt the current operation if possible and if the predicate is true.
+     *
+     * @param trigger cause of compaction interruption
+     */
+    public void stop(StopTrigger trigger)
+    {
+        this.stopRequested = true;
+        if (!this.trigger.isFinal())
+            this.trigger = trigger;
+    }
+
+    /**
+     * @return true if the operation has received a request to be interrupted.
+     */
+    public boolean isStopRequested()
+    {
+        return stopRequested || (isGlobal() && CompactionManager.instance.isGlobalCompactionPaused());
+    }
+
+    /**
+     * Return true if the predicate for the given sstables holds, or if the operation
+     * does not consider any sstables, in which case it will always return true (the
+     * default behaviour).
+     */
+    public boolean shouldStop(Predicate<SSTableReader> predicate)
+    {
+        OperationProgress progress = getProgress();
+        if (progress.sstables.isEmpty())
+        {
+            return true;
+        }
+        return progress.sstables.stream().anyMatch(predicate);
+    }
+
+    /**
+     * @return cause of compaction interruption.
+     */
+    public StopTrigger trigger()
+    {
+        return trigger;
+    }
+
+    /**
+     * The progress information for an operation, refer to the description of the class properties.
+     */
+    public static final class OperationProgress implements Serializable, Progress
+    {
+        private static final long serialVersionUID = 3695381572726744816L;
+
+        /**
+         * The table metadata
+         */
+        private final TableMetadata metadata;
+        /**
+         * The type of operation
+         */
+        private final OperationType operationType;
+        /**
+         * Normally the bytes processed so far by this operation, but depending on the unit it could mean something else, e.g. ranges or keys.
+         */
+        private final long completed;
+        /**
+         * The total bytes that need to be processed, for example the size of the input files. Depending on the unit it could mean something else, e.g. ranges or keys.
+         */
+        private final long total;
+        /**
+         * The unit for {@link this#completed} and for {@link this#total}.
+         */
+        private final Unit unit;
+        /**
+         * A unique ID for this operation
+         */
+        private final UUID operationId;
+        /**
+         * A set of SSTables participating in this operation
+         */
+        private final ImmutableSet<SSTableReader> sstables;
+
+        public OperationProgress(TableMetadata metadata, OperationType operationType, long bytesComplete, long totalBytes, UUID operationId, Collection<SSTableReader> sstables)
+        {
+            this(metadata, operationType, bytesComplete, totalBytes, Unit.BYTES, operationId, sstables);
+        }
+
+        public OperationProgress(TableMetadata metadata, OperationType operationType, long completed, long total, Unit unit, UUID operationId, Collection<? extends SSTableReader> sstables)
+        {
+            this.operationType = operationType;
+            this.completed = completed;
+            this.total = total;
+            this.metadata = metadata;
+            this.unit = unit;
+            this.operationId = operationId;
+            this.sstables = ImmutableSet.copyOf(sstables);
+        }
+
+        /**
+         * @return A copy of this OperationProgress with updated progress.
+         */
+        public OperationProgress forProgress(long complete, long total)
+        {
+            return new OperationProgress(metadata, operationType, complete, total, unit, operationId, sstables);
+        }
+
+        public static OperationProgress withoutSSTables(TableMetadata metadata, OperationType tasktype, long completed, long total, AbstractTableOperation.Unit unit, UUID compactionId)
+        {
+            return new OperationProgress(metadata, tasktype, completed, total, unit, compactionId, ImmutableSet.of());
+        }
+
+        @Override
+        public Optional<String> keyspace()
+        {
+            return metadata != null ? Optional.of(metadata.keyspace) : Optional.empty();
+        }
+
+        @Override
+        public Optional<String> table()
+        {
+            return metadata != null ? Optional.of(metadata.name) : Optional.empty();
+        }
+
+        @Override
+        public TableMetadata metadata()
+        {
+            return metadata;
+        }
+
+        @Override
+        public long completed()
+        {
+            return completed;
+        }
+
+        @Override
+        public long total()
+        {
+            return total;
+        }
+
+        @Override
+        public OperationType operationType()
+        {
+            return operationType;
+        }
+
+        @Override
+        public UUID operationId()
+        {
+            return operationId;
+        }
+
+        @Override
+        public Unit unit()
+        {
+            return unit;
+        }
+
+        @Override
+        public Set<SSTableReader> sstables()
+        {
+            return sstables;
+        }
+
+        public String toString()
+        {
+            StringBuilder buff = new StringBuilder();
+            buff.append(String.format("%s(%s, %s / %s %s)", operationType, operationId, completed, total, unit));
+            if (metadata != null)
+            {
+                buff.append(String.format("@%s(%s, %s)", metadata.id, metadata.keyspace, metadata.name));
+            }
+            return buff.toString();
+        }
+
+        public Map<String, String> asMap()
+        {
+            Map<String, String> ret = new HashMap<>(8);
+            ret.put(ID, metadata != null ? metadata.id.toString() : "");
+            ret.put(KEYSPACE, keyspace().orElse(null));
+            ret.put(COLUMNFAMILY, table().orElse(null));
+            ret.put(COMPLETED, Long.toString(completed));
+            ret.put(TOTAL, Long.toString(total));
+            ret.put(OPERATION_TYPE, operationType.toString());
+            ret.put(UNIT, unit.toString());
+            ret.put(OPERATION_ID, operationId == null ? "" : operationId.toString());
+            return ret;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java b/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java
deleted file mode 100644
index abaad6349036..000000000000
--- a/src/java/org/apache/cassandra/db/compaction/ActiveCompactions.java
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.db.compaction;
-
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.IdentityHashMap;
-import java.util.List;
-import java.util.Set;
-
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-
-public class ActiveCompactions implements ActiveCompactionsTracker
-{
-    // a synchronized identity set of running tasks to their compaction info
-    private final Set<CompactionInfo.Holder> compactions = Collections.synchronizedSet(Collections.newSetFromMap(new IdentityHashMap<>()));
-
-    public List<CompactionInfo.Holder> getCompactions()
-    {
-        return new ArrayList<>(compactions);
-    }
-
-    public void beginCompaction(CompactionInfo.Holder ci)
-    {
-        compactions.add(ci);
-    }
-
-    public void finishCompaction(CompactionInfo.Holder ci)
-    {
-        compactions.remove(ci);
-        CompactionManager.instance.getMetrics().bytesCompacted.inc(ci.getCompactionInfo().getTotal());
-        CompactionManager.instance.getMetrics().totalCompactionsCompleted.mark();
-    }
-
-    /**
-     * Iterates over the active compactions and tries to find CompactionInfos with the given compactionType for the given sstable
-     *
-     * Number of entries in compactions should be small (< 10) but avoid calling in any time-sensitive context
-     */
-    public Collection<CompactionInfo> getCompactionsForSSTable(SSTableReader sstable, OperationType compactionType)
-    {
-        List<CompactionInfo> toReturn = null;
-        synchronized (compactions)
-        {
-            for (CompactionInfo.Holder holder : compactions)
-            {
-                CompactionInfo compactionInfo = holder.getCompactionInfo();
-                if (compactionInfo.getSSTables().contains(sstable) && compactionInfo.getTaskType() == compactionType)
-                {
-                    if (toReturn == null)
-                        toReturn = new ArrayList<>();
-                    toReturn.add(compactionInfo);
-                }
-            }
-        }
-        return toReturn;
-    }
-
-    /**
-     * @return true if given compaction is still active
-     */
-    public boolean isActive(CompactionInfo.Holder ci)
-    {
-        return compactions.contains(ci);
-    }
-}
diff --git a/src/java/org/apache/cassandra/db/compaction/ActiveOperations.java b/src/java/org/apache/cassandra/db/compaction/ActiveOperations.java
new file mode 100644
index 000000000000..d3da2caa5db9
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/ActiveOperations.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.IdentityHashMap;
+import java.util.List;
+import java.util.Set;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.NonThrowingCloseable;
+
+public class ActiveOperations implements TableOperationObserver
+{
+    // The operations ordered by keyspace.table for all the operations that are currently in progress.
+    private static final Set<TableOperation> operations = Collections.synchronizedSet(Collections.newSetFromMap(new IdentityHashMap<>()));
+
+    /**
+     * @return all the table operations currently in progress. This is mostly compactions but it can include other
+     *         operations too, basically any operation that calls {@link this#onOperationStart(TableOperation).}
+     */
+    public List<TableOperation> getTableOperations()
+    {
+        ImmutableList.Builder<TableOperation> builder = ImmutableList.builder();
+        builder.addAll(operations);
+        return builder.build();
+    }
+
+    @Override
+    public NonThrowingCloseable onOperationStart(TableOperation op)
+    {
+            operations.add(op);
+            return () -> {
+                operations.remove(op);
+                TableOperation.Progress progress = op.getProgress();
+                CompactionManager.instance.getMetrics().bytesCompacted.inc(progress.total());
+                CompactionManager.instance.getMetrics().totalCompactionsCompleted.mark();
+            };
+    }
+
+    /**
+     * Iterates over the active operations and tries to find OperationProgresses with the given operation type for the given sstable
+     *
+     * Number of entries in operations should be small (< 10) but avoid calling in any time-sensitive context
+     */
+    public Collection<AbstractTableOperation.OperationProgress> getOperationsForSSTable(SSTableReader sstable, OperationType operationType)
+    {
+        List<AbstractTableOperation.OperationProgress> toReturn = null;
+
+        synchronized (operations)
+        {
+            for (TableOperation op : operations)
+            {
+                AbstractTableOperation.OperationProgress progress = op.getProgress();
+                if (progress.sstables().contains(sstable) && progress.operationType() == operationType)
+                {
+                    if (toReturn == null)
+                        toReturn = new ArrayList<>();
+                    toReturn.add(progress);
+                }
+            }
+        }
+        return toReturn;
+    }
+
+    /**
+     * @return true if given table operation is still active
+     */
+    public boolean isActive(TableOperation op)
+    {
+        return getTableOperations().contains(op);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java b/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
new file mode 100644
index 000000000000..d63229e1e594
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
@@ -0,0 +1,268 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+
+import com.google.common.collect.ImmutableList;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * A class for grouping the background compactions picked by a strategy, either pending or in progress.
+ */
+class BackgroundCompactions implements CompactionObserver
+{
+    private static final Logger logger = LoggerFactory.getLogger(BackgroundCompactions.class);
+
+    /** The parent strategy */
+    private final AbstractCompactionStrategy strategy;
+
+    /** The table metadata */
+    private final TableMetadata metadata;
+
+    /** The compaction logger */
+    private final CompactionLogger compactionLogger;
+
+    /** The compaction aggregates with either pending or ongoing compactions, or both. */
+    private volatile TreeMap<Long, CompactionAggregate> aggregates = new TreeMap<>();
+
+    /**  The ongoing compactions grouped by unique operation ID. */
+    private ConcurrentHashMap<UUID, CompactionPick> compactions = new ConcurrentHashMap<>();
+
+    BackgroundCompactions(AbstractCompactionStrategy strategy, ColumnFamilyStore cfs)
+    {
+        if (cfs.getCompactionStrategyManager() == null)
+            throw new IllegalStateException("Compaction strategy manager should be set in the CFS first");
+
+        this.strategy = strategy;
+        this.metadata = cfs.metadata();
+        this.compactionLogger = cfs.getCompactionStrategyManager().compactionLogger();
+    }
+
+    /**
+     * Updates the list of pending compactions, while preserving the set of running ones. This is done
+     * by creating new aggregates with the pending aggregates but adding any existing aggregates with
+     * compactions in progress. If there is a matching pending aggregate then the existing compactions
+     * are transferred to it, otherwise the old aggregate is stripped of its pending compactios and then
+     * it is kept with the compactions in progress only.
+     *
+     * @param pending compaction aggregates with pending compactions
+     */
+    synchronized void setPending(List<CompactionAggregate> pending)
+    {
+        if (pending == null)
+            throw new IllegalArgumentException("argument cannot be null");
+
+        if (logger.isTraceEnabled())
+            logger.trace("Resetting pending aggregates for strategy {}/{}, received {} new aggregates",
+                         strategy.getName(), strategy.hashCode(), pending.size());
+
+        // First create a new map with all the pending aggregates
+        TreeMap<Long, CompactionAggregate> aggregates = new TreeMap();
+        for (CompactionAggregate aggregate : pending)
+        {
+            CompactionAggregate prev = aggregates.put(aggregate.getKey(), aggregate);
+            if (logger.isTraceEnabled())
+                logger.trace("Adding new pending aggregate: {}", aggregate);
+
+            if (prev != null)
+                throw new IllegalArgumentException("Received pending aggregates with non unique keys: " + prev.getKey());
+        }
+
+        // Then add the current aggregates with ongoing compactions
+        for (CompactionAggregate oldAggregate : this.aggregates.values())
+        {
+            Collection<CompactionPick> compacting = oldAggregate.getInProgress();
+            if (compacting.isEmpty())
+            {
+                if (logger.isTraceEnabled())
+                    logger.trace("Existing aggregate {} has no in progress compactions, removing it", oldAggregate);
+
+                continue;
+            }
+
+            // See if we have a matching aggregate in the pending aggregates, if so add all the existing compactions to it
+            // otherwise strip the pending and selected compactions from the old one and keep it only with the compactions in progress
+            CompactionAggregate newAggregate;
+            CompactionAggregate matchingAggregate = oldAggregate.getMatching(aggregates);
+            if (matchingAggregate != null)
+            {
+                // add the old compactions to the new aggregate
+                // the key will change slightly for STCS so remove it before adding it again
+                aggregates.remove(matchingAggregate.getKey());
+                newAggregate = matchingAggregate.withAdditionalCompactions(compacting);
+
+                if (logger.isTraceEnabled())
+                    logger.trace("Removed matching aggregate {}", matchingAggregate);
+            }
+            else
+            {
+                // keep the old aggregate but only with the compactions already in progress and not yet completed
+                newAggregate = oldAggregate.withOnlyTheseCompactions(compacting);
+
+                if (logger.isTraceEnabled())
+                    logger.trace("Keeping old aggregate but only with compactions {}", oldAggregate);
+            }
+
+            if (logger.isTraceEnabled())
+                logger.trace("Adding new aggregate with previous compactions {}", newAggregate);
+
+            aggregates.put(newAggregate.getKey(), newAggregate);
+        }
+
+        // Finally publish the new aggregates
+        this.aggregates = aggregates;
+
+        if (compactionLogger != null && compactionLogger.enabled())
+        {
+            compactionLogger.pending(strategy, getEstimatedRemainingTasks());
+            compactionLogger.statistics(strategy, "pending", getStatistics());
+        }
+    }
+
+    @Override
+    public void setSubmitted(UUID id, CompactionAggregate aggregate)
+    {
+        if (id == null || aggregate == null)
+            throw new IllegalArgumentException("arguments cannot be null");
+
+        logger.debug("Submitting background compaction {}", id);
+        CompactionPick compaction = aggregate.getSelected();
+
+        CompactionPick prev = compactions.put(id, compaction);
+        if (prev != null)
+            throw new IllegalArgumentException("Found existing compaction with same id: " + id);
+
+        compaction.setSubmitted(id);
+
+        synchronized (this)
+        {
+            CompactionAggregate existingAggregate = aggregate.getMatching(aggregates);
+
+            if (existingAggregate == null)
+            {
+                if (logger.isTraceEnabled())
+                    logger.trace("Could not find aggregate for compaction using the one passed in: {}", aggregate);
+
+                aggregates.put(aggregate.getKey(), aggregate);
+            }
+            else
+            {
+                if (logger.isTraceEnabled())
+                    logger.trace("Found aggregate for compaction: {}", existingAggregate);
+
+                if (!existingAggregate.getActive().contains(compaction))
+                {
+                    // add the compaction just submitted to the aggregate that was found but because for STCS its
+                    // key may change slightly, first remove it
+                    aggregates.remove(existingAggregate.getKey());
+                    CompactionAggregate newAggregate = existingAggregate.withAdditionalCompactions(ImmutableList.of(compaction));
+                    aggregates.put(newAggregate.getKey(), newAggregate);
+
+                    if (logger.isTraceEnabled())
+                        logger.trace("Added compaction to existing aggregate: {} -> {}", existingAggregate, newAggregate);
+                }
+                else
+                {
+                    if (logger.isTraceEnabled())
+                        logger.trace("Existing aggregate {} already had compaction", existingAggregate);
+                }
+            }
+        }
+
+        if (compactionLogger != null && compactionLogger.enabled())
+            compactionLogger.statistics(strategy, "submitted", getStatistics());
+    }
+
+    @Override
+    public void setInProgress(CompactionProgress progress)
+    {
+        if (progress == null)
+            throw new IllegalArgumentException("argument cannot be null");
+
+        CompactionPick compaction = compactions.computeIfAbsent(progress.operationId(),
+                                                                uuid -> CompactionPick.create(-1, progress.inSSTables()));
+
+        logger.debug("Setting background compaction {} as in progress", progress.operationId());
+        compaction.setProgress(progress);
+    }
+
+    @Override
+    public void setCompleted(UUID id)
+    {
+        if (id == null)
+            throw new IllegalArgumentException("argument cannot be null");
+
+        logger.debug("Removing compaction {}", id);
+
+        // log the statistics before completing the compaction so that we see the stats for the
+        // compaction that just completed
+        if (compactionLogger != null && compactionLogger.enabled())
+            compactionLogger.statistics(strategy, "completed", getStatistics());
+
+        CompactionPick completed = compactions.remove(id);
+        if (completed != null)
+            completed.setCompleted();
+
+        // We rely on setPending() to refresh the aggregates again even though in some cases it may not be
+        // called immediately (e.g. compactions disabled)
+    }
+
+    /**
+     * @return the number of background compactions estimated to still be needed
+     */
+    public int getEstimatedRemainingTasks()
+    {
+        return CompactionAggregate.numEstimatedCompactions(aggregates);
+    }
+
+    /**
+     * @return the number of compactions currently in progress
+     */
+    public int getCompactionsInProgress()
+    {
+        return compactions.size();
+    }
+
+    /**
+     * @return the total number of background compactions, pending or in progress
+     */
+    public int getTotalCompactions()
+    {
+        return getCompactionsInProgress() + getEstimatedRemainingTasks();
+    }
+
+    /**
+     * Return the compaction statistics for this strategy.
+     *
+     * @return statistics about this compaction strategy.
+     */
+    public CompactionStrategyStatistics getStatistics()
+    {
+        return CompactionAggregate.getStatistics(metadata, strategy, aggregates);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java b/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java
new file mode 100644
index 000000000000..125bf68b26fc
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java
@@ -0,0 +1,853 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+/**
+ * A compaction aggregate is either a level in {@link LeveledCompactionStrategy} or a tier (bucket) in other
+ * compaction strategies.
+ * <p/>
+ * It contains a list of {@link CompactionPick}, which are the compactions either in progress or pending.
+ * It also contains a selected {@link CompactionPick}, which is a compaction about to be submitted. The submitted
+ * compaction is also part of the compactions. Lastly, it contains a set of all the sstables in this aggregate,
+ * regardless of whether they need compaction.
+ */
+public abstract class CompactionAggregate
+{
+    private static final Logger logger = LoggerFactory.getLogger(CompactionAggregate.class);
+
+    /** The sstables in this aggregate, whether they are compaction candidates or not */
+    final Set<SSTableReader> sstables;
+
+    /** The compaction that was selected for this aggregate when it was created. It is also part of {@link this#compactions}. */
+    final CompactionPick selected;
+
+    /** The compactions that are part of this aggregate, they could be pending or in progress. */
+    final LinkedHashSet<CompactionPick> compactions;
+
+    CompactionAggregate(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending)
+    {
+        if (sstables == null || selected == null || pending == null)
+            throw new IllegalArgumentException("Arguments cannot be null");
+
+        this.sstables = new HashSet<>(); sstables.forEach(this.sstables::add);
+        this.selected = selected;
+
+        // Here we want to keep the iteration order since normally pending compactions are ordered by a strategy
+        // and the selected compaction should be the first one
+        this.compactions = new LinkedHashSet<>();
+        if (!selected.isEmpty())
+            compactions.add(selected);
+
+        for (CompactionPick p : pending)
+        {
+            if (p == null || p.isEmpty())
+                throw new IllegalArgumentException("Pending compactions should be valid compactions");
+
+            compactions.add(p);
+        }
+    }
+
+    public CompactionPick getSelected()
+    {
+        return selected;
+    }
+
+    /**
+     * @return compactions that have not yet been submitted (no compaction id).
+     */
+    public List<CompactionPick> getPending()
+    {
+        List<CompactionPick> ret = new ArrayList<>(compactions.size());
+        for (CompactionPick comp : compactions)
+        {
+            if (comp.id == null)
+                ret.add(comp);
+        }
+
+        return ret;
+    }
+
+    /**
+     * @return compactions that have already been submitted (compaction id is available) and haven't completed yet
+     */
+    public List<CompactionPick> getInProgress()
+    {
+        List<CompactionPick> ret = new ArrayList<>(compactions.size());
+        for (CompactionPick comp : compactions)
+        {
+            if (comp.id != null && !comp.completed)
+                ret.add(comp);
+        }
+
+        return ret;
+    }
+
+    /**
+     * @return all the compactions we have
+     */
+    public List<CompactionPick> getActive()
+    {
+        return new ArrayList<>(compactions);
+    }
+
+    /**
+     * @return true if this aggregate has no compactions
+     */
+    public boolean isEmpty()
+    {
+        return compactions.isEmpty();
+    }
+
+    /**
+     * Merge the pending compactions and the compactions in progress to create some aggregated statistics.
+     *
+     * @return the statistics for this compaction aggregate, see {@link CompactionAggregateStatistics}.
+     */
+    public abstract CompactionAggregateStatistics getStatistics();
+
+    /**
+     * @return the number of estimated compactions that are still pending.
+     */
+    public int numEstimatedCompactions()
+    {
+        return getPending().size();
+    }
+
+    /**
+     * @return a key that is specific to the concrete implementation, used for grouping compacting aggregates
+     */
+    abstract long getKey();
+
+    /**
+     * Return a matching aggregate from the map passed in or null. Normally this is just a matter of finding
+     * the key in the map but for STCS we need to look at the possible min and maximum average sizes and so
+     * {@link SizeTiered} overrides this method.
+     *
+     * @param others a map of other aggregates
+     *
+     * @return an aggregate with the same key or null
+     */
+    @Nullable CompactionAggregate getMatching(TreeMap<Long, CompactionAggregate> others)
+    {
+        return others.get(getKey());
+    }
+
+    /**
+     * Create a copy of this aggregate with the new parameters
+     *
+     * @return a deep copy of this aggregate
+     */
+    protected abstract CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions);
+
+
+    /**
+     * Add expired sstables to the selected compaction pick and return a new compaction aggregate.
+     */
+    CompactionAggregate withExpired(Collection<SSTableReader> expired)
+    {
+        return clone(Iterables.concat(sstables, expired), selected.withAddedSSTables(expired), compactions);
+    }
+
+    /**
+     * Add existing compactions to our own compactions and return a new compaction aggregate
+     */
+    public CompactionAggregate withAdditionalCompactions(Collection<CompactionPick> comps)
+    {
+        List<SSTableReader> sstables = comps.stream().flatMap(comp -> comp.sstables.stream()).collect(Collectors.toList());
+        return clone(Iterables.concat(this.sstables, sstables), selected, Iterables.concat(compactions, comps));
+    }
+
+    /**
+     * Only keep the compactions passed in, strip everything else.
+     */
+    public CompactionAggregate withOnlyTheseCompactions(Collection<CompactionPick> comps)
+    {
+        List<SSTableReader> sstables = comps.stream().flatMap(comp -> comp.sstables.stream()).collect(Collectors.toList());
+        return clone(sstables, CompactionPick.EMPTY, comps);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(sstables, selected, compactions);
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        if (obj == this)
+            return true;
+
+        if (!(obj instanceof CompactionAggregate))
+            return false;
+
+        CompactionAggregate that = (CompactionAggregate) obj;
+        return sstables.equals(that.sstables) &&
+               selected.equals(that.selected) &&
+               compactions.equals(that.compactions);
+    }
+
+    /**
+     * Contains information about a levelled compaction aggregate, this is equivalent to a level in {@link LeveledCompactionStrategy}.
+     */
+    public static final class Leveled extends CompactionAggregate
+    {
+        /** The current level number */
+        final int level;
+
+        /** The next level number */
+        final int nextLevel;
+
+        /** The score of this level as defined in {@link LeveledCompactionStrategy}. */
+        final double score;
+
+        /** The maximum size of each output sstable that will be produced by compaction, Long.MAX_VALUE if no maximum exists */
+        final long maxSSTableBytes;
+
+        /** How many more compactions this level is expected to perform. This is required because for LCS we cannot easily identify candidate
+         * sstables to put into the pending picks.
+         */
+        final int pendingCompactions;
+
+        Leveled(Iterable<SSTableReader> sstables,
+                CompactionPick selected,
+                Iterable<CompactionPick> compactions,
+                int level,
+                int nextLevel,
+                double score,
+                long maxSSTableBytes,
+                int pendingCompactions)
+        {
+            super(sstables, selected, compactions);
+
+            this.level = level;
+            this.nextLevel = nextLevel;
+            this.score = score;
+            this.maxSSTableBytes = maxSSTableBytes;
+            this.pendingCompactions = pendingCompactions;
+        }
+
+        @Override
+        protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
+        {
+            return new Leveled(sstables, selected, compactions, level, nextLevel, score, maxSSTableBytes, pendingCompactions);
+        }
+
+        @Override
+        public CompactionAggregateStatistics getStatistics()
+        {
+            int numCompactions = pendingCompactions;
+            int numCompactionsInProgress = 0;
+            int numCandidateSSTables = 0;
+            int numCompactingSSTables = 0;
+            long tot = 0;
+            long read = 0;
+            long readLevel = 0;
+            long written = 0;
+            long durationNanos = 0;
+
+            for (CompactionPick compaction : compactions)
+            {
+                if (compaction.completed)
+                    continue;
+
+                numCompactions++;
+                numCandidateSSTables += compaction.sstables.size();
+                tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
+
+                if (compaction.id != null)
+                {
+                    numCompactionsInProgress++;
+                    numCompactingSSTables += compaction.sstables.size();
+                }
+
+                if (compaction.progress != null)
+                {
+                    read += compaction.progress.uncompressedBytesRead();
+                    readLevel += compaction.progress.uncompressedBytesRead(level);
+                    written += compaction.progress.uncompressedBytesWritten();
+                    durationNanos += compaction.progress.durationInNanos();
+                }
+            }
+
+            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+
+            return new LeveledCompactionStatistics(level,
+                                                   score,
+                                                   numCompactions,
+                                                   numCompactionsInProgress,
+                                                   sstables.size(),
+                                                   numCandidateSSTables,
+                                                   numCompactingSSTables,
+                                                   getTotSizeBytes(sstables),
+                                                   readThroughput,
+                                                   writeThroughput,
+                                                   tot,
+                                                   read,
+                                                   readLevel,
+                                                   written);
+        }
+
+        @Override
+        public int numEstimatedCompactions()
+        {
+            return pendingCompactions;
+        }
+
+        @Override
+        public boolean isEmpty()
+        {
+            return super.isEmpty() && pendingCompactions == 0;
+        }
+
+        @Override
+        long getKey()
+        {
+            return level;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("Level %d with %d sstables, %d compactions and %d pending", level, sstables.size(), compactions.size(), pendingCompactions);
+        }
+    }
+
+    /**
+     * Create a level where we have a compaction candidate.
+     */
+    static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
+                                                     Collection<SSTableReader> candidates,
+                                                     int pendingCompactions,
+                                                     long maxSSTableBytes,
+                                                     int level,
+                                                     int nextLevel,
+                                                     double score)
+    {
+        return new Leveled(all,
+                           CompactionPick.create(level, candidates),
+                           ImmutableList.of(),
+                           level,
+                           nextLevel,
+                           score,
+                           maxSSTableBytes,
+                           pendingCompactions);
+    }
+
+    /**
+     * Create a level when we only have estimated tasks.
+     */
+    static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
+                                                     int pendingCompactions,
+                                                     long maxSSTableBytes,
+                                                     int level,
+                                                     double score)
+    {
+        return new Leveled(all,
+                           CompactionPick.EMPTY,
+                           ImmutableList.of(),
+                           level,
+                           level + 1,
+                           score,
+                           maxSSTableBytes,
+                           pendingCompactions);
+    }
+
+    /**
+     * Create a leveled aggregate when LCS is doing STCS on level 0
+     */
+    static CompactionAggregate.Leveled createLeveledForSTCS(Collection<SSTableReader> all,
+                                                            CompactionPick pick,
+                                                            int pendingCompactions,
+                                                            double score)
+    {
+        return new Leveled(all,
+                           pick,
+                           ImmutableList.of(),
+                           0,
+                           0,
+                           score,
+                           Long.MAX_VALUE,
+                           pendingCompactions);
+    }
+
+    /**
+     * Contains information about a size-tiered compaction aggregate, this is equivalent to a bucket in {@link SizeTieredCompactionStrategy}.
+     */
+    public static final class SizeTiered extends CompactionAggregate
+    {
+        /** The total read hotness of the sstables in this tier, as defined by {@link SSTableReader#hotness()} */
+        final double hotness;
+
+        /** The average on disk size in bytes of the sstables in this tier */
+        final long avgSizeBytes;
+
+        /** The minimum on disk size in bytes for this tier, this is normally the avg size times the STCS bucket low and it is
+         * used to find compacting aggregates that are on the same tier. */
+        final long minSizeBytes;
+
+        /** The maximum on disk size in bytes for this tier, this is normally the avg size times the STCS bucket high and it is
+         * used to find compacting aggregates that are on the same tier. */
+        final long maxSizeBytes;
+
+        SizeTiered(Iterable<SSTableReader> sstables,
+                   CompactionPick selected,
+                   Iterable<CompactionPick> pending,
+                   double hotness,
+                   long avgSizeBytes,
+                   long minSizeBytes,
+                   long maxSizeBytes)
+        {
+            super(sstables, selected, pending);
+
+            this.hotness = hotness;
+            this.avgSizeBytes = avgSizeBytes;
+            this.minSizeBytes = minSizeBytes;
+            this.maxSizeBytes = maxSizeBytes;
+        }
+
+        @Override
+        protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
+        {
+            return new SizeTiered(sstables, selected, compactions, getTotHotness(sstables), getAvgSizeBytes(sstables), minSizeBytes, maxSizeBytes);
+        }
+
+        @Override
+        public CompactionAggregateStatistics getStatistics()
+        {
+            int numCompactions = 0;
+            int numCompactionsInProgress = 0;
+            int numCandidateSSTables = 0;
+            int numCompactingSSTables = 0;
+            long tot = 0;
+            long read = 0;
+            long written = 0;
+            double hotness = 0;
+            long durationNanos = 0;
+
+            for (CompactionPick compaction : compactions)
+            {
+                if (compaction.completed)
+                    continue;
+
+                numCompactions++;
+                numCandidateSSTables += compaction.sstables.size();
+                tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
+                hotness += compaction.hotness;
+
+                if (compaction.id != null)
+                {
+                    numCompactionsInProgress++;
+                    numCompactingSSTables += compaction.sstables.size();
+                }
+
+                if (compaction.progress != null)
+                {
+                    read += compaction.progress.uncompressedBytesRead();
+                    written += compaction.progress.uncompressedBytesWritten();
+                    durationNanos += compaction.progress.durationInNanos();
+                }
+            }
+
+            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+
+            return new SizeTieredCompactionStatistics(avgSizeBytes,
+                                                      hotness,
+                                                      numCompactions,
+                                                      numCompactionsInProgress,
+                                                      sstables.size(),
+                                                      numCandidateSSTables,
+                                                      numCompactingSSTables,
+                                                      getTotSizeBytes(sstables),
+                                                      readThroughput,
+                                                      writeThroughput,
+                                                      tot,
+                                                      read,
+                                                      written);
+        }
+
+        @Override
+        long getKey()
+        {
+            return avgSizeBytes;
+        }
+
+        @Override
+        @Nullable CompactionAggregate getMatching(TreeMap<Long, CompactionAggregate> others)
+        {
+            SortedMap<Long, CompactionAggregate> subMap = others.subMap(minSizeBytes, maxSizeBytes);
+            if (subMap.isEmpty())
+            {
+                if (logger.isTraceEnabled())
+                    logger.trace("Found no matching aggregate for {}",
+                                 FBUtilities.prettyPrintMemory(avgSizeBytes));
+
+                return null;
+            }
+
+            if (logger.isTraceEnabled())
+                logger.trace("Found {} matching aggregates for {}",
+                             subMap.size(),
+                             FBUtilities.prettyPrintMemory(avgSizeBytes));
+
+            Long closest = null;
+            long minDiff = 0;
+            for (Long m : subMap.keySet())
+            {
+                long diff = Math.abs(m - avgSizeBytes);
+                if (closest == null || diff < minDiff)
+                {
+                    closest = m;
+                    minDiff = diff;
+                }
+            }
+
+            if (logger.isTraceEnabled())
+                logger.trace("Using closest matching aggregate for {}: {}",
+                             FBUtilities.prettyPrintMemory(avgSizeBytes),
+                             FBUtilities.prettyPrintMemory(closest));
+
+            return others.get(closest);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("Size tiered %s/%s/%s with %d sstables, %d compactions",
+                                 FBUtilities.prettyPrintMemory(minSizeBytes),
+                                 FBUtilities.prettyPrintMemory(avgSizeBytes),
+                                 FBUtilities.prettyPrintMemory(maxSizeBytes),
+                                 sstables.size(),
+                                 compactions.size());
+        }
+    }
+
+    static CompactionAggregate createSizeTiered(Collection<SSTableReader> all,
+                                                CompactionPick selected,
+                                                List<CompactionPick> pending,
+                                                double hotness,
+                                                long avgSizeBytes,
+                                                long minSizeBytes,
+                                                long maxSizeBytes)
+    {
+        return new SizeTiered(all, selected, pending, hotness, avgSizeBytes, minSizeBytes, maxSizeBytes);
+    }
+
+    /**
+     * Contains information about a size-tiered compaction aggregate, this is equivalent to a bucket in {@link SizeTieredCompactionStrategy}.
+     */
+    public static final class TimeTiered extends CompactionAggregate
+    {
+        /** The timestamp of this aggregate */
+        final long timestamp;
+
+        TimeTiered(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending, long timestamp)
+        {
+            super(sstables, selected, pending);
+            this.timestamp = timestamp;
+        }
+
+        @Override
+        protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
+        {
+            return new TimeTiered(sstables, selected, compactions, timestamp);
+        }
+
+        @Override
+        public CompactionAggregateStatistics getStatistics()
+        {
+            int numCompactions = 0;
+            int numCompactionsInProgress = 0;
+            int numCandidateSSTables = 0;
+            int numCompactingSSTables = 0;
+            long tot = 0;
+            long read = 0;
+            long written = 0;
+            double hotness = 0;
+            long durationNanos = 0;
+
+            for (CompactionPick compaction : compactions)
+            {
+                if (compaction.completed)
+                    continue;
+
+                numCompactions++;
+                numCandidateSSTables += compaction.sstables.size();
+                tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
+                hotness += compaction.hotness;
+
+                if (compaction.id != null)
+                {
+                    numCompactionsInProgress++;
+                    numCompactingSSTables += compaction.sstables.size();
+                }
+
+                if (compaction.progress != null)
+                {
+                    read += compaction.progress.uncompressedBytesRead();
+                    written += compaction.progress.uncompressedBytesWritten();
+                    durationNanos += compaction.progress.durationInNanos();
+                }
+            }
+
+            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+
+            return new TimeTieredCompactionStatistics(timestamp,
+                                                      hotness,
+                                                      numCompactions,
+                                                      numCompactionsInProgress,
+                                                      sstables.size(),
+                                                      numCandidateSSTables,
+                                                      numCompactingSSTables,
+                                                      getTotSizeBytes(sstables),
+                                                      readThroughput,
+                                                      writeThroughput,
+                                                      tot,
+                                                      read,
+                                                      written);
+        }
+
+        @Override
+        long getKey()
+        {
+            return timestamp;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("Time tiered %d with %d sstables, %d compactions", timestamp, sstables.size(), compactions.size());
+        }
+    }
+
+    static CompactionAggregate createTimeTiered(Collection<SSTableReader> sstables, long timestamp)
+    {
+        return new TimeTiered(sstables, CompactionPick.create(timestamp, sstables), ImmutableList.of(), timestamp);
+    }
+
+    static CompactionAggregate createTimeTiered(Collection<SSTableReader> sstables, CompactionPick selected, List<CompactionPick> pending, long timestamp)
+    {
+        return new TimeTiered(sstables, selected, pending, timestamp);
+    }
+
+    /** An aggregate that is created for a compaction issued only to drop tombstones */
+    public static final class TombstoneAggregate extends CompactionAggregate
+    {
+        TombstoneAggregate(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending)
+        {
+            super(sstables, selected, pending);
+        }
+
+        @Override
+        protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
+        {
+            return new TombstoneAggregate(sstables, selected, compactions);
+        }
+
+        @Override
+        public CompactionAggregateStatistics getStatistics()
+        {
+            int numCompactions = 0;
+            int numCompactionsInProgress = 0;
+            int numCandidateSSTables = 0;
+            int numCompactingSSTables = 0;
+            long read = 0;
+            long written = 0;
+            long durationNanos = 0;
+
+            for (CompactionPick compaction : compactions)
+            {
+                if (compaction.completed)
+                    continue;
+
+                numCompactions++;
+                numCandidateSSTables += compaction.sstables.size();
+
+                if (compaction.id  != null)
+                {
+                    numCompactionsInProgress++;
+                    numCompactingSSTables += compaction.sstables.size();
+                }
+
+                if (compaction.progress != null)
+                {
+                    read += compaction.progress.uncompressedBytesRead();
+                    written += compaction.progress.uncompressedBytesWritten();
+                    durationNanos += compaction.progress.durationInNanos();
+                }
+            }
+
+            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+
+            return new CompactionAggregateStatistics(numCompactions,
+                                                     numCompactionsInProgress,
+                                                     sstables.size(),
+                                                     numCandidateSSTables,
+                                                     numCompactingSSTables,
+                                                     getTotSizeBytes(sstables),
+                                                     readThroughput,
+                                                     writeThroughput);
+        }
+
+        @Override
+        long getKey()
+        {
+            return -1; // Tombstone compactions are the only ones with negative keys so they will be matched by a unique aggregate
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("Tombstones with %d sstables, %d compactions", sstables.size(), compactions.size());
+        }
+    }
+
+    static CompactionAggregate createForTombstones(SSTableReader sstable)
+    {
+        List<SSTableReader> sstables = ImmutableList.of(sstable);
+        CompactionPick comp = CompactionPick.create(-1, sstables);
+        return new TombstoneAggregate(sstables, comp, ImmutableList.of());
+    }
+
+    /**
+     * Return the compaction statistics for this strategy and list of compactions that are either pending or in progress.
+     *
+     * @param aggregates the compaction aggregates
+     *
+     * @return the statistics about this compactions
+     */
+    static CompactionStrategyStatistics getStatistics(TableMetadata metadata,
+                                                      AbstractCompactionStrategy strategy,
+                                                      Map<Long, CompactionAggregate> aggregates)
+    {
+        List<Pair<Long, CompactionAggregateStatistics>> statistcs = new ArrayList<>(aggregates.size());
+
+        for (CompactionAggregate comp : aggregates.values())
+            statistcs.add(Pair.create(comp.getKey(), comp.getStatistics()));
+
+        return new CompactionStrategyStatistics(metadata,
+                                                strategy.getClass().getSimpleName(),
+                                                statistcs.stream().map(p -> p.right).collect(Collectors.toList()));
+    }
+
+    /**
+     * Return the number of compactions that are still pending;
+     * @param aggregates the compaction aggregates
+     *
+     * @return the number of compactions that are still pending (net yet submitted)
+     */
+    static int numEstimatedCompactions(Map<Long, CompactionAggregate> aggregates)
+    {
+        int ret = 0;
+        for (CompactionAggregate aggregate : aggregates.values())
+            ret += aggregate.numEstimatedCompactions();
+
+        return ret;
+    }
+
+    /**
+     * Given a sorted list of compactions, return the first selected pick.
+     *
+     * @param aggregates a sorted list of compaction aggregates from most interesting to least interesting, some may be empty
+     *
+     * @return the compaction pick of the first aggregate
+     */
+    static CompactionPick getSelected(List<CompactionAggregate> aggregates)
+    {
+        return aggregates.isEmpty() ? CompactionPick.EMPTY : aggregates.get(0).getSelected();
+    }
+
+    /**
+     * Given a list of sstables, return their average size on disk.
+     *
+     * @param sstables the sstables
+     * @return average sstable size on disk or zero.
+     */
+    static long getAvgSizeBytes(Iterable<SSTableReader> sstables)
+    {
+        long ret = 0;
+        long num = 0;
+        for (SSTableReader sstable : sstables)
+        {
+            ret += sstable.onDiskLength();
+            num++;
+        }
+
+        return num > 0 ? ret / num : 0;
+    }
+
+    /**
+     * Given a list of sstables, return their total size on disk.
+     *
+     * @param sstables the sstables
+     * @return total sstable size on disk or zero.
+     */
+    static long getTotSizeBytes(Iterable<SSTableReader> sstables)
+    {
+        long ret = 0;
+        for (SSTableReader sstable : sstables)
+            ret += sstable.onDiskLength();
+
+        return ret;
+    }
+
+    /**
+     * Given a list of sstables, return their total read hotness.
+     *
+     * @param sstables the sstables
+     * @return total read hotness or zero.
+     */
+    static double getTotHotness(Iterable<SSTableReader> sstables)
+    {
+        double ret = 0;
+        for (SSTableReader sstable : sstables)
+            ret += sstable.hotness();
+
+        return ret;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java b/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java
new file mode 100644
index 000000000000..60b3439e99c8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.io.Serializable;
+import java.util.Collection;
+
+import com.google.common.collect.ImmutableList;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemory;
+import static org.apache.cassandra.utils.FBUtilities.prettyPrintMemoryPerSecond;
+
+/**
+ * The statistics for a {@link CompactionAggregate}.
+ * <p/>
+ * It must be serializable for JMX and convertible to JSON for insights. The JSON
+ * properties are published to insights so changing them has a downstream impact.
+ */
+public class CompactionAggregateStatistics implements Serializable
+{
+    protected static final Collection<String> HEADER = ImmutableList.of("Tot. sstables", "Size (bytes)", "Compactions", "Comp. Sstables", "Read (bytes/sec)", "Write (bytes/sec)");
+
+    /** The number of compactions that are either pending or in progress */
+    private final int numCompactions;
+
+    /** The number of compactions that are in progress */
+    private final int numCompactionsInProgress;
+
+    /** The total number of sstables, whether they need compacting or not */
+    private final int numSSTables;
+
+    /** The number of sstables that are compaction candidates */
+    private final int numCandidateSSTables;
+
+    /** The number of sstables that are currently compacting */
+    private final int numCompactingSSTables;
+
+    /** The size in bytes (on disk) of the total sstables */
+    private final long sizeInBytes;
+
+    /** The read throughput in bytes per second */
+    private final double readThroughput;
+
+    /** The write throughput in bytes per second */
+    private final double writeThroughput;
+
+    CompactionAggregateStatistics(int numCompactions,
+                                  int numCompactionsInProgress,
+                                  int numSSTables,
+                                  int numCandidateSSTables,
+                                  int numCompactingSSTables,
+                                  long sizeInBytes,
+                                  double readThroughput,
+                                  double writeThroughput)
+    {
+        this.numCompactions = numCompactions;
+        this.numCompactionsInProgress = numCompactionsInProgress;
+        this.numCandidateSSTables = numCandidateSSTables;
+        this.numCompactingSSTables = numCompactingSSTables;
+        this.numSSTables = numSSTables;
+        this.sizeInBytes = sizeInBytes;
+        this.readThroughput = readThroughput;
+        this.writeThroughput = writeThroughput;
+    }
+
+    /** The number of compactions that are either pending or in progress */
+    @JsonProperty
+    public int numCompactions()
+    {
+        return numCompactions;
+    }
+
+    /** The number of compactions that are in progress */
+    @JsonProperty
+    public int numCompactionsInProgress()
+    {
+        return numCompactionsInProgress;
+    }
+
+    /** The total number of sstables, whether they need compacting or not */
+    @JsonProperty
+    public int numSSTables()
+    {
+        return numSSTables;
+    }
+
+    /** The number of sstables that are part of this level */
+    @JsonProperty
+    public int numCandidateSSTables()
+    {
+        return numCandidateSSTables;
+    }
+
+    /** The number of sstables that are currently part of a compaction operation */
+    @JsonProperty
+    public int numCompactingSSTables()
+    {
+        return numCompactingSSTables;
+    }
+
+    /** The size in bytes (on disk) of the total sstables */
+    public long sizeInBytes()
+    {
+        return sizeInBytes;
+    }
+
+    /** The read throughput in bytes per second */
+    @JsonProperty
+    public double readThroughput()
+    {
+        return readThroughput;
+    }
+
+    /** The write throughput in bytes per second */
+    @JsonProperty
+    public double writeThroughput()
+    {
+        return writeThroughput;
+    }
+
+    @Override
+    public String toString()
+    {
+        return data().toString();
+    }
+
+    protected Collection<String> header()
+    {
+        return HEADER;
+    }
+
+    protected Collection<String> data()
+    {
+        return ImmutableList.of(Integer.toString(numSSTables),
+                                prettyPrintMemory(sizeInBytes),
+                                        Integer.toString(numCompactions()) + '/' + numCompactionsInProgress(),
+                                        Integer.toString(numCandidateSSTables()) + '/' + numCompactingSSTables(),
+                                prettyPrintMemoryPerSecond((long) readThroughput()),
+                                prettyPrintMemoryPerSecond((long) writeThroughput()));
+    }
+
+    protected String toString(long value)
+    {
+        return FBUtilities.prettyPrintMemory(value);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
deleted file mode 100644
index 703687f5c4fc..000000000000
--- a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.compaction;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Optional;
-import java.util.Set;
-import java.util.UUID;
-import java.util.function.Predicate;
-
-import com.google.common.base.Joiner;
-import com.google.common.collect.ImmutableSet;
-
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.schema.TableMetadata;
-
-public final class CompactionInfo
-{
-    public static final String ID = "id";
-    public static final String KEYSPACE = "keyspace";
-    public static final String COLUMNFAMILY = "columnfamily";
-    public static final String COMPLETED = "completed";
-    public static final String TOTAL = "total";
-    public static final String TASK_TYPE = "taskType";
-    public static final String UNIT = "unit";
-    public static final String COMPACTION_ID = "compactionId";
-    public static final String SSTABLES = "sstables";
-
-    private final TableMetadata metadata;
-    private final OperationType tasktype;
-    private final long completed;
-    private final long total;
-    private final Unit unit;
-    private final UUID compactionId;
-    private final ImmutableSet<SSTableReader> sstables;
-
-    public CompactionInfo(TableMetadata metadata, OperationType tasktype, long bytesComplete, long totalBytes, UUID compactionId, Collection<? extends SSTableReader> sstables)
-    {
-        this(metadata, tasktype, bytesComplete, totalBytes, Unit.BYTES, compactionId, sstables);
-    }
-
-    private CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, UUID compactionId, Collection<? extends SSTableReader> sstables)
-    {
-        this.tasktype = tasktype;
-        this.completed = completed;
-        this.total = total;
-        this.metadata = metadata;
-        this.unit = unit;
-        this.compactionId = compactionId;
-        this.sstables = ImmutableSet.copyOf(sstables);
-    }
-
-    /**
-     * Special compaction info where we always need to cancel the compaction - for example ViewBuilderTask and AutoSavingCache where we don't know
-     * the sstables at construction
-     */
-    public static CompactionInfo withoutSSTables(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, UUID compactionId)
-    {
-        return new CompactionInfo(metadata, tasktype, completed, total, unit, compactionId, ImmutableSet.of());
-    }
-
-    /** @return A copy of this CompactionInfo with updated progress. */
-    public CompactionInfo forProgress(long complete, long total)
-    {
-        return new CompactionInfo(metadata, tasktype, complete, total, unit, compactionId, sstables);
-    }
-
-    public Optional<String> getKeyspace()
-    {
-        return Optional.ofNullable(metadata != null ? metadata.keyspace : null);
-    }
-
-    public Optional<String> getTable()
-    {
-        return Optional.ofNullable(metadata != null ? metadata.name : null);
-    }
-
-    public TableMetadata getTableMetadata()
-    {
-        return metadata;
-    }
-
-    public long getCompleted()
-    {
-        return completed;
-    }
-
-    public long getTotal()
-    {
-        return total;
-    }
-
-    public OperationType getTaskType()
-    {
-        return tasktype;
-    }
-
-    public UUID getTaskId()
-    {
-        return compactionId;
-    }
-
-    public Unit getUnit()
-    {
-        return unit;
-    }
-
-    public Set<SSTableReader> getSSTables()
-    {
-        return sstables;
-    }
-
-    @Override
-    public String toString()
-    {
-        if (metadata != null)
-        {
-            return String.format("%s(%s, %s / %s %s)@%s(%s, %s)",
-                                 tasktype, compactionId, completed, total, unit,
-                                 metadata.id, metadata.keyspace, metadata.name);
-        }
-        else
-        {
-            return String.format("%s(%s, %s / %s %s)",
-                                 tasktype, compactionId, completed, total, unit);
-        }
-    }
-
-    public Map<String, String> asMap()
-    {
-        Map<String, String> ret = new HashMap<String, String>();
-        ret.put(ID, metadata != null ? metadata.id.toString() : "");
-        ret.put(KEYSPACE, getKeyspace().orElse(null));
-        ret.put(COLUMNFAMILY, getTable().orElse(null));
-        ret.put(COMPLETED, Long.toString(completed));
-        ret.put(TOTAL, Long.toString(total));
-        ret.put(TASK_TYPE, tasktype.toString());
-        ret.put(UNIT, unit.toString());
-        ret.put(COMPACTION_ID, compactionId == null ? "" : compactionId.toString());
-        ret.put(SSTABLES, Joiner.on(',').join(sstables));
-        return ret;
-    }
-
-    boolean shouldStop(Predicate<SSTableReader> sstablePredicate)
-    {
-        if (sstables.isEmpty())
-        {
-            return true;
-        }
-        return sstables.stream().anyMatch(sstablePredicate);
-    }
-
-    public enum StopTrigger
-    {
-        NONE(false),
-        TRUNCATE(true);
-
-        private final boolean isFinal;
-
-        StopTrigger(boolean isFinal)
-        {
-            this.isFinal = isFinal;
-        }
-
-        // A stop trigger marked as final should not be overwritten. So a table operation that is
-        // marked with a final stop trigger cannot have it's stop trigger changed to another value.
-        public boolean isFinal()
-        {
-            return isFinal;
-        }
-    }
-
-    public static abstract class Holder
-    {
-        private volatile boolean stopRequested = false;
-        private volatile StopTrigger trigger = StopTrigger.NONE;
-        public abstract CompactionInfo getCompactionInfo();
-
-        public void stop()
-        {
-            stopRequested = true;
-        }
-
-        public void stop(StopTrigger trigger)
-        {
-            this.stopRequested = true;
-            if (!this.trigger.isFinal())
-                this.trigger = trigger;
-        }
-
-        /**
-         * if this compaction involves several/all tables we can safely check globalCompactionsPaused
-         * in isStopRequested() below
-         */
-        public abstract boolean isGlobal();
-
-        public boolean isStopRequested()
-        {
-            return stopRequested || (isGlobal() && CompactionManager.instance.isGlobalCompactionPaused());
-        }
-
-        /**
-         * @return cause of compaction interruption.
-         */
-        public StopTrigger trigger()
-        {
-            return trigger;
-        }
-    }
-
-    public enum Unit
-    {
-        BYTES("bytes"), RANGES("token range parts"), KEYS("keys");
-
-        private final String name;
-
-        Unit(String name)
-        {
-            this.name = name;
-        }
-
-        @Override
-        public String toString()
-        {
-            return this.name;
-        }
-
-        public static boolean isFileSize(String unit)
-        {
-            return BYTES.toString().equals(unit);
-        }
-    }
-}
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionInterruptedException.java b/src/java/org/apache/cassandra/db/compaction/CompactionInterruptedException.java
index 129d9fc0ff5c..a28c39526a14 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionInterruptedException.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionInterruptedException.java
@@ -21,7 +21,7 @@ public class CompactionInterruptedException extends RuntimeException
 {
     private static final long serialVersionUID = -8651427062512310398L;
 
-    public CompactionInterruptedException(CompactionInfo info)
+    public CompactionInterruptedException(AbstractTableOperation.OperationProgress info)
     {
         super("Compaction interrupted: " + info);
     }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
index 7cd2a1b17ecd..377f770c6fc1 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
@@ -38,6 +38,7 @@
 import org.apache.cassandra.index.transactions.CompactionTransaction;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.schema.CompactionParams.TombstoneOption;
+import org.apache.cassandra.utils.Throwables;
 
 /**
  * Merge multiple iterators over the content of sstable into a "compacted" iterator.
@@ -55,7 +56,7 @@
  *   <li>keep tracks of the compaction progress.</li>
  * </ul>
  */
-public class CompactionIterator extends CompactionInfo.Holder implements UnfilteredPartitionIterator
+public class CompactionIterator implements UnfilteredPartitionIterator
 {
     private static final long UNFILTERED_TO_UPDATE_PROGRESS = 100;
 
@@ -67,44 +68,40 @@ public class CompactionIterator extends CompactionInfo.Holder implements Unfilte
     private final UUID compactionId;
 
     private final long totalBytes;
-    private long bytesRead;
-    private long totalSourceCQLRows;
+    private volatile long[] bytesReadByLevel;
 
-    /*
-     * counters for merged rows frequency(AKA histogram).
-     * array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row),
-     * index 1 is counter for 2 rows merged, and so on.
+    /**
+     * Merged frequency counters for partitions and rows (AKA histograms).
+     * The array index represents the number of sstables containing the row or partition minus one. So index 0 contains
+     * the number of rows or partitions coming from a single sstable (therefore copied rather than merged), index 1 contains
+     * the number of rows or partitions coming from two sstables and so forth.
      */
+    private final long[] mergedPartitionsHistogram;
     private final long[] mergedRowsHistogram;
 
     private final UnfilteredPartitionIterator compacted;
-    private final ActiveCompactionsTracker activeCompactions;
-
-    public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, AbstractCompactionController controller, int nowInSec, UUID compactionId)
-    {
-        this(type, scanners, controller, nowInSec, compactionId, ActiveCompactionsTracker.NOOP);
-    }
+    private final TableOperation op;
 
     @SuppressWarnings("resource") // We make sure to close mergedIterator in close() and CompactionIterator is itself an AutoCloseable
-    public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, AbstractCompactionController controller, int nowInSec, UUID compactionId, ActiveCompactionsTracker activeCompactions)
+    public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, AbstractCompactionController controller, int nowInSec, UUID compactionId)
     {
         this.controller = controller;
         this.type = type;
         this.scanners = scanners;
         this.nowInSec = nowInSec;
         this.compactionId = compactionId;
-        this.bytesRead = 0;
+        this.bytesReadByLevel = new long[LeveledGenerations.MAX_LEVEL_COUNT];
 
         long bytes = 0;
         for (ISSTableScanner scanner : scanners)
             bytes += scanner.getLengthInBytes();
         this.totalBytes = bytes;
+        this.mergedPartitionsHistogram = new long[scanners.size()];
         this.mergedRowsHistogram = new long[scanners.size()];
         // note that we leak `this` from the constructor when calling beginCompaction below, this means we have to get the sstables before
         // calling that to avoid a NPE.
         sstables = scanners.stream().map(ISSTableScanner::getBackingSSTables).flatMap(Collection::stream).collect(ImmutableSet.toImmutableSet());
-        this.activeCompactions = activeCompactions == null ? ActiveCompactionsTracker.NOOP : activeCompactions;
-        this.activeCompactions.beginCompaction(this); // note that CompactionTask also calls this, but CT only creates CompactionIterator with a NOOP ActiveCompactions
+        op = createOperation();
 
         UnfilteredPartitionIterator merged = scanners.isEmpty()
                                            ? EmptyIterators.unfilteredPartition(controller.cfs.metadata())
@@ -112,7 +109,36 @@ public CompactionIterator(OperationType type, List<ISSTableScanner> scanners, Ab
         merged = Transformation.apply(merged, new GarbageSkipper(controller));
         merged = Transformation.apply(merged, new Purger(controller, nowInSec));
         merged = DuplicateRowChecker.duringCompaction(merged, type);
-        compacted = Transformation.apply(merged, new AbortableUnfilteredPartitionTransformation(this));
+        compacted = Transformation.apply(merged, new AbortableUnfilteredPartitionTransformation(op));
+    }
+
+    protected TableOperation createOperation()
+    {
+        return new AbstractTableOperation() {
+
+            @Override
+            public OperationProgress getProgress()
+            {
+                return new AbstractTableOperation.OperationProgress(controller.cfs.metadata(), type, bytesRead(), totalBytes, compactionId, sstables);
+            }
+
+            @Override
+            public boolean isGlobal()
+            {
+                return false;
+            }
+        };
+    }
+
+    /**
+     * @return A {@link TableOperation} backed by this iterator. This operation can be observed for progress
+     * and for interrupting provided that it is registered with a {@link TableOperationObserver}, normally the
+     * metrics in the compaction manager. The caller is responsible for registering the operation and checking
+     * {@link TableOperation#isStopRequested()}.
+     */
+    public TableOperation getOperation()
+    {
+        return op;
     }
 
     public TableMetadata metadata()
@@ -120,35 +146,45 @@ public TableMetadata metadata()
         return controller.cfs.metadata();
     }
 
-    public CompactionInfo getCompactionInfo()
+    long bytesRead()
     {
-        return new CompactionInfo(controller.cfs.metadata(),
-                                  type,
-                                  bytesRead,
-                                  totalBytes,
-                                  compactionId,
-                                  sstables);
+        long[] bytesReadByLevel = this.bytesReadByLevel;
+        return Arrays.stream(bytesReadByLevel).reduce(Long::sum).orElse(0L);
     }
 
-    public boolean isGlobal()
+    long bytesRead(int level)
     {
-        return false;
+        return level >= 0 && level < bytesReadByLevel.length ? bytesReadByLevel[level] : 0;
+    }
+
+    long totalBytes()
+    {
+        return totalBytes;
+    }
+
+    long totalSourcePartitions()
+    {
+        return Arrays.stream(mergedPartitionsHistogram).reduce(0L, Long::sum);
+    }
+
+    long totalSourceRows()
+    {
+        return Arrays.stream(mergedRowsHistogram).reduce(0L, Long::sum);
     }
 
-    private void incMergedRowsHistogram(int rows)
+    long[] mergedPartitionsHistogram()
     {
-        assert rows > 0 && rows - 1 < mergedRowsHistogram.length;
-        mergedRowsHistogram[rows - 1] += 1;
+        return mergedPartitionsHistogram;
     }
 
-    public long[] getMergedRowsHistogram()
+    long[] mergedRowsHistogram()
     {
         return mergedRowsHistogram;
     }
 
-    public long getTotalSourceCQLRows()
+    public boolean isGlobal()
     {
-        return totalSourceCQLRows;
+        return false;
     }
 
     private UnfilteredPartitionIterators.MergeListener listener()
@@ -157,51 +193,18 @@ private UnfilteredPartitionIterators.MergeListener listener()
         {
             public UnfilteredRowIterators.MergeListener getRowMergeListener(DecoratedKey partitionKey, List<UnfilteredRowIterator> versions)
             {
-                int merged = 0;
+                int numVersions = 0;
                 for (int i=0, isize=versions.size(); i<isize; i++)
                 {
                     @SuppressWarnings("resource")
                     UnfilteredRowIterator iter = versions.get(i);
                     if (iter != null)
-                        merged++;
+                        numVersions++;
                 }
 
-                assert merged > 0;
-
-                CompactionIterator.this.incMergedRowsHistogram(merged);
+                mergedPartitionsHistogram[numVersions - 1] += 1;
 
-                if (type != OperationType.COMPACTION || !controller.cfs.indexManager.handles(IndexTransaction.Type.COMPACTION))
-                    return null;
-
-                Columns statics = Columns.NONE;
-                Columns regulars = Columns.NONE;
-                for (int i=0, isize=versions.size(); i<isize; i++)
-                {
-                    @SuppressWarnings("resource")
-                    UnfilteredRowIterator iter = versions.get(i);
-                    if (iter != null)
-                    {
-                        statics = statics.mergeTo(iter.columns().statics);
-                        regulars = regulars.mergeTo(iter.columns().regulars);
-                    }
-                }
-                final RegularAndStaticColumns regularAndStaticColumns = new RegularAndStaticColumns(statics, regulars);
-
-                // If we have a 2ndary index, we must update it with deleted/shadowed cells.
-                // we can reuse a single CleanupTransaction for the duration of a partition.
-                // Currently, it doesn't do any batching of row updates, so every merge event
-                // for a single partition results in a fresh cycle of:
-                // * Get new Indexer instances
-                // * Indexer::start
-                // * Indexer::onRowMerge (for every row being merged by the compaction)
-                // * Indexer::commit
-                // A new OpOrder.Group is opened in an ARM block wrapping the commits
-                // TODO: this should probably be done asynchronously and batched.
-                final CompactionTransaction indexTransaction =
-                    controller.cfs.indexManager.newCompactionTransaction(partitionKey,
-                                                                         regularAndStaticColumns,
-                                                                         versions.size(),
-                                                                         nowInSec);
+                final CompactionTransaction indexTransaction = getIndexTransaction(partitionKey,versions);
 
                 return new UnfilteredRowIterators.MergeListener()
                 {
@@ -211,9 +214,23 @@ public void onMergedPartitionLevelDeletion(DeletionTime mergedDeletion, Deletion
 
                     public Row onMergedRows(Row merged, Row[] versions)
                     {
-                        indexTransaction.start();
-                        indexTransaction.onRowMerge(merged, versions);
-                        indexTransaction.commit();
+                        int numVersions = 0;
+                        for (Row v : versions)
+                        {
+                            if (v != null)
+                                numVersions++;
+                        }
+
+                        assert numVersions > 0 && numVersions - 1 < mergedRowsHistogram.length;
+                        mergedRowsHistogram[numVersions - 1] += 1;
+
+                        if (indexTransaction != null)
+                        {
+                            indexTransaction.start();
+                            indexTransaction.onRowMerge(merged, versions);
+                            indexTransaction.commit();
+                        }
+
                         return merged;
                     }
 
@@ -233,12 +250,49 @@ public void close()
         };
     }
 
+    private CompactionTransaction getIndexTransaction(DecoratedKey partitionKey, List<UnfilteredRowIterator> versions)
+    {
+        if (type != OperationType.COMPACTION || !controller.cfs.indexManager.handles(IndexTransaction.Type.COMPACTION))
+            return null;
+
+        Columns statics = Columns.NONE;
+        Columns regulars = Columns.NONE;
+        for (int i=0, isize=versions.size(); i<isize; i++)
+        {
+            @SuppressWarnings("resource")
+            UnfilteredRowIterator iter = versions.get(i);
+            if (iter != null)
+            {
+                statics = statics.mergeTo(iter.columns().statics);
+                regulars = regulars.mergeTo(iter.columns().regulars);
+            }
+        }
+        final RegularAndStaticColumns regularAndStaticColumns = new RegularAndStaticColumns(statics, regulars);
+        // If we have a 2ndary index, we must update it with deleted/shadowed cells.
+        // we can reuse a single CleanupTransaction for the duration of a partition.
+        // Currently, it doesn't do any batching of row updates, so every merge event
+        // for a single partition results in a fresh cycle of:
+        // * Get new Indexer instances
+        // * Indexer::start
+        // * Indexer::onRowMerge (for every row being merged by the compaction)
+        // * Indexer::commit
+        // A new OpOrder.Group is opened in an ARM block wrapping the commits
+        // TODO: this should probably be done asynchronously and batched.
+        return controller.cfs.indexManager.newCompactionTransaction(partitionKey, regularAndStaticColumns, versions.size(), nowInSec);
+    }
+
     private void updateBytesRead()
     {
-        long n = 0;
+        long[] bytesReadByLevel = new long[this.bytesReadByLevel.length];
         for (ISSTableScanner scanner : scanners)
-            n += scanner.getCurrentPosition();
-        bytesRead = n;
+        {
+            int level = scanner.level();
+            long n = scanner.getCurrentPosition();
+
+            if (level >= 0 && level < bytesReadByLevel.length)
+                bytesReadByLevel[level] += n;
+        }
+        this.bytesReadByLevel = bytesReadByLevel;
     }
 
     public boolean hasNext()
@@ -258,19 +312,14 @@ public void remove()
 
     public void close()
     {
-        try
-        {
-            compacted.close();
-        }
-        finally
-        {
-            activeCompactions.finishCompaction(this);
-        }
+        updateBytesRead();
+
+        Throwables.maybeFail(Throwables.close(null, compacted));
     }
 
     public String toString()
     {
-        return this.getCompactionInfo().toString();
+        return String.format("%s: %s, (%d/%d)", type, metadata(), bytesRead(), totalBytes());
     }
 
     private class Purger extends PurgeFunction
@@ -307,7 +356,6 @@ protected void onNewPartition(DecoratedKey key)
         @Override
         protected void updateProgress()
         {
-            totalSourceCQLRows++;
             if ((++compactedUnfiltered) % UNFILTERED_TO_UPDATE_PROGRESS == 0)
                 updateBytesRead();
         }
@@ -559,34 +607,36 @@ protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition
     private static class AbortableUnfilteredPartitionTransformation extends Transformation<UnfilteredRowIterator>
     {
         private final AbortableUnfilteredRowTransformation abortableIter;
+        private final TableOperation op;
 
-        private AbortableUnfilteredPartitionTransformation(CompactionIterator iter)
+        private AbortableUnfilteredPartitionTransformation(TableOperation op)
         {
-            this.abortableIter = new AbortableUnfilteredRowTransformation(iter);
+            this.op = op;
+            this.abortableIter = new AbortableUnfilteredRowTransformation(op);
         }
 
         @Override
         protected UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
         {
-            if (abortableIter.iter.isStopRequested())
-                throw new CompactionInterruptedException(abortableIter.iter.getCompactionInfo());
+            if (op.isStopRequested())
+                throw new CompactionInterruptedException(op.getProgress());
             return Transformation.apply(partition, abortableIter);
         }
     }
 
     private static class AbortableUnfilteredRowTransformation extends Transformation
     {
-        private final CompactionIterator iter;
+        private final TableOperation op;
 
-        private AbortableUnfilteredRowTransformation(CompactionIterator iter)
+        private AbortableUnfilteredRowTransformation(TableOperation op)
         {
-            this.iter = iter;
+            this.op = op;
         }
 
         public Row applyToRow(Row row)
         {
-            if (iter.isStopRequested())
-                throw new CompactionInterruptedException(iter.getCompactionInfo());
+            if (op.isStopRequested())
+                throw new CompactionInterruptedException(op.getProgress());
             return row;
         }
     }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java b/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java
index f473be705918..1455d8b3b93e 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java
@@ -18,11 +18,15 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.io.Closeable;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.lang.ref.WeakReference;
 import java.nio.file.*;
+import java.text.DateFormat;
+import java.text.SimpleDateFormat;
 import java.util.Collection;
+import java.util.Date;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -33,6 +37,8 @@
 import java.util.function.Consumer;
 import java.util.function.Function;
 
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
 import com.google.common.collect.MapMaker;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -43,10 +49,18 @@
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.NoSpamLogger;
+import org.apache.cassandra.utils.Throwables;
 
+/**
+ * This is a Compaction logger that logs compaction events in a file called compactions.log.
+ * It was added by CASSANDRA-10805.
+ */
 public class CompactionLogger
 {
+    private static final DateFormat dateFormatter = new SimpleDateFormat("HH:mm:ss.SSS");
+
     public interface Strategy
     {
         JsonNode sstable(SSTableReader sstable);
@@ -78,8 +92,13 @@ public interface StrategySummary
     /**
      * This is an interface to allow writing to a different interface.
      */
-    public interface Writer
+    public interface Writer extends Closeable
     {
+        /**
+         * @param toWrite This should be written out to the medium capturing the logs
+         */
+        void write(String toWrite);
+
         /**
          * This is used when we are already trying to write out the start of a
          * @param statement This should be written out to the medium capturing the logs
@@ -94,6 +113,12 @@ public interface Writer
          * @param tag       This is an identifier for a strategy; each strategy should have a distinct Object
          */
         void write(JsonNode statement, StrategySummary summary, Object tag);
+
+        /**
+         * Closes the writer
+         */
+        @Override
+        void close();
     }
 
     private interface CompactionStrategyAndTableFunction
@@ -103,20 +128,29 @@ private interface CompactionStrategyAndTableFunction
 
     private static final JsonNodeFactory json = JsonNodeFactory.instance;
     private static final Logger logger = LoggerFactory.getLogger(CompactionLogger.class);
-    private static final Writer serializer = new CompactionLogSerializer();
+
+    private static final ExecutorService loggerService = Executors.newFixedThreadPool(1);
+    private static final Writer jsonWriter = new CompactionLogSerializer("compaction", "log", loggerService);
+
     private final WeakReference<ColumnFamilyStore> cfsRef;
     private final WeakReference<CompactionStrategyManager> csmRef;
     private final AtomicInteger identifier = new AtomicInteger(0);
     private final Map<AbstractCompactionStrategy, String> compactionStrategyMapping = new MapMaker().weakKeys().makeMap();
+    private final Map<AbstractCompactionStrategy, Writer> csvWriters = new MapMaker().makeMap();
     private final AtomicBoolean enabled = new AtomicBoolean(false);
 
-    public CompactionLogger(ColumnFamilyStore cfs, CompactionStrategyManager csm)
+    CompactionLogger(ColumnFamilyStore cfs, CompactionStrategyManager csm)
     {
         csmRef = new WeakReference<>(csm);
         cfsRef = new WeakReference<>(cfs);
     }
 
-    private void forEach(Consumer<AbstractCompactionStrategy> consumer)
+    /**
+     * Visit all the strategies in the {@link CompactionStrategyManager} reference, if available.
+     *
+     * @param consumer a consumer function that receives all the strategies one by one
+     */
+    private void visitStrategies(Consumer<AbstractCompactionStrategy> consumer)
     {
         CompactionStrategyManager csm = csmRef.get();
         if (csm == null)
@@ -125,10 +159,18 @@ private void forEach(Consumer<AbstractCompactionStrategy> consumer)
            .forEach(l -> l.forEach(consumer));
     }
 
-    private ArrayNode compactionStrategyMap(Function<AbstractCompactionStrategy, JsonNode> select)
+    /**
+     * Rely on {@link this#visitStrategies(Consumer)} to visit all the strategies in the {@link CompactionStrategyManager}
+     * reference and add the properties extracted by the function passed in to a json node that is returned.
+     *
+     * @param select a function that given a strategy returns a json node
+     *
+     * @return a json node containing information on all the strategies returned by the strategy manager and the function passed in.
+     */
+    private ArrayNode getStrategiesJsonNode(Function<AbstractCompactionStrategy, JsonNode> select)
     {
         ArrayNode node = json.arrayNode();
-        forEach(acs -> node.add(select.apply(acs)));
+        visitStrategies(acs -> node.add(select.apply(acs)));
         return node;
     }
 
@@ -174,7 +216,7 @@ private JsonNode formatSSTable(AbstractCompactionStrategy strategy, SSTableReade
         return node;
     }
 
-    private JsonNode startStrategy(AbstractCompactionStrategy strategy)
+    private JsonNode getStrategyDetails(AbstractCompactionStrategy strategy)
     {
         ObjectNode node = json.objectNode();
         CompactionStrategyManager csm = csmRef.get();
@@ -198,7 +240,7 @@ private JsonNode startStrategy(AbstractCompactionStrategy strategy)
         return node;
     }
 
-    private JsonNode shutdownStrategy(AbstractCompactionStrategy strategy)
+    private JsonNode getStrategyId(AbstractCompactionStrategy strategy)
     {
         ObjectNode node = json.objectNode();
         node.put("strategyId", getId(strategy));
@@ -213,22 +255,22 @@ private JsonNode describeSSTable(AbstractCompactionStrategy strategy, SSTableRea
         return node;
     }
 
-    private void describeStrategy(ObjectNode node)
+    private void maybeAddSchemaAndTimeInfo(ObjectNode node)
     {
         ColumnFamilyStore cfs = cfsRef.get();
         if (cfs == null)
             return;
-        node.put("keyspace", cfs.keyspace.getName());
+        node.put("keyspace", cfs.getKeyspaceName());
         node.put("table", cfs.getTableName());
         node.put("time", System.currentTimeMillis());
     }
 
-    private JsonNode startStrategies()
+    private JsonNode getEventJsonNode()
     {
         ObjectNode node = json.objectNode();
         node.put("type", "enable");
-        describeStrategy(node);
-        node.set("strategies", compactionStrategyMap(this::startStrategy));
+        maybeAddSchemaAndTimeInfo(node);
+        node.set("strategies", getStrategiesJsonNode(this::getStrategyDetails));
         return node;
     }
 
@@ -236,7 +278,7 @@ public void enable()
     {
         if (enabled.compareAndSet(false, true))
         {
-            serializer.writeStart(startStrategies(), this);
+            jsonWriter.writeStart(getEventJsonNode(), this);
         }
     }
 
@@ -246,21 +288,28 @@ public void disable()
         {
             ObjectNode node = json.objectNode();
             node.put("type", "disable");
-            describeStrategy(node);
-            node.set("strategies", compactionStrategyMap(this::shutdownStrategy));
-            serializer.write(node, this::startStrategies, this);
+            maybeAddSchemaAndTimeInfo(node);
+            node.set("strategies", getStrategiesJsonNode(this::getStrategyId));
+            jsonWriter.write(node, this::getEventJsonNode, this);
+
+            visitStrategies(strategy -> csvWriters.computeIfPresent(strategy, (s, w) -> { w.close(); return null; }));
         }
     }
 
+    public boolean enabled()
+    {
+        return enabled.get();
+    }
+
     public void flush(Collection<SSTableReader> sstables)
     {
         if (enabled.get())
         {
             ObjectNode node = json.objectNode();
             node.put("type", "flush");
-            describeStrategy(node);
+            maybeAddSchemaAndTimeInfo(node);
             node.set("tables", sstableMap(sstables, this::describeSSTable));
-            serializer.write(node, this::startStrategies, this);
+            jsonWriter.write(node, this::getEventJsonNode, this);
         }
     }
 
@@ -270,12 +319,12 @@ public void compaction(long startTime, Collection<SSTableReader> input, long end
         {
             ObjectNode node = json.objectNode();
             node.put("type", "compaction");
-            describeStrategy(node);
+            maybeAddSchemaAndTimeInfo(node);
             node.put("start", String.valueOf(startTime));
             node.put("end", String.valueOf(endTime));
             node.set("input", sstableMap(input, this::describeSSTable));
             node.set("output", sstableMap(output, this::describeSSTable));
-            serializer.write(node, this::startStrategies, this);
+            jsonWriter.write(node, this::getEventJsonNode, this);
         }
     }
 
@@ -285,31 +334,89 @@ public void pending(AbstractCompactionStrategy strategy, int remaining)
         {
             ObjectNode node = json.objectNode();
             node.put("type", "pending");
-            describeStrategy(node);
+            maybeAddSchemaAndTimeInfo(node);
             node.put("strategyId", getId(strategy));
             node.put("pending", remaining);
-            serializer.write(node, this::startStrategies, this);
+            jsonWriter.write(node, this::getEventJsonNode, this);
+        }
+    }
+
+    /**
+     * Write the strategy statistics formatted as CSV.
+     **/
+    public void statistics(AbstractCompactionStrategy strategy, String event, CompactionStrategyStatistics statistics)
+    {
+        if (logger.isTraceEnabled())
+            logger.trace("Compaction statistics for strategy {} and event {}: {}", strategy, event, statistics);
+
+        if (!enabled.get())
+            return;
+
+        Writer writer = getCsvWriter(strategy, statistics.getHeader());
+        for (Collection<String> data : statistics.getData())
+            writer.write(String.join(",", Iterables.concat(ImmutableList.of(currentTime(), event), data)) + System.lineSeparator());
+    }
+
+    private Writer getCsvWriter(AbstractCompactionStrategy strategy, Collection<String> header)
+    {
+        Writer writer = csvWriters.get(strategy);
+        if (writer != null)
+            return writer;
+
+        // TODO - should we add the repair status?
+        String fileName = String.format("compaction-%s-%s-%s-%s",
+                                        strategy.getName(),
+                                        strategy.getMetadata().keyspace,
+                                        strategy.getMetadata().name,
+                                        getId(strategy));
+
+        writer = new CompactionLogSerializer(fileName, "csv", loggerService);
+        if (csvWriters.putIfAbsent(strategy, writer) == null)
+        {
+            writer.write(String.join(",", Iterables.concat(ImmutableList.of("Timestamp", "Event"), header)) + System.lineSeparator());
+            return writer;
+        }
+        else
+        {
+            writer.close();
+            return csvWriters.get(strategy);
         }
     }
 
+    private String currentTime()
+    {
+        return dateFormatter.format(new Date(System.currentTimeMillis()));
+    }
+
     private static class CompactionLogSerializer implements Writer
     {
         private static final String logDirectory = System.getProperty("cassandra.logdir", ".");
-        private final ExecutorService loggerService = Executors.newFixedThreadPool(1);
+
         // This is only accessed on the logger service thread, so it does not need to be thread safe
-        private final Set<Object> rolled = new HashSet<>();
+        private final String fileName;
+        private final String fileExt;
+        private final ExecutorService loggerService;
+        private final Set<Object> rolled;
         private OutputStreamWriter stream;
 
-        private static OutputStreamWriter createStream() throws IOException
+        CompactionLogSerializer(String fileName, String fileExt, ExecutorService loggerService)
+        {
+            this.fileName = fileName;
+            this.fileExt = fileExt;
+            this.loggerService = loggerService;
+            this.rolled = new HashSet<>();
+        }
+
+        private OutputStreamWriter createStream() throws IOException
         {
             int count = 0;
-            Path compactionLog = Paths.get(logDirectory, "compaction.log");
+            Path compactionLog = Paths.get(logDirectory,  String.format("%s.%s", fileName, fileExt));
             if (Files.exists(compactionLog))
             {
                 Path tryPath = compactionLog;
                 while (Files.exists(tryPath))
                 {
-                    tryPath = Paths.get(logDirectory, String.format("compaction-%d.log", count++));
+                    tryPath = Paths.get(logDirectory, String.format("%s-%d.%s", fileName, count++, fileExt));
                 }
                 Files.move(compactionLog, tryPath);
             }
@@ -317,44 +424,71 @@ private static OutputStreamWriter createStream() throws IOException
             return new OutputStreamWriter(Files.newOutputStream(compactionLog, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE));
         }
 
-        private void writeLocal(String toWrite)
+        private interface ThrowingConsumer<T>
         {
-            try
-            {
-                if (stream == null)
-                    stream = createStream();
-                stream.write(toWrite);
-                stream.flush();
-            }
-            catch (IOException ioe)
+            void accept(T stream) throws IOException;
+        }
+
+        private void performWrite(ThrowingConsumer<OutputStreamWriter> writeTask)
+        {
+            loggerService.execute(() ->
             {
-                // We'll drop the change and log the error to the logger.
-                NoSpamLogger.log(logger, NoSpamLogger.Level.ERROR, 1, TimeUnit.MINUTES,
-                                 "Could not write to the log file: {}", ioe);
-            }
+              try
+              {
+                  if (stream == null)
+                      stream = createStream();
+
+                  writeTask.accept(stream);
+                  stream.flush();
+              }
+              catch (IOException ioe)
+              {
+                  // We'll drop the change and log the error to the logger.
+                  NoSpamLogger.log(logger, NoSpamLogger.Level.ERROR, 1, TimeUnit.MINUTES,
+                                   "Could not write to the log file: {}", ioe);
+              }
+            });
+        }
 
+        public void write(String toWrite)
+        {
+            performWrite(s -> s.write(toWrite));
         }
 
         public void writeStart(JsonNode statement, Object tag)
         {
             final String toWrite = statement.toString() + System.lineSeparator();
-            loggerService.execute(() -> {
+            performWrite(s -> {
                 rolled.add(tag);
-                writeLocal(toWrite);
+                s.write(toWrite);
             });
         }
 
         public void write(JsonNode statement, StrategySummary summary, Object tag)
         {
             final String toWrite = statement.toString() + System.lineSeparator();
-            loggerService.execute(() -> {
+            performWrite(s -> {
                 if (!rolled.contains(tag))
                 {
-                    writeLocal(summary.getSummary().toString() + System.lineSeparator());
+                    s.write(toWrite);
                     rolled.add(tag);
                 }
-                writeLocal(toWrite);
             });
         }
+
+        public void close()
+        {
+            if (stream != null)
+            {
+                Throwable err = Throwables.close(null, stream);
+                if (err != null)
+                {
+                    JVMStabilityInspector.inspectThrowable(err);
+                    logger.error("Failed to close {}: {}", String.format("%s.%s", fileName, fileExt), err);
+                }
+
+                stream = null;
+            }
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index 2349689b330d..e590c894ffdb 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.db.compaction;
 
+import java.io.Closeable;
 import java.io.File;
 import java.io.IOException;
 import java.util.*;
@@ -30,6 +31,7 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Predicates;
 import com.google.common.collect.*;
 import com.google.common.util.concurrent.*;
 
@@ -49,7 +51,6 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.*;
-import org.apache.cassandra.db.compaction.CompactionInfo.Holder;
 import org.apache.cassandra.db.lifecycle.ILifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.SSTableIntervalTree;
@@ -135,7 +136,7 @@ protected Boolean initialValue()
     @VisibleForTesting
     final Multiset<ColumnFamilyStore> compactingCF = ConcurrentHashMultiset.create();
 
-    public final ActiveCompactions active = new ActiveCompactions();
+    public final ActiveOperations active = new ActiveOperations();
 
     // used to temporarily pause non-strategy managed compactions (like index summary redistribution)
     private final AtomicInteger globalCompactionPauseCount = new AtomicInteger(0);
@@ -236,9 +237,9 @@ public void forceShutdown()
         cacheCleanupExecutor.shutdown();
 
         // interrupt compactions and validations
-        for (Holder compactionHolder : active.getCompactions())
+        for (TableOperation operationSource : active.getTableOperations())
         {
-            compactionHolder.stop();
+            operationSource.stop();
         }
 
         // wait for tasks to terminate
@@ -594,25 +595,13 @@ public Iterable<SSTableReader> filterSSTables(LifecycleTransaction transaction)
             }
 
             @Override
-            public void execute(LifecycleTransaction txn) throws IOException
+            public void execute(LifecycleTransaction txn)
             {
                 logger.debug("Garbage collecting {}", txn.originals());
-                CompactionTask task = new CompactionTask(cfStore, txn, getDefaultGcBefore(cfStore, FBUtilities.nowInSeconds()))
-                {
-                    @Override
-                    protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
-                    {
-                        return new CompactionController(cfStore, toCompact, gcBefore, null, tombstoneOption);
-                    }
-
-                    @Override
-                    protected int getLevel()
-                    {
-                        return txn.onlyOne().getSSTableLevel();
-                    }
-                };
-                task.setUserDefined(true);
-                task.setCompactionType(OperationType.GARBAGE_COLLECT);
+                AbstractCompactionTask task = CompactionTask.forGarbageCollection(cfStore,
+                                                                                  txn,
+                                                                                  getDefaultGcBefore(cfStore, FBUtilities.nowInSeconds()),
+                                                                                  tombstoneOption);
                 task.execute(active);
             }
         }, jobs, OperationType.GARBAGE_COLLECT);
@@ -855,8 +844,14 @@ public void performMaximal(final ColumnFamilyStore cfStore, boolean splitOutput)
         FBUtilities.waitOnFutures(submitMaximal(cfStore, getDefaultGcBefore(cfStore, FBUtilities.nowInSeconds()), splitOutput));
     }
 
-    @SuppressWarnings("resource") // the tasks are executed in parallel on the executor, making sure that they get closed
     public List<Future<?>> submitMaximal(final ColumnFamilyStore cfStore, final int gcBefore, boolean splitOutput)
+    {
+        return submitMaximal(cfStore, gcBefore, splitOutput, active);
+    }
+
+    @VisibleForTesting
+    @SuppressWarnings("resource") // the tasks are executed in parallel on the executor, making sure that they get closed
+    public List<Future<?>> submitMaximal(final ColumnFamilyStore cfStore, final int gcBefore, boolean splitOutput, TableOperationObserver obs)
     {
         // here we compute the task off the compaction executor, so having that present doesn't
         // confuse runWithCompactionsDisabled -- i.e., we don't want to deadlock ourselves, waiting
@@ -878,7 +873,7 @@ public List<Future<?>> submitMaximal(final ColumnFamilyStore cfStore, final int
             {
                 protected void runMayThrow()
                 {
-                    task.execute(active);
+                    task.execute(obs);
                 }
             };
 
@@ -1110,39 +1105,23 @@ public void disableAutoCompaction()
     }
 
     @VisibleForTesting
-    void scrubOne(ColumnFamilyStore cfs, LifecycleTransaction modifier, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, ActiveCompactionsTracker activeCompactions)
+    void scrubOne(ColumnFamilyStore cfs, LifecycleTransaction modifier, boolean skipCorrupted, boolean checkData, boolean reinsertOverflowedTTL, TableOperationObserver activeCompactions)
     {
-        CompactionInfo.Holder scrubInfo = null;
-
-        try (Scrubber scrubber = new Scrubber(cfs, modifier, skipCorrupted, checkData, reinsertOverflowedTTL))
+        try (Scrubber scrubber = new Scrubber(cfs, modifier, skipCorrupted, checkData, reinsertOverflowedTTL);
+             NonThrowingCloseable c = activeCompactions.onOperationStart(scrubber.getScrubInfo()))
         {
-            scrubInfo = scrubber.getScrubInfo();
-            activeCompactions.beginCompaction(scrubInfo);
             scrubber.scrub();
         }
-        finally
-        {
-            if (scrubInfo != null)
-                activeCompactions.finishCompaction(scrubInfo);
-        }
     }
 
     @VisibleForTesting
-    void verifyOne(ColumnFamilyStore cfs, SSTableReader sstable, Verifier.Options options, ActiveCompactionsTracker activeCompactions)
+    void verifyOne(ColumnFamilyStore cfs, SSTableReader sstable, Verifier.Options options, TableOperationObserver activeCompactions)
     {
-        CompactionInfo.Holder verifyInfo = null;
-
-        try (Verifier verifier = new Verifier(cfs, sstable, false, options))
+        try (Verifier verifier = new Verifier(cfs, sstable, false, options);
+             NonThrowingCloseable c = activeCompactions.onOperationStart(verifier.getVerifyInfo()))
         {
-            verifyInfo = verifier.getVerifyInfo();
-            activeCompactions.beginCompaction(verifyInfo);
             verifier.verify();
         }
-        finally
-        {
-            if (verifyInfo != null)
-                activeCompactions.finishCompaction(verifyInfo);
-        }
     }
 
     /**
@@ -1253,7 +1232,7 @@ private void doCleanupOne(final ColumnFamilyStore cfs,
              ISSTableScanner scanner = cleanupStrategy.getScanner(sstable);
              CompactionController controller = new CompactionController(cfs, txn.originals(), getDefaultGcBefore(cfs, nowInSec));
              Refs<SSTableReader> refs = Refs.ref(Collections.singleton(sstable));
-             CompactionIterator ci = new CompactionIterator(OperationType.CLEANUP, Collections.singletonList(scanner), controller, nowInSec, UUIDGen.getTimeUUID(), active))
+             CompactionIterator ci = new CompactionIterator(OperationType.CLEANUP, Collections.singletonList(scanner), controller, nowInSec, UUIDGen.getTimeUUID()))
         {
             StatsMetadata metadata = sstable.getSSTableMetadata();
             writer.switchWriter(createWriter(cfs, compactionFileLocation, expectedBloomFilterSize, metadata.repairedAt, metadata.pendingRepair, metadata.isTransient, sstable, txn));
@@ -1602,44 +1581,47 @@ public void close() {}
 
              AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(txn.originals());
              CompactionController controller = new CompactionController(cfs, sstableAsSet, getDefaultGcBefore(cfs, nowInSec));
-             CompactionIterator ci = getAntiCompactionIterator(scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID(), active, isCancelled))
+             CompactionIterator ci = getAntiCompactionIterator(scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID(), isCancelled))
         {
-            int expectedBloomFilterSize = Math.max(cfs.metadata().params.minIndexInterval, (int)(SSTableReader.getApproximateKeyCount(sstableAsSet)));
-
-            fullWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, false, sstableAsSet, txn));
-            transWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, true, sstableAsSet, txn));
-            unrepairedWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, NO_PENDING_REPAIR, false, sstableAsSet, txn));
+            TableOperation op = ci.getOperation();
+            try (NonThrowingCloseable cls = active.onOperationStart(op))
+            {
+                int expectedBloomFilterSize = Math.max(cfs.metadata().params.minIndexInterval, (int)(SSTableReader.getApproximateKeyCount(sstableAsSet)));
 
-            Predicate<Token> fullChecker = !ranges.onlyFull().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyFull().ranges()) : t -> false;
-            Predicate<Token> transChecker = !ranges.onlyTransient().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyTransient().ranges()) : t -> false;
-            double compressionRatio = scanners.getCompressionRatio();
-            if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
-                compressionRatio = 1.0;
+                fullWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, false, sstableAsSet, txn));
+                transWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, pendingRepair, true, sstableAsSet, txn));
+                unrepairedWriter.switchWriter(CompactionManager.createWriterForAntiCompaction(cfs, destination, expectedBloomFilterSize, UNREPAIRED_SSTABLE, NO_PENDING_REPAIR, false, sstableAsSet, txn));
 
-            long lastBytesScanned = 0;
+                Predicate<Token> fullChecker = !ranges.onlyFull().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyFull().ranges()) : t -> false;
+                Predicate<Token> transChecker = !ranges.onlyTransient().isEmpty() ? new Range.OrderedRangeContainmentChecker(ranges.onlyTransient().ranges()) : t -> false;
+                double compressionRatio = scanners.getCompressionRatio();
+                if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
+                    compressionRatio = 1.0;
 
-            while (ci.hasNext())
-            {
-                try (UnfilteredRowIterator partition = ci.next())
+                long lastBytesScanned = 0;
+                while (ci.hasNext())
                 {
-                    Token token = partition.partitionKey().getToken();
-                    // if this row is contained in the full or transient ranges, append it to the appropriate sstable
-                    if (fullChecker.test(token))
+                    try (UnfilteredRowIterator partition = ci.next())
                     {
-                        fullWriter.append(partition);
-                    }
-                    else if (transChecker.test(token))
-                    {
-                        transWriter.append(partition);
-                    }
-                    else
-                    {
-                        // otherwise, append it to the unrepaired sstable
-                        unrepairedWriter.append(partition);
+                        Token token = partition.partitionKey().getToken();
+                        // if this row is contained in the full or transient ranges, append it to the appropriate sstable
+                        if (fullChecker.test(token))
+                        {
+                            fullWriter.append(partition);
+                        }
+                        else if (transChecker.test(token))
+                        {
+                            transWriter.append(partition);
+                        }
+                        else
+                        {
+                            // otherwise, append it to the unrepaired sstable
+                            unrepairedWriter.append(partition);
+                        }
+                        long bytesScanned = scanners.getTotalBytesScanned();
+                        if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
+                            lastBytesScanned = bytesScanned;
                     }
-                    long bytesScanned = scanners.getTotalBytesScanned();
-                    if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
-                        lastBytesScanned = bytesScanned;
                 }
             }
 
@@ -1685,33 +1667,53 @@ else if (transChecker.test(token))
     }
 
     @VisibleForTesting
-    public static CompactionIterator getAntiCompactionIterator(List<ISSTableScanner> scanners, CompactionController controller, int nowInSec, UUID timeUUID, ActiveCompactionsTracker activeCompactions, BooleanSupplier isCancelled)
+    public static CompactionIterator getAntiCompactionIterator(List<ISSTableScanner> scanners, CompactionController controller, int nowInSec, UUID timeUUID, BooleanSupplier isCancelled)
     {
-        return new CompactionIterator(OperationType.ANTICOMPACTION, scanners, controller, nowInSec, timeUUID, activeCompactions) {
+        return new CompactionIterator(OperationType.ANTICOMPACTION, scanners, controller, nowInSec, timeUUID) {
+            @Override
+            public TableOperation createOperation()
+            {
+                return getAntiCompactionOperation(super.createOperation(), isCancelled);
+            }
+        };
+    }
 
+    @VisibleForTesting
+    public static TableOperation getAntiCompactionOperation(TableOperation compaction, BooleanSupplier isCancelled)
+    {
+        return new AbstractTableOperation()
+        {
+            @Override
+            public boolean isGlobal()
+            {
+                return false;
+            }
+
+            @Override
+            public OperationProgress getProgress()
+            {
+                return compaction.getProgress();
+            }
+
+            @Override
             public boolean isStopRequested()
             {
-                return super.isStopRequested() || isCancelled.getAsBoolean();
+                return compaction.isStopRequested() || isCancelled.getAsBoolean();
             }
         };
     }
 
     @VisibleForTesting
-    ListenableFuture<?> submitIndexBuild(final SecondaryIndexBuilder builder, ActiveCompactionsTracker activeCompactions)
+    ListenableFuture<?> submitIndexBuild(final SecondaryIndexBuilder builder, TableOperationObserver activeCompactions)
     {
         Runnable runnable = new Runnable()
         {
             public void run()
             {
-                activeCompactions.beginCompaction(builder);
-                try
+                try (NonThrowingCloseable c = activeCompactions.onOperationStart(builder))
                 {
                     builder.build();
                 }
-                finally
-                {
-                    activeCompactions.finishCompaction(builder);
-                }
             }
         };
 
@@ -1731,7 +1733,7 @@ public Future<?> submitCacheWrite(final AutoSavingCache.Writer writer)
         return submitCacheWrite(writer, active);
     }
 
-    Future<?> submitCacheWrite(final AutoSavingCache.Writer writer, ActiveCompactionsTracker activeCompactions)
+    Future<?> submitCacheWrite(final AutoSavingCache.Writer writer, TableOperationObserver activeCompactions)
     {
         Runnable runnable = new Runnable()
         {
@@ -1739,20 +1741,15 @@ public void run()
             {
                 if (!AutoSavingCache.flushInProgress.add(writer.cacheType()))
                 {
-                    logger.trace("Cache flushing was already in progress: skipping {}", writer.getCompactionInfo());
+                    logger.trace("Cache flushing was already in progress: skipping {}", writer.getProgress());
                     return;
                 }
                 try
                 {
-                    activeCompactions.beginCompaction(writer);
-                    try
+                    try (NonThrowingCloseable c = activeCompactions.onOperationStart(writer))
                     {
                         writer.saveCache();
                     }
-                    finally
-                    {
-                        activeCompactions.finishCompaction(writer);
-                    }
                 }
                 finally
                 {
@@ -1770,17 +1767,12 @@ public List<SSTableReader> runIndexSummaryRedistribution(IndexSummaryRedistribut
     }
 
     @VisibleForTesting
-    List<SSTableReader> runIndexSummaryRedistribution(IndexSummaryRedistribution redistribution, ActiveCompactionsTracker activeCompactions) throws IOException
+    List<SSTableReader> runIndexSummaryRedistribution(IndexSummaryRedistribution redistribution, TableOperationObserver activeCompactions) throws IOException
     {
-        activeCompactions.beginCompaction(redistribution);
-        try
+        try(Closeable c = activeCompactions.onOperationStart(redistribution))
         {
             return redistribution.redistributeSummaries();
         }
-        finally
-        {
-            activeCompactions.finishCompaction(redistribution);
-        }
     }
 
     public static int getDefaultGcBefore(ColumnFamilyStore cfs, int nowInSec)
@@ -1796,24 +1788,19 @@ public ListenableFuture<Long> submitViewBuilder(final ViewBuilderTask task)
     }
 
     @VisibleForTesting
-    ListenableFuture<Long> submitViewBuilder(final ViewBuilderTask task, ActiveCompactionsTracker activeCompactions)
+    ListenableFuture<Long> submitViewBuilder(final ViewBuilderTask task, TableOperationObserver activeCompactions)
     {
         return viewBuildExecutor.submitIfRunning(() -> {
-            activeCompactions.beginCompaction(task);
-            try
+            try(Closeable c = activeCompactions.onOperationStart(task))
             {
                 return task.call();
             }
-            finally
-            {
-                activeCompactions.finishCompaction(task);
-            }
         }, "view build");
     }
 
     public int getActiveCompactions()
     {
-        return active.getCompactions().size();
+        return active.getTableOperations().size();
     }
 
     static class CompactionExecutor extends JMXEnabledThreadPoolExecutor
@@ -1977,19 +1964,19 @@ public void incrementSstablesDropppedFromCompactions(long num)
 
     public List<Map<String, String>> getCompactions()
     {
-        List<Holder> compactionHolders = active.getCompactions();
-        List<Map<String, String>> out = new ArrayList<Map<String, String>>(compactionHolders.size());
-        for (CompactionInfo.Holder ci : compactionHolders)
-            out.add(ci.getCompactionInfo().asMap());
+        List<TableOperation> operationSources = active.getTableOperations();
+        List<Map<String, String>> out = new ArrayList<Map<String, String>>(operationSources.size());
+        for (TableOperation op : operationSources)
+            out.add(op.getProgress().asMap());
         return out;
     }
 
     public List<String> getCompactionSummary()
     {
-        List<Holder> compactionHolders = active.getCompactions();
-        List<String> out = new ArrayList<String>(compactionHolders.size());
-        for (CompactionInfo.Holder ci : compactionHolders)
-            out.add(ci.getCompactionInfo().toString());
+        List<TableOperation> operationSources = active.getTableOperations();
+        List<String> out = new ArrayList<String>(operationSources.size());
+        for (TableOperation ci : operationSources)
+            out.add(ci.getProgress().toString());
         return out;
     }
 
@@ -2028,20 +2015,20 @@ public long getCompletedTasks()
     public void stopCompaction(String type)
     {
         OperationType operation = OperationType.valueOf(type);
-        for (Holder holder : active.getCompactions())
+        for (TableOperation operationSource : active.getTableOperations())
         {
-            if (holder.getCompactionInfo().getTaskType() == operation)
-                holder.stop();
+            if (operationSource.getProgress().operationType() == operation)
+                operationSource.stop();
         }
     }
 
     public void stopCompactionById(String compactionId)
     {
-        for (Holder holder : active.getCompactions())
+        for (TableOperation operationSource : active.getTableOperations())
         {
-            UUID holderId = holder.getCompactionInfo().getTaskId();
+            UUID holderId = operationSource.getProgress().operationId();
             if (holderId != null && holderId.equals(UUID.fromString(compactionId)))
-                holder.stop();
+                operationSource.stop();
         }
     }
 
@@ -2200,15 +2187,15 @@ public boolean interruptCompactionFor(Iterable<TableMetadata> tables, Predicate<
         assert tables != null;
 
         // interrupt in-progress compactions
-        Set<Holder> interrupted = new HashSet<>();
-        for (Holder compactionHolder : active.getCompactions())
+        Set<TableOperation> interrupted = new HashSet<>();
+        for (TableOperation operationSource : active.getTableOperations())
         {
-            CompactionInfo info = compactionHolder.getCompactionInfo();
+            AbstractTableOperation.OperationProgress info = operationSource.getProgress();
 
-            if (Iterables.contains(tables, info.getTableMetadata()) && opPredicate.test(info.getTaskType()))
+            if (Iterables.contains(tables, info.metadata()) && opPredicate.test(info.operationType()))
             {
-                compactionHolder.stop();
-                interrupted.add(compactionHolder);
+                operationSource.stop();
+                interrupted.add(operationSource);
             }
         }
 
@@ -2218,22 +2205,22 @@ public boolean interruptCompactionFor(Iterable<TableMetadata> tables, Predicate<
             long start = System.nanoTime();
             long wait = TimeUnit.MINUTES.toNanos(2);
 
-            for (Holder operation : interrupted)
+            for (TableOperation operation : interrupted)
             {
                 while (active.isActive(operation) && System.nanoTime() - start < wait)
                     Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
 
                 if (active.isActive(operation))
-                    throw new RuntimeException(String.format("Compaction task (%s) didn't finish within 2 minutes", operation.getCompactionInfo()));
+                    throw new RuntimeException(String.format("Compaction task (%s) didn't finish within 2 minutes", operation.getProgress()));
             }
         }
 
         return !interrupted.isEmpty();
     }
 
-    public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation)
+    public boolean interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation)
     {
-        interruptCompactionFor(columnFamilies, sstablePredicate, interruptValidation, CompactionInfo.StopTrigger.NONE);
+        return interruptCompactionFor(columnFamilies, sstablePredicate, interruptValidation, AbstractTableOperation.StopTrigger.NONE);
     }
     /**
      * Try to stop all of the compactions for given ColumnFamilies.
@@ -2244,32 +2231,43 @@ public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predi
      * @param columnFamilies The ColumnFamilies to try to stop compaction upon.
      * @param sstablePredicate the sstable predicate to match on
      * @param interruptValidation true if validation operations for repair should also be interrupted
+     * @return True if any compaction has been interrupted false otherwise.
      */
-    public void interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation, CompactionInfo.StopTrigger trigger)
+    public boolean interruptCompactionFor(Iterable<TableMetadata> columnFamilies, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation, AbstractTableOperation.StopTrigger trigger)
     {
         assert columnFamilies != null;
 
         // interrupt in-progress compactions
-        for (Holder compactionHolder : active.getCompactions())
+        boolean interrupted = false;
+        for (TableOperation operationSource : active.getTableOperations())
         {
-            CompactionInfo info = compactionHolder.getCompactionInfo();
-            if ((info.getTaskType() == OperationType.VALIDATION) && !interruptValidation)
+            AbstractTableOperation.OperationProgress info = operationSource.getProgress();
+            if ((info.operationType() == OperationType.VALIDATION) && !interruptValidation)
                 continue;
 
-            if (info.getTableMetadata() == null || Iterables.contains(columnFamilies, info.getTableMetadata()))
+            if (info.metadata() == null || Iterables.contains(columnFamilies, info.metadata()))
             {
-                if (info.shouldStop(sstablePredicate))
-                    compactionHolder.stop(trigger);
+                if (operationSource.shouldStop(sstablePredicate))
+                {
+                    operationSource.stop(trigger);
+                    interrupted = true;
+                }
             }
         }
+        return interrupted;
+    }
+
+    public boolean interruptCompactionFor(Iterable<TableMetadata> tables)
+    {
+        return interruptCompactionFor(tables, Predicates.alwaysTrue(), true);
     }
 
     public void interruptCompactionForCFs(Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation)
     {
-        interruptCompactionForCFs(cfss, sstablePredicate, interruptValidation, CompactionInfo.StopTrigger.NONE);
+        interruptCompactionForCFs(cfss, sstablePredicate, interruptValidation, AbstractTableOperation.StopTrigger.NONE);
     }
 
-    public void interruptCompactionForCFs(Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation, CompactionInfo.StopTrigger trigger)
+    public void interruptCompactionForCFs(Iterable<ColumnFamilyStore> cfss, Predicate<SSTableReader> sstablePredicate, boolean interruptValidation, AbstractTableOperation.StopTrigger trigger)
     {
         List<TableMetadata> metadata = new ArrayList<>();
         for (ColumnFamilyStore cfs : cfss)
@@ -2293,14 +2291,14 @@ public void waitForCessation(Iterable<ColumnFamilyStore> cfss, Predicate<SSTable
     }
 
 
-    public List<CompactionInfo> getSSTableTasks()
+    public List<AbstractTableOperation.OperationProgress> getSSTableTasks()
     {
-        return active.getCompactions()
+        return active.getTableOperations()
                      .stream()
-                     .map(CompactionInfo.Holder::getCompactionInfo)
-                     .filter(task -> task.getTaskType() != OperationType.COUNTER_CACHE_SAVE
-                                     && task.getTaskType() != OperationType.KEY_CACHE_SAVE
-                                     && task.getTaskType() != OperationType.ROW_CACHE_SAVE)
+                     .map(TableOperation::getProgress)
+                     .filter(progress -> progress.operationType() != OperationType.COUNTER_CACHE_SAVE
+                                     && progress.operationType() != OperationType.KEY_CACHE_SAVE
+                                     && progress.operationType() != OperationType.ROW_CACHE_SAVE)
                      .collect(Collectors.toList());
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java b/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java
new file mode 100644
index 000000000000..0b0fec5e887b
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.UUID;
+
+/**
+ * An observer of a compaction operation. It is notified when a compaction operation is started.
+ * <p/>
+ * It returns a closeable that is invoked when the compaction is finished.
+ * <p/>
+ * The progress can be queried at any time to obtain real-time updates of the compaction operation.
+ */
+public interface CompactionObserver
+{
+    CompactionObserver NO_OP = new CompactionObserver()
+    {
+        @Override
+        public void setSubmitted(UUID id, CompactionAggregate compaction) { }
+
+        @Override
+        public void setInProgress(CompactionProgress progress) { }
+
+        @Override
+        public void setCompleted(UUID id) { }
+    };
+
+    /**
+     * Indicates that a compaction with the given id has been submitted for the given aggregate.
+     * <p/>
+     * @param id the id of the compaction
+     * @param compaction the compaction aggregate the compaction is part of
+     */
+    void setSubmitted(UUID id, CompactionAggregate compaction);
+
+    /**
+     * Indicates that a compaction has started.
+     * <p/>
+     * @param progress the compaction progress, it contains the unique id and real-time progress information
+     */
+    void setInProgress(CompactionProgress progress);
+
+    /**
+     * Indicates that a compaction with the given id has completed.
+     * <p/>
+     * @param id  the id of the compaction
+     */
+    void setCompleted(UUID id);
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionPick.java b/src/java/org/apache/cassandra/db/compaction/CompactionPick.java
new file mode 100644
index 000000000000..3200ccc0b4b8
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionPick.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+import java.util.Objects;
+import java.util.UUID;
+import java.util.concurrent.CopyOnWriteArraySet;
+
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+/**
+ * A set of sstables that were picked for compaction along with some other relevant properties.
+ * <p/>
+ * This is a list of sstables that should be compacted together after having been picked by a compaction strategy,
+ * for example from a bucket in {@link SizeTieredCompactionStrategy} or from a level in {@link LeveledCompactionStrategy}.
+ * Also, it contains other useful parameters such as a score that was assigned to this candidate (the read hotness or level
+ * score depending on the strategy) and the level, if applicable.
+ **/
+class CompactionPick
+{
+    final static CompactionPick EMPTY = create(-1, new CopyOnWriteArraySet<>(), 0);
+
+    /** The key to the parent compaction aggregate, e.g. a level number or tier avg size, -1 if no parent */
+    final long parent;
+
+    /** The sstables to be compacted */
+    final CopyOnWriteArraySet<SSTableReader> sstables;
+
+    /** The sum of all the sstable hotness scores */
+    final double hotness;
+
+    /** The average size in bytes for the sstables in this compaction */
+    final long avgSizeInBytes;
+
+    /** The unique compaction id, this is only available when a compaction is submitted */
+    @Nullable
+    volatile UUID id;
+
+    /** The compaction progress, this is only available when compaction actually starts and will be null as long as
+     * the candidate is still pending execution, also some tasks cannot report a progress at all, e.g. {@link SingleSSTableLCSTask}.
+     * */
+    @Nullable volatile CompactionProgress progress;
+
+    /** Set to true when the compaction has completed */
+    volatile boolean completed;
+
+    private CompactionPick(long parent, Collection<SSTableReader> sstables, double hotness, long avgSizeInBytes)
+    {
+        this.parent = parent;
+        this.sstables = new CopyOnWriteArraySet<>(sstables);
+        this.hotness = hotness;
+        this.avgSizeInBytes = avgSizeInBytes;
+    }
+
+    /**
+     * Create a pending compaction candidate calculating hotness and avg size.
+     */
+    static CompactionPick create(long parent, Collection<SSTableReader> sstables)
+    {
+        return create(parent, sstables, CompactionAggregate.getTotHotness(sstables), CompactionAggregate.getAvgSizeBytes(sstables));
+    }
+
+    /**
+     * Create a pending compaction candidate calculating avg size.
+     */
+    static CompactionPick create(long parent, Collection<SSTableReader> sstables, double hotness)
+    {
+        return create(parent, sstables, hotness, CompactionAggregate.getAvgSizeBytes(sstables));
+    }
+
+    /**
+     * Create a pending compaction candidate with the given parameters.
+     */
+    static CompactionPick create(long parent, Collection<SSTableReader> sstables, double hotness, long avgSizeInBytes)
+    {
+        return new CompactionPick(parent, sstables, hotness, avgSizeInBytes);
+    }
+
+    /**
+     * Create new compaction pick similar to the one provided but with a new parent.
+     */
+    static CompactionPick create(long parent, CompactionPick pick)
+    {
+        return new CompactionPick(parent, pick.sstables, pick.hotness, pick.avgSizeInBytes);
+    }
+
+    public double hotness()
+    {
+        return hotness;
+    }
+
+    public long avgSizeInBytes()
+    {
+        return avgSizeInBytes;
+    }
+
+    void setSubmitted(UUID id)
+    {
+        if (id == null)
+            throw new IllegalArgumentException("Id cannot be null");
+
+        if (this.id != null)
+            throw new IllegalStateException("Already submitted");
+
+        this.id = id;
+    }
+    /**
+     * Set the compaction progress, this means the compaction pick has started executing.
+     */
+    void setProgress(CompactionProgress progress)
+    {
+        if (progress == null)
+            throw new IllegalArgumentException("Progress cannot be null");
+
+        if (this.progress != null)
+            throw new IllegalStateException("Already compacting");
+
+        if (this.id == null)
+            setSubmitted(progress.operationId());
+        else if (this.id != progress.operationId())
+            throw new IllegalStateException("Submitted with a different id");
+
+        this.progress = progress;
+    }
+
+    void setCompleted()
+    {
+        if (this.completed)
+            throw new IllegalStateException("Already completed");
+
+        this.completed = true;
+    }
+
+    /**
+     * Add more sstables to the collection of sstables initially picked.
+     * <p/>
+     * This is currently used by {@link TimeWindowCompactionStrategy} to add expired sstables.
+     *
+     * @param sstables the sstables to add
+     */
+    CompactionPick withAddedSSTables(Collection<SSTableReader> sstables)
+    {
+        ImmutableList.Builder builder = ImmutableList.builder();
+        builder.addAll(this.sstables);
+        builder.addAll(sstables);
+
+        return new CompactionPick(parent, builder.build(), CompactionAggregate.getTotHotness(sstables), CompactionAggregate.getAvgSizeBytes(sstables));
+    }
+
+    /**
+     * @return true if this compaction candidate is empty, that is it has no sstables to compact.
+     */
+    boolean isEmpty()
+    {
+        return sstables.isEmpty();
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(parent, sstables);
+    }
+
+    @Override
+    public boolean equals(Object obj)
+    {
+        if (obj == this)
+            return true;
+
+        if (!(obj instanceof CompactionPick))
+            return false;
+
+        CompactionPick that = (CompactionPick) obj;
+
+        // a pick is the same if the sstables are the same given that the other properties are derived from sstables and two
+        // picks are the same whether compaction has started or not so the progress and completed properties should not determine equality
+        return parent == that.parent && sstables.equals(that.sstables);
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Parent: %d, Hotness: %f, Avg size in bytes: %d, id: %s, sstables: %s", parent, hotness, avgSizeInBytes, id, sstables);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java b/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java
new file mode 100644
index 000000000000..f6b38511bb3f
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+
+import javax.annotation.Nullable;
+
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+/**
+ * The progress information for a compaction operation. This adds compaction
+ * specific information to {@link TableOperation.Progress}.
+ */
+public interface CompactionProgress extends TableOperation.Progress
+{
+    /**
+     * The compaction strategy if available, otherwise null.
+     * <p/>
+     * The compaction strategy may not be available for some operations that use compaction task such
+     * as GC or sstable splitting.
+     *
+     * @return the compaction strategy when available or null.
+     */
+    @Nullable AbstractCompactionStrategy strategy();
+
+    /**
+     * @return true if the compaction was requested to interrupt
+     */
+    boolean isStopRequested();
+
+    /**
+     * @return input sstables
+     */
+    Collection<SSTableReader> inSSTables();
+
+    /**
+     * @return output sstables
+     */
+    Collection<SSTableReader> outSSTables();
+
+    /**
+     * @return Size on disk (compressed) of the input sstables.
+     */
+    long inputDiskSize();
+
+    /**
+     * @return The uncompressed size of the input sstables.
+     */
+    long inputUncompressedSize();
+
+    /** Same as {@link this#inputDiskSize()} except for LCS where it estimates
+     * the compressed size for number of keys that will be read from the input sstables,
+     * see {@link org.apache.cassandra.db.compaction.LeveledCompactionStrategy}. */
+    long adjustedInputDiskSize();
+
+    /**
+     * @return Size on disk (compressed) of the output sstables.
+     */
+    long outputDiskSize();
+
+    /**
+     * @return the number of bytes processed by the compaction iterator. For compressed or encrypted sstables,
+     *         this is the number of bytes processed by the iterator after decompression, so this is the current
+     *         position in the uncompressed sstable files.
+     */
+    long uncompressedBytesRead();
+
+    /**
+     * @return the number of bytes processed by the compaction iterator for sstables on the specified level.
+     *         For compressed or encrypted sstables, this is the number of bytes processed by the iterator after decompression,
+     *         so this is the current position in the uncompressed sstable files.
+     */
+    long uncompressedBytesRead(int level);
+
+    /**
+     * @return the number of bytes that were written before compression is applied (uncompressed size).
+     */
+    long uncompressedBytesWritten();
+
+    /**
+     * @return the duration so far in nanoseconds.
+     */
+    long durationInNanos();
+
+    /**
+     * @return total number of partitions read
+     */
+    long partitionsRead();
+
+    /**
+     * @return otal number of rows read
+     */
+    long rowsRead();
+
+    /**
+     * The partitions histogram maps the number of sstables to the number of partitions that were merged with that number of input sstables.
+     *
+     * @return the partitions histogram
+     */
+    long[] partitionsHistogram();
+
+    /**
+     * The rows histogram maps the number of sstables to the number of rows that were merged with that number of input sstables.
+     *
+     * @return the rows histogram
+     */
+    long[] rowsHistogram();
+
+    /**
+     * @return the ratio of bytes before and after compaction, using the adjusted input and output disk sizes (uncompressed values).
+     */
+    double sizeRatio();
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java
new file mode 100644
index 000000000000..fa6883096530
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+import java.util.UUID;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * A container for the statistics of a single compaction event:
+ * input and output size, duration, before and after partition and row counters, etc.
+ */
+public class CompactionStatistics
+{
+    private final CompactionStrategyManager strategyManager;
+
+    public final TableMetadata metadata;
+    public final OperationType tasktype;
+    public final UUID compactionId;
+
+    public final Collection<SSTableReader> startSstables;
+    public final long startSizeBytes;
+
+    public long estInputSizeBytes;
+    public long totalSourceRows;
+    public long totalSourcePartitions;
+
+    public long[] mergedPartitionCounts;
+    public long[] mergedRowsCounts;
+
+    public Collection<SSTableReader> endSstables;
+    public long endSizeBytes;
+
+    public long bytesRead;
+    public long bytesWritten;
+
+    public long durationInNanos;
+    public boolean stopRequested;
+
+    CompactionStatistics(ColumnFamilyStore cfs, OperationType tasktype, UUID compactionId, Collection<SSTableReader> startSstables)
+    {
+        this.strategyManager = cfs.getCompactionStrategyManager();
+
+        this.metadata = cfs.metadata();
+        this.tasktype = tasktype;
+        this.compactionId = compactionId;
+        this.startSstables = startSstables;
+        this.startSizeBytes = SSTableReader.getTotalBytes(startSstables);
+        this.estInputSizeBytes = this.startSizeBytes;
+
+        this.totalSourceRows = 0;
+        this.totalSourcePartitions = 0;
+        this.mergedPartitionCounts = new long[0];
+        this.mergedRowsCounts = new long[0];
+        this.endSizeBytes = 0;
+        this.bytesRead = 0;
+        this.bytesWritten = 0;
+        this.durationInNanos = 0;
+        this.stopRequested = false;
+    }
+
+    double sizeRatio()
+    {
+        if (estInputSizeBytes > 0)
+            return endSizeBytes / (double) estInputSizeBytes;
+
+        // this is a valid case, when there are no sstables to actually compact
+        // the previous code would return a NaN that would be logged as zero
+        return 0;
+    }
+
+    void setEndSstables(Collection<SSTableReader> endSstables)
+    {
+        this.endSstables = endSstables;
+        this.endSizeBytes = SSTableReader.getTotalBytes(endSstables);
+    }
+
+    public AbstractCompactionStrategy getStrategyFor(SSTableReader ssTableReader)
+    {
+        return strategyManager.getCompactionStrategyFor(ssTableReader);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
index 963dafee22c8..b9e382fc631f 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
@@ -176,7 +176,11 @@ public int getIndexForSSTableDirectory(Descriptor descriptor)
         this.partitionSSTablesByTokenRange = partitionSSTablesByTokenRange;
         params = cfs.metadata().params.compaction;
         enabled = params.isEnabled();
-        reload(cfs.metadata().params.compaction);
+    }
+
+    CompactionLogger compactionLogger()
+    {
+        return compactionLogger;
     }
 
     /**
@@ -253,7 +257,7 @@ AbstractCompactionTask findUpgradeSSTableTask()
             if (txn != null)
             {
                 logger.debug("Running automatic sstable upgrade for {}", sstable);
-                return getCompactionStrategyFor(sstable).getCompactionTask(txn, Integer.MIN_VALUE, Long.MAX_VALUE);
+                return getCompactionStrategyFor(sstable).createCompactionTask(txn, Integer.MIN_VALUE, Long.MAX_VALUE);
             }
         }
         return null;
@@ -469,6 +473,22 @@ public void maybeReload(TableMetadata metadata)
         }
     }
 
+    /**
+     * Version of the above forcing the strategy to always be reloaded. Used by tests that need to clear the state.
+     */
+    public void forceReload()
+    {
+        writeLock.lock();
+        try
+        {
+            reload(schemaCompactionParams);
+        }
+        finally
+        {
+            writeLock.unlock();
+        }
+    }
+
     /**
      * Checks if the disk boundaries changed and reloads the compaction strategies
      * to reflect the most up-to-date disk boundaries.
@@ -506,7 +526,7 @@ protected void maybeReloadDiskBoundaries()
      * Called after changing configuration and at startup.
      * @param newCompactionParams
      */
-    private void reload(CompactionParams newCompactionParams)
+    public void reload(CompactionParams newCompactionParams)
     {
         boolean enabledWithJMX = enabled && !shouldBeEnabled();
         boolean disabledWithJMX = !enabled && shouldBeEnabled();
@@ -884,7 +904,7 @@ public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, int gc
         try
         {
             validateForCompaction(txn.originals());
-            return compactionStrategyFor(txn.originals().iterator().next()).getCompactionTask(txn, gcBefore, maxSSTableBytes);
+            return compactionStrategyFor(txn.originals().iterator().next()).createCompactionTask(txn, gcBefore, maxSSTableBytes);
         }
         finally
         {
@@ -1002,7 +1022,14 @@ public String getName()
 
     public List<List<AbstractCompactionStrategy>> getStrategies()
     {
-        maybeReloadDiskBoundaries();
+        return getStrategies(true);
+    }
+
+    private List<List<AbstractCompactionStrategy>> getStrategies(boolean checkBoundaries)
+    {
+        if (checkBoundaries)
+            maybeReloadDiskBoundaries();
+
         readLock.lock();
         try
         {
@@ -1016,6 +1043,18 @@ public List<List<AbstractCompactionStrategy>> getStrategies()
         }
     }
 
+    /**
+     * @return the statistics for the compaction strategies that have compactions in progress or pending
+     */
+    public List<CompactionStrategyStatistics> getStrategyStatistics()
+    {
+        return getStrategies(false).stream()
+                                   .flatMap(list -> list.stream())
+                                   .filter(strategy -> strategy.getTotalCompactions() > 0)
+                                   .map(AbstractCompactionStrategy::getStatistics)
+                                   .collect(Collectors.toList());
+    }
+
     public void setNewLocalCompactionStrategy(CompactionParams params)
     {
         logger.info("Switching local compaction strategy from {} to {}}", this.params, params);
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java
new file mode 100644
index 000000000000..e1d24d044f11
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * The statistics for a compaction strategy, to be published over JMX and insights.
+ * <p/>
+ * Implements serializable to allow structured info to be returned via JMX.  The JSON
+ * properties are published to insights so changing them has a downstream impact.
+ */
+public class CompactionStrategyStatistics implements Serializable
+{
+    private static final long serialVersionUID = 3695927592357744816L;
+
+    private final String keyspace;
+    private final String table;
+    private final String strategy;
+    private final List<CompactionAggregateStatistics> aggregates;
+
+    CompactionStrategyStatistics(TableMetadata metadata,
+                                 String strategy,
+                                 List<CompactionAggregateStatistics> aggregates)
+    {
+        this.keyspace = metadata.keyspace;
+        this.table = metadata.name;
+        this.strategy = strategy;
+        this.aggregates = new ArrayList<>(aggregates);
+    }
+
+    public String keyspace()
+    {
+        return keyspace;
+    }
+
+    public String table()
+    {
+        return table;
+    }
+
+    @JsonProperty
+    public String strategy()
+    {
+        return strategy;
+    }
+
+    @JsonProperty
+    public List<CompactionAggregateStatistics> aggregates()
+    {
+        return aggregates;
+    }
+
+    @Override
+    public String toString()
+    {
+        StringBuilder ret = new StringBuilder(1024);
+        ret.append(keyspace)
+           .append('.')
+           .append(table)
+           .append('/')
+           .append(strategy)
+           .append('\n');
+
+        if (!aggregates.isEmpty())
+        {
+            Collection<String> header = aggregates.get(0).header(); // all headers are identical
+            String[][] rows = new String[1 + aggregates.size()][header.size()]; // rows including the header
+            int[] lengths = new int[header.size()]; // the max lengths of each column
+
+            Iterator<String> it = header.iterator();
+            for (int i = 0; i < lengths.length; i++)
+            {
+                rows[0][i] = it.next();
+                lengths[i] = rows[0][i].length();
+            }
+
+            for (int idx = 1; idx <= aggregates.size(); idx++)
+            {
+                it = aggregates.get(idx-1).data().iterator();
+                for (int i = 0; i < lengths.length; i++)
+                {
+                    rows[idx][i] = it.next();
+                    if (rows[idx][i].length() > lengths[i])
+                        lengths[i] = rows[idx][i].length();
+                }
+            }
+
+            for (String[] row : rows)
+            {
+                for (int i = 0; i < row.length; i++)
+                    ret.append(String.format("%-" + lengths[i] + "s\t", row[i]));
+
+                ret.append('\n');
+            }
+        }
+
+        return ret.toString();
+    }
+
+    Collection<String> getHeader()
+    {
+        return aggregates.isEmpty() ? ImmutableList.of() : aggregates.get(0).header();
+    }
+
+    Collection<Collection<String>> getData()
+    {
+        return aggregates.stream().map(CompactionAggregateStatistics::data).collect(Collectors.toList());
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index b1c0869e190f..b93fb301dfaf 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
@@ -17,15 +17,20 @@
  */
 package org.apache.cassandra.db.compaction;
 
+import java.io.Closeable;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
+import javax.annotation.Nullable;
+
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
 import com.google.common.util.concurrent.RateLimiter;
@@ -43,8 +48,11 @@
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Throwables;
 import org.apache.cassandra.utils.concurrent.Refs;
 
 import static org.apache.cassandra.db.compaction.CompactionManager.compactionRateLimiterAcquire;
@@ -54,22 +62,103 @@
 public class CompactionTask extends AbstractCompactionTask
 {
     protected static final Logger logger = LoggerFactory.getLogger(CompactionTask.class);
+
     protected final int gcBefore;
     protected final boolean keepOriginals;
     /** for trace logging purposes only */
     private static final AtomicLong totalBytesCompacted = new AtomicLong();
-    private ActiveCompactionsTracker activeCompactions;
 
-    public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
+    // The compaction strategy is not necessarily available for all compaction tasks (e.g. GC or sstable splitting)
+    @Nullable
+    private final AbstractCompactionStrategy strategy;
+
+    /**
+     * This constructs a compaction tasks that operations that do not normally have a compaction strategy, such as tombstone
+     * collection or table splitting, also tests.
+     */
+    protected CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean keepOriginals)
     {
-        this(cfs, txn, gcBefore, false);
+        this(cfs, txn, gcBefore, keepOriginals, CompactionObserver.NO_OP, null);
     }
 
-    public CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean keepOriginals)
+    /**
+     * This constructs a compaction task that has been created by a compaction strategy.
+     */
+    protected CompactionTask(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore, boolean keepOriginals)
+    {
+        this(strategy.cfs, txn, gcBefore, keepOriginals, strategy == null ? CompactionObserver.NO_OP : strategy.getBackgroundCompactions(), strategy);
+    }
+
+    private CompactionTask(ColumnFamilyStore cfs,
+                           LifecycleTransaction txn,
+                           int gcBefore,
+                           boolean keepOriginals,
+                           CompactionObserver compObserver,
+                           @Nullable AbstractCompactionStrategy strategy)
     {
         super(cfs, txn);
         this.gcBefore = gcBefore;
         this.keepOriginals = keepOriginals;
+        this.compObserver = compObserver;
+        this.strategy = strategy;
+
+        logger.debug("Created compaction task with id {} and strategy {}", txn.opId(), strategy);
+    }
+
+    /**
+     * Create a compaction task for a generic compaction strategy.
+     */
+    public static AbstractCompactionTask forCompaction(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
+    {
+        return new CompactionTask(strategy, txn, gcBefore, false);
+    }
+
+    /**
+     * Create a compaction task for {@link TimeWindowCompactionStrategy}.
+     */
+    static AbstractCompactionTask forTimeWindowCompaction(TimeWindowCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
+    {
+        return new TimeWindowCompactionTask(strategy, txn, gcBefore, strategy.ignoreOverlaps());
+    }
+
+    /**
+     * Create a compaction task without a compaction strategy, currently only called by tests.
+     */
+    static AbstractCompactionTask forTesting(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
+    {
+        return new CompactionTask(cfs, txn, gcBefore, false);
+    }
+
+    /**
+     * Create a compaction task without a compaction strategy, currently only called by tests.
+     */
+    static AbstractCompactionTask forTesting(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, CompactionObserver compObserver)
+    {
+        return new CompactionTask(cfs, txn, gcBefore, false, compObserver, null);
+    }
+
+    /**
+     * Create a compaction task for deleted data collection.
+     */
+    public static AbstractCompactionTask forGarbageCollection(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, CompactionParams.TombstoneOption tombstoneOption)
+    {
+        AbstractCompactionTask task = new CompactionTask(cfs, txn, gcBefore, false)
+        {
+            @Override
+            protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
+            {
+                return new CompactionController(cfs, toCompact, gcBefore, null, tombstoneOption);
+            }
+
+            @Override
+            protected int getLevel()
+            {
+                return txn.onlyOne().getSSTableLevel();
+            }
+        };
+        task.setUserDefined(true);
+        task.setCompactionType(OperationType.GARBAGE_COLLECT);
+        return task;
     }
 
     private static long addToTotalBytesCompacted(long bytesCompacted)
@@ -78,9 +167,8 @@ private static long addToTotalBytesCompacted(long bytesCompacted)
     }
 
     @Override
-    protected int executeInternal(ActiveCompactionsTracker activeCompactions)
+    protected int executeInternal()
     {
-        this.activeCompactions = activeCompactions == null ? ActiveCompactionsTracker.NOOP : activeCompactions;
         run();
         return transaction.originals().size();
     }
@@ -118,17 +206,64 @@ protected void runMayThrow() throws Exception
         if (transaction.originals().isEmpty())
             return;
 
-        // Note that the current compaction strategy, is not necessarily the one this task was created under.
-        // This should be harmless; see comments to CFS.maybeReloadCompactionStrategy.
-        CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
-
         if (DatabaseDescriptor.isSnapshotBeforeCompaction())
             cfs.snapshotWithoutMemtable(System.currentTimeMillis() + "-compact-" + cfs.name);
 
-        try (CompactionController controller = getCompactionController(transaction.originals()))
+        try (CompactionController controller = getCompactionController(transaction.originals());
+             CompactionOperation operation = new CompactionOperation(controller))
         {
+            operation.execute();
+        }
+    }
 
-            final Set<SSTableReader> fullyExpiredSSTables = controller.getFullyExpiredSSTables();
+    /**
+     *  The compaction operation is a special case of an {@link AbstractTableOperation} and takes care of executing the
+     *  actual compaction and releasing any resources when the compaction is finished.
+     *  <p/>
+     *  This class also extends {@link AbstractTableOperation} for reporting compaction-specific progress information.
+     */
+    public final class CompactionOperation implements AutoCloseable
+    {
+        private final CompactionController controller;
+        private final CompactionStrategyManager strategyManager;
+        private final Set<SSTableReader> fullyExpiredSSTables;
+        private final UUID taskId;
+        private final RateLimiter limiter;
+        private final long start;
+        private final long startTime;
+        private final Set<SSTableReader> actuallyCompact;
+        private final CompactionProgress progress;
+
+        // resources that are updated and may be read by another thread
+        private volatile Collection<SSTableReader> newSStables;
+        private volatile long totalKeysWritten;
+        private volatile long estimatedKeys;
+
+        // resources that are updated but only read by this thread
+        private boolean completed;
+
+        // resources that need closing
+        private Refs<SSTableReader> sstableRefs;
+        private AbstractCompactionStrategy.ScannerList scanners;
+        private CompactionIterator compactionIterator;
+        private TableOperation op;
+        private Closeable obsCloseable;
+        private CompactionAwareWriter writer;
+
+        /**
+         * Create a new compaction operation.
+         * <p/>
+         * @param controller the compaction controller is needed by the scanners and compaction iterator to manage options
+         */
+        private CompactionOperation(CompactionController controller)
+        {
+            this.controller = controller;
+
+            // Note that the current compaction strategy, is not necessarily the one this task was created under.
+            // This should be harmless; see comments to CFS.maybeReloadCompactionStrategy.
+            this.strategyManager = cfs.getCompactionStrategyManager();
+            this.fullyExpiredSSTables = controller.getFullyExpiredSSTables();
+            this.taskId = transaction.opId();
 
             // select SSTables to compact based on available disk space.
             buildCompactionCandidatesForAvailableDiskSpace(fullyExpiredSSTables);
@@ -136,8 +271,52 @@ protected void runMayThrow() throws Exception
             // sanity check: all sstables must belong to the same cfs
             assert !Iterables.any(transaction.originals(), sstable -> !sstable.descriptor.cfname.equals(cfs.name));
 
-            UUID taskId = transaction.opId();
+            this.limiter = CompactionManager.instance.getRateLimiter();
+            this.start = System.nanoTime();
+            this.startTime = System.currentTimeMillis();
+            this.actuallyCompact = Sets.difference(transaction.originals(), fullyExpiredSSTables);
+            this.progress = new Progress();
+            this.newSStables = Collections.emptyList();
+            this.totalKeysWritten = 0;
+            this.estimatedKeys = 0;
+            this.completed = false;
+
+            Directories dirs = getDirectories();
+
+            try
+            {
+                // resources that need closing, must be created last in case of exceptions and released if there is an exception in the c.tor
+                this.sstableRefs = Refs.ref(actuallyCompact);
+                this.scanners = strategyManager.getScanners(actuallyCompact);
+                this.compactionIterator = new CompactionIterator(compactionType, scanners.scanners, controller, FBUtilities.nowInSeconds(), taskId);
+                this.op = compactionIterator.getOperation();
+                this.writer = getCompactionAwareWriter(cfs, dirs, transaction, actuallyCompact);
+                this.obsCloseable = opObserver.onOperationStart(op);
+
+                compObserver.setInProgress(progress);
+            }
+            catch (Throwable t)
+            {
+                t = Throwables.close(t, obsCloseable, writer, compactionIterator, scanners, sstableRefs); // ok to close even if null
+
+                Throwables.maybeFail(t);
+            }
+        }
 
+        private void execute()
+        {
+            try
+            {
+                execute0();
+            }
+            catch (Throwable t)
+            {
+                Throwables.maybeFail(onError(t));
+            }
+        }
+
+        private void execute0()
+        {
             // new sstables from flush can be added during a compaction, but only the compaction can remove them,
             // so in our single-threaded compaction world this is a valid way of determining if we're compacting
             // all the sstables (that existed when we started)
@@ -146,116 +325,287 @@ protected void runMayThrow() throws Exception
                 debugLogCompactingMessage(taskId);
             }
 
-            RateLimiter limiter = CompactionManager.instance.getRateLimiter();
-            long start = System.nanoTime();
-            long startTime = System.currentTimeMillis();
-            long totalKeysWritten = 0;
-            long estimatedKeys = 0;
-            long inputSizeBytes;
-
-            Set<SSTableReader> actuallyCompact = Sets.difference(transaction.originals(), fullyExpiredSSTables);
-            Collection<SSTableReader> newSStables;
+            long lastCheckObsoletion = start;
+            double compressionRatio = scanners.getCompressionRatio();
+            if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
+                compressionRatio = 1.0;
 
-            long[] mergedRowsHistogram;
-            long totalSourceCQLRows;
+            long lastBytesScanned = 0;
 
-            // SSTableScanners need to be closed before markCompactedSSTablesReplaced call as scanners contain references
-            // to both ifile and dfile and SSTR will throw deletion errors on Windows if it tries to delete before
-            // scanner is closed.
-            // See CASSANDRA-8019 and CASSANDRA-8399
-            int nowInSec = FBUtilities.nowInSeconds();
+            if (!controller.cfs.getCompactionStrategyManager().isActive())
+                throw new CompactionInterruptedException(op.getProgress());
 
-            try (Refs<SSTableReader> ignored = Refs.ref(actuallyCompact);
-                 AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(actuallyCompact);
-                 CompactionIterator ci = new CompactionIterator(compactionType, scanners.scanners, controller, nowInSec, taskId))
+            estimatedKeys = writer.estimatedKeys();
+            while (compactionIterator.hasNext())
             {
-                long lastCheckObsoletion = start;
-                inputSizeBytes = scanners.getTotalCompressedSize();
-                double compressionRatio = scanners.getCompressionRatio();
-                if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
-                    compressionRatio = 1.0;
+                if (op.isStopRequested())
+                    throw new CompactionInterruptedException(op.getProgress());
 
-                long lastBytesScanned = 0;
+                UnfilteredRowIterator partition = compactionIterator.next();
+                if (writer.append(partition))
+                    totalKeysWritten++;
 
-                activeCompactions.beginCompaction(ci);
-                Directories dirs = getDirectories();
-                try (CompactionAwareWriter writer = getCompactionAwareWriter(cfs, dirs, transaction, actuallyCompact))
-                {
-                    // Note that we need to re-check this flag after calling beginCompaction above to avoid a window
-                    // where the compaction does not exist in activeCompactions but the CSM gets paused.
-                    // We already have the sstables marked compacting here so CompactionManager#waitForCessation will
-                    // block until the below exception is thrown and the transaction is cancelled.
-                    if (!controller.cfs.getCompactionStrategyManager().isActive())
-                        throw new CompactionInterruptedException(ci.getCompactionInfo());
-                    estimatedKeys = writer.estimatedKeys();
-                    while (ci.hasNext())
-                    {
-                        UnfilteredRowIterator partition = ci.next();
-                        if (writer.append(partition))
-                            totalKeysWritten++;
-
-
-                        long bytesScanned = scanners.getTotalBytesScanned();
-
-                        // Rate limit the scanners, and account for compression
-                        if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
-                            lastBytesScanned = bytesScanned;
-
-                        if (System.nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
-                        {
-                            controller.maybeRefreshOverlaps();
-                            lastCheckObsoletion = System.nanoTime();
-                        }
-                    }
-
-                    // point of no return
-                    newSStables = writer.finish();
-                }
-                finally
+                long bytesScanned = scanners.getTotalBytesScanned();
+
+                // Rate limit the scanners, and account for compression
+                if (compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio))
+                    lastBytesScanned = bytesScanned;
+
+                long now = System.nanoTime();
+                if (now - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L))
                 {
-                    activeCompactions.finishCompaction(ci);
-                    mergedRowsHistogram = ci.getMergedRowsHistogram();
-                    totalSourceCQLRows = ci.getTotalSourceCQLRows();
+                    controller.maybeRefreshOverlaps();
+                    lastCheckObsoletion = now;
                 }
             }
 
-            if (transaction.isOffline())
+            // point of no return
+            newSStables = writer.finish();
+
+
+            completed = true;
+        }
+
+        private Throwable onError(Throwable e)
+        {
+            if (e instanceof AssertionError)
             {
-                Refs.release(Refs.selfRefs(newSStables));
+                // Add additional information to help operators.
+                AssertionError error = new AssertionError(
+                String.format("Illegal input has been generated, most probably due to corruption in the input sstables\n" +
+                              "\t%s\n" +
+                              "Try scrubbing the sstables by running\n" +
+                              "\tnodetool scrub %s %s\n",
+                              transaction.originals(),
+                              cfs.keyspace.getName(),
+                              cfs.getTableName()));
+                error.addSuppressed(e);
+                return error;
             }
-            else
-            {
-                // log a bunch of statistics about the result and save to system table compaction_history
 
-                long endsize = SSTableReader.getTotalBytes(newSStables);
+            return e;
+        }
+
+        //
+        // Closeable
+        //
 
-                updateCompactionHistory(taskId,
-                                        cfs.keyspace.getName(),
-                                        cfs.getTableName(),
-                                        mergedRowsHistogram,
-                                        inputSizeBytes,
-                                        endsize);
+        @Override
+        public void close()
+        {
+            Throwable err = Throwables.close((Throwable) null, obsCloseable, writer, compactionIterator, scanners, sstableRefs);
+
+            if (transaction.isOffline())
+            {
+                Refs.release(Refs.selfRefs(newSStables)); // this is harmless in case of exception, newSStables will be empty
+            }
+            else if (completed)
+            {
+                // This code used to execute only if the compaction was successful so we preserve the existing behavior
+                updateCompactionHistory(taskId, cfs.keyspace.getName(), cfs.getTableName(), progress);
 
                 if (logger.isDebugEnabled())
                 {
-                    debugLogCompactionSummaryInfo(taskId,
-                                                  start,
-                                                  totalKeysWritten,
-                                                  inputSizeBytes,
-                                                  newSStables,
-                                                  mergedRowsHistogram,
-                                                  (int) totalSourceCQLRows,
-                                                  endsize);
+                    debugLogCompactionSummaryInfo(taskId, start, totalKeysWritten, newSStables, progress);
                 }
                 if (logger.isTraceEnabled())
                 {
-                    traceLogCompactionSummaryInfo(totalKeysWritten, estimatedKeys, endsize);
+                    traceLogCompactionSummaryInfo(totalKeysWritten, estimatedKeys, progress);
                 }
-                cfs.getCompactionStrategyManager().compactionLogger.
-                        compaction(startTime, transaction.originals(), System.currentTimeMillis(), newSStables);
+                cfs.getCompactionLogger().compaction(startTime, transaction.originals(),  System.currentTimeMillis(), newSStables);
 
                 // update the metrics
-                cfs.metric.compactionBytesWritten.inc(endsize);
+                cfs.metric.compactionBytesWritten.inc(progress.outputDiskSize());
+            }
+
+            Throwables.maybeFail(err);
+        }
+
+        //
+        // CompactionProgress
+        //
+
+        private final class Progress implements CompactionProgress
+        {
+            //
+            // TableOperation.Progress methods
+            //
+
+            @Override
+            public Optional<String> keyspace()
+            {
+                return Optional.of(metadata().keyspace);
+            }
+
+            @Override
+            public Optional<String> table()
+            {
+                return Optional.of(metadata().name);
+            }
+
+            @Override
+            public TableMetadata metadata()
+            {
+                return cfs.metadata();
+            }
+
+            /**
+             * @return the number of bytes read by the compaction iterator. For compressed or encrypted sstables,
+             *         this is the number of bytes processed by the iterator after decompression, so this is the current
+             *         position in the uncompressed sstable files.
+             */
+            @Override
+            public long completed()
+            {
+                return compactionIterator.bytesRead();
+            }
+
+            /**
+             * @return the initial number of bytes for input sstables. For compressed or encrypted sstables,
+             *         this is the number of bytes after decompression, so this is the uncompressed length of sstable files.
+             */
+            public long total()
+            {
+                return compactionIterator.totalBytes();
+            }
+
+            @Override
+            public OperationType operationType()
+            {
+                return compactionType;
+            }
+
+            @Override
+            public UUID operationId()
+            {
+                return taskId;
+            }
+
+            @Override
+            public TableOperation.Unit unit()
+            {
+                return TableOperation.Unit.BYTES;
+            }
+
+            @Override
+            public Set<SSTableReader> sstables()
+            {
+                return transaction.originals();
+            }
+
+            //
+            // CompactionProgress
+            //
+
+            @Override
+            @Nullable
+            public AbstractCompactionStrategy strategy()
+            {
+                return strategy;
+            }
+
+            @Override
+            public boolean isStopRequested()
+            {
+                return op.isStopRequested();
+            }
+
+            @Override
+            public Collection<SSTableReader> inSSTables()
+            {
+                // TODO should we use transaction.originals() and include the expired sstables?
+                // This would be more correct but all the metrics we get from CompactionIterator will not be compatible
+                return actuallyCompact;
+            }
+
+            @Override
+            public Collection<SSTableReader> outSSTables()
+            {
+                return newSStables;
+            }
+
+            @Override
+            public long inputDiskSize()
+            {
+                return SSTableReader.getTotalBytes(actuallyCompact);
+            }
+
+            @Override
+            public long inputUncompressedSize()
+            {
+                return compactionIterator.totalBytes();
+            }
+
+            @Override
+            public long adjustedInputDiskSize()
+            {
+                return scanners.getTotalCompressedSize();
+            }
+
+            @Override
+            public long outputDiskSize()
+            {
+                return SSTableReader.getTotalBytes(newSStables);
+            }
+
+            @Override
+            public long uncompressedBytesRead()
+            {
+                return compactionIterator.bytesRead();
+            }
+
+            @Override
+            public long uncompressedBytesRead(int level)
+            {
+                return compactionIterator.bytesRead(level);
+            }
+
+            @Override
+            public long uncompressedBytesWritten()
+            {
+                return writer.bytesWritten();
+            }
+
+            @Override
+            public long durationInNanos()
+            {
+                return System.nanoTime() - start;
+            }
+
+            @Override
+            public long partitionsRead()
+            {
+                return compactionIterator.totalSourcePartitions();
+            }
+
+            @Override
+            public long rowsRead()
+            {
+                return compactionIterator.totalSourceRows();
+            }
+
+            @Override
+            public long[] partitionsHistogram()
+            {
+                return compactionIterator.mergedPartitionsHistogram();
+            }
+
+            @Override
+            public long[] rowsHistogram()
+            {
+                return compactionIterator.mergedRowsHistogram();
+            }
+
+            @Override
+            public double sizeRatio()
+            {
+                long estInputSizeBytes = adjustedInputDiskSize();
+                if (estInputSizeBytes > 0)
+                    return outputDiskSize() / (double) estInputSizeBytes;
+
+                // this is a valid case, when there are no sstables to actually compact
+                // the previous code would return a NaN that would be logged as zero
+                return 0;
             }
         }
     }
@@ -405,27 +755,24 @@ public static long getMaxDataAge(Collection<SSTableReader> sstables)
     private void debugLogCompactionSummaryInfo(UUID taskId,
                                                long start,
                                                long totalKeysWritten,
-                                               long inputSizeBytes,
                                                Collection<SSTableReader> newSStables,
-                                               long[] mergedRowsHistogram,
-                                               int totalSourceCQLRows,
-                                               long outputSizeBytes)
+                                               CompactionProgress progress)
     {
         // log a bunch of statistics about the result and save to system table compaction_history
         long durationInNano = System.nanoTime() - start;
         long dTime = TimeUnit.NANOSECONDS.toMillis(durationInNano);
-        double ratio = (double) outputSizeBytes / (double) inputSizeBytes;
 
-        long totalSourceRows = 0;
-        StringBuilder mergeSummary = new StringBuilder(mergedRowsHistogram.length * 10);
+        long totalMergedPartitions = 0;
+        long[] mergedPartitionCounts = progress.partitionsHistogram();
+        StringBuilder mergeSummary = new StringBuilder(mergedPartitionCounts.length * 10);
         mergeSummary.append('{');
-        for (int i = 0; i < mergedRowsHistogram.length; i++)
+        for (int i = 0; i < mergedPartitionCounts.length; i++)
         {
-            long mergedRowCount = mergedRowsHistogram[i];
-            if (mergedRowCount != 0)
+            long mergedPartitionCount = mergedPartitionCounts[i];
+            if (mergedPartitionCount != 0)
             {
-                totalSourceRows += mergedRowCount * (i + 1);
-                mergeSummary.append(i).append(':').append(mergedRowCount).append(", ");
+                totalMergedPartitions += mergedPartitionCount * (i + 1);
+                mergeSummary.append(i).append(':').append(mergedPartitionCount).append(", ");
             }
         }
         mergeSummary.append('}');
@@ -433,23 +780,22 @@ private void debugLogCompactionSummaryInfo(UUID taskId,
         StringBuilder newSSTableNames = new StringBuilder(newSStables.size() * 100);
         for (SSTableReader reader : newSStables)
             newSSTableNames.append(reader.descriptor.baseFilename()).append(",");
-        logger.debug("Compacted ({}) {} sstables to [{}] to level={}." +
-                     " {} to {} (~{}% of original) in {}ms." +
-                     " Read Throughput = {}, Write Throughput = {}, Row Throughput = ~{}/s." +
-                     " {} total partitions merged to {}." +
-                     " Partition merge counts were {}",
+        logger.debug("Compacted ({}) {} sstables to [{}] to level={}. {} to {} (~{}% of original) in {}ms. " +
+                     "Read Throughput = {}, Write Throughput = {}, Row Throughput = ~{}/s, Partition Throughput = ~{}/s." +
+                     " {} total partitions merged to {}. Partition merge counts were {}.",
                      taskId,
                      transaction.originals().size(),
                      newSSTableNames.toString(),
                      getLevel(),
-                     prettyPrintMemory(inputSizeBytes),
-                     prettyPrintMemory(outputSizeBytes),
-                     (int) (ratio * 100),
+                     prettyPrintMemory(progress.adjustedInputDiskSize()),
+                     prettyPrintMemory(progress.outputDiskSize()),
+                     (int) (progress.sizeRatio() * 100),
                      dTime,
-                     prettyPrintMemoryPerSecond(inputSizeBytes, durationInNano),
-                     prettyPrintMemoryPerSecond(outputSizeBytes, durationInNano),
-                     totalSourceCQLRows / (TimeUnit.NANOSECONDS.toSeconds(durationInNano) + 1),
-                     totalSourceRows,
+                     prettyPrintMemoryPerSecond(progress.adjustedInputDiskSize(), durationInNano),
+                     prettyPrintMemoryPerSecond(progress.outputDiskSize(), durationInNano),
+                     progress.rowsRead() / (TimeUnit.NANOSECONDS.toSeconds(durationInNano) + 1),
+                     (int) progress.partitionsRead() / (TimeUnit.NANOSECONDS.toSeconds(progress.durationInNanos()) + 1),
+                     totalMergedPartitions,
                      totalKeysWritten,
                      mergeSummary.toString());
     }
@@ -475,32 +821,33 @@ private void debugLogCompactingMessage(UUID taskId)
     private static void updateCompactionHistory(UUID id,
                                                 String keyspaceName,
                                                 String columnFamilyName,
-                                                long[] mergedRowsHistogram,
-                                                long startSize,
-                                                long endSize)
+                                                CompactionProgress progress)
     {
-        Map<Integer, Long> mergedRows = new HashMap<>(mergedRowsHistogram.length);
-        for (int i = 0; i < mergedRowsHistogram.length; i++)
+        long[] mergedPartitionsHistogram = progress.partitionsHistogram();
+        Map<Integer, Long> mergedPartitions = new HashMap<>(mergedPartitionsHistogram.length);
+        for (int i = 0; i < mergedPartitionsHistogram.length; i++)
         {
-            long count = mergedRowsHistogram[i];
+            long count = mergedPartitionsHistogram[i];
             if (count == 0)
                 continue;
 
             int rows = i + 1;
-            mergedRows.put(rows, count);
+            mergedPartitions.put(rows, count);
         }
         SystemKeyspace.updateCompactionHistory(id,
                                                keyspaceName,
                                                columnFamilyName,
                                                System.currentTimeMillis(),
-                                               startSize,
-                                               endSize,
-                                               mergedRows);
+                                               progress.adjustedInputDiskSize(),
+                                               progress.outputDiskSize(),
+                                               mergedPartitions);
     }
 
-    private void traceLogCompactionSummaryInfo(long totalKeysWritten, long estimatedKeys, long endsize)
+    private void traceLogCompactionSummaryInfo(long totalKeysWritten,
+                                               long estimatedKeys,
+                                               CompactionProgress progress)
     {
-        logger.trace("CF Total Bytes Compacted: {}", prettyPrintMemory(addToTotalBytesCompacted(endsize)));
+        logger.trace("CF Total Bytes Compacted: {}", prettyPrintMemory(addToTotalBytesCompacted(progress.outputDiskSize())));
         logger.trace("Actual #keys: {}, Estimated #keys:{}, Err%: {}",
                      totalKeysWritten,
                      estimatedKeys,
diff --git a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
index ab2b6ae32852..5dc4b54c02ce 100644
--- a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
@@ -43,7 +43,7 @@
  * @deprecated in favour of {@link TimeWindowCompactionStrategy}
  */
 @Deprecated
-public class DateTieredCompactionStrategy extends AbstractCompactionStrategy
+public class DateTieredCompactionStrategy extends AbstractCompactionStrategy.WithSSTableList
 {
     private static final Logger logger = LoggerFactory.getLogger(DateTieredCompactionStrategy.class);
 
@@ -69,49 +69,20 @@ public DateTieredCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> o
         this.stcsOptions = new SizeTieredCompactionStrategyOptions(options);
     }
 
-    @Override
-    @SuppressWarnings("resource")
-    public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
-    {
-        List<SSTableReader> previousCandidate = null;
-        while (true)
-        {
-            List<SSTableReader> latestBucket = getNextBackgroundSSTables(gcBefore);
-
-            if (latestBucket.isEmpty())
-                return null;
-
-            // Already tried acquiring references without success. It means there is a race with
-            // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
-            if (latestBucket.equals(previousCandidate))
-            {
-                logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
-                            "unless it happens frequently, in which case it must be reported. Will retry later.",
-                            latestBucket);
-                return null;
-            }
-
-            LifecycleTransaction modifier = cfs.getTracker().tryModify(latestBucket, OperationType.COMPACTION);
-            if (modifier != null)
-                return new CompactionTask(cfs, modifier, gcBefore);
-            previousCandidate = latestBucket;
-        }
-    }
-
     /**
      *
      * @param gcBefore
      * @return
      */
-    private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
+    protected synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
     {
         Set<SSTableReader> uncompacting;
         synchronized (sstables)
         {
             if (sstables.isEmpty())
-                return Collections.emptyList();
+                return ImmutableList.of();
 
-            uncompacting = ImmutableSet.copyOf(filter(cfs.getUncompactingSSTables(), sstables::contains));
+            uncompacting = ImmutableSet.copyOf(cfs.getNoncompactingSSTables(sstables));
         }
 
         Set<SSTableReader> expired = Collections.emptySet();
@@ -223,22 +194,47 @@ public static List<Pair<SSTableReader, Long>> createSSTableAndMinTimestampPairs(
         return sstableMinTimestampPairs;
     }
 
+    public void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added)
+    {
+        synchronized (sstables)
+        {
+            for (SSTableReader remove : removed)
+                removeSSTable(remove);
+            addSSTables(added);
+        }
+    }
+
     @Override
-    public synchronized void addSSTable(SSTableReader sstable)
+    public void addSSTable(SSTableReader sstable)
     {
-        sstables.add(sstable);
+        synchronized (sstables)
+        {
+            sstables.add(sstable);
+        }
     }
 
     @Override
-    public synchronized void removeSSTable(SSTableReader sstable)
+    public void removeSSTable(SSTableReader sstable)
     {
-        sstables.remove(sstable);
+        synchronized (sstables)
+        {
+            sstables.remove(sstable);
+        }
+    }
+
+    @Override
+    void removeDeadSSTables()
+    {
+        removeDeadSSTables(sstables);
     }
 
     @Override
     protected Set<SSTableReader> getSSTables()
     {
-        return ImmutableSet.copyOf(sstables);
+        synchronized (sstables)
+        {
+            return ImmutableSet.copyOf(sstables);
+        }
     }
 
     /**
@@ -365,7 +361,7 @@ private void updateEstimatedCompactionsByTasks(List<List<SSTableReader>> tasks)
         int n = 0;
         for (List<SSTableReader> bucket : tasks)
         {
-            for (List<SSTableReader> stcsBucket : getSTCSBuckets(bucket, stcsOptions))
+            for (List<SSTableReader> stcsBucket : getSTCSBuckets(bucket, stcsOptions, cfs.getMinimumCompactionThreshold(), cfs.getMaximumCompactionThreshold()))
                 if (stcsBucket.size() >= cfs.getMinimumCompactionThreshold())
                     n += Math.ceil((double)stcsBucket.size() / cfs.getMaximumCompactionThreshold());
         }
@@ -392,7 +388,7 @@ static List<SSTableReader> newestBucket(List<List<SSTableReader>> buckets, int m
             boolean inFirstWindow = incomingWindow.onTarget(bucket.get(0).getMinTimestamp());
             if (bucket.size() >= minThreshold || (bucket.size() >= 2 && !inFirstWindow))
             {
-                List<SSTableReader> stcsSSTables = getSSTablesForSTCS(bucket, inFirstWindow ? minThreshold : 2, maxThreshold, stcsOptions);
+                List<SSTableReader> stcsSSTables = getSSTablesForSTCS(bucket, stcsOptions, inFirstWindow ? minThreshold : 2, maxThreshold);
                 if (!stcsSSTables.isEmpty())
                     return stcsSSTables;
             }
@@ -400,49 +396,26 @@ static List<SSTableReader> newestBucket(List<List<SSTableReader>> buckets, int m
         return Collections.emptyList();
     }
 
-    private static List<SSTableReader> getSSTablesForSTCS(Collection<SSTableReader> sstables, int minThreshold, int maxThreshold, SizeTieredCompactionStrategyOptions stcsOptions)
+    private static List<SSTableReader> getSSTablesForSTCS(Collection<SSTableReader> sstables, SizeTieredCompactionStrategyOptions stcsOptions, int minThreshold, int maxThreshold)
     {
-        List<SSTableReader> s = SizeTieredCompactionStrategy.mostInterestingBucket(getSTCSBuckets(sstables, stcsOptions), minThreshold, maxThreshold);
+        SizeTieredCompactionStrategy.SizeTieredBuckets sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstables,
+                                                                                                                              stcsOptions,
+                                                                                                                              minThreshold,
+                                                                                                                              maxThreshold);
+
+        sizeTieredBuckets.aggregate();
+        List<SSTableReader> s = new ArrayList<>(CompactionAggregate.getSelected(sizeTieredBuckets.getAggregates()).sstables);
         logger.debug("Got sstables {} for STCS from {}", s, sstables);
         return s;
     }
 
-    private static List<List<SSTableReader>> getSTCSBuckets(Collection<SSTableReader> sstables, SizeTieredCompactionStrategyOptions stcsOptions)
+    private static List<List<SSTableReader>> getSTCSBuckets(Collection<SSTableReader> sstables, SizeTieredCompactionStrategyOptions stcsOptions, int minThreshold, int maxThreshold)
     {
-        List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(AbstractCompactionStrategy.filterSuspectSSTables(sstables));
-        return SizeTieredCompactionStrategy.getBuckets(pairs,
-                                                       stcsOptions.bucketHigh,
-                                                       stcsOptions.bucketLow,
-                                                       stcsOptions.minSSTableSize);
-    }
-
-    @Override
-    @SuppressWarnings("resource")
-    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore, boolean splitOutput)
-    {
-        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(sstables);
-        if (Iterables.isEmpty(filteredSSTables))
-            return null;
-        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
-        if (txn == null)
-            return null;
-        return Collections.<AbstractCompactionTask>singleton(new CompactionTask(cfs, txn, gcBefore));
-    }
-
-    @Override
-    @SuppressWarnings("resource")
-    public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
-    {
-        assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
-
-        LifecycleTransaction modifier = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
-        if (modifier == null)
-        {
-            logger.trace("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
-            return null;
-        }
-
-        return new CompactionTask(cfs, modifier, gcBefore).setUserDefined(true);
+        SizeTieredCompactionStrategy.SizeTieredBuckets sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstables,
+                                                                                                                              stcsOptions,
+                                                                                                                              minThreshold,
+                                                                                                                              maxThreshold);
+        return sizeTieredBuckets.buckets();
     }
 
     public int getEstimatedRemainingTasks()
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java
new file mode 100644
index 000000000000..6f6cdad0e3eb
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+/**
+ * The statistics for levelled compaction.
+ * <p/>
+ * Implements serializable to allow structured info to be returned via JMX.
+ */
+public class LeveledCompactionStatistics extends CompactionAggregateStatistics
+{
+    private static final Collection<String> HEADER = ImmutableList.copyOf(Iterables.concat(ImmutableList.of("Level", "Score"),
+                                                                                           CompactionAggregateStatistics.HEADER,
+                                                                                           ImmutableList.of("Tot/Read/Written",
+                                                                                                                            "Read: Tot/Prev/Next",
+                                                                                                                            "Written: Tot/New",
+                                                                                                                            "WA (tot_written/read_prev)")));
+
+    private static final long serialVersionUID = 3695927592357744816L;
+
+    /** The current level */
+    private final int level;
+
+    /** The score of this level */
+    private final double score;
+
+    /** Total bytes of the sstables selected for compaction */
+    private final long tot;
+
+    /** Total bytes read during compaction between levels N and N+1. This includes bytes read from this level (N) and from the next level (N+1) */
+    private final long totRead;
+
+    /** Bytes read from the current level (N) during compaction between levels N and N+1 */
+    private final long readLevel;
+
+    /** Total bytes written during compaction between levels N and N+1 */
+    private final long totWritten;
+
+    /**
+     * Additional RocksDB metrics we may want to consider:
+     * Moved(GB): Bytes moved to level N+1 during compaction. In this case there is no IO other than updating the manifest to indicate that a file which used to be in level X is now in level Y
+     * Rd(MB/s): The rate at which data is read during compaction between levels N and N+1. This is (Read(GB) * 1024) / duration where duration is the time for which compactions are in progress from level N to N+1.
+     * Wr(MB/s): The rate at which data is written during compaction. See Rd(MB/s).
+     * Rn(cnt): Total files read from level N during compaction between levels N and N+1
+     * Rnp1(cnt): Total files read from level N+1 during compaction between levels N and N+1
+     * Wnp1(cnt): Total files written to level N+1 during compaction between levels N and N+1
+     * Wnew(cnt): (Wnp1(cnt) - Rnp1(cnt)) -- Increase in file count as result of compaction between levels N and N+1
+     * Comp(sec): Total time spent doing compactions between levels N and N+1
+     * Comp(cnt): Total number of compactions between levels N and N+1
+     * Avg(sec): Average time per compaction between levels N and N+1
+     * Stall(sec): Total time writes were stalled because level N+1 was uncompacted (compaction score was high)
+     * Stall(cnt): Total number of writes stalled because level N+1 was uncompacted
+     * Avg(ms): Average time in milliseconds a write was stalled because level N+1 was uncompacted
+     * KeyIn: number of records compared during compaction
+     * KeyDrop: number of records dropped (not written out) during compaction
+     */
+
+    public LeveledCompactionStatistics(int level,
+                                       double score,
+                                       int numCompactions,
+                                       int numCompactionsInProgress,
+                                       int numSSTables,
+                                       int numCandidateSSTables,
+                                       int numCompactingSSTables,
+                                       long sizeInBytes,
+                                       double readThroughput,
+                                       double writeThroughput,
+                                       long tot,
+                                       long totRead,
+                                       long readLevel,
+                                       long totWritten)
+    {
+        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput);
+
+        this.level = level;
+        this.score = score;
+        this.tot = tot;
+        this.totRead = totRead;
+        this.readLevel = readLevel;
+        this.totWritten = totWritten;
+    }
+
+    /** The current level */
+    @JsonProperty
+    public int level()
+    {
+        return level;
+    }
+
+    /** The score of a level is the level size in bytes of all its files dived by the ideal
+     * level size if applicable, or zero for tiered strategies */
+    @JsonProperty
+    public double score()
+    {
+        return score;
+    }
+
+    /** Total bytes of the sstables selected for compaction */
+    @JsonProperty
+    public long tot()
+    {
+        return tot;
+    }
+
+    /** Total uncompressed bytes read during compaction between this level and the next. This includes bytes read from this level (N) and from the next level (N+1) */
+    @JsonProperty
+    public long read()
+    {
+        return totRead;
+    }
+
+    /** Uncompressed bytes read from the previous level (N) during compaction between levels N and N+1*/
+    @JsonProperty
+    public long readLevel()
+    {
+        return readLevel;
+    }
+
+    /** Uncompressed bytes read from the next level (N+1) during compaction between levels N and N+1 */
+    @JsonProperty
+    public long readNext()
+    {
+        return totRead - readLevel;
+    }
+
+    /** Uncompressed  bytes written during compaction between levels N and N+1 */
+    @JsonProperty
+    public long written()
+    {
+        return totWritten;
+    }
+
+    /** Uncompressed  bytes written to level N+1, calculated as total bytes written - bytes read from N+1 */
+    @JsonProperty
+    public long writtenNew()
+    {
+        return totWritten - readNext();
+    }
+
+    /** W-Amp: total bytes written divided by the bytes read from level N. */
+    @JsonProperty
+    public double writeAmpl()
+    {
+        return readLevel() > 0 ? (double)totWritten / readLevel() : Double.NaN;
+    }
+
+    @Override
+    protected Collection<String> header()
+    {
+        return HEADER;
+    }
+
+    @Override
+    protected Collection<String> data()
+    {
+        List<String> data = new ArrayList<>(HEADER.size());
+        data.add(Integer.toString(level()));
+        data.add(String.format("%.3f", score()));
+
+        data.addAll(super.data());
+
+        data.add(toString(tot()) + '/' + toString(read()) + '/' + toString(written()));
+        data.add(toString(read()) + '/' + toString(readLevel()) + '/' + toString(readNext()));
+        data.add(toString(written()) + '/' + toString(writtenNew()));
+        data.add(String.format("%.3f", writeAmpl()));
+        return data;
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
index dd7c9dfcff79..13765fc7f200 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.*;
+import java.util.function.Function;
 
 
 import com.google.common.annotations.VisibleForTesting;
@@ -43,12 +44,12 @@
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
-public class LeveledCompactionStrategy extends AbstractCompactionStrategy
+public class LeveledCompactionStrategy extends AbstractCompactionStrategy.WithAggregates
 {
     private static final Logger logger = LoggerFactory.getLogger(LeveledCompactionStrategy.class);
-    private static final String SSTABLE_SIZE_OPTION = "sstable_size_in_mb";
+    static final String SSTABLE_SIZE_OPTION = "sstable_size_in_mb";
     private static final boolean tolerateSstableSize = Boolean.getBoolean(Config.PROPERTY_PREFIX + "tolerate_sstable_size");
-    private static final String LEVEL_FANOUT_SIZE_OPTION = "fanout_size";
+    static final String LEVEL_FANOUT_SIZE_OPTION = "fanout_size";
     private static final String SINGLE_SSTABLE_UPLEVEL_OPTION = "single_sstable_uplevel";
     public static final int DEFAULT_LEVEL_FANOUT_SIZE = 10;
 
@@ -116,98 +117,62 @@ public void startup()
         super.startup();
     }
 
-    /**
-     * the only difference between background and maximal in LCS is that maximal is still allowed
-     * (by explicit user request) even when compaction is disabled.
-     */
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+    @Override
+    protected CompactionAggregate getNextBackgroundAggregate(int gcBefore)
     {
-        Collection<SSTableReader> previousCandidate = null;
-        while (true)
-        {
-            OperationType op;
-            LeveledManifest.CompactionCandidate candidate = manifest.getCompactionCandidates();
-            if (candidate == null)
-            {
-                // if there is no sstable to compact in standard way, try compacting based on droppable tombstone ratio
-                SSTableReader sstable = findDroppableSSTable(gcBefore);
-                if (sstable == null)
-                {
-                    logger.trace("No compaction necessary for {}", this);
-                    return null;
-                }
-                candidate = new LeveledManifest.CompactionCandidate(Collections.singleton(sstable),
-                                                                    sstable.getSSTableLevel(),
-                                                                    getMaxSSTableBytes());
-                op = OperationType.TOMBSTONE_COMPACTION;
-            }
-            else
-            {
-                op = OperationType.COMPACTION;
-            }
-
-            // Already tried acquiring references without success. It means there is a race with
-            // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
-            if (candidate.sstables.equals(previousCandidate))
-            {
-                logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
-                            "unless it happens frequently, in which case it must be reported. Will retry later.",
-                            candidate.sstables);
-                return null;
-            }
+        CompactionAggregate.Leveled candidate = manifest.getCompactionCandidate();
+        backgroundCompactions.setPending(manifest.getEstimatedTasks(candidate));
 
-            LifecycleTransaction txn = cfs.getTracker().tryModify(candidate.sstables, OperationType.COMPACTION);
-            if (txn != null)
-            {
-                AbstractCompactionTask newTask;
-                if (!singleSSTableUplevel || op == OperationType.TOMBSTONE_COMPACTION || txn.originals().size() > 1)
-                    newTask = new LeveledCompactionTask(cfs, txn, candidate.level, gcBefore, candidate.maxSSTableBytes, false);
-                else
-                    newTask = new SingleSSTableLCSTask(cfs, txn, candidate.level);
+        if (candidate != null)
+            return candidate;
 
-                newTask.setCompactionType(op);
-                return newTask;
-            }
-            previousCandidate = candidate.sstables;
-        }
+        return findDroppableSSTable(gcBefore);
     }
 
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore, boolean splitOutput)
+    @Override
+    protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, CompactionAggregate compaction)
     {
-        Iterable<SSTableReader> sstables = manifest.getSSTables();
+        long maxxSSTableBytes;
+        int nextLevel;
+        OperationType op;
+
+        if (compaction instanceof CompactionAggregate.TombstoneAggregate)
+        {
+            op = OperationType.TOMBSTONE_COMPACTION;
+            nextLevel = Iterables.getOnlyElement(compaction.selected.sstables).getSSTableLevel();
+            maxxSSTableBytes = getMaxSSTableBytes();    // TODO: verify this is expected as it can split L0 tables
+        }
+        else
+        {
+            CompactionAggregate.Leveled candidate = (CompactionAggregate.Leveled) compaction;
+            op = OperationType.COMPACTION;
+            nextLevel = candidate.nextLevel;
+            maxxSSTableBytes = candidate.maxSSTableBytes;
+        }
+
 
-        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(sstables);
-        if (Iterables.isEmpty(sstables))
-            return null;
-        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
-        if (txn == null)
-            return null;
-        return Arrays.<AbstractCompactionTask>asList(new LeveledCompactionTask(cfs, txn, 0, gcBefore, getMaxSSTableBytes(), true));
+        AbstractCompactionTask newTask;
+        if (!singleSSTableUplevel || op == OperationType.TOMBSTONE_COMPACTION || txn.originals().size() > 1)
+            newTask = LeveledCompactionTask.forCompaction(this, txn, nextLevel, gcBefore, maxxSSTableBytes, false);
+        else
+            newTask = SingleSSTableLCSTask.forCompaction(this, txn, nextLevel);
 
+        newTask.setCompactionType(op);
+        return newTask;
     }
 
+
     @Override
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    public AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
+    protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
     {
-
-        if (sstables.isEmpty())
-            return null;
-
-        LifecycleTransaction transaction = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
-        if (transaction == null)
-        {
-            logger.trace("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
-            return null;
-        }
+        Collection<SSTableReader> sstables = txn.originals();
         int level = sstables.size() > 1 ? 0 : sstables.iterator().next().getSSTableLevel();
-        return new LeveledCompactionTask(cfs, transaction, level, gcBefore, level == 0 ? Long.MAX_VALUE : getMaxSSTableBytes(), false);
+        long maxSSTableBytes = (level == 0 && !isMaximal) ? Long.MAX_VALUE : getMaxSSTableBytes();
+        return LeveledCompactionTask.forCompaction(this, txn, level, gcBefore, maxSSTableBytes, isMaximal);
     }
 
     @Override
-    public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes)
+    public AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes)
     {
         assert txn.originals().size() > 0;
         int level = -1;
@@ -219,7 +184,7 @@ public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, int gc
             if (level != sstable.getSSTableLevel())
                 level = 0;
         }
-        return new LeveledCompactionTask(cfs, txn, level, gcBefore, maxSSTableBytes, false);
+        return LeveledCompactionTask.forCompaction(this, txn, level, gcBefore, maxSSTableBytes, false);
     }
 
     /**
@@ -268,13 +233,6 @@ public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Coll
 
     }
 
-    public int getEstimatedRemainingTasks()
-    {
-        int n = manifest.getEstimatedTasks();
-        cfs.getCompactionStrategyManager().compactionLogger.pending(this, n);
-        return n;
-    }
-
     public long getMaxSSTableBytes()
     {
         return maxSSTableSizeInMB * 1024L * 1024L;
@@ -326,7 +284,7 @@ public ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Ra
                     if (!intersecting.isEmpty())
                     {
                         @SuppressWarnings("resource") // The ScannerList will be in charge of closing (and we close properly on errors)
-                        ISSTableScanner scanner = new LeveledScanner(cfs.metadata(), intersecting, ranges);
+                        ISSTableScanner scanner = new LeveledScanner(cfs.metadata(), intersecting, ranges, level);
                         scanners.add(scanner);
                     }
                 }
@@ -359,6 +317,12 @@ public void addSSTables(Iterable<SSTableReader> sstables)
         manifest.addSSTables(sstables);
     }
 
+    @Override
+    void removeDeadSSTables()
+    {
+        manifest.removeDeadSSTables();
+    }
+
     @Override
     public void addSSTable(SSTableReader added)
     {
@@ -384,6 +348,7 @@ private static class LeveledScanner extends AbstractIterator<UnfilteredRowIterat
         private final TableMetadata metadata;
         private final Collection<Range<Token>> ranges;
         private final List<SSTableReader> sstables;
+        private final int level;
         private final Iterator<SSTableReader> sstableIterator;
         private final long totalLength;
         private final long compressedLength;
@@ -392,13 +357,14 @@ private static class LeveledScanner extends AbstractIterator<UnfilteredRowIterat
         private long positionOffset;
         private long totalBytesScanned = 0;
 
-        public LeveledScanner(TableMetadata metadata, Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
+        public LeveledScanner(TableMetadata metadata, Collection<SSTableReader> sstables, Collection<Range<Token>> ranges, int level)
         {
             this.metadata = metadata;
             this.ranges = ranges;
 
             // add only sstables that intersect our range, and estimate how much data that involves
             this.sstables = new ArrayList<>(sstables.size());
+            this.level = level;
             long length = 0;
             long cLength = 0;
             for (SSTableReader sstable : sstables)
@@ -416,7 +382,7 @@ public LeveledScanner(TableMetadata metadata, Collection<SSTableReader> sstables
 
             totalLength = length;
             compressedLength = cLength;
-            Collections.sort(this.sstables, SSTableReader.sstableComparator);
+            Collections.sort(this.sstables, SSTableReader.firstKeyComparator);
             sstableIterator = this.sstables.iterator();
             assert sstableIterator.hasNext(); // caller should check intersecting first
             SSTableReader currentSSTable = sstableIterator.next();
@@ -502,6 +468,11 @@ public Set<SSTableReader> getBackingSSTables()
         {
             return ImmutableSet.copyOf(sstables);
         }
+
+        public int level()
+        {
+            return level;
+        }
     }
 
     @Override
@@ -510,28 +481,23 @@ public String toString()
         return String.format("LCS@%d(%s)", hashCode(), cfs.name);
     }
 
-    private SSTableReader findDroppableSSTable(final int gcBefore)
+    private CompactionAggregate findDroppableSSTable(final int gcBefore)
     {
-        level:
+        Comparator<SSTableReader> comparator = (o1, o2) -> {
+            double r1 = o1.getEstimatedDroppableTombstoneRatio(gcBefore);
+            double r2 = o2.getEstimatedDroppableTombstoneRatio(gcBefore);
+            return -1 * Doubles.compare(r1, r2);
+        };
+        Function<Collection<SSTableReader>, SSTableReader> selector = list -> Collections.max(list, comparator);
+        Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+
         for (int i = manifest.getLevelCount(); i >= 0; i--)
         {
-            if (manifest.getLevelSize(i) == 0)
-                continue;
-            // sort sstables by droppable ratio in descending order
-            List<SSTableReader> tombstoneSortedSSTables = manifest.getLevelSorted(i, (o1, o2) -> {
-                double r1 = o1.getEstimatedDroppableTombstoneRatio(gcBefore);
-                double r2 = o2.getEstimatedDroppableTombstoneRatio(gcBefore);
-                return -1 * Doubles.compare(r1, r2);
-            });
-
-            Set<SSTableReader> compacting = cfs.getTracker().getCompacting();
-            for (SSTableReader sstable : tombstoneSortedSSTables)
-            {
-                if (sstable.getEstimatedDroppableTombstoneRatio(gcBefore) <= tombstoneThreshold)
-                    continue level;
-                else if (!compacting.contains(sstable) && !sstable.isMarkedSuspect() && worthDroppingTombstones(sstable, gcBefore))
-                    return sstable;
-            }
+            CompactionAggregate tombstoneAggregate = makeTombstoneCompaction(gcBefore,
+                                                                             nonSuspectAndNotIn(manifest.getLevel(i), compacting),
+                                                                             selector);
+            if (tombstoneAggregate != null)
+                return tombstoneAggregate;
         }
         return null;
     }
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
index c6339374b453..c4ee57fa3212 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
@@ -33,14 +33,19 @@ public class LeveledCompactionTask extends CompactionTask
     private final long maxSSTableBytes;
     private final boolean majorCompaction;
 
-    public LeveledCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int level, int gcBefore, long maxSSTableBytes, boolean majorCompaction)
+    public LeveledCompactionTask(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level, int gcBefore, long maxSSTableBytes, boolean majorCompaction)
     {
-        super(cfs, txn, gcBefore);
+        super(strategy, txn, gcBefore, false);
         this.level = level;
         this.maxSSTableBytes = maxSSTableBytes;
         this.majorCompaction = majorCompaction;
     }
 
+    static AbstractCompactionTask forCompaction(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level, int gcBefore, long maxSSTableBytes, boolean majorCompaction)
+    {
+        return new LeveledCompactionTask(strategy, txn, level, gcBefore, maxSSTableBytes, majorCompaction);
+    }
+
     @Override
     public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
                                                           Directories directories,
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledGenerations.java b/src/java/org/apache/cassandra/db/compaction/LeveledGenerations.java
index 64027f2aa777..b33b244a35a6 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledGenerations.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledGenerations.java
@@ -73,9 +73,9 @@ class LeveledGenerations
     private final TreeSet<SSTableReader> [] levels = new TreeSet[MAX_LEVEL_COUNT - 1];
 
     private static final Comparator<SSTableReader> nonL0Comparator = (o1, o2) -> {
-        int cmp = SSTableReader.sstableComparator.compare(o1, o2);
+        int cmp = SSTableReader.firstKeyComparator.compare(o1, o2);
         if (cmp == 0)
-            cmp = Ints.compare(o1.descriptor.generation, o2.descriptor.generation);
+            cmp = Ints.compare(o1.getGeneration(), o2.getGeneration());
         return cmp;
     };
 
@@ -152,8 +152,8 @@ void addAll(Iterable<SSTableReader> readers)
             SSTableReader after = level.ceiling(sstable);
             SSTableReader before = level.floor(sstable);
 
-            if (before != null && before.last.compareTo(sstable.first) >= 0 ||
-                after != null && after.first.compareTo(sstable.last) <= 0)
+            if (before != null && before.getLast().compareTo(sstable.getFirst()) >= 0 ||
+                after != null && after.getFirst().compareTo(sstable.getLast()) <= 0)
             {
                 sendToL0(sstable);
             }
@@ -254,7 +254,7 @@ Iterator<SSTableReader> wrappingIterator(int lvl, SSTableReader lastCompactedSST
         while (tail.hasNext())
         {
             SSTableReader potentialPivot = tail.peek();
-            if (potentialPivot.first.compareTo(lastCompactedSSTable.last) > 0)
+            if (potentialPivot.getFirst().compareTo(lastCompactedSSTable.getLast()) > 0)
             {
                 pivot = potentialPivot;
                 break;
@@ -312,7 +312,7 @@ private void maybeVerifyLevels()
             for (SSTableReader sstable : get(i))
             {
                 // no overlap:
-                assert prev == null || prev.last.compareTo(sstable.first) < 0;
+                assert prev == null || prev.getLast().compareTo(sstable.getFirst()) < 0;
                 prev = sstable;
                 // make sure it does not exist in any other level:
                 for (int j = 0; j < levelCount(); j++)
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
index e3cbdab57219..7a9f69d41896 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
@@ -25,7 +25,6 @@
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Sets;
-import com.google.common.primitives.Ints;
 
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.io.sstable.Component;
@@ -40,7 +39,6 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.db.compaction.LeveledGenerations.MAX_LEVEL_COUNT;
 
@@ -52,7 +50,13 @@ public class LeveledManifest
      * if we have more than MAX_COMPACTING_L0 sstables in L0, we will run a round of STCS with at most
      * cfs.getMaxCompactionThreshold() sstables.
      */
-    private static final int MAX_COMPACTING_L0 = 32;
+    @VisibleForTesting
+    static final int MAX_COMPACTING_L0 = 32;
+
+    /**
+     * The maximum number of sstables in L0 for calculating the maximum number of bytes in L0.
+     */
+    static final int MAX_SSTABLES_L0 = 4;
 
     /**
      * If we go this many rounds without compacting
@@ -152,7 +156,33 @@ public synchronized void replace(Collection<SSTableReader> removed, Collection<S
         if (logger.isTraceEnabled())
             logger.trace("Adding [{}]", toString(added));
         generations.addAll(added);
-        lastCompactedSSTables[minLevel] = SSTableReader.sstableOrdering.max(added);
+        lastCompactedSSTables[minLevel] = SSTableReader.firstKeyOrdering.max(added);
+    }
+
+    /**
+     * See {@link AbstractCompactionStrategy#removeDeadSSTables}
+     */
+    public synchronized void removeDeadSSTables()
+    {
+        int removed = 0;
+        Set<SSTableReader> liveSet = cfs.getLiveSSTables();
+
+        for (int i = 0; i < generations.levelCount(); i++)
+        {
+            Iterator<SSTableReader> it = generations.get(i).iterator();
+            while (it.hasNext())
+            {
+                SSTableReader sstable = it.next();
+                if (!liveSet.contains(sstable))
+                {
+                    it.remove();
+                    ++removed;
+                }
+            }
+        }
+
+        if (removed > 0)
+            logger.debug("Removed {} dead sstables from the compactions tracked list.", removed);
     }
 
     private String toString(Collection<SSTableReader> sstables)
@@ -178,7 +208,7 @@ public long maxBytesForLevel(int level, long maxSSTableSizeInBytes)
     public static long maxBytesForLevel(int level, int levelFanoutSize, long maxSSTableSizeInBytes)
     {
         if (level == 0)
-            return 4L * maxSSTableSizeInBytes;
+            return MAX_SSTABLES_L0 * maxSSTableSizeInBytes;
         double bytes = Math.pow(levelFanoutSize, level) * maxSSTableSizeInBytes;
         if (bytes > Long.MAX_VALUE)
             throw new RuntimeException("At most " + Long.MAX_VALUE + " bytes may be in a compaction level; your maxSSTableSize must be absurdly high to compute " + bytes);
@@ -189,17 +219,17 @@ public static long maxBytesForLevel(int level, int levelFanoutSize, long maxSSTa
      * @return highest-priority sstables to compact, and level to compact them to
      * If no compactions are necessary, will return null
      */
-    public synchronized CompactionCandidate getCompactionCandidates()
+    synchronized CompactionAggregate.Leveled getCompactionCandidate()
     {
         // during bootstrap we only do size tiering in L0 to make sure
         // the streamed files can be placed in their original levels
         if (StorageService.instance.isBootstrapMode())
         {
-            List<SSTableReader> mostInteresting = getSSTablesForSTCS(generations.get(0));
+            CompactionPick mostInteresting = getSSTablesForSTCS(generations.get(0));
             if (!mostInteresting.isEmpty())
             {
                 logger.info("Bootstrapping - doing STCS in L0");
-                return new CompactionCandidate(mostInteresting, 0, Long.MAX_VALUE);
+                return getSTCSAggregate(mostInteresting);
             }
             return null;
         }
@@ -233,7 +263,7 @@ public synchronized CompactionCandidate getCompactionCandidates()
 
         // Let's check that L0 is far enough behind to warrant STCS.
         // If it is, it will be used before proceeding any of higher level
-        CompactionCandidate l0Compaction = getSTCSInL0CompactionCandidate();
+        CompactionAggregate.Leveled l0Compactions = getSTCSInL0CompactionCandidate();
 
         for (int i = generations.levelCount() - 1; i > 0; i--)
         {
@@ -242,7 +272,7 @@ public synchronized CompactionCandidate getCompactionCandidates()
                 continue; // mostly this just avoids polluting the debug log with zero scores
             // we want to calculate score excluding compacting ones
             Set<SSTableReader> sstablesInLevel = Sets.newHashSet(sstables);
-            Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, cfs.getTracker().getCompacting());
+            Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, cfs.getCompactingSSTables());
             long remainingBytesForLevel = SSTableReader.getTotalBytes(remaining);
             long maxBytesForLevel = maxBytesForLevel(i, maxSSTableSizeInBytes);
             double score = (double) remainingBytesForLevel / (double) maxBytesForLevel;
@@ -260,18 +290,20 @@ public synchronized CompactionCandidate getCompactionCandidates()
                 }
 
                 // before proceeding with a higher level, let's see if L0 is far enough behind to warrant STCS
-                if (l0Compaction != null)
-                    return l0Compaction;
+                if (l0Compactions != null)
+                    return l0Compactions;
 
                 // L0 is fine, proceed with this level
                 Collection<SSTableReader> candidates = getCandidatesFor(i);
+                int pendingCompactions = Math.max(0, getEstimatedPendingTasks(i) - 1);
+
                 if (!candidates.isEmpty())
                 {
                     int nextLevel = getNextLevel(candidates);
                     candidates = getOverlappingStarvedSSTables(nextLevel, candidates);
                     if (logger.isTraceEnabled())
                         logger.trace("Compaction candidates for L{} are {}", i, toString(candidates));
-                    return new CompactionCandidate(candidates, nextLevel, maxSSTableSizeInBytes);
+                    return CompactionAggregate.createLeveled(sstablesInLevel, candidates, pendingCompactions, maxSSTableSizeInBytes, i, nextLevel, score);
                 }
                 else
                 {
@@ -281,7 +313,9 @@ public synchronized CompactionCandidate getCompactionCandidates()
         }
 
         // Higher levels are happy, time for a standard, non-STCS L0 compaction
-        if (generations.get(0).isEmpty())
+        Set<SSTableReader> sstables = getLevel(0);
+
+        if (sstables.isEmpty())
             return null;
         Collection<SSTableReader> candidates = getCandidatesFor(0);
         if (candidates.isEmpty())
@@ -289,36 +323,51 @@ public synchronized CompactionCandidate getCompactionCandidates()
             // Since we don't have any other compactions to do, see if there is a STCS compaction to perform in L0; if
             // there is a long running compaction, we want to make sure that we continue to keep the number of SSTables
             // small in L0.
-            return l0Compaction;
+            return l0Compactions;
         }
-        return new CompactionCandidate(candidates, getNextLevel(candidates), maxSSTableSizeInBytes);
+        double l0Score = (double) SSTableReader.getTotalBytes(sstables) / (double) maxBytesForLevel(0, maxSSTableSizeInBytes);
+        int l0PendingCompactions = Math.max(0, getEstimatedPendingTasks(0) - 1);
+        return CompactionAggregate.createLeveled(sstables, candidates, l0PendingCompactions, maxSSTableSizeInBytes, 0, getNextLevel(candidates), l0Score);
     }
 
-    private CompactionCandidate getSTCSInL0CompactionCandidate()
+    private CompactionAggregate.Leveled getSTCSInL0CompactionCandidate()
     {
         if (!DatabaseDescriptor.getDisableSTCSInL0() && generations.get(0).size() > MAX_COMPACTING_L0)
         {
-            List<SSTableReader> mostInteresting = getSSTablesForSTCS(generations.get(0));
+            CompactionPick mostInteresting = getSSTablesForSTCS(getLevel(0));
             if (!mostInteresting.isEmpty())
             {
                 logger.debug("L0 is too far behind, performing size-tiering there first");
-                return new CompactionCandidate(mostInteresting, 0, Long.MAX_VALUE);
+                return getSTCSAggregate(mostInteresting);
             }
         }
 
         return null;
     }
 
-    private List<SSTableReader> getSSTablesForSTCS(Collection<SSTableReader> sstables)
+    private CompactionAggregate.Leveled getSTCSAggregate(CompactionPick compaction)
     {
-        Iterable<? extends SSTableReader> candidates = cfs.getTracker().getUncompacting(sstables);
-        List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(AbstractCompactionStrategy.filterSuspectSSTables(candidates));
-        List<List<SSTableReader>> buckets = SizeTieredCompactionStrategy.getBuckets(pairs,
-                                                                                    options.bucketHigh,
-                                                                                    options.bucketLow,
-                                                                                    options.minSSTableSize);
-        return SizeTieredCompactionStrategy.mostInterestingBucket(buckets,
-                cfs.getMinimumCompactionThreshold(), cfs.getMaximumCompactionThreshold());
+        Set<SSTableReader> sstables = getLevel(0);
+        double score = (double) SSTableReader.getTotalBytes(sstables) / (double) maxBytesForLevel(0, maxSSTableSizeInBytes);
+        int remainingSSTables = sstables.size() - compaction.sstables.size();
+        int pendingTasks = remainingSSTables > cfs.getMinimumCompactionThreshold()
+                           ? (int) Math.ceil(remainingSSTables / cfs.getMaximumCompactionThreshold())
+                           : 0;
+        return CompactionAggregate.createLeveledForSTCS(sstables, compaction, pendingTasks, score);
+    }
+
+    private CompactionPick getSSTablesForSTCS(Collection<SSTableReader> sstables)
+    {
+        Iterable<? extends SSTableReader> candidates = cfs.getNoncompactingSSTables(sstables);
+
+        SizeTieredCompactionStrategy.SizeTieredBuckets sizeTieredBuckets;
+        sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(candidates,
+                                                                               options,
+                                                                               cfs.getMinimumCompactionThreshold(),
+                                                                               cfs.getMaximumCompactionThreshold());
+        sizeTieredBuckets.aggregate();
+
+        return CompactionAggregate.getSelected(sizeTieredBuckets.getAggregates());
     }
 
     /**
@@ -359,10 +408,10 @@ private Collection<SSTableReader> getOverlappingStarvedSSTables(int targetLevel,
                     PartitionPosition min = null;
                     for (SSTableReader candidate : candidates)
                     {
-                        if (min == null || candidate.first.compareTo(min) < 0)
-                            min = candidate.first;
-                        if (max == null || candidate.last.compareTo(max) > 0)
-                            max = candidate.last;
+                        if (min == null || candidate.getFirst().compareTo(min) < 0)
+                            min = candidate.getFirst();
+                        if (max == null || candidate.getLast().compareTo(max) > 0)
+                            max = candidate.getLast();
                     }
                     if (min == null || max == null || min.equals(max)) // single partition sstables - we cannot include a high level sstable.
                         return candidates;
@@ -370,7 +419,7 @@ private Collection<SSTableReader> getOverlappingStarvedSSTables(int targetLevel,
                     Range<PartitionPosition> boundaries = new Range<>(min, max);
                     for (SSTableReader sstable : generations.get(i))
                     {
-                        Range<PartitionPosition> r = new Range<>(sstable.first, sstable.last);
+                        Range<PartitionPosition> r = new Range<>(sstable.getFirst(), sstable.getLast());
                         if (boundaries.contains(r) && !compacting.contains(sstable))
                         {
                             logger.info("Adding high-level (L{}) {} to candidates", sstable.getSSTableLevel(), sstable);
@@ -426,20 +475,20 @@ private static Set<SSTableReader> overlapping(Collection<SSTableReader> candidat
          */
         Iterator<SSTableReader> iter = candidates.iterator();
         SSTableReader sstable = iter.next();
-        Token first = sstable.first.getToken();
-        Token last = sstable.last.getToken();
+        Token first = sstable.getFirst().getToken();
+        Token last = sstable.getLast().getToken();
         while (iter.hasNext())
         {
             sstable = iter.next();
-            first = first.compareTo(sstable.first.getToken()) <= 0 ? first : sstable.first.getToken();
-            last = last.compareTo(sstable.last.getToken()) >= 0 ? last : sstable.last.getToken();
+            first = first.compareTo(sstable.getFirst().getToken()) <= 0 ? first : sstable.getFirst().getToken();
+            last = last.compareTo(sstable.getLast().getToken()) >= 0 ? last : sstable.getLast().getToken();
         }
         return overlapping(first, last, others);
     }
 
-    private static Set<SSTableReader> overlappingWithBounds(SSTableReader sstable, Map<SSTableReader, Bounds<Token>> others)
+    static Set<SSTableReader> overlappingWithBounds(SSTableReader sstable, Map<SSTableReader, Bounds<Token>> others)
     {
-        return overlappingWithBounds(sstable.first.getToken(), sstable.last.getToken(), others);
+        return overlappingWithBounds(sstable.getFirst().getToken(), sstable.getLast().getToken(), others);
     }
 
     /**
@@ -465,27 +514,34 @@ private static Set<SSTableReader> overlappingWithBounds(Token start, Token end,
         return overlapped;
     }
 
-    private static Map<SSTableReader, Bounds<Token>> genBounds(Iterable<SSTableReader> ssTableReaders)
+    @VisibleForTesting
+    static Map<SSTableReader, Bounds<Token>> genBounds(Iterable<SSTableReader> ssTableReaders)
     {
         Map<SSTableReader, Bounds<Token>> boundsMap = new HashMap<>();
         for (SSTableReader sstable : ssTableReaders)
         {
-            boundsMap.put(sstable, new Bounds<>(sstable.first.getToken(), sstable.last.getToken()));
+            boundsMap.put(sstable, new Bounds<>(sstable.getFirst().getToken(), sstable.getLast().getToken()));
         }
         return boundsMap;
     }
 
     /**
+     * Determine the highest-priority sstables to compact for the given level and add any overlapping sstables
+     * from the next level.
+     * <p/>
      * @return highest-priority sstables to compact for the given level.
      * If no compactions are possible (because of concurrent compactions or because some sstables are excluded
      * for prior failure), will return an empty list.  Never returns null.
+     *
+     * @param level the level number
+     * @return highest-priority sstables to compact for the given level.
      */
     private Collection<SSTableReader> getCandidatesFor(int level)
     {
         assert !generations.get(level).isEmpty();
         logger.trace("Choosing candidates for L{}", level);
 
-        final Set<SSTableReader> compacting = cfs.getTracker().getCompacting();
+        final Set<SSTableReader> compacting = cfs.getCompactingSSTables();
 
         if (level == 0)
         {
@@ -495,10 +551,10 @@ private Collection<SSTableReader> getCandidatesFor(int level)
             PartitionPosition firstCompactingKey = null;
             for (SSTableReader candidate : compactingL0)
             {
-                if (firstCompactingKey == null || candidate.first.compareTo(firstCompactingKey) < 0)
-                    firstCompactingKey = candidate.first;
-                if (lastCompactingKey == null || candidate.last.compareTo(lastCompactingKey) > 0)
-                    lastCompactingKey = candidate.last;
+                if (firstCompactingKey == null || candidate.getFirst().compareTo(firstCompactingKey) < 0)
+                    firstCompactingKey = candidate.getFirst();
+                if (lastCompactingKey == null || candidate.getLast().compareTo(lastCompactingKey) > 0)
+                    lastCompactingKey = candidate.getLast();
             }
 
             // L0 is the dumping ground for new sstables which thus may overlap each other.
@@ -583,7 +639,7 @@ private Set<SSTableReader> getCompactingL0()
     {
         Set<SSTableReader> sstables = new HashSet<>();
         Set<SSTableReader> levelSSTables = new HashSet<>(generations.get(0));
-        for (SSTableReader sstable : cfs.getTracker().getCompacting())
+        for (SSTableReader sstable : cfs.getCompactingSSTables())
         {
             if (levelSSTables.contains(sstable))
                 sstables.add(sstable);
@@ -618,32 +674,68 @@ public synchronized int getLevelCount()
         return 0;
     }
 
-    public synchronized int getEstimatedTasks()
+    public synchronized List<CompactionAggregate> getEstimatedTasks(CompactionAggregate.Leveled selected)
     {
-        long tasks = 0;
-        long[] estimated = new long[generations.levelCount()];
+        List<CompactionAggregate> ret = new ArrayList<>(generations.levelCount());
 
         for (int i = generations.levelCount() - 1; i >= 0; i--)
         {
             Set<SSTableReader> sstables = generations.get(i);
-            // If there is 1 byte over TBL - (MBL * 1.001), there is still a task left, so we need to round up.
-            estimated[i] = (long)Math.ceil((double)Math.max(0L, SSTableReader.getTotalBytes(sstables) - (long)(maxBytesForLevel(i, maxSSTableSizeInBytes) * 1.001)) / (double)maxSSTableSizeInBytes);
-            tasks += estimated[i];
-        }
 
-        if (!DatabaseDescriptor.getDisableSTCSInL0() && generations.get(0).size() > cfs.getMaximumCompactionThreshold())
-        {
-            int l0compactions = generations.get(0).size() / cfs.getMaximumCompactionThreshold();
-            tasks += l0compactions;
-            estimated[0] += l0compactions;
+            // do not log high levels that are empty, only log after we've found a non-empty level
+            if (sstables.isEmpty() && ret.isEmpty())
+                continue;
+
+            if (selected != null && selected.level == i)
+            {
+                ret.add(selected);
+                continue; // pending tasks already calculated by getCompactionCandidate()
+            }
+
+            if (i == 0)
+            { // for L0 if it is too far behind then pick the STCS choice
+                CompactionAggregate l0Compactions = getSTCSInL0CompactionCandidate();
+                if (l0Compactions != null)
+                {
+                    ret.add(l0Compactions);
+                    continue;
+                }
+            }
+
+            int pendingTasks = getEstimatedPendingTasks(i);
+            double score = (double) SSTableReader.getTotalBytes(sstables) / (double) maxBytesForLevel(i, maxSSTableSizeInBytes);
+            ret.add(CompactionAggregate.createLeveled(sstables, pendingTasks, maxSSTableSizeInBytes, i, score));
         }
 
-        logger.trace("Estimating {} compactions to do for {}.{}",
-                     Arrays.toString(estimated), cfs.keyspace.getName(), cfs.name);
-        return Ints.checkedCast(tasks);
+        logger.trace("Estimating {} compactions to do for {}", ret.size(), cfs.metadata());
+        return ret;
     }
 
-    public int getNextLevel(Collection<SSTableReader> sstables)
+    /**
+     * @return the estimated number of LCS compactions for a given level with the given sstables. Because it compacts one sstable at
+     *         a time, this number is determined as the number of bytes above the maximum divided the maximum sstable size in bytes.
+     *
+     *         This is however incorrect for L0. If the STCS threshold has been exceeded, we simply divide by the max threshold,
+     *         otherwise we currently use a very pessimistic estimate (no overlapping sstables).
+     */
+    private int getEstimatedPendingTasks(int level)
+    {
+        final Set<SSTableReader> sstables = getLevel(level);
+        if (sstables.isEmpty())
+            return 0;
+
+        final Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+        final Set<SSTableReader> remaining = Sets.difference(Sets.newHashSet(sstables), compacting);
+
+        if (level == 0 && !DatabaseDescriptor.getDisableSTCSInL0() && remaining.size() > MAX_COMPACTING_L0)
+            return remaining.size() / cfs.getMaximumCompactionThreshold();
+
+        // If there is 1 byte over TBL - (MBL * 1.001), there is still a task left, so we need to round up.
+        return Math.toIntExact((long) Math.ceil((Math.max(0L, SSTableReader.getTotalBytes(remaining) -
+                                                              (maxBytesForLevel(level, maxSSTableSizeInBytes) * 1.001)) / (double) maxSSTableSizeInBytes)));
+    }
+
+    int getNextLevel(Collection<SSTableReader> sstables)
     {
         int maximumLevel = Integer.MIN_VALUE;
         int minimumLevel = Integer.MAX_VALUE;
@@ -681,18 +773,4 @@ synchronized void newLevel(SSTableReader sstable, int oldLevel)
         generations.newLevel(sstable, oldLevel);
         lastCompactedSSTables[oldLevel] = sstable;
     }
-
-    public static class CompactionCandidate
-    {
-        public final Collection<SSTableReader> sstables;
-        public final int level;
-        public final long maxSSTableBytes;
-
-        public CompactionCandidate(Collection<SSTableReader> sstables, int level, long maxSSTableBytes)
-        {
-            this.sstables = sstables;
-            this.level = level;
-            this.maxSSTableBytes = maxSSTableBytes;
-        }
-    }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/OperationType.java b/src/java/org/apache/cassandra/db/compaction/OperationType.java
index 7c602aa8a18f..9429d2eaddec 100644
--- a/src/java/org/apache/cassandra/db/compaction/OperationType.java
+++ b/src/java/org/apache/cassandra/db/compaction/OperationType.java
@@ -19,6 +19,14 @@
 
 import com.google.common.base.Predicate;
 
+/**
+ * The types of operations that can be observed with {@link AbstractTableOperation} and tracked by
+ * {@link org.apache.cassandra.db.lifecycle.LifecycleTransaction}.
+ * <p/>
+ * Historically these operations have been broadly described as "compactions", even though they have
+ * nothing to do with actual compactions. Any operation that can report progress and that normally
+ * involves files, either for reading or writing, is a valid operation.
+ */
 public enum OperationType
 {
     /** Each modification here should be also applied to {@link org.apache.cassandra.tools.nodetool.Stop#compactionType} */
diff --git a/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java b/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java
index aefa40be807e..5c40be1fc1ec 100644
--- a/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java
@@ -547,7 +547,7 @@ public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Dir
             throw new UnsupportedOperationException();
         }
 
-        protected int executeInternal(ActiveCompactionsTracker activeCompactions)
+        protected int executeInternal()
         {
             run();
             return transaction.originals().size();
diff --git a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
index 1746d7c2abde..6b7568571c7b 100644
--- a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
+++ b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
@@ -19,7 +19,6 @@
 
 import java.util.*;
 import java.util.function.LongPredicate;
-import java.util.function.Predicate;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
@@ -29,23 +28,23 @@
 
 public class SSTableSplitter 
 {
-    private final SplittingCompactionTask task;
+    private final AbstractCompactionTask task;
 
     public SSTableSplitter(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
     {
-        this.task = new SplittingCompactionTask(cfs, transaction, sstableSizeInMB);
+        this.task = SplittingCompactionTask.forSSTableSplitting(cfs, transaction, sstableSizeInMB);
     }
 
     public void split()
     {
-        task.execute(ActiveCompactionsTracker.NOOP);
+        task.execute();
     }
 
-    public static class SplittingCompactionTask extends CompactionTask
+    private static class SplittingCompactionTask extends CompactionTask
     {
         private final int sstableSizeInMB;
 
-        public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
+        private SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
         {
             super(cfs, transaction, CompactionManager.NO_GC, false);
             this.sstableSizeInMB = sstableSizeInMB;
@@ -54,6 +53,11 @@ public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction trans
                 throw new IllegalArgumentException("Invalid target size for SSTables, must be > 0 (got: " + sstableSizeInMB + ")");
         }
 
+        static AbstractCompactionTask forSSTableSplitting(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
+        {
+            return new SplittingCompactionTask(cfs, transaction, sstableSizeInMB);
+        }
+
         @Override
         protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
         {
diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index 47bc1e30a15a..4cf322ac0dd1 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -178,7 +178,7 @@ public void scrub()
             while (!dataFile.isEOF())
             {
                 if (scrubInfo.isStopRequested())
-                    throw new CompactionInterruptedException(scrubInfo.getCompactionInfo());
+                    throw new CompactionInterruptedException(scrubInfo.getProgress());
 
                 // position in a data file where the partition starts
                 long dataStart = dataFile.getFilePointer();
@@ -481,12 +481,12 @@ public void close()
         }
     }
 
-    public CompactionInfo.Holder getScrubInfo()
+    public TableOperation getScrubInfo()
     {
         return scrubInfo;
     }
 
-    private static class ScrubInfo extends CompactionInfo.Holder
+    private static class ScrubInfo extends AbstractTableOperation
     {
         private final RandomAccessReader dataFile;
         private final SSTableReader sstable;
@@ -501,17 +501,17 @@ public ScrubInfo(RandomAccessReader dataFile, SSTableReader sstable, Lock fileRe
             scrubCompactionId = UUIDGen.getTimeUUID();
         }
 
-        public CompactionInfo getCompactionInfo()
+        public OperationProgress getProgress()
         {
             fileReadLock.lock();
             try
             {
-                return new CompactionInfo(sstable.metadata(),
-                                          OperationType.SCRUB,
-                                          dataFile.getFilePointer(),
-                                          dataFile.length(),
-                                          scrubCompactionId,
-                                          ImmutableSet.of(sstable));
+                return new OperationProgress(sstable.metadata(),
+                                             OperationType.SCRUB,
+                                             dataFile.getFilePointer(),
+                                             dataFile.length(),
+                                             scrubCompactionId,
+                                             ImmutableSet.of(sstable));
             }
             catch (Exception e)
             {
diff --git a/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java b/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java
index 2e1dffc5221d..166b2ae1af6a 100644
--- a/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java
@@ -50,6 +50,13 @@ public SingleSSTableLCSTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int
         this.level = level;
     }
 
+    public static AbstractCompactionTask forCompaction(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level)
+    {
+        SingleSSTableLCSTask ret = new SingleSSTableLCSTask(strategy.cfs, txn, level);
+        ret.compObserver = strategy.getBackgroundCompactions();
+        return ret;
+    }
+
     @Override
     public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
     {
@@ -57,7 +64,7 @@ public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Dir
     }
 
     @Override
-    protected int executeInternal(ActiveCompactionsTracker activeCompactions)
+    protected int executeInternal()
     {
         run();
         return 1;
diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java
new file mode 100644
index 000000000000..642aa55af7c5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+/**
+ * The statistics for size tiered compaction.
+ * <p/>
+ * Implements serializable to allow structured info to be returned via JMX.
+ */
+public class SizeTieredCompactionStatistics extends TieredCompactionStatistics
+{
+    /** The average sstable size in this tier */
+    private final long avgSSTableSize;
+
+    SizeTieredCompactionStatistics(long avgSSTableSize,
+                                   double hotness,
+                                   int numCompactions,
+                                   int numCompactionsInProgress,
+                                   int numSSTables,
+                                   int numCandidateSSTables,
+                                   int numCompactingSSTables,
+                                   long sizeInBytes,
+                                   double readThroughput,
+                                   double writeThroughput,
+                                   long tot,
+                                   long read,
+                                   long written)
+    {
+        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput, hotness, tot, read, written);
+        this.avgSSTableSize = avgSSTableSize;
+    }
+
+    /** The average sstable size in this tier */
+    public long avgSSTableSize()
+    {
+        return avgSSTableSize;
+    }
+
+    @Override
+    @JsonProperty("Bucket")
+    protected String tierValue()
+    {
+        return toString(avgSSTableSize);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
index 8d1d8dac2eb3..5e8c2f8c6eca 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
@@ -18,9 +18,14 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.*;
-import java.util.Map.Entry;
+import java.util.stream.Collectors;
+
+import javax.annotation.Nullable;
+import javax.annotation.concurrent.NotThreadSafe;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 import org.slf4j.Logger;
@@ -38,273 +43,301 @@
 
 import static com.google.common.collect.Iterables.filter;
 
-public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy
+public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy.WithAggregates
 {
     private static final Logger logger = LoggerFactory.getLogger(SizeTieredCompactionStrategy.class);
 
-    private static final Comparator<Pair<List<SSTableReader>,Double>> bucketsByHotnessComparator = new Comparator<Pair<List<SSTableReader>, Double>>()
-    {
-        public int compare(Pair<List<SSTableReader>, Double> o1, Pair<List<SSTableReader>, Double> o2)
-        {
-            int comparison = Double.compare(o1.right, o2.right);
-            if (comparison != 0)
-                return comparison;
-
-            // break ties by compacting the smallest sstables first (this will probably only happen for
-            // system tables and new/unread sstables)
-            return Long.compare(avgSize(o1.left), avgSize(o2.left));
-        }
-
-        private long avgSize(List<SSTableReader> sstables)
-        {
-            long n = 0;
-            for (SSTableReader sstable : sstables)
-                n += sstable.bytesOnDisk();
-            return n / sstables.size();
-        }
-    };
+    /**
+     * Compare {@link CompactionPick} instances by hotness first and in case of a tie by sstable size by
+     * selecting the largest first (a tie would happen for system tables and new/unread sstables).
+     * <p/>
+     * Note that in previous version there is a comment saying "break ties by compacting the smallest sstables first"
+     * but the code was doing the opposite. I preserved the behavior and fixed the comment.
+     */
+    private static final Comparator<CompactionPick> comparePicksByHotness = Comparator.comparing(CompactionPick::hotness)
+                                                                                      .thenComparing(CompactionPick::avgSizeInBytes);
 
     protected SizeTieredCompactionStrategyOptions sizeTieredOptions;
-    protected volatile int estimatedRemainingTasks;
     @VisibleForTesting
     protected final Set<SSTableReader> sstables = new HashSet<>();
 
     public SizeTieredCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
     {
         super(cfs, options);
-        this.estimatedRemainingTasks = 0;
         this.sizeTieredOptions = new SizeTieredCompactionStrategyOptions(options);
     }
 
-    private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
+    @Override
+    protected synchronized CompactionAggregate getNextBackgroundAggregate(final int gcBefore)
     {
         // make local copies so they can't be changed out from under us mid-method
         int minThreshold = cfs.getMinimumCompactionThreshold();
         int maxThreshold = cfs.getMaximumCompactionThreshold();
 
-        Iterable<SSTableReader> candidates = filterSuspectSSTables(filter(cfs.getUncompactingSSTables(), sstables::contains));
+        List<SSTableReader> candidates = new ArrayList<>();
+        synchronized (sstables)
+        {
+            Iterables.addAll(candidates, nonSuspectAndNotIn(sstables, cfs.getCompactingSSTables()));
+        }
+
+        SizeTieredBuckets sizeTieredBuckets = new SizeTieredBuckets(candidates, sizeTieredOptions, minThreshold, maxThreshold);
+        sizeTieredBuckets.aggregate();
 
-        List<List<SSTableReader>> buckets = getBuckets(createSSTableAndLengthPairs(candidates), sizeTieredOptions.bucketHigh, sizeTieredOptions.bucketLow, sizeTieredOptions.minSSTableSize);
-        logger.trace("Compaction buckets are {}", buckets);
-        estimatedRemainingTasks = getEstimatedCompactionsByTasks(cfs, buckets);
-        cfs.getCompactionStrategyManager().compactionLogger.pending(this, estimatedRemainingTasks);
-        List<SSTableReader> mostInteresting = mostInterestingBucket(buckets, minThreshold, maxThreshold);
-        if (!mostInteresting.isEmpty())
-            return mostInteresting;
+        backgroundCompactions.setPending(sizeTieredBuckets.getAggregates());
+
+        CompactionAggregate ret = sizeTieredBuckets.getAggregates().isEmpty() ? null : sizeTieredBuckets.getAggregates().get(0);
 
         // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
         // ratio is greater than threshold.
-        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
-        for (SSTableReader sstable : candidates)
-        {
-            if (worthDroppingTombstones(sstable, gcBefore))
-                sstablesWithTombstones.add(sstable);
-        }
-        if (sstablesWithTombstones.isEmpty())
-            return Collections.emptyList();
+        if (ret == null || ret.isEmpty())
+            ret = makeTombstoneCompaction(gcBefore, candidates, list -> Collections.max(list, SSTableReader.sizeComparator));
 
-        return Collections.singletonList(Collections.max(sstablesWithTombstones, SSTableReader.sizeComparator));
+        return ret;
     }
 
-
     /**
-     * @param buckets list of buckets from which to return the most interesting, where "interesting" is the total hotness for reads
-     * @param minThreshold minimum number of sstables in a bucket to qualify as interesting
-     * @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this)
-     * @return a bucket (list) of sstables to compact
+     * This class contains the logic for {@link SizeTieredCompactionStrategy}:
+     *
+     * - sorts the sstables by length on disk
+     * - it sorts the candidates into buckets
+     * - takes a snapshot of the sstable hotness
+     * - it organizes the buckets into a list of {@link CompactionAggregate}, an aggregate per bucket.
+     *   An aggregate will have a list of compaction picks, each pick is a list of sstables below the max threshold,
+     *   sorted by hotness.
+     * - the aggregates are sorted by comparing the total hotness of the first pick of each aggregate
+     * - the aggregate with the hottest first pick will have its first pick submitted for compaction.
      */
-    public static List<SSTableReader> mostInterestingBucket(List<List<SSTableReader>> buckets, int minThreshold, int maxThreshold)
+    @NotThreadSafe
+    final static class SizeTieredBuckets
     {
-        // skip buckets containing less than minThreshold sstables, and limit other buckets to maxThreshold sstables
-        final List<Pair<List<SSTableReader>, Double>> prunedBucketsAndHotness = new ArrayList<>(buckets.size());
-        for (List<SSTableReader> bucket : buckets)
+        private final SizeTieredCompactionStrategyOptions options;
+        private final List<SSTableReader> tablesBySize;
+        private final Map<Long, List<SSTableReader>> buckets;
+        private final Map<SSTableReader, Double> hotnessSnapshot;
+        private final int minThreshold;
+        private final int maxThreshold;
+
+        /**
+         * This is the list of compactions order by most interesting first
+         */
+        private List<CompactionAggregate> aggregates;
+
+        /**
+         * @param candidates   list sstables that are not yet compacting
+         * @param options      the options for size tiered compaction strategy
+         * @param minThreshold minimum number of sstables in a bucket to qualify as interesting
+         * @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this)
+         */
+        SizeTieredBuckets(Iterable<? extends SSTableReader> candidates,
+                          SizeTieredCompactionStrategyOptions options,
+                          int minThreshold,
+                          int maxThreshold)
         {
-            Pair<List<SSTableReader>, Double> bucketAndHotness = trimToThresholdWithHotness(bucket, maxThreshold);
-            if (bucketAndHotness != null && bucketAndHotness.left.size() >= minThreshold)
-                prunedBucketsAndHotness.add(bucketAndHotness);
+            this.options = options;
+            this.tablesBySize = new ArrayList<>();
+            Iterables.addAll(this.tablesBySize, candidates);
+            this.tablesBySize.sort(SSTableReader.sizeComparator);
+            this.buckets = getBuckets(tablesBySize, options);
+            this.hotnessSnapshot = getHotnessSnapshot(buckets.values());
+            this.minThreshold = minThreshold;
+            this.maxThreshold = maxThreshold;
+
+            this.aggregates = new ArrayList<>(buckets.size());
+
+            if (logger.isTraceEnabled())
+                logger.trace("Compaction buckets are {}", buckets);
         }
-        if (prunedBucketsAndHotness.isEmpty())
-            return Collections.emptyList();
 
-        Pair<List<SSTableReader>, Double> hottest = Collections.max(prunedBucketsAndHotness, bucketsByHotnessComparator);
-        return hottest.left;
-    }
-
-    /**
-     * Returns a (bucket, hotness) pair or null if there were not enough sstables in the bucket to meet minThreshold.
-     * If there are more than maxThreshold sstables, the coldest sstables will be trimmed to meet the threshold.
-     **/
-    @VisibleForTesting
-    static Pair<List<SSTableReader>, Double> trimToThresholdWithHotness(List<SSTableReader> bucket, int maxThreshold)
-    {
-        // Sort by sstable hotness (descending). We first build a map because the hotness may change during the sort.
-        final Map<SSTableReader, Double> hotnessSnapshot = getHotnessMap(bucket);
-        Collections.sort(bucket, new Comparator<SSTableReader>()
+        /**
+         * Group sstables of similar on disk size into buckets.
+         * The given set must be sorted using SSTableReader.sizeComparator
+         */
+        private static Map<Long, List<SSTableReader>> getBuckets(List<SSTableReader> sstables, SizeTieredCompactionStrategyOptions options)
         {
-            public int compare(SSTableReader o1, SSTableReader o2)
+            if (sstables.isEmpty())
+                return Collections.EMPTY_MAP;
+
+            Map<Long, List<SSTableReader>> buckets = new HashMap<>();
+
+            long currentAverageSize = 0;
+            List<SSTableReader> currentBucket = new ArrayList<>();
+
+            for (SSTableReader sstable: sstables)
             {
-                return -1 * Double.compare(hotnessSnapshot.get(o1), hotnessSnapshot.get(o2));
+                long size = sstable.onDiskLength();
+                assert size >= currentAverageSize;
+
+                if (size >= currentAverageSize * options.bucketHigh
+                    && size >= options.minSSTableSize
+                    && currentAverageSize > 0)   // false for first table only
+                {
+                    // Switch to new bucket
+                    buckets.put(currentAverageSize, currentBucket);
+                    currentBucket = new ArrayList<>();
+                }
+                // TODO: Is it okay that the bucket max can grow unboundedly?
+
+                currentAverageSize = (currentAverageSize * currentBucket.size() + size) / (currentBucket.size() + 1);
+                currentBucket.add(sstable);
             }
-        });
 
-        // and then trim the coldest sstables off the end to meet the maxThreshold
-        List<SSTableReader> prunedBucket = bucket.subList(0, Math.min(bucket.size(), maxThreshold));
+            buckets.put(currentAverageSize, currentBucket);
+            return buckets;
+        }
 
-        // bucket hotness is the sum of the hotness of all sstable members
-        double bucketHotness = 0.0;
-        for (SSTableReader sstr : prunedBucket)
-            bucketHotness += hotness(sstr);
+        /**
+         * For each bucket with at least minThreshold sstables:
+         * <p>
+         * - sort the sstables by hotness
+         * - divide the bucket into max threshold sstables and add it to a temporary list of candidates along with the total hotness of the bucket section
+         * <p>
+         * Then select the candidate with the max hotness and the most interesting bucket and put the remaining candidates in the pending list.
+         *
+         * @return the parent object {@link SizeTieredBuckets}
+         */
+        SizeTieredBuckets aggregate()
+        {
+            if (!aggregates.isEmpty())
+                return this; // already called
 
-        return Pair.create(prunedBucket, bucketHotness);
-    }
+            List<CompactionAggregate> aggregatesWithoutCompactions = new ArrayList<>(buckets.size());
+            List<CompactionAggregate> aggregatesWithCompactions = new ArrayList<>(buckets.size());
 
-    private static Map<SSTableReader, Double> getHotnessMap(Collection<SSTableReader> sstables)
-    {
-        Map<SSTableReader, Double> hotness = new HashMap<>(sstables.size());
-        for (SSTableReader sstable : sstables)
-            hotness.put(sstable, hotness(sstable));
-        return hotness;
-    }
+            for (Map.Entry<Long, List<SSTableReader>> entry : buckets.entrySet())
+            {
+                long avgSizeBytes = entry.getKey();
+                long minSizeBytes = (long) (avgSizeBytes * options.bucketLow);
+                long maxSizeBytes = (long) (avgSizeBytes * options.bucketHigh);
 
-    /**
-     * Returns the reads per second per key for this sstable, or 0.0 if the sstable has no read meter
-     */
-    private static double hotness(SSTableReader sstr)
-    {
-        // system tables don't have read meters, just use 0.0 for the hotness
-        return sstr.getReadMeter() == null ? 0.0 : sstr.getReadMeter().twoHourRate() / sstr.estimatedKeys();
-    }
+                List<SSTableReader> bucket = entry.getValue();
+                double hotness = totHotness(bucket, hotnessSnapshot);
 
-    @SuppressWarnings("resource")
-    public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
-    {
-        List<SSTableReader> previousCandidate = null;
-        while (true)
-        {
-            List<SSTableReader> hottestBucket = getNextBackgroundSSTables(gcBefore);
+                if (bucket.size() < minThreshold)
+                {
+                    if (logger.isTraceEnabled())
+                        logger.trace("Aggregate with {} avg bytes for {} files not considered for compaction: {}", avgSizeBytes, bucket.size(), bucket);
+
+                    aggregatesWithoutCompactions.add(CompactionAggregate.createSizeTiered(bucket,
+                                                                                          CompactionPick.EMPTY,
+                                                                                          ImmutableList.of(),
+                                                                                          hotness,
+                                                                                          avgSizeBytes,
+                                                                                          minSizeBytes,
+                                                                                          maxSizeBytes));
+
+                    continue;
+                }
+
+                // sort the bucket by hotness
+                Collections.sort(bucket, (o1, o2) -> -1 * Double.compare(hotnessSnapshot.get(o1), hotnessSnapshot.get(o2)));
+
+                // now divide the candidates into a list of picks, each pick with at most max threshold sstables
+                int i = 0;
+                CompactionPick selected = null;
+                List<CompactionPick> pending = new ArrayList<>();
+
+
+                while ((bucket.size() - i) >= minThreshold)
+                {
+                    List<SSTableReader> sstables = bucket.subList(i, i + Math.min(bucket.size() - i, maxThreshold));
+                    if (selected == null)
+                        selected = CompactionPick.create(avgSizeBytes, sstables, totHotness(sstables, hotnessSnapshot));
+                    else
+                        pending.add(CompactionPick.create(avgSizeBytes, sstables, totHotness(sstables, hotnessSnapshot)));
+
+                    i += sstables.size();
+                }
+
+                if (logger.isTraceEnabled())
+                    logger.trace("Aggregate with {} avg bytes for {} files considered for compaction: {}", avgSizeBytes, bucket.size(), bucket);
+
+                // Finally create the new aggregate with the new pending compactions and those already compacting and not yet completed
+                aggregatesWithCompactions.add(CompactionAggregate.createSizeTiered(bucket, selected, pending, hotness, avgSizeBytes, minSizeBytes, maxSizeBytes));
+            }
 
-            if (hottestBucket.isEmpty())
-                return null;
+            // This sorts the aggregates based on the hotness of their selected pick so that the aggregate with the hottest selected pick
+            // be first in the list and get submitted
+            if (!aggregatesWithCompactions.isEmpty())
+            {
+                Collections.sort(aggregatesWithCompactions, (a1, a2) -> comparePicksByHotness.compare(a2.getSelected(), a1.getSelected()));
 
-            // Already tried acquiring references without success. It means there is a race with
-            // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
-            if (hottestBucket.equals(previousCandidate))
+                if (logger.isTraceEnabled())
+                    logger.trace("Found compaction for aggregate {}", aggregatesWithCompactions.get(0));
+            }
+            else
             {
-                logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
-                            "unless it happens frequently, in which case it must be reported. Will retry later.",
-                            hottestBucket);
-                return null;
+                if (logger.isTraceEnabled())
+                    logger.trace("No compactions found");
             }
 
-            LifecycleTransaction transaction = cfs.getTracker().tryModify(hottestBucket, OperationType.COMPACTION);
-            if (transaction != null)
-                return new CompactionTask(cfs, transaction, gcBefore);
-            previousCandidate = hottestBucket;
+            // publish the results
+            this.aggregates.addAll(aggregatesWithCompactions); // those with compactions first, because the first one will be the one submitted
+            this.aggregates.addAll(aggregatesWithoutCompactions); // then add those empty
+            return this;
         }
-    }
 
-    @SuppressWarnings("resource")
-    public synchronized Collection<AbstractCompactionTask> getMaximalTask(final int gcBefore, boolean splitOutput)
-    {
-        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(sstables);
-        if (Iterables.isEmpty(filteredSSTables))
-            return null;
-        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
-        if (txn == null)
-            return null;
-        if (splitOutput)
-            return Arrays.<AbstractCompactionTask>asList(new SplittingCompactionTask(cfs, txn, gcBefore));
-        return Arrays.<AbstractCompactionTask>asList(new CompactionTask(cfs, txn, gcBefore));
-    }
+        /**
+         * For diagnostics only. Returns the sorted tables paired with their on-disk length.
+         */
+        public Collection<Pair<SSTableReader, Long>> pairs()
+        {
+            return Collections2.transform(tablesBySize, (SSTableReader table) -> Pair.create(table, table.onDiskLength()));
+        }
 
-    @SuppressWarnings("resource")
-    public AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, final int gcBefore)
-    {
-        assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
+        public List<List<SSTableReader>> buckets()
+        {
+            return new ArrayList<>(buckets.values());
+        }
 
-        LifecycleTransaction transaction = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
-        if (transaction == null)
+        public List<CompactionAggregate> getAggregates()
         {
-            logger.trace("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
-            return null;
+            return aggregates;
         }
 
-        return new CompactionTask(cfs, transaction, gcBefore).setUserDefined(true);
+        public List<CompactionPick> getCompactions()
+        {
+            return aggregates.stream().flatMap(aggr -> aggr.getActive().stream()).collect(Collectors.toList());
+        }
     }
 
-    public int getEstimatedRemainingTasks()
+    /**
+     * @return a snapshot mapping sstables to their current read hotness.
+     */
+    @VisibleForTesting
+    static Map<SSTableReader, Double> getHotnessSnapshot(Collection<List<SSTableReader>> buckets)
     {
-        return estimatedRemainingTasks;
-    }
+        Map<SSTableReader, Double> ret = new HashMap<>();
 
-    public static List<Pair<SSTableReader, Long>> createSSTableAndLengthPairs(Iterable<SSTableReader> sstables)
-    {
-        List<Pair<SSTableReader, Long>> sstableLengthPairs = new ArrayList<>(Iterables.size(sstables));
-        for(SSTableReader sstable : sstables)
-            sstableLengthPairs.add(Pair.create(sstable, sstable.onDiskLength()));
-        return sstableLengthPairs;
+        for (List<SSTableReader> sstables: buckets)
+        {
+            for (SSTableReader sstable : sstables)
+                ret.put(sstable, sstable.hotness());
+        }
+
+        return ret;
     }
 
-    /*
-     * Group files of similar size into buckets.
+    /**
+     * @return the sum of the hotness of all the sstables
      */
-    public static <T> List<List<T>> getBuckets(Collection<Pair<T, Long>> files, double bucketHigh, double bucketLow, long minSSTableSize)
+    private static double totHotness(Iterable<SSTableReader> sstables, @Nullable final Map<SSTableReader, Double> hotnessSnapshot)
     {
-        // Sort the list in order to get deterministic results during the grouping below
-        List<Pair<T, Long>> sortedFiles = new ArrayList<Pair<T, Long>>(files);
-        Collections.sort(sortedFiles, new Comparator<Pair<T, Long>>()
-        {
-            public int compare(Pair<T, Long> p1, Pair<T, Long> p2)
-            {
-                return p1.right.compareTo(p2.right);
-            }
-        });
-
-        Map<Long, List<T>> buckets = new HashMap<Long, List<T>>();
-
-        outer:
-        for (Pair<T, Long> pair: sortedFiles)
+        double hotness = 0.0;
+        for (SSTableReader sstable : sstables)
         {
-            long size = pair.right;
-
-            // look for a bucket containing similar-sized files:
-            // group in the same bucket if it's w/in 50% of the average for this bucket,
-            // or this file and the bucket are all considered "small" (less than `minSSTableSize`)
-            for (Entry<Long, List<T>> entry : buckets.entrySet())
-            {
-                List<T> bucket = entry.getValue();
-                long oldAverageSize = entry.getKey();
-                if ((size > (oldAverageSize * bucketLow) && size < (oldAverageSize * bucketHigh))
-                    || (size < minSSTableSize && oldAverageSize < minSSTableSize))
-                {
-                    // remove and re-add under new new average size
-                    buckets.remove(oldAverageSize);
-                    long totalSize = bucket.size() * oldAverageSize;
-                    long newAverageSize = (totalSize + size) / (bucket.size() + 1);
-                    bucket.add(pair.left);
-                    buckets.put(newAverageSize, bucket);
-                    continue outer;
-                }
-            }
-
-            // no similar bucket found; put it in a new one
-            ArrayList<T> bucket = new ArrayList<T>();
-            bucket.add(pair.left);
-            buckets.put(size, bucket);
+            double h = hotnessSnapshot == null ? 0.0 : hotnessSnapshot.getOrDefault(sstable, 0.0);
+            hotness += h == 0.0  ? sstable.hotness() : h;
         }
 
-        return new ArrayList<List<T>>(buckets.values());
+        return hotness;
     }
 
-    public static int getEstimatedCompactionsByTasks(ColumnFamilyStore cfs, List<List<SSTableReader>> tasks)
+    @Override
+    protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
     {
-        int n = 0;
-        for (List<SSTableReader> bucket : tasks)
-        {
-            if (bucket.size() >= cfs.getMinimumCompactionThreshold())
-                n += Math.ceil((double)bucket.size() / cfs.getMaximumCompactionThreshold());
-        }
-        return n;
+        return isMaximal && splitOutput
+               ? SplittingCompactionTask.forSplitting(this, txn, gcBefore)
+               : CompactionTask.forCompaction(this, txn, gcBefore);
     }
 
     public long getMaxSSTableBytes()
@@ -324,21 +357,47 @@ public static Map<String, String> validateOptions(Map<String, String> options) t
     }
 
     @Override
-    public synchronized void addSSTable(SSTableReader added)
+    public void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added)
+    {
+        synchronized (sstables)
+        {
+            for (SSTableReader remove : removed)
+                sstables.remove(remove);
+            sstables.addAll(added);
+        }
+    }
+
+    @Override
+    public void addSSTable(SSTableReader added)
+    {
+        synchronized (sstables)
+        {
+            sstables.add(added);
+        }
+    }
+
+    @Override
+    void removeDeadSSTables()
     {
-        sstables.add(added);
+        removeDeadSSTables(sstables);
     }
 
     @Override
-    public synchronized void removeSSTable(SSTableReader sstable)
+    public void removeSSTable(SSTableReader sstable)
     {
-        sstables.remove(sstable);
+        synchronized (sstables)
+        {
+            sstables.remove(sstable);
+        }
     }
 
     @Override
     protected Set<SSTableReader> getSSTables()
     {
-        return ImmutableSet.copyOf(sstables);
+        synchronized (sstables)
+        {
+            return ImmutableSet.copyOf(sstables);
+        }
     }
 
     public String toString()
@@ -350,9 +409,14 @@ public String toString()
 
     private static class SplittingCompactionTask extends CompactionTask
     {
-        public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
+        public SplittingCompactionTask(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
+        {
+            super(strategy, txn, gcBefore, false);
+        }
+
+        static AbstractCompactionTask forSplitting(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
         {
-            super(cfs, txn, gcBefore);
+            return new SplittingCompactionTask(strategy, txn, gcBefore);
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java
index 288af2bebb22..3b612eaedc31 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyOptions.java
@@ -23,12 +23,12 @@
 
 public final class SizeTieredCompactionStrategyOptions
 {
-    protected static final long DEFAULT_MIN_SSTABLE_SIZE = 50L * 1024L * 1024L;
-    protected static final double DEFAULT_BUCKET_LOW = 0.5;
-    protected static final double DEFAULT_BUCKET_HIGH = 1.5;
-    protected static final String MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
-    protected static final String BUCKET_LOW_KEY = "bucket_low";
-    protected static final String BUCKET_HIGH_KEY = "bucket_high";
+    static final long DEFAULT_MIN_SSTABLE_SIZE = 50L * 1024L * 1024L;
+    static final double DEFAULT_BUCKET_LOW = 0.5;
+    static final double DEFAULT_BUCKET_HIGH = 1.5;
+    static final String MIN_SSTABLE_SIZE_KEY = "min_sstable_size";
+    static final String BUCKET_LOW_KEY = "bucket_low";
+    static final String BUCKET_HIGH_KEY = "bucket_high";
 
     protected long minSSTableSize;
     protected double bucketLow;
@@ -46,9 +46,14 @@ public SizeTieredCompactionStrategyOptions(Map<String, String> options)
 
     public SizeTieredCompactionStrategyOptions()
     {
-        minSSTableSize = DEFAULT_MIN_SSTABLE_SIZE;
-        bucketLow = DEFAULT_BUCKET_LOW;
-        bucketHigh = DEFAULT_BUCKET_HIGH;
+        this(DEFAULT_MIN_SSTABLE_SIZE, DEFAULT_BUCKET_LOW, DEFAULT_BUCKET_HIGH);
+    }
+
+    SizeTieredCompactionStrategyOptions(long minSSTableSize, double bucketLow, double bucketHigh)
+    {
+        this.minSSTableSize = minSSTableSize;
+        this.bucketLow = bucketLow;
+        this.bucketHigh = bucketHigh;
     }
 
     private static double parseDouble(Map<String, String> options, String key, double defaultValue) throws ConfigurationException
diff --git a/src/java/org/apache/cassandra/db/compaction/TableOperation.java b/src/java/org/apache/cassandra/db/compaction/TableOperation.java
new file mode 100644
index 000000000000..1908eeaa2c31
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/TableOperation.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Optional;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Predicate;
+
+import javax.annotation.Nullable;
+
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+
+/**
+ * This is a table operation that must be able to report the operation progress and to
+ * interrupt the operation when requested.
+ * <p/>
+ * Any operation defined by {@link OperationType} is normally implementing this interface,
+ * for example index building, view building, cache saving, anti-compaction, compaction,
+ * scrubbing, verifying, tombstone collection and others.
+ * <p/>
+ * These operations have in common that they run on the compaction executor and used to be
+ * known as "compaction".
+ * */
+public interface TableOperation
+{
+    /**
+     * @return the progress of the operation, see {@link Progress}.
+     */
+    AbstractTableOperation.OperationProgress getProgress();
+
+    /**
+     * Interrupt the operation.
+     */
+    void stop();
+
+    /**
+     * Interrupt the current operation if possible and if the predicate is true.
+     *
+     * @param trigger cause of compaction interruption
+     */
+    void stop(StopTrigger trigger);
+
+    /**
+     * @return true if the operation has been requested to be interrupted.
+     */
+    boolean isStopRequested();
+
+    /**
+     * Return true if the predicate for the given sstables holds, or if the operation
+     * does not consider any sstables, in which case it will always return true (the
+     * default behaviour).
+     * <p/>
+     *
+     * @param predicate the predicate to be applied to the operation sstables
+     *
+     * @return true by default, see overrides for different behaviors
+     */
+    boolean shouldStop(Predicate<SSTableReader> predicate);
+
+    /**
+     * @return cause of compaction interruption.
+     */
+    public StopTrigger trigger();
+
+    /**
+     * if this compaction involves several/all tables we can safely check globalCompactionsPaused
+     * in isStopRequested() below
+     */
+    public abstract boolean isGlobal();
+
+    /**
+     * The unit for the {@link Progress} report.
+     */
+    enum Unit
+    {
+        BYTES("bytes"), RANGES("token range parts"), KEYS("keys");
+
+        private final String name;
+
+        Unit(String name)
+        {
+            this.name = name;
+        }
+
+        @Override
+        public String toString()
+        {
+            return this.name;
+        }
+
+        public static boolean isFileSize(String unit)
+        {
+            return BYTES.toString().equals(unit);
+        }
+    }
+
+    public enum StopTrigger
+    {
+        NONE(false),
+        TRUNCATE(true);
+
+        private final boolean isFinal;
+
+        StopTrigger(boolean isFinal)
+        {
+            this.isFinal = isFinal;
+        }
+
+        // A stop trigger marked as final should not be overwritten. So a table operation that is
+        // marked with a final stop trigger cannot have it's stop trigger changed to another value.
+        public boolean isFinal()
+        {
+            return isFinal;
+        }
+    }
+
+    /**
+     * The progress of a table operation.
+     */
+    interface Progress
+    {
+        String ID = "id";
+        String KEYSPACE = "keyspace";
+        String COLUMNFAMILY = "columnfamily";
+        String COMPLETED = "completed";
+        String TOTAL = "total";
+        String OPERATION_TYPE = "operationType";
+        String UNIT = "unit";
+        String OPERATION_ID = "operationId";
+
+        /**
+         * @return the keyspace name, if the metadata is not null.
+         */
+        Optional<String> keyspace();
+
+        /**
+         * @return the table name, if the metadata is not null.
+         */
+        Optional<String> table();
+
+        /**
+         * @return the table metadata, this may be null if the operation has no metadata.
+         */
+        @Nullable TableMetadata metadata();
+
+        /**
+         * @return the number of units completed, see {@link this#unit()}.
+         */
+        long completed();
+
+        /**
+         * @return the total number of units that must be processed by the operation, see {@link this#unit()}.
+         */
+        long total();
+
+        /**
+         * @return the type of operation, see {@link OperationType}.
+         */
+        OperationType operationType();
+
+        /**
+         * @return a unique identifier for this operation.
+         */
+        UUID operationId();
+
+        /**
+         * @return the unit to be used for {@link this#completed()} and {@link this#total()}, see {@link Unit}.
+         */
+        Unit unit();
+
+        /**
+         * @return a set of SSTables participating in this operation
+         */
+        Set<SSTableReader> sstables();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/TableOperationObserver.java b/src/java/org/apache/cassandra/db/compaction/TableOperationObserver.java
new file mode 100644
index 000000000000..93dc643a5685
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/TableOperationObserver.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import org.apache.cassandra.utils.NonThrowingCloseable;
+
+/**
+ * An observer of {@link AbstractTableOperation}.
+ * <p/>
+ * The observer is notified when an operation is started. It returns a closeable that will be closed
+ * when the operation is finished. The operation can be queried at any time to get the progress information.
+ */
+public interface TableOperationObserver
+{
+    TableOperationObserver NOOP = operation -> () -> {};
+
+    /**
+     * Signal to the observer that an operation is starting.
+     *
+     * @param operation the operation starting
+     *
+     * @return a closeable that the caller should close when the operation completes
+     */
+    NonThrowingCloseable onOperationStart(TableOperation operation);
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java
new file mode 100644
index 000000000000..9d6803c0a682
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+public abstract class TieredCompactionStatistics extends CompactionAggregateStatistics
+{
+    private static final Collection<String> HEADER = ImmutableList.copyOf(Iterables.concat(ImmutableList.of("Bucket", "Hotness"),
+                                                                                           CompactionAggregateStatistics.HEADER,
+                                                                                           ImmutableList.of("Tot/Read/Written")));
+
+    private static final long serialVersionUID = 3695927592357987916L;
+    /** The total read hotness of the sstables */
+    protected final double hotness;
+    /** Total uncompressed bytes of the sstables */
+    protected final long tot;
+    /** Total bytes read by ongoing compactions */
+    protected final long read;
+    /** Total bytes written by ongoing compactions */
+    protected final long written;
+
+    public TieredCompactionStatistics(int numCompactions,
+                                      int numCompactionsInProgress,
+                                      int numSSTables,
+                                      int numCandidateSSTables,
+                                      int numCompactingSSTables,
+                                      long sizeInBytes,
+                                      double readThroughput,
+                                      double writeThroughput,
+                                      double hotness,
+                                      long tot,
+                                      long read,
+                                      long written)
+    {
+        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput);
+
+        this.hotness = hotness;
+        this.tot = tot;
+        this.read = read;
+        this.written = written;
+    }
+
+    /** The total read hotness of the sstables */
+    @JsonProperty
+    public double hotness()
+    {
+        return hotness;
+    }
+
+    /** Total uncompressed bytes of the sstables */
+    @JsonProperty
+    public long tot()
+    {
+        return tot;
+    }
+
+    /** Uncompressed bytes read by compactions so far. */
+    @JsonProperty
+    public long read()
+    {
+        return read;
+    }
+
+    /** Uncompressed  bytes written by compactions so far. */
+    @JsonProperty
+    public long written()
+    {
+        return written;
+    }
+
+    @Override
+    protected Collection<String> header()
+    {
+        return HEADER;
+    }
+
+    @Override
+    protected Collection<String> data()
+    {
+        List<String> data = new ArrayList<>(HEADER.size());
+        data.add(tierValue());
+        data.add(String.format("%.4f", hotness));
+
+        data.addAll(super.data());
+
+        data.add(toString(tot()) + '/' + toString(read()) + '/' + toString(written()));
+        return data;
+    }
+
+    protected abstract String tierValue();
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java
new file mode 100644
index 000000000000..c106f4073ca2
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.text.DateFormat;
+import java.util.Date;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+/**
+ * The statistics for time tiered compaction.
+ * <p/>
+ * Implements serializable to allow structured info to be returned via JMX.
+ */
+public class TimeTieredCompactionStatistics extends TieredCompactionStatistics
+{
+    protected static final DateFormat bucketFormatter = DateFormat.getDateTimeInstance(DateFormat.SHORT, DateFormat.SHORT);
+
+    /** The timestamp in this tier */
+    private final long timestamp;
+
+    TimeTieredCompactionStatistics(long timestamp,
+                                   double hotness,
+                                   int numCompactions,
+                                   int numCompactionsInProgress,
+                                   int numSSTables,
+                                   int numCandidateSSTables,
+                                   int numCompactingSSTables,
+                                   long sizeInBytes,
+                                   double readThroughput,
+                                   double writeThroughput,
+                                   long tot,
+                                   long read,
+                                   long written)
+    {
+        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput, hotness, tot, read, written);
+
+        this.timestamp = timestamp;
+    }
+
+    /** The timestamp in this tier */
+    public long timestamp()
+    {
+        return timestamp;
+    }
+
+    @Override
+    @JsonProperty("Bucket")
+    protected String tierValue()
+    {
+        return bucketFormatter.format(new Date(timestamp));
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
index 41668053ebe6..7f3765ed464a 100644
--- a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
@@ -21,8 +21,9 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Iterator;
-import java.util.TreeSet;
+import java.util.Comparator;
+import java.util.NavigableMap;
+import java.util.TreeMap;
 import java.util.concurrent.TimeUnit;
 import java.util.HashSet;
 import java.util.List;
@@ -40,16 +41,14 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.CompactionParams;
-import org.apache.cassandra.utils.Pair;
 
 import static com.google.common.collect.Iterables.filter;
 
-public class TimeWindowCompactionStrategy extends AbstractCompactionStrategy
+public class TimeWindowCompactionStrategy extends AbstractCompactionStrategy.WithAggregates
 {
     private static final Logger logger = LoggerFactory.getLogger(TimeWindowCompactionStrategy.class);
 
     private final TimeWindowCompactionStrategyOptions options;
-    protected volatile int estimatedRemainingTasks;
     private final Set<SSTableReader> sstables = new HashSet<>();
     private long lastExpiredCheck;
     private long highestWindowSeen;
@@ -57,7 +56,6 @@ public class TimeWindowCompactionStrategy extends AbstractCompactionStrategy
     public TimeWindowCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
     {
         super(cfs, options);
-        this.estimatedRemainingTasks = 0;
         this.options = new TimeWindowCompactionStrategyOptions(options);
         if (!options.containsKey(AbstractCompactionStrategy.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(AbstractCompactionStrategy.TOMBSTONE_THRESHOLD_OPTION))
         {
@@ -69,32 +67,9 @@ public TimeWindowCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> o
     }
 
     @Override
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+    public AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
     {
-        List<SSTableReader> previousCandidate = null;
-        while (true)
-        {
-            List<SSTableReader> latestBucket = getNextBackgroundSSTables(gcBefore);
-
-            if (latestBucket.isEmpty())
-                return null;
-
-            // Already tried acquiring references without success. It means there is a race with
-            // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
-            if (latestBucket.equals(previousCandidate))
-            {
-                logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
-                            "unless it happens frequently, in which case it must be reported. Will retry later.",
-                            latestBucket);
-                return null;
-            }
-
-            LifecycleTransaction modifier = cfs.getTracker().tryModify(latestBucket, OperationType.COMPACTION);
-            if (modifier != null)
-                return new TimeWindowCompactionTask(cfs, modifier, gcBefore, options.ignoreOverlaps);
-            previousCandidate = latestBucket;
-        }
+        return CompactionTask.forTimeWindowCompaction(this, txn, gcBefore);
     }
 
     /**
@@ -102,12 +77,18 @@ public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
      * @param gcBefore
      * @return
      */
-    private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcBefore)
+    @Override
+    protected synchronized CompactionAggregate getNextBackgroundAggregate(final int gcBefore)
     {
         if (Iterables.isEmpty(cfs.getSSTables(SSTableSet.LIVE)))
-            return Collections.emptyList();
+            return null;
 
-        Set<SSTableReader> uncompacting = ImmutableSet.copyOf(filter(cfs.getUncompactingSSTables(), sstables::contains));
+        Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+        Set<SSTableReader> uncompacting;
+        synchronized (sstables)
+        {
+            uncompacting = ImmutableSet.copyOf(filter(sstables, sstable -> !compacting.contains(sstable)));
+        }
 
         // Find fully expired SSTables. Those will be included no matter what.
         Set<SSTableReader> expired = Collections.emptySet();
@@ -126,279 +107,270 @@ private synchronized List<SSTableReader> getNextBackgroundSSTables(final int gcB
 
         Set<SSTableReader> candidates = Sets.newHashSet(filterSuspectSSTables(uncompacting));
 
-        List<SSTableReader> compactionCandidates = new ArrayList<>(getNextNonExpiredSSTables(Sets.difference(candidates, expired), gcBefore));
-        if (!expired.isEmpty())
+        CompactionAggregate compactionCandidate = getNextNonExpiredSSTables(Sets.difference(candidates, expired), gcBefore);
+        if (expired.isEmpty())
+            return compactionCandidate;
+
+        logger.debug("Including expired sstables: {}", expired);
+        if (compactionCandidate == null)
         {
-            logger.debug("Including expired sstables: {}", expired);
-            compactionCandidates.addAll(expired);
+            long timestamp = getWindowBoundsInMillis(options.sstableWindowUnit, options.sstableWindowSize,
+                                                     Collections.max(expired, Comparator.comparing(SSTableReader::getMaxTimestamp)).getMaxTimestamp());
+            return CompactionAggregate.createTimeTiered(expired, timestamp);
         }
 
-        return compactionCandidates;
+        return compactionCandidate.withExpired(expired);
     }
 
-    private List<SSTableReader> getNextNonExpiredSSTables(Iterable<SSTableReader> nonExpiringSSTables, final int gcBefore)
+    private CompactionAggregate getNextNonExpiredSSTables(Iterable<SSTableReader> nonExpiringSSTables, final int gcBefore)
     {
-        List<SSTableReader> mostInteresting = getCompactionCandidates(nonExpiringSSTables);
+        List<CompactionAggregate> candidates = getCompactionCandidates(nonExpiringSSTables);
+        backgroundCompactions.setPending(candidates);
 
-        if (mostInteresting != null)
-        {
-            return mostInteresting;
-        }
+        CompactionAggregate ret = candidates.isEmpty() ? null : candidates.get(0);
 
         // if there is no sstable to compact in standard way, try compacting single sstable whose droppable tombstone
         // ratio is greater than threshold.
-        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
-        for (SSTableReader sstable : nonExpiringSSTables)
-        {
-            if (worthDroppingTombstones(sstable, gcBefore))
-                sstablesWithTombstones.add(sstable);
-        }
-        if (sstablesWithTombstones.isEmpty())
-            return Collections.emptyList();
+        if (ret == null || ret.isEmpty())
+            ret = makeTombstoneCompaction(gcBefore, nonExpiringSSTables, list -> Collections.min(list, SSTableReader.sizeComparator));
 
-        return Collections.singletonList(Collections.min(sstablesWithTombstones, SSTableReader.sizeComparator));
+        return ret;
     }
 
-    private List<SSTableReader> getCompactionCandidates(Iterable<SSTableReader> candidateSSTables)
+    private List<CompactionAggregate> getCompactionCandidates(Iterable<SSTableReader> candidateSSTables)
     {
-        Pair<HashMultimap<Long, SSTableReader>, Long> buckets = getBuckets(candidateSSTables, options.sstableWindowUnit, options.sstableWindowSize, options.timestampResolution);
+        NavigableMap<Long, List<SSTableReader>> buckets = getBuckets(candidateSSTables, options.sstableWindowUnit, options.sstableWindowSize, options.timestampResolution);
         // Update the highest window seen, if necessary
-        if(buckets.right > this.highestWindowSeen)
-            this.highestWindowSeen = buckets.right;
+        if (!buckets.isEmpty())
+        {
+            long maxKey = buckets.lastKey();
+            if (maxKey > this.highestWindowSeen)
+                this.highestWindowSeen = maxKey;
+        }
 
-        NewestBucket mostInteresting = newestBucket(buckets.left,
-                cfs.getMinimumCompactionThreshold(),
-                cfs.getMaximumCompactionThreshold(),
-                options.stcsOptions,
-                this.highestWindowSeen);
-
-        this.estimatedRemainingTasks = mostInteresting.estimatedRemainingTasks;
-        if (!mostInteresting.sstables.isEmpty())
-            return mostInteresting.sstables;
-        return null;
+        return getBucketAggregates(buckets,
+                                   cfs.getMinimumCompactionThreshold(),
+                                   cfs.getMaximumCompactionThreshold(),
+                                   options.stcsOptions,
+                                   this.highestWindowSeen);
     }
 
+
     @Override
-    public synchronized void addSSTable(SSTableReader sstable)
+    public void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added)
     {
-        sstables.add(sstable);
+        synchronized (sstables)
+        {
+            for (SSTableReader remove : removed)
+                sstables.remove(remove);
+            sstables.addAll(added);
+        }
     }
 
     @Override
-    public synchronized void removeSSTable(SSTableReader sstable)
+    public void addSSTable(SSTableReader sstable)
     {
-        sstables.remove(sstable);
+        synchronized (sstables)
+        {
+            sstables.add(sstable);
+        }
     }
 
     @Override
-    protected Set<SSTableReader> getSSTables()
+    void removeDeadSSTables()
     {
-        return ImmutableSet.copyOf(sstables);
+        removeDeadSSTables(sstables);
     }
 
-    /**
-     * Find the lowest and highest timestamps in a given timestamp/unit pair
-     * Returns milliseconds, caller should adjust accordingly
-     */
-    public static Pair<Long,Long> getWindowBoundsInMillis(TimeUnit windowTimeUnit, int windowTimeSize, long timestampInMillis)
+    @Override
+    public void removeSSTable(SSTableReader sstable)
     {
-        long lowerTimestamp;
-        long upperTimestamp;
-        long timestampInSeconds = TimeUnit.SECONDS.convert(timestampInMillis, TimeUnit.MILLISECONDS);
+        synchronized (sstables)
+        {
+            sstables.remove(sstable);
+        }
+    }
 
-        switch(windowTimeUnit)
+    @Override
+    protected Set<SSTableReader> getSSTables()
+    {
+        synchronized (sstables)
         {
-            case MINUTES:
-                lowerTimestamp = timestampInSeconds - ((timestampInSeconds) % (60L * windowTimeSize));
-                upperTimestamp = (lowerTimestamp + (60L * (windowTimeSize - 1L))) + 59L;
-                break;
-            case HOURS:
-                lowerTimestamp = timestampInSeconds - ((timestampInSeconds) % (3600L * windowTimeSize));
-                upperTimestamp = (lowerTimestamp + (3600L * (windowTimeSize - 1L))) + 3599L;
-                break;
-            case DAYS:
-            default:
-                lowerTimestamp = timestampInSeconds - ((timestampInSeconds) % (86400L * windowTimeSize));
-                upperTimestamp = (lowerTimestamp + (86400L * (windowTimeSize - 1L))) + 86399L;
-                break;
+            return ImmutableSet.copyOf(sstables);
         }
+    }
 
-        return Pair.create(TimeUnit.MILLISECONDS.convert(lowerTimestamp, TimeUnit.SECONDS),
-                           TimeUnit.MILLISECONDS.convert(upperTimestamp, TimeUnit.SECONDS));
+    /**
+     * Find the lowest timestamp in a given window/unit pair and
+     * return it expressed as milliseconds, the caller should adjust accordingly
+     */
+    static long getWindowBoundsInMillis(TimeUnit windowTimeUnit, int windowTimeSize, long timestampInMillis)
+    {
 
+        long sizeInMillis = TimeUnit.MILLISECONDS.convert(windowTimeSize, windowTimeUnit);
+        return (timestampInMillis / sizeInMillis) * sizeInMillis;
     }
 
     /**
      * Group files with similar max timestamp into buckets.
+     * <p/>
+     * The max timestamp of each sstable is converted into the timestamp resolution and then the window bounds are
+     * calculated by calling {@link #getWindowBoundsInMillis(TimeUnit, int, long)}. The sstable is added to the bucket
+     * with the same lower timestamp bound. If the lower timestamp bound is higher than any other seen, then it is recorded
+     * as the max timestamp seen that will be returned.
      *
-     * @param files pairs consisting of a file and its min timestamp
-     * @param sstableWindowUnit
-     * @param sstableWindowSize
-     * @param timestampResolution
-     * @return A pair, where the left element is the bucket representation (map of timestamp to sstablereader), and the right is the highest timestamp seen
+     * @param files the candidate sstables
+     * @param sstableWindowUnit the time unit for {@code sstableWindowSize}
+     * @param sstableWindowSize the size of the time window by which sstables are grouped
+     * @param timestampResolution the time unit for converting the sstable timestamp
+     * @return A pair, where the left element is the bucket representation (multi-map of lower bound timestamp to sstables),
+     *         and the right is the highest lower bound timestamp seen
      */
     @VisibleForTesting
-    static Pair<HashMultimap<Long, SSTableReader>, Long> getBuckets(Iterable<SSTableReader> files, TimeUnit sstableWindowUnit, int sstableWindowSize, TimeUnit timestampResolution)
+    static NavigableMap<Long, List<SSTableReader>> getBuckets(Iterable<SSTableReader> files, TimeUnit sstableWindowUnit, int sstableWindowSize, TimeUnit timestampResolution)
     {
-        HashMultimap<Long, SSTableReader> buckets = HashMultimap.create();
+        NavigableMap<Long, List<SSTableReader>> buckets = new TreeMap<>(Long::compare);
 
-        long maxTimestamp = 0;
-        // Create hash map to represent buckets
         // For each sstable, add sstable to the time bucket
         // Where the bucket is the file's max timestamp rounded to the nearest window bucket
         for (SSTableReader f : files)
         {
             assert TimeWindowCompactionStrategyOptions.validTimestampTimeUnits.contains(timestampResolution);
             long tStamp = TimeUnit.MILLISECONDS.convert(f.getMaxTimestamp(), timestampResolution);
-            Pair<Long,Long> bounds = getWindowBoundsInMillis(sstableWindowUnit, sstableWindowSize, tStamp);
-            buckets.put(bounds.left, f);
-            if (bounds.left > maxTimestamp)
-                maxTimestamp = bounds.left;
+            addToBuckets(buckets, f, tStamp, sstableWindowUnit, sstableWindowSize);
         }
 
-        logger.trace("buckets {}, max timestamp {}", buckets, maxTimestamp);
-        return Pair.create(buckets, maxTimestamp);
+        logger.trace("buckets {}, max timestamp {}", buckets, buckets.isEmpty() ? "none" : buckets.lastKey().toString());
+        return buckets;
     }
 
-    static final class NewestBucket
+    @VisibleForTesting
+    static void addToBuckets(NavigableMap<Long, List<SSTableReader>> buckets, SSTableReader f, long tStamp, TimeUnit sstableWindowUnit, int sstableWindowSize)
     {
-        /** The sstables that should be compacted next */
-        final List<SSTableReader> sstables;
-
-        /** The number of tasks estimated */
-        final int estimatedRemainingTasks;
-
-        NewestBucket(List<SSTableReader> sstables, int estimatedRemainingTasks)
-        {
-            this.sstables = sstables;
-            this.estimatedRemainingTasks = estimatedRemainingTasks;
-        }
-
-        @Override
-        public String toString()
-        {
-            return String.format("sstables: %s, estimated remaining tasks: %d", sstables, estimatedRemainingTasks);
-        }
+        long bound = getWindowBoundsInMillis(sstableWindowUnit, sstableWindowSize, tStamp);
+        buckets.computeIfAbsent(bound,
+                                key -> new ArrayList<>())
+               .add(f);
     }
 
-
     /**
-     * @param buckets list of buckets, sorted from newest to oldest, from which to return the newest bucket within thresholds.
+     * If the current bucket has at least minThreshold SSTables, choose that one. For any other bucket, at least 2 SSTables is enough.
+     * In any case, limit to maxThreshold SSTables.
+     *
+     * @param buckets A map from a bucket id to a set of tables, sorted by id and then by table size
      * @param minThreshold minimum number of sstables in a bucket to qualify.
      * @param maxThreshold maximum number of sstables to compact at once (the returned bucket will be trimmed down to this).
-     * @return a bucket (list) of sstables to compact.
+     * @param stcsOptions the options for {@link SizeTieredCompactionStrategy} to be used in the newest bucket
+     * @param now the latest timestamp in milliseconds
+     *
+     * @return a list of compaction aggregates, one per time bucket
      */
     @VisibleForTesting
-    static NewestBucket newestBucket(HashMultimap<Long, SSTableReader> buckets, int minThreshold, int maxThreshold, SizeTieredCompactionStrategyOptions stcsOptions, long now)
+    static List<CompactionAggregate> getBucketAggregates(NavigableMap<Long, List<SSTableReader>> buckets,
+                                                         int minThreshold,
+                                                         int maxThreshold,
+                                                         SizeTieredCompactionStrategyOptions stcsOptions,
+                                                         long now)
     {
-        // If the current bucket has at least minThreshold SSTables, choose that one.
-        // For any other bucket, at least 2 SSTables is enough.
-        // In any case, limit to maxThreshold SSTables.
+        List<CompactionAggregate> ret = new ArrayList<>(buckets.size());
+        boolean nextCompactionFound = false; // set to true once the first bucket with a compaction is found
 
-        List<SSTableReader> sstables = Collections.emptyList();
-        int estimatedRemainingTasks = 0;
-
-        TreeSet<Long> allKeys = new TreeSet<>(buckets.keySet());
-
-        Iterator<Long> it = allKeys.descendingIterator();
-        while(it.hasNext())
+        for (Map.Entry<Long, List<SSTableReader>> entry : buckets.descendingMap().entrySet())
         {
-            Long key = it.next();
-            Set<SSTableReader> bucket = buckets.get(key);
+            Long key = entry.getKey();
+            List<SSTableReader> bucket = entry.getValue();
             logger.trace("Key {}, now {}", key, now);
+
+            CompactionPick selected = CompactionPick.EMPTY;
+            List<CompactionPick> pending = new ArrayList<>(1);
+
             if (bucket.size() >= minThreshold && key >= now)
             {
                 // If we're in the newest bucket, we'll use STCS to prioritize sstables
-                List<Pair<SSTableReader,Long>> pairs = SizeTieredCompactionStrategy.createSSTableAndLengthPairs(bucket);
-                List<List<SSTableReader>> stcsBuckets = SizeTieredCompactionStrategy.getBuckets(pairs, stcsOptions.bucketHigh, stcsOptions.bucketLow, stcsOptions.minSSTableSize);
-                List<SSTableReader> stcsInterestingBucket = SizeTieredCompactionStrategy.mostInterestingBucket(stcsBuckets, minThreshold, maxThreshold);
+                SizeTieredCompactionStrategy.SizeTieredBuckets stcsBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(bucket,
+                                                                                                                                stcsOptions,
+                                                                                                                                minThreshold,
+                                                                                                                                maxThreshold);
+                stcsBuckets.aggregate();
 
-                // If the tables in the current bucket aren't eligible in the STCS strategy, we'll skip it and look for other buckets
-                if (!stcsInterestingBucket.isEmpty())
+                for (CompactionAggregate stcsAggregate : stcsBuckets.getAggregates())
                 {
-                    double remaining = bucket.size() - maxThreshold;
-                    estimatedRemainingTasks +=  1 + (remaining > minThreshold ? Math.ceil(remaining / maxThreshold) : 0);
-                    if (sstables.isEmpty())
+                    if (selected.isEmpty())
                     {
-                        logger.debug("Using STCS compaction for first window of bucket: data files {} , options {}", pairs, stcsOptions);
-                        sstables = stcsInterestingBucket;
+                        selected = CompactionPick.create(key, stcsAggregate.getSelected());
+                        for (CompactionPick comp : stcsAggregate.getActive())
+                        {
+                            if (comp != stcsAggregate.getSelected())
+                                pending.add(comp);
+                        }
                     }
                     else
                     {
-                        logger.trace("First window of bucket is eligible but not selected: data files {} , options {}", pairs, stcsOptions);
+                        pending.addAll(stcsAggregate.getActive());
                     }
                 }
+
+                if (!selected.isEmpty())
+                    logger.debug("Newest window has STCS compaction candidates, {}, data files {} , options {}",
+                                 nextCompactionFound ? "eligible but not selected due to prior candidate" : "will be selected for compaction",
+                                 stcsBuckets.pairs(),
+                                 stcsOptions);
+                else
+                    logger.debug("No STCS compactions found for first window, data files {}, options {}", stcsBuckets.pairs(), stcsOptions);
+
+                if (!nextCompactionFound && !selected.isEmpty())
+                {
+                    nextCompactionFound = true;
+                    ret.add(0, CompactionAggregate.createTimeTiered(bucket, selected, pending, key)); // the first one will be submitted for compaction
+                }
+                else
+                {
+                    ret.add(CompactionAggregate.createTimeTiered(bucket, selected, pending, key));
+                }
             }
             else if (bucket.size() >= 2 && key < now)
             {
-                double remaining = bucket.size() - maxThreshold;
-                estimatedRemainingTasks +=  1 + (remaining > minThreshold ? Math.ceil(remaining / maxThreshold) : 0);
-                if (sstables.isEmpty())
+                List<SSTableReader> sstables = bucket;
+
+                // Sort the largest sstables off the end before splitting by maxThreshold
+                Collections.sort(sstables, SSTableReader.sizeComparator);
+
+                int i = 0;
+                while ((bucket.size() - i) >= 2)
+                {
+                    List<SSTableReader> pick = sstables.subList(i, i + Math.min(bucket.size() - i, maxThreshold));
+                    if (selected.isEmpty())
+                        selected = CompactionPick.create(key, pick);
+                    else
+                        pending.add(CompactionPick.create(key, pick));
+
+                    i += pick.size();
+                }
+
+                if (!nextCompactionFound)
                 {
                     logger.debug("bucket size {} >= 2 and not in current bucket, compacting what's here: {}", bucket.size(), bucket);
-                    sstables = trimToThreshold(bucket, maxThreshold);
+                    nextCompactionFound = true;
+                    ret.add(0, CompactionAggregate.createTimeTiered(bucket, selected, pending, key)); // the first one will be submitted for compaction
                 }
                 else
                 {
                     logger.trace("bucket size {} >= 2 and not in current bucket, eligible but not selected: {}", bucket.size(), bucket);
+                    ret.add(CompactionAggregate.createTimeTiered(bucket, selected, pending, key));
                 }
             }
             else
             {
                 logger.trace("No compaction necessary for bucket size {} , key {}, now {}", bucket.size(), key, now);
+                ret.add(CompactionAggregate.createTimeTiered(bucket, selected, pending, key)); // add an empty aggregate anyway so we get a full view
             }
         }
-        return new NewestBucket(sstables, estimatedRemainingTasks);
-    }
-
-    /**
-     * @param bucket set of sstables
-     * @param maxThreshold maximum number of sstables in a single compaction task.
-     * @return A bucket trimmed to the maxThreshold newest sstables.
-     */
-    @VisibleForTesting
-    static List<SSTableReader> trimToThreshold(Set<SSTableReader> bucket, int maxThreshold)
-    {
-        List<SSTableReader> ssTableReaders = new ArrayList<>(bucket);
-
-        // Trim the largest sstables off the end to meet the maxThreshold
-        Collections.sort(ssTableReaders, SSTableReader.sizeComparator);
-
-        return ImmutableList.copyOf(Iterables.limit(ssTableReaders, maxThreshold));
-    }
-
-    @Override
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore, boolean splitOutput)
-    {
-        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(sstables);
-        if (Iterables.isEmpty(filteredSSTables))
-            return null;
-        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
-        if (txn == null)
-            return null;
-        return Collections.singleton(new TimeWindowCompactionTask(cfs, txn, gcBefore, options.ignoreOverlaps));
-    }
-
-    @Override
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
-    {
-        assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
-
-        LifecycleTransaction modifier = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
-        if (modifier == null)
-        {
-            logger.debug("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
-            return null;
-        }
-
-        return new TimeWindowCompactionTask(cfs, modifier, gcBefore, options.ignoreOverlaps).setUserDefined(true);
+        return ret;
     }
 
-    public int getEstimatedRemainingTasks()
+    boolean ignoreOverlaps()
     {
-        return this.estimatedRemainingTasks;
+        return options.ignoreOverlaps;
     }
 
     public long getMaxSSTableBytes()
diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java
index 4f1fe6a0b998..57221ab03d53 100644
--- a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java
@@ -28,9 +28,9 @@ public class TimeWindowCompactionTask extends CompactionTask
 {
     private final boolean ignoreOverlaps;
 
-    public TimeWindowCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean ignoreOverlaps)
+    public TimeWindowCompactionTask(TimeWindowCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore, boolean ignoreOverlaps)
     {
-        super(cfs, txn, gcBefore);
+        super(strategy, txn, gcBefore, false);
         this.ignoreOverlaps = ignoreOverlaps;
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java
index a811a3d3cf9d..f333438b40f8 100644
--- a/src/java/org/apache/cassandra/db/compaction/Verifier.java
+++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java
@@ -19,6 +19,7 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Throwables;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 
 import org.apache.cassandra.db.*;
@@ -252,7 +253,7 @@ public void verify()
             {
 
                 if (verifyInfo.isStopRequested())
-                    throw new CompactionInterruptedException(verifyInfo.getCompactionInfo());
+                    throw new CompactionInterruptedException(verifyInfo.getProgress());
 
                 rowStart = dataFile.getFilePointer();
                 outputHandler.debug("Reading row at " + rowStart);
@@ -480,7 +481,7 @@ private void markAndThrow(Throwable cause, boolean mutateRepaired)
             try
             {
                 sstable.mutateRepairedAndReload(ActiveRepairService.UNREPAIRED_SSTABLE, sstable.getPendingRepair(), sstable.isTransient());
-                cfs.getTracker().notifySSTableRepairedStatusChanged(Collections.singleton(sstable));
+                cfs.getTracker().notifySSTableRepairedStatusChanged(ImmutableList.of(sstable));
             }
             catch(IOException ioe)
             {
@@ -494,12 +495,12 @@ private void markAndThrow(Throwable cause, boolean mutateRepaired)
             throw new RuntimeException(e);
     }
 
-    public CompactionInfo.Holder getVerifyInfo()
+    public AbstractTableOperation getVerifyInfo()
     {
         return verifyInfo;
     }
 
-    private static class VerifyInfo extends CompactionInfo.Holder
+    private static class VerifyInfo extends AbstractTableOperation
     {
         private final RandomAccessReader dataFile;
         private final SSTableReader sstable;
@@ -514,17 +515,17 @@ public VerifyInfo(RandomAccessReader dataFile, SSTableReader sstable, Lock fileR
             verificationCompactionId = UUIDGen.getTimeUUID();
         }
 
-        public CompactionInfo getCompactionInfo()
+        public OperationProgress getProgress()
         {
             fileReadLock.lock();
             try
             {
-                return new CompactionInfo(sstable.metadata(),
-                                          OperationType.VERIFY,
-                                          dataFile.getFilePointer(),
-                                          dataFile.length(),
-                                          verificationCompactionId,
-                                          ImmutableSet.of(sstable));
+                return new OperationProgress(sstable.metadata(),
+                                             OperationType.VERIFY,
+                                             dataFile.getFilePointer(),
+                                             dataFile.length(),
+                                             verificationCompactionId,
+                                             ImmutableSet.of(sstable));
             }
             catch (Exception e)
             {
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
index d363dcf3daf3..55ccb3260380 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
@@ -233,4 +233,9 @@ public CompactionAwareWriter setRepairedAt(long repairedAt)
         this.sstableWriter.setRepairedAt(repairedAt);
         return this;
     }
+
+    public long bytesWritten()
+    {
+        return sstableWriter.bytesWritten();
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
index 70b8e1768f0d..78f5453db2cc 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
@@ -126,14 +126,18 @@ static Throwable markObsolete(List<LogTransaction.Obsoletion> obsoletions, Throw
         return accumulate;
     }
 
-    static Throwable prepareForObsoletion(Iterable<SSTableReader> readers, LogTransaction txnLogs, List<LogTransaction.Obsoletion> obsoletions, Throwable accumulate)
+    static Throwable prepareForObsoletion(Iterable<SSTableReader> readers,
+                                          LogTransaction txnLogs,
+                                          List<LogTransaction.Obsoletion> obsoletions,
+                                          Tracker tracker,
+                                          Throwable accumulate)
     {
         Map<SSTable, LogRecord> logRecords = txnLogs.makeRemoveRecords(readers);
         for (SSTableReader reader : readers)
         {
             try
             {
-                obsoletions.add(new LogTransaction.Obsoletion(reader, txnLogs.obsoleted(reader, logRecords.get(reader))));
+                obsoletions.add(new LogTransaction.Obsoletion(reader, txnLogs.obsoleted(reader, logRecords.get(reader), tracker)));
             }
             catch (Throwable t)
             {
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
index e8f8000cddf2..99da6f9d95b0 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
@@ -103,7 +103,7 @@ public String toString()
         }
     }
 
-    public final Tracker tracker;
+    private final Tracker tracker;
     // The transaction logs keep track of new and old sstable files
     private final LogTransaction log;
     // the original readers this transaction was opened over, and that it guards
@@ -155,13 +155,13 @@ public static LifecycleTransaction offline(OperationType operationType, Iterable
     public static LifecycleTransaction offline(OperationType operationType)
     {
         Tracker dummy = Tracker.newDummyTracker();
-        return new LifecycleTransaction(dummy, new LogTransaction(operationType, dummy), Collections.emptyList());
+        return new LifecycleTransaction(dummy, new LogTransaction(operationType), Collections.emptyList());
     }
 
     @SuppressWarnings("resource") // log closed during postCleanup
     LifecycleTransaction(Tracker tracker, OperationType operationType, Iterable<? extends SSTableReader> readers)
     {
-        this(tracker, new LogTransaction(operationType, tracker), readers);
+        this(tracker, new LogTransaction(operationType), readers);
     }
 
     LifecycleTransaction(Tracker tracker, LogTransaction log, Iterable<? extends SSTableReader> readers)
@@ -192,6 +192,11 @@ public UUID opId()
         return log.id();
     }
 
+    public Set<SSTableReader> getCompacting()
+    {
+        return tracker.getCompacting();
+    }
+
     public void doPrepare()
     {
         // note for future: in anticompaction two different operations use the same Transaction, and both prepareToCommit()
@@ -202,7 +207,7 @@ public void doPrepare()
 
         // prepare for compaction obsolete readers as long as they were part of the original set
         // since those that are not original are early readers that share the same desc with the finals
-        maybeFail(prepareForObsoletion(filterIn(logged.obsolete, originals), log, obsoletions = new ArrayList<>(), null));
+        maybeFail(prepareForObsoletion(filterIn(logged.obsolete, originals), log, obsoletions = new ArrayList<>(), tracker, null));
         log.prepareToCommit();
     }
 
@@ -253,7 +258,7 @@ public Throwable doAbort(Throwable accumulate)
         Iterable<SSTableReader> obsolete = filterOut(concatUniq(staged.update, logged.update), originals);
         logger.trace("Obsoleting {}", obsolete);
 
-        accumulate = prepareForObsoletion(obsolete, log, obsoletions = new ArrayList<>(), accumulate);
+        accumulate = prepareForObsoletion(obsolete, log, obsoletions = new ArrayList<>(), tracker, accumulate);
         // it's safe to abort even if committed, see maybeFail in doCommit() above, in this case it will just report
         // a failure to abort, which is useful information to have for debug
         accumulate = log.abort(accumulate);
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
index fd916864a879..3e4e37639490 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LogTransaction.java
@@ -28,12 +28,15 @@
 import java.util.concurrent.TimeUnit;
 import java.util.function.Predicate;
 
+import javax.annotation.Nullable;
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.util.concurrent.Runnables;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import com.codahale.metrics.Counter;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.Directories;
@@ -107,7 +110,6 @@ public CorruptTransactionLogException(String message, LogFile txnFile)
         }
     }
 
-    private final Tracker tracker;
     private final LogFile txnFile;
     // We need an explicit lock because the transaction tidier cannot store a reference to the transaction
     private final Object lock;
@@ -120,12 +122,6 @@ public CorruptTransactionLogException(String message, LogFile txnFile)
 
     LogTransaction(OperationType opType)
     {
-        this(opType, null);
-    }
-
-    LogTransaction(OperationType opType, Tracker tracker)
-    {
-        this.tracker = tracker;
         this.txnFile = new LogFile(opType, UUIDGen.getTimeUUID());
         this.lock = new Object();
         this.selfRef = new Ref<>(this, new TransactionTidier(txnFile, lock));
@@ -165,13 +161,13 @@ void untrackNew(SSTable table)
     @VisibleForTesting
     SSTableTidier obsoleted(SSTableReader sstable)
     {
-        return obsoleted(sstable, LogRecord.make(Type.REMOVE, sstable));
+        return obsoleted(sstable, LogRecord.make(Type.REMOVE, sstable), null);
     }
 
     /**
      * Schedule a reader for deletion as soon as it is fully unreferenced.
      */
-    SSTableTidier obsoleted(SSTableReader reader, LogRecord logRecord)
+    SSTableTidier obsoleted(SSTableReader reader, LogRecord logRecord, @Nullable Tracker tracker)
     {
         synchronized (lock)
         {
@@ -183,7 +179,7 @@ SSTableTidier obsoleted(SSTableReader reader, LogRecord logRecord)
                 if (txnFile.contains(Type.REMOVE, reader, logRecord))
                     throw new IllegalArgumentException();
 
-                return new SSTableTidier(reader, true, this);
+                return new SSTableTidier(reader, true, this, tracker);
             }
 
             txnFile.addRecord(logRecord);
@@ -191,7 +187,7 @@ SSTableTidier obsoleted(SSTableReader reader, LogRecord logRecord)
             if (tracker != null)
                 tracker.notifyDeleting(reader);
 
-            return new SSTableTidier(reader, false, this);
+            return new SSTableTidier(reader, false, this, tracker);
         }
     }
 
@@ -349,21 +345,23 @@ public static class SSTableTidier implements Runnable
         // must not retain a reference to the SSTableReader, else leak detection cannot kick in
         private final Descriptor desc;
         private final long sizeOnDisk;
-        private final Tracker tracker;
         private final boolean wasNew;
         private final Object lock;
         private final Ref<LogTransaction> parentRef;
         private final UUID txnId;
+        private final boolean onlineTxn;
+        private final Counter totalDiskSpaceUsed;
 
-        public SSTableTidier(SSTableReader referent, boolean wasNew, LogTransaction parent)
+        public SSTableTidier(SSTableReader referent, boolean wasNew, LogTransaction parent, Tracker tracker)
         {
             this.desc = referent.descriptor;
             this.sizeOnDisk = referent.bytesOnDisk();
-            this.tracker = parent.tracker;
             this.wasNew = wasNew;
             this.lock = parent.lock;
             this.parentRef = parent.selfRef.tryRef();
             this.txnId = parent.id();
+            this.onlineTxn = tracker != null && !tracker.isDummy();
+            this.totalDiskSpaceUsed = tracker != null && tracker.cfstore != null ? tracker.cfstore.metric.totalDiskSpaceUsed : null;
 
             if (this.parentRef == null)
                 throw new IllegalStateException("Transaction already completed");
@@ -371,7 +369,7 @@ public SSTableTidier(SSTableReader referent, boolean wasNew, LogTransaction pare
 
         public void run()
         {
-            if (tracker != null && !tracker.isDummy())
+            if (onlineTxn)
                 SystemKeyspace.clearSSTableReadMeter(desc.ksname, desc.cfname, desc.generation);
 
             synchronized (lock)
@@ -399,8 +397,8 @@ else if (!wasNew)
                     return;
                 }
 
-                if (tracker != null && tracker.cfstore != null && !wasNew)
-                    tracker.cfstore.metric.totalDiskSpaceUsed.dec(sizeOnDisk);
+                if (totalDiskSpaceUsed != null && !wasNew)
+                    totalDiskSpaceUsed.dec(sizeOnDisk);
 
                 // release the referent to the parent so that the all transaction files can be released
                 parentRef.release();
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
index d5b68b58d4d7..26f2b4ccec8f 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
@@ -264,7 +264,7 @@ public Throwable dropSSTables(Throwable accumulate)
      */
     public Throwable dropSSTables(final Predicate<SSTableReader> remove, OperationType operationType, Throwable accumulate)
     {
-        try (LogTransaction txnLogs = new LogTransaction(operationType, this))
+        try (LogTransaction txnLogs = new LogTransaction(operationType))
         {
             Pair<View, View> result = apply(view -> {
                 Set<SSTableReader> toremove = copyOf(filter(view.sstables, and(remove, notIn(view.compacting))));
@@ -277,7 +277,7 @@ public Throwable dropSSTables(final Predicate<SSTableReader> remove, OperationTy
             // It is important that any method accepting/returning a Throwable never throws an exception, and does its best
             // to complete the instructions given to it
             List<LogTransaction.Obsoletion> obsoletions = new ArrayList<>();
-            accumulate = prepareForObsoletion(removed, txnLogs, obsoletions, accumulate);
+            accumulate = prepareForObsoletion(removed, txnLogs, obsoletions, this, accumulate);
             try
             {
                 txnLogs.finish();
@@ -402,14 +402,14 @@ public Set<SSTableReader> getCompacting()
         return view.get().compacting;
     }
 
-    public Iterable<SSTableReader> getUncompacting()
+    public Iterable<SSTableReader> getNoncompacting()
     {
         return view.get().select(SSTableSet.NONCOMPACTING);
     }
 
-    public Iterable<? extends SSTableReader> getUncompacting(Iterable<? extends SSTableReader> candidates)
+    public Iterable<? extends SSTableReader> getNoncompacting(Iterable<? extends SSTableReader> candidates)
     {
-        return view.get().getUncompacting(candidates);
+        return view.get().getNoncompacting(candidates);
     }
 
     public void maybeIncrementallyBackup(final Iterable<SSTableReader> sstables)
diff --git a/src/java/org/apache/cassandra/db/lifecycle/View.java b/src/java/org/apache/cassandra/db/lifecycle/View.java
index 5fe39364c70e..79113c3163e1 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/View.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/View.java
@@ -170,7 +170,8 @@ public Iterable<SSTableReader> select(SSTableSet sstableSet)
         }
     }
 
-    public Iterable<? extends SSTableReader> getUncompacting(Iterable<? extends SSTableReader> candidates)
+
+    public Iterable<? extends SSTableReader> getNoncompacting(Iterable<? extends SSTableReader> candidates)
     {
         return filter(candidates, (Predicate<SSTableReader>) sstable -> !compacting.contains(sstable));
     }
diff --git a/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java b/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java
index d9e90367d2d3..09f3ae3bbf18 100644
--- a/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java
+++ b/src/java/org/apache/cassandra/db/partitions/PurgeFunction.java
@@ -18,7 +18,6 @@
 package org.apache.cassandra.db.partitions;
 
 import java.util.function.LongPredicate;
-import java.util.function.Predicate;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.*;
diff --git a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
index 45089e2441b6..add2c9a80792 100644
--- a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
+++ b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
@@ -39,10 +39,8 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
-import org.apache.cassandra.db.compaction.ActiveCompactionsTracker;
 import org.apache.cassandra.db.compaction.CompactionController;
 import org.apache.cassandra.db.compaction.CompactionIterator;
-import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.View;
@@ -102,9 +100,9 @@ public static int getDefaultGcBefore(ColumnFamilyStore cfs, int nowInSec)
 
     private static class ValidationCompactionIterator extends CompactionIterator
     {
-        public ValidationCompactionIterator(List<ISSTableScanner> scanners, ValidationCompactionController controller, int nowInSec, ActiveCompactionsTracker activeCompactions)
+        public ValidationCompactionIterator(List<ISSTableScanner> scanners, ValidationCompactionController controller, int nowInSec)
         {
-            super(OperationType.VALIDATION, scanners, controller, nowInSec, UUIDGen.getTimeUUID(), activeCompactions);
+            super(OperationType.VALIDATION, scanners, controller, nowInSec, UUIDGen.getTimeUUID());
         }
     }
 
@@ -224,7 +222,7 @@ public CassandraValidationIterator(ColumnFamilyStore cfs, Collection<Range<Token
 
         controller = new ValidationCompactionController(cfs, getDefaultGcBefore(cfs, nowInSec));
         scanners = cfs.getCompactionStrategyManager().getScanners(sstables, ranges);
-        ci = new ValidationCompactionIterator(scanners.scanners, controller, nowInSec, CompactionManager.instance.active);
+        ci = new ValidationCompactionIterator(scanners.scanners, controller, nowInSec);
 
         long allPartitions = 0;
         rangePartitionCounts = Maps.newHashMapWithExpectedSize(ranges.size());
diff --git a/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java b/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java
index f6762556e985..484dec6c9180 100644
--- a/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java
+++ b/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java
@@ -43,7 +43,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -145,8 +145,8 @@ public boolean apply(SSTableReader sstable)
                 }
                 return false;
             }
-            Collection<CompactionInfo> cis = CompactionManager.instance.active.getCompactionsForSSTable(sstable, OperationType.ANTICOMPACTION);
-            if (cis != null && !cis.isEmpty())
+            Collection<AbstractTableOperation.OperationProgress> ops = CompactionManager.instance.active.getOperationsForSSTable(sstable, OperationType.ANTICOMPACTION);
+            if (ops != null && !ops.isEmpty())
             {
                 // todo: start tracking the parent repair session id that created the anticompaction to be able to give a better error messsage here:
                 StringBuilder sb = new StringBuilder();
@@ -155,8 +155,10 @@ public boolean apply(SSTableReader sstable)
                 sb.append(" has failed because it encountered intersecting sstables belonging to another incremental repair session. ");
                 sb.append("This is caused by starting multiple conflicting incremental repairs at the same time. ");
                 sb.append("Conflicting anticompactions: ");
-                for (CompactionInfo ci : cis)
-                    sb.append(ci.getTaskId() == null ? "no compaction id" : ci.getTaskId()).append(':').append(ci.getSSTables()).append(',');
+                for (AbstractTableOperation.OperationProgress op : ops)
+                {
+                    sb.append(op.operationId() == null ? "no compaction id" : op.operationId()).append(':').append(op.sstables()).append(',');
+                }
                 throw new SSTableAcquisitionException(sb.toString());
             }
             return true;
diff --git a/src/java/org/apache/cassandra/db/view/ViewBuilderTask.java b/src/java/org/apache/cassandra/db/view/ViewBuilderTask.java
index c84c6978dcce..3739b325aba8 100644
--- a/src/java/org/apache/cassandra/db/view/ViewBuilderTask.java
+++ b/src/java/org/apache/cassandra/db/view/ViewBuilderTask.java
@@ -29,7 +29,6 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
 import com.google.common.base.Objects;
-import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterators;
 import com.google.common.collect.PeekingIterator;
 import com.google.common.util.concurrent.Futures;
@@ -44,8 +43,7 @@
 import org.apache.cassandra.db.ReadQuery;
 import org.apache.cassandra.db.SinglePartitionReadCommand;
 import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.db.compaction.CompactionInfo;
-import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
@@ -63,7 +61,7 @@
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.concurrent.Refs;
 
-public class ViewBuilderTask extends CompactionInfo.Holder implements Callable<Long>
+public class ViewBuilderTask extends AbstractTableOperation implements Callable<Long>
 {
     private static final Logger logger = LoggerFactory.getLogger(ViewBuilderTask.class);
 
@@ -191,12 +189,12 @@ private void finish()
             // If it's stopped due to a compaction interruption we should throw that exception.
             // Otherwise we assume that the task has been stopped due to a schema update and we can finish successfully.
             if (isCompactionInterrupted)
-                throw new StoppedException(ksName, view.name, getCompactionInfo());
+                throw new StoppedException(ksName, view.name, getProgress());
         }
     }
 
     @Override
-    public CompactionInfo getCompactionInfo()
+    public OperationProgress getProgress()
     {
         // we don't know the sstables at construction of ViewBuilderTask and we could change this to return once we know the
         // but since we basically only cancel view builds on truncation where we cancel all compactions anyway, this seems reasonable
@@ -205,13 +203,13 @@ public CompactionInfo getCompactionInfo()
         if (range.left.getPartitioner().splitter().isPresent())
         {
             long progress = prevToken == null ? 0 : Math.round(prevToken.getPartitioner().splitter().get().positionInRange(prevToken, range) * 1000);
-            return CompactionInfo.withoutSSTables(baseCfs.metadata(), OperationType.VIEW_BUILD, progress, 1000, Unit.RANGES, compactionId);
+            return OperationProgress.withoutSSTables(baseCfs.metadata(), OperationType.VIEW_BUILD, progress, 1000, Unit.RANGES, compactionId);
         }
 
         // When there is no splitter, estimate based on number of total keys but
         // take the max with keysBuilt + 1 to avoid having more completed than total
         long keysTotal = Math.max(keysBuilt + 1, baseCfs.estimatedKeysForRange(range));
-        return CompactionInfo.withoutSSTables(baseCfs.metadata(), OperationType.VIEW_BUILD, keysBuilt, keysTotal, Unit.KEYS, compactionId);
+        return OperationProgress.withoutSSTables(baseCfs.metadata(), OperationType.VIEW_BUILD, keysBuilt, keysTotal, Unit.KEYS, compactionId);
     }
 
     @Override
@@ -248,7 +246,7 @@ static class StoppedException extends CompactionInterruptedException
     {
         private final String ksName, viewName;
 
-        private StoppedException(String ksName, String viewName, CompactionInfo info)
+        private StoppedException(String ksName, String viewName, OperationProgress info)
         {
             super(info);
             this.ksName = ksName;
diff --git a/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java b/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java
index 20033dfed4cb..4ddb73201870 100644
--- a/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java
+++ b/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java
@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.db.virtual;
 
-import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.marshal.DoubleType;
 import org.apache.cassandra.db.marshal.LongType;
@@ -58,21 +58,21 @@ public DataSet data()
     {
         SimpleDataSet result = new SimpleDataSet(metadata());
 
-        for (CompactionInfo task : CompactionManager.instance.getSSTableTasks())
+        for (AbstractTableOperation.OperationProgress task : CompactionManager.instance.getSSTableTasks())
         {
-            long completed = task.getCompleted();
-            long total = task.getTotal();
+            long completed = task.completed();
+            long total = task.total();
 
             double completionRatio = total == 0L ? 1.0 : (((double) completed) / total);
 
-            result.row(task.getKeyspace().orElse("*"),
-                       task.getTable().orElse("*"),
-                       task.getTaskId())
+            result.row(task.keyspace().orElse("*"),
+                       task.table().orElse("*"),
+                       task.operationId())
                   .column(COMPLETION_RATIO, completionRatio)
-                  .column(KIND, task.getTaskType().toString().toLowerCase())
+                  .column(KIND, task.operationType().toString().toLowerCase())
                   .column(PROGRESS, completed)
                   .column(TOTAL, total)
-                  .column(UNIT, task.getUnit().toString().toLowerCase());
+                  .column(UNIT, task.unit().toString().toLowerCase());
         }
 
         return result;
diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java b/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java
index 73dc3345a250..eeaf0ce81d70 100644
--- a/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexBuilder.java
@@ -17,12 +17,12 @@
  */
 package org.apache.cassandra.index;
 
-import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 
 /**
  * Manages building an entire index from column family data. Runs on to compaction manager.
  */
-public abstract class SecondaryIndexBuilder extends CompactionInfo.Holder
+public abstract class SecondaryIndexBuilder extends AbstractTableOperation
 {
     public abstract void build();
 
diff --git a/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java b/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java
index 3c005c42525b..b17bb65f9413 100644
--- a/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java
@@ -23,7 +23,6 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.index.Index;
@@ -52,14 +51,14 @@ public CollatedViewIndexBuilder(ColumnFamilyStore cfs, Set<Index> indexers, Redu
         this.sstables = sstables;
     }
 
-    public CompactionInfo getCompactionInfo()
+    public OperationProgress getProgress()
     {
-        return new CompactionInfo(cfs.metadata(),
-                OperationType.INDEX_BUILD,
-                iter.getBytesRead(),
-                iter.getTotalBytes(),
-                compactionId,
-                sstables);
+        return new OperationProgress(cfs.metadata(),
+                                     OperationType.INDEX_BUILD,
+                                     iter.getBytesRead(),
+                                     iter.getTotalBytes(),
+                                     compactionId,
+                                     sstables);
     }
 
     public void build()
@@ -70,7 +69,7 @@ public void build()
             while (iter.hasNext())
             {
                 if (isStopRequested())
-                    throw new CompactionInterruptedException(getCompactionInfo());
+                    throw new CompactionInterruptedException(getProgress());
                 DecoratedKey key = iter.next();
                 cfs.indexManager.indexPartition(key, indexers, pageSize);
             }
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
index 729afd3955f8..28d05eeb8539 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
@@ -42,7 +42,6 @@
 
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.DeletionTime;
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -67,7 +66,7 @@
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.concurrent.Ref;
 
-import static org.apache.cassandra.db.compaction.CompactionInfo.StopTrigger.TRUNCATE;
+import static org.apache.cassandra.db.compaction.TableOperation.StopTrigger.TRUNCATE;
 
 /**
  * Multiple storage-attached indexes can start building concurrently. We need to make sure:
@@ -168,7 +167,7 @@ private boolean indexSSTable(SSTableReader sstable, Set<StorageAttachedIndex> in
                 {
                     if (isStopRequested())
                     {
-                        throw new CompactionInterruptedException(getCompactionInfo());
+                        throw new CompactionInterruptedException(getProgress());
                     }
 
                     final DecoratedKey key = sstable.decorateKey(keys.key());
@@ -263,14 +262,14 @@ else if (t instanceof CompactionInterruptedException)
     }
 
     @Override
-    public CompactionInfo getCompactionInfo()
+    public OperationProgress getProgress()
     {
-        return new CompactionInfo(metadata,
-                                  OperationType.INDEX_BUILD,
-                                  bytesProcessed,
-                                  totalSizeInBytes,
-                                  compactionId,
-                                  sstables.keySet());
+        return new OperationProgress(metadata,
+                                     OperationType.INDEX_BUILD,
+                                     bytesProcessed,
+                                     totalSizeInBytes,
+                                     compactionId,
+                                     sstables.keySet());
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
index 6152deb5e27b..8cbb9a12cd7e 100644
--- a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java
@@ -30,7 +30,6 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.marshal.AbstractType;
@@ -86,7 +85,7 @@ public void build()
                     while (!keys.isExhausted())
                     {
                         if (isStopRequested())
-                            throw new CompactionInterruptedException(getCompactionInfo());
+                            throw new CompactionInterruptedException(getProgress());
 
                         final DecoratedKey key = sstable.decorateKey(keys.key());
                         final long keyPosition = keys.keyPosition();
@@ -123,14 +122,15 @@ public void build()
         }
     }
 
-    public CompactionInfo getCompactionInfo()
+    @Override
+    public OperationProgress getProgress()
     {
-        return new CompactionInfo(cfs.metadata(),
-                                  OperationType.INDEX_BUILD,
-                                  bytesProcessed,
-                                  totalSizeInBytes,
-                                  compactionId,
-                                  sstables.keySet());
+        return new OperationProgress(cfs.metadata(),
+                                     OperationType.INDEX_BUILD,
+                                     bytesProcessed,
+                                     totalSizeInBytes,
+                                     compactionId,
+                                     sstables.keySet());
     }
 
     private void completeSSTable(PerSSTableIndexWriter indexWriter, SSTableReader sstable, Collection<ColumnIndex> indexes)
diff --git a/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java b/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java
index af661b7d66c9..0f1897ded8be 100644
--- a/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/ISSTableScanner.java
@@ -34,13 +34,14 @@
  */
 public interface ISSTableScanner extends UnfilteredPartitionIterator
 {
-    public long getLengthInBytes();
-    public long getCompressedLengthInBytes();
-    public long getCurrentPosition();
-    public long getBytesScanned();
-    public Set<SSTableReader> getBackingSSTables();
+    long getLengthInBytes();
+    long getCompressedLengthInBytes();
+    long getCurrentPosition();
+    long getBytesScanned();
+    Set<SSTableReader> getBackingSSTables();
+    int level();
 
-    public static void closeAllAndPropagate(Collection<ISSTableScanner> scanners, Throwable throwable)
+    static void closeAllAndPropagate(Collection<ISSTableScanner> scanners, Throwable throwable)
     {
         for (ISSTableScanner scanner: scanners)
         {
@@ -66,6 +67,5 @@ public static void closeAllAndPropagate(Collection<ISSTableScanner> scanners, Th
         {
             Throwables.propagate(throwable);
         }
-
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
index c98529266411..900f811acc41 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryManager.java
@@ -215,7 +215,7 @@ private Pair<Long, Map<TableId, LifecycleTransaction>> getRestributionTransactio
                 {
                     View view = cfStore.getTracker().getView();
                     allSSTables = ImmutableSet.copyOf(SSTableReader.selectOnlyBigTableReaders(view.select(SSTableSet.CANONICAL)));
-                    nonCompacting = ImmutableSet.copyOf(SSTableReader.selectOnlyBigTableReaders(view.getUncompacting(allSSTables)));
+                    nonCompacting = ImmutableSet.copyOf(SSTableReader.selectOnlyBigTableReaders(view.getNoncompacting(allSSTables)));
                 }
                 while (null == (txn = cfStore.getTracker().tryModify(nonCompacting, OperationType.INDEX_SUMMARY)));
 
diff --git a/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java b/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
index 453d27414290..cd930108dc7a 100644
--- a/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
+++ b/src/java/org/apache/cassandra/io/sstable/IndexSummaryRedistribution.java
@@ -34,10 +34,9 @@
 import org.slf4j.LoggerFactory;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.metrics.StorageMetrics;
@@ -48,7 +47,7 @@
 
 import static org.apache.cassandra.io.sstable.Downsampling.BASE_SAMPLING_LEVEL;
 
-public class IndexSummaryRedistribution extends CompactionInfo.Holder
+public class IndexSummaryRedistribution extends AbstractTableOperation
 {
     private static final Logger logger = LoggerFactory.getLogger(IndexSummaryRedistribution.class);
 
@@ -100,7 +99,7 @@ public List<SSTableReader> redistributeSummaries() throws IOException
         for (SSTableReader sstable : redistribute)
         {
             if (isStopRequested())
-                throw new CompactionInterruptedException(getCompactionInfo());
+                throw new CompactionInterruptedException(getProgress());
 
             if (sstable.getReadMeter() != null)
             {
@@ -153,7 +152,7 @@ private List<SSTableReader> adjustSamplingLevels(List<SSTableReader> sstables,
         for (SSTableReader sstable : sstables)
         {
             if (isStopRequested())
-                throw new CompactionInterruptedException(getCompactionInfo());
+                throw new CompactionInterruptedException(getProgress());
 
             int minIndexInterval = sstable.metadata().params.minIndexInterval;
             int maxIndexInterval = sstable.metadata().params.maxIndexInterval;
@@ -250,7 +249,7 @@ else if (targetNumEntries < currentNumEntries * DOWNSAMPLE_THESHOLD && newSampli
         for (ResampleEntry entry : toDownsample)
         {
             if (isStopRequested())
-                throw new CompactionInterruptedException(getCompactionInfo());
+                throw new CompactionInterruptedException(getProgress());
 
             SSTableReader sstable = entry.sstable;
             logger.trace("Re-sampling index summary for {} from {}/{} to {}/{} of the original number of entries",
@@ -329,9 +328,14 @@ public int compare(ResampleEntry o1, ResampleEntry o2)
         return Pair.create(willNotDownsample, toDownsample.subList(noDownsampleCutoff, toDownsample.size()));
     }
 
-    public CompactionInfo getCompactionInfo()
+    public OperationProgress getProgress()
     {
-        return CompactionInfo.withoutSSTables(null, OperationType.INDEX_SUMMARY, (memoryPoolBytes - remainingSpace), memoryPoolBytes, Unit.BYTES, compactionId);
+        return OperationProgress.withoutSSTables(null,
+                                                 OperationType.INDEX_SUMMARY,
+                                                 (memoryPoolBytes - remainingSpace),
+                                                 memoryPoolBytes,
+                                                 Unit.BYTES,
+                                                 compactionId);
     }
 
     public boolean isGlobal()
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java
index a375781837fe..82dbe22dbacc 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTable.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java
@@ -171,6 +171,11 @@ public String getKeyspaceName()
         return descriptor.ksname;
     }
 
+    public int getGeneration()
+    {
+        return descriptor.generation;
+    }
+
     public List<String> getAllFilePaths()
     {
         List<String> ret = new ArrayList<>(components.size());
@@ -426,4 +431,14 @@ public static void validateRepairedMetadata(long repairedAt, UUID pendingRepair,
                                     "isTransient can only be true for sstables pending repair");
 
     }
+
+    public DecoratedKey getFirst()
+    {
+        return first;
+    }
+
+    public DecoratedKey getLast()
+    {
+        return last;
+    }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
index a390648bf1bc..693735bdb144 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableRewriter.java
@@ -64,6 +64,7 @@ public class SSTableRewriter extends Transactional.AbstractTransactional impleme
     private final List<SSTableReader> preparedForCommit = new ArrayList<>();
 
     private long currentlyOpenedEarlyAt; // the position (in MB) in the target file we last (re)opened at
+    private long bytesWritten; // the bytes written by previous writers, or zero if the current writer is the first writer
 
     private final List<SSTableWriter> writers = new ArrayList<>();
     private final boolean keepOriginals; // true if we do not want to obsolete the originals
@@ -118,6 +119,11 @@ public SSTableWriter currentWriter()
         return writer;
     }
 
+    public long bytesWritten()
+    {
+        return bytesWritten + (writer == null ? 0 : writer.getFilePointer());
+    }
+
     public RowIndexEntry append(UnfilteredRowIterator partition)
     {
         // we do this before appending to ensure we can resetAndTruncate() safely if the append fails
@@ -324,6 +330,7 @@ public void switchWriter(SSTableWriter newWriter)
         }
 
         currentlyOpenedEarlyAt = 0;
+        bytesWritten += writer.getFilePointer();
         writer = newWriter;
     }
 
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index f7e34a25cce9..47a9ae68a5df 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -231,11 +231,11 @@ private static ScheduledThreadPoolExecutor initSyncExecutor()
     // it's just an object, which we use regular Object equality on; we introduce a special class just for easy recognition
     public static final class UniqueIdentifier {}
 
-    public static final Comparator<SSTableReader> sstableComparator = (o1, o2) -> o1.first.compareTo(o2.first);
+    public static final Comparator<SSTableReader> firstKeyComparator = (o1, o2) -> o1.getFirst().compareTo(o2.getFirst());
 
     public static final Comparator<SSTableReader> generationReverseComparator = (o1, o2) -> -Integer.compare(o1.descriptor.generation, o2.descriptor.generation);
 
-    public static final Ordering<SSTableReader> sstableOrdering = Ordering.from(sstableComparator);
+    public static final Ordering<SSTableReader> firstKeyOrdering = Ordering.from(firstKeyComparator);
 
     public static final Comparator<? super SSTableReader> sizeComparator = (o1, o2) -> Longs.compare(o1.onDiskLength(), o2.onDiskLength());
 
@@ -1096,6 +1096,21 @@ public RestorableMeter getReadMeter()
         return readMeter;
     }
 
+    /**
+     * Called by {@link org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy} and other compaction
+     * strategies to determine the read hotness of this sstables, this method returna a "read hotness" which is
+     * calculated by looking at the last two hours read rate and dividing this number by the estimated number of keys.
+     * <p/>
+     * Note that some system tables do not have read meters, in which case this method will return zero.
+     *
+     * @return the last two hours read rate per estimated key
+     */
+    public double hotness()
+    {
+        // system tables don't have read meters, just use 0.0 for the hotness
+        return readMeter == null ? 0.0 : readMeter.twoHourRate() / estimatedKeys();
+    }
+
     public int getIndexSummarySamplingLevel()
     {
         return indexSummary.getSamplingLevel();
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
index 62e3ed7a40c3..1c5db73693e9 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableScanner.java
@@ -246,6 +246,11 @@ public Set<SSTableReader> getBackingSSTables()
         return ImmutableSet.of(sstable);
     }
 
+    public int level()
+    {
+        return sstable.getSSTableLevel();
+    }
+
 
     public TableMetadata metadata()
     {
@@ -425,6 +430,11 @@ public Set<SSTableReader> getBackingSSTables()
             return ImmutableSet.of(sstable);
         }
 
+        public int level()
+        {
+            return 0;
+        }
+
         public TableMetadata metadata()
         {
             return sstable.metadata();
diff --git a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
index 46e5940e42e2..bc5d01c28e70 100644
--- a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
@@ -26,17 +26,20 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.db.compaction.ActiveCompactions;
-import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionManager;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.db.compaction.CompactionStrategyStatistics;
+import org.apache.cassandra.db.compaction.TableOperation;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadata;
 
 import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
 
 /**
- * Metrics for compaction.
+ * Metrics for the compaction executor. Note that several different operations execute on the compaction
+ * executor, for example index or view building. These operations are abstracted by {@link AbstractTableOperation}
+ * but previously we would refer to these operations as "compactions", so this incorrect name may still be
+ * found in the metrics that are exported to the users.
  */
 public class CompactionMetrics
 {
@@ -47,14 +50,13 @@ public class CompactionMetrics
     /** Estimated number of compactions remaining to perform, group by keyspace and then table name */
     public final Gauge<Map<String, Map<String, Integer>>> pendingTasksByTableName;
 
-    /** Number of completed compactions since server [re]start */
+    /** Number of completed operations since server [re]start */
     public final Gauge<Long> completedTasks;
-    /** Total number of compactions since server [re]start */
+    /** Total number of operations since server [re]start */
     public final Meter totalCompactionsCompleted;
-    /** Total number of bytes compacted since server [re]start */
+    /** Total number of bytes processed by operations since server [re]start */
     public final Counter bytesCompacted;
 
-
     /** Total number of compactions that have had sstables drop out of them */
     public final Counter compactionsReduced;
 
@@ -64,74 +66,69 @@ public class CompactionMetrics
     /** Total number of compactions which have outright failed due to lack of disk space */
     public final Counter compactionsAborted;
 
+    /** The compaction strategy information for each table. */
+    public final Gauge<List<CompactionStrategyStatistics>> aggregateCompactions;
+
     public CompactionMetrics(final ThreadPoolExecutor... collectors)
     {
-        pendingTasks = Metrics.register(factory.createMetricName("PendingTasks"), new Gauge<Integer>()
-        {
-            public Integer getValue()
+        pendingTasks = Metrics.register(factory.createMetricName("PendingTasks"), () -> {
+            int n = 0;
+            // add estimate number of compactions need to be done
+            for (String keyspaceName : Schema.instance.getKeyspaces())
             {
-                int n = 0;
-                // add estimate number of compactions need to be done
-                for (String keyspaceName : Schema.instance.getKeyspaces())
-                {
-                    for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
-                        n += cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
-                }
-                // add number of currently running compactions
-                return n + CompactionManager.instance.active.getCompactions().size();
+                for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+                    n += cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
             }
+            // add number of currently running compactions
+            return n + CompactionManager.instance.active.getTableOperations().size();
         });
 
-        pendingTasksByTableName = Metrics.register(factory.createMetricName("PendingTasksByTableName"),
-            new Gauge<Map<String, Map<String, Integer>>>()
-        {
-            @Override
-            public Map<String, Map<String, Integer>> getValue() 
+        pendingTasksByTableName = Metrics.register(factory.createMetricName("PendingTasksByTableName"), () -> {
+            Map<String, Map<String, Integer>> resultMap = new HashMap<>();
+            // estimation of compactions need to be done
+            for (String keyspaceName : Schema.instance.getKeyspaces())
             {
-                Map<String, Map<String, Integer>> resultMap = new HashMap<>();
-                // estimation of compactions need to be done
-                for (String keyspaceName : Schema.instance.getKeyspaces())
+                for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
                 {
-                    for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+                    int taskNumber = cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
+                    if (taskNumber > 0)
                     {
-                        int taskNumber = cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
-                        if (taskNumber > 0)
+                        if (!resultMap.containsKey(keyspaceName))
                         {
-                            if (!resultMap.containsKey(keyspaceName))
-                            {
-                                resultMap.put(keyspaceName, new HashMap<>());
-                            }
-                            resultMap.get(keyspaceName).put(cfs.getTableName(), taskNumber);
+                            resultMap.put(keyspaceName, new HashMap<>());
                         }
+                        resultMap.get(keyspaceName).put(cfs.getTableName(), taskNumber);
                     }
                 }
+            }
 
-                // currently running compactions
-                for (CompactionInfo.Holder compaction : CompactionManager.instance.active.getCompactions())
+            // currently running compactions
+            // TODO DB-2701 - this includes all operations (previous behaviour), if we wanted only real
+            // compactions we could remove this block of code and call getTotalCompactions() from the strategy managers
+            for (TableOperation op : CompactionManager.instance.active.getTableOperations())
+            {
+                TableMetadata metaData = op.getProgress().metadata();
+                if (metaData == null)
                 {
-                    TableMetadata metaData = compaction.getCompactionInfo().getTableMetadata();
-                    if (metaData == null)
-                    {
-                        continue;
-                    }
-                    if (!resultMap.containsKey(metaData.keyspace))
-                    {
-                        resultMap.put(metaData.keyspace, new HashMap<>());
-                    }
+                    continue;
+                }
+                if (!resultMap.containsKey(metaData.keyspace))
+                {
+                    resultMap.put(metaData.keyspace, new HashMap<>());
+                }
 
-                    Map<String, Integer> tableNameToCountMap = resultMap.get(metaData.keyspace);
-                    if (tableNameToCountMap.containsKey(metaData.name))
-                    {
-                        tableNameToCountMap.put(metaData.name,
-                                                tableNameToCountMap.get(metaData.name) + 1);
-                    }
-                    else
-                    {
-                        tableNameToCountMap.put(metaData.name, 1);
-                    }
+                Map<String, Integer> tableNameToCountMap = resultMap.get(metaData.keyspace);
+                if (tableNameToCountMap.containsKey(metaData.name))
+                {
+                    tableNameToCountMap.put(metaData.name,
+                                            tableNameToCountMap.get(metaData.name) + 1);
+                }
+                else
+                {
+                    tableNameToCountMap.put(metaData.name, 1);
                 }
-                return resultMap;
             }
+            return resultMap;
         });
 
         completedTasks = Metrics.register(factory.createMetricName("CompletedTasks"), new Gauge<Long>()
@@ -151,5 +148,24 @@ public Long getValue()
         compactionsReduced = Metrics.counter(factory.createMetricName("CompactionsReduced"));
         sstablesDropppedFromCompactions = Metrics.counter(factory.createMetricName("SSTablesDroppedFromCompaction"));
         compactionsAborted = Metrics.counter(factory.createMetricName("CompactionsAborted"));
+
+        aggregateCompactions = Metrics.register(factory.createMetricName("AggregateCompactions"), this::getAggregateCompactions);
+    }
+
+    /**
+     * Scan all the compactions strategies of all tables and find those that have compactions in progress.
+     * For those return the statistics.
+     *
+     * @return a list of statistics for the compaction strategies that have compactions in progress
+     */
+    List<CompactionStrategyStatistics> getAggregateCompactions()
+    {
+        List<CompactionStrategyStatistics> ret = new ArrayList<>();
+        for (String keyspaceName : Schema.instance.getKeyspaces())
+        {
+            for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+                ret.addAll(cfs.getCompactionStrategyManager().getStrategyStatistics());
+        }
+        return ret;
     }
 }
diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java
index 605edba16fe3..558816725b69 100644
--- a/src/java/org/apache/cassandra/tools/NodeProbe.java
+++ b/src/java/org/apache/cassandra/tools/NodeProbe.java
@@ -1601,6 +1601,7 @@ public Object getCompactionMetric(String metricName)
                 case "CompletedTasks":
                 case "PendingTasks":
                 case "PendingTasksByTableName":
+                case "AggregateCompactions":
                     return JMX.newMBeanProxy(mbeanServerConn,
                             new ObjectName("org.apache.cassandra.metrics:type=Compaction,name=" + metricName),
                             CassandraMetricsRegistry.JmxGaugeMBean.class).getValue();
diff --git a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
index e815d74aa17c..a3abeea5de65 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
@@ -26,10 +26,10 @@
 import io.airlift.airline.Command;
 import io.airlift.airline.Option;
 
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionManagerMBean;
+import org.apache.cassandra.db.compaction.CompactionStrategyStatistics;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.compaction.CompactionInfo.Unit;
+import org.apache.cassandra.db.compaction.TableOperation;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.tools.NodeProbe;
 import org.apache.cassandra.tools.NodeTool.NodeToolCmd;
@@ -45,6 +45,11 @@ public class CompactionStats extends NodeToolCmd
             description = "Display bytes in human readable form, i.e. KiB, MiB, GiB, TiB")
     private boolean humanReadable = false;
 
+    @Option(title = "aggregate",
+    name = {"-A", "--aggregate"},
+    description = "Show the compaction aggregates for the compactions in progress, e.g. the levels for LCS or the buckets for STCS and TWCS.")
+    private boolean aggregate = false;
+
     @Override
     public void execute(NodeProbe probe)
     {
@@ -58,6 +63,7 @@ public void execute(NodeProbe probe)
             for (Entry<String, Integer> tableEntry : ksEntry.getValue().entrySet())
                 numTotalPendingTask += tableEntry.getValue();
         }
+
         out.println("pending tasks: " + numTotalPendingTask);
         for (Entry<String, Map<String, Integer>> ksEntry : pendingTaskNumberByTable.entrySet())
         {
@@ -72,6 +78,11 @@ public void execute(NodeProbe probe)
         }
         out.println();
         reportCompactionTable(cm.getCompactions(), probe.getCompactionThroughput(), humanReadable, out);
+
+        if (aggregate)
+        {
+            reportAggregateCompactions(probe);
+        }
     }
 
     public static void reportCompactionTable(List<Map<String,String>> compactions, int compactionThroughput, boolean humanReadable, PrintStream out)
@@ -84,17 +95,17 @@ public static void reportCompactionTable(List<Map<String,String>> compactions, i
             table.add("id", "compaction type", "keyspace", "table", "completed", "total", "unit", "progress");
             for (Map<String, String> c : compactions)
             {
-                long total = Long.parseLong(c.get(CompactionInfo.TOTAL));
-                long completed = Long.parseLong(c.get(CompactionInfo.COMPLETED));
-                String taskType = c.get(CompactionInfo.TASK_TYPE);
-                String keyspace = c.get(CompactionInfo.KEYSPACE);
-                String columnFamily = c.get(CompactionInfo.COLUMNFAMILY);
-                String unit = c.get(CompactionInfo.UNIT);
-                boolean toFileSize = humanReadable && Unit.isFileSize(unit);
+                long total = Long.parseLong(c.get(TableOperation.Progress.TOTAL));
+                long completed = Long.parseLong(c.get(TableOperation.Progress.COMPLETED));
+                String taskType = c.get(TableOperation.Progress.OPERATION_TYPE);
+                String keyspace = c.get(TableOperation.Progress.KEYSPACE);
+                String columnFamily = c.get(TableOperation.Progress.COLUMNFAMILY);
+                String unit = c.get(TableOperation.Progress.UNIT);
+                boolean toFileSize = humanReadable && TableOperation.Unit.isFileSize(unit);
                 String completedStr = toFileSize ? FileUtils.stringifyFileSize(completed) : Long.toString(completed);
                 String totalStr = toFileSize ? FileUtils.stringifyFileSize(total) : Long.toString(total);
                 String percentComplete = total == 0 ? "n/a" : new DecimalFormat("0.00").format((double) completed / total * 100) + "%";
-                String id = c.get(CompactionInfo.COMPACTION_ID);
+                String id = c.get(TableOperation.Progress.OPERATION_ID);
                 table.add(id, taskType, keyspace, columnFamily, completedStr, totalStr, unit, percentComplete);
                 if (taskType.equals(OperationType.COMPACTION.toString()))
                     remainingBytes += total - completed;
@@ -111,4 +122,14 @@ public static void reportCompactionTable(List<Map<String,String>> compactions, i
         }
     }
 
+    private static void reportAggregateCompactions(NodeProbe probe)
+    {
+        List<CompactionStrategyStatistics> statistics = (List<CompactionStrategyStatistics>) probe.getCompactionMetric("AggregateCompactions");
+        if (statistics.isEmpty())
+            return;
+
+        System.out.println("Aggregated view:");
+        for (CompactionStrategyStatistics stat : statistics)
+            System.out.println(stat.toString());
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/ActiveCompactionsTracker.java b/src/java/org/apache/cassandra/utils/NonThrowingCloseable.java
similarity index 64%
rename from src/java/org/apache/cassandra/db/compaction/ActiveCompactionsTracker.java
rename to src/java/org/apache/cassandra/utils/NonThrowingCloseable.java
index c1bbbd8e67bf..684d6bd30346 100644
--- a/src/java/org/apache/cassandra/db/compaction/ActiveCompactionsTracker.java
+++ b/src/java/org/apache/cassandra/utils/NonThrowingCloseable.java
@@ -16,19 +16,14 @@
  * limitations under the License.
  */
 
-package org.apache.cassandra.db.compaction;
+package org.apache.cassandra.utils;
 
-public interface ActiveCompactionsTracker
-{
-    public void beginCompaction(CompactionInfo.Holder ci);
-    public void finishCompaction(CompactionInfo.Holder ci);
-
-    public static final ActiveCompactionsTracker NOOP = new ActiveCompactionsTracker()
-    {
-        public void beginCompaction(CompactionInfo.Holder ci)
-        {}
+import java.io.Closeable;
 
-        public void finishCompaction(CompactionInfo.Holder ci)
-        {}
-    };
-}
+/**
+ * A closeable that will not throw.
+ */
+public interface NonThrowingCloseable extends Closeable
+{
+    void close();
+}
\ No newline at end of file
diff --git a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
index 45b3b33e8cd4..225819cd0bb7 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/FailingRepairTest.java
@@ -351,6 +351,11 @@ public Set<SSTableReader> getBackingSSTables()
             return Collections.emptySet();
         }
 
+        public int level()
+        {
+            return 0;
+        }
+
         public TableMetadata metadata()
         {
             return null;
diff --git a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
index 5bc4768be63e..60dffb12418a 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
@@ -127,7 +127,7 @@ protected void testCompaction(int sstableCount, int partitionsPerSSTable, int ro
         try (LifecycleTransaction txn = store.getTracker().tryModify(sstables, OperationType.COMPACTION))
         {
             assert txn != null : "Cannot markCompacting all sstables";
-            new CompactionTask(store, txn, gcBefore).execute(ActiveCompactionsTracker.NOOP);
+            CompactionTask.forTesting(store, txn, gcBefore).execute();
         }
         System.out.println(String.format("%s: sstables=%d rowsper=%d colsper=%d: %d ms",
                                          this.getClass().getName(),
diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
index a72e75e4c423..d142eda289c3 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
@@ -29,7 +29,6 @@
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
@@ -40,7 +39,6 @@
 import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
 
 import static org.junit.Assert.assertFalse;
@@ -97,7 +95,7 @@ public void testParallelLeveledCompaction() throws Exception
                 {
                     public void run()
                     {
-                        nextTask.execute(ActiveCompactionsTracker.NOOP);
+                        nextTask.execute();
                     }
                 });
             }
diff --git a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
index ece90e017b4a..d3af45072043 100644
--- a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
+++ b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
@@ -56,7 +56,6 @@
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterators;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.io.util.UnbufferedDataOutputStreamPlus;
 import org.apache.cassandra.net.MessagingService;
@@ -388,7 +387,7 @@ private static void measure(Workload workload) throws Throwable
         }
 
         ColumnFamilyStore cfs = workload.getCfs();
-        ActiveCompactions active = new ActiveCompactions();
+        ActiveOperations active = new ActiveOperations();
         Set<SSTableReader> sstables = cfs.getLiveSSTables();
 
         CompactionTasks tasks = cfs.getCompactionStrategyManager()
diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index 7d89c68b20e5..a41b04098d4c 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -43,7 +43,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.compaction.ActiveCompactionsTracker;
 import org.apache.cassandra.db.compaction.CompactionTasks;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -262,7 +261,7 @@ public static void compact(ColumnFamilyStore cfs, Collection<SSTableReader> ssta
         try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstables, gcBefore))
         {
             for (AbstractCompactionTask task : tasks)
-                task.execute(ActiveCompactionsTracker.NOOP);
+                task.execute();
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionInfoTest.java b/test/unit/org/apache/cassandra/db/compaction/AbstractTableOperationTest.java
similarity index 70%
rename from test/unit/org/apache/cassandra/db/compaction/CompactionInfoTest.java
rename to test/unit/org/apache/cassandra/db/compaction/AbstractTableOperationTest.java
index c0196e80ab23..792e8f047f82 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionInfoTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AbstractTableOperationTest.java
@@ -20,9 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.UUID;
-import java.util.regex.Pattern;
 
-import org.junit.Assert;
 import org.junit.Test;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
@@ -31,15 +29,15 @@
 import org.apache.cassandra.schema.TableId;
 import org.assertj.core.api.Assertions;
 
-public class CompactionInfoTest extends AbstractPendingAntiCompactionTest
+public class AbstractTableOperationTest extends AbstractPendingAntiCompactionTest
 {
     @Test
-    public void testCompactionInfoToStringContainsTaskId()
+    public void testAbstractTableOperationToStringContainsTaskId()
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
         UUID expectedTaskId = UUID.randomUUID();
-        CompactionInfo compactionInfo = new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, 0, 1000, expectedTaskId, new ArrayList<>());
-        Assertions.assertThat(compactionInfo.toString())
+        AbstractTableOperation.OperationProgress task = new AbstractTableOperation.OperationProgress(cfs.metadata(), OperationType.COMPACTION, 0, 1000, expectedTaskId, new ArrayList<>());
+        Assertions.assertThat(task.toString())
                   .contains(expectedTaskId.toString());
     }
 
@@ -49,8 +47,9 @@ public void testCompactionInfoToStringFormat()
         UUID tableId = UUID.randomUUID();
         UUID taskId = UUID.randomUUID();
         ColumnFamilyStore cfs = MockSchema.newCFS(builder -> builder.id(TableId.fromUUID(tableId)));
-        CompactionInfo compactionInfo = new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, 0, 1000, taskId, new ArrayList<>());
-        Assertions.assertThat(compactionInfo.toString())
-                  .isEqualTo("Compaction(%s, 0 / 1000 bytes)@%s(mockks, mockcf1)", taskId, tableId);
+        AbstractTableOperation.OperationProgress task = new AbstractTableOperation.OperationProgress(cfs.metadata(), OperationType.COMPACTION, 0, 1000, taskId, new ArrayList<>());
+        Assertions.assertThat(task.toString())
+                  .isEqualTo("Compaction(%s, 0 / 1000 bytes)@%s(%s, %s)",
+                             taskId, tableId, cfs.getKeyspaceName(), cfs.getTableName());
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/ActiveOperationsTest.java
similarity index 80%
rename from test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
rename to test/unit/org/apache/cassandra/db/compaction/ActiveOperationsTest.java
index 4a859e7f8275..30116d3601a2 100644
--- a/test/unit/org/apache/cassandra/db/compaction/ActiveCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/ActiveOperationsTest.java
@@ -51,13 +51,14 @@
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NonThrowingCloseable;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
-public class ActiveCompactionsTest extends CQLTester
+public class ActiveOperationsTest extends CQLTester
 {
     @Test
     public void testActiveCompactionTrackingRaceWithIndexBuilder() throws Throwable
@@ -94,7 +95,7 @@ public void testActiveCompactionTrackingRaceWithIndexBuilder() throws Throwable
             });
             Future<?> f2 = es.submit(() -> {
                 Uninterruptibles.awaitUninterruptibly(trigger);
-                CompactionManager.instance.active.getCompactionsForSSTable(null, null);
+                CompactionManager.instance.active.getOperationsForSSTable(null, null);
             });
             trigger.countDown();
             FBUtilities.waitOnFutures(Arrays.asList(f1, f2));
@@ -119,12 +120,12 @@ public void testSecondaryIndexTracking() throws Throwable
         Set<SSTableReader> sstables = getCurrentColumnFamilyStore().getLiveSSTables();
         SecondaryIndexBuilder builder = idx.getBuildTaskSupport().getIndexBuildTask(getCurrentColumnFamilyStore(), Collections.singleton(idx), sstables, false);
 
-        MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
+        MockTableOperations mockActiveCompactions = new MockTableOperations();
         CompactionManager.instance.submitIndexBuild(builder, mockActiveCompactions).get();
 
         assertTrue(mockActiveCompactions.finished);
-        assertNotNull(mockActiveCompactions.holder);
-        assertEquals(sstables, mockActiveCompactions.holder.getCompactionInfo().getSSTables());
+        assertNotNull(mockActiveCompactions.operation);
+        assertEquals(sstables, mockActiveCompactions.operation.getProgress().sstables());
     }
 
     @Test
@@ -142,13 +143,13 @@ public void testIndexSummaryRedistributionTracking() throws Throwable
         {
             Map<TableId, LifecycleTransaction> transactions = ImmutableMap.<TableId, LifecycleTransaction>builder().put(getCurrentColumnFamilyStore().metadata().id, txn).build();
             IndexSummaryRedistribution isr = new IndexSummaryRedistribution(transactions, 0, 1000);
-            MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
+            MockTableOperations mockActiveCompactions = new MockTableOperations();
             CompactionManager.instance.runIndexSummaryRedistribution(isr, mockActiveCompactions);
             assertTrue(mockActiveCompactions.finished);
-            assertNotNull(mockActiveCompactions.holder);
+            assertNotNull(mockActiveCompactions.operation);
             // index redistribution operates over all keyspaces/tables, we always cancel them
-            assertTrue(mockActiveCompactions.holder.getCompactionInfo().getSSTables().isEmpty());
-            assertTrue(mockActiveCompactions.holder.getCompactionInfo().shouldStop((sstable) -> false));
+            assertTrue(mockActiveCompactions.operation.getProgress().sstables().isEmpty());
+            assertTrue(mockActiveCompactions.operation.shouldStop((sstable) -> false));
         }
     }
 
@@ -168,12 +169,12 @@ public void testViewBuildTracking() throws Throwable
         Token token = DatabaseDescriptor.getPartitioner().getMinimumToken();
         ViewBuilderTask vbt = new ViewBuilderTask(getCurrentColumnFamilyStore(), view, new Range<>(token, token), token, 0);
 
-        MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
+        MockTableOperations mockActiveCompactions = new MockTableOperations();
         CompactionManager.instance.submitViewBuilder(vbt, mockActiveCompactions).get();
         assertTrue(mockActiveCompactions.finished);
-        assertTrue(mockActiveCompactions.holder.getCompactionInfo().getSSTables().isEmpty());
+        assertTrue(mockActiveCompactions.operation.getProgress().sstables().isEmpty());
         // this should stop for all compactions, even if it doesn't pick any sstables;
-        assertTrue(mockActiveCompactions.holder.getCompactionInfo().shouldStop((sstable) -> false));
+        assertTrue(mockActiveCompactions.operation.shouldStop((sstable) -> false));
     }
 
     @Test
@@ -190,13 +191,13 @@ public void testScrubOne() throws Throwable
         SSTableReader sstable = Iterables.getFirst(getCurrentColumnFamilyStore().getLiveSSTables(), null);
         try (LifecycleTransaction txn = getCurrentColumnFamilyStore().getTracker().tryModify(sstable, OperationType.SCRUB))
         {
-            MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
+            MockTableOperations mockActiveCompactions = new MockTableOperations();
             CompactionManager.instance.scrubOne(getCurrentColumnFamilyStore(), txn, true, false, false, mockActiveCompactions);
 
             assertTrue(mockActiveCompactions.finished);
-            assertEquals(mockActiveCompactions.holder.getCompactionInfo().getSSTables(), Sets.newHashSet(sstable));
-            assertFalse(mockActiveCompactions.holder.getCompactionInfo().shouldStop((s) -> false));
-            assertTrue(mockActiveCompactions.holder.getCompactionInfo().shouldStop((s) -> true));
+            assertEquals(mockActiveCompactions.operation.getProgress().sstables(), Sets.newHashSet(sstable));
+            assertFalse(mockActiveCompactions.operation.shouldStop((s) -> false));
+            assertTrue(mockActiveCompactions.operation.shouldStop((s) -> true));
         }
 
     }
@@ -213,36 +214,33 @@ public void testVerifyOne() throws Throwable
         }
 
         SSTableReader sstable = Iterables.getFirst(getCurrentColumnFamilyStore().getLiveSSTables(), null);
-        MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
+        MockTableOperations mockActiveCompactions = new MockTableOperations();
         CompactionManager.instance.verifyOne(getCurrentColumnFamilyStore(), sstable, new Verifier.Options.Builder().build(), mockActiveCompactions);
         assertTrue(mockActiveCompactions.finished);
-        assertEquals(mockActiveCompactions.holder.getCompactionInfo().getSSTables(), Sets.newHashSet(sstable));
-        assertFalse(mockActiveCompactions.holder.getCompactionInfo().shouldStop((s) -> false));
-        assertTrue(mockActiveCompactions.holder.getCompactionInfo().shouldStop((s) -> true));
+        assertEquals(mockActiveCompactions.operation.getProgress().sstables(), Sets.newHashSet(sstable));
+        assertFalse(mockActiveCompactions.operation.shouldStop((s) -> false));
+        assertTrue(mockActiveCompactions.operation.shouldStop((s) -> true));
     }
 
     @Test
     public void testSubmitCacheWrite() throws ExecutionException, InterruptedException
     {
         AutoSavingCache.Writer writer = CacheService.instance.keyCache.getWriter(100);
-        MockActiveCompactions mockActiveCompactions = new MockActiveCompactions();
+        MockTableOperations mockActiveCompactions = new MockTableOperations();
         CompactionManager.instance.submitCacheWrite(writer, mockActiveCompactions).get();
         assertTrue(mockActiveCompactions.finished);
-        assertTrue(mockActiveCompactions.holder.getCompactionInfo().getSSTables().isEmpty());
+        assertTrue(mockActiveCompactions.operation.getProgress().sstables().isEmpty());
     }
 
-    private static class MockActiveCompactions implements ActiveCompactionsTracker
+    private static class MockTableOperations implements TableOperationObserver
     {
-        public CompactionInfo.Holder holder;
+        public TableOperation operation;
         public boolean finished = false;
-        public void beginCompaction(CompactionInfo.Holder ci)
-        {
-            holder = ci;
-        }
 
-        public void finishCompaction(CompactionInfo.Holder ci)
+        public NonThrowingCloseable onOperationStart(TableOperation op)
         {
-            finished = true;
+            this.operation = op;
+            return () -> finished = true;
         }
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java
new file mode 100644
index 000000000000..d6b51b4a5f00
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java
@@ -0,0 +1,426 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.TreeMap;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.schema.TableMetadata;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+
+import static org.junit.Assert.*;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.when;
+
+public class BackgroundCompactionsTest
+{
+    private final String keyspace = "ks";
+    private final String table = "table";
+
+    @Mock
+    private ColumnFamilyStore cfs;
+
+    @Mock
+    private AbstractCompactionStrategy strategy;
+
+    @Mock
+    private CompactionStrategyManager strategyManager;
+
+    @Mock
+    private CompactionLogger compactionLogger;
+
+    @BeforeClass
+    public static void setUpClass()
+    {
+        DatabaseDescriptor.daemonInitialization(); // because of all the static initialization in CFS
+    }
+
+    @Before
+    public void setUp()
+    {
+        MockitoAnnotations.initMocks(this);
+
+        TableMetadata metadata = TableMetadata.builder(keyspace, table)
+                                              .addPartitionKeyColumn("pk", AsciiType.instance)
+                                              .build();
+
+        when(cfs.metadata()).thenReturn(metadata);
+        when(cfs.getKeyspaceName()).thenReturn(keyspace);
+        when(cfs.getTableName()).thenReturn(table);
+        when(cfs.getCompactionStrategyManager()).thenReturn(strategyManager);
+        when(strategyManager.compactionLogger()).thenReturn(compactionLogger);
+        when(compactionLogger.enabled()).thenReturn(true);
+    }
+
+    private CompactionAggregate mockAggregate(long key, int numCompactions, int numCompacting)
+    {
+        if (numCompacting > numCompactions)
+            throw new IllegalArgumentException("Cannot have more compactions in progress than total compactions");
+
+        CompactionAggregate ret = Mockito.mock(CompactionAggregate.class);
+        when(ret.getKey()).thenReturn(key);
+
+        List<CompactionPick> compactions = new ArrayList<>(numCompactions);
+        for (int i = 0; i < numCompactions; i++)
+            compactions.add(Mockito.mock(CompactionPick.class));
+
+        when(ret.numEstimatedCompactions()).thenReturn(numCompactions);
+        when(ret.getActive()).thenReturn(compactions);
+        when(ret.getInProgress()).thenReturn(compactions.subList(0, numCompacting));
+        when(ret.toString()).thenReturn(String.format("Key: %d, compactions: %d/%d", key, numCompactions, numCompacting));
+
+        return ret;
+    }
+
+    @Test
+    public void testNoCompaction()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(0, backgroundCompactions.getTotalCompactions());
+
+        CompactionStrategyStatistics statistics = backgroundCompactions.getStatistics();
+        assertNotNull(statistics);
+        assertTrue(statistics.aggregates().isEmpty());
+        assertEquals(keyspace, statistics.keyspace());
+        assertEquals(table, statistics.table());
+        assertEquals(strategy.getClass().getSimpleName(), statistics.strategy());
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testNullPendingCompactions()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        backgroundCompactions.setPending(null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testDuplicatePendingCompactions()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+
+        List<CompactionAggregate> pending = new ArrayList<>(0);
+        for (int i = 0; i < 5; i++)
+            pending.add(mockAggregate(1, 1, 0));
+
+        // Two compactions with the same key are invalid
+        backgroundCompactions.setPending(pending);
+    }
+
+    @Test
+    public void testPendingCompactions()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+
+        List<CompactionAggregate> pending = new ArrayList<>(0);
+        for (int i = 0; i < 5; i++)
+            pending.add(mockAggregate(i, 1, 0));
+
+        backgroundCompactions.setPending(pending);
+
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(pending.size()));
+
+        assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        // Remove the previous pending compactions, none should be kept since they don't have in progress compactions
+        backgroundCompactions.setPending(ImmutableList.of());
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(0, backgroundCompactions.getTotalCompactions());
+    }
+
+    @Test
+    public void testCompactionFromPending()
+    {
+        // Add some pending compactions, and then submit one of them, the most common case
+
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+
+        List<CompactionAggregate> pending = new ArrayList<>(0);
+        for (int i = 0; i < 5; i++)
+            pending.add(mockAggregate(i, 1, 0));
+
+        backgroundCompactions.setPending(pending);
+
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(pending.size()));
+
+        assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        UUID uuid = UUID.randomUUID();
+        CompactionAggregate aggregate = pending.get(0);
+        CompactionPick compaction = Mockito.mock(CompactionPick.class);
+        when(aggregate.getSelected()).thenReturn(compaction);
+        when(aggregate.getMatching(any(TreeMap.class))).thenReturn(aggregate);
+        when(aggregate.getActive()).thenReturn(ImmutableList.of(compaction)); // ensure the aggregate already has the compaction
+
+        backgroundCompactions.setSubmitted(uuid, aggregate);
+
+        Mockito.verify(compaction, times(1)).setSubmitted(eq(uuid));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("submitted"), any(CompactionStrategyStatistics.class));
+
+        when(pending.get(0).numEstimatedCompactions()).thenReturn(0);
+        assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
+        when(progress.operationId()).thenReturn(uuid);
+
+        backgroundCompactions.setInProgress(progress);
+        Mockito.verify(compaction, times(1)).setProgress(eq(progress));
+
+        assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        // Remove the previous pending compactions, the one submitted should be kept
+        backgroundCompactions.setPending(ImmutableList.of());
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(1, backgroundCompactions.getTotalCompactions());
+
+        backgroundCompactions.setCompleted(uuid);
+
+        Mockito.verify(compaction, times(1)).setCompleted();
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("completed"), any(CompactionStrategyStatistics.class));
+
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(0, backgroundCompactions.getTotalCompactions());
+    }
+
+    @Test
+    public void testCompactionWithMatchingPending()
+    {
+        // Add some pending compactions, and then submit a compaction from an aggregate that is not in the pending
+        // but for which there is a matching aggregate, this would happen if two threads raced and created equivalent
+        // but not identical pending aggregates
+
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+
+        List<CompactionAggregate> pending = new ArrayList<>(0);
+        for (int i = 0; i < 5; i++)
+            pending.add(mockAggregate(i, 1, 0));
+
+        backgroundCompactions.setPending(pending);
+
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(pending.size()));
+
+        assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        UUID uuid = UUID.randomUUID();
+        CompactionAggregate aggregate = mockAggregate(0, 1, 0);
+        CompactionPick compaction = Mockito.mock(CompactionPick.class);
+        when(aggregate.getSelected()).thenReturn(compaction);
+        when(aggregate.getMatching(any(TreeMap.class))).thenReturn(pending.get(0));
+        when(pending.get(0).getActive()).thenReturn(ImmutableList.of()); // ensure the matching aggregate does not have the compaction
+        when(pending.get(0).withAdditionalCompactions(any(Collection.class))).thenReturn(pending.get(0));
+
+        backgroundCompactions.setSubmitted(uuid, aggregate);
+
+        Mockito.verify(compaction, times(1)).setSubmitted(eq(uuid));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("submitted"), any(CompactionStrategyStatistics.class));
+
+        when(pending.get(0).numEstimatedCompactions()).thenReturn(0);
+        assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
+        when(progress.operationId()).thenReturn(uuid);
+
+        backgroundCompactions.setInProgress(progress);
+        Mockito.verify(compaction, times(1)).setProgress(eq(progress));
+
+        assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        // Remove the previous pending compactions, the one submitted should be kept
+        backgroundCompactions.setPending(ImmutableList.of());
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(1, backgroundCompactions.getTotalCompactions());
+
+        backgroundCompactions.setCompleted(uuid);
+
+        Mockito.verify(compaction, times(1)).setCompleted();
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("completed"), any(CompactionStrategyStatistics.class));
+
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(0, backgroundCompactions.getTotalCompactions());
+    }
+
+    @Test
+    public void testCompactionNotInPending()
+    {
+        // Submit a compaction that is not part of a pending aggregate, this normally happens for tombstone compactions,
+        // in this case the pending aggregates are empty but a tombstone compaction is submitted
+
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+
+        backgroundCompactions.setPending(ImmutableList.of());
+
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(0));
+
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(0, backgroundCompactions.getTotalCompactions());
+
+        UUID uuid = UUID.randomUUID();
+        CompactionAggregate aggregate = mockAggregate(-1, 0, 0);
+        CompactionPick compaction = Mockito.mock(CompactionPick.class);
+        when(aggregate.getSelected()).thenReturn(compaction);
+        when(aggregate.getMatching(any(TreeMap.class))).thenReturn(null);
+
+        backgroundCompactions.setSubmitted(uuid, aggregate);
+
+        Mockito.verify(compaction, times(1)).setSubmitted(eq(uuid));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("submitted"), any(CompactionStrategyStatistics.class));
+
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(1, backgroundCompactions.getTotalCompactions());
+
+        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
+        when(progress.operationId()).thenReturn(uuid);
+
+        backgroundCompactions.setInProgress(progress);
+        Mockito.verify(compaction, times(1)).setProgress(eq(progress));
+
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(1, backgroundCompactions.getTotalCompactions());
+
+        // Remove the previous pending compactions, the one submitted should be kept
+        backgroundCompactions.setPending(ImmutableList.of());
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(1, backgroundCompactions.getTotalCompactions());
+
+        backgroundCompactions.setCompleted(uuid);
+
+        Mockito.verify(compaction, times(1)).setCompleted();
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("completed"), any(CompactionStrategyStatistics.class));
+
+        assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(0, backgroundCompactions.getTotalCompactions());
+    }
+
+    @Test
+    public void testReplacePending()
+    {
+        // Add som pending aggregates, then replace them with aggregates with different keys, verify that only
+        // those with compactions are kept, partially overlap the keys between the old and new aggregates
+
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+
+        List<CompactionAggregate> pending = new ArrayList<>(0);
+        int key = 0;
+        for (int i = 0; i < 5; i++)
+        {
+            pending.add(mockAggregate(key++, 1, 0)); // these aggregates have no compactions
+        }
+
+        // this aggregates have a compaction
+        for (int i = 0; i < 5; i++)
+        {
+            CompactionAggregate aggregateWithComps = mockAggregate(key++, 1, 1);
+            when(aggregateWithComps.withOnlyTheseCompactions(any(Collection.class))).thenReturn(aggregateWithComps);
+            when(aggregateWithComps.getMatching(any(TreeMap.class))).thenCallRealMethod();
+            pending.add(aggregateWithComps);
+        }
+
+        backgroundCompactions.setPending(pending);
+
+        assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
+
+        pending.clear();
+
+        key -= 2; //overlap the aggregates by 2 keys
+
+        for (int i = 0; i < 5; i++)
+        {
+            // those that overlap the key need to report 2 compactions because they take the one from the old aggregate
+            // when addCompacting is called
+            CompactionAggregate aggregate = mockAggregate(key++, i < 2 ? 2 : 1, 0);
+            when(aggregate.withAdditionalCompactions(any(Collection.class))).thenReturn(aggregate);
+            pending.add(aggregate);
+        }
+
+        backgroundCompactions.setPending(pending);
+
+        // the extra compactions are those from the old aggregates with a compaction regardless of whether
+        // the keys overlapped or not (when the keys overlap the new one has a compaction added, when they do
+        // not the old aggregate is used)
+        assertEquals(pending.size() + 5, backgroundCompactions.getEstimatedRemainingTasks());
+        assertEquals(pending.size() + 5, backgroundCompactions.getTotalCompactions());
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSetSubmittedNoId()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        backgroundCompactions.setSubmitted(null, Mockito.mock(CompactionAggregate.class));
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSetSubmittedNoAggregate()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        backgroundCompactions.setSubmitted(UUID.randomUUID(), null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSetSubmittedDuplicateId()
+    {
+        UUID uuid = UUID.randomUUID();
+        CompactionAggregate aggregate = mockAggregate(1, 1, 0);
+        when(aggregate.getSelected()).thenReturn(CompactionPick.EMPTY);
+
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        backgroundCompactions.setSubmitted(uuid, aggregate);
+        backgroundCompactions.setSubmitted(uuid, aggregate);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSetInProgressNoProgress()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        backgroundCompactions.setInProgress(null);
+    }
+
+    @Test(expected = IllegalArgumentException.class)
+    public void testSetCompletedNoId()
+    {
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        backgroundCompactions.setCompleted(null);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
index 551fef16bb44..8426be121d63 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
@@ -18,6 +18,7 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.io.Closeable;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
@@ -35,7 +36,6 @@
 import com.google.common.util.concurrent.Uninterruptibles;
 import org.junit.Test;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -58,6 +58,8 @@
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.streaming.PreviewKind;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NonThrowingCloseable;
+import org.apache.cassandra.utils.Throwables;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -82,9 +84,9 @@ public void cancelTest() throws InterruptedException
         {
             tct.start();
 
-            List<CompactionInfo.Holder> activeCompactions = getActiveCompactionsForTable(cfs);
+            List<TableOperation> activeCompactions = getActiveCompactionsForTable(cfs);
             assertEquals(1, activeCompactions.size());
-            assertEquals(activeCompactions.get(0).getCompactionInfo().getSSTables(), toMarkCompacting);
+            assertEquals(activeCompactions.get(0).getProgress().sstables(), toMarkCompacting);
             // predicate requires the non-compacting sstables, should not cancel the one currently compacting:
             cfs.runWithCompactionsDisabled(() -> null, (sstable) -> !toMarkCompacting.contains(sstable), false, false, true);
             assertEquals(1, activeCompactions.size());
@@ -128,12 +130,12 @@ public void multipleCompactionsCancelTest() throws InterruptedException
         {
             tcts.forEach(TestCompactionTask::start);
 
-            List<CompactionInfo.Holder> activeCompactions = getActiveCompactionsForTable(cfs);
+            List<TableOperation> activeCompactions = getActiveCompactionsForTable(cfs);
             assertEquals(2, activeCompactions.size());
 
             Set<Set<SSTableReader>> compactingSSTables = new HashSet<>();
-            compactingSSTables.add(activeCompactions.get(0).getCompactionInfo().getSSTables());
-            compactingSSTables.add(activeCompactions.get(1).getCompactionInfo().getSSTables());
+            compactingSSTables.add(activeCompactions.get(0).getProgress().sstables());
+            compactingSSTables.add(activeCompactions.get(1).getProgress().sstables());
             Set<Set<SSTableReader>> expectedSSTables = new HashSet<>();
             expectedSSTables.add(new HashSet<>(sstables.subList(0, 3)));
             expectedSSTables.add(new HashSet<>(sstables.subList(6, 9)));
@@ -141,7 +143,7 @@ public void multipleCompactionsCancelTest() throws InterruptedException
 
             cfs.runWithCompactionsDisabled(() -> null, (sstable) -> false, false, false, true);
             assertEquals(2, activeCompactions.size());
-            assertTrue(activeCompactions.stream().noneMatch(CompactionInfo.Holder::isStopRequested));
+            assertTrue(activeCompactions.stream().noneMatch(TableOperation::isStopRequested));
 
             CountDownLatch cdl = new CountDownLatch(1);
             // start a compaction which only needs the sstables where first token is > 50 - these are the sstables compacted by tcts.get(1)
@@ -150,15 +152,15 @@ public void multipleCompactionsCancelTest() throws InterruptedException
             activeCompactions = getActiveCompactionsForTable(cfs);
             assertEquals(2, activeCompactions.size());
             Thread.sleep(500);
-            for (CompactionInfo.Holder holder : activeCompactions)
+            for (TableOperation compaction : activeCompactions)
             {
-                if (holder.getCompactionInfo().getSSTables().containsAll(sstables.subList(6, 9)))
-                    assertTrue(holder.isStopRequested());
+                if (compaction.getProgress().sstables().containsAll(sstables.subList(6, 9)))
+                    assertTrue(compaction.isStopRequested());
                 else
-                    assertFalse(holder.isStopRequested());
+                    assertFalse(compaction.isStopRequested());
             }
             tcts.get(1).abort();
-            assertEquals(1, CompactionManager.instance.active.getCompactions().size());
+            assertEquals(1, CompactionManager.instance.active.getTableOperations().size());
             cdl.await();
             t.join();
         }
@@ -186,7 +188,7 @@ public void testSubrangeCompaction() throws InterruptedException
         {
             tcts.forEach(TestCompactionTask::start);
 
-            List<CompactionInfo.Holder> activeCompactions = getActiveCompactionsForTable(cfs);
+            List<TableOperation> activeCompactions = getActiveCompactionsForTable(cfs);
             assertEquals(4, activeCompactions.size());
             Range<Token> range = new Range<>(token(0), token(49));
             Thread t = new Thread(() -> {
@@ -205,17 +207,17 @@ public void testSubrangeCompaction() throws InterruptedException
             Thread.sleep(500);
             assertEquals(4, getActiveCompactionsForTable(cfs).size());
             List<TestCompactionTask> toAbort = new ArrayList<>();
-            for (CompactionInfo.Holder holder : getActiveCompactionsForTable(cfs))
+            for (TableOperation compaction : getActiveCompactionsForTable(cfs))
             {
-                if (holder.getCompactionInfo().getSSTables().stream().anyMatch(sstable -> sstable.intersects(Collections.singleton(range))))
+                if (compaction.getProgress().sstables().stream().anyMatch(sstable -> sstable.intersects(Collections.singleton(range))))
                 {
-                    assertTrue(holder.isStopRequested());
+                    assertTrue(compaction.isStopRequested());
                     for (TestCompactionTask tct : tcts)
-                        if (tct.sstables.equals(holder.getCompactionInfo().getSSTables()))
+                        if (tct.sstables.equals(compaction.getProgress().sstables()))
                             toAbort.add(tct);
                 }
                 else
-                    assertFalse(holder.isStopRequested());
+                    assertFalse(compaction.isStopRequested());
             }
             assertEquals(2, toAbort.size());
             toAbort.forEach(TestCompactionTask::abort);
@@ -250,7 +252,7 @@ public void testAnticompaction() throws InterruptedException, ExecutionException
         {
             tcts.forEach(TestCompactionTask::start);
             nonAffectedTcts.forEach(TestCompactionTask::start);
-            List<CompactionInfo.Holder> activeCompactions = getActiveCompactionsForTable(cfs);
+            List<TableOperation> activeCompactions = getActiveCompactionsForTable(cfs);
             assertEquals(5, activeCompactions.size());
             // make sure that sstables are fully contained so that the metadata gets mutated
             Range<Token> range = new Range<>(token(-1), token(49));
@@ -265,17 +267,17 @@ public void testAnticompaction() throws InterruptedException, ExecutionException
             Future<?> fut = pac.run();
             Thread.sleep(600);
             List<TestCompactionTask> toAbort = new ArrayList<>();
-            for (CompactionInfo.Holder holder : getActiveCompactionsForTable(cfs))
+            for (TableOperation compaction : getActiveCompactionsForTable(cfs))
             {
-                if (holder.getCompactionInfo().getSSTables().stream().anyMatch(sstable -> sstable.intersects(Collections.singleton(range)) && !sstable.isRepaired() && !sstable.isPendingRepair()))
+                if (compaction.getProgress().sstables().stream().anyMatch(sstable -> sstable.intersects(Collections.singleton(range)) && !sstable.isRepaired() && !sstable.isPendingRepair()))
                 {
-                    assertTrue(holder.isStopRequested());
+                    assertTrue(compaction.isStopRequested());
                     for (TestCompactionTask tct : tcts)
-                        if (tct.sstables.equals(holder.getCompactionInfo().getSSTables()))
+                        if (tct.sstables.equals(compaction.getProgress().sstables()))
                             toAbort.add(tct);
                 }
                 else
-                    assertFalse(holder.isStopRequested());
+                    assertFalse(compaction.isStopRequested());
             }
             assertEquals(2, toAbort.size());
             toAbort.forEach(TestCompactionTask::abort);
@@ -324,11 +326,11 @@ public boolean hasNext()
         indexBuildStarted.await();
         assertEquals(1, getActiveCompactionsForTable(cfs).size());
         boolean foundCompaction = false;
-        for (CompactionInfo.Holder holder : getActiveCompactionsForTable(cfs))
+        for (TableOperation compaction : getActiveCompactionsForTable(cfs))
         {
-            if (holder.getCompactionInfo().getSSTables().equals(new HashSet<>(sstables)))
+            if (compaction.getProgress().sstables().equals(new HashSet<>(sstables)))
             {
-                assertFalse(holder.isStopRequested());
+                assertFalse(compaction.isStopRequested());
                 foundCompaction = true;
             }
         }
@@ -338,11 +340,11 @@ public boolean hasNext()
         compactionsStopped.await();
         assertEquals(1, getActiveCompactionsForTable(cfs).size());
         foundCompaction = false;
-        for (CompactionInfo.Holder holder : getActiveCompactionsForTable(cfs))
+        for (TableOperation compaction : getActiveCompactionsForTable(cfs))
         {
-            if (holder.getCompactionInfo().getSSTables().equals(new HashSet<>(sstables)))
+            if (compaction.getProgress().sstables().equals(new HashSet<>(sstables)))
             {
-                assertTrue(holder.isStopRequested());
+                assertTrue(compaction.isStopRequested());
                 foundCompaction = true;
             }
         }
@@ -385,6 +387,7 @@ private static class TestCompactionTask
         private CompactionController controller;
         private CompactionIterator ci;
         private List<ISSTableScanner> scanners;
+        private Closeable closeable;
 
         public TestCompactionTask(ColumnFamilyStore cfs, Set<SSTableReader> sstables)
         {
@@ -399,7 +402,8 @@ public void start()
             assertNotNull(txn);
             controller = new CompactionController(cfs, sstables, Integer.MIN_VALUE);
             ci = new CompactionIterator(txn.opType(), scanners, controller, FBUtilities.nowInSeconds(), UUID.randomUUID());
-            CompactionManager.instance.active.beginCompaction(ci);
+            TableOperation op = ci.getOperation();
+            closeable = CompactionManager.instance.active.onOperationStart(op);
         }
 
         public void abort()
@@ -412,8 +416,8 @@ public void abort()
                 txn.abort();
             if (scanners != null)
                 scanners.forEach(ISSTableScanner::close);
-            CompactionManager.instance.active.finishCompaction(ci);
-
+            if (closeable != null)
+                Throwables.maybeFail(Throwables.close(null, closeable));
         }
     }
 
@@ -498,14 +502,14 @@ public void testStandardCompactionTaskCancellation() throws Throwable
 
         try
         {
-            ct.execute(new ActiveCompactions()
+            ct.execute(new ActiveOperations()
             {
                 @Override
-                public void beginCompaction(CompactionInfo.Holder ci)
+                public NonThrowingCloseable onOperationStart(TableOperation op)
                 {
                     waitForBeginCompaction.countDown();
                     Uninterruptibles.awaitUninterruptibly(waitForStart);
-                    super.beginCompaction(ci);
+                    return super.onOperationStart(op);
                 }
             });
             fail("execute should throw CompactionInterruptedException");
@@ -521,11 +525,11 @@ public void beginCompaction(CompactionInfo.Holder ci)
         }
     }
 
-    private List<CompactionInfo.Holder> getActiveCompactionsForTable(ColumnFamilyStore cfs)
+    private List<TableOperation> getActiveCompactionsForTable(ColumnFamilyStore cfs)
     {
-        return CompactionManager.instance.active.getCompactions()
+        return CompactionManager.instance.active.getTableOperations()
                                                 .stream()
-                                                .filter(holder -> holder.getCompactionInfo().getTable().orElse("unknown").equalsIgnoreCase(cfs.name))
+                                                .filter(operation -> operation.getProgress().table().orElse("unknown").equalsIgnoreCase(cfs.name))
                                                 .collect(Collectors.toList());
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
index ff3f210ec2ad..3dd25542683c 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionIteratorTest.java
@@ -334,12 +334,13 @@ public void transformTest()
                                                               Lists.transform(content, x -> new Scanner(x)),
                                                               controller, NOW, null))
         {
+            TableOperation op = iter.getOperation();
             assertTrue(iter.hasNext());
             UnfilteredRowIterator rows = iter.next();
             assertTrue(rows.hasNext());
             assertNotNull(rows.next());
 
-            iter.stop();
+            op.stop();
             try
             {
                 // Will call Transformation#applyToRow
@@ -367,7 +368,8 @@ public void transformPartitionTest()
                                                               Lists.transform(content, x -> new Scanner(x)),
                                                               controller, NOW, null))
         {
-            iter.stop();
+            TableOperation op = iter.getOperation();
+            op.stop();
             try
             {
                 // Will call Transformation#applyToPartition
@@ -455,6 +457,12 @@ public Set<SSTableReader> getBackingSSTables()
         {
             return ImmutableSet.of();
         }
+
+        @Override
+        public int level()
+        {
+            return 0;
+        }
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java
index 9f2bc2ea75fb..3a804b63bfbe 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java
@@ -252,7 +252,7 @@ public void cleanupCompactionFinalized()
         Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
-        compactionTask.execute(ActiveCompactionsTracker.NOOP);
+        compactionTask.execute();
 
         Assert.assertTrue(repairedContains(sstable));
         Assert.assertFalse(unrepairedContains(sstable));
@@ -293,7 +293,7 @@ public void cleanupCompactionFailed()
         Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
-        compactionTask.execute(ActiveCompactionsTracker.NOOP);
+        compactionTask.execute();
 
         Assert.assertFalse(repairedContains(sstable));
         Assert.assertTrue(unrepairedContains(sstable));
@@ -330,7 +330,7 @@ public void finalizedSessionTransientCleanup()
         Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
-        compactionTask.execute(ActiveCompactionsTracker.NOOP);
+        compactionTask.execute();
 
         Assert.assertTrue(cfs.getLiveSSTables().isEmpty());
         Assert.assertFalse(hasPendingStrategiesFor(repairID));
@@ -361,7 +361,7 @@ public void failedSessionTransientCleanup()
         Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
-        compactionTask.execute(ActiveCompactionsTracker.NOOP);
+        compactionTask.execute();
 
         Assert.assertFalse(cfs.getLiveSSTables().isEmpty());
         Assert.assertFalse(hasPendingStrategiesFor(repairID));
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
index 7856500bfea9..3ef18b3f69dc 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
@@ -161,6 +161,7 @@ public void testSSTablesAssignedToCorrectCompactionStrategy(int numSSTables, int
         logger.debug("Boundaries for {} disks is {}", numDisks, Arrays.toString(boundaries));
         CompactionStrategyManager csm = new CompactionStrategyManager(cfs, mockBoundaryManager::getBoundaries,
                                                                       true);
+        csm.reload(cfs.metadata().params.compaction);
 
         // Check that SSTables are assigned to the correct Compaction Strategy
         for (SSTableReader reader : cfs.getLiveSSTables())
@@ -364,6 +365,7 @@ public void groupSSTables() throws Exception
                                                        10, 10);
 
         CompactionStrategyManager csm = new CompactionStrategyManager(cfs, () -> boundaries, true);
+        csm.reload(cfs.metadata().params.compaction);
 
         List<GroupedSSTableContainer> grouped = csm.groupSSTables(Iterables.concat( transientRepairs, pendingRepair, repaired, unrepaired));
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java
new file mode 100644
index 000000000000..aad554d39e51
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java
@@ -0,0 +1,807 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Sets;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.SSTableSet;
+import org.apache.cassandra.db.lifecycle.Tracker;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.dht.Bounds;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+
+import static org.apache.cassandra.db.compaction.LeveledManifest.MAX_COMPACTING_L0;
+import static org.junit.Assert.*;
+import static org.mockito.ArgumentMatchers.anyIterable;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.when;
+
+/**
+ * Test for the compaction statistics for all strategies that support them.
+ */
+public class CompactionStrategyStatisticsTest
+{
+    private static final double epsilon = 0.00000001;
+    private static final Random random = new Random(87689624525L);
+    private static final AtomicInteger generation = new AtomicInteger(1);
+
+    private final String keyspace = "ks";
+    private final String table = "table";
+    private final int minCompactionThreshold = 4;
+    private final int maxCompactionThreshold = 32;
+    private final long minSSTableSize = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE;
+
+    private long repairedAt;
+
+    @Mock
+    private ColumnFamilyStore cfs;
+
+    @Mock
+    private Tracker dataTracker;
+
+    @Mock
+    private CompactionStrategyManager strategyManager;
+
+    private CompactionLogger compactionLogger;
+
+    @BeforeClass
+    public static void setUpClass()
+    {
+        DatabaseDescriptor.daemonInitialization(); // because of all the static initialization in CFS
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+    }
+
+    @Before
+    public void setUp()
+    {
+        MockitoAnnotations.initMocks(this);
+
+        TableMetadata metadata = TableMetadata.builder(keyspace, table)
+                                              .addPartitionKeyColumn("pk", AsciiType.instance)
+                                              .build();
+        repairedAt = System.currentTimeMillis();
+
+        when(cfs.getMinimumCompactionThreshold()).thenReturn(minCompactionThreshold);
+        when(cfs.getMaximumCompactionThreshold()).thenReturn(maxCompactionThreshold);
+        when(cfs.metadata()).thenReturn(metadata);
+        when(cfs.getKeyspaceName()).thenReturn(keyspace);
+        when(cfs.getTableName()).thenReturn(table);
+        when(cfs.getTracker()).thenReturn(dataTracker);
+        when(cfs.getPartitioner()).thenReturn(DatabaseDescriptor.getPartitioner());
+        when(cfs.getCompactionStrategyManager()).thenReturn(strategyManager);
+
+        // use a real compaction logger to execute that code too even though we don't really check
+        // the content of the files, at least we cover the code. The files will be overwritten next
+        // time the test is run or by a gradle clean task, so they will not grow indefinitely
+        compactionLogger = new CompactionLogger(cfs, strategyManager);
+        compactionLogger.enable();
+        when(strategyManager.compactionLogger()).thenReturn(compactionLogger);
+    }
+
+    private void addSizeTieredOptions(Map<String, String> options)
+    {
+        options.put(SizeTieredCompactionStrategyOptions.MIN_SSTABLE_SIZE_KEY, Long.toString(minSSTableSize));
+        options.put(SizeTieredCompactionStrategyOptions.BUCKET_LOW_KEY, Double.toString(SizeTieredCompactionStrategyOptions.DEFAULT_BUCKET_LOW));
+        options.put(SizeTieredCompactionStrategyOptions.BUCKET_HIGH_KEY, Double.toString(SizeTieredCompactionStrategyOptions.DEFAULT_BUCKET_HIGH));
+    }
+
+    private void addTimeTieredOptions(Map<String, String> options)
+    {
+        addSizeTieredOptions(options);
+
+        options.put(TimeWindowCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, TimeUnit.MILLISECONDS.toString());
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "30");
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "MINUTES");
+        options.put(TimeWindowCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, Long.toString(Long.MAX_VALUE)); // disable check for expired sstables
+    }
+
+    private void addLeveledOptions(Map<String, String> options, long maxSSTableSizeBytes)
+    {
+        addSizeTieredOptions(options);
+
+        options.put(LeveledCompactionStrategy.SSTABLE_SIZE_OPTION, Long.toString(maxSSTableSizeBytes >> 20)); // Bytes to MB
+        options.put(LeveledCompactionStrategy.LEVEL_FANOUT_SIZE_OPTION, "10");
+    }
+
+    private SSTableReader mockSSTable(int level, long bytesOnDisk, long timestamp, double hotness, DecoratedKey first, DecoratedKey last)
+    {
+        SSTableReader ret = Mockito.mock(SSTableReader.class);
+
+        when(ret.bytesOnDisk()).thenReturn(bytesOnDisk);
+        when(ret.onDiskLength()).thenReturn(bytesOnDisk);
+        when(ret.uncompressedLength()).thenReturn(bytesOnDisk); // let's assume no compression
+        when(ret.hotness()).thenReturn(hotness);
+        when(ret.getSSTableLevel()).thenReturn(level);
+        when(ret.getMaxTimestamp()).thenReturn(timestamp);
+        when(ret.getMinTimestamp()).thenReturn(timestamp);
+        when(ret.getFirst()).thenReturn(first);
+        when(ret.getLast()).thenReturn(last);
+        when(ret.isMarkedSuspect()).thenReturn(false);
+        when(ret.isRepaired()).thenReturn(true);
+        when(ret.getRepairedAt()).thenReturn(repairedAt);
+        when(ret.getGeneration()).thenReturn(generation.getAndIncrement());
+        when(ret.toString()).thenReturn(String.format("Bytes on disk: %s, level %d, hotness %f, timestamp %d, first %s, last %s",
+                                                      FBUtilities.prettyPrintMemory(bytesOnDisk), level, hotness, timestamp, first, last));
+
+        return ret;
+    }
+
+    private List<SSTableReader> mockSSTables(int numSSTables, long bytesOnDisk, double hotness, long timestamp)
+    {
+        IPartitioner partitioner = cfs.getPartitioner();
+        DecoratedKey first = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+        DecoratedKey last = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+
+        List<SSTableReader> sstables = new ArrayList<>();
+        for (int i = 0; i < numSSTables; i++)
+        {
+            long b = (long)(bytesOnDisk * 0.8 + bytesOnDisk * 0.05 * random.nextDouble()); // leave 5% variability
+            double h = hotness * 0.8 + hotness * 0.05 * random.nextDouble(); // leave 5% variability
+            sstables.add(mockSSTable(0, b, timestamp, h, first, last));
+        }
+
+        return sstables;
+    }
+
+    private List<SSTableReader> mockNonOverlappingSSTables(int numSSTables, int level, long bytesOnDisk)
+    {
+        IPartitioner partitioner = cfs.getPartitioner(); // mocked same as DD.getPartitioner()
+        if (!partitioner.splitter().isPresent())
+            fail(String.format("Cannot split ranges with current partitioner %s", partitioner));
+
+        Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken());
+        Splitter.WeightedRange weightedRange = new Splitter.WeightedRange(1.0, range);
+        Splitter splitter = partitioner.splitter().get();
+        List<Token> boundaries = splitter.splitOwnedRanges(numSSTables,
+                                                           ImmutableList.of(weightedRange),
+                                                           false);
+        assertEquals(numSSTables, boundaries.size());
+        boundaries.add(0, partitioner.getMinimumToken());
+        ByteBuffer emptyBuffer = ByteBuffer.allocate(0);
+
+        long timestamp = System.currentTimeMillis();
+        List<SSTableReader> sstables = new ArrayList<>(numSSTables);
+        for (int i = 0; i < numSSTables; i++)
+        {
+            DecoratedKey first = new BufferDecoratedKey(boundaries.get(i).increaseSlightly(), emptyBuffer);
+            DecoratedKey last =  new BufferDecoratedKey(boundaries.get(i+1), emptyBuffer);
+            sstables.add(mockSSTable(level, bytesOnDisk, timestamp, 0., first, last));
+
+            timestamp+=10;
+        }
+
+        return sstables;
+    }
+
+    private long totUncompressedLength(Collection<SSTableReader> sstables)
+    {
+        long ret = 0;
+        for (SSTableReader sstable : sstables)
+            ret += sstable.uncompressedLength();
+
+        return ret;
+    }
+
+    private double totHotness(Collection<SSTableReader> sstables)
+    {
+        double ret = 0;
+        for (SSTableReader sstable : sstables)
+            ret += sstable.hotness();
+
+        return ret;
+    }
+
+    /**
+     * Creates 5 STCS buckets with a single compaction pick (<= max threshold tables) and
+     * increasing hotness so that the highest test bucket will be compacted first.
+     */
+    @Test
+    public void testSizeTieredCompactionStrategy_fiveBucketsOnePick()
+    {
+        Map<String, String> options = new HashMap<>();
+        addSizeTieredOptions(options);
+
+        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(cfs, options);
+
+        final int numCompactions = 5;
+        long minSize = minSSTableSize;
+        double hotness = 1000;
+
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(numCompactions);
+        for (int i = 0; i < numCompactions; i++)
+        {
+            List<SSTableReader> sstables = mockSSTables(maxCompactionThreshold,
+                                                        minSize,
+                                                        hotness,
+                                                        System.currentTimeMillis());
+            testBuckets.add(sstables);
+
+            minSize *= 10;
+            hotness *= 2;
+        }
+
+        testCompactionStatistics(testBuckets, strategy);
+    }
+
+    /**
+     * Creates a single STCS bucket with enough sstables to fill 5 picks.
+     */
+    @Test
+    public void testSizeTieredCompactionStrategy_oneBucketFivePicks()
+    {
+        Map<String, String> options = new HashMap<>();
+        addSizeTieredOptions(options);
+
+        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(cfs, options);
+
+        final int numCompactions = 5;
+        long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 2;
+        double hotness = 1000;
+
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(numCompactions);
+        for (int i = 0; i < numCompactions; i++)
+        {
+            List<SSTableReader> sstables = mockSSTables(maxCompactionThreshold,
+                                                        size,
+                                                        hotness,
+                                                        System.currentTimeMillis());
+            testBuckets.add(sstables);
+            hotness *= 2;
+        }
+
+        testCompactionStatistics(testBuckets, strategy);
+    }
+
+    /**
+     * Creates 3 STCS buckets with enough sstables to have 2 compactions per bucket and increasing
+     * hotness so that the highest test buckets will be compacted first.
+     */
+    @Test
+    public void testSizeTieredCompactionStrategy_threeBucketsTwoPicks()
+    {
+        Map<String, String> options = new HashMap<>();
+        addSizeTieredOptions(options);
+
+        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(cfs, options);
+
+        long minSize = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE;
+        double hotness = 1000;
+
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(3);
+        for (int i = 0; i < 3; i++) // STCS buckets
+        {
+            for (int j = 0; j < 2; j++) // picks
+            {
+                List<SSTableReader> sstables = mockSSTables(maxCompactionThreshold,
+                                                            minSize,
+                                                            hotness,
+                                                            System.currentTimeMillis());
+                testBuckets.add(sstables);
+                hotness *= 2;
+            }
+
+            minSize *= 10;
+        }
+
+        testCompactionStatistics(testBuckets, strategy);
+    }
+
+
+    /**
+     * Creates 5 TWCS buckets with increasing timestamp so that the higher buckets will be compacted first.
+     * Each bucket only has a single compaction pick (<= max threshold tables).
+     */
+    @Test
+    public void testTimeWindowCompactionStrategy_fiveBucketsOnePick()
+    {
+        Map<String, String> options = new HashMap<>();
+        addTimeTieredOptions(options);
+
+        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(cfs, options);
+
+        final int numCompactions = 5;
+        long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 5;
+        double hotness = 1000;
+        long timestap = System.currentTimeMillis() - TimeUnit.HOURS.toMillis(10); // 10 hours ago
+
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(numCompactions);
+        for (int i = 0; i < numCompactions; i++)
+        {
+            List<SSTableReader> sstables = mockSSTables(maxCompactionThreshold, size, hotness, timestap);
+            testBuckets.add(sstables);
+
+            timestap += TimeUnit.HOURS.toMillis(2);
+        }
+
+        testCompactionStatistics(testBuckets, strategy);
+    }
+
+    /**
+     * Creates a single TWCS bucket with enough sstables to fill 5 picks.
+     */
+    @Test
+    public void testTimeWindowCompactionStrategy_oneBucketFivePicks()
+    {
+        Map<String, String> options = new HashMap<>();
+        addTimeTieredOptions(options);
+
+        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(cfs, options);
+
+        final int numCompactions = 5;
+        long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 5;
+        double hotness = 100;
+        long timestap = System.currentTimeMillis();
+
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(numCompactions);
+        for (int i = 0; i < numCompactions; i++)
+        {
+            List<SSTableReader> sstables = mockSSTables(maxCompactionThreshold, size, hotness, timestap);
+            testBuckets.add(sstables);
+
+            hotness *= 2; // hottest tables should be picked first because TWCS uses STCS in the latest bucket
+        }
+
+        testCompactionStatistics(testBuckets, strategy);
+    }
+
+    /**
+     * Creates 3 TWCS buckets with enough sstables to have 2 compactions per bucket.
+     */
+    @Test
+    public void testTimeWindowCompactionStrategy_threeBucketsTwoPicks()
+    {
+        Map<String, String> options = new HashMap<>();
+        addTimeTieredOptions(options);
+
+        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(cfs, options);
+
+        long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 10;
+        double hotness = 1000;
+        long timestap = System.currentTimeMillis() - TimeUnit.HOURS.toMillis(10); // 10 hours ago
+
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(3 * 2);
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                List<SSTableReader> sstables = mockSSTables(maxCompactionThreshold, size, hotness, timestap);
+                testBuckets.add(sstables);
+
+                hotness *= 2; // hottest tables should be picked first in the newest bucket because of STCS
+                size -= SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE; // smaller sstables are picked first in other TWCS buckets
+            }
+
+            size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 10;
+            timestap += TimeUnit.HOURS.toMillis(2);
+        }
+
+        testCompactionStatistics(testBuckets, strategy);
+    }
+
+    /**
+     * A utility method for determining the overlapping sstables similarly to what {@link LeveledManifest} does
+     * when selecting sstables from the next level that overlap with a candidate of the previous level.
+     *
+     * @param sstable the sstables from the previous level
+     * @param candidates the candidates sstables from the next level
+     *
+     * @return a set containing the sstable passed in and all the sstables that overlap from the candidates
+     */
+    private static Set<SSTableReader> overlapping(SSTableReader sstable, List<SSTableReader> candidates)
+    {
+        Map<SSTableReader, Bounds<Token>> candidatesWithBounds = LeveledManifest.genBounds(candidates);
+        return Sets.union(Collections.singleton(sstable), LeveledManifest.overlappingWithBounds(sstable, candidatesWithBounds));
+    }
+
+    /**
+     * Creates 3 LCS levels. Checks that L2 and L3 are able to compact in parallel but L0 gets blocked by the
+     * L1 compaction. Once the L2 and L3 compactions have finished, then the L0 compaction can proceed.
+     */
+    @Test
+    public void testLeveledCompactionStrategy_threeLevels()
+    {
+        Map<String, String> options = new HashMap<>();
+        long maxSSTableSize = 160 << 20; // 160 MB in bytes
+        addLeveledOptions(options, maxSSTableSize);
+
+        LeveledCompactionStrategy strategy = new LeveledCompactionStrategy(cfs, options);
+
+        final int numLevels = 3;
+        List<List<SSTableReader>> ssTablesByLevel = new ArrayList<>(numLevels);
+        for (int i = 0; i < numLevels; i++)
+        {
+            // level zero maximum size is 4 times maxSSTableSize, and for other levels it is
+            // the fan-out size (10) to the power of the level number, times maxSSTableSize.
+            long maxLevelSize = (long) ((i == 0 ? 4 : Math.pow(10, i)) * maxSSTableSize);
+
+            // we add one to ensure the score will be > 1 so that only one sstable (and no more!) will be selected for compaction
+            int numSSTables = (int) Math.ceil(maxLevelSize / maxSSTableSize) + 1;
+
+            List<SSTableReader> sstables = mockNonOverlappingSSTables(numSSTables, i, maxSSTableSize);
+            ssTablesByLevel.add(sstables);
+        }
+
+        // all sstables flattened
+        Set<SSTableReader> sstables = ssTablesByLevel.stream().flatMap(bucket -> bucket.stream()).collect(Collectors.toSet());
+
+        // Organize the sstables into the expected compactions
+        // LCS will always compact the highest level first unless L0 has more than 32 sstables in which case
+        // it compacts using STCS
+        List<Collection<SSTableReader>> compactions = new ArrayList(3);
+
+        //L0 will compact all its sstables and the ones of L1 since they all overlap and the total is below the max threshold
+        compactions.add(Sets.union(Sets.newLinkedHashSet(ssTablesByLevel.get(0)), Sets.newLinkedHashSet(ssTablesByLevel.get(1))));
+
+        // L1 will compact the first sstable because the score is > 1 plus the overlapping sstables from L2
+        compactions.add(overlapping(ssTablesByLevel.get(1).get(0), ssTablesByLevel.get(2)));
+
+        // L2 will compact the first sstable because the score is > 1 but no other overlapping sstables since L3 is empty
+        compactions.add(overlapping(ssTablesByLevel.get(2).get(0), ImmutableList.of()));
+
+        // L2 and L1 compactions can proceed in parallel but L0 will refuse to compact due to overlapping sstables in L1
+        // already compacting, hence we can only test 2 compactions initially
+        testCompactionStatistics(sstables, compactions, 2, strategy);
+
+        // Now check L0 compaction can proceed, the other levels won't compact since the score should be < 1
+        ssTablesByLevel.get(1).remove(0); // the first one must have been compacted
+        Set<SSTableReader> candidates = Sets.union(Sets.newLinkedHashSet(ssTablesByLevel.get(0)), Sets.newLinkedHashSet(ssTablesByLevel.get(1)));
+        long totLength = totUncompressedLength(candidates);
+        UUID id = mockCompaction(strategy, sstables, candidates, Collections.emptySet());
+
+        verifyStatistics(strategy,
+                         1,
+                         1,
+                         candidates.size(),
+                         candidates.size(),
+                         totLength,
+                         0,
+                         0,
+                         0);
+
+        CompactionProgress progress = mockCompactionProgress(candidates, id);
+        strategy.getBackgroundCompactions().setInProgress(progress);
+
+        verifyStatistics(strategy,
+                         1,
+                         1,
+                         candidates.size(),
+                         candidates.size(),
+                         totLength,
+                         totLength,
+                         totLength,
+                         0);
+
+        strategy.backgroundCompactions.setCompleted(id);
+
+        // Now we should have L1 again...
+    }
+
+    /**
+     *  Test the case where L0 has enough sstables to trigger STCS, plus also add some tables in L1.
+     */
+    @Test
+    public void testLeveledCompactionStrategy_stcsL0()
+    {
+        Map<String, String> options = new HashMap<>();
+        long maxSSTableSize = 160 << 20; // 160 MB in bytes
+        addLeveledOptions(options, maxSSTableSize);
+
+        LeveledCompactionStrategy strategy = new LeveledCompactionStrategy(cfs, options);
+
+        int level = 1;
+        long maxLevelSize = (long) (Math.pow(10, level) * maxSSTableSize);
+        int numSSTables = (int) Math.ceil(maxLevelSize / maxSSTableSize) + 1;
+        List<SSTableReader> l1SSTables = mockNonOverlappingSSTables(numSSTables, level, maxSSTableSize);
+
+        List<SSTableReader> l0SSTables = mockSSTables(MAX_COMPACTING_L0 + 1, maxSSTableSize, 0.0, System.currentTimeMillis());
+
+        Set<SSTableReader> sstables = Sets.newHashSet(l0SSTables);
+        sstables.addAll(l1SSTables);
+
+        // Organize the sstables into the expected compactions
+        // LCS will always compact the highest level first unless L0 has more than 32 sstables in which case
+        // it compacts using STCS
+        List<Collection<SSTableReader>> compactions = new ArrayList(2);
+
+        // L1 will compact the first sstable because the score is > 1 but no other overlapping sstables since L2 is empty
+        compactions.add(overlapping(l1SSTables.get(0), ImmutableList.of()));
+
+        // L0 should use STCS to compact them all up to the max threshold, since all sstables have the same hotness,
+        // they will be sorted by size
+        Collections.sort(l0SSTables, Comparator.comparing(SSTableReader::onDiskLength));
+        compactions.add(l0SSTables.subList(0, Math.min(maxCompactionThreshold, l0SSTables.size())));
+
+        testCompactionStatistics(sstables, compactions, compactions.size(), strategy);
+    }
+
+    private void testCompactionStatistics(List<Collection<SSTableReader>> compactions, AbstractCompactionStrategy strategy)
+    {
+        Set<SSTableReader> sstables = compactions.stream().flatMap(bucket -> bucket.stream()).collect(Collectors.toSet());
+        testCompactionStatistics(sstables, compactions, compactions.size(), strategy);
+    }
+
+    /**
+     * Tests the statistics for a given strategy. It is expected that the compactions passed in will contain a set of sstables
+     * to be compacted together, with the highest index being picked first, then the second highest and so forth.
+     *
+     * @param compactions sstables grouped by compaction, each compaction is expected to be compacted fully (no splitting currently
+     *                    supported), the highest index compaction should be picked first by the strategy
+     * @param numExpectedCompactions the expected number of compactions that can occur in parallel
+     * @param strategy the compaction strategy
+     */
+    private void testCompactionStatistics(Set<SSTableReader> sstables,
+                                          List<Collection<SSTableReader>> compactions,
+                                          int numExpectedCompactions,
+                                          AbstractCompactionStrategy strategy)
+    {
+        // Add the tables to the strategy
+        for (SSTableReader sstable : sstables)
+            strategy.addSSTable(sstable);
+
+        List<SSTableReader> sstablesForCompaction = compactions.stream().flatMap(Collection::stream).collect(Collectors.toList());
+
+        int numSSTables = sstablesForCompaction.size();
+        long totLength = totUncompressedLength(sstablesForCompaction);
+        double totHotness = totHotness(sstablesForCompaction);
+
+        Set<SSTableReader> compacting = new HashSet<>();
+        List<Pair<Set<SSTableReader>, UUID>> submittedCompactions = new ArrayList<>(compactions.size());
+
+        long totRead = 0;
+        long totWritten = 0;
+        int numSSTablesCompacting = 0;
+        int numCompactions = compactions.size();
+        int numCompactionsInProgress = 0;
+
+        // Create a compaction task and start the compaction for each bucket starting with the highest index
+        for (int i = 0; i < numExpectedCompactions; i++)
+        {
+            int compactingLevel = compactions.size() - i - 1;
+            Set<SSTableReader> candidates = Sets.newHashSet(compactions.get(compactingLevel));
+
+            UUID id = mockCompaction(strategy, sstables, candidates, compacting);
+
+            numCompactionsInProgress++;
+            numSSTablesCompacting += candidates.size();
+            submittedCompactions.add(Pair.create(candidates, id));
+
+            // after mocking the compaction the list of pending compactions has been updated in the strategy
+            // and this will be reflected in the statistics but the compaction task has not started yet
+            verifyStatistics(strategy,
+                             numCompactions,
+                             numCompactionsInProgress,
+                             numSSTables,
+                             numSSTablesCompacting,
+                             totLength,
+                             totRead,
+                             totWritten,
+                             totHotness);
+
+            // Now we simulate starting the compaction task
+            CompactionProgress progress = mockCompactionProgress(candidates, id);
+            strategy.getBackgroundCompactions().setInProgress(progress);
+
+            // The compaction has started and so we must updated the following expected values
+            totRead += progress.uncompressedBytesRead();
+            totWritten += progress.uncompressedBytesWritten();
+
+            // Now check that the statistics reflect the compaction in progress
+            verifyStatistics(strategy,
+                             numCompactions,
+                             numCompactionsInProgress,
+                             numSSTables,
+                             numSSTablesCompacting,
+                             totLength,
+                             totRead,
+                             totWritten,
+                             totHotness);
+
+            // update compacting for the next iteration
+            compacting.addAll(candidates);
+        }
+
+        // Terminate the compactions one by one by closing the AutoCloseable and check
+        // that the statistics are updated
+        for (Pair<Set<SSTableReader>, UUID> pair : submittedCompactions)
+        {
+            Set<SSTableReader> compSSTables = pair.left;
+            long totSSTablesLen = totUncompressedLength(compSSTables);
+            strategy.getBackgroundCompactions().setCompleted(pair.right);
+
+            numCompactions--;
+            numCompactionsInProgress--;
+            numSSTables -= compSSTables.size();
+            numSSTablesCompacting -= compSSTables.size();
+
+            totLength -= totSSTablesLen;
+            totRead -= totSSTablesLen;
+            totWritten -= totSSTablesLen;
+            totHotness -= totHotness(compSSTables);
+
+            for (SSTableReader sstable : pair.left)
+                strategy.removeSSTable(sstable);
+
+            sstables.removeAll(pair.left);
+            compacting.removeAll(pair.left);
+
+            verifyStatistics(strategy,
+                             numCompactions,
+                             numCompactionsInProgress,
+                             numSSTables,
+                             numSSTablesCompacting,
+                             totLength,
+                             totRead,
+                             totWritten,
+                             totHotness);
+        }
+    }
+
+    private UUID mockCompaction(AbstractCompactionStrategy strategy, Set<SSTableReader> live, Set<SSTableReader> candidates, Set<SSTableReader> compacting)
+    {
+        final UUID id = UUID.randomUUID();
+        final AtomicReference<LifecycleTransaction> txn = new AtomicReference<>();
+
+        when(dataTracker.tryModify(anyIterable(), eq(OperationType.COMPACTION))).thenAnswer(invocation -> {
+            assertNull(txn.get());
+
+            LifecycleTransaction ret = Mockito.mock(LifecycleTransaction.class);
+            when(ret.opId()).thenReturn(id);
+            when(ret.originals()).thenReturn(candidates);
+            when(ret.getCompacting()).thenReturn(Sets.union(compacting, candidates));
+
+            txn.set(ret);
+            return ret;
+        });
+
+        when(cfs.getSSTables(eq(SSTableSet.LIVE))).thenReturn(live);
+        when(cfs.getNoncompactingSSTables()).thenAnswer(invocation -> Sets.difference(live, txn.get() == null ? compacting : Sets.union(compacting, candidates)));
+        when(cfs.getNoncompactingSSTables(anyIterable())).thenAnswer(invocation -> Sets.difference(Sets.newHashSet((Iterable<SSTableReader>)invocation.getArguments()[0]),
+                                                                                                  txn.get() == null ? compacting : Sets.union(compacting, candidates)));
+        when(cfs.getCompactingSSTables()).thenAnswer(invocation -> txn.get() == null ? compacting : Sets.union(compacting, candidates));
+
+        // Ask for a background compaction
+        AbstractCompactionTask task = strategy.getNextBackgroundTask((int) TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()));
+        assertNotNull(task);
+        return id;
+    }
+
+    private CompactionProgress mockCompactionProgress(Set<SSTableReader> compacting, UUID id)
+    {
+        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
+
+        long compactingLen = totUncompressedLength(compacting);
+        when(progress.operationId()).thenReturn(id);
+        when(progress.inSSTables()).thenReturn(compacting);
+        when(progress.uncompressedBytesRead()).thenReturn(compactingLen);
+        when(progress.uncompressedBytesWritten()).thenReturn(compactingLen);
+        when(progress.durationInNanos()).thenReturn(TimeUnit.SECONDS.toNanos(30));
+
+        return progress;
+    }
+
+    private void verifyStatistics(AbstractCompactionStrategy strategy,
+                                  int expectedCompactions,
+                                  int expectedCompacting,
+                                  int expectedSSTables,
+                                  int expectedSSTablesCompacting,
+                                  long expectedTotBytes,
+                                  long expectedReadBytes,
+                                  long expectedWrittenBytes,
+                                  double expectedTotHotness)
+    {
+        CompactionStrategyStatistics stats = strategy.getStatistics();
+        System.out.println(stats.toString());
+
+        assertEquals(keyspace, stats.keyspace());
+        assertEquals(table, stats.table());
+        assertEquals(strategy.getClass().getSimpleName(), stats.strategy());
+
+        assertEquals(expectedCompactions, strategy.getTotalCompactions());
+
+        int numCompactions = 0;
+        int numCompacting = 0;
+        int numSSTables = 0;
+        int numCompactingSSTables = 0;
+        long totBytes = 0;
+        long writtenBytes = 0;
+        long readBytes = 0;
+        double hotness = 0;
+
+        for (CompactionAggregateStatistics compactionStatistics : stats.aggregates())
+        {
+            numCompactions += compactionStatistics.numCompactions();
+            numCompacting += compactionStatistics.numCompactionsInProgress();
+            numSSTables += compactionStatistics.numCandidateSSTables();
+            numCompactingSSTables += compactionStatistics.numCompactingSSTables();
+
+            if (compactionStatistics instanceof TieredCompactionStatistics)
+            {
+                TieredCompactionStatistics tieredStatistics = (TieredCompactionStatistics) compactionStatistics;
+
+                totBytes += tieredStatistics.tot();
+                writtenBytes += tieredStatistics.written();
+                readBytes += tieredStatistics.read();
+                hotness += tieredStatistics.hotness;
+            }
+            else
+            {
+                LeveledCompactionStatistics leveledStatistics = (LeveledCompactionStatistics) compactionStatistics;
+
+                totBytes += leveledStatistics.tot();
+                writtenBytes += leveledStatistics.written();
+                readBytes += leveledStatistics.read();
+            }
+        }
+
+        assertEquals(expectedCompactions, numCompactions);
+        assertEquals(expectedCompacting, numCompacting);
+
+        if (!(strategy instanceof LeveledCompactionStrategy))
+        { // LCS won't report pending sstables but only pending tasks
+            assertEquals(expectedSSTables, numSSTables);
+            assertEquals(expectedSSTablesCompacting, numCompactingSSTables);
+            assertEquals(expectedTotBytes, totBytes);
+        }
+
+        assertEquals(expectedReadBytes, readBytes);
+        assertEquals(expectedWrittenBytes, writtenBytes);
+
+        if (hotness > 0)
+            assertEquals(expectedTotHotness, hotness, epsilon);
+
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
index 049415c85968..a5620ab1d457 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
@@ -41,8 +41,19 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NonThrowingCloseable;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.concurrent.Transactional;
+import org.mockito.ArgumentCaptor;
+import org.mockito.Mockito;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 public class CompactionTaskTest
 {
@@ -66,7 +77,7 @@ public void setUp() throws Exception
     }
 
     @Test
-    public void compactionInterruption() throws Exception
+    public void compactionDisabled() throws Exception
     {
         cfs.getCompactionStrategyManager().disable();
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (1, 1);");
@@ -81,7 +92,8 @@ public void compactionInterruption() throws Exception
 
         LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
         Assert.assertNotNull(txn);
-        CompactionTask task = new CompactionTask(cfs, txn, 0);
+
+        AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0);
         Assert.assertNotNull(task);
         cfs.getCompactionStrategyManager().pause();
         try
@@ -96,6 +108,41 @@ public void compactionInterruption() throws Exception
         Assert.assertEquals(Transactional.AbstractTransactional.State.ABORTED, txn.state());
     }
 
+    @Test
+    public void compactionInterruption()
+    {
+        cfs.getCompactionStrategyManager().disable();
+        Set<SSTableReader> sstables = generateData(2, 2);
+
+        LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
+        assertNotNull(txn);
+
+        AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0);
+        assertNotNull(task);
+
+        TableOperationObserver obs = Mockito.mock(TableOperationObserver.class);
+        NonThrowingCloseable cls = Mockito.mock(NonThrowingCloseable.class);
+
+        when(obs.onOperationStart(any(TableOperation.class))).thenAnswer(invocation -> {
+            TableOperation op = invocation.getArgument(0);
+            op.stop();
+            return cls;
+        });
+
+        try
+        {
+            task.execute(obs);
+            Assert.fail("Expected CompactionInterruptedException");
+        }
+        catch (CompactionInterruptedException e)
+        {
+            // pass
+        }
+
+        verify(cls, times(1)).close();
+        assertEquals(Transactional.AbstractTransactional.State.ABORTED, txn.state());
+    }
+
     private static void mutateRepaired(SSTableReader sstable, long repairedAt, UUID pendingRepair, boolean isTransient) throws IOException
     {
         sstable.descriptor.getMetadataSerializer().mutateRepairMetadata(sstable.descriptor, repairedAt, pendingRepair, isTransient);
@@ -139,7 +186,7 @@ public void mixedSSTableFailure() throws Exception
             {
                 txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
                 Assert.assertNotNull(txn);
-                CompactionTask task = new CompactionTask(cfs, txn, 0);
+                AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0);
                 Assert.fail("Expected IllegalArgumentException");
             }
             catch (IllegalArgumentException e)
@@ -154,4 +201,40 @@ public void mixedSSTableFailure() throws Exception
             Collections.rotate(toCompact, 1);
         }
     }
+
+    @Test
+    public void testCompactionReporting()
+    {
+        cfs.getCompactionStrategyManager().disable();
+        Set<SSTableReader> sstables = generateData(2, 2);
+        LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
+        assertNotNull(txn);
+        TableOperationObserver operationObserver = Mockito.mock(TableOperationObserver.class);
+        CompactionObserver compObserver = Mockito.mock(CompactionObserver.class);
+        final ArgumentCaptor<TableOperation> tableOpCaptor = ArgumentCaptor.forClass(AbstractTableOperation.class);
+        final ArgumentCaptor<CompactionProgress> compactionCaptor = ArgumentCaptor.forClass(CompactionProgress.class);
+        AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0, compObserver);
+        assertNotNull(task);
+        task.execute(operationObserver);
+
+        verify(operationObserver, times(1)).onOperationStart(tableOpCaptor.capture());
+        verify(compObserver, times(1)).setInProgress(compactionCaptor.capture());
+        verify(compObserver, times(1)).setCompleted(eq(txn.opId()));
+    }
+
+
+    private Set<SSTableReader> generateData(int numSSTables, int numKeys)
+    {
+        for (int i = 0; i < numSSTables; i++)
+        {
+            for (int j = 0; j < numKeys; j++)
+                QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (?, ?);", j + i * numKeys, j + i * numKeys);
+
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        }
+
+        Set<SSTableReader> sstables = cfs.getLiveSSTables();
+        Assert.assertEquals(numSSTables, sstables.size());
+        return sstables;
+    }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
index e49923b0b387..d13e41f96514 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
@@ -154,10 +154,10 @@ private void createLowGCGraceTable(){
 
     @Test
     @BMRule(name = "Stop all compactions",
-    targetClass = "CompactionTask",
-    targetMethod = "runMayThrow",
+    targetClass = "CompactionTask$CompactionOperation",
+    targetMethod = "<init>",
     targetLocation = "AT INVOKE getCompactionAwareWriter",
-    action = "$ci.stop()")
+    action = "$this.op.stop()")
     public void testStopUserDefinedCompactionRepaired() throws Throwable
     {
         testStopCompactionRepaired((cfs) -> {
@@ -168,10 +168,10 @@ public void testStopUserDefinedCompactionRepaired() throws Throwable
 
     @Test
     @BMRule(name = "Stop all compactions",
-    targetClass = "CompactionTask",
-    targetMethod = "runMayThrow",
+    targetClass = "CompactionTask$CompactionOperation",
+    targetMethod = "<init>",
     targetLocation = "AT INVOKE getCompactionAwareWriter",
-    action = "$ci.stop()")
+    action = "$this.op.stop()")
     public void testStopSubRangeCompactionRepaired() throws Throwable
     {
         testStopCompactionRepaired((cfs) -> {
@@ -205,7 +205,7 @@ public void testStopCompactionRepaired(Consumer<ColumnFamilyStore> compactionRun
         }
 
         assertTrue(cfs.getTracker().getCompacting().isEmpty());
-        assertTrue(CompactionManager.instance.active.getCompactions().stream().noneMatch(h -> h.getCompactionInfo().getTableMetadata().equals(cfs.metadata)));
+        assertTrue(CompactionManager.instance.active.getTableOperations().stream().noneMatch(h -> h.getProgress().metadata().equals(cfs.metadata)));
 
         try
         {
@@ -220,7 +220,7 @@ public void testStopCompactionRepaired(Consumer<ColumnFamilyStore> compactionRun
         }
 
         assertTrue(cfs.getTracker().getCompacting().isEmpty());
-        assertTrue(CompactionManager.instance.active.getCompactions().stream().noneMatch(h -> h.getCompactionInfo().getTableMetadata().equals(cfs.metadata)));
+        assertTrue(CompactionManager.instance.active.getTableOperations().stream().noneMatch(h -> h.getProgress().metadata().equals(cfs.metadata)));
 
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
index 4bbc526989fd..5c4576e09e03 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
@@ -420,7 +420,7 @@ public void testLCSThresholdParams() throws Throwable
         AbstractCompactionTask act = lcs.getNextBackgroundTask(0);
         // we should be compacting all 50 sstables:
         assertEquals(50, act.transaction.originals().size());
-        act.execute(ActiveCompactionsTracker.NOOP);
+        act.execute();
     }
 
     @Test
@@ -457,7 +457,7 @@ public void testSTCSinL0() throws Throwable
         assertEquals(0, ((LeveledCompactionTask)act).getLevel());
         assertTrue(act.transaction.originals().stream().allMatch(s -> s.getSSTableLevel() == 0));
         txn.abort(); // unmark the l1 sstable compacting
-        act.execute(ActiveCompactionsTracker.NOOP);
+        act.execute();
     }
 
     @Test
@@ -521,7 +521,7 @@ public void testAbortNotifications() throws Throwable
         // sstables have been removed.
         try
         {
-            AbstractCompactionTask task = new NotifyingCompactionTask((LeveledCompactionTask) lcs.getNextBackgroundTask(0));
+            AbstractCompactionTask task = new NotifyingCompactionTask(lcs, (LeveledCompactionTask) lcs.getNextBackgroundTask(0));
             task.execute(CompactionManager.instance.active);
             fail("task should throw exception");
         }
@@ -544,9 +544,9 @@ public void testAbortNotifications() throws Throwable
 
     private static class NotifyingCompactionTask extends LeveledCompactionTask
     {
-        public NotifyingCompactionTask(LeveledCompactionTask task)
+        public NotifyingCompactionTask(LeveledCompactionStrategy lcs, LeveledCompactionTask task)
         {
-            super(task.cfs, task.transaction, task.getLevel(), task.gcBefore, task.getLevel(), false);
+            super(lcs, task.transaction, task.getLevel(), task.gcBefore, task.getLevel(), false);
         }
 
         @Override
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
index 887ebddfee4f..032ab97d1a9a 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
@@ -305,7 +305,7 @@ public void testMinorCompactionPurge()
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
         {
-            Iterables.getOnlyElement(tasks).execute(ActiveCompactionsTracker.NOOP);
+            Iterables.getOnlyElement(tasks).execute();
         }
 
         // verify that minor compaction does GC when key is provably not
@@ -357,7 +357,7 @@ public void testMinTimestampPurge()
         // compact the sstables with the c1/c2 data and the c1 tombstone
         try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
         {
-            Iterables.getOnlyElement(tasks).execute(ActiveCompactionsTracker.NOOP);
+            Iterables.getOnlyElement(tasks).execute();
         }
 
         // We should have both the c1 and c2 tombstones still. Since the min timestamp in the c2 tombstone
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
index 5dc3388038ee..a6a57d4a1ab0 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
@@ -23,9 +23,14 @@
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
 
+import com.google.common.base.Throwables;
+import com.google.common.collect.ImmutableList;
 import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -74,10 +79,15 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.NonThrowingCloseable;
+import org.mockito.Mockito;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.when;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class CompactionsTest
@@ -565,4 +575,71 @@ public void testConcurrencySettings()
         CompactionManager.instance.setConcurrentCompactors(1);
         assertEquals(1, CompactionManager.instance.getCoreCompactorThreads());
     }
+
+    @Test
+    public void testCompactionsCanBeInterrupted() throws Exception
+    {
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD1);
+        store.clearUnsafe();
+
+        // disable compaction while flushing
+        store.disableAutoCompaction();
+
+        // Write a bit of data
+        for (int j = 0; j < 2; j++)
+        {
+            for (int i = 1; i < 100; i++)
+            {
+                new RowUpdateBuilder(store.metadata(), 0, ByteBufferUtil.bytes("key" + i))
+                .clustering("Column1")
+                .add("val", ByteBufferUtil.bytes("abcd"))
+                .build()
+                .apply();
+            }
+
+            store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        }
+
+        assertTrue(store.getLiveSSTables().size() >= 2);
+
+        // Enable compaction but do not submit any background compactions
+        store.getCompactionStrategyManager().enable();
+
+        CountDownLatch compactionRegistered = new CountDownLatch(1);
+        CountDownLatch resumeCompaction = new CountDownLatch(1);
+        TableOperationObserver obs = Mockito.mock(TableOperationObserver.class);
+
+        when(obs.onOperationStart(any(TableOperation.class))).thenAnswer(invocation -> {
+            NonThrowingCloseable ret = CompactionManager.instance.active.onOperationStart(invocation.getArgument(0));
+            compactionRegistered.countDown(); // this makes sure we don't attempt to interrupt a compaction before it has registered
+            resumeCompaction.await(); // this will block the compaction just after it has registered so that we can interrupt it before it even starts
+            return ret;
+        });
+
+        List<Future<?>> compactions = CompactionManager.instance.submitMaximal(store, FBUtilities.nowInSeconds(), false, obs);
+        assertEquals("Expected one compaction to be submitted", 1, compactions.size());
+
+        // Wait for compaction to register with its operation observer (the metrics)
+        compactionRegistered.await(1, TimeUnit.MINUTES);
+
+        // Interrupt the compaction, this only works if CompactionManager.instance.active.onOperationStart() has already been called
+        boolean ret = CompactionManager.instance.interruptCompactionFor(ImmutableList.of(store.metadata()));
+        assertTrue("Compaction should have been interrupted", ret);
+
+        // Let the compaction continue running
+        resumeCompaction.countDown();
+
+        // Make sure the compactions was interrupted
+        try
+        {
+            compactions.get(0).get();
+            fail("Compaction should have been interrupted");
+        }
+        catch(Throwable t)
+        {
+            t = Throwables.getRootCause(t);
+            assertTrue(t.getMessage(), t instanceof CompactionInterruptedException);
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
index 17bb5ee9040e..dfdda303e16a 100644
--- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
@@ -117,7 +117,7 @@ public void truncateSTandardLeveled()
      * Ensure that the grouping operation preserves the levels of grouped tables
      */
     @Test
-    public void testGrouperLevels() throws Exception{
+    public void testGrouperLevels() throws Exception {
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
 
         //Need entropy to prevent compression so size is predictable with compression enabled/disabled
@@ -513,7 +513,7 @@ public void testTokenRangeCompaction() throws Exception
     }
 
     @Test
-    public void testCompactionCandidateOrdering() throws Exception
+    public void testCompactionCandidateOrdering()
     {
         // add some data
         byte [] b = new byte[100 * 1024];
@@ -733,7 +733,7 @@ public void randomMultiLevelAddTest()
             assertTrue(level.stream().allMatch(s -> s.getSSTableLevel() == lvl));
             if (i > 0)
             {
-                level.sort(SSTableReader.sstableComparator);
+                level.sort(SSTableReader.firstKeyComparator);
                 SSTableReader prev = null;
                 for (SSTableReader sstable : level)
                 {
@@ -796,7 +796,7 @@ private static int[] canAdd(LeveledManifest lm, List<SSTableReader> newSSTables,
             for (SSTableReader sstable : lvlGroup.getValue())
             {
                 newLevel.add(sstable);
-                newLevel.sort(SSTableReader.sstableComparator);
+                newLevel.sort(SSTableReader.firstKeyComparator);
 
                 SSTableReader prev = null;
                 boolean kept = true;
@@ -848,7 +848,7 @@ public void testHighestLevelHasMoreDataThanSupported()
 
         // compaction for L8 sstables is not supposed to be run because there is no upper level to promote sstables
         // that's why we expect compaction candidates for L7 only
-        Collection<SSTableReader> compactionCandidates = lm.getCompactionCandidates().sstables;
+        Collection<SSTableReader> compactionCandidates = lm.getCompactionCandidate().sstables;
         assertThat(compactionCandidates).containsAll(sstablesOnL7);
         assertThat(compactionCandidates).doesNotContainAnyElementsOf(sstablesOnL8);
     }
diff --git a/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java b/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
index ca64e4a2a34d..ba5c203cb98a 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
@@ -50,7 +50,7 @@ public void basicTest() throws Throwable
             if (txn != null)
             {
                 SingleSSTableLCSTask task = new SingleSSTableLCSTask(cfs, txn, 2);
-                task.executeInternal(null);
+                task.executeInternal();
             }
         }
         assertEquals(1, cfs.getLiveSSTables().size());
@@ -100,16 +100,20 @@ private void compactionTestHelper(boolean singleSSTUplevel) throws Throwable
         // now we have a bunch of data in L0, first compaction will be a normal one, containing all sstables:
         LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
         AbstractCompactionTask act = lcs.getNextBackgroundTask(0);
-        act.execute(ActiveCompactionsTracker.NOOP);
+        act.execute();
 
         // now all sstables are laid out non-overlapping in L1, this means that the rest of the compactions
         // will be single sstable ones, make sure that we use SingleSSTableLCSTask if singleSSTUplevel is true:
-        while (lcs.getEstimatedRemainingTasks() > 0)
+        while ((act = lcs.getNextBackgroundTask(0)) != null)
         {
-            act = lcs.getNextBackgroundTask(0);
+            assertTrue(lcs.getTotalCompactions() > 0);
             assertEquals(singleSSTUplevel, act instanceof SingleSSTableLCSTask);
-            act.execute(ActiveCompactionsTracker.NOOP);
+            act.execute();
         }
+
+        assertEquals(0, lcs.getTotalCompactions());
+        assertEquals(0, lcs.getEstimatedRemainingTasks());
+
         assertEquals(0, lcs.getLevelSize(0));
         int l1size = lcs.getLevelSize(1);
         // this should be 10, but it might vary a bit depending on partition sizes etc
@@ -137,7 +141,7 @@ public void corruptMetadataTest() throws Throwable
             if (txn != null)
             {
                 SingleSSTableLCSTask task = new SingleSSTableLCSTask(cfs, txn, 2);
-                task.executeInternal(null);
+                task.executeInternal();
             }
         }
         catch (Throwable t)
diff --git a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
index bf761d8eb637..54a96ea3413f 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategyTest.java
@@ -19,11 +19,17 @@
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Random;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
+import com.google.common.collect.ImmutableList;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
@@ -35,21 +41,27 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.metrics.RestorableMeter;
 import org.apache.cassandra.schema.KeyspaceParams;
-import org.apache.cassandra.utils.Pair;
+import org.mockito.Mockito;
 
-import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.getBuckets;
-import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.mostInterestingBucket;
-import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.trimToThresholdWithHotness;
 import static org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy.validateOptions;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.mockito.Mockito.when;
 
 public class SizeTieredCompactionStrategyTest
 {
     public static final String KEYSPACE1 = "SizeTieredCompactionStrategyTest";
     private static final String CF_STANDARD1 = "Standard1";
 
+    private static final Random random = new Random(98752945723L);
+
+    private final int minThreshold = 4; //same as the default
+    private final int maxThreshold = 32; //same as the default
+    private final double bucketLow = 0.5; //same as the default
+    private final double bucketHigh = 1.5; //same as the default
+    private final int minSSTableSize = 10; // small enough not to interfere
+
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
     {
@@ -92,61 +104,66 @@ public void testOptionsValidation() throws ConfigurationException
     @Test
     public void testGetBuckets()
     {
-        List<Pair<String, Long>> pairs = new ArrayList<Pair<String, Long>>();
-        String[] strings = { "a", "bbbb", "cccccccc", "cccccccc", "bbbb", "a" };
-        for (String st : strings)
+        List<SSTableReader> sstables = new ArrayList<>();
+        long[] sstableLengths = { 1L, 4L, 8L, 8L, 4L, 1L };
+        for (long len : sstableLengths)
         {
-            Pair<String, Long> pair = Pair.create(st, new Long(st.length()));
-            pairs.add(pair);
+            SSTableReader sstable = Mockito.mock(SSTableReader.class);
+            when(sstable.onDiskLength()).thenReturn(len);
+            when(sstable.hotness()).thenReturn(0.);
+            sstables.add(sstable);
         }
 
-        List<List<String>> buckets = getBuckets(pairs, 1.5, 0.5, 2);
+        SizeTieredCompactionStrategyOptions stcsOptions = new SizeTieredCompactionStrategyOptions(2, bucketLow, bucketHigh);
+        SizeTieredCompactionStrategy.SizeTieredBuckets sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstables, stcsOptions, minThreshold, maxThreshold);
+        List<List<SSTableReader>> buckets = sizeTieredBuckets.buckets();
         assertEquals(3, buckets.size());
-
-        for (List<String> bucket : buckets)
+        for (List<SSTableReader> bucket : buckets)
         {
             assertEquals(2, bucket.size());
-            assertEquals(bucket.get(0).length(), bucket.get(1).length());
-            assertEquals(bucket.get(0).charAt(0), bucket.get(1).charAt(0));
         }
 
-        pairs.clear();
+        sstables.clear();
         buckets.clear();
 
-        String[] strings2 = { "aaa", "bbbbbbbb", "aaa", "bbbbbbbb", "bbbbbbbb", "aaa" };
-        for (String st : strings2)
+        long[] sstableLengths2 = { 3L, 8L, 3L, 8L, 8L, 3L };
+        for (long len : sstableLengths2)
         {
-            Pair<String, Long> pair = Pair.create(st, new Long(st.length()));
-            pairs.add(pair);
+            SSTableReader sstable = Mockito.mock(SSTableReader.class);
+            when(sstable.onDiskLength()).thenReturn(len);
+            when(sstable.hotness()).thenReturn(0.);
+            sstables.add(sstable);
         }
 
-        buckets = getBuckets(pairs, 1.5, 0.5, 2);
+        sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstables, stcsOptions, minThreshold, maxThreshold);
+        buckets = sizeTieredBuckets.buckets();
         assertEquals(2, buckets.size());
-
-        for (List<String> bucket : buckets)
+        for (List<SSTableReader> bucket : buckets)
         {
             assertEquals(3, bucket.size());
-            assertEquals(bucket.get(0).charAt(0), bucket.get(1).charAt(0));
-            assertEquals(bucket.get(1).charAt(0), bucket.get(2).charAt(0));
         }
 
         // Test the "min" functionality
-        pairs.clear();
+        sstables.clear();
         buckets.clear();
 
-        String[] strings3 = { "aaa", "bbbbbbbb", "aaa", "bbbbbbbb", "bbbbbbbb", "aaa" };
-        for (String st : strings3)
+        long[] sstableLengths3 = { 3L, 8L, 3L, 8L, 8L, 3L };
+        for (long len : sstableLengths3)
         {
-            Pair<String, Long> pair = Pair.create(st, new Long(st.length()));
-            pairs.add(pair);
+            SSTableReader sstable = Mockito.mock(SSTableReader.class);
+            when(sstable.onDiskLength()).thenReturn(len);
+            when(sstable.hotness()).thenReturn(0.);
+            sstables.add(sstable);
         }
 
-        buckets = getBuckets(pairs, 1.5, 0.5, 10);
+        stcsOptions = new SizeTieredCompactionStrategyOptions(10, bucketLow, bucketHigh);
+        sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstables, stcsOptions, minThreshold, maxThreshold);
+        buckets = sizeTieredBuckets.buckets();
         assertEquals(1, buckets.size());
     }
 
     @Test
-    public void testPrepBucket() throws Exception
+    public void testSingleBucketWith3IdenticalFilesRealSSTables()
     {
         String ksname = KEYSPACE1;
         String cfname = "Standard1";
@@ -169,10 +186,11 @@ public void testPrepBucket() throws Exception
         }
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
+        SizeTieredCompactionStrategyOptions stcsOptions = new SizeTieredCompactionStrategyOptions();
         List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
-        Pair<List<SSTableReader>, Double> bucket;
+        SizeTieredCompactionStrategy.SizeTieredBuckets sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstrs.subList(0, 2), stcsOptions, 4, 32);
 
-        List<SSTableReader> interestingBucket = mostInterestingBucket(Collections.singletonList(sstrs.subList(0, 2)), 4, 32);
+        List<SSTableReader> interestingBucket = new ArrayList<>(CompactionAggregate.getSelected(sizeTieredBuckets.getAggregates()).sstables);
         assertTrue("nothing should be returned when all buckets are below the min threshold", interestingBucket.isEmpty());
 
         sstrs.get(0).overrideReadMeter(new RestorableMeter(100.0, 100.0));
@@ -182,10 +200,303 @@ public void testPrepBucket() throws Exception
         long estimatedKeys = sstrs.get(0).estimatedKeys();
 
         // if we have more than the max threshold, the coldest should be dropped
-        bucket = trimToThresholdWithHotness(sstrs, 2);
-        assertEquals("one bucket should have been dropped", 2, bucket.left.size());
+        sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstrs, stcsOptions, 1, 2);
+        sizeTieredBuckets.aggregate();
+
+        List<CompactionPick> compactions = sizeTieredBuckets.getCompactions();
+        CompactionPick selected = CompactionAggregate.getSelected(sizeTieredBuckets.getAggregates());
+        if (!selected.isEmpty())
+            assertEquals(selected, compactions.get(0));
+        List<CompactionPick> pending = compactions.isEmpty() ? ImmutableList.of() : compactions.subList(1, compactions.size());
+
+        assertEquals("one bucket should have been dropped", 2, selected.sstables.size());
+        assertEquals("there should be one pending task", 1, pending.size());
+
         double expectedBucketHotness = (200.0 + 300.0) / estimatedKeys;
-        assertEquals(String.format("bucket hotness (%f) should be close to %f", bucket.right, expectedBucketHotness),
-                     expectedBucketHotness, bucket.right, 1.0);
+        assertEquals(String.format("bucket hotness (%f) should be close to %f",
+                                   CompactionAggregate.getSelected(sizeTieredBuckets.getAggregates()).hotness, expectedBucketHotness),
+                     expectedBucketHotness, CompactionAggregate.getSelected(sizeTieredBuckets.getAggregates()).hotness, 1.0);
+    }
+
+
+    @Test
+    public void testTwoBucketsDifferentHotness()
+    {
+        List<SSTableReader> bucket1 = mockBucket(8, 2000, 100);
+        List<SSTableReader> bucket2 = mockBucket(8, 1000, 200); // hottest bucket with hotness 200 per table should be selected
+
+        List<SSTableReader> sstables = Stream.concat(bucket1.stream(), bucket2.stream()).collect(Collectors.toList());
+        for (int i = 0; i < 5; i++)
+        {
+            Collections.shuffle(sstables, random);
+            testBuckets(sstables, bucket2, ImmutableList.of(bucket1), 2);
+        }
+    }
+
+    @Test
+    public void testTwoBucketsSameHotness()
+    {
+        List<SSTableReader> bucket1 = mockBucket(8, 1000, 100);
+        List<SSTableReader> bucket2 = mockBucket(8, 4000, 100); // bucket with largest sstables should be selected if same hotness
+
+        List<SSTableReader> sstables = Stream.concat(bucket1.stream(), bucket2.stream()).collect(Collectors.toList());
+        for (int i = 0; i < 5; i++)
+        {
+            Collections.shuffle(sstables, random);
+            testBuckets(sstables, bucket2, ImmutableList.of(bucket1), 2);
+        }
+    }
+
+    @Test
+    public void testSplitLargeBucketExactMultiple()
+    {
+        List<SSTableReader> bucket1 = mockBucket(maxThreshold, 1000, 100);
+        List<SSTableReader> bucket2 = mockBucket(maxThreshold, 1000, 200);
+        List<SSTableReader> bucket3 = mockBucket(maxThreshold, 1000, 300);
+        List<SSTableReader> bucket4 = mockBucket(maxThreshold, 1000, 400); // hottest bucket
+
+        List<SSTableReader> largeBucket = new ArrayList<>(maxThreshold * 4);
+        largeBucket.addAll(bucket1);
+        largeBucket.addAll(bucket2);
+        largeBucket.addAll(bucket3);
+        largeBucket.addAll(bucket4);
+
+        Collections.shuffle(largeBucket, random);
+
+        testBuckets(largeBucket, bucket4, ImmutableList.of(bucket3, bucket2, bucket1), 1);
+    }
+
+    @Test
+    public void testSplitLargeBucketNotExactMultiple()
+    {
+        List<SSTableReader> bucket1 = mockBucket(maxThreshold / 2, 1000, 100);
+        List<SSTableReader> bucket2 = mockBucket(maxThreshold, 1000, 200);
+        List<SSTableReader> bucket3 = mockBucket(maxThreshold, 1000, 300);
+        List<SSTableReader> bucket4 = mockBucket(maxThreshold, 1000, 400); // hottest bucket
+
+        List<SSTableReader> largeBucket = new ArrayList<>(maxThreshold * 4);
+        largeBucket.addAll(bucket1);
+        largeBucket.addAll(bucket2);
+        largeBucket.addAll(bucket3);
+        largeBucket.addAll(bucket4);
+
+        Collections.shuffle(largeBucket, random);
+
+        testBuckets(largeBucket, bucket4, ImmutableList.of(bucket3, bucket2, bucket1), 1);
+    }
+
+    @Test
+    public void testSplitLargeBucketWithLeftOverBelowMinThreshold()
+    {
+        List<SSTableReader> bucket1 = mockBucket(minThreshold - 1, 1000, 100); // should be ignored
+        List<SSTableReader> bucket2 = mockBucket(maxThreshold, 1000, 200); // hottest bucket
+
+        List<SSTableReader> largeBucket = new ArrayList<>(maxThreshold * 4);
+        largeBucket.addAll(bucket1);
+        largeBucket.addAll(bucket2);
+
+        Collections.shuffle(largeBucket, random);
+
+        testBuckets(largeBucket, bucket2, ImmutableList.of(), 1);
+    }
+
+    @Test
+    public void testIgnoreBucketsBelowMinThreshold()
+    {
+        List<SSTableReader> sstables = new ArrayList<>();
+        long bytesOnDisk = 1000;
+        double hotness = 200;
+        for (int i = 0; i < minThreshold; i++)
+        {
+            sstables.addAll(mockBucket(i, bytesOnDisk, hotness));
+            bytesOnDisk *= 2;
+            hotness *= 2;
+        }
+
+        // all buckets with sstables should be considered and so the number of expected aggregates
+        // is minThreshold - 1 (because one has no sstables)
+        testBuckets(sstables, ImmutableList.of(), ImmutableList.of(), minThreshold - 1);
+    }
+
+    @Test
+    public void testIgnoreBucketsBelowMinThresholdExceptOne()
+    {
+        List<SSTableReader> sstables = new ArrayList<>();
+        long bytesOnDisk = 1000;
+        double hotness = 200;
+        for (int i = 0; i < minThreshold; i++)
+        {
+            sstables.addAll(mockBucket(i, bytesOnDisk, hotness));
+            bytesOnDisk *= 2;
+            hotness *= 2;
+        }
+
+        List<SSTableReader> bucket = mockBucket(minThreshold, bytesOnDisk, hotness);
+        sstables.addAll(bucket); // this is the only bucket that should be picked up
+
+        // all buckets with sstables should be considered and so the number of expected aggregates
+        // is minThreshold (because one has no sstables)
+        testBuckets(sstables, bucket, ImmutableList.of(), minThreshold);
+    }
+
+    @Test
+    public void testManySmallSSTables()
+    {
+        // SStables smaller than minSSTableSize should all be grouped in the same bucket
+
+        int minSSTableSize = 1000;
+        List<SSTableReader> sstables = new ArrayList<>();
+
+        for (int i = 0; i < 10; i++)
+        {
+            List<SSTableReader> bucket = mockBucket(minThreshold + random.nextInt(maxThreshold), random.nextInt(minSSTableSize), 100);
+            sstables.addAll(bucket);
+        }
+
+        Collections.sort(sstables, Comparator.comparing(sstable -> sstable.onDiskLength()));
+
+        List<List<SSTableReader>> buckets = new ArrayList<>();
+        int i = 0;
+        while ((sstables.size() - i) >= minThreshold)
+        {
+            buckets.add(sstables.subList(i, Math.min(i+ maxThreshold, sstables.size())));
+            i += maxThreshold;
+        }
+
+        SizeTieredCompactionStrategyOptions stcsOptions = new SizeTieredCompactionStrategyOptions(minSSTableSize, bucketLow, bucketHigh);
+        testBuckets(stcsOptions, sstables, buckets.get(0), buckets.subList(1, buckets.size()), 1);
+    }
+
+    @Test
+    public void testThreeBucketsOnlyLargestSizeHasComps()
+    {
+        List<SSTableReader> bucket1 = mockBucket(2, 1000, 0); // no compaction
+        List<SSTableReader> bucket2 = mockBucket(2, 4000, 0); // no compaction
+        List<SSTableReader> bucket3 = mockBucket(4, 8000, 0); // one compaction
+
+        List<SSTableReader> sstables = new ArrayList<>(bucket1.size() + bucket2.size() + bucket3.size());
+        sstables.addAll(bucket1);
+        sstables.addAll(bucket2);
+        sstables.addAll(bucket3);
+
+        for (int i = 0; i < 5; i++)
+        {
+            Collections.shuffle(sstables, random);
+            testBuckets(sstables, bucket3, ImmutableList.of(), 3);
+        }
+    }
+
+    @Test
+    public void testThreeBucketsOnlySmallestSizeHasComps()
+    {
+        List<SSTableReader> bucket1 = mockBucket(4, 1000, 0); // one compaction
+        List<SSTableReader> bucket2 = mockBucket(2, 4000, 0); // no compaction
+        List<SSTableReader> bucket3 = mockBucket(2, 8000, 0); // no compaction
+
+        List<SSTableReader> sstables = new ArrayList<>(bucket1.size() + bucket2.size() + bucket3.size());
+        sstables.addAll(bucket1);
+        sstables.addAll(bucket2);
+        sstables.addAll(bucket3);
+
+        for (int i = 0; i < 5; i++)
+        {
+            Collections.shuffle(sstables, random);
+            testBuckets(sstables, bucket1, ImmutableList.of(), 3);
+        }
+    }
+
+    /**
+     * Sort the buckets by calling {@link SizeTieredCompactionStrategy.SizeTieredBuckets#aggregate()} and then verify
+     * that the selected bucket is {@code expectedBucket} and that the pending buckets are {@code expectedPending}.
+     *
+     * @param sstables - the input sstables to aggregate into buckets
+     * @param expectedSelected - the expected bucket that should be selected for compaction
+     * @param expectedPending - the expected pending buckets
+     */
+    private void testBuckets(List<SSTableReader> sstables, List<SSTableReader> expectedSelected, List<List<SSTableReader>> expectedPending, int numExpectedAggregates)
+    {
+        SizeTieredCompactionStrategyOptions stcsOptions = new SizeTieredCompactionStrategyOptions(minSSTableSize, bucketLow, bucketHigh);
+        testBuckets(stcsOptions, sstables, expectedSelected, expectedPending, numExpectedAggregates);
+    }
+
+    private void testBuckets(SizeTieredCompactionStrategyOptions stcsOptions,
+                             List<SSTableReader> sstables,
+                             List<SSTableReader> expectedSelected,
+                             List<List<SSTableReader>> expectedPending,
+                             int numExpectedAggregates)
+    {
+        SizeTieredCompactionStrategy.SizeTieredBuckets buckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(sstables,
+                                                                                                                    stcsOptions,
+                                                                                                                    minThreshold,
+                                                                                                                    maxThreshold);
+        buckets.aggregate();
+
+        List<CompactionPick> compactions = buckets.getCompactions();
+        CompactionPick selected = CompactionAggregate.getSelected(buckets.getAggregates());
+        if (!selected.isEmpty())
+            assertEquals(selected, compactions.get(0));
+        List<CompactionPick> pending = compactions.isEmpty() ? ImmutableList.of() : compactions.subList(1, compactions.size());
+
+        compareBucketToCandidate(expectedSelected, selected);
+        assertEquals(expectedPending.size(), pending.size());
+
+        for (int i = 0; i < expectedPending.size(); i++)
+            compareBucketToCandidate(expectedPending.get(i), pending.get(i));
+
+        assertEquals(numExpectedAggregates, buckets.getAggregates().size());
+    }
+
+    private List<SSTableReader> mockBucket(int numSSTables, long bytesOnDisk, double hotness)
+    {
+        List<SSTableReader> ret = new ArrayList<>(numSSTables);
+        int h = 0;
+        for (int i = 0; i < numSSTables; i++)
+            ret.add(mockSSTable(bytesOnDisk, hotness));
+
+        return ret;
+    }
+
+    private SSTableReader mockSSTable(long bytesOnDisk, double hotness)
+    {
+        SSTableReader ret = Mockito.mock(SSTableReader.class);
+        when(ret.hotness()).thenReturn(hotness);
+        when(ret.onDiskLength()).thenReturn(bytesOnDisk);
+        when(ret.bytesOnDisk()).thenReturn(bytesOnDisk);
+        when(ret.toString()).thenReturn(String.format("Bytes on disk: %d, hotness %f, hashcode %d", bytesOnDisk, hotness, ret.hashCode()));
+
+        return ret;
+    }
+
+    private void compareBucketToCandidate(Collection<SSTableReader> bucket, CompactionPick candidate)
+    {
+        List<SSTableReader> sortedBucket = new ArrayList<>(bucket);
+        List<SSTableReader> sortedCandidate = new ArrayList<>(candidate.sstables);
+
+        // Sort by hash code because sorting by hotness may not work if several sstables have the
+        // same hotness and length on disk
+        Collections.sort(sortedBucket, Comparator.comparingLong(SSTableReader::hashCode));
+        Collections.sort(sortedCandidate, Comparator.comparingLong(SSTableReader::hashCode));
+
+        assertEquals(sortedBucket, sortedCandidate);
+        assertEquals(getBucketHotness(bucket), candidate.hotness, 0.000001);
+        assertEquals(bucket.size() > 0 ? getBucketSize(bucket) / (double) bucket.size() : 0, candidate.avgSizeInBytes, 1);
+    }
+
+    private double getBucketHotness(Collection<SSTableReader> bucket)
+    {
+        double ret = 0;
+        for (SSTableReader sstable : bucket)
+            ret += sstable.hotness();
+
+        return ret;
+    }
+
+    private long getBucketSize(Collection<SSTableReader> bucket)
+    {
+        long ret = 0;
+        for (SSTableReader sstable : bucket)
+            ret += sstable.onDiskLength();
+
+        return ret;
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
index 2ca490a6f13f..ec44e5b01ef2 100644
--- a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
@@ -22,13 +22,18 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
 
-import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Iterables;
 
 import org.junit.BeforeClass;
 import org.junit.Test;
+
+import static java.util.concurrent.TimeUnit.HOURS;
+import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.getBucketAggregates;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
@@ -45,10 +50,8 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.KeyspaceParams;
-import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.getWindowBoundsInMillis;
-import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.newestBucket;
 import static org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy.validateOptions;
 import static org.apache.cassandra.utils.FBUtilities.nowInSeconds;
 
@@ -136,19 +139,19 @@ public void testTimeWindows()
     {
         long tstamp1 = 1451001601000L; // 2015-12-25 @ 00:00:01, in milliseconds
         long tstamp2 = 1451088001000L; // 2015-12-26 @ 00:00:01, in milliseconds
-        Long lowHour = 1451001600000L; // 2015-12-25 @ 00:00:00, in milliseconds
+        long lowHour = 1451001600000L; // 2015-12-25 @ 00:00:00, in milliseconds
 
         // A 1 hour window should round down to the beginning of the hour
-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp1).left.compareTo(lowHour));
+        assertEquals(lowHour, getWindowBoundsInMillis(HOURS, 1, tstamp1));
 
         // A 1 minute window should round down to the beginning of the hour
-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.MINUTES, 1, tstamp1).left.compareTo(lowHour));
+        assertEquals(lowHour, getWindowBoundsInMillis(TimeUnit.MINUTES, 1, tstamp1));
 
         // A 1 day window should round down to the beginning of the hour
-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.DAYS, 1, tstamp1).left.compareTo(lowHour));
+        assertEquals(lowHour, getWindowBoundsInMillis(TimeUnit.DAYS, 1, tstamp1));
 
         // The 2 day window of 2015-12-25 + 2015-12-26 should round down to the beginning of 2015-12-25
-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.DAYS, 2, tstamp2).left.compareTo(lowHour));
+        assertEquals(lowHour, getWindowBoundsInMillis(TimeUnit.DAYS, 2, tstamp2));
     }
 
     @Test
@@ -186,30 +189,29 @@ public void testPrepBucket()
 
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
-        HashMultimap<Long, SSTableReader> buckets = HashMultimap.create();
+        TreeMap<Long, List<SSTableReader>> buckets = new TreeMap<>(Long::compare);
         List<SSTableReader> sstrs = new ArrayList<>(cfs.getLiveSSTables());
 
         // We'll put 3 sstables into the newest bucket
         for (int i = 0; i < 3; i++)
         {
-            Pair<Long, Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp);
-            buckets.put(bounds.left, sstrs.get(i));
+            TimeWindowCompactionStrategy.addToBuckets(buckets, sstrs.get(i), tstamp, TimeUnit.HOURS, 1);
         }
 
-        TimeWindowCompactionStrategy.NewestBucket newBucket = newestBucket(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
-        assertTrue("incoming bucket should not be accepted when it has below the min threshold SSTables", newBucket.sstables.isEmpty());
-        assertEquals("there should be no estimated remaining tasks when bucket is below min threshold SSTables", 0, newBucket.estimatedRemainingTasks);
-
+        List<CompactionAggregate> aggregates = getBucketAggregates(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(HOURS, 1, System.currentTimeMillis()));
+        Set<CompactionPick> compactions = toCompactions(aggregates);
+        assertTrue("No selected compactions when fewer than min threshold SSTables in the newest bucket", CompactionAggregate.getSelected(aggregates).isEmpty());
+        assertTrue("No compactions when fewer than min threshold SSTables in the newest bucket", compactions.isEmpty());
 
-        newBucket = newestBucket(buckets, 2, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
-        assertFalse("incoming bucket should be accepted when it is larger than the min threshold SSTables", newBucket.sstables.isEmpty());
-        assertEquals("there should be one estimated remaining task when bucket is larger than the min threshold SSTables", 1, newBucket.estimatedRemainingTasks);
+        aggregates = getBucketAggregates(buckets, 2, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(HOURS, 1, System.currentTimeMillis()));
+        compactions = toCompactions(aggregates);
+        assertFalse("There should be one selected compaction when bucket is larger than the min but smaller than max threshold", CompactionAggregate.getSelected(aggregates).isEmpty());
+        assertEquals("There should be one compaction when bucket is larger than the min but smaller than max threshold", 1,  compactions.size());
 
         // And 2 into the second bucket (1 hour back)
         for (int i = 3; i < 5; i++)
         {
-            Pair<Long, Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp2);
-            buckets.put(bounds.left, sstrs.get(i));
+            TimeWindowCompactionStrategy.addToBuckets(buckets, sstrs.get(i), tstamp2, TimeUnit.HOURS, 1);
         }
 
         assertEquals("an sstable with a single value should have equal min/max timestamps", sstrs.get(0).getMinTimestamp(), sstrs.get(0).getMaxTimestamp());
@@ -234,15 +236,15 @@ public void testPrepBucket()
         sstrs = new ArrayList<>(cfs.getLiveSSTables());
         for (int i = 0; i < 40; i++)
         {
-            Pair<Long, Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, sstrs.get(i).getMaxTimestamp());
-            buckets.put(bounds.left, sstrs.get(i));
+            TimeWindowCompactionStrategy.addToBuckets(buckets, sstrs.get(i), sstrs.get(i).getMaxTimestamp(), TimeUnit.HOURS, 1);
         }
 
-        newBucket = newestBucket(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
-        assertEquals("new bucket should be trimmed to max threshold of 32", newBucket.sstables.size(), 32);
+        aggregates = getBucketAggregates(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(HOURS, 1, System.currentTimeMillis()));
+        compactions = toCompactions(aggregates);
+        assertEquals("new bucket should be split by max threshold of 32", buckets.keySet().size() + 1, compactions.size());
 
-        // one per bucket because they are all eligible and one more for the sstables that were trimmed
-        assertEquals("there should be one estimated remaining task per eligible bucket", buckets.keySet().size() + 1, newBucket.estimatedRemainingTasks);
+        CompactionPick selected = CompactionAggregate.getSelected(aggregates);
+        assertEquals("first pick should be trimmed to max threshold of 32", 32, selected.sstables.size());
     }
 
 
@@ -360,4 +362,9 @@ public void testDropOverlappingExpiredSSTables() throws InterruptedException
         twcs.shutdown();
         t.transaction.abort();
     }
+
+    private static Set<CompactionPick> toCompactions(List<CompactionAggregate> aggregates)
+    {
+        return aggregates.stream().flatMap(aggr -> aggr.getActive().stream()).collect(Collectors.toSet());
+    }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java b/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java
new file mode 100644
index 000000000000..727afea02a84
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Sets;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.marshal.LongType;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
+import org.apache.cassandra.schema.TableMetadata;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertNotNull;
+
+/**
+ *
+ */
+public class ZombieSSTablesTest
+{
+    private static final String KEYSPACE1 = "BlacklistingCompactionsTest";
+    private static final String STANDARD_STCS = "Standard_STCS";
+    private static final String STANDARD_LCS = "Standard_LCS";
+    private static final String STANDARD_TWCS = "Standard_TWCS";
+    private static final String MAXIMAL = "_Maximal";
+    private static int maxValueSize;
+
+    @After
+    public void leakDetect() throws InterruptedException
+    {
+        System.gc();
+        System.gc();
+        System.gc();
+        Thread.sleep(10);
+    }
+
+    @BeforeClass
+    public static void defineSchema() throws ConfigurationException
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE1,
+                KeyspaceParams.simple(1),
+                makeTable(STANDARD_STCS).compaction(CompactionParams.DEFAULT),
+                makeTable(STANDARD_LCS).compaction(CompactionParams.lcs(Collections.emptyMap())),
+                makeTable(STANDARD_TWCS).compaction(CompactionParams.twcs(Collections.emptyMap())),
+                makeTable(STANDARD_STCS + MAXIMAL).compaction(CompactionParams.DEFAULT),
+                makeTable(STANDARD_LCS + MAXIMAL).compaction(CompactionParams.lcs(Collections.emptyMap())),
+                makeTable(STANDARD_TWCS + MAXIMAL).compaction(CompactionParams.twcs(Collections.emptyMap())));
+
+        maxValueSize = DatabaseDescriptor.getMaxValueSize();
+        DatabaseDescriptor.setMaxValueSize(1024 * 1024);
+    }
+
+    /**
+     * Return a table metadata, we use types with fixed size to increase the chance of detecting corrupt data
+     */
+    private static TableMetadata.Builder makeTable(String tableName)
+    {
+        return SchemaLoader.standardCFMD(KEYSPACE1, tableName, 1, LongType.instance, LongType.instance, LongType.instance);
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.setMaxValueSize(maxValueSize);
+    }
+
+    @Test
+    public void testWithSizeTieredCompactionStrategy() throws Exception
+    {
+        testZombieSSTables(STANDARD_STCS);
+    }
+
+    @Test
+    public void testWithLeveledCompactionStrategy() throws Exception
+    {
+        testZombieSSTables(STANDARD_LCS);
+    }
+
+    @Test
+    public void testWithTimeWindowCompactionStrategy() throws Exception
+    {
+        testZombieSSTables(STANDARD_TWCS);
+    }
+
+    @Test
+    public void testWithSizeTieredCompactionStrategyMaximal() throws Exception
+    {
+        testZombieSSTablesMaximal(STANDARD_STCS);
+    }
+
+    @Test
+    public void testWithLeveledCompactionStrategyMaximal() throws Exception
+    {
+        testZombieSSTablesMaximal(STANDARD_LCS);
+    }
+
+    @Test
+    public void testWithTimeWindowCompactionStrategyMaximal() throws Exception
+    {
+        testZombieSSTablesMaximal(STANDARD_TWCS);
+    }
+
+    private void prepareZombieSSTables(ColumnFamilyStore cfs) throws Exception
+    {
+        final int ROWS_PER_SSTABLE = 10;
+        final int SSTABLES = 15;
+        final int SSTABLES_TO_DELETE = 2;
+
+        cfs.truncateBlocking();
+
+        // disable compaction while flushing
+        cfs.disableAutoCompaction();
+        //test index corruption
+        //now create a few new SSTables
+        long maxTimestampExpected = Long.MIN_VALUE;
+        Set<DecoratedKey> inserted = new HashSet<>();
+
+        for (int j = 0; j < SSTABLES; j++)
+        {
+            for (int i = 0; i < ROWS_PER_SSTABLE; i++)
+            {
+                DecoratedKey key = Util.dk(String.valueOf(i));
+                long timestamp = j * ROWS_PER_SSTABLE + i;
+                new RowUpdateBuilder(cfs.metadata(), timestamp, key.getKey())
+                        .clustering(Long.valueOf(i))
+                        .add("val", Long.valueOf(i))
+                        .build()
+                        .applyUnsafe();
+                maxTimestampExpected = Math.max(timestamp, maxTimestampExpected);
+                inserted.add(key);
+            }
+            cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+            CompactionsTest.assertMaxTimestamp(cfs, maxTimestampExpected);
+            assertEquals(inserted.toString(), inserted.size(), Util.getAll(Util.cmd(cfs).build()).size());
+        }
+
+        Collection<SSTableReader> sstables = cfs.getLiveSSTables();
+
+        // delete first 'sstablesToDelete' SSTables, but make it so that the compaction strategy still thinks they
+        // are present by not sending the removal notification (this can normally happen due to a race between the add
+        // and remove notification for an sstable).
+        Set<SSTableReader> toDrop = sstables.stream().limit(SSTABLES_TO_DELETE).collect(Collectors.toSet());
+        cfs.getTracker().removeUnsafe(toDrop);
+        toDrop.stream().forEach(sstable -> sstable.selfRef().release());    // avoid leak
+        assertTrue(Sets.intersection(cfs.getLiveSSTables(), toDrop).isEmpty());
+    }
+
+    private void testZombieSSTablesMaximal(String tableName) throws Exception
+    {
+        // this test does enough rows to force multiple block indexes to be used
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(tableName + MAXIMAL);
+
+        prepareZombieSSTables(cfs);
+        Collection<AbstractCompactionTask> maximalTasks = cfs.getCompactionStrategyManager().getMaximalTasks(0, false);
+        assertNotNull(maximalTasks);
+        assertFalse(maximalTasks.isEmpty());
+        maximalTasks.stream().forEach(task -> task.transaction.abort());    // avoid leak
+    }
+
+    private void testZombieSSTables(String tableName) throws Exception
+    {
+        // this test does enough rows to force multiple block indexes to be used
+        Keyspace keyspace = Keyspace.open(KEYSPACE1);
+        final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(tableName);
+
+        prepareZombieSSTables(cfs);
+
+        CompactionStrategyManager compactionStrategyManager = cfs.getCompactionStrategyManager();
+        compactionStrategyManager.enable();
+        AbstractCompactionTask nextBackgroundTask = compactionStrategyManager.getNextBackgroundTask(0);
+        assertNotNull(nextBackgroundTask);
+        nextBackgroundTask.transaction.abort();    // avoid leak
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java b/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java
index 7acd3e62f2f9..0be935180bbb 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/HelpersTest.java
@@ -169,7 +169,7 @@ public void testMarkObsolete()
         Iterable<SSTableReader> readersToKeep = Lists.newArrayList(MockSchema.sstable(3, cfs), MockSchema.sstable(4, cfs));
 
         List<LogTransaction.Obsoletion> obsoletions = new ArrayList<>();
-        Helpers.prepareForObsoletion(readers, txnLogs, obsoletions, null);
+        Helpers.prepareForObsoletion(readers, txnLogs, obsoletions, null, null);
         assertNotNull(obsoletions);
         assertEquals(2, obsoletions.size());
 
@@ -200,7 +200,7 @@ public void testObsoletionPerformance()
         }
         long start = System.currentTimeMillis();
 
-        Helpers.prepareForObsoletion(readers.subList(0, 500), txnLogs, new ArrayList<>(),null );
+        Helpers.prepareForObsoletion(readers.subList(0, 500), txnLogs, new ArrayList<>(),null, null);
         txnLogs.finish();
         long time = System.currentTimeMillis() - start;
         assertTrue(time < 20000);
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
index eb162d59b9f7..3bf2b2b0fb19 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/ViewTest.java
@@ -103,8 +103,8 @@ public void testCompaction()
         Assert.assertFalse(View.permitCompacting(readers.subList(0, 2)).apply(cur));
         Assert.assertFalse(View.permitCompacting(readers.subList(0, 1)).apply(cur));
         Assert.assertFalse(View.permitCompacting(readers.subList(1, 2)).apply(cur));
-        Assert.assertTrue(readers.subList(2, 5).containsAll(copyOf(cur.getUncompacting(readers))));
-        Assert.assertEquals(3, copyOf(cur.getUncompacting(readers)).size());
+        Assert.assertTrue(readers.subList(2, 5).containsAll(copyOf(cur.getNoncompacting(readers))));
+        Assert.assertEquals(3, copyOf(cur.getNoncompacting(readers)).size());
         Assert.assertTrue(ImmutableSet.copyOf(cur.select(SSTableSet.NONCOMPACTING)).containsAll(readers.subList(2, 5)));
         Assert.assertEquals(3, ImmutableSet.copyOf(cur.select(SSTableSet.NONCOMPACTING)).size());
 
@@ -132,8 +132,8 @@ public void testCompaction()
         Assert.assertFalse(View.permitCompacting(readers.subList(1, 2)).apply(cur));
         testFailure(View.updateCompacting(emptySet(), readers.subList(1, 2)), cur);
         testFailure(View.updateCompacting(copyOf(readers.subList(0, 2)), emptySet()), cur);
-        Assert.assertTrue(copyOf(concat(readers.subList(0, 1), readers.subList(2, 5))).containsAll(copyOf(cur.getUncompacting(readers))));
-        Assert.assertEquals(4, copyOf(cur.getUncompacting(readers)).size());
+        Assert.assertTrue(copyOf(concat(readers.subList(0, 1), readers.subList(2, 5))).containsAll(copyOf(cur.getNoncompacting(readers))));
+        Assert.assertEquals(4, copyOf(cur.getNoncompacting(readers)).size());
         Set<SSTableReader> nonCompacting = ImmutableSet.copyOf(cur.select(SSTableSet.NONCOMPACTING));
         Assert.assertTrue(nonCompacting.containsAll(readers.subList(2, 5)));
         Assert.assertTrue(nonCompacting.containsAll(readers.subList(0, 1)));
diff --git a/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java b/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java
index dc46c27cb78e..69bcc048b8df 100644
--- a/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java
+++ b/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java
@@ -18,6 +18,8 @@
 
 package org.apache.cassandra.db.repair;
 
+import java.io.Closeable;
+import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -36,6 +38,7 @@
 
 import javax.annotation.Nullable;
 
+import com.google.common.base.Throwables;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
 import com.google.common.util.concurrent.FutureCallback;
@@ -53,12 +56,13 @@
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.compaction.AbstractPendingRepairTest;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionController;
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.CompactionIterator;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.TableOperation;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner;
@@ -76,6 +80,7 @@
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.streaming.PreviewKind;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.NonThrowingCloseable;
 import org.apache.cassandra.utils.UUIDGen;
 import org.apache.cassandra.utils.WrappedRunnable;
 import org.apache.cassandra.utils.concurrent.Transactional;
@@ -458,27 +463,30 @@ public void testBlockedAcquisition() throws ExecutionException, InterruptedExcep
         {
             try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
                  CompactionController controller = new CompactionController(cfs, sstables, 0);
-                 CompactionIterator ci = CompactionManager.getAntiCompactionIterator(scanners, controller, 0, UUID.randomUUID(), CompactionManager.instance.active, () -> false))
+                 CompactionIterator ci = new CompactionIterator(OperationType.ANTICOMPACTION, scanners, controller, 0, UUIDGen.getTimeUUID()))
             {
-                // `ci` is our imaginary ongoing anticompaction which makes no progress until after 30s
-                // now we try to start a new AC, which will try to cancel all ongoing compactions
-
-                CompactionManager.instance.active.beginCompaction(ci);
-                PendingAntiCompaction pac = new PendingAntiCompaction(prsid, Collections.singleton(cfs), atEndpoint(FULL_RANGE, NO_RANGES), 0, 0, es, () -> false);
-                ListenableFuture fut = pac.run();
-                try
-                {
-                    fut.get(30, TimeUnit.SECONDS);
-                    fail("the future should throw exception since we try to start a new anticompaction when one is already running");
-                }
-                catch (ExecutionException e)
+                TableOperation op = ci.getOperation();
+                try (NonThrowingCloseable cls = CompactionManager.instance.active.onOperationStart(op))
                 {
-                    assertTrue(e.getCause() instanceof PendingAntiCompaction.SSTableAcquisitionException);
-                }
+                    // `ci` is our imaginary ongoing anticompaction which makes no progress until after 30s
+                    // now we try to start a new AC, which will try to cancel all ongoing compactions
+
+                    PendingAntiCompaction pac = new PendingAntiCompaction(prsid, Collections.singleton(cfs), atEndpoint(FULL_RANGE, NO_RANGES), 0, 0, es, () -> false);
+                    ListenableFuture fut = pac.run();
+                    try
+                    {
+                        fut.get(30, TimeUnit.SECONDS);
+                        fail("the future should throw exception since we try to start a new anticompaction when one is already running");
+                    }
+                    catch (ExecutionException e)
+                    {
+                        assertTrue(e.getCause() instanceof PendingAntiCompaction.SSTableAcquisitionException);
+                    }
 
-                assertEquals(1, getCompactionsFor(cfs).size());
-                for (CompactionInfo.Holder holder : getCompactionsFor(cfs))
-                    assertFalse(holder.isStopRequested());
+                    assertEquals(1, getCompactionsFor(cfs).size());
+                    for (TableOperation compaction : getCompactionsFor(cfs))
+                        assertFalse(compaction.isStopRequested());
+                }
             }
         }
         finally
@@ -488,13 +496,13 @@ public void testBlockedAcquisition() throws ExecutionException, InterruptedExcep
         }
     }
 
-    private List<CompactionInfo.Holder> getCompactionsFor(ColumnFamilyStore cfs)
+    private List<TableOperation> getCompactionsFor(ColumnFamilyStore cfs)
     {
-        List<CompactionInfo.Holder> compactions = new ArrayList<>();
-        for (CompactionInfo.Holder holder : CompactionManager.instance.active.getCompactions())
+        List<TableOperation> compactions = new ArrayList<>();
+        for (TableOperation compaction : CompactionManager.instance.active.getTableOperations())
         {
-            if (holder.getCompactionInfo().getTableMetadata().equals(cfs.metadata()))
-                compactions.add(holder);
+            if (compaction.getProgress().metadata().equals(cfs.metadata()))
+                compactions.add(compaction);
         }
         return compactions;
     }
@@ -512,48 +520,50 @@ public void testUnblockedAcquisition() throws ExecutionException, InterruptedExc
         {
             try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.ANTICOMPACTION);
                  CompactionController controller = new CompactionController(cfs, sstables, 0);
-                 CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners, controller, 0, UUID.randomUUID()))
+                 CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners, controller, 0, UUID.randomUUID());)
             {
-                // `ci` is our imaginary ongoing anticompaction which makes no progress until after 5s
-                // now we try to start a new AC, which will try to cancel all ongoing compactions
-
-                CompactionManager.instance.active.beginCompaction(ci);
-                PendingAntiCompaction pac = new PendingAntiCompaction(prsid, Collections.singleton(cfs), atEndpoint(FULL_RANGE, NO_RANGES), es, () -> false);
-                ListenableFuture fut = pac.run();
-                try
-                {
-                    fut.get(5, TimeUnit.SECONDS);
-                }
-                catch (TimeoutException e)
-                {
-                    // expected, we wait 1 minute for compactions to get cancelled in runWithCompactionsDisabled, but we are not iterating
-                    // CompactionIterator so the compaction is not actually cancelled
-                }
-                try
-                {
-                    assertTrue(ci.hasNext());
-                    ci.next();
-                    fail("CompactionIterator should be abortable");
-                }
-                catch (CompactionInterruptedException e)
-                {
-                    CompactionManager.instance.active.finishCompaction(ci);
-                    txn.abort();
-                    // expected
-                }
-                CountDownLatch cdl = new CountDownLatch(1);
-                Futures.addCallback(fut, new FutureCallback<Object>()
+                TableOperation op = ci.getOperation();
+                try (NonThrowingCloseable cls = CompactionManager.instance.active.onOperationStart(op))
                 {
-                    public void onSuccess(@Nullable Object o)
+                    // `ci` is our imaginary ongoing anticompaction which makes no progress until after 5s
+                    // now we try to start a new AC, which will try to cancel all ongoing compactions
+
+                    PendingAntiCompaction pac = new PendingAntiCompaction(prsid, Collections.singleton(cfs), atEndpoint(FULL_RANGE, NO_RANGES), es, () -> false);
+                    ListenableFuture fut = pac.run();
+                    try
                     {
-                        cdl.countDown();
+                        fut.get(5, TimeUnit.SECONDS);
                     }
-
-                    public void onFailure(Throwable throwable)
+                    catch (TimeoutException e)
+                    {
+                        // expected, we wait 1 minute for compactions to get cancelled in runWithCompactionsDisabled, but we are not iterating
+                        // CompactionIterator so the compaction is not actually cancelled
+                    }
+                    try
+                    {
+                        assertTrue(ci.hasNext());
+                        ci.next();
+                        fail("CompactionIterator should be abortable");
+                    }
+                    catch (CompactionInterruptedException e)
                     {
+                        txn.abort();
+                        // expected
                     }
-                }, MoreExecutors.directExecutor());
-                assertTrue(cdl.await(1, TimeUnit.MINUTES));
+                    CountDownLatch cdl = new CountDownLatch(1);
+                    Futures.addCallback(fut, new FutureCallback<Object>()
+                    {
+                        public void onSuccess(@Nullable Object o)
+                        {
+                            cdl.countDown();
+                        }
+
+                        public void onFailure(Throwable throwable)
+                        {
+                        }
+                    }, MoreExecutors.directExecutor());
+                    assertTrue(cdl.await(1, TimeUnit.MINUTES));
+                }
             }
         }
         finally
@@ -600,11 +610,11 @@ public void testSSTablePredicateOngoingAntiCompaction()
 
     private void tryPredicate(ColumnFamilyStore cfs, List<SSTableReader> compacting, List<SSTableReader> expectedLive, boolean shouldFail)
     {
-        CompactionInfo.Holder holder = new CompactionInfo.Holder()
+        TableOperation operation = new AbstractTableOperation()
         {
-            public CompactionInfo getCompactionInfo()
+            public OperationProgress getProgress()
             {
-                return new CompactionInfo(cfs.metadata(), OperationType.ANTICOMPACTION, 0, 1000, UUID.randomUUID(), compacting);
+                return new OperationProgress(cfs.metadata(), OperationType.ANTICOMPACTION, 0, 1000, UUID.randomUUID(), compacting);
             }
 
             public boolean isGlobal()
@@ -612,8 +622,7 @@ public boolean isGlobal()
                 return false;
             }
         };
-        CompactionManager.instance.active.beginCompaction(holder);
-        try
+        try(Closeable c = CompactionManager.instance.active.onOperationStart(operation))
         {
             PendingAntiCompaction.AntiCompactionPredicate predicate =
             new PendingAntiCompaction.AntiCompactionPredicate(Collections.singleton(new Range<>(new Murmur3Partitioner.LongToken(0), new Murmur3Partitioner.LongToken(100))),
@@ -623,15 +632,11 @@ public boolean isGlobal()
                 fail("should fail - we try to grab already anticompacting sstables for anticompaction");
             assertEquals(live, new HashSet<>(expectedLive));
         }
-        catch (PendingAntiCompaction.SSTableAcquisitionException e)
+        catch (PendingAntiCompaction.SSTableAcquisitionException | IOException e)
         {
             if (!shouldFail)
                 fail("We should not fail filtering sstables");
         }
-        finally
-        {
-            CompactionManager.instance.active.finishCompaction(holder);
-        }
     }
 
     @Test
@@ -641,11 +646,11 @@ public void testRetries() throws InterruptedException, ExecutionException
         cfs.addSSTable(MockSchema.sstable(1, true, cfs));
         CountDownLatch cdl = new CountDownLatch(5);
         ExecutorService es = Executors.newFixedThreadPool(1);
-        CompactionInfo.Holder holder = new CompactionInfo.Holder()
+        AbstractTableOperation operation = new AbstractTableOperation()
         {
-            public CompactionInfo getCompactionInfo()
+            public OperationProgress getProgress()
             {
-                return new CompactionInfo(cfs.metadata(), OperationType.ANTICOMPACTION, 0, 0, UUID.randomUUID(), cfs.getLiveSSTables());
+                return new OperationProgress(cfs.metadata(), OperationType.ANTICOMPACTION, 0, 0, UUID.randomUUID(), cfs.getLiveSSTables());
             }
 
             public boolean isGlobal()
@@ -653,7 +658,7 @@ public boolean isGlobal()
                 return false;
             }
         };
-        try
+        try (Closeable c = CompactionManager.instance.active.onOperationStart(operation))
         {
             PendingAntiCompaction.AntiCompactionPredicate acp = new PendingAntiCompaction.AntiCompactionPredicate(FULL_RANGE, UUID.randomUUID())
             {
@@ -666,30 +671,32 @@ public boolean apply(SSTableReader sstable)
                     return true;
                 }
             };
-            CompactionManager.instance.active.beginCompaction(holder);
             PendingAntiCompaction.AcquisitionCallable acquisitionCallable = new PendingAntiCompaction.AcquisitionCallable(cfs, UUID.randomUUID(), 10, 1, acp);
             Future f = es.submit(acquisitionCallable);
             cdl.await();
             assertNotNull(f.get());
         }
+        catch (IOException ex)
+        {
+            throw Throwables.propagate(ex);
+        }
         finally
         {
             es.shutdown();
-            CompactionManager.instance.active.finishCompaction(holder);
         }
     }
 
     @Test
-    public void testRetriesTimeout() throws InterruptedException, ExecutionException
+    public void testRetriesTimeout() throws InterruptedException, ExecutionException, IOException
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
         cfs.addSSTable(MockSchema.sstable(1, true, cfs));
         ExecutorService es = Executors.newFixedThreadPool(1);
-        CompactionInfo.Holder holder = new CompactionInfo.Holder()
+        TableOperation operation = new AbstractTableOperation()
         {
-            public CompactionInfo getCompactionInfo()
+            public OperationProgress getProgress()
             {
-                return new CompactionInfo(cfs.metadata(), OperationType.ANTICOMPACTION, 0, 0, UUID.randomUUID(), cfs.getLiveSSTables());
+                return new OperationProgress(cfs.metadata(), OperationType.ANTICOMPACTION, 0, 0, UUID.randomUUID(), cfs.getLiveSSTables());
             }
 
             public boolean isGlobal()
@@ -697,7 +704,7 @@ public boolean isGlobal()
                 return false;
             }
         };
-        try
+        try (Closeable c = CompactionManager.instance.active.onOperationStart(operation))
         {
             PendingAntiCompaction.AntiCompactionPredicate acp = new PendingAntiCompaction.AntiCompactionPredicate(FULL_RANGE, UUID.randomUUID())
             {
@@ -707,7 +714,6 @@ public boolean apply(SSTableReader sstable)
                     throw new PendingAntiCompaction.SSTableAcquisitionException("blah");
                 }
             };
-            CompactionManager.instance.active.beginCompaction(holder);
             PendingAntiCompaction.AcquisitionCallable acquisitionCallable = new PendingAntiCompaction.AcquisitionCallable(cfs, UUID.randomUUID(), 2, 1000, acp);
             Future fut = es.submit(acquisitionCallable);
             assertNull(fut.get());
@@ -715,7 +721,6 @@ public boolean apply(SSTableReader sstable)
         finally
         {
             es.shutdown();
-            CompactionManager.instance.active.finishCompaction(holder);
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java b/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java
index c9dde2ae694e..7b31dfb65af6 100644
--- a/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java
+++ b/test/unit/org/apache/cassandra/index/SecondaryIndexManagerTest.java
@@ -34,7 +34,6 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.SystemKeyspace;
-import org.apache.cassandra.db.compaction.CompactionInfo;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.notifications.SSTableAddedNotification;
@@ -792,9 +791,9 @@ public void build()
                             }
 
                             @Override
-                            public CompactionInfo getCompactionInfo()
+                            public OperationProgress getProgress()
                             {
-                                return builder.getCompactionInfo();
+                                return builder.getProgress();
                             }
                         };
                     }
diff --git a/test/unit/org/apache/cassandra/index/sai/SAITester.java b/test/unit/org/apache/cassandra/index/sai/SAITester.java
index 3fb0cedf7199..a07bb3c7ea30 100644
--- a/test/unit/org/apache/cassandra/index/sai/SAITester.java
+++ b/test/unit/org/apache/cassandra/index/sai/SAITester.java
@@ -96,7 +96,7 @@ public class SAITester extends CQLTester
 
     protected static final Injections.Counter INDEX_BUILD_COUNTER = Injections.newCounter("IndexBuildCounter")
                                                                               .add(newInvokePoint().onClass(CompactionManager.class)
-                                                                                                   .onMethod("submitIndexBuild", "SecondaryIndexBuilder", "ActiveCompactionsTracker"))
+                                                                                                   .onMethod("submitIndexBuild", "SecondaryIndexBuilder", "TableOperationObserver"))
                                                                               .build();
 
     protected static final Injections.Counter perSSTableValidationCounter = Injections.newCounter("PerSSTableValidationCounter")
diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
index 4585f0a8e10b..957d324336b4 100644
--- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryManagerTest.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.io.sstable;
 
+import java.io.Closeable;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.*;
@@ -43,9 +44,10 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.RowUpdateBuilder;
-import org.apache.cassandra.db.compaction.CompactionInfo;
+import org.apache.cassandra.db.compaction.AbstractTableOperation;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.TableOperation;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
@@ -120,9 +122,9 @@ public void beforeTest()
     @After
     public void afterTest()
     {
-        for (CompactionInfo.Holder holder : CompactionManager.instance.active.getCompactions())
+        for (TableOperation operation : CompactionManager.instance.active.getTableOperations())
         {
-            holder.stop();
+            operation.stop();
         }
 
         String ksname = KEYSPACE1;
@@ -646,11 +648,11 @@ public void testCancelIndexHelper(Consumer<ColumnFamilyStore> cancelFunction) th
         final AtomicReference<CompactionInterruptedException> exception = new AtomicReference<>();
         // barrier to control when redistribution runs
         final CountDownLatch barrier = new CountDownLatch(1);
-        CompactionInfo.Holder ongoingCompaction = new CompactionInfo.Holder()
+        AbstractTableOperation ongoingCompaction = new AbstractTableOperation()
         {
-            public CompactionInfo getCompactionInfo()
+            public OperationProgress getProgress()
             {
-                return new CompactionInfo(cfs.metadata(), OperationType.UNKNOWN, 0, 0, UUID.randomUUID(), compacting);
+                return new OperationProgress(cfs.metadata(), OperationType.UNKNOWN, 0, 0, UUID.randomUUID(), compacting);
             }
 
             public boolean isGlobal()
@@ -658,10 +660,9 @@ public boolean isGlobal()
                 return false;
             }
         };
-        try (LifecycleTransaction ignored = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN))
+        try (LifecycleTransaction ignored = cfs.getTracker().tryModify(compacting, OperationType.UNKNOWN);
+             Closeable c = CompactionManager.instance.active.onOperationStart(ongoingCompaction))
         {
-            CompactionManager.instance.active.beginCompaction(ongoingCompaction);
-
             Thread t = NamedThreadFactory.createThread(new Runnable()
             {
                 public void run()
@@ -698,13 +699,9 @@ public void run()
             barrier.countDown();
             t.join();
         }
-        finally
-        {
-            CompactionManager.instance.active.finishCompaction(ongoingCompaction);
-        }
 
         assertNotNull("Expected compaction interrupted exception", exception.get());
-        assertTrue("Expected no active compactions", CompactionManager.instance.active.getCompactions().isEmpty());
+        assertTrue("Expected no active compactions", CompactionManager.instance.active.getTableOperations().isEmpty());
 
         Set<SSTableReader> beforeRedistributionSSTables = new HashSet<>(allSSTables);
         Set<SSTableReader> afterCancelSSTables = selectOnlyBigTableReaders(cfs.getLiveSSTables(), Collectors.toSet());
diff --git a/update-history/STAR-801/33-35e12f4887 STAR-410 (#101) b/update-history/STAR-801/33-35e12f4887 STAR-410 (#101)
new file mode 100644
index 000000000000..f6c3dcb88393
--- /dev/null
+++ b/update-history/STAR-801/33-35e12f4887 STAR-410 (#101)	
@@ -0,0 +1,137 @@
+--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
++++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+@@ -272,15 +272,10 @@
+                 continue; // mostly this just avoids polluting the debug log with zero scores
+             // we want to calculate score excluding compacting ones
+             Set<SSTableReader> sstablesInLevel = Sets.newHashSet(sstables);
+-<<<<<<<
+             Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, cfs.getCompactingSSTables());
+-            double score = (double) SSTableReader.getTotalBytes(remaining) / (double)maxBytesForLevel(i, maxSSTableSizeInBytes);
+-=======
+-            Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, cfs.getTracker().getCompacting());
+             long remainingBytesForLevel = SSTableReader.getTotalBytes(remaining);
+             long maxBytesForLevel = maxBytesForLevel(i, maxSSTableSizeInBytes);
+             double score = (double) remainingBytesForLevel / (double) maxBytesForLevel;
+->>>>>>>
+             logger.trace("Compaction score for level {} is {}", i, score);
+ 
+             if (score > 1.001)
+--- a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
++++ b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
+@@ -139,38 +139,19 @@
+     {
+         long tstamp1 = 1451001601000L; // 2015-12-25 @ 00:00:01, in milliseconds
+         long tstamp2 = 1451088001000L; // 2015-12-26 @ 00:00:01, in milliseconds
+-<<<<<<<
+-        Long lowHour = 1451001600000L; // 2015-12-25 @ 00:00:00, in milliseconds
+-
+-        // A 1 hour window should round down to the beginning of the hour
+-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp1).left.compareTo(lowHour));
+-
+-        // A 1 minute window should round down to the beginning of the hour
+-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.MINUTES, 1, tstamp1).left.compareTo(lowHour));
+-
+-        // A 1 day window should round down to the beginning of the hour
+-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.DAYS, 1, tstamp1).left.compareTo(lowHour));
+-
+-        // The 2 day window of 2015-12-25 + 2015-12-26 should round down to the beginning of 2015-12-25
+-        assertEquals(0, getWindowBoundsInMillis(TimeUnit.DAYS, 2, tstamp2).left.compareTo(lowHour));
+-=======
+         long lowHour = 1451001600000L; // 2015-12-25 @ 00:00:00, in milliseconds
+ 
+         // A 1 hour window should round down to the beginning of the hour
+-        assertTrue(getWindowBoundsInMillis(HOURS, 1, tstamp1) == lowHour);
++        assertEquals(lowHour, getWindowBoundsInMillis(HOURS, 1, tstamp1));
+ 
+         // A 1 minute window should round down to the beginning of the hour
+-        assertTrue(getWindowBoundsInMillis(TimeUnit.MINUTES, 1, tstamp1) == lowHour);
++        assertEquals(lowHour, getWindowBoundsInMillis(TimeUnit.MINUTES, 1, tstamp1));
+ 
+         // A 1 day window should round down to the beginning of the hour
+-        assertTrue(getWindowBoundsInMillis(TimeUnit.DAYS, 1, tstamp1) == lowHour);
++        assertEquals(lowHour, getWindowBoundsInMillis(TimeUnit.DAYS, 1, tstamp1));
+ 
+         // The 2 day window of 2015-12-25 + 2015-12-26 should round down to the beginning of 2015-12-25
+-        assertTrue(getWindowBoundsInMillis(TimeUnit.DAYS, 2, tstamp2) == lowHour);
+-
+-
+-        return;
+->>>>>>>
++        assertEquals(lowHour, getWindowBoundsInMillis(TimeUnit.DAYS, 2, tstamp2));
+     }
+ 
+     @Test
+@@ -214,20 +195,6 @@
+         // We'll put 3 sstables into the newest bucket
+         for (int i = 0; i < 3; i++)
+         {
+-<<<<<<<
+-            Pair<Long, Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp);
+-            buckets.put(bounds.left, sstrs.get(i));
+-        }
+-
+-        TimeWindowCompactionStrategy.NewestBucket newBucket = newestBucket(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
+-        assertTrue("incoming bucket should not be accepted when it has below the min threshold SSTables", newBucket.sstables.isEmpty());
+-        assertEquals("there should be no estimated remaining tasks when bucket is below min threshold SSTables", 0, newBucket.estimatedRemainingTasks);
+-
+-
+-        newBucket = newestBucket(buckets, 2, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
+-        assertFalse("incoming bucket should be accepted when it is larger than the min threshold SSTables", newBucket.sstables.isEmpty());
+-        assertEquals("there should be one estimated remaining task when bucket is larger than the min threshold SSTables", 1, newBucket.estimatedRemainingTasks);
+-=======
+             TimeWindowCompactionStrategy.addToBuckets(buckets, sstrs.get(i), tstamp, TimeUnit.HOURS, 1);
+         }
+ 
+@@ -240,17 +207,11 @@
+         compactions = toCompactions(aggregates);
+         assertFalse("There should be one selected compaction when bucket is larger than the min but smaller than max threshold", CompactionAggregate.getSelected(aggregates).isEmpty());
+         assertEquals("There should be one compaction when bucket is larger than the min but smaller than max threshold", 1,  compactions.size());
+->>>>>>>
+ 
+         // And 2 into the second bucket (1 hour back)
+         for (int i = 3; i < 5; i++)
+         {
+-<<<<<<<
+-            Pair<Long, Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, tstamp2);
+-            buckets.put(bounds.left, sstrs.get(i));
+-=======
+             TimeWindowCompactionStrategy.addToBuckets(buckets, sstrs.get(i), tstamp2, TimeUnit.HOURS, 1);
+->>>>>>>
+         }
+ 
+         assertEquals("an sstable with a single value should have equal min/max timestamps", sstrs.get(0).getMinTimestamp(), sstrs.get(0).getMaxTimestamp());
+@@ -275,21 +236,12 @@
+         sstrs = new ArrayList<>(cfs.getLiveSSTables());
+         for (int i = 0; i < 40; i++)
+         {
+-<<<<<<<
+-            Pair<Long, Long> bounds = getWindowBoundsInMillis(TimeUnit.HOURS, 1, sstrs.get(i).getMaxTimestamp());
+-            buckets.put(bounds.left, sstrs.get(i));
+-        }
+-
+-        newBucket = newestBucket(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(TimeUnit.HOURS, 1, System.currentTimeMillis()).left);
+-        assertEquals("new bucket should be trimmed to max threshold of 32", newBucket.sstables.size(), 32);
+-=======
+             TimeWindowCompactionStrategy.addToBuckets(buckets, sstrs.get(i), sstrs.get(i).getMaxTimestamp(), TimeUnit.HOURS, 1);
+         }
+ 
+         aggregates = getBucketAggregates(buckets, 4, 32, new SizeTieredCompactionStrategyOptions(), getWindowBoundsInMillis(HOURS, 1, System.currentTimeMillis()));
+         compactions = toCompactions(aggregates);
+         assertEquals("new bucket should be split by max threshold of 32", buckets.keySet().size() + 1, compactions.size());
+->>>>>>>
+ 
+         CompactionPick selected = CompactionAggregate.getSelected(aggregates);
+         assertEquals("first pick should be trimmed to max threshold of 32", 32, selected.sstables.size());
+diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+index 64cf161a31..dfdda303e1 100644
+--- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
++++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+@@ -848,7 +848,7 @@ public class LeveledCompactionStrategyTest
+ 
+         // compaction for L8 sstables is not supposed to be run because there is no upper level to promote sstables
+         // that's why we expect compaction candidates for L7 only
+-        Collection<SSTableReader> compactionCandidates = lm.getCompactionCandidates().sstables;
++        Collection<SSTableReader> compactionCandidates = lm.getCompactionCandidate().sstables;
+         assertThat(compactionCandidates).containsAll(sstablesOnL7);
+         assertThat(compactionCandidates).doesNotContainAnyElementsOf(sstablesOnL8);
+     }

From 4c99b0fdfbe690ad6e180a904d34069f8807e753 Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Tue, 27 Apr 2021 16:02:43 -0700
Subject: [PATCH 071/151] STAR-409 Port guardrails from astra branch (#124)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CDB-15: Introduce basic framework for Guardrails

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Sylvain Lebresne <lebresne@gmail.com>

* CDB-18: Introduce Guardrails for MV/2i

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-19: Introduce guardrails for Table number and Table properties

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-22: Introduce disallowed write consistencies guardrail

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-21: Introduce Guardrail for partition size

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>

* CDB-20: Introduce Guardrail for partition keys in a SELECT

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>

* CDB-17: Introduce Guardrails for CollectionSizeWarnThreshold / FieldsPerUDT / ItemsPerCollection

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-3: Redact guardrail user data in error messages that are passed to listeners
CDB-16: Introduce disk usage guardrails

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>
Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>

* CDB-30: Add Guardrail for cartesian product of IN queries

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-31: Add guardrail for user-provided timestamps
CDB-23: Introduce Guardrail for read-before-write list operations
CDB-3: Generalize/improve conditions on which Guardrails are suspended

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-3: Make sure tablePropertiesDisallowed Guardrail is also applied to MV creation/alteration
CDB-3: Fix remaining test failures
CDB-3: Fix CompactionStressTest / CQLSSTableWriterTest

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

* CDB-3: Add missing guardrail settings to cassandra.yaml and missing enabled check for tables limit GR

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>

* STAR-409 Fix ClientRequestSizeMetricsTest by ensuring session connected before clearing metrics

* STAR-409 Add call to SSTableWriter.guardCollectionSize in PartitionWriter.writePartition

Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Sylvain Lebresne <lebresne@gmail.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>
Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>
(cherry picked from commit 0676af036fc17d1e9d9aa23e1305ea1dcacd2040)
(cherry picked from commit d187a01bfd17aa58ef109869073f674a57ca0d1f)
---
 conf/cassandra.yaml                           | 103 +++
 .../org/apache/cassandra/config/Config.java   |   4 +
 .../cassandra/config/DatabaseDescriptor.java  |  18 +
 .../apache/cassandra/cql3/CQLStatement.java   |   4 +-
 src/java/org/apache/cassandra/cql3/Lists.java |  55 +-
 src/java/org/apache/cassandra/cql3/Maps.java  |  38 +-
 .../apache/cassandra/cql3/QueryProcessor.java |  10 +-
 src/java/org/apache/cassandra/cql3/Sets.java  |  36 +-
 .../cassandra/cql3/UpdateParameters.java      |  20 +-
 .../ClusteringColumnRestrictions.java         |   6 +-
 .../PartitionKeyRestrictions.java             |   3 +-
 .../PartitionKeySingleRestrictionSet.java     |   7 +-
 .../restrictions/StatementRestrictions.java   |  10 +-
 .../cql3/restrictions/TokenFilter.java        |  17 +-
 .../cql3/restrictions/TokenRestriction.java   |  10 +-
 .../cql3/statements/AlterRoleStatement.java   |   6 +-
 .../cql3/statements/BatchStatement.java       |  40 +-
 .../cql3/statements/CQL3CasRequest.java       |  14 +-
 .../cql3/statements/CreateRoleStatement.java  |   6 +-
 .../cql3/statements/DescribeStatement.java    |   2 +-
 .../cql3/statements/DropRoleStatement.java    |   8 +-
 .../statements/ListPermissionsStatement.java  |   8 +-
 .../cql3/statements/ListRolesStatement.java   |   6 +-
 .../statements/ModificationStatement.java     | 105 ++-
 .../PermissionsManagementStatement.java       |   9 +-
 .../cql3/statements/PropertyDefinitions.java  |   8 +
 .../statements/RoleManagementStatement.java   |   7 +-
 .../cql3/statements/SelectStatement.java      |  63 +-
 .../cql3/statements/TruncateStatement.java    |   3 +-
 .../cql3/statements/UseStatement.java         |   3 +-
 .../schema/AlterSchemaStatement.java          |   4 +-
 .../schema/AlterTableStatement.java           |  31 +
 .../statements/schema/AlterTypeStatement.java |  15 +
 .../statements/schema/AlterViewStatement.java |  13 +
 .../schema/CreateIndexStatement.java          |  24 +
 .../schema/CreateTableStatement.java          |  43 +-
 .../schema/CreateTypeStatement.java           |  47 ++
 .../schema/CreateViewStatement.java           |  24 +
 .../statements/schema/TableAttributes.java    |   2 +-
 .../apache/cassandra/db/ConsistencyLevel.java |  29 +-
 .../org/apache/cassandra/db/Directories.java  |  10 +
 .../apache/cassandra/db/MultiCBuilder.java    |  19 +
 .../org/apache/cassandra/db/view/View.java    |   3 +-
 .../cassandra/gms/ApplicationState.java       |   1 +
 .../apache/cassandra/gms/VersionedValue.java  |   5 +
 .../cassandra/guardrails/Guardrail.java       | 780 ++++++++++++++++++
 .../cassandra/guardrails/Guardrails.java      | 234 ++++++
 .../guardrails/GuardrailsConfig.java          | 267 ++++++
 .../io/sstable/CQLSSTableWriter.java          |  21 +-
 .../io/sstable/format/SSTableWriter.java      |  58 ++
 .../io/sstable/format/big/ColumnIndex.java    |   7 +-
 .../format/trieindex/PartitionWriter.java     |   2 +
 .../org/apache/cassandra/schema/Schema.java   |   8 +
 .../cassandra/schema/SchemaConstants.java     |  35 +-
 .../schema/SchemaTransformations.java         | 143 ++++
 .../cassandra/schema/TableMetadata.java       |  66 +-
 .../apache/cassandra/service/CASRequest.java  |   2 +-
 .../apache/cassandra/service/ClientState.java |  15 +
 .../apache/cassandra/service/QueryState.java  |  13 +-
 .../cassandra/service/StorageProxy.java       |  14 +-
 .../cassandra/service/StorageService.java     |   2 +
 .../disk/usage/DiskUsageBroadcaster.java      | 154 ++++
 .../service/disk/usage/DiskUsageMonitor.java  | 178 ++++
 .../service/disk/usage/DiskUsageState.java    |  54 ++
 .../apache/cassandra/utils/Comparables.java   |  58 ++
 .../cassandra/utils/units/RateUnit.java       | 269 ++++++
 .../cassandra/utils/units/RateValue.java      | 173 ++++
 .../cassandra/utils/units/SizeUnit.java       | 356 ++++++++
 .../cassandra/utils/units/SizeValue.java      | 153 ++++
 .../cassandra/utils/units/TimeValue.java      | 146 ++++
 .../apache/cassandra/utils/units/Units.java   | 227 +++++
 .../distributed/impl/Coordinator.java         |  16 +-
 .../cassandra/distributed/impl/Instance.java  |   2 +-
 .../compaction/CompactionAllocationTest.java  |   6 +-
 .../test/microbench/BatchStatementBench.java  |   2 +-
 .../org/apache/cassandra/SchemaLoader.java    |  32 +
 .../config/DatabaseDescriptorRefTest.java     |   7 +-
 .../org/apache/cassandra/cql3/CQLTester.java  | 311 +++++--
 .../org/apache/cassandra/cql3/ListsTest.java  |   3 +-
 .../cql3/PreparedStatementsTest.java          |  10 +-
 .../cql3/ViewFilteringClustering1Test.java    |  10 +-
 .../cql3/ViewFilteringClustering2Test.java    |   8 +-
 .../cassandra/cql3/ViewFilteringPKTest.java   |   6 +-
 .../cassandra/cql3/ViewFilteringTest.java     |  22 +-
 .../apache/cassandra/cql3/ViewSchemaTest.java |  15 +-
 .../db/SinglePartitionSliceCommandTest.java   |   4 +-
 .../GuardrailCollectionSizeTest.java          | 455 ++++++++++
 .../GuardrailColumnValueSizeTest.java         | 557 +++++++++++++
 .../GuardrailColumnsPerTableTest.java         | 151 ++++
 .../guardrails/GuardrailConsistencyTest.java  | 356 ++++++++
 .../guardrails/GuardrailDiskUsageTest.java    | 461 +++++++++++
 .../guardrails/GuardrailFieldsPerUDTTest.java | 110 +++
 .../guardrails/GuardrailInSelectTest.java     | 192 +++++
 .../GuardrailItemsPerCollectionTest.java      | 449 ++++++++++
 ...uardrailMaterializedViewsPerTableTest.java | 118 +++
 .../GuardrailPartitionKeysInSelectTest.java   |  87 ++
 .../GuardrailPartitionSizeTest.java           |  65 ++
 ...railReadBeforeWriteListOperationsTest.java | 189 +++++
 ...GuardrailSecondaryIndexesPerTableTest.java |  99 +++
 .../cassandra/guardrails/GuardrailTester.java | 393 +++++++++
 .../GuardrailUserTimestampsTest.java          | 157 ++++
 .../GuardrailWarningOnSSTableWriteTester.java |  79 ++
 .../guardrails/GuardrailsOnTableTest.java     | 209 +++++
 .../cassandra/guardrails/GuardrailsTest.java  | 506 ++++++++++++
 .../sstable/CQLSSTableWriterClientTest.java   |   1 -
 .../io/sstable/CQLSSTableWriterTest.java      |  39 +-
 .../metrics/ClientRequestSizeMetricsTest.java |   4 +
 .../cassandra/schema/TableMetadataTest.java   | 128 +++
 .../cassandra/utils/units/RateUnitTest.java   | 126 +++
 .../cassandra/utils/units/RateValueTest.java  |  52 ++
 .../cassandra/utils/units/SizeUnitTest.java   | 158 ++++
 .../cassandra/utils/units/SizeValueTest.java  |  39 +
 .../cassandra/utils/units/TimeValueTest.java  |  40 +
 .../cassandra/utils/units/UnitsTest.java      |  56 ++
 .../io/sstable/StressCQLSSTableWriter.java    |  19 +-
 .../cassandra/stress/CompactionStress.java    |   2 +
 ...9 Port guardrails from astra branch (#124) | 196 +++++
 117 files changed, 9952 insertions(+), 329 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/guardrails/Guardrail.java
 create mode 100644 src/java/org/apache/cassandra/guardrails/Guardrails.java
 create mode 100644 src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
 create mode 100644 src/java/org/apache/cassandra/schema/SchemaTransformations.java
 create mode 100644 src/java/org/apache/cassandra/service/disk/usage/DiskUsageBroadcaster.java
 create mode 100644 src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java
 create mode 100644 src/java/org/apache/cassandra/service/disk/usage/DiskUsageState.java
 create mode 100644 src/java/org/apache/cassandra/utils/Comparables.java
 create mode 100644 src/java/org/apache/cassandra/utils/units/RateUnit.java
 create mode 100644 src/java/org/apache/cassandra/utils/units/RateValue.java
 create mode 100644 src/java/org/apache/cassandra/utils/units/SizeUnit.java
 create mode 100644 src/java/org/apache/cassandra/utils/units/SizeValue.java
 create mode 100644 src/java/org/apache/cassandra/utils/units/TimeValue.java
 create mode 100644 src/java/org/apache/cassandra/utils/units/Units.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailCollectionSizeTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailColumnValueSizeTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailColumnsPerTableTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailFieldsPerUDTTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailInSelectTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailItemsPerCollectionTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailPartitionKeysInSelectTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailPartitionSizeTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailReadBeforeWriteListOperationsTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailUserTimestampsTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailWarningOnSSTableWriteTester.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java
 create mode 100644 test/unit/org/apache/cassandra/schema/TableMetadataTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/units/RateUnitTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/units/RateValueTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/units/SizeUnitTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/units/SizeValueTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/units/TimeValueTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/units/UnitsTest.java
 create mode 100644 update-history/STAR-801/32-d187a01bfd STAR-409 Port guardrails from astra branch (#124)

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index ec15f1f40b6d..d25d7f0e6e02 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -1434,3 +1434,106 @@ enable_sasi_indexes: false
 # Enables creation of transiently replicated keyspaces on this node.
 # Transient replication is experimental and is not recommended for production use.
 enable_transient_replication: false
+
+# Apply database-as-a-service defaults.
+#
+# When enabled, some guardrails defaults are modified to values that are appropriate for cloud environments.
+# This includes (but is not limited to) stricter guardrails defaults.
+#
+# This can be used as an convenience to develop and test applications meant to run in a cloud environment.
+# apply_dbaas_defaults: false
+
+# Guardrails settings.
+# guardrails:
+  # When executing a scan, within or across a partition, we need to keep the
+  # tombstones seen in memory so we can return them to the coordinator, which
+  # will use them to make sure other replicas also know about the deleted rows.
+  # With workloads that generate a lot of tombstones, this can cause performance
+  # problems and even exhaust the server heap.
+  # (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
+  # Adjust the thresholds here if you understand the dangers and want to
+  # scan more tombstones anyway.  These thresholds may also be adjusted at runtime
+  # using the StorageService mbean.
+  #
+  # Default: tombstone_warn_threshold is 1000, may differ if apply_dbaas_defaults is enabled
+  # Default: tombstone_failure_threshold is 100000, may differ if apply_dbaas_defaults is enabled
+  # tombstone_warn_threshold: 1000
+  # tombstone_failure_threshold: 100000
+
+  # Failure threshold to prevent writing large a column value into Cassandra.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # column_value_size_failure_threshold_in_kb: -1
+
+  # Failure threshold to prevent creating more columns per table than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # columns_per_table_failure_threshold: -1
+
+  # Failure threshold to prevent creating more secondary indexes per table than threshold
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # secondary_index_per_table_failure_threshold: -1
+
+  # Failure threshold to prevent creating more materialized views per table than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # materialized_view_per_table_failure_threshold: -1
+
+  # Warn threshold to warn creating more tables than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # tables_warn_threshold: -1
+
+  # Failure threshold to prevent creating more tables than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # tables_failure_threshold: -1
+
+  # Prevents creating tables with provided configurations.
+  # Default: all properties are allowed, may differ if apply_dbaas_defaults is enabled
+  # table_properties_disallowed:
+
+  # Whether to allow user-provided timestamps in write requests
+  # Default: true to allow user-provided timestamps, may differ if apply_dbaas_defaults is enabled
+  # user_timestamps_enabled: true
+
+  # Preventing a query with provided consistency levels
+  # Default: all consistency levels are allowed.
+  # write_consistency_levels_disallowed:
+
+  # Log a warning when compacting partitions larger than this value.
+  # Default: 100mb, may differ if apply_dbaas_defaults is enabled
+  # partition_size_warn_threshold_in_mb: 100
+
+  # Failure threshold to prevent IN query containing more partition keys than threshold
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # partition_keys_in_select_failure_threshold: -1
+
+  # Warning threshold to warn when local disk usage exceeding threshold. Valid values: (1, 100]
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # disk_usage_percentage_warn_threshold: -1
+
+  # Failure threshold to reject write requests if replica disk usage exceeding threshold. Valid values: (1, 100]
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # disk_usage_percentage_failure_threshold: -1
+
+  # Failure threshold to prevent IN query creating size of cartesian product exceeding threshold, eg.
+  # "a IN (1,2,...10) AND b IN (1,2...10)" results in cartesian product of 100.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # in_select_cartesian_product_failure_threshold: -1
+
+  # Whether to allow user-provided timestamps in write request (USING TIMESTAMP ...)
+  # Default: true to allow user-provided timestamp, may differ if apply_dbaas_defaults is enabled
+  # user_timestamps_enabled: true
+
+  # Whether read-before-write operation is allowed on lists, eg. setting list element by index, removing list element
+  # by index. Note: LWT is always allowed.
+  # Default: true to allow read before write operation on lists, may differ if apply_dbaas_defaults is enabled
+  # read_before_write_list_operations_enabled: true
+
+  # Failure threshold to prevent creating more fields in user-defined-type than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # fields_per_udt_failure_threshold: -1
+
+  # Warning threshold to warn when encountering larger size of collection data than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # collection_size_warn_threshold_in_kb: -1
+
+  # Warning threshold to warn when encountering more elements in collection than threshold.
+  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # items_per_collection_warn_threshold: -1
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 490627f58187..482f8905a3e7 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.audit.AuditLogOptions;
 import org.apache.cassandra.fql.FullQueryLoggerOptions;
 import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.guardrails.GuardrailsConfig;
 
 /**
  * A class that contains configuration properties for the cassandra node it runs within.
@@ -505,6 +506,9 @@ public class Config
      */
     public volatile int validation_preview_purge_head_start_in_sec = 60 * 60;
 
+    public boolean apply_dbaas_defaults = false;
+    public GuardrailsConfig guardrails = new GuardrailsConfig();
+
     /**
      * The intial capacity for creating RangeTombstoneList.
      */
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 90f4d48710fa..2e09f150af9d 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -56,6 +56,7 @@
 import org.apache.cassandra.db.commitlog.CommitLogSegmentManagerStandard;
 import org.apache.cassandra.dht.IPartitioner;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.guardrails.GuardrailsConfig;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.util.DiskOptimizationStrategy;
 import org.apache.cassandra.io.util.FileUtils;
@@ -362,6 +363,14 @@ private static void applyAll() throws ConfigurationException
         applyEncryptionContext();
 
         applySslContext();
+
+        applyGuardrailsConfig();
+    }
+
+    private static void applyGuardrailsConfig()
+    {
+        conf.guardrails.applyConfig();
+        conf.guardrails.validate();
     }
 
     private static void applySimpleConfig()
@@ -3400,4 +3409,13 @@ public static void setSAIZeroCopyUsedThreshold(double threshold)
     {
         conf.sai_options.zerocopy_used_threshold = threshold;
     }
+    
+    public static GuardrailsConfig getGuardrailsConfig()
+    {
+        return conf.guardrails;
+    }
+    public static boolean isApplyDbaasDefaults()
+    {
+        return conf.apply_dbaas_defaults;
+    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/CQLStatement.java b/src/java/org/apache/cassandra/cql3/CQLStatement.java
index c34e27fb2eb4..efc6bcfd39b8 100644
--- a/src/java/org/apache/cassandra/cql3/CQLStatement.java
+++ b/src/java/org/apache/cassandra/cql3/CQLStatement.java
@@ -66,9 +66,9 @@ default Iterable<Function> getFunctions()
     /**
      * Perform additional validation required by the statment. To be overriden by subclasses if needed.
      *
-     * @param state the current client state
+     * @param state the current query state
      */
-    public void validate(ClientState state);
+    public void validate(QueryState state);
 
     /**
      * Execute the statement and return the resulting result or null if there is no result.
diff --git a/src/java/org/apache/cassandra/cql3/Lists.java b/src/java/org/apache/cassandra/cql3/Lists.java
index cd45095ea5c4..38029d1f33a2 100644
--- a/src/java/org/apache/cassandra/cql3/Lists.java
+++ b/src/java/org/apache/cassandra/cql3/Lists.java
@@ -29,6 +29,7 @@
 import java.util.stream.StreamSupport;
 
 import org.apache.cassandra.db.marshal.ByteBufferAccessor;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import com.google.common.annotations.VisibleForTesting;
 import org.apache.cassandra.cql3.functions.Function;
@@ -464,11 +465,14 @@ public void collectMarkerSpecification(VariableSpecifications boundNames)
             idx.collectMarkerSpecification(boundNames);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             // we should not get here for frozen lists
             assert column.type.isMultiCell() : "Attempted to set an individual element on a frozen list";
 
+            Guardrails.readBeforeWriteListOperationsEnabled.ensureEnabled("Setting of list items by index requiring read before write", params.state);
+
             ByteBuffer index = idx.bindAndGet(params.options);
             ByteBuffer value = t.bindAndGet(params.options);
 
@@ -500,6 +504,7 @@ public Appender(ColumnMetadata column, Term t)
             super(column, t);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to append to a frozen list";
@@ -509,26 +514,40 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I
 
         static void doAppend(Term.Terminal value, ColumnMetadata column, UpdateParameters params) throws InvalidRequestException
         {
-            if (column.type.isMultiCell())
+            if (value == null)
             {
+                // for frozen lists, we're overwriting the whole cell value
+                if (!column.type.isMultiCell())
+                    params.addTombstone(column);
+
                 // If we append null, do nothing. Note that for Setter, we've
                 // already removed the previous value so we're good here too
-                if (value == null)
-                    return;
+                return;
+            }
+
+            List<ByteBuffer> elements = ((Value) value).elements;
 
-                for (ByteBuffer buffer : ((Value) value).elements)
+            if (column.type.isMultiCell())
+            {
+                // Guardrails about collection size are only checked for the added elements without considering
+                // already existent elements. This is done so to avoid read-before-write, having additional checks
+                // during SSTable write.
+                Guardrails.itemsPerCollection.guard(elements.size(), column.name.toString(), false, params.state);
+
+                int dataSize = 0;
+                for (ByteBuffer buffer : elements)
                 {
                     ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes());
-                    params.addCell(column, CellPath.create(uuid), buffer);
+                    Cell cell = params.addCell(column, CellPath.create(uuid), buffer);
+                    dataSize += cell.dataSize();
                 }
+                Guardrails.collectionSize.guard(dataSize, column.name.toString(), false, params.state);
             }
             else
             {
-                // for frozen lists, we're overwriting the whole cell value
-                if (value == null)
-                    params.addTombstone(column);
-                else
-                    params.addCell(column, value.get(ProtocolVersion.CURRENT));
+                Guardrails.itemsPerCollection.guard(elements.size(), column.name.toString(), false, params.state);
+                Cell cell = params.addCell(column, value.get(ProtocolVersion.CURRENT));
+                Guardrails.collectionSize.guard(cell.dataSize(), column.name.toString(), false, params.state);
             }
         }
     }
@@ -550,10 +569,16 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I
             List<ByteBuffer> toAdd = ((Value) value).elements;
             final int totalCount = toAdd.size();
 
+            // Guardrails about collection size are only checked for the added elements without considering
+            // already existent elements. This is done so to avoid read-before-write, having additional checks
+            // during SSTable write.
+            Guardrails.itemsPerCollection.guard(totalCount, column.name.toString(), false, params.state);
+
             // we have to obey MAX_NANOS per batch - in the unlikely event a client has decided to prepend a list with
             // an insane number of entries.
             PrecisionTime pt = null;
             int remainingInBatch = 0;
+            int dataSize = 0;
             for (int i = totalCount - 1; i >= 0; i--)
             {
                 if (remainingInBatch == 0)
@@ -564,8 +589,10 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I
                 }
 
                 ByteBuffer uuid = ByteBuffer.wrap(UUIDGen.getTimeUUIDBytes(pt.millis, (pt.nanos + remainingInBatch--)));
-                params.addCell(column, CellPath.create(uuid), toAdd.get(i));
+                Cell cell = params.addCell(column, CellPath.create(uuid), toAdd.get(i));
+                dataSize += cell.dataSize();
             }
+            Guardrails.collectionSize.guard(dataSize, column.name.toString(), false, params.state);
         }
     }
 
@@ -582,10 +609,13 @@ public boolean requiresRead()
             return true;
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete from a frozen list";
 
+            Guardrails.readBeforeWriteListOperationsEnabled.ensureEnabled("Removal of list items requiring read before write", params.state);
+
             // We want to call bind before possibly returning to reject queries where the value provided is not a list.
             Term.Terminal value = t.bind(params.options);
 
@@ -620,9 +650,12 @@ public boolean requiresRead()
             return true;
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete an item by index from a frozen list";
+
+            Guardrails.readBeforeWriteListOperationsEnabled.ensureEnabled("Removal of list items by index requiring read before write", params.state);
             Term.Terminal index = t.bind(params.options);
             if (index == null)
                 throw new InvalidRequestException("Invalid null value for list index");
diff --git a/src/java/org/apache/cassandra/cql3/Maps.java b/src/java/org/apache/cassandra/cql3/Maps.java
index a4c213c98a3a..4eb073ccb84f 100644
--- a/src/java/org/apache/cassandra/cql3/Maps.java
+++ b/src/java/org/apache/cassandra/cql3/Maps.java
@@ -23,6 +23,7 @@
 import java.util.*;
 import java.util.stream.Collectors;
 
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.db.DecoratedKey;
@@ -366,6 +367,7 @@ public Setter(ColumnMetadata column, Term t)
             super(column, t);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             Term.Terminal value = t.bind(params.options);
@@ -396,6 +398,7 @@ public void collectMarkerSpecification(VariableSpecifications boundNames)
             k.collectMarkerSpecification(boundNames);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to set a value for a single key on a frozen map";
@@ -426,6 +429,7 @@ public Putter(ColumnMetadata column, Term t)
             super(column, t);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to add items to a frozen map";
@@ -436,22 +440,37 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I
 
         static void doPut(Term.Terminal value, ColumnMetadata column, UpdateParameters params) throws InvalidRequestException
         {
+            if (value == null)
+            {
+                // for frozen maps, we're overwriting the whole cell
+                if (!column.type.isMultiCell())
+                    params.addTombstone(column);
+
+                return;
+            }
+
+            Map<ByteBuffer, ByteBuffer> elements = ((Value) value).map;
+
             if (column.type.isMultiCell())
             {
-                if (value == null)
-                    return;
+                // Guardrails about collection size are only checked for the added elements without considering
+                // already existent elements. This is done so to avoid read-before-write, having additional checks
+                // during SSTable write.
+                Guardrails.itemsPerCollection.guard(elements.size(), column.name.toString(), false, params.state);
 
-                Map<ByteBuffer, ByteBuffer> elements = ((Value) value).map;
+                int dataSize = 0;
                 for (Map.Entry<ByteBuffer, ByteBuffer> entry : elements.entrySet())
-                    params.addCell(column, CellPath.create(entry.getKey()), entry.getValue());
+                {
+                    Cell cell = params.addCell(column, CellPath.create(entry.getKey()), entry.getValue());
+                    dataSize += cell.dataSize();
+                }
+                Guardrails.collectionSize.guard(dataSize, column.name.toString(), false, params.state);
             }
             else
             {
-                // for frozen maps, we're overwriting the whole cell
-                if (value == null)
-                    params.addTombstone(column);
-                else
-                    params.addCell(column, value.get(ProtocolVersion.CURRENT));
+                Guardrails.itemsPerCollection.guard(elements.size(), column.name.toString(), false, params.state);
+                Cell cell = params.addCell(column, value.get(ProtocolVersion.CURRENT));
+                Guardrails.collectionSize.guard(cell.dataSize(), column.name.toString(), false, params.state);
             }
         }
     }
@@ -463,6 +482,7 @@ public DiscarderByKey(ColumnMetadata column, Term k)
             super(column, k);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete a single key in a frozen map";
diff --git a/src/java/org/apache/cassandra/cql3/QueryProcessor.java b/src/java/org/apache/cassandra/cql3/QueryProcessor.java
index 87829ab358ac..b7836b9aeca2 100644
--- a/src/java/org/apache/cassandra/cql3/QueryProcessor.java
+++ b/src/java/org/apache/cassandra/cql3/QueryProcessor.java
@@ -207,7 +207,7 @@ public ResultMessage processStatement(CQLStatement statement, QueryState querySt
         logger.trace("Process {} @CL.{}", statement, options.getConsistency());
         ClientState clientState = queryState.getClientState();
         statement.authorize(clientState);
-        statement.validate(clientState);
+        statement.validate(queryState);
 
         ResultMessage result;
         if (options.getConsistency() == ConsistencyLevel.NODE_LOCAL)
@@ -310,7 +310,7 @@ public static Prepared prepareInternal(String query) throws RequestValidationExc
 
         // Note: if 2 threads prepare the same query, we'll live so don't bother synchronizing
         CQLStatement statement = parseStatement(query, internalQueryState().getClientState());
-        statement.validate(internalQueryState().getClientState());
+        statement.validate(internalQueryState());
 
         prepared = new Prepared(statement);
         internalStatements.put(query, prepared);
@@ -358,7 +358,7 @@ public static UntypedResultSet executeInternalWithPaging(String query, int pageS
             throw new IllegalArgumentException("Only SELECTs can be paged");
 
         SelectStatement select = (SelectStatement)prepared.statement;
-        QueryPager pager = select.getQuery(makeInternalOptions(prepared.statement, values), FBUtilities.nowInSeconds()).getPager(null, ProtocolVersion.CURRENT);
+        QueryPager pager = select.getQuery(QueryState.forInternalCalls(), makeInternalOptions(prepared.statement, values), FBUtilities.nowInSeconds()).getPager(null, ProtocolVersion.CURRENT);
         return UntypedResultSet.create(select, pager, pageSize);
     }
 
@@ -385,7 +385,7 @@ public static UntypedResultSet executeOnceInternalWithNowAndTimestamp(int nowInS
     private static UntypedResultSet executeOnceInternal(QueryState queryState, String query, Object... values)
     {
         CQLStatement statement = parseStatement(query, queryState.getClientState());
-        statement.validate(queryState.getClientState());
+        statement.validate(queryState);
         ResultMessage result = statement.executeLocally(queryState, makeInternalOptions(statement, values));
         if (result instanceof ResultMessage.Rows)
             return UntypedResultSet.create(((ResultMessage.Rows)result).result);
@@ -536,7 +536,7 @@ public ResultMessage processBatch(BatchStatement batch, QueryState queryState, B
         ClientState clientState = queryState.getClientState().cloneWithKeyspaceIfSet(options.getKeyspace());
         batch.authorize(clientState);
         batch.validate();
-        batch.validate(clientState);
+        batch.validate(queryState);
         return batch.execute(queryState, options, queryStartNanoTime);
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/Sets.java b/src/java/org/apache/cassandra/cql3/Sets.java
index e31841a583cc..f6b33e30902e 100644
--- a/src/java/org/apache/cassandra/cql3/Sets.java
+++ b/src/java/org/apache/cassandra/cql3/Sets.java
@@ -24,6 +24,7 @@
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
 
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.db.DecoratedKey;
@@ -338,6 +339,7 @@ public Adder(ColumnMetadata column, Term t)
             super(column, t);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to add items to a frozen set";
@@ -348,26 +350,40 @@ public void execute(DecoratedKey partitionKey, UpdateParameters params) throws I
 
         static void doAdd(Term.Terminal value, ColumnMetadata column, UpdateParameters params) throws InvalidRequestException
         {
+            if (value == null)
+            {
+                // for frozen sets, we're overwriting the whole cell
+                if (!column.type.isMultiCell())
+                    params.addTombstone(column);
+
+                return;
+            }
+
+            Set<ByteBuffer> elements = ((Value) value).elements;
+
             if (column.type.isMultiCell())
             {
-                if (value == null)
-                    return;
+                // Guardrails about collection size are only checked for the added elements without considering
+                // already existent elements. This is done so to avoid read-before-write, having additional checks
+                // during SSTable write.
+                Guardrails.itemsPerCollection.guard(elements.size(), column.name.toString(), false, params.state);
 
-                for (ByteBuffer bb : ((Value) value).elements)
+                int dataSize = 0;
+                for (ByteBuffer bb : elements)
                 {
                     if (bb == ByteBufferUtil.UNSET_BYTE_BUFFER)
                         continue;
 
-                    params.addCell(column, CellPath.create(bb), ByteBufferUtil.EMPTY_BYTE_BUFFER);
+                    Cell cell = params.addCell(column, CellPath.create(bb), ByteBufferUtil.EMPTY_BYTE_BUFFER);
+                    dataSize += cell.dataSize();
                 }
+                Guardrails.collectionSize.guard(dataSize, column.name.toString(), false, params.state);
             }
             else
             {
-                // for frozen sets, we're overwriting the whole cell
-                if (value == null)
-                    params.addTombstone(column);
-                else
-                    params.addCell(column, value.get(ProtocolVersion.CURRENT));
+                Guardrails.itemsPerCollection.guard(elements.size(), column.name.toString(), false, params.state);
+                Cell cell = params.addCell(column, value.get(ProtocolVersion.CURRENT));
+                Guardrails.collectionSize.guard(cell.dataSize(), column.name.toString(), false, params.state);
             }
         }
     }
@@ -380,6 +396,7 @@ public Discarder(ColumnMetadata column, Term t)
             super(column, t);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to remove items from a frozen set";
@@ -405,6 +422,7 @@ public ElementDiscarder(ColumnMetadata column, Term k)
             super(column, k);
         }
 
+        @Override
         public void execute(DecoratedKey partitionKey, UpdateParameters params) throws InvalidRequestException
         {
             assert column.type.isMultiCell() : "Attempted to delete a single element in a frozen set";
diff --git a/src/java/org/apache/cassandra/cql3/UpdateParameters.java b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
index 427230734987..39689776c34a 100644
--- a/src/java/org/apache/cassandra/cql3/UpdateParameters.java
+++ b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
@@ -20,6 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.Map;
 
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
@@ -27,6 +28,7 @@
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * Groups the parameters of an update query, and make building updates easier.
@@ -36,6 +38,7 @@ public class UpdateParameters
     public final TableMetadata metadata;
     public final RegularAndStaticColumns updatedColumns;
     public final QueryOptions options;
+    public final QueryState state;
 
     private final int nowInSec;
     private final long timestamp;
@@ -54,6 +57,7 @@ public class UpdateParameters
 
     public UpdateParameters(TableMetadata metadata,
                             RegularAndStaticColumns updatedColumns,
+                            QueryState state,
                             QueryOptions options,
                             long timestamp,
                             int nowInSec,
@@ -64,6 +68,7 @@ public UpdateParameters(TableMetadata metadata,
         this.metadata = metadata;
         this.updatedColumns = updatedColumns;
         this.options = options;
+        this.state = state;
 
         this.nowInSec = nowInSec;
         this.timestamp = timestamp;
@@ -138,20 +143,29 @@ public void addTombstone(ColumnMetadata column) throws InvalidRequestException
 
     public void addTombstone(ColumnMetadata column, CellPath path) throws InvalidRequestException
     {
+        if (path != null && column.type.isMultiCell())
+            Guardrails.columnValueSize.guard(path.dataSize(), column.name.toString(), state);
+
         builder.addCell(BufferCell.tombstone(column, timestamp, nowInSec, path));
     }
 
-    public void addCell(ColumnMetadata column, ByteBuffer value) throws InvalidRequestException
+    public Cell addCell(ColumnMetadata column, ByteBuffer value) throws InvalidRequestException
     {
-        addCell(column, null, value);
+        return addCell(column, null, value);
     }
 
-    public void addCell(ColumnMetadata column, CellPath path, ByteBuffer value) throws InvalidRequestException
+    public Cell addCell(ColumnMetadata column, CellPath path, ByteBuffer value) throws InvalidRequestException
     {
+        Guardrails.columnValueSize.guard(value.remaining(), column.name.toString(), state);
+
+        if (path != null && column.type.isMultiCell())
+            Guardrails.columnValueSize.guard(path.dataSize(), column.name.toString(), state);
+
         Cell<?> cell = ttl == LivenessInfo.NO_TTL
                        ? BufferCell.live(column, timestamp, value, path)
                        : BufferCell.expiring(column, timestamp, ttl, nowInSec, value, path);
         builder.addCell(cell);
+        return cell;
     }
 
     public void addCounter(ColumnMetadata column, long increment) throws InvalidRequestException
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
index d5d153e5e797..17b128523020 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
@@ -19,6 +19,7 @@
 
 import java.util.*;
 
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
@@ -50,7 +51,7 @@ private ClusteringColumnRestrictions(ClusteringComparator comparator,
         this.comparator = comparator;
     }
 
-    public NavigableSet<Clustering<?>> valuesAsClustering(QueryOptions options) throws InvalidRequestException
+    public NavigableSet<Clustering<?>> valuesAsClustering(QueryOptions options, QueryState queryState) throws InvalidRequestException
     {
         MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN());
         List<SingleRestriction> restrictions = restrictions();
@@ -61,6 +62,9 @@ public NavigableSet<Clustering<?>> valuesAsClustering(QueryOptions options) thro
 
             if (builder.hasMissingElements())
                 break;
+
+            if (hasIN() && Guardrails.inSelectCartesianProduct.enabled(queryState))
+                Guardrails.inSelectCartesianProduct.guard(builder.buildSize(), "IN Select", queryState);
         }
         return builder.build();
     }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java
index b1edf947fa75..85b038eacb28 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeyRestrictions.java
@@ -23,6 +23,7 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * A set of restrictions on the partition key.
@@ -32,7 +33,7 @@ interface PartitionKeyRestrictions extends Restrictions
 {
     public PartitionKeyRestrictions mergeWith(Restriction restriction);
 
-    public List<ByteBuffer> values(QueryOptions options);
+    public List<ByteBuffer> values(QueryOptions options, QueryState queryState);
 
     public List<ByteBuffer> bounds(Bound b, QueryOptions options);
 
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
index 2ced74127024..5fc76b9ddb19 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
@@ -20,6 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.statements.Bound;
@@ -28,6 +29,7 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * A set of single restrictions on the partition key.
@@ -71,7 +73,7 @@ public PartitionKeyRestrictions mergeWith(Restriction restriction)
     }
 
     @Override
-    public List<ByteBuffer> values(QueryOptions options)
+    public List<ByteBuffer> values(QueryOptions options, QueryState queryState)
     {
         MultiCBuilder builder = MultiCBuilder.create(comparator, hasIN());
         List<SingleRestriction> restrictions = restrictions();
@@ -82,6 +84,9 @@ public List<ByteBuffer> values(QueryOptions options)
 
             if (builder.hasMissingElements())
                 break;
+
+            if (hasIN() && Guardrails.inSelectCartesianProduct.enabled(queryState))
+                Guardrails.inSelectCartesianProduct.guard(builder.buildSize(), "IN Select", queryState);
         }
         return builder.buildSerializedPartitionKeys();
     }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index 70ec0ddadcfe..41750521fdaa 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
@@ -741,11 +741,12 @@ public RowFilter getRowFilter(IndexRegistry indexManager, QueryOptions options)
      * Returns the partition keys for which the data is requested.
      *
      * @param options the query options
+     * @param queryState the query state
      * @return the partition keys for which the data is requested.
      */
-    public List<ByteBuffer> getPartitionKeys(final QueryOptions options)
+    public List<ByteBuffer> getPartitionKeys(final QueryOptions options, QueryState queryState)
     {
-        return partitionKeyRestrictions.values(options);
+        return partitionKeyRestrictions.values(options, queryState);
     }
 
     /**
@@ -863,9 +864,10 @@ public boolean hasClusteringColumnsRestrictions()
      * Returns the requested clustering columns.
      *
      * @param options the query options
+     * @param queryState the query state
      * @return the requested clustering columns
      */
-    public NavigableSet<Clustering<?>> getClusteringColumns(QueryOptions options)
+    public NavigableSet<Clustering<?>> getClusteringColumns(QueryOptions options, QueryState queryState)
     {
         // If this is a names command and the table is a static compact one, then as far as CQL is concerned we have
         // only a single row which internally correspond to the static parts. In which case we want to return an empty
@@ -873,7 +875,7 @@ public NavigableSet<Clustering<?>> getClusteringColumns(QueryOptions options)
         if (table.isStaticCompactTable())
             return BTreeSet.empty(table.comparator);
 
-        return clusteringColumnsRestrictions.valuesAsClustering(options);
+        return clusteringColumnsRestrictions.valuesAsClustering(options, queryState);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
index 84ec5e6b1c3b..96f15d2c9c10 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
@@ -165,9 +165,17 @@ public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
     }
 
     @Override
-    public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
+    public boolean isOnToken()
     {
-        return filter(restrictions.values(options), options);
+        // if all partition key columns have non-token restrictions and do not need filtering,
+        // we can simply use the token range to filter those restrictions and then ignore the token range
+        return needFiltering(tokenRestriction.metadata) || restrictions.size() < tokenRestriction.size();
+    }
+
+    @Override
+    public List<ByteBuffer> values(QueryOptions options, QueryState queryState) throws InvalidRequestException
+    {
+        return filter(restrictions.values(options, queryState), options, queryState);
     }
 
     @Override
@@ -184,13 +192,14 @@ public PartitionKeyRestrictions mergeWith(Restriction restriction) throws Invali
      *
      * @param values the values returned by the decorated restriction
      * @param options the query options
+     * @param queryState the query state
      * @return the values matching the token restriction
      * @throws InvalidRequestException if the request is invalid
      */
-    private List<ByteBuffer> filter(List<ByteBuffer> values, QueryOptions options) throws InvalidRequestException
+    private List<ByteBuffer> filter(List<ByteBuffer> values, QueryOptions options, QueryState queryState) throws InvalidRequestException
     {
         RangeSet<Token> rangeSet = tokenRestriction.hasSlice() ? toRangeSet(tokenRestriction, options)
-                                                               : toRangeSet(tokenRestriction.values(options));
+                                                               : toRangeSet(tokenRestriction.values(options, queryState));
 
         return filterWithRangeSet(rangeSet, values);
     }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
index 62c93236b567..f9838d48e9e5 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
@@ -32,6 +32,7 @@
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.index.IndexRegistry;
+import org.apache.cassandra.service.QueryState;
 
 import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
@@ -214,7 +215,10 @@ protected PartitionKeyRestrictions doMergeWith(TokenRestriction otherRestriction
         @Override
         public List<ByteBuffer> bounds(Bound b, QueryOptions options) throws InvalidRequestException
         {
-            return values(options);
+            // QueryState is used by inSelectCartesianProduct guardrail to skip non-ordinary users.
+            // Passing null here to avoid polluting too many methods, because in case of EQ token restriction,
+            // it won't generate high cartesian product.
+            return values(options, null);
         }
 
         @Override
@@ -230,7 +234,7 @@ public boolean isInclusive(Bound b)
         }
 
         @Override
-        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options, QueryState queryState) throws InvalidRequestException
         {
             return Collections.singletonList(value.bindAndGet(options));
         }
@@ -263,7 +267,7 @@ public boolean hasSlice()
         }
 
         @Override
-        public List<ByteBuffer> values(QueryOptions options) throws InvalidRequestException
+        public List<ByteBuffer> values(QueryOptions options, QueryState queryState) throws InvalidRequestException
         {
             throw new UnsupportedOperationException();
         }
diff --git a/src/java/org/apache/cassandra/cql3/statements/AlterRoleStatement.java b/src/java/org/apache/cassandra/cql3/statements/AlterRoleStatement.java
index 7a748e8aaf8b..d7b03d32192d 100644
--- a/src/java/org/apache/cassandra/cql3/statements/AlterRoleStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/AlterRoleStatement.java
@@ -25,6 +25,7 @@
 import org.apache.cassandra.cql3.RoleName;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
@@ -47,7 +48,8 @@ public AlterRoleStatement(RoleName name, RoleOptions opts, DCPermissions dcPermi
         this.dcPermissions = dcPermissions;
     }
 
-    public void validate(ClientState state) throws RequestValidationException
+    @Override
+    public void validate(QueryState state) throws RequestValidationException
     {
         opts.validate();
 
@@ -60,7 +62,7 @@ public void validate(ClientState state) throws RequestValidationException
             throw new InvalidRequestException("ALTER [ROLE|USER] can't be empty");
 
         // validate login here before authorize to avoid leaking user existence to anonymous users.
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
         if (!DatabaseDescriptor.getRoleManager().isExistingRole(role))
             throw new InvalidRequestException(String.format("%s doesn't exist", role.getRoleName()));
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
index 80bd437779a4..34da575b562b 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
@@ -31,9 +31,6 @@
 
 import org.apache.cassandra.audit.AuditLogContext;
 import org.apache.cassandra.audit.AuditLogEntryType;
-import org.apache.cassandra.schema.TableId;
-import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.*;
@@ -41,6 +38,9 @@
 import org.apache.cassandra.db.rows.RowIterator;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.metrics.BatchMetrics;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.*;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.messages.ResultMessage;
@@ -49,7 +49,6 @@
 import org.apache.cassandra.utils.Pair;
 
 import static java.util.function.Predicate.isEqual;
-
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
 
 /**
@@ -252,7 +251,8 @@ private boolean isLogged()
 
     // The batch itself will be validated in either Parsed#prepare() - for regular CQL3 batches,
     //   or in QueryProcessor.processBatch() - for native protocol batches.
-    public void validate(ClientState state) throws InvalidRequestException
+    @Override
+    public void validate(QueryState state) throws InvalidRequestException
     {
         for (ModificationStatement statement : statements)
             statement.validate(state);
@@ -264,7 +264,8 @@ public List<ModificationStatement> getStatements()
     }
 
     @VisibleForTesting
-    public List<? extends IMutation> getMutations(BatchQueryOptions options,
+    public List<? extends IMutation> getMutations(QueryState state,
+                                                  BatchQueryOptions options,
                                                   boolean local,
                                                   long batchTimestamp,
                                                   int nowInSeconds,
@@ -280,7 +281,7 @@ public List<? extends IMutation> getMutations(BatchQueryOptions options,
             ModificationStatement stmt = statements.get(i);
             if (metadata != null && !stmt.metadata.id.equals(metadata.id))
                 metadata = null;
-            List<ByteBuffer> stmtPartitionKeys = stmt.buildPartitionKeyNames(options.forStatement(i));
+            List<ByteBuffer> stmtPartitionKeys = stmt.buildPartitionKeyNames(options.forStatement(i), state);
             partitionKeys.add(stmtPartitionKeys);
             HashMultiset<ByteBuffer> perKeyCountsForTable = partitionCounts.computeIfAbsent(stmt.metadata.id, k -> HashMultiset.create());
             for (int stmtIdx = 0, stmtSize = stmtPartitionKeys.size(); stmtIdx < stmtSize; stmtIdx++)
@@ -305,7 +306,7 @@ public List<? extends IMutation> getMutations(BatchQueryOptions options,
             }
             QueryOptions statementOptions = options.forStatement(i);
             long timestamp = attrs.getTimestamp(batchTimestamp, statementOptions);
-            statement.addUpdates(collector, partitionKeys.get(i), statementOptions, local, timestamp, nowInSeconds, queryStartNanoTime);
+            statement.addUpdates(collector, partitionKeys.get(i), state, statementOptions, local, timestamp, nowInSeconds, queryStartNanoTime);
         }
 
         if (tablesWithZeroGcGs != null)
@@ -403,8 +404,17 @@ public ResultMessage execute(QueryState queryState, BatchQueryOptions options, l
         long timestamp = options.getTimestamp(queryState);
         int nowInSeconds = options.getNowInSeconds(queryState);
 
-        if (options.getConsistency() == null)
+        ConsistencyLevel cl = options.getConsistency();
+        if (cl == null)
             throw new InvalidRequestException("Invalid empty consistency level");
+
+        for (int i = 0; i < statements.size(); i++ )
+        {
+            ModificationStatement statement = statements.get(i);
+            statement.validateConsistency(cl, queryState);
+            statement.validateDiskUsage(queryState, options.forStatement(i));
+        }
+
         if (options.getSerialConsistency() == null)
             throw new InvalidRequestException("Invalid empty serial consistency level");
 
@@ -414,7 +424,7 @@ public ResultMessage execute(QueryState queryState, BatchQueryOptions options, l
         if (updatesVirtualTables)
             executeInternalWithoutCondition(queryState, options, queryStartNanoTime);
         else    
-            executeWithoutConditions(getMutations(options, false, timestamp, nowInSeconds, queryStartNanoTime), options.getConsistency(), queryStartNanoTime);
+            executeWithoutConditions(getMutations(queryState, options, false, timestamp, nowInSeconds, queryStartNanoTime), cl, queryStartNanoTime);
 
         return new ResultMessage.Void();
     }
@@ -459,7 +469,7 @@ private ResultMessage executeWithConditions(BatchQueryOptions options, QueryStat
                                                    casRequest,
                                                    options.getSerialConsistency(),
                                                    options.getConsistency(),
-                                                   state.getClientState(),
+                                                   state,
                                                    options.getNowInSeconds(state),
                                                    queryStartNanoTime))
         {
@@ -486,7 +496,7 @@ private Pair<CQL3CasRequest,Set<ColumnMetadata>> makeCasRequest(BatchQueryOption
             ModificationStatement statement = statements.get(i);
             QueryOptions statementOptions = options.forStatement(i);
             long timestamp = attrs.getTimestamp(batchTimestamp, statementOptions);
-            List<ByteBuffer> pks = statement.buildPartitionKeyNames(statementOptions);
+            List<ByteBuffer> pks = statement.buildPartitionKeyNames(statementOptions, state);
             if (statement.getRestrictions().keyIsInRelation())
                 throw new IllegalArgumentException("Batch with conditions cannot span multiple partitions (you cannot use IN on the partition key)");
             if (key == null)
@@ -521,7 +531,7 @@ else if (!key.getKey().equals(pks.get(0)))
             }
             else
             {
-                Clustering<?> clustering = Iterables.getOnlyElement(statement.createClustering(statementOptions));
+                Clustering<?> clustering = Iterables.getOnlyElement(statement.createClustering(statementOptions, state));
                 if (statement.hasConditions())
                 {
                     statement.addConditions(clustering, casRequest, statementOptions);
@@ -559,7 +569,7 @@ private ResultMessage executeInternalWithoutCondition(QueryState queryState, Bat
         long timestamp = batchOptions.getTimestamp(queryState);
         int nowInSeconds = batchOptions.getNowInSeconds(queryState);
 
-        for (IMutation mutation : getMutations(batchOptions, true, timestamp, nowInSeconds, queryStartNanoTime))
+        for (IMutation mutation : getMutations(queryState, batchOptions, true, timestamp, nowInSeconds, queryStartNanoTime))
             mutation.apply();
         return null;
     }
@@ -576,7 +586,7 @@ private ResultMessage executeInternalWithConditions(BatchQueryOptions options, Q
         long timestamp = options.getTimestamp(state);
         int nowInSeconds = options.getNowInSeconds(state);
 
-        try (RowIterator result = ModificationStatement.casInternal(request, timestamp, nowInSeconds))
+        try (RowIterator result = ModificationStatement.casInternal(request, timestamp, nowInSeconds, state))
         {
             ResultSet resultSet =
                 ModificationStatement.buildCasResultSet(ksName,
diff --git a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
index 563a63907ff3..d29f7a6c6c5e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CQL3CasRequest.java
@@ -34,6 +34,7 @@
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.service.CASRequest;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.Pair;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
@@ -228,13 +229,14 @@ private RegularAndStaticColumns updatedColumns()
         return builder.build();
     }
 
-    public PartitionUpdate makeUpdates(FilteredPartition current) throws InvalidRequestException
+    @Override
+    public PartitionUpdate makeUpdates(FilteredPartition current, QueryState state) throws InvalidRequestException
     {
         PartitionUpdate.Builder updateBuilder = new PartitionUpdate.Builder(metadata, key, updatedColumns(), conditions.size());
         for (RowUpdate upd : updates)
-            upd.applyUpdates(current, updateBuilder);
+            upd.applyUpdates(current, updateBuilder, state);
         for (RangeDeletion upd : rangeDeletions)
-            upd.applyUpdates(current, updateBuilder);
+            upd.applyUpdates(current, updateBuilder, state);
 
         PartitionUpdate partitionUpdate = updateBuilder.build();
         IndexRegistry.obtain(metadata).validate(partitionUpdate);
@@ -265,12 +267,13 @@ private RowUpdate(Clustering<?> clustering, ModificationStatement stmt, QueryOpt
             this.nowInSeconds = nowInSeconds;
         }
 
-        void applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuilder)
+        void applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuilder, QueryState state)
         {
             Map<DecoratedKey, Partition> map = stmt.requiresRead() ? Collections.singletonMap(key, current) : null;
             UpdateParameters params =
                 new UpdateParameters(metadata,
                                      updateBuilder.columns(),
+                                     state,
                                      options,
                                      timestamp,
                                      nowInSeconds,
@@ -297,13 +300,14 @@ private RangeDeletion(Slice slice, ModificationStatement stmt, QueryOptions opti
             this.nowInSeconds = nowInSeconds;
         }
 
-        void applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuilder)
+        void applyUpdates(FilteredPartition current, PartitionUpdate.Builder updateBuilder, QueryState state)
         {
             // No slice statements currently require a read, but this maintains consistency with RowUpdate, and future proofs us
             Map<DecoratedKey, Partition> map = stmt.requiresRead() ? Collections.singletonMap(key, current) : null;
             UpdateParameters params =
                 new UpdateParameters(metadata,
                                      updateBuilder.columns(),
+                                     state,
                                      options,
                                      timestamp,
                                      nowInSeconds,
diff --git a/src/java/org/apache/cassandra/cql3/statements/CreateRoleStatement.java b/src/java/org/apache/cassandra/cql3/statements/CreateRoleStatement.java
index 574d661d0e28..a15f0b127f10 100644
--- a/src/java/org/apache/cassandra/cql3/statements/CreateRoleStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/CreateRoleStatement.java
@@ -24,6 +24,7 @@
 import org.apache.cassandra.cql3.RoleName;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
@@ -53,7 +54,8 @@ public void authorize(ClientState state) throws UnauthorizedException
         }
     }
 
-    public void validate(ClientState state) throws RequestValidationException
+    @Override
+    public void validate(QueryState state) throws RequestValidationException
     {
         opts.validate();
 
@@ -66,7 +68,7 @@ public void validate(ClientState state) throws RequestValidationException
             throw new InvalidRequestException("Role name can't be an empty string");
 
         // validate login here before authorize to avoid leaking role existence to anonymous users.
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
 
         if (!ifNotExists && DatabaseDescriptor.getRoleManager().isExistingRole(role))
             throw new InvalidRequestException(String.format("%s already exists", role.getRoleName()));
diff --git a/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java b/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java
index 16671dd38d9a..6916c0f525a2 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java
@@ -114,7 +114,7 @@ public final void authorize(ClientState state)
     }
 
     @Override
-    public final void validate(ClientState state)
+    public final void validate(QueryState state)
     {
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/DropRoleStatement.java b/src/java/org/apache/cassandra/cql3/statements/DropRoleStatement.java
index 058ab01c04ba..e7c81c2671b4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DropRoleStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DropRoleStatement.java
@@ -24,6 +24,7 @@
 import org.apache.cassandra.cql3.RoleName;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
@@ -51,15 +52,16 @@ public void authorize(ClientState state) throws UnauthorizedException
             throw new UnauthorizedException("Only superusers can drop a role with superuser status");
     }
 
-    public void validate(ClientState state) throws RequestValidationException
+    @Override
+    public void validate(QueryState state) throws RequestValidationException
     {
         // validate login here before authorize to avoid leaking user existence to anonymous users.
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
 
         if (!ifExists && !DatabaseDescriptor.getRoleManager().isExistingRole(role))
             throw new InvalidRequestException(String.format("%s doesn't exist", role.getRoleName()));
 
-        AuthenticatedUser user = state.getUser();
+        AuthenticatedUser user = state.getClientState().getUser();
         if (user != null && user.getName().equals(role.getRoleName()))
             throw new InvalidRequestException("Cannot DROP primary role for current login");
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/ListPermissionsStatement.java b/src/java/org/apache/cassandra/cql3/statements/ListPermissionsStatement.java
index 4b5aa601e2ab..3a39fb0e7da9 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ListPermissionsStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ListPermissionsStatement.java
@@ -30,6 +30,7 @@
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
@@ -64,14 +65,15 @@ public ListPermissionsStatement(Set<Permission> permissions, IResource resource,
         this.grantee = grantee.hasName()? RoleResource.role(grantee.getName()) : null;
     }
 
-    public void validate(ClientState state) throws RequestValidationException
+    @Override
+    public void validate(QueryState state) throws RequestValidationException
     {
         // a check to ensure the existence of the user isn't being leaked by user existence check.
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
 
         if (resource != null)
         {
-            resource = maybeCorrectResource(resource, state);
+            resource = maybeCorrectResource(resource, state.getClientState());
             if (!resource.exists())
                 throw new InvalidRequestException(String.format("%s doesn't exist", resource));
         }
diff --git a/src/java/org/apache/cassandra/cql3/statements/ListRolesStatement.java b/src/java/org/apache/cassandra/cql3/statements/ListRolesStatement.java
index 8a75f8a6c36a..8bebe3425062 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ListRolesStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ListRolesStatement.java
@@ -35,6 +35,7 @@
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
@@ -67,9 +68,10 @@ public ListRolesStatement(RoleName grantee, boolean recursive)
         this.recursive = recursive;
     }
 
-    public void validate(ClientState state) throws UnauthorizedException, InvalidRequestException
+    @Override
+    public void validate(QueryState state) throws UnauthorizedException, InvalidRequestException
     {
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
 
         if ((grantee != null) && !DatabaseDescriptor.getRoleManager().isExistingRole(grantee))
             throw new InvalidRequestException(String.format("%s doesn't exist", grantee));
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index 7ec3a69fcedd..390cf40d67c7 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -49,9 +49,18 @@
 import org.apache.cassandra.db.rows.RowIterator;
 import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.guardrails.Guardrails;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.SchemaConstants;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.schema.ViewMetadata;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageProxy;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster;
 import org.apache.cassandra.service.paxos.Commit;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.triggers.TriggerExecutor;
@@ -250,7 +259,8 @@ public void authorize(ClientState state) throws InvalidRequestException, Unautho
             state.ensurePermission(Permission.EXECUTE, function);
     }
 
-    public void validate(ClientState state) throws InvalidRequestException
+    @Override
+    public void validate(QueryState state) throws InvalidRequestException
     {
         checkFalse(hasConditions() && attrs.isTimestampSet(), "Cannot provide custom timestamp for conditional updates");
         checkFalse(isCounter() && attrs.isTimestampSet(), "Cannot provide custom timestamp for counter updates");
@@ -258,6 +268,10 @@ public void validate(ClientState state) throws InvalidRequestException
         checkFalse(isView(), "Cannot directly modify a materialized view");
         checkFalse(isVirtual() && attrs.isTimeToLiveSet(), "Expiring columns are not supported by virtual tables");
         checkFalse(isVirtual() && hasConditions(), "Conditional updates are not supported by virtual tables");
+
+        // there are system queries with USING TIMESTAMP, e.g. SchemaKeyspace#saveSystemKeyspacesSchema
+        if (SchemaConstants.isUserKeyspace(metadata.keyspace) && attrs.isTimestampSet())
+            Guardrails.userTimestampsEnabled.ensureEnabled(state);
     }
 
     public RegularAndStaticColumns updatedColumns()
@@ -315,23 +329,23 @@ public boolean hasIfExistCondition()
         return conditions.isIfExists();
     }
 
-    public List<ByteBuffer> buildPartitionKeyNames(QueryOptions options)
+    public List<ByteBuffer> buildPartitionKeyNames(QueryOptions options, QueryState queryState)
     throws InvalidRequestException
     {
-        List<ByteBuffer> partitionKeys = restrictions.getPartitionKeys(options);
+        List<ByteBuffer> partitionKeys = restrictions.getPartitionKeys(options, queryState);
         for (ByteBuffer key : partitionKeys)
             QueryProcessor.validateKey(key);
 
         return partitionKeys;
     }
 
-    public NavigableSet<Clustering<?>> createClustering(QueryOptions options)
+    public NavigableSet<Clustering<?>> createClustering(QueryOptions options, QueryState queryState)
     throws InvalidRequestException
     {
         if (appliesOnlyToStaticColumns() && !restrictions.hasClusteringColumnsRestrictions())
             return FBUtilities.singleton(CBuilder.STATIC_BUILDER.build(), metadata().comparator);
 
-        return restrictions.getClusteringColumns(options);
+        return restrictions.getClusteringColumns(options, queryState);
     }
 
     /**
@@ -343,6 +357,19 @@ private boolean appliesOnlyToStaticColumns()
         return appliesOnlyToStaticColumns(operations, conditions);
     }
 
+    public void validateDiskUsage(QueryState state, QueryOptions options)
+    {
+        // reject writes if any replica exceeds disk usage failure limit or warn if exceeds warn limit
+        if (Guardrails.replicaDiskUsage.enabled(state) && DiskUsageBroadcaster.instance.hasStuffedOrFullNode())
+        {
+            for (ByteBuffer keyValue : buildPartitionKeyNames(options, state))
+            {
+                for (InetAddressAndPort replica : StorageService.instance.getNaturalReplicasForToken(keyspace(), keyValue).endpointList())
+                    Guardrails.replicaDiskUsage.guard(replica, state);
+            }
+        }
+    }
+
     /**
      * Checks that the specified operations and conditions only apply to static columns.
      * @return <code>true</code> if the specified operations and conditions only apply to static columns,
@@ -450,23 +477,29 @@ private ResultMessage executeWithoutCondition(QueryState queryState, QueryOption
             return executeInternalWithoutCondition(queryState, options, queryStartNanoTime);
 
         ConsistencyLevel cl = options.getConsistency();
-        if (isCounter())
-            cl.validateCounterForWrite(metadata());
-        else
-            cl.validateForWrite();
+        validateConsistency(cl, queryState);
+        validateDiskUsage(queryState, options);
 
         List<? extends IMutation> mutations =
-            getMutations(options,
-                         false,
-                         options.getTimestamp(queryState),
-                         options.getNowInSeconds(queryState),
-                         queryStartNanoTime);
+            getMutations(queryState,
+            options,
+            false,
+            options.getTimestamp(queryState),
+            options.getNowInSeconds(queryState), queryStartNanoTime);
         if (!mutations.isEmpty())
             StorageProxy.mutateWithTriggers(mutations, cl, false, queryStartNanoTime);
 
         return null;
     }
 
+    public void validateConsistency(ConsistencyLevel cl, QueryState queryState)
+    {
+        if (isCounter())
+            cl.validateCounterForWrite(metadata(), queryState);
+        else
+            cl.validateForWrite(metadata.keyspace, queryState);
+    }
+
     private ResultMessage executeWithCondition(QueryState queryState, QueryOptions options, long queryStartNanoTime)
     {
         CQL3CasRequest request = makeCasRequest(queryState, options);
@@ -477,7 +510,7 @@ private ResultMessage executeWithCondition(QueryState queryState, QueryOptions o
                                                    request,
                                                    options.getSerialConsistency(),
                                                    options.getConsistency(),
-                                                   queryState.getClientState(),
+                                                   queryState,
                                                    options.getNowInSeconds(queryState),
                                                    queryStartNanoTime))
         {
@@ -487,7 +520,7 @@ private ResultMessage executeWithCondition(QueryState queryState, QueryOptions o
 
     private CQL3CasRequest makeCasRequest(QueryState queryState, QueryOptions options)
     {
-        List<ByteBuffer> keys = buildPartitionKeyNames(options);
+        List<ByteBuffer> keys = buildPartitionKeyNames(options, queryState);
         // We don't support IN for CAS operation so far
         checkFalse(restrictions.keyIsInRelation(),
                    "IN on the partition key is not supported with conditional %s",
@@ -501,7 +534,7 @@ private CQL3CasRequest makeCasRequest(QueryState queryState, QueryOptions option
                    "IN on the clustering key columns is not supported with conditional %s",
                     type.isUpdate()? "updates" : "deletions");
 
-        Clustering<?> clustering = Iterables.getOnlyElement(createClustering(options));
+        Clustering<?> clustering = Iterables.getOnlyElement(createClustering(options, queryState));
         CQL3CasRequest request = new CQL3CasRequest(metadata(), key, conditionColumns(), updatesRegularRows(), updatesStaticRow());
 
         addConditions(clustering, request, options);
@@ -619,7 +652,7 @@ public ResultMessage executeInternalWithoutCondition(QueryState queryState, Quer
     {
         long timestamp = options.getTimestamp(queryState);
         int nowInSeconds = options.getNowInSeconds(queryState);
-        for (IMutation mutation : getMutations(options, true, timestamp, nowInSeconds, queryStartNanoTime))
+        for (IMutation mutation : getMutations(queryState, options, true, timestamp, nowInSeconds, queryStartNanoTime))
             mutation.apply();
         return null;
     }
@@ -628,13 +661,13 @@ public ResultMessage executeInternalWithCondition(QueryState state, QueryOptions
     {
         CQL3CasRequest request = makeCasRequest(state, options);
 
-        try (RowIterator result = casInternal(request, options.getTimestamp(state), options.getNowInSeconds(state)))
+        try (RowIterator result = casInternal(request, options.getTimestamp(state), options.getNowInSeconds(state), state))
         {
             return new ResultMessage.Rows(buildCasResultSet(result, state, options));
         }
     }
 
-    static RowIterator casInternal(CQL3CasRequest request, long timestamp, int nowInSeconds)
+    static RowIterator casInternal(CQL3CasRequest request, long timestamp, int nowInSeconds, QueryState state)
     {
         UUID ballot = UUIDGen.getTimeUUIDFromMicros(timestamp);
 
@@ -649,7 +682,7 @@ static RowIterator casInternal(CQL3CasRequest request, long timestamp, int nowIn
         if (!request.appliesTo(current))
             return current.rowIterator();
 
-        PartitionUpdate updates = request.makeUpdates(current);
+        PartitionUpdate updates = request.makeUpdates(current, state);
         updates = TriggerExecutor.instance.execute(updates);
 
         Commit proposal = Commit.newProposal(ballot, updates);
@@ -660,27 +693,31 @@ static RowIterator casInternal(CQL3CasRequest request, long timestamp, int nowIn
     /**
      * Convert statement into a list of mutations to apply on the server
      *
+     *
+     * @param state The query state
      * @param options value for prepared statement markers
      * @param local if true, any requests (for collections) performed by getMutation should be done locally only.
      * @param timestamp the current timestamp in microseconds to use if no timestamp is user provided.
      *
      * @return list of the mutations
      */
-    private List<? extends IMutation> getMutations(QueryOptions options,
-                                                         boolean local,
-                                                         long timestamp,
-                                                         int nowInSeconds,
-                                                         long queryStartNanoTime)
-    {
-        List<ByteBuffer> keys = buildPartitionKeyNames(options);
+    private List<? extends IMutation> getMutations(QueryState state,
+                                                   QueryOptions options,
+                                                   boolean local,
+                                                   long timestamp,
+                                                   int nowInSeconds,
+                                                   long queryStartNanoTime)
+    {
+        List<ByteBuffer> keys = buildPartitionKeyNames(options, state);
         HashMultiset<ByteBuffer> perPartitionKeyCounts = HashMultiset.create(keys);
         SingleTableUpdatesCollector collector = new SingleTableUpdatesCollector(metadata, updatedColumns, perPartitionKeyCounts);
-        addUpdates(collector, keys, options, local, timestamp, nowInSeconds, queryStartNanoTime);
+        addUpdates(collector, keys, state, options, local, timestamp, nowInSeconds, queryStartNanoTime);
         return collector.toMutations();
     }
 
     final void addUpdates(UpdatesCollector collector,
                           List<ByteBuffer> keys,
+                          QueryState state,
                           QueryOptions options,
                           boolean local,
                           long timestamp,
@@ -697,6 +734,7 @@ final void addUpdates(UpdatesCollector collector,
 
             UpdateParameters params = makeUpdateParameters(keys,
                                                            new ClusteringIndexSliceFilter(slices, false),
+                                                           state,
                                                            options,
                                                            DataLimits.NONE,
                                                            local,
@@ -716,13 +754,13 @@ final void addUpdates(UpdatesCollector collector,
         }
         else
         {
-            NavigableSet<Clustering<?>> clusterings = createClustering(options);
+            NavigableSet<Clustering<?>> clusterings = createClustering(options, state);
 
             // If some of the restrictions were unspecified (e.g. empty IN restrictions) we do not need to do anything.
             if (restrictions.hasClusteringColumnsRestrictions() && clusterings.isEmpty())
                 return;
 
-            UpdateParameters params = makeUpdateParameters(keys, clusterings, options, local, timestamp, nowInSeconds, queryStartNanoTime);
+            UpdateParameters params = makeUpdateParameters(keys, clusterings, state, options, local, timestamp, nowInSeconds, queryStartNanoTime);
 
             for (ByteBuffer key : keys)
             {
@@ -769,6 +807,7 @@ Slices createSlices(QueryOptions options)
 
     private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
                                                   NavigableSet<Clustering<?>> clusterings,
+                                                  QueryState state,
                                                   QueryOptions options,
                                                   boolean local,
                                                   long timestamp,
@@ -778,6 +817,7 @@ private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
         if (clusterings.contains(Clustering.STATIC_CLUSTERING))
             return makeUpdateParameters(keys,
                                         new ClusteringIndexSliceFilter(Slices.ALL, false),
+                                        state,
                                         options,
                                         DataLimits.cqlLimits(1),
                                         local,
@@ -787,6 +827,7 @@ private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
 
         return makeUpdateParameters(keys,
                                     new ClusteringIndexNamesFilter(clusterings, false),
+                                    state,
                                     options,
                                     DataLimits.NONE,
                                     local,
@@ -797,6 +838,7 @@ private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
 
     private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
                                                   ClusteringIndexFilter filter,
+                                                  QueryState state,
                                                   QueryOptions options,
                                                   DataLimits limits,
                                                   boolean local,
@@ -816,6 +858,7 @@ private UpdateParameters makeUpdateParameters(Collection<ByteBuffer> keys,
 
         return new UpdateParameters(metadata(),
                                     updatedColumns(),
+                                    state,
                                     options,
                                     getTimestamp(timestamp, options),
                                     nowInSeconds,
diff --git a/src/java/org/apache/cassandra/cql3/statements/PermissionsManagementStatement.java b/src/java/org/apache/cassandra/cql3/statements/PermissionsManagementStatement.java
index aa7e85ba7307..e29fa56f99e3 100644
--- a/src/java/org/apache/cassandra/cql3/statements/PermissionsManagementStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/PermissionsManagementStatement.java
@@ -27,6 +27,8 @@
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
+
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
 
@@ -43,17 +45,18 @@ protected PermissionsManagementStatement(Set<Permission> permissions, IResource
         this.grantee = RoleResource.role(grantee.getName());
     }
 
-    public void validate(ClientState state) throws RequestValidationException
+    @Override
+    public void validate(QueryState state) throws RequestValidationException
     {
         // validate login here before authorize to avoid leaking user existence to anonymous users.
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
 
         if (!DatabaseDescriptor.getRoleManager().isExistingRole(grantee))
             throw new InvalidRequestException(String.format("Role %s doesn't exist", grantee.getRoleName()));
 
         // if a keyspace is omitted when GRANT/REVOKE ON TABLE <table>, we need to correct the resource.
         // called both here and in authorize(), as in some cases we do not call the latter.
-        resource = maybeCorrectResource(resource, state);
+        resource = maybeCorrectResource(resource, state.getClientState());
 
         // altering permissions on builtin functions is not supported
         if (resource instanceof FunctionResource
diff --git a/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java b/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
index c474949d7a19..d1d8acd6feed 100644
--- a/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
+++ b/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
@@ -143,4 +143,12 @@ public static Integer toInt(String key, String value, Integer defaultValue) thro
             }
         }
     }
+
+    /**
+     * Returns the name of all the properties that are updated by this object.
+     */
+    public Set<String> updatedProperties()
+    {
+        return properties.keySet();
+    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/statements/RoleManagementStatement.java b/src/java/org/apache/cassandra/cql3/statements/RoleManagementStatement.java
index a5274dd73834..1d8560f3bb44 100644
--- a/src/java/org/apache/cassandra/cql3/statements/RoleManagementStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/RoleManagementStatement.java
@@ -25,6 +25,8 @@
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.UnauthorizedException;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
+
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
 
@@ -44,9 +46,10 @@ public void authorize(ClientState state) throws UnauthorizedException
         super.checkPermission(state, Permission.AUTHORIZE, role);
     }
 
-    public void validate(ClientState state) throws RequestValidationException
+    @Override
+    public void validate(QueryState state) throws RequestValidationException
     {
-        state.ensureNotAnonymous();
+        state.getClientState().ensureNotAnonymous();
 
         if (!DatabaseDescriptor.getRoleManager().isExistingRole(role))
             throw new InvalidRequestException(String.format("%s doesn't exist", role.getRoleName()));
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index 6cb022b98c98..5a344eb42eb2 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -29,8 +29,10 @@
 import org.apache.cassandra.audit.AuditLogContext;
 import org.apache.cassandra.audit.AuditLogEntryType;
 import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.cql3.*;
@@ -222,17 +224,25 @@ public void authorize(ClientState state) throws InvalidRequestException, Unautho
             state.ensurePermission(Permission.EXECUTE, function);
     }
 
-    public void validate(ClientState state) throws InvalidRequestException
+    @Override
+    public void validate(QueryState state) throws InvalidRequestException
     {
         // Nothing to do, all validation has been done by RawStatement.prepare()
     }
 
+    private void validateQueryOptions(QueryOptions options)
+    {
+        if (SchemaConstants.isUserKeyspace(table.keyspace))
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(options.getConsistency());
+    }
+
     public ResultMessage.Rows execute(QueryState state, QueryOptions options, long queryStartNanoTime)
     {
         ConsistencyLevel cl = options.getConsistency();
         checkNotNull(cl, "Invalid empty consistency level");
 
         cl.validateForRead();
+        validateQueryOptions(options);
 
         int nowInSec = options.getNowInSeconds(state);
         int userLimit = getLimit(options);
@@ -240,7 +250,7 @@ public ResultMessage.Rows execute(QueryState state, QueryOptions options, long q
         int pageSize = options.getPageSize();
 
         Selectors selectors = selection.newSelectors(options);
-        ReadQuery query = getQuery(options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
+        ReadQuery query = getQuery(state, options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
 
         if (aggregationSpec == null && (pageSize <= 0 || (query.limits().count() <= pageSize)))
             return execute(query, options, state, selectors, nowInSec, userLimit, queryStartNanoTime);
@@ -256,18 +266,19 @@ public ResultMessage.Rows execute(QueryState state, QueryOptions options, long q
                        queryStartNanoTime);
     }
 
-    public ReadQuery getQuery(QueryOptions options, int nowInSec) throws RequestValidationException
+    public ReadQuery getQuery(QueryState state, QueryOptions options, int nowInSec) throws RequestValidationException
     {
         Selectors selectors = selection.newSelectors(options);
-        return getQuery(options,
-                        selectors.getColumnFilter(),
-                        nowInSec,
-                        getLimit(options),
-                        getPerPartitionLimit(options),
-                        options.getPageSize());
+        return getQuery(state,
+        options,
+        selectors.getColumnFilter(),
+        nowInSec,
+        getLimit(options),
+        getPerPartitionLimit(options), options.getPageSize());
     }
 
-    public ReadQuery getQuery(QueryOptions options,
+    public ReadQuery getQuery(QueryState queryState,
+                              QueryOptions options,
                               ColumnFilter columnFilter,
                               int nowInSec,
                               int userLimit,
@@ -279,9 +290,9 @@ public ReadQuery getQuery(QueryOptions options,
         DataLimits limit = getDataLimits(userLimit, perPartitionLimit, pageSize);
 
         if (isPartitionRangeQuery)
-            return getRangeCommand(options, columnFilter, limit, nowInSec);
+            return getRangeCommand(options, columnFilter, limit, nowInSec, queryState);
 
-        return getSliceCommands(options, columnFilter, limit, nowInSec);
+        return getSliceCommands(queryState, options, columnFilter, limit, nowInSec);
     }
 
     private ResultMessage.Rows execute(ReadQuery query,
@@ -438,7 +449,7 @@ public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options
         int pageSize = options.getPageSize();
 
         Selectors selectors = selection.newSelectors(options);
-        ReadQuery query = getQuery(options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
+        ReadQuery query = getQuery(state, options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
 
         try (ReadExecutionController executionController = query.executionController())
         {
@@ -505,13 +516,15 @@ public StatementRestrictions getRestrictions()
         return restrictions;
     }
 
-    private ReadQuery getSliceCommands(QueryOptions options, ColumnFilter columnFilter, DataLimits limit, int nowInSec)
+    private ReadQuery getSliceCommands(QueryState queryState, QueryOptions options, ColumnFilter columnFilter, DataLimits limit, int nowInSec)
     {
-        Collection<ByteBuffer> keys = restrictions.getPartitionKeys(options);
+        Collection<ByteBuffer> keys = restrictions.getPartitionKeys(options, queryState);
         if (keys.isEmpty())
             return ReadQuery.empty(table);
 
-        ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter);
+        Guardrails.partitionKeysInSelectQuery.guard(keys.size(), "Select query", queryState);
+
+        ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter, queryState);
         if (filter == null || filter.isEmpty(table.comparator))
             return ReadQuery.empty(table);
 
@@ -537,9 +550,10 @@ private ReadQuery getSliceCommands(QueryOptions options, ColumnFilter columnFilt
      */
     public Slices clusteringIndexFilterAsSlices()
     {
+        QueryState state = QueryState.forInternalCalls();
         QueryOptions options = QueryOptions.forInternalCalls(Collections.emptyList());
         ColumnFilter columnFilter = selection.newSelectors(options).getColumnFilter();
-        ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter);
+        ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter, state);
         if (filter instanceof ClusteringIndexSliceFilter)
             return ((ClusteringIndexSliceFilter)filter).requestedSlices();
 
@@ -555,9 +569,10 @@ public Slices clusteringIndexFilterAsSlices()
      */
     public SinglePartitionReadCommand internalReadForView(DecoratedKey key, int nowInSec)
     {
+        QueryState state = QueryState.forInternalCalls();
         QueryOptions options = QueryOptions.forInternalCalls(Collections.emptyList());
         ColumnFilter columnFilter = selection.newSelectors(options).getColumnFilter();
-        ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter);
+        ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter, state);
         RowFilter rowFilter = getRowFilter(options);
         return SinglePartitionReadCommand.create(table, nowInSec, columnFilter, rowFilter, DataLimits.NONE, key, filter);
     }
@@ -570,9 +585,9 @@ public RowFilter rowFilterForInternalCalls()
         return getRowFilter(QueryOptions.forInternalCalls(Collections.emptyList()));
     }
 
-    private ReadQuery getRangeCommand(QueryOptions options, ColumnFilter columnFilter, DataLimits limit, int nowInSec)
+    private ReadQuery getRangeCommand(QueryOptions options, ColumnFilter columnFilter, DataLimits limit, int nowInSec, QueryState queryState)
     {
-        ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options, columnFilter);
+        ClusteringIndexFilter clusteringIndexFilter = makeClusteringIndexFilter(options, columnFilter, queryState);
         if (clusteringIndexFilter == null)
             return ReadQuery.empty(table);
 
@@ -593,7 +608,7 @@ private ReadQuery getRangeCommand(QueryOptions options, ColumnFilter columnFilte
         return command;
     }
 
-    private ClusteringIndexFilter makeClusteringIndexFilter(QueryOptions options, ColumnFilter columnFilter)
+    private ClusteringIndexFilter makeClusteringIndexFilter(QueryOptions options, ColumnFilter columnFilter, QueryState queryState)
     {
         if (parameters.isDistinct)
         {
@@ -616,7 +631,7 @@ private ClusteringIndexFilter makeClusteringIndexFilter(QueryOptions options, Co
             return new ClusteringIndexSliceFilter(slices, isReversed);
         }
 
-        NavigableSet<Clustering<?>> clusterings = getRequestedRows(options);
+        NavigableSet<Clustering<?>> clusterings = getRequestedRows(options, queryState);
         // We can have no clusterings if either we're only selecting the static columns, or if we have
         // a 'IN ()' for clusterings. In that case, we still want to query if some static columns are
         // queried. But we're fine otherwise.
@@ -748,12 +763,12 @@ private int getLimit(Term limit, QueryOptions options)
         return userLimit;
     }
 
-    private NavigableSet<Clustering<?>> getRequestedRows(QueryOptions options) throws InvalidRequestException
+    private NavigableSet<Clustering<?>> getRequestedRows(QueryOptions options, QueryState queryState) throws InvalidRequestException
     {
         // Note: getRequestedColumns don't handle static columns, but due to CASSANDRA-5762
         // we always do a slice for CQL3 tables, so it's ok to ignore them here
         assert !restrictions.isColumnRange();
-        return restrictions.getClusteringColumns(options);
+        return restrictions.getClusteringColumns(options, queryState);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java b/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
index 206d11697f95..f6c4864ec0df 100644
--- a/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
@@ -52,7 +52,8 @@ public void authorize(ClientState state) throws InvalidRequestException, Unautho
         state.ensureTablePermission(keyspace(), name(), Permission.MODIFY);
     }
 
-    public void validate(ClientState state) throws InvalidRequestException
+    @Override
+    public void validate(QueryState state) throws InvalidRequestException
     {
         Schema.instance.validateTable(keyspace(), name());
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/UseStatement.java b/src/java/org/apache/cassandra/cql3/statements/UseStatement.java
index 3013d9f997e8..8220ee2f5a7c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/UseStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/UseStatement.java
@@ -48,7 +48,8 @@ public void authorize(ClientState state) throws UnauthorizedException
         state.validateLogin();
     }
 
-    public void validate(ClientState state) throws InvalidRequestException
+    @Override
+    public void validate(QueryState state) throws InvalidRequestException
     {
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java
index 161c9c4a93ed..2a254f213494 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterSchemaStatement.java
@@ -29,7 +29,6 @@
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
-import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event.SchemaChange;
@@ -44,7 +43,8 @@ protected AlterSchemaStatement(String keyspaceName)
         this.keyspaceName = keyspaceName;
     }
 
-    public final void validate(ClientState state)
+    @Override
+    public void validate(QueryState state)
     {
         // no-op; validation is performed while executing the statement, in apply()
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
index 5e3bfa22a5e0..5312c63fdcc5 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
@@ -35,6 +35,7 @@
 
 import org.apache.cassandra.audit.AuditLogContext;
 import org.apache.cassandra.audit.AuditLogEntryType;
+import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.auth.Permission;
 
 import org.apache.cassandra.cql3.CQL3Type;
@@ -53,6 +54,8 @@
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.Keyspaces;
+import org.apache.cassandra.guardrails.Guardrails;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableParams;
@@ -60,11 +63,13 @@
 import org.apache.cassandra.schema.Views;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.reads.repair.ReadRepairStrategy;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
 import org.apache.cassandra.utils.NoSpamLogger;
+import org.apache.cassandra.transport.messages.ResultMessage;
 
 import static java.lang.String.format;
 import static java.lang.String.join;
@@ -101,6 +106,11 @@ public Keyspaces apply(Keyspaces schema) throws UnknownHostException
         return schema.withAddedOrUpdated(apply(keyspace, table));
     }
 
+    public ResultMessage execute(QueryState state, boolean locally)
+    {
+        return super.execute(state, locally);
+    }
+
     SchemaChange schemaChangeEvent(KeyspacesDiff diff)
     {
         return new SchemaChange(Change.UPDATED, Target.TABLE, keyspaceName, tableName);
@@ -160,9 +170,20 @@ private static class Column
                 this.type = type;
                 this.isStatic = isStatic;
             }
+
         }
 
         private final Collection<Column> newColumns;
+        private QueryState queryState;
+
+        @Override
+        public void validate(QueryState state)
+        {
+            super.validate(state);
+
+            // save the query state to use it for guardrails validation in #apply
+            this.queryState = state;
+        }
 
         private AddColumns(String keyspaceName, String tableName, Collection<Column> newColumns)
         {
@@ -178,6 +199,8 @@ public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table)
             TableMetadata tableMetadata = tableBuilder.build();
             tableMetadata.validate();
 
+            Guardrails.columnsPerTable.guard(tableBuilder.numColumns(), tableName, queryState);
+
             return keyspace.withSwapped(keyspace.tables.withSwapped(tableMetadata))
                            .withSwapped(viewsBuilder.build());
         }
@@ -388,6 +411,14 @@ private AlterOptions(String keyspaceName, String tableName, TableAttributes attr
             this.attrs = attrs;
         }
 
+        @Override
+        public void validate(QueryState state)
+        {
+            super.validate(state);
+
+            Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+        }
+
         public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table)
         {
             attrs.validate();
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java
index a9887c499b3a..2628b654ecc1 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java
@@ -29,10 +29,12 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.Keyspaces;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
@@ -97,6 +99,7 @@ private static final class AddField extends AlterTypeStatement
     {
         private final FieldIdentifier fieldName;
         private final CQL3Type.Raw type;
+        private QueryState state;
 
         private AddField(String keyspaceName, String typeName, FieldIdentifier fieldName, CQL3Type.Raw type)
         {
@@ -105,6 +108,15 @@ private AddField(String keyspaceName, String typeName, FieldIdentifier fieldName
             this.type = type;
         }
 
+        @Override
+        public void validate(QueryState state)
+        {
+            super.validate(state);
+
+            // save the query state to use it for guardrails validation in #apply
+            this.state = state;
+        }
+
         UserType apply(KeyspaceMetadata keyspace, UserType userType)
         {
             if (userType.fieldPosition(fieldName) >= 0)
@@ -125,6 +137,9 @@ UserType apply(KeyspaceMetadata keyspace, UserType userType)
             List<FieldIdentifier> fieldNames = new ArrayList<>(userType.fieldNames()); fieldNames.add(fieldName);
             List<AbstractType<?>> fieldTypes = new ArrayList<>(userType.fieldTypes()); fieldTypes.add(fieldType);
 
+            int newSize = userType.size() + 1;
+            Guardrails.fieldsPerUDT.guard(newSize, userType.getNameAsString(), state);
+
             return new UserType(keyspaceName, userType.name, fieldNames, fieldTypes, true);
         }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
index 3eba21561a48..8632739ad2f4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
@@ -22,9 +22,11 @@
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.cql3.CQLStatement;
 import org.apache.cassandra.cql3.QualifiedName;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
@@ -33,6 +35,7 @@ public final class AlterViewStatement extends AlterSchemaStatement
 {
     private final String viewName;
     private final TableAttributes attrs;
+    private QueryState state;
 
     public AlterViewStatement(String keyspaceName, String viewName, TableAttributes attrs)
     {
@@ -41,6 +44,14 @@ public AlterViewStatement(String keyspaceName, String viewName, TableAttributes
         this.attrs = attrs;
     }
 
+    public void validate(QueryState state)
+    {
+        super.validate(state);
+
+        // save the query state to use it for guardrails validation in #apply
+        this.state = state;
+    }
+
     public Keyspaces apply(Keyspaces schema)
     {
         KeyspaceMetadata keyspace = schema.getNullable(keyspaceName);
@@ -54,6 +65,8 @@ public Keyspaces apply(Keyspaces schema)
 
         attrs.validate();
 
+        Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+
         TableParams params = attrs.asAlteredTableParams(view.metadata.params);
 
         if (params.gcGraceSeconds == 0)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
index 52ea9f9bb00f..78bc194481d5 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
@@ -20,6 +20,7 @@
 import java.util.*;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 
@@ -34,10 +35,12 @@
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.index.sasi.SASIIndex;
 import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
@@ -52,6 +55,7 @@ public final class CreateIndexStatement extends AlterSchemaStatement
     private final List<IndexTarget.Raw> rawIndexTargets;
     private final IndexAttributes attrs;
     private final boolean ifNotExists;
+    private QueryState state;
 
     private static final String DSE_INDEX_WARNING = "Index %s was not created. DSE custom index (%s) is not " +
                                                     "supported. Consult the docs on alternatives (SAI indexes, " +
@@ -81,6 +85,15 @@ public CreateIndexStatement(String keyspaceName,
         this.ifNotExists = ifNotExists;
     }
 
+    @Override
+    public void validate(QueryState state)
+    {
+        super.validate(state);
+
+        // save the query state to use it for guardrails validation in #apply
+        this.state = state;
+    }
+
     public Keyspaces apply(Keyspaces schema)
     {
         if (isDseIndexCreateStatement())
@@ -120,6 +133,17 @@ public Keyspaces apply(Keyspaces schema)
         if (Keyspace.open(table.keyspace).getReplicationStrategy().hasTransientReplicas())
             throw new InvalidRequestException("Secondary indexes are not supported on transiently replicated keyspaces");
 
+        // guardrails to limit number of secondary indexes per table.
+        if (!attrs.isCustom)
+        {
+            long existingSecondaryIndexes = table.indexes.stream().filter(indexMetadata -> !indexMetadata.isCustom()).count();
+            Guardrails.secondaryIndexesPerTable.guard(existingSecondaryIndexes + 1,
+                                                      Strings.isNullOrEmpty(indexName)
+                                                      ? String.format("on table %s", table.name)
+                                                      : String.format("%s on table %s", indexName, table.name),
+                                                      state);
+        }
+
         List<IndexTarget> indexTargets = Lists.newArrayList(transform(rawIndexTargets, t -> t.prepare(table)));
 
         if (indexTargets.isEmpty() && !attrs.isCustom)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index fd65d3c42eb7..689d9a2d924a 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -33,19 +33,21 @@
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.*;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.AlreadyExistsException;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.reads.repair.ReadRepairStrategy;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
 
-import static java.util.Comparator.comparing;
-
 import static com.google.common.collect.Iterables.concat;
+import static java.util.Comparator.comparing;
 
 public final class CreateTableStatement extends AlterSchemaStatement
 {
@@ -92,6 +94,32 @@ public CreateTableStatement(String keyspaceName,
         this.useCompactStorage = useCompactStorage;
     }
 
+    @Override
+    public void validate(QueryState state)
+    {
+        super.validate(state);
+
+        // Some tools use CreateTableStatement, and the guardrails below both don't make too much sense for tools and
+        // require the server to be initialized, so skipping them if it isn't.
+        if (Guardrails.ready())
+        {
+            // Guardrail on table properties
+            Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+
+            // Guardrail on columns per table
+            Guardrails.columnsPerTable.guard(rawColumns.size(), tableName, state);
+
+            if (Guardrails.tablesLimit.enabled(state))
+            {
+                // guardrails on number of tables
+                int totalUserTables = Schema.instance.getNonInternalKeyspaces().stream().map(Keyspace::open)
+                                                     .mapToInt(keyspace -> keyspace.getColumnFamilyStores().size())
+                                                     .sum();
+                Guardrails.tablesLimit.guard(totalUserTables + 1, tableName, state);
+            }
+        }
+    }
+
     public Keyspaces apply(Keyspaces schema)
     {
         KeyspaceMetadata keyspace = schema.getNullable(keyspaceName);
@@ -435,11 +463,16 @@ public String defaultCompactValueName()
     }
 
     public static TableMetadata.Builder parse(String cql, String keyspace)
+    {
+        return parse(cql, keyspace, Types.none());
+    }
+
+    public static TableMetadata.Builder parse(String cql, String keyspace, Types types)
     {
         return CQLFragmentParser.parseAny(CqlParser::createTableStatement, cql, "CREATE TABLE")
-                                .keyspace(keyspace)
-                                .prepare(null) // works around a messy ClientState/QueryProcessor class init deadlock
-                                .builder(Types.none());
+                         .keyspace(keyspace)
+                         .prepare(null) // works around a messy ClientState/QueryProcessor class init deadlock
+                         .builder(types);
     }
 
     public final static class Raw extends CQLStatement.Raw
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java
index 7c1717e24c2e..f99a4d4ffef3 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java
@@ -23,16 +23,20 @@
 import org.apache.cassandra.audit.AuditLogEntryType;
 import org.apache.cassandra.auth.Permission;
 import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.CQLFragmentParser;
 import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.CqlParser;
 import org.apache.cassandra.cql3.FieldIdentifier;
 import org.apache.cassandra.cql3.UTName;
 import org.apache.cassandra.db.marshal.AbstractType;
 import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.Keyspaces;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.schema.Types;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
@@ -61,6 +65,14 @@ public CreateTypeStatement(String keyspaceName,
         this.ifNotExists = ifNotExists;
     }
 
+    @Override
+    public void validate(QueryState state)
+    {
+        super.validate(state);
+
+        Guardrails.fieldsPerUDT.guard(fieldNames.size(), typeName, state);
+    }
+
     public Keyspaces apply(Keyspaces schema)
     {
         KeyspaceMetadata keyspace = schema.getNullable(keyspaceName);
@@ -120,6 +132,35 @@ public String toString()
         return String.format("%s (%s, %s)", getClass().getSimpleName(), keyspaceName, typeName);
     }
 
+    public static UserType parse(String cql, String keyspace)
+    {
+        return parse(cql, keyspace, Types.none());
+    }
+
+    public static UserType parse(String cql, String keyspace, Types userTypes)
+    {
+        return CQLFragmentParser.parseAny(CqlParser::createTypeStatement, cql, "CREATE TYPE")
+                                .keyspace(keyspace)
+                                .prepare(null) // works around a messy ClientState/QueryProcessor class init deadlock
+                                .createType(userTypes);
+    }
+
+    /**
+     * Build the {@link UserType} this statement creates.
+     *
+     * @param existingTypes the user-types existing in the keyspace in which the type is created (and thus on which
+     *                      the created type may depend on).
+     * @return the created type.
+     */
+    public UserType createType(Types existingTypes)
+    {
+        List<AbstractType<?>> fieldTypes = rawFieldTypes.stream()
+                                                        .map(t -> t.prepare(keyspaceName, existingTypes).getType())
+                                                        .collect(toList());
+        UserType type = new UserType(keyspaceName, bytes(typeName), fieldNames, fieldTypes, true);
+        return type;
+    }
+
     public static final class Raw extends CQLStatement.Raw
     {
         private final UTName name;
@@ -134,6 +175,12 @@ public Raw(UTName name, boolean ifNotExists)
             this.ifNotExists = ifNotExists;
         }
 
+        public Raw keyspace(String keyspace)
+        {
+            name.setKeyspace(keyspace);
+            return this;
+        }
+
         public CreateTypeStatement prepare(ClientState state)
         {
             String keyspaceName = name.hasKeyspace() ? name.getKeyspace() : state.getKeyspace();
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
index 7e51eb2ad818..3a5691c195b9 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
@@ -18,6 +18,8 @@
 package org.apache.cassandra.cql3.statements.schema;
 
 import java.util.*;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
 
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
@@ -36,9 +38,11 @@
 import org.apache.cassandra.db.view.View;
 import org.apache.cassandra.exceptions.AlreadyExistsException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Event.SchemaChange;
 import org.apache.cassandra.transport.Event.SchemaChange.Change;
 import org.apache.cassandra.transport.Event.SchemaChange.Target;
@@ -64,6 +68,7 @@ public final class CreateViewStatement extends AlterSchemaStatement
     private final TableAttributes attrs;
 
     private final boolean ifNotExists;
+    private QueryState state;
 
     public CreateViewStatement(String keyspaceName,
                                String tableName,
@@ -96,6 +101,15 @@ public CreateViewStatement(String keyspaceName,
         this.ifNotExists = ifNotExists;
     }
 
+    @Override
+    public void validate(QueryState state)
+    {
+        super.validate(state);
+
+        // save the query state to use it for guardrails validation in #apply
+        this.state = state;
+    }
+
     public Keyspaces apply(Keyspaces schema)
     {
         if (!DatabaseDescriptor.getEnableMaterializedViews())
@@ -137,6 +151,16 @@ public Keyspaces apply(Keyspaces schema)
         if (table.isView())
             throw ire("Materialized views cannot be created against other materialized views");
 
+        // Guardrail on table properties
+        Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+
+        // guardrails to limit number of mvs per table.
+        Set<ViewMetadata> baseTableViews = StreamSupport.stream(keyspace.views.forTable(table.id).spliterator(), false)
+                                                        .collect(Collectors.toCollection(HashSet::new));
+        Guardrails.materializedViewsPerTable.guard(baseTableViews.size() + 1,
+                                                   String.format("%s on table %s", viewName, table.name),
+                                                   state);
+
         if (table.params.gcGraceSeconds == 0)
         {
             throw ire("Cannot create materialized view '%s' for base table " +
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 85fe0fcd499b..7faac95d54e4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -40,7 +40,7 @@
 public final class TableAttributes extends PropertyDefinitions
 {
     public static final String ID = "id";
-    private static final Set<String> validKeywords;
+    public static final Set<String> validKeywords;
     private static final Set<String> obsoleteKeywords;
 
     private static final Set<String> UNSUPPORTED_DSE_COMPACTION_STRATEGIES = ImmutableSet.of(
diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
index fbaf3fd4d65a..2f02665c3058 100644
--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
@@ -17,14 +17,18 @@
  */
 package org.apache.cassandra.db;
 
+import java.util.Locale;
 
 import com.carrotsearch.hppc.ObjectIntHashMap;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.locator.Endpoints;
+import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.locator.NetworkTopologyStrategy;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolException;
 
 import static org.apache.cassandra.locator.Replicas.addToCountPerDc;
@@ -74,6 +78,11 @@ private ConsistencyLevel(int code, boolean isDCLocal)
         this.isDCLocal = isDCLocal;
     }
 
+    public static ConsistencyLevel fromString(String str)
+    {
+        return valueOf(str.toUpperCase(Locale.US));
+    }
+
     public static ConsistencyLevel fromCode(int code)
     {
         if (code < 0 || code >= codeIdx.length)
@@ -207,8 +216,11 @@ public void validateForRead() throws InvalidRequestException
         }
     }
 
-    public void validateForWrite() throws InvalidRequestException
+    public void validateForWrite(String keyspaceName, QueryState queryState) throws InvalidRequestException
     {
+        if (SchemaConstants.isUserKeyspace(keyspaceName))
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+
         switch (this)
         {
             case SERIAL:
@@ -218,8 +230,11 @@ public void validateForWrite() throws InvalidRequestException
     }
 
     // This is the same than validateForWrite really, but we include a slightly different error message for SERIAL/LOCAL_SERIAL
-    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy) throws InvalidRequestException
+    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName) throws InvalidRequestException
     {
+        if (SchemaConstants.isUserKeyspace(keyspaceName))
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(this);
+
         switch (this)
         {
             case EACH_QUORUM:
@@ -231,8 +246,11 @@ public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy
         }
     }
 
-    public void validateForCas() throws InvalidRequestException
+    public void validateForCas(String keyspaceName) throws InvalidRequestException
     {
+        if (SchemaConstants.isUserKeyspace(keyspaceName))
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(this);
+
         if (!isSerialConsistency())
             throw new InvalidRequestException("Invalid consistency for conditional update. Must be one of SERIAL or LOCAL_SERIAL");
     }
@@ -242,8 +260,11 @@ public boolean isSerialConsistency()
         return this == SERIAL || this == LOCAL_SERIAL;
     }
 
-    public void validateCounterForWrite(TableMetadata metadata) throws InvalidRequestException
+    public void validateCounterForWrite(TableMetadata metadata, QueryState queryState) throws InvalidRequestException
     {
+        if (SchemaConstants.isUserKeyspace(metadata.keyspace))
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+
         if (this == ConsistencyLevel.ANY)
             throw new InvalidRequestException("Consistency level ANY is not yet supported for counter table " + metadata.name);
 
diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java
index cf4238c67de5..83c7cf92db66 100644
--- a/src/java/org/apache/cassandra/db/Directories.java
+++ b/src/java/org/apache/cassandra/db/Directories.java
@@ -621,6 +621,16 @@ public long getAvailableSpace()
             return availableSpace > 0 ? availableSpace : 0;
         }
 
+        public long getTotalSpace()
+        {
+            return FileUtils.getTotalSpace(location);
+        }
+
+        public long getSpaceUsed()
+        {
+            return getTotalSpace() - getAvailableSpace();
+        }
+
         @Override
         public boolean equals(Object o)
         {
diff --git a/src/java/org/apache/cassandra/db/MultiCBuilder.java b/src/java/org/apache/cassandra/db/MultiCBuilder.java
index 787755192678..b85b450c7a89 100644
--- a/src/java/org/apache/cassandra/db/MultiCBuilder.java
+++ b/src/java/org/apache/cassandra/db/MultiCBuilder.java
@@ -135,6 +135,14 @@ public int remainingCount()
         return comparator.size() - size;
     }
 
+    /**
+     * Returns the current number of results when {@link #build()} is called
+     *
+     * @return the current number of build results
+     */
+    public abstract int buildSize();
+
+
     /**
      * Checks if the clusterings contains null elements.
      *
@@ -265,6 +273,12 @@ public MultiCBuilder addAllElementsToAll(List<List<ByteBuffer>> values)
             return addEachElementToAll(values.get(0));
         }
 
+        @Override
+        public int buildSize()
+        {
+            return hasMissingElements ? 0 : 1;
+        }
+
         public NavigableSet<Clustering<?>> build()
         {
             built = true;
@@ -427,6 +441,11 @@ public MultiCBuilder addAllElementsToAll(List<List<ByteBuffer>> values)
             return this;
         }
 
+        public int buildSize()
+        {
+            return hasMissingElements ? 0 : elementsList.size();
+        }
+
         public NavigableSet<Clustering<?>> build()
         {
             built = true;
diff --git a/src/java/org/apache/cassandra/db/view/View.java b/src/java/org/apache/cassandra/db/view/View.java
index d813d0e66885..cc7941a18099 100644
--- a/src/java/org/apache/cassandra/db/view/View.java
+++ b/src/java/org/apache/cassandra/db/view/View.java
@@ -35,6 +35,7 @@
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.schema.ViewMetadata;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.FBUtilities;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -201,7 +202,7 @@ private List<RawSelector> selectClause()
     ReadQuery getReadQuery()
     {
         if (query == null)
-            query = getSelectStatement().getQuery(QueryOptions.forInternalCalls(Collections.emptyList()), FBUtilities.nowInSeconds());
+            query = getSelectStatement().getQuery(QueryState.forInternalCalls(), QueryOptions.forInternalCalls(Collections.emptyList()), FBUtilities.nowInSeconds());
 
         return query;
     }
diff --git a/src/java/org/apache/cassandra/gms/ApplicationState.java b/src/java/org/apache/cassandra/gms/ApplicationState.java
index f5a0670e8dab..33a5632b0330 100644
--- a/src/java/org/apache/cassandra/gms/ApplicationState.java
+++ b/src/java/org/apache/cassandra/gms/ApplicationState.java
@@ -47,6 +47,7 @@ public enum ApplicationState
     INTERNAL_ADDRESS_AND_PORT, //Replacement for INTERNAL_IP with up to two ports
     NATIVE_ADDRESS_AND_PORT, //Replacement for RPC_ADDRESS
     STATUS_WITH_PORT, //Replacement for STATUS
+    DISK_USAGE,
     /**
      * The set of sstable versions on this node. This will usually be only the "current" sstable format (the one with
      * which new sstables are written), but may contain more on newly upgraded nodes before `upgradesstable` has been
diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 938ba5348c3f..532ced35f403 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java
@@ -172,6 +172,11 @@ public VersionedValue load(double load)
             return new VersionedValue(String.valueOf(load));
         }
 
+        public VersionedValue diskUsage(String state)
+        {
+            return new VersionedValue(state);
+        }
+
         public VersionedValue schema(UUID newVersion)
         {
             return new VersionedValue(newVersion.toString());
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrail.java b/src/java/org/apache/cassandra/guardrails/Guardrail.java
new file mode 100644
index 000000000000..55e97e7d092d
--- /dev/null
+++ b/src/java/org/apache/cassandra/guardrails/Guardrail.java
@@ -0,0 +1,780 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+import java.util.function.BooleanSupplier;
+import java.util.function.Function;
+import java.util.function.LongSupplier;
+import java.util.function.Predicate;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Sets;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.utils.NoSpamLogger;
+import org.apache.cassandra.utils.units.SizeUnit;
+import org.apache.cassandra.utils.units.Units;
+
+import static java.lang.String.format;
+
+/**
+ * General class defining a given guardrail (that guards against some particular usage/condition).
+ *
+ * <p>Some guardrails only emit warnings when triggered, while other fail the query that trigger them. Some may do one
+ * or the other based on specific threshold.
+ *
+ * <p>Note that all the defined class support live updates, which is why each guardrail class ctor takes suppliers of
+ * the condition the guardrail acts on rather than the condition itself. Which does imply that said suppliers should
+ * be fast and non-blocking to avoid surprises. Note that this does not mean live updates are exposed to the user,
+ * just that the implementation is up to it if we ever want to expose it.
+ */
+public abstract class Guardrail
+{
+    private static final NoSpamLogger logger = NoSpamLogger.getLogger(LoggerFactory.getLogger(Guardrail.class),
+                                                                      10, TimeUnit.MINUTES);
+
+    private static final String REDACTED = "<redacted>";
+
+    public final String name;
+
+    /**
+     * whether to throw {@link InvalidRequestException} on {@link this#fail(String)}
+     */
+    private boolean throwOnFailure = true;
+
+    /**
+     * minimum logging and triggering interval to avoid spamming downstream
+     */
+    private long minNotifyIntervalInMs = 0;
+
+    /**
+     * time of last warning in milliseconds
+     */
+    private volatile long lastWarnInMs = 0;
+
+    /**
+     * time of last failure in milliseconds
+     */
+    private volatile long lastFailInMs = 0;
+
+    protected Guardrail(String name)
+    {
+        this.name = name;
+    }
+
+    protected void warn(String message)
+    {
+        warn(message, message);
+    }
+
+    protected void warn(String fullMessage, String redactedMessage)
+    {
+        if (skipNotifyingOnWarning())
+            return;
+
+        logger.warn(fullMessage);
+        // Note that ClientWarn will simply ignore the message if we're not running this as part of a user query
+        // (the internal "state" will be null)
+        ClientWarn.instance.warn(fullMessage);
+        for (Guardrails.Listener listener : Guardrails.listeners)
+            listener.onWarningTriggered(name, redactedMessage);
+    }
+
+    protected void fail(String message)
+    {
+        fail(message, message);
+    }
+
+    protected void fail(String fullMessage, String redactedMessage)
+    {
+        if (!skipNotifyingOnFailure())
+        {
+            logger.error(fullMessage);
+            for (Guardrails.Listener listener : Guardrails.listeners)
+                listener.onFailureTriggered(name, redactedMessage);
+        }
+
+        if (throwOnFailure)
+            throw new InvalidRequestException(fullMessage);
+    }
+
+    /**
+     * do no throw {@link InvalidRequestException} if guardrail failure is triggered.
+     * <p>
+     * Note: this method is not thread safe and should only be used during guardrail initialization
+     *
+     * @return current guardrail
+     */
+    Guardrail noExceptionOnFailure()
+    {
+        this.throwOnFailure = false;
+        return this;
+    }
+
+    /**
+     * Note: this method is not thread safe and should only be used during guardrail initialization
+     *
+     * @param minNotifyIntervalInMs frequency of logging and triggering listener to avoid spamming,
+     *                              default 0 means always log and trigger listeners.
+     * @return current guardrail
+     */
+    Guardrail minNotifyIntervalInMs(long minNotifyIntervalInMs)
+    {
+        assert minNotifyIntervalInMs >= 0;
+        this.minNotifyIntervalInMs = minNotifyIntervalInMs;
+        return this;
+    }
+
+    /**
+     * reset last notify time to make sure it will notify downstream when {@link this#warn(String, String)}
+     * or {@link this#fail(String)} is called next time.
+     */
+    @VisibleForTesting
+    public void resetLastNotifyTime()
+    {
+        lastFailInMs = 0;
+        lastWarnInMs = 0;
+    }
+
+    /**
+     * @return true if guardrail should not log message and trigger listeners; otherwise, update lastFailInMs respectively.
+     */
+    private boolean skipNotifyingOnFailure()
+    {
+        if (minNotifyIntervalInMs == 0)
+            return false;
+
+        long nowInMs = System.currentTimeMillis();
+        long timeElapsedInMs = nowInMs - lastFailInMs;
+
+        boolean skip = timeElapsedInMs < minNotifyIntervalInMs;
+
+        if (!skip)
+        {
+            lastFailInMs = nowInMs;
+        }
+
+        return skip;
+    }
+
+    /**
+     * @return true if guardrail should not log message and trigger listeners; otherwise, update lastWarnInMs respectively.
+     */
+    private boolean skipNotifyingOnWarning()
+    {
+        if (minNotifyIntervalInMs == 0)
+            return false;
+
+        long nowInMs = System.currentTimeMillis();
+        long timeElapsedInMs = nowInMs - lastWarnInMs;
+
+        boolean skip = timeElapsedInMs < minNotifyIntervalInMs;
+
+        if (!skip)
+        {
+            lastWarnInMs = nowInMs;
+        }
+
+        return skip;
+    }
+
+    /**
+     * Checks whether this guardrail is enabled or not. This will be enabled if guardrails are globally enabled and
+     * {@link Guardrails#ready()} and if the authenticated user (if specified) is not a system nor superuser.
+     *
+     * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+     *                   A {@code null} value means that the check should be done regardless of the query.
+     * @return {@code true} if this guardrail is enabled & ready, {@code false} otherwise.
+     */
+    public boolean enabled(@Nullable QueryState queryState)
+    {
+        return Guardrails.enabled() && Guardrails.ready() && (null == queryState || queryState.isOrdinaryUser());
+    }
+
+    /**
+     * A guardrail based on numeric threshold(s).
+     *
+     * <p>A {@link Threshold} guardrail defines (up to) 2 threshold, one at which a warning is issued, and a higher one
+     * at which a failure is triggered. Only one of those thresholds can be activated if desired.
+     *
+     * <p>This guardrail only handles guarding positive values.
+     */
+    public static class Threshold extends Guardrail
+    {
+        /**
+         * A function used to build the error message of a triggered {@link Threshold} guardrail.
+         */
+        public interface ErrorMessageProvider
+        {
+            /**
+             * Called when the guardrail is triggered to build the corresponding error message.
+             *
+             * @param isWarning       whether the trigger is a warning one; otherwise it is failure one.
+             * @param what            a string, provided by the call to the {@link #guard} method, describing what the guardrail
+             *                        has been applied to (and that has triggered it).
+             * @param valueString     the value that triggered the guardrail (as a string).
+             * @param thresholdString the threshold that was passed to trigger the guardrail (as a string).
+             */
+            public String createMessage(boolean isWarning, String what, String valueString, String thresholdString);
+        }
+
+        final LongSupplier warnThreshold;
+        final LongSupplier failThreshold;
+        final ErrorMessageProvider errorMessageProvider;
+
+        /**
+         * Creates a new {@link Threshold} guardrail.
+         *
+         * @param name                 the name of the guardrail (for identification in {@link Guardrails.Listener} events).
+         * @param warnThreshold        a supplier of the threshold above which a warning should be triggered. This cannot be
+         *                             null, but {@code () -> -1L} can be provided if no warning threshold is desired.
+         * @param failThreshold        a supplier of the threshold above which a failure should be triggered. This cannot be
+         *                             null, but {@code () -> -1L} can be provided if no failure threshold is desired.
+         * @param errorMessageProvider a function to generate the error message if the guardrail is triggered
+         *                             (being it for a warning or a failure).
+         */
+        Threshold(String name,
+                  LongSupplier warnThreshold,
+                  LongSupplier failThreshold,
+                  ErrorMessageProvider errorMessageProvider)
+        {
+            super(name);
+            this.warnThreshold = warnThreshold;
+            this.failThreshold = failThreshold;
+            this.errorMessageProvider = errorMessageProvider;
+        }
+
+        protected String errMsg(boolean isWarning, String what, long value, long thresholdValue)
+        {
+            return errorMessageProvider.createMessage(isWarning,
+                                                      what,
+                                                      Long.toString(value),
+                                                      Long.toString(thresholdValue));
+        }
+
+        protected String redactedErrMsg(boolean isWarning, long value, long thresholdValue)
+        {
+            return errorMessageProvider.createMessage(isWarning,
+                                                      REDACTED,
+                                                      Long.toString(value),
+                                                      Long.toString(thresholdValue));
+        }
+
+        private long failValue()
+        {
+            long failValue = failThreshold.getAsLong();
+            return failValue < 0 ? Long.MAX_VALUE : failValue;
+        }
+
+        private long warnValue()
+        {
+            long warnValue = warnThreshold.getAsLong();
+            return warnValue < 0 ? Long.MAX_VALUE : warnValue;
+        }
+
+        /**
+         * Checks whether this guardrail is enabled or not. This will be enabled if guardrails are globally enabled
+         * ({@link Guardrails#enabled()}), and if any of the thresholds is positive.
+         *
+         * @return {@code true} if this guardrail is enabled, {@code false} otherwise.
+         */
+        public boolean enabled()
+        {
+            return super.enabled(null) && (failThreshold.getAsLong() >= 0 || warnThreshold.getAsLong() >= 0);
+        }
+
+        /**
+         * Checks whether this guardrail is enabled or not. This will be enabled if guardrails are
+         * ({@link Guardrails#ready()} ()}), the keyspace (if specified) is not an internal one, and if any of the
+         * thresholds is positive.
+         *
+         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         * @return {@code true} if this guardrail is enabled, {@code false} otherwise.
+         */
+        public boolean enabled(@Nullable QueryState queryState)
+        {
+            return super.enabled(queryState) && (failThreshold.getAsLong() >= 0 || warnThreshold.getAsLong() >= 0);
+        }
+
+        /**
+         * Checks whether the provided value would trigger a warning or failure if passed to {@link #guard}.
+         *
+         * <p>This method is optional (does not have to be called) but can be used in the case where the "what"
+         * argument to {@link #guard} is expensive to build to save doing so in the common case (of the guardrail
+         * not being triggered).
+         *
+         * @param value the value to test.
+         * @return {@code true} if {@code value} is above the warning or failure thresholds of this guardrail, {@code false} otherwise.
+         */
+        public boolean triggersOn(long value)
+        {
+            return enabled(null) && (value > Math.min(failValue(), warnValue()));
+        }
+
+        /**
+         * Checks whether the provided value would trigger a warning or failure if passed to {@link #guard}.
+         *
+         * <p>This method is optional (does not have to be called) but can be used in the case where the "what"
+         * argument to {@link #guard} is expensive to build to save doing so in the common case (of the guardrail
+         * not being triggered).
+         *
+         * @param value      the value to test.
+         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         * @return {@code true} if {@code value} is above the warning or failure thresholds of this guardrail, {@code false} otherwise.
+         */
+        public boolean triggersOn(long value, @Nullable QueryState queryState)
+        {
+            return enabled(queryState) && (value > Math.min(failValue(), warnValue()));
+        }
+
+        /**
+         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
+         *
+         * @param value the value to check.
+         * @param what  a string describing what {@code value} is a value of used in the error message if the
+         *              guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
+         *              argument must describe which column of which row is triggering the guardrail for convenience). Note that
+         *              this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
+         *              this method behind a {@link #triggersOn} call.
+         */
+        public void guard(long value, String what)
+        {
+            guard(value, what, false);
+        }
+
+        /**
+         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
+         *
+         * @param value            the value to check.
+         * @param what             a string describing what {@code value} is a value of used in the error message if the
+         *                         guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
+         *                         argument must describe which column of which row is triggering the guardrail for convenience). Note that
+         *                         this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
+         *                         this method behind a {@link #triggersOn} call.
+         * @param containsUserData a boolean describing if {@code what} contains user data. If this is the case,
+         *                         {@code what} will only be included in the log messages and client warning. It will not be included in the
+         *                         error messages that are passed to listeners and exceptions. We have to exclude the user data from exceptions
+         *                         because they will be sent as Diagnostic Events in the future.
+         */
+        public void guard(long value, String what, boolean containsUserData)
+        {
+            guard(value, what, containsUserData, null);
+        }
+
+        /**
+         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
+         *
+         * @param value            the value to check.
+         * @param what             a string describing what {@code value} is a value of used in the error message if the
+         *                         guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
+         *                         argument must describe which column of which row is triggering the guardrail for convenience). Note that
+         *                         this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
+         *                         this method behind a {@link #triggersOn} call.
+         * @param queryState       the queryState, used to skip the check if the query is internal or is done by a superuser.
+         */
+        public void guard(long value, String what, @Nullable QueryState queryState)
+        {
+            guard(value, what, false, queryState);
+        }
+
+        /**
+         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
+         *
+         * @param value            the value to check.
+         * @param what             a string describing what {@code value} is a value of used in the error message if the
+         *                         guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
+         *                         argument must describe which column of which row is triggering the guardrail for convenience). Note that
+         *                         this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
+         *                         this method behind a {@link #triggersOn} call.
+         * @param containsUserData a boolean describing if {@code what} contains user data. If this is the case,
+         *                         {@code what} will only be included in the log messages and client warning. It will not be included in the
+         *                         error messages that are passed to listeners and exceptions.
+         * @param queryState       the queryState, used to skip the check if the query is internal or is done by a superuser.
+         */
+        public void guard(long value, String what, boolean containsUserData, @Nullable QueryState queryState)
+        {
+            if (!enabled(queryState))
+                return;
+
+            long failValue = failValue();
+            if (value > failValue)
+            {
+                String fullMsg = errMsg(false, what, value, failValue);
+                fail(fullMsg, containsUserData ? redactedErrMsg(false, value, failValue) : fullMsg);
+            }
+            else
+            {
+                long warnValue = warnValue();
+                if (value > warnValue)
+                {
+                    String fullMsg = errMsg(true, what, value, warnValue);
+                    warn(fullMsg, containsUserData ? redactedErrMsg(true, value, warnValue) : fullMsg);
+                }
+            }
+        }
+    }
+
+    /**
+     * A {@link Threshold} guardrail whose values represent a byte size.
+     *
+     * <p>This works exactly as a {@link Threshold}, but provides slightly more convenient error messages (display
+     * the sizes in human readable format).
+     */
+    public static class SizeThreshold extends Threshold
+    {
+        SizeThreshold(String name,
+                      LongSupplier warnThreshold,
+                      LongSupplier failThreshold,
+                      ErrorMessageProvider errorMessageProvider)
+        {
+            super(name, warnThreshold, failThreshold, errorMessageProvider);
+        }
+
+        @Override
+        protected String errMsg(boolean isWarning, String what, long value, long thresholdValue)
+        {
+            return errorMessageProvider.createMessage(isWarning,
+                                                      what,
+                                                      Units.toString(value, SizeUnit.BYTES),
+                                                      Units.toString(thresholdValue, SizeUnit.BYTES));
+        }
+
+        @Override
+        protected String redactedErrMsg(boolean isWarning, long value, long thresholdValue)
+        {
+            return errorMessageProvider.createMessage(isWarning,
+                                                      REDACTED,
+                                                      Units.toString(value, SizeUnit.BYTES),
+                                                      Units.toString(thresholdValue, SizeUnit.BYTES));
+        }
+    }
+
+    /**
+     * A {@link Threshold} guardrail whose values represent a percentage
+     *
+     * <p>This work exactly as a {@link Threshold}, but provides slightly more convenient error messages for percentage
+     */
+    public static class PercentageThreshold extends Threshold
+    {
+        PercentageThreshold(String name,
+                            LongSupplier warnThreshold,
+                            LongSupplier failThreshold,
+                            ErrorMessageProvider errorMessageProvider)
+        {
+            super(name, warnThreshold, failThreshold, errorMessageProvider);
+        }
+
+        @Override
+        protected String errMsg(boolean isWarning, String what, long value, long thresholdValue)
+        {
+            return errorMessageProvider.createMessage(isWarning,
+                                                      what,
+                                                      String.format("%d%%", value),
+                                                      String.format("%d%%", thresholdValue));
+        }
+    }
+
+    /**
+     * A guardrail that completely disables the use of a particular feature.
+     *
+     * <p>Note that this guardrail only triggers failures (if the feature is disabled) so is only meant for
+     * query-based guardrails (we're happy to reject queries deemed dangerous, but we don't want to create a guardrail
+     * that breaks compaction for instance).
+     */
+    public static class DisableFlag extends Guardrail
+    {
+        private final BooleanSupplier disabled;
+        private final String what;
+
+        /**
+         * Creates a new {@link DisableFlag} guardrail.
+         *
+         * @param name     the name of the guardrail (for identification in {@link Guardrails.Listener} events).
+         * @param disabled a supplier of boolean indicating whether the feature guarded by this guardrail must be
+         *                 disabled.
+         * @param what     the feature that is guarded by this guardrail (for reporting in error messages),
+         *                 {@link #ensureEnabled(String, QueryState)}}} can specify a different {@code what}.
+         */
+        DisableFlag(String name, BooleanSupplier disabled, String what)
+        {
+            super(name);
+            this.disabled = disabled;
+            this.what = what;
+        }
+
+        /**
+         * Triggers a failure if this guardrail is disabled.
+         *
+         * <p>This must be called when the feature guarded by this guardrail is used to ensure such use is in fact
+         * allowed.
+         */
+        public void ensureEnabled()
+        {
+            ensureEnabled(what, QueryState.forInternalCalls());
+        }
+
+        /**
+         * Triggers a failure if this guardrail is disabled.
+         *
+         * <p>This must be called when the feature guarded by this guardrail is used to ensure such use is in fact
+         * allowed.
+         *
+         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         */
+        public void ensureEnabled(@Nullable QueryState queryState)
+        {
+            ensureEnabled(what, queryState);
+        }
+
+        /**
+         * Triggers a failure if this guardrail is disabled.
+         *
+         * <p>This must be called when the feature guarded by this guardrail is used to ensure such use is in fact
+         * allowed.
+         *
+         * @param what       the feature that is guarded by this guardrail (for reporting in error messages).
+         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         */
+        public void ensureEnabled(String what, @Nullable QueryState queryState)
+        {
+            if (enabled(queryState) && disabled.getAsBoolean())
+                fail(what + " is not allowed");
+        }
+    }
+
+    /**
+     * A guardrail that rejects the use of specific values.
+     *
+     * <p>Note that like {@link DisableFlag}, this guardrail only triggers failures and is thus only for query-based
+     * guardrails.
+     *
+     * @param <T> the type of the values of which certain are disallowed.
+     */
+    public static class DisallowedValues<T> extends Guardrail
+    {
+        /*
+         * Implementation note: as mentioned in the class Javadoc and for consistency with the other Guardrail
+         * implementation of this class (and to generally avoid surprises), this implementation ensures that live
+         * changes to the underlying guardrail setting gets reflected. This is the reason for the relative
+         * "complexity" of this class.
+         */
+
+        private final Supplier<Set<String>> rawSupplier;
+        private final Function<String, T> parser;
+        private final String what;
+
+        private volatile Set<T> cachedDisallowed;
+        private volatile Set<String> cachedRaw;
+
+        /**
+         * Creates a new {@link DisallowedValues} guardrail.
+         *
+         * @param name          the name of the guardrail (for identification in {@link Guardrails.Listener} events).
+         * @param disallowedRaw a supplier of the values that are disallowed in raw (string) form. The set returned by
+         *                      this supplier <b>must</b> be immutable (we don't use {@code ImmutableSet} because we
+         *                      want to feed values from {@link GuardrailsConfig} directly and having ImmutableSet
+         *                      there would currently be annoying (because populated automatically by snakeYaml)).
+         * @param parser        a function to parse the value to disallow from string.
+         * @param what          what represents the value disallowed (for reporting in error messages).
+         */
+        DisallowedValues(String name, Supplier<Set<String>> disallowedRaw, Function<String, T> parser, String what)
+        {
+            super(name);
+            this.rawSupplier = disallowedRaw;
+            this.parser = parser;
+            this.what = what;
+
+            if (Guardrails.ready())
+                ensureUpToDate();
+        }
+
+        private void ensureUpToDate()
+        {
+            Set<String> current = rawSupplier.get();
+            // Same as below, this shouldn't happen if settings have been properly sanitized, but throw a meaningful
+            // error if there is a bug.
+            if (current == null)
+                throw new RuntimeException(format("Invalid null setting for guardrail on %s. This is a bug and should not have happened.", what));
+
+            // Note that this will fail on first call (as we want), as currentRaw will be null but not current
+            if (current == cachedRaw)
+                return;
+
+            try
+            {
+                // Setting cachedAllowed first so that on a parse failure we leave everything as it previously
+                // was (not that we'd expect that matter but ...).
+                cachedDisallowed = current.stream()
+                                          .map(parser)
+                                          .collect(Collectors.toCollection(HashSet::new));
+                cachedRaw = current;
+            }
+            catch (Exception e)
+            {
+                // This catches parsing errors. Hopefully, this shouldn't happen as guardrails settings should have
+                // been sanitized, but ...
+                // Also, we catch the exception to add a meaningful error message, but rethrow otherwise: if a
+                // guardrail has been configured, it's presumably to avoid bad things to go in, so we don't want to
+                // take the risk of letting it go if there is a misconfiguration.
+                throw new RuntimeException(format("Error parsing configured setting for guardrail on %s. This "
+                                                  + "is a bug and should not have happened."
+                                                  + "The failing setting is %s", what, current), e);
+            }
+        }
+
+        /**
+         * Triggers a failure if the provided value is disallowed by this guardrail.
+         *
+         * @param value the value to check.
+         */
+        public void ensureAllowed(T value)
+        {
+            ensureAllowed(value, null);
+        }
+
+        /**
+         * Triggers a failure if any of the provided values is disallowed by this guardrail.
+         *
+         * @param values the values to check.
+         */
+        public void ensureAllowed(Set<T> values)
+        {
+            ensureAllowed(values, null);
+        }
+
+        /**
+         * Triggers a failure if the provided value is disallowed by this guardrail.
+         *
+         * @param value      the value to check.
+         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         */
+        public void ensureAllowed(T value, @Nullable QueryState queryState)
+        {
+            if (!enabled(queryState))
+                return;
+
+            ensureUpToDate();
+            if (cachedDisallowed.contains(value))
+                fail(format("Provided value %s is not allowed for %s (disallowed values are: %s)",
+                            value, what, cachedRaw));
+        }
+
+        /**
+         * Triggers a failure if any of the provided values is disallowed by this guardrail.
+         *
+         * @param values     the values to check.
+         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         */
+        public void ensureAllowed(Set<T> values, @Nullable QueryState queryState)
+        {
+            if (!enabled(queryState))
+                return;
+
+            ensureUpToDate();
+
+            Set<T> intersection = Sets.intersection(values, cachedDisallowed);
+            if (!intersection.isEmpty())
+                fail(format("Provided values %s are not allowed for %s (disallowed values are: %s)",
+                            intersection.stream().sorted().collect(Collectors.toList()), what, cachedRaw));
+        }
+    }
+
+    /**
+     * A guardrail based on two predicates.
+     *
+     * <p>A {@link Predicates} guardrail defines (up to) 2 predicates, one at which a warning is issued, and another one
+     * at which a failure is triggered. If failure is triggered, warning is skipped.
+     *
+     * @param <T> the type of the values to be tested against predicates.
+     */
+    public static class Predicates<T> extends Guardrail
+    {
+        private final Predicate<T> warnPredicate;
+        private final Predicate<T> failurePredicate;
+        private final MessageProvider<T> messageProvider;
+
+        /**
+         * A function used to build the warning or error message of a triggered {@link Predicates} guardrail.
+         */
+        public interface MessageProvider<T>
+        {
+            /**
+             * Called when the guardrail is triggered to build the corresponding message.
+             *
+             * @param isWarning whether the trigger is a warning one; otherwise it is failure one.
+             * @param value     the value that triggers guardrail.
+             */
+            String createMessage(boolean isWarning, T value);
+        }
+
+        /**
+         * Creates a new {@link Predicates} guardrail.
+         *
+         * @param name             the name of the guardrail (for identification in {@link Guardrails.Listener} events).
+         * @param warnPredicate    a predicate that is used to check if given value should trigger a warning.
+         * @param failurePredicate a predicate that is used to check if given value should trigger a failure.
+         * @param messageProvider  a function to generate the warning or error message if the guardrail is triggered
+         */
+        Predicates(String name, Predicate<T> warnPredicate, Predicate<T> failurePredicate, MessageProvider<T> messageProvider)
+        {
+            super(name);
+            this.warnPredicate = warnPredicate;
+            this.failurePredicate = failurePredicate;
+            this.messageProvider = messageProvider;
+        }
+
+        /**
+         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
+         *
+         * @param value      the value to check.
+         * @param queryState the query queryState, used to skip the check if the query is internal or is done by a superuser.
+         *                   A {@code null} value means that the check should be done regardless of the query.
+         */
+        public void guard(T value, @Nullable QueryState queryState)
+        {
+            if (!enabled(queryState))
+                return;
+
+            if (failurePredicate.test(value))
+            {
+                fail(messageProvider.createMessage(false, value));
+            }
+            else if (warnPredicate.test(value))
+            {
+                warn(messageProvider.createMessage(true, value));
+            }
+        }
+    }
+}
+
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
new file mode 100644
index 000000000000..672fc30aa893
--- /dev/null
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.List;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.guardrails.Guardrail.DisableFlag;
+import org.apache.cassandra.guardrails.Guardrail.DisallowedValues;
+import org.apache.cassandra.guardrails.Guardrail.PercentageThreshold;
+import org.apache.cassandra.guardrails.Guardrail.Predicates;
+import org.apache.cassandra.guardrails.Guardrail.SizeThreshold;
+import org.apache.cassandra.guardrails.Guardrail.Threshold;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster;
+
+import static java.lang.String.format;
+
+/**
+ * Entry point for Guardrails, storing the defined guardrails and provided a few global methods over them.
+ */
+public abstract class Guardrails
+{
+    private static final GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+
+    public static final Threshold columnValueSize = new SizeThreshold("column_value_size",
+                                                                      () -> -1L, // not needed so far
+                                                                      () -> config.column_value_size_failure_threshold_in_kb * 1024L,
+                                                                      (x, what, v, t) -> format("Value of %s of size %s is greater than the maximum allowed (%s)",
+                                                                                                what, v, t));
+
+    public static final Threshold columnsPerTable = new Threshold("columns_per_table",
+                                                                  () -> -1L, // not needed so far
+                                                                  () -> config.columns_per_table_failure_threshold,
+                                                                  (x, what, v, t) -> format("Tables cannot have more than %s columns, but %s provided for table %s",
+                                                                                            t, v, what));
+
+    public static final DisableFlag userTimestampsEnabled = new DisableFlag("user_provided_timestamps",
+                                                                            () -> !config.user_timestamps_enabled,
+                                                                            "User provided timestamps (USING TIMESTAMP)");
+
+    public static final DisallowedValues<ConsistencyLevel> disallowedWriteConsistencies = new DisallowedValues<>("disallowed_write_consistency_levels",
+                                                                                                                 () -> config.write_consistency_levels_disallowed,
+                                                                                                                 ConsistencyLevel::fromString,
+                                                                                                                 "Consistency Level");
+
+    public static final Threshold secondaryIndexesPerTable = new Threshold("secondary_indexes_per_table",
+                                                                           () -> -1,
+                                                                           () -> config.secondary_index_per_table_failure_threshold,
+                                                                           (x, what, v, t) -> format("Tables cannot have more than %s secondary indexes, failed to create secondary index %s",
+                                                                                                     t, what));
+
+    public static final Threshold materializedViewsPerTable = new Threshold("materialized_views_per_table",
+                                                                            () -> -1,
+                                                                            () -> config.materialized_view_per_table_failure_threshold,
+                                                                            (x, what, v, t) -> format("Tables cannot have more than %s materialized views, failed to create materialized view %s",
+                                                                                                      t, what));
+
+    public static final Threshold tablesLimit = new Threshold("number_of_tables",
+                                                              () -> config.tables_warn_threshold,
+                                                              () -> config.tables_failure_threshold,
+                                                              (isWarning, what, v, t) -> isWarning
+                                                                                         ? format("Creating table %s, current number of tables %s exceeds warning threshold of %s.",
+                                                                                                  what, v, t)
+                                                                                         : format("Cannot have more than %s tables, failed to create table %s",
+                                                                                                  t, what));
+
+    public static final DisallowedValues<String> disallowedTableProperties = new DisallowedValues<>("disallowed_table_properties",
+                                                                                                    () -> config.table_properties_disallowed,
+                                                                                                    String::toLowerCase,
+                                                                                                    "Table Properties");
+
+    @SuppressWarnings("unchecked")
+    public static final Predicates<InetAddressAndPort> replicaDiskUsage =
+    (Predicates<InetAddressAndPort>) new Predicates<>("replica_disk_usage",
+                                                      DiskUsageBroadcaster.instance::isStuffed,
+                                                      DiskUsageBroadcaster.instance::isFull,
+                                                      // not using `what` because it represents replica address which should be hidden from client.
+                                                      (isWarning, what) -> isWarning
+                                                                           ? "Replica disk usage exceeds warn threshold"
+                                                                           : "Write request failed because disk usage exceeds failure threshold")
+                                     .minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
+
+    public static final PercentageThreshold localDiskUsage =
+    (PercentageThreshold) new PercentageThreshold("local_disk_usage",
+                                                  () -> config.disk_usage_percentage_warn_threshold,
+                                                  () -> config.disk_usage_percentage_failure_threshold,
+                                                  (isWarning, what, v, t) -> isWarning
+                                                                             ? format("Local disk usage %s(%s) exceeds warn threshold of %s", v, what, t)
+                                                                             : format("Local disk usage %s(%s) exceeds failure threshold of %s, will stop accepting writes", v, what, t))
+                          .noExceptionOnFailure()
+                          .minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
+
+    public static final Threshold partitionSize =
+    new SizeThreshold("partition_size",
+                      () -> config.partition_size_warn_threshold_in_mb * 1024L * 1024L,
+                      () -> -1L,
+                      (x, what, v, t) -> format("Detected partition %s of size %s is greater than the maximum recommended size (%s)",
+                                                what, v, t));
+
+    public static final Threshold partitionKeysInSelectQuery =
+    new Threshold("partition_keys_in_select_query",
+                  () -> -1L,
+                  () -> config.partition_keys_in_select_failure_threshold,
+                  (x, what, v, t) -> format("%s cannot be completed because it selects %s partitions keys - more than the maximum allowed %s", what, v, t));
+
+    public static final Threshold fieldsPerUDT =
+    new Threshold("fields_per_udt",
+                  () -> -1L, // not needed so far
+                  () -> config.fields_per_udt_failure_threshold,
+                  (x, what, v, t) -> format("User types cannot have more than %s columns, but %s provided for type %s",
+                                            t, v, what));
+
+    public static final Threshold collectionSize =
+    new SizeThreshold("collection_size",
+                      () -> config.collection_size_warn_threshold_in_kb * 1024L,
+                      () -> -1L, // not needed so far
+                      (x, what, v, t) -> format("Detected collection %s of size %s, greater than the maximum recommended size (%s)",
+                                                what, v, t));
+
+    public static final Threshold itemsPerCollection =
+    new Threshold("items_per_collection",
+                  () -> config.items_per_collection_warn_threshold,
+                  () -> -1L, // not needed so far
+                  (x, what, v, t) -> format("Detected collection %s with %s items, greater than the maximum recommended (%s)",
+                                            what, v, t));
+
+    public static final Threshold inSelectCartesianProduct =
+    new Threshold("in_select_cartesian_product",
+                  () -> -1L,
+                  () -> config.in_select_cartesian_product_failure_threshold,
+                  (x, what, v, t) -> format("The query cannot be completed because cartesian product of all values in IN conditions is greater than %s", t));
+
+    public static final DisableFlag readBeforeWriteListOperationsEnabled =
+    new DisableFlag("read_before_write_list_operations",
+                    () -> !config.read_before_write_list_operations_enabled,
+                    "List operation requiring read before write");
+
+    static final List<Listener> listeners = new CopyOnWriteArrayList<>();
+
+    private Guardrails()
+    {
+    }
+
+    /**
+     * Whether guardrails are enabled globally or not.
+     *
+     * @return {@code true} if guardrails are enabled (applies based on their individual setting), {@code false}
+     * otherwise (in which case no guardrail will trigger).
+     */
+    public static boolean enabled()
+    {
+        return config.enabled;
+    }
+
+    /**
+     * Whether guardrails are ready.
+     *
+     * @return {@code true} if daemon is initialized (applies based on their individual setting), {@code false}
+     * otherwise (in which case no guardrail will trigger).
+     */
+    public static boolean ready()
+    {
+        return DatabaseDescriptor.isDaemonInitialized();
+    }
+
+    /**
+     * Register a {@link Listener}.
+     *
+     * <p>Note that listeners are called in the order they are registered, and on the thread on which the guardrail
+     * is triggered.
+     *
+     * @param listener the listener to register. If the same listener is registered twice (or more), its method will be
+     *                 called twice (or more) for every trigger.
+     */
+    public static void register(Listener listener)
+    {
+        listeners.add(listener);
+    }
+
+    /**
+     * Unregister a previously registered listener.
+     *
+     * @param listener the listener to unregister. If it was not registered before, this is a no-op. If it was
+     *                 registered more than once, only one of the instance is unregistered.
+     */
+    public static void unregister(Listener listener)
+    {
+        listeners.remove(listener);
+    }
+
+    /**
+     * Interface for external listeners interested in being notified when a guardrail is triggered.
+     *
+     * <p>Listeners should be registered through the {@link #register} method to take effect.
+     */
+    public interface Listener
+    {
+        /**
+         * Called when a guardrail triggers a warning.
+         *
+         * @param guardrailName a name describing the guardrail.
+         * @param message       the message corresponding to the guardrail trigger.
+         */
+        public void onWarningTriggered(String guardrailName, String message);
+
+        /**
+         * Called when a guardrail triggers a failure.
+         *
+         * @param guardrailName a name describing the guardrail.
+         * @param message       the message corresponding to the guardrail trigger.
+         */
+        public void onFailureTriggered(String guardrailName, String message);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
new file mode 100644
index 000000000000..ee4c0e6e1601
--- /dev/null
+++ b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
@@ -0,0 +1,267 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Sets;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.schema.TableAttributes;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+import static java.lang.String.format;
+
+/**
+ * Configuration settings for guardrails (populated from the Yaml file).
+ *
+ * <p>Note that the settings here must only be used by the {@link Guardrails} class and not directly by the code
+ * checking each guarded constraint (which, again, should use the higher level abstractions defined in
+ * {@link Guardrails}).
+ *
+ * <p>This contains a main setting, {@code enabled}, controlling if guardrails are globally active or not, and
+ * individual setting to control each guardrail. We have 2 variants of guardrails, soft (warn) and hard (fail) limits,
+ * each guardrail having either one of the variant or both (note in particular that hard limits only make sense for
+ * guardrails triggering during query execution. For other guardrails, say one triggering during compaction, failing
+ * does not make sense).
+ *
+ * <p>If {@code enabled == false}, no limits should be enforced, be it soft or hard. Additionally, each individual
+ * setting should have a specific value (typically -1 for numeric settings), that allows to disable the corresponding
+ * guardrail.
+ *
+ * <p>The default values for each guardrail settings should reflect what is mandated for C* aaS environment.
+ *
+ * <p>For consistency, guardrails based on a simple numeric threshold should use the naming scheme
+ * {@code <what_is_guarded>_warn_threshold} for soft limits and {@code <what_is_guarded>_failure_threshold} for hard
+ * ones, and if the value has a unit, that unit should be added at the end (for instance,
+ * {@code <what_is_guarded>_failure_threshold_in_kb}). For "boolean" guardrails that disable a feature, use
+ * {@code <what_is_guarded_enabled}. Other type of guardrails can use appropriate suffixes but should start with
+ * {@code <what is guarded>}.
+ */
+public class GuardrailsConfig
+{
+    public static final Long NO_LIMIT = -1L;
+
+    public Boolean enabled = false;
+
+    public Long column_value_size_failure_threshold_in_kb;
+    public Long columns_per_table_failure_threshold;
+
+    public Long tables_warn_threshold;
+    public Long tables_failure_threshold;
+    public Set<String> table_properties_disallowed;
+
+    public Boolean user_timestamps_enabled;
+
+    public Long secondary_index_per_table_failure_threshold;
+    public Long materialized_view_per_table_failure_threshold;
+
+    public Set<String> write_consistency_levels_disallowed;
+
+    public Integer partition_size_warn_threshold_in_mb;
+    public Integer partition_keys_in_select_failure_threshold;
+
+    // Limit number of terms and their cartesian product in IN query
+    public Integer in_select_cartesian_product_failure_threshold;
+
+    public Long fields_per_udt_failure_threshold;
+    public Long collection_size_warn_threshold_in_kb;
+    public Long items_per_collection_warn_threshold;
+
+    public Integer disk_usage_percentage_warn_threshold;
+    public Integer disk_usage_percentage_failure_threshold;
+
+    public Boolean read_before_write_list_operations_enabled;
+
+    /**
+     * Validate that the value provided for each guardrail setting is valid.
+     *
+     * @throws ConfigurationException if any of the settings has an invalid setting.
+     */
+    public void validate()
+    {
+        validateStrictlyPositiveInteger(column_value_size_failure_threshold_in_kb,
+                                        "column_value_size_failure_threshold_in_kb");
+
+        validateStrictlyPositiveInteger(columns_per_table_failure_threshold,
+                                        "columns_per_table_failure_threshold");
+
+        validateStrictlyPositiveInteger(tables_warn_threshold, "tables_warn_threshold");
+        validateStrictlyPositiveInteger(tables_failure_threshold, "tables_failure_threshold");
+        validateWarnLowerThanFail(tables_warn_threshold, tables_failure_threshold, "tables");
+        validateStrictlyPositiveInteger(partition_size_warn_threshold_in_mb, "partition_size_warn_threshold_in_mb");
+        validateStrictlyPositiveInteger(partition_keys_in_select_failure_threshold, "partition_keys_in_select_failure_threshold");
+
+        validateStrictlyPositiveInteger(fields_per_udt_failure_threshold, "fields_per_udt_failure_threshold");
+        validateStrictlyPositiveInteger(collection_size_warn_threshold_in_kb, "collection_size_warn_threshold_in_kb");
+        validateStrictlyPositiveInteger(items_per_collection_warn_threshold, "items_per_collection_warn_threshold");
+
+        validateStrictlyPositiveInteger(in_select_cartesian_product_failure_threshold, "in_select_cartesian_product_failure_threshold");
+
+        validateDisallowedTableProperties();
+
+        validateDiskUsageThreshold();
+
+        for (String rawCL : write_consistency_levels_disallowed)
+        {
+            try
+            {
+                ConsistencyLevel.fromString(rawCL);
+            }
+            catch (Exception e)
+            {
+                throw new ConfigurationException(format("Invalid value for write_consistency_level_disallowed guardrail: "
+                                                        + "'%s' does not parse as a Consistency Level", rawCL));
+            }
+        }
+    }
+
+    /**
+     * If {@link DatabaseDescriptor#isApplyDbaasDefaults()} is true, apply cloud defaults to guardrails settings that
+     * are not specified in yaml; otherwise, apply on-prem defaults to guardrails settings that are not specified in yaml;
+     */
+    public void applyConfig()
+    {
+        enforceDefault(user_timestamps_enabled, v -> user_timestamps_enabled = v, true, true);
+
+        enforceDefault(column_value_size_failure_threshold_in_kb, v -> column_value_size_failure_threshold_in_kb = v, NO_LIMIT, 5 * 1024L);
+
+        enforceDefault(columns_per_table_failure_threshold, v -> columns_per_table_failure_threshold = v, NO_LIMIT, 20L);
+        enforceDefault(secondary_index_per_table_failure_threshold, v -> secondary_index_per_table_failure_threshold = v, NO_LIMIT, 1L);
+        enforceDefault(materialized_view_per_table_failure_threshold, v -> materialized_view_per_table_failure_threshold = v, NO_LIMIT, 2L);
+        enforceDefault(tables_warn_threshold, v -> tables_warn_threshold = v, NO_LIMIT, 100L);
+        enforceDefault(tables_failure_threshold, v -> tables_failure_threshold = v, NO_LIMIT, 200L);
+
+        // We use a LinkedHashSet just for the sake of preserving the ordering in error messages
+        enforceDefault(write_consistency_levels_disallowed,
+                       v -> write_consistency_levels_disallowed = v,
+                       Collections.<String>emptySet(),
+                       new LinkedHashSet<>(Arrays.asList("ANY", "ONE", "LOCAL_ONE")));
+
+        enforceDefault(table_properties_disallowed,
+                       v -> table_properties_disallowed = v,
+                       Collections.<String>emptySet(),
+                       new LinkedHashSet<>(TableAttributes.validKeywords.stream().sorted().filter(p -> !p.equals("default_time_to_live")).collect(Collectors.toList())));
+
+        enforceDefault(partition_size_warn_threshold_in_mb, v -> partition_size_warn_threshold_in_mb = v, 100, 100);
+        enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT.intValue(), 20);
+
+        enforceDefault(fields_per_udt_failure_threshold, v -> fields_per_udt_failure_threshold = v, NO_LIMIT, 10L);
+        enforceDefault(collection_size_warn_threshold_in_kb, v -> collection_size_warn_threshold_in_kb = v, NO_LIMIT, 5 * 1024L);
+        enforceDefault(items_per_collection_warn_threshold, v -> items_per_collection_warn_threshold = v, NO_LIMIT, 20L);
+
+        // for node status
+        enforceDefault(disk_usage_percentage_warn_threshold, v -> disk_usage_percentage_warn_threshold = v, NO_LIMIT.intValue(), 70);
+        enforceDefault(disk_usage_percentage_failure_threshold, v -> disk_usage_percentage_failure_threshold = v, NO_LIMIT.intValue(), 80);
+
+        enforceDefault(in_select_cartesian_product_failure_threshold, v -> in_select_cartesian_product_failure_threshold = v, NO_LIMIT.intValue(), 25);
+        enforceDefault(read_before_write_list_operations_enabled, v -> read_before_write_list_operations_enabled = v, true, false);
+    }
+
+    private void validateDisallowedTableProperties()
+    {
+        Set<String> diff = Sets.difference(table_properties_disallowed.stream().map(String::toLowerCase).collect(Collectors.toSet()),
+                                           TableAttributes.validKeywords);
+
+        if (!diff.isEmpty())
+            throw new ConfigurationException(format("Invalid value for table_properties_disallowed guardrail: "
+                                                    + "'%s' do not parse as valid table properties", diff.toString()));
+    }
+
+    private void validateStrictlyPositiveInteger(long value, String name)
+    {
+        // We use 'long' for generality, but most numeric guardrails cannot effectively be more than an 'int' for various
+        // internal reasons. Not that any should ever come close in practice ...
+        // Also, in most cases, zero does not make sense (allowing 0 tables or columns is not exactly useful).
+        validatePositiveNumeric(value, Integer.MAX_VALUE, false, name);
+    }
+
+    private void validatePositiveNumeric(long value, long maxValue, boolean allowZero, String name)
+    {
+        if (value > maxValue)
+            throw new ConfigurationException(format("Invalid value %d for guardrail %s: maximum allowed value is %d",
+                                                    value, name, maxValue));
+
+        if (value == 0 && !allowZero)
+            throw new ConfigurationException(format("Invalid value for guardrail %s: 0 is not allowed", name));
+
+        // We allow -1 as a general "disabling" flag. But reject anything lower to avoid mistakes.
+        if (value < -1L)
+            throw new ConfigurationException(format("Invalid value %d for guardrail %s: negative values are not "
+                                                    + "allowed, outside of -1 which disables the guardrail",
+                                                    value, name));
+    }
+
+    private void validateWarnLowerThanFail(long warnValue, long failValue, String guardName)
+    {
+        if (warnValue == -1 || failValue == -1)
+            return;
+
+        if (failValue < warnValue)
+            throw new ConfigurationException(format("The warn threshold %d for the %s guardrail should be lower "
+                                                    + "than the failure threshold %d",
+                                                    warnValue, guardName, failValue));
+    }
+
+    /**
+     * Enforce default value based on {@link DatabaseDescriptor#isApplyDbaasDefaults()} if
+     * it's not specified in yaml
+     *
+     * @param current       current config value defined in yaml
+     * @param optionSetter  setter to updated given config
+     * @param onPremDefault default value for on-prem
+     * @param dbaasDefault  default value for constellation DB-as-a-service
+     * @param <T>
+     */
+    private static <T> void enforceDefault(T current, Consumer<T> optionSetter, T onPremDefault, T dbaasDefault)
+    {
+        if (current != null)
+            return;
+
+        optionSetter.accept(DatabaseDescriptor.isApplyDbaasDefaults() ? dbaasDefault : onPremDefault);
+    }
+
+    /**
+     * @return true if given disk usage threshold disables disk usage guardrail
+     */
+    public static boolean diskUsageGuardrailDisabled(double value)
+    {
+        return value < 0;
+    }
+
+    /**
+     * Validate that the values provided for disk usage are valid.
+     *
+     * @throws ConfigurationException if any of the settings has an invalid setting.
+     */
+    @VisibleForTesting
+    public void validateDiskUsageThreshold()
+    {
+        validatePositiveNumeric(disk_usage_percentage_warn_threshold, 100, false, "disk_usage_percentage_warn_threshold");
+        validatePositiveNumeric(disk_usage_percentage_failure_threshold, 100, false, "disk_usage_percentage_failure_threshold");
+        validateWarnLowerThanFail(disk_usage_percentage_warn_threshold, disk_usage_percentage_failure_threshold, "disk_usage_percentage");
+    }
+}
diff --git a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
index 0ac189c9e77e..999e1bd11f2c 100644
--- a/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/CQLSSTableWriter.java
@@ -50,7 +50,7 @@
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.schema.*;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -235,20 +235,21 @@ public CQLSSTableWriter rawAddRow(List<ByteBuffer> values)
         if (values.size() != boundNames.size())
             throw new InvalidRequestException(String.format("Invalid number of arguments, expecting %d values but got %d", boundNames.size(), values.size()));
 
+        QueryState state = QueryState.forInternalCalls();
         QueryOptions options = QueryOptions.forInternalCalls(null, values);
-        List<ByteBuffer> keys = insert.buildPartitionKeyNames(options);
-        SortedSet<Clustering<?>> clusterings = insert.createClustering(options);
+        List<ByteBuffer> keys = insert.buildPartitionKeyNames(options, state);
+        SortedSet<Clustering<?>> clusterings = insert.createClustering(options, state);
 
         long now = System.currentTimeMillis();
         // Note that we asks indexes to not validate values (the last 'false' arg below) because that triggers a 'Keyspace.open'
         // and that forces a lot of initialization that we don't want.
         UpdateParameters params = new UpdateParameters(insert.metadata,
                                                        insert.updatedColumns(),
+                                                       QueryState.forInternalCalls(),
                                                        options,
                                                        insert.getTimestamp(TimeUnit.MILLISECONDS.toMicros(now), options),
                                                        (int) TimeUnit.MILLISECONDS.toSeconds(now),
-                                                       insert.getTimeToLive(options),
-                                                       Collections.emptyMap());
+                                                       insert.getTimeToLive(options), Collections.emptyMap());
 
         try
         {
@@ -558,9 +559,9 @@ private Types createTypes(String keyspace)
          */
         private TableMetadata createTable(Types types)
         {
-            ClientState state = ClientState.forInternalCalls();
-            CreateTableStatement statement = schemaStatement.prepare(state);
-            statement.validate(ClientState.forInternalCalls());
+            QueryState state = QueryState.forInternalCalls();
+            CreateTableStatement statement = schemaStatement.prepare(state.getClientState());
+            statement.validate(state);
 
             TableMetadata.Builder builder = statement.builder(types);
             if (partitioner != null)
@@ -576,8 +577,8 @@ private TableMetadata createTable(Types types)
          */
         private UpdateStatement prepareInsert()
         {
-            ClientState state = ClientState.forInternalCalls();
-            UpdateStatement insert = (UpdateStatement) insertStatement.prepare(state);
+            QueryState state = QueryState.forInternalCalls();
+            UpdateStatement insert = (UpdateStatement) insertStatement.prepare(state.getClientState());
             insert.validate(state);
 
             if (insert.hasConditions())
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index 1e807dbb7ed1..18455493380f 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
@@ -18,6 +18,7 @@
 
 package org.apache.cassandra.io.sstable.format;
 
+import java.nio.ByteBuffer;
 import java.util.*;
 import java.util.function.Consumer;
 
@@ -30,9 +31,14 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DeletionPurger;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.ComplexColumnData;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.Unfiltered;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.io.FSWriteError;
 import org.apache.cassandra.io.sstable.Component;
@@ -43,7 +49,9 @@
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.utils.FBUtilities;
@@ -423,6 +431,47 @@ public interface SSTableSizeParameters
         long dataSize();
     }
 
+    public static void guardCollectionSize(UnfilteredRowIterator partition, Unfiltered unfiltered)
+    {
+        if (!unfiltered.isRow() || SchemaConstants.isInternalKeyspace(partition.metadata().keyspace))
+            return;
+
+        if (!Guardrails.collectionSize.enabled() && !Guardrails.itemsPerCollection.enabled())
+            return;
+
+        Row row = (Row) unfiltered;
+        for (ColumnMetadata column : row.columns())
+        {
+            if (!column.type.isCollection() || !column.type.isMultiCell())
+                continue;
+
+            ComplexColumnData cells = row.getComplexColumnData(column);
+            if (cells == null)
+                continue;
+
+            ComplexColumnData liveCells = cells.purge(DeletionPurger.PURGE_ALL, FBUtilities.nowInSeconds());
+            if (liveCells == null)
+                continue;
+
+            int cellsSize = liveCells.dataSize();
+            int cellsCount = liveCells.cellsCount();
+
+            if (!Guardrails.collectionSize.triggersOn(cellsSize) &&
+                !Guardrails.itemsPerCollection.triggersOn(cellsCount))
+                continue;
+
+            TableMetadata metadata = partition.metadata();
+            ByteBuffer key = partition.partitionKey().getKey();
+            String keyString = metadata.primaryKeyAsCQLLiteral(key, row.clustering());
+            String msg = String.format("%s in row %s in table %s",
+                                       column.name.toString(),
+                                       keyString,
+                                       metadata);
+            Guardrails.collectionSize.guard(cellsSize, msg, true);
+            Guardrails.itemsPerCollection.guard(cellsCount, msg, true);
+        }
+    }
+
     public static abstract class Factory
     {
         public abstract long estimateSize(SSTableSizeParameters parameters);
@@ -442,6 +491,15 @@ public abstract SSTableWriter open(Descriptor descriptor,
 
     protected void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
     {
+        if (SchemaConstants.isInternalKeyspace(metadata().keyspace))
+            return;
+
+        if (Guardrails.partitionSize.triggersOn(rowSize))
+        {
+            String keyString = metadata().partitionKeyAsCQLLiteral(key.getKey());
+            Guardrails.partitionSize.guard(rowSize, String.format("%s in %s", keyString, metadata), true);
+        }
+
         if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
         {
             String keyString = metadata().partitionKeyType.getString(key.getKey());
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java b/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
index 399c273cfbcc..5c6d465301b3 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/ColumnIndex.java
@@ -34,6 +34,7 @@
 import org.apache.cassandra.io.sstable.format.big.IndexInfo;
 import org.apache.cassandra.io.sstable.format.big.BigTableRowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.util.DataOutputBuffer;
 import org.apache.cassandra.io.util.SequentialWriter;
@@ -120,7 +121,11 @@ public void buildRowIndex(UnfilteredRowIterator iterator) throws IOException
         this.headerLength = writer.position() - initialPosition;
 
         while (iterator.hasNext())
-            add(iterator.next());
+        {
+            Unfiltered unfiltered = iterator.next();
+            SSTableWriter.guardCollectionSize(iterator, unfiltered);
+            add(unfiltered);
+        }
 
         finish();
     }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
index 263e73a24a90..b0f55c15c629 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/PartitionWriter.java
@@ -34,6 +34,7 @@
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.rows.UnfilteredSerializer;
 import org.apache.cassandra.io.sstable.format.SSTableFlushObserver;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.sstable.format.trieindex.RowIndexReader.IndexInfo;
 import org.apache.cassandra.io.util.SequentialWriter;
@@ -112,6 +113,7 @@ public long writePartition(UnfilteredRowIterator partition) throws IOException
         while (partition.hasNext())
         {
             Unfiltered unfiltered = partition.next();
+            SSTableWriter.guardCollectionSize(partition, unfiltered);
             addUnfiltered(unfiltered);
         }
 
diff --git a/src/java/org/apache/cassandra/schema/Schema.java b/src/java/org/apache/cassandra/schema/Schema.java
index 2a7a451b558a..08426cf47610 100644
--- a/src/java/org/apache/cassandra/schema/Schema.java
+++ b/src/java/org/apache/cassandra/schema/Schema.java
@@ -335,6 +335,14 @@ public List<String> getUserKeyspaces()
         return ImmutableList.copyOf(Sets.difference(getNonSystemKeyspacesSet(), SchemaConstants.REPLICATED_SYSTEM_KEYSPACE_NAMES));
     }
 
+    /**
+     * @return collection of the user defined keyspaces, excluding DSE internal keyspaces.
+     */
+    public List<String> getNonInternalKeyspaces()
+    {
+        return getUserKeyspaces().stream().filter(ks -> !SchemaConstants.isInternalKeyspace(ks)).collect(Collectors.toList());
+    }
+
     /**
      * Get metadata about keyspace inner ColumnFamilies
      *
diff --git a/src/java/org/apache/cassandra/schema/SchemaConstants.java b/src/java/org/apache/cassandra/schema/SchemaConstants.java
index 7b6b7de4906d..8c6c4bbb42f7 100644
--- a/src/java/org/apache/cassandra/schema/SchemaConstants.java
+++ b/src/java/org/apache/cassandra/schema/SchemaConstants.java
@@ -42,14 +42,18 @@ public final class SchemaConstants
     public static final String VIRTUAL_SCHEMA = "system_virtual_schema";
 
     public static final String VIRTUAL_VIEWS = "system_views";
+    public static final String SCHEMA_VIRTUAL_KEYSPACE_NAME = "system_virtual_schema";
+    public static final String SYSTEM_VIEWS_KEYSPACE_NAME = "system_views";
 
     /* system keyspace names (the ones with LocalStrategy replication strategy) */
-    public static final Set<String> LOCAL_SYSTEM_KEYSPACE_NAMES =
-        ImmutableSet.of(SYSTEM_KEYSPACE_NAME, SCHEMA_KEYSPACE_NAME);
+    public static final Set<String> LOCAL_SYSTEM_KEYSPACE_NAMES = ImmutableSet.of(SYSTEM_KEYSPACE_NAME, SCHEMA_KEYSPACE_NAME);
 
     /* replicate system keyspace names (the ones with a "true" replication strategy) */
-    public static final Set<String> REPLICATED_SYSTEM_KEYSPACE_NAMES =
-        ImmutableSet.of(TRACE_KEYSPACE_NAME, AUTH_KEYSPACE_NAME, DISTRIBUTED_KEYSPACE_NAME);
+    public static final Set<String> REPLICATED_SYSTEM_KEYSPACE_NAMES = ImmutableSet.of(TRACE_KEYSPACE_NAME, AUTH_KEYSPACE_NAME, DISTRIBUTED_KEYSPACE_NAME);
+
+    /* virtual keyspace names */
+    public static final Set<String> VIRTUAL_KEYSPACE_NAMES = ImmutableSet.of(SCHEMA_VIRTUAL_KEYSPACE_NAME, SYSTEM_VIEWS_KEYSPACE_NAME);
+
     /**
      * longest permissible KS or CF name.  Our main concern is that filename not be more than 255 characters;
      * the filename will contain both the KS and CF names. Since non-schema-name components only take up
@@ -108,4 +112,27 @@ public static boolean isSystemKeyspace(String keyspaceName)
                 || isReplicatedSystemKeyspace(keyspaceName)
                 || isVirtualSystemKeyspace(keyspaceName);
     }
+    
+    /**
+     * @return whether or not the keyspace is a virtual keyspace (system_virtual_schema, system_views)
+     */
+    public static boolean isVirtualKeyspace(String keyspaceName)
+    {
+        return VIRTUAL_KEYSPACE_NAMES.contains(keyspaceName.toLowerCase());
+    }
+
+    public static boolean isInternalKeyspace(String keyspaceName)
+    {
+        return isLocalSystemKeyspace(keyspaceName)
+               || isReplicatedSystemKeyspace(keyspaceName)
+               || isVirtualKeyspace(keyspaceName);
+    }
+    
+    /**
+     * @return whether or not the keyspace is a user keyspace
+     */
+    public static boolean isUserKeyspace(String keyspaceName)
+    {
+        return !isInternalKeyspace(keyspaceName);
+    }
 }
diff --git a/src/java/org/apache/cassandra/schema/SchemaTransformations.java b/src/java/org/apache/cassandra/schema/SchemaTransformations.java
new file mode 100644
index 000000000000..a3950d54feac
--- /dev/null
+++ b/src/java/org/apache/cassandra/schema/SchemaTransformations.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.schema;
+
+
+import org.apache.cassandra.db.marshal.UserType;
+import org.apache.cassandra.exceptions.AlreadyExistsException;
+
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+
+/**
+ * Factory and utility methods to create simple schema transformation.
+ */
+public class SchemaTransformations
+{
+    /**
+     * Creates a schema transformation that adds the provided keyspace.
+     *
+     * @param keyspace       the keyspace to add.
+     * @param ignoreIfExists if {@code true}, the transformation is a no-op if a keyspace of the same name than
+     *                       {@code keyspace} already exists in the schema the transformation is applied on. Otherwise,
+     *                       the transformation throws an {@link AlreadyExistsException} in that case.
+     * @return the created transformation.
+     */
+    public static SchemaTransformation addKeyspace(KeyspaceMetadata keyspace, boolean ignoreIfExists)
+    {
+        return schema ->
+        {
+            KeyspaceMetadata existing = schema.getNullable(keyspace.name);
+            if (existing != null)
+            {
+                if (ignoreIfExists)
+                    return schema;
+
+                throw new AlreadyExistsException(keyspace.name);
+            }
+
+            return schema.withAddedOrUpdated(keyspace);
+        };
+    }
+
+    /**
+     * Creates a schema transformation that adds the provided table.
+     *
+     * @param table          the table to add.
+     * @param ignoreIfExists if {@code true}, the transformation is a no-op if a table of the same name than
+     *                       {@code table} already exists in the schema the transformation is applied on. Otherwise,
+     *                       the transformation throws an {@link AlreadyExistsException} in that case.
+     * @return the created transformation.
+     */
+    public static SchemaTransformation addTable(TableMetadata table, boolean ignoreIfExists)
+    {
+        return schema ->
+        {
+            KeyspaceMetadata keyspace = schema.getNullable(table.keyspace);
+            if (keyspace == null)
+                throw invalidRequest("Keyspace '%s' doesn't exist", table.keyspace);
+
+            if (keyspace.hasTable(table.name))
+            {
+                if (ignoreIfExists)
+                    return schema;
+
+                throw new AlreadyExistsException(table.keyspace, table.name);
+            }
+
+            table.validate();
+
+            return schema.withAddedOrUpdated(keyspace.withSwapped(keyspace.tables.with(table)));
+        };
+    }
+
+    /**
+     * Creates a schema transformation that either add the provided type, or "update" (replace really) it to be the
+     * provided type.
+     *
+     * <p>Please note that this usually <b>unsafe</b>: if the type exists, this replace it without any particular check
+     * and so could replace it with an incompatible version. This is used internally however for hard-coded tables
+     * (System ones, including DSE ones) to force the "last version".
+     *
+     * @param type the type to add/update.
+     * @return the created transformation.
+     */
+    public static SchemaTransformation addOrUpdateType(UserType type)
+    {
+        return schema ->
+        {
+            KeyspaceMetadata keyspace = schema.getNullable(type.keyspace);
+            if (null == keyspace)
+                throw invalidRequest("Keyspace '%s' doesn't exist", type.keyspace);
+
+            Types newTypes = keyspace.types.get(type.name).isPresent()
+                             ? keyspace.types.withUpdatedUserType(type)
+                             : keyspace.types.with(type);
+            return schema.withAddedOrUpdated(keyspace.withSwapped(newTypes));
+        };
+    }
+
+    /**
+     * Creates a schema transformation that adds the provided view.
+     *
+     * @param view           the view to add.
+     * @param ignoreIfExists if {@code true}, the transformation is a no-op if a view of the same name than
+     *                       {@code view} already exists in the schema the transformation is applied on. Otherwise,
+     *                       the transformation throws an {@link AlreadyExistsException} in that case.
+     * @return the created transformation.
+     */
+    public static SchemaTransformation addView(ViewMetadata view, boolean ignoreIfExists)
+    {
+        return schema ->
+        {
+            KeyspaceMetadata keyspace = schema.getNullable(view.keyspace());
+            if (keyspace == null)
+                throw invalidRequest("Cannot add view to non existing keyspace '%s'", view.keyspace());
+
+            if (keyspace.hasView(view.name()))
+            {
+                if (ignoreIfExists)
+                    return schema;
+
+                throw new AlreadyExistsException(view.keyspace(), view.name());
+            }
+
+            return schema.withAddedOrUpdated(keyspace.withSwapped(keyspace.views.with(view)));
+        };
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/schema/TableMetadata.java b/src/java/org/apache/cassandra/schema/TableMetadata.java
index a5530ed629e3..60aa2fbb7bd8 100644
--- a/src/java/org/apache/cassandra/schema/TableMetadata.java
+++ b/src/java/org/apache/cassandra/schema/TableMetadata.java
@@ -25,7 +25,6 @@
 
 import com.google.common.base.MoreObjects;
 import com.google.common.collect.*;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -40,6 +39,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.service.reads.SpeculativeRetryPolicy;
+import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.AbstractIterator;
 import org.github.jamm.Unmetered;
 
@@ -698,6 +698,65 @@ public String toDebugString()
                           .toString();
     }
 
+    /**
+     * Returns a string representation of a partition in a CQL-friendly format.
+     * <p>
+     * For non-composite types it returns the result of {@link org.apache.cassandra.cql3.CQL3Type#toCQLLiteral}
+     * applied to the partition key.
+     * For composite types it applies {@link org.apache.cassandra.cql3.CQL3Type#toCQLLiteral} to each subkey and combines
+     * the results into a tuple.
+     *
+     * @param partitionKey
+     * @return CQL-like string representation of a partition key
+     */
+    public String partitionKeyAsCQLLiteral(ByteBuffer partitionKey)
+    {
+        return primaryKeyAsCQLLiteral(partitionKey, Clustering.EMPTY);
+    }
+
+    /**
+     * Returns a string representation of a primary key in a CQL-friendly format.
+     *
+     * @param partitionKey the partition key part of the primary key
+     * @param clustering   the clustering key part of the primary key
+     * @return a CQL-like string representation of the specified primary key
+     */
+    public String primaryKeyAsCQLLiteral(ByteBuffer partitionKey, Clustering clustering)
+    {
+        int clusteringSize = clustering.size();
+
+        String[] literals;
+        int i = 0;
+
+        if (partitionKeyType instanceof CompositeType)
+        {
+            ByteBuffer[] values = ((CompositeType) partitionKeyType).split(partitionKey);
+            int size = ((CompositeType) partitionKeyType).types.size();
+            literals = new String[size + clusteringSize];
+            for (i = 0; i < size; i++)
+            {
+                literals[i] = asCQLLiteral(((CompositeType) partitionKeyType).types.get(i), values[i]);
+            }
+        }
+        else
+        {
+            literals = new String[1 + clusteringSize];
+            literals[i++] = asCQLLiteral(partitionKeyType, partitionKey);
+        }
+
+        for (int j = 0; j < clusteringSize; j++)
+        {
+            literals[i++] = asCQLLiteral(clusteringColumns().get(j).type, clustering.bufferAt(j));
+        }
+
+        return i == 1 ? literals[0] : "(" + String.join(", ", literals) + ")";
+    }
+
+    private static String asCQLLiteral(AbstractType<?> type, ByteBuffer value)
+    {
+        return type.asCQL3Type().toCQLLiteral(value, ProtocolVersion.CURRENT);
+    }
+
     public static final class Builder
     {
         final String keyspace;
@@ -984,6 +1043,11 @@ public Iterable<ColumnMetadata> columns()
             return columns.values();
         }
 
+        public int numColumns()
+        {
+            return columns.size();
+        }
+
         public Set<String> columnNames()
         {
             return columns.values().stream().map(c -> c.name.toString()).collect(toSet());
diff --git a/src/java/org/apache/cassandra/service/CASRequest.java b/src/java/org/apache/cassandra/service/CASRequest.java
index 88fb9bd70d88..3404451c8f8c 100644
--- a/src/java/org/apache/cassandra/service/CASRequest.java
+++ b/src/java/org/apache/cassandra/service/CASRequest.java
@@ -42,5 +42,5 @@ public interface CASRequest
      * The updates to perform of a CAS success. The values fetched using the readFilter()
      * are passed as argument.
      */
-    public PartitionUpdate makeUpdates(FilteredPartition current) throws InvalidRequestException;
+    public PartitionUpdate makeUpdates(FilteredPartition current, QueryState state) throws InvalidRequestException;
 }
diff --git a/src/java/org/apache/cassandra/service/ClientState.java b/src/java/org/apache/cassandra/service/ClientState.java
index 496cabdb8ec2..da0e563938a2 100644
--- a/src/java/org/apache/cassandra/service/ClientState.java
+++ b/src/java/org/apache/cassandra/service/ClientState.java
@@ -151,6 +151,13 @@ protected ClientState(ClientState source)
         this.driverVersion = source.driverVersion;
     }
 
+    private ClientState(AuthenticatedUser user)
+    {
+        this.isInternal = false;
+        this.remoteAddress = null;
+        this.user = user;
+    }
+
     /**
      * @return a ClientState object for internal C* calls (not limited by any kind of auth).
      */
@@ -174,6 +181,14 @@ public static ClientState forExternalCalls(SocketAddress remoteAddress)
         return new ClientState((InetSocketAddress)remoteAddress);
     }
 
+    /**
+     * @return a ClientState object for internal calls with the given user logged in (not limited by any kind of auth).
+     */
+    public static ClientState forExternalCalls(AuthenticatedUser user)
+    {
+        return new ClientState(user);
+    }
+
     /**
      * Clone this ClientState object, but use the provided keyspace instead of the
      * keyspace in this ClientState object.
diff --git a/src/java/org/apache/cassandra/service/QueryState.java b/src/java/org/apache/cassandra/service/QueryState.java
index adb13b505df9..60cebf309ea3 100644
--- a/src/java/org/apache/cassandra/service/QueryState.java
+++ b/src/java/org/apache/cassandra/service/QueryState.java
@@ -19,7 +19,7 @@
 
 import java.net.InetAddress;
 
-import org.apache.cassandra.transport.ClientStat;
+import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.utils.FBUtilities;
 
 /**
@@ -115,4 +115,15 @@ public InetAddress getClientAddress()
     {
         return clientState.getClientAddress();
     }
+
+    /**
+     * Checks if this user is an ordinary user (not a super or system user).
+     *
+     * @return {@code true} if this user is an ordinary user, {@code false} otherwise.
+     */
+    public boolean isOrdinaryUser()
+    {
+        AuthenticatedUser user = this.getClientState().getUser();
+        return null != user && !user.isSystem() && !user.isSuper();
+    }
 }
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index 72801a9d129b..361b57fc7aac 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -280,7 +280,7 @@ public static RowIterator cas(String keyspaceName,
                                   CASRequest request,
                                   ConsistencyLevel consistencyForPaxos,
                                   ConsistencyLevel consistencyForCommit,
-                                  ClientState state,
+                                  QueryState state,
                                   int nowInSeconds,
                                   long queryStartNanoTime)
     throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException, CasWriteUnknownResultException
@@ -289,6 +289,8 @@ public static RowIterator cas(String keyspaceName,
         try
         {
             TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName);
+            consistencyForPaxos.validateForCas(keyspaceName);
+            consistencyForCommit.validateForCasCommit(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName);
 
             Supplier<Pair<PartitionUpdate, RowIterator>> updateProposer = () ->
             {
@@ -311,7 +313,7 @@ public static RowIterator cas(String keyspaceName,
                 }
 
                 // Create the desired updates
-                PartitionUpdate updates = request.makeUpdates(current);
+                PartitionUpdate updates = request.makeUpdates(current, state);
 
                 long size = updates.dataSize();
                 casWriteMetrics.mutationSize.update(size);
@@ -334,7 +336,7 @@ public static RowIterator cas(String keyspaceName,
                            consistencyForPaxos,
                            consistencyForCommit,
                            consistencyForCommit,
-                           state,
+                           state.getClientState(),
                            queryStartNanoTime,
                            casWriteMetrics,
                            updateProposer);
@@ -434,9 +436,9 @@ private static RowIterator doPaxos(TableMetadata metadata,
         AbstractReplicationStrategy latestRs = keyspace.getReplicationStrategy();
         try
         {
-            consistencyForPaxos.validateForCas();
-            consistencyForReplayCommits.validateForCasCommit(latestRs);
-            consistencyForCommit.validateForCasCommit(latestRs);
+            consistencyForPaxos.validateForCas(metadata.keyspace);
+            consistencyForReplayCommits.validateForCasCommit(latestRs, metadata.keyspace);
+            consistencyForCommit.validateForCasCommit(latestRs, metadata.keyspace);
 
             long timeoutNanos = DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS);
             while (System.nanoTime() - queryStartNanoTime < timeoutNanos)
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index 8d07f655860f..c4e910ebc889 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -102,6 +102,7 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.schema.ViewMetadata;
+import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster;
 import org.apache.cassandra.streaming.*;
 import org.apache.cassandra.tracing.TraceKeyspace;
 import org.apache.cassandra.transport.ClientResourceLimits;
@@ -977,6 +978,7 @@ else if (isReplacingSameAddress())
             // gossip Schema.emptyVersion forcing immediate check for schema updates (see MigrationManager#maybeScheduleSchemaPull)
             Schema.instance.updateVersionAndAnnounce(); // Ensure we know our own actual Schema UUID in preparation for updates
             LoadBroadcaster.instance.startBroadcasting();
+            DiskUsageBroadcaster.instance.startBroadcasting();
             HintsService.instance.startDispatch();
             BatchlogManager.instance.start();
         }
diff --git a/src/java/org/apache/cassandra/service/disk/usage/DiskUsageBroadcaster.java b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageBroadcaster.java
new file mode 100644
index 000000000000..9cc8d30af88c
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageBroadcaster.java
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service.disk.usage;
+
+
+import java.net.InetAddress;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.gms.ApplicationState;
+import org.apache.cassandra.gms.EndpointState;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.gms.IEndpointStateChangeSubscriber;
+import org.apache.cassandra.gms.VersionedValue;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.NoSpamLogger;
+
+/**
+ * Start {@link DiskUsageMonitor} to monitor local disk usage state and broadcast new state via Gossip.
+ * At the same time, cache cluster disk usage state received via Gossip.
+ */
+public class DiskUsageBroadcaster implements IEndpointStateChangeSubscriber
+{
+    private static final Logger logger = LoggerFactory.getLogger(DiskUsageBroadcaster.class);
+    private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 10, TimeUnit.MINUTES);
+
+    public static final DiskUsageBroadcaster instance = new DiskUsageBroadcaster(DiskUsageMonitor.instance);
+
+    private final DiskUsageMonitor monitor;
+    private final ConcurrentMap<InetAddressAndPort, DiskUsageState> usageInfo = new ConcurrentHashMap<>();
+    private volatile boolean hasStuffedOrFullNode = false;
+
+    @VisibleForTesting
+    public DiskUsageBroadcaster(DiskUsageMonitor monitor)
+    {
+        this.monitor = monitor;
+        Gossiper.instance.register(this);
+    }
+
+    /**
+     * @return true if any node in the cluster is STUFFED OR FULL
+     */
+    public boolean hasStuffedOrFullNode()
+    {
+        return hasStuffedOrFullNode;
+    }
+
+    /**
+     * @return true if given node's disk usage is FULL
+     */
+    public boolean isFull(InetAddressAndPort endpoint)
+    {
+        return usageInfo.getOrDefault(endpoint, DiskUsageState.NOT_AVAILABLE).isFull();
+    }
+
+    /**
+     * @return true if given node's disk usage is STUFFED
+     */
+    public boolean isStuffed(InetAddressAndPort endpoint)
+    {
+        return usageInfo.getOrDefault(endpoint, DiskUsageState.NOT_AVAILABLE).isStuffed();
+    }
+
+    public void startBroadcasting()
+    {
+        monitor.start(newState -> {
+
+            if (logger.isTraceEnabled())
+                logger.trace("Disseminating disk usage info: {}", newState);
+
+            Gossiper.instance.addLocalApplicationState(ApplicationState.DISK_USAGE, StorageService.instance.valueFactory.diskUsage(newState.name()));
+        });
+    }
+
+    public void onChange(InetAddressAndPort endpoint, ApplicationState state, VersionedValue value)
+    {
+        if (state != ApplicationState.DISK_USAGE)
+            return;
+
+        DiskUsageState usageState = DiskUsageState.NOT_AVAILABLE;
+        try
+        {
+            usageState = DiskUsageState.valueOf(value.value);
+        }
+        catch (IllegalArgumentException e)
+        {
+            noSpamLogger.warn(String.format("Found unknown DiskUsageState: %s. Using default state %s instead.", value.value, usageState.toString()));
+        }
+        usageInfo.put(endpoint, usageState);
+
+        hasStuffedOrFullNode = usageState.isStuffedOrFull() || usageInfo.values().stream().anyMatch(DiskUsageState::isStuffedOrFull);
+    }
+
+    public void onJoin(InetAddressAndPort endpoint, EndpointState epState)
+    {
+        updateDiskUsage(endpoint, epState);
+    }
+
+    public void beforeChange(InetAddressAndPort endpoint, EndpointState currentState, ApplicationState newStateKey, VersionedValue newValue)
+    {
+    }
+
+    public void onAlive(InetAddressAndPort endpoint, EndpointState state)
+    {
+        updateDiskUsage(endpoint, state);
+    }
+
+    public void onDead(InetAddressAndPort endpoint, EndpointState state)
+    {
+        // do nothing, as we don't care about dead node
+    }
+
+    public void onRestart(InetAddressAndPort endpoint, EndpointState state)
+    {
+        updateDiskUsage(endpoint, state);
+    }
+
+    public void onRemove(InetAddressAndPort endpoint)
+    {
+        usageInfo.remove(endpoint);
+        hasStuffedOrFullNode = usageInfo.values().stream().anyMatch(DiskUsageState::isStuffedOrFull);
+    }
+
+    private void updateDiskUsage(InetAddressAndPort endpoint, EndpointState state)
+    {
+        VersionedValue localValue = state.getApplicationState(ApplicationState.DISK_USAGE);
+
+        if (localValue != null)
+        {
+            onChange(endpoint, ApplicationState.DISK_USAGE, localValue);
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java
new file mode 100644
index 000000000000..f1659bd5816c
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service.disk.usage;
+
+
+import java.math.BigDecimal;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Consumer;
+import java.util.function.LongSupplier;
+import java.util.function.Supplier;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.guardrails.Guardrails;
+import org.apache.cassandra.guardrails.GuardrailsConfig;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.Schema;
+
+/**
+ * Schedule periodic task to monitor local disk usage and notify {@link DiskUsageBroadcaster} if local state changed.
+ */
+public class DiskUsageMonitor
+{
+    private static final Logger logger = LoggerFactory.getLogger(DiskUsageMonitor.class);
+
+    private static final int MONITOR_INTERVAL = Integer.parseInt(System.getProperty(Config.PROPERTY_PREFIX + "disk_usage.monitor_interval_ms", Integer.valueOf(30 * 1000).toString()));
+
+    public static DiskUsageMonitor instance = new DiskUsageMonitor();
+
+    private final GuardrailsConfig config;
+    private final LongSupplier warnValueSupplier;
+    private final LongSupplier failValueSupplier;
+    private final Supplier<Directories.DataDirectories> dataDirectoriesSupplier;
+
+    private volatile DiskUsageState localState = DiskUsageState.NOT_AVAILABLE;
+
+    @VisibleForTesting
+    public DiskUsageMonitor()
+    {
+        this.config = DatabaseDescriptor.getGuardrailsConfig();
+        this.warnValueSupplier = () -> config.disk_usage_percentage_warn_threshold;
+        this.failValueSupplier = () -> config.disk_usage_percentage_failure_threshold;
+        this.dataDirectoriesSupplier = () -> Directories.dataDirectories;
+    }
+
+    /**
+     * Start monitoring local disk usage and call notifier when local disk usage state changed.
+     */
+    public void start(Consumer<DiskUsageState> notifier)
+    {
+        // start the scheduler regardless guardrail is enabled, so we can enable it later without a restart
+        ScheduledExecutors.scheduledTasks.scheduleAtFixedRate(() -> {
+
+            if (!Guardrails.localDiskUsage.enabled())
+                return;
+
+            updateLocalState(getDiskUsage(), notifier);
+        }, 0, MONITOR_INTERVAL, TimeUnit.MILLISECONDS);
+    }
+
+    @VisibleForTesting
+    public void updateLocalState(double usageRatio, Consumer<DiskUsageState> notifier)
+    {
+        double percentage = usageRatio * 100;
+        long percentageCeiling = (long) Math.ceil(percentage);
+
+        DiskUsageState state = getState(percentageCeiling);
+
+        Guardrails.localDiskUsage.guard(percentageCeiling, state.toString(), false);
+
+        // if state remains unchanged, no need to notify peers
+        if (state == localState)
+            return;
+
+        localState = state;
+        notifier.accept(state);
+    }
+
+    /**
+     * @return local node disk usage state
+     */
+    @VisibleForTesting
+    public DiskUsageState state()
+    {
+        return localState;
+    }
+
+    /**
+     * @return disk usage (including all memtable sizes) ratio
+     */
+    private double getDiskUsage()
+    {
+        // using BigDecimal to handle large file system
+        BigDecimal used = BigDecimal.ZERO;
+        BigDecimal total = BigDecimal.ZERO;
+
+        for (Directories.DataDirectory dir : dataDirectoriesSupplier.get())
+        {
+            used = used.add(BigDecimal.valueOf(dir.getSpaceUsed()));
+            total = total.add(BigDecimal.valueOf(dir.getTotalSpace()));
+        }
+
+        used = used.add(BigDecimal.valueOf(getAllMemtableSize()));
+
+        if (logger.isTraceEnabled())
+            logger.trace("Disk Usage Guardrail: current disk usage = {}, total disk usage = {}.",
+                         FileUtils.stringifyFileSize(used.doubleValue()),
+                         FileUtils.stringifyFileSize(total.doubleValue()));
+
+        return used.divide(total, 5, BigDecimal.ROUND_UP).doubleValue();
+    }
+
+    @VisibleForTesting
+    public long getAllMemtableSize()
+    {
+        long size = 0;
+
+        for (String keyspaceName : Schema.instance.getKeyspaces())
+        {
+            Keyspace keyspace = Keyspace.open(keyspaceName);
+
+            for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
+            {
+                for (Memtable memtable : cfs.getTracker().getView().getAllMemtables())
+                {
+                    size += memtable.getLiveDataSize();
+                }
+            }
+        }
+
+        return size;
+    }
+
+    @VisibleForTesting
+    public DiskUsageState getState(long usagePercentage)
+    {
+        long warnValue = warnValueSupplier.getAsLong();
+        long failValue = failValueSupplier.getAsLong();
+
+        boolean warnDisabled = GuardrailsConfig.diskUsageGuardrailDisabled(warnValue);
+        boolean failDisabled = GuardrailsConfig.diskUsageGuardrailDisabled(failValue);
+
+        if (failDisabled && warnDisabled)
+            return DiskUsageState.NOT_AVAILABLE;
+
+        if (!failDisabled && usagePercentage > failValue)
+            return DiskUsageState.FULL;
+
+        if (!warnDisabled && usagePercentage > warnValue)
+            return DiskUsageState.STUFFED;
+
+        return DiskUsageState.SPACIOUS;
+    }
+}
diff --git a/src/java/org/apache/cassandra/service/disk/usage/DiskUsageState.java b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageState.java
new file mode 100644
index 000000000000..4df34173e415
--- /dev/null
+++ b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageState.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.service.disk.usage;
+
+public enum DiskUsageState
+{
+    NOT_AVAILABLE("Not Available"), // either disk usage guardrail is not enabled or gossip state is not ready
+    SPACIOUS("Spacious"),           // below disk_usage_percentage_warn_threshold
+    STUFFED("Stuffed"),             // exceeds disk_usage_percentage_warn_threshold but below disk_usage_percentage_failure_threshold
+    FULL("Full");                   // exceeds disk_usage_percentage_failure_threshold
+
+    private final String msg;
+
+    DiskUsageState(String msg)
+    {
+        this.msg = msg;
+    }
+
+    public boolean isFull()
+    {
+        return this == FULL;
+    }
+
+    public boolean isStuffed()
+    {
+        return this == STUFFED;
+    }
+
+    public boolean isStuffedOrFull()
+    {
+        return isFull() || isStuffed();
+    }
+
+    @Override
+    public String toString()
+    {
+        return msg;
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/Comparables.java b/src/java/org/apache/cassandra/utils/Comparables.java
new file mode 100644
index 000000000000..ead35b5a804d
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/Comparables.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils;
+
+import java.util.Comparator;
+
+/**
+ * Utility methods linked to comparing comparable values.
+ */
+public class Comparables
+{
+    /**
+     * Returns the maximum of 2 comparable values. On ties, returns the first argument.
+     */
+    public static <T extends Comparable<? super T>> T max(T a, T b)
+    {
+        return a.compareTo(b) < 0 ? b : a;
+    }
+
+    /**
+     * Returns the maximum of 2 values given a comparator of those values. On ties, returns the first argument.
+     */
+    public static <T> T max(T a, T b, Comparator<T> comparator)
+    {
+        return comparator.compare(a, b) < 0 ? b : a;
+    }
+
+    /**
+     * Returns the minimum of 2 comparable values. On ties, returns the first argument.
+     */
+    public static <T extends Comparable<? super T>> T min(T a, T b)
+    {
+        return a.compareTo(b) > 0 ? b : a;
+    }
+
+    /**
+     * Returns the minimum of 2 values given a comparator of those values. On ties, returns the first argument.
+     */
+    public static <T> T min(T a, T b, Comparator<T> comparator)
+    {
+        return comparator.compare(a, b) > 0 ? b : a;
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/utils/units/RateUnit.java b/src/java/org/apache/cassandra/utils/units/RateUnit.java
new file mode 100644
index 000000000000..4dd1612e2eaf
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/units/RateUnit.java
@@ -0,0 +1,269 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import java.util.Objects;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.utils.Comparables;
+
+/**
+ * Represents the unit of a rate of transfer/work in term of byte sizes dealt with in a given time. As such, a
+ * {@link RateUnit} unit is simply the combination of a {@link SizeUnit} and a {@link TimeUnit}.
+ * <p>
+ * Note that while the code is relatively in that it can manipulate any combination of size unit and time unit, we
+ * pre-declare only a handful of the most common rates (only in seconds in practice).
+ */
+public class RateUnit implements Comparable<RateUnit>
+{
+    /**
+     * Bytes per Seconds.
+     */
+    public static final RateUnit B_S = RateUnit.of(SizeUnit.BYTES, TimeUnit.SECONDS);
+    /**
+     * KiloBytes per Seconds.
+     */
+    public static final RateUnit KB_S = RateUnit.of(SizeUnit.KILOBYTES, TimeUnit.SECONDS);
+    /**
+     * MegaBytes per Seconds.
+     */
+    public static final RateUnit MB_S = RateUnit.of(SizeUnit.MEGABYTES, TimeUnit.SECONDS);
+    /**
+     * GigaBytes per Seconds.
+     */
+    public static final RateUnit GB_S = RateUnit.of(SizeUnit.GIGABYTES, TimeUnit.SECONDS);
+    /**
+     * TeraBytes per Seconds.
+     */
+    public static final RateUnit TB_S = RateUnit.of(SizeUnit.TERABYTES, TimeUnit.SECONDS);
+
+    public final SizeUnit sizeUnit;
+    public final TimeUnit timeUnit;
+
+    private RateUnit(SizeUnit sizeUnit, TimeUnit timeUnit)
+    {
+        this.sizeUnit = sizeUnit;
+        this.timeUnit = timeUnit;
+    }
+
+    public static RateUnit of(SizeUnit sizeUnit, TimeUnit timeUnit)
+    {
+        return new RateUnit(sizeUnit, timeUnit);
+    }
+
+    /**
+     * Convert the given rate in the given unit to this unit. Conversions from finer to coarser granularities truncate,
+     * so lose precision, conversions from coarser to finer granularities with arguments that would numerically overflow
+     * saturate to <tt>Long.MIN_VALUE</tt> if negative or <tt>Long.MAX_VALUE</tt> if positive.
+     * <p>
+     * For example, to convert 10 megabytes per seconds to bytes per seconds, use: {@code B_S.convert(10L, MB_S)}.
+     *
+     * @param sourceRate the rate to convert in the given {@code sourceUnit}.
+     * @param sourceUnit the unit of the {@code sourceSize} argument
+     * @return the converted size in this unit, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     */
+    public long convert(long sourceRate, RateUnit sourceUnit)
+    {
+        // We need to convert the size unit and the time unit. For the time unit, since it's a rate, we basically want
+        // to do the opposite of converting from the sourceUnit to the destinationUnit, so we convert from the
+        // destinationUnit to the sourceUnit, even though the value is obviously not in the destination unit in the
+        // first place.
+        // The order we apply the conversion matters however: say we convert '10 MB/s' to 'GB/days': if we were to apply
+        // the size conversion first, we'd get 0, since 10MB is 0GB. So we should apply the time conversion first
+        // ('10 MB/s' is '10 * 3600 * 24 MB/days') and then do the size conversion. Conversely, when converting
+        // '10 MB/s' to 'B/ms', we shouldn't convert by time first, as 10ms is 0s (we do the inverse conversion).
+        // In practice, if the source size unit is smaller than the destination one, we want to apply the time conversion
+        // first, otherwise, we can apply the size one first.
+        if (sourceUnit.sizeUnit.compareTo(sizeUnit) < 0)
+            return sizeUnit.convert(sourceUnit.timeUnit.convert(sourceRate, timeUnit), sourceUnit.sizeUnit);
+
+        return sourceUnit.timeUnit.convert(sizeUnit.convert(sourceRate, sourceUnit.sizeUnit), timeUnit);
+    }
+
+    /**
+     * Returns a Human Readable representation of the provided value in this unit.
+     * <p>
+     * Note that this method may discard precision for the sake of returning a more human readable value. In other
+     * words, if {@code value} is large, it will be converted to a bigger, more readable unit, even this imply
+     * truncating the value.
+     *
+     * @param value the value in this unit.
+     * @return a potentially truncated but human readable representation of {@code value}.
+     */
+    public String toHumanReadableString(long value)
+    {
+        return Units.toString(value, this);
+    }
+
+    public String toString(long value)
+    {
+        return Units.formatValue(value) + this;
+    }
+
+    static String toString(SizeUnit sizeUnit, TimeUnit timeUnit)
+    {
+        return String.format("%s/%s", sizeUnit.symbol, Units.TIME_UNIT_SYMBOL_FCT.apply(timeUnit));
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(sizeUnit, timeUnit);
+    }
+
+    @Override
+    public boolean equals(Object other)
+    {
+        if (!(other instanceof RateUnit))
+            return false;
+
+        RateUnit that = (RateUnit) other;
+        return this.sizeUnit == that.sizeUnit && this.timeUnit == that.timeUnit;
+    }
+
+    @Override
+    public String toString()
+    {
+        return toString(sizeUnit, timeUnit);
+    }
+
+    /**
+     * Given a value in this unit, returns the smallest (most fine grained) unit in which that value can be represented
+     * without overflowing.
+     *
+     * @param value the value in this unit.
+     * @return the smallest unit, potentially this unit, at which the value can be represented without overflowing. If
+     * {@code value == Long.MAX_VALUE}, then this unit is returned.
+     */
+    RateUnit smallestRepresentableUnit(long value)
+    {
+        // This is kind of subtle because we get a smaller unit that this one by both decreasing the size unit
+        // and increasing the time unit, and both don't have the same effect, so we need to find the most optimal
+        // application of both operation that don't overflow our value.
+        // For instance, consider v1=(Long.MAX_VALUE-1 / 1000), then the smallest representable unit for
+        // v1 MB/ms is MB/s (kB/ms doesn't work), while for v2=(Long.MAX_VALUE-1 / 1024) MB/ms, the smallest
+        // representable unit is actually kB/ms (it's also representable as MB/s, but it's a bigger unit).
+        //
+        // So we proceed by applying each option (decreasing size or incrementing time), check if we overflow with each
+        // and if we don't apply recursively. We then compare the unit from both recursive call to find the smallest
+        // one.
+        if (value == Long.MAX_VALUE)
+            return this;
+
+        SizeUnit nextSizeUnit = next(sizeUnit);
+        TimeUnit nextTimeUnit = next(timeUnit);
+
+        long vSize = nextSizeUnit == null ? Long.MAX_VALUE : nextSizeUnit.convert(value, sizeUnit);
+        // Reminder that because the time divide the rate, the conversion should be applied in reverse
+        long vTime = nextTimeUnit == null ? Long.MAX_VALUE : timeUnit.convert(value, nextTimeUnit);
+
+        RateUnit smallestWithSize = vSize == Long.MAX_VALUE
+                                    ? this
+                                    : RateUnit.of(nextSizeUnit, timeUnit).smallestRepresentableUnit(vSize);
+        RateUnit smallestWithTime = vTime == Long.MAX_VALUE
+                                    ? this
+                                    : RateUnit.of(sizeUnit, nextTimeUnit).smallestRepresentableUnit(vTime);
+
+        return Comparables.min(smallestWithSize, smallestWithTime);
+    }
+
+    private static SizeUnit next(SizeUnit unit)
+    {
+        int ordinal = unit.ordinal();
+        return ordinal == 0 ? null : SizeUnit.values()[ordinal - 1];
+    }
+
+    private static TimeUnit next(TimeUnit unit)
+    {
+        int ordinal = unit.ordinal();
+        return ordinal == TimeUnit.values().length - 1 ? null : TimeUnit.values()[ordinal + 1];
+    }
+
+    public int compareTo(RateUnit that)
+    {
+        // Comparing rate units is a tad tricky. We're asking what is the biggest "transfer rate" between 1 of this unit
+        // versus 1 of 'that' unit. This is easier when one of the unit is the same in each unit however.
+        if (this.sizeUnit == that.sizeUnit)
+            return that.timeUnit.compareTo(this.timeUnit); // 1 MB/h is smaller/slower than 1 MB/s
+
+        if (this.timeUnit == that.timeUnit)
+            return this.sizeUnit.compareTo(that.sizeUnit); // 1 MB/s is smaller/slower than 1 TB/s
+
+        // Otherwise, we have to compute by how much it differs in size versus by how much it differs in time.
+        if (this.sizeUnit.compareTo(that.sizeUnit) < 0)
+        {
+            if (this.timeUnit.compareTo(that.timeUnit) < 0)
+            {
+                // this = 1 B/ms and that = 1 MB/s
+                // How much we'll multiply 'that' to get it into 'this' size unit
+                long thatScale = valueDiff(this.sizeUnit, that.sizeUnit);
+                // How much we'll multiply 'this' to get it into 'that' time unit
+                long thisScale = valueDiff(this.timeUnit, that.timeUnit);
+                // 'that' is bigger if it is bigger when put in the same unit than 'this', that is if we'll multiply it
+                // by a bigger value
+                return Long.compare(thisScale, thatScale);
+            }
+            else
+            {
+                // this = 1 B/s and that = 1 MB/ms
+                // That transfers more data in less time, it's definitively faster (bigger)
+                return -1;
+            }
+        }
+        else
+        {
+            if (this.timeUnit.compareTo(that.timeUnit) < 0)
+            {
+                // This transfers more data in less time, it's definitively faster (bigger)
+                return 1;
+            }
+            else
+            {
+                // this = 1 MB/s and that = 1 B/ms
+                // How much we'll multiply 'this' to get it into 'that' size unit
+                long thisScale = valueDiff(that.sizeUnit, this.sizeUnit);
+                // How much we'll multiply 'that' to get it into 'this' time unit
+                long thatScale = valueDiff(that.timeUnit, this.timeUnit);
+                // 'that' is bigger if it is bigger when put in the same unit than 'this', that is if we'll multiply it
+                // by a bigger value
+                return Long.compare(thisScale, thatScale);
+            }
+        }
+    }
+
+    /**
+     * The difference in value between 2 different size unit min and max, where min < max.
+     */
+    private static long valueDiff(SizeUnit min, SizeUnit max)
+    {
+        return 1024L * (max.ordinal() - min.ordinal());
+    }
+
+    /**
+     * The difference in value between 2 different time unit min and max, where min < max.
+     */
+    private static long valueDiff(TimeUnit min, TimeUnit max)
+    {
+        TimeUnit[] all = TimeUnit.values();
+        long val = 1;
+        for (int i = min.ordinal(); i < max.ordinal(); i++)
+            val *= Units.TIME_UNIT_SCALE_FCT.applyAsLong(all[i]);
+        return val;
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/units/RateValue.java b/src/java/org/apache/cassandra/utils/units/RateValue.java
new file mode 100644
index 000000000000..102dea76930f
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/units/RateValue.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import org.apache.cassandra.utils.Comparables;
+
+/**
+ * A {@code RateValue} represents a particular rate in a particular {@link RateUnit}.
+ * <p>
+ * Note that this can only represent positive sizes.
+ */
+public class RateValue implements Comparable<RateValue>
+{
+    public static final RateValue ZERO = new RateValue(0, RateUnit.B_S);
+
+    public final long value;
+    public final RateUnit unit;
+
+    private RateValue(long value, RateUnit unit)
+    {
+        assert value >= 0 && value != Long.MAX_VALUE;
+        this.value = value;
+        this.unit = unit;
+    }
+
+    /**
+     * Creates a new {@link RateValue} for the provided value in the provided unit.
+     *
+     * @param value the value in {@code unit}, which must be positive and strictly less than {@code Long.MAX_VALUE}
+     *              (the latter being used to represent overflows).
+     * @param unit  the unit of {@code value}.
+     * @return a newly created {@link RateValue} for {@code value} in {@code unit}.
+     * @throws IllegalArgumentException if {@code value} is negative or equal to {@code Long.MAX_VALUE}.
+     */
+    public static RateValue of(long value, RateUnit unit)
+    {
+        if (value < 0)
+            throw new IllegalArgumentException("Invalid negative value for a rate: " + value);
+        if (value == Long.MAX_VALUE)
+            throw new IllegalArgumentException("Invalid value for a rate, cannot be Long.MAX_VALUE");
+        return new RateValue(value, unit);
+    }
+
+    /**
+     * Computes the rate corresponding to "processing" {@code size} in {@code duration}.
+     *
+     * @param size     the size processed.
+     * @param duration the duration of the process.
+     * @return the rate corresponding to processing {@code size} in {@code duration}.
+     */
+    public static RateValue compute(SizeValue size, TimeValue duration)
+    {
+        SizeUnit bestSizeUnit = size.smallestRepresentableUnit();
+        return RateValue.of(size.in(bestSizeUnit) / duration.value, RateUnit.of(bestSizeUnit, duration.unit));
+    }
+
+    /**
+     * Returns the value this represents in the provided unit.
+     *
+     * @param destinationUnit the unit to return the value in.
+     * @return the value this represent in {@code unit}.
+     */
+    public long in(RateUnit destinationUnit)
+    {
+        return destinationUnit.convert(value, unit);
+    }
+
+    public RateValue convert(RateUnit destinationUnit)
+    {
+        return RateValue.of(in(destinationUnit), destinationUnit);
+    }
+
+    /**
+     * Returns the time required to "process" the provided size at this rate.
+     */
+    public TimeValue timeFor(SizeValue size)
+    {
+        // Convert both the rate and size in the smallest unit in which they don't overflow: this will ensure the most
+        // precise return value.
+        RateUnit smallestForRate = smallestRepresentableUnit();
+        SizeUnit smallestForSize = size.smallestRepresentableUnit();
+
+        SizeUnit toConvert = Comparables.max(smallestForSize, smallestForRate.sizeUnit);
+        return TimeValue.of(size.in(toConvert) / toConvert.convert(value, unit.sizeUnit), unit.timeUnit);
+    }
+
+    private RateUnit smallestRepresentableUnit()
+    {
+        return unit.smallestRepresentableUnit(value);
+    }
+
+    /**
+     * Returns a string representation of this value in the unit it was created with.
+     */
+    public String toRawString()
+    {
+        return unit.toString(value);
+    }
+
+    /**
+     * Returns a Human Readable representation of this value.
+     * <p>
+     * Note that this method may discard precision for the sake of returning a more human readable value. In other
+     * words, this will display the value is a bigger unit than the one it was created with if that improve readability
+     * and this even this imply truncating the value.
+     *
+     * @return a potentially truncated but human readable representation of this value.
+     */
+    @Override
+    public String toString()
+    {
+        return unit.toHumanReadableString(value);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        // Make sure that equals() => same hashCode()
+        return Long.hashCode(in(smallestRepresentableUnit()));
+    }
+
+    /**
+     * Checks the equality of this value with another value.
+     * <p>
+     * Two {@link RateValue} are equal if they represent exactly the same number of bytes in the same number of time.
+     *
+     * @param other the value to check equality with.
+     * @return whether this value and {@code other} represent the same rate.
+     */
+    @Override
+    public boolean equals(Object other)
+    {
+        if (!(other instanceof RateValue))
+            return false;
+
+        RateValue that = (RateValue) other;
+
+        // Convert both value to the most precise unit in which they can both be represented without overflowing and
+        // check we get the same value. If both don't have the same smallest representable unit, they can't be
+        // representing the same number of bytes.
+        RateUnit smallest = this.smallestRepresentableUnit();
+        return smallest.equals(that.smallestRepresentableUnit()) && this.in(smallest) == that.in(smallest);
+    }
+
+    public int compareTo(RateValue that)
+    {
+        // To compare, we need to have the same unit.
+        RateUnit thisSmallest = this.smallestRepresentableUnit();
+        RateUnit thatSmallest = that.smallestRepresentableUnit();
+
+        if (thisSmallest.equals(thatSmallest))
+            return Long.compare(this.in(thisSmallest), that.in(thatSmallest));
+
+        // If one value overflow "before" (it has a bigger smallest representable unit) the other one, then that value
+        // is bigger. Note that rate units are not comparable in the absolute
+        return thisSmallest.compareTo(thatSmallest) > 0 ? 1 : -1;
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/units/SizeUnit.java b/src/java/org/apache/cassandra/utils/units/SizeUnit.java
new file mode 100644
index 000000000000..783097050a37
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/units/SizeUnit.java
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import com.google.common.annotations.VisibleForTesting;
+
+/**
+ * A {@code SizeUnit} represents byte sizes at a given unit of granularity and provide utility methods to convert across
+ * units. A {@code SizeUnit} does not maintain size information (see {@link SizeValue}), but only represent the unit
+ * itself. A kilobyte is defined a 1024 bytes, a megabyte as 1024 kilobytes, etc...
+ */
+public enum SizeUnit
+{
+    BYTES("B")
+    {
+        public long convert(long s, SizeUnit u)
+        {
+            return u.toBytes(s);
+        }
+
+        public long toBytes(long s)
+        {
+            return s;
+        }
+
+        public long toKiloBytes(long s)
+        {
+            return s / (C1 / C0);
+        }
+
+        public long toMegaBytes(long s)
+        {
+            return s / (C2 / C0);
+        }
+
+        public long toGigaBytes(long s)
+        {
+            return s / (C3 / C0);
+        }
+
+        public long toTeraBytes(long s)
+        {
+            return s / (C4 / C0);
+        }
+    },
+    KILOBYTES("kB")
+    {
+        public long convert(long s, SizeUnit u)
+        {
+            return u.toKiloBytes(s);
+        }
+
+        public long toBytes(long s)
+        {
+            return x(s, C1 / C0, MAX / (C1 / C0));
+        }
+
+        public long toKiloBytes(long s)
+        {
+            return s;
+        }
+
+        public long toMegaBytes(long s)
+        {
+            return s / (C2 / C1);
+        }
+
+        public long toGigaBytes(long s)
+        {
+            return s / (C3 / C1);
+        }
+
+        public long toTeraBytes(long s)
+        {
+            return s / (C4 / C1);
+        }
+    },
+    MEGABYTES("MB")
+    {
+        public long convert(long s, SizeUnit u)
+        {
+            return u.toMegaBytes(s);
+        }
+
+        public long toBytes(long s)
+        {
+            return x(s, C2 / C0, MAX / (C2 / C0));
+        }
+
+        public long toKiloBytes(long s)
+        {
+            return x(s, C2 / C1, MAX / (C2 / C1));
+        }
+
+        public long toMegaBytes(long s)
+        {
+            return s;
+        }
+
+        public long toGigaBytes(long s)
+        {
+            return s / (C3 / C2);
+        }
+
+        public long toTeraBytes(long s)
+        {
+            return s / (C4 / C2);
+        }
+    },
+    GIGABYTES("GB")
+    {
+        public long convert(long s, SizeUnit u)
+        {
+            return u.toGigaBytes(s);
+        }
+
+        public long toBytes(long s)
+        {
+            return x(s, C3 / C0, MAX / (C3 / C0));
+        }
+
+        public long toKiloBytes(long s)
+        {
+            return x(s, C3 / C1, MAX / (C3 / C1));
+        }
+
+        public long toMegaBytes(long s)
+        {
+            return x(s, C3 / C2, MAX / (C3 / C2));
+        }
+
+        public long toGigaBytes(long s)
+        {
+            return s;
+        }
+
+        public long toTeraBytes(long s)
+        {
+            return s / (C4 / C3);
+        }
+    },
+    TERABYTES("TB")
+    {
+        public long convert(long s, SizeUnit u)
+        {
+            return u.toTeraBytes(s);
+        }
+
+        public long toBytes(long s)
+        {
+            return x(s, C4 / C0, MAX / (C4 / C0));
+        }
+
+        public long toKiloBytes(long s)
+        {
+            return x(s, C4 / C1, MAX / (C4 / C1));
+        }
+
+        public long toMegaBytes(long s)
+        {
+            return x(s, C4 / C2, MAX / (C4 / C2));
+        }
+
+        public long toGigaBytes(long s)
+        {
+            return x(s, C4 / C3, MAX / (C4 / C3));
+        }
+
+        public long toTeraBytes(long s)
+        {
+            return s;
+        }
+    };
+
+    /**
+     * The string symbol for that unit
+     **/
+    public final String symbol;
+
+    SizeUnit(String symbol)
+    {
+        this.symbol = symbol;
+    }
+
+    // Handy constants for conversion methods (all are visible for testing)
+    static final long C0 = 1L;
+    static final long C1 = C0 * 1024L;
+    static final long C2 = C1 * 1024L;
+    static final long C3 = C2 * 1024L;
+    static final long C4 = C3 * 1024L;
+
+    private static final long MAX = Long.MAX_VALUE;
+
+    /**
+     * Scale d by m, checking for overflow.
+     * This has a short name to make above code more readable.
+     */
+    @VisibleForTesting
+    static long x(long d, long m, long over)
+    {
+        if (d > over) return Long.MAX_VALUE;
+        if (d < -over) return Long.MIN_VALUE;
+        return d * m;
+    }
+
+    /**
+     * Convert the given size in the given unit to this unit. Conversions from finer to coarser granularities truncate,
+     * so lose precision. For example converting {@code 1023} bytes to kilobytes results in {@code 0}. Conversions from
+     * coarser to finer granularities with arguments that would numerically overflow saturate to <tt>Long.MIN_VALUE</tt>
+     * if negative or <tt>Long.MAX_VALUE</tt> if positive.
+     * <p>
+     * For example, to convert 10 megabytes to bytes, use: {@code SizeUnit.BYTES.convert(10L, SizeUnit.MEGABYTES)}.
+     *
+     * @param sourceSize the size in the given {@code sourceUnit}.
+     * @param sourceUnit the unit of the {@code sourceSize} argument
+     * @return the converted size in this unit, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     */
+    public abstract long convert(long sourceSize, SizeUnit sourceUnit);
+
+    /**
+     * Equivalent to {@code BYTES.convert(size, this)}.
+     *
+     * @param size the size
+     * @return the converted size, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     * @see #convert
+     */
+    public abstract long toBytes(long size);
+
+    /**
+     * Equivalent to {@code KILOBYTES.convert(size, this)}.
+     *
+     * @param size the size
+     * @return the converted size, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     * @see #convert
+     */
+    public abstract long toKiloBytes(long size);
+
+    /**
+     * Equivalent to {@code MEGABYTES.convert(size, this)}.
+     *
+     * @param size the size
+     * @return the converted size, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     * @see #convert
+     */
+    public abstract long toMegaBytes(long size);
+
+    /**
+     * Equivalent to {@code GIGABYTES.convert(size, this)}.
+     *
+     * @param size the size
+     * @return the converted size, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     * @see #convert
+     */
+    public abstract long toGigaBytes(long size);
+
+    /**
+     * Equivalent to {@code TERABYTES.convert(size, this)}.
+     *
+     * @param size the size
+     * @return the converted size, or {@code Long.MIN_VALUE} if conversion would negatively overflow, or
+     * {@code Long.MAX_VALUE} if it would positively overflow.
+     * @see #convert
+     */
+    public abstract long toTeraBytes(long size);
+
+    /**
+     * Creates a {@link SizeValue} using the provided {@code value} and this unit.
+     *
+     * @param value the value.
+     * @return a new {@link SizeValue} for {@code value} at this unit.
+     */
+    public SizeValue value(long value)
+    {
+        return SizeValue.of(value, this);
+    }
+
+    /**
+     * Returns a Human Readable representation of the provided value in this unit.
+     * <p>
+     * Note that this method may discard precision for the sake of returning a more human readable value. In other
+     * words, if {@code value} is large, it will be converted to a bigger, more readable unit, even this imply
+     * truncating the value.
+     *
+     * @param value the value in this unit.
+     * @return a potentially truncated but human readable representation of {@code value}.
+     */
+    public String toHumanReadableString(long value)
+    {
+        return Units.toString(value, this);
+    }
+
+    /**
+     * Returns a string representation particularly suitable for logging a value. of this unit.
+     * <p>
+     * The returned representation combines the value displayed in bytes (for the sake of script parsing the log, so
+     * they don't have to bother with unit conversion), followed by the representation from {@link #toHumanReadableString} for
+     * humans.
+     *
+     * @param value the value in this unit.
+     * @return a string representation suitable for logging the value.
+     */
+    public String toLogString(long value)
+    {
+        return Units.toLogString(value, this);
+    }
+
+    /**
+     * Returns a string representation of a value in this unit.
+     *
+     * @param value the value in this unit.
+     * @return a string representation of {@code value} in this unit.
+     */
+    public String toString(long value)
+    {
+        return Units.formatValue(value) + symbol;
+    }
+
+    /**
+     * Given a value in this unit, returns the smallest (most fine grained) unit in which that value can be represented
+     * without overflowing.
+     *
+     * @param value the value in this unit.
+     * @return the smallest unit, potentially this unit, at which the value can be represented without overflowing. If
+     * {@code value == Long.MAX_VALUE}, then this unit is returned.
+     */
+    SizeUnit smallestRepresentableUnit(long value)
+    {
+        int i = ordinal();
+        while (i > 0 && value < Long.MAX_VALUE)
+        {
+            value = x(value, C1, MAX / C1);
+            i--;
+        }
+        return SizeUnit.values()[i];
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/units/SizeValue.java b/src/java/org/apache/cassandra/utils/units/SizeValue.java
new file mode 100644
index 000000000000..1ec494d7d096
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/units/SizeValue.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+/**
+ * A {@code SizeValue} represents a particular size in a particular {@link SizeUnit}.
+ * <p>
+ * Note that this can only represent positive sizes.
+ */
+public class SizeValue implements Comparable<SizeValue>
+{
+    public static final SizeValue ZERO = new SizeValue(0, SizeUnit.BYTES);
+
+    public final long value;
+    public final SizeUnit unit;
+
+    private SizeValue(long value, SizeUnit unit)
+    {
+        assert value >= 0 && value != Long.MAX_VALUE;
+        this.value = value;
+        this.unit = unit;
+    }
+
+    /**
+     * Creates a new {@link SizeValue} for the provided value in the provided unit.
+     *
+     * @param value the value in {@code unit}, which must be positive and strictly less than {@code Long.MAX_VALUE}
+     *              (the latter being used to represent overflows).
+     * @param unit  the unit of {@code value}.
+     * @return a newly created {@link SizeValue} for {@code value} in {@code unit}.
+     * @throws IllegalArgumentException if {@code value} is negative or equal to {@code Long.MAX_VALUE}.
+     */
+    public static SizeValue of(long value, SizeUnit unit)
+    {
+        if (value < 0)
+            throw new IllegalArgumentException("Invalid negative value for a size in bytes: " + value);
+        if (value == Long.MAX_VALUE)
+            throw new IllegalArgumentException("Invalid value for a size in bytes, cannot be Long.MAX_VALUE");
+        return new SizeValue(value, unit);
+    }
+
+    /**
+     * Returns the value this represents in the provided unit.
+     *
+     * @param destinationUnit the unit to return the value in.
+     * @return the value this represent in {@code unit}.
+     */
+    public long in(SizeUnit destinationUnit)
+    {
+        return destinationUnit.convert(value, unit);
+    }
+
+    SizeUnit smallestRepresentableUnit()
+    {
+        return unit.smallestRepresentableUnit(value);
+    }
+
+    /**
+     * Returns a string representation of this value in the unit it was created with.
+     */
+    public String toRawString()
+    {
+        return unit.toString(value);
+    }
+
+    /**
+     * Returns a string representation particularly suitable for logging the value.
+     * <p>
+     * The returned representation combines the value displayed in bytes (for the sake of script parsing the log, so
+     * they don't have to bother with unit conversion), followed by the representation from
+     * {@link SizeUnit#toHumanReadableString(long)} for humans.
+     *
+     * @return a string representation suitable for logging the value.
+     */
+    public String toLogString()
+    {
+        return unit.toLogString(value);
+    }
+
+    /**
+     * Returns a Human Readable representation of this value.
+     * <p>
+     * Note that this method may discard precision for the sake of returning a more human readable value. In other
+     * words, this will display the value is a bigger unit than the one it was created with if that improve readability
+     * and this even this imply truncating the value.
+     *
+     * @return a potentially truncated but human readable representation of this value.
+     */
+    @Override
+    public String toString()
+    {
+        return unit.toHumanReadableString(value);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        // Make sure that equals() => same hashCode()
+        return Long.hashCode(in(smallestRepresentableUnit()));
+    }
+
+    /**
+     * Checks the equality of this value with another value.
+     * <p>
+     * Two {@link SizeValue} are equal if they represent exactly the same number of bytes.
+     *
+     * @param other the value to check equality with.
+     * @return whether this value and {@code other} represent the same number of bytes.
+     */
+    @Override
+    public boolean equals(Object other)
+    {
+        if (!(other instanceof SizeValue))
+            return false;
+
+        SizeValue that = (SizeValue) other;
+
+        // Convert both value to the most precise unit in which they can both be represented without overflowing and
+        // check we get the same value. If both don't have the same smallest representable unit, they can't be
+        // representing the same number of bytes.
+        SizeUnit smallest = this.smallestRepresentableUnit();
+        return smallest == that.smallestRepresentableUnit() && this.in(smallest) == that.in(smallest);
+    }
+
+    public int compareTo(SizeValue that)
+    {
+        // To compare, we need to have the same unit.
+        SizeUnit thisSmallest = this.smallestRepresentableUnit();
+        SizeUnit thatSmallest = that.smallestRepresentableUnit();
+
+        if (thisSmallest == thatSmallest)
+            return Long.compare(this.in(thisSmallest), that.in(thatSmallest));
+
+        // If one value overflow "before" (it has a bigger smallest representable unit) the other one, then that value
+        // is bigger.
+        return thisSmallest.compareTo(thatSmallest) > 0 ? 1 : -1;
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/units/TimeValue.java b/src/java/org/apache/cassandra/utils/units/TimeValue.java
new file mode 100644
index 000000000000..90885a73271b
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/units/TimeValue.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import java.util.concurrent.TimeUnit;
+
+/**
+ * A {@code TimeValue} represents a particular duration in a particular {@link TimeUnit}.
+ */
+public class TimeValue implements Comparable<TimeValue>
+{
+    public static final TimeValue ZERO = new TimeValue(0, TimeUnit.NANOSECONDS);
+
+    final long value;
+    final TimeUnit unit;
+
+    private TimeValue(long value, TimeUnit unit)
+    {
+        this.value = value;
+        this.unit = unit;
+    }
+
+    /**
+     * Creates a new {@link TimeValue} for the provided value in the provided unit.
+     *
+     * @param value the value in {@code unit}.
+     * @param unit  the unit of {@code value}.
+     * @return a newly created {@link TimeValue} for {@code value} in {@code unit}.
+     */
+    public static TimeValue of(long value, TimeUnit unit)
+    {
+        return new TimeValue(value, unit);
+    }
+
+    /**
+     * Returns the value this represents in the provided unit.
+     *
+     * @param destinationUnit the unit to return the value in.
+     * @return the value this represent in {@code unit}.
+     */
+    public long in(TimeUnit destinationUnit)
+    {
+        return destinationUnit.convert(value, unit);
+    }
+
+    static TimeUnit smallestRepresentableUnit(long value, TimeUnit unit)
+    {
+        long v = value;
+        int i = unit.ordinal();
+        TimeUnit u = unit;
+        while (i > 0 && v < Long.MAX_VALUE)
+        {
+            TimeUnit current = u;
+            u = TimeUnit.values()[--i];
+            v = u.convert(v, current);
+        }
+        return u;
+    }
+
+    private TimeUnit smallestRepresentableUnit()
+    {
+        return smallestRepresentableUnit(value, unit);
+    }
+
+    /**
+     * Returns a string representation of this value in the unit it was created with.
+     */
+    public String toRawString()
+    {
+        return Units.formatValue(value) + Units.TIME_UNIT_SYMBOL_FCT.apply(unit);
+    }
+
+    /**
+     * Returns a Human Readable representation of this value.
+     * <p>
+     * Note that this method may discard precision for the sake of returning a more human readable value. In other
+     * words, this will display the value is a bigger unit than the one it was created with if that improve readability
+     * and this even this imply truncating the value.
+     *
+     * @return a potentially truncated but human readable representation of this value.
+     */
+    @Override
+    public String toString()
+    {
+        return Units.toString(value, unit);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        // Make sure that equals() => same hashCode()
+        return Long.hashCode(in(smallestRepresentableUnit()));
+    }
+
+    /**
+     * Checks the equality of this value with another value.
+     * <p>
+     * Two {@link TimeValue} are equal if they represent exactly the same number of nanoseconds.
+     *
+     * @param other the value to check equality with.
+     * @return whether this value and {@code other} represent the same number of nanoseconds.
+     */
+    @Override
+    public boolean equals(Object other)
+    {
+        if (!(other instanceof TimeValue))
+            return false;
+
+        TimeValue that = (TimeValue) other;
+
+        // Convert both value to the most precise unit in which they can both be represented without overflowing and
+        // check we get the same value. If both don't have the same smallest representable unit, they can't be
+        // representing the same number of bytes.
+        TimeUnit smallest = this.smallestRepresentableUnit();
+        return smallest == that.smallestRepresentableUnit() && this.in(smallest) == that.in(smallest);
+    }
+
+    public int compareTo(TimeValue that)
+    {
+        // To compare, we need to have the same unit.
+        TimeUnit thisSmallest = this.smallestRepresentableUnit();
+        TimeUnit thatSmallest = that.smallestRepresentableUnit();
+
+        if (thisSmallest == thatSmallest)
+            return Long.compare(this.in(thisSmallest), that.in(thatSmallest));
+
+        // If one value overflow "before" (it has a bigger smallest representable unit) the other one, then that value
+        // is bigger.
+        return thisSmallest.compareTo(thatSmallest) > 0 ? 1 : -1;
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/units/Units.java b/src/java/org/apache/cassandra/utils/units/Units.java
new file mode 100644
index 000000000000..482e25555fc7
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/units/Units.java
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import java.util.concurrent.TimeUnit;
+import java.util.function.Function;
+import java.util.function.ToLongFunction;
+
+/**
+ * Static methods used by work with units.
+ * <p>
+ * This is mostly useful for {@link TimeUnit}, as for other units the method provided  are more directly accessible in the
+ * unit class itself (we can't modify {@link TimeUnit}), but contains methods for all unit for symmetry.
+ */
+public class Units
+{
+    static final ToLongFunction<TimeUnit> TIME_UNIT_SCALE_FCT = u ->
+    {
+        switch (u)
+        {
+            case NANOSECONDS:
+            case MICROSECONDS:
+            case MILLISECONDS:
+                return 1000L;
+            case SECONDS:
+            case MINUTES:
+                return 60L;
+            case HOURS:
+                return 24L;
+            case DAYS:
+                return 365; // Never actually use but well...
+            default:
+                throw new AssertionError();
+        }
+    };
+    static final Function<TimeUnit, String> TIME_UNIT_SYMBOL_FCT = u ->
+    {
+        switch (u)
+        {
+            case NANOSECONDS:
+                return "ns";
+            case MICROSECONDS:
+                return "us";
+            case MILLISECONDS:
+                return "ms";
+            case SECONDS:
+                return "s";
+            case MINUTES:
+                return "m";
+            case HOURS:
+                return "h";
+            case DAYS:
+                return "d";
+            default:
+                throw new AssertionError();
+        }
+    };
+
+    private static final ToLongFunction<SizeUnit> SIZE_UNIT_SCALE_FCT = u -> SizeUnit.C1;
+    private static final Function<SizeUnit, String> SIZE_UNIT_SYMBOL_FCT = u -> u.symbol;
+
+
+    /**
+     * Returns a Human Readable representation of the provided duration given the unit of said duration.
+     * <p>
+     * This method strives to produce a short and human readable representation and may trade precision for that. In
+     * other words, if the value is large, this will display the value in a bigger unit than the one provided to improve
+     * readability and this even this imply truncating the value.
+     *
+     * @param value the value to build a string of.
+     * @param unit  the unit of {@code value}.
+     * @return a potentially truncated but human readable representation of {@code value}.
+     */
+    public static String toString(long value, TimeUnit unit)
+    {
+        return toString(value, unit, TimeUnit.class, TIME_UNIT_SCALE_FCT, TIME_UNIT_SYMBOL_FCT);
+    }
+
+    /**
+     * Returns a Human Readable representation of the provided size given the unit of said size.
+     * <p>
+     * This method strives to produce a short and human readable representation and may trade precision for that. In
+     * other words, if the value is large, this will display the value in a bigger unit than the one provided to improve
+     * readability and this even this imply truncating the value.
+     *
+     * @param value the value to build a string of.
+     * @param unit  the unit of {@code value}.
+     * @return a potentially truncated but human readable representation of {@code value}.
+     */
+    public static String toString(long value, SizeUnit unit)
+    {
+        return toString(value, unit, SizeUnit.class, SIZE_UNIT_SCALE_FCT, SIZE_UNIT_SYMBOL_FCT);
+    }
+
+    /**
+     * Returns a string representation for a size value (in a particular unit) that is suitable for logging the value.
+     * <p>
+     * The returned representation combines the value displayed in bytes (for the sake of script parsing the log, so
+     * they don't have to bother with unit conversion), followed by the representation from {@link #toString} for
+     * humans.
+     *
+     * @param value a size in {@code unit}.
+     * @param unit  the unit for {@code value}.
+     * @return a string representation suitable for logging the value.
+     */
+    public static String toLogString(long value, SizeUnit unit)
+    {
+        return String.format("%s (%s)", SizeUnit.BYTES.toString(unit.toBytes(value)), toString(value, unit));
+    }
+
+    /**
+     * Returns a Human Readable representation of the provided rate given the unit of said rate.
+     * <p>
+     * This method strives to produce a short and human readable representation and may trade precision for that. In
+     * other words, if the value is large, this will display the value in a bigger unit than the one provided to improve
+     * readability and this even this imply truncating the value.
+     *
+     * @param value the value to build a string of.
+     * @param unit  the unit of {@code value}.
+     * @return a potentially truncated but human readable representation of {@code value}.
+     */
+    public static String toString(long value, RateUnit unit)
+    {
+        // There is theoretically multiple options for any given (large) value since we can play on both the size
+        // and time unit. In practice though, it's much more common to reason with rate 'per second' so we force
+        // seconds as unit of time and play only on the size unit.
+        value = RateUnit.of(unit.sizeUnit, TimeUnit.SECONDS).convert(value, unit);
+        return toString(value, unit.sizeUnit, SizeUnit.class, SIZE_UNIT_SCALE_FCT, u -> RateUnit.toString(u, unit.timeUnit));
+    }
+
+    /**
+     * Format a value a in a human readable way, adding a comma (',') to separate every thousands.
+     * <p>
+     * For instance, {@code formatValue(4693234L) == "4,693,234"}
+     *
+     * @param value the value to format.
+     * @return a more human readable representation of {@code value}.
+     */
+    static String formatValue(long value)
+    {
+        return String.format("%,d", value);
+    }
+
+    /**
+     * The number of comma to use to format {@code digits} digit using ',' on every thousands.
+     */
+    private static int commaCount(int digits)
+    {
+        return (digits - 1) / 3;
+    }
+
+    /**
+     * Returns a Human Readable representation of the provided size/rate given the unit of said size/rate.
+     * <p>
+     * This method strives to produce a short and human readable representation and may trade precision for that. In
+     * other words, if the value is large, this will display the value in a bigger unit than the one provided to improve
+     * readability and this even this imply truncating the value.
+     *
+     * @param value     the value to build a string of.
+     * @param unit      the unit of {@code value}, which is currently either {@link SizeUnit} or {@link RateUnit}
+     * @param klass     Currently can be either a {@link SizeUnit} or {@link RateUnit} class
+     * @param scaleFct  A function that knows how to scale between units of the given {@code unit}
+     * @param symbolFct A function that knows how to scale between symbols of the given {@code unit}
+     * @param <E>       currently either {@link SizeUnit} or {@link RateUnit}
+     * @return a potentially truncated but human readable representation of {@code value}.
+     */
+    private static <E extends Enum<E>> String toString(long value,
+                                                       E unit,
+                                                       Class<E> klass,
+                                                       ToLongFunction<E> scaleFct,
+                                                       Function<E, String> symbolFct)
+    {
+        E[] enumVals = klass.getEnumConstants();
+
+        long v = value;
+        int i = unit.ordinal();
+        long remainder = 0;
+        // The scale is how much we need to go from unit to the next one
+        long scale = scaleFct.applyAsLong(unit);
+
+        while (i < enumVals.length - 1 && v >= scale)
+        {
+            remainder = v % scale;
+            v = v / scale;
+            unit = enumVals[++i];
+            scale = scaleFct.applyAsLong(unit);
+        }
+
+        // If the value is small (<10), include one decimal so the precision is not too truncated. Otherwise, don't
+        // bother, it's less relevant.
+        if (v >= 10 || remainder == 0)
+            return fmt(v, unit, symbolFct);
+
+        // Note that scale is the scale of the current unit, but remainder relates to the previous unit. Also not that
+        // can only get here is remainder != 0 so we know accessing the previous unit is legit
+        long prevScale = scaleFct.applyAsLong(enumVals[i - 1]);
+        int decimal = Math.round(((float) remainder / prevScale) * 10);
+        if (decimal == 0)
+            return fmt(v, unit, symbolFct);
+
+        // If the remainder amounts to more than 0.95 of C1, decimal will be 10. In that case, just bump the value by 1
+        if (decimal == 10)
+            return fmt(v + 1, unit, symbolFct);
+
+        return formatValue(v) + '.' + decimal + symbolFct.apply(unit);
+    }
+
+    private static <E extends Enum<E>> String fmt(long value, E unit, Function<E, String> symbolFct)
+    {
+        return formatValue(value) + symbolFct.apply(unit);
+    }
+}
diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
index e31ce2c63517..32ca5a5636b8 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
@@ -47,6 +47,7 @@
 import org.apache.cassandra.transport.ClientStat;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -94,7 +95,7 @@ private SimpleQueryResult executeInternal(String query, ConsistencyLevel consist
         for (Object boundValue : boundValues)
             boundBBValues.add(ByteBufferUtil.objectToBytes(boundValue));
 
-        prepared.validate(QueryState.forInternalCalls().getClientState());
+        prepared.validate(QueryState.forInternalCalls());
 
         // Start capturing warnings on this thread. Note that this will implicitly clear out any previous 
         // warnings as it sets a new State instance on the ThreadLocal.
@@ -135,20 +136,19 @@ public QueryResult executeWithPagingWithResult(String query, ConsistencyLevel co
             throw new IllegalArgumentException("Page size should be strictly positive but was " + pageSize);
 
         return instance.sync(() -> {
-            ClientState clientState = makeFakeClientState();
+            QueryState state = new QueryState(makeFakeClientState());
             ConsistencyLevel consistencyLevel = ConsistencyLevel.valueOf(consistencyLevelOrigin.name());
-            CQLStatement prepared = QueryProcessor.getStatement(query, clientState);
+            CQLStatement prepared = QueryProcessor.getStatement(query, state.getClientState());
             final List<ByteBuffer> boundBBValues = new ArrayList<>();
             for (Object boundValue : boundValues)
                 boundBBValues.add(ByteBufferUtil.objectToBytes(boundValue));
 
-            prepared.validate(clientState);
+            prepared.validate(state);
             assert prepared instanceof SelectStatement : "Only SELECT statements can be executed with paging";
 
             long nanoTime = System.nanoTime();
             SelectStatement selectStatement = (SelectStatement) prepared;
 
-            QueryState queryState = new QueryState(clientState);
             QueryOptions initialOptions = QueryOptions.create(toCassandraCL(consistencyLevel),
                                                               boundBBValues,
                                                               false,
@@ -159,9 +159,9 @@ public QueryResult executeWithPagingWithResult(String query, ConsistencyLevel co
                                                               selectStatement.keyspace());
 
 
-            ResultMessage.Rows initialRows = selectStatement.execute(queryState, initialOptions, nanoTime);
+            ResultMessage.Rows initialRows = selectStatement.execute(state, initialOptions, nanoTime);
             Iterator<Object[]> iter = new Iterator<Object[]>() {
-                ResultMessage.Rows rows = selectStatement.execute(queryState, initialOptions, nanoTime);
+                ResultMessage.Rows rows = selectStatement.execute(state, initialOptions, nanoTime);
                 Iterator<Object[]> iter = RowUtil.toIter(rows);
 
                 public boolean hasNext()
@@ -181,7 +181,7 @@ public boolean hasNext()
                                                                    ProtocolVersion.CURRENT,
                                                                    selectStatement.keyspace());
 
-                    rows = selectStatement.execute(queryState, nextOptions, nanoTime);
+                    rows = selectStatement.execute(state, nextOptions, nanoTime);
                     iter = Iterators.forArray(RowUtil.toObjects(initialRows.result.metadata.names, rows.result.rows));
 
                     return hasNext();
diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
index 95767965efe1..37c79ea71014 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java
@@ -247,7 +247,7 @@ public void schemaChangeInternal(String query)
                 QueryState queryState = new QueryState(state);
 
                 CQLStatement statement = QueryProcessor.parseStatement(query, queryState.getClientState());
-                statement.validate(state);
+                statement.validate(queryState);
 
                 QueryOptions options = QueryOptions.forInternalCalls(Collections.emptyList());
                 statement.executeLocally(queryState, options);
diff --git a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
index d3af45072043..ec3271139c84 100644
--- a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
+++ b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
@@ -486,7 +486,7 @@ public void setup()
                         if (!overlap || f == 0)
                         {
                             QueryOptions options = QueryProcessor.makeInternalOptions(select, new Object[]{f});
-                            ReadQuery query = select.getQuery(options, queryState.getNowInSeconds());
+                            ReadQuery query = select.getQuery(queryState, options, queryState.getNowInSeconds());
                             reads.add(() -> runQuery(query, cfs.metadata.get()));
                         }
                     }
@@ -601,7 +601,7 @@ public void setup()
                         if (!overlap || f == 0)
                         {
                             QueryOptions options = QueryProcessor.makeInternalOptions(select, new Object[]{key});
-                            ReadQuery query = select.getQuery(options, queryState.getNowInSeconds());
+                            ReadQuery query = select.getQuery(queryState, options, queryState.getNowInSeconds());
                             reads.add(() -> runQuery(query, cfs.metadata.get()));
                         }
                     }
@@ -701,7 +701,7 @@ public void setup()
                         if (!overlap || f == 0)
                         {
                             QueryOptions options = QueryProcessor.makeInternalOptions(select, new Object[]{key});
-                            ReadQuery query = select.getQuery(options, queryState.getNowInSeconds());
+                            ReadQuery query = select.getQuery(queryState, options, queryState.getNowInSeconds());
                             reads.add(() -> runQuery(query, cfs.metadata.get()));
                         }
                     }
diff --git a/test/microbench/org/apache/cassandra/test/microbench/BatchStatementBench.java b/test/microbench/org/apache/cassandra/test/microbench/BatchStatementBench.java
index b79f154682e7..70490d16da04 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/BatchStatementBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/BatchStatementBench.java
@@ -124,7 +124,7 @@ public void setup() throws Throwable
     @Benchmark
     public void bench()
     {
-        bs.getMutations(bqo, false, nowInSec, nowInSec, queryStartTime);
+        bs.getMutations(QueryState.forInternalCalls(), bqo, false, nowInSec, nowInSec, queryStartTime);
     }
 
 
diff --git a/test/unit/org/apache/cassandra/SchemaLoader.java b/test/unit/org/apache/cassandra/SchemaLoader.java
index 1f46562822d1..ac06d730dd26 100644
--- a/test/unit/org/apache/cassandra/SchemaLoader.java
+++ b/test/unit/org/apache/cassandra/SchemaLoader.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.config.*;
 import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
+import org.apache.cassandra.cql3.statements.schema.CreateTypeStatement;
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.db.RowUpdateBuilder;
 import org.apache.cassandra.db.marshal.*;
@@ -748,6 +749,37 @@ public static void cleanupSavedCaches()
         ServerTestUtils.cleanupSavedCaches();
     }
 
+    /**
+     * Simple method that allows creating a table given it's CQL definition.
+     *
+     * <p>The method also creates the keyspace of the table if needs be (using a simple strategy with 1 replica) and
+     * can also create a few UDT (also from their CQL definition) if needed for the created table.
+     *
+     * <p>This method does not complain if any of the created entity already exists.
+     */
+    public static void load(String keyspace, String schemaCQL, String... typesCQL)
+    {
+        KeyspaceMetadata ksm = KeyspaceMetadata.create(keyspace,
+                                                       KeyspaceParams.simple(1),
+                                                       Tables.none(),
+                                                       Views.none(),
+                                                       Types.none(),
+                                                       Functions.none());
+        MigrationManager.announce(SchemaTransformations.addKeyspace(ksm, true), true);
+
+        for (String typeCQL : typesCQL)
+        {
+            Types types = Schema.instance.getKeyspaceMetadata(keyspace).types;
+            SchemaTransformation t = SchemaTransformations.addOrUpdateType(CreateTypeStatement.parse(typeCQL,
+                                                                                                     keyspace, types));
+            MigrationManager.announce(t, true);
+        }
+
+        Types types = Schema.instance.getKeyspaceMetadata(keyspace).types;
+        TableMetadata metadata = CreateTableStatement.parse(schemaCQL, keyspace, types).build();
+        MigrationManager.announce(SchemaTransformations.addTable(metadata, true), true);
+    }
+
     private static CompressionParams compressionParams(int chunkLength)
     {
         String algo = System.getProperty("cassandra.test.compression.algo", "lz4").toLowerCase();
diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
index c03bb09a2425..a33b4375de9f 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
@@ -122,9 +122,11 @@ public class DatabaseDescriptorRefTest
     "org.apache.cassandra.exceptions.ConfigurationException",
     "org.apache.cassandra.exceptions.RequestValidationException",
     "org.apache.cassandra.exceptions.CassandraException",
+    "org.apache.cassandra.exceptions.SyntaxException",
     "org.apache.cassandra.exceptions.TransportException",
     "org.apache.cassandra.fql.FullQueryLogger",
     "org.apache.cassandra.fql.FullQueryLoggerOptions",
+    "org.apache.cassandra.guardrails.GuardrailsConfig",
     "org.apache.cassandra.locator.IEndpointSnitch",
     "org.apache.cassandra.io.FSWriteError",
     "org.apache.cassandra.io.FSError",
@@ -166,8 +168,11 @@ public class DatabaseDescriptorRefTest
     "org.apache.cassandra.ConsoleAppenderBeanInfo",
     "org.apache.cassandra.ConsoleAppenderCustomizer",
     "org.apache.cassandra.locator.InetAddressAndPort",
+    "org.apache.cassandra.cql3.statements.PropertyDefinitions",
     "org.apache.cassandra.cql3.statements.schema.AlterKeyspaceStatement",
-    "org.apache.cassandra.cql3.statements.schema.CreateKeyspaceStatement"
+    "org.apache.cassandra.cql3.statements.schema.CreateKeyspaceStatement",
+    "org.apache.cassandra.cql3.statements.schema.TableAttributes",
+    "org.apache.cassandra.schema.TableParams$Option"
     };
 
     static final Set<String> checkedClasses = new HashSet<>(Arrays.asList(validClasses));
diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 9bd9be5feb83..26fab6c4acea 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -29,6 +29,9 @@
 import java.rmi.server.RMISocketFactory;
 import java.util.*;
 import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Consumer;
@@ -57,7 +60,6 @@
 import com.datastax.driver.core.*;
 import com.datastax.driver.core.DataType;
 import com.datastax.driver.core.ResultSet;
-
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.ServerTestUtils;
 import org.apache.cassandra.concurrent.ScheduledExecutors;
@@ -70,18 +72,34 @@
 import org.apache.cassandra.locator.TokenMetadata;
 import org.apache.cassandra.metrics.ClientMetrics;
 import org.apache.cassandra.schema.*;
+import org.apache.cassandra.audit.AuditLogManager;
+import org.apache.cassandra.auth.CassandraAuthorizer;
+import org.apache.cassandra.auth.CassandraRoleManager;
+import org.apache.cassandra.auth.PasswordAuthenticator;
+import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.EncryptionOptions;
 import org.apache.cassandra.cql3.functions.FunctionName;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.db.marshal.TupleType;
+import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry;
+import org.apache.cassandra.db.virtual.VirtualSchemaKeyspace;
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.SyntaxException;
+import org.apache.cassandra.index.SecondaryIndexManager;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.locator.AbstractEndpointSnitch;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.metrics.ClientMetrics;
+import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.security.ThreadAwareSecurityManager;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.serializers.TypeSerializer;
 import org.apache.cassandra.service.ClientState;
@@ -94,6 +112,7 @@
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.JMXServerUtils;
 import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.Pair;
 
 import static com.datastax.driver.core.SocketOptions.DEFAULT_CONNECT_TIMEOUT_MILLIS;
 import static com.datastax.driver.core.SocketOptions.DEFAULT_READ_TIMEOUT_MILLIS;
@@ -116,6 +135,7 @@ public abstract class CQLTester
     public static final String DATA_CENTER = ServerTestUtils.DATA_CENTER;
     public static final String DATA_CENTER_REMOTE = ServerTestUtils.DATA_CENTER_REMOTE;
     public static final String RACK1 = ServerTestUtils.RACK1;
+    private static final User SUPER_USER = new User("cassandra", "cassandra");
 
     private static org.apache.cassandra.transport.Server server;
     private static JMXConnectorServer jmxServer;
@@ -128,8 +148,13 @@ public abstract class CQLTester
     protected static final int nativePort;
     protected static final InetAddress nativeAddr;
     protected static final Set<InetAddressAndPort> remoteAddrs = new HashSet<>();
-    private static final Map<ProtocolVersion, Cluster> clusters = new HashMap<>();
-    protected static final Map<ProtocolVersion, Session> sessions = new HashMap<>();
+    private static final Map<Pair<User, ProtocolVersion>, Cluster> clusters = new HashMap<>();
+    protected static final Map<Pair<User, ProtocolVersion>, Session> sessions = new HashMap<>();
+
+    // needed in GuardrailsOnTableTest to check whether dropping schema tasks have been finished
+    protected static final ThreadPoolExecutor schemaCleanup =
+    new ThreadPoolExecutor(1, 1, 0L, TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>());
+
 
     public static final List<ProtocolVersion> PROTOCOL_VERSIONS = new ArrayList<>(ProtocolVersion.SUPPORTED.size());
 
@@ -165,9 +190,11 @@ public static final ProtocolVersion getDefaultVersion()
 
     private List<String> keyspaces = new ArrayList<>();
     private List<String> tables = new ArrayList<>();
+    private List<String> views = new ArrayList<>();
     private List<String> types = new ArrayList<>();
     private List<String> functions = new ArrayList<>();
     private List<String> aggregates = new ArrayList<>();
+    private User user;
 
     // We don't use USE_PREPARED_VALUES in the code below so some test can foce value preparation (if the result
     // is not expected to be the same without preparation)
@@ -333,58 +360,61 @@ public void afterTest() throws Throwable
 
         final List<String> keyspacesToDrop = copy(keyspaces);
         final List<String> tablesToDrop = copy(tables);
+        final List<String> viewsToDrop = copy(views);
         final List<String> typesToDrop = copy(types);
         final List<String> functionsToDrop = copy(functions);
         final List<String> aggregatesToDrop = copy(aggregates);
         keyspaces = null;
         tables = null;
+        views = null;
         types = null;
         functions = null;
         aggregates = null;
+        user = null;
 
         // We want to clean up after the test, but dropping a table is rather long so just do that asynchronously
-        ScheduledExecutors.optionalTasks.execute(new Runnable()
-        {
-            public void run()
+        schemaCleanup.execute(() -> {
+            try
             {
-                try
-                {
-                    for (int i = tablesToDrop.size() - 1; i >= 0; i--)
-                        schemaChange(String.format("DROP TABLE IF EXISTS %s.%s", KEYSPACE, tablesToDrop.get(i)));
+                logger.debug("Dropping {} materialized view created in previous test", viewsToDrop.size());
+                for (int i = viewsToDrop.size() - 1; i >= 0; i--)
+                    schemaChange(String.format("DROP MATERIALIZED VIEW IF EXISTS %s.%s", KEYSPACE, viewsToDrop.get(i)));
 
-                    for (int i = aggregatesToDrop.size() - 1; i >= 0; i--)
-                        schemaChange(String.format("DROP AGGREGATE IF EXISTS %s", aggregatesToDrop.get(i)));
+                for (int i = tablesToDrop.size() - 1; i >= 0; i--)
+                    schemaChange(String.format("DROP TABLE IF EXISTS %s.%s", KEYSPACE, tablesToDrop.get(i)));
 
-                    for (int i = functionsToDrop.size() - 1; i >= 0; i--)
-                        schemaChange(String.format("DROP FUNCTION IF EXISTS %s", functionsToDrop.get(i)));
+                for (int i = aggregatesToDrop.size() - 1; i >= 0; i--)
+                    schemaChange(String.format("DROP AGGREGATE IF EXISTS %s", aggregatesToDrop.get(i)));
 
-                    for (int i = typesToDrop.size() - 1; i >= 0; i--)
-                        schemaChange(String.format("DROP TYPE IF EXISTS %s.%s", KEYSPACE, typesToDrop.get(i)));
+                for (int i = functionsToDrop.size() - 1; i >= 0; i--)
+                    schemaChange(String.format("DROP FUNCTION IF EXISTS %s", functionsToDrop.get(i)));
 
-                    for (int i = keyspacesToDrop.size() - 1; i >= 0; i--)
-                        schemaChange(String.format("DROP KEYSPACE IF EXISTS %s", keyspacesToDrop.get(i)));
+                for (int i = typesToDrop.size() - 1; i >= 0; i--)
+                    schemaChange(String.format("DROP TYPE IF EXISTS %s.%s", KEYSPACE, typesToDrop.get(i)));
 
-                    // Dropping doesn't delete the sstables. It's not a huge deal but it's cleaner to cleanup after us
-                    // Thas said, we shouldn't delete blindly before the TransactionLogs.SSTableTidier for the table we drop
-                    // have run or they will be unhappy. Since those taks are scheduled on StorageService.tasks and that's
-                    // mono-threaded, just push a task on the queue to find when it's empty. No perfect but good enough.
+                for (int i = keyspacesToDrop.size() - 1; i >= 0; i--)
+                    schemaChange(String.format("DROP KEYSPACE IF EXISTS %s", keyspacesToDrop.get(i)));
 
-                    final CountDownLatch latch = new CountDownLatch(1);
-                    ScheduledExecutors.nonPeriodicTasks.execute(new Runnable()
-                    {
-                        public void run()
-                        {
-                            latch.countDown();
-                        }
-                    });
-                    latch.await(2, TimeUnit.SECONDS);
-
-                    removeAllSSTables(KEYSPACE, tablesToDrop);
-                }
-                catch (Exception e)
+                // Dropping doesn't delete the sstables. It's not a huge deal but it's cleaner to cleanup after us
+                // Thas said, we shouldn't delete blindly before the TransactionLogs.SSTableTidier for the table we drop
+                // have run or they will be unhappy. Since those taks are scheduled on StorageService.tasks and that's
+                // mono-threaded, just push a task on the queue to find when it's empty. No perfect but good enough.
+
+                final CountDownLatch latch = new CountDownLatch(1);
+                ScheduledExecutors.nonPeriodicTasks.execute(new Runnable()
                 {
-                    throw new RuntimeException(e);
-                }
+                    public void run()
+                    {
+                        latch.countDown();
+                    }
+                });
+                latch.await(2, TimeUnit.SECONDS);
+
+                removeAllSSTables(KEYSPACE, tablesToDrop);
+            }
+            catch (Exception e)
+            {
+                throw new RuntimeException(e);
             }
         });
     }
@@ -425,6 +455,15 @@ public static List<String> buildCassandraStressArgs(List<String> args)
         return allArgs;
     }
 
+    protected static void requireAuthentication()
+    {
+        System.setProperty("cassandra.superuser_setup_delay_ms", "-1");
+
+        DatabaseDescriptor.setAuthenticator(new PasswordAuthenticator());
+        DatabaseDescriptor.setRoleManager(new CassandraRoleManager());
+        DatabaseDescriptor.setAuthorizer(new CassandraAuthorizer());
+    }
+
     // lazy initialization for all tests that require Java Driver
     protected static void requireNetwork() throws ConfigurationException
     {
@@ -475,39 +514,6 @@ private static void initializeNetwork(Consumer<Server.Builder> decorator, Consum
         server = serverBuilder.build();
         ClientMetrics.instance.init(Collections.singleton(server));
         server.start();
-
-        for (ProtocolVersion version : PROTOCOL_VERSIONS)
-        {
-            if (clusters.containsKey(version))
-                continue;
-
-            SocketOptions socketOptions = new SocketOptions()
-                                          .setConnectTimeoutMillis(Integer.getInteger("cassandra.test.driver.connection_timeout_ms", DEFAULT_CONNECT_TIMEOUT_MILLIS)) // default is 5000
-                                          .setReadTimeoutMillis(Integer.getInteger("cassandra.test.driver.read_timeout_ms", DEFAULT_READ_TIMEOUT_MILLIS)); // default is 12000
-
-            logger.info("Timeouts: {} / {}", socketOptions.getConnectTimeoutMillis(), socketOptions.getReadTimeoutMillis());
-
-            Cluster.Builder builder = Cluster.builder()
-                                             .withoutJMXReporting()
-                                             .addContactPoints(nativeAddr)
-                                             .withClusterName("Test Cluster")
-                                             .withPort(nativePort)
-                                             .withSocketOptions(socketOptions);
-
-            if (clusterConfigurator != null)
-                clusterConfigurator.accept(builder);
-
-            if (version.isBeta())
-                builder = builder.allowBetaProtocolVersion();
-            else
-                builder = builder.withProtocolVersion(com.datastax.driver.core.ProtocolVersion.fromInt(version.asInt()));
-
-            Cluster cluster = builder.build();
-            clusters.put(version, cluster);
-            sessions.put(version, cluster.connect());
-
-            logger.info("Started Java Driver instance for protocol version {}", version);
-        }
     }
 
     protected void dropPerTestKeyspace() throws Throwable
@@ -586,6 +592,13 @@ public void compact(String keyspace, String table)
             store.forceMajorCompaction();
     }
 
+    public void compact(String keyspace)
+    {
+        ColumnFamilyStore store = getCurrentColumnFamilyStore(keyspace);
+        if (store != null)
+            store.forceMajorCompaction();
+    }
+
     public void disableCompaction()
     {
         disableCompaction(KEYSPACE);
@@ -655,6 +668,13 @@ protected String currentTable()
         return tables.get(tables.size() - 1);
     }
 
+    protected String currentView()
+    {
+        if (views.isEmpty())
+            return null;
+        return views.get(views.size() - 1);
+    }
+
     protected String currentKeyspace()
     {
         if (keyspaces.isEmpty())
@@ -684,14 +704,20 @@ public static void disablePreparedReuseForTest()
 
     protected String createType(String query)
     {
-        String typeName = String.format("type_%02d", seqNumber.getAndIncrement());
+        String typeName = createTypeName();
         String fullQuery = String.format(query, KEYSPACE + "." + typeName);
-        types.add(typeName);
         logger.info(fullQuery);
         schemaChange(fullQuery);
         return typeName;
     }
 
+    protected String createTypeName()
+    {
+        String typeName = String.format("type_%02d", seqNumber.getAndIncrement());
+        types.add(typeName);
+        return typeName;
+    }
+
     protected String createFunctionName(String keyspace)
     {
         return String.format("%s.function_%02d", keyspace, seqNumber.getAndIncrement());
@@ -777,6 +803,13 @@ public String createTable(String query)
         return createTable(KEYSPACE, query);
     }
 
+    protected String createViewName()
+    {
+        String currentView = "view_" + seqNumber.getAndIncrement();
+        views.add(currentView);
+        return currentView;
+    }
+
     protected String createTable(String keyspace, String query)
     {
         String currentTable = createTableName();
@@ -820,6 +853,12 @@ protected void dropTable(String query)
         dropFormattedTable(String.format(query, KEYSPACE + "." + currentTable()));
     }
 
+    public void dropView(String view)
+    {
+        dropFormattedTable(String.format("DROP MATERIALIZED VIEW IF EXISTS %s.%s", KEYSPACE, view));
+        views.remove(view);
+    }
+
     protected void dropFormattedTable(String formattedQuery)
     {
         logger.info(formattedQuery);
@@ -986,7 +1025,7 @@ protected static ResultMessage schemaChange(String query)
             QueryState queryState = new QueryState(state);
 
             CQLStatement statement = QueryProcessor.parseStatement(query, queryState.getClientState());
-            statement.validate(state);
+            statement.validate(queryState);
 
             QueryOptions options = QueryOptions.forInternalCalls(Collections.<ByteBuffer>emptyList());
 
@@ -1014,6 +1053,11 @@ protected com.datastax.driver.core.ResultSet executeNet(String query, Object...
         return sessionNet().execute(formatQuery(query), values);
     }
 
+    protected com.datastax.driver.core.ResultSet executeNet(Statement statement)
+    {
+        return executeNet(getDefaultVersion(), statement);
+    }
+
     protected com.datastax.driver.core.ResultSet executeNet(ProtocolVersion protocolVersion, Statement statement)
     {
         return sessionNet(protocolVersion).execute(statement);
@@ -1029,6 +1073,29 @@ protected com.datastax.driver.core.ResultSet executeNetWithPaging(String query,
         return sessionNet().execute(new SimpleStatement(formatQuery(query)).setFetchSize(pageSize));
     }
 
+    /**
+     * Use the specified user for executing the queries over the network.
+     * @param username the user name
+     * @param password the user password
+     */
+    public void useUser(String username, String password)
+    {
+        this.user = new User(username, password);
+    }
+
+    /**
+     * Use the super user for executing the queries over the network.
+     */
+    public void useSuperUser()
+    {
+        this.user = SUPER_USER;
+    }
+
+    public boolean isSuperUser()
+    {
+        return SUPER_USER.equals(user);
+    }
+
     public Session sessionNet()
     {
         return sessionNet(getDefaultVersion());
@@ -1038,7 +1105,55 @@ protected Session sessionNet(ProtocolVersion protocolVersion)
     {
         requireNetwork();
 
-        return sessions.get(protocolVersion);
+        return getSession(protocolVersion);
+    }
+
+    protected Session getSession(ProtocolVersion protocolVersion)
+    {
+        Cluster cluster = getCluster(protocolVersion);
+        return sessions.computeIfAbsent(Pair.create(user, protocolVersion), userProto -> cluster.connect());
+    }
+
+    private Cluster getCluster(ProtocolVersion protocolVersion)
+    {
+        return clusters.computeIfAbsent(Pair.create(user, protocolVersion),
+                                        userProto -> initClientCluster(user, protocolVersion));
+    }
+
+    private static Cluster initClientCluster(User user, ProtocolVersion version)
+    {
+        Pair<User, ProtocolVersion> key = Pair.create(user, version);
+        Cluster cluster = clusters.get(key);
+        if (cluster != null)
+            return cluster;
+
+        Cluster.Builder builder = clusterBuilder(version);
+        if (user != null)
+            builder.withCredentials(user.username, user.password);
+        cluster = builder.build();
+
+        logger.info("Started Java Driver instance for protocol version {}", version);
+
+        return cluster;
+    }
+
+    public static Cluster.Builder clusterBuilder(ProtocolVersion version)
+    {
+        Cluster.Builder builder = clusterBuilder();
+        if (version.isBeta())
+            builder = builder.allowBetaProtocolVersion();
+        else
+            builder = builder.withProtocolVersion(com.datastax.driver.core.ProtocolVersion.fromInt(version.asInt()));
+        return builder;
+    }
+
+    public static Cluster.Builder clusterBuilder()
+    {
+        return Cluster.builder()
+                      .addContactPoints(nativeAddr)
+                      .withPort(nativePort)
+                      .withClusterName("Test Cluster")
+                      .withoutJMXReporting();
     }
 
     protected SimpleClient newSimpleClient(ProtocolVersion version) throws IOException
@@ -1140,9 +1255,9 @@ protected void assertRowsNet(ProtocolVersion protocolVersion, ResultSet result,
             for (int j = 0; j < meta.size(); j++)
             {
                 DataType type = meta.getType(j);
-                com.datastax.driver.core.TypeCodec<Object> codec = clusters.get(protocolVersion).getConfiguration()
-                                                                                                .getCodecRegistry()
-                                                                                                .codecFor(type);
+                com.datastax.driver.core.TypeCodec<Object> codec = getCluster(protocolVersion).getConfiguration()
+                                                                                              .getCodecRegistry()
+                                                                                              .codecFor(type);
                 ByteBuffer expectedByteValue = codec.serialize(expected[j], com.datastax.driver.core.ProtocolVersion.fromInt(protocolVersion.asInt()));
                 int expectedBytes = expectedByteValue == null ? -1 : expectedByteValue.remaining();
                 ByteBuffer actualValue = actual.getBytesUnsafe(meta.getName(j));
@@ -1843,7 +1958,7 @@ protected Object map(Object...values)
     protected com.datastax.driver.core.TupleType tupleTypeOf(ProtocolVersion protocolVersion, com.datastax.driver.core.DataType...types)
     {
         requireNetwork();
-        return clusters.get(protocolVersion).getMetadata().newTupleType(types);
+        return getCluster(protocolVersion).getMetadata().newTupleType(types);
     }
 
     // Attempt to find an AbstracType from a value (for serialization/printing sake).
@@ -2116,4 +2231,44 @@ protected void failed(Throwable e, Description description)
                 random.printSeedOnFailure();
         }
     }
+
+    private static class User
+    {
+        /**
+         * The user name
+         */
+        public final String username;
+
+        /**
+         * The user password
+         */
+        public final String password;
+
+        public User(String username, String password)
+        {
+            this.username = username;
+            this.password = password;
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hashCode(username, password);
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o)
+                return true;
+
+            if (!(o instanceof User))
+                return false;
+
+            User u = (User) o;
+
+            return Objects.equal(username, u.username)
+                   && Objects.equal(password, u.password);
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/cql3/ListsTest.java b/test/unit/org/apache/cassandra/cql3/ListsTest.java
index 92dcd96f35ae..5779b0817c86 100644
--- a/test/unit/org/apache/cassandra/cql3/ListsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ListsTest.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
@@ -142,7 +143,7 @@ private void testPrepender_execute(List<ByteBuffer> terms)
         ByteBuffer keyBuf = ByteBufferUtil.bytes("key");
         DecoratedKey key = Murmur3Partitioner.instance.decorateKey(keyBuf);
         UpdateParameters parameters =
-            new UpdateParameters(metaData, null, QueryOptions.DEFAULT, System.currentTimeMillis(), FBUtilities.nowInSeconds(), 1000, Collections.emptyMap());
+            new UpdateParameters(metaData, null, QueryState.forInternalCalls(), QueryOptions.DEFAULT, System.currentTimeMillis(), FBUtilities.nowInSeconds(), 1000, Collections.emptyMap());
         Clustering<?> clustering = Clustering.make(ByteBufferUtil.bytes(1));
         parameters.newRow(clustering);
         prepender.execute(key, parameters);
diff --git a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java
index ef705bdf294d..b98ddf5e46d3 100644
--- a/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/PreparedStatementsTest.java
@@ -58,7 +58,7 @@ public void setup()
     @Test
     public void testInvalidatePreparedStatementsOnDrop()
     {
-        Session session = sessions.get(ProtocolVersion.V5);
+        Session session = getSession(ProtocolVersion.V5);
         session.execute(dropKsStatement);
         session.execute(createKsStatement);
 
@@ -102,7 +102,7 @@ public void testInvalidatePreparedStatementOnAlterV4()
 
     private void testInvalidatePreparedStatementOnAlter(ProtocolVersion version, boolean supportsMetadataChange)
     {
-        Session session = sessions.get(version);
+        Session session = getSession(version);
         String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (a int PRIMARY KEY, b int, c int);";
         String alterTableStatement = "ALTER TABLE " + KEYSPACE + ".qp_cleanup ADD d int;";
 
@@ -162,7 +162,7 @@ public void testInvalidatePreparedStatementOnAlterUnchangedMetadataV5()
 
     private void testInvalidatePreparedStatementOnAlterUnchangedMetadata(ProtocolVersion version)
     {
-        Session session = sessions.get(version);
+        Session session = getSession(version);
         String createTableStatement = "CREATE TABLE IF NOT EXISTS " + KEYSPACE + ".qp_cleanup (a int PRIMARY KEY, b int, c int);";
         String alterTableStatement = "ALTER TABLE " + KEYSPACE + ".qp_cleanup ADD d int;";
 
@@ -200,7 +200,7 @@ private void testInvalidatePreparedStatementOnAlterUnchangedMetadata(ProtocolVer
     @Test
     public void testStatementRePreparationOnReconnect()
     {
-        Session session = sessions.get(ProtocolVersion.V5);
+        Session session = getSession(ProtocolVersion.V5);
         session.execute("USE " + keyspace());
 
         session.execute(dropKsStatement);
@@ -241,7 +241,7 @@ public void testStatementRePreparationOnReconnect()
     @Test
     public void prepareAndExecuteWithCustomExpressions() throws Throwable
     {
-        Session session = sessions.get(ProtocolVersion.V5);
+        Session session = getSession(ProtocolVersion.V5);
 
         session.execute(dropKsStatement);
         session.execute(createKsStatement);
diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java
index 6d4e4874bca4..f721883e17b6 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java
@@ -84,9 +84,9 @@ private void createView(String name, String query) throws Throwable
         ViewFilteringTest.createView(name, query, views, version, this);
     }
 
-    private void dropView(String name) throws Throwable
+    private void dropMaterializedView(String name) throws Throwable
     {
-        ViewFilteringTest.dropView(name, views, version, this);
+        ViewFilteringTest.dropMaterializedView(name, views, version, this);
     }
 
     @Test
@@ -193,7 +193,7 @@ public void testClusteringKeyEQRestrictions() throws Throwable
                                     row(0, 1, 1, 0)
             );
 
-            dropView("mv_test" + i);
+            dropMaterializedView("mv_test" + i);
             dropTable("DROP TABLE %s");
         }
     }
@@ -301,7 +301,7 @@ public void testClusteringKeySliceRestrictions() throws Throwable
                                     row(0, 1, 1, 0)
             );
 
-            dropView("mv_test" + i);
+            dropMaterializedView("mv_test" + i);
             dropTable("DROP TABLE %s");
         }
     }
@@ -418,7 +418,7 @@ public void testClusteringKeyINRestrictions() throws Throwable
                                     row(0, 1, 1, 0)
             );
 
-            dropView("mv_test" + i);
+            dropMaterializedView("mv_test" + i);
             dropTable("DROP TABLE %s");
         }
     }
diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java
index d1ba84203638..d6abdd9805ea 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java
@@ -84,9 +84,9 @@ private void createView(String name, String query) throws Throwable
         ViewFilteringTest.createView(name, query, views, version, this);
     }
 
-    private void dropView(String name) throws Throwable
+    private void dropMaterializedView(String name) throws Throwable
     {
-        ViewFilteringTest.dropView(name, views, version, this);
+        ViewFilteringTest.dropMaterializedView(name, views, version, this);
     }
 
     @Test
@@ -197,7 +197,7 @@ public void testClusteringKeyMultiColumnRestrictions() throws Throwable
                                     row(0, 1, 1, 0)
             );
 
-            dropView("mv_test" + i);
+            dropMaterializedView("mv_test" + i);
             dropTable("DROP TABLE %s");
         }
     }
@@ -322,7 +322,7 @@ public void testClusteringKeyFilteringRestrictions() throws Throwable
                                     row(4, 4, 1, 1)
             );
 
-            dropView("mv_test" + i);
+            dropMaterializedView("mv_test" + i);
             dropTable("DROP TABLE %s");
         }
     }
diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java
index 09d220d3ce45..d4d246aadeb7 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java
@@ -84,9 +84,9 @@ private void createView(String name, String query) throws Throwable
         ViewFilteringTest.createView(name, query, views, version, this);
     }
 
-    private void dropView(String name) throws Throwable
+    private void dropMaterializedView(String name) throws Throwable
     {
-        ViewFilteringTest.dropView(name, views, version, this);
+        ViewFilteringTest.dropMaterializedView(name, views, version, this);
     }
 
     @Test
@@ -650,7 +650,7 @@ public void testPartitionKeyAndClusteringKeyFilteringRestrictions() throws Throw
             execute("DELETE FROM %s WHERE a = ?", 1);
             assertEmpty(execute("SELECT a, b, c, d FROM mv_test" + i));
 
-            dropView("mv_test" + i);
+            dropMaterializedView("mv_test" + i);
             dropTable("DROP TABLE %s");
         }
     }
diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
index 2d4cbb65ab30..1b818c9d1668 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
@@ -134,12 +134,12 @@ private void updateView(String query, Object... params) throws Throwable
         }
     }
 
-    private void dropView(String name) throws Throwable
+    private void dropMaterializedView(String name) throws Throwable
     {
-        dropView(name, views, version, this);
+        dropMaterializedView(name, views, version, this);
     }
 
-    public static void dropView(String name, List<String> views, ProtocolVersion version, CQLTester tester) throws Throwable
+    public static void dropMaterializedView(String name, List<String> views, ProtocolVersion version, CQLTester tester) throws Throwable
     {
         tester.executeNet(version, "DROP MATERIALIZED VIEW " + name);
         views.remove(name);
@@ -365,12 +365,12 @@ public void testViewFiltering(boolean flush) throws Throwable
         assertRowCount(execute("SELECT * FROM mv_test5"), 0);
         assertRowCount(execute("SELECT * FROM mv_test6"), 0);
 
-        dropView("mv_test1");
-        dropView("mv_test2");
-        dropView("mv_test3");
-        dropView("mv_test4");
-        dropView("mv_test5");
-        dropView("mv_test6");
+        dropMaterializedView("mv_test1");
+        dropMaterializedView("mv_test2");
+        dropMaterializedView("mv_test3");
+        dropMaterializedView("mv_test4");
+        dropMaterializedView("mv_test5");
+        dropMaterializedView("mv_test6");
         dropTable("DROP TABLE %s");
     }
 
@@ -757,7 +757,7 @@ public void testMVCreationWithNonPrimaryRestrictions() throws Throwable
 
         try {
             createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL AND d = 1 PRIMARY KEY (a, b, c)");
-            dropView("mv_test");
+            dropMaterializedView("mv_test");
         } catch(Exception e) {
             throw new RuntimeException("MV creation with non primary column restrictions failed.", e);
         }
@@ -862,7 +862,7 @@ public void testNonPrimaryRestrictions() throws Throwable
                                 row(0, 1, 1, 0)
         );
 
-        dropView("mv_test");
+        dropMaterializedView("mv_test");
         dropTable("DROP TABLE %s");
     }
 
diff --git a/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java b/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java
index 0323b49ee78c..3542e3939ec6 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewSchemaTest.java
@@ -29,25 +29,24 @@
 import java.util.List;
 import java.util.UUID;
 
+import org.junit.After;
 import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
 
 import com.datastax.driver.core.exceptions.OperationTimedOutException;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
 import org.apache.cassandra.concurrent.SEPExecutor;
 import org.apache.cassandra.concurrent.Stage;
+import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.Schema;
-import org.apache.cassandra.db.SystemKeyspace;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.serializers.SimpleDateSerializer;
 import org.apache.cassandra.serializers.TimeSerializer;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.BeforeClass;
-import org.junit.Test;
-
-import com.datastax.driver.core.exceptions.InvalidQueryException;
 
 
 public class ViewSchemaTest extends CQLTester
diff --git a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
index 960ee2abf97c..b21606ad69d6 100644
--- a/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
+++ b/test/unit/org/apache/cassandra/db/SinglePartitionSliceCommandTest.java
@@ -24,6 +24,7 @@
 import java.math.BigInteger;
 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
@@ -71,6 +72,7 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.btree.BTreeSet;
@@ -491,7 +493,7 @@ public static UnfilteredRowIterator getIteratorFromSinglePartition(String q)
     {
         SelectStatement stmt = (SelectStatement) QueryProcessor.parseStatement(q).prepare(ClientState.forInternalCalls());
 
-        SinglePartitionReadQuery.Group<SinglePartitionReadCommand> query = (SinglePartitionReadQuery.Group<SinglePartitionReadCommand>) stmt.getQuery(QueryOptions.DEFAULT, 0);
+        SinglePartitionReadQuery.Group<SinglePartitionReadCommand> query = (SinglePartitionReadQuery.Group<SinglePartitionReadCommand>) stmt.getQuery(QueryState.forInternalCalls(), QueryOptions.DEFAULT, 0);
         Assert.assertEquals(1, query.queries.size());
         SinglePartitionReadCommand command = Iterables.getOnlyElement(query.queries);
         try (ReadExecutionController controller = ReadExecutionController.forCommand(command);
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailCollectionSizeTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailCollectionSizeTest.java
new file mode 100644
index 000000000000..27939650f6f8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailCollectionSizeTest.java
@@ -0,0 +1,455 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static java.nio.ByteBuffer.allocate;
+
+/**
+ * Tests the guardrail for the max size of collections.
+ */
+public class GuardrailCollectionSizeTest extends GuardrailWarningOnSSTableWriteTester
+{
+    private static final int THRESHOLD_IN_KB = 1;
+    private static final int THRESHOLD_IN_BYTES = THRESHOLD_IN_KB * 1024;
+    private static final String SSTABLE_WRITE_WARN_MESSAGE = "Detected collection <redacted> of size";
+
+    private long defaultCollectionSize;
+
+    @Before
+    public void before()
+    {
+        defaultCollectionSize = config().collection_size_warn_threshold_in_kb;
+        config().collection_size_warn_threshold_in_kb = (long) THRESHOLD_IN_KB;
+    }
+
+    @After
+    public void after()
+    {
+        config().collection_size_warn_threshold_in_kb = defaultCollectionSize;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.collection_size_warn_threshold_in_kb = v,
+                                                 "collection_size_warn_threshold_in_kb");
+    }
+
+    @Test
+    public void testSetSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set(allocate(1)));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", set(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (4, ?)", set(allocate(THRESHOLD_IN_BYTES)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (5, ?)",
+                    set(allocate(THRESHOLD_IN_BYTES / 2), allocate(THRESHOLD_IN_BYTES / 2 + 1)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testFrozenSetSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<set<text>>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set((allocate(1))));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", set(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (5, ?)", set(allocate(THRESHOLD_IN_BYTES)));
+
+        // frozen collections size is not checked during sstable write
+        assertNotWarnedOnFlush();
+    }
+
+    @Test
+    public void testSetSizeWithUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", set(allocate(1)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", set(allocate(1)));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", set(allocate(THRESHOLD_IN_BYTES / 2 + 1)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testSetSizeAfterCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", set(allocate(1)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", set(allocate(1)));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", set(allocate(THRESHOLD_IN_BYTES / 2 + 1)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        assertValid("DELETE v FROM %s WHERE k = 1");
+        assertNotWarnedOnCompact();
+    }
+
+    @Test
+    public void testListSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(allocate(1)));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (4, ?)", list(allocate(THRESHOLD_IN_BYTES)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (5, ?)",
+                    list(allocate(THRESHOLD_IN_BYTES / 2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testFrozenListSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<list<text>>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set((allocate(1))));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", set(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (5, ?)", set(allocate(THRESHOLD_IN_BYTES)));
+
+        // frozen collections size is not checked during sstable write
+        assertNotWarnedOnFlush();
+    }
+
+    @Test
+    public void testListSizeWithUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", list(allocate(1)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", list(allocate(1)));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertValid("UPDATE %s SET v = ? + v WHERE k = 2", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testListSizeAfterCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", list(allocate(1)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", list(allocate(1)));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        assertValid("DELETE v[1] FROM %s WHERE k = 1");
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = ? + v WHERE k = 2", list(allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+    }
+
+    @Test
+    public void testMapSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(allocate(1), allocate(1)));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (5, ?)",
+                    map(allocate(THRESHOLD_IN_BYTES / 2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (6, ?)",
+                    map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2),
+                        allocate(2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (7, ?)",
+                    map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1),
+                        allocate(THRESHOLD_IN_BYTES / 2 + 1), allocate(1)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (8, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, v) VALUES (9, ?)", map(allocate(THRESHOLD_IN_BYTES), allocate(1)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testFrozenMapSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<map<text, text>>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(allocate(1), allocate(1)));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (5, ?)",
+                    map(allocate(THRESHOLD_IN_BYTES / 2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (6, ?)",
+                    map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2),
+                        allocate(2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (7, ?)",
+                    map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1),
+                        allocate(THRESHOLD_IN_BYTES / 2 + 1), allocate(1)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (8, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES)));
+        assertWarns("INSERT INTO %s (k, v) VALUES (9, ?)", map(allocate(THRESHOLD_IN_BYTES), allocate(1)));
+
+        // frozen collections size is not checked during sstable write
+        assertNotWarnedOnFlush();
+    }
+
+    @Test
+    public void testMapSizeWithUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", map(allocate(1), allocate(1)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", map(allocate(1), allocate(1)));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", map(allocate(2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 2", map(allocate(THRESHOLD_IN_BYTES / 2 + 1), allocate(1)));
+        assertWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 3", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 4", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testMapSizeAfterCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", map(allocate(1), allocate(1)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", map(allocate(1), allocate(1)));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", map(allocate(2), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        truncate();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 2", map(allocate(THRESHOLD_IN_BYTES / 2 + 1), allocate(1)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        truncate();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 3", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        truncate();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", map(allocate(THRESHOLD_IN_BYTES / 2), allocate(1)));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 4", map(allocate(1), allocate(THRESHOLD_IN_BYTES / 2 + 1)));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+    }
+
+    @Test
+    public void testMultipleCollections() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "   k int PRIMARY KEY, " +
+                    "   s set<text>," +
+                    "   l list<text>," +
+                    "   m map<text, text>," +
+                    "   fs frozen<set<text>>," +
+                    "   fl frozen<list<text>>," +
+                    "   fm frozen<map<text, text>>" +
+                    ")");
+
+        // the guardrail won't be triggered when the combined size of all the collections in a row is over the threshold
+        assertValid("INSERT INTO %s (k, s, fs, l, fl, m, fm) VALUES (0, ?, ?, ?, ?, ?, ?)",
+                    set(allocate(THRESHOLD_IN_BYTES / 2)),
+                    set(allocate(THRESHOLD_IN_BYTES / 2)),
+                    list(allocate(THRESHOLD_IN_BYTES / 2)),
+                    list(allocate(THRESHOLD_IN_BYTES / 2)),
+                    map(allocate(THRESHOLD_IN_BYTES / 4),
+                        allocate(THRESHOLD_IN_BYTES / 4)),
+                    map(allocate(THRESHOLD_IN_BYTES / 4),
+                        allocate(THRESHOLD_IN_BYTES / 4)));
+        assertNotWarnedOnFlush();
+
+        // the guardrail will produce a log message for each column exceeding the threshold, not just for the first one
+        assertWarns(Arrays.asList("Detected collection s of size",
+                                  "Detected collection fs of size",
+                                  "Detected collection l of size",
+                                  "Detected collection fl of size",
+                                  "Detected collection m of size",
+                                  "Detected collection fm of size"),
+                    "INSERT INTO %s (k, s, fs, l, fl, m, fm) VALUES (0, ?, ?, ?, ?, ?, ?)",
+                    set(allocate(THRESHOLD_IN_BYTES)),
+                    set(allocate(THRESHOLD_IN_BYTES)),
+                    list(allocate(THRESHOLD_IN_BYTES)),
+                    list(allocate(THRESHOLD_IN_BYTES)),
+                    map(allocate(THRESHOLD_IN_BYTES),
+                        allocate(THRESHOLD_IN_BYTES)),
+                    map(allocate(THRESHOLD_IN_BYTES),
+                        allocate(THRESHOLD_IN_BYTES)));
+
+        // only the non frozen collections will produce a warning during sstable write
+        assertWarnedOnSSTableWrite(false,
+                                   SSTABLE_WRITE_WARN_MESSAGE,
+                                   SSTABLE_WRITE_WARN_MESSAGE,
+                                   SSTABLE_WRITE_WARN_MESSAGE);
+    }
+
+    @Test
+    public void testCompositePartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 text, v set<text>, PRIMARY KEY((k1, k2)))");
+
+        assertValid("INSERT INTO %s (k1, k2, v) VALUES (0, 'a', ?)", set(allocate(1)));
+        assertNotWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k1, k2, v) VALUES (1, 'b', ?)", set(allocate(THRESHOLD_IN_BYTES)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testCompositeClusteringKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 int, c2 text, v set<text>, PRIMARY KEY(k, c1, c2))");
+
+        assertValid("INSERT INTO %s (k, c1, c2, v) VALUES (1, 10, 'a', ?)", set(allocate(1)));
+        assertNotWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, c1, c2, v) VALUES (2, 20, 'b', ?)", set(allocate(THRESHOLD_IN_BYTES)));
+        assertWarnedOnFlush();
+
+        assertWarns("INSERT INTO %s (k, c1, c2, v) VALUES (3, 30, 'c', ?)", set(allocate(THRESHOLD_IN_BYTES)));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testSuperUser() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+
+        // regular user should be warned
+        assertWarns("INSERT INTO %s (k, v) VALUES (1, ?)", set(allocate(THRESHOLD_IN_BYTES)));
+
+        // super user shouldn't be warned
+        useSuperUser();
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set(allocate(THRESHOLD_IN_BYTES)));
+
+        // sstable should produces warnings because the keyspace is not internal, regardless of the user
+        assertWarnedOnSSTableWrite(false, SSTABLE_WRITE_WARN_MESSAGE, SSTABLE_WRITE_WARN_MESSAGE);
+    }
+
+    private void truncate() throws Throwable
+    {
+        execute("TRUNCATE %s");
+    }
+
+    private void assertWarns(String query, Object... args) throws Throwable
+    {
+        String warning = "Detected collection v of size";
+        assertWarns(warning, query, args);
+    }
+
+    private void assertWarnedOnFlush()
+    {
+        assertWarnedOnFlush(SSTABLE_WRITE_WARN_MESSAGE);
+    }
+
+    private void assertWarnedOnCompact()
+    {
+        assertWarnedOnCompact(SSTABLE_WRITE_WARN_MESSAGE);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailColumnValueSizeTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailColumnValueSizeTest.java
new file mode 100644
index 000000000000..d77f413c1fdc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailColumnValueSizeTest.java
@@ -0,0 +1,557 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.util.Arrays;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.utils.units.SizeUnit;
+import org.apache.cassandra.utils.units.Units;
+
+import static java.lang.String.format;
+import static java.nio.ByteBuffer.allocate;
+
+/**
+ * Tests the guardrail for max column value size.
+ */
+public class GuardrailColumnValueSizeTest extends GuardrailTester
+{
+    private static final int THRESHOLD_IN_KB = 1;
+    private static final int THRESHOLD_IN_BYTES = THRESHOLD_IN_KB * 1024;
+
+    private long defaultColumnValueSizeThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultColumnValueSizeThreshold = config().column_value_size_failure_threshold_in_kb;
+        config().column_value_size_failure_threshold_in_kb = (long) THRESHOLD_IN_KB;
+    }
+
+    @After
+    public void after()
+    {
+        config().column_value_size_failure_threshold_in_kb = defaultColumnValueSizeThreshold;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.column_value_size_failure_threshold_in_kb = v,
+                                                 "column_value_size_failure_threshold_in_kb");
+    }
+
+    @Test
+    public void testSimplePartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text PRIMARY KEY, v int)");
+
+        testNoThreshold("INSERT INTO %s (k, v) VALUES (?, 0)");
+        testNoThreshold("UPDATE %s SET v = 1 WHERE k = ?");
+        testNoThreshold("DELETE v FROM %s WHERE k = ?");
+        testNoThreshold("DELETE FROM %s WHERE k = ?");
+    }
+
+    @Test
+    public void testComplexPartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 text, k2 text, v int, PRIMARY KEY((k1, k2)))");
+
+        testNoThreshold2("INSERT INTO %s (k1, k2, v) VALUES (?, ?, 0)");
+        testNoThreshold2("UPDATE %s SET v = 1 WHERE k1 = ? AND k2 = ?");
+        testNoThreshold2("DELETE v FROM %s WHERE k1 = ? AND k2 = ?");
+        testNoThreshold2("DELETE FROM %s WHERE k1 = ? AND k2 = ?");
+    }
+
+    @Test
+    public void testSimpleClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c text, v int, PRIMARY KEY(k, c))");
+
+        testNoThreshold("INSERT INTO %s (k, c, v) VALUES (0, ?, 0)");
+        testNoThreshold("UPDATE %s SET v = 1 WHERE k = 0 AND c = ?");
+        testNoThreshold("DELETE v FROM %s WHERE k = 0 AND c = ?");
+        testNoThreshold("DELETE FROM %s WHERE k = 0 AND c = ?");
+    }
+
+    @Test
+    public void testComplexClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 text, c2 text, v int, PRIMARY KEY(k, c1, c2))");
+
+        testNoThreshold("DELETE FROM %s WHERE k = 0 AND c1 = ?");
+        testNoThreshold("DELETE FROM %s WHERE k = 0 AND c1 > ?");
+        testNoThreshold("DELETE FROM %s WHERE k = 0 AND c1 < ?");
+        testNoThreshold("DELETE FROM %s WHERE k = 0 AND c1 >= ?");
+        testNoThreshold("DELETE FROM %s WHERE k = 0 AND c1 <= ?");
+
+        testNoThreshold2("INSERT INTO %s (k, c1, c2, v) VALUES (0, ?, ?, 0)");
+        testNoThreshold2("UPDATE %s SET v = 1 WHERE k = 0 AND c1 = ? AND c2 = ?");
+        testNoThreshold2("DELETE v FROM %s WHERE k = 0 AND c1 = ? AND c2 = ?");
+        testNoThreshold2("DELETE FROM %s WHERE k = 0 AND c1 = ? AND c2 = ?");
+        testNoThreshold2("DELETE FROM %s WHERE k = 0 AND c1 = ? AND c2 > ?");
+        testNoThreshold2("DELETE FROM %s WHERE k = 0 AND c1 = ? AND c2 < ?");
+        testNoThreshold2("DELETE FROM %s WHERE k = 0 AND c1 = ? AND c2 >= ?");
+        testNoThreshold2("DELETE FROM %s WHERE k = 0 AND c1 = ? AND c2 <= ?");
+    }
+
+    @Test
+    public void testRegularColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
+
+        testThreshold("v", "INSERT INTO %s (k, v) VALUES (0, ?)");
+        testThreshold("v", "UPDATE %s SET v = ? WHERE k = 0");
+    }
+
+    @Test
+    public void testStaticColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c int, s text STATIC, r int, PRIMARY KEY(k, c))");
+
+        testThreshold("s", "INSERT INTO %s (k, s) VALUES (0, ?)");
+        testThreshold("s", "INSERT INTO %s (k, c, s, r) VALUES (0, 0, ?, 0)");
+        testThreshold("s", "UPDATE %s SET s = ? WHERE k = 0");
+        testThreshold("s", "UPDATE %s SET s = ?, r = 0 WHERE k = 0 AND c = 0");
+    }
+
+    @Test
+    public void testTuple() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v tuple<text, text>)");
+
+        testThreshold2("v", "INSERT INTO %s (k, v) VALUES (0, (?, ?))", 8);
+        testThreshold2("v", "UPDATE %s SET v = (?, ?) WHERE k = 0", 8);
+    }
+
+    @Test
+    public void testUDT() throws Throwable
+    {
+        String udt = createType("CREATE TYPE %s (a text, b text)");
+        createTable(format("CREATE TABLE %%s (k int PRIMARY KEY, v %s)", udt));
+
+        testThreshold("v", "INSERT INTO %s (k, v) VALUES (0, {a: ?})");
+        testThreshold("v", "INSERT INTO %s (k, v) VALUES (0, {b: ?})");
+        testThreshold("v", "UPDATE %s SET v = {a: ?} WHERE k = 0");
+        testThreshold("v", "UPDATE %s SET v = {b: ?} WHERE k = 0");
+        testThreshold("v", "UPDATE %s SET v.a = ? WHERE k = 0");
+        testThreshold("v", "UPDATE %s SET v.b = ? WHERE k = 0");
+        testThreshold2("v", "INSERT INTO %s (k, v) VALUES (0, {a: ?, b: ?})");
+        testThreshold2("v", "UPDATE %s SET v.a = ?, v.b = ? WHERE k = 0");
+        testThreshold2("v", "UPDATE %s SET v = {a: ?, b: ?} WHERE k = 0");
+    }
+
+    @Test
+    public void testFrozenUDT() throws Throwable
+    {
+        String udt = createType("CREATE TYPE %s (a text, b text)");
+        createTable(format("CREATE TABLE %%s (k int PRIMARY KEY, v frozen<%s>)", udt));
+
+        testThreshold("v", "INSERT INTO %s (k, v) VALUES (0, {a: ?})", 8);
+        testThreshold("v", "INSERT INTO %s (k, v) VALUES (0, {b: ?})", 8);
+        testThreshold("v", "UPDATE %s SET v = {a: ?} WHERE k = 0", 8);
+        testThreshold("v", "UPDATE %s SET v = {b: ?} WHERE k = 0", 8);
+        testThreshold2("v", "INSERT INTO %s (k, v) VALUES (0, {a: ?, b: ?})", 8);
+        testThreshold2("v", "UPDATE %s SET v = {a: ?, b: ?} WHERE k = 0", 8);
+    }
+
+    @Test
+    public void testNestedUDT() throws Throwable
+    {
+        String inner = createType("CREATE TYPE %s (c text, d text)");
+        String outer = createType(format("CREATE TYPE %%s (a text, b frozen<%s>)", inner));
+        createTable(format("CREATE TABLE %%s (k int PRIMARY KEY, v %s)", outer));
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, {a: ?, b: {c: ?, d: ?}})",
+                                          "UPDATE %s SET v = {a: ?, b: {c: ?, d: ?}} WHERE k = 0"))
+        {
+            assertValid(query, allocate(0), allocate(0), allocate(0));
+            assertValid(query, allocate(THRESHOLD_IN_BYTES), allocate(0), allocate(0));
+            assertValid(query, allocate(0), allocate(THRESHOLD_IN_BYTES - 8), allocate(0));
+            assertValid(query, allocate(0), allocate(0), allocate(THRESHOLD_IN_BYTES - 8));
+            assertGuardrailFailed("v", query, allocate(THRESHOLD_IN_BYTES + 1), allocate(0), allocate(0));
+            assertGuardrailFailed("v", query, allocate(0), allocate(THRESHOLD_IN_BYTES - 7), allocate(0));
+            assertGuardrailFailed("v", query, allocate(0), allocate(0), allocate(THRESHOLD_IN_BYTES - 7));
+        }
+    }
+
+    @Test
+    public void testList() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0",
+                                          "UPDATE %s SET v = v + ? WHERE k = 0"))
+        {
+            assertValid(query, list(allocate(1)));
+            assertValid(query, list(allocate(THRESHOLD_IN_BYTES)));
+            assertValid(query, list(allocate(THRESHOLD_IN_BYTES), allocate(THRESHOLD_IN_BYTES)));
+            assertGuardrailFailed("v", query, list(allocate(THRESHOLD_IN_BYTES + 1)));
+        }
+
+        testThreshold("v", "UPDATE %s SET v[0] = ? WHERE k = 0");
+
+        String query = "UPDATE %s SET v = v - ? WHERE k = 0";
+        assertValid(query, list(allocate(1)));
+        assertValid(query, list(allocate(THRESHOLD_IN_BYTES)));
+        assertValid(query, list(allocate(THRESHOLD_IN_BYTES + 1))); // Doesn't write anything because we couldn't write
+    }
+
+    @Test
+    public void testFrozenList() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<list<text>>)");
+
+        // the serialized size of a frozen list is the size of its serialized elements, plus a 32-bit integer prefix for
+        // the number of elements, and another 32-bit integer for the size of each element
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0"))
+        {
+            assertValid(query, list(allocate(1)));
+            assertValid(query, list(allocate(THRESHOLD_IN_BYTES - 8)));
+            assertValid(query, list(allocate((THRESHOLD_IN_BYTES - 12) / 2), allocate((THRESHOLD_IN_BYTES - 12) / 2)));
+            assertGuardrailFailed("v", query, list(allocate(THRESHOLD_IN_BYTES - 7)));
+            assertGuardrailFailed("v", query, list(allocate(THRESHOLD_IN_BYTES - 12), allocate(1)));
+        }
+    }
+
+    @Test
+    public void testSet() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0",
+                                          "UPDATE %s SET v = v + ? WHERE k = 0",
+                                          "UPDATE %s SET v = v - ? WHERE k = 0"))
+        {
+            assertValid(query, set(allocate(0)));
+            assertValid(query, set(allocate(THRESHOLD_IN_BYTES)));
+            assertValid(query, set(allocate(THRESHOLD_IN_BYTES), allocate(THRESHOLD_IN_BYTES)));
+            assertGuardrailFailed("v", query, set(allocate(THRESHOLD_IN_BYTES + 1)));
+        }
+    }
+
+    @Test
+    public void testSetWithClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 int, c2 int, v set<text>, PRIMARY KEY(k, c1, c2))");
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, c1, c2, v) VALUES (0, 0, 0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0 AND c1 = 0 AND c2 = 0",
+                                          "UPDATE %s SET v = v + ? WHERE k = 0 AND c1 = 0 AND c2 = 0",
+                                          "UPDATE %s SET v = v - ? WHERE k = 0 AND c1 = 0 AND c2 = 0"))
+        {
+            assertValid(query, set(allocate(0)));
+            assertValid(query, set(allocate(THRESHOLD_IN_BYTES)));
+            assertValid(query, set(allocate(THRESHOLD_IN_BYTES), allocate(THRESHOLD_IN_BYTES)));
+            assertGuardrailFailed("v", query, set(allocate(THRESHOLD_IN_BYTES + 1)));
+        }
+    }
+
+    @Test
+    public void testFrozenSet() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<set<text>>)");
+
+        // the serialized size of a frozen set is the size of its serialized elements, plus a 32-bit integer prefix for
+        // the number of elements, and another 32-bit integer for the size of each element
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0"))
+        {
+            assertValid(query, set(allocate(1)));
+            assertValid(query, set(allocate(THRESHOLD_IN_BYTES - 8)));
+            assertValid(query, set(allocate((THRESHOLD_IN_BYTES - 12) / 2), allocate((THRESHOLD_IN_BYTES - 12) / 2)));
+            assertGuardrailFailed("v", query, set(allocate(THRESHOLD_IN_BYTES - 7)));
+            assertGuardrailFailed("v", query, set(allocate(THRESHOLD_IN_BYTES - 12), allocate(1)));
+        }
+    }
+
+    @Test
+    public void testMap() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0",
+                                          "UPDATE %s SET v = v + ? WHERE k = 0"))
+        {
+            assertValid(query, map(allocate(0), allocate(0)));
+            assertValid(query, map(allocate(THRESHOLD_IN_BYTES), allocate(0)));
+            assertValid(query, map(allocate(0), allocate(THRESHOLD_IN_BYTES)));
+            assertValid(query, map(allocate(THRESHOLD_IN_BYTES), allocate(THRESHOLD_IN_BYTES)));
+            assertGuardrailFailed("v", query, map(allocate(THRESHOLD_IN_BYTES + 1), allocate(1)));
+            assertGuardrailFailed("v", query, map(allocate(1), allocate(THRESHOLD_IN_BYTES + 1)));
+            assertGuardrailFailed("v", query, map(allocate(THRESHOLD_IN_BYTES + 1), allocate(THRESHOLD_IN_BYTES + 1)));
+        }
+
+        testThreshold2("v", "UPDATE %s SET v[?] = ? WHERE k = 0");
+
+        String query = "UPDATE %s SET v = v - ? WHERE k = 0";
+        assertValid(query, set(allocate(0)));
+        assertValid(query, set(allocate(THRESHOLD_IN_BYTES)));
+        assertGuardrailFailed("v", query, set(allocate(THRESHOLD_IN_BYTES + 1)));
+    }
+
+    @Test
+    public void testMapWithClustering() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 int, c2 int, v map<text, text>, PRIMARY KEY(k, c1, c2))");
+
+        String query = "INSERT INTO %s (k, c1, c2, v) VALUES (0, 0, 0, ?)";
+        assertValid(query, map(allocate(1), allocate(1)));
+        assertValid(query, map(allocate(THRESHOLD_IN_BYTES), allocate(1)));
+        assertValid(query, map(allocate(1), allocate(THRESHOLD_IN_BYTES)));
+        assertGuardrailFailed("v", query, map(allocate(THRESHOLD_IN_BYTES + 1), allocate(1)));
+        assertGuardrailFailed("v", query, map(allocate(1), allocate(THRESHOLD_IN_BYTES + 1)));
+    }
+
+    @Test
+    public void testFrozenMap() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<map<text, text>>)");
+
+        // the serialized size of a frozen map is the size of the serialized values plus a 32-bit integer prefix for the
+        // number of key-value pairs, and another 32-bit integer for the size of each value
+
+        for (String query : Arrays.asList("INSERT INTO %s (k, v) VALUES (0, ?)",
+                                          "UPDATE %s SET v = ? WHERE k = 0"))
+        {
+            assertValid(query, map(allocate(1), allocate(1)));
+            assertValid(query, map(allocate(THRESHOLD_IN_BYTES - 13), allocate(1)));
+            assertValid(query, map(allocate(1), allocate(THRESHOLD_IN_BYTES - 13)));
+            assertGuardrailFailed("v", query, map(allocate(THRESHOLD_IN_BYTES - 12), allocate(1)));
+            assertGuardrailFailed("v", query, map(allocate(1), allocate(THRESHOLD_IN_BYTES - 12)));
+        }
+    }
+
+    @Test
+    public void testBatch() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, c text, r text, s text STATIC, PRIMARY KEY(k, c))");
+
+        // partition key
+        testNoThreshold("BEGIN BATCH INSERT INTO %s (k, c, r) VALUES (?, '0', '0'); APPLY BATCH;");
+        testNoThreshold("BEGIN BATCH UPDATE %s SET r = '0' WHERE k = ? AND c = '0'; APPLY BATCH;");
+        testNoThreshold("BEGIN BATCH DELETE r FROM %s WHERE k = ? AND c = '0'; APPLY BATCH;");
+        testNoThreshold("BEGIN BATCH DELETE FROM %s WHERE k = ?; APPLY BATCH;");
+
+        // static column
+        testThreshold("s", "BEGIN BATCH INSERT INTO %s (k, s) VALUES ('0', ?); APPLY BATCH;");
+        testThreshold("s", "BEGIN BATCH INSERT INTO %s (k, s, c, r) VALUES ('0', ?, '0', '0'); APPLY BATCH;");
+        testThreshold("s", "BEGIN BATCH UPDATE %s SET s = ? WHERE k = '0'; APPLY BATCH;");
+        testThreshold("s", "BEGIN BATCH UPDATE %s SET s = ?, r = '0' WHERE k = '0' AND c = '0'; APPLY BATCH;");
+
+        // clustering key
+        testNoThreshold("BEGIN BATCH INSERT INTO %s (k, c, r) VALUES ('0', ?, '0'); APPLY BATCH;");
+        testNoThreshold("BEGIN BATCH UPDATE %s SET r = '0' WHERE k = '0' AND c = ?; APPLY BATCH;");
+        testNoThreshold("BEGIN BATCH DELETE r FROM %s WHERE k = '0' AND c = ?; APPLY BATCH;");
+        testNoThreshold("BEGIN BATCH DELETE FROM %s WHERE k = '0' AND c = ?; APPLY BATCH;");
+
+        // regular column
+        testThreshold("r", "BEGIN BATCH INSERT INTO %s (k, c, r) VALUES ('0', '0', ?); APPLY BATCH;");
+        testThreshold("r", "BEGIN BATCH UPDATE %s SET r = ? WHERE k = '0' AND c = '0'; APPLY BATCH;");
+    }
+
+    @Test
+    public void testCASWithIfNotExistsCondition() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, c text, v text, s text STATIC, PRIMARY KEY(k, c))");
+
+        // partition key
+        testNoThreshold("INSERT INTO %s (k, c, v) VALUES (?, '0', '0') IF NOT EXISTS");
+
+        // clustering key
+        testNoThreshold("INSERT INTO %s (k, c, v) VALUES ('0', ?, '0') IF NOT EXISTS");
+
+        // static column
+        assertValid("INSERT INTO %s (k, s) VALUES ('1', ?) IF NOT EXISTS", allocate(1));
+        assertValid("INSERT INTO %s (k, s) VALUES ('2', ?) IF NOT EXISTS", allocate(THRESHOLD_IN_BYTES));
+        assertValid("INSERT INTO %s (k, s) VALUES ('2', ?) IF NOT EXISTS", allocate(THRESHOLD_IN_BYTES + 1)); // not applied
+        assertGuardrailFailed("s", "INSERT INTO %s (k, s) VALUES ('3', ?) IF NOT EXISTS", allocate(THRESHOLD_IN_BYTES + 1));
+
+        // regular column
+        assertValid("INSERT INTO %s (k, c, v) VALUES ('4', '0', ?) IF NOT EXISTS", allocate(1));
+        assertValid("INSERT INTO %s (k, c, v) VALUES ('5', '0', ?) IF NOT EXISTS", allocate(THRESHOLD_IN_BYTES));
+        assertValid("INSERT INTO %s (k, c, v) VALUES ('5', '0', ?) IF NOT EXISTS", allocate(THRESHOLD_IN_BYTES + 1)); // not applied
+        assertGuardrailFailed("v", "INSERT INTO %s (k, c, v) VALUES ('6', '0', ?) IF NOT EXISTS", allocate(THRESHOLD_IN_BYTES + 1));
+    }
+
+    @Test
+    public void testCASWithIfExistsCondition() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, c text, v text, s text STATIC, PRIMARY KEY(k, c))");
+
+        // partition key, the CAS updates with values beyond the threshold are not applied so they don't come to fail
+        testNoThreshold("UPDATE %s SET v = '0' WHERE k = ? AND c = '0' IF EXISTS");
+
+        // clustering key, the CAS updates with values beyond the threshold are not applied so they don't come to fail
+        testNoThreshold("UPDATE %s SET v = '0' WHERE k = '0' AND c = ? IF EXISTS");
+
+        // static column, only the applied CAS updates can fire the guardrail
+        assertValid("INSERT INTO %s (k, s) VALUES ('0', '0')");
+        testThreshold("s", "UPDATE %s SET s = ? WHERE k = '0' IF EXISTS");
+        assertValid("DELETE FROM %s WHERE k = '0'");
+        testNoThreshold("UPDATE %s SET s = ? WHERE k = '0' IF EXISTS");
+
+        // regular column, only the applied CAS updates can fire the guardrail
+        assertValid("INSERT INTO %s (k, c) VALUES ('0', '0')");
+        testThreshold("v", "UPDATE %s SET v = ? WHERE k = '0' AND c = '0' IF EXISTS");
+        assertValid("DELETE FROM %s WHERE k = '0' AND c = '0'");
+        testNoThreshold("UPDATE %s SET v = ? WHERE k = '0' AND c = '0' IF EXISTS");
+    }
+
+    @Test
+    public void testCASWithColumnsCondition() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v text)");
+
+        // updates are always accepted for values lesser than the threshold, independently of whether they are applied
+        assertValid("DELETE FROM %s WHERE k = 0");
+        assertValid("UPDATE %s SET v = ? WHERE k = 0 IF v = '0'", allocate(1));
+        assertValid("UPDATE %s SET v = '0' WHERE k = 0");
+        assertValid("UPDATE %s SET v = ? WHERE k = 0 IF v = '0'", allocate(1));
+
+        // updates are always accepted for values equals to the threshold, independently of whether they are applied
+        assertValid("DELETE FROM %s WHERE k = 0");
+        assertValid("UPDATE %s SET v = ? WHERE k = 0 IF v = '0'", allocate(THRESHOLD_IN_BYTES));
+        assertValid("UPDATE %s SET v = '0' WHERE k = 0");
+        assertValid("UPDATE %s SET v = ? WHERE k = 0 IF v = '0'", allocate(THRESHOLD_IN_BYTES));
+
+        // updates beyond the threshold fail only if the update is applied
+        assertValid("DELETE FROM %s WHERE k = 0");
+        assertValid("UPDATE %s SET v = ? WHERE k = 0 IF v = '0'", allocate(THRESHOLD_IN_BYTES + 1));
+        assertValid("UPDATE %s SET v = '0' WHERE k = 0");
+        assertGuardrailFailed("v", "UPDATE %s SET v = ? WHERE k = 0 IF v = '0'", allocate(THRESHOLD_IN_BYTES + 1));
+    }
+
+    @Test
+    public void testSelect() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k text, c text, r text, s text STATIC, PRIMARY KEY(k, c))");
+
+        testNoThreshold("SELECT * FROM %s WHERE k = ?");
+        testNoThreshold("SELECT * FROM %s WHERE k = '0' AND c = ?");
+        testNoThreshold("SELECT * FROM %s WHERE c = ? ALLOW FILTERING");
+        testNoThreshold("SELECT * FROM %s WHERE s = ? ALLOW FILTERING");
+        testNoThreshold("SELECT * FROM %s WHERE r = ? ALLOW FILTERING");
+    }
+
+    /**
+     * Tests that the max column size guardrail threshold is not applied for the specified 1-placeholder CQL query.
+     *
+     * @param query a CQL modification statement with exactly one placeholder
+     */
+    private void testNoThreshold(String query) throws Throwable
+    {
+        assertValid(query, allocate(1));
+        assertValid(query, allocate(THRESHOLD_IN_BYTES));
+        assertValid(query, allocate(THRESHOLD_IN_BYTES + 1));
+    }
+
+    /**
+     * Tests that the max column size guardrail threshold is not applied for the specified 2-placeholder CQL query.
+     *
+     * @param query a CQL modification statement with exactly two placeholders
+     */
+    private void testNoThreshold2(String query) throws Throwable
+    {
+        assertValid(query, allocate(1), allocate(1));
+        assertValid(query, allocate(THRESHOLD_IN_BYTES), allocate(1));
+        assertValid(query, allocate(1), allocate(THRESHOLD_IN_BYTES));
+        assertValid(query, allocate((THRESHOLD_IN_BYTES)), allocate((THRESHOLD_IN_BYTES)));
+        assertValid(query, allocate(THRESHOLD_IN_BYTES + 1), allocate(1));
+        assertValid(query, allocate(1), allocate(THRESHOLD_IN_BYTES + 1));
+    }
+
+    /**
+     * Tests that the max column size guardrail threshold is applied for the specified 1-placeholder CQL query.
+     *
+     * @param column the name of the column referenced by the query placeholder
+     * @param query  a CQL query with exactly one placeholder
+     */
+    private void testThreshold(String column, String query) throws Throwable
+    {
+        testThreshold(column, query, 0);
+    }
+
+    /**
+     * Tests that the max column size guardrail threshold is applied for the specified 1-placeholder CQL query.
+     *
+     * @param column             the name of the column referenced by the query placeholder
+     * @param query              a CQL query with exactly one placeholder
+     * @param serializationBytes the extra bytes added to the placeholder value by its wrapping column type serializer
+     */
+    private void testThreshold(String column, String query, int serializationBytes) throws Throwable
+    {
+        int threshold = THRESHOLD_IN_BYTES - serializationBytes;
+        assertValid(query, allocate(1));
+        assertValid(query, allocate(threshold));
+        assertGuardrailFailed(column, query, allocate(threshold + 1));
+    }
+
+    /**
+     * Tests that the max column size guardrail threshold is applied for the specified 2-placeholder CQL query.
+     *
+     * @param column the name of the column referenced by the placeholders
+     * @param query  a CQL query with exactly two placeholders
+     */
+    private void testThreshold2(String column, String query) throws Throwable
+    {
+        assertValid(query, allocate(1), allocate(1));
+        assertValid(query, allocate(THRESHOLD_IN_BYTES), allocate(1));
+        assertValid(query, allocate(1), allocate(THRESHOLD_IN_BYTES));
+        assertValid(query, allocate((THRESHOLD_IN_BYTES)), allocate((THRESHOLD_IN_BYTES)));
+        assertGuardrailFailed(column, query, allocate(THRESHOLD_IN_BYTES + 1), allocate(1));
+        assertGuardrailFailed(column, query, allocate(1), allocate(THRESHOLD_IN_BYTES + 1));
+    }
+
+    /**
+     * Tests that the max column size guardrail threshold is applied for the specified 2-placeholder query.
+     *
+     * @param column             the name of the column referenced by the placeholders
+     * @param query              a CQL query with exactly two placeholders
+     * @param serializationBytes the extra bytes added to the sum of the placeholder value by their wrapping serializer
+     */
+    private void testThreshold2(String column, String query, int serializationBytes) throws Throwable
+    {
+        assertValid(query, allocate(1), allocate(1));
+        assertValid(query, allocate(THRESHOLD_IN_BYTES - serializationBytes - 1), allocate(1));
+        assertValid(query, allocate(1), allocate(THRESHOLD_IN_BYTES - serializationBytes - 1));
+        assertValid(query, allocate((THRESHOLD_IN_BYTES / 2) - serializationBytes), allocate((THRESHOLD_IN_BYTES / 2)));
+        assertGuardrailFailed(column, query, allocate(THRESHOLD_IN_BYTES - serializationBytes), allocate(1));
+        assertGuardrailFailed(column, query, allocate(1), allocate(THRESHOLD_IN_BYTES - serializationBytes));
+    }
+
+    private void assertGuardrailFailed(String column, String query, Object... values) throws Throwable
+    {
+        String errorMessage = format("Value of %s of size %s is greater than the maximum allowed (%s)",
+                                     column,
+                                     Units.toString(THRESHOLD_IN_BYTES + 1, SizeUnit.BYTES),
+                                     Units.toString(THRESHOLD_IN_BYTES, SizeUnit.BYTES));
+        assertFails(errorMessage, query, values);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailColumnsPerTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailColumnsPerTableTest.java
new file mode 100644
index 000000000000..d2b57c804b2e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailColumnsPerTableTest.java
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+import static java.lang.String.format;
+
+/**
+ * Tests the guardrail for the max number of columns in a table.
+ */
+public class GuardrailColumnsPerTableTest extends GuardrailTester
+{
+    private static final long TABLES_PER_COLUMN_THRESHOLD = 3;
+    // Name use when testing CREATE TABLE that should fail (we need to provide it as assertFails, which
+    // is use to assert the failure, does not know that it is a CREATE TABLE and would thus reuse the name of the
+    // previously created table, which is not what we want).
+    private static final String FAIL_TABLE = "failure_table_creation_test";
+
+    private long defaultColumnsPerTableThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultColumnsPerTableThreshold = config().columns_per_table_failure_threshold;
+        config().columns_per_table_failure_threshold = TABLES_PER_COLUMN_THRESHOLD;
+    }
+
+    @After
+    public void after()
+    {
+        config().columns_per_table_failure_threshold = defaultColumnsPerTableThreshold;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.columns_per_table_failure_threshold = v,
+                                                 "columns_per_table_failure_threshold");
+    }
+
+    @Test
+    public void testCreateTable() throws Throwable
+    {
+        // partition key on skinny table
+        assertValid(format("CREATE TABLE %s (k1 int, v int, PRIMARY KEY((k1)))", createTableName()));
+        assertValid(format("CREATE TABLE %s (k1 int, k2 int, v int, PRIMARY KEY((k1, k2)))", createTableName()));
+        assertFails(4, "CREATE TABLE %s (k1 int, k2 int, k3 int, v int, PRIMARY KEY((k1, k2, k3)))", FAIL_TABLE);
+        assertFails(5, "CREATE TABLE %s (k1 int, k2 int, k3 int, k4 int, v int, PRIMARY KEY((k1, k2, k3, k4)))", FAIL_TABLE);
+
+        // partition key on wide table
+        assertValid(format("CREATE TABLE %s (k1 int, c int, v int, PRIMARY KEY(k1, c))", createTableName()));
+        assertFails(4, "CREATE TABLE %s (k1 int, k2 int, c int, v int, PRIMARY KEY((k1, k2), c))", FAIL_TABLE);
+        assertFails(5, "CREATE TABLE %s (k1 int, k2 int, k3 int, c int, v int, PRIMARY KEY((k1, k2, k3), c))", FAIL_TABLE);
+
+        // clustering key
+        assertValid(format("CREATE TABLE %s (k int, c1 int, v int, PRIMARY KEY(k, c1))", createTableName()));
+        assertFails(4, "CREATE TABLE %s (k int, c1 int, c2 int, v int, PRIMARY KEY(k, c1, c2))", FAIL_TABLE);
+        assertFails(5, "CREATE TABLE %s (k int, c1 int, c2 int, c3 int, v int, PRIMARY KEY(k, c1, c2, c3))", FAIL_TABLE);
+
+        // static column
+        assertValid(format("CREATE TABLE %s (k int, c int, s1 int STATIC, PRIMARY KEY(k, c))", createTableName()));
+        assertFails(4, "CREATE TABLE %s (k int, c int, s1 int STATIC, s2 int STATIC, PRIMARY KEY(k, c))", FAIL_TABLE);
+        assertFails(5, "CREATE TABLE %s (k int, c int, s1 int STATIC, s2 int STATIC, s3 int STATIC, PRIMARY KEY(k, c))", FAIL_TABLE);
+
+        // regular column on skinny table
+        assertValid(format("CREATE TABLE %s (k int PRIMARY KEY, v1 int)", createTableName()));
+        assertValid(format("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int)", createTableName()));
+        assertFails(4, "CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int, v3 int)", FAIL_TABLE);
+        assertFails(5, "CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int, v3 int, v4 int)", FAIL_TABLE);
+
+        // regular column on wide table
+        assertValid(format("CREATE TABLE %s (k int, c int, v1 int, PRIMARY KEY(k, c))", createTableName()));
+        assertFails(4, "CREATE TABLE %s (k int, c int, v1 int, v2 int, PRIMARY KEY(k, c))", FAIL_TABLE);
+        assertFails(5, "CREATE TABLE %s (k int, c int, v1 int, v2 int, v3 int, PRIMARY KEY(k, c))", FAIL_TABLE);
+
+        // udt
+        String udt = createType("CREATE TYPE %s (a int, b int, c int, d int)");
+        assertValid(format("CREATE TABLE %s (k int PRIMARY KEY, v %s)", createTableName(), udt));
+    }
+
+    @Test
+    public void testAlterTableAddColumn() throws Throwable
+    {
+        // skinny table under threshold
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int)");
+        assertValid("ALTER TABLE %s ADD v2 int");
+        assertFails(4, "ALTER TABLE %s ADD v3 int");
+
+        // skinny table at threshold
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int)");
+        assertFails(4, "ALTER TABLE %s ADD v3 int");
+
+        // wide table
+        createTable("CREATE TABLE %s (k int, c int, v1 int, PRIMARY KEY(k, c))");
+        assertFails(4, "ALTER TABLE %s ADD v2 int");
+        assertFails(4, "ALTER TABLE %s ADD s int STATIC");
+
+        // udt
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int)");
+        String udt = createType("CREATE TYPE %s (a int, b int, c int, d int)");
+        assertValid("ALTER TABLE %s ADD v2 " + udt);
+    }
+
+    /**
+     * Verifies that its possible to drop columns from a table that has more columns than the current threshold.
+     */
+    @Test
+    public void testAlterTableDropColumn() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v1 int, v2 int)");
+        DatabaseDescriptor.getGuardrailsConfig().columns_per_table_failure_threshold = 2L;
+        assertValid("ALTER TABLE %s DROP v2");
+        assertFails(3, "ALTER TABLE %s ADD v2 int");
+    }
+
+    private void assertFails(int columns, String query) throws Throwable
+    {
+        assertFails(columns, query, currentTable());
+    }
+
+    private void assertFails(int columns, String query, String tableName) throws Throwable
+    {
+        String errorMessage = format("Tables cannot have more than %s columns, but %s provided for table %s",
+                                     DatabaseDescriptor.getGuardrailsConfig().columns_per_table_failure_threshold,
+                                     columns,
+                                     tableName);
+        assertFails(errorMessage, format(query, keyspace() + '.' + tableName));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
new file mode 100644
index 000000000000..1805c730e842
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.LinkedHashSet;
+import java.util.Set;
+import java.util.function.Supplier;
+
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.transport.ProtocolVersion;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class GuardrailConsistencyTest extends GuardrailTester
+{
+    private static final Set<String> DISALLOWED_WRITE_CLS = new LinkedHashSet<>(Arrays.asList(
+    ConsistencyLevel.ANY.toString(),
+    ConsistencyLevel.ONE.toString(),
+    ConsistencyLevel.TWO.toString(),
+    ConsistencyLevel.THREE.toString(),
+    ConsistencyLevel.QUORUM.toString(),
+    ConsistencyLevel.ALL.toString(),
+    ConsistencyLevel.EACH_QUORUM.toString(),
+    ConsistencyLevel.LOCAL_ONE.toString()));
+
+    private static final Set<String> SERIAL_CLS = new LinkedHashSet<>(Arrays.asList(
+    ConsistencyLevel.SERIAL.toString(),
+    ConsistencyLevel.LOCAL_SERIAL.toString()
+    ));
+    private static final Set<String> SERIAL_ONLY = new LinkedHashSet<>(Collections.singletonList(ConsistencyLevel.SERIAL.toString()));
+    private static final LinkedHashSet<String> LOCAL_SERIAL_ONLY = new LinkedHashSet<>(Collections.singletonList(ConsistencyLevel.LOCAL_SERIAL.toString()));
+
+    private static Set<String> defaultDisallowedWriteConsistencyLevels;
+    private Supplier<QueryState> queryState;
+
+    @BeforeClass
+    public static void setup()
+    {
+        defaultDisallowedWriteConsistencyLevels = DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = defaultDisallowedWriteConsistencyLevels;
+    }
+
+    @Before
+    public void setupTest()
+    {
+        createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
+        queryState = this::userQueryState;
+        disableConsistencyLevels(DISALLOWED_WRITE_CLS);
+    }
+
+    private void disableConsistencyLevels(Set<String> consistencyLevels)
+    {
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = consistencyLevels;
+    }
+
+    private void executeWithConsistency(String query, ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        QueryOptions queryOptions = queryOptions(cl, serialCl);
+        QueryState state = queryState.get();
+        CQLStatement statement = QueryProcessor.getStatement(formatQuery(query), state.getClientState());
+        statement.execute(state, queryOptions, System.nanoTime());
+    }
+
+    private void insert(ConsistencyLevel cl)
+    {
+        executeWithConsistency("INSERT INTO %s (k, c, v) VALUES (1, 2, 'val')", cl, null);
+    }
+
+    private void lwtInsert(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        executeWithConsistency("INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') IF NOT EXISTS", cl, serialCl);
+    }
+
+    @Test
+    public void testInsertWithDisallowedConsistency()
+    {
+        assertThatThrownBy(() -> insert(ConsistencyLevel.ONE))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+    }
+
+    @Test
+    public void testLWTInsertWithDisallowedConsistency1()
+    {
+        disableConsistencyLevels(SERIAL_ONLY);
+        assertThatThrownBy(() -> lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+    }
+
+    @Test
+    public void testLWTInsertWithDisallowedConsistency2()
+    {
+        disableConsistencyLevels(SERIAL_CLS);
+        assertThatThrownBy(() -> lwtInsert(ConsistencyLevel.LOCAL_QUORUM, null))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+    }
+
+    @Test
+    public void testInsertWithAllowedConsistency()
+    {
+        // test that it does not throw
+        insert(ConsistencyLevel.LOCAL_QUORUM);
+
+        disableConsistencyLevels(SERIAL_ONLY);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+    }
+
+    @Test
+    public void testLWTUpdateWithDisallowedConsistency()
+    {
+        disableConsistencyLevels(SERIAL_ONLY);
+        assertThatThrownBy(() -> lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+    }
+
+    @Test
+    public void testLWTUpdateWithDisallowedConsistency1()
+    {
+        disableConsistencyLevels(SERIAL_ONLY);
+        assertThatThrownBy(() -> lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+    }
+
+    @Test
+    public void testLWTUpdateWithDisallowedConsistency2()
+    {
+        disableConsistencyLevels(SERIAL_CLS);
+        assertThatThrownBy(() -> lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, null))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+    }
+
+    @Test
+    public void testUpdateWithAllowedConsistency()
+    {
+        // test that it does not throw
+        update(ConsistencyLevel.LOCAL_QUORUM);
+
+        disableConsistencyLevels(SERIAL_ONLY);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+    }
+
+    @Test
+    public void testUpdateWithDisallowedConsistency()
+    {
+        assertThatThrownBy(() -> update(ConsistencyLevel.ONE))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+    }
+
+    @Test
+    public void testDeleteWithDisallowedConsistency()
+    {
+        assertThatThrownBy(() -> delete(ConsistencyLevel.ONE))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+    }
+
+    @Test
+    public void testLWTDeleteWithAllowedConsistency1()
+    {
+        disableConsistencyLevels(SERIAL_ONLY);
+        assertThatThrownBy(() -> lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+    }
+
+    @Test
+    public void testLWTDeleteWithAllowedConsistency2()
+    {
+        disableConsistencyLevels(SERIAL_CLS);
+        assertThatThrownBy(() -> lwtDelete(ConsistencyLevel.LOCAL_QUORUM, null))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+    }
+
+    @Test
+    public void testDeleteWithAllowedConsistency()
+    {
+        // test that it does not throw
+        delete(ConsistencyLevel.LOCAL_QUORUM);
+
+        disableConsistencyLevels(SERIAL_ONLY);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+    }
+
+    @Test
+    public void testLWTBatchWithDisallowedConsistency1()
+    {
+        disableConsistencyLevels(SERIAL_ONLY);
+        assertThatThrownBy(() -> lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+    }
+
+    @Test
+    public void testLWTBatchWithDisallowedConsistency2()
+    {
+        disableConsistencyLevels(SERIAL_CLS);
+        assertThatThrownBy(() -> lwtBatch(ConsistencyLevel.LOCAL_QUORUM, null))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+    }
+
+    @Test
+    public void testBatchWithAllowedConsistency()
+    {
+        // test that it does not throw
+        batch(ConsistencyLevel.LOCAL_QUORUM);
+
+        disableConsistencyLevels(SERIAL_ONLY);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+    }
+
+    @Test
+    public void testBatchWithDisallowedConsistency()
+    {
+        assertThatThrownBy(() -> batch(ConsistencyLevel.ONE))
+        .isInstanceOf(InvalidRequestException.class)
+        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+    }
+
+    private QueryOptions queryOptions(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        return QueryOptions.create(cl,
+                                   Collections.emptyList(),
+                                   false,
+                                   1,
+                                   null,
+                                   serialCl,
+                                   ProtocolVersion.CURRENT,
+                                   KEYSPACE);
+    }
+
+    private void update(ConsistencyLevel cl)
+    {
+        executeWithConsistency("UPDATE %s SET v = 'val2' WHERE k = 1 and c = 2", cl, null);
+    }
+
+    private void lwtUpdate(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        executeWithConsistency("UPDATE %s SET v = 'val2' WHERE k = 1 and c = 2 IF EXISTS", cl, serialCl);
+    }
+
+    private void delete(ConsistencyLevel cl)
+    {
+        executeWithConsistency("DELETE FROM %s WHERE k=1", cl, null);
+    }
+
+    private void lwtDelete(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        executeWithConsistency("DELETE FROM %s WHERE k=1 AND c=2 IF EXISTS", cl, serialCl);
+    }
+
+    private void batch(ConsistencyLevel cl)
+    {
+        executeWithConsistency("BEGIN BATCH " +
+                               "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') " +
+                               "APPLY BATCH", cl, null);
+    }
+
+    private void lwtBatch(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        executeWithConsistency("BEGIN BATCH " +
+                               "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') IF NOT EXISTS " +
+                               "APPLY BATCH", cl, serialCl);
+    }
+
+    @Test
+    public void testSuperUser()
+    {
+        queryState = this::superQueryState;
+        testExcludedUser();
+    }
+
+    @Test
+    public void testSystemUser()
+    {
+        queryState = this::internalQueryState;
+        testExcludedUser();
+    }
+
+    private void testExcludedUser()
+    {
+        insert(ConsistencyLevel.ONE);
+        insert(ConsistencyLevel.LOCAL_QUORUM);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        update(ConsistencyLevel.ONE);
+        update(ConsistencyLevel.LOCAL_QUORUM);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        delete(ConsistencyLevel.ONE);
+        delete(ConsistencyLevel.LOCAL_QUORUM);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+
+        batch(ConsistencyLevel.ONE);
+        batch(ConsistencyLevel.LOCAL_QUORUM);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+    }
+}
+
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
new file mode 100644
index 000000000000..ed1086645865
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
@@ -0,0 +1,461 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.net.UnknownHostException;
+import java.util.Arrays;
+import java.util.function.Consumer;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.BatchStatement;
+import com.datastax.driver.core.ConsistencyLevel;
+import com.datastax.driver.core.SimpleStatement;
+import com.datastax.driver.core.Statement;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.gms.ApplicationState;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.gms.VersionedValue;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster;
+import org.apache.cassandra.service.disk.usage.DiskUsageMonitor;
+import org.apache.cassandra.service.disk.usage.DiskUsageState;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.service.disk.usage.DiskUsageState.FULL;
+import static org.apache.cassandra.service.disk.usage.DiskUsageState.NOT_AVAILABLE;
+import static org.apache.cassandra.service.disk.usage.DiskUsageState.SPACIOUS;
+import static org.apache.cassandra.service.disk.usage.DiskUsageState.STUFFED;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class GuardrailDiskUsageTest extends GuardrailTester
+{
+    private static Integer defaultDiskUsagePercentageWarnThreshold;
+    private static Integer defaultDiskUsagePercentageFailThreshold;
+
+    @BeforeClass
+    public static void beforeClass()
+    {
+        defaultDiskUsagePercentageWarnThreshold = DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_warn_threshold;
+        defaultDiskUsagePercentageFailThreshold = DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_failure_threshold;
+
+        DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_warn_threshold = -1;
+        DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_failure_threshold = -1;
+    }
+
+    @AfterClass
+    public static void afterClass()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_warn_threshold = defaultDiskUsagePercentageWarnThreshold;
+        DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_failure_threshold = defaultDiskUsagePercentageFailThreshold;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+
+        // warn threshold smaller than lower bound
+        config.disk_usage_percentage_warn_threshold = 0;
+        config.disk_usage_percentage_failure_threshold = 80;
+        assertConfigFails(config::validateDiskUsageThreshold, "0 is not allowed");
+
+        // fail threshold bigger than upper bound
+        config.disk_usage_percentage_warn_threshold = 1;
+        config.disk_usage_percentage_failure_threshold = 110;
+        assertConfigFails(config::validateDiskUsageThreshold, "maximum allowed value is 100");
+
+        // warn threshold larger than fail threshold
+        config.disk_usage_percentage_warn_threshold = 60;
+        config.disk_usage_percentage_failure_threshold = 50;
+        assertConfigFails(config::validateDiskUsageThreshold, "60 for the disk_usage_percentage guardrail should be" +
+                                                              " lower than the failure threshold 50");
+
+        // disabled warn
+        config.disk_usage_percentage_warn_threshold = -1;
+        config.disk_usage_percentage_failure_threshold = 100;
+        config.validateDiskUsageThreshold();
+        assertTrue(GuardrailsConfig.diskUsageGuardrailDisabled(config.disk_usage_percentage_warn_threshold));
+        assertFalse(GuardrailsConfig.diskUsageGuardrailDisabled(config.disk_usage_percentage_failure_threshold));
+
+        // disabled fail
+        config.disk_usage_percentage_warn_threshold = 20;
+        config.disk_usage_percentage_failure_threshold = -1;
+        config.validateDiskUsageThreshold();
+        assertFalse(GuardrailsConfig.diskUsageGuardrailDisabled(config.disk_usage_percentage_warn_threshold));
+        assertTrue(GuardrailsConfig.diskUsageGuardrailDisabled(config.disk_usage_percentage_failure_threshold));
+
+        // disabled disk usage guardrail
+        config.disk_usage_percentage_warn_threshold = -1;
+        config.disk_usage_percentage_failure_threshold = -1;
+        config.validateDiskUsageThreshold();
+        assertTrue(GuardrailsConfig.diskUsageGuardrailDisabled(config.disk_usage_percentage_warn_threshold));
+        assertTrue(GuardrailsConfig.diskUsageGuardrailDisabled(config.disk_usage_percentage_failure_threshold));
+    }
+
+    @Test
+    public void testDiskUsageState()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        config.disk_usage_percentage_warn_threshold = 50;
+        config.disk_usage_percentage_failure_threshold = 90;
+
+        // under usage
+        assertEquals(SPACIOUS, DiskUsageMonitor.instance.getState(10));
+        assertEquals(SPACIOUS, DiskUsageMonitor.instance.getState(50));
+
+        // exceed warning threshold
+        assertEquals(STUFFED, DiskUsageMonitor.instance.getState(51));
+        assertEquals(STUFFED, DiskUsageMonitor.instance.getState(56));
+        assertEquals(STUFFED, DiskUsageMonitor.instance.getState(90));
+
+        // exceed fail threshold
+        assertEquals(FULL, DiskUsageMonitor.instance.getState(91));
+        assertEquals(FULL, DiskUsageMonitor.instance.getState(100));
+    }
+
+    @Test
+    public void testDiskUsageDetectorWarnDisabled()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        config.disk_usage_percentage_warn_threshold = -1;
+        config.disk_usage_percentage_failure_threshold = 90;
+
+        // under usage
+        assertEquals(SPACIOUS, DiskUsageMonitor.instance.getState(0));
+        assertEquals(SPACIOUS, DiskUsageMonitor.instance.getState(50));
+        assertEquals(SPACIOUS, DiskUsageMonitor.instance.getState(90));
+
+        // exceed fail threshold
+        assertEquals(FULL, DiskUsageMonitor.instance.getState(91));
+        assertEquals(FULL, DiskUsageMonitor.instance.getState(100));
+    }
+
+    @Test
+    public void testDiskUsageDetectorFailDisabled()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        config.disk_usage_percentage_warn_threshold = 50;
+        config.disk_usage_percentage_failure_threshold = -1;
+
+        // under usage
+        assertEquals(SPACIOUS, DiskUsageMonitor.instance.getState(50));
+
+        // exceed warning threshold
+        assertEquals(STUFFED, DiskUsageMonitor.instance.getState(51));
+        assertEquals(STUFFED, DiskUsageMonitor.instance.getState(80));
+        assertEquals(STUFFED, DiskUsageMonitor.instance.getState(100));
+    }
+
+    @Test
+    public void testDiskUsageGuardrailDisabled()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        config.disk_usage_percentage_warn_threshold = -1;
+        config.disk_usage_percentage_failure_threshold = -1;
+
+        assertEquals(NOT_AVAILABLE, DiskUsageMonitor.instance.getState(0));
+        assertEquals(NOT_AVAILABLE, DiskUsageMonitor.instance.getState(60));
+        assertEquals(NOT_AVAILABLE, DiskUsageMonitor.instance.getState(100));
+    }
+
+    @Test
+    public void testMemtableSizeIncluded() throws Throwable
+    {
+        DiskUsageMonitor monitor = new DiskUsageMonitor();
+
+        createTable(keyspace(), "CREATE TABLE %s (key text primary key, value text) with compression = { 'enabled' : false };");
+
+        long memtableSizeBefore = monitor.getAllMemtableSize();
+        int rows = 10;
+        int mb = 1024 * 1024;
+
+        for (int i = 0; i < rows; i++)
+        {
+            char[] chars = new char[mb];
+            Arrays.fill(chars, (char) i);
+            String value = String.copyValueOf(chars);
+            execute("INSERT INTO %s (key, value) VALUES(?, ?)", i, value);
+        }
+
+        // verify memtables are included
+        long memtableSizeAfterInsert = monitor.getAllMemtableSize();
+        assertTrue("Expect at least 10MB more data, but got before: " + memtableSizeBefore + " and after: " + memtableSizeAfterInsert,
+                   memtableSizeAfterInsert - memtableSizeBefore >= rows * mb);
+
+        // verify memtable size are reduced after flush
+        flush();
+        long memtableSizeAfterFlush = monitor.getAllMemtableSize();
+        assertEquals(memtableSizeBefore, memtableSizeAfterFlush, mb);
+    }
+
+    @Test
+    public void testMonitorLogsOnStateChange()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        config.disk_usage_percentage_warn_threshold = 50;
+        config.disk_usage_percentage_failure_threshold = 90;
+
+        Guardrails.localDiskUsage.resetLastNotifyTime();
+
+        DiskUsageMonitor monitor = new DiskUsageMonitor();
+
+        // transit to SPACIOUS, no logging
+        assertMonitorStateTransition(0.50, SPACIOUS, monitor);
+
+        // transit to STUFFED, expect warning
+        assertMonitorStateTransition(0.50001, STUFFED, monitor, true, "Local disk usage 51%(Stuffed) exceeds warn threshold of 50%");
+
+        // remain as STUFFED, no logging because of min log interval
+        assertMonitorStateTransition(0.90, STUFFED, monitor);
+
+        // transit to FULL, expect failure
+        assertMonitorStateTransition(0.90001, FULL, monitor, false, "Local disk usage 91%(Full) exceeds failure threshold of 90%, will stop accepting writes");
+
+        // remain as FULL, no logging because of min log interval
+        assertMonitorStateTransition(0.99, FULL, monitor);
+
+        // transit back to STUFFED, no warning  because of min log interval
+        assertMonitorStateTransition(0.90, STUFFED, monitor);
+
+        // transit back to FULL, no logging  because of min log interval
+        assertMonitorStateTransition(0.900001, FULL, monitor);
+
+        // transit back to STUFFED, no logging  because of min log interval
+        assertMonitorStateTransition(0.90, STUFFED, monitor);
+
+        // transit to SPACIOUS, no logging
+        assertMonitorStateTransition(0.50, SPACIOUS, monitor);
+    }
+
+    @Test
+    public void testDiskUsageBroadcaster() throws UnknownHostException
+    {
+        DiskUsageBroadcaster broadcaster = new DiskUsageBroadcaster(null);
+        Gossiper.instance.unregister(broadcaster);
+
+        InetAddressAndPort node1 = InetAddressAndPort.getByName("127.0.0.1");
+        InetAddressAndPort node2 = InetAddressAndPort.getByName("127.0.0.2");
+        InetAddressAndPort node3 = InetAddressAndPort.getByName("127.0.0.3");
+
+        // initially it's NOT_AVAILABLE
+        assertFalse(broadcaster.hasStuffedOrFullNode());
+        assertFalse(broadcaster.isFull(node1));
+        assertFalse(broadcaster.isFull(node2));
+        assertFalse(broadcaster.isFull(node3));
+
+        // adding 1st node: Spacious, cluster has no Full node
+        broadcaster.onChange(node1, ApplicationState.DISK_USAGE, value(SPACIOUS));
+        assertFalse(broadcaster.hasStuffedOrFullNode());
+        assertFalse(broadcaster.isFull(node1));
+
+        // adding 2nd node with wrong ApplicationState
+        broadcaster.onChange(node2, ApplicationState.RACK, value(FULL));
+        assertFalse(broadcaster.hasStuffedOrFullNode());
+        assertFalse(broadcaster.isFull(node2));
+
+        // adding 2nd node: STUFFED
+        broadcaster.onChange(node2, ApplicationState.DISK_USAGE, value(STUFFED));
+        assertTrue(broadcaster.hasStuffedOrFullNode());
+        assertTrue(broadcaster.isStuffed(node2));
+
+        // adding 3rd node: FULL
+        broadcaster.onChange(node3, ApplicationState.DISK_USAGE, value(FULL));
+        assertTrue(broadcaster.hasStuffedOrFullNode());
+        assertTrue(broadcaster.isFull(node3));
+
+        // remove 2nd node, cluster has Full node
+        broadcaster.onRemove(node2);
+        assertTrue(broadcaster.hasStuffedOrFullNode());
+        assertFalse(broadcaster.isStuffed(node2));
+
+        // remove 3nd node, cluster has no Full node
+        broadcaster.onRemove(node3);
+        assertFalse(broadcaster.hasStuffedOrFullNode());
+        assertFalse(broadcaster.isFull(node3));
+    }
+
+    @Test
+    public void testWriteRequests() throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s (key int primary key, value int)");
+
+        InetAddressAndPort local = FBUtilities.getBroadcastAddressAndPort();
+        InetAddressAndPort node1 = InetAddressAndPort.getByName("127.0.0.11");
+        InetAddressAndPort node2 = InetAddressAndPort.getByName("127.0.0.21");
+        InetAddressAndPort node3 = InetAddressAndPort.getByName("127.0.0.31");
+
+        // avoid noise due to test machines
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        config.disk_usage_percentage_warn_threshold = 98;
+        config.disk_usage_percentage_failure_threshold = 99;
+
+        String warnMessage = "Replica disk usage exceeds warn threshold";
+        String errorMessage = "Write request failed because disk usage exceeds failure threshold";
+
+        CheckedFunction select = () -> {
+            Statement statement = new SimpleStatement("SELECT * FROM " + keyspace() + "." + table);
+            statement.setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM);
+            executeNet(statement);
+        };
+        CheckedFunction insert = () -> {
+            Statement statement = new SimpleStatement("INSERT INTO " + keyspace() + "." + table + " (key, value) VALUES(0, 0)");
+            statement.setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM);
+            executeNet(statement);
+        };
+        CheckedFunction batch = () -> {
+            BatchStatement batchStatement = new BatchStatement();
+            batchStatement.add(new SimpleStatement("INSERT INTO " + keyspace() + "." + table + " (key, value) VALUES(1, 1)"));
+            batchStatement.add(new SimpleStatement("INSERT INTO " + keyspace() + "." + table + " (key, value) VALUES(2, 2)"));
+            batchStatement.setConsistencyLevel(ConsistencyLevel.LOCAL_QUORUM);
+            executeNet(batchStatement);
+        };
+
+        // default state, write request works fine
+        assertTrue(Guardrails.enabled());
+        assertValid(select);
+        assertValid(insert);
+        assertValid(batch);
+
+        // verify node1 NOT_AVAILABLE won't affect writes
+        DiskUsageBroadcaster.instance.onChange(node1, ApplicationState.DISK_USAGE, value(NOT_AVAILABLE));
+        assertValid(select);
+        assertValid(insert);
+        assertValid(batch);
+
+        // verify node2 Spacious won't affect writes
+        DiskUsageBroadcaster.instance.onChange(node2, ApplicationState.DISK_USAGE, value(SPACIOUS));
+        assertValid(select);
+        assertValid(insert);
+        assertValid(batch);
+
+        // verify node3 STUFFED won't trigger warning as it's not write replica
+        DiskUsageBroadcaster.instance.onChange(node3, ApplicationState.DISK_USAGE, value(STUFFED));
+        assertValid(select);
+        assertValid(insert);
+        assertValid(batch);
+
+        // verify node3 Full won't affect writes as it's not write replica
+        DiskUsageBroadcaster.instance.onChange(node3, ApplicationState.DISK_USAGE, value(FULL));
+        assertValid(select);
+        assertValid(insert);
+        assertValid(batch);
+
+        // verify local node STUFF, will log warning
+        DiskUsageBroadcaster.instance.onChange(local, ApplicationState.DISK_USAGE, value(STUFFED));
+        assertValid(select);
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertWarns(insert, warnMessage);
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertWarns(batch, warnMessage);
+
+        // verify local node Full, will reject writes
+        DiskUsageBroadcaster.instance.onChange(local, ApplicationState.DISK_USAGE, value(FULL));
+        assertValid(select);
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertFails(insert, errorMessage);
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertFails(batch, errorMessage);
+
+        // super user can insert to Full cluster
+        useSuperUser();
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertValid(select);
+        assertValid(insert);
+        assertValid(batch);
+        useUser(USERNAME, PASSWORD);
+
+        // verify local node STUFFED won't reject writes
+        DiskUsageBroadcaster.instance.onChange(local, ApplicationState.DISK_USAGE, value(STUFFED));
+        assertValid(select);
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertWarns(insert, warnMessage);
+        Guardrails.replicaDiskUsage.resetLastNotifyTime();
+        assertWarns(batch, warnMessage);
+    }
+
+    private VersionedValue value(DiskUsageState state)
+    {
+        return StorageService.instance.valueFactory.diskUsage(state.name());
+    }
+
+
+    private void assertMonitorStateTransition(double usageRatio, DiskUsageState state, DiskUsageMonitor monitor)
+    {
+        assertMonitorStateTransition(usageRatio, state, monitor, false, null);
+    }
+
+    private void assertMonitorStateTransition(double usageRatio, DiskUsageState state, DiskUsageMonitor monitor,
+                                              boolean isWarn, String msg)
+    {
+        boolean stateChanged = state != monitor.state();
+        Consumer<DiskUsageState> notifier = newState -> {
+            if (stateChanged)
+                assertEquals(state, newState);
+            else
+                fail("Expect no notification if state remains the same");
+        };
+
+        monitor.updateLocalState(usageRatio, notifier);
+        assertEquals(state, monitor.state());
+
+        if (msg == null)
+        {
+            listener.assertNotFailed();
+            listener.assertNotWarned();
+        }
+        else if (isWarn)
+        {
+            listener.assertWarned(msg);
+            listener.assertNotFailed();
+        }
+        else
+        {
+            listener.assertFailed(msg);
+            listener.assertNotWarned();
+        }
+
+        listener.clear();
+    }
+
+
+    protected void assertConfigFails(Runnable runnable, String message)
+    {
+        try
+        {
+            runnable.run();
+            fail("Expected failure");
+        }
+        catch (ConfigurationException e)
+        {
+            String actualMessage = e.getMessage();
+            assertTrue(String.format("Failure message '%s' does not contain expected message '%s'", actualMessage, message),
+                       actualMessage.contains(message));
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailFieldsPerUDTTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailFieldsPerUDTTest.java
new file mode 100644
index 000000000000..4e93bda1c78e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailFieldsPerUDTTest.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+import static java.lang.String.format;
+
+/**
+ * Tests the guardrail for max number of fields in a UDT.
+ */
+public class GuardrailFieldsPerUDTTest extends GuardrailTester
+{
+    private static final long FIELDS_PER_UDT_THRESHOLD = 2;
+
+    private long defaultFieldsPerUDTThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultFieldsPerUDTThreshold = config().fields_per_udt_failure_threshold;
+        config().fields_per_udt_failure_threshold = FIELDS_PER_UDT_THRESHOLD;
+    }
+
+    @After
+    public void after()
+    {
+        config().fields_per_udt_failure_threshold = defaultFieldsPerUDTThreshold;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.fields_per_udt_failure_threshold = v,
+                                                 "fields_per_udt_failure_threshold");
+    }
+
+    @Test
+    public void testCreateType() throws Throwable
+    {
+        assertValid("CREATE TYPE %s (a int)");
+        assertValid("CREATE TYPE %s (a int, b int)");
+        assertFails("CREATE TYPE %s (a int, b int, c int)", 3);
+        assertFails("CREATE TYPE %s (a int, b int, c int, d int)", 4);
+    }
+
+    @Test
+    public void testAlterTypeAddField() throws Throwable
+    {
+        String name = createType("CREATE TYPE %s (a int)");
+
+        assertValid("ALTER TYPE %s ADD b int", name);
+        assertFails("ALTER TYPE %s ADD c int", name, 3);
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        String name = createTypeName();
+        testExcludedUsers(format("CREATE TYPE %s (a int, b int, c int)", name),
+                          format("ALTER TYPE %s ADD d int", name),
+                          format("DROP TYPE %s", name));
+    }
+
+    private void assertValid(String query) throws Throwable
+    {
+        assertValid(query, createTypeName());
+    }
+
+    private void assertValid(String query, String typeName) throws Throwable
+    {
+        super.assertValid(format(query, typeName));
+    }
+
+    private void assertFails(String query, int numFields) throws Throwable
+    {
+        String typeName = createTypeName();
+        assertFails(query, typeName, numFields);
+    }
+
+    private void assertFails(String query, String typeName, int numFields) throws Throwable
+    {
+        String errorMessage = format("User types cannot have more than %s columns, but %s provided for type %s",
+                                     DatabaseDescriptor.getGuardrailsConfig().fields_per_udt_failure_threshold,
+                                     numFields,
+                                     typeName);
+        assertFails(errorMessage, format(query, typeName));
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailInSelectTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailInSelectTest.java
new file mode 100644
index 000000000000..7eae95ab4c2d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailInSelectTest.java
@@ -0,0 +1,192 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.util.Arrays;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.assertj.core.api.Assertions;
+
+public class GuardrailInSelectTest extends GuardrailTester
+{
+    private static int defaultInSelectCartesianProduct;
+    private static int defaultPartitionKeysInSelectQuery;
+    private static final int inSelectCartesianProduct = 25;
+    private static final int partitionKeysInSelectQuery = 500;
+    private static final String cartesianProductErrorMessage = "The query cannot be completed because cartesian product of all values in IN conditions is greater than " + inSelectCartesianProduct;
+
+    @BeforeClass
+    public static void setup()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+        defaultInSelectCartesianProduct = config.in_select_cartesian_product_failure_threshold;
+        defaultPartitionKeysInSelectQuery = config.partition_keys_in_select_failure_threshold;
+        config.in_select_cartesian_product_failure_threshold = inSelectCartesianProduct;
+        config.partition_keys_in_select_failure_threshold = partitionKeysInSelectQuery;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().in_select_cartesian_product_failure_threshold = defaultInSelectCartesianProduct;
+        DatabaseDescriptor.getGuardrailsConfig().partition_keys_in_select_failure_threshold = defaultPartitionKeysInSelectQuery;
+    }
+
+    @Before
+    public void initSchema()
+    {
+        createTable("CREATE TABLE %s (pk1 int, pk2 int, ck1 int, ck2 int, PRIMARY KEY((pk1, pk2), ck1, ck2))");
+    }
+
+    @Test
+    public void testPkCartesianProduct() throws Throwable
+    {
+        // below threshold
+        testPkCartesianProduct(5, 5);
+        testPkCartesianProduct(2, 12);
+        testPkCartesianProduct(8, 3);
+
+        // above cartesian product limit
+        testPkCartesianProduct(1, 26);
+        testPkCartesianProduct(5, 6);
+        testPkCartesianProduct(26, 1);
+
+        // above cartesian product limit as super user
+        useSuperUser();
+        testPkCartesianProduct(26, 1);
+    }
+
+    @Test
+    public void testCkCartesianProduct() throws Throwable
+    {
+        // below threshold
+        testCkCartesianProduct(3, 8);
+        testCkCartesianProduct(5, 5);
+
+        // above cartesian product limit
+        testCkCartesianProduct(1, 26);
+        testCkCartesianProduct(5, 6);
+        testCkCartesianProduct(6, 5);
+        testCkCartesianProduct(26, 1);
+
+        // above cartesian product limit as super user
+        useSuperUser();
+        testCkCartesianProduct(26, 1);
+    }
+
+    @Test
+    public void testPkCkCartesianProduct() throws Throwable
+    {
+        // below threshold
+        testCartesianProduct(1, 10, 1, 10);
+        testCartesianProduct(10, 1, 10, 1);
+        testCartesianProduct(5, 5, 5, 5);
+
+        // above cartesian product limit
+        testCartesianProduct(5, 6, 5, 5);
+        testCartesianProduct(6, 5, 5, 5);
+        testCartesianProduct(5, 5, 6, 5);
+        testCartesianProduct(5, 5, 5, 6);
+
+        // above cartesian product limit as super user
+        useSuperUser();
+        testCartesianProduct(5, 5, 5, 6);
+    }
+
+    private void testPkCartesianProduct(int pk1Terms, int pk2Terms) throws Throwable
+    {
+        testCartesianProduct(pk1Terms, pk2Terms, 1, 1);
+    }
+
+    private void testCkCartesianProduct(int ck1Terms, int ck2Terms) throws Throwable
+    {
+        testCartesianProduct(1, 1, ck1Terms, ck2Terms);
+    }
+
+    private void testCartesianProduct(int pk1, int pk2, int ck1, int ck2) throws Throwable
+    {
+        String query = String.format("SELECT * FROM %%s WHERE pk1 in (%s) AND pk2 in (%s) AND ck1 in (%s) AND ck2 in (%s);",
+                                     terms(pk1), terms(pk2), terms(ck1), terms(ck2));
+
+        String queryWithBindVariables = String.format("SELECT * FROM %%s WHERE pk1 in (%s) AND pk2 in (%s) AND ck1 in (%s) AND ck2 in (%s);",
+                                                      markers(pk1), markers(pk2), markers(ck1), markers(ck2));
+
+        boolean exceedCartesianProductLimit = Math.max(pk1 * pk2, ck1 * ck2) > inSelectCartesianProduct;
+        boolean failed = exceedCartesianProductLimit && !isSuperUser();
+
+        if (failed)
+        {
+            String errorMessage = cartesianProductErrorMessage;
+            Assertions.assertThatThrownBy(() -> executeNet(query))
+                      .hasMessage(errorMessage);
+            Assertions.assertThatThrownBy(() -> executeNet(queryWithBindVariables, bindValues(pk1, pk2, ck1, ck2)))
+                      .hasMessage(errorMessage);
+        }
+        else
+        {
+            executeNet(query);
+            executeNet(queryWithBindVariables, bindValues(pk1, pk2, ck1, ck2));
+        }
+    }
+
+    @Test
+    public void testPkCartesianProductMultiColumnBelowThreshold() throws Throwable
+    {
+        String inTerms = IntStream.range(0, 5).mapToObj(i -> String.format("(%d, %d)", i, i + 1)).collect(Collectors.joining(", "));
+        String query = String.format("SELECT * FROM %%s WHERE (pk1, pk2) in (%s)", inTerms);
+        assertInvalidMessage("Multi-column relations can only be applied to clustering columns but was applied to: pk1", query);
+    }
+
+    private static String terms(int terms)
+    {
+        assert terms > 0;
+        return IntStream.range(0, terms).mapToObj(String::valueOf).collect(Collectors.joining(", "));
+    }
+
+    private static Object[] bindValues(int... termCounts)
+    {
+        Object[] values = new Object[Arrays.stream(termCounts).sum()];
+        int idx = 0;
+
+        for (int count : termCounts)
+        {
+            for (int i = 0; i < count; i++, idx++)
+            {
+                values[idx] = i;
+            }
+        }
+
+        return values;
+    }
+
+    private static String markers(int terms)
+    {
+        assert terms > 0;
+        return IntStream.range(0, terms).mapToObj(i -> "?").collect(Collectors.joining(", "));
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailItemsPerCollectionTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailItemsPerCollectionTest.java
new file mode 100644
index 000000000000..99495d22646e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailItemsPerCollectionTest.java
@@ -0,0 +1,449 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.stream.Collector;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static java.lang.String.format;
+
+/**
+ * Tests the guardrail for the max number of items in collections.
+ */
+public class GuardrailItemsPerCollectionTest extends GuardrailWarningOnSSTableWriteTester
+{
+    private static final int THRESHOLD = 4;
+    private static final String SSTABLE_WRITE_WARN_MESSAGE = String.format(
+    "Detected collection <redacted> with %d items", THRESHOLD + 1);
+
+    private long defaultItemsPerCollection;
+    private boolean defaultReadBeforeWriteListOperationsEnabled;
+
+    @Before
+    public void before()
+    {
+        defaultItemsPerCollection = config().items_per_collection_warn_threshold;
+        config().items_per_collection_warn_threshold = (long) THRESHOLD;
+
+        defaultReadBeforeWriteListOperationsEnabled = config().read_before_write_list_operations_enabled;
+        config().read_before_write_list_operations_enabled = true;
+    }
+
+    @After
+    public void after()
+    {
+        config().items_per_collection_warn_threshold = defaultItemsPerCollection;
+        config().read_before_write_list_operations_enabled = defaultReadBeforeWriteListOperationsEnabled;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.items_per_collection_warn_threshold = v,
+                                                 "items_per_collection_warn_threshold");
+    }
+
+    @Test
+    public void testSetSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set(0));
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set(1));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", set(THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (5, ?)", set(THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testFrozenSetSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<set<text>>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set(0));
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set(1));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", set(THRESHOLD));
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (5, ?)", set(THRESHOLD + 1));
+
+        // frozen collections size is not checked during sstable write
+        assertNotWarnedOnFlush();
+    }
+
+    @Test
+    public void testSetSizeWithUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", set(1));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", set(1, THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (0, ?)", set(THRESHOLD + 1));
+        assertValid("UPDATE %s SET v = v - ? WHERE k = 0", set(1));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set(1));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", set(1, THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testSetSizeAfterCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", set(1));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", set(1, THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list(THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v - ? WHERE k = 1", set(1));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", set(THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", set(1, THRESHOLD + 1));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        assertValid("DELETE v FROM %s WHERE k = 1");
+        assertNotWarnedOnCompact();
+    }
+
+    @Test
+    public void testListSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(1));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", list(THRESHOLD - 1));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", list(THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (5, ?)", list(THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testFrozenListSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<list<text>>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(1));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", list(THRESHOLD - 1));
+        assertValid("INSERT INTO %s (k, v) VALUES (4, ?)", list(THRESHOLD));
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (5, ?)", list(THRESHOLD + 1));
+
+        // frozen collections size is not checked during sstable write
+        assertNotWarnedOnFlush();
+    }
+
+    @Test
+    public void testListSizeWithUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", list(1));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", list(1, THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (0, ?)", set(THRESHOLD + 1));
+        assertValid("UPDATE %s SET v[?] = null WHERE k = 0", THRESHOLD);
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list(1));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 1", list(1, THRESHOLD + 1));
+        assertWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(1));
+        assertValid("UPDATE %s SET v = ? + v WHERE k = 2", list(1, THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testListSizeAfterCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v list<text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", list(1));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", list(1, THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", list(THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v[?] = null WHERE k = 1", 0);
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", list(1));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 2", list(1, THRESHOLD + 1));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        assertValid("DELETE v[1] FROM %s WHERE k = 2");
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", list(1));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = ? + v WHERE k = 3", list(1, THRESHOLD + 1));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+    }
+
+    @Test
+    public void testMapSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(1));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", map(THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (4, ?)", map(THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testFrozenMapSize() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen<map<text, text>>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, null)");
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map());
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(1));
+        assertValid("INSERT INTO %s (k, v) VALUES (3, ?)", map(THRESHOLD));
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (4, ?)", map(THRESHOLD + 1));
+
+        // frozen collections size is not checked during sstable write
+        assertNotWarnedOnFlush();
+    }
+
+    @Test
+    public void testMapSizeWithUpdates() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", map(1));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", map(1, THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (1, ?)", map(THRESHOLD + 1));
+        assertValid("UPDATE %s SET v = v - ? WHERE k = 1", set(1));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(1));
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 2", map(1, THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testMapSizeAfterCompaction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v map<text, text>)");
+        disableCompaction();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (0, ?)", map(1));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 0", map(1, THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (1, ?)", map(THRESHOLD));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v - ? WHERE k = 1", set(1));
+        assertNotWarnedOnFlush();
+        assertNotWarnedOnCompact();
+
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", map(1));
+        assertNotWarnedOnFlush();
+        assertValid("UPDATE %s SET v = v + ? WHERE k = 2", map(1, THRESHOLD + 1));
+        assertNotWarnedOnFlush();
+        assertWarnedOnCompact();
+
+        assertValid("DELETE v FROM %s WHERE k = 2");
+        assertNotWarnedOnCompact();
+    }
+
+    @Test
+    public void testMultipleCollections() throws Throwable
+    {
+        createTable("CREATE TABLE %s (" +
+                    "   k int PRIMARY KEY, " +
+                    "   s set<text>," +
+                    "   l list<text>," +
+                    "   m map<text, text>," +
+                    "   fs frozen<set<text>>," +
+                    "   fl frozen<list<text>>," +
+                    "   fm frozen<map<text, text>>" +
+                    ")");
+
+        // the guardrail won't be triggered when the combined size of all the collections in a row is over the threshold
+        assertValid("INSERT INTO %s (k, s, l, m, fs, fl, fm) VALUES (0, ?, ?, ?, ?, ?, ?)",
+                    set(THRESHOLD), list(THRESHOLD), map(THRESHOLD),
+                    set(THRESHOLD), list(THRESHOLD), map(THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        // the guardrail will produce a log message for each column exceeding the threshold, not just for the first one
+        assertWarns(Arrays.asList(format("Detected collection s with %d items", THRESHOLD + 1),
+                                  format("Detected collection l with %d items", THRESHOLD + 2),
+                                  format("Detected collection m with %d items", THRESHOLD + 3),
+                                  format("Detected collection fs with %d items", THRESHOLD + 4),
+                                  format("Detected collection fl with %d items", THRESHOLD + 5),
+                                  format("Detected collection fm with %d items", THRESHOLD + 6)),
+                    "INSERT INTO %s (k, s, l, m, fs, fl, fm) VALUES (1, ?, ?, ?, ?, ?, ?)",
+                    set(THRESHOLD + 1), list(THRESHOLD + 2), map(THRESHOLD + 3),
+                    set(THRESHOLD + 4), list(THRESHOLD + 5), map(THRESHOLD + 6));
+
+        // only the non frozen collections will produce a warning during sstable write
+        assertWarnedOnSSTableWrite(false,
+                                   format("Detected collection <redacted> with %d items", THRESHOLD + 1),
+                                   format("Detected collection <redacted> with %d items", THRESHOLD + 2),
+                                   format("Detected collection <redacted> with %d items", THRESHOLD + 3));
+    }
+
+    @Test
+    public void testCompositePartitionKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k1 int, k2 text, v set<text>, PRIMARY KEY((k1, k2)))");
+
+        assertValid("INSERT INTO %s (k1, k2, v) VALUES (0, 'a', ?)", set(1));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k1, k2, v) VALUES (1, 'a', ?)", set(THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k1, k2, v) VALUES (2, 'c', ?)", set(THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testCompositeClusteringKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int, c1 int, c2 text, v set<text>, PRIMARY KEY(k, c1, c2))");
+
+        assertValid("INSERT INTO %s (k, c1, c2, v) VALUES (1, 10, 'a', ?)", set(1));
+        assertNotWarnedOnFlush();
+
+        assertValid("INSERT INTO %s (k, c1, c2, v) VALUES (2, 20, 'b', ?)", set(THRESHOLD));
+        assertNotWarnedOnFlush();
+
+        assertWarnedOnClient("INSERT INTO %s (k, c1, c2, v) VALUES (3, 30, 'c', ?)", set(THRESHOLD + 1));
+        assertWarnedOnFlush();
+    }
+
+    @Test
+    public void testSuperUser() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, v set<text>)");
+
+        // regular user should be warned
+        assertWarnedOnClient("INSERT INTO %s (k, v) VALUES (1, ?)", set(THRESHOLD + 1));
+
+        // super user shouldn't be warned
+        useSuperUser();
+        assertValid("INSERT INTO %s (k, v) VALUES (2, ?)", set(THRESHOLD + 1));
+
+        // sstable should produces warnings because the keyspace is not internal, regardless of the user
+        assertWarnedOnSSTableWrite(false,
+                                   format("Detected collection <redacted> with %d items", THRESHOLD + 1),
+                                   format("Detected collection <redacted> with %d items", THRESHOLD + 1));
+    }
+
+    private static Set<Integer> set(int numElements)
+    {
+        return set(0, numElements);
+    }
+
+    private static Set<Integer> set(int startInclusive, int endExclusive)
+    {
+        return collection(startInclusive, endExclusive, Collectors.toSet());
+    }
+
+    private static List<Integer> list(int numElements)
+    {
+        return list(0, numElements);
+    }
+
+    private static List<Integer> list(int startInclusive, int endExclusive)
+    {
+        return collection(startInclusive, endExclusive, Collectors.toList());
+    }
+
+    private static Map<Integer, Integer> map(int numElements)
+    {
+        return map(0, numElements);
+    }
+
+    private static Map<Integer, Integer> map(int startInclusive, int endExclusive)
+    {
+        return collection(startInclusive, endExclusive, Collectors.toMap(x -> x, x -> x));
+    }
+
+    private static <R, A> R collection(int startInclusive, int endExclusive, Collector<Integer, A, R> collector)
+    {
+        return IntStream.range(startInclusive, endExclusive).boxed().collect(collector);
+    }
+
+    private void assertWarnedOnClient(String query, Object... args) throws Throwable
+    {
+        String warning = String.format("Detected collection v with %d items", THRESHOLD + 1);
+        assertWarns(Collections.singletonList(warning), query, args);
+    }
+
+    private void assertWarnedOnFlush()
+    {
+        assertWarnedOnFlush(SSTABLE_WRITE_WARN_MESSAGE);
+    }
+
+    private void assertWarnedOnCompact()
+    {
+        assertWarnedOnCompact(SSTABLE_WRITE_WARN_MESSAGE);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java
new file mode 100644
index 000000000000..10de77224ecf
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.util.Collections;
+import java.util.Set;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+import static java.lang.String.format;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+
+public class GuardrailMaterializedViewsPerTableTest extends GuardrailTester
+{
+    private static final String CREATE_TABLE = "CREATE TABLE %s (k int primary key, v int)";
+    private static final String CREATE_VIEW = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s " +
+                                              "WHERE k is NOT NULL AND v IS NOT NULL PRIMARY KEY (v, k)";
+
+    private Long defaultMVPerTableFailureThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultMVPerTableFailureThreshold = config().materialized_view_per_table_failure_threshold;
+        config().materialized_view_per_table_failure_threshold = 1L;
+
+        createTable(CREATE_TABLE);
+    }
+
+    @After
+    public void after()
+    {
+        config().materialized_view_per_table_failure_threshold = defaultMVPerTableFailureThreshold;
+    }
+
+    @Test
+    public void testCreateView() throws Throwable
+    {
+        String view1 = assertCreateViewSucceeds();
+        assertNumViews(1);
+
+        assertCreateViewFails();
+        assertNumViews(1);
+
+        // drop the first view, we should be able to create new MV again
+        dropView(view1);
+        assertNumViews(0);
+
+        assertCreateViewSucceeds();
+        assertNumViews(1);
+
+        // previous guardrail should not apply to another base table
+        createTable("CREATE TABLE %s (k int primary key, v int)");
+        assertNumViews(0);
+
+        assertCreateViewSucceeds();
+        assertNumViews(1);
+
+        assertCreateViewFails();
+        assertNumViews(1);
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        testExcludedUsers("CREATE MATERIALIZED VIEW excluded_1 AS SELECT * FROM %s " +
+                          "  WHERE k is NOT NULL AND v1 IS NOT NULL PRIMARY KEY (v1, k)",
+                          "CREATE MATERIALIZED VIEW excluded_2 AS SELECT * FROM %s " +
+                          "  WHERE k is NOT NULL AND v2 IS NOT NULL PRIMARY KEY (v2, k)",
+                          "DROP MATERIALIZED VIEW excluded_1",
+                          "DROP MATERIALIZED VIEW excluded_2");
+    }
+
+    private String assertCreateViewSucceeds() throws Throwable
+    {
+        String viewName = createViewName();
+        assertValid(format(CREATE_VIEW, viewName));
+        return viewName;
+    }
+
+    private void assertNumViews(int count)
+    {
+        assertEquals(count, getCurrentColumnFamilyStore().viewManager.size());
+    }
+
+    private void assertCreateViewFails() throws Throwable
+    {
+        String viewName = createViewName();
+        String expectedMessage = String.format("failed to create materialized view %s on table %s",
+                                               viewName, currentTable());
+        assertFails(expectedMessage, format(CREATE_VIEW, viewName));
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailPartitionKeysInSelectTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailPartitionKeysInSelectTest.java
new file mode 100644
index 000000000000..db689e20b949
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailPartitionKeysInSelectTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.assertj.core.api.Assertions;
+
+public class GuardrailPartitionKeysInSelectTest extends GuardrailTester
+{
+    private static int defaultPartitionKeysInSelectQuery;
+
+    @BeforeClass
+    public static void setup()
+    {
+        defaultPartitionKeysInSelectQuery = DatabaseDescriptor.getGuardrailsConfig().partition_keys_in_select_failure_threshold;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().partition_keys_in_select_failure_threshold = defaultPartitionKeysInSelectQuery;
+    }
+
+    @Before
+    public void setUp() throws Throwable
+    {
+        createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
+        DatabaseDescriptor.getGuardrailsConfig().partition_keys_in_select_failure_threshold = 3;
+    }
+
+    @Test
+    public void testFilterOnFewPartitions() throws Throwable
+    {
+        // test that it does not throw
+        assertValid("SELECT * FROM %s WHERE k IN (1,2)");
+    }
+
+    @Test
+    public void testFilterOnManyPartitions() throws Throwable
+    {
+        assertFails("Select query cannot be completed because it selects 5 partitions keys - more than the maximum allowed 3",
+                    "SELECT * FROM %s WHERE k IN (1,2,3,4,5)");
+    }
+
+    @Test
+    public void testFilterOnOneRepeatedPartitions() throws Throwable
+    {
+        // test that it does not throw
+        assertValid("SELECT * FROM %s WHERE k IN (1,1,1,1,1)");
+    }
+
+    @Test
+    public void testFilterOnClusteringColumns() throws Throwable
+    {
+        // test that it does not throw
+        assertValid("SELECT * FROM %s WHERE c IN (1,2,3,4,5) ALLOW FILTERING");
+        assertValid("SELECT * FROM %s WHERE k = 3 AND c IN (1,2,3,4,5)");
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        testExcludedUsers("SELECT * FROM %s WHERE k IN (1,2,3,4,5)");
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailPartitionSizeTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailPartitionSizeTest.java
new file mode 100644
index 000000000000..6242a3bbb6e2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailPartitionSizeTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+public class GuardrailPartitionSizeTest extends GuardrailWarningOnSSTableWriteTester
+{
+    private static int partitionSizeThreshold;
+
+    @Before
+    public void setup()
+    {
+        partitionSizeThreshold = DatabaseDescriptor.getGuardrailsConfig().partition_size_warn_threshold_in_mb;
+        DatabaseDescriptor.getGuardrailsConfig().partition_size_warn_threshold_in_mb = 1;
+    }
+
+    @After
+    public void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().partition_size_warn_threshold_in_mb = partitionSizeThreshold;
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.partition_size_warn_threshold_in_mb = v.intValue(),
+                                                 "partition_size_warn_threshold_in_mb");
+    }
+
+    @Test
+    public void testCompactLargePartition() throws Throwable
+    {
+        createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
+        disableCompaction();
+
+        // insert stuff into a single partition
+        for (int i = 0; i < 23000; i++)
+            assertValid("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 100, i, "long string for large partition test");
+
+        assertWarnedOnCompact("Detected partition <redacted> of size 1.1MB is greater than the maximum recommended size (1MB)");
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailReadBeforeWriteListOperationsTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailReadBeforeWriteListOperationsTest.java
new file mode 100644
index 000000000000..170c7b5c791a
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailReadBeforeWriteListOperationsTest.java
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+/**
+ * Tests the guardrail for disabling read-before-write list operations.
+ */
+@RunWith(Parameterized.class)
+public class GuardrailReadBeforeWriteListOperationsTest extends GuardrailTester
+{
+    private static boolean defaultEnabled;
+    private final boolean enabled;
+
+    public GuardrailReadBeforeWriteListOperationsTest(boolean enabled)
+    {
+        this.enabled = enabled;
+    }
+
+    @Parameterized.Parameters(name = "read_before_write_list_operations_enabled={0}")
+    public static Collection<Object> generateData()
+    {
+        return Arrays.asList(false, true);
+    }
+
+    @Before
+    public void before()
+    {
+        defaultEnabled = config().read_before_write_list_operations_enabled;
+        config().read_before_write_list_operations_enabled = enabled;
+
+        createTable("CREATE TABLE %s (k int PRIMARY KEY, l list<int>)");
+    }
+
+    @After
+    public void after()
+    {
+        config().read_before_write_list_operations_enabled = defaultEnabled;
+    }
+
+    @Test
+    public void tesInsertFullValue() throws Throwable
+    {
+        // insert from scratch
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2])");
+        assertRows(row(0, list(1, 2)));
+
+        // insert overriding previous value
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [2, 3])");
+        assertRows(row(0, list(2, 3)));
+    }
+
+    @Test
+    public void testUpdateFullValue() throws Throwable
+    {
+        // update from scratch
+        assertValid("UPDATE %s SET l = [1, 2] WHERE k = 0");
+        assertRows(row(0, list(1, 2)));
+
+        // update overriding previous value
+        assertValid("UPDATE %s SET l = [2, 3] WHERE k = 0");
+        assertRows(row(0, list(2, 3)));
+    }
+
+    @Test
+    public void testDeleteFullValue() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2])");
+        assertValid("DELETE l FROM %s WHERE k = 0");
+        assertRows(row(0, null));
+    }
+
+    @Test
+    public void testAppend() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2])");
+        assertValid("UPDATE %s SET l = l + [3, 4] WHERE k = 0");
+        assertRows(row(0, list(1, 2, 3, 4)));
+    }
+
+    @Test
+    public void testPrepend() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2])");
+        assertValid("UPDATE %s SET l = [3, 4] + l WHERE k = 0");
+        assertRows(row(0, list(3, 4, 1, 2)));
+    }
+
+    @Test
+    public void testUpdateByIndex() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2, 3])");
+        testGuardrail("UPDATE %s SET l[1] = 4 WHERE k = 0",
+                      "Setting of list items by index requiring read before write is not allowed",
+                      row(0, list(1, 4, 3)));
+    }
+
+    @Test
+    public void testDeleteByIndex() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2, 3])");
+        testGuardrail("DELETE l[1] FROM %s WHERE k = 0",
+                      "Removal of list items by index requiring read before write is not allowed",
+                      row(0, list(1, 3)));
+    }
+
+    @Test
+    public void testDeleteByItem() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2, 3])");
+        testGuardrail("UPDATE %s SET l = l - [2] WHERE k = 0",
+                      "Removal of list items requiring read before write is not allowed",
+                      row(0, list(1, 3)));
+    }
+
+    @Test
+    public void testBatch() throws Throwable
+    {
+        assertValid("INSERT INTO %s (k, l) VALUES (0, [1, 2, 3])");
+
+        testGuardrail("BEGIN BATCH UPDATE %s SET l[1] = 0 WHERE k = 0; APPLY BATCH",
+                      "Setting of list items by index requiring read before write is not allowed",
+                      row(0, list(1, 0, 3)));
+
+        testGuardrail("BEGIN BATCH DELETE l[1] FROM %s WHERE k = 0; APPLY BATCH",
+                      "Removal of list items by index requiring read before write is not allowed",
+                      row(0, list(1, 3)));
+
+        testGuardrail("BEGIN BATCH UPDATE %s SET l = l - [3] WHERE k = 0; APPLY BATCH",
+                      "Removal of list items requiring read before write is not allowed",
+                      row(0, list(1)));
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        testExcludedUsers("INSERT INTO %s (k, l) VALUES (0, [1, 2, 3, 4, 5])",
+                          "UPDATE %s SET l[1] = 4 WHERE k = 0",
+                          "DELETE l[1] FROM %s WHERE k = 0",
+                          "INSERT INTO %s (k, l) VALUES (0, [1, 2, 3])",
+                          "UPDATE %s SET l = l - [2] WHERE k = 0",
+                          "BEGIN BATCH UPDATE %s SET l[1] = 0 WHERE k = 0; APPLY BATCH",
+                          "BEGIN BATCH DELETE l[1] FROM %s WHERE k = 0; APPLY BATCH",
+                          "BEGIN BATCH UPDATE %s SET l = l - [3] WHERE k = 0; APPLY BATCH");
+    }
+
+    private void assertRows(Object[]... rows) throws Throwable
+    {
+        assertRowsNet(executeNet("SELECT * FROM %s"), rows);
+    }
+
+    private void testGuardrail(String query, String expectedMessage, Object[]... rows) throws Throwable
+    {
+        if (enabled)
+        {
+            assertValid(query);
+            assertRows(rows);
+        }
+        else
+        {
+            assertFails(expectedMessage, query);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java
new file mode 100644
index 000000000000..5fd3b8f01922
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import com.google.common.base.Strings;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static java.lang.String.format;
+import static org.junit.Assert.assertEquals;
+
+public class GuardrailSecondaryIndexesPerTableTest extends GuardrailTester
+{
+    private Long defaultSIPerTableFailureThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultSIPerTableFailureThreshold = config().secondary_index_per_table_failure_threshold;
+        config().secondary_index_per_table_failure_threshold = 1L;
+    }
+
+    @After
+    public void after()
+    {
+        config().secondary_index_per_table_failure_threshold = defaultSIPerTableFailureThreshold;
+    }
+
+    @Test
+    public void testCreateIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        String indexName = createIndex("CREATE INDEX ON %s(v1)");
+        assertNumIndexes(1);
+
+        assertFails("", "v2", 1);
+        assertFails("custom_index_name", "v2", 1);
+        assertNumIndexes(1);
+
+        // 2i guardrail won't affect custom index
+        assertValid("CREATE CUSTOM INDEX ON %s (v2) USING 'org.apache.cassandra.index.sasi.SASIIndex'");
+        assertNumIndexes(2);
+
+        // drop the first index, we should be able to create new index again
+        dropIndex(format("DROP INDEX %s.%s", keyspace(), indexName));
+        assertNumIndexes(1);
+
+        execute("CREATE INDEX ON %s(v2)");
+        assertNumIndexes(2);
+
+        // previous guardrail should not apply to another base table
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        assertValid("CREATE INDEX ON %s(v1)");
+        assertNumIndexes(1);
+
+        assertFails("custom_index_name2", "v2", 1);
+        assertNumIndexes(1);
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        testExcludedUsers("CREATE INDEX excluded_1 ON %s(v1)",
+                          "CREATE INDEX excluded_2 ON %s(v2)",
+                          "DROP INDEX excluded_1",
+                          "DROP INDEX excluded_2");
+    }
+
+    private void assertNumIndexes(int count)
+    {
+        assertEquals(count, getCurrentColumnFamilyStore().indexManager.listIndexes().size());
+    }
+
+    private void assertFails(String indexName, String column, int indexes) throws Throwable
+    {
+        String expectedMessage = String.format("failed to create secondary index %son table %s",
+                                               Strings.isNullOrEmpty(indexName) ? "" : indexName + " ", currentTable());
+        assertFails(expectedMessage, format("CREATE INDEX %s ON %%s(%s)", indexName, column));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
new file mode 100644
index 000000000000..a959d10a5259
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
@@ -0,0 +1,393 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.function.BiConsumer;
+import java.util.function.Consumer;
+import javax.annotation.Nullable;
+
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.auth.AuthenticatedUser;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.QueryState;
+
+import static java.lang.String.format;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public abstract class GuardrailTester extends CQLTester
+{
+    static final String USERNAME = "guardrail_user";
+    static final String PASSWORD = "guardrail_password";
+
+    private static boolean guardRailsEnabled;
+    private static Set<String> tablePropertiesDisallowed;
+
+    protected TestListener listener;
+
+    @BeforeClass
+    public static void setupGuardrailTester()
+    {
+        guardRailsEnabled = DatabaseDescriptor.getGuardrailsConfig().enabled;
+        DatabaseDescriptor.getGuardrailsConfig().enabled = true;
+
+        tablePropertiesDisallowed = DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed;
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = Collections.emptySet();
+
+        requireAuthentication();
+        requireNetwork();
+    }
+
+    @AfterClass
+    public static void tearDownGuardrailTester()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().enabled = guardRailsEnabled;
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = tablePropertiesDisallowed;
+    }
+
+    /**
+     * Creates an ordinary user that is not excluded from guardrails, that is, a user that is not super not internal.
+     */
+    @Before
+    public void beforeGuardrailTest() throws Throwable
+    {
+        useSuperUser();
+        executeNet(format("CREATE USER IF NOT EXISTS %s WITH PASSWORD '%s'", USERNAME, PASSWORD));
+        executeNet(format("GRANT ALL ON KEYSPACE %s TO %s", KEYSPACE, USERNAME));
+        useUser(USERNAME, PASSWORD);
+
+        listener = new TestListener(null);
+        Guardrails.register(listener);
+
+        execute("USE " + keyspace());
+        executeNet("USE " + keyspace());
+    }
+
+    @After
+    public void afterGuardrailTest() throws Throwable
+    {
+        Guardrails.unregister(listener);
+
+        useSuperUser();
+        executeNet("DROP USER " + USERNAME);
+    }
+
+    QueryState userQueryState()
+    {
+        return queryState(new AuthenticatedUser(USERNAME));
+    }
+
+    QueryState superQueryState()
+    {
+        return queryState(new AuthenticatedUser("cassandra"));
+    }
+
+    QueryState internalQueryState()
+    {
+        return QueryState.forInternalCalls();
+    }
+
+    private QueryState queryState(AuthenticatedUser user)
+    {
+        ClientState clientState = ClientState.forExternalCalls(user);
+        return new QueryState(clientState);
+    }
+
+    static GuardrailsConfig config()
+    {
+        return DatabaseDescriptor.getGuardrailsConfig();
+    }
+
+    private void assertValidProperty(BiConsumer<GuardrailsConfig, Long> setter, long value)
+    {
+        setter.accept(config(), value);
+        config().validate();
+    }
+
+    private void assertInvalidPositiveProperty(BiConsumer<GuardrailsConfig, Long> setter,
+                                               long value,
+                                               long maxValue,
+                                               boolean allowZero,
+                                               String name)
+    {
+        try
+        {
+            assertValidProperty(setter, value);
+            fail(format("Expected configuration exception for guardrail %s value: %d", name, value));
+        }
+        catch (ConfigurationException e)
+        {
+            String expectedMessage = null;
+
+            if (value > maxValue)
+                expectedMessage = format("Invalid value %d for guardrail %s: maximum allowed value is %d",
+                                         value, name, maxValue);
+            if (value == 0 && !allowZero)
+                expectedMessage = format("Invalid value for guardrail %s: 0 is not allowed", name);
+
+            if (value < -1L)
+                expectedMessage = format("Invalid value %d for guardrail %s: negative values are not "
+                                         + "allowed, outside of -1 which disables the guardrail",
+                                         value, name);
+
+            assertEquals(format("Exception message '%s' does not contain '%s'", e.getMessage(), expectedMessage),
+                         expectedMessage, e.getMessage());
+        }
+    }
+
+    private void assertInvalidStrictlyPositiveProperty(BiConsumer<GuardrailsConfig, Long> setter, long value, String name)
+    {
+        assertInvalidPositiveProperty(setter, value, Integer.MAX_VALUE, false, name);
+    }
+
+    void testValidationOfStrictlyPositiveProperty(BiConsumer<GuardrailsConfig, Long> setter, String name)
+    {
+        assertInvalidStrictlyPositiveProperty(setter, Integer.MIN_VALUE, name);
+        assertInvalidStrictlyPositiveProperty(setter, -2, name);
+        assertValidProperty(setter, -1); // disabled
+        assertInvalidStrictlyPositiveProperty(setter, 0, name);
+        assertValidProperty(setter, 1);
+        assertValidProperty(setter, 2);
+        assertValidProperty(setter, Integer.MAX_VALUE);
+    }
+
+    protected void testExcludedUsers(String... queries) throws Throwable
+    {
+        assertSuperuserIsExcluded(queries);
+        assertInternalQueriesAreExcluded(queries);
+    }
+
+    protected void assertInternalQueriesAreExcluded(String... queries) throws Throwable
+    {
+        for (String query : queries)
+        {
+            listener.clear();
+            try
+            {
+                execute(query);
+                listener.assertNotWarned();
+                listener.assertNotFailed();
+            }
+            finally
+            {
+                listener.clear();
+            }
+        }
+    }
+
+    protected void assertSuperuserIsExcluded(String... queries) throws Throwable
+    {
+        useSuperUser();
+        executeNet("USE " + keyspace());
+
+        for (String query : queries)
+            assertValid(query);
+
+        useUser(USERNAME, PASSWORD);
+        executeNet("USE " + keyspace());
+    }
+
+    protected void assertValid(CheckedFunction function) throws Throwable
+    {
+        try
+        {
+            function.apply();
+            listener.assertNotWarned();
+            listener.assertNotFailed();
+        }
+        finally
+        {
+            listener.clear();
+        }
+    }
+
+    protected void assertValid(String query, Object... args) throws Throwable
+    {
+        assertValid(() -> executeNet(query, args));
+    }
+
+    protected void assertWarns(CheckedFunction function, String... messages) throws Throwable
+    {
+        listener.clear();
+        try
+        {
+            function.apply();
+            listener.assertWarned(messages);
+            listener.assertNotFailed();
+        }
+        finally
+        {
+            listener.clear();
+        }
+    }
+
+    void assertWarns(List<String> messages, String query, Object... args) throws Throwable
+    {
+        assertWarns(() -> executeNet(query, args), messages.toArray(new String[0]));
+    }
+
+    protected void assertWarns(String message, String query, Object... args) throws Throwable
+    {
+        assertWarns(() -> executeNet(query, args), message);
+    }
+
+    protected void assertFails(CheckedFunction function, String... messages) throws Throwable
+    {
+        listener.clear();
+        try
+        {
+            function.apply();
+            fail("Expected failure");
+        }
+        catch (InvalidQueryException e)
+        {
+            listener.assertFailed(messages);
+            listener.assertNotWarned();
+        }
+        finally
+        {
+            listener.clear();
+        }
+    }
+
+    protected void assertFails(String message, String query, Object... args) throws Throwable
+    {
+        assertFails(() -> executeNet(query, args), message);
+    }
+
+    protected void assertConfigFails(Runnable runnable, String message)
+    {
+        try
+        {
+            runnable.run();
+            fail("Expected failure");
+        }
+        catch (ConfigurationException e)
+        {
+            String actualMessage = e.getMessage();
+            assertTrue(String.format("Failure message '%s' does not contain expected message '%s'", actualMessage, message),
+                       actualMessage.contains(message));
+        }
+    }
+
+    static class TestListener implements Guardrails.Listener
+    {
+        @Nullable
+        private final Guardrail guardrail;
+        private List<String> failures = new CopyOnWriteArrayList<>();
+        private List<String> warnings = new CopyOnWriteArrayList<>();
+
+        private TestListener(@Nullable Guardrail guardrail)
+        {
+            this.guardrail = guardrail;
+        }
+
+        synchronized void assertFailed(String... expectedMessages)
+        {
+            assertFalse("Expected to fail, but no failures were received", failures == null || failures.isEmpty());
+            assertEquals(format("Expected %d failures, but found %d messages: %s)",
+                                expectedMessages.length, failures.size(), failures),
+                         expectedMessages.length, failures.size());
+
+            for (int i = 0; i < failures.size(); i++)
+            {
+                String actual = failures.get(i);
+                String expected = expectedMessages[i];
+                assertTrue(format("Failure message '%s' does not contain expected message '%s'", actual, expected),
+                           actual.contains(expected));
+            }
+        }
+
+        synchronized void assertNotFailed()
+        {
+            assertTrue(format("No failures expected, but found %d: %s", failures.size(), failures), failures.isEmpty());
+        }
+
+        synchronized void assertWarned(String... expectedMessages)
+        {
+            assertFalse("Expected to warn, but no warnings were received", warnings == null || warnings.isEmpty());
+            assertEquals(format("Expected %d warnings, but found %d messages: %s)",
+                                expectedMessages.length, warnings.size(), warnings),
+                         expectedMessages.length, warnings.size());
+
+            for (int i = 0; i < warnings.size(); i++)
+            {
+                String actual = warnings.get(i);
+                String expected = expectedMessages[i];
+                assertTrue(format("Warning message '%s' does not contain expected message '%s'", actual, expected),
+                           actual.contains(expected));
+            }
+        }
+
+        synchronized void assertContainsWarns(String... expectedMessages)
+        {
+            assertFalse(warnings.isEmpty());
+            for (String msg : expectedMessages)
+            {
+                assertTrue(String.format("Warning messages '%s' don't contain the expected '%s'", warnings, msg),
+                           warnings.stream().anyMatch(m -> m.contains(msg)));
+            }
+        }
+
+        synchronized void assertNotWarned()
+        {
+            assertTrue(format("No warnings expected, but found %d: %s", warnings.size(), warnings), warnings.isEmpty());
+        }
+
+        synchronized void clear()
+        {
+            failures.clear();
+            warnings.clear();
+        }
+
+        @Override
+        public synchronized void onWarningTriggered(String guardrailName, String message)
+        {
+            if (guardrail == null || guardrailName.equals(guardrail.name))
+            {
+                warnings.add(message);
+            }
+        }
+
+        @Override
+        public void onFailureTriggered(String guardrailName, String message)
+        {
+            if (guardrail == null || guardrailName.equals(guardrail.name))
+            {
+                failures.add(message);
+            }
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailUserTimestampsTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailUserTimestampsTest.java
new file mode 100644
index 000000000000..7ced3354df71
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailUserTimestampsTest.java
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+public class GuardrailUserTimestampsTest extends GuardrailTester
+{
+    public static final String NOT_ALLOWED_MSG = "User provided timestamps (USING TIMESTAMP) is not allowed";
+    private static boolean userTimestampsEnabled;
+
+    @BeforeClass
+    public static void setup()
+    {
+        userTimestampsEnabled = DatabaseDescriptor.getGuardrailsConfig().user_timestamps_enabled;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().user_timestamps_enabled = userTimestampsEnabled;
+    }
+
+    @Before
+    public void setupTest()
+    {
+        createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
+    }
+
+    private void setGuardrails(boolean userTimestampsEnabled)
+    {
+        DatabaseDescriptor.getGuardrailsConfig().user_timestamps_enabled = userTimestampsEnabled;
+    }
+
+    private void insert(boolean userTimestampsEnabled) throws Throwable
+    {
+        setGuardrails(userTimestampsEnabled);
+        assertValid("INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') USING TIMESTAMP 1");
+    }
+
+    @Test
+    public void testInsertWithDisabledUserTimestamps()
+    {
+        assertThatThrownBy(() -> insert(false))
+        .hasMessage(NOT_ALLOWED_MSG);
+    }
+
+    @Test
+    public void testInsertWithEnabledUserTimestamps() throws Throwable
+    {
+        // test that it does not throw
+        insert(true);
+    }
+
+    private void update(boolean userTimestampsEnabled) throws Throwable
+    {
+        setGuardrails(userTimestampsEnabled);
+        assertValid("UPDATE %s USING TIMESTAMP 1 SET v = 'val2' WHERE k = 1 and c = 2");
+    }
+
+    @Test
+    public void testUpdateWithDisabledUserTimestamps() throws Throwable
+    {
+        assertThatThrownBy(() -> update(false))
+        .hasMessage(NOT_ALLOWED_MSG);
+    }
+
+    @Test
+    public void testUpdateWithEnabledUserTimestamps() throws Throwable
+    {
+        // test that it does not throw
+        update(true);
+    }
+
+    private void delete(boolean userTimestampsEnabled) throws Throwable
+    {
+        setGuardrails(userTimestampsEnabled);
+        assertValid("DELETE FROM %s USING TIMESTAMP 1 WHERE k=1");
+    }
+
+    @Test
+    public void testDeleteWithDisabledUserTimestamps()
+    {
+        assertThatThrownBy(() -> delete(false))
+        .hasMessage(NOT_ALLOWED_MSG);
+    }
+
+    @Test
+    public void testDeleteWithEnabledUserTimestamps() throws Throwable
+    {
+        // test that it does not throw
+        delete(true);
+    }
+
+    private void batch(boolean userTimestampsEnabled) throws Throwable
+    {
+        setGuardrails(userTimestampsEnabled);
+        assertValid("BEGIN BATCH USING TIMESTAMP 1 " +
+                    "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') " +
+                    "APPLY BATCH");
+        assertValid("BEGIN BATCH " +
+                    "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') USING TIMESTAMP 1 " +
+                    "APPLY BATCH");
+    }
+
+    @Test
+    public void testBatchWithDisabledUserTimestamps()
+    {
+        assertThatThrownBy(() -> batch(false))
+        .hasMessage(NOT_ALLOWED_MSG);
+    }
+
+    @Test
+    public void testBatchWithEnabledUserTimestamps() throws Throwable
+    {
+        // test that it does not throw
+        batch(true);
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        for (boolean userTimestampsEnabled : new boolean[]{ false, true })
+        {
+            setGuardrails(userTimestampsEnabled);
+            testExcludedUsers("INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') USING TIMESTAMP 1",
+                              "UPDATE %s USING TIMESTAMP 1 SET v = 'val2' WHERE k = 1 and c = 2",
+                              "DELETE FROM %s USING TIMESTAMP 1 WHERE k=1",
+                              "BEGIN BATCH USING TIMESTAMP 1 INSERT INTO %s (k, c, v) VALUES (1, 2, 'v'); APPLY BATCH",
+                              "BEGIN BATCH INSERT INTO %s (k, c, v) VALUES (1, 2, 'v') USING TIMESTAMP 1; APPLY BATCH");
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailWarningOnSSTableWriteTester.java b/test/unit/org/apache/cassandra/guardrails/GuardrailWarningOnSSTableWriteTester.java
new file mode 100644
index 000000000000..32f07fe9948d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailWarningOnSSTableWriteTester.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+public abstract class GuardrailWarningOnSSTableWriteTester extends GuardrailTester
+{
+    void assertNotWarnedOnFlush()
+    {
+        assertNotWarnedOnSSTableWrite(false, keyspace());
+    }
+
+    void assertNotWarnedOnCompact()
+    {
+        assertNotWarnedOnSSTableWrite(true, keyspace());
+    }
+
+    void assertNotWarnedOnSSTableWrite(boolean compact, String keyspace)
+    {
+        try
+        {
+            listener.clear();
+            writeSSTables(keyspace, compact);
+            listener.assertNotWarned();
+        }
+        finally
+        {
+            listener.clear();
+        }
+    }
+
+    void assertWarnedOnFlush(String... expectedMessages)
+    {
+        assertWarnedOnSSTableWrite(false, expectedMessages);
+    }
+
+    void assertWarnedOnCompact(String... expectedMessages)
+    {
+        assertWarnedOnSSTableWrite(true, expectedMessages);
+    }
+
+    void assertWarnedOnSSTableWrite(boolean compact, String... expectedMessages)
+    {
+        try
+        {
+            listener.clear();
+            writeSSTables(keyspace(), compact);
+            listener.assertContainsWarns(expectedMessages);
+        }
+        finally
+        {
+            listener.clear();
+        }
+    }
+
+    private void writeSSTables(String keyspace, boolean compact)
+    {
+        flush(keyspace);
+        if (compact)
+        {
+            compact(keyspace);
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
new file mode 100644
index 000000000000..6ae268213375
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Collectors;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.schema.TableAttributes;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.schema.Schema;
+
+import static java.lang.String.format;
+
+public class GuardrailsOnTableTest extends GuardrailTester
+{
+    private static final String CREATE_TABLE = "CREATE TABLE %s.%s(pk int, ck int, v int, PRIMARY KEY(pk, ck)) %s";
+    private static final String CREATE_VIEW = "CREATE MATERIALIZED VIEW %s.%s as select * from %s.%s where pk is not null and ck is not null primary key (ck, pk) %s";
+    private static final String ALTER_VIEW = "ALTER MATERIALIZED VIEW %s.%s WITH %s";
+    private static long defaultTablesSoftLimit;
+    private static long defaultTableHardLimit;
+    private static long defaultMVPerTableFailureThreshold;
+    private static Set<String> defaultTablePropertiesDisallowed;
+
+    @Before
+    public void before()
+    {
+        defaultTablesSoftLimit = DatabaseDescriptor.getGuardrailsConfig().tables_warn_threshold;
+        defaultTableHardLimit = DatabaseDescriptor.getGuardrailsConfig().tables_failure_threshold;
+        defaultMVPerTableFailureThreshold = DatabaseDescriptor.getGuardrailsConfig().materialized_view_per_table_failure_threshold;
+        defaultTablePropertiesDisallowed = DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed;
+
+        // only allow "gc_grace_seconds"
+        defaultMVPerTableFailureThreshold = 100;
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed =
+        TableAttributes.validKeywords.stream()
+                                     .filter(p -> !p.equals("gc_grace_seconds"))
+                                     .map(String::toUpperCase)
+                                     .collect(Collectors.toSet());
+    }
+
+    @After
+    public void after()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().tables_warn_threshold = defaultTablesSoftLimit;
+        DatabaseDescriptor.getGuardrailsConfig().tables_failure_threshold = defaultTableHardLimit;
+        DatabaseDescriptor.getGuardrailsConfig().materialized_view_per_table_failure_threshold = defaultMVPerTableFailureThreshold;
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = defaultTablePropertiesDisallowed;
+    }
+
+    @Test
+    public void testTableLimit() throws Throwable
+    {
+        // check previous async dropping schema tasks have been finished...
+        int waitInSeconds = 30;
+        while (schemaCleanup.getActiveCount() > 0 && waitInSeconds-- >= 0)
+        {
+            Thread.sleep(1000);
+        }
+
+        int currentTables = Schema.instance.getNonInternalKeyspaces().stream().map(Keyspace::open)
+                                           .mapToInt(keyspace -> keyspace.getColumnFamilyStores().size()).sum();
+        long warn = currentTables + 1;
+        long fail = currentTables + 3;
+        DatabaseDescriptor.getGuardrailsConfig().tables_warn_threshold = warn;
+        DatabaseDescriptor.getGuardrailsConfig().tables_failure_threshold = fail;
+
+        assertValid(this::create);
+        assertWarns(this::create, format("current number of tables %d exceeds warning threshold of %d", currentTables + 2, warn));
+        assertWarns(this::create, format("current number of tables %d exceeds warning threshold of %d", currentTables + 3, warn));
+        assertFails(this::create, format("Cannot have more than %s tables, failed to create table", fail));
+
+        // test super user
+        String ks1 = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }");
+        for (int i = 0; i <= fail + 1; i++)
+        {
+            assertSuperuserIsExcluded(format(CREATE_TABLE, ks1, "t" + i, ""));
+        }
+
+        // test internal queries
+        String ks2 = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }");
+        for (int i = 0; i <= fail + 1; i++)
+        {
+            assertInternalQueriesAreExcluded(format(CREATE_TABLE, ks2, "t" + i, ""));
+        }
+    }
+
+    @Test
+    public void testTableProperties() throws Throwable
+    {
+        // table properties is not allowed
+        assertValid(this::create);
+        assertFails(() -> create("with id = " + UUID.randomUUID()), "[id]");
+        assertFails(() -> create("with compression = { 'enabled': 'false' }"), "[compression]");
+        assertFails(() -> create("with compression = { 'enabled': 'false' } AND id = " + UUID.randomUUID()), "[compression, id]");
+        assertFails(() -> create("with compaction = { 'class': 'SizeTieredCompactionStrategy' }"), "[compaction]");
+        assertFails(() -> create("with gc_grace_seconds = 1000 and compression = { 'enabled': 'false' }"), "[compression]");
+        assertValid(() -> create("with gc_grace_seconds = 1000"));
+
+        // alter column is allowed
+        assertValid(this::create);
+        assertValid("ALTER TABLE %s ADD v1 int");
+        assertValid("ALTER TABLE %s DROP v1");
+        assertValid("ALTER TABLE %s RENAME pk to pk1");
+
+        // alter table properties except "gc_grace_seconds" is not allowed
+        assertValid("ALTER TABLE %s WITH gc_grace_seconds = 1000");
+        assertFails("[compaction, default_time_to_live]",
+                    "ALTER TABLE %s WITH compaction = { 'class': 'SizeTieredCompactionStrategy' } AND default_time_to_live = 1");
+        assertFails("[compaction, crc_check_chance]",
+                    "ALTER TABLE %s WITH compaction = { 'class': 'SizeTieredCompactionStrategy' } AND crc_check_chance = 1");
+
+        // skip table properties guardrails for super user
+        assertSuperuserIsExcluded(
+        format(CREATE_TABLE, keyspace(), createTableName(), "WITH compaction = { 'class': 'SizeTieredCompactionStrategy' }"),
+        format(CREATE_TABLE, keyspace(), createTableName(), "WITH gc_grace_seconds = 1000"),
+        "ALTER TABLE %s WITH gc_grace_seconds = 1000 and default_time_to_live = 1000",
+        "ALTER TABLE %s WITH compaction = { 'class': 'SizeTieredCompactionStrategy' }");
+
+        // skip table properties guardrails for internal queries
+        assertInternalQueriesAreExcluded(
+        format(CREATE_TABLE, keyspace(), createTableName(), "WITH compaction = { 'class': 'SizeTieredCompactionStrategy' }"),
+        format(CREATE_TABLE, keyspace(), createTableName(), "WITH gc_grace_seconds = 1000"),
+        "ALTER TABLE %s WITH gc_grace_seconds = 1000 and default_time_to_live = 1000",
+        "ALTER TABLE %s WITH compaction = { 'class': 'SizeTieredCompactionStrategy' }");
+    }
+
+    @Test
+    public void testViewProperties() throws Throwable
+    {
+        // view properties is not allowed
+        assertValid(this::create);
+        assertValid(() -> createMV(""));
+        assertFails(() -> createMV("with compression = { 'enabled': 'false' }"), "[compression]");
+        assertValid(() -> createMV("with gc_grace_seconds = 1000"));
+        String viewName = currentView();
+
+        // alter mv properties except "gc_grace_seconds" is not allowed
+        assertValid(format(ALTER_VIEW, keyspace(), viewName, "gc_grace_seconds = 1000"));
+        assertFails("[compaction, default_time_to_live]",
+                    format(ALTER_VIEW, keyspace(), viewName, "compaction = { 'class': 'SizeTieredCompactionStrategy' } AND default_time_to_live = 1"));
+        assertFails("[compaction, crc_check_chance]",
+                    format(ALTER_VIEW, keyspace(), viewName, "compaction = { 'class': 'SizeTieredCompactionStrategy' } AND crc_check_chance = 1"));
+
+        // skip table properties guardrails for super user
+        assertSuperuserIsExcluded(
+        format(ALTER_VIEW, keyspace(), viewName, "compaction = { 'class': 'SizeTieredCompactionStrategy' }"),
+        format(ALTER_VIEW, keyspace(), viewName, "gc_grace_seconds = 1000"),
+        format(ALTER_VIEW, keyspace(), viewName, "gc_grace_seconds = 1000 and crc_check_chance = 1"),
+        format(ALTER_VIEW, keyspace(), viewName, "compaction = { 'class': 'SizeTieredCompactionStrategy' }"));
+
+        // skip table properties guardrails for internal queries
+        assertInternalQueriesAreExcluded(
+        format(ALTER_VIEW, keyspace(), viewName, "compaction = { 'class': 'SizeTieredCompactionStrategy' }"),
+        format(ALTER_VIEW, keyspace(), viewName, "gc_grace_seconds = 1000"),
+        format(ALTER_VIEW, keyspace(), viewName, "gc_grace_seconds = 1000 and crc_check_chance = 1"),
+        format(ALTER_VIEW, keyspace(), viewName, "compaction = { 'class': 'SizeTieredCompactionStrategy' }"));
+    }
+
+    @Test
+    public void testInvalidTableProperties()
+    {
+        GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
+
+        config.table_properties_disallowed = new HashSet<>(Arrays.asList("ID1", "gc_grace_seconds"));
+        assertConfigFails(config::validate, "[id1]");
+
+        config.table_properties_disallowed = new HashSet<>(Arrays.asList("ID", "Gc_Grace_Seconds"));
+        config.validate();
+    }
+
+    private void create() throws Throwable
+    {
+        create("");
+    }
+
+    private void create(String withClause) throws Throwable
+    {
+        executeNet(format(CREATE_TABLE, keyspace(), createTableName(), withClause));
+    }
+
+    private void createMV(String withClause) throws Throwable
+    {
+        executeNet(format(CREATE_VIEW, keyspace(), createViewName(), keyspace(), currentTable(), withClause));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java
new file mode 100644
index 000000000000..387da460e1bc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java
@@ -0,0 +1,506 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.auth.AuthenticatedUser;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.QueryState;
+
+import static java.lang.String.format;
+import static org.apache.cassandra.guardrails.Guardrail.DisableFlag;
+import static org.apache.cassandra.guardrails.Guardrail.DisallowedValues;
+import static org.apache.cassandra.guardrails.Guardrail.Threshold;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class GuardrailsTest extends GuardrailTester
+{
+    private static QueryState userQueryState, systemQueryState, superQueryState;
+
+    @BeforeClass
+    public static void setup()
+    {
+        systemQueryState = QueryState.forInternalCalls();
+        userQueryState = new QueryState(ClientState.forExternalCalls(AuthenticatedUser.ANONYMOUS_USER));
+        superQueryState = new QueryState(ClientState.forExternalCalls(new AuthenticatedUser("cassandra")));
+    }
+
+    private TriggerCollector createAndAddCollector()
+    {
+        TriggerCollector collector = new TriggerCollector();
+        Guardrails.register(collector);
+        return collector;
+    }
+
+    private void assertWarn(Runnable runnable, String fullMessage, String redactedMessage)
+    {
+        // We use client warnings and listeners to check we properly warn as this is the most convenient. Technically,
+        // this doesn't validate we also log the warning, but that's probably fine ...
+        ClientWarn.instance.captureWarnings();
+        TriggerCollector collector = createAndAddCollector();
+        try
+        {
+            runnable.run();
+
+            // Client Warnings
+            List<String> warnings = ClientWarn.instance.getWarnings();
+            assertFalse("Expected to warn, but no warning was received", warnings == null || warnings.isEmpty());
+            assertEquals(format("Got more thant 1 warning (got %d => %s)", warnings.size(), warnings),
+                         1,
+                         warnings.size());
+            String warning = warnings.get(0);
+            assertTrue(format("Warning log message '%s' does not contain expected message '%s'", warning, fullMessage),
+                       warning.contains(fullMessage));
+
+            // Listeners
+            assertTrue("Expected to warn, but failure event(s) triggered: " + collector.failuresTriggered,
+                       collector.failuresTriggered.isEmpty());
+            assertFalse("Expected to warn, but no warning event triggered",
+                        collector.warningsTriggered.isEmpty());
+            assertEquals(format("Got more thant 1 warning event (got %s)", collector.warningsTriggered),
+                         1,
+                         collector.warningsTriggered.size());
+            assertTrue(format("Redacted warning message '%s' does not contain expected message '%s'",
+                              collector.warningsTriggered.entrySet().iterator().next().getValue(),
+                              redactedMessage),
+                       collector.warningsTriggered.containsValue(redactedMessage));
+        }
+        finally
+        {
+            ClientWarn.instance.resetWarnings();
+            Guardrails.unregister(collector);
+        }
+    }
+
+    private void assertWarn(Runnable runnable, String message)
+    {
+        assertWarn(runnable, message, message);
+    }
+
+    private void assertFails(Runnable runnable, String fullMessage, String redactedMessage)
+    {
+        assertFails(runnable, fullMessage, redactedMessage, true, true);
+    }
+
+    private void assertFails(Runnable runnable, String fullMessage, String redactedMessage, boolean notified, boolean thrown)
+    {
+        ClientWarn.instance.captureWarnings();
+        TriggerCollector collector = createAndAddCollector();
+        try
+        {
+            runnable.run();
+
+            if (thrown)
+                fail("Expected to fail, but it did not");
+        }
+        catch (InvalidRequestException e)
+        {
+            assertTrue("Expect no exception thrown", thrown);
+
+            assertTrue(format("Full error message '%s' does not contain expected message '%s'", e.getMessage(), fullMessage),
+                       e.getMessage().contains(fullMessage));
+
+            // Listeners
+            assertTrue("Expected to fail, but warn event(s) triggered: " + collector.warningsTriggered,
+                       collector.warningsTriggered.isEmpty());
+
+            if (notified)
+            {
+                assertFalse("Expected to fail, but no fail event triggered",
+                            collector.failuresTriggered.isEmpty());
+                assertEquals(format("Got more thant 1 fail event (got %s)", collector.failuresTriggered),
+                             1,
+                             collector.failuresTriggered.size());
+                assertTrue(format("Redacted error message '%s' does not contain expected message '%s'",
+                                  collector.failuresTriggered.entrySet().iterator().next().getValue(),
+                                  redactedMessage),
+                           collector.failuresTriggered.containsValue(redactedMessage));
+            }
+            else
+            {
+                assertTrue(format("Expected no fail event triggered, but got %s", collector.failuresTriggered),
+                           collector.failuresTriggered.isEmpty());
+            }
+        }
+        finally
+        {
+            Guardrails.unregister(collector);
+        }
+
+        try
+        {
+
+            List<String> warnings = ClientWarn.instance.getWarnings();
+            if (warnings == null) // will always be the case in practice currently, but being defensive if this change
+                warnings = Collections.emptyList();
+
+            assertTrue(format("Expect no warning messages but got %s", warnings), warnings.isEmpty());
+        }
+        finally
+        {
+            ClientWarn.instance.resetWarnings();
+        }
+    }
+
+    private void assertFails(Runnable runnable, String message)
+    {
+        assertFails(runnable, message, message);
+    }
+
+    private void assertNoWarnOrFails(Runnable runnable)
+    {
+        ClientWarn.instance.captureWarnings();
+        TriggerCollector collector = createAndAddCollector();
+
+        try
+        {
+            runnable.run();
+            List<String> warnings = ClientWarn.instance.getWarnings();
+            if (warnings == null) // will always be the case in practice currently, but being defensive if this change
+                warnings = Collections.emptyList();
+            assertTrue("Expected to not warning, but got the following warnings: " + warnings,
+                       warnings.isEmpty());
+
+            assertTrue("Expected to not warning, but got the following warnings: " + collector.warningsTriggered,
+                       collector.warningsTriggered.isEmpty());
+
+            assertTrue("Expected to not failure, but got the following failure: " + collector.warningsTriggered,
+                       collector.failuresTriggered.isEmpty());
+        }
+        catch (InvalidRequestException e)
+        {
+            fail("Expected not to fail, but failed with error message: " + e.getMessage());
+        }
+        finally
+        {
+            ClientWarn.instance.resetWarnings();
+            Guardrails.unregister(collector);
+        }
+    }
+
+    @Test
+    public void testDisabledThreshold()
+    {
+        Threshold.ErrorMessageProvider errorMessageProvider = (isWarn, what, v, t) -> "Should never trigger";
+        testDisabledThreshold(new Threshold("e", () -> -1, () -> -1, errorMessageProvider));
+    }
+
+    private void testDisabledThreshold(Threshold guard)
+    {
+        assertFalse(guard.enabled(userQueryState));
+
+        assertFalse(guard.triggersOn(1, userQueryState));
+        assertFalse(guard.triggersOn(10, userQueryState));
+        assertFalse(guard.triggersOn(11, userQueryState));
+        assertFalse(guard.triggersOn(50, userQueryState));
+        assertFalse(guard.triggersOn(110, userQueryState));
+
+        for (boolean containsUserData : Arrays.asList(true, false))
+        {
+            assertNoWarnOrFails(() -> guard.guard(5, "Z", containsUserData, null));
+            assertNoWarnOrFails(() -> guard.guard(25, "A", containsUserData, userQueryState));
+            assertNoWarnOrFails(() -> guard.guard(100, "B", containsUserData, userQueryState));
+            assertNoWarnOrFails(() -> guard.guard(101, "X", containsUserData, userQueryState));
+            assertNoWarnOrFails(() -> guard.guard(200, "Y", containsUserData, userQueryState));
+        }
+    }
+
+    @Test
+    public void testThreshold()
+    {
+        Threshold guard = new Threshold("x",
+                                        () -> 10,
+                                        () -> 100,
+                                        (isWarn, what, v, t) -> format("%s: for %s, %s > %s",
+                                                                       isWarn ? "Warning" : "Failure", what, v, t));
+
+        assertTrue(guard.enabled(userQueryState));
+
+        assertFalse(guard.triggersOn(1, userQueryState));
+        assertFalse(guard.triggersOn(10, userQueryState));
+        assertTrue(guard.triggersOn(11, userQueryState));
+        assertTrue(guard.triggersOn(50, userQueryState));
+        assertTrue(guard.triggersOn(110, userQueryState));
+
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", false, userQueryState));
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", true, userQueryState));
+
+        assertWarn(() -> guard.guard(25, "A", false, userQueryState), "Warning: for A, 25 > 10");
+        assertWarn(() -> guard.guard(25, "A", true, userQueryState),
+                   "Warning: for A, 25 > 10",
+                   "Warning: for <redacted>, 25 > 10");
+
+        assertWarn(() -> guard.guard(100, "B", false, userQueryState), "Warning: for B, 100 > 10");
+        assertWarn(() -> guard.guard(100, "B", true, userQueryState),
+                   "Warning: for B, 100 > 10",
+                   "Warning: for <redacted>, 100 > 10");
+
+        assertFails(() -> guard.guard(101, "X", false, userQueryState), "Failure: for X, 101 > 100");
+        assertFails(() -> guard.guard(101, "X", true, userQueryState),
+                    "Failure: for X, 101 > 100",
+                    "Failure: for <redacted>, 101 > 100");
+
+        assertFails(() -> guard.guard(200, "Y", false, userQueryState), "Failure: for Y, 200 > 100");
+        assertFails(() -> guard.guard(200, "Y", true, userQueryState),
+                    "Failure: for Y, 200 > 100",
+                    "Failure: for <redacted>, 200 > 100");
+
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", false, userQueryState));
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", true, userQueryState));
+    }
+
+    @Test
+    public void testWarnOnlyThreshold()
+    {
+        Threshold guard = new Threshold("x",
+                                        () -> 10,
+                                        () -> -1L,
+                                        (isWarn, what, v, t) -> format("%s: for %s, %s > %s",
+                                                                       isWarn ? "Warning" : "Failure", what, v, t));
+
+        assertTrue(guard.enabled(userQueryState));
+
+        assertFalse(guard.triggersOn(10, userQueryState));
+        assertTrue(guard.triggersOn(11, userQueryState));
+
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", false, userQueryState));
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", true, userQueryState));
+
+        assertWarn(() -> guard.guard(11, "A", false, userQueryState), "Warning: for A, 11 > 10");
+        assertWarn(() -> guard.guard(11, "A", true, userQueryState),
+                   "Warning: for A, 11 > 10",
+                   "Warning: for <redacted>, 11 > 10");
+    }
+
+    @Test
+    public void testFailureOnlyThreshold()
+    {
+        Threshold guard = new Threshold("x",
+                                        () -> -1L,
+                                        () -> 10,
+                                        (isWarn, what, v, t) -> format("%s: for %s, %s > %s",
+                                                                       isWarn ? "Warning" : "Failure", what, v, t));
+
+        assertTrue(guard.enabled(userQueryState));
+
+        assertFalse(guard.triggersOn(10, userQueryState));
+        assertTrue(guard.triggersOn(11, userQueryState));
+
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", false, userQueryState));
+        assertNoWarnOrFails(() -> guard.guard(5, "Z", true, userQueryState));
+        assertFails(() -> guard.guard(11, "A", false, userQueryState), "Failure: for A, 11 > 10");
+        assertFails(() -> guard.guard(11, "A", true, userQueryState),
+                    "Failure: for A, 11 > 10",
+                    "Failure: for <redacted>, 11 > 10");
+    }
+
+    @Test
+    public void testNotThrowOnFailure()
+    {
+        Threshold guard = new Threshold("x",
+                                        () -> 5L,
+                                        () -> 10,
+                                        (isWarn, what, v, t) -> format("%s: for %s, %s > %s",
+                                                                       isWarn ? "Warning" : "Failure", what, v, t));
+        guard.noExceptionOnFailure();
+
+        assertTrue(guard.triggersOn(11, userQueryState));
+        assertFails(() -> guard.guard(11, "A", true, userQueryState),
+                    "Failure: for A, 11 > 10", "Failure: for <redacted>, 11 > 10", true, false);
+    }
+
+    @Test
+    public void testMinLogInterval()
+    {
+        Threshold guard = new Threshold("x",
+                                        () -> 5,
+                                        () -> 10,
+                                        (isWarn, what, v, t) -> format("%s: for %s, %s > %s",
+                                                                       isWarn ? "Warning" : "Failure", what, v, t));
+
+        guard.minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
+
+        // should trigger on first warn and error
+        assertWarn(() -> guard.guard(6, "A", true, userQueryState), "Warning: for A, 6 > 5", "Warning: for <redacted>, 6 > 5");
+        assertFails(() -> guard.guard(11, "B", true, userQueryState),
+                    "Failure: for B, 11 > 10", "Failure: for <redacted>, 11 > 10", true, true);
+
+        // should not trigger on second warn and error within minimum notify interval
+        assertNoWarnOrFails(() -> guard.guard(6, "A", true, userQueryState));
+        assertFails(() -> guard.guard(11, "B", true, userQueryState),
+                    "Failure: for B, 11 > 10", "Failure: for <redacted>, 11 > 10", false, true);
+    }
+
+    @Test
+    public void testThresholdUsers()
+    {
+        Threshold guard = new Threshold("x",
+                                        () -> 10,
+                                        () -> 100,
+                                        (isWarn, what, v, t) -> format("%s: for %s, %s > %s",
+                                                                       isWarn ? "Warning" : "Failure", what, v, t));
+
+        // value under both thresholds
+        assertNoWarnOrFails(() -> guard.guard(5, "x", true, null));
+        assertNoWarnOrFails(() -> guard.guard(5, "x", true, userQueryState));
+        assertNoWarnOrFails(() -> guard.guard(5, "x", true, systemQueryState));
+        assertNoWarnOrFails(() -> guard.guard(5, "x", true, superQueryState));
+
+        // value over warning threshold
+        assertWarn(() -> guard.guard(100, "y", true, null),
+                   "Warning: for y, 100 > 10", "Warning: for <redacted>, 100 > 10");
+        assertWarn(() -> guard.guard(100, "y", true, userQueryState),
+                   "Warning: for y, 100 > 10", "Warning: for <redacted>, 100 > 10");
+        assertNoWarnOrFails(() -> guard.guard(100, "y", true, systemQueryState));
+        assertNoWarnOrFails(() -> guard.guard(100, "y", true, superQueryState));
+
+        // value over failure threshold
+        assertFails(() -> guard.guard(101, "z", true, null),
+                    "Failure: for z, 101 > 100", "Failure: for <redacted>, 101 > 100");
+        assertFails(() -> guard.guard(101, "z", true, userQueryState),
+                    "Failure: for z, 101 > 100", "Failure: for <redacted>, 101 > 100");
+        assertNoWarnOrFails(() -> guard.guard(101, "z", true, systemQueryState));
+        assertNoWarnOrFails(() -> guard.guard(101, "z", true, superQueryState));
+    }
+
+    @Test
+    public void testDisableFlag()
+    {
+        assertFails(() -> new DisableFlag("x", () -> true, "X").ensureEnabled(userQueryState), "X is not allowed");
+        assertNoWarnOrFails(() -> new DisableFlag("x", () -> false, "X").ensureEnabled(userQueryState));
+
+        assertFails(() -> new DisableFlag("x", () -> true, "X").ensureEnabled("Y", userQueryState), "Y is not allowed");
+        assertNoWarnOrFails(() -> new DisableFlag("x", () -> false, "X").ensureEnabled("Y", userQueryState));
+    }
+
+    @Test
+    public void testDisableFlagUsers()
+    {
+        DisableFlag enabled = new DisableFlag("x", () -> false, "X");
+        assertNoWarnOrFails(() -> enabled.ensureEnabled(null));
+        assertNoWarnOrFails(() -> enabled.ensureEnabled(userQueryState));
+        assertNoWarnOrFails(() -> enabled.ensureEnabled(systemQueryState));
+        assertNoWarnOrFails(() -> enabled.ensureEnabled(superQueryState));
+
+        DisableFlag disabled = new DisableFlag("x", () -> true, "X");
+        assertFails(() -> disabled.ensureEnabled(null), "X is not allowed");
+        assertFails(() -> disabled.ensureEnabled(userQueryState), "X is not allowed");
+        assertNoWarnOrFails(() -> disabled.ensureEnabled(systemQueryState));
+        assertNoWarnOrFails(() -> disabled.ensureEnabled(superQueryState));
+    }
+
+    @Test
+    public void testDisallowedValues()
+    {
+        // Using a LinkedHashSet below to ensure the order in the error message checked below are not random
+        DisallowedValues<Integer> disallowed = new DisallowedValues<>(
+        "x",
+        () -> new LinkedHashSet<>(Arrays.asList("4", "6", "20")),
+        Integer::valueOf,
+        "integer");
+
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(3, userQueryState));
+        assertFails(() -> disallowed.ensureAllowed(4, userQueryState),
+                    "Provided value 4 is not allowed for integer (disallowed values are: [4, 6, 20])");
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(10, userQueryState));
+        assertFails(() -> disallowed.ensureAllowed(20, userQueryState),
+                    "Provided value 20 is not allowed for integer (disallowed values are: [4, 6, 20])");
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(200, userQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(set(1, 2, 3), userQueryState));
+
+        assertFails(() -> disallowed.ensureAllowed(set(4, 6), null),
+                    "Provided values [4, 6] are not allowed for integer (disallowed values are: [4, 6, 20])");
+        assertFails(() -> disallowed.ensureAllowed(set(4, 5, 6, 7), null),
+                    "Provided values [4, 6] are not allowed for integer (disallowed values are: [4, 6, 20])");
+    }
+
+    @Test
+    public void testDisallowedValuesUsers()
+    {
+        DisallowedValues<Integer> disallowed = new DisallowedValues<>(
+        "x",
+        () -> Collections.singleton("2"),
+        Integer::valueOf,
+        "integer");
+
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(1, null));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(1, userQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(1, systemQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(1, superQueryState));
+
+        String message = "Provided value 2 is not allowed for integer (disallowed values are: [2])";
+        assertFails(() -> disallowed.ensureAllowed(2, null), message);
+        assertFails(() -> disallowed.ensureAllowed(2, userQueryState), message);
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(2, systemQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(2, superQueryState));
+
+        Set<Integer> allowedValues = set(1);
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(allowedValues, null));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(allowedValues, userQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(allowedValues, systemQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(allowedValues, superQueryState));
+
+        Set<Integer> disallowedValues = set(2);
+        message = "Provided values [2] are not allowed for integer (disallowed values are: [2])";
+        assertFails(() -> disallowed.ensureAllowed(disallowedValues, null), message);
+        assertFails(() -> disallowed.ensureAllowed(disallowedValues, userQueryState), message);
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(disallowedValues, systemQueryState));
+        assertNoWarnOrFails(() -> disallowed.ensureAllowed(disallowedValues, superQueryState));
+    }
+
+    private Set<Integer> set(Integer... values)
+    {
+        return new HashSet<>(Arrays.asList(values));
+    }
+
+    private static class TriggerCollector implements Guardrails.Listener
+    {
+        final Map<String, String> warningsTriggered = new HashMap<>();
+        final Map<String, String> failuresTriggered = new HashMap<>();
+
+        @Override
+        public void onWarningTriggered(String guardrailName, String message)
+        {
+            warningsTriggered.put(guardrailName, message);
+        }
+
+        @Override
+        public void onFailureTriggered(String guardrailName, String message)
+        {
+            failuresTriggered.put(guardrailName, message);
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java
index 61ac017430e2..1e7c9930e923 100644
--- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterClientTest.java
@@ -40,7 +40,6 @@ public class CQLSSTableWriterClientTest
     public void setUp()
     {
         this.testDirectory = Files.createTempDir();
-        DatabaseDescriptor.daemonInitialization();
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
index 31c588b340e5..c211ae05bfde 100644
--- a/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/CQLSSTableWriterTest.java
@@ -41,12 +41,15 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.functions.UDHelper;
 import org.apache.cassandra.cql3.functions.types.*;
+import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.exceptions.*;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.*;
@@ -55,6 +58,18 @@
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.fail;
 
+/**
+ * Tests for {@link CQLSSTableWriter}.
+ *
+ * Please note: most tests here both create sstables and try to load them, so for the last part, we need to make sure
+ * we have properly "loaded" the table (which we do with {@link SchemaLoader#load(String, String, String...)}). But
+ * a small subtlety is that this <b>must</b> be called before we call {@link CQLSSTableWriter#builder} because
+ * otherwise the guardrail validation in {@link CreateTableStatement#validate(QueryState)} ends up breaking because
+ * the {@link ColumnFamilyStore} is not loaded yet. This would not be a problem in real usage of
+ * {@link CQLSSTableWriter} because the later only calls {@link DatabaseDescriptor#clientInitialization}, not
+ * {@link DatabaseDescriptor#daemonInitialization}, so said guardrail validation don't execute, but this test does
+ * manually call {@link DatabaseDescriptor#daemonInitialization} so...
+ */
 public class CQLSSTableWriterTest
 {
     private static final AtomicInteger idGen = new AtomicInteger(0);
@@ -101,6 +116,9 @@ public void testUnsortedWriter() throws Exception
                           + "  v2 int"
                           + ")";
             String insert = "INSERT INTO " + qualifiedTable + " (k, v1, v2) VALUES (?, ?, ?)";
+
+            SchemaLoader.load(keyspace, schema);
+
             CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                       .inDirectory(dataDir)
                                                       .forTable(schema)
@@ -155,6 +173,7 @@ public void testForbidCounterUpdates() throws Exception
         String insert = String.format("UPDATE " + qualifiedTable + " SET my_counter = my_counter - ? WHERE my_id = ?");
         try
         {
+            SchemaLoader.load(keyspace, schema);
             CQLSSTableWriter.builder().inDirectory(dataDir)
                             .forTable(schema)
                             .withPartitioner(Murmur3Partitioner.instance)
@@ -178,6 +197,7 @@ public void testSyncWithinPartition() throws Exception
                       + "  v blob"
                       + ")";
         String insert = "INSERT INTO " + qualifiedTable + " (k, v) VALUES (?, ?)";
+        SchemaLoader.load(keyspace, schema);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .using(insert)
@@ -212,6 +232,7 @@ public void testSyncNoEmptyRows() throws Exception
                         + "  PRIMARY KEY (k)"
                         + ")";
         String insert = "INSERT INTO " + qualifiedTable + " (k, c) VALUES (?, ?)";
+        SchemaLoader.load(keyspace, schema);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .forTable(schema)
@@ -252,6 +273,7 @@ public void run()
                     + "  PRIMARY KEY (k, v)"
                     + ")";
             String insert = "INSERT INTO " + qualifiedTable + " (k, v) VALUES (?, ?)";
+            SchemaLoader.load(keyspace, schema);
             CQLSSTableWriter writer = CQLSSTableWriter.builder()
                     .inDirectory(dataDir)
                     .forTable(schema)
@@ -310,10 +332,13 @@ public void testWritesWithUdts() throws Exception
                               + "  PRIMARY KEY (k)"
                               + ")";
 
+        String type1 = "CREATE TYPE " + keyspace + ".tuple2 (a int, b int)";
+        String type2 = "CREATE TYPE " + keyspace + ".tuple3 (a int, b int, c int)";
+        SchemaLoader.load(keyspace, schema, type1, type2);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
-                                                  .withType("CREATE TYPE " + keyspace + ".tuple2 (a int, b int)")
-                                                  .withType("CREATE TYPE " + keyspace + ".tuple3 (a int, b int, c int)")
+                                                  .withType(type1)
+                                                  .withType(type2)
                                                   .forTable(schema)
                                                   .using("INSERT INTO " + keyspace + "." + table + " (k, v1, v2) " +
                                                          "VALUES (?, ?, ?)").build();
@@ -375,10 +400,13 @@ public void testWritesWithDependentUdts() throws Exception
                               + "  PRIMARY KEY (k)"
                               + ")";
 
+        String type1 = "CREATE TYPE " + keyspace + ".tuple2 (a int, b int)";
+        String type2 = "CREATE TYPE " + keyspace + ".nested_tuple (c int, tpl frozen<tuple2>)";
+        SchemaLoader.load(keyspace, schema, type1, type2);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
-                                                  .withType("CREATE TYPE " + keyspace + ".nested_tuple (c int, tpl frozen<tuple2>)")
-                                                  .withType("CREATE TYPE " + keyspace + ".tuple2 (a int, b int)")
+                                                  .withType(type2)
+                                                  .withType(type1)
                                                   .forTable(schema)
                                                   .using("INSERT INTO " + keyspace + "." + table + " (k, v1) " +
                                                          "VALUES (?, ?)")
@@ -433,6 +461,7 @@ public void testUnsetValues() throws Exception
                               + "  PRIMARY KEY (k, c1, c2)"
                               + ")";
 
+        SchemaLoader.load(keyspace, schema);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .forTable(schema)
@@ -529,6 +558,7 @@ public void testUpdateStatement() throws Exception
                               + "  PRIMARY KEY (k, c1, c2)"
                               + ")";
 
+        SchemaLoader.load(keyspace, schema);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .forTable(schema)
@@ -571,6 +601,7 @@ public void testNativeFunctions() throws Exception
                               + "  PRIMARY KEY (k, c1, c2)"
                               + ")";
 
+        SchemaLoader.load(keyspace, schema);
         CQLSSTableWriter writer = CQLSSTableWriter.builder()
                                                   .inDirectory(dataDir)
                                                   .forTable(schema)
diff --git a/test/unit/org/apache/cassandra/metrics/ClientRequestSizeMetricsTest.java b/test/unit/org/apache/cassandra/metrics/ClientRequestSizeMetricsTest.java
index 1f2f771e5c41..9ad4492ab49f 100644
--- a/test/unit/org/apache/cassandra/metrics/ClientRequestSizeMetricsTest.java
+++ b/test/unit/org/apache/cassandra/metrics/ClientRequestSizeMetricsTest.java
@@ -70,6 +70,10 @@ public void testReadAndWriteMetricsAreRecordedDuringNativeRequests() throws Thro
         try
         {
             reinitializeNetwork(builder -> builder.withQueryOptions(new QueryOptions().setMetadataEnabled(false)));
+
+            // Ensure the driver session has been connected
+            sessionNet(version);
+
             // We want to ignore all the messages sent by the driver upon connection as well as
             // the event sent upon schema updates
             clearMetrics();
diff --git a/test/unit/org/apache/cassandra/schema/TableMetadataTest.java b/test/unit/org/apache/cassandra/schema/TableMetadataTest.java
new file mode 100644
index 000000000000..e15f7dd70d54
--- /dev/null
+++ b/test/unit/org/apache/cassandra/schema/TableMetadataTest.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.schema;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+import org.junit.Test;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.marshal.BooleanType;
+import org.apache.cassandra.db.marshal.CompositeType;
+import org.apache.cassandra.db.marshal.FloatType;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.IntegerType;
+import org.apache.cassandra.db.marshal.TupleType;
+import org.apache.cassandra.db.marshal.UTF8Type;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+
+public class TableMetadataTest
+{
+    @Test
+    public void testPartitionKeyAsCQLLiteral()
+    {
+        String keyspaceName = "keyspace";
+        String tableName = "table";
+        CompositeType compositeType1 = CompositeType.getInstance(UTF8Type.instance, UTF8Type.instance, UTF8Type.instance);
+        TableMetadata metadata1 = TableMetadata.builder(keyspaceName, tableName)
+                                               .addPartitionKeyColumn("key", compositeType1)
+                                               .build();
+
+        String keyAsCQLLiteral = metadata1.partitionKeyAsCQLLiteral(compositeType1.decompose("test:", "composite!", "type)"));
+        assertThat(keyAsCQLLiteral).isEqualTo("('test:', 'composite!', 'type)')");
+
+        CompositeType compositeType2 = CompositeType.getInstance(new TupleType(Arrays.asList(FloatType.instance,
+                                                                                             UTF8Type.instance)),
+                                                                 IntegerType.instance);
+        TableMetadata metadata2 = TableMetadata.builder(keyspaceName, tableName)
+                                               .addPartitionKeyColumn("key", compositeType2)
+                                               .build();
+        String keyAsCQLLiteral2 = metadata2.partitionKeyAsCQLLiteral(compositeType2.decompose(TupleType.buildValue(FloatType.instance.decompose(0.33f),
+                                                                                                                   UTF8Type.instance.decompose("tuple test")),
+                                                                                              BigInteger.valueOf(10)));
+        assertThat(keyAsCQLLiteral2).isEqualTo("((0.33, 'tuple test'), 10)");
+
+        TableMetadata metadata3 = TableMetadata.builder(keyspaceName, tableName).addPartitionKeyColumn("key", UTF8Type.instance).build();
+        assertEquals("'non-composite test'", metadata3.partitionKeyAsCQLLiteral(UTF8Type.instance.decompose("non-composite test")));
+    }
+
+    @Test
+    public void testPrimaryKeyAsCQLLiteral()
+    {
+        String keyspaceName = "keyspace";
+        String tableName = "table";
+
+        TableMetadata metadata;
+
+        // one partition key column, no clustering key
+        metadata = TableMetadata.builder(keyspaceName, tableName)
+                                .addPartitionKeyColumn("key", UTF8Type.instance)
+                                .build();
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("Test"), Clustering.EMPTY)).isEqualTo("'Test'");
+
+        // two partition key columns, no clustering key
+        metadata = TableMetadata.builder(keyspaceName, tableName)
+                                .addPartitionKeyColumn("k1", UTF8Type.instance)
+                                .addPartitionKeyColumn("k2", Int32Type.instance)
+                                .build();
+        assertThat(metadata.primaryKeyAsCQLLiteral(CompositeType.getInstance(UTF8Type.instance, Int32Type.instance)
+                                                                .decompose("Test", -12), Clustering.EMPTY)).isEqualTo("('Test', -12)");
+
+        // one partition key column, one clustering key column
+        metadata = TableMetadata.builder(keyspaceName, tableName)
+                                .addPartitionKeyColumn("key", UTF8Type.instance)
+                                .addClusteringColumn("clustering", UTF8Type.instance)
+                                .build();
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("k"),
+                                                   Clustering.make(UTF8Type.instance.decompose("Cluster")))).isEqualTo("('k', 'Cluster')");
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("k"), Clustering.EMPTY)).isEqualTo("'k'");
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("k"), Clustering.STATIC_CLUSTERING)).isEqualTo("'k'");
+
+        // one partition key column, two clustering key columns
+        metadata = TableMetadata.builder(keyspaceName, tableName)
+                                .addPartitionKeyColumn("key", UTF8Type.instance)
+                                .addClusteringColumn("c1", UTF8Type.instance)
+                                .addClusteringColumn("c2", UTF8Type.instance)
+                                .build();
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("k"),
+                                                   Clustering.make(UTF8Type.instance.decompose("c1"),
+                                                                   UTF8Type.instance.decompose("c2")))).isEqualTo("('k', 'c1', 'c2')");
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("k"), Clustering.EMPTY)).isEqualTo("'k'");
+        assertThat(metadata.primaryKeyAsCQLLiteral(UTF8Type.instance.decompose("k"), Clustering.STATIC_CLUSTERING)).isEqualTo("'k'");
+
+        // two partition key columns, two clustering key columns
+        CompositeType composite = CompositeType.getInstance(Int32Type.instance, BooleanType.instance);
+        metadata = TableMetadata.builder(keyspaceName, tableName)
+                                .addPartitionKeyColumn("k1", Int32Type.instance)
+                                .addPartitionKeyColumn("k2", BooleanType.instance)
+                                .addClusteringColumn("c1", UTF8Type.instance)
+                                .addClusteringColumn("c2", UTF8Type.instance)
+                                .build();
+        assertThat(metadata.primaryKeyAsCQLLiteral(composite.decompose(0, true),
+                                                   Clustering.make(UTF8Type.instance.decompose("Cluster_1"),
+                                                                   UTF8Type.instance.decompose("Cluster_2"))))
+        .isEqualTo("(0, true, 'Cluster_1', 'Cluster_2')");
+
+        assertThat(metadata.primaryKeyAsCQLLiteral(composite.decompose(1, true), Clustering.EMPTY)).isEqualTo("(1, true)");
+        assertThat(metadata.primaryKeyAsCQLLiteral(composite.decompose(2, true), Clustering.STATIC_CLUSTERING)).isEqualTo("(2, true)");
+    }
+}
diff --git a/test/unit/org/apache/cassandra/utils/units/RateUnitTest.java b/test/unit/org/apache/cassandra/utils/units/RateUnitTest.java
new file mode 100644
index 000000000000..f96d25cf718e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/units/RateUnitTest.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RateUnitTest
+{
+    @Test
+    public void testConvert()
+    {
+        assertThat(RateUnit.B_S.convert(10L, RateUnit.KB_S)).isEqualTo(10 * 1024);
+        assertThat(RateUnit.B_S.convert(10L, RateUnit.MB_S)).isEqualTo(10 * 1024 * 1024);
+        assertThat(RateUnit.MB_S.convert(10L, RateUnit.GB_S)).isEqualTo(10 * 1024);
+        assertThat(RateUnit.GB_S.convert(10L, RateUnit.TB_S)).isEqualTo(10 * 1024);
+        assertThat(RateUnit.MB_S.convert(10L, RateUnit.B_S)).isEqualTo(0);
+
+        RateUnit B_MS = RateUnit.of(SizeUnit.BYTES, TimeUnit.MILLISECONDS);
+        // 10 kB/s == 10,240 B/s == 10 kB/ms
+        assertThat(B_MS.convert(10L, RateUnit.KB_S)).isEqualTo(10L);
+
+        RateUnit GB_D = RateUnit.of(SizeUnit.GIGABYTES, TimeUnit.DAYS);
+        // 10 MB/s == 10 * 3600 MB/h == 36,000 MB/h == 864,000 MB/days == 843 GB/days
+        assertThat(GB_D.convert(10L, RateUnit.MB_S)).isEqualTo(843L);
+
+        RateUnit GB_MS = RateUnit.of(SizeUnit.GIGABYTES, TimeUnit.MILLISECONDS);
+        // 10 MB/s == 0 GB/ms
+        assertThat(GB_MS.convert(10L, RateUnit.MB_S)).isEqualTo(0L);
+
+        RateUnit B_H = RateUnit.of(SizeUnit.BYTES, TimeUnit.HOURS);
+        // 10 MB/s == 10 * 1024 * 1024 B/s = 10 * 1024 * 1024 * 3600 B/hours
+        assertThat(B_H.convert(10L, RateUnit.MB_S)).isEqualTo(10L * 1024 * 1024 * 3600);
+    }
+
+    @Test
+    public void testToHumanReadableString()
+    {
+        assertThat(RateUnit.B_S.toHumanReadableString(10)).isEqualTo("10B/s");
+        assertThat(RateUnit.B_S.toHumanReadableString(1024L)).isEqualTo("1kB/s");
+        assertThat(RateUnit.B_S.toHumanReadableString(1150L)).isEqualTo("1.1kB/s");
+        assertThat(RateUnit.B_S.toHumanReadableString(4 * 1024 * 1024L)).isEqualTo("4MB/s");
+        assertThat(RateUnit.GB_S.toHumanReadableString(2600 * 1024L)).isEqualTo("2,600TB/s");
+    }
+
+    @Test
+    public void testCompare()
+    {
+        assertThat(RateUnit.MB_S.compareTo(RateUnit.MB_S)).isEqualTo(0);
+
+        assertThat(RateUnit.MB_S.compareTo(RateUnit.GB_S)).isLessThan(0);
+        assertThat(RateUnit.MB_S.compareTo(RateUnit.TB_S)).isLessThan(0);
+
+        assertThat(RateUnit.MB_S.compareTo(RateUnit.KB_S)).isGreaterThan(0);
+        assertThat(RateUnit.MB_S.compareTo(RateUnit.B_S)).isGreaterThan(0);
+
+        RateUnit MB_MS = RateUnit.of(SizeUnit.MEGABYTES, TimeUnit.MILLISECONDS);
+        RateUnit MB_NS = RateUnit.of(SizeUnit.MEGABYTES, TimeUnit.NANOSECONDS);
+
+        assertThat(RateUnit.MB_S.compareTo(MB_MS)).isLessThan(0);
+        assertThat(RateUnit.MB_S.compareTo(MB_NS)).isLessThan(0);
+
+        RateUnit KB_MS = RateUnit.of(SizeUnit.KILOBYTES, TimeUnit.MILLISECONDS);
+        RateUnit KB_NS = RateUnit.of(SizeUnit.KILOBYTES, TimeUnit.NANOSECONDS);
+
+        // 1 MB/s = 1024 kB/s > 1000 kB/s = 1kB/ms
+        assertThat(RateUnit.MB_S.compareTo(KB_MS)).isGreaterThan(0);
+        // 1 MB/s = 1024 kB/s < 1000 * 1000 kB/s = 1kB/ns
+        assertThat(RateUnit.MB_S.compareTo(KB_NS)).isLessThan(0);
+
+        RateUnit MB_M = RateUnit.of(SizeUnit.MEGABYTES, TimeUnit.MINUTES);
+        RateUnit GB_D = RateUnit.of(SizeUnit.GIGABYTES, TimeUnit.DAYS);
+        // 1 MB/m = 1440 MB/d > 1024 MB/d = 1 GB/d
+        assertThat(MB_M.compareTo(GB_D)).isGreaterThan(0);
+
+        RateUnit GB_MS = RateUnit.of(SizeUnit.GIGABYTES, TimeUnit.MILLISECONDS);
+        // 1 MB/s < 1 GB/ms
+        assertThat(RateUnit.MB_S.compareTo(GB_MS)).isLessThan(0);
+    }
+
+    @Test
+    public void testSmallestRepresentation()
+    {
+        // The smallest unit
+        RateUnit B_D = RateUnit.of(SizeUnit.BYTES, TimeUnit.DAYS);
+
+        // A few simple case that all resolve to the smallest unit
+        assertThat(B_D.smallestRepresentableUnit(1)).isEqualTo(B_D);
+        assertThat(B_D.smallestRepresentableUnit(Long.MAX_VALUE)).isEqualTo(B_D);
+        assertThat(RateUnit.B_S.smallestRepresentableUnit(1)).isEqualTo(B_D);
+        assertThat(RateUnit.KB_S.smallestRepresentableUnit(1)).isEqualTo(B_D);
+
+        assertThat(RateUnit.MB_S.smallestRepresentableUnit(Long.MAX_VALUE - 10)).isEqualTo(RateUnit.MB_S);
+        assertThat(RateUnit.MB_S.smallestRepresentableUnit((Long.MAX_VALUE / 1024) - 10)).isEqualTo(RateUnit.KB_S);
+
+        // Slightly more subtle cases
+        long v1 = (Long.MAX_VALUE - 1) / 1000;
+        long v2 = (Long.MAX_VALUE - 1) / 1024;
+        long v3 = (Long.MAX_VALUE - 1) / (1000 * 60);
+
+        RateUnit MB_MS = RateUnit.of(SizeUnit.MEGABYTES, TimeUnit.MILLISECONDS);
+        RateUnit KB_MS = RateUnit.of(SizeUnit.KILOBYTES, TimeUnit.MILLISECONDS);
+        RateUnit MB_M = RateUnit.of(SizeUnit.MEGABYTES, TimeUnit.MINUTES);
+        assertThat(MB_MS.smallestRepresentableUnit(v1)).isEqualTo(RateUnit.MB_S);
+        assertThat(MB_MS.smallestRepresentableUnit(v2)).isEqualTo(KB_MS);
+        assertThat(MB_MS.smallestRepresentableUnit(v3)).isEqualTo(MB_M);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/units/RateValueTest.java b/test/unit/org/apache/cassandra/utils/units/RateValueTest.java
new file mode 100644
index 000000000000..67bf12c7d238
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/units/RateValueTest.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class RateValueTest
+{
+    @Test
+    public void testCompute()
+    {
+        assertThat(RateValue.compute(SizeValue.of(10, SizeUnit.MEGABYTES), TimeValue.of(2, TimeUnit.SECONDS)))
+        .isEqualTo(RateValue.of(5, RateUnit.MB_S).convert(RateUnit.B_S));
+
+        assertThat(RateValue.compute(SizeValue.of(10, SizeUnit.MEGABYTES), TimeValue.of(5, TimeUnit.SECONDS)))
+        .isEqualTo(RateValue.of(2, RateUnit.MB_S));
+
+        assertThat(RateValue.compute(SizeValue.of(10, SizeUnit.MEGABYTES), TimeValue.of(10, TimeUnit.SECONDS)))
+        .isEqualTo(RateValue.of(1, RateUnit.MB_S));
+
+        // Reminder that 1MB = 1204KB, so 0.5MB == 512KB
+        assertThat(RateValue.compute(SizeValue.of(10, SizeUnit.MEGABYTES), TimeValue.of(20, TimeUnit.SECONDS)))
+        .isEqualTo(RateValue.of(512, RateUnit.KB_S));
+    }
+
+    @Test
+    public void testTimeFor()
+    {
+        RateValue rate = RateValue.of(1, RateUnit.MB_S);
+        assertThat(rate.timeFor(SizeValue.of(50, SizeUnit.MEGABYTES))).isEqualTo(TimeValue.of(50, TimeUnit.SECONDS));
+        assertThat(rate.timeFor(SizeValue.of(93, SizeUnit.MEGABYTES))).isEqualTo(TimeValue.of(93, TimeUnit.SECONDS));
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/units/SizeUnitTest.java b/test/unit/org/apache/cassandra/utils/units/SizeUnitTest.java
new file mode 100644
index 000000000000..92c7d13fefa0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/units/SizeUnitTest.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import org.junit.Test;
+
+import static org.apache.cassandra.utils.units.SizeUnit.BYTES;
+import static org.apache.cassandra.utils.units.SizeUnit.C0;
+import static org.apache.cassandra.utils.units.SizeUnit.C1;
+import static org.apache.cassandra.utils.units.SizeUnit.C2;
+import static org.apache.cassandra.utils.units.SizeUnit.C3;
+import static org.apache.cassandra.utils.units.SizeUnit.C4;
+import static org.apache.cassandra.utils.units.SizeUnit.GIGABYTES;
+import static org.apache.cassandra.utils.units.SizeUnit.KILOBYTES;
+import static org.apache.cassandra.utils.units.SizeUnit.MEGABYTES;
+import static org.apache.cassandra.utils.units.SizeUnit.TERABYTES;
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SizeUnitTest
+{
+    @Test
+    public void testConvert()
+    {
+        // We know convert delegates to the other methods, so we don't go overboard on testing it with all units. We
+        // just use that test for a few random conversions.
+        assertThat(GIGABYTES.convert(100, BYTES)).isEqualTo(0);
+        assertThat(GIGABYTES.convert(100 * C3, BYTES)).isEqualTo(100);
+        assertThat(GIGABYTES.convert(100 * C4, BYTES)).isEqualTo(100 * C1);
+        assertThat(BYTES.convert(100 * C4, GIGABYTES)).isEqualTo(Long.MAX_VALUE);
+    }
+
+    @Test
+    public void testToBytes()
+    {
+        testToBytes(BYTES, C0);
+        testToBytes(KILOBYTES, C1);
+        testToBytes(MEGABYTES, C2);
+        testToBytes(GIGABYTES, C3);
+        testToBytes(TERABYTES, C4);
+
+        // Test overflow
+        assertThat(TERABYTES.toBytes(Long.MAX_VALUE / 10)).isEqualTo(Long.MAX_VALUE);
+    }
+
+    private void testToBytes(SizeUnit unit, long constant)
+    {
+        assertThat(unit.toBytes(1)).isEqualTo(1 * constant);
+        assertThat(unit.toBytes(1023)).isEqualTo(1023 * constant);
+        assertThat(unit.toBytes(1024)).isEqualTo(1024 * constant);
+        assertThat(unit.toBytes(2049)).isEqualTo(2049 * constant);
+    }
+
+    @Test
+    public void testToKiloBytes()
+    {
+        assertThat(BYTES.toKiloBytes(1)).isEqualTo(0);
+        assertThat(BYTES.toKiloBytes(1023)).isEqualTo(0);
+        assertThat(BYTES.toKiloBytes(1024)).isEqualTo(1);
+        assertThat(BYTES.toKiloBytes(2049)).isEqualTo(2);
+
+        testToKiloBytes(KILOBYTES, C0);
+        testToKiloBytes(MEGABYTES, C1);
+        testToKiloBytes(GIGABYTES, C2);
+        testToKiloBytes(TERABYTES, C3);
+    }
+
+    private void testToKiloBytes(SizeUnit unit, long constant)
+    {
+        assertThat(unit.toKiloBytes(1)).isEqualTo(1 * constant);
+        assertThat(unit.toKiloBytes(1023)).isEqualTo(1023 * constant);
+        assertThat(unit.toKiloBytes(1024)).isEqualTo(1024 * constant);
+        assertThat(unit.toKiloBytes(2049)).isEqualTo(2049 * constant);
+    }
+
+    @Test
+    public void testToMegaBytes() throws Exception
+    {
+        testToMegaBytes(BYTES, 0);
+
+        assertThat(KILOBYTES.toMegaBytes(1)).isEqualTo(0);
+        assertThat(KILOBYTES.toMegaBytes(1023)).isEqualTo(0);
+        assertThat(KILOBYTES.toMegaBytes(1024)).isEqualTo(1);
+        assertThat(KILOBYTES.toMegaBytes(2049)).isEqualTo(2);
+
+        testToMegaBytes(MEGABYTES, C0);
+        testToMegaBytes(GIGABYTES, C1);
+        testToMegaBytes(TERABYTES, C2);
+    }
+
+    private void testToMegaBytes(SizeUnit unit, long constant)
+    {
+        assertThat(unit.toMegaBytes(1)).isEqualTo(1 * constant);
+        assertThat(unit.toMegaBytes(1023)).isEqualTo(1023 * constant);
+        assertThat(unit.toMegaBytes(1024)).isEqualTo(1024 * constant);
+        assertThat(unit.toMegaBytes(2049)).isEqualTo(2049 * constant);
+    }
+
+    @Test
+    public void testToGigaBytes()
+    {
+        testToGigaBytes(BYTES, 0);
+        testToGigaBytes(KILOBYTES, 0);
+
+        assertThat(MEGABYTES.toGigaBytes(1)).isEqualTo(0);
+        assertThat(MEGABYTES.toGigaBytes(1023)).isEqualTo(0);
+        assertThat(MEGABYTES.toGigaBytes(1024)).isEqualTo(1);
+        assertThat(MEGABYTES.toGigaBytes(2049)).isEqualTo(2);
+
+        testToGigaBytes(GIGABYTES, C0);
+        testToGigaBytes(TERABYTES, C1);
+    }
+
+    private void testToGigaBytes(SizeUnit unit, long constant)
+    {
+        assertThat(unit.toGigaBytes(1)).isEqualTo(1 * constant);
+        assertThat(unit.toGigaBytes(1023)).isEqualTo(1023 * constant);
+        assertThat(unit.toGigaBytes(1024)).isEqualTo(1024 * constant);
+        assertThat(unit.toGigaBytes(2049)).isEqualTo(2049 * constant);
+    }
+
+    @Test
+    public void testToTeraBytes()
+    {
+        testToTeraBytes(BYTES, 0);
+        testToTeraBytes(KILOBYTES, 0);
+        testToTeraBytes(MEGABYTES, 0);
+
+        assertThat(GIGABYTES.toTeraBytes(1)).isEqualTo(0);
+        assertThat(GIGABYTES.toTeraBytes(1023)).isEqualTo(0);
+        assertThat(GIGABYTES.toTeraBytes(1024)).isEqualTo(1);
+        assertThat(GIGABYTES.toTeraBytes(2049)).isEqualTo(2);
+
+        testToTeraBytes(TERABYTES, C0);
+    }
+
+    private void testToTeraBytes(SizeUnit unit, long constant)
+    {
+        assertThat(unit.toTeraBytes(1)).isEqualTo(1 * constant);
+        assertThat(unit.toTeraBytes(1023)).isEqualTo(1023 * constant);
+        assertThat(unit.toTeraBytes(1024)).isEqualTo(1024 * constant);
+        assertThat(unit.toTeraBytes(2049)).isEqualTo(2049 * constant);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/units/SizeValueTest.java b/test/unit/org/apache/cassandra/utils/units/SizeValueTest.java
new file mode 100644
index 000000000000..cfc845b93616
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/units/SizeValueTest.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class SizeValueTest
+{
+    @Test
+    public void testToString()
+    {
+        assertThat(SizeUnit.BYTES.value(10).toString()).isEqualTo("10B");
+        assertThat(SizeUnit.BYTES.value(1900).toString()).isEqualTo("1.9kB");
+        assertThat(SizeUnit.BYTES.value(2000).toString()).isEqualTo("2kB");
+        assertThat(SizeUnit.BYTES.value(2200).toString()).isEqualTo("2.1kB");
+        assertThat(SizeUnit.BYTES.value(42_345L).toString()).isEqualTo("41kB");
+        assertThat(SizeUnit.BYTES.value(100_334_345L).toString()).isEqualTo("95MB");
+        assertThat(SizeUnit.BYTES.value(345_100_334_345L).toString()).isEqualTo("321GB");
+        assertThat(SizeUnit.BYTES.value(2_345_100_334_345L).toString()).isEqualTo("2.1TB");
+        assertThat(SizeUnit.BYTES.value(98_345_100_334_345L).toString()).isEqualTo("89TB");
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/units/TimeValueTest.java b/test/unit/org/apache/cassandra/utils/units/TimeValueTest.java
new file mode 100644
index 000000000000..6e38c2d2ee88
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/units/TimeValueTest.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class TimeValueTest
+{
+    @Test
+    public void testToString()
+    {
+        assertThat(TimeValue.of(10L, TimeUnit.MILLISECONDS).toString()).isEqualTo("10ms");
+        assertThat(TimeValue.of(1000L, TimeUnit.MILLISECONDS).toString()).isEqualTo("1s");
+        assertThat(TimeValue.of(1200L, TimeUnit.MILLISECONDS).toString()).isEqualTo("1.2s");
+        assertThat(TimeValue.of(42_324L, TimeUnit.MILLISECONDS).toString()).isEqualTo("42s");
+        assertThat(TimeValue.of(60_000L, TimeUnit.MILLISECONDS).toString()).isEqualTo("1m");
+        assertThat(TimeValue.of(250_000L, TimeUnit.MILLISECONDS).toString()).isEqualTo("4.2m");
+        assertThat(TimeValue.of(3_600_000L, TimeUnit.MILLISECONDS).toString()).isEqualTo("1h");
+        assertThat(TimeValue.of(24 * 10_200_000L, TimeUnit.MILLISECONDS).toString()).isEqualTo("2.8d");
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/units/UnitsTest.java b/test/unit/org/apache/cassandra/utils/units/UnitsTest.java
new file mode 100644
index 000000000000..e55aa6ae1993
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/units/UnitsTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.utils.units;
+
+import org.junit.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+public class UnitsTest
+{
+    @Test
+    public void testFormatValue()
+    {
+        assertThat(Units.formatValue(0L)).isEqualTo("0");
+        // No comma
+        assertThat(Units.formatValue(1L)).isEqualTo("1");
+        assertThat(Units.formatValue(-1L)).isEqualTo("-1");
+        assertThat(Units.formatValue(10L)).isEqualTo("10");
+        assertThat(Units.formatValue(-10L)).isEqualTo("-10");
+        assertThat(Units.formatValue(999L)).isEqualTo("999");
+        assertThat(Units.formatValue(-999L)).isEqualTo("-999");
+
+        // One comma
+        assertThat(Units.formatValue(1_000L)).isEqualTo("1,000");
+        assertThat(Units.formatValue(-1_000L)).isEqualTo("-1,000");
+        assertThat(Units.formatValue(12_345L)).isEqualTo("12,345");
+        assertThat(Units.formatValue(-12_345L)).isEqualTo("-12,345");
+        assertThat(Units.formatValue(999_999L)).isEqualTo("999,999");
+        assertThat(Units.formatValue(-999_999L)).isEqualTo("-999,999");
+
+        // Two comma
+        assertThat(Units.formatValue(1_000_000L)).isEqualTo("1,000,000");
+        assertThat(Units.formatValue(-1_000_000L)).isEqualTo("-1,000,000");
+        assertThat(Units.formatValue(999_999_999L)).isEqualTo("999,999,999");
+        assertThat(Units.formatValue(-999_999_999L)).isEqualTo("-999,999,999");
+
+        // Lots of comma
+        assertThat(Units.formatValue(123_456_789_123_456_789L)).isEqualTo("123,456,789,123,456,789");
+        assertThat(Units.formatValue(-123_456_789_123_456_789L)).isEqualTo("-123,456,789,123,456,789");
+    }
+}
\ No newline at end of file
diff --git a/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java b/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java
index bc6756b498d8..4373d745d696 100644
--- a/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java
+++ b/tools/stress/src/org/apache/cassandra/io/sstable/StressCQLSSTableWriter.java
@@ -54,7 +54,7 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.schema.Types;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
@@ -241,20 +241,21 @@ public StressCQLSSTableWriter rawAddRow(List<ByteBuffer> values)
         if (values.size() != boundNames.size())
             throw new InvalidRequestException(String.format("Invalid number of arguments, expecting %d values but got %d", boundNames.size(), values.size()));
 
+        QueryState state = QueryState.forInternalCalls();
         QueryOptions options = QueryOptions.forInternalCalls(null, values);
-        List<ByteBuffer> keys = insert.buildPartitionKeyNames(options);
-        SortedSet<Clustering<?>> clusterings = insert.createClustering(options);
+        List<ByteBuffer> keys = insert.buildPartitionKeyNames(options, state);
+        SortedSet<Clustering<?>> clusterings = insert.createClustering(options, state);
 
         long now = System.currentTimeMillis();
         // Note that we asks indexes to not validate values (the last 'false' arg below) because that triggers a 'Keyspace.open'
         // and that forces a lot of initialization that we don't want.
         UpdateParameters params = new UpdateParameters(insert.metadata(),
                                                        insert.updatedColumns(),
+                                                       state,
                                                        options,
                                                        insert.getTimestamp(TimeUnit.MILLISECONDS.toMicros(now), options),
                                                        (int) TimeUnit.MILLISECONDS.toSeconds(now),
-                                                       insert.getTimeToLive(options),
-                                                       Collections.emptyMap());
+                                                       insert.getTimeToLive(options), Collections.emptyMap());
 
         try
         {
@@ -601,8 +602,8 @@ public static ColumnFamilyStore createOfflineTable(CreateTableStatement.Raw sche
             if (tableMetadata != null)
                 return Schema.instance.getColumnFamilyStoreInstance(tableMetadata.id);
 
-            ClientState state = ClientState.forInternalCalls();
-            CreateTableStatement statement = schemaStatement.prepare(state);
+            QueryState state = QueryState.forInternalCalls();
+            CreateTableStatement statement = schemaStatement.prepare(state.getClientState());
             statement.validate(state);
 
             //Build metadata with a portable tableId
@@ -634,8 +635,8 @@ private static TableId deterministicId(String keyspace, String table)
          */
         private UpdateStatement prepareInsert()
         {
-            ClientState state = ClientState.forInternalCalls();
-            CQLStatement cqlStatement = insertStatement.prepare(state);
+            QueryState state = QueryState.forInternalCalls();
+            CQLStatement cqlStatement = insertStatement.prepare(state.getClientState());
             UpdateStatement insert = (UpdateStatement) cqlStatement;
             insert.validate(state);
 
diff --git a/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java b/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
index 6bfe620954d3..7afc4f123f03 100644
--- a/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
+++ b/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
@@ -33,6 +33,7 @@
 import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.SystemKeyspace;
 import org.apache.cassandra.db.commitlog.CommitLog;
 import org.apache.cassandra.db.compaction.CompactionManager;
@@ -76,6 +77,7 @@ public abstract class CompactionStress implements Runnable
     static
     {
         DatabaseDescriptor.daemonInitialization();
+        Keyspace.setInitialized();
         CommitLog.instance.start();
     }
 
diff --git a/update-history/STAR-801/32-d187a01bfd STAR-409 Port guardrails from astra branch (#124) b/update-history/STAR-801/32-d187a01bfd STAR-409 Port guardrails from astra branch (#124)
new file mode 100644
index 000000000000..1b2a70f1b5f2
--- /dev/null
+++ b/update-history/STAR-801/32-d187a01bfd STAR-409 Port guardrails from astra branch (#124)	
@@ -0,0 +1,196 @@
+diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+index 3708f21a40..2f02665c30 100644
+--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
++++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+@@ -230,7 +230,7 @@ public enum ConsistencyLevel
+     }
+ 
+     // This is the same than validateForWrite really, but we include a slightly different error message for SERIAL/LOCAL_SERIAL
+-    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy) throws InvalidRequestException
++    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName) throws InvalidRequestException
+     {
+         if (SchemaConstants.isUserKeyspace(keyspaceName))
+             Guardrails.disallowedWriteConsistencies.ensureAllowed(this);
+diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
+index 2b7de2ef15..361b57fc7a 100644
+--- a/src/java/org/apache/cassandra/service/StorageProxy.java
++++ b/src/java/org/apache/cassandra/service/StorageProxy.java
+@@ -290,7 +290,7 @@ public class StorageProxy implements StorageProxyMBean
+         {
+             TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName);
+             consistencyForPaxos.validateForCas(keyspaceName);
+-            consistencyForCommit.validateForCasCommit(keyspaceName);
++            consistencyForCommit.validateForCasCommit(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName);
+ 
+             Supplier<Pair<PartitionUpdate, RowIterator>> updateProposer = () ->
+             {
+@@ -437,8 +437,8 @@ public class StorageProxy implements StorageProxyMBean
+         try
+         {
+             consistencyForPaxos.validateForCas(metadata.keyspace);
+-            consistencyForReplayCommits.validateForCasCommit(latestRs);
+-            consistencyForCommit.validateForCasCommit(latestRs);
++            consistencyForReplayCommits.validateForCasCommit(latestRs, metadata.keyspace);
++            consistencyForCommit.validateForCasCommit(latestRs, metadata.keyspace);
+ 
+             long timeoutNanos = DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS);
+             while (System.nanoTime() - queryStartNanoTime < timeoutNanos)
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java
+index 6d4e4874bc..f721883e17 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering1Test.java
+@@ -84,9 +84,9 @@ public class ViewFilteringClustering1Test extends CQLTester
+         ViewFilteringTest.createView(name, query, views, version, this);
+     }
+ 
+-    private void dropView(String name) throws Throwable
++    private void dropMaterializedView(String name) throws Throwable
+     {
+-        ViewFilteringTest.dropView(name, views, version, this);
++        ViewFilteringTest.dropMaterializedView(name, views, version, this);
+     }
+ 
+     @Test
+@@ -193,7 +193,7 @@ public class ViewFilteringClustering1Test extends CQLTester
+                                     row(0, 1, 1, 0)
+             );
+ 
+-            dropView("mv_test" + i);
++            dropMaterializedView("mv_test" + i);
+             dropTable("DROP TABLE %s");
+         }
+     }
+@@ -301,7 +301,7 @@ public class ViewFilteringClustering1Test extends CQLTester
+                                     row(0, 1, 1, 0)
+             );
+ 
+-            dropView("mv_test" + i);
++            dropMaterializedView("mv_test" + i);
+             dropTable("DROP TABLE %s");
+         }
+     }
+@@ -418,7 +418,7 @@ public class ViewFilteringClustering1Test extends CQLTester
+                                     row(0, 1, 1, 0)
+             );
+ 
+-            dropView("mv_test" + i);
++            dropMaterializedView("mv_test" + i);
+             dropTable("DROP TABLE %s");
+         }
+     }
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java
+index d1ba842036..d6abdd9805 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringClustering2Test.java
+@@ -84,9 +84,9 @@ public class ViewFilteringClustering2Test extends CQLTester
+         ViewFilteringTest.createView(name, query, views, version, this);
+     }
+ 
+-    private void dropView(String name) throws Throwable
++    private void dropMaterializedView(String name) throws Throwable
+     {
+-        ViewFilteringTest.dropView(name, views, version, this);
++        ViewFilteringTest.dropMaterializedView(name, views, version, this);
+     }
+ 
+     @Test
+@@ -197,7 +197,7 @@ public class ViewFilteringClustering2Test extends CQLTester
+                                     row(0, 1, 1, 0)
+             );
+ 
+-            dropView("mv_test" + i);
++            dropMaterializedView("mv_test" + i);
+             dropTable("DROP TABLE %s");
+         }
+     }
+@@ -322,7 +322,7 @@ public class ViewFilteringClustering2Test extends CQLTester
+                                     row(4, 4, 1, 1)
+             );
+ 
+-            dropView("mv_test" + i);
++            dropMaterializedView("mv_test" + i);
+             dropTable("DROP TABLE %s");
+         }
+     }
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java
+index 09d220d3ce..d4d246aade 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringPKTest.java
+@@ -84,9 +84,9 @@ public class ViewFilteringPKTest extends CQLTester
+         ViewFilteringTest.createView(name, query, views, version, this);
+     }
+ 
+-    private void dropView(String name) throws Throwable
++    private void dropMaterializedView(String name) throws Throwable
+     {
+-        ViewFilteringTest.dropView(name, views, version, this);
++        ViewFilteringTest.dropMaterializedView(name, views, version, this);
+     }
+ 
+     @Test
+@@ -650,7 +650,7 @@ public class ViewFilteringPKTest extends CQLTester
+             execute("DELETE FROM %s WHERE a = ?", 1);
+             assertEmpty(execute("SELECT a, b, c, d FROM mv_test" + i));
+ 
+-            dropView("mv_test" + i);
++            dropMaterializedView("mv_test" + i);
+             dropTable("DROP TABLE %s");
+         }
+     }
+diff --git a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
+index 2d4cbb65ab..1b818c9d16 100644
+--- a/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
++++ b/test/unit/org/apache/cassandra/cql3/ViewFilteringTest.java
+@@ -134,12 +134,12 @@ public class ViewFilteringTest extends CQLTester
+         }
+     }
+ 
+-    private void dropView(String name) throws Throwable
++    private void dropMaterializedView(String name) throws Throwable
+     {
+-        dropView(name, views, version, this);
++        dropMaterializedView(name, views, version, this);
+     }
+ 
+-    public static void dropView(String name, List<String> views, ProtocolVersion version, CQLTester tester) throws Throwable
++    public static void dropMaterializedView(String name, List<String> views, ProtocolVersion version, CQLTester tester) throws Throwable
+     {
+         tester.executeNet(version, "DROP MATERIALIZED VIEW " + name);
+         views.remove(name);
+@@ -365,12 +365,12 @@ public class ViewFilteringTest extends CQLTester
+         assertRowCount(execute("SELECT * FROM mv_test5"), 0);
+         assertRowCount(execute("SELECT * FROM mv_test6"), 0);
+ 
+-        dropView("mv_test1");
+-        dropView("mv_test2");
+-        dropView("mv_test3");
+-        dropView("mv_test4");
+-        dropView("mv_test5");
+-        dropView("mv_test6");
++        dropMaterializedView("mv_test1");
++        dropMaterializedView("mv_test2");
++        dropMaterializedView("mv_test3");
++        dropMaterializedView("mv_test4");
++        dropMaterializedView("mv_test5");
++        dropMaterializedView("mv_test6");
+         dropTable("DROP TABLE %s");
+     }
+ 
+@@ -757,7 +757,7 @@ public class ViewFilteringTest extends CQLTester
+ 
+         try {
+             createView("mv_test", "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE a IS NOT NULL AND b IS NOT NULL AND c IS NOT NULL AND d = 1 PRIMARY KEY (a, b, c)");
+-            dropView("mv_test");
++            dropMaterializedView("mv_test");
+         } catch(Exception e) {
+             throw new RuntimeException("MV creation with non primary column restrictions failed.", e);
+         }
+@@ -862,7 +862,7 @@ public class ViewFilteringTest extends CQLTester
+                                 row(0, 1, 1, 0)
+         );
+ 
+-        dropView("mv_test");
++        dropMaterializedView("mv_test");
+         dropTable("DROP TABLE %s");
+     }
+ 

From c0100b281aa1b61a35b3336c21b0b216a1139d15 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Thu, 29 Apr 2021 09:55:12 +0200
Subject: [PATCH 072/151] STAR-454 ignore 'nodesync' table option (#126)

The table creation command ends successfully (applied == true).
`nodesync` table option is ignored, additional warning is
returned to the caller.

(cherry picked from commit eaeb61283ce845a345954efe92a3989e2990c1be)
(cherry picked from commit 47bc95e77f2138c86fdb5cdd0af8488824b6c8f8)
---
 .../schema/CreateTableStatement.java          |  9 +-
 .../statements/schema/TableAttributes.java    |  5 +-
 ...bleStatementCompactionStrategiesTest.java} |  2 +-
 .../CreateTableStatementNodeSyncTest.java     | 85 +++++++++++++++++++
 4 files changed, 96 insertions(+), 5 deletions(-)
 rename test/unit/org/apache/cassandra/cql3/statements/{CreateTableStatementTest.java => CreateTableStatementCompactionStrategiesTest.java} (97%)
 create mode 100644 test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementNodeSyncTest.java

diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index 689d9a2d924a..0992eb58cbc2 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -400,7 +400,7 @@ else if (!builder.hasRegularColumns())
     @Override
     public Set<String> clientWarnings(KeyspacesDiff diff)
     {
-        Set<String> warnings = new HashSet<>();
+        ImmutableSet.Builder<String> warnings = ImmutableSet.builder();
 
         int tableCount = Schema.instance.getNumberOfTables();
         if (tableCount > DatabaseDescriptor.tableCountWarnThreshold())
@@ -422,7 +422,12 @@ public Set<String> clientWarnings(KeyspacesDiff diff)
                          "Inspect your schema and adjust other table properties if needed.");
         }
 
-        return warnings;
+        if (attrs.hasProperty("nodesync"))
+        {
+            warnings.add("The unsupported 'nodesync' table option was ignored.");
+        }
+
+        return warnings.build();
     }
 
     private static class DefaultNames
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 7faac95d54e4..5001dd2a53e6 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -41,7 +41,9 @@ public final class TableAttributes extends PropertyDefinitions
 {
     public static final String ID = "id";
     public static final Set<String> validKeywords;
-    private static final Set<String> obsoleteKeywords;
+    private static final Set<String> obsoleteKeywords = ImmutableSet.of(
+        "nodesync"
+    );
 
     private static final Set<String> UNSUPPORTED_DSE_COMPACTION_STRATEGIES = ImmutableSet.of(
         "org.apache.cassandra.db.compaction.TieredCompactionStrategy",
@@ -57,7 +59,6 @@ public final class TableAttributes extends PropertyDefinitions
             validBuilder.add(option.toString());
         validBuilder.add(ID);
         validKeywords = validBuilder.build();
-        obsoleteKeywords = ImmutableSet.of();
     }
 
     public void validate()
diff --git a/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementCompactionStrategiesTest.java
similarity index 97%
rename from test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java
rename to test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementCompactionStrategiesTest.java
index 1ef2fde5af7f..c43163a43436 100644
--- a/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementTest.java
+++ b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementCompactionStrategiesTest.java
@@ -37,7 +37,7 @@
 import static org.junit.Assert.assertTrue;
 
 @RunWith(Parameterized.class)
-public class CreateTableStatementTest extends CQLTester
+public class CreateTableStatementCompactionStrategiesTest extends CQLTester
 {
     @Parameterized.Parameters(name = "compactionStrategy = {0}")
     public static Set<String> strategies()
diff --git a/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementNodeSyncTest.java b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementNodeSyncTest.java
new file mode 100644
index 000000000000..18db9caf9e9b
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementNodeSyncTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.Matchers.not;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class CreateTableStatementNodeSyncTest extends CQLTester
+{
+    @Parameterized.Parameters(name = "tableOptions = {0}")
+    public static Set<String> tableOptions()
+    {
+        return ImmutableSet.of(
+            "WITH nodesync = { 'enabled' : 'true', 'incremental' : 'true' }",
+            "WITH nodesync = { 'enabled' : 'true' }",
+            "WITH nodesync = { 'enabled' : 'false' }",
+            "WITH nodesync = { 'enabled' : 'true', 'deadline_target_sec': 60 }"
+        );
+    }
+
+    @Parameterized.Parameter()
+    public String tableOptions;
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1));
+    }
+
+    @Test
+    public void dseNodesyncShouldBeIgnoredWithWarning() throws Throwable
+    {
+        String tableName = createTableName();
+
+        // should not throw
+        ResultSet rows = executeNet(String.format("CREATE TABLE ks.%s (k int PRIMARY KEY, v int) %s", tableName, tableOptions));
+
+        assertTrue(rows.wasApplied());
+
+        String warning = rows.getAllExecutionInfo().get(0).getWarnings().get(0);
+        assertThat(warning, containsString("The unsupported 'nodesync' table option was ignored."));
+
+        assertNoNodesyncTableParamater(tableName);
+    }
+
+    private void assertNoNodesyncTableParamater(String tableName) throws Throwable
+    {
+        ResultSet result = executeNet("DESCRIBE TABLE ks." + tableName);
+
+        String createStatement = result.one().getString("create_statement");
+        assertThat(createStatement, not(containsString("nodesync")));
+    }
+}

From f401273dd53fe6273d3d95da572e8626d4f09747 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Thu, 29 Apr 2021 10:58:47 +0200
Subject: [PATCH 073/151] STAR-455 ignore VERTEX|EDGE LABEL in CREATE TABLE
 statement (#128)

* STAR-455 ignore VERTEX|EDGE LABEL in CREATE TABLE statement

The DSE-specific graph properties are added to CQL syntax.
Once detected, a hint about them is provided to the node
via table properties: dse_vertex_label_property and
dse_edge_label_property.
The node returns a warning to the caller, ignores the
properties and creates the table.

* STAR-455 ignore graph_engine property in CREATE KEYSPACE cmd

the property is ignored, the keyspace is created (applied == true).
A warning is returned to the caller.

(cherry picked from commit b2a8a809d8cc11580e6027bc5838394f00f86481)
(cherry picked from commit 25389746ebb04c383ddcd768760add9fe95735c4)
---
 src/antlr/Lexer.g                             |  4 +
 src/antlr/Parser.g                            |  8 ++
 .../schema/CreateKeyspaceStatement.java       |  5 +
 .../schema/CreateTableStatement.java          | 10 ++
 .../statements/schema/KeyspaceAttributes.java |  2 +-
 .../statements/schema/TableAttributes.java    |  4 +-
 .../CreateKeyspaceStatementTest.java          | 59 ++++++++++++
 .../CreateTableStatementGraphTest.java        | 91 +++++++++++++++++++
 8 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/cql3/statements/CreateKeyspaceStatementTest.java
 create mode 100644 test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementGraphTest.java

diff --git a/src/antlr/Lexer.g b/src/antlr/Lexer.g
index c1f6f125abc3..8950690b5d15 100644
--- a/src/antlr/Lexer.g
+++ b/src/antlr/Lexer.g
@@ -216,6 +216,10 @@ K_DEFAULT:     D E F A U L T;
 K_UNSET:       U N S E T;
 K_LIKE:        L I K E;
 
+K_EDGE:        E D G E;
+K_VERTEX:      V E R T E X;
+K_LABEL:       L A B E L;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
diff --git a/src/antlr/Parser.g b/src/antlr/Parser.g
index b3ba7b34c04d..ea5e11ac633f 100644
--- a/src/antlr/Parser.g
+++ b/src/antlr/Parser.g
@@ -797,6 +797,11 @@ tableProperty[CreateTableStatement.Raw stmt]
     : property[stmt.attrs]
     | K_COMPACT K_STORAGE { $stmt.setCompactStorage(); }
     | K_CLUSTERING K_ORDER K_BY '(' tableClusteringOrder[stmt] (',' tableClusteringOrder[stmt])* ')'
+    | K_VERTEX K_LABEL ( noncol_ident )? {stmt.attrs.addProperty("dse_vertex_label_property", "vertex");}
+    | K_EDGE K_LABEL ( noncol_ident ) ?
+             K_FROM noncol_ident '(' ident (',' ident)* ')'
+             K_TO noncol_ident '(' ident (',' ident)* ')'
+             {stmt.attrs.addProperty("dse_edge_label_property", "edge");}
     ;
 
 tableClusteringOrder[CreateTableStatement.Raw stmt]
@@ -1900,5 +1905,8 @@ basic_unreserved_keyword returns [String str]
         | K_MBEANS
         | K_REPLACE
         | K_UNSET
+        | K_EDGE
+        | K_VERTEX
+        | K_LABEL
         ) { $str = $k.text; }
     ;
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java
index 806b50a2bdb0..2dab938bcd60 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java
@@ -133,6 +133,11 @@ Set<String> clientWarnings(KeyspacesDiff diff)
             clientWarnings.add(msg);
         }
 
+        if (attrs.hasProperty("graph_engine"))
+        {
+            clientWarnings.add("The unsupported graph property 'graph_engine' was ignored.");
+        }
+
         return clientWarnings;
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index 0992eb58cbc2..96500e8b78ab 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -427,6 +427,16 @@ public Set<String> clientWarnings(KeyspacesDiff diff)
             warnings.add("The unsupported 'nodesync' table option was ignored.");
         }
 
+        if (attrs.hasProperty("dse_vertex_label_property"))
+        {
+            warnings.add("The unsupported graph table property was ignored (VERTEX LABEL).");
+        }
+
+        if (attrs.hasProperty("dse_edge_label_property"))
+        {
+            warnings.add("The unsupported graph table property was ignored (EDGE LABEL).");
+        }
+
         return warnings.build();
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java
index 42fcaf4e69e8..88764a93f1a7 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java
@@ -38,7 +38,7 @@ public final class KeyspaceAttributes extends PropertyDefinitions
         for (Option option : Option.values())
             validBuilder.add(option.toString());
         validKeywords = validBuilder.build();
-        obsoleteKeywords = ImmutableSet.of();
+        obsoleteKeywords = ImmutableSet.of("graph_engine");
     }
 
     public void validate()
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 5001dd2a53e6..9d6c66e49c65 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -42,7 +42,9 @@ public final class TableAttributes extends PropertyDefinitions
     public static final String ID = "id";
     public static final Set<String> validKeywords;
     private static final Set<String> obsoleteKeywords = ImmutableSet.of(
-        "nodesync"
+        "nodesync",
+        "dse_vertex_label_property",
+        "dse_edge_label_property"
     );
 
     private static final Set<String> UNSUPPORTED_DSE_COMPACTION_STRATEGIES = ImmutableSet.of(
diff --git a/test/unit/org/apache/cassandra/cql3/statements/CreateKeyspaceStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/CreateKeyspaceStatementTest.java
new file mode 100644
index 000000000000..08281150806d
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/statements/CreateKeyspaceStatementTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.cql3.CQLTester;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.Matchers.not;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+public class CreateKeyspaceStatementTest extends CQLTester
+{
+    @Test
+    public void ignoreUnsupportedGraphEngineProperty() throws Throwable
+    {
+        String keyspaceName = createKeyspaceName();
+        String keyspaceOptions = "graph_engine = 'Core'";
+
+        // should not throw
+        ResultSet rows = executeNet(String.format("CREATE KEYSPACE %s WITH REPLICATION = " +
+                                                  "{ 'class' : 'SimpleStrategy', 'replication_factor' : '1' } AND %s",
+                                                  keyspaceName, keyspaceOptions));
+
+        assertTrue(rows.wasApplied());
+
+        String warning = rows.getAllExecutionInfo().get(0).getWarnings().get(0);
+        assertThat(warning, containsString("The unsupported graph property 'graph_engine' was ignored."));
+
+        assertNoGraphEngineKeyspaceProperty(keyspaceName);
+    }
+
+    private void assertNoGraphEngineKeyspaceProperty(String tableName) throws Throwable
+    {
+        ResultSet result = executeNet("DESCRIBE KEYSPACE " + tableName);
+
+        String createStatement = result.one().getString("create_statement");
+        assertThat(createStatement, not(containsString("graph_engine")));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementGraphTest.java b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementGraphTest.java
new file mode 100644
index 000000000000..f594f11c0880
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/statements/CreateTableStatementGraphTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3.statements;
+
+import java.util.Set;
+
+import com.google.common.collect.ImmutableSet;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.Matchers.not;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class CreateTableStatementGraphTest extends CQLTester
+{
+    @Parameterized.Parameters(name = "tableOptions = {0}")
+    public static Set<String> tableOptions()
+    {
+        return ImmutableSet.of(
+            "VERTEX LABEL",
+            "vertex label",
+            "VERTEX LABEL person_label",
+            "VERTEX LABEL personlabel",
+            "VERTEX LABEL \"personlabel\"",
+            "VERTEX LABEL personlabel AND CLUSTERING ORDER BY (v DESC)",
+            "CLUSTERING ORDER BY (v DESC) AND VERTEX LABEL",
+            "EDGE LABEL person_authored_book FROM person(name,person_id) TO book(name, book_id,  cover)",
+            "EDGE LABEL person_authored_book FROM person(name) TO book(cover)",
+            "VERTEX LABEL AND EDGE LABEL person_authored_book FROM person(name) TO book(cover)"
+        );
+    }
+
+    @Parameterized.Parameter()
+    public String tableOptions;
+
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        SchemaLoader.createKeyspace("ks", KeyspaceParams.simple(1));
+    }
+
+    @Test
+    public void dseGraphShouldBeIgnoredWithWarning() throws Throwable
+    {
+        String tableName = createTableName();
+
+        // should not throw
+        ResultSet rows = executeNet(String.format("CREATE TABLE ks.%s (k int, v int, PRIMARY KEY (k, v)) WITH %s", tableName, tableOptions));
+
+        assertTrue(rows.wasApplied());
+
+        String warning = rows.getAllExecutionInfo().get(0).getWarnings().get(0);
+        assertThat(warning, containsString("The unsupported graph table property was ignored"));
+
+        assertNoGraphLabels(tableName);
+    }
+
+    private void assertNoGraphLabels(String tableName) throws Throwable
+    {
+        ResultSet result = executeNet("DESCRIBE TABLE ks." + tableName);
+
+        String createStatement = result.one().getString("create_statement");
+        assertThat(createStatement.toUpperCase(), not(containsString("LABEL")));
+    }
+}

From 4a304000581ca382cb54c949c3cd2a935f7eae0e Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Tue, 4 May 2021 08:57:08 +0200
Subject: [PATCH 074/151] STAR-400: Add a smoke test for mixed sstable formats
 (ds-trunk) (#136)

* STAR-400: Add a smoke test for mixed sstable formats

A smoke test which operates on a table whose sstables are in both formats - big and bti. We try to read and compact those sstables.

* address review comments

(cherry picked from commit def65b86df31010ef581729153544c91f871b7d8)
(cherry picked from commit 5d3739b69384243700e8fb806dedecf1b1fc4263)
---
 .../sstable/MutlipleSSTableFormatsTest.java   | 170 ++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 test/unit/org/apache/cassandra/io/sstable/MutlipleSSTableFormatsTest.java

diff --git a/test/unit/org/apache/cassandra/io/sstable/MutlipleSSTableFormatsTest.java b/test/unit/org/apache/cassandra/io/sstable/MutlipleSSTableFormatsTest.java
new file mode 100644
index 000000000000..67bb0392b5be
--- /dev/null
+++ b/test/unit/org/apache/cassandra/io/sstable/MutlipleSSTableFormatsTest.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.big.BigFormat;
+import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexFormat;
+import org.assertj.core.api.Assertions;
+
+
+public class MutlipleSSTableFormatsTest extends CQLTester
+{
+    private final static Logger logger = LoggerFactory.getLogger(MutlipleSSTableFormatsTest.class);
+    private final static int cnt = 100;
+    private final static int overlap = 70;
+    private final static int deletionCount = 30;
+
+    private final long seed = System.nanoTime();
+    private Random random;
+
+    private String savedProp;
+
+    @Before
+    public void before() {
+        savedProp = System.getProperty(SSTableFormat.FORMAT_DEFAULT_PROP);
+        random = new Random(seed);
+        logger.info("Using random seed = {}", seed);
+    }
+
+    @After
+    public void after() {
+        if (savedProp == null)
+            System.getProperties().remove(SSTableFormat.FORMAT_DEFAULT_PROP);
+        else
+            System.setProperty(SSTableFormat.FORMAT_DEFAULT_PROP, savedProp);
+    }
+
+    private Map<Integer, Integer> createSSTables() throws Throwable {
+        Map<Integer, Integer> content = Maps.newHashMap();
+
+        createTable("CREATE TABLE %s (id INT, val INT, PRIMARY KEY (id))");
+        disableCompaction();
+
+        int offset = 0;
+        for (SSTableFormat.Type formatType : SSTableFormat.Type.values())
+        {
+            System.setProperty(SSTableFormat.FORMAT_DEFAULT_PROP, formatType.name);
+
+            for (int i = 0; i < cnt; i++)
+            {
+                int v = random.nextInt();
+                content.put(i + offset, v);
+                execute("INSERT INTO %s (id, val) VALUES (?, ?)", i + offset, v);
+            }
+            offset += cnt - overlap;
+
+            flush();
+        }
+
+        for (SSTableFormat.Type formatType : SSTableFormat.Type.values())
+        {
+            System.setProperty(SSTableFormat.FORMAT_DEFAULT_PROP, formatType.name);
+
+            for (int i = 0; i < deletionCount; i++)
+            {
+                int key = random.nextInt(offset + overlap);
+                content.remove(key);
+                execute("DELETE FROM %s WHERE id = ?", key);
+            }
+
+            flush();
+        }
+
+        List<SSTableFormat.Type> createdFormats = createdFormats();
+        Assertions.assertThat(createdFormats).hasSameElementsAs(Sets.newHashSet(SSTableFormat.Type.values()));
+
+        return content;
+    }
+
+    private void checkRead(Map<Integer, Integer> content) throws Throwable {
+        for (Map.Entry<Integer, Integer> entry : content.entrySet())
+        {
+            UntypedResultSet r = execute("SELECT val FROM %s WHERE id = ?", entry.getKey());
+            Assertions.assertThat(r.one().getInt("val")).isEqualTo(entry.getValue());
+        }
+
+        Iterator<UntypedResultSet.Row> it = execute("SELECT id, val FROM %s").iterator();
+        Map<Integer, Integer> results = Maps.newHashMap();
+        while (it.hasNext()) {
+            UntypedResultSet.Row row = it.next();
+            results.put(row.getInt("id"), row.getInt("val"));
+        }
+        Assertions.assertThat(results).isEqualTo(content);
+    }
+
+    @Test
+    public void testRead() throws Throwable
+    {
+        Map<Integer, Integer> content = createSSTables();
+        checkRead(content);
+    }
+
+    @Test
+    public void testCompactionToBigFormat() throws Throwable
+    {
+        testCompaction(BigFormat.instance);
+    }
+
+    @Test
+    public void testCompactionToBtiFormat() throws Throwable
+    {
+        testCompaction(TrieIndexFormat.instance);
+    }
+
+    private void testCompaction(SSTableFormat format) throws Throwable
+    {
+        Map<Integer, Integer> content = createSSTables();
+        System.setProperty(SSTableFormat.FORMAT_DEFAULT_PROP, format.getType().name);
+        enableCompaction();
+        compact();
+        List<SSTableFormat.Type> createdFormats = createdFormats();
+        Assertions.assertThat(createdFormats).hasSize(1);
+        Assertions.assertThat(createdFormats.get(0)).isEqualTo(format.getType());
+        checkRead(content);
+    }
+
+    private List<SSTableFormat.Type> createdFormats()
+    {
+        return ColumnFamilyStore.getIfExists(KEYSPACE, currentTable())
+                                .getLiveSSTables()
+                                .stream()
+                                .map(sstr -> sstr.descriptor.formatType)
+                                .collect(Collectors.toList());
+    }
+
+}

From b91909df3f54ef1a97517838ce629a1a09aa8a86 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Wed, 5 May 2021 00:09:54 +0200
Subject: [PATCH 075/151] STAR-539: Fix storing stats metadata for big table
 format (ds-trunk) (#141)

* STAR-539: Fix storing stats metadata for big table format

The problem was that at some point we refactored the code a bit and one detail got missed - for the min and max clustering bounds we should store clustering components until first null component is encountered.

* Improve test coverage

(cherry picked from commit e0c0accb61dede076954f7d39ba494e5f2249551)
(cherry picked from commit 88d7d00d112d8e09476512765e385684490e43e1)
---
 .../io/sstable/metadata/StatsMetadata.java    | 24 +++++++++--
 .../metadata/MetadataSerializerTest.java      | 42 +++++++++++++++----
 2 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
index e6bd14cb25ea..093306b3a0e3 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/StatsMetadata.java
@@ -26,6 +26,7 @@
 import java.util.UUID;
 
 import com.google.common.collect.ImmutableMap;
+import org.apache.commons.lang3.ArrayUtils;
 import org.apache.commons.lang3.builder.EqualsBuilder;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
 import org.slf4j.Logger;
@@ -320,11 +321,11 @@ public int serializedSize(Version version, StatsMetadata component) throws IOExc
                 // min column names
                 size += 4;
                 ClusteringBound<?> minClusteringValues = component.coveredClustering.start();
-                size += minClusteringValues.size() * 2 /* short length */ + minClusteringValues.dataSize();
+                size += countUntilNull(minClusteringValues.getBufferArray()) * 2 /* short length */ + minClusteringValues.dataSize();
                 // max column names
                 size += 4;
                 ClusteringBound<?> maxClusteringValues = component.coveredClustering.end();
-                size += maxClusteringValues.size() * 2 /* short length */ + maxClusteringValues.dataSize();
+                size += countUntilNull(maxClusteringValues.getBufferArray()) * 2 /* short length */ + maxClusteringValues.dataSize();
             }
 
             size += TypeSizes.sizeof(component.hasLegacyCounterShards);
@@ -406,13 +407,21 @@ public void serialize(Version version, StatsMetadata component, DataOutputPlus o
             else
             {
                 ClusteringBound<?> minClusteringValues = component.coveredClustering.start();
-                out.writeInt(minClusteringValues.size());
+                out.writeInt(countUntilNull(minClusteringValues.getBufferArray()));
                 for (ByteBuffer value : minClusteringValues.getBufferArray())
+                {
+                    if (value == null)
+                        break;
                     ByteBufferUtil.writeWithShortLength(value, out);
+                }
                 ClusteringBound<?> maxClusteringValues = component.coveredClustering.end();
-                out.writeInt(maxClusteringValues.size());
+                out.writeInt(countUntilNull(maxClusteringValues.getBufferArray()));
                 for (ByteBuffer value : maxClusteringValues.getBufferArray())
+                {
+                    if (value == null)
+                        break;
                     ByteBufferUtil.writeWithShortLength(value, out);
+                }
             }
 
             out.writeBoolean(component.hasLegacyCounterShards);
@@ -633,5 +642,12 @@ public StatsMetadata deserialize(Version version, DataInputPlus in) throws IOExc
                                      isTransient,
                                      maxColumnValueLengths);
         }
+
+        private int countUntilNull(ByteBuffer[] bufferArray)
+        {
+            int i = ArrayUtils.indexOf(bufferArray, null);
+            return i < 0 ? bufferArray.length : i;
+        }
+
     }
 }
diff --git a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
index 5c0cd5410b18..3ec8c7ebcae6 100644
--- a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
@@ -25,21 +25,24 @@
 import java.util.EnumSet;
 import java.util.Map;
 
+import org.junit.Assume;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Clustering;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.commitlog.IntervalSet;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.dht.RandomPartitioner;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.Version;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
-import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexFormat;
 import org.apache.cassandra.io.util.BufferedDataOutputStreamPlus;
 import org.apache.cassandra.io.util.DataOutputStreamPlus;
 import org.apache.cassandra.io.util.FileUtils;
@@ -65,9 +68,9 @@ public void testSerialization() throws IOException
         Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
 
         MetadataSerializer serializer = new MetadataSerializer();
-        File statsFile = serialize(originalMetadata, serializer, TrieIndexFormat.latestVersion);
+        File statsFile = serialize(originalMetadata, serializer, SSTableFormat.Type.current().info.getLatestVersion());
 
-        Descriptor desc = new Descriptor(statsFile.getParentFile(), "", "", 0, TrieIndexFormat.instance.getType());
+        Descriptor desc = new Descriptor(statsFile.getParentFile(), "", "", 0, SSTableFormat.Type.current());
         try (RandomAccessReader in = RandomAccessReader.open(statsFile))
         {
             Map<MetadataType, MetadataComponent> deserialized = serializer.deserialize(desc, in, EnumSet.allOf(MetadataType.class));
@@ -123,12 +126,14 @@ public Map<MetadataType, MetadataComponent> constructMetadata()
         CommitLogPosition club = new CommitLogPosition(11L, 12);
         CommitLogPosition cllb = new CommitLogPosition(9L, 12);
 
-        TableMetadata cfm = SchemaLoader.standardCFMD("ks1", "cf1").build();
+        TableMetadata cfm = SchemaLoader.clusteringSASICFMD("ks1", "cf1").build();
         MetadataCollector collector = new MetadataCollector(cfm.comparator)
                                       .commitLogIntervals(new IntervalSet<>(cllb, club));
 
         String partitioner = RandomPartitioner.class.getCanonicalName();
         double bfFpChance = 0.1;
+        collector.updateClusteringValues(Clustering.make(UTF8Type.instance.decompose("abc"), Int32Type.instance.decompose(123)));
+        collector.updateClusteringValues(Clustering.make(UTF8Type.instance.decompose("cba"), null));
         return collector.finalizeMetadata(partitioner, bfFpChance, 0, null, false, SerializationHeader.make(cfm, Collections.emptyList()));
     }
 
@@ -158,25 +163,48 @@ private void testVersions(String... versions) throws Throwable
     @Test
     public void testMVersions() throws Throwable
     {
+        Assume.assumeTrue(SSTableFormat.Type.current() == SSTableFormat.Type.BIG);
         testVersions("ma", "mb", "mc", "md", "me");
     }
 
     @Test
     public void testNVersions() throws Throwable
     {
+        Assume.assumeTrue(SSTableFormat.Type.current() == SSTableFormat.Type.BIG);
         testVersions("na", "nb");
     }
 
+    @Test
+    public void testAVersions() throws Throwable
+    {
+        Assume.assumeTrue(SSTableFormat.Type.current() == SSTableFormat.Type.BTI);
+        testVersions("aa", "ac", "ad");
+    }
+
+    @Test
+    public void testBVersions() throws Throwable
+    {
+        Assume.assumeTrue(SSTableFormat.Type.current() == SSTableFormat.Type.BTI);
+        testVersions("ba", "bb");
+    }
+
+    @Test
+    public void testCVersions() throws Throwable
+    {
+        Assume.assumeTrue(SSTableFormat.Type.current() == SSTableFormat.Type.BTI);
+        testVersions("ca");
+    }
+
     public void testOldReadsNew(String oldV, String newV) throws IOException
     {
         Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
 
         MetadataSerializer serializer = new MetadataSerializer();
         // Write metadata in two minor formats.
-        File statsFileLb = serialize(originalMetadata, serializer, BigFormat.instance.getVersion(newV));
-        File statsFileLa = serialize(originalMetadata, serializer, BigFormat.instance.getVersion(oldV));
+        File statsFileLb = serialize(originalMetadata, serializer, SSTableFormat.Type.current().info.getVersion(newV));
+        File statsFileLa = serialize(originalMetadata, serializer, SSTableFormat.Type.current().info.getVersion(oldV));
         // Reading both as earlier version should yield identical results.
-        SSTableFormat.Type stype = SSTableFormat.Type.BIG;
+        SSTableFormat.Type stype = SSTableFormat.Type.current();
         Descriptor desc = new Descriptor(stype.info.getVersion(oldV), statsFileLb.getParentFile(), "", "", 0, stype);
         try (RandomAccessReader inLb = RandomAccessReader.open(statsFileLb);
              RandomAccessReader inLa = RandomAccessReader.open(statsFileLa))

From 00da732ee5b3507902e0a73a6f3f3ad7021be8b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20=C5=BBytka?= <jakub.zytka@datastax.com>
Date: Wed, 5 May 2021 12:16:46 +0200
Subject: [PATCH 076/151] STAR-396: Multi-buffer, sharded memtable trie (#131)

* STAR-396: TrieMemtable sharding; Multi-buffer memtable trie

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
(cherry picked from commit 62ead9eb0fcf22ed39ad449398b56aed0a23505c)
(cherry picked from commit 2eb26043abe8b882d9790abe187e9ad7019d21c8)
---
 .../cassandra/db/ColumnFamilyStore.java       |  42 +-
 .../cassandra/db/DiskBoundaryManager.java     |  59 ++-
 .../memtable/AbstractAllocatorMemtable.java   |  20 +-
 .../db/memtable/AbstractMemtable.java         |  15 +-
 .../db/memtable/DefaultMemtableFactory.java   |  46 +-
 .../cassandra/db/memtable/Memtable.java       |  13 +-
 .../db/memtable/ShardBoundaries.java          | 129 ++++++
 .../db/memtable/SkipListMemtable.java         |  28 ++
 .../cassandra/db/memtable/TrieMemtable.java   | 405 ++++++++++++++++--
 .../db/memtable/TrieMemtableConfigMXBean.java |  24 ++
 .../cassandra/db/tries/MemtableReadTrie.java  |  74 +++-
 .../cassandra/db/tries/MemtableTrie.java      | 173 ++++----
 .../cassandra/metrics/MinMaxAvgMetric.java    |  94 ++++
 .../cassandra/metrics/TableMetrics.java       |   7 +-
 .../metrics/TrieMemtableMetricsView.java      |  90 ++++
 .../cassandra/schema/MemtableParams.java      |   2 +-
 .../cassandra/schema/TableMetadata.java       |   7 +
 .../test/microbench/instance/WriteTest.java   |  21 +-
 .../cql3/validation/operations/AlterTest.java |  11 +
 .../cassandra/db/commitlog/CommitLogTest.java |  19 +-
 .../db/memtable/TrieMemtableConfigTest.java   |  62 +++
 .../db/tries/MemtableTrieTestBase.java        |   2 +-
 .../metrics/TrieMemtableMetricsTest.java      | 215 ++++++++++
 23 files changed, 1374 insertions(+), 184 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java
 create mode 100644 src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java
 create mode 100644 src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java
 create mode 100644 src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java
 create mode 100644 test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java
 create mode 100644 test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 4e41dace4d59..06846d0eae69 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -51,6 +51,7 @@
 import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.memtable.Flushing;
 import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.db.memtable.ShardBoundaries;
 import org.apache.cassandra.db.streaming.CassandraStreamManager;
 import org.apache.cassandra.db.repair.CassandraTableRepairManager;
 import org.apache.cassandra.db.view.TableViews;
@@ -60,6 +61,7 @@
 import org.apache.cassandra.db.rows.CellPath;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.StartupException;
 import org.apache.cassandra.index.SecondaryIndexManager;
@@ -242,6 +244,7 @@ public enum FlushReason
 
     @VisibleForTesting
     final DiskBoundaryManager diskBoundaryManager = new DiskBoundaryManager();
+    ShardBoundaries cachedShardBoundaries = null;
 
     private volatile boolean neverPurgeTombstones = false;
 
@@ -419,7 +422,7 @@ public ColumnFamilyStore(Keyspace keyspace,
             indexManager.addIndex(info, true);
         }
 
-        metric = new TableMetrics(this);
+        metric = new TableMetrics(this, memtableFactory.createMemtableMetrics(metadata));
 
         if (data.loadsstables)
         {
@@ -1346,6 +1349,43 @@ private UpdateTransaction newUpdateTransaction(PartitionUpdate update, Cassandra
                : UpdateTransaction.NO_OP;
     }
 
+    public ShardBoundaries localRangeSplits(int shardCount)
+    {
+        if (shardCount == 1 || !getPartitioner().splitter().isPresent() || SchemaConstants.isLocalSystemKeyspace(keyspace.getName()))
+            return ShardBoundaries.NONE;
+
+        ShardBoundaries shardBoundaries = cachedShardBoundaries;
+        if (shardBoundaries == null ||
+            shardBoundaries.shardCount() != shardCount ||
+            shardBoundaries.ringVersion != StorageService.instance.getTokenMetadata().getRingVersion())
+        {
+            DiskBoundaryManager.VersionedRangesAtEndpoint versionedLocalRanges = DiskBoundaryManager.getVersionedLocalRanges(this);
+            Set<Range<Token>> localRanges = versionedLocalRanges.rangesAtEndpoint.ranges();
+            List<Splitter.WeightedRange> weightedRanges;
+            if (localRanges.isEmpty())
+                weightedRanges = ImmutableList.of(new Splitter.WeightedRange(1.0, new Range<>(getPartitioner().getMinimumToken(), getPartitioner().getMaximumToken())));
+            else
+            {
+                weightedRanges = new ArrayList<>(localRanges.size());
+                for (Range<Token> r : localRanges)
+                {
+                    // WeightedRange supports only unwrapped ranges as it relies
+                    // on right - left == num tokens equality
+                    for (Range<Token> u: r.unwrap())
+                        weightedRanges.add(new Splitter.WeightedRange(1.0, u));
+                }
+                weightedRanges.sort(Comparator.comparing(Splitter.WeightedRange::left));
+            }
+
+            List<Token> boundaries = getPartitioner().splitter().get().splitOwnedRanges(shardCount, weightedRanges, false);
+            shardBoundaries = new ShardBoundaries(boundaries.subList(0, boundaries.size() - 1),
+                                                  versionedLocalRanges.ringVersion);
+            cachedShardBoundaries = shardBoundaries;
+            logger.info("Memtable shard boundaries for {}.{}: {}", keyspace.getName(), getTableName(), boundaries);
+        }
+        return shardBoundaries;
+    }
+
     /**
      * @param sstables
      * @return sstables whose key range overlaps with that of the given sstables, not including itself.
diff --git a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java
index cc617da702e2..0de745d3cf80 100644
--- a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java
+++ b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java
@@ -21,7 +21,6 @@
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.List;
-import java.util.stream.Collectors;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -68,7 +67,19 @@ public void invalidate()
            diskBoundaries.invalidate();
     }
 
-    private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs)
+    static class VersionedRangesAtEndpoint
+    {
+        public final RangesAtEndpoint rangesAtEndpoint;
+        public final long ringVersion;
+
+        VersionedRangesAtEndpoint(RangesAtEndpoint rangesAtEndpoint, long ringVersion)
+        {
+            this.rangesAtEndpoint = rangesAtEndpoint;
+            this.ringVersion = ringVersion;
+        }
+    }
+
+    public static VersionedRangesAtEndpoint getVersionedLocalRanges(ColumnFamilyStore cfs)
     {
         RangesAtEndpoint localRanges;
 
@@ -78,23 +89,20 @@ private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs)
         {
             tmd = StorageService.instance.getTokenMetadata();
             ringVersion = tmd.getRingVersion();
-            if (StorageService.instance.isBootstrapMode()
-                && !StorageService.isReplacingSameAddress()) // When replacing same address, the node marks itself as UN locally
-            {
-                PendingRangeCalculatorService.instance.blockUntilFinished();
-                localRanges = tmd.getPendingRanges(cfs.keyspace.getName(), FBUtilities.getBroadcastAddressAndPort());
-            }
-            else
-            {
-                // Reason we use use the future settled TMD is that if we decommission a node, we want to stream
-                // from that node to the correct location on disk, if we didn't, we would put new files in the wrong places.
-                // We do this to minimize the amount of data we need to move in rebalancedisks once everything settled
-                localRanges = cfs.keyspace.getReplicationStrategy().getAddressReplicas(tmd.cloneAfterAllSettled(), FBUtilities.getBroadcastAddressAndPort());
-            }
+            localRanges = getLocalRanges(cfs, tmd);
             logger.debug("Got local ranges {} (ringVersion = {})", localRanges, ringVersion);
         }
         while (ringVersion != tmd.getRingVersion()); // if ringVersion is different here it means that
-                                                     // it might have changed before we calculated localRanges - recalculate
+        // it might have changed before we calculated localRanges - recalculate
+
+        return new VersionedRangesAtEndpoint(localRanges, ringVersion);
+    }
+
+    private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs)
+    {
+        VersionedRangesAtEndpoint rangesAtEndpoint = getVersionedLocalRanges(cfs);
+        RangesAtEndpoint localRanges = rangesAtEndpoint.rangesAtEndpoint;
+        long ringVersion = rangesAtEndpoint.ringVersion;
 
         int directoriesVersion;
         Directories.DataDirectory[] dirs;
@@ -113,6 +121,25 @@ private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs)
         return new DiskBoundaries(cfs, dirs, positions, ringVersion, directoriesVersion);
     }
 
+    private static RangesAtEndpoint getLocalRanges(ColumnFamilyStore cfs, TokenMetadata tmd)
+    {
+        RangesAtEndpoint localRanges;
+        if (StorageService.instance.isBootstrapMode()
+        && !StorageService.isReplacingSameAddress()) // When replacing same address, the node marks itself as UN locally
+        {
+            PendingRangeCalculatorService.instance.blockUntilFinished();
+            localRanges = tmd.getPendingRanges(cfs.keyspace.getName(), FBUtilities.getBroadcastAddressAndPort());
+        }
+        else
+        {
+            // Reason we use use the future settled TMD is that if we decommission a node, we want to stream
+            // from that node to the correct location on disk, if we didn't, we would put new files in the wrong places.
+            // We do this to minimize the amount of data we need to move in rebalancedisks once everything settled
+            localRanges = cfs.keyspace.getReplicationStrategy().getAddressReplicas(tmd.cloneAfterAllSettled(), FBUtilities.getBroadcastAddressAndPort());
+        }
+        return localRanges;
+    }
+
     /**
      * Returns a list of disk boundaries, the result will differ depending on whether vnodes are enabled or not.
      *
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java
index e13fc6543362..6636a4783f0d 100644
--- a/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractAllocatorMemtable.java
@@ -144,24 +144,16 @@ public void discard()
 
     public String toString()
     {
-        return String.format("Memtable-%s@%s(%s serialized bytes, %s ops, %.0f%%/%.0f%% of on/off-heap limit)",
+        MemoryUsage usage = Memtable.getMemoryUsage(this);
+        return String.format("Memtable-%s@%s(%s serialized bytes, %s ops, %s)",
                              metadata.get().name,
                              hashCode(),
-                             FBUtilities.prettyPrintMemory(liveDataSize.get()),
-                             currentOperations,
-                             100 * allocator.onHeap().ownershipRatio(),
-                             100 * allocator.offHeap().ownershipRatio());
-    }
-
-    /**
-     * For testing only. Give this memtable too big a size to make it always fail flushing.
-     */
-    @VisibleForTesting
-    public void makeUnflushable()
-    {
-        liveDataSize.addAndGet(1024L * 1024 * 1024 * 1024 * 1024);
+                             FBUtilities.prettyPrintMemory(getLiveDataSize()),
+                             getOperations(),
+                             usage);
     }
 
+    @Override
     public void addMemoryUsageTo(MemoryUsage stats)
     {
         stats.ownershipRatioOnHeap += getAllocator().onHeap().ownershipRatio();
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
index 53e52ed95ebf..51c871a4a0ee 100644
--- a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
@@ -37,7 +37,6 @@
 
 public abstract class AbstractMemtable implements Memtable
 {
-    protected final AtomicLong liveDataSize = new AtomicLong(0);
     protected final AtomicLong currentOperations = new AtomicLong(0);
     protected final ColumnsCollector columnsCollector;
     protected final StatsCollector statsCollector = new StatsCollector();
@@ -57,11 +56,6 @@ public TableMetadata metadata()
         return metadata.get();
     }
 
-    public long getLiveDataSize()
-    {
-        return liveDataSize.get();
-    }
-
     public long getOperations()
     {
         return currentOperations.get();
@@ -125,6 +119,15 @@ public void update(RegularAndStaticColumns columns)
                 update(r);
         }
 
+        public void update(ColumnsCollector other)
+        {
+            for (Map.Entry<ColumnMetadata, AtomicBoolean> v : other.predefined.entrySet())
+                if (v.getValue().get())
+                    update(v.getKey());
+
+            extra.addAll(other.extra);
+        }
+
         private void update(ColumnMetadata definition)
         {
             AtomicBoolean present = predefined.get(definition);
diff --git a/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java b/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java
index 9213fab91f85..d35a21922374 100644
--- a/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java
+++ b/src/java/org/apache/cassandra/db/memtable/DefaultMemtableFactory.java
@@ -18,13 +18,51 @@
 
 package org.apache.cassandra.db.memtable;
 
+import java.util.concurrent.atomic.AtomicReference;
+
+import org.apache.cassandra.db.commitlog.CommitLogPosition;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.schema.TableMetadataRef;
+
 /**
  * This class exists solely to avoid initialization of the default memtable class.
  * Some tests want to setup table parameters before initializing DatabaseDescriptor -- this allows them to do so.
  */
-public class DefaultMemtableFactory
+public class DefaultMemtableFactory implements Memtable.Factory
 {
-    // We can't use TrieMemtable.FACTORY as that requires DatabaseDescriptor to have been initialized.
-    public static final Memtable.Factory INSTANCE = TrieMemtable::new;
-//    public static final Memtable.Factory INSTANCE = SkipListMemtable::new;
+    @Override
+    public Memtable create(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadaRef, Memtable.Owner owner)
+    {
+        return TrieMemtable.FACTORY.create(commitLogLowerBound, metadaRef, owner);
+    }
+
+    @Override
+    public boolean writesShouldSkipCommitLog()
+    {
+        return TrieMemtable.FACTORY.writesShouldSkipCommitLog();
+    }
+
+    @Override
+    public boolean writesAreDurable()
+    {
+        return TrieMemtable.FACTORY.writesAreDurable();
+    }
+
+    @Override
+    public boolean streamToMemtable()
+    {
+        return TrieMemtable.FACTORY.streamToMemtable();
+    }
+
+    @Override
+    public boolean streamFromMemtable()
+    {
+        return TrieMemtable.FACTORY.streamFromMemtable();
+    }
+
+    @Override
+    public TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef)
+    {
+        return TrieMemtable.FACTORY.createMemtableMetrics(metadataRef);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/memtable/Memtable.java b/src/java/org/apache/cassandra/db/memtable/Memtable.java
index 8513ec684a4b..6d36e5da0d60 100644
--- a/src/java/org/apache/cassandra/db/memtable/Memtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/Memtable.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.db.rows.EncodingStats;
 import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.metrics.TableMetrics;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.utils.FBUtilities;
@@ -133,6 +134,15 @@ default boolean streamFromMemtable()
         {
             return false;
         }
+
+        /**
+         * Memtable metrics lifecycle matches table lifecycle. It is the table
+         * that owns the metrics and decides when to release them;
+         */
+        default TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef)
+        {
+            return null;
+        }
     }
 
     /**
@@ -151,8 +161,9 @@ interface Owner
          * freed by a flush.
          */
         Iterable<Memtable> getIndexMemtables();
-    }
 
+        ShardBoundaries localRangeSplits(int shardCount);
+    }
 
     // Main write and read operations
 
diff --git a/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java b/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java
new file mode 100644
index 000000000000..87d283527f97
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.db.memtable;
+
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.dht.Token;
+
+/**
+ * Holds boundaries (tokens) used to map a particular token (so partition key) to a shard id.
+ * In practice, each keyspace has its associated boundaries, see {@link Keyspace}.
+ * <p>
+ * Technically, if we use {@code n} shards, this is a list of {@code n-1} tokens and each token {@code tk} gets assigned
+ * to the core ID corresponding to the slot of the smallest token in the list that is greater to {@code tk}, or {@code n}
+ * if {@code tk} is bigger than any token in the list.
+ */
+public class ShardBoundaries
+{
+    private static final Token[] EMPTY_TOKEN_ARRAY = new Token[0];
+
+    // Special boundaries that map all tokens to one shard.
+    // These boundaries will be used in either of these cases:
+    // - there is only 1 shard configured
+    // - the default partitioner doesn't support splitting
+    // - the keyspace is local system keyspace
+    public static final ShardBoundaries NONE = new ShardBoundaries(EMPTY_TOKEN_ARRAY, -1);
+
+    private final Token[] boundaries;
+    public final long ringVersion;
+
+    @VisibleForTesting
+    public ShardBoundaries(Token[] boundaries, long ringVersion)
+    {
+        this.boundaries = boundaries;
+        this.ringVersion = ringVersion;
+    }
+
+    public ShardBoundaries(List<Token> boundaries, long ringVersion)
+    {
+        this(boundaries.toArray(EMPTY_TOKEN_ARRAY), ringVersion);
+    }
+
+    /**
+     * Computes the shard to use for the provided token.
+     */
+    public int getShardForToken(Token tk)
+    {
+        for (int i = 0; i < boundaries.length; i++)
+        {
+            if (tk.compareTo(boundaries[i]) < 0)
+                return i;
+        }
+        return boundaries.length;
+    }
+
+    /**
+     * Computes the shard to use for the provided key.
+     */
+    public int getShardForKey(DecoratedKey key)
+    {
+        // Boundaries are missing if the node is not sufficiently initialized yet
+        if (boundaries.length == 0)
+            return 0;
+
+        assert (key.getPartitioner() == DatabaseDescriptor.getPartitioner());
+        return getShardForToken(key.getToken());
+    }
+
+    /**
+     * The number of shards that this boundaries support, that is how many different shard ids {@link #getShardForToken} might
+     * possibly return.
+     *
+     * @return the number of shards supported by theses boundaries.
+     */
+    public int shardCount()
+    {
+        return boundaries.length + 1;
+    }
+
+    @Override
+    public String toString()
+    {
+        if (boundaries.length == 0)
+            return "shard 0: (min, max)";
+
+        StringBuilder sb = new StringBuilder();
+        sb.append("shard 0: (min, ").append(boundaries[0]).append(") ");
+        for (int i = 0; i < boundaries.length - 1; i++)
+            sb.append("shard ").append(i+1).append(": (").append(boundaries[i]).append(", ").append(boundaries[i+1]).append("] ");
+        sb.append("shard ").append(boundaries.length).append(": (").append(boundaries[boundaries.length-1]).append(", max)");
+        return sb.toString();
+    }
+
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        ShardBoundaries that = (ShardBoundaries) o;
+
+        return Arrays.equals(boundaries, that.boundaries);
+    }
+
+    public int hashCode()
+    {
+        return Arrays.hashCode(boundaries);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java b/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java
index cd2b7eaee634..1255716aecfb 100644
--- a/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/SkipListMemtable.java
@@ -22,6 +22,7 @@
 import java.util.Map;
 import java.util.concurrent.ConcurrentNavigableMap;
 import java.util.concurrent.ConcurrentSkipListMap;
+import java.util.concurrent.atomic.AtomicLong;
 import java.util.concurrent.atomic.AtomicReference;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -70,6 +71,8 @@ public class SkipListMemtable extends AbstractAllocatorMemtable
     // actually only store DecoratedKey.
     private final ConcurrentNavigableMap<PartitionPosition, AtomicBTreePartition> partitions = new ConcurrentSkipListMap<>();
 
+    private final AtomicLong liveDataSize = new AtomicLong(0);
+
     SkipListMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
     {
         super(commitLogLowerBound, metadataRef, owner);
@@ -98,6 +101,11 @@ public Iterable<Memtable> getIndexMemtables()
             {
                 return Collections.emptyList();
             }
+
+            public ShardBoundaries localRangeSplits(int shardCount)
+            {
+                return null; // not implemented
+            }
         });
     }
 
@@ -106,6 +114,12 @@ protected Factory factory()
         return FACTORY;
     }
 
+    @Override
+    public void addMemoryUsageTo(MemoryUsage stats)
+    {
+        super.addMemoryUsageTo(stats);
+    }
+
     public boolean isClean()
     {
         return partitions.isEmpty();
@@ -338,4 +352,18 @@ public UnfilteredRowIterator next()
             return filter.getUnfilteredRowIterator(columnFilter, entry.getValue());
         }
     }
+
+    public long getLiveDataSize()
+    {
+        return liveDataSize.get();
+    }
+
+    /**
+     * For testing only. Give this memtable too big a size to make it always fail flushing.
+     */
+    @VisibleForTesting
+    public void makeUnflushable()
+    {
+        liveDataSize.addAndGet(1024L * 1024 * 1024 * 1024 * 1024);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
index ebbfe7cd78d6..e0523b219019 100644
--- a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
@@ -17,11 +17,14 @@
  */
 package org.apache.cassandra.db.memtable;
 
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
 import java.util.NavigableSet;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.locks.ReentrantLock;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Throwables;
@@ -37,6 +40,7 @@
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.DeletionInfo;
 import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.RegularAndStaticColumns;
 import org.apache.cassandra.db.Slices;
 import org.apache.cassandra.db.commitlog.CommitLogPosition;
 import org.apache.cassandra.db.filter.ClusteringIndexFilter;
@@ -47,6 +51,7 @@
 import org.apache.cassandra.db.partitions.ImmutableBTreePartition;
 import org.apache.cassandra.db.partitions.Partition;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
+import org.apache.cassandra.db.rows.EncodingStats;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.tries.MemtableTrie;
@@ -57,20 +62,27 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.metrics.TableMetrics;
+import org.apache.cassandra.metrics.TrieMemtableMetricsView;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MBeanWrapper;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 import org.apache.cassandra.utils.memory.EnsureOnHeap;
+import org.apache.cassandra.utils.memory.MemtableAllocator;
 
 public class TrieMemtable extends AbstractAllocatorMemtable
 {
     private static final Logger logger = LoggerFactory.getLogger(TrieMemtable.class);
+    public static final String TRIE_MEMTABLE_CONFIG_OBJECT_NAME = "org.apache.cassandra.db:type=TrieMemtableConfig";
 
-    public static final Factory FACTORY = TrieMemtable::new;
+    public static final Factory FACTORY = new TrieMemtable.Factory();
 
     /** Buffer type to use for memtable tries (on- vs off-heap) */
     public static final BufferType BUFFER_TYPE;
+
     static
     {
         switch (DatabaseDescriptor.getMemtableAllocationType())
@@ -86,6 +98,8 @@ public class TrieMemtable extends AbstractAllocatorMemtable
         default:
             throw new AssertionError();
         }
+
+        MBeanWrapper.instance.registerMBean(new TrieMemtableConfig(), TRIE_MEMTABLE_CONFIG_OBJECT_NAME, MBeanWrapper.OnException.LOG);
     }
 
     /** If keys is below this length, we will use a recursive procedure for inserting data in the memtable trie. */
@@ -99,15 +113,62 @@ public class TrieMemtable extends AbstractAllocatorMemtable
     // thread calls cfs.switchMemtableIfCurrent.
     private AtomicBoolean switchRequested = new AtomicBoolean(false);
 
-    // We index the memtable by PartitionPosition only for the purpose of being able
-    // to select key range using Token.KeyBound. However put() ensures that we
-    // actually only store DecoratedKey.
-    private final MemtableTrie<BTreePartitionData> partitions = new MemtableTrie<>(BUFFER_TYPE);
+
+    // The boundaries for the keyspace as they were calculated when the memtable is created.
+    // The boundaries will be NONE for system keyspaces or if StorageService is not yet initialized.
+    // The fact this is fixed for the duration of the memtable lifetime, guarantees we'll always pick the same core
+    // for the a given key, even if we race with the StorageService initialization or with topology changes.
+    private final ShardBoundaries boundaries;
+
+    /**
+     * Core-specific memtable regions. All writes must go through the specific core. The data structures used
+     * are concurrent-read safe, thus reads can be carried out from any thread.
+     */
+    private final MemtableShard[] shards;
+
+    /**
+     * A merged view of the memtable map. Used for partition range queries and flush.
+     * For efficiency we serve single partition requests off the shard which offers more direct MemtableTrie methods.
+     */
+    private final Trie<BTreePartitionData> mergedTrie;
+
+    private final TrieMemtableMetricsView metrics;
+
+    @VisibleForTesting
+    public static final String SHARD_COUNT_PROPERTY = "cassandra.trie.memtable.shard.count";
+
+    private static volatile int SHARD_COUNT = Integer.getInteger(SHARD_COUNT_PROPERTY, FBUtilities.getAvailableProcessors());
 
     // only to be used by init(), to setup the very first memtable for the cfs
     TrieMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
     {
         super(commitLogLowerBound, metadataRef, owner);
+        this.boundaries = owner.localRangeSplits(getShardCount());
+        this.metrics = new TrieMemtableMetricsView(metadataRef.keyspace, metadataRef.name);
+        this.shards = generatePartitionShards(boundaries.shardCount(), metadataRef, metrics);
+        this.mergedTrie = makeMergedTrie(shards);
+    }
+
+    private static MemtableShard[] generatePartitionShards(int splits,
+                                                           TableMetadataRef metadata,
+                                                           TrieMemtableMetricsView metrics)
+    {
+        if (splits == 1)
+            return new MemtableShard[] { new MemtableShard(0, metadata, metrics) };
+
+        MemtableShard[] partitionMapContainer = new MemtableShard[splits];
+        for (int i = 0; i < splits; i++)
+            partitionMapContainer[i] = new MemtableShard(i, metadata, metrics);
+
+        return partitionMapContainer;
+    }
+
+    private static Trie<BTreePartitionData> makeMergedTrie(MemtableShard[] shards)
+    {
+        List<Trie<BTreePartitionData>> tries = new ArrayList<>(shards.length);
+        for (MemtableShard shard : shards)
+            tries.add(shard.data);
+        return Trie.mergeDistinct(tries);
     }
 
     protected Factory factory()
@@ -117,7 +178,37 @@ protected Factory factory()
 
     public boolean isClean()
     {
-        return partitions.isEmpty();
+        for (MemtableShard shard : shards)
+            if (!shard.isEmpty())
+                return false;
+        return true;
+    }
+
+    @VisibleForTesting
+    @Override
+    public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPosition> commitLogUpperBound)
+    {
+        super.switchOut(writeBarrier, commitLogUpperBound);
+
+        for (MemtableShard shard : shards)
+            shard.allocator.setDiscarding();
+    }
+
+    @Override
+    public void discard()
+    {
+        super.discard();
+        // metrics here are not thread safe, but I think we can live with that
+        metrics.lastFlushShardDataSizes.reset();
+        for (MemtableShard shard : shards)
+        {
+            metrics.lastFlushShardDataSizes.update(shard.liveDataSize());
+        }
+        for (MemtableShard shard : shards)
+        {
+            shard.allocator.setDiscarded();
+            shard.data.discardBuffers();
+        }
     }
 
     /**
@@ -128,46 +219,86 @@ public boolean isClean()
      */
     public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
     {
-        BTreePartitionUpdater updater = new BTreePartitionUpdater(allocator, opGroup, indexer);
         DecoratedKey key = update.partitionKey();
+        MemtableShard shard = shards[boundaries.getShardForKey(key)];
+        long colUpdateTimeDelta = shard.put(key, update, indexer, opGroup);
 
-        // TODO: Improve locking.
-        synchronized (this)
+        if (shard.data.reachedAllocatedSizeThreshold() && !switchRequested.getAndSet(true))
         {
-            long onHeap = partitions.sizeOnHeap();
-            long offHeap = partitions.sizeOffHeap();
-
-            try
-            {
-                partitions.putSingleton(key, update, updater::mergePartitions, key.getKeyLength() < MAX_RECURSIVE_KEY_LENGTH);
-            }
-            catch (MemtableTrie.SpaceExhaustedException e)
-            {
-                // This should never really happen as a flush would be triggered long before this limit is reached.
-                throw Throwables.propagate(e);
-            }
-
-            allocator.offHeap().adjust(partitions.sizeOffHeap() - offHeap, opGroup);
-            allocator.onHeap().adjust(partitions.sizeOnHeap() - onHeap, opGroup);
-            updateMin(minTimestamp, update.stats().minTimestamp);
-            liveDataSize.addAndGet(updater.dataSize);
-            columnsCollector.update(update.columns());
-            statsCollector.update(update.stats());
-            currentOperations.addAndGet(update.operationCount());
+            logger.info("Scheduling flush due to trie size limit reached.");
+            owner.signalFlushRequired(this, ColumnFamilyStore.FlushReason.MEMTABLE_LIMIT);
         }
 
-        if (partitions.reachedAllocatedSizeThreshold() && !switchRequested.getAndSet(true))
+        return colUpdateTimeDelta;
+    }
+
+    @Override
+    public void addMemoryUsageTo(MemoryUsage stats)
+    {
+        super.addMemoryUsageTo(stats);
+        for (MemtableShard shard : shards)
         {
-            logger.info("Scheduling flush due to trie size limit reached.");
-            owner.signalFlushRequired(this, ColumnFamilyStore.FlushReason.MEMTABLE_LIMIT);
+            stats.ownsOnHeap += shard.allocator.onHeap().owns();
+            stats.ownsOffHeap += shard.allocator.offHeap().owns();
+            stats.ownershipRatioOnHeap += shard.allocator.onHeap().ownershipRatio();
+            stats.ownershipRatioOffHeap += shard.allocator.offHeap().ownershipRatio();
         }
+    }
 
-        return updater.colUpdateTimeDelta;
+    /**
+     * Technically we should scatter gather on all the core threads because the size in following calls are not
+     * using volatile variables, but for metrics purpose this should be good enough.
+     */
+    @Override
+    public long getLiveDataSize()
+    {
+        long total = 0L;
+        for (MemtableShard shard : shards)
+            total += shard.liveDataSize();
+        return total;
     }
 
+    @Override
+    public long getOperations()
+    {
+        long total = 0L;
+        for (MemtableShard shard : shards)
+            total += shard.currentOperations();
+        return total;
+    }
+
+    @Override
     public long partitionCount()
     {
-        return partitions.valuesCount();
+        int total = 0;
+        for (MemtableShard shard : shards)
+            total += shard.size();
+        return total;
+    }
+
+    @Override
+    public long getMinTimestamp()
+    {
+        long min = Long.MAX_VALUE;
+        for (MemtableShard shard : shards)
+            min =  Long.min(min, shard.minTimestamp());
+        return min;
+    }
+
+    @Override
+    RegularAndStaticColumns columns()
+    {
+        for (MemtableShard shard : shards)
+            columnsCollector.update(shard.columnsCollector);
+        return columnsCollector.get();
+    }
+
+    @Override
+    EncodingStats encodingStats()
+    {
+        for (MemtableShard shard : shards)
+            statsCollector.update(shard.statsCollector.get());
+        return statsCollector.get();
     }
 
     public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFilter columnFilter, final DataRange dataRange)
@@ -185,7 +316,7 @@ public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFil
         boolean includeStart = isBound || keyRange instanceof IncludingExcludingBounds;
         boolean includeStop = isBound || keyRange instanceof Range;
 
-        Trie<BTreePartitionData> subMap = partitions.subtrie(left, includeStart, right, includeStop);
+        Trie<BTreePartitionData> subMap = mergedTrie.subtrie(left, includeStart, right, includeStop);
 
         return new MemtableUnfilteredPartitionIterator(metadata(),
                                                        allocator.ensureOnHeap(),
@@ -196,7 +327,8 @@ public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFil
 
     public Partition getPartition(DecoratedKey key)
     {
-        BTreePartitionData data = partitions.get(key);
+        int shardIndex = boundaries.getShardForKey(key);
+        BTreePartitionData data = shards[shardIndex].data.get(key);
         if (data != null)
             return createPartition(metadata(), allocator.ensureOnHeap(), key, data);
         else
@@ -219,7 +351,7 @@ private static MemtablePartition getPartitionFromTrieEntry(TableMetadata metadat
 
     public FlushCollection<MemtablePartition> getFlushSet(PartitionPosition from, PartitionPosition to)
     {
-        Trie<BTreePartitionData> toFlush = partitions.subtrie(from, true, to, false);
+        Trie<BTreePartitionData> toFlush = mergedTrie.subtrie(from, true, to, false);
         long keySize = 0;
         int keyCount = 0;
 
@@ -271,6 +403,157 @@ public long partitionKeySize()
         };
     }
 
+    static class MemtableShard
+    {
+        // The following fields are volatile as we have to make sure that when we
+        // collect results from all sub-ranges, the thread accessing the value
+        // is guaranteed to see the changes to the values.
+
+        // The smallest timestamp for all partitions stored in this shard
+        private volatile long minTimestamp = Long.MAX_VALUE;
+
+        private volatile long liveDataSize = 0;
+
+        private volatile long currentOperations = 0;
+
+        private ReentrantLock writeLock = new ReentrantLock();
+
+        // Content map for the given shard. This is implemented as a memtable trie which uses the prefix-free
+        // byte-comparable ByteSource representations of the keys to address the partitions.
+        //
+        // This map is used in a single-producer, multi-consumer fashion: only one thread will insert items but
+        // several threads may read from it and iterate over it. Iterators are created when a the first item of
+        // a flow is requested for example, and then used asynchronously when sub-sequent items are requested.
+        //
+        // Therefore, iterators should not throw ConcurrentModificationExceptions if the underlying map is modified
+        // during iteration, they should provide a weakly consistent view of the map instead.
+        //
+        // Also, this data is backed by memtable memory, when accessing it callers must specify if it can be accessed
+        // unsafely, meaning that the memtable will not be discarded as long as the data is used, or whether the data
+        // should be copied on heap for off-heap allocators.
+        @VisibleForTesting
+        final MemtableTrie<BTreePartitionData> data;
+
+        private final ColumnsCollector columnsCollector;
+
+        private final StatsCollector statsCollector;
+
+        private final MemtableAllocator allocator;
+
+        private final TrieMemtableMetricsView metrics;
+
+        MemtableShard(int shardId, TableMetadataRef metadata, TrieMemtableMetricsView metrics)
+        {
+            this(metadata, AbstractAllocatorMemtable.MEMORY_POOL.newAllocator(), metrics);
+        }
+
+        @VisibleForTesting
+        MemtableShard(TableMetadataRef metadata, MemtableAllocator allocator, TrieMemtableMetricsView metrics)
+        {
+            this.data = new MemtableTrie<>(BUFFER_TYPE);
+            this.columnsCollector = new AbstractMemtable.ColumnsCollector(metadata.get().regularAndStaticColumns());
+            this.statsCollector = new AbstractMemtable.StatsCollector();
+            this.allocator = allocator;
+            this.metrics = metrics;
+        }
+
+        public long put(DecoratedKey key, PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
+        {
+            BTreePartitionUpdater updater = new BTreePartitionUpdater(allocator, opGroup, indexer);
+            boolean locked = writeLock.tryLock();
+            if (locked)
+            {
+                metrics.uncontendedPuts.inc();
+            }
+            else
+            {
+                metrics.contendedPuts.inc();
+                long lockStartTime = System.nanoTime();
+                writeLock.lock();
+                metrics.contentionTime.addNano(System.nanoTime() - lockStartTime);
+            }
+            try
+            {
+                try
+                {
+                    long onHeap = data.sizeOnHeap();
+                    long offHeap = data.sizeOffHeap();
+                    // Use the fast recursive put if we know the key is small enough to not cause a stack overflow.
+                    try
+                    {
+                        data.putSingleton(key,
+                                          update,
+                                          updater::mergePartitions,
+                                          key.getKeyLength() < MAX_RECURSIVE_KEY_LENGTH);
+                    }
+                    catch (MemtableTrie.SpaceExhaustedException e)
+                    {
+                        // This should never really happen as a flush would be triggered long before this limit is reached.
+                        throw Throwables.propagate(e);
+                    }
+                    allocator.offHeap().adjust(data.sizeOffHeap() - offHeap, opGroup);
+                    allocator.onHeap().adjust(data.sizeOnHeap() - onHeap, opGroup);
+                }
+                finally
+                {
+                    updateMinTimestamp(update.stats().minTimestamp);
+                    updateLiveDataSize(updater.dataSize);
+                    updateCurrentOperations(update.operationCount());
+
+                    // TODO: lambov 2021-03-30: check if stats are further optimisable
+                    columnsCollector.update(update.columns());
+                    statsCollector.update(update.stats());
+                }
+            }
+            finally
+            {
+                writeLock.unlock();
+            }
+            return updater.colUpdateTimeDelta;
+        }
+
+        public boolean isEmpty()
+        {
+            return data.isEmpty();
+        }
+
+        private void updateMinTimestamp(long timestamp)
+        {
+            if (timestamp < minTimestamp)
+                minTimestamp = timestamp;
+        }
+
+        void updateLiveDataSize(long size)
+        {
+            liveDataSize = liveDataSize + size;
+        }
+
+        private void updateCurrentOperations(long op)
+        {
+            currentOperations = currentOperations + op;
+        }
+
+        public int size()
+        {
+            return data.valuesCount();
+        }
+
+        long minTimestamp()
+        {
+            return minTimestamp;
+        }
+
+        long liveDataSize()
+        {
+            return liveDataSize;
+        }
+
+        long currentOperations()
+        {
+            return currentOperations;
+        }
+    }
+
     static class MemtableUnfilteredPartitionIterator extends AbstractUnfilteredPartitionIterator implements Memtable.MemtableUnfilteredPartitionIterator
     {
         private final TableMetadata metadata;
@@ -405,4 +688,52 @@ public Iterator<Row> iterator()
             return ensureOnHeap.applyToPartition(super.iterator());
         }
     }
+
+    static class Factory implements Memtable.Factory
+    {
+        public Memtable create(AtomicReference<CommitLogPosition> commitLogLowerBound,
+                               TableMetadataRef metadaRef,
+                               Owner owner)
+        {
+            return new TrieMemtable(commitLogLowerBound, metadaRef, owner);
+        }
+
+        @Override
+        public TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef)
+        {
+            TrieMemtableMetricsView metrics = new TrieMemtableMetricsView(metadataRef.keyspace, metadataRef.name);
+            return metrics::release;
+        }
+    }
+
+    private static class TrieMemtableConfig implements TrieMemtableConfigMXBean
+    {
+        @Override
+        public void setShardCount(String shardCount)
+        {
+            if ("auto".equalsIgnoreCase(shardCount))
+            {
+                SHARD_COUNT = FBUtilities.getAvailableProcessors();
+            }
+            else
+            {
+                try
+                {
+                    SHARD_COUNT = Integer.valueOf(shardCount);
+                }
+                catch (NumberFormatException ex)
+                {
+                    logger.warn("Unable to parse {} as valid value for shard count", shardCount);
+                    return;
+                }
+            }
+            logger.info("Requested setting shard count to {}; set to: {}", shardCount, SHARD_COUNT);
+        }
+    }
+
+    @VisibleForTesting
+    public static int getShardCount()
+    {
+        return SHARD_COUNT;
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java
new file mode 100644
index 000000000000..6080a31a4056
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+public interface TrieMemtableConfigMXBean
+{
+    public void setShardCount(String numShards);
+}
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index f4c8ec9820f8..7aad6a7773b4 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -183,20 +183,69 @@ Block offsets used to identify node types (by comparing them to the node 'pointe
 
     volatile int root;
 
-    final UnsafeBuffer buffer;
+    /*
+     EXPANDABLE DATA STORAGE
+
+     The tries will need more and more space in buffers and content lists as they grow. Instead of using ArrayList-like
+     reallocation with copying, which may be prohibitively expensive for large buffers, we use a sequence of
+     buffers/content arrays that double in size on every expansion.
+
+     For a given address x the index of the buffer can be found with the following calculation:
+        index_of_most_significant_set_bit(x / min_size + 1)
+     (relying on sum (2^i) for i in [0, n-1] == 2^n - 1) which can be performed quickly on modern hardware.
 
-    volatile AtomicReferenceArray<T> contentArray;
+     Finding the offset within the buffer is then
+        x + min - (min << buffer_index)
 
-    MemtableReadTrie(UnsafeBuffer buffer, AtomicReferenceArray<T> contentArray, int root)
+     The allocated space starts 256 bytes for the buffer and 16 entries for the content list.
+
+     Note that a buffer is not allowed to split 32-byte blocks (code assumes same buffer can be used for all bytes
+     inside the block).
+
+     TODO: implement delay and retry on space hitting the 2GB barrier.
+     */
+
+    static final int BUF_START_SHIFT = 8;
+    static final int BUF_START_SIZE = 1 << BUF_START_SHIFT;
+
+    static final int CONTENTS_START_SHIFT = 4;
+    static final int CONTENTS_START_SIZE = 1 << CONTENTS_START_SHIFT;
+
+    final UnsafeBuffer[] buffers;
+    final AtomicReferenceArray<T>[] contentArrays;
+
+    MemtableReadTrie(UnsafeBuffer[] buffers, AtomicReferenceArray<T>[] contentArrays, int root)
     {
-        this.buffer = buffer;
-        this.contentArray = contentArray;
+        this.buffers = buffers;
+        this.contentArrays = contentArrays;
         this.root = root;
     }
 
     /*
      Buffer, content list and block management
      */
+    int getChunkIdx(int pos, int minChunkShift, int minChunkSize)
+    {
+        return 31 - minChunkShift - Integer.numberOfLeadingZeros(pos + minChunkSize);
+    }
+
+    int getChunkOffset(int pos, int chunkIndex, int minChunkSize)
+    {
+        return pos + minChunkSize - (minChunkSize << chunkIndex);
+    }
+
+    UnsafeBuffer getBuffer(int pos)
+    {
+        int leadBit = getChunkIdx(pos, BUF_START_SHIFT, BUF_START_SIZE);
+        return buffers[leadBit];
+    }
+
+    int getOffset(int pos)
+    {
+        int leadBit = getChunkIdx(pos, BUF_START_SHIFT, BUF_START_SIZE);
+        return getChunkOffset(pos, leadBit, BUF_START_SIZE);
+    }
+
 
     /** Pointer offset for a node pointer. */
     int offset(int pos)
@@ -206,19 +255,22 @@ int offset(int pos)
 
     final int getByte(int pos)
     {
-        return buffer.getByte(pos) & 0xFF;
+        return getBuffer(pos).getByte(getOffset(pos)) & 0xFF;
     }
 
     final int getShort(int pos)
     {
-        return buffer.getShort(pos) & 0xFFFF;
+        return getBuffer(pos).getShort(getOffset(pos)) & 0xFFFF;
     }
 
-    final int getInt(int pos) { return buffer.getInt(pos); }
+    final int getInt(int pos) { return getBuffer(pos).getInt(getOffset(pos)); }
 
     T getContent(int index)
     {
-        return contentArray.get(index);
+        int leadBit = getChunkIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE);
+        int ofs = getChunkOffset(index, leadBit, CONTENTS_START_SIZE);
+        AtomicReferenceArray<T> array = contentArrays[leadBit];
+        return array.get(ofs);
     }
 
     /*
@@ -633,9 +685,9 @@ public BaseNode<L> getUniqueDescendant(L parentLink, TransitionsReceiver receive
             int child = node;
             do
             {
-                final int pointerPos =  chainBlockChildPointer(child);
+                final int pointerPos = chainBlockChildPointer(child);
                 if (receiver != null)
-                    receiver.add(buffer, child, pointerPos - child);
+                    receiver.add(getBuffer(child), getOffset(child), pointerPos - child);
                 // jump directly to the child at the end of the chain
                 child = getInt(pointerPos);
                 // and continue jumping as long as the resulting node is a chain
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index 8e9bc1ca933d..a7d908989548 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -24,13 +24,11 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
-import org.slf4j.LoggerFactory;
-
 import org.agrona.concurrent.UnsafeBuffer;
 import org.apache.cassandra.io.compress.BufferType;
+import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.bytecomparable.ByteSource;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.github.jamm.MemoryLayoutSpecification;
 
@@ -75,18 +73,16 @@ public class MemtableTrie<T> extends MemtableReadTrie<T>
     static
     {
         MemtableTrie<Object> empty = new MemtableTrie<>(BufferType.ON_HEAP);
-        EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(empty)
-                             - empty.contentArray.length() * MemoryLayoutSpecification.SPEC.getReferenceSize()
-                             - empty.buffer.capacity();
+        EMPTY_SIZE_ON_HEAP = ObjectSizes.measureDeep(empty);
         empty = new MemtableTrie<>(BufferType.OFF_HEAP);
-        EMPTY_SIZE_OFF_HEAP = ObjectSizes.measureDeep(empty)
-                              - empty.contentArray.length() * MemoryLayoutSpecification.SPEC.getReferenceSize()
-                              - empty.buffer.capacity();
+        EMPTY_SIZE_OFF_HEAP = ObjectSizes.measureDeep(empty);
     }
 
     public MemtableTrie(BufferType bufferType)
     {
-        super(new UnsafeBuffer(bufferType.allocate(INITIAL_BUFFER_CAPACITY)), new AtomicReferenceArray<>(16), NONE);
+        super(new UnsafeBuffer[31 - BUF_START_SHIFT],  // last one is 1G for a total of ~2G bytes
+              new AtomicReferenceArray[29 - CONTENTS_START_SHIFT],  // takes at least 4 bytes to write pointer to one content -> 4 times smaller than buffers
+              NONE);
         this.bufferType = bufferType;
         assert INITIAL_BUFFER_CAPACITY % BLOCK_SIZE == 0;
     }
@@ -101,40 +97,51 @@ public SpaceExhaustedException()
         }
     }
 
+    final void putInt(int pos, int value)
+    {
+        getBuffer(pos).putInt(getOffset(pos), value);
+    }
+
+    final void putIntOrdered(int pos, int value)
+    {
+        getBuffer(pos).putIntOrdered(getOffset(pos), value);
+    }
+
+    final void putIntVolatile(int pos, int value)
+    {
+        getBuffer(pos).putIntVolatile(getOffset(pos), value);
+    }
+
+    final void putShort(int pos, short value)
+    {
+        getBuffer(pos).putShort(getOffset(pos), value);
+    }
+
+    final void putShortVolatile(int pos, short value)
+    {
+        getBuffer(pos).putShort(getOffset(pos), value);
+    }
+
+    final void putByte(int pos, byte value)
+    {
+        getBuffer(pos).putByte(getOffset(pos), value);
+    }
+
+
     private int allocateBlock() throws SpaceExhaustedException
     {
         // Note: If this method is modified, please run MemtableTrieTest.testOver1GSize to verify it acts correctly
         // close to the 2G limit.
         int v = allocatedPos;
-        if (buffer.capacity() == v)
+        if (getOffset(v) == 0)
         {
-            int newSize;
-            if (v >= ALLOCATED_SIZE_THRESHOLD)
-            {
-                // we don't expect to write much after the threshold has been reached
-                // to avoid allocating too much space which will be left unused,
-                // grow by 10% of the limit, rounding up to BLOCK_SIZE
-                newSize = (v + ALLOCATED_SIZE_THRESHOLD / 10 + BLOCK_SIZE - 1) & -BLOCK_SIZE;
-                // If we do this repeatedly and the calculated size grows over 2G, it will overflow and result in a
-                // negative integer. In that case, cap it to a size that can be allocated.
-                if (newSize < 0)
-                {
-                    newSize = 0x7FFFFF00;   // 2G - 256 bytes
-                    if (newSize == allocatedPos)    // already at limit
-                        throw new SpaceExhaustedException();
-                    LoggerFactory.getLogger(getClass()).debug("Growing memtable trie to maximum size {}",
-                                                              FBUtilities.prettyPrintMemory(newSize));
-                }
-                else
-                    LoggerFactory.getLogger(getClass()).debug("Growing memtable trie by 10% over the {} limit to {}",
-                                                              FBUtilities.prettyPrintMemory(ALLOCATED_SIZE_THRESHOLD),
-                                                              FBUtilities.prettyPrintMemory(newSize));
-            } else
-                newSize = v * 2;
-
-            ByteBuffer newBuffer = bufferType.allocate(newSize);
-            buffer.getBytes(0, newBuffer, v);
-            buffer.wrap(newBuffer);
+            int leadBit = getChunkIdx(v, BUF_START_SHIFT, BUF_START_SIZE);
+            if (leadBit == 31)
+                throw new SpaceExhaustedException();
+
+            assert buffers[leadBit] == null;
+            ByteBuffer newBuffer = bufferType.allocate(BUF_START_SIZE << leadBit);
+            buffers[leadBit] = new UnsafeBuffer(newBuffer);
             // The above does not contain any happens-before enforcing writes, thus at this point the new buffer may be
             // invisible to any concurrent readers. Touching the volatile root pointer (which any new read must go
             // through) enforces a happens-before that makes it visible to all new reads (note: when the write completes
@@ -150,21 +157,37 @@ private int allocateBlock() throws SpaceExhaustedException
     private int addContent(T value)
     {
         int index = contentCount++;
-        if (index == contentArray.length())
+        int leadBit = getChunkIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE);
+        int ofs = getChunkOffset(index, leadBit, CONTENTS_START_SIZE);
+        AtomicReferenceArray<T> array = contentArrays[leadBit];
+        if (array == null)
         {
-            AtomicReferenceArray<T> newContent = new AtomicReferenceArray<>(index * 2);
-            for (int i = 0; i < contentArray.length(); ++i)
-                newContent.lazySet(i, contentArray.get(i));
-            contentArray = newContent;  // This is a volatile set, hence all previous stores must become visible
+            assert ofs == 0;
+            contentArrays[leadBit] = array = new AtomicReferenceArray<>(CONTENTS_START_SIZE << leadBit);
         }
-        contentArray.lazySet(index, value); // no need for a volatile set here; at this point the item is not referenced
-                                            // by any node in the trie, and a volatile set will be made to reference it.
+        array.lazySet(ofs, value); // no need for a volatile set here; at this point the item is not referenced
+                                   // by any node in the trie, and a volatile set will be made to reference it.
         return index;
     }
 
     private void setContent(int index, T value)
     {
-        contentArray.set(index, value);
+        int leadBit = getChunkIdx(index, CONTENTS_START_SHIFT, CONTENTS_START_SIZE);
+        int ofs = getChunkOffset(index, leadBit, CONTENTS_START_SIZE);
+        AtomicReferenceArray<T> array = contentArrays[leadBit];
+        array.set(ofs, value);
+    }
+
+    public void discardBuffers()
+    {
+        if (bufferType == BufferType.ON_HEAP)
+            return; // no cleaning needed
+
+        for (UnsafeBuffer b : buffers)
+        {
+            if (b != null)
+                FileUtils.clean(b.byteBuffer());
+        }
     }
 
     // Write methods
@@ -200,7 +223,7 @@ private int attachChild(int node, int trans, int newChild) throws SpaceExhausted
                 // If this is the last character in a Chain block, we can modify the child in-place
                 if (trans == getByte(node))
                 {
-                    buffer.putIntVolatile(node + 1, newChild);
+                    putIntVolatile(node + 1, newChild);
                     return node;
                 }
                 // else pass through
@@ -219,7 +242,7 @@ private void attachChildToSplit(int node, int trans, int newChild) throws SpaceE
         if (isNull(mid))
         {
             mid = allocateBlock();
-            buffer.putIntOrdered(midPos, mid);  // ordered write to ensure no uncleaned state is visible to readers
+            putIntOrdered(midPos, mid);  // ordered write to ensure no uncleaned state is visible to readers
             // i.e. if block is reused it may need to be set to all zero. if this is not ordered the writes clearing
             // it may execute after this link is created, and readers could see old content.
             // Not currently necessary (we don't reuse), but let's avoid the surprise when we start doing so.
@@ -230,11 +253,11 @@ private void attachChildToSplit(int node, int trans, int newChild) throws SpaceE
         if (isNull(tail))
         {
             tail = allocateBlock();
-            buffer.putIntOrdered(tailPos, tail); // as above
+            putIntOrdered(tailPos, tail); // as above
         }
 
         int childPos = tail + splitNodeChildIndex(trans) * 4;
-        buffer.putIntVolatile(childPos, newChild);
+        putIntVolatile(childPos, newChild);
     }
 
     /**
@@ -250,7 +273,7 @@ private int attachChildToSparse(int node, int trans, int newChild) throws SpaceE
                 break;
             if ((getByte(node + SPARSE_BYTES_OFFSET + i)) == trans)
             {
-                buffer.putIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4, newChild);
+                putIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4, newChild);
                 return node;
             }
         }
@@ -270,10 +293,10 @@ private int attachChildToSparse(int node, int trans, int newChild) throws SpaceE
         }
 
         // Add a new transition. They are not kept in order, so append it at the first free position.
-        buffer.putByte(node + SPARSE_BYTES_OFFSET + i,  (byte) trans);
+        putByte(node + SPARSE_BYTES_OFFSET + i,  (byte) trans);
 
         // Update order word.
-        int order = buffer.getShort(node + SPARSE_ORDER_OFFSET) & 0xFFFF;
+        int order = getShort(node + SPARSE_ORDER_OFFSET) & 0xFFFF;
         int newOrder = insertInOrderWord(order, i, trans, node + SPARSE_BYTES_OFFSET);
 
         // Sparse nodes have two access modes: via the order word, when listing transitions, or directly to characters
@@ -285,11 +308,11 @@ private int attachChildToSparse(int node, int trans, int newChild) throws SpaceE
         // correct value (see getSparseChild).
 
         // setting child enables reads to start seeing the new branch
-        buffer.putIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4, newChild);
+        putIntVolatile(node + SPARSE_CHILDREN_OFFSET + i * 4, newChild);
 
         // some readers will decide whether to check the pointer based on the order word
         // write that volatile to make sure they see the new change too
-        buffer.putShortVolatile(node + SPARSE_ORDER_OFFSET,  (short) newOrder);
+        putShortVolatile(node + SPARSE_ORDER_OFFSET,  (short) newOrder);
         return node;
     }
 
@@ -331,7 +354,7 @@ private void attachChildToSplitNonVolatile(int node, int trans, int newChild) th
         if (isNull(mid))
         {
             mid = allocateBlock();
-            buffer.putInt(midPos, mid);
+            putInt(midPos, mid);
         }
 
         int tailPos = mid + splitNodeTailIndex(trans) * 4;
@@ -339,11 +362,11 @@ private void attachChildToSplitNonVolatile(int node, int trans, int newChild) th
         if (isNull(tail))
         {
             tail = allocateBlock();
-            buffer.putInt(tailPos, tail);
+            putInt(tailPos, tail);
         }
 
         int childPos = tail + splitNodeChildIndex(trans) * 4;
-        buffer.putInt(childPos, newChild);
+        putInt(childPos, newChild);
     }
 
     /**
@@ -397,11 +420,11 @@ private int createSparseNode(int byte1, int child1, int byte2, int child2) throw
         }
 
         int node = allocateBlock() + SPARSE_OFFSET;
-        buffer.putByte(node + SPARSE_BYTES_OFFSET + 0,  (byte) byte1);
-        buffer.putByte(node + SPARSE_BYTES_OFFSET + 1,  (byte) byte2);
-        buffer.putInt(node + SPARSE_CHILDREN_OFFSET + 0 * 4, child1);
-        buffer.putInt(node + SPARSE_CHILDREN_OFFSET + 1 * 4, child2);
-        buffer.putShort(node + SPARSE_ORDER_OFFSET,  (short) (1 * 6 + 0));
+        putByte(node + SPARSE_BYTES_OFFSET + 0,  (byte) byte1);
+        putByte(node + SPARSE_BYTES_OFFSET + 1,  (byte) byte2);
+        putInt(node + SPARSE_CHILDREN_OFFSET + 0 * 4, child1);
+        putInt(node + SPARSE_CHILDREN_OFFSET + 1 * 4, child2);
+        putShort(node + SPARSE_ORDER_OFFSET,  (short) (1 * 6 + 0));
         // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be
         // put in an existing node or the root. That action ends in a happens-before enforcing write.
         return node;
@@ -415,8 +438,8 @@ private int createSparseNode(int byte1, int child1, int byte2, int child2) throw
     private int createNewChainNode(int transitionByte, int newChild) throws SpaceExhaustedException
     {
         int newNode = allocateBlock() + LAST_POINTER_OFFSET - 1;
-        buffer.putByte(newNode, (byte) transitionByte);
-        buffer.putInt(newNode + 1, newChild);
+        putByte(newNode, (byte) transitionByte);
+        putInt(newNode + 1, newChild);
         // Note: this does not need a volatile write as it is a new node, returning a new pointer, which needs to be
         // put in an existing node or the root. That action ends in a happens-before enforcing write.
         return newNode;
@@ -430,7 +453,7 @@ private int expandOrCreateChainNode(int transitionByte, int newChild) throws Spa
         {
             // attach as a new character in child node
             int newNode = newChild - 1;
-            buffer.putByte(newNode, (byte) transitionByte);
+            putByte(newNode, (byte) transitionByte);
             return newNode;
         }
 
@@ -458,17 +481,17 @@ private int createContentNode(int contentIndex, int child, boolean isSafeChain)
             // creating the embedded node may overwrite information that is still needed by concurrent readers or the
             // mutation process itself.
             node = (child & -BLOCK_SIZE) | PREFIX_OFFSET;
-            buffer.putByte(node + PREFIX_FLAGS_OFFSET, (byte) offset);
+            putByte(node + PREFIX_FLAGS_OFFSET, (byte) offset);
         }
         else
         {
             // Full prefix node
             node = allocateBlock() + PREFIX_OFFSET;
-            buffer.putByte(node + PREFIX_FLAGS_OFFSET, (byte) 0xFF);
-            buffer.putInt(node + PREFIX_POINTER_OFFSET, child);
+            putByte(node + PREFIX_FLAGS_OFFSET, (byte) 0xFF);
+            putInt(node + PREFIX_POINTER_OFFSET, child);
         }
 
-        buffer.putInt(node + PREFIX_CONTENT_OFFSET, contentIndex);
+        putInt(node + PREFIX_CONTENT_OFFSET, contentIndex);
         return node;
     }
 
@@ -483,7 +506,7 @@ private int updatePrefixNodeChild(int node, int child) throws SpaceExhaustedExce
         if (!isEmbeddedPrefixNode(node))
         {
             // This attaches the child branch and makes it reachable -- the write must be volatile.
-            buffer.putIntVolatile(node + PREFIX_POINTER_OFFSET, child);
+            putIntVolatile(node + PREFIX_POINTER_OFFSET, child);
             return node;
         }
         else
@@ -615,7 +638,7 @@ private int applyContent(U mutationContent, UpsertTransformer<T, U> transformer)
                     existingContentIndex = getInt(existingPreContentNode + PREFIX_CONTENT_OFFSET);
                 }
 
-                final T existingContent = contentArray.get(existingContentIndex);
+                final T existingContent = getContent(existingContentIndex);
                 T combinedContent = transformer.apply(existingContent, mutationContent);
                 setContent(existingContentIndex, combinedContent);
                 if (combinedContent != null)
@@ -644,7 +667,7 @@ private int applyContent(U mutationContent, UpsertTransformer<T, U> transformer)
 
             // Otherwise modify in place
             if (updatedPostContentNode != existingPostContentNode) // to use volatile write but also ensure we don't corrupt embedded nodes
-                buffer.putIntVolatile(existingPreContentNode + PREFIX_POINTER_OFFSET, updatedPostContentNode);
+                putIntVolatile(existingPreContentNode + PREFIX_POINTER_OFFSET, updatedPostContentNode);
             assert contentIndex == existingContentIndex;
             return existingPreContentNode;
         }
@@ -867,14 +890,14 @@ int advanceAllocatedPos(int wantedPos) throws SpaceExhaustedException
     /** Returns the off heap size of the memtable trie itself, not counting any space taken by referenced content. */
     public long sizeOffHeap()
     {
-        return bufferType == BufferType.ON_HEAP ? 0 : buffer.capacity();
+        return bufferType == BufferType.ON_HEAP ? 0 : allocatedPos;
     }
 
     /** Returns the on heap size of the memtable trie itself, not counting any space taken by referenced content. */
     public long sizeOnHeap()
     {
         return contentCount * MemoryLayoutSpecification.SPEC.getReferenceSize() +
-               (bufferType == BufferType.ON_HEAP ? buffer.capacity() + EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP);
+               (bufferType == BufferType.ON_HEAP ? allocatedPos + EMPTY_SIZE_ON_HEAP : EMPTY_SIZE_OFF_HEAP);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java b/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java
new file mode 100644
index 000000000000..7d69fa13ec06
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.metrics;
+
+import com.codahale.metrics.Gauge;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class MinMaxAvgMetric
+{
+    private final MetricNameFactory factory;
+    private final String namePrefix;
+
+    final Gauge<Long> minGauge;
+    final Gauge<Long> maxGauge;
+    final Gauge<Double> avgGauge;
+    final Gauge<Double> stddevGauge;
+    final Gauge<Integer> numSamplesGauge;
+
+    private long min;
+    private long max;
+    private long sum;
+    private long sumSquares;
+    private int numSamples;
+
+    public MinMaxAvgMetric(MetricNameFactory factory, String namePrefix)
+    {
+        this.factory = factory;
+        this.namePrefix = namePrefix;
+
+        minGauge = Metrics.register(factory.createMetricName(namePrefix + "Min"), () -> min);
+        maxGauge = Metrics.register(factory.createMetricName(namePrefix + "Max"), () -> max);
+        avgGauge = Metrics.register(factory.createMetricName(namePrefix + "Avg"), () -> numSamples > 0 ? ((double) sum) / numSamples : 0);
+        stddevGauge = Metrics.register(factory.createMetricName(namePrefix + "StdDev"), () -> stddev());
+        numSamplesGauge = Metrics.register(factory.createMetricName(namePrefix + "NumSamples"), () -> numSamples);
+    }
+
+    public void release()
+    {
+        Metrics.remove(factory.createMetricName(namePrefix + "Min"));
+        Metrics.remove(factory.createMetricName(namePrefix + "Max"));
+        Metrics.remove(factory.createMetricName(namePrefix + "Avg"));
+        Metrics.remove(factory.createMetricName(namePrefix + "StdDev"));
+        Metrics.remove(factory.createMetricName(namePrefix + "NumSamples"));
+    }
+
+    public void reset()
+    {
+        sum = 0;
+        sumSquares = 0;
+        max = Long.MIN_VALUE;
+        min = Long.MAX_VALUE;
+        numSamples = 0;
+    }
+
+    public void update(long value)
+    {
+        max = max > value ? max : value;
+        min = min < value ? min : value;
+        sum += value;
+        sumSquares += value * value;
+        numSamples++;
+    }
+
+    private Double stddev()
+    {
+        if (numSamples > 0)
+        {
+            double avgSquare = ((double) sumSquares) / numSamples;
+            double avg = ((double) sum) / numSamples;
+            return Math.sqrt(avgSquare - avg * avg);
+        }
+        else
+        {
+            return 0.0;
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java
index 341b7dac6f72..7473d9c1f50d 100644
--- a/src/java/org/apache/cassandra/metrics/TableMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java
@@ -377,11 +377,16 @@ public static long[] addHistogram(long[] sums, long[] buckets)
      *
      * @param cfs ColumnFamilyStore to measure metrics
      */
-    public TableMetrics(final ColumnFamilyStore cfs)
+    public TableMetrics(final ColumnFamilyStore cfs, ReleasableMetric memtableMetrics)
     {
         factory = new TableMetricNameFactory(cfs, "Table");
         aliasFactory = new TableMetricNameFactory(cfs, "ColumnFamily");
 
+        if (memtableMetrics != null)
+        {
+            all.add(memtableMetrics);
+        }
+
         samplers = new EnumMap<>(SamplerType.class);
         topReadPartitionFrequency = new FrequencySampler<ByteBuffer>()
         {
diff --git a/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java b/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java
new file mode 100644
index 000000000000..934350399945
--- /dev/null
+++ b/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.metrics;
+
+import com.codahale.metrics.Counter;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+public class TrieMemtableMetricsView
+{
+    private static final String UNCONTENDED_PUTS = "Uncontended memtable puts";
+    private static final String CONTENDED_PUTS = "Contended memtable puts";
+    private static final String CONTENTION_TIME = "Contention time";
+    private static final String LAST_FLUSH_SHARD_SIZES = "Shard sizes during last flush";
+
+    // the number of memtable puts that did not need to wait on write lock
+    public final Counter uncontendedPuts;
+
+    // the number of memtable puts that needed to wait on write lock
+    public final Counter contendedPuts;
+
+    // shard put contention measurements
+    public final LatencyMetrics contentionTime;
+
+    // shard sizes distribution
+    public final MinMaxAvgMetric lastFlushShardDataSizes;
+
+    private final TrieMemtableMetricNameFactory factory;
+
+    public TrieMemtableMetricsView(String keyspace, String table)
+    {
+        factory = new TrieMemtableMetricNameFactory(keyspace, table);
+        
+        uncontendedPuts = Metrics.counter(factory.createMetricName(UNCONTENDED_PUTS));
+        contendedPuts = Metrics.counter(factory.createMetricName(CONTENDED_PUTS));
+        contentionTime = new LatencyMetrics(factory, CONTENTION_TIME);
+        lastFlushShardDataSizes = new MinMaxAvgMetric(factory, LAST_FLUSH_SHARD_SIZES);
+    }
+
+    public void release()
+    {
+        Metrics.remove(factory.createMetricName(UNCONTENDED_PUTS));
+        Metrics.remove(factory.createMetricName(CONTENDED_PUTS));
+        contentionTime.release();
+        lastFlushShardDataSizes.release();
+    }
+
+    static class TrieMemtableMetricNameFactory implements MetricNameFactory
+    {
+        private final String keyspace;
+        private final String table;
+
+        TrieMemtableMetricNameFactory(String keyspace, String table)
+        {
+            this.keyspace = keyspace;
+            this.table = table;
+        }
+
+        public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
+        {
+            String groupName = TableMetrics.class.getPackage().getName();
+            String type = "TrieMemtable";
+
+            StringBuilder mbeanName = new StringBuilder();
+            mbeanName.append(groupName).append(":");
+            mbeanName.append("type=").append(type);
+            mbeanName.append(",keyspace=").append(keyspace);
+            mbeanName.append(",scope=").append(table);
+            mbeanName.append(",name=").append(metricName);
+
+            return new CassandraMetricsRegistry.MetricName(groupName, type, metricName, keyspace + "." + table, mbeanName.toString());
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/schema/MemtableParams.java b/src/java/org/apache/cassandra/schema/MemtableParams.java
index f9430c22564d..f5ec4d7d0a96 100644
--- a/src/java/org/apache/cassandra/schema/MemtableParams.java
+++ b/src/java/org/apache/cassandra/schema/MemtableParams.java
@@ -60,7 +60,7 @@ public String toString()
     private MemtableParams()
     {
         this.options = ImmutableMap.of();
-        this.factory = DefaultMemtableFactory.INSTANCE;
+        this.factory = new DefaultMemtableFactory();
     }
 
     public MemtableParams(Map<String, String> options)
diff --git a/src/java/org/apache/cassandra/schema/TableMetadata.java b/src/java/org/apache/cassandra/schema/TableMetadata.java
index 60aa2fbb7bd8..c87bdb60b118 100644
--- a/src/java/org/apache/cassandra/schema/TableMetadata.java
+++ b/src/java/org/apache/cassandra/schema/TableMetadata.java
@@ -920,6 +920,13 @@ public Builder flags(Set<Flag> val)
             return this;
         }
 
+        public Builder memtable(MemtableParams val)
+        {
+            params.memtable(val);
+            return this;
+        }
+
+
         public Builder isCounter(boolean val)
         {
             return flag(Flag.COUNTER, val);
diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java b/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
index e7a98d882e29..d6a21097aeb4 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
@@ -26,6 +26,9 @@
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;
+import java.util.function.Supplier;
+import java.util.stream.IntStream;
+import java.util.stream.LongStream;
 
 import com.google.common.base.Throwables;
 
@@ -74,8 +77,8 @@ public enum EndOp
 
     String writeStatement;
 
-    @Param({"1"})
-    int threadCount = 1;
+    @Param({"32"})
+    int threadCount;
 
     ExecutorService executorService;
 
@@ -143,7 +146,7 @@ public Object[] writeArguments(long i)
         return new Object[] { i, i, i };
     }
 
-    public void performWrite(long ofs, long count) throws Throwable
+    public void performWrite(long ofs, int count) throws Throwable
     {
         if (useNet)
         {
@@ -161,16 +164,16 @@ public void performWrite(long ofs, long count) throws Throwable
         }
     }
 
-    public void performWriteSerial(long ofs, long count) throws Throwable
+    public void performWriteSerial(long ofs, int count) throws Throwable
     {
         for (long i = ofs; i < ofs + count; ++i)
             execute(writeStatement, writeArguments(i));
     }
 
-    public void performWriteThreads(long ofs, long count) throws Throwable
+    public void performWriteThreads(long ofs, int count) throws Throwable
     {
         List<Future<Integer>> futures = new ArrayList<>();
-        for (long i = 0; i < count; ++i)
+        for (int i = 0; i < count; ++i)
         {
             long pos = ofs + i;
             futures.add(executorService.submit(() ->
@@ -186,19 +189,19 @@ public void performWriteThreads(long ofs, long count) throws Throwable
                 }
             }));
         }
-        long done = 0;
+        int done = 0;
         for (Future<Integer> f : futures)
             done += f.get();
         assert count == done;
     }
 
-    public void performWriteSerialNet(long ofs, long count) throws Throwable
+    public void performWriteSerialNet(long ofs, int count) throws Throwable
     {
         for (long i = ofs; i < ofs + count; ++i)
             sessionNet().execute(writeStatement, writeArguments(i));
     }
 
-    public void performWriteThreadsNet(long ofs, long count) throws Throwable
+    public void performWriteThreadsNet(long ofs, int count) throws Throwable
     {
         List<Future<Integer>> futures = new ArrayList<>();
         for (long i = 0; i < count; ++i)
diff --git a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
index 3ccac4034202..d7ecd37e81f1 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/operations/AlterTest.java
@@ -20,11 +20,13 @@
 import java.util.List;
 import java.util.UUID;
 
+import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
 import com.datastax.driver.core.PreparedStatement;
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
@@ -53,6 +55,15 @@
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class AlterTest extends CQLTester
 {
+    @BeforeClass
+    public static void setUpClass()
+    {
+        // AlterTest uses Murmur3 partitioner, but injects OrderPreservingPartitioner.StringToken
+        // into TokenMetadata; expect trouble
+        System.setProperty(TrieMemtable.SHARD_COUNT_PROPERTY, "1");
+        CQLTester.setUpClass();
+    }
+
     @Test
     public void testDropColumnAsPreparedStatement() throws Throwable
     {
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
index e433c39523af..74cd402bb8bc 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogTest.java
@@ -30,6 +30,7 @@
 import java.util.zip.Checksum;
 
 import com.google.common.base.Throwables;
+import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Iterables;
 import com.google.common.io.Files;
 
@@ -40,10 +41,11 @@
 
 import org.apache.cassandra.SchemaLoader;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.memtable.AbstractAllocatorMemtable;
 import org.apache.cassandra.db.memtable.Memtable;
+import org.apache.cassandra.db.memtable.SkipListMemtable;
 import org.apache.cassandra.io.compress.ZstdCompressor;
 import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.MemtableParams;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -131,22 +133,25 @@ public static void beforeClass() throws ConfigurationException
         SchemaLoader.prepareServer();
         StorageService.instance.getTokenMetadata().updateHostId(UUID.randomUUID(), FBUtilities.getBroadcastAddressAndPort());
 
+        MemtableParams skipListMemtable = MemtableParams.fromMap(ImmutableMap.of("class", "SkipListMemtable"));
+
         TableMetadata.Builder custom =
             TableMetadata.builder(KEYSPACE1, CUSTOM1)
                          .addPartitionKeyColumn("k", IntegerType.instance)
                          .addClusteringColumn("c1", MapType.getInstance(UTF8Type.instance, UTF8Type.instance, false))
                          .addClusteringColumn("c2", SetType.getInstance(UTF8Type.instance, false))
-                         .addStaticColumn("s", IntegerType.instance);
+                         .addStaticColumn("s", IntegerType.instance)
+                         .memtable(skipListMemtable);
 
         SchemaLoader.createKeyspace(KEYSPACE1,
                                     KeyspaceParams.simple(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance).memtable(skipListMemtable),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance).memtable(skipListMemtable),
                                     custom);
         SchemaLoader.createKeyspace(KEYSPACE2,
                                     KeyspaceParams.simpleTransient(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance));
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD1, 0, AsciiType.instance, BytesType.instance).memtable(skipListMemtable),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, STANDARD2, 0, AsciiType.instance, BytesType.instance).memtable(skipListMemtable));
         CompactionManager.instance.disableAutoCompaction();
 
         testKiller = new KillerForTests();
@@ -944,7 +949,7 @@ public void testOutOfOrderFlushRecovery(BiConsumer<ColumnFamilyStore, Memtable>
 
             Memtable current = cfs.getTracker().getView().getCurrentMemtable();
             if (i == 2)
-                ((AbstractAllocatorMemtable) current).makeUnflushable();
+                ((SkipListMemtable) current).makeUnflushable();
 
             flushAction.accept(cfs, current);
         }
diff --git a/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java b/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java
new file mode 100644
index 000000000000..ef472fef8251
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.memtable;
+
+import java.io.IOException;
+import javax.management.Attribute;
+import javax.management.AttributeNotFoundException;
+import javax.management.InstanceNotFoundException;
+import javax.management.InvalidAttributeValueException;
+import javax.management.MBeanException;
+import javax.management.MalformedObjectNameException;
+import javax.management.ObjectName;
+import javax.management.ReflectionException;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.db.memtable.TrieMemtable.TRIE_MEMTABLE_CONFIG_OBJECT_NAME;
+import static org.junit.Assert.assertEquals;
+
+public class TrieMemtableConfigTest extends CQLTester
+{
+    @BeforeClass
+    public static void setup() throws Exception
+    {
+        startJMXServer();
+        createMBeanServerConnection();
+    }
+
+    @Test
+    public void testShardCountSetByJMX() throws MalformedObjectNameException, ReflectionException, AttributeNotFoundException, InstanceNotFoundException, MBeanException, IOException, InvalidAttributeValueException
+    {
+        jmxConnection.setAttribute(new ObjectName(TRIE_MEMTABLE_CONFIG_OBJECT_NAME), new Attribute("ShardCount", "7"));
+        assertEquals(7, TrieMemtable.getShardCount());
+    }
+
+    @Test
+    public void testAutoShardCount() throws MalformedObjectNameException, ReflectionException, AttributeNotFoundException, InstanceNotFoundException, MBeanException, IOException, InvalidAttributeValueException
+    {
+        jmxConnection.setAttribute(new ObjectName(TRIE_MEMTABLE_CONFIG_OBJECT_NAME), new Attribute("ShardCount", "auto"));
+        assertEquals(FBUtilities.getAvailableProcessors(), TrieMemtable.getShardCount());
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
index 2dec4746d3a9..480cfb2e0e8f 100644
--- a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
@@ -284,7 +284,7 @@ public void testDirect()
                             .mapToInt(src1 -> ByteComparable.length(src1, VERSION))
                             .sum();
         long ts = ObjectSizes.measureDeep(content);
-        long onh = ObjectSizes.measureDeep(trie.contentArray);
+        long onh = ObjectSizes.measureDeep(trie.contentArrays);
         System.out.format("Trie size on heap %,d off heap %,d measured %,d keys %,d treemap %,d\n",
                           trie.sizeOnHeap(), trie.sizeOffHeap(), onh, keysize, ts);
         System.out.format("per entry on heap %.2f off heap %.2f measured %.2f keys %.2f treemap %.2f\n",
diff --git a/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java b/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java
new file mode 100644
index 000000000000..de6aa8cd8ba9
--- /dev/null
+++ b/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java
@@ -0,0 +1,215 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.metrics;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.datastax.driver.core.Cluster;
+import com.datastax.driver.core.Session;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.config.OverrideConfigurationLoader;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.Schema;
+import org.apache.cassandra.service.EmbeddedCassandraService;
+import org.apache.cassandra.service.StorageService;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMRules;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+import static org.hamcrest.Matchers.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
+
+@RunWith(BMUnitRunner.class)
+public class TrieMemtableMetricsTest extends SchemaLoader
+{
+    private static final int NUM_SHARDS = 13;
+
+    private static Logger logger = LoggerFactory.getLogger(TrieMemtableMetricsTest.class);
+    private static Session session;
+
+    private static final String KEYSPACE = "triememtable";
+    private static final String TABLE = "metricstest";
+
+    @BeforeClass
+    public static void loadSchema() throws ConfigurationException
+    {
+        // shadow superclass method; we'll call it directly
+        // after tinkering with the Config
+    }
+
+    @BeforeClass
+    public static void setup() throws ConfigurationException, IOException
+    {
+        OverrideConfigurationLoader.override((config) -> {
+            config.partitioner = "Murmur3Partitioner";
+        });
+        System.setProperty("cassandra.trie.memtable.shard.count", "" + NUM_SHARDS);
+
+        SchemaLoader.loadSchema();
+
+        Schema.instance.clear();
+
+        EmbeddedCassandraService cassandra = new EmbeddedCassandraService();
+        cassandra.start();
+
+        Cluster cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build();
+        session = cluster.connect();
+
+        session.execute(String.format("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };", KEYSPACE));
+    }
+
+    private ColumnFamilyStore recreateTable()
+    {
+        return recreateTable(TABLE);
+    }
+
+    private ColumnFamilyStore recreateTable(String table)
+    {
+        session.execute(String.format("DROP TABLE IF EXISTS %s.%s", KEYSPACE, table));
+        session.execute(String.format("CREATE TABLE IF NOT EXISTS %s.%s (id int, val1 text, val2 text, PRIMARY KEY(id, val1)) WITH MEMTABLE = {'class':'TrieMemtable'};", KEYSPACE, table));
+        return ColumnFamilyStore.getIfExists(KEYSPACE, table);
+    }
+
+    @Test
+    public void testRegularStatementsAreCounted()
+    {
+        ColumnFamilyStore cfs = recreateTable();
+        TrieMemtableMetricsView metrics = getMemtableMetrics(cfs);
+        assertEquals(0, metrics.contendedPuts.getCount());
+        assertEquals(0, metrics.uncontendedPuts.getCount());
+
+        for (int i = 0; i < 10; i++)
+        {
+            session.execute(String.format("INSERT INTO %s.%s (id, val1, val2) VALUES (%d, '%s', '%s')", KEYSPACE, TABLE, i, "val" + i, "val" + i));
+        }
+
+        long allPuts = metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount();
+        assertEquals(10, allPuts);
+    }
+
+    @Test
+    public void testFlushRelatedMetrics() throws IOException, ExecutionException, InterruptedException
+    {
+        ColumnFamilyStore cfs = recreateTable();
+        TrieMemtableMetricsView metrics = getMemtableMetrics(cfs);
+
+        StorageService.instance.forceKeyspaceFlush(KEYSPACE, TABLE);
+        assertEquals(0, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
+
+        writeAndFlush(10);
+        assertEquals(10, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
+
+        // verify that metrics survive flush / memtable switching
+        writeAndFlush(10);
+        assertEquals(20, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
+        assertEquals(metrics.lastFlushShardDataSizes.toString(), NUM_SHARDS, (int) metrics.lastFlushShardDataSizes.numSamplesGauge.getValue());
+    }
+
+    @Test
+    @BMRules(rules = { @BMRule(name = "Delay memtable update",
+    targetClass = "MemtableTrie",
+    targetMethod = "putSingleton",
+    action = "java.lang.Thread.sleep(10)")})
+    public void testContentionMetrics() throws IOException, ExecutionException, InterruptedException
+    {
+        ColumnFamilyStore cfs = recreateTable();
+        TrieMemtableMetricsView metrics = getMemtableMetrics(cfs);
+        assertEquals(0, (int) metrics.lastFlushShardDataSizes.numSamplesGauge.getValue());
+
+        StorageService.instance.forceKeyspaceFlush(KEYSPACE, TABLE);
+
+        writeAndFlush(100);
+
+        ByteArrayOutputStream stream = new ByteArrayOutputStream();
+        metrics.contentionTime.latency.getSnapshot().dump(stream);
+
+        assertEquals(100, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
+        assertThat(metrics.contendedPuts.getCount(), greaterThan(0L));
+        assertThat(metrics.contentionTime.totalLatency.getCount(), greaterThan(0L));
+    }
+
+    @Test
+    public void testMetricsCleanupOnDrop()
+    {
+        String tableName = TABLE + "_metrics_cleanup";
+        CassandraMetricsRegistry registry = CassandraMetricsRegistry.Metrics;
+        Supplier<Stream<String>> metrics = () -> registry.getNames().stream().filter(m -> m.contains(tableName));
+
+        // no metrics before creating
+        assertEquals(0, metrics.get().count());
+
+        recreateTable(tableName);
+        // some metrics
+        assertTrue(metrics.get().count() > 0);
+
+        session.execute(String.format("DROP TABLE IF EXISTS %s.%s", KEYSPACE, tableName));
+        // no metrics after drop
+        assertEquals(metrics.get().collect(Collectors.joining(",")), 0, metrics.get().count());
+    }
+
+    private TrieMemtableMetricsView getMemtableMetrics(ColumnFamilyStore cfs)
+    {
+        return new TrieMemtableMetricsView(cfs.keyspace.getName(), cfs.name);
+    }
+
+    private void writeAndFlush(int rows) throws IOException, ExecutionException, InterruptedException
+    {
+        logger.info("writing {} rows", rows);
+        Future[] futures = new Future[rows];
+        for (int i = 0; i < rows; i++)
+        {
+            logger.info("writing {} row", i);
+            futures[i] = session.executeAsync(String.format("INSERT INTO %s.%s (id, val1, val2) VALUES (%d, '%s', '%s')", KEYSPACE, TABLE, i, "val" + i, "val" + i));
+        }
+        for (int i = 0; i < rows; i++)
+        {
+            futures[i].get();
+            logger.info("writing {} row completed", i);
+        }
+        logger.info("forcing flush");
+        StorageService.instance.forceKeyspaceFlush(KEYSPACE, TABLE);
+        logger.info("table flushed");
+    }
+
+    @AfterClass
+    public static void teardown()
+    {
+        session.close();
+    }
+}

From 7e081acbd6c4fd95a1263bc9840b523ac388a2d2 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Thu, 6 May 2021 08:51:37 +0200
Subject: [PATCH 077/151] STAR-539: Extra fix, missed running bigtable tests
 again (#143)

The problem is that we cannot test serialization / deserialization this way with null clusterings, because they get truncated during serialization. Thus, limit serialization / deserialization tests to the cases where there is no null in clusterings

(cherry picked from commit 0a5fb20341a7daf41e8f4dbcb1b34d547d1b8715)
(cherry picked from commit c8964c64efda766a77e445500c24e22198e9c7ac)
---
 .../io/sstable/metadata/MetadataSerializerTest.java    | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
index 3ec8c7ebcae6..d290e7093f23 100644
--- a/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/metadata/MetadataSerializerTest.java
@@ -65,7 +65,7 @@ public static void initDD()
     @Test
     public void testSerialization() throws IOException
     {
-        Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
+        Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata(false);
 
         MetadataSerializer serializer = new MetadataSerializer();
         File statsFile = serialize(originalMetadata, serializer, SSTableFormat.Type.current().info.getLatestVersion());
@@ -85,7 +85,7 @@ public void testSerialization() throws IOException
     @Test
     public void testHistogramSterilization() throws IOException
     {
-        Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
+        Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata(false);
 
         // Modify the histograms to overflow:
         StatsMetadata originalStats = (StatsMetadata) originalMetadata.get(MetadataType.STATS);
@@ -121,7 +121,7 @@ public File serialize(Map<MetadataType, MetadataComponent> metadata, MetadataSer
         return statsFile;
     }
 
-    public Map<MetadataType, MetadataComponent> constructMetadata()
+    public Map<MetadataType, MetadataComponent> constructMetadata(boolean withNulls)
     {
         CommitLogPosition club = new CommitLogPosition(11L, 12);
         CommitLogPosition cllb = new CommitLogPosition(9L, 12);
@@ -133,7 +133,7 @@ public Map<MetadataType, MetadataComponent> constructMetadata()
         String partitioner = RandomPartitioner.class.getCanonicalName();
         double bfFpChance = 0.1;
         collector.updateClusteringValues(Clustering.make(UTF8Type.instance.decompose("abc"), Int32Type.instance.decompose(123)));
-        collector.updateClusteringValues(Clustering.make(UTF8Type.instance.decompose("cba"), null));
+        collector.updateClusteringValues(Clustering.make(UTF8Type.instance.decompose("cba"), withNulls ? null : Int32Type.instance.decompose(234)));
         return collector.finalizeMetadata(partitioner, bfFpChance, 0, null, false, SerializationHeader.make(cfm, Collections.emptyList()));
     }
 
@@ -197,7 +197,7 @@ public void testCVersions() throws Throwable
 
     public void testOldReadsNew(String oldV, String newV) throws IOException
     {
-        Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata();
+        Map<MetadataType, MetadataComponent> originalMetadata = constructMetadata(true);
 
         MetadataSerializer serializer = new MetadataSerializer();
         // Write metadata in two minor formats.

From 4c29fc971b999d2356458b7481540cc1ba0905a4 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 6 May 2021 16:07:51 +0100
Subject: [PATCH 078/151] STAR-549: Only register index components that exist
 on disk with SSTable

(cherry picked from commit ee59af85f5a4bba09892fe28ddf7112a0921d781)
(cherry picked from commit 3212a6674cc61c3f0c60cd22f6a754ed45f104f7)
---
 .../sai/StorageAttachedIndexBuilder.java      |  2 +-
 .../index/sai/StorageAttachedIndexGroup.java  | 10 +++
 .../sai/functional/GroupComponentsTest.java   | 85 +++++++++++++++++++
 3 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 test/unit/org/apache/cassandra/index/sai/functional/GroupComponentsTest.java

diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
index 28d05eeb8539..91cbbdfcaa6c 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexBuilder.java
@@ -324,7 +324,7 @@ private void completeSSTable(SSTableFlushObserver indexWriter,
         }
 
         // register custom index components into existing sstables
-        sstable.registerComponents(group.getComponents(existing), tracker);
+        sstable.registerComponents(group.getLiveComponents(sstable, existing), tracker);
         Set<StorageAttachedIndex> incomplete = group.onSSTableChanged(Collections.emptyList(), Collections.singleton(sstable), existing, false, false);
 
         if (!incomplete.isEmpty())
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
index e8b08c24ae3a..d6702c0753d4 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
@@ -232,6 +232,16 @@ static Set<Component> getComponents(Collection<StorageAttachedIndex> indices)
         return components;
     }
 
+    // This differs from getComponents in that it only returns index components that exist on disk.
+    // It avoids errors being logged by the SSTable.readTOC method when we have an empty index.
+    @VisibleForTesting
+    public static Set<Component> getLiveComponents(SSTableReader sstable, Collection<StorageAttachedIndex> indices)
+    {
+        return getComponents(indices).stream()
+                                     .filter(component -> sstable.descriptor.fileFor(component).exists())
+                                     .collect(Collectors.toSet());
+    }
+
     @Override
     public void handleNotification(INotification notification, Object sender)
     {
diff --git a/test/unit/org/apache/cassandra/index/sai/functional/GroupComponentsTest.java b/test/unit/org/apache/cassandra/index/sai/functional/GroupComponentsTest.java
new file mode 100644
index 000000000000..6af4b7dc8eeb
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/functional/GroupComponentsTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.functional;
+
+import java.util.Collection;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.index.sai.SAITester;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.StorageAttachedIndexGroup;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+import static org.junit.Assert.assertEquals;
+
+public class GroupComponentsTest extends SAITester
+{
+    @Test
+    public void getLiveComponentsForEmptyIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int primary key, value int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        execute("INSERT INTO %s (pk) VALUES (1)");
+        flush();
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+        Set<SSTableReader> sstables = cfs.getLiveSSTables();
+
+        assertEquals(1, sstables.size());
+
+        Set<Component> components = group.getLiveComponents(sstables.iterator().next(), getIndexesFromGroup(group));
+
+        // 4 per-sstable components and column complete marker
+        assertEquals(5, components.size());
+    }
+
+    @Test
+    public void getLiveComponentsForPopulatedIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int primary key, value int)");
+        createIndex("CREATE CUSTOM INDEX ON %s(value) USING 'StorageAttachedIndex'");
+        waitForIndexQueryable();
+        execute("INSERT INTO %s (pk, value) VALUES (1, 1)");
+        flush();
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        StorageAttachedIndexGroup group = StorageAttachedIndexGroup.getIndexGroup(cfs);
+        Set<SSTableReader> sstables = cfs.getLiveSSTables();
+
+        assertEquals(1, sstables.size());
+
+        Set<Component> components = group.getLiveComponents(sstables.iterator().next(), getIndexesFromGroup(group));
+
+        // 4 per-sstable components and 4 column components
+        assertEquals(8, components.size());
+    }
+
+    private Collection<StorageAttachedIndex> getIndexesFromGroup(StorageAttachedIndexGroup group)
+    {
+        return group.getIndexes().stream().map(index -> (StorageAttachedIndex)index).collect(Collectors.toList());
+    }
+}

From ce958c67252e8c5fc75f5f59438b8d31209462dc Mon Sep 17 00:00:00 2001
From: Jakub Zytka <jakub.zytka@datastax.com>
Date: Fri, 7 May 2021 11:05:50 +0200
Subject: [PATCH 079/151] STAR-553: update last flush metrics in TrieMemtable +
 cosmetics

(cherry picked from commit 5081fa14af122ebac82bfff997be7ff483bfe2d3)
(cherry picked from commit a3280794778883852d1b84fe3123b609af8a9160)
---
 .../cassandra/db/memtable/TrieMemtable.java   | 20 ++++++++--------
 .../db/memtable/TrieMemtableConfigMXBean.java |  2 ++
 .../cassandra/metrics/MinMaxAvgMetric.java    | 20 ++++++++--------
 .../metrics/TrieMemtableMetricsView.java      | 23 ++++++++++++++++++-
 .../db/memtable/TrieMemtableConfigTest.java   |  4 ++--
 .../metrics/TrieMemtableMetricsTest.java      | 11 ++++-----
 6 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
index e0523b219019..d11eb0dc0649 100644
--- a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
@@ -143,10 +143,11 @@ public class TrieMemtable extends AbstractAllocatorMemtable
     TrieMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
     {
         super(commitLogLowerBound, metadataRef, owner);
-        this.boundaries = owner.localRangeSplits(getShardCount());
-        this.metrics = new TrieMemtableMetricsView(metadataRef.keyspace, metadataRef.name);
+        this.boundaries = owner.localRangeSplits(SHARD_COUNT);
+        this.metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name);
         this.shards = generatePartitionShards(boundaries.shardCount(), metadataRef, metrics);
         this.mergedTrie = makeMergedTrie(shards);
+        logger.debug("Created memtable with {} shards", this.shards.length);
     }
 
     private static MemtableShard[] generatePartitionShards(int splits,
@@ -701,12 +702,13 @@ public Memtable create(AtomicReference<CommitLogPosition> commitLogLowerBound,
         @Override
         public TableMetrics.ReleasableMetric createMemtableMetrics(TableMetadataRef metadataRef)
         {
-            TrieMemtableMetricsView metrics = new TrieMemtableMetricsView(metadataRef.keyspace, metadataRef.name);
+            TrieMemtableMetricsView metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name);
             return metrics::release;
         }
     }
 
-    private static class TrieMemtableConfig implements TrieMemtableConfigMXBean
+    @VisibleForTesting
+    public static class TrieMemtableConfig implements TrieMemtableConfigMXBean
     {
         @Override
         public void setShardCount(String shardCount)
@@ -729,11 +731,11 @@ public void setShardCount(String shardCount)
             }
             logger.info("Requested setting shard count to {}; set to: {}", shardCount, SHARD_COUNT);
         }
-    }
 
-    @VisibleForTesting
-    public static int getShardCount()
-    {
-        return SHARD_COUNT;
+        @Override
+        public String getShardCount()
+        {
+            return "" + SHARD_COUNT;
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java
index 6080a31a4056..85123666ad91 100644
--- a/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtableConfigMXBean.java
@@ -21,4 +21,6 @@
 public interface TrieMemtableConfigMXBean
 {
     public void setShardCount(String numShards);
+
+    public String getShardCount();
 }
diff --git a/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java b/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java
index 7d69fa13ec06..541bf7df295e 100644
--- a/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java
+++ b/src/java/org/apache/cassandra/metrics/MinMaxAvgMetric.java
@@ -44,20 +44,20 @@ public MinMaxAvgMetric(MetricNameFactory factory, String namePrefix)
         this.factory = factory;
         this.namePrefix = namePrefix;
 
-        minGauge = Metrics.register(factory.createMetricName(namePrefix + "Min"), () -> min);
-        maxGauge = Metrics.register(factory.createMetricName(namePrefix + "Max"), () -> max);
-        avgGauge = Metrics.register(factory.createMetricName(namePrefix + "Avg"), () -> numSamples > 0 ? ((double) sum) / numSamples : 0);
-        stddevGauge = Metrics.register(factory.createMetricName(namePrefix + "StdDev"), () -> stddev());
-        numSamplesGauge = Metrics.register(factory.createMetricName(namePrefix + "NumSamples"), () -> numSamples);
+        minGauge = Metrics.register(factory.createMetricName(namePrefix + " Min"), () -> min);
+        maxGauge = Metrics.register(factory.createMetricName(namePrefix + " Max"), () -> max);
+        avgGauge = Metrics.register(factory.createMetricName(namePrefix + " Avg"), () -> numSamples > 0 ? ((double) sum) / numSamples : 0);
+        stddevGauge = Metrics.register(factory.createMetricName(namePrefix + " StdDev"), () -> stddev());
+        numSamplesGauge = Metrics.register(factory.createMetricName(namePrefix + " NumSamples"), () -> numSamples);
     }
 
     public void release()
     {
-        Metrics.remove(factory.createMetricName(namePrefix + "Min"));
-        Metrics.remove(factory.createMetricName(namePrefix + "Max"));
-        Metrics.remove(factory.createMetricName(namePrefix + "Avg"));
-        Metrics.remove(factory.createMetricName(namePrefix + "StdDev"));
-        Metrics.remove(factory.createMetricName(namePrefix + "NumSamples"));
+        Metrics.remove(factory.createMetricName(namePrefix + " Min"));
+        Metrics.remove(factory.createMetricName(namePrefix + " Max"));
+        Metrics.remove(factory.createMetricName(namePrefix + " Avg"));
+        Metrics.remove(factory.createMetricName(namePrefix + " StdDev"));
+        Metrics.remove(factory.createMetricName(namePrefix + " NumSamples"));
     }
 
     public void reset()
diff --git a/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java b/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java
index 934350399945..d0e49ff5eeea 100644
--- a/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java
+++ b/src/java/org/apache/cassandra/metrics/TrieMemtableMetricsView.java
@@ -18,6 +18,9 @@
 
 package org.apache.cassandra.metrics;
 
+import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
+
 import com.codahale.metrics.Counter;
 
 import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
@@ -29,6 +32,8 @@ public class TrieMemtableMetricsView
     private static final String CONTENTION_TIME = "Contention time";
     private static final String LAST_FLUSH_SHARD_SIZES = "Shard sizes during last flush";
 
+    private static final Map<String, TrieMemtableMetricsView> perTableMetrics = new ConcurrentHashMap<>();
+
     // the number of memtable puts that did not need to wait on write lock
     public final Counter uncontendedPuts;
 
@@ -42,9 +47,18 @@ public class TrieMemtableMetricsView
     public final MinMaxAvgMetric lastFlushShardDataSizes;
 
     private final TrieMemtableMetricNameFactory factory;
+    private final String keyspace;
+    private final String table;
+
+    public static TrieMemtableMetricsView getOrCreate(String keyspace, String table)
+    {
+        return perTableMetrics.computeIfAbsent(getKey(keyspace, table), k -> new TrieMemtableMetricsView(keyspace, table));
+    }
 
-    public TrieMemtableMetricsView(String keyspace, String table)
+    private TrieMemtableMetricsView(String keyspace, String table)
     {
+        this.keyspace = keyspace;
+        this.table = table;
         factory = new TrieMemtableMetricNameFactory(keyspace, table);
         
         uncontendedPuts = Metrics.counter(factory.createMetricName(UNCONTENDED_PUTS));
@@ -55,6 +69,8 @@ public TrieMemtableMetricsView(String keyspace, String table)
 
     public void release()
     {
+        perTableMetrics.remove(getKey(keyspace, table));
+
         Metrics.remove(factory.createMetricName(UNCONTENDED_PUTS));
         Metrics.remove(factory.createMetricName(CONTENDED_PUTS));
         contentionTime.release();
@@ -87,4 +103,9 @@ public CassandraMetricsRegistry.MetricName createMetricName(String metricName)
             return new CassandraMetricsRegistry.MetricName(groupName, type, metricName, keyspace + "." + table, mbeanName.toString());
         }
     }
+
+    private static String getKey(String keyspace, String table)
+    {
+        return keyspace + "." + table;
+    }
 }
diff --git a/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java b/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java
index ef472fef8251..6474d9eb2635 100644
--- a/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java
+++ b/test/unit/org/apache/cassandra/db/memtable/TrieMemtableConfigTest.java
@@ -50,13 +50,13 @@ public static void setup() throws Exception
     public void testShardCountSetByJMX() throws MalformedObjectNameException, ReflectionException, AttributeNotFoundException, InstanceNotFoundException, MBeanException, IOException, InvalidAttributeValueException
     {
         jmxConnection.setAttribute(new ObjectName(TRIE_MEMTABLE_CONFIG_OBJECT_NAME), new Attribute("ShardCount", "7"));
-        assertEquals(7, TrieMemtable.getShardCount());
+        assertEquals(7, Integer.parseInt(new TrieMemtable.TrieMemtableConfig().getShardCount()));
     }
 
     @Test
     public void testAutoShardCount() throws MalformedObjectNameException, ReflectionException, AttributeNotFoundException, InstanceNotFoundException, MBeanException, IOException, InvalidAttributeValueException
     {
         jmxConnection.setAttribute(new ObjectName(TRIE_MEMTABLE_CONFIG_OBJECT_NAME), new Attribute("ShardCount", "auto"));
-        assertEquals(FBUtilities.getAvailableProcessors(), TrieMemtable.getShardCount());
+        assertEquals(FBUtilities.getAvailableProcessors(), Integer.parseInt(new TrieMemtable.TrieMemtableConfig().getShardCount()));
     }
 }
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java b/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java
index de6aa8cd8ba9..31423b7a1aec 100644
--- a/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java
+++ b/test/unit/org/apache/cassandra/metrics/TrieMemtableMetricsTest.java
@@ -39,9 +39,6 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.OverrideConfigurationLoader;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.dht.Murmur3Partitioner;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Splitter;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.service.EmbeddedCassandraService;
@@ -134,11 +131,13 @@ public void testFlushRelatedMetrics() throws IOException, ExecutionException, In
 
         writeAndFlush(10);
         assertEquals(10, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
+        Long maxShardSize = metrics.lastFlushShardDataSizes.maxGauge.getValue();
 
         // verify that metrics survive flush / memtable switching
-        writeAndFlush(10);
-        assertEquals(20, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
+        writeAndFlush(100);
+        assertEquals(110, metrics.contendedPuts.getCount() + metrics.uncontendedPuts.getCount());
         assertEquals(metrics.lastFlushShardDataSizes.toString(), NUM_SHARDS, (int) metrics.lastFlushShardDataSizes.numSamplesGauge.getValue());
+        assertThat(metrics.lastFlushShardDataSizes.maxGauge.getValue(), greaterThan(maxShardSize));
     }
 
     @Test
@@ -185,7 +184,7 @@ public void testMetricsCleanupOnDrop()
 
     private TrieMemtableMetricsView getMemtableMetrics(ColumnFamilyStore cfs)
     {
-        return new TrieMemtableMetricsView(cfs.keyspace.getName(), cfs.name);
+        return TrieMemtableMetricsView.getOrCreate(cfs.keyspace.getName(), cfs.name);
     }
 
     private void writeAndFlush(int rows) throws IOException, ExecutionException, InterruptedException

From 2492b498d13076fd78bc96d4d09295e359b982bd Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Mon, 10 May 2021 17:07:24 +0200
Subject: [PATCH 080/151] STAR-530 SchemaKeyspace is compatible with
 riptano/6.8-cndb (#133)

* STAR-530 SchemaKeyspace is compatible with riptano/6.8-cndb

Add missing columns in SchemaKeyspace, which results in sstable
compatibility with riptano/6.8-cndb (used by CNDB). This makes
full-stop upgrade from Astra serverless to dse-db possible.

Additionally fix reading commit log coming from riptano/6.8-cndb.
Commit log version 680 is read with MessagingService.VERSION_40
messaging version but in truth it doesn't matter as the version
is never used in deserialization.

* STAR-530 add UCS mock

Add temp UCS mock to support full-stop upgrades from CNDB to
dse-db. The mock should be removed by UCS feature PR.

(cherry picked from commit 03ad8bab34840501cacf1bcee2028ab2a62cbcdf)
(cherry picked from commit a4598b5346c9fbf3f46a1ffe96e004df412f3310)
---
 .../db/commitlog/CommitLogDescriptor.java     |  3 ++
 .../compaction/UnifiedCompactionStrategy.java | 34 +++++++++++++++++++
 .../cassandra/schema/SchemaKeyspace.java      |  9 +++++
 3 files changed, 46 insertions(+)
 create mode 100644 src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java

diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
index 700f12a242ca..9120a3925f04 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
@@ -60,6 +60,8 @@ public class CommitLogDescriptor
     // We don't support anything pre-3.0
     public static final int VERSION_30 = 6;
     public static final int VERSION_40 = 7;
+    // For compatibility with CNDB
+    public static final int VERSION_DSE_68 = 680;
 
     /**
      * Increment this number if there is a changes in the commit log disc layout or MessagingVersion changes.
@@ -208,6 +210,7 @@ public int getMessagingVersion()
             case VERSION_30:
                 return MessagingService.VERSION_30;
             case VERSION_40:
+            case VERSION_DSE_68:
                 return MessagingService.VERSION_40;
             default:
                 throw new IllegalStateException("Unknown commitlog version " + version);
diff --git a/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java
new file mode 100644
index 000000000000..98a2bbda0577
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Map;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+
+/**
+ * TODO: STAR-13 will introduce true UCS, please remove me.
+ */
+public class UnifiedCompactionStrategy extends SizeTieredCompactionStrategy
+{
+    public UnifiedCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    {
+        super(cfs, options);
+    }
+}
diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
index fecfbbc95b56..ae33f5330397 100644
--- a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
+++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java
@@ -110,6 +110,7 @@ private SchemaKeyspace()
               + "keyspace_name text,"
               + "durable_writes boolean,"
               + "replication frozen<map<text, text>>,"
+              + "graph_engine text,"
               + "PRIMARY KEY ((keyspace_name)))");
 
     private static final TableMetadata Tables =
@@ -134,6 +135,7 @@ private SchemaKeyspace()
               + "max_index_interval int,"
               + "memtable_flush_period_in_ms int,"
               + "min_index_interval int,"
+              + "nodesync frozen<map<text, text>>,"
               + "read_repair_chance double," // no longer used, left for drivers' sake
               + "speculative_retry text,"
               + "additional_write_policy text,"
@@ -153,6 +155,7 @@ private SchemaKeyspace()
               + "kind text,"
               + "position int,"
               + "type text,"
+              + "required_for_liveness boolean,"
               + "PRIMARY KEY ((keyspace_name), table_name, column_name))");
 
     private static final TableMetadata DroppedColumns =
@@ -202,10 +205,12 @@ private SchemaKeyspace()
               + "max_index_interval int,"
               + "memtable_flush_period_in_ms int,"
               + "min_index_interval int,"
+              + "nodesync frozen<map<text, text>>,"
               + "read_repair_chance double," // no longer used, left for drivers' sake
               + "speculative_retry text,"
               + "additional_write_policy text,"
               + "cdc boolean,"
+              + "version int,"
               + "read_repair text,"
               + "PRIMARY KEY ((keyspace_name), view_name))");
 
@@ -242,6 +247,9 @@ private SchemaKeyspace()
               + "language text,"
               + "return_type text,"
               + "called_on_null_input boolean,"
+              + "deterministic boolean,"
+              + "monotonic boolean,"
+              + "monotonic_on frozen<list<text>>,"
               + "PRIMARY KEY ((keyspace_name), function_name, argument_types))");
 
     private static final TableMetadata Aggregates =
@@ -256,6 +264,7 @@ private SchemaKeyspace()
               + "return_type text,"
               + "state_func text,"
               + "state_type text,"
+              + "deterministic boolean,"
               + "PRIMARY KEY ((keyspace_name), aggregate_name, argument_types))");
 
     private static final List<TableMetadata> ALL_TABLE_METADATA =

From c5804252d67971a6860124bfe904987ce4830af4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20=C5=BBytka?= <jakub.zytka@datastax.com>
Date: Tue, 11 May 2021 10:01:12 +0200
Subject: [PATCH 081/151] STAR-555: fix commitLogUpperBound initialization in
 AbstractMemtableWithCommitlog

Fix commitLogUpperBound initialization in AbstractMemtableWithCommitlog so that it is always available when
writeBarrier gets initialized

Harden Memtable API so that it is apparent that getting commitLogUpperBound is valid only after it is fully established

Co-authored-by: Daniel Jatnieks <jatnieks@pobox.com>
(cherry picked from commit e2fb3a1532967de62dc2885e14551f356793196b)
(cherry picked from commit a4482ec698472b17f2549d7c6785d311a3be224a)
---
 .../cassandra/db/ColumnFamilyStore.java       |  2 +-
 .../db/memtable/AbstractMemtable.java         |  5 +-
 .../AbstractMemtableWithCommitlog.java        |  8 ++-
 .../cassandra/db/memtable/Flushing.java       |  2 +-
 .../cassandra/db/memtable/Memtable.java       |  2 +-
 .../db/memtable/PersistentMemoryMemtable.java |  4 +-
 .../db/commitlog/CommitLogCQLTest.java        | 71 +++++++++++++++++++
 7 files changed, 83 insertions(+), 11 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index 06846d0eae69..fb523d2dfc30 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -994,7 +994,7 @@ public CommitLogPosition call()
             // If a flush errored out but the error was ignored, make sure we don't discard the commit log.
             if (flushFailure == null && mainMemtable != null)
             {
-                commitLogUpperBound = mainMemtable.getCommitLogUpperBound();
+                commitLogUpperBound = mainMemtable.getFinalCommitLogUpperBound();
                 CommitLog.instance.discardCompletedSegments(metadata.id, mainMemtable.getCommitLogLowerBound(), commitLogUpperBound);
             }
 
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
index 51c871a4a0ee..b6d4a13b2031 100644
--- a/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractMemtable.java
@@ -33,7 +33,6 @@
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableMetadataRef;
-import org.apache.cassandra.utils.concurrent.OpOrder;
 
 public abstract class AbstractMemtable implements Memtable
 {
@@ -193,9 +192,9 @@ public CommitLogPosition commitLogLowerBound()
             return AbstractMemtable.this.getCommitLogLowerBound();
         }
 
-        public CommitLogPosition commitLogUpperBound()
+        public LastCommitLogPosition commitLogUpperBound()
         {
-            return AbstractMemtable.this.getCommitLogUpperBound();
+            return AbstractMemtable.this.getFinalCommitLogUpperBound();
         }
 
         public EncodingStats encodingStats()
diff --git a/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java b/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java
index 55f08a42a152..a782fece2be2 100644
--- a/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java
+++ b/src/java/org/apache/cassandra/db/memtable/AbstractMemtableWithCommitlog.java
@@ -57,8 +57,8 @@ public void switchOut(OpOrder.Barrier writeBarrier, AtomicReference<CommitLogPos
         // This can prepare the memtable data for deletion; it will still be used while the flush is proceeding.
         // A setDiscarded call will follow.
         assert this.writeBarrier == null;
-        this.writeBarrier = writeBarrier;
         this.commitLogUpperBound = commitLogUpperBound;
+        this.writeBarrier = writeBarrier;
     }
 
     public void discard()
@@ -112,9 +112,11 @@ public CommitLogPosition getCommitLogLowerBound()
         return commitLogLowerBound.get();
     }
 
-    public CommitLogPosition getCommitLogUpperBound()
+    public LastCommitLogPosition getFinalCommitLogUpperBound()
     {
-        return commitLogUpperBound.get();
+        assert commitLogUpperBound != null : "Commit log upper bound should be set before flushing";
+        assert commitLogUpperBound.get() instanceof LastCommitLogPosition : "Commit log upper bound has not been sealed yet? " + commitLogUpperBound.get();
+        return (LastCommitLogPosition) commitLogUpperBound.get();
     }
 
     public boolean mayContainDataBefore(CommitLogPosition position)
diff --git a/src/java/org/apache/cassandra/db/memtable/Flushing.java b/src/java/org/apache/cassandra/db/memtable/Flushing.java
index 55677b0552c8..8332a1bc9d1e 100644
--- a/src/java/org/apache/cassandra/db/memtable/Flushing.java
+++ b/src/java/org/apache/cassandra/db/memtable/Flushing.java
@@ -226,7 +226,7 @@ private void writeSortedContents()
                             logger.info("Completed flushing {} ({}) for commitlog position {}",
                                         writer.getFilename(),
                                         FBUtilities.prettyPrintMemory(bytesFlushed),
-                                        toFlush.memtable().getCommitLogUpperBound());
+                                        toFlush.memtable().getFinalCommitLogUpperBound());
                             // Update the metrics
                             metrics.bytesFlushed.inc(bytesFlushed);
                         }
diff --git a/src/java/org/apache/cassandra/db/memtable/Memtable.java b/src/java/org/apache/cassandra/db/memtable/Memtable.java
index 6d36e5da0d60..eb92743089cd 100644
--- a/src/java/org/apache/cassandra/db/memtable/Memtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/Memtable.java
@@ -382,7 +382,7 @@ default boolean isEmpty()
     CommitLogPosition getCommitLogLowerBound();
 
     /** The commit log position at the time that this memtable was switched out */
-    CommitLogPosition getCommitLogUpperBound();
+    LastCommitLogPosition getFinalCommitLogUpperBound();
 
     /** True if the memtable can contain any data that was written before the given commit log position */
     boolean mayContainDataBefore(CommitLogPosition position);
diff --git a/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java b/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java
index eaa235218b95..c44f1bd12449 100644
--- a/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/PersistentMemoryMemtable.java
@@ -180,10 +180,10 @@ public CommitLogPosition getCommitLogLowerBound()
         return CommitLogPosition.NONE;
     }
 
-    public CommitLogPosition getCommitLogUpperBound()
+    public LastCommitLogPosition getFinalCommitLogUpperBound()
     {
         // We don't maintain commit log positions
-        return CommitLogPosition.NONE;
+        return new LastCommitLogPosition(CommitLogPosition.NONE);
     }
 
     public boolean isClean()
diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
index 531ca87bee27..af0626bbcfd3 100644
--- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogCQLTest.java
@@ -18,12 +18,21 @@
 
 package org.apache.cassandra.db.commitlog;
 
+import java.io.ByteArrayOutputStream;
+import java.io.PrintStream;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.List;
+import java.util.concurrent.ConcurrentLinkedQueue;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
 
+import org.junit.Assert;
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.db.ColumnFamilyStore;
 
 public class CommitLogCQLTest extends CQLTester
@@ -56,4 +65,66 @@ public void testTruncateSegmentDiscard() throws Throwable
         active.retainAll(CommitLog.instance.segmentManager.getActiveSegments());
         assert active.isEmpty();
     }
+    
+    @Test
+    public void testSwitchMemtable() throws Throwable
+    {
+        createTable("CREATE TABLE %s (idx INT, data TEXT, PRIMARY KEY(idx));");
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        
+        AtomicBoolean shouldStop = new AtomicBoolean(false);
+        ConcurrentLinkedQueue<Throwable> errors = new ConcurrentLinkedQueue<>();
+        List<Thread> threads = new ArrayList<>();
+        
+        final String stmt = String.format("INSERT INTO %s.%s (idx, data) VALUES(?, ?)", KEYSPACE, currentTable());
+        for (int i = 0; i < 10; ++i)
+        {
+            threads.add(new Thread("" + i)
+            {
+                public void run()
+                {
+                    try
+                    {
+                        while (!shouldStop.get())
+                        {
+                            for (int i = 0; i < 50; i++)
+                            {
+                                QueryProcessor.executeInternal(stmt, i, Integer.toString(i));
+                            }
+                            cfs.dumpMemtable(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+                        }
+                    }
+                    catch (Throwable t)
+                    {
+                        errors.add(t);
+                        shouldStop.set(true);
+                    }
+                }
+            });
+        }
+
+        for (Thread t : threads)
+            t.start();
+
+        Thread.sleep(15_000);
+        shouldStop.set(true);
+        
+        for (Thread t : threads)
+            t.join();
+
+        if (!errors.isEmpty())
+        {
+            StringBuilder sb = new StringBuilder();
+            for(Throwable error: errors)
+            {
+                sb.append("Got error during memtable switching:\n");
+                sb.append(error.getMessage() + "\n");
+                ByteArrayOutputStream os = new ByteArrayOutputStream();
+                PrintStream ps = new PrintStream(os);
+                error.printStackTrace(ps);
+                sb.append(os.toString("UTF-8"));
+            }
+            Assert.fail(sb.toString());
+        }
+    }
 }

From 1f5b4246b2332bf8266db8e2d9ec3bc4d310b62e Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Thu, 13 May 2021 07:49:50 -0700
Subject: [PATCH 082/151] STAR-511 Port guardrails: ignore table properties
 (#134)

* STAR-511: Disallow obsolete table properties

Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>

* STAR-511: Add new guardrail that ignores table properties

Co-authored-by: Sylvain Lebresne <lebresne@gmail.com>

Co-authored-by: Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com>
Co-authored-by: Sylvain Lebresne <lebresne@gmail.com>
(cherry picked from commit 117bbc92990c7db571320e294990406087bd2fea)
(cherry picked from commit 301b93dca10ce9a65bc5b42443a6a527d7110d3a)
---
 .../cassandra/cql3/UntypedResultSet.java      |  14 +-
 .../cql3/statements/PropertyDefinitions.java  |   5 +
 .../cql3/statements/SelectStatement.java      |  32 ++--
 .../schema/AlterTableStatement.java           |   1 +
 .../statements/schema/AlterViewStatement.java |   1 +
 .../schema/CreateTableStatement.java          |   3 +-
 .../schema/CreateViewStatement.java           |   3 +-
 .../statements/schema/TableAttributes.java    |   6 +
 .../apache/cassandra/db/ConsistencyLevel.java |   8 +-
 .../cassandra/db/MultiRangeReadCommand.java   |   4 +-
 .../db/PartitionRangeReadCommand.java         |   4 +-
 .../org/apache/cassandra/db/ReadQuery.java    |   8 +-
 .../db/SinglePartitionReadCommand.java        |   8 +-
 .../cassandra/db/VirtualTableReadQuery.java   |   4 +-
 .../VirtualTableSinglePartitionReadQuery.java |   8 +-
 .../cassandra/guardrails/Guardrail.java       | 177 +++++++++++++-----
 .../cassandra/guardrails/Guardrails.java      |   6 +
 .../guardrails/GuardrailsConfig.java          |  24 ++-
 .../cassandra/service/StorageProxy.java       |  32 ++--
 .../service/pager/AbstractQueryPager.java     |   6 +-
 .../service/pager/AggregationQueryPager.java  |  26 +--
 .../service/pager/MultiPartitionPager.java    |  14 +-
 .../cassandra/service/pager/QueryPager.java   |   8 +-
 .../guardrails/GuardrailsOnTableTest.java     |  33 +++-
 .../cassandra/guardrails/GuardrailsTest.java  |  30 +++
 ...guardrails: ignore table properties (#134) |  46 +++++
 26 files changed, 363 insertions(+), 148 deletions(-)
 create mode 100644 update-history/STAR-801/21-301b93dca1 STAR-511 Port guardrails: ignore table properties (#134)

diff --git a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
index f4ac99fdc4a4..767a13976dce 100644
--- a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
@@ -31,7 +31,7 @@
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.pager.QueryPager;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.AbstractIterator;
@@ -62,11 +62,11 @@ public static UntypedResultSet create(SelectStatement select, QueryPager pager,
     @VisibleForTesting
     public static UntypedResultSet create(SelectStatement select,
                                           ConsistencyLevel cl,
-                                          ClientState clientState,
+                                          QueryState queryState,
                                           QueryPager pager,
                                           int pageSize)
     {
-        return new FromDistributedPager(select, cl, clientState, pager, pageSize);
+        return new FromDistributedPager(select, cl, queryState, pager, pageSize);
     }
 
     public boolean isEmpty()
@@ -227,19 +227,19 @@ private static class FromDistributedPager extends UntypedResultSet
     {
         private final SelectStatement select;
         private final ConsistencyLevel cl;
-        private final ClientState clientState;
+        private final QueryState queryState;
         private final QueryPager pager;
         private final int pageSize;
         private final List<ColumnSpecification> metadata;
 
         private FromDistributedPager(SelectStatement select,
                                      ConsistencyLevel cl,
-                                     ClientState clientState,
+                                     QueryState queryState,
                                      QueryPager pager, int pageSize)
         {
             this.select = select;
             this.cl = cl;
-            this.clientState = clientState;
+            this.queryState = queryState;
             this.pager = pager;
             this.pageSize = pageSize;
             this.metadata = select.getResultMetadata().requestNames();
@@ -269,7 +269,7 @@ protected Row computeNext()
                         if (pager.isExhausted())
                             return endOfData();
 
-                        try (PartitionIterator iter = pager.fetchPage(pageSize, cl, clientState, System.nanoTime()))
+                        try (PartitionIterator iter = pager.fetchPage(pageSize, cl, queryState, System.nanoTime()))
                         {
                             currentPage = select.process(iter, nowInSec).rows.iterator();
                         }
diff --git a/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java b/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
index d1d8acd6feed..0123aec79145 100644
--- a/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
+++ b/src/java/org/apache/cassandra/cql3/statements/PropertyDefinitions.java
@@ -151,4 +151,9 @@ public Set<String> updatedProperties()
     {
         return properties.keySet();
     }
+
+    public void removeProperty(String name)
+    {
+        properties.remove(name);
+    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index 5a344eb42eb2..efd9ad4a76c2 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -230,34 +230,34 @@ public void validate(QueryState state) throws InvalidRequestException
         // Nothing to do, all validation has been done by RawStatement.prepare()
     }
 
-    private void validateQueryOptions(QueryOptions options)
+    private void validateQueryOptions(QueryState queryState, QueryOptions options)
     {
         if (SchemaConstants.isUserKeyspace(table.keyspace))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(options.getConsistency());
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(options.getConsistency(), queryState);
     }
 
-    public ResultMessage.Rows execute(QueryState state, QueryOptions options, long queryStartNanoTime)
+    public ResultMessage.Rows execute(QueryState queryState, QueryOptions options, long queryStartNanoTime)
     {
         ConsistencyLevel cl = options.getConsistency();
         checkNotNull(cl, "Invalid empty consistency level");
 
         cl.validateForRead();
-        validateQueryOptions(options);
+        validateQueryOptions(queryState, options);
 
-        int nowInSec = options.getNowInSeconds(state);
+        int nowInSec = options.getNowInSeconds(queryState);
         int userLimit = getLimit(options);
         int userPerPartitionLimit = getPerPartitionLimit(options);
         int pageSize = options.getPageSize();
 
         Selectors selectors = selection.newSelectors(options);
-        ReadQuery query = getQuery(state, options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
+        ReadQuery query = getQuery(queryState, options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
 
         if (aggregationSpec == null && (pageSize <= 0 || (query.limits().count() <= pageSize)))
-            return execute(query, options, state, selectors, nowInSec, userLimit, queryStartNanoTime);
+            return execute(query, options, queryState, selectors, nowInSec, userLimit, queryStartNanoTime);
 
         QueryPager pager = getPager(query, options);
 
-        return execute(Pager.forDistributedQuery(pager, cl, state.getClientState()),
+        return execute(Pager.forDistributedQuery(pager, cl, queryState),
                        options,
                        selectors,
                        pageSize,
@@ -297,12 +297,12 @@ public ReadQuery getQuery(QueryState queryState,
 
     private ResultMessage.Rows execute(ReadQuery query,
                                        QueryOptions options,
-                                       QueryState state,
+                                       QueryState queryState,
                                        Selectors selectors,
                                        int nowInSec,
                                        int userLimit, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
     {
-        try (PartitionIterator data = query.execute(options.getConsistency(), state.getClientState(), queryStartNanoTime))
+        try (PartitionIterator data = query.execute(options.getConsistency(), queryState, queryStartNanoTime))
         {
             return processResults(data, options, selectors, nowInSec, userLimit);
         }
@@ -329,9 +329,9 @@ public static Pager forInternalQuery(QueryPager pager, ReadExecutionController e
             return new InternalPager(pager, executionController);
         }
 
-        public static Pager forDistributedQuery(QueryPager pager, ConsistencyLevel consistency, ClientState clientState)
+        public static Pager forDistributedQuery(QueryPager pager, ConsistencyLevel consistency, QueryState queryState)
         {
-            return new NormalPager(pager, consistency, clientState);
+            return new NormalPager(pager, consistency, queryState);
         }
 
         public boolean isExhausted()
@@ -349,18 +349,18 @@ public PagingState state()
         public static class NormalPager extends Pager
         {
             private final ConsistencyLevel consistency;
-            private final ClientState clientState;
+            private final QueryState queryState;
 
-            private NormalPager(QueryPager pager, ConsistencyLevel consistency, ClientState clientState)
+            private NormalPager(QueryPager pager, ConsistencyLevel consistency, QueryState queryState)
             {
                 super(pager);
                 this.consistency = consistency;
-                this.clientState = clientState;
+                this.queryState = queryState;
             }
 
             public PartitionIterator fetchPage(int pageSize, long queryStartNanoTime)
             {
-                return pager.fetchPage(pageSize, consistency, clientState, queryStartNanoTime);
+                return pager.fetchPage(pageSize, consistency, queryState, queryStartNanoTime);
             }
         }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
index 5312c63fdcc5..33669b1a572a 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
@@ -417,6 +417,7 @@ public void validate(QueryState state)
             super.validate(state);
 
             Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+            Guardrails.ignoredTableProperties.maybeIgnoreAndWarn(attrs.updatedProperties(), attrs::removeProperty, state);
         }
 
         public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
index 8632739ad2f4..5247c6bbdab4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterViewStatement.java
@@ -66,6 +66,7 @@ public Keyspaces apply(Keyspaces schema)
         attrs.validate();
 
         Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+        Guardrails.ignoredTableProperties.maybeIgnoreAndWarn(attrs.updatedProperties(), attrs::removeProperty, state);
 
         TableParams params = attrs.asAlteredTableParams(view.metadata.params);
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index 96500e8b78ab..fe5c04a44e96 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -103,8 +103,9 @@ public void validate(QueryState state)
         // require the server to be initialized, so skipping them if it isn't.
         if (Guardrails.ready())
         {
-            // Guardrail on table properties
+            // Guardrails on table properties
             Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+            Guardrails.ignoredTableProperties.maybeIgnoreAndWarn(attrs.updatedProperties(), attrs::removeProperty, state);
 
             // Guardrail on columns per table
             Guardrails.columnsPerTable.guard(rawColumns.size(), tableName, state);
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
index 3a5691c195b9..157b343ac3f9 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
@@ -151,8 +151,9 @@ public Keyspaces apply(Keyspaces schema)
         if (table.isView())
             throw ire("Materialized views cannot be created against other materialized views");
 
-        // Guardrail on table properties
+        // Guardrails on table properties
         Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
+        Guardrails.ignoredTableProperties.maybeIgnoreAndWarn(attrs.updatedProperties(), attrs::removeProperty, state);
 
         // guardrails to limit number of mvs per table.
         Set<ViewMetadata> baseTableViews = StreamSupport.stream(keyspace.views.forTable(table.id).spliterator(), false)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 9d6c66e49c65..23901efaf640 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -21,6 +21,7 @@
 import java.util.Set;
 
 import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
 
 import org.apache.cassandra.cql3.statements.PropertyDefinitions;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -112,6 +113,11 @@ boolean hasUnsupportedDseCompaction()
         }
     }
 
+    public static Set<String> allKeywords()
+    {
+        return Sets.union(validKeywords, obsoleteKeywords);
+    }
+
     private TableParams build(TableParams.Builder builder)
     {
         if (hasOption(Option.BLOOM_FILTER_FP_CHANCE))
diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
index 2f02665c3058..fa5960568e9d 100644
--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
@@ -230,10 +230,10 @@ public void validateForWrite(String keyspaceName, QueryState queryState) throws
     }
 
     // This is the same than validateForWrite really, but we include a slightly different error message for SERIAL/LOCAL_SERIAL
-    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName) throws InvalidRequestException
+    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName, QueryState queryState) throws InvalidRequestException
     {
         if (SchemaConstants.isUserKeyspace(keyspaceName))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(this);
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
 
         switch (this)
         {
@@ -246,10 +246,10 @@ public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy
         }
     }
 
-    public void validateForCas(String keyspaceName) throws InvalidRequestException
+    public void validateForCas(String keyspaceName, QueryState queryState) throws InvalidRequestException
     {
         if (SchemaConstants.isUserKeyspace(keyspaceName))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(this);
+            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
 
         if (!isSerialConsistency())
             throw new InvalidRequestException("Invalid consistency for conditional update. Must be one of SERIAL or LOCAL_SERIAL");
diff --git a/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java b/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
index 271f8e29d6c6..644fd5c3e897 100644
--- a/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
@@ -43,7 +43,7 @@
 import org.apache.cassandra.metrics.TableMetrics;
 import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.service.pager.QueryPager;
 import org.apache.cassandra.service.reads.ReadCallback;
@@ -335,7 +335,7 @@ protected void appendCQLWhereClause(StringBuilder sb)
     }
 
     @Override
-    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+    public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException
     {
         // MultiRangeReadCommand should only be executed on the replica side
         throw new UnsupportedOperationException();
diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
index dc95e787cb11..0dac5b8d6c79 100644
--- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
@@ -41,7 +41,7 @@
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.metrics.TableMetrics;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.StorageProxy;
 import org.apache.cassandra.tracing.Tracing;
 
@@ -263,7 +263,7 @@ public boolean isReversed()
         return dataRange.isReversed();
     }
 
-    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+    public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException
     {
         return StorageProxy.getRangeSlice(this, consistency, queryStartNanoTime);
     }
diff --git a/src/java/org/apache/cassandra/db/ReadQuery.java b/src/java/org/apache/cassandra/db/ReadQuery.java
index bd20c26d9fd4..41ac15ee85cf 100644
--- a/src/java/org/apache/cassandra/db/ReadQuery.java
+++ b/src/java/org/apache/cassandra/db/ReadQuery.java
@@ -23,7 +23,7 @@
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.pager.QueryPager;
 import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.transport.ProtocolVersion;
@@ -48,7 +48,7 @@ public ReadExecutionController executionController()
                 return ReadExecutionController.empty();
             }
 
-            public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+            public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException
             {
                 return EmptyIterators.partition();
             }
@@ -140,12 +140,12 @@ public ColumnFilter columnFilter()
      * Executes the query at the provided consistency level.
      *
      * @param consistency the consistency level to achieve for the query.
-     * @param clientState the {@code ClientState} for the query. In practice, this can be null unless
+     * @param queryState the {@code QueryState} for the query. In practice, this can be null unless
      * {@code consistency} is a serial consistency.
      *
      * @return the result of the query.
      */
-    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException;
+    public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException;
 
     /**
      * Execute the query for internal queries (that is, it basically executes the query locally).
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index e44c50104d8f..9525854b4254 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -382,12 +382,12 @@ public SinglePartitionReadCommand forPaging(Clustering<?> lastReturned, DataLimi
                       lastReturned == null ? clusteringIndexFilter() : clusteringIndexFilter.forPaging(metadata().comparator, lastReturned, false));
     }
 
-    public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+    public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException
     {
         if (clusteringIndexFilter.isEmpty(metadata().comparator))
             return EmptyIterators.partition();
 
-        return StorageProxy.read(Group.one(this), consistency, clientState, queryStartNanoTime);
+        return StorageProxy.read(Group.one(this), consistency, queryState, queryStartNanoTime);
     }
 
     protected void recordLatency(TableMetrics metric, long latencyNanos)
@@ -1112,9 +1112,9 @@ public static Group one(SinglePartitionReadCommand command)
             return new Group(Collections.singletonList(command), command.limits());
         }
 
-        public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+        public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException
         {
-            return StorageProxy.read(this, consistency, clientState, queryStartNanoTime);
+            return StorageProxy.read(this, consistency, queryState, queryStartNanoTime);
         }
     }
 
diff --git a/src/java/org/apache/cassandra/db/VirtualTableReadQuery.java b/src/java/org/apache/cassandra/db/VirtualTableReadQuery.java
index ad22a587a53e..7154c4b811df 100644
--- a/src/java/org/apache/cassandra/db/VirtualTableReadQuery.java
+++ b/src/java/org/apache/cassandra/db/VirtualTableReadQuery.java
@@ -24,7 +24,7 @@
 import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * Base class for the {@code ReadQuery} implementations use to query virtual tables.
@@ -48,7 +48,7 @@ public ReadExecutionController executionController()
 
     @Override
     public PartitionIterator execute(ConsistencyLevel consistency,
-                                     ClientState clientState,
+                                     QueryState queryState,
                                      long queryStartNanoTime) throws RequestExecutionException
     {
         return executeInternal(executionController());
diff --git a/src/java/org/apache/cassandra/db/VirtualTableSinglePartitionReadQuery.java b/src/java/org/apache/cassandra/db/VirtualTableSinglePartitionReadQuery.java
index ba9441ae7f3e..1369c782abc9 100644
--- a/src/java/org/apache/cassandra/db/VirtualTableSinglePartitionReadQuery.java
+++ b/src/java/org/apache/cassandra/db/VirtualTableSinglePartitionReadQuery.java
@@ -34,7 +34,7 @@
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * A read query that selects a (part of a) single partition of a virtual table.
@@ -181,13 +181,13 @@ public static Group one(VirtualTableSinglePartitionReadQuery query)
             return new Group(Collections.singletonList(query), query.limits());
         }
 
-        public PartitionIterator execute(ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestExecutionException
+        public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestExecutionException
         {
             if (queries.size() == 1)
-                return queries.get(0).execute(consistency, clientState, queryStartNanoTime);
+                return queries.get(0).execute(consistency, queryState, queryStartNanoTime);
 
             return PartitionIterators.concat(queries.stream()
-                                                    .map(q -> q.execute(consistency, clientState, queryStartNanoTime))
+                                                    .map(q -> q.execute(consistency, queryState, queryStartNanoTime))
                                                     .collect(Collectors.toList()));
         }
     }
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrail.java b/src/java/org/apache/cassandra/guardrails/Guardrail.java
index 55e97e7d092d..879f48f82416 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrail.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrail.java
@@ -22,6 +22,7 @@
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.function.BooleanSupplier;
+import java.util.function.Consumer;
 import java.util.function.Function;
 import java.util.function.LongSupplier;
 import java.util.function.Predicate;
@@ -30,6 +31,7 @@
 import javax.annotation.Nullable;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Sets;
 import org.slf4j.LoggerFactory;
 
@@ -572,14 +574,11 @@ public void ensureEnabled(String what, @Nullable QueryState queryState)
     }
 
     /**
-     * A guardrail that rejects the use of specific values.
-     *
-     * <p>Note that like {@link DisableFlag}, this guardrail only triggers failures and is thus only for query-based
-     * guardrails.
+     * Base class for guardrail that are triggered based on a set of values.
      *
-     * @param <T> the type of the values of which certain are disallowed.
+     * @param <T> the type of the values that trigger the guardrail.
      */
-    public static class DisallowedValues<T> extends Guardrail
+    private static abstract class ValuesBaseGuardrail<T> extends Guardrail
     {
         /*
          * Implementation note: as mentioned in the class Javadoc and for consistency with the other Guardrail
@@ -590,23 +589,13 @@ public static class DisallowedValues<T> extends Guardrail
 
         private final Supplier<Set<String>> rawSupplier;
         private final Function<String, T> parser;
-        private final String what;
+        protected final String what;
 
-        private volatile Set<T> cachedDisallowed;
+        private volatile ImmutableSet<T> cachedValues;
         private volatile Set<String> cachedRaw;
 
-        /**
-         * Creates a new {@link DisallowedValues} guardrail.
-         *
-         * @param name          the name of the guardrail (for identification in {@link Guardrails.Listener} events).
-         * @param disallowedRaw a supplier of the values that are disallowed in raw (string) form. The set returned by
-         *                      this supplier <b>must</b> be immutable (we don't use {@code ImmutableSet} because we
-         *                      want to feed values from {@link GuardrailsConfig} directly and having ImmutableSet
-         *                      there would currently be annoying (because populated automatically by snakeYaml)).
-         * @param parser        a function to parse the value to disallow from string.
-         * @param what          what represents the value disallowed (for reporting in error messages).
-         */
-        DisallowedValues(String name, Supplier<Set<String>> disallowedRaw, Function<String, T> parser, String what)
+        protected ValuesBaseGuardrail(
+            String name, Supplier<Set<String>> disallowedRaw, Function<String, T> parser, String what)
         {
             super(name);
             this.rawSupplier = disallowedRaw;
@@ -617,13 +606,14 @@ public static class DisallowedValues<T> extends Guardrail
                 ensureUpToDate();
         }
 
-        private void ensureUpToDate()
+        protected void ensureUpToDate()
         {
             Set<String> current = rawSupplier.get();
             // Same as below, this shouldn't happen if settings have been properly sanitized, but throw a meaningful
             // error if there is a bug.
             if (current == null)
-                throw new RuntimeException(format("Invalid null setting for guardrail on %s. This is a bug and should not have happened.", what));
+                throw new RuntimeException(format("Invalid null setting for guardrail on %s. This should not have"
+                                                  + " happened", what));
 
             // Note that this will fail on first call (as we want), as currentRaw will be null but not current
             if (current == cachedRaw)
@@ -633,9 +623,9 @@ private void ensureUpToDate()
             {
                 // Setting cachedAllowed first so that on a parse failure we leave everything as it previously
                 // was (not that we'd expect that matter but ...).
-                cachedDisallowed = current.stream()
-                                          .map(parser)
-                                          .collect(Collectors.toCollection(HashSet::new));
+                cachedValues = current.stream()
+                                      .map(parser)
+                                      .collect(ImmutableSet.toImmutableSet());
                 cachedRaw = current;
             }
             catch (Exception e)
@@ -651,62 +641,96 @@ private void ensureUpToDate()
             }
         }
 
+        protected Set<T> matchingValues(Set<T> values) {
+            return Sets.intersection(values, cachedValues);
+        }
+
+        protected String triggerValuesString()
+        {
+            return cachedRaw.toString();
+        }
+
         /**
-         * Triggers a failure if the provided value is disallowed by this guardrail.
+         * Checks whether the provided value would trigger this guardrail.
          *
-         * @param value the value to check.
+         * <p>This method is optional (does not have to be called) but can be used in the case some of the arguments
+         * to the actual guardrail method is expensive to build to save doing so in the common case (of the
+         * guardrail not being triggered).
+         *
+         * @param value the value to test.
+         * @param state the query state, used to skip the check if the query is internal or is done by a superuser.
+         * @return {@code true} if {@code value} is not allowed by this guardrail,
+         * {@code false otherwise}.
          */
-        public void ensureAllowed(T value)
+        public boolean triggersOn(T value, @Nullable QueryState state)
         {
-            ensureAllowed(value, null);
+            if (!enabled(state))
+                return false;
+
+            ensureUpToDate();
+            return cachedValues.contains(value);
         }
+    }
 
+
+    /**
+     * A guardrail that rejects the use of specific values.
+     *
+     * <p>Note that like {@link DisableFlag}, this guardrail only trigger failures and is thus only for query-based
+     * guardrails.
+     *
+     * @param <T> the type of the values of which certain are disallowed.
+     */
+    public static class DisallowedValues<T> extends ValuesBaseGuardrail<T>
+    {
         /**
-         * Triggers a failure if any of the provided values is disallowed by this guardrail.
+         * Creates a new {@link DisallowedValues} guardrail.
          *
-         * @param values the values to check.
+         * @param name the name of the guardrail (for identification in {@link Guardrails.Listener} events).
+         * @param disallowedRaw a supplier of the values that are disallowed in raw (string) form. The set returned by
+         *                      this supplier <b>must</b> be immutable (we don't use {@code ImmutableSet} because we
+         *                      want to feed values from {@link GuardrailsConfig} directly and having ImmutableSet
+         *                      there would currently be annoying (because populated automatically by snakeYaml)).
+         * @param parser a function to parse the value to disallow from string.
+         * @param what what represents the value disallowed (for reporting in error messages).
          */
-        public void ensureAllowed(Set<T> values)
+        DisallowedValues(String name, Supplier<Set<String>> disallowedRaw, Function<String, T> parser, String what)
         {
-            ensureAllowed(values, null);
+            super(name, disallowedRaw, parser, what);
         }
 
         /**
          * Triggers a failure if the provided value is disallowed by this guardrail.
          *
-         * @param value      the value to check.
-         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
-         *                   A {@code null} value means that the check should be done regardless of the query.
+         * @param value the value to check.
+         * @param state the query state, used to skip the check if the query is internal or is done by a superuser.
+         * A {@code null} value means that the check should be done regardless of the query.
          */
-        public void ensureAllowed(T value, @Nullable QueryState queryState)
+        public void ensureAllowed(T value, @Nullable QueryState state)
         {
-            if (!enabled(queryState))
-                return;
-
-            ensureUpToDate();
-            if (cachedDisallowed.contains(value))
+            if (triggersOn(value, state))
                 fail(format("Provided value %s is not allowed for %s (disallowed values are: %s)",
-                            value, what, cachedRaw));
+                            value, what, triggerValuesString()));
         }
 
         /**
          * Triggers a failure if any of the provided values is disallowed by this guardrail.
          *
-         * @param values     the values to check.
-         * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
-         *                   A {@code null} value means that the check should be done regardless of the query.
+         * @param values the values to check.
+         * @param state the query state, used to skip the check if the query is internal or is done by a superuser.
+         * A {@code null} value means that the check should be done regardless of the query.
          */
-        public void ensureAllowed(Set<T> values, @Nullable QueryState queryState)
+        public void ensureAllowed(Set<T> values, @Nullable QueryState state)
         {
-            if (!enabled(queryState))
+            if (!enabled(state))
                 return;
 
             ensureUpToDate();
 
-            Set<T> intersection = Sets.intersection(values, cachedDisallowed);
-            if (!intersection.isEmpty())
+            Set<T> disallowed = matchingValues(values);
+            if (!disallowed.isEmpty())
                 fail(format("Provided values %s are not allowed for %s (disallowed values are: %s)",
-                            intersection.stream().sorted().collect(Collectors.toList()), what, cachedRaw));
+                            disallowed.stream().sorted().collect(Collectors.toList()), what, triggerValuesString()));
         }
     }
 
@@ -776,5 +800,56 @@ else if (warnPredicate.test(value))
             }
         }
     }
+    
+    /**
+     * A guardrail that warns but ignore some specific values.
+     *
+     * @param <T> the type of the values of which certain are ignored.
+     */
+    public static class IgnoredValues<T> extends ValuesBaseGuardrail<T>
+    {
+        /**
+         * Creates a new {@link IgnoredValues} guardrail.
+         *
+         * @param name the name of the guardrail (for identification in {@link Guardrails.Listener} events).
+         * @param ignoredRaw a supplier of the values that are ignored in raw (string) form. The set returned by
+         *                      this supplier <b>must</b> be immutable (we don't use {@code ImmutableSet} because we
+         *                      want to feed values from {@link GuardrailsConfig} directly and having ImmutableSet
+         *                      there would currently be annoying (because populated automatically by snakeYaml)).
+         * @param parser a function to parse the value to ignore from string.
+         * @param what what represents the value ignored (for reporting in error messages).
+         */
+        IgnoredValues(String name, Supplier<Set<String>> ignoredRaw, Function<String, T> parser, String what)
+        {
+            super(name, ignoredRaw, parser, what);
+        }
+
+        /**
+         * Checks for ignored values by this guardrail and when it found some, log a warning and trigger an action
+         * to ignore them.
+         *
+         * @param values the values to check.
+         * @param ignoreAction an action called on the subset of {@code values} that should be ignored. This action
+         * should do whatever is necessary to make sure the value is ignored.
+         * @param state the query state, used to skip the check if the query is internal or is done by a superuser.
+         * A {@code null} value means that the check should be done regardless of the query.
+         */
+        public void maybeIgnoreAndWarn(Set<T> values, Consumer<T> ignoreAction, @Nullable QueryState state)
+        {
+            if (!enabled(state))
+                return;
+
+            ensureUpToDate();
+
+            Set<T> toIgnore = matchingValues(values);
+            if (toIgnore.isEmpty())
+                return;
+
+            warn(format("Ignoring provided values %s as they are not supported for %s (ignored values are: %s)",
+                        toIgnore.stream().sorted().collect(Collectors.toList()), what, triggerValuesString()));
+            for (T value : toIgnore)
+                ignoreAction.accept(value);
+        }
+    }
 }
 
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
index 672fc30aa893..0686b4cee6f9 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrails.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -26,6 +26,7 @@
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.guardrails.Guardrail.DisableFlag;
 import org.apache.cassandra.guardrails.Guardrail.DisallowedValues;
+import org.apache.cassandra.guardrails.Guardrail.IgnoredValues;
 import org.apache.cassandra.guardrails.Guardrail.PercentageThreshold;
 import org.apache.cassandra.guardrails.Guardrail.Predicates;
 import org.apache.cassandra.guardrails.Guardrail.SizeThreshold;
@@ -89,6 +90,11 @@ public abstract class Guardrails
                                                                                                     String::toLowerCase,
                                                                                                     "Table Properties");
 
+    public static final IgnoredValues<String> ignoredTableProperties = new IgnoredValues<>("ignored_table_properties",
+                                                                                                               () -> config.table_properties_ignored,
+                                                                                                               String::toLowerCase,
+                                                                                                               "Table Properties");
+
     @SuppressWarnings("unchecked")
     public static final Predicates<InetAddressAndPort> replicaDiskUsage =
     (Predicates<InetAddressAndPort>) new Predicates<>("replica_disk_usage",
diff --git a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
index ee4c0e6e1601..be47fc54bfbb 100644
--- a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
+++ b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
@@ -73,6 +73,7 @@ public class GuardrailsConfig
     public Long tables_warn_threshold;
     public Long tables_failure_threshold;
     public Set<String> table_properties_disallowed;
+    public Set<String> table_properties_ignored;
 
     public Boolean user_timestamps_enabled;
 
@@ -122,6 +123,7 @@ public void validate()
         validateStrictlyPositiveInteger(in_select_cartesian_product_failure_threshold, "in_select_cartesian_product_failure_threshold");
 
         validateDisallowedTableProperties();
+        validateIgnoredTableProperties();
 
         validateDiskUsageThreshold();
 
@@ -164,7 +166,15 @@ public void applyConfig()
         enforceDefault(table_properties_disallowed,
                        v -> table_properties_disallowed = v,
                        Collections.<String>emptySet(),
-                       new LinkedHashSet<>(TableAttributes.validKeywords.stream().sorted().filter(p -> !p.equals("default_time_to_live")).collect(Collectors.toList())));
+                       Collections.<String>emptySet());
+
+        enforceDefault(table_properties_ignored,
+                       v -> table_properties_ignored = v,
+                       Collections.<String>emptySet(),
+                       new LinkedHashSet<>(TableAttributes.allKeywords().stream()
+                                                          .sorted()
+                                                          .filter(p -> !p.equals("default_time_to_live"))
+                                                          .collect(Collectors.toList())));
 
         enforceDefault(partition_size_warn_threshold_in_mb, v -> partition_size_warn_threshold_in_mb = v, 100, 100);
         enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT.intValue(), 20);
@@ -184,13 +194,23 @@ public void applyConfig()
     private void validateDisallowedTableProperties()
     {
         Set<String> diff = Sets.difference(table_properties_disallowed.stream().map(String::toLowerCase).collect(Collectors.toSet()),
-                                           TableAttributes.validKeywords);
+                                           TableAttributes.allKeywords());
 
         if (!diff.isEmpty())
             throw new ConfigurationException(format("Invalid value for table_properties_disallowed guardrail: "
                                                     + "'%s' do not parse as valid table properties", diff.toString()));
     }
 
+    private void validateIgnoredTableProperties()
+    {
+        Set<String> diff = Sets.difference(table_properties_ignored.stream().map(String::toLowerCase).collect(Collectors.toSet()),
+                                           TableAttributes.allKeywords());
+
+        if (!diff.isEmpty())
+            throw new ConfigurationException(format("Invalid value for table_properties_ignored guardrail: "
+                                                    + "'%s' do not parse as valid table properties", diff.toString()));
+    }
+
     private void validateStrictlyPositiveInteger(long value, String name)
     {
         // We use 'long' for generality, but most numeric guardrails cannot effectively be more than an 'int' for various
diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java
index 361b57fc7aac..0cebda4702dd 100644
--- a/src/java/org/apache/cassandra/service/StorageProxy.java
+++ b/src/java/org/apache/cassandra/service/StorageProxy.java
@@ -289,8 +289,8 @@ public static RowIterator cas(String keyspaceName,
         try
         {
             TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName);
-            consistencyForPaxos.validateForCas(keyspaceName);
-            consistencyForCommit.validateForCasCommit(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName);
+            consistencyForPaxos.validateForCas(keyspaceName, state);
+            consistencyForCommit.validateForCasCommit(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName, state);
 
             Supplier<Pair<PartitionUpdate, RowIterator>> updateProposer = () ->
             {
@@ -336,7 +336,7 @@ public static RowIterator cas(String keyspaceName,
                            consistencyForPaxos,
                            consistencyForCommit,
                            consistencyForCommit,
-                           state.getClientState(),
+                           state,
                            queryStartNanoTime,
                            casWriteMetrics,
                            updateProposer);
@@ -409,7 +409,7 @@ private static void recordCasContention(TableMetadata table,
      *     {@link ConsistencyLevel#LOCAL_SERIAL}).
      * @param consistencyForReplayCommits the consistency for the commit phase of "replayed" in-progress operations.
      * @param consistencyForCommit the consistency for the commit phase of _this_ operation update.
-     * @param state the client state.
+     * @param queryState the query state.
      * @param queryStartNanoTime the nano time for the start of the query this is part of. This is the base time for
      *     timeouts.
      * @param casMetrics the metrics to update for this operation.
@@ -425,7 +425,7 @@ private static RowIterator doPaxos(TableMetadata metadata,
                                        ConsistencyLevel consistencyForPaxos,
                                        ConsistencyLevel consistencyForReplayCommits,
                                        ConsistencyLevel consistencyForCommit,
-                                       ClientState state,
+                                       QueryState queryState,
                                        long queryStartNanoTime,
                                        CASClientRequestMetrics casMetrics,
                                        Supplier<Pair<PartitionUpdate, RowIterator>> createUpdateProposal)
@@ -436,9 +436,9 @@ private static RowIterator doPaxos(TableMetadata metadata,
         AbstractReplicationStrategy latestRs = keyspace.getReplicationStrategy();
         try
         {
-            consistencyForPaxos.validateForCas(metadata.keyspace);
-            consistencyForReplayCommits.validateForCasCommit(latestRs, metadata.keyspace);
-            consistencyForCommit.validateForCasCommit(latestRs, metadata.keyspace);
+            consistencyForPaxos.validateForCas(metadata.keyspace, queryState);
+            consistencyForReplayCommits.validateForCasCommit(latestRs, metadata.keyspace, queryState);
+            consistencyForCommit.validateForCasCommit(latestRs, metadata.keyspace, queryState);
 
             long timeoutNanos = DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS);
             while (System.nanoTime() - queryStartNanoTime < timeoutNanos)
@@ -453,7 +453,7 @@ private static RowIterator doPaxos(TableMetadata metadata,
                                                                     consistencyForPaxos,
                                                                     consistencyForReplayCommits,
                                                                     casMetrics,
-                                                                    state);
+                                                                    queryState.getClientState());
 
                 final UUID ballot = pair.ballot;
                 contentions += pair.contentions;
@@ -1698,10 +1698,10 @@ public static RowIterator readOne(SinglePartitionReadCommand command, Consistenc
         return readOne(command, consistencyLevel, null, queryStartNanoTime);
     }
 
-    public static RowIterator readOne(SinglePartitionReadCommand command, ConsistencyLevel consistencyLevel, ClientState state, long queryStartNanoTime)
+    public static RowIterator readOne(SinglePartitionReadCommand command, ConsistencyLevel consistencyLevel, QueryState queryState, long queryStartNanoTime)
     throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException
     {
-        return PartitionIterators.getOnlyElement(read(SinglePartitionReadCommand.Group.one(command), consistencyLevel, state, queryStartNanoTime), command);
+        return PartitionIterators.getOnlyElement(read(SinglePartitionReadCommand.Group.one(command), consistencyLevel, queryState, queryStartNanoTime), command);
     }
 
     public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, long queryStartNanoTime)
@@ -1716,7 +1716,7 @@ public static PartitionIterator read(SinglePartitionReadCommand.Group group, Con
      * Performs the actual reading of a row out of the StorageService, fetching
      * a specific set of column names from a given column family.
      */
-    public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ClientState state, long queryStartNanoTime)
+    public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, QueryState queryState, long queryStartNanoTime)
     throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException
     {
         if (StorageService.instance.isBootstrapMode() && !systemKeyspaceQuery(group.queries))
@@ -1727,14 +1727,14 @@ public static PartitionIterator read(SinglePartitionReadCommand.Group group, Con
         }
 
         return consistencyLevel.isSerialConsistency()
-             ? readWithPaxos(group, consistencyLevel, state, queryStartNanoTime)
+             ? readWithPaxos(group, consistencyLevel, queryState, queryStartNanoTime)
              : readRegular(group, consistencyLevel, queryStartNanoTime);
     }
 
-    private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ClientState state, long queryStartNanoTime)
+    private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, QueryState queryState, long queryStartNanoTime)
     throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException
     {
-        assert state != null;
+        assert queryState != null;
         if (group.queries.size() > 1)
             throw new InvalidRequestException("SERIAL/LOCAL_SERIAL consistency may only be requested for one partition at a time");
 
@@ -1769,7 +1769,7 @@ private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group
                         consistencyLevel,
                         consistencyForReplayCommitsOrFetch,
                         ConsistencyLevel.ANY,
-                        state,
+                        queryState,
                         start,
                         casReadMetrics,
                         updateProposer);
diff --git a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
index cc8f4340b4ba..7f0896a445a4 100644
--- a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
@@ -23,7 +23,7 @@
 import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 
 abstract class AbstractQueryPager<T extends ReadQuery> implements QueryPager
@@ -59,7 +59,7 @@ public ReadExecutionController executionController()
         return query.executionController();
     }
 
-    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime)
+    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime)
     {
         if (isExhausted())
             return EmptyIterators.partition();
@@ -72,7 +72,7 @@ public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, C
             exhausted = true;
             return EmptyIterators.partition();
         }
-        return Transformation.apply(readQuery.execute(consistency, clientState, queryStartNanoTime), pager);
+        return Transformation.apply(readQuery.execute(consistency, queryState, queryStartNanoTime), pager);
     }
 
     public PartitionIterator fetchPageInternal(int pageSize, ReadExecutionController executionController)
diff --git a/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java b/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java
index dbc4fc045afc..5a484aaa8566 100644
--- a/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java
@@ -27,7 +27,7 @@
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.db.rows.RowIterator;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * {@code QueryPager} that takes care of fetching the pages for aggregation queries.
@@ -52,13 +52,13 @@ public AggregationQueryPager(QueryPager subPager, DataLimits limits)
     @Override
     public PartitionIterator fetchPage(int pageSize,
                                        ConsistencyLevel consistency,
-                                       ClientState clientState,
+                                       QueryState queryState,
                                        long queryStartNanoTime)
     {
         if (limits.isGroupByLimit())
-            return new GroupByPartitionIterator(pageSize, consistency, clientState, queryStartNanoTime);
+            return new GroupByPartitionIterator(pageSize, consistency, queryState, queryStartNanoTime);
 
-        return new AggregationPartitionIterator(pageSize, consistency, clientState, queryStartNanoTime);
+        return new AggregationPartitionIterator(pageSize, consistency, queryState, queryStartNanoTime);
     }
 
     @Override
@@ -113,7 +113,7 @@ public class GroupByPartitionIterator implements PartitionIterator
 
         // For "normal" queries
         private final ConsistencyLevel consistency;
-        private final ClientState clientState;
+        private final QueryState queryState;
 
         // For internal queries
         private final ReadExecutionController executionController;
@@ -156,11 +156,11 @@ public class GroupByPartitionIterator implements PartitionIterator
         private long queryStartNanoTime;
 
         public GroupByPartitionIterator(int pageSize,
-                                         ConsistencyLevel consistency,
-                                         ClientState clientState,
+                                        ConsistencyLevel consistency,
+                                        QueryState queryState,
                                         long queryStartNanoTime)
         {
-            this(pageSize, consistency, clientState, null, queryStartNanoTime);
+            this(pageSize, consistency, queryState, null, queryStartNanoTime);
         }
 
         public GroupByPartitionIterator(int pageSize,
@@ -172,13 +172,13 @@ public GroupByPartitionIterator(int pageSize,
 
         private GroupByPartitionIterator(int pageSize,
                                          ConsistencyLevel consistency,
-                                         ClientState clientState,
+                                         QueryState queryState,
                                          ReadExecutionController executionController,
                                          long queryStartNanoTime)
         {
             this.pageSize = handlePagingOff(pageSize);
             this.consistency = consistency;
-            this.clientState = clientState;
+            this.queryState = queryState;
             this.executionController = executionController;
             this.queryStartNanoTime = queryStartNanoTime;
         }
@@ -287,7 +287,7 @@ protected int computeSubPageSize(int pageSize, int counted)
          */
         private final PartitionIterator fetchSubPage(int subPageSize)
         {
-            return consistency != null ? subPager.fetchPage(subPageSize, consistency, clientState, queryStartNanoTime)
+            return consistency != null ? subPager.fetchPage(subPageSize, consistency, queryState, queryStartNanoTime)
                                        : subPager.fetchPageInternal(subPageSize, executionController);
         }
 
@@ -400,10 +400,10 @@ public final class AggregationPartitionIterator extends GroupByPartitionIterator
     {
         public AggregationPartitionIterator(int pageSize,
                                             ConsistencyLevel consistency,
-                                            ClientState clientState,
+                                            QueryState queryState,
                                             long queryStartNanoTime)
         {
-            super(pageSize, consistency, clientState, queryStartNanoTime);
+            super(pageSize, consistency, queryState, queryStartNanoTime);
         }
 
         public AggregationPartitionIterator(int pageSize,
diff --git a/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java b/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
index ca16967b6c79..83dfbc91d81c 100644
--- a/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
+++ b/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.service.pager;
 
+import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.AbstractIterator;
 
@@ -28,7 +29,6 @@
 import org.apache.cassandra.db.partitions.*;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.exceptions.RequestExecutionException;
-import org.apache.cassandra.service.ClientState;
 
 /**
  * Pager over a list of SinglePartitionReadQuery.
@@ -148,10 +148,10 @@ public ReadExecutionController executionController()
     }
 
     @SuppressWarnings("resource") // iter closed via countingIter
-    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
+    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
     {
         int toQuery = Math.min(remaining, pageSize);
-        return new PagersIterator(toQuery, consistency, clientState, null, queryStartNanoTime);
+        return new PagersIterator(toQuery, consistency, queryState, null, queryStartNanoTime);
     }
 
     @SuppressWarnings("resource") // iter closed via countingIter
@@ -170,7 +170,7 @@ private class PagersIterator extends AbstractIterator<RowIterator> implements Pa
 
         // For "normal" queries
         private final ConsistencyLevel consistency;
-        private final ClientState clientState;
+        private final QueryState queryState;
 
         // For internal queries
         private final ReadExecutionController executionController;
@@ -178,11 +178,11 @@ private class PagersIterator extends AbstractIterator<RowIterator> implements Pa
         private int pagerMaxRemaining;
         private int counted;
 
-        public PagersIterator(int pageSize, ConsistencyLevel consistency, ClientState clientState, ReadExecutionController executionController, long queryStartNanoTime)
+        public PagersIterator(int pageSize, ConsistencyLevel consistency, QueryState queryState, ReadExecutionController executionController, long queryStartNanoTime)
         {
             this.pageSize = pageSize;
             this.consistency = consistency;
-            this.clientState = clientState;
+            this.queryState = queryState;
             this.executionController = executionController;
             this.queryStartNanoTime = queryStartNanoTime;
         }
@@ -213,7 +213,7 @@ protected RowIterator computeNext()
                 int toQuery = pageSize - counted;
                 result = consistency == null
                        ? pagers[current].fetchPageInternal(toQuery, executionController)
-                       : pagers[current].fetchPage(toQuery, consistency, clientState, queryStartNanoTime);
+                       : pagers[current].fetchPage(toQuery, consistency, queryState, queryStartNanoTime);
             }
             return result.next();
         }
diff --git a/src/java/org/apache/cassandra/service/pager/QueryPager.java b/src/java/org/apache/cassandra/service/pager/QueryPager.java
index 5d2399744b50..e9b0b158e76d 100644
--- a/src/java/org/apache/cassandra/service/pager/QueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/QueryPager.java
@@ -24,7 +24,7 @@
 import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
-import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
 
 /**
  * Perform a query, paging it by page of a given size.
@@ -54,7 +54,7 @@ public ReadExecutionController executionController()
             return ReadExecutionController.empty();
         }
 
-        public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
+        public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
         {
             return EmptyIterators.partition();
         }
@@ -90,11 +90,11 @@ public QueryPager withUpdatedLimit(DataLimits newLimits)
      *
      * @param pageSize the maximum number of elements to return in the next page.
      * @param consistency the consistency level to achieve for the query.
-     * @param clientState the {@code ClientState} for the query. In practice, this can be null unless
+     * @param queryState the {@code QueryState} for the query. In practice, this can be null unless
      * {@code consistency} is a serial consistency.
      * @return the page of result.
      */
-    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, ClientState clientState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException;
+    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException;
 
     /**
      * Starts a new read operation.
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
index 6ae268213375..4a5bbe9038f8 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
@@ -22,6 +22,7 @@
 import java.util.HashSet;
 import java.util.Set;
 import java.util.UUID;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
 
 import org.junit.After;
@@ -34,6 +35,7 @@
 import org.apache.cassandra.schema.Schema;
 
 import static java.lang.String.format;
+import static org.junit.Assert.assertEquals;
 
 public class GuardrailsOnTableTest extends GuardrailTester
 {
@@ -44,6 +46,7 @@ public class GuardrailsOnTableTest extends GuardrailTester
     private static long defaultTableHardLimit;
     private static long defaultMVPerTableFailureThreshold;
     private static Set<String> defaultTablePropertiesDisallowed;
+    private static Set<String> defaultTablePropertiesIgnored;
 
     @Before
     public void before()
@@ -52,14 +55,18 @@ public void before()
         defaultTableHardLimit = DatabaseDescriptor.getGuardrailsConfig().tables_failure_threshold;
         defaultMVPerTableFailureThreshold = DatabaseDescriptor.getGuardrailsConfig().materialized_view_per_table_failure_threshold;
         defaultTablePropertiesDisallowed = DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed;
+        defaultTablePropertiesIgnored = DatabaseDescriptor.getGuardrailsConfig().table_properties_ignored;
 
-        // only allow "gc_grace_seconds"
         defaultMVPerTableFailureThreshold = 100;
+        // only allow "gc_grace_seconds" and "comments"
+        Set<String> allowed = new HashSet<>(Arrays.asList("gc_grace_seconds", "comment"));
         DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed =
         TableAttributes.validKeywords.stream()
-                                     .filter(p -> !p.equals("gc_grace_seconds"))
+                                     .filter(p -> !allowed.contains(p))
                                      .map(String::toUpperCase)
                                      .collect(Collectors.toSet());
+        // but actually ignore "comment"
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_ignored = new HashSet<>(Arrays.asList("comment"));
     }
 
     @After
@@ -69,6 +76,7 @@ public void after()
         DatabaseDescriptor.getGuardrailsConfig().tables_failure_threshold = defaultTableHardLimit;
         DatabaseDescriptor.getGuardrailsConfig().materialized_view_per_table_failure_threshold = defaultMVPerTableFailureThreshold;
         DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = defaultTablePropertiesDisallowed;
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_ignored = defaultTablePropertiesIgnored;
     }
 
     @Test
@@ -111,15 +119,28 @@ public void testTableLimit() throws Throwable
     @Test
     public void testTableProperties() throws Throwable
     {
-        // table properties is not allowed
+        // most table properties are not allowed
         assertValid(this::create);
         assertFails(() -> create("with id = " + UUID.randomUUID()), "[id]");
         assertFails(() -> create("with compression = { 'enabled': 'false' }"), "[compression]");
         assertFails(() -> create("with compression = { 'enabled': 'false' } AND id = " + UUID.randomUUID()), "[compression, id]");
         assertFails(() -> create("with compaction = { 'class': 'SizeTieredCompactionStrategy' }"), "[compaction]");
         assertFails(() -> create("with gc_grace_seconds = 1000 and compression = { 'enabled': 'false' }"), "[compression]");
+
+        // though gc_grace_seconds alone is
         assertValid(() -> create("with gc_grace_seconds = 1000"));
 
+        // and comment is "ignored". So it should warn, and getting the comment on the created table should be empty,
+        // not the one we set.
+        AtomicReference<String> tableName = new AtomicReference<>();
+        assertWarns(() -> tableName.set(create("with comment = 'my table'")), "[comment]");
+        com.datastax.driver.core.ResultSet rs =
+        executeNet("SELECT comment FROM system_schema.tables WHERE keyspace_name=? AND table_name=?",
+                   keyspace(),
+                   tableName.get());
+        com.datastax.driver.core.Row r = rs.one();
+        assertEquals("", r.getString("comment"));
+
         // alter column is allowed
         assertValid(this::create);
         assertValid("ALTER TABLE %s ADD v1 int");
@@ -197,9 +218,11 @@ private void create() throws Throwable
         create("");
     }
 
-    private void create(String withClause) throws Throwable
+    private String create(String withClause) throws Throwable
     {
-        executeNet(format(CREATE_TABLE, keyspace(), createTableName(), withClause));
+        String name = createTableName();
+        executeNet(format(CREATE_TABLE, keyspace(), name, withClause));
+        return name;
     }
 
     private void createMV(String withClause) throws Throwable
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java
index 387da460e1bc..7afa1228b9a9 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailsTest.java
@@ -447,6 +447,36 @@ public void testDisallowedValues()
                     "Provided values [4, 6] are not allowed for integer (disallowed values are: [4, 6, 20])");
     }
 
+    @Test
+    public void testIgnoredValues()
+    {
+        // Using a LinkedHashSet below to ensure the order in the error message checked below are not random
+        Guardrail.IgnoredValues<Integer> ignored = new Guardrail.IgnoredValues<>(
+        "x",
+        () -> new LinkedHashSet<>(Arrays.asList("4", "6", "20")),
+        Integer::valueOf,
+        "integer");
+
+        Set<Integer> triggeredOn = new HashSet<>();
+        assertNoWarnOrFails(() -> ignored.maybeIgnoreAndWarn(set(3), triggeredOn::add, userQueryState));
+        assertEquals(set(), triggeredOn);
+
+        assertWarn(() -> ignored.maybeIgnoreAndWarn(set(4), triggeredOn::add, userQueryState),
+                   "Ignoring provided values [4] as they are not supported for integer (ignored values are: [4, 6, 20])");
+        assertEquals(set(4), triggeredOn);
+        triggeredOn.clear();
+
+        assertWarn(() -> ignored.maybeIgnoreAndWarn(set(4, 6), triggeredOn::add, null),
+                   "Ignoring provided values [4, 6] as they are not supported for integer (ignored values are: [4, 6, 20])");
+        assertEquals(set(4, 6), triggeredOn);
+        triggeredOn.clear();
+
+        assertWarn(() -> ignored.maybeIgnoreAndWarn(set(4, 5, 6, 7), triggeredOn::add, null),
+                   "Ignoring provided values [4, 6] as they are not supported for integer (ignored values are: [4, 6, 20])");
+        assertEquals(set(4, 6), triggeredOn);
+        triggeredOn.clear();
+    }
+
     @Test
     public void testDisallowedValuesUsers()
     {
diff --git a/update-history/STAR-801/21-301b93dca1 STAR-511 Port guardrails: ignore table properties (#134) b/update-history/STAR-801/21-301b93dca1 STAR-511 Port guardrails: ignore table properties (#134)
new file mode 100644
index 000000000000..f5abfcdd5b71
--- /dev/null
+++ b/update-history/STAR-801/21-301b93dca1 STAR-511 Port guardrails: ignore table properties (#134)	
@@ -0,0 +1,46 @@
+--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
++++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+@@ -230,11 +230,7 @@
+     }
+ 
+     // This is the same than validateForWrite really, but we include a slightly different error message for SERIAL/LOCAL_SERIAL
+-<<<<<<<
+-    public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName) throws InvalidRequestException
+-=======
+     public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName, QueryState queryState) throws InvalidRequestException
+->>>>>>>
+     {
+         if (SchemaConstants.isUserKeyspace(keyspaceName))
+             Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+--- a/src/java/org/apache/cassandra/service/StorageProxy.java
++++ b/src/java/org/apache/cassandra/service/StorageProxy.java
+@@ -289,13 +289,8 @@
+         try
+         {
+             TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName);
+-<<<<<<<
+-            consistencyForPaxos.validateForCas(keyspaceName);
+-            consistencyForCommit.validateForCasCommit(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName);
+-=======
+             consistencyForPaxos.validateForCas(keyspaceName, state);
+             consistencyForCommit.validateForCasCommit(Keyspace.open(keyspaceName).getReplicationStrategy(), keyspaceName, state);
+->>>>>>>
+ 
+             Supplier<Pair<PartitionUpdate, RowIterator>> updateProposer = () ->
+             {
+@@ -441,15 +436,9 @@
+         AbstractReplicationStrategy latestRs = keyspace.getReplicationStrategy();
+         try
+         {
+-<<<<<<<
+-            consistencyForPaxos.validateForCas(metadata.keyspace);
+-            consistencyForReplayCommits.validateForCasCommit(latestRs, metadata.keyspace);
+-            consistencyForCommit.validateForCasCommit(latestRs, metadata.keyspace);
+-=======
+             consistencyForPaxos.validateForCas(metadata.keyspace, queryState);
+             consistencyForReplayCommits.validateForCasCommit(latestRs, metadata.keyspace, queryState);
+             consistencyForCommit.validateForCasCommit(latestRs, metadata.keyspace, queryState);
+->>>>>>>
+ 
+             long timeoutNanos = DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS);
+             while (System.nanoTime() - queryStartNanoTime < timeoutNanos)

From 0bd7ea3e9298485cf26c61dee4e76f08bbee4b76 Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Thu, 13 May 2021 08:58:25 -0700
Subject: [PATCH 083/151] STAR-513 Add Guardrail to disable TRUNCATE TABLE
 (#138)

Co-authored-by: Stefania Alborghetti <stef1927@users.noreply.github.com>
(cherry picked from commit df12ae10368526485dcc6e4314c2b2ce1a9efe9a)
(cherry picked from commit e0ffa4c68835d8ced03c18b9889e88b688060df0)
---
 .../cql3/statements/TruncateStatement.java    |  3 +
 .../cassandra/guardrails/Guardrails.java      |  4 +
 .../guardrails/GuardrailsConfig.java          |  4 +
 .../GuardrailTruncateTableTest.java           | 83 +++++++++++++++++++
 4 files changed, 94 insertions(+)
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailTruncateTableTest.java

diff --git a/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java b/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
index f6c4864ec0df..6b6b99f8b9dc 100644
--- a/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/TruncateStatement.java
@@ -26,6 +26,7 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ClientState;
@@ -55,6 +56,8 @@ public void authorize(ClientState state) throws InvalidRequestException, Unautho
     @Override
     public void validate(QueryState state) throws InvalidRequestException
     {
+        Guardrails.truncateTableEnabled.ensureEnabled(state);
+
         Schema.instance.validateTable(keyspace(), name());
     }
 
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
index 0686b4cee6f9..3c31dcd3cf95 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrails.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -59,6 +59,10 @@ public abstract class Guardrails
                                                                             () -> !config.user_timestamps_enabled,
                                                                             "User provided timestamps (USING TIMESTAMP)");
 
+    public static final DisableFlag truncateTableEnabled = new DisableFlag("truncate_table",
+                                                                           () -> !config.truncate_table_enabled,
+                                                                           "TRUNCATE table");
+
     public static final DisallowedValues<ConsistencyLevel> disallowedWriteConsistencies = new DisallowedValues<>("disallowed_write_consistency_levels",
                                                                                                                  () -> config.write_consistency_levels_disallowed,
                                                                                                                  ConsistencyLevel::fromString,
diff --git a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
index be47fc54bfbb..d0ddbdf96821 100644
--- a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
+++ b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
@@ -97,6 +97,8 @@ public class GuardrailsConfig
 
     public Boolean read_before_write_list_operations_enabled;
 
+    public Boolean truncate_table_enabled;
+
     /**
      * Validate that the value provided for each guardrail setting is valid.
      *
@@ -147,6 +149,8 @@ public void validate()
      */
     public void applyConfig()
     {
+        enforceDefault(truncate_table_enabled, v -> truncate_table_enabled = v, true, true);
+
         enforceDefault(user_timestamps_enabled, v -> user_timestamps_enabled = v, true, true);
 
         enforceDefault(column_value_size_failure_threshold_in_kb, v -> column_value_size_failure_threshold_in_kb = v, NO_LIMIT, 5 * 1024L);
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailTruncateTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailTruncateTableTest.java
new file mode 100644
index 000000000000..9c93fbd648ae
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailTruncateTableTest.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+public class GuardrailTruncateTableTest extends GuardrailTester
+{
+    private static boolean truncateTableEnabled;
+
+    @BeforeClass
+    public static void setup()
+    {
+        truncateTableEnabled = DatabaseDescriptor.getGuardrailsConfig().truncate_table_enabled;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        setGuardrails(truncateTableEnabled);
+    }
+
+    private static void setGuardrails(boolean truncate_table_enabled)
+    {
+        DatabaseDescriptor.getGuardrailsConfig().truncate_table_enabled = truncate_table_enabled;
+    }
+
+    private void testTruncate(boolean truncateTableEnabled) throws Throwable
+    {
+        setGuardrails(truncateTableEnabled);
+        createTable("CREATE TABLE %s (a int, b int, c int, PRIMARY KEY(a, b))");
+
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 0, 0);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 0, 1, 1);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 0, 2);
+        execute("INSERT INTO %s (a, b, c) VALUES (?, ?, ?)", 1, 1, 3);
+
+        assertRows(execute("SELECT * FROM %s"), row(1, 0, 2), row(1, 1, 3), row(0, 0, 0), row(0, 1, 1));
+
+        assertValid("TRUNCATE %s");
+
+        assertEmpty(execute("SELECT * FROM %s"));
+    }
+
+    @Test
+    public void testEnabledTruncateTable() throws Throwable
+    {
+        testTruncate(true);
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testDisabledTruncateTable() throws Throwable
+    {
+        testTruncate(false);
+    }
+}
\ No newline at end of file

From 164f30f9185dc21451e86f4b2eb8db2a8e9b5f7c Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Thu, 13 May 2021 09:55:31 -0700
Subject: [PATCH 084/151] STAR-514 Add guardrail to disable counter columns
 (#139)

Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>
(cherry picked from commit 82351749bd01e346441e747aac0c7d96f10b9239)
(cherry picked from commit caf45c16690dd41fdef88cfd02be0aeb89ffe289)
---
 .../schema/CreateTableStatement.java          |  4 +
 .../cassandra/guardrails/Guardrails.java      | 12 ++-
 .../guardrails/GuardrailsConfig.java          |  4 +
 .../guardrails/GuardrailCounterTest.java      | 75 +++++++++++++++++++
 4 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailCounterTest.java

diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index fe5c04a44e96..0068cb7f3a3c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -107,6 +107,10 @@ public void validate(QueryState state)
             Guardrails.disallowedTableProperties.ensureAllowed(attrs.updatedProperties(), state);
             Guardrails.ignoredTableProperties.maybeIgnoreAndWarn(attrs.updatedProperties(), attrs::removeProperty, state);
 
+            // Guardrail on counter
+            if (rawColumns.values().stream().anyMatch(CQL3Type.Raw::isCounter))
+                Guardrails.counterEnabled.ensureEnabled(state);
+
             // Guardrail on columns per table
             Guardrails.columnsPerTable.guard(rawColumns.size(), tableName, state);
 
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
index 3c31dcd3cf95..eebecfb33ae4 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrails.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -94,10 +94,14 @@ public abstract class Guardrails
                                                                                                     String::toLowerCase,
                                                                                                     "Table Properties");
 
-    public static final IgnoredValues<String> ignoredTableProperties = new IgnoredValues<>("ignored_table_properties",
-                                                                                                               () -> config.table_properties_ignored,
-                                                                                                               String::toLowerCase,
-                                                                                                               "Table Properties");
+    public static final IgnoredValues<String> ignoredTableProperties = new IgnoredValues<>("ignored_table_properties", 
+                                                                                           () -> config.table_properties_ignored, 
+                                                                                           String::toLowerCase, 
+                                                                                           "Table Properties");
+    
+    public static final DisableFlag counterEnabled = new DisableFlag("counter",
+                                                                     () -> !config.counter_enabled,
+                                                                     "Counter");
 
     @SuppressWarnings("unchecked")
     public static final Predicates<InetAddressAndPort> replicaDiskUsage =
diff --git a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
index d0ddbdf96821..7f30f1e15491 100644
--- a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
+++ b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
@@ -77,6 +77,8 @@ public class GuardrailsConfig
 
     public Boolean user_timestamps_enabled;
 
+    public Boolean counter_enabled;
+
     public Long secondary_index_per_table_failure_threshold;
     public Long materialized_view_per_table_failure_threshold;
 
@@ -183,6 +185,8 @@ public void applyConfig()
         enforceDefault(partition_size_warn_threshold_in_mb, v -> partition_size_warn_threshold_in_mb = v, 100, 100);
         enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT.intValue(), 20);
 
+        enforceDefault(counter_enabled, v -> counter_enabled = v, true, true);
+
         enforceDefault(fields_per_udt_failure_threshold, v -> fields_per_udt_failure_threshold = v, NO_LIMIT, 10L);
         enforceDefault(collection_size_warn_threshold_in_kb, v -> collection_size_warn_threshold_in_kb = v, NO_LIMIT, 5 * 1024L);
         enforceDefault(items_per_collection_warn_threshold, v -> items_per_collection_warn_threshold = v, NO_LIMIT, 20L);
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailCounterTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailCounterTest.java
new file mode 100644
index 000000000000..39a75d8f58cc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailCounterTest.java
@@ -0,0 +1,75 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+public class GuardrailCounterTest extends GuardrailTester
+{
+    private static boolean counterEnabled;
+
+    @BeforeClass
+    public static void setup()
+    {
+        counterEnabled = DatabaseDescriptor.getGuardrailsConfig().counter_enabled;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().counter_enabled = counterEnabled;
+    }
+
+    private void setGuardrails(boolean counterEnabled)
+    {
+        DatabaseDescriptor.getGuardrailsConfig().counter_enabled = counterEnabled;
+    }
+
+    private void testCounter(boolean counterEnabled) throws Throwable
+    {
+        setGuardrails(counterEnabled);
+
+        executeNet(String.format("CREATE TABLE %s (pk int PRIMARY KEY, c counter)", createTableName()));
+        execute("UPDATE %s SET c = c + 1 WHERE pk = 10");
+        assertRows(execute("SELECT c FROM %s WHERE pk = 10"), row(1L));
+    }
+
+    @Test
+    public void testCounterEnabled() throws Throwable
+    {
+        testCounter(true);
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testCounterDisabled() throws Throwable
+    {
+        testCounter(false);
+    }
+}
\ No newline at end of file

From 753fb6916ba32673a479b8c8e39cec447aa97012 Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Thu, 13 May 2021 13:18:08 -0700
Subject: [PATCH 085/151] STAR-515 Add guardrail for SASI and STAR-512 Add
 guardrails for total and per-table indexes for SAI (#140)

* STAR-512 Add guardrails for total and per-table indexes for SAI

Co-authored-by: Paulo Ricardo Motta Gomes <pauloricardomg@users.noreply.github.com>

* STAR-515 Add guardrail for SASI

Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>

Co-authored-by: Paulo Ricardo Motta Gomes <pauloricardomg@users.noreply.github.com>
Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>
(cherry picked from commit e2ae18ead10d7186a76bb1a8299bfcd4ddf3820e)
(cherry picked from commit eaf8c59b3820a575742370aa6b4936d3783bc49b)
---
 conf/cassandra.yaml                           |  10 +-
 .../cassandra/config/DatabaseDescriptor.java  |   7 +
 .../schema/CreateIndexStatement.java          |  78 ++++++-
 .../cassandra/guardrails/Guardrails.java      |  18 ++
 .../guardrails/GuardrailsConfig.java          |  58 +++--
 ...uardrailMaterializedViewsPerTableTest.java |   4 +-
 .../guardrails/GuardrailSAIIndexesTest.java   | 213 ++++++++++++++++++
 .../GuardrailSASIIndexesPerTableTest.java     |  96 ++++++++
 ...GuardrailSecondaryIndexesPerTableTest.java |   4 +-
 .../guardrails/GuardrailsOnTableTest.java     |   2 +-
 10 files changed, 456 insertions(+), 34 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailSASIIndexesPerTableTest.java

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index d25d7f0e6e02..f50084ea502b 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -1468,10 +1468,18 @@ enable_transient_replication: false
   # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
   # columns_per_table_failure_threshold: -1
 
-  # Failure threshold to prevent creating more secondary indexes per table than threshold
+  # Failure threshold to prevent creating more secondary indexes per table than threshold (does not apply to CUSTOM INDEX StorageAttachedIndex)
   # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
   # secondary_index_per_table_failure_threshold: -1
 
+  # Failure threshold for number of StorageAttachedIndex per table (only applies to CUSTOM INDEX StorageAttachedIndex)
+  # Default is 10 (same when apply_dbaas_defaults is enabled)
+  # sai_indexes_per_table_failure_threshold: 10
+  #
+  # Failure threshold for total number of StorageAttachedIndex across all keyspaces (only applies to CUSTOM INDEX StorageAttachedIndex)
+  # Default is 10 (same when apply_dbaas_defaults is enabled)
+  # sai_indexes_total_failure_threshold: 100
+
   # Failure threshold to prevent creating more materialized views per table than threshold.
   # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
   # materialized_view_per_table_failure_threshold: -1
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 2e09f150af9d..a22f8cb45e0f 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -3414,6 +3414,13 @@ public static GuardrailsConfig getGuardrailsConfig()
     {
         return conf.guardrails;
     }
+
+    @VisibleForTesting
+    public static boolean setApplyDbaasDefaults(boolean dbaasDefaults)
+    {
+        return conf.apply_dbaas_defaults = dbaasDefaults;
+    }
+
     public static boolean isApplyDbaasDefaults()
     {
         return conf.apply_dbaas_defaults;
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
index 78bc194481d5..c781480a538d 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateIndexStatement.java
@@ -18,9 +18,9 @@
 package org.apache.cassandra.cql3.statements.schema;
 
 import java.util.*;
+import java.util.stream.StreamSupport;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Strings;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
 
@@ -35,6 +35,7 @@
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.marshal.MapType;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.guardrails.Guardrail;
 import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.index.sasi.SASIIndex;
 import org.apache.cassandra.schema.*;
@@ -133,17 +134,6 @@ public Keyspaces apply(Keyspaces schema)
         if (Keyspace.open(table.keyspace).getReplicationStrategy().hasTransientReplicas())
             throw new InvalidRequestException("Secondary indexes are not supported on transiently replicated keyspaces");
 
-        // guardrails to limit number of secondary indexes per table.
-        if (!attrs.isCustom)
-        {
-            long existingSecondaryIndexes = table.indexes.stream().filter(indexMetadata -> !indexMetadata.isCustom()).count();
-            Guardrails.secondaryIndexesPerTable.guard(existingSecondaryIndexes + 1,
-                                                      Strings.isNullOrEmpty(indexName)
-                                                      ? String.format("on table %s", table.name)
-                                                      : String.format("%s on table %s", indexName, table.name),
-                                                      state);
-        }
-
         List<IndexTarget> indexTargets = Lists.newArrayList(transform(rawIndexTargets, t -> t.prepare(table)));
 
         if (indexTargets.isEmpty() && !attrs.isCustom)
@@ -170,6 +160,27 @@ public Keyspaces apply(Keyspaces schema)
 
         IndexMetadata index = IndexMetadata.fromIndexTargets(indexTargets, name, kind, options);
 
+        String className = index.getIndexClassName();
+        IndexGuardrails guardRails = IndexGuardrails.forClassName(className);
+        String indexDescription = indexName == null ? String.format("on table %s", table.name) : String.format("%s on table %s", indexName, table.name);
+
+        // Guardrail to limit number of secondary indexes (per table)
+        if (guardRails.hasPerTableThreshold())
+        {
+            long indexesOnSameTable = table.indexes.stream().filter(other -> className.equals(other.getIndexClassName())).count();
+            guardRails.perTableThreshold.guard(indexesOnSameTable + 1, indexDescription,false, state);
+        }
+
+        // Guardrail to limit number of secondary indexes (total)
+        if (guardRails.hasTotalThreshold())
+        {
+            long indexesOnAllTables = StreamSupport.stream(Keyspace.all().spliterator(), false).flatMap(ks -> ks.getColumnFamilyStores().stream())
+                                                   .flatMap(ks -> ks.indexManager.listIndexes().stream())
+                                                   .map(i -> i.getIndexMetadata().getIndexClassName())
+                                                   .filter(otherClassName -> className.equals(otherClassName)).count();
+            guardRails.totalThreshold.guard(indexesOnAllTables + 1, indexDescription, false, state);
+        }
+
         // check to disallow creation of an index which duplicates an existing one in all but name
         IndexMetadata equalIndex = tryFind(table.indexes, i -> i.equalsWithoutName(index)).orNull();
         if (null != equalIndex)
@@ -322,4 +333,47 @@ public CreateIndexStatement prepare(ClientState state)
             return new CreateIndexStatement(keyspaceName, tableName.getName(), indexName.getName(), rawIndexTargets, attrs, ifNotExists);
         }
     }
+
+    enum IndexGuardrails
+    {
+        LEGACY(Guardrails.secondaryIndexesPerTable, null),
+        SAI(Guardrails.indexesPerTableSai, Guardrails.indexesTotalSai),
+        SASI(Guardrails.indexesPerTableSasi, null),
+        UNKNOWN(null, null);
+
+        final Guardrail.Threshold perTableThreshold;
+        final Guardrail.Threshold totalThreshold;
+
+        IndexGuardrails(Guardrail.Threshold perTableThreshold, Guardrail.Threshold totalThreshold)
+        {
+            this.perTableThreshold = perTableThreshold;
+            this.totalThreshold = totalThreshold;
+        }
+
+        boolean hasPerTableThreshold()
+        {
+            return perTableThreshold != null;
+        }
+
+        boolean hasTotalThreshold()
+        {
+            return totalThreshold != null;
+        }
+
+        static IndexGuardrails forClassName(String className)
+        {
+            switch (className)
+            {
+                case "org.apache.cassandra.index.internal.CassandraIndex":
+                    return IndexGuardrails.LEGACY;
+                case "org.apache.cassandra.index.sasi.SASIIndex":
+                    return IndexGuardrails.SASI;
+                case "org.apache.cassandra.index.sai.StorageAttachedIndex":
+                    return IndexGuardrails.SAI;
+                default:
+                    return IndexGuardrails.UNKNOWN;
+            }
+        }
+
+    }
 }
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
index eebecfb33ae4..8c8da194f71a 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrails.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -74,6 +74,24 @@ public abstract class Guardrails
                                                                            (x, what, v, t) -> format("Tables cannot have more than %s secondary indexes, failed to create secondary index %s",
                                                                                                      t, what));
 
+    public static final Threshold indexesPerTableSasi = new Threshold("sasi_indexes_per_table_failure_threshold",
+                                                                      () -> -1,
+                                                                      () -> config.sasi_indexes_per_table_failure_threshold,
+                                                                      (x, what, v, t) -> format("Tables cannot have more than %s SASI indexes, failed to create SASI index %s",
+                                                                                                t, what));
+
+    public static final Threshold indexesPerTableSai = new Threshold("sai_indexes_per_table_failure_threshold",
+                                                                     () -> -1,
+                                                                     () -> config.sai_indexes_per_table_failure_threshold,
+                                                                     (x, what, v, t) -> format("Tables cannot have more than %s StorageAttachedIndex secondary indexes, failed to create secondary index %s",
+                                                                                               t, what));
+
+    public static final Threshold indexesTotalSai = new Threshold("sai_indexes_total_failure_threshold",
+                                                                  () -> -1,
+                                                                  () -> config.sai_indexes_total_failure_threshold,
+                                                                  (x, what, v, t) -> format("Cannot have more than %s StorageAttachedIndex secondary indexes across all keyspaces, failed to create secondary index %s",
+                                                                                            t, what));
+
     public static final Threshold materializedViewsPerTable = new Threshold("materialized_views_per_table",
                                                                             () -> -1,
                                                                             () -> config.materialized_view_per_table_failure_threshold,
diff --git a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
index 7f30f1e15491..995cd2d290de 100644
--- a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
+++ b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
@@ -28,6 +28,7 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Sets;
 
+import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.statements.schema.TableAttributes;
 import org.apache.cassandra.db.ConsistencyLevel;
@@ -63,7 +64,13 @@
  */
 public class GuardrailsConfig
 {
-    public static final Long NO_LIMIT = -1L;
+    public static final String INDEX_GUARDRAILS_TABLE_FAILURE_THRESHOLD = Config.PROPERTY_PREFIX + "index.guardrails.table_failure_threshold";
+    public static final String INDEX_GUARDRAILS_TOTAL_FAILURE_THRESHOLD = Config.PROPERTY_PREFIX + "index.guardrails.total_failure_threshold";
+
+    public static final int NO_LIMIT = -1;
+    public static final int UNSET = -2;
+    public static final int DEFAULT_INDEXES_PER_TABLE_THRESHOLD = 10;
+    public static final int DEFAULT_INDEXES_TOTAL_THRESHOLD = 100;
 
     public Boolean enabled = false;
 
@@ -79,8 +86,13 @@ public class GuardrailsConfig
 
     public Boolean counter_enabled;
 
-    public Long secondary_index_per_table_failure_threshold;
-    public Long materialized_view_per_table_failure_threshold;
+    // Legacy 2i guardrail
+    public Integer secondary_index_per_table_failure_threshold;
+    public Integer sasi_indexes_per_table_failure_threshold;
+    // SAI indexes guardrail
+    public Integer sai_indexes_per_table_failure_threshold;
+    public Integer sai_indexes_total_failure_threshold;
+    public Integer materialized_view_per_table_failure_threshold;
 
     public Set<String> write_consistency_levels_disallowed;
 
@@ -149,19 +161,21 @@ public void validate()
      * If {@link DatabaseDescriptor#isApplyDbaasDefaults()} is true, apply cloud defaults to guardrails settings that
      * are not specified in yaml; otherwise, apply on-prem defaults to guardrails settings that are not specified in yaml;
      */
+    @VisibleForTesting
     public void applyConfig()
     {
         enforceDefault(truncate_table_enabled, v -> truncate_table_enabled = v, true, true);
 
         enforceDefault(user_timestamps_enabled, v -> user_timestamps_enabled = v, true, true);
 
-        enforceDefault(column_value_size_failure_threshold_in_kb, v -> column_value_size_failure_threshold_in_kb = v, NO_LIMIT, 5 * 1024L);
+        enforceDefault(column_value_size_failure_threshold_in_kb, v -> column_value_size_failure_threshold_in_kb = v, -1L, 5 * 1024L);
 
-        enforceDefault(columns_per_table_failure_threshold, v -> columns_per_table_failure_threshold = v, NO_LIMIT, 20L);
-        enforceDefault(secondary_index_per_table_failure_threshold, v -> secondary_index_per_table_failure_threshold = v, NO_LIMIT, 1L);
-        enforceDefault(materialized_view_per_table_failure_threshold, v -> materialized_view_per_table_failure_threshold = v, NO_LIMIT, 2L);
-        enforceDefault(tables_warn_threshold, v -> tables_warn_threshold = v, NO_LIMIT, 100L);
-        enforceDefault(tables_failure_threshold, v -> tables_failure_threshold = v, NO_LIMIT, 200L);
+        enforceDefault(columns_per_table_failure_threshold, v -> columns_per_table_failure_threshold = v, -1L, 20L);
+        enforceDefault(secondary_index_per_table_failure_threshold, v -> secondary_index_per_table_failure_threshold = v, NO_LIMIT, 1);
+        enforceDefault(sasi_indexes_per_table_failure_threshold, v -> sasi_indexes_per_table_failure_threshold = v, NO_LIMIT, 0);
+        enforceDefault(materialized_view_per_table_failure_threshold, v -> materialized_view_per_table_failure_threshold = v, NO_LIMIT, 2);
+        enforceDefault(tables_warn_threshold, v -> tables_warn_threshold = v, -1L, 100L);
+        enforceDefault(tables_failure_threshold, v -> tables_failure_threshold = v, -1L, 200L);
 
         // We use a LinkedHashSet just for the sake of preserving the ordering in error messages
         enforceDefault(write_consistency_levels_disallowed,
@@ -183,20 +197,32 @@ public void applyConfig()
                                                           .collect(Collectors.toList())));
 
         enforceDefault(partition_size_warn_threshold_in_mb, v -> partition_size_warn_threshold_in_mb = v, 100, 100);
-        enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT.intValue(), 20);
+        enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT, 20);
 
         enforceDefault(counter_enabled, v -> counter_enabled = v, true, true);
 
-        enforceDefault(fields_per_udt_failure_threshold, v -> fields_per_udt_failure_threshold = v, NO_LIMIT, 10L);
-        enforceDefault(collection_size_warn_threshold_in_kb, v -> collection_size_warn_threshold_in_kb = v, NO_LIMIT, 5 * 1024L);
-        enforceDefault(items_per_collection_warn_threshold, v -> items_per_collection_warn_threshold = v, NO_LIMIT, 20L);
+        enforceDefault(fields_per_udt_failure_threshold, v -> fields_per_udt_failure_threshold = v, -1L, 10L);
+        enforceDefault(collection_size_warn_threshold_in_kb, v -> collection_size_warn_threshold_in_kb = v, -1L, 5 * 1024L);
+        enforceDefault(items_per_collection_warn_threshold, v -> items_per_collection_warn_threshold = v, -1L, 20L);
 
         // for node status
-        enforceDefault(disk_usage_percentage_warn_threshold, v -> disk_usage_percentage_warn_threshold = v, NO_LIMIT.intValue(), 70);
-        enforceDefault(disk_usage_percentage_failure_threshold, v -> disk_usage_percentage_failure_threshold = v, NO_LIMIT.intValue(), 80);
+        enforceDefault(disk_usage_percentage_warn_threshold, v -> disk_usage_percentage_warn_threshold = v, NO_LIMIT, 70);
+        enforceDefault(disk_usage_percentage_failure_threshold, v -> disk_usage_percentage_failure_threshold = v, NO_LIMIT, 80);
 
-        enforceDefault(in_select_cartesian_product_failure_threshold, v -> in_select_cartesian_product_failure_threshold = v, NO_LIMIT.intValue(), 25);
+        enforceDefault(in_select_cartesian_product_failure_threshold, v -> in_select_cartesian_product_failure_threshold = v, NO_LIMIT, 25);
         enforceDefault(read_before_write_list_operations_enabled, v -> read_before_write_list_operations_enabled = v, true, false);
+
+        // SAI Table Failure threshold (maye be overridden via system property)
+        Integer overrideTableFailureThreshold = Integer.getInteger(INDEX_GUARDRAILS_TABLE_FAILURE_THRESHOLD, UNSET);
+        if (overrideTableFailureThreshold != UNSET)
+            sai_indexes_per_table_failure_threshold = overrideTableFailureThreshold;
+        enforceDefault(sai_indexes_per_table_failure_threshold, v -> sai_indexes_per_table_failure_threshold = v, DEFAULT_INDEXES_PER_TABLE_THRESHOLD, DEFAULT_INDEXES_PER_TABLE_THRESHOLD);
+
+        // SAI Table Failure threshold (maye be overridden via system property)
+        Integer overrideTotalFailureThreshold = Integer.getInteger(INDEX_GUARDRAILS_TOTAL_FAILURE_THRESHOLD, UNSET);
+        if (overrideTotalFailureThreshold != UNSET)
+            sai_indexes_total_failure_threshold = overrideTotalFailureThreshold;
+        enforceDefault(sai_indexes_total_failure_threshold, v -> sai_indexes_total_failure_threshold = v, DEFAULT_INDEXES_TOTAL_THRESHOLD, DEFAULT_INDEXES_TOTAL_THRESHOLD);
     }
 
     private void validateDisallowedTableProperties()
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java
index 10de77224ecf..6dfb017dfd74 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailMaterializedViewsPerTableTest.java
@@ -40,13 +40,13 @@ public class GuardrailMaterializedViewsPerTableTest extends GuardrailTester
     private static final String CREATE_VIEW = "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s " +
                                               "WHERE k is NOT NULL AND v IS NOT NULL PRIMARY KEY (v, k)";
 
-    private Long defaultMVPerTableFailureThreshold;
+    private int defaultMVPerTableFailureThreshold;
 
     @Before
     public void before()
     {
         defaultMVPerTableFailureThreshold = config().materialized_view_per_table_failure_threshold;
-        config().materialized_view_per_table_failure_threshold = 1L;
+        config().materialized_view_per_table_failure_threshold = 1;
 
         createTable(CREATE_TABLE);
     }
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java
new file mode 100644
index 000000000000..eb9d4e21a22e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.stream.StreamSupport;
+
+import com.google.common.base.Strings;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+
+import static java.lang.String.format;
+import static org.junit.Assert.assertEquals;
+
+public class GuardrailSAIIndexesTest extends GuardrailTester
+{
+    private int defaultSAIPerTableFailureThreshold;
+    private int defaultSAITotalFailureThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultSAIPerTableFailureThreshold = config().sai_indexes_per_table_failure_threshold;
+        defaultSAITotalFailureThreshold = config().sai_indexes_total_failure_threshold;
+        config().sai_indexes_per_table_failure_threshold = 1;
+        config().sai_indexes_total_failure_threshold = 2;
+    }
+
+    @After
+    public void after()
+    {
+        config().sai_indexes_per_table_failure_threshold = defaultSAIPerTableFailureThreshold;
+        config().sai_indexes_total_failure_threshold = defaultSAITotalFailureThreshold;
+    }
+
+    @Test
+    public void testDefaultsOnPrem()
+    {
+        testDefaults(false);
+    }
+
+    @Test
+    public void testDefaultsDBAAS()
+    {
+        testDefaults(true);
+    }
+
+    public void testDefaults(boolean dbaas)
+    {
+        boolean previous = DatabaseDescriptor.isApplyDbaasDefaults();
+        try
+        {
+            DatabaseDescriptor.setApplyDbaasDefaults(dbaas);
+
+            GuardrailsConfig config = new GuardrailsConfig();
+            config.applyConfig();
+
+            assertEquals(GuardrailsConfig.DEFAULT_INDEXES_PER_TABLE_THRESHOLD, (int) config.sai_indexes_per_table_failure_threshold);
+            assertEquals(GuardrailsConfig.DEFAULT_INDEXES_TOTAL_THRESHOLD, (int) config.sai_indexes_total_failure_threshold);
+        }
+        finally
+        {
+            DatabaseDescriptor.setApplyDbaasDefaults(previous);
+        }
+    }
+
+    @Test
+    public void testPerTableFailureThreshold() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int, v3 int)");
+        String indexName = createIndex(getCreateIndexStatement("v1"));
+        assertIndexesOnCurrentTable(1);
+
+        assertIndexCreationFails("", "v2");
+        assertIndexCreationFails("custom_index_name", "v2");
+        assertIndexesOnCurrentTable(1);
+
+        // guardrail should not affect indexes of other types
+        assertValid(getDifferentCreateIndexStatement("idx2", "v2"));
+        assertIndexesOnCurrentTable(2);
+
+        // drop the first index, we should be able to create new index again
+        dropIndex(format("DROP INDEX %s.%s", keyspace(), indexName));
+        assertIndexesOnCurrentTable(1);
+
+        execute(getCreateIndexStatement("v3"));
+        assertIndexesOnCurrentTable(2);
+
+        // previous guardrail should not apply to another base table
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        assertValid(getCreateIndexStatement("v1"));
+        assertIndexesOnCurrentTable(1);
+
+        assertIndexCreationFails("custom_index_name2", "v2");
+        assertIndexesOnCurrentTable(1);
+    }
+
+    @Test
+    public void testTotalFailureThreshold() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        String indexName = createIndex(getCreateIndexStatement("v1"));
+        assertTotalIndexesOfTheSameType(1);
+        assertGlobalIndexes(1);
+
+        // Create index on new table
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        createIndex(getCreateIndexStatement("v1"));
+        assertTotalIndexesOfTheSameType(2);
+        assertGlobalIndexes(2);
+
+        // Trying create new indexes on current table should fail
+        assertIndexCreationFails("", "v2");
+        assertIndexCreationFails("custom_index_name", "v2");
+        assertTotalIndexesOfTheSameType(2);
+        assertGlobalIndexes(2);
+
+        // Trying to create indexes on new table should also fail
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        assertIndexCreationFails("", "v1");
+
+        // Trying to create different index type should not fail
+        assertValid(getDifferentCreateIndexStatement("idx2", "v2"));
+        assertTotalIndexesOfTheSameType(2);
+        assertGlobalIndexes(3);
+
+        // drop the first index, we should be able to create new index again
+        dropIndex(format("DROP INDEX %s.%s", keyspace(), indexName));
+        assertTotalIndexesOfTheSameType(1);
+        assertGlobalIndexes(2);
+
+        // Now index creation should succeed
+        createIndex(getCreateIndexStatement("v1"));
+        assertTotalIndexesOfTheSameType(2);
+        assertGlobalIndexes(3);
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+        testExcludedUsers(getCreateIndexStatement("excluded_1", "v1"),
+                          getCreateIndexStatement("excluded_2", "v2"),
+                          "DROP INDEX excluded_1",
+                          "DROP INDEX excluded_2");
+    }
+
+    private void assertIndexesOnCurrentTable(int count)
+    {
+        assertEquals(count, getCurrentColumnFamilyStore().indexManager.listIndexes().size());
+    }
+
+    private void assertGlobalIndexes(int count)
+    {
+        int totalIndexes = StreamSupport.stream(Keyspace.all().spliterator(), false).flatMap(k -> k.getColumnFamilyStores().stream()).mapToInt(t -> t.indexManager.listIndexes().size()).sum();
+        assertEquals(count, totalIndexes);
+    }
+
+    private void assertTotalIndexesOfTheSameType(int count)
+    {
+        int totalIndexes = (int) StreamSupport.stream(Keyspace.all().spliterator(), false).flatMap(k -> k.getColumnFamilyStores().stream())
+                                              .flatMap(t -> t.indexManager.listIndexes().stream())
+                                              .filter(i -> i.getIndexMetadata().getIndexClassName().equals(getIndexClassName())).count();
+        assertEquals(count, totalIndexes);
+    }
+
+    private void assertIndexCreationFails(String indexName, String column) throws Throwable
+    {
+        String expectedMessage = String.format("failed to create secondary index %son table %s",
+                                               Strings.isNullOrEmpty(indexName) ? "" : indexName + " ", currentTable());
+        assertFails(expectedMessage, getCreateIndexStatement(indexName, column));
+    }
+
+    protected String getIndexClassName()
+    {
+        return StorageAttachedIndex.class.getName();
+    }
+
+    String getCreateIndexStatement(String column)
+    {
+        return String.format("CREATE CUSTOM INDEX ON %%s (%s) USING '%s'", column, StorageAttachedIndex.class.getCanonicalName());
+    }
+
+    String getCreateIndexStatement(String indexName, String column)
+    {
+        return String.format("CREATE CUSTOM INDEX %s ON %%s (%s) USING '%s'", indexName, column, StorageAttachedIndex.class.getCanonicalName());
+    }
+
+    String getDifferentCreateIndexStatement(String indexName, String column)
+    {
+        return String.format("CREATE INDEX %s ON %%s (%s)", indexName, column);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailSASIIndexesPerTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailSASIIndexesPerTableTest.java
new file mode 100644
index 000000000000..b74af3cb9919
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailSASIIndexesPerTableTest.java
@@ -0,0 +1,96 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Please see the included license file for details.
+ */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class GuardrailSASIIndexesPerTableTest extends GuardrailTester
+{
+    private int defaultSASIPerTableFailureThreshold;
+
+    @Before
+    public void before()
+    {
+        defaultSASIPerTableFailureThreshold = config().sasi_indexes_per_table_failure_threshold;
+    }
+
+    @After
+    public void after()
+    {
+        config().sasi_indexes_per_table_failure_threshold = defaultSASIPerTableFailureThreshold;
+    }
+
+    @Test
+    public void testCreateIndex() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+
+        config().sasi_indexes_per_table_failure_threshold = 0;
+        assertCreationFailed("v1");
+        assertNumIndexes(0);
+
+        config().sasi_indexes_per_table_failure_threshold = 1;
+        createIndex(getCreateIndexStatement("v1"));
+        assertNumIndexes(1);
+        assertCreationFailed("v2");
+        assertNumIndexes(1);
+    }
+
+    @Test
+    public void testExcludedUsers() throws Throwable
+    {
+        createTable("CREATE TABLE %s (k int primary key, v1 int, v2 int)");
+
+        config().sasi_indexes_per_table_failure_threshold = 0;
+        testExcludedUsers(getCreateIndexStatement("excluded_1", "v1"),
+                          getCreateIndexStatement("excluded_2", "v2"),
+                          "DROP INDEX excluded_1",
+                          "DROP INDEX excluded_2");
+    }
+
+    private void assertNumIndexes(int count)
+    {
+        assertEquals(count, getCurrentColumnFamilyStore().indexManager.listIndexes().size());
+    }
+
+    private void assertCreationFailed(String column) throws Throwable
+    {
+        String expectedMessage = String.format("failed to create SASI index on table %s", currentTable());
+        assertFails(expectedMessage, getCreateIndexStatement(column));
+    }
+
+    private String getCreateIndexStatement(String column)
+    {
+        return String.format("CREATE CUSTOM INDEX ON %%s (%s) USING 'org.apache.cassandra.index.sasi.SASIIndex'", column);
+    }
+
+    private String getCreateIndexStatement(String indexName, String column)
+    {
+        return String.format("CREATE CUSTOM INDEX %s ON %%s (%s) USING 'org.apache.cassandra.index.sasi.SASIIndex'", indexName, column);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java
index 5fd3b8f01922..f6165f91234b 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailSecondaryIndexesPerTableTest.java
@@ -29,13 +29,13 @@
 
 public class GuardrailSecondaryIndexesPerTableTest extends GuardrailTester
 {
-    private Long defaultSIPerTableFailureThreshold;
+    private int defaultSIPerTableFailureThreshold;
 
     @Before
     public void before()
     {
         defaultSIPerTableFailureThreshold = config().secondary_index_per_table_failure_threshold;
-        config().secondary_index_per_table_failure_threshold = 1L;
+        config().secondary_index_per_table_failure_threshold = 1;
     }
 
     @After
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
index 4a5bbe9038f8..df85876dcd9d 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
@@ -44,7 +44,7 @@ public class GuardrailsOnTableTest extends GuardrailTester
     private static final String ALTER_VIEW = "ALTER MATERIALIZED VIEW %s.%s WITH %s";
     private static long defaultTablesSoftLimit;
     private static long defaultTableHardLimit;
-    private static long defaultMVPerTableFailureThreshold;
+    private static int defaultMVPerTableFailureThreshold;
     private static Set<String> defaultTablePropertiesDisallowed;
     private static Set<String> defaultTablePropertiesIgnored;
 

From c7f0c550d3b09394f23d4a17d7934a23657866d0 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 19 May 2021 12:01:58 +0100
Subject: [PATCH 086/151] STAR-674: Map disk_access_mode auto to standard like
 DSE

(cherry picked from commit a76fd0a98796488f454980f3100dda3477f4c238)
(cherry picked from commit 3d3e7a563c35df52cab9b94f09d50a4e3cb63062)
---
 src/java/org/apache/cassandra/config/DatabaseDescriptor.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index a22f8cb45e0f..7584433a7b45 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -420,7 +420,7 @@ else if (!Double.isNaN(conf.commitlog_sync_batch_window_in_ms))
         /* evaluate the DiskAccessMode Config directive, which also affects indexAccessMode selection */
         if (conf.disk_access_mode == Config.DiskAccessMode.auto)
         {
-            conf.disk_access_mode = hasLargeAddressSpace() ? Config.DiskAccessMode.mmap : Config.DiskAccessMode.standard;
+            conf.disk_access_mode = Config.DiskAccessMode.standard;
             indexAccessMode = conf.disk_access_mode;
             logger.info("DiskAccessMode 'auto' determined to be {}, indexAccessMode is {}", conf.disk_access_mode, indexAccessMode);
         }

From d529aa339190c1591de1a372ed775b06439fc69b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 19 May 2021 16:07:56 +0100
Subject: [PATCH 087/151] STAR-674: Use object_buffers for
 memtable_allocation_type like DSE

We also need to reduce the GC logging level (from TRACE to DEBUG) for
some events otherwise we'll overwhelm the system as it attempts to write
events to disk. See STAR-658.

(cherry picked from commit 1c38f5088d61a45c4a0b06412789208e3d665749)
(cherry picked from commit 122bf53bdd4e68f070cd0a0f9bde14eb45e89af1)
---
 conf/cassandra-env.sh                            | 2 +-
 conf/cassandra.yaml                              | 4 ++--
 src/java/org/apache/cassandra/config/Config.java | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conf/cassandra-env.sh b/conf/cassandra-env.sh
index dd088df81fd7..5360c33e4dff 100644
--- a/conf/cassandra-env.sh
+++ b/conf/cassandra-env.sh
@@ -99,7 +99,7 @@ if [ $JAVA_VERSION -ge 11 ] ; then
     if [ "$?" = "1" ] ; then # [X] to prevent ccm from replacing this line
         # only add -Xlog:gc if it's not mentioned in jvm-server.options file
         mkdir -p ${CASSANDRA_LOG_DIR}
-        JVM_OPTS="$JVM_OPTS -Xlog:gc=info,heap*=trace,age*=debug,safepoint=info,promotion*=trace:file=${CASSANDRA_LOG_DIR}/gc.log:time,uptime,pid,tid,level:filecount=10,filesize=10485760"
+        JVM_OPTS="$JVM_OPTS -Xlog:gc=info,heap*=debug,age*=debug,safepoint=info,promotion*=debug:file=${CASSANDRA_LOG_DIR}/gc.log:time,uptime,pid,tid,level:filecount=10,filesize=10485760"
     fi
 else
     # Java 8
diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index f50084ea502b..9bb9eca4bbcf 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -546,7 +546,7 @@ concurrent_materialized_view_writes: 32
 #
 # offheap_objects
 #    off heap objects
-memtable_allocation_type: heap_buffers
+memtable_allocation_type: offheap_objects
 
 # Limit memory usage for Merkle tree calculations during repairs. The default
 # is 1/16th of the available heap. The main tradeoff is that smaller trees
@@ -1544,4 +1544,4 @@ enable_transient_replication: false
 
   # Warning threshold to warn when encountering more elements in collection than threshold.
   # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
-  # items_per_collection_warn_threshold: -1
\ No newline at end of file
+  # items_per_collection_warn_threshold: -1
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 482f8905a3e7..4d67acfca139 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -342,7 +342,7 @@ public class Config
 
     public boolean inter_dc_tcp_nodelay = true;
 
-    public MemtableAllocationType memtable_allocation_type = MemtableAllocationType.heap_buffers;
+    public MemtableAllocationType memtable_allocation_type = MemtableAllocationType.offheap_objects;
 
     public volatile int tombstone_warn_threshold = 1000;
     public volatile int tombstone_failure_threshold = 100000;

From 80abaa774b91c1178ec76c235948c2c8a491b1de Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Thu, 20 May 2021 14:16:06 +0100
Subject: [PATCH 088/151] STAR-674: Switch from CMS to G1 GC like DSE

(cherry picked from commit ccf9d94c7e13f3e05118ac83cb1033156fe6b479)
(cherry picked from commit 580bd44d0322e92e954f1736bacac10d3f924edb)
---
 conf/jvm11-server.options | 30 +++++++++++++++---------------
 conf/jvm8-server.options  | 30 +++++++++++++++---------------
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/conf/jvm11-server.options b/conf/jvm11-server.options
index 7e784678535c..7d58d1c5f820 100644
--- a/conf/jvm11-server.options
+++ b/conf/jvm11-server.options
@@ -11,34 +11,34 @@
 
 
 ### CMS Settings
--XX:+UseConcMarkSweepGC
--XX:+CMSParallelRemarkEnabled
--XX:SurvivorRatio=8
--XX:MaxTenuringThreshold=1
--XX:CMSInitiatingOccupancyFraction=75
--XX:+UseCMSInitiatingOccupancyOnly
--XX:CMSWaitDuration=10000
--XX:+CMSParallelInitialMarkEnabled
--XX:+CMSEdenChunksRecordAlways
+#-XX:+UseConcMarkSweepGC
+#-XX:+CMSParallelRemarkEnabled
+#-XX:SurvivorRatio=8
+#-XX:MaxTenuringThreshold=1
+#-XX:CMSInitiatingOccupancyFraction=75
+#-XX:+UseCMSInitiatingOccupancyOnly
+#-XX:CMSWaitDuration=10000
+#-XX:+CMSParallelInitialMarkEnabled
+#-XX:+CMSEdenChunksRecordAlways
 ## some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
--XX:+CMSClassUnloadingEnabled
+#-XX:+CMSClassUnloadingEnabled
 
 
 
 ### G1 Settings
 ## Use the Hotspot garbage-first collector.
-#-XX:+UseG1GC
-#-XX:+ParallelRefProcEnabled
+-XX:+UseG1GC
+-XX:+ParallelRefProcEnabled
 
 #
 ## Have the JVM do less remembered set work during STW, instead
 ## preferring concurrent GC. Reduces p99.9 latency.
-#-XX:G1RSetUpdatingPauseTimePercent=5
+-XX:G1RSetUpdatingPauseTimePercent=5
 #
 ## Main G1GC tunable: lowering the pause target will lower throughput and vise versa.
 ## 200ms is the JVM default and lowest viable setting
 ## 1000ms increases throughput. Keep it smaller than the timeouts in cassandra.yaml.
-#-XX:MaxGCPauseMillis=500
+-XX:MaxGCPauseMillis=500
 
 ## Optional G1 Settings
 # Save CPU time on large (>= 16GB) heaps by delaying region scanning
@@ -80,7 +80,7 @@
 # Java 11 (and newer) GC logging options:
 # See description of https://bugs.openjdk.java.net/browse/JDK-8046148 for details about the syntax
 # The following is the equivalent to -XX:+PrintGCDetails -XX:+UseGCLogFileRotation -XX:NumberOfGCLogFiles=10 -XX:GCLogFileSize=10M
-#-Xlog:gc=info,heap*=trace,age*=debug,safepoint=info,promotion*=trace:file=/var/log/cassandra/gc.log:time,uptime,pid,tid,level:filecount=10,filesize=10485760
+-Xlog:gc=info,heap*=debug,age*=debug,safepoint=info,promotion*=debug:file=/var/log/cassandra/gc.log:time,uptime,pid,tid,level:filecount=10,filesize=10485760
 
 # Notes for Java 8 migration:
 #
diff --git a/conf/jvm8-server.options b/conf/jvm8-server.options
index 6214669eabdc..14d02617cd9e 100644
--- a/conf/jvm8-server.options
+++ b/conf/jvm8-server.options
@@ -18,33 +18,33 @@
 #################
 
 ### CMS Settings
--XX:+UseParNewGC
--XX:+UseConcMarkSweepGC
--XX:+CMSParallelRemarkEnabled
--XX:SurvivorRatio=8
--XX:MaxTenuringThreshold=1
--XX:CMSInitiatingOccupancyFraction=75
--XX:+UseCMSInitiatingOccupancyOnly
--XX:CMSWaitDuration=10000
--XX:+CMSParallelInitialMarkEnabled
--XX:+CMSEdenChunksRecordAlways
+#-XX:+UseParNewGC
+#-XX:+UseConcMarkSweepGC
+#-XX:+CMSParallelRemarkEnabled
+#-XX:SurvivorRatio=8
+#-XX:MaxTenuringThreshold=1
+#-XX:CMSInitiatingOccupancyFraction=75
+#-XX:+UseCMSInitiatingOccupancyOnly
+#-XX:CMSWaitDuration=10000
+#-XX:+CMSParallelInitialMarkEnabled
+#-XX:+CMSEdenChunksRecordAlways
 ## some JVMs will fill up their heap when accessed via JMX, see CASSANDRA-6541
--XX:+CMSClassUnloadingEnabled
+#-XX:+CMSClassUnloadingEnabled
 
 ### G1 Settings
 ## Use the Hotspot garbage-first collector.
-#-XX:+UseG1GC
-#-XX:+ParallelRefProcEnabled
+-XX:+UseG1GC
+-XX:+ParallelRefProcEnabled
 
 #
 ## Have the JVM do less remembered set work during STW, instead
 ## preferring concurrent GC. Reduces p99.9 latency.
-#-XX:G1RSetUpdatingPauseTimePercent=5
+-XX:G1RSetUpdatingPauseTimePercent=5
 #
 ## Main G1GC tunable: lowering the pause target will lower throughput and vise versa.
 ## 200ms is the JVM default and lowest viable setting
 ## 1000ms increases throughput. Keep it smaller than the timeouts in cassandra.yaml.
-#-XX:MaxGCPauseMillis=500
+-XX:MaxGCPauseMillis=500
 
 ## Optional G1 Settings
 # Save CPU time on large (>= 16GB) heaps by delaying region scanning

From 7ce7a83f38cfe7b9519c85525f8255451d3530d8 Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Tue, 25 May 2021 07:15:43 -0700
Subject: [PATCH 089/151] STAR-426 Add specific CQL statement to record dropped
 columns (#156)

Co-authored-by: Sylvain Lebresne <lebresne@gmail.com>
(cherry picked from commit 2a48ae1d278e7e5df2cdea03186b790935d7c953)
---
 src/antlr/Lexer.g                             |   4 +
 src/antlr/Parser.g                            |  12 +-
 .../schema/AlterTableStatement.java           |  17 ++-
 .../schema/CreateTableStatement.java          |   2 +
 .../statements/schema/TableAttributes.java    |  20 +++
 .../cassandra/db/marshal/AbstractType.java    |   8 ++
 .../cassandra/schema/ColumnMetadata.java      |  74 ++++++++++-
 .../cassandra/schema/DroppedColumn.java       |  57 +++++++++
 .../cassandra/schema/TableMetadata.java       |  79 ++++--------
 .../apache/cassandra/schema/ViewMetadata.java |   2 +-
 .../statements/DescribeStatementTest.java     |  48 ++++---
 .../cassandra/db/SchemaCQLHelperTest.java     | 120 ++++++++++++++----
 12 files changed, 333 insertions(+), 110 deletions(-)

diff --git a/src/antlr/Lexer.g b/src/antlr/Lexer.g
index 8950690b5d15..ba1e15a29d8d 100644
--- a/src/antlr/Lexer.g
+++ b/src/antlr/Lexer.g
@@ -220,6 +220,10 @@ K_EDGE:        E D G E;
 K_VERTEX:      V E R T E X;
 K_LABEL:       L A B E L;
 
+K_DROPPED:     D R O P P E D;
+K_COLUMN:      C O L U M N;
+K_RECORD:      R E C O R D;
+
 // Case-insensitive alpha characters
 fragment A: ('a'|'A');
 fragment B: ('b'|'B');
diff --git a/src/antlr/Parser.g b/src/antlr/Parser.g
index ea5e11ac633f..30e1074a4cfe 100644
--- a/src/antlr/Parser.g
+++ b/src/antlr/Parser.g
@@ -794,6 +794,7 @@ tablePartitionKey[CreateTableStatement.Raw stmt]
     ;
 
 tableProperty[CreateTableStatement.Raw stmt]
+    @init { boolean isStatic = false; }
     : property[stmt.attrs]
     | K_COMPACT K_STORAGE { $stmt.setCompactStorage(); }
     | K_CLUSTERING K_ORDER K_BY '(' tableClusteringOrder[stmt] (',' tableClusteringOrder[stmt])* ')'
@@ -802,6 +803,12 @@ tableProperty[CreateTableStatement.Raw stmt]
              K_FROM noncol_ident '(' ident (',' ident)* ')'
              K_TO noncol_ident '(' ident (',' ident)* ')'
              {stmt.attrs.addProperty("dse_edge_label_property", "edge");}
+    | K_DROPPED K_COLUMN K_RECORD
+          k=ident v=comparatorType (K_STATIC {isStatic = true;})?
+          K_USING K_TIMESTAMP t=INTEGER
+      {
+          stmt.attrs.addDroppedColumnRecord(k, v, isStatic, Long.parseLong($t.text));
+      }
     ;
 
 tableClusteringOrder[CreateTableStatement.Raw stmt]
@@ -947,7 +954,7 @@ alterTableStatement returns [AlterTableStatement.Raw stmt]
       | K_DROP (        id=ident { $stmt.drop(id);  }
                | ('('  id1=ident { $stmt.drop(id1); }
                  ( ',' idn=ident { $stmt.drop(idn); } )* ')') )
-               ( K_USING K_TIMESTAMP t=INTEGER { $stmt.timestamp(Long.parseLong(Constants.Literal.integer($t.text).getText())); } )?
+               ( K_USING K_TIMESTAMP t=INTEGER { $stmt.dropTimestamp(Long.parseLong($t.text)); } )?
 
       | K_RENAME id1=ident K_TO toId1=ident { $stmt.rename(id1, toId1); }
          ( K_AND idn=ident K_TO toIdn=ident { $stmt.rename(idn, toIdn); } )*
@@ -1908,5 +1915,8 @@ basic_unreserved_keyword returns [String str]
         | K_EDGE
         | K_VERTEX
         | K_LABEL
+        | K_DROPPED
+        | K_COLUMN
+        | K_RECORD
         ) { $str = $k.text; }
     ;
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
index 33669b1a572a..1447eb9f48a3 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
@@ -35,7 +35,6 @@
 
 import org.apache.cassandra.audit.AuditLogContext;
 import org.apache.cassandra.audit.AuditLogEntryType;
-import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.auth.Permission;
 
 import org.apache.cassandra.cql3.CQL3Type;
@@ -51,11 +50,11 @@
 import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.DroppedColumn;
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.schema.Keyspaces;
 import org.apache.cassandra.guardrails.Guardrails;
-import org.apache.cassandra.schema.*;
 import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.TableParams;
@@ -444,7 +443,11 @@ public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table)
                 throw ire("read_repair must be set to 'NONE' for transiently replicated keyspaces");
             }
 
-            return keyspace.withSwapped(keyspace.tables.withSwapped(table.withSwapped(params)));
+            TableMetadata.Builder builder = table.unbuild().params(params);
+            for (DroppedColumn.Raw record : attrs.droppedColumnRecords())
+                builder.recordColumnDrop(record.prepare(keyspaceName, tableName));
+
+            return keyspace.withSwapped(keyspace.tables.withSwapped(builder.build()));
         }
     }
 
@@ -561,7 +564,7 @@ private enum Kind
 
         // DROP
         private final Set<ColumnIdentifier> droppedColumns = new HashSet<>();
-        private Long timestamp = null; // will use execution timestamp if not provided by query
+        private Long dropTimestamp = null; // will use execution timestamp if not provided by query
 
         // RENAME
         private final Map<ColumnIdentifier, ColumnIdentifier> renamedColumns = new HashMap<>();
@@ -583,7 +586,7 @@ public AlterTableStatement prepare(ClientState state)
             {
                 case          ALTER_COLUMN: return new AlterColumn(keyspaceName, tableName);
                 case           ADD_COLUMNS: return new AddColumns(keyspaceName, tableName, addedColumns);
-                case          DROP_COLUMNS: return new DropColumns(keyspaceName, tableName, droppedColumns, timestamp);
+                case          DROP_COLUMNS: return new DropColumns(keyspaceName, tableName, droppedColumns, dropTimestamp);
                 case        RENAME_COLUMNS: return new RenameColumns(keyspaceName, tableName, renamedColumns);
                 case         ALTER_OPTIONS: return new AlterOptions(keyspaceName, tableName, attrs);
                 case  DROP_COMPACT_STORAGE: return new DropCompactStorage(keyspaceName, tableName);
@@ -614,9 +617,9 @@ public void dropCompactStorage()
             kind = Kind.DROP_COMPACT_STORAGE;
         }
 
-        public void timestamp(long timestamp)
+        public void dropTimestamp(long timestamp)
         {
-            this.timestamp = timestamp;
+            this.dropTimestamp = timestamp;
         }
 
         public void rename(ColumnIdentifier from, ColumnIdentifier to)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index 0068cb7f3a3c..cc163dd68807 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -324,6 +324,8 @@ public TableMetadata.Builder builder(Types types)
                     builder.addRegularColumn(column, type.getType());
             });
         }
+        for (DroppedColumn.Raw record : attrs.droppedColumnRecords())
+            builder.recordColumnDrop(record.prepare(keyspaceName, tableName));
         return builder;
     }
 
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
index 23901efaf640..13344b1de450 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java
@@ -17,18 +17,24 @@
  */
 package org.apache.cassandra.cql3.statements.schema;
 
+import java.util.Collection;
+import java.util.HashMap;
 import java.util.Map;
 import java.util.Set;
 
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Sets;
 
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.statements.PropertyDefinitions;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.exceptions.SyntaxException;
 import org.apache.cassandra.schema.CachingParams;
 import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.schema.DroppedColumn;
 import org.apache.cassandra.schema.MemtableParams;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.schema.TableParams;
@@ -64,12 +70,26 @@ public final class TableAttributes extends PropertyDefinitions
         validKeywords = validBuilder.build();
     }
 
+    private final Map<ColumnIdentifier, DroppedColumn.Raw> droppedColumnRecords = new HashMap<>();
+
     public void validate()
     {
         validate(validKeywords, obsoleteKeywords);
         build(TableParams.builder()).validate();
     }
 
+    public void addDroppedColumnRecord(ColumnIdentifier name, CQL3Type.Raw type, boolean isStatic, long timestamp)
+    {
+        DroppedColumn.Raw newRecord = new DroppedColumn.Raw(name, type, isStatic, timestamp);
+        if (droppedColumnRecords.put(name, newRecord) != null)
+            throw new InvalidRequestException(String.format("Cannot have multiple dropped column record for column %s", name));
+    }
+
+    public Collection<DroppedColumn.Raw> droppedColumnRecords()
+    {
+        return droppedColumnRecords.values();
+    }
+
     TableParams asNewTableParams()
     {
         return build(TableParams.builder());
diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
index bc6918118549..4c48e7d058f8 100644
--- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java
+++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java
@@ -542,6 +542,14 @@ public <V> boolean referencesUserType(V name, ValueAccessor<V> accessor)
         return false;
     }
 
+    /**
+     * Whether this type is or contains any UDT.
+     */
+    public final boolean referencesUserTypes()
+    {
+        return isUDT() || subTypes().stream().anyMatch(AbstractType::referencesUserTypes);
+    }
+
     /**
      * Returns an instance of this type with all references to the provided user type recursively replaced with its new
      * definition.
diff --git a/src/java/org/apache/cassandra/schema/ColumnMetadata.java b/src/java/org/apache/cassandra/schema/ColumnMetadata.java
index 37fbfa248275..db21935d2f0e 100644
--- a/src/java/org/apache/cassandra/schema/ColumnMetadata.java
+++ b/src/java/org/apache/cassandra/schema/ColumnMetadata.java
@@ -33,9 +33,10 @@
 import org.apache.cassandra.db.marshal.*;
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.serializers.MarshalException;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.github.jamm.Unmetered;
 
+import static java.lang.String.format;
+
 @Unmetered
 public class ColumnMetadata extends ColumnSpecification implements Selectable, Comparable<ColumnMetadata>
 {
@@ -73,6 +74,11 @@ public boolean isPrimaryKeyKind()
 
     }
 
+    /**
+     * Whether this is a dropped column.
+     */
+    private final boolean isDropped;
+
     public final Kind kind;
 
     /*
@@ -145,6 +151,25 @@ public static ColumnMetadata staticColumn(String keyspace, String table, String
         return new ColumnMetadata(keyspace, table, ColumnIdentifier.getInterned(name, true), type, NO_POSITION, Kind.STATIC);
     }
 
+    /**
+     * Rebuild the metadata for a dropped column from its recorded data.
+     *
+     * <p>Please note that this method expect that the provided arguments are those of a dropped column, and in
+     * particular that the type uses no UDT (any should have been expanded). If a column is being dropped, prefer
+     * {@link #asDropped()} to transform the existing column to a dropped one as this deal with type expansion directly.
+     */
+    public static ColumnMetadata droppedColumn(String keyspace,
+                                               String table,
+                                               ColumnIdentifier name,
+                                               AbstractType<?> type,
+                                               Kind kind)
+    {
+        assert !kind.isPrimaryKeyKind();
+        assert !type.referencesUserTypes()
+        : format("In %s.%s, dropped column %s type should not contain UDT; got %s" , keyspace, table, name, type);
+        return new ColumnMetadata(keyspace, table, name, type, NO_POSITION, kind, true);
+    }
+
     public ColumnMetadata(TableMetadata table, ByteBuffer name, AbstractType<?> type, int position, Kind kind)
     {
         this(table.keyspace,
@@ -162,6 +187,17 @@ public ColumnMetadata(String ksName,
                           AbstractType<?> type,
                           int position,
                           Kind kind)
+    {
+        this(ksName, cfName, name, type, position, kind, false);
+    }
+    
+    public ColumnMetadata(String ksName,
+                          String cfName,
+                          ColumnIdentifier name,
+                          AbstractType<?> type,
+                          int position,
+                          Kind kind,
+                          boolean isDropped)
     {
         super(ksName, cfName, name, type);
         assert name != null && type != null && kind != null;
@@ -173,6 +209,7 @@ public ColumnMetadata(String ksName,
         this.cellComparator = cellPathComparator == null ? ColumnData.comparator : (a, b) -> cellPathComparator.compare(a.path(), b.path());
         this.asymmetricCellPathComparator = cellPathComparator == null ? null : (a, b) -> cellPathComparator.compare(((Cell<?>)a).path(), (CellPath) b);
         this.comparisonOrder = comparisonOrder(kind, isComplex(), Math.max(0, position), name);
+        this.isDropped = isDropped;
     }
 
     private static Comparator<CellPath> makeCellPathComparator(Kind kind, AbstractType<?> type)
@@ -225,19 +262,48 @@ public boolean isPlaceholder()
         return false;
     }
 
+    /**
+     * Whether that is the column metadata of a dropped column.
+     */
+    public boolean isDropped()
+    {
+        return isDropped;
+    }
+
     public ColumnMetadata copy()
     {
-        return new ColumnMetadata(ksName, cfName, name, type, position, kind);
+        return new ColumnMetadata(ksName, cfName, name, type, position, kind, isDropped);
     }
 
     public ColumnMetadata withNewName(ColumnIdentifier newName)
     {
-        return new ColumnMetadata(ksName, cfName, newName, type, position, kind);
+        return new ColumnMetadata(ksName, cfName, newName, type, position, kind, isDropped);
     }
 
     public ColumnMetadata withNewType(AbstractType<?> newType)
     {
-        return new ColumnMetadata(ksName, cfName, name, newType, position, kind);
+        return new ColumnMetadata(ksName, cfName, name, newType, position, kind, isDropped);
+    }
+
+    /**
+     * Transforms this (non-dropped) column metadata into one suitable when the column is dropped.
+     *
+     * <p>This should be used when a column is dropped to create the relevant {@link DroppedColumn} record.
+     *
+     * @return the transformed metadata. It will be equivalent to {@code this} except that 1) its {@link #isDropped}
+     * method will return {@code true} and 2) any UDT within the column type will have been expanded to tuples (see
+     * {@link AbstractType#expandUserTypes()}).
+     */
+    ColumnMetadata asDropped()
+    {
+        assert !isDropped : this + " was already dropped";
+        return new ColumnMetadata(ksName,
+                                  cfName,
+                                  name,
+                                  type.expandUserTypes(),
+                                  position,
+                                  kind,
+                                  true);
     }
 
     public boolean isPartitionKey()
diff --git a/src/java/org/apache/cassandra/schema/DroppedColumn.java b/src/java/org/apache/cassandra/schema/DroppedColumn.java
index 90dfe651f7e0..d59383a9a04f 100644
--- a/src/java/org/apache/cassandra/schema/DroppedColumn.java
+++ b/src/java/org/apache/cassandra/schema/DroppedColumn.java
@@ -20,11 +20,24 @@
 import com.google.common.base.MoreObjects;
 import com.google.common.base.Objects;
 
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.db.marshal.AbstractType;
+
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
+
 public final class DroppedColumn
 {
     public final ColumnMetadata column;
     public final long droppedTime; // drop timestamp, in microseconds, yet with millisecond granularity
 
+    /**
+     * Creates a new dropped column record.
+     *
+     * @param column the metadata for the dropped column. This <b>must</b> be a dropped metadata, that is we should
+     * have {@code column.isDropped() == true}.
+     * @param droppedTime the time at which the column was dropped, in microseconds.
+     */
     public DroppedColumn(ColumnMetadata column, long droppedTime)
     {
         this.column = column;
@@ -51,9 +64,53 @@ public int hashCode()
         return Objects.hashCode(column, droppedTime);
     }
 
+    public String toCQLString()
+    {
+        return String.format("DROPPED COLUMN RECORD %s %s%s USING TIMESTAMP %d",
+                             column.name,
+                             column.type.asCQL3Type(),
+                             column.isStatic() ? " static" : "",
+                             droppedTime);
+    }
+
     @Override
     public String toString()
     {
         return MoreObjects.toStringHelper(this).add("column", column).add("droppedTime", droppedTime).toString();
     }
+
+    /**
+     * A parsed dropped column record (from CREATE TABLE ... WITH DROPPED COLUMN RECORD ...).
+     */
+    public static final class Raw
+    {
+        private final ColumnIdentifier name;
+        private final CQL3Type.Raw type;
+        private final boolean isStatic;
+        private final long timestamp;
+
+        public Raw(ColumnIdentifier name, CQL3Type.Raw type, boolean isStatic, long timestamp)
+        {
+            this.name = name;
+            this.type = type;
+            this.isStatic = isStatic;
+            this.timestamp = timestamp;
+        }
+
+        public DroppedColumn prepare(String keyspace, String table)
+        {
+            ColumnMetadata.Kind kind = isStatic ? ColumnMetadata.Kind.STATIC : ColumnMetadata.Kind.REGULAR;
+            AbstractType<?> parsedType = type.prepare(keyspace).getType();
+            if (parsedType.referencesUserTypes())
+                throw invalidRequest("Invalid type %s for DROPPED COLUMN RECORD on %s: dropped column types should "
+                                     + "not have user types", type, name);
+
+            ColumnMetadata droppedColumn = ColumnMetadata.droppedColumn(keyspace,
+                                                                        table,
+                                                                        name,
+                                                                        parsedType,
+                                                                        kind);
+            return new DroppedColumn(droppedColumn, timestamp);
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/schema/TableMetadata.java b/src/java/org/apache/cassandra/schema/TableMetadata.java
index c87bdb60b118..bfec66eb32e8 100644
--- a/src/java/org/apache/cassandra/schema/TableMetadata.java
+++ b/src/java/org/apache/cassandra/schema/TableMetadata.java
@@ -19,7 +19,6 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
-import java.util.Map.Entry;
 
 import javax.annotation.Nullable;
 
@@ -1041,7 +1040,19 @@ public Builder recordDeprecatedSystemColumn(String name, AbstractType<?> type)
 
         public Builder recordColumnDrop(ColumnMetadata column, long timeMicros)
         {
-            droppedColumns.put(column.name.bytes, new DroppedColumn(column.withNewType(column.type.expandUserTypes()), timeMicros));
+            return recordColumnDrop(new DroppedColumn(column.asDropped(), timeMicros));
+        }
+
+        public Builder recordColumnDrop(DroppedColumn dropped)
+        {
+            DroppedColumn previous = droppedColumns.get(dropped.column.name.bytes);
+            if (previous != null && previous.droppedTime > dropped.droppedTime)
+                throw new ConfigurationException(String.format("Invalid dropped column record for column %s in %s at "
+                                                               + "%d: pre-existing record at %d is newer",
+                                                               dropped.column.name, this.name, previous.droppedTime,
+                                                               dropped.droppedTime));
+
+            droppedColumns.put(dropped.column.name.bytes, dropped);
             return this;
         }
 
@@ -1261,7 +1272,7 @@ public void appendCqlTo(CqlBuilder builder,
         builder.append(" WITH ")
                .increaseIndent();
 
-        appendTableOptions(builder, internals);
+        appendTableOptions(builder, internals, includeDroppedColumns);
 
         builder.decreaseIndent();
 
@@ -1270,9 +1281,6 @@ public void appendCqlTo(CqlBuilder builder,
             builder.newLine()
                    .append("*/");
         }
-
-        if (includeDroppedColumns)
-            appendDropColumns(builder);
     }
 
     private void appendColumnDefinitions(CqlBuilder builder,
@@ -1283,37 +1291,16 @@ private void appendColumnDefinitions(CqlBuilder builder,
         while (iter.hasNext())
         {
             ColumnMetadata column = iter.next();
-            // If the column has been re-added after a drop, we don't include it right away. Instead, we'll add the
-            // dropped one first below, then we'll issue the DROP and then the actual ADD for this column, thus
-            // simulating the proper sequence of events.
-            if (includeDroppedColumns && droppedColumns.containsKey(column.name.bytes))
-                continue;
-
             column.appendCqlTo(builder);
 
             if (hasSingleColumnPrimaryKey && column.isPartitionKey())
                 builder.append(" PRIMARY KEY");
 
-            if (!hasSingleColumnPrimaryKey || (includeDroppedColumns && !droppedColumns.isEmpty()) || iter.hasNext())
+            if (!hasSingleColumnPrimaryKey || iter.hasNext())
                 builder.append(',');
 
             builder.newLine();
         }
-
-        if (includeDroppedColumns)
-        {
-            Iterator<DroppedColumn> iterDropped = droppedColumns.values().iterator();
-            while (iterDropped.hasNext())
-            {
-                DroppedColumn dropped = iterDropped.next();
-                dropped.column.appendCqlTo(builder);
-
-                if (!hasSingleColumnPrimaryKey || iter.hasNext())
-                    builder.append(',');
-
-                builder.newLine();
-            }
-        }
     }
 
     void appendPrimaryKey(CqlBuilder builder)
@@ -1344,7 +1331,7 @@ void appendPrimaryKey(CqlBuilder builder)
                .newLine();
     }
 
-    void appendTableOptions(CqlBuilder builder, boolean internals)
+    void appendTableOptions(CqlBuilder builder, boolean internals, boolean includeDroppedColumns)
     {
         if (internals)
             builder.append("ID = ")
@@ -1368,6 +1355,8 @@ void appendTableOptions(CqlBuilder builder, boolean internals)
         }
         else
         {
+            if (includeDroppedColumns)
+                appendDropColumns(builder);
             params.appendCqlTo(builder);
         }
         builder.append(";");
@@ -1375,31 +1364,11 @@ void appendTableOptions(CqlBuilder builder, boolean internals)
 
     private void appendDropColumns(CqlBuilder builder)
     {
-        for (Entry<ByteBuffer, DroppedColumn> entry : droppedColumns.entrySet())
+        for (DroppedColumn dropped : droppedColumns.values())
         {
-            DroppedColumn dropped = entry.getValue();
-
-            builder.newLine()
-                   .append("ALTER TABLE ")
-                   .append(toString())
-                   .append(" DROP ")
-                   .append(dropped.column.name)
-                   .append(" USING TIMESTAMP ")
-                   .append(dropped.droppedTime)
-                   .append(';');
-
-            ColumnMetadata column = getColumn(entry.getKey());
-            if (column != null)
-            {
-                builder.newLine()
-                       .append("ALTER TABLE ")
-                       .append(toString())
-                       .append(" ADD ");
-
-                column.appendCqlTo(builder);
-
-                builder.append(';');
-            }
+            builder.append(dropped.toCQLString())
+                   .newLine()
+                   .append("AND ");
         }
     }
 
@@ -1572,13 +1541,13 @@ public void appendCqlTo(CqlBuilder builder,
                    .append("*/");
         }
 
-        void appendTableOptions(CqlBuilder builder, boolean internals)
+        void appendTableOptions(CqlBuilder builder, boolean internals, boolean includeDroppedColumns)
         {
             builder.append("COMPACT STORAGE")
                    .newLine()
                    .append("AND ");
 
-            super.appendTableOptions(builder, internals);
+            super.appendTableOptions(builder, internals, includeDroppedColumns);
         }
 
         public static ColumnMetadata getCompactValueColumn(RegularAndStaticColumns columns)
diff --git a/src/java/org/apache/cassandra/schema/ViewMetadata.java b/src/java/org/apache/cassandra/schema/ViewMetadata.java
index 0053249d4248..e068e947ca86 100644
--- a/src/java/org/apache/cassandra/schema/ViewMetadata.java
+++ b/src/java/org/apache/cassandra/schema/ViewMetadata.java
@@ -198,7 +198,7 @@ public void appendCqlTo(CqlBuilder builder,
                .append(" WITH ")
                .increaseIndent();
 
-        metadata.appendTableOptions(builder, internals);
+        metadata.appendTableOptions(builder, internals, false);
     }
 
     @Override
diff --git a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java
index f90bce6b2868..086a5945c99c 100644
--- a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java
+++ b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java
@@ -477,25 +477,37 @@ public void testDescribeTableWithInternals() throws Throwable
                                       "    v3 int,\n" +
                                       "    PRIMARY KEY ((pk1, pk2), c)\n" +
                                       ") WITH ID = " + id + "\n" +
-                                      "    AND CLUSTERING ORDER BY (c ASC)\n" +
-                                      "    AND " + tableParametersCql();
+                                      "    AND CLUSTERING ORDER BY (c ASC)\n";
 
         assertRowsNet(executeDescribeNet("DESCRIBE TABLE " + KEYSPACE + "." + table + " WITH INTERNALS"),
                       row(KEYSPACE,
                           "table",
                           table,
-                          tableCreateStatement));
+                          tableCreateStatement +
+                          "    AND " + tableParametersCql()));
 
         String dropStatement = "ALTER TABLE " + KEYSPACE + "." + table + " DROP v3 USING TIMESTAMP 1589286942065000;";
 
         execute(dropStatement);
 
+        String tableCreateStatementAfterDrop = "CREATE TABLE " + KEYSPACE + "." + table + " (\n" +
+                                      "    pk1 text,\n" +
+                                      "    pk2 int,\n" +
+                                      "    c int,\n" +
+                                      "    s decimal static,\n" +
+                                      "    v1 text,\n" +
+                                      "    v2 int,\n" +
+                                      "    PRIMARY KEY ((pk1, pk2), c)\n" +
+                                      ") WITH ID = " + id + "\n" +
+                                      "    AND CLUSTERING ORDER BY (c ASC)\n";
+
         assertRowsNet(executeDescribeNet("DESCRIBE TABLE " + KEYSPACE + "." + table + " WITH INTERNALS"),
                       row(KEYSPACE,
                           "table",
                           table,
-                          tableCreateStatement + "\n" +
-                          dropStatement));
+                          tableCreateStatementAfterDrop +
+                          "    AND DROPPED COLUMN RECORD v3 int USING TIMESTAMP 1589286942065000" + "\n" +
+                          "    AND " + tableParametersCql()));
 
         String addStatement = "ALTER TABLE " + KEYSPACE + "." + table + " ADD v3 int;";
 
@@ -505,9 +517,9 @@ public void testDescribeTableWithInternals() throws Throwable
                       row(KEYSPACE,
                           "table",
                           table,
-                          tableCreateStatement + "\n" +
-                          dropStatement + "\n" +
-                          addStatement));
+                          tableCreateStatement +
+                          "    AND DROPPED COLUMN RECORD v3 int USING TIMESTAMP 1589286942065000" + "\n" +
+                          "    AND " + tableParametersCql()));
     }
 
     @Test
@@ -522,25 +534,31 @@ public void testPrimaryKeyPositionWithAndWithoutInternals() throws Throwable
                                       "    v1 text,\n" +
                                       "    v2 int,\n" +
                                       "    v3 int\n" +
-                                      ") WITH ID = " + id + "\n" +
-                                      "    AND " + tableParametersCql();
-
+                                      ") WITH ID = " + id;
+        
         assertRowsNet(executeDescribeNet("DESCRIBE TABLE " + KEYSPACE + "." + table + " WITH INTERNALS"),
                       row(KEYSPACE,
                           "table",
                           table,
-                          tableCreateStatement));
-
+                          tableCreateStatement + "\n" +
+                          "    AND " + tableParametersCql()));
         String dropStatement = "ALTER TABLE " + KEYSPACE + "." + table + " DROP v3 USING TIMESTAMP 1589286942065000;";
 
         execute(dropStatement);
 
+        String tableCreateStatementAfterDrop = "CREATE TABLE " + KEYSPACE + "." + table + " (\n" +
+                                      "    pk text PRIMARY KEY,\n" +
+                                      "    v1 text,\n" +
+                                      "    v2 int\n" +
+                                      ") WITH ID = " + id;
+
         assertRowsNet(executeDescribeNet("DESCRIBE TABLE " + KEYSPACE + "." + table + " WITH INTERNALS"),
                       row(KEYSPACE,
                           "table",
                           table,
-                          tableCreateStatement + "\n" +
-                          dropStatement));
+                          tableCreateStatementAfterDrop + "\n" +
+                          "    AND DROPPED COLUMN RECORD v3 int USING TIMESTAMP 1589286942065000" + "\n" +
+                          "    AND " + tableParametersCql()));
 
         String tableCreateStatementWithoutDroppedColumn = "CREATE TABLE " + KEYSPACE + "." + table + " (\n" +
                                                           "    pk text PRIMARY KEY,\n" +
diff --git a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java
index d801f80ddc10..72693ae577f3 100644
--- a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java
+++ b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java
@@ -49,9 +49,11 @@
 
 import static org.hamcrest.CoreMatchers.allOf;
 import static org.hamcrest.CoreMatchers.containsString;
+import static org.hamcrest.CoreMatchers.notNullValue;
 import static org.hamcrest.CoreMatchers.startsWith;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.fail;
 
 public class SchemaCQLHelperTest extends CQLTester
 {
@@ -128,8 +130,8 @@ public void testUserTypesCQL()
     @Test
     public void testDroppedColumnsCQL()
     {
-        String keyspace = "cql_test_keyspace_dropped_columns";
-        String table = "test_table_dropped_columns";
+        String keyspace = createKeyspaceName();
+        String table = createTableName();
 
         TableMetadata.Builder builder =
         TableMetadata.builder(keyspace, table)
@@ -159,29 +161,56 @@ public void testDroppedColumnsCQL()
 
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
 
-        String expected = "CREATE TABLE IF NOT EXISTS cql_test_keyspace_dropped_columns.test_table_dropped_columns (\n" +
+        String expected = "CREATE TABLE IF NOT EXISTS " + keyspace + '.' + table + " (\n" +
                           "    pk1 varint,\n" +
                           "    ck1 varint,\n" +
-                          "    reg1 varint,\n" +
-                          "    reg3 varint,\n" +
-                          "    reg2 varint,\n" +
-                          "    st1 varint static,\n" +
                           "    PRIMARY KEY (pk1, ck1)\n) WITH ID =";
         String actual = SchemaCQLHelper.getTableMetadataAsCQL(cfs.metadata(), true, true, true);
 
         assertThat(actual,
                    allOf(startsWith(expected),
-                         containsString("ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP reg1 USING TIMESTAMP 10000;"),
-                         containsString("ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP reg3 USING TIMESTAMP 30000;"),
-                         containsString("ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP reg2 USING TIMESTAMP 20000;"),
-                         containsString("ALTER TABLE cql_test_keyspace_dropped_columns.test_table_dropped_columns DROP st1 USING TIMESTAMP 5000;")));
+                         containsString("DROPPED COLUMN RECORD reg1 varint USING TIMESTAMP 10000"),
+                         containsString("DROPPED COLUMN RECORD reg2 varint USING TIMESTAMP 20000"),
+                         containsString("DROPPED COLUMN RECORD reg3 varint USING TIMESTAMP 30000"),
+                         containsString("DROPPED COLUMN RECORD st1 varint static USING TIMESTAMP 5000")));
+    }
+
+    @Test
+    public void testDroppedColumnsCQLWithEarlierTimestamp()
+    {
+        String keyspace = createKeyspaceName();
+        String table = createTableName();
+
+        TableMetadata.Builder builder =
+        TableMetadata.builder(keyspace, table)
+                     .addPartitionKeyColumn("pk1", IntegerType.instance)
+                     .addClusteringColumn("ck1", IntegerType.instance)
+                     .addStaticColumn("st1", IntegerType.instance)
+                     .addRegularColumn("reg1", IntegerType.instance)
+                     .addRegularColumn("reg2", IntegerType.instance)
+                     .addRegularColumn("reg3", IntegerType.instance);
+
+        ColumnMetadata st1 = builder.getColumn(ByteBufferUtil.bytes("st1"));
+        builder.removeRegularOrStaticColumn(st1.name);
+
+        String expectedMessage = String.format("Invalid dropped column record for column st1 in %s at 5000: pre-existing record at 1000 is newer", table);
+        try
+        {
+            builder.recordColumnDrop(st1, 5000)
+                   .recordColumnDrop(st1, 1000);
+            fail("Expected an ConfigurationException: " + expectedMessage);
+        }
+        catch (ConfigurationException e)
+        {
+            assertThat(e.getMessage(), containsString(expectedMessage));
+        }
     }
 
     @Test
     public void testReaddedColumns()
     {
-        String keyspace = "cql_test_keyspace_readded_columns";
-        String table = "test_table_readded_columns";
+        String keyspace = createKeyspaceName();
+        String table = createTableName();
 
         TableMetadata.Builder builder =
         TableMetadata.builder(keyspace, table)
@@ -209,21 +238,19 @@ public void testReaddedColumns()
 
         // when re-adding, column is present as both column and as dropped column record.
         String actual = SchemaCQLHelper.getTableMetadataAsCQL(cfs.metadata(), true, true, true);
-        String expected = "CREATE TABLE IF NOT EXISTS cql_test_keyspace_readded_columns.test_table_readded_columns (\n" +
+        String expected = "CREATE TABLE IF NOT EXISTS " + keyspace + '.' + table + " (\n" +
                           "    pk1 varint,\n" +
                           "    ck1 varint,\n" +
-                          "    reg2 varint,\n" +
-                          "    reg1 varint,\n" +
                           "    st1 varint static,\n" +
+                          "    reg1 varint,\n" +
+                          "    reg2 varint,\n" +
                           "    PRIMARY KEY (pk1, ck1)\n" +
                           ") WITH ID";
 
         assertThat(actual,
                    allOf(startsWith(expected),
-                         containsString("ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns DROP reg1 USING TIMESTAMP 10000;"),
-                         containsString("ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns ADD reg1 varint;"),
-                         containsString("ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns DROP st1 USING TIMESTAMP 20000;"),
-                         containsString("ALTER TABLE cql_test_keyspace_readded_columns.test_table_readded_columns ADD st1 varint static;")));
+                         containsString("DROPPED COLUMN RECORD reg1 varint USING TIMESTAMP 10000"),
+                         containsString("DROPPED COLUMN RECORD st1 varint static USING TIMESTAMP 20000")));
     }
 
     @Test
@@ -295,7 +322,8 @@ public void testCfmOptionsCQL()
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
 
         assertThat(SchemaCQLHelper.getTableMetadataAsCQL(cfs.metadata(), true, true, true),
-                   containsString("CLUSTERING ORDER BY (cl1 ASC)\n" +
+                   containsString("AND CLUSTERING ORDER BY (cl1 ASC)\n" +
+                            "    AND DROPPED COLUMN RECORD reg1 ascii USING TIMESTAMP " + droppedTimestamp +"\n" +
                             "    AND additional_write_policy = 'ALWAYS'\n" +
                             "    AND bloom_filter_fp_chance = 1.0\n" +
                             "    AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" +
@@ -433,16 +461,14 @@ public void testSnapshot() throws Throwable
                           "    ck1 varint,\n" +
                           "    ck2 varint,\n" +
                           "    reg2 int,\n" +
-                          "    reg1 " + typeC+ ",\n" +
                           "    reg3 int,\n" +
+                          "    reg1 " + typeC + ",\n" +
                           "    PRIMARY KEY ((pk1, pk2), ck1, ck2)\n" +
                           ") WITH ID = " + cfs.metadata.id + "\n" +
-                          "    AND CLUSTERING ORDER BY (ck1 ASC, ck2 DESC)";
+                          "    AND CLUSTERING ORDER BY (ck1 ASC, ck2 DESC)" + "\n" +
+                          "    AND DROPPED COLUMN RECORD reg3 int USING TIMESTAMP 10000";
 
-        assertThat(schema,
-                   allOf(startsWith(expected),
-                         containsString("ALTER TABLE " + keyspace() + "." + tableName + " DROP reg3 USING TIMESTAMP 10000;"),
-                         containsString("ALTER TABLE " + keyspace() + "." + tableName + " ADD reg3 int;")));
+        assertThat(schema, startsWith(expected));
 
         assertThat(schema, containsString("CREATE INDEX IF NOT EXISTS " + tableName + "_reg2_idx ON " + keyspace() + '.' + tableName + " (reg2);"));
 
@@ -479,4 +505,44 @@ public void testBooleanCompositeKey() throws Throwable
         execute("insert into %s (t_id, id, ck, nk) VALUES (true, true, false, true)");
         assertRows(execute("select t_id, id, ck, nk from %s"), row(true, false, false, true), row(true, true, false, true));
     }
+    
+    @Test
+    public void testParseCreateTableWithDroppedColumns()
+    {
+        String keyspace = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }");
+        String createTable = "CREATE TABLE IF NOT EXISTS %s (\n" +
+                             "    pk1 varint,\n" +
+                             "    ck1 varint,\n" +
+                             "    PRIMARY KEY (pk1, ck1)\n" +
+                             ") WITH ID = 552f4510-b8fd-11eb-aef4-518b3b328020\n" +
+                             "    AND CLUSTERING ORDER BY (ck1 ASC)\n" +
+                             "    AND DROPPED COLUMN RECORD reg1 varint USING TIMESTAMP 10000\n" +
+                             "    AND DROPPED COLUMN RECORD st1 varint static USING TIMESTAMP 5000\n";
+        createTable(keyspace, createTable);
+    }
+
+    @Test
+    public void testParseCreateTableWithDuplicateDroppedColumns()
+    {
+        String keyspace = createKeyspace("CREATE KEYSPACE %s WITH replication={ 'class' : 'SimpleStrategy', 'replication_factor' : 1 }");
+        String createTable = "CREATE TABLE IF NOT EXISTS %s (\n" +
+                             "    pk1 varint,\n" +
+                             "    ck1 varint,\n" +
+                             "    PRIMARY KEY (pk1, ck1)\n" +
+                             ") WITH ID = 552f4510-b8fd-11eb-aef4-518b3b328020\n" +
+                             "    AND CLUSTERING ORDER BY (ck1 ASC)\n" +
+                             "    AND DROPPED COLUMN RECORD reg1 varint USING TIMESTAMP 10000\n" +
+                             "    AND DROPPED COLUMN RECORD reg1 varint static USING TIMESTAMP 5000\n";
+        try
+        {
+            createTable(keyspace, createTable);
+            fail("Expected an InvalidRequestException: Cannot have multiple dropped column record for column reg1");
+        }
+        catch (RuntimeException e)
+        {
+            assertThat(e.getCause(), notNullValue());
+            assertThat(e.getCause().getMessage(),
+                       containsString("Cannot have multiple dropped column record for column"));
+        }
+    }
 }

From bd77281c7a61114fc95e298ab2aa31be25142b4a Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Tue, 25 May 2021 16:20:35 +0200
Subject: [PATCH 090/151] STAR-725 Bloom false positive rate includes true
 negatives (#164)

This is a port of CASSANDRA-15384 that was partially ported in the
last oss -> dse-db port. Only the corresponding dtest was ported
and now it constantly fails without the actual fix in dse-db code.

Patch by Jaroslaw Grabowski; reviewed by brandonwilliams and paulo for
CASSANDRA-15384

Before this change the bloom filter false positive rate was calculated
without true negatives which resulted in high rates. In an extreme case,
where all queries return no data, the false positive rate could go up to
1.0.

This change includes true negatives in [recent] bloom filter false ratio.

(cherry picked from commit c0fa15431d3a27677ec1e74da0b419f628356d9f)
---
 .../io/sstable/format/trieindex/TrieIndexSSTableReader.java     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
index 88fcb5e1e50c..dd25f1018a84 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
@@ -334,6 +334,8 @@ public RowIndexEntry getExactPosition(DecoratedKey dk,
         {
             listener.onSSTableSkipped(this, SkippingReason.BLOOM_FILTER);
             Tracing.trace("Bloom filter allows skipping sstable {}", descriptor.generation);
+            if (updateStats)
+                bloomFilterTracker.addTrueNegative();
             return null;
         }
 

From bd6096e7c7c9a587976bb3ff49f4ac8cf57d5846 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Mon, 24 May 2021 15:27:45 +0100
Subject: [PATCH 091/151] STAR-431: Add option to prevent any file-I/O from
 cqlsh

Co-authored-by: Robert Stupp <snazy@snazy.de>
(cherry picked from commit d7e5ff43b037684c8606cfb2b7c515bb8d51cc20)
---
 bin/cqlsh.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index 74c9c4dc40eb..a7c5d629c0fc 100755
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py
@@ -177,6 +177,7 @@ def find_zip(libprefix):
 DEFAULT_SSL = False
 DEFAULT_CONNECT_TIMEOUT_SECONDS = 5
 DEFAULT_REQUEST_TIMEOUT_SECONDS = 10
+DEFAULT_NO_FILE_IO = False
 
 DEFAULT_FLOAT_PRECISION = 5
 DEFAULT_DOUBLE_PRECISION = 5
@@ -231,6 +232,8 @@ def find_zip(libprefix):
                   help='Specify the default request timeout in seconds (default: %default seconds).')
 parser.add_option("-t", "--tty", action='store_true', dest='tty',
                   help='Force tty mode (command prompt).')
+parser.add_option("--no-file-io", action='store_true', dest='no_file_io',
+                  help='Disable cqlsh commands that perform file I/O.')
 
 optvalues = optparse.Values()
 (options, arguments) = parser.parse_args(sys.argv[1:], values=optvalues)
@@ -422,6 +425,7 @@ class Shell(cmd.Cmd):
     last_hist = None
     shunted_query_out = None
     use_paging = True
+    no_file_io = False
 
     default_page_size = 100
 
@@ -442,7 +446,8 @@ def __init__(self, hostname, port, color=False,
                  request_timeout=DEFAULT_REQUEST_TIMEOUT_SECONDS,
                  protocol_version=None,
                  connect_timeout=DEFAULT_CONNECT_TIMEOUT_SECONDS,
-                 is_subshell=False):
+                 is_subshell=False,
+                 no_file_io=False):
         cmd.Cmd.__init__(self, completekey=completekey)
         self.hostname = hostname
         self.port = port
@@ -532,6 +537,7 @@ def __init__(self, hostname, port, color=False,
         self.statement_error = False
         self.single_statement = single_statement
         self.is_subshell = is_subshell
+        self.no_file_io = no_file_io
 
     @property
     def batch_mode(self):
@@ -1553,6 +1559,10 @@ def do_copy(self, parsed):
         on a line by itself to end the data input.
         """
 
+        if self.no_file_io:
+            self.printerr('No file I/O permitted')
+            return
+
         ks = self.cql_unprotect_name(parsed.get_binding('ksname', None))
         if ks is None:
             ks = self.current_keyspace
@@ -1637,6 +1647,11 @@ def do_source(self, parsed):
 
         See also the --file option to cqlsh.
         """
+
+        if self.no_file_io:
+            self.printerr('No file I/O permitted')
+            return
+
         fname = parsed.get_binding('fname')
         fname = os.path.expanduser(self.cql_unprotect_value(fname))
         try:
@@ -1697,6 +1712,11 @@ def do_capture(self, parsed):
         To inspect the current capture configuration, use CAPTURE with no
         arguments.
         """
+
+        if self.no_file_io:
+            self.printerr('No file I/O permitted')
+            return
+
         fname = parsed.get_binding('fname')
         if fname is None:
             if self.shunted_query_out is not None:
@@ -1894,6 +1914,11 @@ def do_clear(self, parsed):
     do_cls = do_clear
 
     def do_debug(self, parsed):
+
+        if self.no_file_io:
+            self.printerr('No file I/O permitted')
+            return
+
         import pdb
         pdb.set_trace()
 
@@ -2159,6 +2184,7 @@ def read_options(cmdlineargs, environment):
     optvalues.connect_timeout = option_with_default(configs.getint, 'connection', 'timeout', DEFAULT_CONNECT_TIMEOUT_SECONDS)
     optvalues.request_timeout = option_with_default(configs.getint, 'connection', 'request_timeout', DEFAULT_REQUEST_TIMEOUT_SECONDS)
     optvalues.execute = None
+    optvalues.no_file_io = option_with_default(configs.getboolean, 'ui', 'no_file_io', False)
 
     (options, arguments) = parser.parse_args(cmdlineargs, values=optvalues)
     # Make sure some user values read from the command line are in unicode
@@ -2329,6 +2355,7 @@ def main(options, hostname, port):
                       single_statement=options.execute,
                       request_timeout=options.request_timeout,
                       connect_timeout=options.connect_timeout,
+                      no_file_io=options.no_file_io,
                       encoding=options.encoding)
     except KeyboardInterrupt:
         sys.exit('Connection aborted.')

From d51809875b936f5919bd6d382c570f0483f7eb5b Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Fri, 21 May 2021 11:35:37 +0100
Subject: [PATCH 092/151] STAR-674: Enable trickle_fsync by default like DSE

(cherry picked from commit fc60821edb0e61ff0d1c6e5ce52c8a8e3d455e78)
---
 conf/cassandra.yaml                              | 2 +-
 src/java/org/apache/cassandra/config/Config.java | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index 9bb9eca4bbcf..15be354aab8e 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -633,7 +633,7 @@ index_summary_resize_interval_in_minutes: 60
 # buffers. Enable this to avoid sudden dirty buffer flushing from
 # impacting read latencies. Almost always a good idea on SSDs; not
 # necessarily on platters.
-trickle_fsync: false
+trickle_fsync: true
 trickle_fsync_interval_in_kb: 10240
 
 # TCP port, for commands and data
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 4d67acfca139..82e6770e3d43 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -292,7 +292,7 @@ public class Config
     public ParameterizedClass hints_compression;
 
     public volatile boolean incremental_backups = false;
-    public boolean trickle_fsync = false;
+    public boolean trickle_fsync = true;
     public int trickle_fsync_interval_in_kb = 10240;
 
     public volatile int sstable_preemptive_open_interval_in_mb = 50;

From 0b65dd40abea3d2493083cca603dae658bb46eff Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt@codeblueprint.co.uk>
Date: Wed, 26 May 2021 15:37:37 +0100
Subject: [PATCH 093/151] STAR-431: Add option to prevent any file-I/O from
 cqlsh (apply from correct version of cqlsh.py this time) (#170)

(cherry picked from commit 38a49150cd8d2ea7acbe54095a7fd8c9c3f5d5cf)
---
 bin/cqlsh.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index a7c5d629c0fc..94fea40cf357 100755
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py
@@ -425,7 +425,7 @@ class Shell(cmd.Cmd):
     last_hist = None
     shunted_query_out = None
     use_paging = True
-    no_file_io = False
+    no_file_io = DEFAULT_NO_FILE_IO
 
     default_page_size = 100
 
@@ -446,8 +446,8 @@ def __init__(self, hostname, port, color=False,
                  request_timeout=DEFAULT_REQUEST_TIMEOUT_SECONDS,
                  protocol_version=None,
                  connect_timeout=DEFAULT_CONNECT_TIMEOUT_SECONDS,
-                 is_subshell=False,
-                 no_file_io=False):
+                 no_file_io=DEFAULT_NO_FILE_IO,
+                 is_subshell=False):
         cmd.Cmd.__init__(self, completekey=completekey)
         self.hostname = hostname
         self.port = port
@@ -536,8 +536,8 @@ def __init__(self, hostname, port, color=False,
         self.empty_lines = 0
         self.statement_error = False
         self.single_statement = single_statement
-        self.is_subshell = is_subshell
         self.no_file_io = no_file_io
+        self.is_subshell = is_subshell
 
     @property
     def batch_mode(self):
@@ -1677,6 +1677,7 @@ def do_source(self, parsed):
                          max_trace_wait=self.max_trace_wait, ssl=self.ssl,
                          request_timeout=self.session.default_timeout,
                          connect_timeout=self.conn.connect_timeout,
+                         no_file_io=self.no_file_io,
                          is_subshell=True)
         # duplicate coverage related settings in subshell
         if self.coverage:
@@ -2184,7 +2185,7 @@ def read_options(cmdlineargs, environment):
     optvalues.connect_timeout = option_with_default(configs.getint, 'connection', 'timeout', DEFAULT_CONNECT_TIMEOUT_SECONDS)
     optvalues.request_timeout = option_with_default(configs.getint, 'connection', 'request_timeout', DEFAULT_REQUEST_TIMEOUT_SECONDS)
     optvalues.execute = None
-    optvalues.no_file_io = option_with_default(configs.getboolean, 'ui', 'no_file_io', False)
+    optvalues.no_file_io = option_with_default(configs.getboolean, 'ui', 'no_file_io', DEFAULT_NO_FILE_IO)
 
     (options, arguments) = parser.parse_args(cmdlineargs, values=optvalues)
     # Make sure some user values read from the command line are in unicode

From d478c9ff8a416eb2b732347e1ec3a7d296407288 Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Thu, 27 May 2021 13:17:05 +0200
Subject: [PATCH 094/151] STAR-429 cqlsh disable cqlsh_history logging (#165)

We can disable saving of the history either via command-line parameter
--disable-history, or by setting disabled = True in the history section of the
cqlshrc. Both options will read existing history, and just won't save new commands.

Co-authored-by: Alex Ott alex.ott@datastax.com
(cherry picked from commit ff1bc1f395bdc00c7c54b8e763d595366a1a3f72)
---
 bin/cqlsh.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index 94fea40cf357..f9322cd30e7a 100755
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py
@@ -234,6 +234,7 @@ def find_zip(libprefix):
                   help='Force tty mode (command prompt).')
 parser.add_option("--no-file-io", action='store_true', dest='no_file_io',
                   help='Disable cqlsh commands that perform file I/O.')
+parser.add_option('--disable-history', action='store_true', help='Disable saving of history', default=False)
 
 optvalues = optparse.Values()
 (options, arguments) = parser.parse_args(sys.argv[1:], values=optvalues)
@@ -2186,6 +2187,7 @@ def read_options(cmdlineargs, environment):
     optvalues.request_timeout = option_with_default(configs.getint, 'connection', 'request_timeout', DEFAULT_REQUEST_TIMEOUT_SECONDS)
     optvalues.execute = None
     optvalues.no_file_io = option_with_default(configs.getboolean, 'ui', 'no_file_io', DEFAULT_NO_FILE_IO)
+    optvalues.disable_history = option_with_default(configs.getboolean, 'history', 'disabled', False)
 
     (options, arguments) = parser.parse_args(cmdlineargs, values=optvalues)
     # Make sure some user values read from the command line are in unicode
@@ -2270,8 +2272,8 @@ def init_history():
         readline.set_completer_delims(delims)
 
 
-def save_history():
-    if readline is not None:
+def save_history(history_disabled=False):
+    if readline is not None and not history_disabled:
         try:
             readline.write_history_file(HISTORY)
         except IOError:
@@ -2377,7 +2379,7 @@ def handle_sighup():
         signal.signal(signal.SIGHUP, handle_sighup)
 
     shell.cmdloop()
-    save_history()
+    save_history(options.disable_history)
 
     if shell.batch_mode and shell.statement_error:
         sys.exit(2)

From 81d62c315fd2297b5dea39af38cbd378631a4efa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomek=20=C5=81asica?= <tlasica@life.pl>
Date: Tue, 1 Jun 2021 13:24:08 +0200
Subject: [PATCH 095/151] STAR-432. Add options for consistency-level and
 serial-consistency-level (#173)

* STAR-432. Add options for consistency-level and serial-consistency-level

Allows overriding the default consistency-level (ONE) and the default serial-consistency-level (SERIAL) on the command line. If provided value is incorrect cqlsh will not start.

Co-authored-by: Robert Stupp <snazy@snazy.de>
(cherry picked from commit d778304357021c2b9813fd1194227b48266045b2)
---
 bin/cqlsh.py | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index f9322cd30e7a..0afb7841c800 100755
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py
@@ -230,6 +230,10 @@ def find_zip(libprefix):
                   help='Specify the connection timeout in seconds (default: %default seconds).')
 parser.add_option("--request-timeout", default=DEFAULT_REQUEST_TIMEOUT_SECONDS, dest='request_timeout',
                   help='Specify the default request timeout in seconds (default: %default seconds).')
+parser.add_option("--consistency-level", dest='consistency_level',
+                  help='Specify the initial consistency level.')
+parser.add_option("--serial-consistency-level", dest='serial_consistency_level',
+                  help='Specify the initial serial consistency level.')
 parser.add_option("-t", "--tty", action='store_true', dest='tty',
                   help='Force tty mode (command prompt).')
 parser.add_option("--no-file-io", action='store_true', dest='no_file_io',
@@ -429,11 +433,14 @@ class Shell(cmd.Cmd):
     no_file_io = DEFAULT_NO_FILE_IO
 
     default_page_size = 100
+    consistency_level = None
+    serial_consistency_level = None
 
     def __init__(self, hostname, port, color=False,
                  username=None, password=None, encoding=None, stdin=None, tty=True,
                  completekey=DEFAULT_COMPLETEKEY, browser=None, use_conn=None,
                  cqlver=None, keyspace=None,
+                 consistency_level=None, serial_consistency_level=None,
                  tracing_enabled=False, expand_enabled=False,
                  display_nanotime_format=DEFAULT_NANOTIME_FORMAT,
                  display_timestamp_format=DEFAULT_TIMESTAMP_FORMAT,
@@ -463,6 +470,14 @@ def __init__(self, hostname, port, color=False,
         self.tracing_enabled = tracing_enabled
         self.page_size = self.default_page_size
         self.expand_enabled = expand_enabled
+
+        if not consistency_level:
+            raise Exception('Argument consistency_level must not be None')
+        if not serial_consistency_level:
+            raise Exception('Argument serial_consistency_level must not be None')
+        self.consistency_level = consistency_level
+        self.serial_consistency_level = serial_consistency_level
+
         if use_conn:
             self.conn = use_conn
         else:
@@ -531,9 +546,6 @@ def __init__(self, hostname, port, color=False,
             self.show_line_nums = True
         self.stdin = stdin
         self.query_out = sys.stdout
-        self.consistency_level = cassandra.ConsistencyLevel.ONE
-        self.serial_consistency_level = cassandra.ConsistencyLevel.SERIAL
-
         self.empty_lines = 0
         self.statement_error = False
         self.single_statement = single_statement
@@ -1668,6 +1680,8 @@ def do_source(self, parsed):
                          username=username, password=password,
                          encoding=self.encoding, stdin=f, tty=False, use_conn=self.conn,
                          cqlver=self.cql_version, keyspace=self.current_keyspace,
+                         consistency_level=self.consistency_level,
+                         serial_consistency_level=self.serial_consistency_level,
                          tracing_enabled=self.tracing_enabled,
                          display_nanotime_format=self.display_nanotime_format,
                          display_timestamp_format=self.display_timestamp_format,
@@ -2180,6 +2194,9 @@ def read_options(cmdlineargs, environment):
     optvalues.ssl = option_with_default(configs.getboolean, 'connection', 'ssl', DEFAULT_SSL)
     optvalues.encoding = option_with_default(configs.get, 'ui', 'encoding', UTF8)
 
+    optvalues.consistency_level = option_with_default(configs.get, 'cql', 'consistency_level', 'ONE')
+    optvalues.serial_consistency_level = option_with_default(configs.get, 'cql', 'serial_consistency_level', 'SERIAL')
+
     optvalues.tty = option_with_default(configs.getboolean, 'ui', 'tty', sys.stdin.isatty())
     optvalues.protocol_version = option_with_default(configs.getint, 'protocol', 'version', None)
     optvalues.cqlversion = option_with_default(configs.get, 'cql', 'version', None)
@@ -2196,6 +2213,24 @@ def read_options(cmdlineargs, environment):
     options.password = maybe_ensure_text(options.password)
     options.keyspace = maybe_ensure_text(options.keyspace)
 
+    serial_levels = [cassandra.ConsistencyLevel.SERIAL, cassandra.ConsistencyLevel.LOCAL_SERIAL]
+
+    try:
+        cl = cassandra.ConsistencyLevel.name_to_value[options.consistency_level.upper()]
+        if cl in serial_levels:
+            raise KeyError
+        options.consistency_level = cl
+    except KeyError:
+        parser.error('"{}" is not a valid consistency level'.format(options.consistency_level))
+
+    try:
+        cl = cassandra.ConsistencyLevel.name_to_value[options.serial_consistency_level.upper()]
+        if cl not in serial_levels:
+            raise KeyError
+        options.serial_consistency_level = cl
+    except KeyError:
+        parser.error('"{}" is not a valid serial consistency level'.format(options.serial_consistency_level))
+
     hostname = option_with_default(configs.get, 'connection', 'hostname', DEFAULT_HOST)
     port = option_with_default(configs.get, 'connection', 'port', DEFAULT_PORT)
 
@@ -2347,6 +2382,8 @@ def main(options, hostname, port):
                       protocol_version=options.protocol_version,
                       cqlver=options.cqlversion,
                       keyspace=options.keyspace,
+                      consistency_level=options.consistency_level,
+                      serial_consistency_level=options.serial_consistency_level,
                       display_timestamp_format=options.time_format,
                       display_nanotime_format=options.nanotime_format,
                       display_date_format=options.date_format,

From b90b43306e89172ae7882cb041157abf7e30740e Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Tue, 8 Jun 2021 07:07:18 -0700
Subject: [PATCH 096/151] STAR-543 Port remaining guardrails and sync with cndb
 (#180)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Zhao Yang <jasonstack.zhao@gmail.com>
Co-authored-by: Paulo Ricardo Motta Gomes <pauloricardomg@users.noreply.github.com>
Co-authored-by: Eduard Tudenhöfner <etudenhoefner@gmail.com>
Co-authored-by: Andrés de la Peña <adelapena@users.noreply.github.com>
Co-authored-by: Benjamin Lerer <blerer@users.noreply.github.com>
(cherry picked from commit a83e0e87c5521ec24e76de57db03dc0ed093c73f)
---
 conf/cassandra.yaml                           | 165 ++++----
 doc/source/configuration/cass_yaml_file.rst   |   3 +-
 .../org/apache/cassandra/config/Config.java   |  39 +-
 .../cassandra/config/DatabaseDescriptor.java  | 117 +++---
 .../cassandra/cql3/BatchQueryOptions.java     |   4 +-
 .../apache/cassandra/cql3/QueryOptions.java   |  17 +-
 .../cassandra/cql3/UpdateParameters.java      |   6 +-
 .../ClusteringColumnRestrictions.java         |   6 +-
 .../PartitionKeySingleRestrictionSet.java     |   6 +-
 .../cql3/statements/BatchStatement.java       |  93 +++--
 .../statements/ModificationStatement.java     |   2 +-
 .../cql3/statements/SelectStatement.java      |   2 +-
 .../schema/AlterTableStatement.java           |   2 +-
 .../statements/schema/AlterTypeStatement.java |   2 +-
 .../schema/CreateTableStatement.java          |   4 +-
 .../schema/CreateTypeStatement.java           |   2 +-
 .../schema/CreateViewStatement.java           |   1 +
 .../apache/cassandra/db/ConsistencyLevel.java |  32 +-
 .../org/apache/cassandra/db/ReadCommand.java  |  67 +--
 .../TombstoneOverwhelmingException.java       |   2 +-
 .../cassandra/guardrails/Guardrail.java       | 276 ++++++-------
 .../cassandra/guardrails/Guardrails.java      | 313 ++++++++------
 .../guardrails/GuardrailsConfig.java          | 387 ++++++++++++------
 .../io/sstable/format/SSTableWriter.java      |  20 +-
 .../apache/cassandra/service/QueryState.java  |   4 +-
 .../cassandra/service/StorageService.java     |  16 +-
 .../service/disk/usage/DiskUsageMonitor.java  |   4 +-
 .../transport/messages/BatchMessage.java      |   4 +-
 .../transport/messages/ExecuteMessage.java    |   4 +-
 .../transport/messages/OptionsMessage.java    |   3 +
 .../transport/messages/QueryMessage.java      |   4 +-
 .../transport/messages/StartupMessage.java    |   1 +
 .../distributed/test/ReadFailureTest.java     |  18 +-
 .../config/DatabaseDescriptorTest.java        |   8 +-
 .../entities/SecondaryIndexTest.java          |  15 +-
 .../miscellaneous/TombstonesTest.java         |  20 +-
 .../cassandra/fql/FullQueryLoggerTest.java    |   2 +-
 .../guardrails/GuardrailConsistencyTest.java  | 238 +++++------
 .../guardrails/GuardrailDiskUsageTest.java    |   5 +-
 .../guardrails/GuardrailLoggedBatchTest.java  |  88 ++++
 .../guardrails/GuardrailSAIIndexesTest.java   |   6 +-
 .../cassandra/guardrails/GuardrailTester.java |  19 +-
 .../guardrails/GuardrailsOnTableTest.java     |  10 +-
 .../cassandra/service/ClientWarningsTest.java |  42 +-
 .../service/ProtocolBetaVersionTest.java      |   1 -
 .../transport/ClientResourceLimitsTest.java   |   2 +-
 .../transport/MessagePayloadTest.java         |   2 +-
 .../cassandra/transport/SerDeserTest.java     | 103 ++++-
 48 files changed, 1303 insertions(+), 884 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailLoggedBatchTest.java

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index 15be354aab8e..d70d025eb910 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -1243,18 +1243,6 @@ transparent_data_encryption_options:
 # SAFETY THRESHOLDS #
 #####################
 
-# When executing a scan, within or across a partition, we need to keep the
-# tombstones seen in memory so we can return them to the coordinator, which
-# will use them to make sure other replicas also know about the deleted rows.
-# With workloads that generate a lot of tombstones, this can cause performance
-# problems and even exaust the server heap.
-# (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
-# Adjust the thresholds here if you understand the dangers and want to
-# scan more tombstones anyway.  These thresholds may also be adjusted at runtime
-# using the StorageService mbean.
-tombstone_warn_threshold: 1000
-tombstone_failure_threshold: 100000
-
 # Filtering and secondary index queries at read consistency levels above ONE/LOCAL_ONE use a
 # mechanism called replica filtering protection to ensure that results from stale replicas do
 # not violate consistency. (See CASSANDRA-8272 and CASSANDRA-15907 for more details.) This
@@ -1275,19 +1263,6 @@ replica_filtering_protection:
     cached_rows_warn_threshold: 2000
     cached_rows_fail_threshold: 32000
 
-# Log WARN on any multiple-partition batch size exceeding this value. 5kb per batch by default.
-# Caution should be taken on increasing the size of this threshold as it can lead to node instability.
-batch_size_warn_threshold_in_kb: 5
-
-# Fail any multiple-partition batch exceeding this value. 50kb (10x warn threshold) by default.
-batch_size_fail_threshold_in_kb: 50
-
-# Log WARN on any batches not of type LOGGED than span across more partitions than this limit
-unlogged_batch_across_partitions_warn_threshold: 10
-
-# Log a warning when compacting partitions larger than this value
-compaction_large_partition_warning_threshold_mb: 100
-
 # GC Pauses greater than 200 ms will be logged at INFO level
 # This threshold can be adjusted to minimize logging if necessary
 # gc_log_threshold_in_ms: 200
@@ -1435,16 +1410,20 @@ enable_sasi_indexes: false
 # Transient replication is experimental and is not recommended for production use.
 enable_transient_replication: false
 
-# Apply database-as-a-service defaults.
-#
-# When enabled, some guardrails defaults are modified to values that are appropriate for cloud environments.
-# This includes (but is not limited to) stricter guardrails defaults.
-#
-# This can be used as an convenience to develop and test applications meant to run in a cloud environment.
-# apply_dbaas_defaults: false
+  # Emulates DataStax Constellation database-as-a-service defaults.
+  #
+  # When enabled, some defaults are modified to match those used by DataStax Constellation (DataStax cloud data
+  # platform). This includes (but is not limited to) stricter guardrails defaults.
+  #
+  # This can be used as an convenience to develop and test applications meant to run on DataStax Constellation.
+  #
+  # Warning: when enabled, the updated defaults reflect those of DataStax Constellation _at the time_ of the currently
+  #                 used DSE release. This is a best-effort emulation of said defaults. Further, all nodes must use the same
+  #                 config value.
+  # emulate_dbaas_defaults: false
 
-# Guardrails settings.
-# guardrails:
+  # Guardrails settings.
+  # guardrails:
   # When executing a scan, within or across a partition, we need to keep the
   # tombstones seen in memory so we can return them to the coordinator, which
   # will use them to make sure other replicas also know about the deleted rows.
@@ -1455,93 +1434,113 @@ enable_transient_replication: false
   # scan more tombstones anyway.  These thresholds may also be adjusted at runtime
   # using the StorageService mbean.
   #
-  # Default: tombstone_warn_threshold is 1000, may differ if apply_dbaas_defaults is enabled
-  # Default: tombstone_failure_threshold is 100000, may differ if apply_dbaas_defaults is enabled
+  # Default tombstone_warn_threshold is 1000, may differ if emulate_dbaas_defaults is enabled
+  # Default tombstone_failure_threshold is 100000, may differ if emulate_dbaas_defaults is enabled
   # tombstone_warn_threshold: 1000
   # tombstone_failure_threshold: 100000
 
-  # Failure threshold to prevent writing large a column value into Cassandra.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Log a warning when compacting partitions larger than this value.
+  # Default value is 100mb, may differ if emulate_dbaas_defaults is enabled
+  # partition_size_warn_threshold_in_mb: 100
+
+  # Log WARN on any multiple-partition batch size that exceeds this value. 64kb per batch by default.
+  # Use caution when increasing the size of this threshold as it can lead to node instability.
+  # Default value is 64kb, may differ if emulate_dbaas_defaults is enabled
+  # batch_size_warn_threshold_in_kb: 64
+
+  # Fail any multiple-partition batch that exceeds this value. The calculated default is 640kb (10x warn threshold).
+  # Default value is 640kb, may differ if emulate_dbaas_defaults is enabled
+  # batch_size_fail_threshold_in_kb: 640
+
+  # Log WARN on any batches not of type LOGGED than span across more partitions than this limit.
+  # Default value is 10, may differ if emulate_dbaas_defaults is enabled
+  # unlogged_batch_across_partitions_warn_threshold: 10
+
+  # Failure threshold to prevent writing large column value into Cassandra.
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # column_value_size_failure_threshold_in_kb: -1
 
   # Failure threshold to prevent creating more columns per table than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # columns_per_table_failure_threshold: -1
 
-  # Failure threshold to prevent creating more secondary indexes per table than threshold (does not apply to CUSTOM INDEX StorageAttachedIndex)
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Failure threshold to prevent creating more fields in user-defined-type than threshold.
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
+  # fields_per_udt_failure_threshold: -1
+
+  # Warning threshold to warn when encountering larger size of collection data than threshold.
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
+  # collection_size_warn_threshold_in_kb: -1
+
+  # Warning threshold to warn when encountering more elements in collection than threshold.
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
+  # items_per_collection_warn_threshold: -1
+
+  # Whether read-before-write operation is allowed, eg. setting list element by index, removing list element
+  # by index. Note: LWT is always allowed.
+  # Default true to allow read before write operation, may differ if emulate_dbaas_defaults is enabled
+  # read_before_write_list_operations_enabled: true
+
+  # Failure threshold to prevent creating more secondary index per table than threshold (does not apply to CUSTOM INDEX StorageAttachedIndex)
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # secondary_index_per_table_failure_threshold: -1
 
   # Failure threshold for number of StorageAttachedIndex per table (only applies to CUSTOM INDEX StorageAttachedIndex)
-  # Default is 10 (same when apply_dbaas_defaults is enabled)
+  # Default is 10 (same when emulate_dbaas_defaults is enabled)
   # sai_indexes_per_table_failure_threshold: 10
   #
   # Failure threshold for total number of StorageAttachedIndex across all keyspaces (only applies to CUSTOM INDEX StorageAttachedIndex)
-  # Default is 10 (same when apply_dbaas_defaults is enabled)
+  # Default is 10 (same when emulate_dbaas_defaults is enabled)
   # sai_indexes_total_failure_threshold: 100
 
   # Failure threshold to prevent creating more materialized views per table than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # materialized_view_per_table_failure_threshold: -1
 
   # Warn threshold to warn creating more tables than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # tables_warn_threshold: -1
 
   # Failure threshold to prevent creating more tables than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # tables_failure_threshold: -1
 
-  # Prevents creating tables with provided configurations.
-  # Default: all properties are allowed, may differ if apply_dbaas_defaults is enabled
+  # Preventing creating tables with provided configurations.
+  # Default all properties are allowed, may differ if emulate_dbaas_defaults is enabled
   # table_properties_disallowed:
 
-  # Whether to allow user-provided timestamps in write requests
-  # Default: true to allow user-provided timestamps, may differ if apply_dbaas_defaults is enabled
+  # Whether to allow user-provided timestamp in write request
+  # Default true to allow user-provided timestamp, may differ if emulate_dbaas_defaults is enabled
   # user_timestamps_enabled: true
 
-  # Preventing a query with provided consistency levels
-  # Default: all consistency levels are allowed.
+  # Preventing query with provided consistency levels
+  # Default all consistency levels are allowed.
   # write_consistency_levels_disallowed:
 
-  # Log a warning when compacting partitions larger than this value.
-  # Default: 100mb, may differ if apply_dbaas_defaults is enabled
-  # partition_size_warn_threshold_in_mb: 100
+  # Failure threshold to prevent providing larger paging by bytes than threshold, also served as a hard paging limit
+  # when paging by rows is used.
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
+  # page_size_failure_threshold_in_kb: -1
+
+  # Failure threshold to prevent IN query creating size of cartesian product exceeding threshold, eg.
+  # "a in (1,2,...10) and b in (1,2...10)" results in cartesian product of 100.
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
+  # in_select_cartesian_product_failure_threshold: -1
 
   # Failure threshold to prevent IN query containing more partition keys than threshold
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # partition_keys_in_select_failure_threshold: -1
 
   # Warning threshold to warn when local disk usage exceeding threshold. Valid values: (1, 100]
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # disk_usage_percentage_warn_threshold: -1
 
   # Failure threshold to reject write requests if replica disk usage exceeding threshold. Valid values: (1, 100]
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
+  # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled
   # disk_usage_percentage_failure_threshold: -1
 
-  # Failure threshold to prevent IN query creating size of cartesian product exceeding threshold, eg.
-  # "a IN (1,2,...10) AND b IN (1,2...10)" results in cartesian product of 100.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
-  # in_select_cartesian_product_failure_threshold: -1
-
-  # Whether to allow user-provided timestamps in write request (USING TIMESTAMP ...)
-  # Default: true to allow user-provided timestamp, may differ if apply_dbaas_defaults is enabled
-  # user_timestamps_enabled: true
-
-  # Whether read-before-write operation is allowed on lists, eg. setting list element by index, removing list element
-  # by index. Note: LWT is always allowed.
-  # Default: true to allow read before write operation on lists, may differ if apply_dbaas_defaults is enabled
-  # read_before_write_list_operations_enabled: true
-
-  # Failure threshold to prevent creating more fields in user-defined-type than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
-  # fields_per_udt_failure_threshold: -1
-
-  # Warning threshold to warn when encountering larger size of collection data than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
-  # collection_size_warn_threshold_in_kb: -1
-
-  # Warning threshold to warn when encountering more elements in collection than threshold.
-  # Default: -1 to disable, may differ if apply_dbaas_defaults is enabled
-  # items_per_collection_warn_threshold: -1
+  # Allows configuring max disk size of data directories when calculating thresholds for disk_usage_percentage_warn_threshold
+  # and disk_usage_percentage_failure_threshold. Valid values: (1, max available disk size of all data directories]
+  # Default -1 to disable and use the physically available disk size of data directories during calculations.
+  # may differ if emulate_dbaas_defaults is enabled
+  # disk_usage_max_disk_size_in_gb: -1
diff --git a/doc/source/configuration/cass_yaml_file.rst b/doc/source/configuration/cass_yaml_file.rst
index e3babbcd7fdf..a553da71686f 100644
--- a/doc/source/configuration/cass_yaml_file.rst
+++ b/doc/source/configuration/cass_yaml_file.rst
@@ -1861,13 +1861,14 @@ Log WARN on any batches not of type LOGGED than span across more partitions than
 
 *Default Value:* 10
 
-``compaction_large_partition_warning_threshold_mb``
+``partition_size_warn_threshold_in_mb``
 ---------------------------------------------------
 
 Log a warning when compacting partitions larger than this value
 
 *Default Value:* 100
 
+`
 ``gc_log_threshold_in_ms``
 --------------------------
 *This option is commented out by default.*
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 82e6770e3d43..09cb48d2c863 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -214,12 +214,28 @@ public class Config
     /* if the size of columns or super-columns are more than this, indexing will kick in */
     public int column_index_size_in_kb = 64;
     public volatile int column_index_cache_size_in_kb = 2;
-    public volatile int batch_size_warn_threshold_in_kb = 5;
-    public volatile int batch_size_fail_threshold_in_kb = 50;
-    public Integer unlogged_batch_across_partitions_warn_threshold = 10;
+    /**
+     * @deprecated Migrated to 'guardrails.batch_size_warn_threshold_in_kb'
+     */
+    @Deprecated
+    public int batch_size_warn_threshold_in_kb = 0;
+    /**
+     * @deprecated Migrated to 'guardrails.batch_size_fail_threshold_in_kb'
+     */
+    @Deprecated
+    public int batch_size_fail_threshold_in_kb = 0;
+    /**
+     * @deprecated Migrated to 'guardrails.unlogged_batch_across_partitions_warn_threshold'
+     */
+    @Deprecated
+    public Integer unlogged_batch_across_partitions_warn_threshold = 0;
     public volatile Integer concurrent_compactors;
     public volatile int compaction_throughput_mb_per_sec = 16;
-    public volatile int compaction_large_partition_warning_threshold_mb = 100;
+    /**
+     * @deprecated Migrated to 'guardrails.compaction_large_partition_warning_threshold_mb'
+     */
+    @Deprecated
+    public int compaction_large_partition_warning_threshold_mb = 0;
     public int min_free_space_per_drive_in_mb = 50;
 
     public volatile int concurrent_materialized_view_builders = 1;
@@ -344,8 +360,16 @@ public class Config
 
     public MemtableAllocationType memtable_allocation_type = MemtableAllocationType.offheap_objects;
 
-    public volatile int tombstone_warn_threshold = 1000;
-    public volatile int tombstone_failure_threshold = 100000;
+    /**
+     * @deprecated Migrated to 'guardrails.tombstone_warn_threshold'
+     */
+    @Deprecated
+    public int tombstone_warn_threshold = 0;
+    /**
+     * @deprecated Migrated to 'guardrails.tombstone_failure_threshold'
+     */
+    @Deprecated
+    public int tombstone_failure_threshold = 0;
 
     public final ReplicaFilteringProtectionOptions replica_filtering_protection = new ReplicaFilteringProtectionOptions();
 
@@ -506,7 +530,8 @@ public class Config
      */
     public volatile int validation_preview_purge_head_start_in_sec = 60 * 60;
 
-    public boolean apply_dbaas_defaults = false;
+    public boolean emulate_dbaas_defaults = false;
+    
     public GuardrailsConfig guardrails = new GuardrailsConfig();
 
     /**
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 7584433a7b45..2a763ad75230 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -177,6 +177,10 @@ public static void daemonInitialization(Supplier<Config> config) throws Configur
 
         setConfig(config.get());
         applyAll();
+
+        createAllDirectories();
+        applyGuardrails(); // requires created directories
+
         AuthConfig.applyAuth();
     }
 
@@ -363,14 +367,13 @@ private static void applyAll() throws ConfigurationException
         applyEncryptionContext();
 
         applySslContext();
-
-        applyGuardrailsConfig();
     }
 
-    private static void applyGuardrailsConfig()
+    private static void applyGuardrails()
     {
         conf.guardrails.applyConfig();
         conf.guardrails.validate();
+        getGuardrailsConfig().validateAfterDataDirectoriesExist();
     }
 
     private static void applySimpleConfig()
@@ -764,6 +767,48 @@ else if (conf.repair_session_space_in_mb > (int) (Runtime.getRuntime().maxMemory
         if (conf.user_defined_function_fail_timeout < conf.user_defined_function_warn_timeout)
             throw new ConfigurationException("user_defined_function_warn_timeout must less than user_defined_function_fail_timeout", false);
 
+        if (conf.compaction_large_partition_warning_threshold_mb != 0)
+        {
+            logger.warn("Found deprecated property 'compaction_large_partition_warning_threshold_mb' in config - migrate to `guardrails.partition_size_warn_threshold_in_mb`. " +
+                        "The value of 'guardrails.partition_size_warn_threshold_in_mb' is overwritten by 'compaction_large_partition_warning_threshold_mb'.");
+            getGuardrailsConfig().partition_size_warn_threshold_in_mb = conf.compaction_large_partition_warning_threshold_mb;
+        }
+
+        if (conf.tombstone_failure_threshold != 0)
+        {
+            logger.warn("Found deprecated property 'tombstone_failure_threshold' in config - migrate to 'guardrails.tombstone_failure_threshold'. " +
+                        "The value of 'guardrails.tombstone_failure_threshold' is overwritten by 'tombstone_failure_threshold'.");
+            getGuardrailsConfig().tombstone_failure_threshold = conf.tombstone_failure_threshold;
+        }
+
+        if (conf.tombstone_warn_threshold != 0)
+        {
+            logger.warn("Found deprecated property 'tombstone_warn_threshold' in config - migrate to 'guardrails.tombstone_warn_threshold'. " +
+                        "The value of 'guardrails.tombstone_warn_threshold' is overwritten by 'tombstone_warn_threshold'.");
+            getGuardrailsConfig().tombstone_warn_threshold = conf.tombstone_warn_threshold;
+        }
+
+        if (conf.batch_size_fail_threshold_in_kb != 0)
+        {
+            logger.warn("Found deprecated property 'batch_size_fail_threshold_in_kb' in config - migrate to 'guardrails.batch_size_fail_threshold_in_kb'. " +
+                        "The value of 'guardrails.batch_size_fail_threshold_in_kb' is overwritten by 'batch_size_fail_threshold_in_kb'.");
+            getGuardrailsConfig().batch_size_fail_threshold_in_kb = conf.batch_size_fail_threshold_in_kb;
+        }
+
+        if (conf.batch_size_warn_threshold_in_kb != 0)
+        {
+            logger.warn("Found deprecated property 'batch_size_warn_threshold_in_kb' in config - migrate to 'guardrails.batch_size_warn_threshold_in_kb'. " +
+                        "The value of 'guardrails.batch_size_warn_threshold_in_kb' is overwritten by 'batch_size_warn_threshold_in_kb'.");
+            getGuardrailsConfig().batch_size_warn_threshold_in_kb = conf.batch_size_warn_threshold_in_kb;
+        }
+
+        if (conf.unlogged_batch_across_partitions_warn_threshold != 0)
+        {
+            logger.warn("Found deprecated property 'unlogged_batch_across_partitions_warn_threshold' in config - migrate to 'guardrails.unlogged_batch_across_partitions_warn_threshold'. " +
+                        "The value of 'guardrails.unlogged_batch_across_partitions_warn_threshold' is overwritten by 'unlogged_batch_across_partitions_warn_threshold'.");
+            getGuardrailsConfig().unlogged_batch_across_partitions_warn_threshold = conf.unlogged_batch_across_partitions_warn_threshold;
+        }
+
         if (conf.commitlog_segment_size_in_mb <= 0)
             throw new ConfigurationException("commitlog_segment_size_in_mb must be positive, but was "
                     + conf.commitlog_segment_size_in_mb, false);
@@ -1492,42 +1537,6 @@ public static void setColumnIndexCacheSize(int val)
         conf.column_index_cache_size_in_kb = val;
     }
 
-    public static int getBatchSizeWarnThreshold()
-    {
-        return (int) ByteUnit.KIBI_BYTES.toBytes(conf.batch_size_warn_threshold_in_kb);
-    }
-
-    public static int getBatchSizeWarnThresholdInKB()
-    {
-        return conf.batch_size_warn_threshold_in_kb;
-    }
-
-    public static long getBatchSizeFailThreshold()
-    {
-        return ByteUnit.KIBI_BYTES.toBytes(conf.batch_size_fail_threshold_in_kb);
-    }
-
-    public static int getBatchSizeFailThresholdInKB()
-    {
-        return conf.batch_size_fail_threshold_in_kb;
-    }
-
-    public static int getUnloggedBatchAcrossPartitionsWarnThreshold()
-    {
-        return conf.unlogged_batch_across_partitions_warn_threshold;
-    }
-
-    public static void setBatchSizeWarnThresholdInKB(int threshold)
-    {
-        checkValidForByteConversion(threshold, "batch_size_warn_threshold_in_kb", ByteUnit.KIBI_BYTES);
-        conf.batch_size_warn_threshold_in_kb = threshold;
-    }
-
-    public static void setBatchSizeFailThresholdInKB(int threshold)
-    {
-        conf.batch_size_fail_threshold_in_kb = threshold;
-    }
-
     public static Collection<String> getInitialTokens()
     {
         return tokensFromString(System.getProperty(Config.PROPERTY_PREFIX + "initial_token", conf.initial_token));
@@ -1808,8 +1817,6 @@ public static void setCompactionThroughputMbPerSec(int value)
         conf.compaction_throughput_mb_per_sec = value;
     }
 
-    public static long getCompactionLargePartitionWarningThreshold() { return ByteUnit.MEBI_BYTES.toBytes(conf.compaction_large_partition_warning_threshold_mb); }
-
     public static int getConcurrentValidations()
     {
         return conf.concurrent_validations;
@@ -1968,26 +1975,6 @@ public static int getMaxMutationSize()
         return (int) ByteUnit.KIBI_BYTES.toBytes(conf.max_mutation_size_in_kb);
     }
 
-    public static int getTombstoneWarnThreshold()
-    {
-        return conf.tombstone_warn_threshold;
-    }
-
-    public static void setTombstoneWarnThreshold(int threshold)
-    {
-        conf.tombstone_warn_threshold = threshold;
-    }
-
-    public static int getTombstoneFailureThreshold()
-    {
-        return conf.tombstone_failure_threshold;
-    }
-
-    public static void setTombstoneFailureThreshold(int threshold)
-    {
-        conf.tombstone_failure_threshold = threshold;
-    }
-
     public static int getCachedReplicaRowsWarnThreshold()
     {
         return conf.replica_filtering_protection.cached_rows_warn_threshold;
@@ -3416,13 +3403,13 @@ public static GuardrailsConfig getGuardrailsConfig()
     }
 
     @VisibleForTesting
-    public static boolean setApplyDbaasDefaults(boolean dbaasDefaults)
+    public static boolean setEmulateDbaasDefaults(boolean dbaasDefaults)
     {
-        return conf.apply_dbaas_defaults = dbaasDefaults;
+        return conf.emulate_dbaas_defaults = dbaasDefaults;
     }
 
-    public static boolean isApplyDbaasDefaults()
+    public static boolean isEmulateDbaasDefaults()
     {
-        return conf.apply_dbaas_defaults;
+        return conf.emulate_dbaas_defaults;
     }
 }
diff --git a/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java b/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java
index ac8f179fe43f..28178c7a32fe 100644
--- a/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/BatchQueryOptions.java
@@ -69,9 +69,9 @@ public String getKeyspace()
         return wrapped.getKeyspace();
     }
 
-    public ConsistencyLevel getSerialConsistency()
+    public ConsistencyLevel getSerialConsistency(QueryState state)
     {
-        return wrapped.getSerialConsistency();
+        return wrapped.getSerialConsistency(state);
     }
 
     public List<Object> getQueryOrIdList()
diff --git a/src/java/org/apache/cassandra/cql3/QueryOptions.java b/src/java/org/apache/cassandra/cql3/QueryOptions.java
index d3b1a03cca9e..9d4bb48737d6 100644
--- a/src/java/org/apache/cassandra/cql3/QueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/QueryOptions.java
@@ -38,6 +38,8 @@
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
 
+import javax.annotation.Nullable;
+
 /**
  * Options for a query.
  */
@@ -188,9 +190,10 @@ public PagingState getPagingState()
     }
 
     /**  Serial consistency for conditional updates. */
-    public ConsistencyLevel getSerialConsistency()
+    public ConsistencyLevel getSerialConsistency(@Nullable QueryState state)
     {
-        return getSpecificOptions().serialConsistency;
+        ConsistencyLevel cl = getSpecificOptions().serialConsistency;
+        return cl != null ? cl : ConsistencyLevel.defaultSerialConsistency(state);
     }
 
     public long getTimestamp(QueryState state)
@@ -395,7 +398,7 @@ private SpecificOptions(int pageSize,
         {
             this.pageSize = pageSize;
             this.state = state;
-            this.serialConsistency = serialConsistency == null ? ConsistencyLevel.SERIAL : serialConsistency;
+            this.serialConsistency = serialConsistency;
             this.timestamp = timestamp;
             this.keyspace = keyspace;
             this.nowInSeconds = nowInSeconds;
@@ -471,7 +474,7 @@ public QueryOptions decode(ByteBuf body, ProtocolVersion version)
             {
                 int pageSize = flags.contains(Flag.PAGE_SIZE) ? body.readInt() : -1;
                 PagingState pagingState = flags.contains(Flag.PAGING_STATE) ? PagingState.deserialize(CBUtil.readValueNoCopy(body), version) : null;
-                ConsistencyLevel serialConsistency = flags.contains(Flag.SERIAL_CONSISTENCY) ? CBUtil.readConsistencyLevel(body) : ConsistencyLevel.SERIAL;
+                ConsistencyLevel serialConsistency = flags.contains(Flag.SERIAL_CONSISTENCY) ? CBUtil.readConsistencyLevel(body) : null;
                 long timestamp = Long.MIN_VALUE;
                 if (flags.contains(Flag.TIMESTAMP))
                 {
@@ -506,7 +509,7 @@ public void encode(QueryOptions options, ByteBuf dest, ProtocolVersion version)
             if (flags.contains(Flag.PAGING_STATE))
                 CBUtil.writeValue(options.getPagingState().serialize(version), dest);
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
-                CBUtil.writeConsistencyLevel(options.getSerialConsistency(), dest);
+                CBUtil.writeConsistencyLevel(options.getSerialConsistency(null), dest);
             if (flags.contains(Flag.TIMESTAMP))
                 dest.writeLong(options.getSpecificOptions().timestamp);
             if (flags.contains(Flag.KEYSPACE))
@@ -535,7 +538,7 @@ public int encodedSize(QueryOptions options, ProtocolVersion version)
             if (flags.contains(Flag.PAGING_STATE))
                 size += CBUtil.sizeOfValue(options.getPagingState().serializedSize(version));
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
-                size += CBUtil.sizeOfConsistencyLevel(options.getSerialConsistency());
+                size += CBUtil.sizeOfConsistencyLevel(options.getSerialConsistency(null));
             if (flags.contains(Flag.TIMESTAMP))
                 size += 8;
             if (flags.contains(Flag.KEYSPACE))
@@ -557,7 +560,7 @@ private EnumSet<Flag> gatherFlags(QueryOptions options, ProtocolVersion version)
                 flags.add(Flag.PAGE_SIZE);
             if (options.getPagingState() != null)
                 flags.add(Flag.PAGING_STATE);
-            if (options.getSerialConsistency() != ConsistencyLevel.SERIAL)
+            if (options.getSpecificOptions().serialConsistency != null)
                 flags.add(Flag.SERIAL_CONSISTENCY);
             if (options.getSpecificOptions().timestamp != Long.MIN_VALUE)
                 flags.add(Flag.TIMESTAMP);
diff --git a/src/java/org/apache/cassandra/cql3/UpdateParameters.java b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
index 39689776c34a..9767977b42b6 100644
--- a/src/java/org/apache/cassandra/cql3/UpdateParameters.java
+++ b/src/java/org/apache/cassandra/cql3/UpdateParameters.java
@@ -144,7 +144,7 @@ public void addTombstone(ColumnMetadata column) throws InvalidRequestException
     public void addTombstone(ColumnMetadata column, CellPath path) throws InvalidRequestException
     {
         if (path != null && column.type.isMultiCell())
-            Guardrails.columnValueSize.guard(path.dataSize(), column.name.toString(), state);
+            Guardrails.columnValueSize.guard(path.dataSize(), column.name.toString(), false, state);
 
         builder.addCell(BufferCell.tombstone(column, timestamp, nowInSec, path));
     }
@@ -156,10 +156,10 @@ public Cell addCell(ColumnMetadata column, ByteBuffer value) throws InvalidReque
 
     public Cell addCell(ColumnMetadata column, CellPath path, ByteBuffer value) throws InvalidRequestException
     {
-        Guardrails.columnValueSize.guard(value.remaining(), column.name.toString(), state);
+        Guardrails.columnValueSize.guard(value.remaining(), column.name.toString(), false, state);
 
         if (path != null && column.type.isMultiCell())
-            Guardrails.columnValueSize.guard(path.dataSize(), column.name.toString(), state);
+            Guardrails.columnValueSize.guard(path.dataSize(), column.name.toString(), false, state);
 
         Cell<?> cell = ttl == LivenessInfo.NO_TTL
                        ? BufferCell.live(column, timestamp, value, path)
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
index 17b128523020..1e11e54db0dd 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
@@ -60,11 +60,11 @@ public NavigableSet<Clustering<?>> valuesAsClustering(QueryOptions options, Quer
             SingleRestriction r = restrictions.get(i);
             r.appendTo(builder, options);
 
+            if (hasIN() && Guardrails.inSelectCartesianProduct.enabled(queryState))
+                Guardrails.inSelectCartesianProduct.guard(builder.buildSize(), "IN Select", false, queryState);
+
             if (builder.hasMissingElements())
                 break;
-
-            if (hasIN() && Guardrails.inSelectCartesianProduct.enabled(queryState))
-                Guardrails.inSelectCartesianProduct.guard(builder.buildSize(), "IN Select", queryState);
         }
         return builder.build();
     }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
index 5fc76b9ddb19..f6ecc6923c7c 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
@@ -82,11 +82,11 @@ public List<ByteBuffer> values(QueryOptions options, QueryState queryState)
             SingleRestriction r = restrictions.get(i);
             r.appendTo(builder, options);
 
+            if (hasIN() && Guardrails.inSelectCartesianProduct.enabled(queryState))
+                Guardrails.inSelectCartesianProduct.guard(builder.buildSize(), "IN Select", false, queryState);
+
             if (builder.hasMissingElements())
                 break;
-
-            if (hasIN() && Guardrails.inSelectCartesianProduct.enabled(queryState))
-                Guardrails.inSelectCartesianProduct.guard(builder.buildSize(), "IN Select", queryState);
         }
         return builder.buildSerializedPartitionKeys();
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
index 34da575b562b..1fafabbc465e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/BatchStatement.java
@@ -18,7 +18,15 @@
 package org.apache.cassandra.cql3.statements;
 
 import java.nio.ByteBuffer;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 
 import com.google.common.annotations.VisibleForTesting;
@@ -32,16 +40,35 @@
 import org.apache.cassandra.audit.AuditLogContext;
 import org.apache.cassandra.audit.AuditLogEntryType;
 import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.cql3.*;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.Attributes;
+import org.apache.cassandra.cql3.BatchQueryOptions;
+import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.ResultSet;
+import org.apache.cassandra.cql3.VariableSpecifications;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.IMutation;
+import org.apache.cassandra.db.RegularAndStaticColumns;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
 import org.apache.cassandra.db.partitions.PartitionUpdate;
 import org.apache.cassandra.db.rows.RowIterator;
-import org.apache.cassandra.exceptions.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.RequestExecutionException;
+import org.apache.cassandra.exceptions.RequestValidationException;
+import org.apache.cassandra.exceptions.UnauthorizedException;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.metrics.BatchMetrics;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableId;
 import org.apache.cassandra.schema.TableMetadata;
-import org.apache.cassandra.service.*;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.ClientWarn;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.service.StorageProxy;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.FBUtilities;
@@ -78,10 +105,6 @@ public enum Type
 
     private static final Logger logger = LoggerFactory.getLogger(BatchStatement.class);
 
-    private static final String UNLOGGED_BATCH_WARNING = "Unlogged batch covering {} partitions detected " +
-                                                         "against table{} {}. You should use a logged batch for " +
-                                                         "atomicity, or asynchronous writes for performance.";
-
     private static final String LOGGED_BATCH_LOW_GCGS_WARNING = "Executing a LOGGED BATCH on table{} {}, configured with a " +
                                                                 "gc_grace_seconds of 0. The gc_grace_seconds is used to TTL " +
                                                                 "batchlog entries, so setting gc_grace_seconds too low on " +
@@ -254,6 +277,9 @@ private boolean isLogged()
     @Override
     public void validate(QueryState state) throws InvalidRequestException
     {
+        if (isLogged())
+            Guardrails.loggedBatchEnabled.ensureEnabled(state);
+
         for (ModificationStatement statement : statements)
             statement.validate(state);
     }
@@ -325,16 +351,15 @@ public List<? extends IMutation> getMutations(QueryState state,
      *
      * @param mutations - the batch mutations.
      */
-    private static void verifyBatchSize(Collection<? extends IMutation> mutations) throws InvalidRequestException
+    private static void verifyBatchSize(Collection<? extends IMutation> mutations, QueryState queryState) throws InvalidRequestException
     {
         // We only warn for batch spanning multiple mutations (#10876)
         if (mutations.size() <= 1)
             return;
 
-        long warnThreshold = DatabaseDescriptor.getBatchSizeWarnThreshold();
         long size = IMutation.dataSize(mutations);
 
-        if (size > warnThreshold)
+        if (Guardrails.batchSize.triggersOn(size, queryState))
         {
             Set<String> tableNames = new HashSet<>();
             for (IMutation mutation : mutations)
@@ -343,27 +368,11 @@ private static void verifyBatchSize(Collection<? extends IMutation> mutations) t
                     tableNames.add(update.metadata().toString());
             }
 
-            long failThreshold = DatabaseDescriptor.getBatchSizeFailThreshold();
-
-            String format = "Batch for {} is of size {}, exceeding specified threshold of {} by {}.{}";
-            if (size > failThreshold)
-            {
-                Tracing.trace(format, tableNames, FBUtilities.prettyPrintMemory(size), FBUtilities.prettyPrintMemory(failThreshold),
-                              FBUtilities.prettyPrintMemory(size - failThreshold), " (see batch_size_fail_threshold_in_kb)");
-                logger.error(format, tableNames, FBUtilities.prettyPrintMemory(size), FBUtilities.prettyPrintMemory(failThreshold),
-                             FBUtilities.prettyPrintMemory(size - failThreshold), " (see batch_size_fail_threshold_in_kb)");
-                throw new InvalidRequestException("Batch too large");
-            }
-            else if (logger.isWarnEnabled())
-            {
-                logger.warn(format, tableNames, FBUtilities.prettyPrintMemory(size), FBUtilities.prettyPrintMemory(warnThreshold),
-                            FBUtilities.prettyPrintMemory(size - warnThreshold), "");
-            }
-            ClientWarn.instance.warn(MessageFormatter.arrayFormat(format, new Object[] {tableNames, size, warnThreshold, size - warnThreshold, ""}).getMessage());
+            Guardrails.batchSize.guard(size, tableNames.toString(), false, queryState);
         }
     }
 
-    private void verifyBatchType(Collection<? extends IMutation> mutations)
+    private void verifyBatchType(Collection<? extends IMutation> mutations, QueryState queryState)
     {
         if (!isLogged() && mutations.size() > 1)
         {
@@ -382,13 +391,9 @@ private void verifyBatchType(Collection<? extends IMutation> mutations)
 
             // CASSANDRA-11529: log only if we have more than a threshold of keys, this was also suggested in the
             // original ticket that introduced this warning, CASSANDRA-9282
-            if (keySet.size() > DatabaseDescriptor.getUnloggedBatchAcrossPartitionsWarnThreshold())
+            if (Guardrails.unloggedBatchAcrossPartitions.triggersOn(keySet.size(), queryState))
             {
-                NoSpamLogger.log(logger, NoSpamLogger.Level.WARN, 1, TimeUnit.MINUTES, UNLOGGED_BATCH_WARNING,
-                                 keySet.size(), tableNames.size() == 1 ? "" : "s", tableNames);
-
-                ClientWarn.instance.warn(MessageFormatter.arrayFormat(UNLOGGED_BATCH_WARNING, new Object[]{keySet.size(),
-                                                    tableNames.size() == 1 ? "" : "s", tableNames}).getMessage());
+                Guardrails.unloggedBatchAcrossPartitions.guard(keySet.size(), tableNames.toString(), false, queryState);
             }
         }
     }
@@ -415,7 +420,7 @@ public ResultMessage execute(QueryState queryState, BatchQueryOptions options, l
             statement.validateDiskUsage(queryState, options.forStatement(i));
         }
 
-        if (options.getSerialConsistency() == null)
+        if (options.getSerialConsistency(queryState) == null)
             throw new InvalidRequestException("Invalid empty serial consistency level");
 
         if (hasConditions)
@@ -424,18 +429,22 @@ public ResultMessage execute(QueryState queryState, BatchQueryOptions options, l
         if (updatesVirtualTables)
             executeInternalWithoutCondition(queryState, options, queryStartNanoTime);
         else    
-            executeWithoutConditions(getMutations(queryState, options, false, timestamp, nowInSeconds, queryStartNanoTime), cl, queryStartNanoTime);
+            executeWithoutConditions(getMutations(queryState, options, false, timestamp, nowInSeconds, queryStartNanoTime), 
+                                     queryState, cl, queryStartNanoTime);
 
         return new ResultMessage.Void();
     }
 
-    private void executeWithoutConditions(List<? extends IMutation> mutations, ConsistencyLevel cl, long queryStartNanoTime) throws RequestExecutionException, RequestValidationException
+    private void executeWithoutConditions(List<? extends IMutation> mutations,
+                                          QueryState queryState,
+                                          ConsistencyLevel cl,
+                                          long queryStartNanoTime) throws RequestExecutionException, RequestValidationException
     {
         if (mutations.isEmpty())
             return;
 
-        verifyBatchSize(mutations);
-        verifyBatchType(mutations);
+        verifyBatchSize(mutations, queryState);
+        verifyBatchType(mutations, queryState);
 
         updatePartitionsPerBatchMetrics(mutations.size());
 
@@ -467,7 +476,7 @@ private ResultMessage executeWithConditions(BatchQueryOptions options, QueryStat
                                                    tableName,
                                                    casRequest.key,
                                                    casRequest,
-                                                   options.getSerialConsistency(),
+                                                   options.getSerialConsistency(state),
                                                    options.getConsistency(),
                                                    state,
                                                    options.getNowInSeconds(state),
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index 390cf40d67c7..413181b6ab9c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -508,7 +508,7 @@ private ResultMessage executeWithCondition(QueryState queryState, QueryOptions o
                                                    columnFamily(),
                                                    request.key,
                                                    request,
-                                                   options.getSerialConsistency(),
+                                                   options.getSerialConsistency(queryState),
                                                    options.getConsistency(),
                                                    queryState,
                                                    options.getNowInSeconds(queryState),
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index efd9ad4a76c2..f913b6912fc4 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -522,7 +522,7 @@ private ReadQuery getSliceCommands(QueryState queryState, QueryOptions options,
         if (keys.isEmpty())
             return ReadQuery.empty(table);
 
-        Guardrails.partitionKeysInSelectQuery.guard(keys.size(), "Select query", queryState);
+        Guardrails.partitionKeysInSelectQuery.guard(keys.size(), "Select query", false, queryState);
 
         ClusteringIndexFilter filter = makeClusteringIndexFilter(options, columnFilter, queryState);
         if (filter == null || filter.isEmpty(table.comparator))
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
index 1447eb9f48a3..0cba90607b4e 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java
@@ -198,7 +198,7 @@ public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table)
             TableMetadata tableMetadata = tableBuilder.build();
             tableMetadata.validate();
 
-            Guardrails.columnsPerTable.guard(tableBuilder.numColumns(), tableName, queryState);
+            Guardrails.columnsPerTable.guard(tableBuilder.numColumns(), tableName, false, queryState);
 
             return keyspace.withSwapped(keyspace.tables.withSwapped(tableMetadata))
                            .withSwapped(viewsBuilder.build());
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java
index 2628b654ecc1..47081fa012b0 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTypeStatement.java
@@ -138,7 +138,7 @@ UserType apply(KeyspaceMetadata keyspace, UserType userType)
             List<AbstractType<?>> fieldTypes = new ArrayList<>(userType.fieldTypes()); fieldTypes.add(fieldType);
 
             int newSize = userType.size() + 1;
-            Guardrails.fieldsPerUDT.guard(newSize, userType.getNameAsString(), state);
+            Guardrails.fieldsPerUDT.guard(newSize, userType.getNameAsString(), false, state);
 
             return new UserType(keyspaceName, userType.name, fieldNames, fieldTypes, true);
         }
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
index cc163dd68807..0078af4289bc 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTableStatement.java
@@ -112,7 +112,7 @@ public void validate(QueryState state)
                 Guardrails.counterEnabled.ensureEnabled(state);
 
             // Guardrail on columns per table
-            Guardrails.columnsPerTable.guard(rawColumns.size(), tableName, state);
+            Guardrails.columnsPerTable.guard(rawColumns.size(), tableName, false, state);
 
             if (Guardrails.tablesLimit.enabled(state))
             {
@@ -120,7 +120,7 @@ public void validate(QueryState state)
                 int totalUserTables = Schema.instance.getNonInternalKeyspaces().stream().map(Keyspace::open)
                                                      .mapToInt(keyspace -> keyspace.getColumnFamilyStores().size())
                                                      .sum();
-                Guardrails.tablesLimit.guard(totalUserTables + 1, tableName, state);
+                Guardrails.tablesLimit.guard(totalUserTables + 1, tableName, false, state);
             }
         }
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java
index f99a4d4ffef3..ef82920e722c 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateTypeStatement.java
@@ -70,7 +70,7 @@ public void validate(QueryState state)
     {
         super.validate(state);
 
-        Guardrails.fieldsPerUDT.guard(fieldNames.size(), typeName, state);
+        Guardrails.fieldsPerUDT.guard(fieldNames.size(), typeName, false, state);
     }
 
     public Keyspaces apply(Keyspaces schema)
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
index 157b343ac3f9..8104cd9da305 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
@@ -160,6 +160,7 @@ public Keyspaces apply(Keyspaces schema)
                                                         .collect(Collectors.toCollection(HashSet::new));
         Guardrails.materializedViewsPerTable.guard(baseTableViews.size() + 1,
                                                    String.format("%s on table %s", viewName, table.name),
+                                                   false,
                                                    state);
 
         if (table.params.gcGraceSeconds == 0)
diff --git a/src/java/org/apache/cassandra/db/ConsistencyLevel.java b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
index fa5960568e9d..8cbfb47a425c 100644
--- a/src/java/org/apache/cassandra/db/ConsistencyLevel.java
+++ b/src/java/org/apache/cassandra/db/ConsistencyLevel.java
@@ -19,10 +19,11 @@
 
 import java.util.Locale;
 
+import javax.annotation.Nullable;
+
 import com.carrotsearch.hppc.ObjectIntHashMap;
 import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.locator.Endpoints;
-import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.exceptions.InvalidRequestException;
@@ -218,8 +219,7 @@ public void validateForRead() throws InvalidRequestException
 
     public void validateForWrite(String keyspaceName, QueryState queryState) throws InvalidRequestException
     {
-        if (SchemaConstants.isUserKeyspace(keyspaceName))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+        Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
 
         switch (this)
         {
@@ -232,8 +232,7 @@ public void validateForWrite(String keyspaceName, QueryState queryState) throws
     // This is the same than validateForWrite really, but we include a slightly different error message for SERIAL/LOCAL_SERIAL
     public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy, String keyspaceName, QueryState queryState) throws InvalidRequestException
     {
-        if (SchemaConstants.isUserKeyspace(keyspaceName))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+        Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
 
         switch (this)
         {
@@ -248,8 +247,7 @@ public void validateForCasCommit(AbstractReplicationStrategy replicationStrategy
 
     public void validateForCas(String keyspaceName, QueryState queryState) throws InvalidRequestException
     {
-        if (SchemaConstants.isUserKeyspace(keyspaceName))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+        Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
 
         if (!isSerialConsistency())
             throw new InvalidRequestException("Invalid consistency for conditional update. Must be one of SERIAL or LOCAL_SERIAL");
@@ -262,8 +260,7 @@ public boolean isSerialConsistency()
 
     public void validateCounterForWrite(TableMetadata metadata, QueryState queryState) throws InvalidRequestException
     {
-        if (SchemaConstants.isUserKeyspace(metadata.keyspace))
-            Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
+        Guardrails.disallowedWriteConsistencies.ensureAllowed(this, queryState);
 
         if (this == ConsistencyLevel.ANY)
             throw new InvalidRequestException("Consistency level ANY is not yet supported for counter table " + metadata.name);
@@ -278,4 +275,21 @@ private void requireNetworkTopologyStrategy(AbstractReplicationStrategy replicat
             throw new InvalidRequestException(String.format("consistency level %s not compatible with replication strategy (%s)",
                                                             this, replicationStrategy.getClass().getName()));
     }
+
+    /**
+     * Returns the strictest consistency level allowed by Guardrails.
+     *
+     * @param state the query state, used to skip the guardrails check if the query is internal or is done by a superuser.
+     * @return the strictest allowed serial consistency level
+     * @throws InvalidRequestException if all serial consistency level are disallowed
+     */
+    public static ConsistencyLevel defaultSerialConsistency(@Nullable QueryState state) throws InvalidRequestException
+    {
+        if (DatabaseDescriptor.getRawConfig() == null || !Guardrails.disallowedWriteConsistencies.triggersOn(ConsistencyLevel.SERIAL, state))
+            return ConsistencyLevel.SERIAL;
+        else if (!Guardrails.disallowedWriteConsistencies.triggersOn(ConsistencyLevel.LOCAL_SERIAL, state))
+            return ConsistencyLevel.LOCAL_SERIAL;
+
+        throw new InvalidRequestException("Serial consistency levels are disallowed by disallowedWriteConsistencies Guardrail");
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index 1de5f4c68289..2ada437b9fd7 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java
@@ -34,8 +34,10 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.*;
 import org.apache.cassandra.db.filter.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.guardrails.Guardrail;
+import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.net.MessageFlag;
 import org.apache.cassandra.net.Verb;
 import org.apache.cassandra.db.partitions.*;
@@ -61,7 +63,6 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.schema.SchemaProvider;
 import org.apache.cassandra.service.ActiveRepairService;
-import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -546,6 +547,17 @@ public ReadExecutionController executionController()
         return ReadExecutionController.forCommand(this);
     }
 
+    /**
+     * Whether tombstone guardrail ({@link Guardrails#scannedTombstones} should be respected for this query.
+     *
+     * @return {@code true} if the tombstone thresholds should be respected for the query. If {@code false}, no
+     * tombstone warning will ever be logged, and the query will never fail due to tombstones.
+     */
+    protected boolean shouldRespectTombstoneThresholds()
+    {
+        return !SchemaConstants.isLocalSystemKeyspace(ReadCommand.this.metadata().keyspace);
+    }
+
     /**
      * Wraps the provided iterator so that metrics on what is scanned by the command are recorded.
      * This also log warning/trow TombstoneOverwhelmingException if appropriate.
@@ -554,17 +566,21 @@ private UnfilteredPartitionIterator withMetricsRecording(UnfilteredPartitionIter
     {
         class MetricRecording extends Transformation<UnfilteredRowIterator>
         {
-            private final int failureThreshold = DatabaseDescriptor.getTombstoneFailureThreshold();
-            private final int warningThreshold = DatabaseDescriptor.getTombstoneWarnThreshold();
-
-            private final boolean respectTombstoneThresholds = !SchemaConstants.isLocalSystemKeyspace(ReadCommand.this.metadata().keyspace);
             private final boolean enforceStrictLiveness = metadata().enforceStrictLiveness();
 
             private int liveRows = 0;
-            private int tombstones = 0;
+            private final Guardrail.Threshold.GuardedCounter tombstones = createTombstoneCounter();
 
             private DecoratedKey currentKey;
 
+            private Guardrail.Threshold.GuardedCounter createTombstoneCounter()
+            {
+                Guardrail.Threshold guardrail = shouldRespectTombstoneThresholds()
+                                                ? Guardrails.scannedTombstones
+                                                : Guardrail.Threshold.NEVER_TRIGGERED;
+                return guardrail.newCounter(ReadCommand.this::toCQLString, true, null);
+            }
+
             @Override
             public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator iter)
             {
@@ -613,13 +629,18 @@ public RangeTombstoneMarker applyToMarker(RangeTombstoneMarker marker)
 
             private void countTombstone(ClusteringPrefix<?> clustering)
             {
-                ++tombstones;
-                if (tombstones > failureThreshold && respectTombstoneThresholds)
+                try
+                {
+                    tombstones.add(1);
+                }
+                catch (InvalidRequestException e)
                 {
-                    String query = ReadCommand.this.toCQLString();
-                    Tracing.trace("Scanned over {} tombstones for query {}; query aborted (see tombstone_failure_threshold)", failureThreshold, query);
                     metric.tombstoneFailures.inc();
-                    throw new TombstoneOverwhelmingException(tombstones, query, ReadCommand.this.metadata(), currentKey, clustering);
+                    throw new TombstoneOverwhelmingException(tombstones.get(),
+                                                             ReadCommand.this.toCQLString(),
+                                                             ReadCommand.this.metadata(),
+                                                             currentKey,
+                                                             clustering);
                 }
             }
 
@@ -628,27 +649,13 @@ public void onClose()
             {
                 recordLatency(metric, System.nanoTime() - startTimeNanos);
 
-                metric.tombstoneScannedHistogram.update(tombstones);
+                metric.tombstoneScannedHistogram.update(tombstones.get());
                 metric.liveScannedHistogram.update(liveRows);
 
-                boolean warnTombstones = tombstones > warningThreshold && respectTombstoneThresholds;
-                if (warnTombstones)
-                {
-                    String msg = String.format(
-                            "Read %d live rows and %d tombstone cells for query %1.512s; token %s (see tombstone_warn_threshold)",
-                            liveRows, tombstones, ReadCommand.this.toCQLString(), currentKey.getToken());
-                    ClientWarn.instance.warn(msg);
-                    if (tombstones < failureThreshold)
-                    {
-                        metric.tombstoneWarnings.inc();
-                    }
-
-                    logger.warn(msg);
-                }
+                if (tombstones.checkAndTriggerWarning())
+                    metric.tombstoneWarnings.inc();
 
-                Tracing.trace("Read {} live rows and {} tombstone cells{}",
-                        liveRows, tombstones,
-                        (warnTombstones ? " (see tombstone_warn_threshold)" : ""));
+                Tracing.trace("Read {} live rows and {} tombstone ones", liveRows, tombstones.get());
             }
         }
 
diff --git a/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java b/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java
index 28d49ae370c5..d59acbc77676 100644
--- a/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java
+++ b/src/java/org/apache/cassandra/db/filter/TombstoneOverwhelmingException.java
@@ -26,7 +26,7 @@
 
 public class TombstoneOverwhelmingException extends RuntimeException
 {
-    public TombstoneOverwhelmingException(int numTombstones, String query, TableMetadata metadata, DecoratedKey lastPartitionKey, ClusteringPrefix<?> lastClustering)
+    public TombstoneOverwhelmingException(long numTombstones, String query, TableMetadata metadata, DecoratedKey lastPartitionKey, ClusteringPrefix<?> lastClustering)
     {
         super(String.format("Scanned over %d tombstones during query '%s' (last scanned row token was %s and partion key was (%s)); query aborted",
                             numTombstones, query, lastPartitionKey.getToken(), makePKString(metadata, lastPartitionKey.getKey(), lastClustering)));
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrail.java b/src/java/org/apache/cassandra/guardrails/Guardrail.java
index 879f48f82416..795237490861 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrail.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrail.java
@@ -18,7 +18,6 @@
 
 package org.apache.cassandra.guardrails;
 
-import java.util.HashSet;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.function.BooleanSupplier;
@@ -38,6 +37,7 @@
 import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.NoSpamLogger;
 import org.apache.cassandra.utils.units.SizeUnit;
 import org.apache.cassandra.utils.units.Units;
@@ -59,64 +59,55 @@ public abstract class Guardrail
 {
     private static final NoSpamLogger logger = NoSpamLogger.getLogger(LoggerFactory.getLogger(Guardrail.class),
                                                                       10, TimeUnit.MINUTES);
-
     private static final String REDACTED = "<redacted>";
 
+    /** A name identifying the guardrail (mainly for shipping with Insights events). */
     public final String name;
 
-    /**
-     * whether to throw {@link InvalidRequestException} on {@link this#fail(String)}
-     */
+    /** whether to throw {@link InvalidRequestException} on {@link this#fail(String)} */
     private boolean throwOnFailure = true;
 
-    /**
-     * minimum logging and triggering interval to avoid spamming downstream
-     */
+    /** minimum logging and triggering interval to avoid spamming downstream*/
     private long minNotifyIntervalInMs = 0;
 
-    /**
-     * time of last warning in milliseconds
-     */
+    /** time of last warning in milliseconds */
     private volatile long lastWarnInMs = 0;
 
-    /**
-     * time of last failure in milliseconds
-     */
+    /** time of last failure in milliseconds */
     private volatile long lastFailInMs = 0;
 
-    protected Guardrail(String name)
+    Guardrail(String name)
     {
         this.name = name;
     }
 
-    protected void warn(String message)
-    {
-        warn(message, message);
-    }
-
     protected void warn(String fullMessage, String redactedMessage)
     {
-        if (skipNotifyingOnWarning())
+        if (skipNotifying(true))
             return;
 
         logger.warn(fullMessage);
         // Note that ClientWarn will simply ignore the message if we're not running this as part of a user query
         // (the internal "state" will be null)
         ClientWarn.instance.warn(fullMessage);
+        // Similarly, tracing will also ignore the message if we're not running tracing on the current thread.
+        Tracing.trace(fullMessage);
         for (Guardrails.Listener listener : Guardrails.listeners)
             listener.onWarningTriggered(name, redactedMessage);
     }
 
-    protected void fail(String message)
+    protected void warn(String fullMessage)
     {
-        fail(message, message);
+        warn(fullMessage, fullMessage);
     }
 
     protected void fail(String fullMessage, String redactedMessage)
     {
-        if (!skipNotifyingOnFailure())
+        if (!skipNotifying(false))
         {
             logger.error(fullMessage);
+            // Tracing will ignore the message if we're not running tracing on the current thread.
+            Tracing.trace(fullMessage);
             for (Guardrails.Listener listener : Guardrails.listeners)
                 listener.onFailureTriggered(name, redactedMessage);
         }
@@ -125,9 +116,14 @@ protected void fail(String fullMessage, String redactedMessage)
             throw new InvalidRequestException(fullMessage);
     }
 
+    protected void fail(String message)
+    {
+        fail(message, message);
+    }
+
     /**
      * do no throw {@link InvalidRequestException} if guardrail failure is triggered.
-     * <p>
+     *
      * Note: this method is not thread safe and should only be used during guardrail initialization
      *
      * @return current guardrail
@@ -164,42 +160,25 @@ public void resetLastNotifyTime()
     }
 
     /**
-     * @return true if guardrail should not log message and trigger listeners; otherwise, update lastFailInMs respectively.
-     */
-    private boolean skipNotifyingOnFailure()
-    {
-        if (minNotifyIntervalInMs == 0)
-            return false;
-
-        long nowInMs = System.currentTimeMillis();
-        long timeElapsedInMs = nowInMs - lastFailInMs;
-
-        boolean skip = timeElapsedInMs < minNotifyIntervalInMs;
-
-        if (!skip)
-        {
-            lastFailInMs = nowInMs;
-        }
-
-        return skip;
-    }
-
-    /**
-     * @return true if guardrail should not log message and trigger listeners; otherwise, update lastWarnInMs respectively.
+     * @return true if guardrail should not log message and trigger listeners; otherwise, update lastWarnInMs or
+     * lastFailInMs respectively.
      */
-    private boolean skipNotifyingOnWarning()
+    private boolean skipNotifying(boolean isWarn)
     {
         if (minNotifyIntervalInMs == 0)
             return false;
 
         long nowInMs = System.currentTimeMillis();
-        long timeElapsedInMs = nowInMs - lastWarnInMs;
+        long timeElapsedInMs = nowInMs - (isWarn ? lastWarnInMs : lastFailInMs);
 
         boolean skip = timeElapsedInMs < minNotifyIntervalInMs;
 
         if (!skip)
         {
-            lastWarnInMs = nowInMs;
+            if (isWarn)
+                lastWarnInMs = nowInMs;
+            else
+                lastFailInMs = nowInMs;
         }
 
         return skip;
@@ -215,7 +194,7 @@ private boolean skipNotifyingOnWarning()
      */
     public boolean enabled(@Nullable QueryState queryState)
     {
-        return Guardrails.enabled() && Guardrails.ready() && (null == queryState || queryState.isOrdinaryUser());
+        return Guardrails.ready() && (queryState == null || queryState.isOrdinaryUser());
     }
 
     /**
@@ -228,6 +207,11 @@ public boolean enabled(@Nullable QueryState queryState)
      */
     public static class Threshold extends Guardrail
     {
+        /**
+         * A {@link Threshold} with both failure and warning thresholds disabled, so that cannot ever be triggered.
+         */
+        public static final Threshold NEVER_TRIGGERED = new Threshold("never_triggered", () -> -1L, () -> -1L, null);
+
         /**
          * A function used to build the error message of a triggered {@link Threshold} guardrail.
          */
@@ -242,7 +226,7 @@ public interface ErrorMessageProvider
              * @param valueString     the value that triggered the guardrail (as a string).
              * @param thresholdString the threshold that was passed to trigger the guardrail (as a string).
              */
-            public String createMessage(boolean isWarning, String what, String valueString, String thresholdString);
+            String createMessage(boolean isWarning, String what, String valueString, String thresholdString);
         }
 
         final LongSupplier warnThreshold;
@@ -299,17 +283,6 @@ private long warnValue()
             return warnValue < 0 ? Long.MAX_VALUE : warnValue;
         }
 
-        /**
-         * Checks whether this guardrail is enabled or not. This will be enabled if guardrails are globally enabled
-         * ({@link Guardrails#enabled()}), and if any of the thresholds is positive.
-         *
-         * @return {@code true} if this guardrail is enabled, {@code false} otherwise.
-         */
-        public boolean enabled()
-        {
-            return super.enabled(null) && (failThreshold.getAsLong() >= 0 || warnThreshold.getAsLong() >= 0);
-        }
-
         /**
          * Checks whether this guardrail is enabled or not. This will be enabled if guardrails are
          * ({@link Guardrails#ready()} ()}), the keyspace (if specified) is not an internal one, and if any of the
@@ -324,21 +297,6 @@ public boolean enabled(@Nullable QueryState queryState)
             return super.enabled(queryState) && (failThreshold.getAsLong() >= 0 || warnThreshold.getAsLong() >= 0);
         }
 
-        /**
-         * Checks whether the provided value would trigger a warning or failure if passed to {@link #guard}.
-         *
-         * <p>This method is optional (does not have to be called) but can be used in the case where the "what"
-         * argument to {@link #guard} is expensive to build to save doing so in the common case (of the guardrail
-         * not being triggered).
-         *
-         * @param value the value to test.
-         * @return {@code true} if {@code value} is above the warning or failure thresholds of this guardrail, {@code false} otherwise.
-         */
-        public boolean triggersOn(long value)
-        {
-            return enabled(null) && (value > Math.min(failValue(), warnValue()));
-        }
-
         /**
          * Checks whether the provided value would trigger a warning or failure if passed to {@link #guard}.
          *
@@ -349,28 +307,14 @@ public boolean triggersOn(long value)
          * @param value      the value to test.
          * @param queryState the queryState, used to skip the check if the query is internal or is done by a superuser.
          *                   A {@code null} value means that the check should be done regardless of the query.
-         * @return {@code true} if {@code value} is above the warning or failure thresholds of this guardrail, {@code false} otherwise.
+         * @return {@code true} if {@code value} is above the warning or failure thresholds of this guardrail,
+         * {@code false otherwise}.
          */
         public boolean triggersOn(long value, @Nullable QueryState queryState)
         {
             return enabled(queryState) && (value > Math.min(failValue(), warnValue()));
         }
 
-        /**
-         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
-         *
-         * @param value the value to check.
-         * @param what  a string describing what {@code value} is a value of used in the error message if the
-         *              guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
-         *              argument must describe which column of which row is triggering the guardrail for convenience). Note that
-         *              this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
-         *              this method behind a {@link #triggersOn} call.
-         */
-        public void guard(long value, String what)
-        {
-            guard(value, what, false);
-        }
-
         /**
          * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
          *
@@ -382,63 +326,123 @@ public void guard(long value, String what)
          *                         this method behind a {@link #triggersOn} call.
          * @param containsUserData a boolean describing if {@code what} contains user data. If this is the case,
          *                         {@code what} will only be included in the log messages and client warning. It will not be included in the
-         *                         error messages that are passed to listeners and exceptions. We have to exclude the user data from exceptions
-         *                         because they will be sent as Diagnostic Events in the future.
+         *                         error messages that are passed to listeners and exceptions. We have to exclude the user data from
+         *                         exceptions because they are sent to Insights.
+         * @param queryState       the query state, used to skip the check if the query is internal or is done by a superuser.
+         *                         A {@code null} value means that the check should be done regardless of the query.
          */
-        public void guard(long value, String what, boolean containsUserData)
+        public void guard(long value, String what, boolean containsUserData, @Nullable QueryState queryState)
         {
-            guard(value, what, containsUserData, null);
+            if (!enabled(queryState))
+                return;
+
+            long failValue = failValue();
+            if (value > failValue)
+            {
+                triggerFail(value, failValue, what, containsUserData);
+                return;
+            }
+
+            long warnValue = warnValue();
+            if (value > warnValue)
+                triggerWarn(value, warnValue, what, containsUserData);
+        }
+
+        private void triggerFail(long value, long failValue, String what, boolean containsUserData)
+        {
+            String fullMsg = errMsg(false, what, value, failValue);
+            fail(fullMsg, containsUserData ? redactedErrMsg(false, value, failValue) : fullMsg);
+        }
+
+        private void triggerWarn(long value, long warnValue, String what, boolean containsUserData)
+        {
+            String fullMsg = errMsg(true, what, value, warnValue);
+            warn(fullMsg, containsUserData ? redactedErrMsg(true, value, warnValue) : fullMsg);
         }
 
         /**
-         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
+         * Creates a new {@link GuardedCounter} guarded by this threshold guardrail.
          *
-         * @param value            the value to check.
-         * @param what             a string describing what {@code value} is a value of used in the error message if the
-         *                         guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
-         *                         argument must describe which column of which row is triggering the guardrail for convenience). Note that
-         *                         this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
-         *                         this method behind a {@link #triggersOn} call.
-         * @param queryState       the queryState, used to skip the check if the query is internal or is done by a superuser.
+         * @param whatFct          a function called when either a warning or failure is triggered by the created counter to
+         *                         describe the value. This is equivalent to the {@code what} argument of {@link #guard} but is a function to
+         *                         allow the output string to be compute lazily (only if a failure/warn ends up being triggered).
+         * @param containsUserData if a warning or failure is triggered by the created counter and the {@code whatFct}
+         *                         is called, indicates whether the create string contains user data. This is the exact equivalent to the
+         *                         similarly named argument of {@link #guard}.
+         * @param queryState       the query state, used to skip the check if the query is internal or is done by a superuser.
+         *                         A {@code null} value means that the check should be done regardless of the query.
+         * @return the newly created guarded counter.
          */
-        public void guard(long value, String what, @Nullable QueryState queryState)
+        public GuardedCounter newCounter(Supplier<String> whatFct, boolean containsUserData, @Nullable QueryState queryState)
         {
-            guard(value, what, false, queryState);
+            Threshold threshold = enabled(queryState) ? this : NEVER_TRIGGERED;
+            return threshold.new GuardedCounter(whatFct, containsUserData);
         }
 
         /**
-         * Apply the guardrail to the provided value, triggering a warning or failure if appropriate.
-         *
-         * @param value            the value to check.
-         * @param what             a string describing what {@code value} is a value of used in the error message if the
-         *                         guardrail is triggered (for instance, say the guardrail guards the size of column values, then this
-         *                         argument must describe which column of which row is triggering the guardrail for convenience). Note that
-         *                         this is only used if the guardrail triggers, so if it is expensive to build, you can put the call to
-         *                         this method behind a {@link #triggersOn} call.
-         * @param containsUserData a boolean describing if {@code what} contains user data. If this is the case,
-         *                         {@code what} will only be included in the log messages and client warning. It will not be included in the
-         *                         error messages that are passed to listeners and exceptions.
-         * @param queryState       the queryState, used to skip the check if the query is internal or is done by a superuser.
+         * A facility for when the value to guard is built incrementally, but we want to trigger failures as soon
+         * as the failure threshold is reached, but only trigger the warning on the final value (and so only if the
+         * failure threshold hasn't also been reached).
+         * <p>
+         * Note that instances are neither thread safe nor reusable.
          */
-        public void guard(long value, String what, boolean containsUserData, @Nullable QueryState queryState)
+        public class GuardedCounter
         {
-            if (!enabled(queryState))
-                return;
+            private final long warnValue;
+            private final long failValue;
+            private final Supplier<String> what;
+            private final boolean containsUserData;
 
-            long failValue = failValue();
-            if (value > failValue)
+            private long accumulated;
+
+            private GuardedCounter(Supplier<String> what, boolean containsUserData)
             {
-                String fullMsg = errMsg(false, what, value, failValue);
-                fail(fullMsg, containsUserData ? redactedErrMsg(false, value, failValue) : fullMsg);
+                // We capture the warn and fail value at the time of the counter construction to ensure we use
+                // stable value during the counter lifetime (and reading a final field is possibly at tad faster).
+                this.warnValue = warnValue();
+                this.failValue = failValue();
+                this.what = what;
+                this.containsUserData = containsUserData;
             }
-            else
+
+            /**
+             * The currently accumulated value of the counter.
+             */
+            public long get()
+            {
+                return accumulated;
+            }
+
+            /**
+             * Add the provided increment to the counter, triggering a failure if the counter after this addition
+             * crosses the failure threshold.
+             *
+             * @param increment the increment to add.
+             */
+            public void add(long increment)
+            {
+                accumulated += increment;
+                if (accumulated > failValue)
+                    triggerFail(accumulated, failValue, what.get(), containsUserData);
+            }
+
+            /**
+             * Trigger the warn if the currently accumulated counter value crosses warning threshold and the failure
+             * has not been triggered yet.
+             * <p>
+             * This is generally meant to be called when the guarded value is complete.
+             *
+             * @return {@code true} and trigger a warning if the current counter value is greater than the warning
+             * threshold and less than or equal to the failure threshold, {@code false} otherwise.
+             */
+            public boolean checkAndTriggerWarning()
             {
-                long warnValue = warnValue();
-                if (value > warnValue)
+                if (accumulated > warnValue && accumulated <= failValue)
                 {
-                    String fullMsg = errMsg(true, what, value, warnValue);
-                    warn(fullMsg, containsUserData ? redactedErrMsg(true, value, warnValue) : fullMsg);
+                    triggerWarn(accumulated, warnValue, what.get(), containsUserData);
+                    return true;
                 }
+                return false;
             }
         }
     }
@@ -536,14 +540,6 @@ public static class DisableFlag extends Guardrail
          *
          * <p>This must be called when the feature guarded by this guardrail is used to ensure such use is in fact
          * allowed.
-         */
-        public void ensureEnabled()
-        {
-            ensureEnabled(what, QueryState.forInternalCalls());
-        }
-
-        /**
-         * Triggers a failure if this guardrail is disabled.
          *
          * <p>This must be called when the feature guarded by this guardrail is used to ensure such use is in fact
          * allowed.
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
index 8c8da194f71a..3dabca591409 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrails.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -43,117 +43,46 @@ public abstract class Guardrails
 {
     private static final GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
 
-    public static final Threshold columnValueSize = new SizeThreshold("column_value_size",
-                                                                      () -> -1L, // not needed so far
-                                                                      () -> config.column_value_size_failure_threshold_in_kb * 1024L,
-                                                                      (x, what, v, t) -> format("Value of %s of size %s is greater than the maximum allowed (%s)",
-                                                                                                what, v, t));
-
-    public static final Threshold columnsPerTable = new Threshold("columns_per_table",
-                                                                  () -> -1L, // not needed so far
-                                                                  () -> config.columns_per_table_failure_threshold,
-                                                                  (x, what, v, t) -> format("Tables cannot have more than %s columns, but %s provided for table %s",
-                                                                                            t, v, what));
-
-    public static final DisableFlag userTimestampsEnabled = new DisableFlag("user_provided_timestamps",
-                                                                            () -> !config.user_timestamps_enabled,
-                                                                            "User provided timestamps (USING TIMESTAMP)");
-
-    public static final DisableFlag truncateTableEnabled = new DisableFlag("truncate_table",
-                                                                           () -> !config.truncate_table_enabled,
-                                                                           "TRUNCATE table");
-
-    public static final DisallowedValues<ConsistencyLevel> disallowedWriteConsistencies = new DisallowedValues<>("disallowed_write_consistency_levels",
-                                                                                                                 () -> config.write_consistency_levels_disallowed,
-                                                                                                                 ConsistencyLevel::fromString,
-                                                                                                                 "Consistency Level");
-
-    public static final Threshold secondaryIndexesPerTable = new Threshold("secondary_indexes_per_table",
-                                                                           () -> -1,
-                                                                           () -> config.secondary_index_per_table_failure_threshold,
-                                                                           (x, what, v, t) -> format("Tables cannot have more than %s secondary indexes, failed to create secondary index %s",
-                                                                                                     t, what));
-
-    public static final Threshold indexesPerTableSasi = new Threshold("sasi_indexes_per_table_failure_threshold",
-                                                                      () -> -1,
-                                                                      () -> config.sasi_indexes_per_table_failure_threshold,
-                                                                      (x, what, v, t) -> format("Tables cannot have more than %s SASI indexes, failed to create SASI index %s",
-                                                                                                t, what));
-
-    public static final Threshold indexesPerTableSai = new Threshold("sai_indexes_per_table_failure_threshold",
-                                                                     () -> -1,
-                                                                     () -> config.sai_indexes_per_table_failure_threshold,
-                                                                     (x, what, v, t) -> format("Tables cannot have more than %s StorageAttachedIndex secondary indexes, failed to create secondary index %s",
-                                                                                               t, what));
-
-    public static final Threshold indexesTotalSai = new Threshold("sai_indexes_total_failure_threshold",
-                                                                  () -> -1,
-                                                                  () -> config.sai_indexes_total_failure_threshold,
-                                                                  (x, what, v, t) -> format("Cannot have more than %s StorageAttachedIndex secondary indexes across all keyspaces, failed to create secondary index %s",
-                                                                                            t, what));
-
-    public static final Threshold materializedViewsPerTable = new Threshold("materialized_views_per_table",
-                                                                            () -> -1,
-                                                                            () -> config.materialized_view_per_table_failure_threshold,
-                                                                            (x, what, v, t) -> format("Tables cannot have more than %s materialized views, failed to create materialized view %s",
-                                                                                                      t, what));
-
-    public static final Threshold tablesLimit = new Threshold("number_of_tables",
-                                                              () -> config.tables_warn_threshold,
-                                                              () -> config.tables_failure_threshold,
-                                                              (isWarning, what, v, t) -> isWarning
-                                                                                         ? format("Creating table %s, current number of tables %s exceeds warning threshold of %s.",
-                                                                                                  what, v, t)
-                                                                                         : format("Cannot have more than %s tables, failed to create table %s",
-                                                                                                  t, what));
-
-    public static final DisallowedValues<String> disallowedTableProperties = new DisallowedValues<>("disallowed_table_properties",
-                                                                                                    () -> config.table_properties_disallowed,
-                                                                                                    String::toLowerCase,
-                                                                                                    "Table Properties");
-
-    public static final IgnoredValues<String> ignoredTableProperties = new IgnoredValues<>("ignored_table_properties", 
-                                                                                           () -> config.table_properties_ignored, 
-                                                                                           String::toLowerCase, 
-                                                                                           "Table Properties");
-    
-    public static final DisableFlag counterEnabled = new DisableFlag("counter",
-                                                                     () -> !config.counter_enabled,
-                                                                     "Counter");
-
-    @SuppressWarnings("unchecked")
-    public static final Predicates<InetAddressAndPort> replicaDiskUsage =
-    (Predicates<InetAddressAndPort>) new Predicates<>("replica_disk_usage",
-                                                      DiskUsageBroadcaster.instance::isStuffed,
-                                                      DiskUsageBroadcaster.instance::isFull,
-                                                      // not using `what` because it represents replica address which should be hidden from client.
-                                                      (isWarning, what) -> isWarning
-                                                                           ? "Replica disk usage exceeds warn threshold"
-                                                                           : "Write request failed because disk usage exceeds failure threshold")
-                                     .minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
-
-    public static final PercentageThreshold localDiskUsage =
-    (PercentageThreshold) new PercentageThreshold("local_disk_usage",
-                                                  () -> config.disk_usage_percentage_warn_threshold,
-                                                  () -> config.disk_usage_percentage_failure_threshold,
-                                                  (isWarning, what, v, t) -> isWarning
-                                                                             ? format("Local disk usage %s(%s) exceeds warn threshold of %s", v, what, t)
-                                                                             : format("Local disk usage %s(%s) exceeds failure threshold of %s, will stop accepting writes", v, what, t))
-                          .noExceptionOnFailure()
-                          .minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
-
-    public static final Threshold partitionSize =
-    new SizeThreshold("partition_size",
-                      () -> config.partition_size_warn_threshold_in_mb * 1024L * 1024L,
-                      () -> -1L,
-                      (x, what, v, t) -> format("Detected partition %s of size %s is greater than the maximum recommended size (%s)",
+    public static final Threshold tablesLimit =
+    new Threshold("number_of_tables",
+                  () -> config.tables_warn_threshold,
+                  () -> config.tables_failure_threshold,
+                  (isWarning, what, v, t) -> isWarning
+                                             ? format("Creating table %s, current number of tables %s exceeds warning threshold of %s.",
+                                                      what, v, t)
+                                             : format("Cannot have more than %s tables, failed to create table %s",
+                                                      t, what));
+
+    public static final DisallowedValues<String> disallowedTableProperties =
+    new DisallowedValues<>("disallowed_table_properties",
+                           () -> config.table_properties_disallowed,
+                           String::toLowerCase,
+                           "Table Properties");
+
+    public static final IgnoredValues<String> ignoredTableProperties =
+    new IgnoredValues<>("ignored_table_properties",
+                        () -> config.table_properties_ignored,
+                        String::toLowerCase,
+                        "Table Properties");
+
+    public static final DisableFlag counterEnabled =
+    new DisableFlag("counter",
+                    () -> !config.counter_enabled,
+                    "Counter");
+
+    public static final Threshold columnValueSize =
+    new SizeThreshold("column_value_size",
+                      () -> -1L, // not needed so far
+                      () -> config.column_value_size_failure_threshold_in_kb * 1024L,
+                      (x, what, v, t) -> format("Value of %s of size %s is greater than the maximum allowed (%s)",
                                                 what, v, t));
 
-    public static final Threshold partitionKeysInSelectQuery =
-    new Threshold("partition_keys_in_select_query",
-                  () -> -1L,
-                  () -> config.partition_keys_in_select_failure_threshold,
-                  (x, what, v, t) -> format("%s cannot be completed because it selects %s partitions keys - more than the maximum allowed %s", what, v, t));
+    public static final Threshold columnsPerTable =
+    new Threshold("columns_per_table",
+                  () -> -1L, // not needed so far
+                  () -> config.columns_per_table_failure_threshold,
+                  (x, what, v, t) -> format("Tables cannot have more than %s columns, but %s provided for table %s",
+                                            t, v, what));
 
     public static final Threshold fieldsPerUDT =
     new Threshold("fields_per_udt",
@@ -176,33 +105,145 @@ public abstract class Guardrails
                   (x, what, v, t) -> format("Detected collection %s with %s items, greater than the maximum recommended (%s)",
                                             what, v, t));
 
+    public static final DisableFlag readBeforeWriteListOperationsEnabled =
+    new DisableFlag("read_before_write_list_operations",
+                    () -> !config.read_before_write_list_operations_enabled,
+                    "List operation requiring read before write");
+
+    public static final DisableFlag userTimestampsEnabled =
+    new DisableFlag("user_provided_timestamps",
+                    () -> !config.user_timestamps_enabled,
+                    "User provided timestamps (USING TIMESTAMP)");
+
+    public static final DisableFlag loggedBatchEnabled =
+    new DisableFlag("logged_batch",
+                    () -> !config.logged_batch_enabled,
+                    "LOGGED batch");
+
+    public static final DisableFlag truncateTableEnabled =
+    new DisableFlag("truncate_table",
+                    () -> !config.truncate_table_enabled,
+                    "TRUNCATE table");
+
+    public static final DisallowedValues<ConsistencyLevel> disallowedWriteConsistencies =
+    new DisallowedValues<>("disallowed_write_consistency_levels",
+                           () -> config.write_consistency_levels_disallowed,
+                           ConsistencyLevel::fromString,
+                           "Write Consistency Level");
+
+    public static final Threshold secondaryIndexesPerTable =
+    new Threshold("secondary_indexes_per_table",
+                  () -> -1,
+                  () -> config.secondary_index_per_table_failure_threshold,
+                  (x, what, v, t) -> format("Tables cannot have more than %s secondary indexes, failed to create secondary index %s",
+                                            t, what));
+
+    public static final Threshold indexesPerTableSasi =
+    new Threshold("sasi_indexes_per_table_failure_threshold",
+                  () -> -1,
+                  () -> config.sasi_indexes_per_table_failure_threshold,
+                  (x, what, v, t) -> format("Tables cannot have more than %s SASI indexes, failed to create SASI index %s",
+                                            t, what));
+
+    public static final Threshold indexesPerTableSai =
+    new Threshold("sai_indexes_per_table_failure_threshold",
+                  () -> -1,
+                  () -> config.sai_indexes_per_table_failure_threshold,
+                  (x, what, v, t) -> format("Tables cannot have more than %s StorageAttachedIndex secondary indexes, failed to create secondary index %s",
+                                            t, what));
+
+    public static final Threshold indexesTotalSai =
+    new Threshold("sai_indexes_total_failure_threshold",
+                  () -> -1,
+                  () -> config.sai_indexes_total_failure_threshold,
+                  (x, what, v, t) -> format("Cannot have more than %s StorageAttachedIndex secondary indexes across all keyspaces, failed to create secondary index %s",
+                                            t, what));
+
+    public static final Threshold materializedViewsPerTable =
+    new Threshold("materialized_views_per_table",
+                  () -> -1,
+                  () -> config.materialized_view_per_table_failure_threshold,
+                  (x, what, v, t) -> format("Tables cannot have more than %s materialized views, failed to create materialized view %s",
+                                            t, what));
+
+    // TODO Unused until STAR-762 implements paging by bytes and can port pagesize related DB-3208 guardrails
+    public static final Threshold pageSize =
+    new SizeThreshold("page_size",
+                      () -> -1L,
+                      () -> config.page_size_failure_threshold_in_kb * 1024L,
+                      (x, what, v, t) -> format("Page size %s - %s is greater than the maximum allowed (%s)",
+                                                what, v, t));
+
+    public static final Threshold partitionSize =
+    new SizeThreshold("partition_size",
+                      () -> config.partition_size_warn_threshold_in_mb * 1024L * 1024L,
+                      () -> -1L,
+                      (x, what, v, t) -> format("Detected partition %s of size %s is greater than the maximum recommended size (%s)",
+                                                what, v, t));
+
+    public static final Threshold partitionKeysInSelectQuery =
+    new Threshold("partition_keys_in_select_query",
+                  () -> -1L,
+                  () -> config.partition_keys_in_select_failure_threshold,
+                  (x, what, v, t) -> format("%s cannot be completed because it selects %s partitions keys - more than the maximum allowed %s", what, v, t));
+
     public static final Threshold inSelectCartesianProduct =
     new Threshold("in_select_cartesian_product",
                   () -> -1L,
                   () -> config.in_select_cartesian_product_failure_threshold,
                   (x, what, v, t) -> format("The query cannot be completed because cartesian product of all values in IN conditions is greater than %s", t));
 
-    public static final DisableFlag readBeforeWriteListOperationsEnabled =
-    new DisableFlag("read_before_write_list_operations",
-                    () -> !config.read_before_write_list_operations_enabled,
-                    "List operation requiring read before write");
+    @SuppressWarnings("unchecked")
+    public static final Predicates<InetAddressAndPort> replicaDiskUsage =
+    (Predicates<InetAddressAndPort>) new Predicates<>("replica_disk_usage",
+                                               DiskUsageBroadcaster.instance::isStuffed,
+                                               DiskUsageBroadcaster.instance::isFull,
+                                               // not using `what` because it represents replica address which should be hidden from client.
+                                               (isWarning, what) -> isWarning
+                                                                    ? "Replica disk usage exceeds warn threshold"
+                                                                    : "Write request failed because disk usage exceeds failure threshold")
+                              .minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
+
+    public static final PercentageThreshold localDiskUsage =
+    (PercentageThreshold) new PercentageThreshold("local_disk_usage",
+                                                  () -> config.disk_usage_percentage_warn_threshold,
+                                                  () -> config.disk_usage_percentage_failure_threshold,
+                                                  (isWarning, what, v, t) -> isWarning
+                                                                             ? format("Local disk usage %s(%s) exceeds warn threshold of %s", v, what, t)
+                                                                             : format("Local disk usage %s(%s) exceeds failure threshold of %s, will stop accepting writes", v, what, t))
+                          .noExceptionOnFailure()
+                          .minNotifyIntervalInMs(TimeUnit.MINUTES.toMillis(30));
+
+    public static final Threshold scannedTombstones =
+    new Threshold("scanned_tombstones",
+                  () -> config.tombstone_warn_threshold,
+                  () -> config.tombstone_failure_threshold,
+                  (isWarning, what, v, t) -> isWarning ?
+                                             format("Scanned over %s tombstone rows for query %1.512s - more than the warning threshold %s", v, what, t) :
+                                             format("Scanned over %s tombstone rows during query %1.512s - more than the maximum allowed %s; query aborted", v, what, t));
+
+
+    public static final Threshold batchSize =
+    new SizeThreshold("batch_size",
+                      config::getBatchSizeWarnThreshold,
+                      config::getBatchSizeFailThreshold,
+                      (isWarning, what, v, t) -> isWarning
+                                                 ? format("Batch for %s is of size %s, exceeding specified warning threshold %s", what, v, t)
+                                                 : format("Batch for %s is of size %s, exceeding specified failure threshold %s", what, v, t));
+
+    public static final Threshold unloggedBatchAcrossPartitions =
+    new Threshold("unlogged_batch_across_partitions",
+                  () -> config.unlogged_batch_across_partitions_warn_threshold,
+                  () -> -1L,
+                  (x, what, v, t) -> format("Unlogged batch covering %s partitions detected " +
+                                            "against table%s %s. You should use a logged batch for " +
+                                            "atomicity, or asynchronous writes for performance.",
+                                            v, what.contains(", ") ? "s" : "", what));
 
     static final List<Listener> listeners = new CopyOnWriteArrayList<>();
 
     private Guardrails()
-    {
-    }
-
-    /**
-     * Whether guardrails are enabled globally or not.
-     *
-     * @return {@code true} if guardrails are enabled (applies based on their individual setting), {@code false}
-     * otherwise (in which case no guardrail will trigger).
-     */
-    public static boolean enabled()
-    {
-        return config.enabled;
-    }
+    {}
 
     /**
      * Whether guardrails are ready.
@@ -222,7 +263,7 @@ public static boolean ready()
      * is triggered.
      *
      * @param listener the listener to register. If the same listener is registered twice (or more), its method will be
-     *                 called twice (or more) for every trigger.
+     * called twice (or more) for every trigger.
      */
     public static void register(Listener listener)
     {
@@ -233,7 +274,7 @@ public static void register(Listener listener)
      * Unregister a previously registered listener.
      *
      * @param listener the listener to unregister. If it was not registered before, this is a no-op. If it was
-     *                 registered more than once, only one of the instance is unregistered.
+     * registered more than once, only one of the instance is unregistered.
      */
     public static void unregister(Listener listener)
     {
@@ -241,25 +282,35 @@ public static void unregister(Listener listener)
     }
 
     /**
-     * Interface for external listeners interested in being notified when a guardrail is triggered.
+     * Interface for external listener interested in being notified when a guardrail is triggered.
      *
      * <p>Listeners should be registered through the {@link #register} method to take effect.
+     *
+     * <p>Note: this provides a mechanism to generate events when guardrails are triggered.
      */
     public interface Listener
     {
         /**
          * Called when a guardrail triggers a warning.
          *
+         * <p>This method is called on the thread on which the guardrail is triggered.
+         * Overall, if any blocking work is to be done, the method should submit it asynchronously on a
+         * separate dedicated thread.
+         *
          * @param guardrailName a name describing the guardrail.
-         * @param message       the message corresponding to the guardrail trigger.
+         * @param message the message corresponding to the guardrail trigger.
          */
         public void onWarningTriggered(String guardrailName, String message);
 
         /**
          * Called when a guardrail triggers a failure.
          *
+         * <p>This method is called on the thread on which the guardrail is triggered.
+         * Overall, if any blocking work is to be done, the method should submit it asynchronously on a
+         * separate dedicated thread.
+         *
          * @param guardrailName a name describing the guardrail.
-         * @param message       the message corresponding to the guardrail trigger.
+         * @param message the message corresponding to the guardrail trigger.
          */
         public void onFailureTriggered(String guardrailName, String message);
     }
diff --git a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
index 995cd2d290de..d5c14983d0d6 100644
--- a/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
+++ b/src/java/org/apache/cassandra/guardrails/GuardrailsConfig.java
@@ -26,13 +26,16 @@
 import java.util.stream.Collectors;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Sets;
 
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.statements.schema.TableAttributes;
 import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.units.SizeUnit;
 
 import static java.lang.String.format;
 
@@ -43,17 +46,14 @@
  * checking each guarded constraint (which, again, should use the higher level abstractions defined in
  * {@link Guardrails}).
  *
- * <p>This contains a main setting, {@code enabled}, controlling if guardrails are globally active or not, and
- * individual setting to control each guardrail. We have 2 variants of guardrails, soft (warn) and hard (fail) limits,
- * each guardrail having either one of the variant or both (note in particular that hard limits only make sense for
- * guardrails triggering during query execution. For other guardrails, say one triggering during compaction, failing
- * does not make sense).
+ * <p>We have 2 variants of guardrails, soft (warn) and hard (fail) limits, each guardrail having either one of the
+ * variant or both (note in particular that hard limits only make sense for guardrails triggering during query
+ * execution. For other guardrails, say one triggering during compaction, failing does not make sense).
  *
- * <p>If {@code enabled == false}, no limits should be enforced, be it soft or hard. Additionally, each individual
- * setting should have a specific value (typically -1 for numeric settings), that allows to disable the corresponding
- * guardrail.
+ * <p>Additionally, each individual setting should have a specific value (typically -1 for numeric settings),
+ * that allows to disable the corresponding guardrail.
  *
- * <p>The default values for each guardrail settings should reflect what is mandated for C* aaS environment.
+ * <p>The default values for each guardrail settings should reflect what is mandated for DCaaS.
  *
  * <p>For consistency, guardrails based on a simple numeric threshold should use the naming scheme
  * {@code <what_is_guarded>_warn_threshold} for soft limits and {@code <what_is_guarded>_failure_threshold} for hard
@@ -72,46 +72,154 @@ public class GuardrailsConfig
     public static final int DEFAULT_INDEXES_PER_TABLE_THRESHOLD = 10;
     public static final int DEFAULT_INDEXES_TOTAL_THRESHOLD = 100;
 
-    public Boolean enabled = false;
+    public volatile Long column_value_size_failure_threshold_in_kb;
+    public volatile Long columns_per_table_failure_threshold;
+    public volatile Long fields_per_udt_failure_threshold;
+    public volatile Long collection_size_warn_threshold_in_kb;
+    public volatile Long items_per_collection_warn_threshold;
+    public volatile Boolean read_before_write_list_operations_enabled;
 
-    public Long column_value_size_failure_threshold_in_kb;
-    public Long columns_per_table_failure_threshold;
+    // Legacy 2i guardrail
+    public volatile Integer secondary_index_per_table_failure_threshold;
+    public volatile Integer sasi_indexes_per_table_failure_threshold;
+    // SAI indexes guardrail
+    public volatile Integer sai_indexes_per_table_failure_threshold;
+    public volatile Integer sai_indexes_total_failure_threshold;
+    public volatile Integer materialized_view_per_table_failure_threshold;
 
-    public Long tables_warn_threshold;
-    public Long tables_failure_threshold;
-    public Set<String> table_properties_disallowed;
-    public Set<String> table_properties_ignored;
+    public volatile Long tables_warn_threshold;
+    public volatile Long tables_failure_threshold;
+    // N.B. Not safe for concurrent modification
+    public volatile Set<String> table_properties_disallowed;
+    public volatile Set<String> table_properties_ignored;
 
-    public Boolean user_timestamps_enabled;
+    public volatile Boolean user_timestamps_enabled;
 
-    public Boolean counter_enabled;
+    public volatile Boolean logged_batch_enabled;
 
-    // Legacy 2i guardrail
-    public Integer secondary_index_per_table_failure_threshold;
-    public Integer sasi_indexes_per_table_failure_threshold;
-    // SAI indexes guardrail
-    public Integer sai_indexes_per_table_failure_threshold;
-    public Integer sai_indexes_total_failure_threshold;
-    public Integer materialized_view_per_table_failure_threshold;
+    public volatile Boolean truncate_table_enabled;
 
-    public Set<String> write_consistency_levels_disallowed;
+    public volatile Boolean counter_enabled;
 
-    public Integer partition_size_warn_threshold_in_mb;
-    public Integer partition_keys_in_select_failure_threshold;
+    public volatile Set<String> write_consistency_levels_disallowed;
+
+    // For paging by bytes having a page bigger than this threshold will result in a failure
+    // For paging by rows the result will be silently cut short if it is bigger than the threshold
+    public volatile Integer page_size_failure_threshold_in_kb;
 
     // Limit number of terms and their cartesian product in IN query
-    public Integer in_select_cartesian_product_failure_threshold;
+    public volatile Integer in_select_cartesian_product_failure_threshold;
+    public volatile Integer partition_keys_in_select_failure_threshold;
+
+    // represent percentage of disk space, -1 means disabled
+    public volatile Integer disk_usage_percentage_warn_threshold;
+    public volatile Integer disk_usage_percentage_failure_threshold;
+    public volatile Long disk_usage_max_disk_size_in_gb;
+
+    // When executing a scan, within or across a partition, we need to keep the
+    // tombstones seen in memory so we can return them to the coordinator, which
+    // will use them to make sure other replicas also know about the deleted rows.
+    // With workloads that generate a lot of tombstones, this can cause performance
+    // problems and even exaust the server heap.
+    // (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets)
+    // Adjust the thresholds here if you understand the dangers and want to
+    // scan more tombstones anyway. These thresholds may also be adjusted at runtime
+    // using the StorageService mbean.
+    public volatile Integer tombstone_warn_threshold;
+    public volatile Integer tombstone_failure_threshold;
+
+    // Log WARN on any multiple-partition batch size that exceeds this value. 5kb per batch by default.
+    // Use caution when increasing the size of this threshold as it can lead to node instability.
+    public volatile Integer batch_size_warn_threshold_in_kb;
+    // Fail any multiple-partition batch that exceeds this value. The calculated default is 50kb (10x warn threshold).
+    public volatile Integer batch_size_fail_threshold_in_kb;
+    // Log WARN on any batches not of type LOGGED than span across more partitions than this limit.
+    public volatile Integer unlogged_batch_across_partitions_warn_threshold;
+
+    public volatile Integer partition_size_warn_threshold_in_mb;
+
+    /**
+     * If {@link DatabaseDescriptor#isEmulateDbaasDefaults()} is true, apply cloud defaults to guardrails settings that
+     * are not specified in yaml; otherwise, apply on-prem defaults to guardrails settings that are not specified in yaml;
+     */
+    @VisibleForTesting
+    public void applyConfig()
+    {
+        // for read requests
+        enforceDefault(page_size_failure_threshold_in_kb, v -> page_size_failure_threshold_in_kb = v, NO_LIMIT, 512);
 
-    public Long fields_per_udt_failure_threshold;
-    public Long collection_size_warn_threshold_in_kb;
-    public Long items_per_collection_warn_threshold;
+        enforceDefault(in_select_cartesian_product_failure_threshold, v -> in_select_cartesian_product_failure_threshold = v, NO_LIMIT, 25);
+        enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT, 20);
 
-    public Integer disk_usage_percentage_warn_threshold;
-    public Integer disk_usage_percentage_failure_threshold;
+        enforceDefault(tombstone_warn_threshold, v -> tombstone_warn_threshold = v, 1000, 1000);
+        enforceDefault(tombstone_failure_threshold, v -> tombstone_failure_threshold = v, 100000, 100000);
 
-    public Boolean read_before_write_list_operations_enabled;
+        // for write requests
+        enforceDefault(logged_batch_enabled, v -> logged_batch_enabled = v, true, true);
+        enforceDefault(batch_size_warn_threshold_in_kb, v -> batch_size_warn_threshold_in_kb = v, 64, 64);
+        enforceDefault(batch_size_fail_threshold_in_kb, v -> batch_size_fail_threshold_in_kb = v, 640, 640);
+        enforceDefault(unlogged_batch_across_partitions_warn_threshold, v -> unlogged_batch_across_partitions_warn_threshold = v, 10, 10);
+
+        enforceDefault(truncate_table_enabled, v -> truncate_table_enabled = v, true, true);
+
+        enforceDefault(user_timestamps_enabled, v -> user_timestamps_enabled = v, true, true);
+
+        enforceDefault(column_value_size_failure_threshold_in_kb, v -> column_value_size_failure_threshold_in_kb = v, -1L, 5 * 1024L);
+
+        enforceDefault(read_before_write_list_operations_enabled, v -> read_before_write_list_operations_enabled = v, true, false);
+
+        // We use a LinkedHashSet just for the sake of preserving the ordering in error messages
+        enforceDefault(write_consistency_levels_disallowed,
+                       v -> write_consistency_levels_disallowed = ImmutableSet.copyOf(v),
+                       Collections.<String>emptySet(),
+                       new LinkedHashSet<>(Arrays.asList("ANY", "ONE", "LOCAL_ONE")));
 
-    public Boolean truncate_table_enabled;
+        // for schema
+        enforceDefault(counter_enabled, v -> counter_enabled = v, true, true);
+
+        enforceDefault(fields_per_udt_failure_threshold, v -> fields_per_udt_failure_threshold = v, -1L, 10L);
+        enforceDefault(collection_size_warn_threshold_in_kb, v -> collection_size_warn_threshold_in_kb = v, -1L, 5 * 1024L);
+        enforceDefault(items_per_collection_warn_threshold, v -> items_per_collection_warn_threshold = v, -1L, 20L);
+
+        enforceDefault(columns_per_table_failure_threshold, v -> columns_per_table_failure_threshold = v, -1L, 50L);
+        enforceDefault(secondary_index_per_table_failure_threshold, v -> secondary_index_per_table_failure_threshold = v, NO_LIMIT, 1);
+        enforceDefault(sasi_indexes_per_table_failure_threshold, v -> sasi_indexes_per_table_failure_threshold = v, NO_LIMIT, 0);
+        enforceDefault(materialized_view_per_table_failure_threshold, v -> materialized_view_per_table_failure_threshold = v, NO_LIMIT, 2);
+        enforceDefault(tables_warn_threshold, v -> tables_warn_threshold = v, -1L, 100L);
+        enforceDefault(tables_failure_threshold, v -> tables_failure_threshold = v, -1L, 200L);
+
+        enforceDefault(table_properties_disallowed,
+                       v -> table_properties_disallowed = ImmutableSet.copyOf(v),
+                       Collections.<String>emptySet(),
+                       Collections.<String>emptySet());
+
+        enforceDefault(table_properties_ignored,
+                       v -> table_properties_ignored = ImmutableSet.copyOf(v),
+                       Collections.<String>emptySet(),
+                       new LinkedHashSet<>(TableAttributes.allKeywords().stream()
+                                                          .sorted()
+                                                          .filter(p -> !p.equals("default_time_to_live"))
+                                                          .collect(Collectors.toList())));
+
+        // for node status
+        enforceDefault(disk_usage_percentage_warn_threshold, v -> disk_usage_percentage_warn_threshold = v, NO_LIMIT, 70);
+        enforceDefault(disk_usage_percentage_failure_threshold, v -> disk_usage_percentage_failure_threshold = v, NO_LIMIT, 80);
+        enforceDefault(disk_usage_max_disk_size_in_gb, v -> disk_usage_max_disk_size_in_gb = v, (long) NO_LIMIT, (long) NO_LIMIT);
+
+        enforceDefault(partition_size_warn_threshold_in_mb, v -> partition_size_warn_threshold_in_mb = v, 100, 100);
+
+        // SAI Table Failure threshold (maye be overridden via system property)
+        Integer overrideTableFailureThreshold = Integer.getInteger(INDEX_GUARDRAILS_TABLE_FAILURE_THRESHOLD, UNSET);
+        if (overrideTableFailureThreshold != UNSET)
+            sai_indexes_per_table_failure_threshold = overrideTableFailureThreshold;
+        enforceDefault(sai_indexes_per_table_failure_threshold, v -> sai_indexes_per_table_failure_threshold = v, DEFAULT_INDEXES_PER_TABLE_THRESHOLD, DEFAULT_INDEXES_PER_TABLE_THRESHOLD);
+
+        // SAI Table Failure threshold (maye be overridden via system property)
+        Integer overrideTotalFailureThreshold = Integer.getInteger(INDEX_GUARDRAILS_TOTAL_FAILURE_THRESHOLD, UNSET);
+        if (overrideTotalFailureThreshold != UNSET)
+            sai_indexes_total_failure_threshold = overrideTotalFailureThreshold;
+        enforceDefault(sai_indexes_total_failure_threshold, v -> sai_indexes_total_failure_threshold = v, DEFAULT_INDEXES_TOTAL_THRESHOLD, DEFAULT_INDEXES_TOTAL_THRESHOLD);
+    }
 
     /**
      * Validate that the value provided for each guardrail setting is valid.
@@ -126,23 +234,37 @@ public void validate()
         validateStrictlyPositiveInteger(columns_per_table_failure_threshold,
                                         "columns_per_table_failure_threshold");
 
+        validateStrictlyPositiveInteger(fields_per_udt_failure_threshold,
+                                        "fields_per_udt_failure_threshold");
+
+        validateStrictlyPositiveInteger(collection_size_warn_threshold_in_kb,
+                                        "collection_size_warn_threshold_in_kb");
+
+        validateStrictlyPositiveInteger(items_per_collection_warn_threshold,
+                                        "items_per_collection_warn_threshold");
+
         validateStrictlyPositiveInteger(tables_warn_threshold, "tables_warn_threshold");
         validateStrictlyPositiveInteger(tables_failure_threshold, "tables_failure_threshold");
         validateWarnLowerThanFail(tables_warn_threshold, tables_failure_threshold, "tables");
+
+        validateDisallowedTableProperties();
+        validateIgnoredTableProperties();
+
+        validateStrictlyPositiveInteger(page_size_failure_threshold_in_kb, "page_size_failure_threshold_in_kb");
+
         validateStrictlyPositiveInteger(partition_size_warn_threshold_in_mb, "partition_size_warn_threshold_in_mb");
-        validateStrictlyPositiveInteger(partition_keys_in_select_failure_threshold, "partition_keys_in_select_failure_threshold");
 
-        validateStrictlyPositiveInteger(fields_per_udt_failure_threshold, "fields_per_udt_failure_threshold");
-        validateStrictlyPositiveInteger(collection_size_warn_threshold_in_kb, "collection_size_warn_threshold_in_kb");
-        validateStrictlyPositiveInteger(items_per_collection_warn_threshold, "items_per_collection_warn_threshold");
+        validateStrictlyPositiveInteger(partition_keys_in_select_failure_threshold, "partition_keys_in_select_failure_threshold");
 
         validateStrictlyPositiveInteger(in_select_cartesian_product_failure_threshold, "in_select_cartesian_product_failure_threshold");
 
-        validateDisallowedTableProperties();
-        validateIgnoredTableProperties();
-
         validateDiskUsageThreshold();
 
+        validateTombstoneThreshold(tombstone_warn_threshold, tombstone_failure_threshold);
+
+        validateBatchSizeThreshold(batch_size_warn_threshold_in_kb, batch_size_fail_threshold_in_kb);
+        validateStrictlyPositiveInteger(unlogged_batch_across_partitions_warn_threshold, "unlogged_batch_across_partitions_warn_threshold");
+
         for (String rawCL : write_consistency_levels_disallowed)
         {
             try
@@ -151,78 +273,80 @@ public void validate()
             }
             catch (Exception e)
             {
-                throw new ConfigurationException(format("Invalid value for write_consistency_level_disallowed guardrail: "
+                throw new ConfigurationException(format("Invalid value for write_consistency_levels_disallowed guardrail: "
                                                         + "'%s' does not parse as a Consistency Level", rawCL));
             }
         }
     }
 
     /**
-     * If {@link DatabaseDescriptor#isApplyDbaasDefaults()} is true, apply cloud defaults to guardrails settings that
-     * are not specified in yaml; otherwise, apply on-prem defaults to guardrails settings that are not specified in yaml;
+     * This validation method should only be called after {@link DatabaseDescriptor#createAllDirectories()} has been called.
      */
-    @VisibleForTesting
-    public void applyConfig()
+    public void validateAfterDataDirectoriesExist()
     {
-        enforceDefault(truncate_table_enabled, v -> truncate_table_enabled = v, true, true);
-
-        enforceDefault(user_timestamps_enabled, v -> user_timestamps_enabled = v, true, true);
-
-        enforceDefault(column_value_size_failure_threshold_in_kb, v -> column_value_size_failure_threshold_in_kb = v, -1L, 5 * 1024L);
-
-        enforceDefault(columns_per_table_failure_threshold, v -> columns_per_table_failure_threshold = v, -1L, 20L);
-        enforceDefault(secondary_index_per_table_failure_threshold, v -> secondary_index_per_table_failure_threshold = v, NO_LIMIT, 1);
-        enforceDefault(sasi_indexes_per_table_failure_threshold, v -> sasi_indexes_per_table_failure_threshold = v, NO_LIMIT, 0);
-        enforceDefault(materialized_view_per_table_failure_threshold, v -> materialized_view_per_table_failure_threshold = v, NO_LIMIT, 2);
-        enforceDefault(tables_warn_threshold, v -> tables_warn_threshold = v, -1L, 100L);
-        enforceDefault(tables_failure_threshold, v -> tables_failure_threshold = v, -1L, 200L);
-
-        // We use a LinkedHashSet just for the sake of preserving the ordering in error messages
-        enforceDefault(write_consistency_levels_disallowed,
-                       v -> write_consistency_levels_disallowed = v,
-                       Collections.<String>emptySet(),
-                       new LinkedHashSet<>(Arrays.asList("ANY", "ONE", "LOCAL_ONE")));
-
-        enforceDefault(table_properties_disallowed,
-                       v -> table_properties_disallowed = v,
-                       Collections.<String>emptySet(),
-                       Collections.<String>emptySet());
-
-        enforceDefault(table_properties_ignored,
-                       v -> table_properties_ignored = v,
-                       Collections.<String>emptySet(),
-                       new LinkedHashSet<>(TableAttributes.allKeywords().stream()
-                                                          .sorted()
-                                                          .filter(p -> !p.equals("default_time_to_live"))
-                                                          .collect(Collectors.toList())));
+        validateDiskUsageMaxSize();
+    }
 
-        enforceDefault(partition_size_warn_threshold_in_mb, v -> partition_size_warn_threshold_in_mb = v, 100, 100);
-        enforceDefault(partition_keys_in_select_failure_threshold, v -> partition_keys_in_select_failure_threshold = v, NO_LIMIT, 20);
+    @VisibleForTesting
+    public void validateDiskUsageMaxSize()
+    {
+        long totalDiskSizeInGb = 0L;
+        for (Directories.DataDirectory directory : Directories.dataDirectories.getAllDirectories())
+        {
+            totalDiskSizeInGb += SizeUnit.BYTES.toGigaBytes(directory.getTotalSpace());
+        }
 
-        enforceDefault(counter_enabled, v -> counter_enabled = v, true, true);
+        if (totalDiskSizeInGb == 0L)
+        {
+            totalDiskSizeInGb = Long.MAX_VALUE;
+        }
+        validatePositiveNumeric(disk_usage_max_disk_size_in_gb, totalDiskSizeInGb, false, "disk_usage_max_disk_size_in_gb");
+    }
 
-        enforceDefault(fields_per_udt_failure_threshold, v -> fields_per_udt_failure_threshold = v, -1L, 10L);
-        enforceDefault(collection_size_warn_threshold_in_kb, v -> collection_size_warn_threshold_in_kb = v, -1L, 5 * 1024L);
-        enforceDefault(items_per_collection_warn_threshold, v -> items_per_collection_warn_threshold = v, -1L, 20L);
+    /**
+     * Enforce default value based on {@link DatabaseDescriptor#isEmulateDbaasDefaults()} if
+     * it's not specified in yaml
+     *
+     * @param current current config value defined in yaml
+     * @param optionSetter setter to updated given config
+     * @param onPremDefault default value for on-prem
+     * @param dbaasDefault default value for constellation DB-as-a-service
+     * @param <T>
+     */
+    private static <T> void enforceDefault(T current, Consumer<T> optionSetter, T onPremDefault, T dbaasDefault)
+    {
+        if (current != null)
+            return;
 
-        // for node status
-        enforceDefault(disk_usage_percentage_warn_threshold, v -> disk_usage_percentage_warn_threshold = v, NO_LIMIT, 70);
-        enforceDefault(disk_usage_percentage_failure_threshold, v -> disk_usage_percentage_failure_threshold = v, NO_LIMIT, 80);
+        optionSetter.accept(DatabaseDescriptor.isEmulateDbaasDefaults() ? dbaasDefault : onPremDefault);
+    }
 
-        enforceDefault(in_select_cartesian_product_failure_threshold, v -> in_select_cartesian_product_failure_threshold = v, NO_LIMIT, 25);
-        enforceDefault(read_before_write_list_operations_enabled, v -> read_before_write_list_operations_enabled = v, true, false);
+    /**
+     * @return true if given disk usage threshold disables disk usage guardrail
+     */
+    public static boolean diskUsageGuardrailDisabled(double value)
+    {
+        return value < 0;
+    }
 
-        // SAI Table Failure threshold (maye be overridden via system property)
-        Integer overrideTableFailureThreshold = Integer.getInteger(INDEX_GUARDRAILS_TABLE_FAILURE_THRESHOLD, UNSET);
-        if (overrideTableFailureThreshold != UNSET)
-            sai_indexes_per_table_failure_threshold = overrideTableFailureThreshold;
-        enforceDefault(sai_indexes_per_table_failure_threshold, v -> sai_indexes_per_table_failure_threshold = v, DEFAULT_INDEXES_PER_TABLE_THRESHOLD, DEFAULT_INDEXES_PER_TABLE_THRESHOLD);
+    /**
+     * Validate that the values provided for disk usage are valid.
+     *
+     * @throws ConfigurationException if any of the settings has an invalid setting.
+     */
+    @VisibleForTesting
+    public void validateDiskUsageThreshold()
+    {
+        validatePositiveNumeric(disk_usage_percentage_warn_threshold, 100, false, "disk_usage_percentage_warn_threshold");
+        validatePositiveNumeric(disk_usage_percentage_failure_threshold, 100, false, "disk_usage_percentage_failure_threshold");
+        validateWarnLowerThanFail(disk_usage_percentage_warn_threshold, disk_usage_percentage_failure_threshold, "disk_usage_percentage");
+    }
 
-        // SAI Table Failure threshold (maye be overridden via system property)
-        Integer overrideTotalFailureThreshold = Integer.getInteger(INDEX_GUARDRAILS_TOTAL_FAILURE_THRESHOLD, UNSET);
-        if (overrideTotalFailureThreshold != UNSET)
-            sai_indexes_total_failure_threshold = overrideTotalFailureThreshold;
-        enforceDefault(sai_indexes_total_failure_threshold, v -> sai_indexes_total_failure_threshold = v, DEFAULT_INDEXES_TOTAL_THRESHOLD, DEFAULT_INDEXES_TOTAL_THRESHOLD);
+    public void validateTombstoneThreshold(long warnThreshold, long failureThreshold)
+    {
+        validateStrictlyPositiveInteger(warnThreshold, "tombstone_warn_threshold");
+        validateStrictlyPositiveInteger(failureThreshold, "tombstone_failure_threshold");
+        validateWarnLowerThanFail(warnThreshold, failureThreshold, "tombstone_threshold");
     }
 
     private void validateDisallowedTableProperties()
@@ -247,7 +371,7 @@ private void validateIgnoredTableProperties()
 
     private void validateStrictlyPositiveInteger(long value, String name)
     {
-        // We use 'long' for generality, but most numeric guardrails cannot effectively be more than an 'int' for various
+        // We use 'long' for generality, but most numeric guardrail cannot effectively be more than a 'int' for various
         // internal reasons. Not that any should ever come close in practice ...
         // Also, in most cases, zero does not make sense (allowing 0 tables or columns is not exactly useful).
         validatePositiveNumeric(value, Integer.MAX_VALUE, false, name);
@@ -280,42 +404,45 @@ private void validateWarnLowerThanFail(long warnValue, long failValue, String gu
                                                     warnValue, guardName, failValue));
     }
 
-    /**
-     * Enforce default value based on {@link DatabaseDescriptor#isApplyDbaasDefaults()} if
-     * it's not specified in yaml
-     *
-     * @param current       current config value defined in yaml
-     * @param optionSetter  setter to updated given config
-     * @param onPremDefault default value for on-prem
-     * @param dbaasDefault  default value for constellation DB-as-a-service
-     * @param <T>
-     */
-    private static <T> void enforceDefault(T current, Consumer<T> optionSetter, T onPremDefault, T dbaasDefault)
+    public void setTombstoneFailureThreshold(int threshold)
     {
-        if (current != null)
-            return;
+        validateTombstoneThreshold(tombstone_warn_threshold, threshold);
+        tombstone_failure_threshold = threshold;
+    }
 
-        optionSetter.accept(DatabaseDescriptor.isApplyDbaasDefaults() ? dbaasDefault : onPremDefault);
+    public void setTombstoneWarnThreshold(int threshold)
+    {
+        validateTombstoneThreshold(threshold, tombstone_failure_threshold);
+        tombstone_warn_threshold = threshold;
     }
 
-    /**
-     * @return true if given disk usage threshold disables disk usage guardrail
-     */
-    public static boolean diskUsageGuardrailDisabled(double value)
+
+    public void validateBatchSizeThreshold(long warnThreshold, long failureThreshold)
     {
-        return value < 0;
+        validateStrictlyPositiveInteger(warnThreshold, "batch_size_warn_threshold_in_kb");
+        validateStrictlyPositiveInteger(failureThreshold, "batch_size_fail_threshold_in_kb");
+        validateWarnLowerThanFail(warnThreshold, failureThreshold, "batch_size_threshold");
     }
 
-    /**
-     * Validate that the values provided for disk usage are valid.
-     *
-     * @throws ConfigurationException if any of the settings has an invalid setting.
-     */
-    @VisibleForTesting
-    public void validateDiskUsageThreshold()
+    public int getBatchSizeWarnThreshold()
     {
-        validatePositiveNumeric(disk_usage_percentage_warn_threshold, 100, false, "disk_usage_percentage_warn_threshold");
-        validatePositiveNumeric(disk_usage_percentage_failure_threshold, 100, false, "disk_usage_percentage_failure_threshold");
-        validateWarnLowerThanFail(disk_usage_percentage_warn_threshold, disk_usage_percentage_failure_threshold, "disk_usage_percentage");
+        return batch_size_warn_threshold_in_kb * 1024;
+    }
+
+    public int getBatchSizeFailThreshold()
+    {
+        return batch_size_fail_threshold_in_kb * 1024;
+    }
+
+    public void setBatchSizeWarnThresholdInKB(int threshold)
+    {
+        validateBatchSizeThreshold(threshold, batch_size_fail_threshold_in_kb);
+        batch_size_warn_threshold_in_kb = threshold;
+    }
+
+    public void setBatchSizeFailThresholdInKB(int threshold)
+    {
+        validateBatchSizeThreshold(batch_size_warn_threshold_in_kb, threshold);
+        batch_size_fail_threshold_in_kb = threshold;
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
index 18455493380f..21bd73215a4c 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableWriter.java
@@ -436,7 +436,7 @@ public static void guardCollectionSize(UnfilteredRowIterator partition, Unfilter
         if (!unfiltered.isRow() || SchemaConstants.isInternalKeyspace(partition.metadata().keyspace))
             return;
 
-        if (!Guardrails.collectionSize.enabled() && !Guardrails.itemsPerCollection.enabled())
+        if (!Guardrails.collectionSize.enabled(null) && !Guardrails.itemsPerCollection.enabled(null))
             return;
 
         Row row = (Row) unfiltered;
@@ -456,8 +456,8 @@ public static void guardCollectionSize(UnfilteredRowIterator partition, Unfilter
             int cellsSize = liveCells.dataSize();
             int cellsCount = liveCells.cellsCount();
 
-            if (!Guardrails.collectionSize.triggersOn(cellsSize) &&
-                !Guardrails.itemsPerCollection.triggersOn(cellsCount))
+            if (!Guardrails.collectionSize.triggersOn(cellsSize, null) &&
+                !Guardrails.itemsPerCollection.triggersOn(cellsCount, null))
                 continue;
 
             TableMetadata metadata = partition.metadata();
@@ -467,8 +467,8 @@ public static void guardCollectionSize(UnfilteredRowIterator partition, Unfilter
                                        column.name.toString(),
                                        keyString,
                                        metadata);
-            Guardrails.collectionSize.guard(cellsSize, msg, true);
-            Guardrails.itemsPerCollection.guard(cellsCount, msg, true);
+            Guardrails.collectionSize.guard(cellsSize, msg, true, null);
+            Guardrails.itemsPerCollection.guard(cellsCount, msg, true, null);
         }
     }
 
@@ -494,16 +494,10 @@ protected void maybeLogLargePartitionWarning(DecoratedKey key, long rowSize)
         if (SchemaConstants.isInternalKeyspace(metadata().keyspace))
             return;
 
-        if (Guardrails.partitionSize.triggersOn(rowSize))
+        if (Guardrails.partitionSize.triggersOn(rowSize, null))
         {
             String keyString = metadata().partitionKeyAsCQLLiteral(key.getKey());
-            Guardrails.partitionSize.guard(rowSize, String.format("%s in %s", keyString, metadata), true);
-        }
-
-        if (rowSize > DatabaseDescriptor.getCompactionLargePartitionWarningThreshold())
-        {
-            String keyString = metadata().partitionKeyType.getString(key.getKey());
-            logger.warn("Writing large partition {}/{}:{} ({}) to sstable {}", metadata.keyspace, metadata.name, keyString, FBUtilities.prettyPrintMemory(rowSize), getFilename());
+            Guardrails.partitionSize.guard(rowSize, String.format("%s in %s", keyString, metadata), true, null);
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/service/QueryState.java b/src/java/org/apache/cassandra/service/QueryState.java
index 60cebf309ea3..c72111fb50cb 100644
--- a/src/java/org/apache/cassandra/service/QueryState.java
+++ b/src/java/org/apache/cassandra/service/QueryState.java
@@ -123,7 +123,7 @@ public InetAddress getClientAddress()
      */
     public boolean isOrdinaryUser()
     {
-        AuthenticatedUser user = this.getClientState().getUser();
-        return null != user && !user.isSystem() && !user.isSuper();
+        AuthenticatedUser user = getClientState().getUser();
+        return !getClientState().isInternal && null != user && !user.isSystem() && !user.isSuper();
     }
 }
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index c4e910ebc889..e8943fe09c4b 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -5646,23 +5646,23 @@ public void setMigrateKeycacheOnCompaction(boolean invalidateKeyCacheOnCompactio
 
     public int getTombstoneWarnThreshold()
     {
-        return DatabaseDescriptor.getTombstoneWarnThreshold();
+        return DatabaseDescriptor.getGuardrailsConfig().tombstone_warn_threshold;
     }
 
     public void setTombstoneWarnThreshold(int threshold)
     {
-        DatabaseDescriptor.setTombstoneWarnThreshold(threshold);
+        DatabaseDescriptor.getGuardrailsConfig().setTombstoneWarnThreshold(threshold);
         logger.info("updated tombstone_warn_threshold to {}", threshold);
     }
 
     public int getTombstoneFailureThreshold()
     {
-        return DatabaseDescriptor.getTombstoneFailureThreshold();
+        return DatabaseDescriptor.getGuardrailsConfig().tombstone_failure_threshold;
     }
 
     public void setTombstoneFailureThreshold(int threshold)
     {
-        DatabaseDescriptor.setTombstoneFailureThreshold(threshold);
+        DatabaseDescriptor.getGuardrailsConfig().setTombstoneFailureThreshold(threshold);
         logger.info("updated tombstone_failure_threshold to {}", threshold);
     }
 
@@ -5701,23 +5701,23 @@ public void setColumnIndexCacheSize(int cacheSizeInKB)
 
     public int getBatchSizeFailureThreshold()
     {
-        return DatabaseDescriptor.getBatchSizeFailThresholdInKB();
+        return DatabaseDescriptor.getGuardrailsConfig().batch_size_fail_threshold_in_kb;
     }
 
     public void setBatchSizeFailureThreshold(int threshold)
     {
-        DatabaseDescriptor.setBatchSizeFailThresholdInKB(threshold);
+        DatabaseDescriptor.getGuardrailsConfig().setBatchSizeFailThresholdInKB(threshold);
         logger.info("updated batch_size_fail_threshold_in_kb to {}", threshold);
     }
 
     public int getBatchSizeWarnThreshold()
     {
-        return DatabaseDescriptor.getBatchSizeWarnThresholdInKB();
+        return DatabaseDescriptor.getGuardrailsConfig().batch_size_warn_threshold_in_kb;
     }
 
     public void setBatchSizeWarnThreshold(int threshold)
     {
-        DatabaseDescriptor.setBatchSizeWarnThresholdInKB(threshold);
+        DatabaseDescriptor.getGuardrailsConfig().setBatchSizeWarnThresholdInKB(threshold);
         logger.info("Updated batch_size_warn_threshold_in_kb to {}", threshold);
     }
 
diff --git a/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java
index f1659bd5816c..cbb284808abb 100644
--- a/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java
+++ b/src/java/org/apache/cassandra/service/disk/usage/DiskUsageMonitor.java
@@ -75,7 +75,7 @@ public void start(Consumer<DiskUsageState> notifier)
         // start the scheduler regardless guardrail is enabled, so we can enable it later without a restart
         ScheduledExecutors.scheduledTasks.scheduleAtFixedRate(() -> {
 
-            if (!Guardrails.localDiskUsage.enabled())
+            if (!Guardrails.localDiskUsage.enabled(null))
                 return;
 
             updateLocalState(getDiskUsage(), notifier);
@@ -90,7 +90,7 @@ public void updateLocalState(double usageRatio, Consumer<DiskUsageState> notifie
 
         DiskUsageState state = getState(percentageCeiling);
 
-        Guardrails.localDiskUsage.guard(percentageCeiling, state.toString(), false);
+        Guardrails.localDiskUsage.guard(percentageCeiling, state.toString(), false, null);
 
         // if state remains unchanged, no need to notify peers
         if (state == localState)
diff --git a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
index e76096013719..c59d1d3e8c30 100644
--- a/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/BatchMessage.java
@@ -236,8 +236,8 @@ private void traceQuery(QueryState state)
         ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
         if (options.getConsistency() != null)
             builder.put("consistency_level", options.getConsistency().name());
-        if (options.getSerialConsistency() != null)
-            builder.put("serial_consistency_level", options.getSerialConsistency().name());
+        if (options.getSerialConsistency(state) != null)
+            builder.put("serial_consistency_level", options.getSerialConsistency(state).name());
 
         // TODO we don't have [typed] access to CQL bind variables here.  CASSANDRA-4560 is open to add support.
         Tracing.instance.begin("Execute batch of CQL3 queries", state.getClientAddress(), builder.build());
diff --git a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
index 3b9899621cfa..2fc06a69f774 100644
--- a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
@@ -186,8 +186,8 @@ private void traceQuery(QueryState state, QueryHandler.Prepared prepared)
             builder.put("page_size", Integer.toString(options.getPageSize()));
         if (options.getConsistency() != null)
             builder.put("consistency_level", options.getConsistency().name());
-        if (options.getSerialConsistency() != null)
-            builder.put("serial_consistency_level", options.getSerialConsistency().name());
+        if (options.getSerialConsistency(state) != null)
+            builder.put("serial_consistency_level", options.getSerialConsistency(state).name());
 
         builder.put("query", prepared.rawCQLStatement);
 
diff --git a/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java b/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
index 828548d3f90b..cfb653134c4b 100644
--- a/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
@@ -18,12 +18,14 @@
 package org.apache.cassandra.transport.messages;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 import io.netty.buffer.ByteBuf;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Compressor;
@@ -73,6 +75,7 @@ protected Message.Response execute(QueryState state, long queryStartNanoTime, bo
         supported.put(StartupMessage.CQL_VERSION, cqlVersions);
         supported.put(StartupMessage.COMPRESSION, compressions);
         supported.put(StartupMessage.PROTOCOL_VERSIONS, ProtocolVersion.supportedVersions());
+        supported.put(StartupMessage.EMULATE_DBAAS_DEFAULTS, Collections.singletonList(String.valueOf(DatabaseDescriptor.isEmulateDbaasDefaults())));
 
         return new SupportedMessage(supported);
     }
diff --git a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
index 71d7c731eca0..b347fc2fb3c4 100644
--- a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
@@ -131,8 +131,8 @@ private void traceQuery(QueryState state)
             builder.put("page_size", Integer.toString(options.getPageSize()));
         if (options.getConsistency() != null)
             builder.put("consistency_level", options.getConsistency().name());
-        if (options.getSerialConsistency() != null)
-            builder.put("serial_consistency_level", options.getSerialConsistency().name());
+        if (options.getSerialConsistency(state) != null)
+            builder.put("serial_consistency_level", options.getSerialConsistency(state).name());
 
         Tracing.instance.begin("Execute CQL3 query", state.getClientAddress(), builder.build());
     }
diff --git a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
index 172768c5465d..1bae7f05f711 100644
--- a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
@@ -40,6 +40,7 @@ public class StartupMessage extends Message.Request
     public static final String DRIVER_NAME = "DRIVER_NAME";
     public static final String DRIVER_VERSION = "DRIVER_VERSION";
     public static final String THROW_ON_OVERLOAD = "THROW_ON_OVERLOAD";
+    public static final String EMULATE_DBAAS_DEFAULTS = "EMULATE_DBAAS_DEFAULTS";
 
     public static final Message.Codec<StartupMessage> codec = new Message.Codec<StartupMessage>()
     {
diff --git a/test/distributed/org/apache/cassandra/distributed/test/ReadFailureTest.java b/test/distributed/org/apache/cassandra/distributed/test/ReadFailureTest.java
index be8db6c7782f..3a76a91f2fc0 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/ReadFailureTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/ReadFailureTest.java
@@ -21,17 +21,17 @@
 import org.apache.commons.lang3.exception.ExceptionUtils;
 import org.junit.Test;
 
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
-
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
-import org.apache.cassandra.distributed.api.ICluster;
 import org.apache.cassandra.distributed.Cluster;
 import org.apache.cassandra.distributed.api.ConsistencyLevel;
+import org.apache.cassandra.distributed.api.ICluster;
 import org.apache.cassandra.exceptions.RequestFailureReason;
 
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
 public class ReadFailureTest extends TestBaseImpl
 {
     static final int TOMBSTONE_FAIL_THRESHOLD = 20;
@@ -49,7 +49,13 @@ public class ReadFailureTest extends TestBaseImpl
     @Test
     public void testSpecExecRace() throws Throwable
     {
-        try (Cluster cluster = init(Cluster.build().withNodes(2).withConfig(config -> config.set("tombstone_failure_threshold", TOMBSTONE_FAIL_THRESHOLD)).start()))
+        try (Cluster cluster = init(Cluster.build()
+                                           .withNodes(2)
+                                           .withConfig(config -> {
+                                               config.set("tombstone_warn_threshold", -1L);
+                                               config.set("tombstone_failure_threshold", TOMBSTONE_FAIL_THRESHOLD);
+                                           })
+                                           .start()))
         {
             // Create a table with the spec exec policy set to a low percentile so it's more likely to produce a spec exec racing with the local request.
             // Not using 'Always' because that actually uses a different class/mechanism and doesn't exercise the bug
diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
index 8a1bf0558ba8..cccc7cc1af95 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
@@ -313,15 +313,15 @@ public void testExceptionsForInvalidConfigValues() {
 
         try
         {
-            DatabaseDescriptor.setBatchSizeWarnThresholdInKB(-1);
-            fail("Should have received a ConfigurationException batch_size_warn_threshold_in_kb = -1");
+            DatabaseDescriptor.getGuardrailsConfig().setBatchSizeWarnThresholdInKB(-2);
+            fail("Should have received a ConfigurationException batch_size_warn_threshold_in_kb = -2");
         }
         catch (ConfigurationException ignored) { }
-        Assert.assertEquals(5120, DatabaseDescriptor.getBatchSizeWarnThreshold());
+        Assert.assertEquals(65536, DatabaseDescriptor.getGuardrailsConfig().getBatchSizeWarnThreshold());
 
         try
         {
-            DatabaseDescriptor.setBatchSizeWarnThresholdInKB(2 * 1024 * 1024);
+            DatabaseDescriptor.getGuardrailsConfig().setBatchSizeWarnThresholdInKB(2 * 1024 * 1024);
             fail("Should have received a ConfigurationException batch_size_warn_threshold_in_kb = 2GiB");
         }
         catch (ConfigurationException ignored) { }
diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
index 3b5a6105aab6..68dc32afd060 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/entities/SecondaryIndexTest.java
@@ -772,17 +772,17 @@ public void testIndexOnPartitionKeyInsertValueOver64k() throws Throwable
         // (the non-conditional batch doesn't hit this because
         // BatchStatement::executeLocally skips the size check but CAS
         // path does not)
-        long batchSizeThreshold = DatabaseDescriptor.getBatchSizeFailThreshold();
+        int batchSizeThreshold = DatabaseDescriptor.getGuardrailsConfig().batch_size_fail_threshold_in_kb;
         try
         {
-            DatabaseDescriptor.setBatchSizeFailThresholdInKB( (TOO_BIG / 1024) * 2);
+            DatabaseDescriptor.getGuardrailsConfig().setBatchSizeFailThresholdInKB((TOO_BIG / 1024) * 2);
             succeedInsert("BEGIN BATCH\n" +
                           "INSERT INTO %s (a, b, c) VALUES (1, 1, ?) IF NOT EXISTS;\n" +
                           "APPLY BATCH", ByteBuffer.allocate(TOO_BIG));
         }
         finally
         {
-            DatabaseDescriptor.setBatchSizeFailThresholdInKB((int) (batchSizeThreshold / 1024));
+            DatabaseDescriptor.getGuardrailsConfig().setBatchSizeFailThresholdInKB(batchSizeThreshold);
         }
     }
 
@@ -825,17 +825,20 @@ public void testIndexOnClusteringColumnInsertValueOver64k() throws Throwable
         // (the non-conditional batch doesn't hit this because
         // BatchStatement::executeLocally skips the size check but CAS
         // path does not)
-        long batchSizeThreshold = DatabaseDescriptor.getBatchSizeFailThreshold();
+        int batchSizeThreshold = DatabaseDescriptor.getGuardrailsConfig().batch_size_fail_threshold_in_kb;
+        int diskUsageThreshold = DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_failure_threshold;
         try
         {
-            DatabaseDescriptor.setBatchSizeFailThresholdInKB( (TOO_BIG / 1024) * 2);
+            DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_failure_threshold = -1;
+            DatabaseDescriptor.getGuardrailsConfig().setBatchSizeFailThresholdInKB( (TOO_BIG / 1024) * 2);
             succeedInsert("BEGIN BATCH\n" +
                           "INSERT INTO %s (a, b, c) VALUES (1, 1, ?) IF NOT EXISTS;\n" +
                           "APPLY BATCH", ByteBuffer.allocate(TOO_BIG));
         }
         finally
         {
-            DatabaseDescriptor.setBatchSizeFailThresholdInKB((int)(batchSizeThreshold / 1024));
+            DatabaseDescriptor.getGuardrailsConfig().setBatchSizeFailThresholdInKB(batchSizeThreshold);
+            DatabaseDescriptor.getGuardrailsConfig().disk_usage_percentage_failure_threshold = diskUsageThreshold;
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java
index 85048ae2c141..7983fdc820a7 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/TombstonesTest.java
@@ -40,25 +40,25 @@
  */
 public class TombstonesTest extends CQLTester
 {
-    static final int ORIGINAL_FAILURE_THRESHOLD = DatabaseDescriptor.getTombstoneFailureThreshold();
+    static final int ORIGINAL_FAILURE_THRESHOLD = DatabaseDescriptor.getGuardrailsConfig().tombstone_failure_threshold;
     static final int FAILURE_THRESHOLD = 100;
 
-    static final int ORIGINAL_WARN_THRESHOLD = DatabaseDescriptor.getTombstoneFailureThreshold();
+    static final int ORIGINAL_WARN_THRESHOLD = DatabaseDescriptor.getGuardrailsConfig().tombstone_warn_threshold;
     static final int WARN_THRESHOLD = 50;
 
     @BeforeClass
     public static void setUp() throws Throwable
     {
         DatabaseDescriptor.daemonInitialization();
-        DatabaseDescriptor.setTombstoneFailureThreshold(FAILURE_THRESHOLD);
-        DatabaseDescriptor.setTombstoneWarnThreshold(WARN_THRESHOLD);
+        DatabaseDescriptor.getGuardrailsConfig().setTombstoneWarnThreshold(WARN_THRESHOLD);
+        DatabaseDescriptor.getGuardrailsConfig().setTombstoneFailureThreshold(FAILURE_THRESHOLD);
     }
 
     @AfterClass
     public static void tearDown()
     {
-        DatabaseDescriptor.setTombstoneFailureThreshold(ORIGINAL_FAILURE_THRESHOLD);
-        DatabaseDescriptor.setTombstoneWarnThreshold(ORIGINAL_WARN_THRESHOLD);
+        DatabaseDescriptor.getGuardrailsConfig().setTombstoneFailureThreshold(ORIGINAL_FAILURE_THRESHOLD);
+        DatabaseDescriptor.getGuardrailsConfig().setTombstoneWarnThreshold(ORIGINAL_WARN_THRESHOLD);
     }
 
     @Test
@@ -72,13 +72,13 @@ public void testBelowThresholdSelect() throws Throwable
 
         // insert exactly the amount of tombstones that shouldn't trigger an exception
         for (int i = 0; i < FAILURE_THRESHOLD; i++)
-            execute("INSERT INTO %s (a, b, c) VALUES ('key', 'column" + i + "', null);");
+            execute("DELETE FROM %s WHERE a = 'key' and b = '" + i + "'");
 
         try
         {
             execute("SELECT * FROM %s WHERE a = 'key';");
             assertEquals(oldFailures, cfs.metric.tombstoneFailures.getCount());
-            assertEquals(oldWarnings, cfs.metric.tombstoneWarnings.getCount());
+            assertEquals(oldWarnings + 1, cfs.metric.tombstoneWarnings.getCount());
         }
         catch (Throwable e)
         {
@@ -96,7 +96,7 @@ public void testBeyondThresholdSelect() throws Throwable
 
         // insert exactly the amount of tombstones that *SHOULD* trigger an exception
         for (int i = 0; i < FAILURE_THRESHOLD + 1; i++)
-            execute("INSERT INTO %s (a, b, c) VALUES ('key', 'column" + i + "', null);");
+            execute("DELETE FROM %s WHERE a = 'key' and b = '" + i + "'");
 
         try
         {
@@ -218,7 +218,7 @@ public void testBeyondWarnThresholdSelect() throws Throwable
 
         // insert the number of tombstones that *SHOULD* trigger an Warning
         for (int i = 0; i < WARN_THRESHOLD + 1; i++)
-            execute("INSERT INTO %s (a, b, c ) VALUES ('key', 'cc" + i + "',  null);");
+            execute("DELETE FROM %s WHERE a = 'key' and b = '" + i + "'");
         try
         {
             execute("SELECT * FROM %s WHERE a = 'key';");
diff --git a/test/unit/org/apache/cassandra/fql/FullQueryLoggerTest.java b/test/unit/org/apache/cassandra/fql/FullQueryLoggerTest.java
index 73be0b493e9c..31e553967f11 100644
--- a/test/unit/org/apache/cassandra/fql/FullQueryLoggerTest.java
+++ b/test/unit/org/apache/cassandra/fql/FullQueryLoggerTest.java
@@ -678,7 +678,7 @@ private static void compareQueryOptions(QueryOptions a, QueryOptions b)
         assertEquals(a.getConsistency(), b.getConsistency());
         assertEquals(a.getPagingState(), b.getPagingState());
         assertEquals(a.getValues(), b.getValues());
-        assertEquals(a.getSerialConsistency(), b.getSerialConsistency());
+        assertEquals(a.getSerialConsistency(null), b.getSerialConsistency(null));
     }
 
     private void configureFQL() throws Exception
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
index 1805c730e842..d6678aafca00 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
@@ -18,12 +18,12 @@
 
 package org.apache.cassandra.guardrails;
 
-import java.util.Arrays;
 import java.util.Collections;
-import java.util.LinkedHashSet;
 import java.util.Set;
 import java.util.function.Supplier;
 
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Sets;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
@@ -38,11 +38,9 @@
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 
-import static org.assertj.core.api.Assertions.assertThatThrownBy;
-
 public class GuardrailConsistencyTest extends GuardrailTester
 {
-    private static final Set<String> DISALLOWED_WRITE_CLS = new LinkedHashSet<>(Arrays.asList(
+    private static Set<String> disallowedConsistencyLevels = ImmutableSet.of(
     ConsistencyLevel.ANY.toString(),
     ConsistencyLevel.ONE.toString(),
     ConsistencyLevel.TWO.toString(),
@@ -50,14 +48,12 @@ public class GuardrailConsistencyTest extends GuardrailTester
     ConsistencyLevel.QUORUM.toString(),
     ConsistencyLevel.ALL.toString(),
     ConsistencyLevel.EACH_QUORUM.toString(),
-    ConsistencyLevel.LOCAL_ONE.toString()));
-
-    private static final Set<String> SERIAL_CLS = new LinkedHashSet<>(Arrays.asList(
+    ConsistencyLevel.LOCAL_ONE.toString()
+    );
+    private static Set<String> serialConsistencyLevels = ImmutableSet.of(
     ConsistencyLevel.SERIAL.toString(),
     ConsistencyLevel.LOCAL_SERIAL.toString()
-    ));
-    private static final Set<String> SERIAL_ONLY = new LinkedHashSet<>(Collections.singletonList(ConsistencyLevel.SERIAL.toString()));
-    private static final LinkedHashSet<String> LOCAL_SERIAL_ONLY = new LinkedHashSet<>(Collections.singletonList(ConsistencyLevel.LOCAL_SERIAL.toString()));
+    );
 
     private static Set<String> defaultDisallowedWriteConsistencyLevels;
     private Supplier<QueryState> queryState;
@@ -79,12 +75,24 @@ public void setupTest()
     {
         createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
         queryState = this::userQueryState;
-        disableConsistencyLevels(DISALLOWED_WRITE_CLS);
+        disableConsistencyLevels(disallowedConsistencyLevels);
     }
 
     private void disableConsistencyLevels(Set<String> consistencyLevels)
     {
-        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = consistencyLevels;
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = ImmutableSet.copyOf(consistencyLevels);
+    }
+
+    private QueryOptions queryOptions(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        return QueryOptions.create(cl,
+                                   Collections.emptyList(),
+                                   false,
+                                   1,
+                                   null,
+                                   serialCl,
+                                   ProtocolVersion.CURRENT,
+                                   KEYSPACE);
     }
 
     private void executeWithConsistency(String query, ConsistencyLevel cl, ConsistencyLevel serialCl)
@@ -105,30 +113,24 @@ private void lwtInsert(ConsistencyLevel cl, ConsistencyLevel serialCl)
         executeWithConsistency("INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') IF NOT EXISTS", cl, serialCl);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testInsertWithDisallowedConsistency()
     {
-        assertThatThrownBy(() -> insert(ConsistencyLevel.ONE))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+        insert(ConsistencyLevel.ONE);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testLWTInsertWithDisallowedConsistency1()
     {
-        disableConsistencyLevels(SERIAL_ONLY);
-        assertThatThrownBy(() -> lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testLWTInsertWithDisallowedConsistency2()
     {
-        disableConsistencyLevels(SERIAL_CLS);
-        assertThatThrownBy(() -> lwtInsert(ConsistencyLevel.LOCAL_QUORUM, null))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+        disableConsistencyLevels(serialConsistencyLevels);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
     @Test
@@ -137,38 +139,43 @@ public void testInsertWithAllowedConsistency()
         // test that it does not throw
         insert(ConsistencyLevel.LOCAL_QUORUM);
 
-        disableConsistencyLevels(SERIAL_ONLY);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
         lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, null);
 
-        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.LOCAL_SERIAL.toString()));
         lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtInsert(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
-    @Test
-    public void testLWTUpdateWithDisallowedConsistency()
+    private void update(ConsistencyLevel cl)
     {
-        disableConsistencyLevels(SERIAL_ONLY);
-        assertThatThrownBy(() -> lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+        executeWithConsistency("UPDATE %s SET v = 'val2' WHERE k = 1 and c = 2", cl, null);
     }
 
-    @Test
+    private void lwtUpdate(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        executeWithConsistency("UPDATE %s SET v = 'val2' WHERE k = 1 and c = 2 IF EXISTS", cl, serialCl);
+    }
+
+    @Test(expected = InvalidRequestException.class)
+    public void testUpdateWithDisallowedConsistency()
+    {
+        update(ConsistencyLevel.ONE);
+    }
+
+    @Test(expected = InvalidRequestException.class)
     public void testLWTUpdateWithDisallowedConsistency1()
     {
-        disableConsistencyLevels(SERIAL_ONLY);
-        assertThatThrownBy(() -> lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testLWTUpdateWithDisallowedConsistency2()
     {
-        disableConsistencyLevels(SERIAL_CLS);
-        assertThatThrownBy(() -> lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, null))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+        disableConsistencyLevels(serialConsistencyLevels);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
     @Test
@@ -177,45 +184,43 @@ public void testUpdateWithAllowedConsistency()
         // test that it does not throw
         update(ConsistencyLevel.LOCAL_QUORUM);
 
-        disableConsistencyLevels(SERIAL_ONLY);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
         lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, null);
 
-        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.LOCAL_SERIAL.toString()));
         lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtUpdate(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
-    @Test
-    public void testUpdateWithDisallowedConsistency()
+    private void delete(ConsistencyLevel cl)
     {
-        assertThatThrownBy(() -> update(ConsistencyLevel.ONE))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+        executeWithConsistency("DELETE FROM %s WHERE k=1", cl, null);
     }
 
-    @Test
+    private void lwtDelete(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    {
+        executeWithConsistency("DELETE FROM %s WHERE k=1 AND c=2 IF EXISTS", cl, serialCl);
+    }
+
+    @Test(expected = InvalidRequestException.class)
     public void testDeleteWithDisallowedConsistency()
     {
-        assertThatThrownBy(() -> delete(ConsistencyLevel.ONE))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
+        delete(ConsistencyLevel.ONE);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testLWTDeleteWithAllowedConsistency1()
     {
-        disableConsistencyLevels(SERIAL_ONLY);
-        assertThatThrownBy(() -> lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testLWTDeleteWithAllowedConsistency2()
     {
-        disableConsistencyLevels(SERIAL_CLS);
-        assertThatThrownBy(() -> lwtDelete(ConsistencyLevel.LOCAL_QUORUM, null))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+        disableConsistencyLevels(serialConsistencyLevels);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
     @Test
@@ -224,96 +229,61 @@ public void testDeleteWithAllowedConsistency()
         // test that it does not throw
         delete(ConsistencyLevel.LOCAL_QUORUM);
 
-        disableConsistencyLevels(SERIAL_ONLY);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
         lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, null);
 
-        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.LOCAL_SERIAL.toString()));
         lwtDelete(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtDelete(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
-    @Test
-    public void testLWTBatchWithDisallowedConsistency1()
-    {
-        disableConsistencyLevels(SERIAL_ONLY);
-        assertThatThrownBy(() -> lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL])");
-    }
-
-    @Test
-    public void testLWTBatchWithDisallowedConsistency2()
+    private void batch(ConsistencyLevel cl)
     {
-        disableConsistencyLevels(SERIAL_CLS);
-        assertThatThrownBy(() -> lwtBatch(ConsistencyLevel.LOCAL_QUORUM, null))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value SERIAL is not allowed for Consistency Level (disallowed values are: [SERIAL, LOCAL_SERIAL])");
+        executeWithConsistency("BEGIN BATCH " +
+                               "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') " +
+                               "APPLY BATCH", cl, null);
     }
 
-    @Test
-    public void testBatchWithAllowedConsistency()
+    private void lwtBatch(ConsistencyLevel cl, ConsistencyLevel serialCl)
     {
-        // test that it does not throw
-        batch(ConsistencyLevel.LOCAL_QUORUM);
-
-        disableConsistencyLevels(SERIAL_ONLY);
-        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
-
-        disableConsistencyLevels(LOCAL_SERIAL_ONLY);
-        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        executeWithConsistency("BEGIN BATCH " +
+                               "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') IF NOT EXISTS " +
+                               "APPLY BATCH", cl, serialCl);
     }
 
-    @Test
+    @Test(expected = InvalidRequestException.class)
     public void testBatchWithDisallowedConsistency()
     {
-        assertThatThrownBy(() -> batch(ConsistencyLevel.ONE))
-        .isInstanceOf(InvalidRequestException.class)
-        .hasMessage("Provided value ONE is not allowed for Consistency Level (disallowed values are: [ANY, ONE, TWO, THREE, QUORUM, ALL, EACH_QUORUM, LOCAL_ONE])");
-    }
-
-    private QueryOptions queryOptions(ConsistencyLevel cl, ConsistencyLevel serialCl)
-    {
-        return QueryOptions.create(cl,
-                                   Collections.emptyList(),
-                                   false,
-                                   1,
-                                   null,
-                                   serialCl,
-                                   ProtocolVersion.CURRENT,
-                                   KEYSPACE);
-    }
-
-    private void update(ConsistencyLevel cl)
-    {
-        executeWithConsistency("UPDATE %s SET v = 'val2' WHERE k = 1 and c = 2", cl, null);
+        batch(ConsistencyLevel.ONE);
     }
 
-    private void lwtUpdate(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    @Test(expected = InvalidRequestException.class)
+    public void testLWTBatchWithDisallowedConsistency1()
     {
-        executeWithConsistency("UPDATE %s SET v = 'val2' WHERE k = 1 and c = 2 IF EXISTS", cl, serialCl);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
     }
 
-    private void delete(ConsistencyLevel cl)
+    @Test(expected = InvalidRequestException.class)
+    public void testLWTBatchWithDisallowedConsistency2()
     {
-        executeWithConsistency("DELETE FROM %s WHERE k=1", cl, null);
+        disableConsistencyLevels(serialConsistencyLevels);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, null);
     }
-
-    private void lwtDelete(ConsistencyLevel cl, ConsistencyLevel serialCl)
+    @Test
+    public void testBatchWithAllowedConsistency()
     {
-        executeWithConsistency("DELETE FROM %s WHERE k=1 AND c=2 IF EXISTS", cl, serialCl);
-    }
+        // test that it does not throw
+        batch(ConsistencyLevel.LOCAL_QUORUM);
 
-    private void batch(ConsistencyLevel cl)
-    {
-        executeWithConsistency("BEGIN BATCH " +
-                               "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') " +
-                               "APPLY BATCH", cl, null);
-    }
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.SERIAL.toString()));
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, null);
 
-    private void lwtBatch(ConsistencyLevel cl, ConsistencyLevel serialCl)
-    {
-        executeWithConsistency("BEGIN BATCH " +
-                               "INSERT INTO %s (k, c, v) VALUES (1, 2, 'val') IF NOT EXISTS " +
-                               "APPLY BATCH", cl, serialCl);
+        disableConsistencyLevels(ImmutableSet.of(ConsistencyLevel.LOCAL_SERIAL.toString()));
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
+        lwtBatch(ConsistencyLevel.LOCAL_QUORUM, null);
     }
 
     @Test
@@ -332,6 +302,7 @@ public void testSystemUser()
 
     private void testExcludedUser()
     {
+        disableConsistencyLevels(Sets.union(defaultDisallowedWriteConsistencyLevels, serialConsistencyLevels));
         insert(ConsistencyLevel.ONE);
         insert(ConsistencyLevel.LOCAL_QUORUM);
         lwtInsert(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.SERIAL);
@@ -353,4 +324,3 @@ private void testExcludedUser()
         lwtBatch(ConsistencyLevel.LOCAL_QUORUM, ConsistencyLevel.LOCAL_SERIAL);
     }
 }
-
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
index ed1086645865..83cf70191db5 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
@@ -336,8 +336,11 @@ public void testWriteRequests() throws Throwable
             executeNet(batchStatement);
         };
 
+        // delay needed after grant permissions
+        Thread.sleep(1000);
+
         // default state, write request works fine
-        assertTrue(Guardrails.enabled());
+        assertTrue(Guardrails.ready());
         assertValid(select);
         assertValid(insert);
         assertValid(batch);
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailLoggedBatchTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailLoggedBatchTest.java
new file mode 100644
index 000000000000..2ac096ed7720
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailLoggedBatchTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+
+import org.junit.AfterClass;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.datastax.driver.core.BatchStatement;
+import com.datastax.driver.core.SimpleStatement;
+import com.datastax.driver.core.exceptions.InvalidQueryException;
+import org.apache.cassandra.config.DatabaseDescriptor;
+
+public class GuardrailLoggedBatchTest extends GuardrailTester
+{
+    private static boolean loggedBatchEnabled;
+
+    @BeforeClass
+    public static void setup()
+    {
+        loggedBatchEnabled = DatabaseDescriptor.getGuardrailsConfig().logged_batch_enabled;
+    }
+
+    @AfterClass
+    public static void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().logged_batch_enabled = loggedBatchEnabled;
+    }
+
+    @Before
+    public void setupTest()
+    {
+        createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
+    }
+
+    private void setGuardrails(boolean logged_batch_enabled)
+    {
+        DatabaseDescriptor.getGuardrailsConfig().logged_batch_enabled = logged_batch_enabled;
+    }
+
+    private void insertBatchAndAssertValid(boolean loggedBatchEnabled, boolean logged) throws Throwable
+    {
+        setGuardrails(loggedBatchEnabled);
+
+        BatchStatement batch = new BatchStatement(logged ? BatchStatement.Type.LOGGED : BatchStatement.Type.UNLOGGED);
+        batch.add(new SimpleStatement(String.format("INSERT INTO %s.%s (k, c, v) VALUES (1, 2, 'val')", keyspace(), currentTable())));
+        batch.add(new SimpleStatement(String.format("INSERT INTO %s.%s (k, c, v) VALUES (3, 4, 'val')", keyspace(), currentTable())));
+
+        assertValid(batch);
+    }
+
+    @Test
+    public void testInsertUnloggedBatch() throws Throwable
+    {
+        insertBatchAndAssertValid(false, false);
+        insertBatchAndAssertValid(true, false);
+    }
+
+    @Test(expected = InvalidQueryException.class)
+    public void testDisabledLoggedBatch() throws Throwable
+    {
+        insertBatchAndAssertValid(false, true);
+    }
+
+    @Test
+    public void testEnabledLoggedBatch() throws Throwable
+    {
+        insertBatchAndAssertValid(true, true);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java
index eb9d4e21a22e..b2d9a5bf28f0 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailSAIIndexesTest.java
@@ -67,10 +67,10 @@ public void testDefaultsDBAAS()
 
     public void testDefaults(boolean dbaas)
     {
-        boolean previous = DatabaseDescriptor.isApplyDbaasDefaults();
+        boolean previous = DatabaseDescriptor.isEmulateDbaasDefaults();
         try
         {
-            DatabaseDescriptor.setApplyDbaasDefaults(dbaas);
+            DatabaseDescriptor.setEmulateDbaasDefaults(dbaas);
 
             GuardrailsConfig config = new GuardrailsConfig();
             config.applyConfig();
@@ -80,7 +80,7 @@ public void testDefaults(boolean dbaas)
         }
         finally
         {
-            DatabaseDescriptor.setApplyDbaasDefaults(previous);
+            DatabaseDescriptor.setEmulateDbaasDefaults(previous);
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
index a959d10a5259..36d5b5b1516f 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
@@ -18,30 +18,28 @@
 
 package org.apache.cassandra.guardrails;
 
-import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.CopyOnWriteArrayList;
 import java.util.function.BiConsumer;
-import java.util.function.Consumer;
 import javax.annotation.Nullable;
 
+import com.google.common.collect.ImmutableSet;
 import org.junit.After;
 import org.junit.AfterClass;
 import org.junit.Before;
 import org.junit.BeforeClass;
 
+import com.datastax.driver.core.Statement;
 import com.datastax.driver.core.exceptions.InvalidQueryException;
 import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.ClientState;
-import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.service.QueryState;
 
 import static java.lang.String.format;
-import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
@@ -52,7 +50,6 @@ public abstract class GuardrailTester extends CQLTester
     static final String USERNAME = "guardrail_user";
     static final String PASSWORD = "guardrail_password";
 
-    private static boolean guardRailsEnabled;
     private static Set<String> tablePropertiesDisallowed;
 
     protected TestListener listener;
@@ -60,11 +57,8 @@ public abstract class GuardrailTester extends CQLTester
     @BeforeClass
     public static void setupGuardrailTester()
     {
-        guardRailsEnabled = DatabaseDescriptor.getGuardrailsConfig().enabled;
-        DatabaseDescriptor.getGuardrailsConfig().enabled = true;
-
         tablePropertiesDisallowed = DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed;
-        DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = Collections.emptySet();
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = ImmutableSet.of();
 
         requireAuthentication();
         requireNetwork();
@@ -73,7 +67,6 @@ public static void setupGuardrailTester()
     @AfterClass
     public static void tearDownGuardrailTester()
     {
-        DatabaseDescriptor.getGuardrailsConfig().enabled = guardRailsEnabled;
         DatabaseDescriptor.getGuardrailsConfig().table_properties_disallowed = tablePropertiesDisallowed;
     }
 
@@ -86,6 +79,7 @@ public void beforeGuardrailTest() throws Throwable
         useSuperUser();
         executeNet(format("CREATE USER IF NOT EXISTS %s WITH PASSWORD '%s'", USERNAME, PASSWORD));
         executeNet(format("GRANT ALL ON KEYSPACE %s TO %s", KEYSPACE, USERNAME));
+
         useUser(USERNAME, PASSWORD);
 
         listener = new TestListener(null);
@@ -238,6 +232,11 @@ protected void assertValid(String query, Object... args) throws Throwable
         assertValid(() -> executeNet(query, args));
     }
 
+    protected void assertValid(Statement query) throws Throwable
+    {
+        assertValid(() -> executeNet(query));
+    }
+
     protected void assertWarns(CheckedFunction function, String... messages) throws Throwable
     {
         listener.clear();
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
index df85876dcd9d..67fd75ca09cb 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailsOnTableTest.java
@@ -23,8 +23,8 @@
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.atomic.AtomicReference;
-import java.util.stream.Collectors;
 
+import com.google.common.collect.ImmutableSet;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -64,9 +64,9 @@ public void before()
         TableAttributes.validKeywords.stream()
                                      .filter(p -> !allowed.contains(p))
                                      .map(String::toUpperCase)
-                                     .collect(Collectors.toSet());
+                                     .collect(ImmutableSet.toImmutableSet());
         // but actually ignore "comment"
-        DatabaseDescriptor.getGuardrailsConfig().table_properties_ignored = new HashSet<>(Arrays.asList("comment"));
+        DatabaseDescriptor.getGuardrailsConfig().table_properties_ignored = ImmutableSet.of("comment");
     }
 
     @After
@@ -206,10 +206,10 @@ public void testInvalidTableProperties()
     {
         GuardrailsConfig config = DatabaseDescriptor.getGuardrailsConfig();
 
-        config.table_properties_disallowed = new HashSet<>(Arrays.asList("ID1", "gc_grace_seconds"));
+        config.table_properties_disallowed = ImmutableSet.of("ID1", "gc_grace_seconds");
         assertConfigFails(config::validate, "[id1]");
 
-        config.table_properties_disallowed = new HashSet<>(Arrays.asList("ID", "Gc_Grace_Seconds"));
+        config.table_properties_disallowed = ImmutableSet.of("ID", "Gc_Grace_Seconds");
         config.validate();
     }
 
diff --git a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
index 27ca4f67e8ac..23f98776b7e2 100644
--- a/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
+++ b/test/unit/org/apache/cassandra/service/ClientWarningsTest.java
@@ -18,10 +18,13 @@
 package org.apache.cassandra.service;
 
 import java.util.Collection;
+import java.util.Set;
 import java.util.stream.Collectors;
 
+import com.google.common.collect.ImmutableSet;
 import org.apache.commons.lang3.StringUtils;
 
+import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -33,6 +36,7 @@
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.guardrails.GuardrailsConfig;
 import org.apache.cassandra.transport.Message;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.transport.SimpleClient;
@@ -44,6 +48,11 @@
 @RunWith(Parameterized.class)
 public class ClientWarningsTest extends CQLTester
 {
+    private static int defaultBatchSizeWarnThreshold;
+    private static Set<String> defaultWriteConsistencyLevelsDisallowed;
+    private static Set<String> defaultTable_properties_disallowed;
+    private static Integer defaultDisk_usage_percentage_failure_threshold;
+
     @Parameterized.Parameter
     public ProtocolVersion version;
 
@@ -60,9 +69,32 @@ public static Collection<Object[]> versions()
     public static void setUp()
     {
         requireNetwork();
-        DatabaseDescriptor.setBatchSizeWarnThresholdInKB(1);
+
+        GuardrailsConfig guardrailsConfig = DatabaseDescriptor.getGuardrailsConfig();
+        
+        // Save current settings
+        defaultBatchSizeWarnThreshold = guardrailsConfig.batch_size_warn_threshold_in_kb;
+        defaultWriteConsistencyLevelsDisallowed = guardrailsConfig.write_consistency_levels_disallowed;
+        defaultTable_properties_disallowed = guardrailsConfig.table_properties_disallowed;
+        defaultDisk_usage_percentage_failure_threshold = guardrailsConfig.disk_usage_percentage_failure_threshold;
+
+        guardrailsConfig.setBatchSizeWarnThresholdInKB(1);
+        guardrailsConfig.write_consistency_levels_disallowed = ImmutableSet.of();
+        guardrailsConfig.table_properties_disallowed = ImmutableSet.of();
+        guardrailsConfig.disk_usage_percentage_failure_threshold = -1;
     }
 
+    @AfterClass 
+    public static void teardown()
+    {
+        // Restore previous settings
+        GuardrailsConfig guardrailsConfig = DatabaseDescriptor.getGuardrailsConfig();
+        guardrailsConfig.setBatchSizeWarnThresholdInKB(defaultBatchSizeWarnThreshold);
+        guardrailsConfig.write_consistency_levels_disallowed = defaultWriteConsistencyLevelsDisallowed;
+        guardrailsConfig.table_properties_disallowed = defaultTable_properties_disallowed;
+        guardrailsConfig.disk_usage_percentage_failure_threshold = defaultDisk_usage_percentage_failure_threshold;
+    }
+    
     @Test
     public void testUnloggedBatch() throws Exception
     {
@@ -77,7 +109,7 @@ public void testUnloggedBatch() throws Exception
             Message.Response resp = client.execute(query);
             assertNull(resp.getWarnings());
 
-            query = new QueryMessage(createBatchStatement2(DatabaseDescriptor.getBatchSizeWarnThreshold()), QueryOptions.DEFAULT);
+            query = new QueryMessage(createBatchStatement2(DatabaseDescriptor.getGuardrailsConfig().getBatchSizeWarnThreshold()), QueryOptions.DEFAULT);
             resp = client.execute(query);
             assertEquals(1, resp.getWarnings().size());
         }
@@ -93,11 +125,11 @@ public void testLargeBatch() throws Exception
         {
             client.connect(false);
 
-            QueryMessage query = new QueryMessage(createBatchStatement2(DatabaseDescriptor.getBatchSizeWarnThreshold() / 2 + 1), QueryOptions.DEFAULT);
+            QueryMessage query = new QueryMessage(createBatchStatement2(DatabaseDescriptor.getGuardrailsConfig().getBatchSizeWarnThreshold() / 2 + 1), QueryOptions.DEFAULT);
             Message.Response resp = client.execute(query);
             assertEquals(1, resp.getWarnings().size());
 
-            query = new QueryMessage(createBatchStatement(DatabaseDescriptor.getBatchSizeWarnThreshold()), QueryOptions.DEFAULT);
+            query = new QueryMessage(createBatchStatement(DatabaseDescriptor.getGuardrailsConfig().getBatchSizeWarnThreshold()), QueryOptions.DEFAULT);
             resp = client.execute(query);
             assertNull(resp.getWarnings());
         }
@@ -153,7 +185,7 @@ public void testLargeBatchWithProtoV2() throws Exception
         {
             client.connect(false);
 
-            QueryMessage query = new QueryMessage(createBatchStatement(DatabaseDescriptor.getBatchSizeWarnThreshold()), QueryOptions.DEFAULT);
+            QueryMessage query = new QueryMessage(createBatchStatement(DatabaseDescriptor.getGuardrailsConfig().getBatchSizeWarnThreshold()), QueryOptions.DEFAULT);
             Message.Response resp = client.execute(query);
             assertNull(resp.getWarnings());
         }
diff --git a/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java b/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java
index a7551f42b310..fabc392bf648 100644
--- a/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java
+++ b/test/unit/org/apache/cassandra/service/ProtocolBetaVersionTest.java
@@ -35,7 +35,6 @@ public class ProtocolBetaVersionTest extends CQLTester
     public static void setUp()
     {
         requireNetwork();
-        DatabaseDescriptor.setBatchSizeWarnThresholdInKB(1);
     }
 
     private ProtocolVersion getBetaVersion()
diff --git a/test/unit/org/apache/cassandra/transport/ClientResourceLimitsTest.java b/test/unit/org/apache/cassandra/transport/ClientResourceLimitsTest.java
index c52f43e27213..5b41bc95420f 100644
--- a/test/unit/org/apache/cassandra/transport/ClientResourceLimitsTest.java
+++ b/test/unit/org/apache/cassandra/transport/ClientResourceLimitsTest.java
@@ -63,7 +63,7 @@ public class ClientResourceLimitsTest extends CQLTester
         QueryOptions.DEFAULT.skipMetadata(),
         QueryOptions.DEFAULT.getPageSize(),
         QueryOptions.DEFAULT.getPagingState(),
-        QueryOptions.DEFAULT.getSerialConsistency(),
+        QueryOptions.DEFAULT.getSerialConsistency(null),
         ProtocolVersion.V5,
         KEYSPACE);
 
diff --git a/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java b/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java
index 6c9ef4fe255b..0295e7126ea2 100644
--- a/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java
+++ b/test/unit/org/apache/cassandra/transport/MessagePayloadTest.java
@@ -140,7 +140,7 @@ public void testMessagePayloadBeta() throws Throwable
                   QueryOptions.DEFAULT.skipMetadata(),
                   QueryOptions.DEFAULT.getPageSize(),
                   QueryOptions.DEFAULT.getPagingState(),
-                  QueryOptions.DEFAULT.getSerialConsistency(),
+                  QueryOptions.DEFAULT.getSerialConsistency(null),
                   ProtocolVersion.V5,
                   KEYSPACE);
                 QueryMessage queryMessage = new QueryMessage("CREATE TABLE atable (pk int PRIMARY KEY, v text)",
diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
index 75523e1587cd..a6de876f6d30 100644
--- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
@@ -20,6 +20,7 @@
 import java.nio.ByteBuffer;
 import java.util.*;
 
+import com.google.common.collect.ImmutableSet;
 import org.apache.commons.lang3.RandomStringUtils;
 
 import io.netty.buffer.Unpooled;
@@ -33,6 +34,7 @@
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.marshal.*;
+import org.apache.cassandra.exceptions.InvalidRequestException;
 import org.apache.cassandra.serializers.CollectionSerializer;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
@@ -388,7 +390,7 @@ private void queryOptionsSerDeserTest(ProtocolVersion version, QueryOptions opti
 
         assertNotNull(decodedOptions);
         assertEquals(options.getConsistency(), decodedOptions.getConsistency());
-        assertEquals(options.getSerialConsistency(), decodedOptions.getSerialConsistency());
+        assertEquals(options.getSerialConsistency(null), decodedOptions.getSerialConsistency(null));
         assertEquals(options.getPageSize(), decodedOptions.getPageSize());
         assertEquals(options.getProtocolVersion(), decodedOptions.getProtocolVersion());
         assertEquals(options.getValues(), decodedOptions.getValues());
@@ -399,6 +401,105 @@ private void queryOptionsSerDeserTest(ProtocolVersion version, QueryOptions opti
         assertEquals(options.getNowInSeconds(state), decodedOptions.getNowInSeconds(state));
     }
 
+    @Test
+    public void defaultSerialCLGuardrailsTest()
+    {
+        for(ProtocolVersion version : ProtocolVersion.SUPPORTED)
+        {
+            defaultSerialCLGuardrailsTest(version, new LinkedHashSet<>(), ConsistencyLevel.SERIAL);
+            defaultSerialCLGuardrailsTest(version,
+                                          new LinkedHashSet<>(Arrays.asList(ConsistencyLevel.LOCAL_SERIAL.toString())),
+                                          ConsistencyLevel.SERIAL);
+            defaultSerialCLGuardrailsTest(version,
+                                          new LinkedHashSet<>(Arrays.asList(ConsistencyLevel.SERIAL.toString())),
+                                          ConsistencyLevel.LOCAL_SERIAL);
+            defaultSerialCLGuardrailsTest(version,
+                                          new LinkedHashSet<>(Arrays.asList(ConsistencyLevel.SERIAL.toString(),
+                                                                   ConsistencyLevel.LOCAL_SERIAL.toString())),
+                                          null);
+        }
+    }
+
+    private void defaultSerialCLGuardrailsTest(ProtocolVersion version,
+                                               Set<String> writeConsistencyLevelsDisallowed,
+                                               ConsistencyLevel expectedDecodedSerialConsistency)
+    {
+        Set<String> previousConsistencyLevels =  DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed;
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = ImmutableSet.copyOf(writeConsistencyLevelsDisallowed);
+
+        QueryOptions queryOptions = QueryOptions.create(ConsistencyLevel.ALL,
+                                                        Collections.singletonList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 })),
+                                                        false,
+                                                        5000,
+                                                        Util.makeSomePagingState(version),
+                                                        null,
+                                                        version,
+                                                        null);
+        ByteBuf buf = Unpooled.buffer(QueryOptions.codec.encodedSize(queryOptions, version));
+        QueryOptions.codec.encode(queryOptions, buf, version);
+        QueryOptions decodedOptions = QueryOptions.codec.decode(buf, version);
+        if (expectedDecodedSerialConsistency != null)
+        {
+            assertEquals(expectedDecodedSerialConsistency, decodedOptions.getSerialConsistency(null));
+        }
+        else
+        {
+            try
+            {
+                decodedOptions.getSerialConsistency(null);
+                throw new AssertionError("Decoding should have failed with InvalidRequestException");
+            }
+            catch (InvalidRequestException e)
+            {
+                assertEquals("Serial consistency levels are disallowed by disallowedWriteConsistencies Guardrail",
+                             e.getMessage());
+            }
+        }
+
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = ImmutableSet.copyOf(previousConsistencyLevels);
+    }
+
+    @Test
+    public void specifiedSerialCLGuardrailsTest()
+    {
+        // write consistency level guardrail check happens before query execution. Here we validate only that if
+        // QueryOptions has explicitly set serial consistency, the same consistency level remains after encoding/decoding
+        // even if that level is forbidden by the guardrail.
+
+        Set<String> serialCLs = new LinkedHashSet<>(Arrays.asList(ConsistencyLevel.LOCAL_SERIAL.toString(), ConsistencyLevel.SERIAL.toString()));
+        for(ProtocolVersion version : ProtocolVersion.SUPPORTED)
+        {
+            specifiedSerialCLGuardrailsTest(version, ConsistencyLevel.SERIAL, new LinkedHashSet<>(), ConsistencyLevel.SERIAL);
+            specifiedSerialCLGuardrailsTest(version, ConsistencyLevel.SERIAL, serialCLs, ConsistencyLevel.SERIAL);
+            specifiedSerialCLGuardrailsTest(version, ConsistencyLevel.LOCAL_SERIAL, new LinkedHashSet<>(), ConsistencyLevel.LOCAL_SERIAL);
+            specifiedSerialCLGuardrailsTest(version, ConsistencyLevel.LOCAL_SERIAL, serialCLs, ConsistencyLevel.LOCAL_SERIAL);
+        }
+    }
+
+    private void specifiedSerialCLGuardrailsTest(ProtocolVersion version,
+                                                 ConsistencyLevel specifiedSerialConsistency,
+                                                 Set<String> writeConsistencyLevelsDisallowed,
+                                                 ConsistencyLevel expectedDecodedSerialConsistency)
+    {
+        Set<String> previousConsistencyLevels =  DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed;
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = ImmutableSet.copyOf(writeConsistencyLevelsDisallowed);
+
+        QueryOptions queryOptions = QueryOptions.create(ConsistencyLevel.ALL,
+                                                        Collections.singletonList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 })),
+                                                        false,
+                                                        5000,
+                                                        Util.makeSomePagingState(version),
+                                                        specifiedSerialConsistency,
+                                                        version,
+                                                        null);
+        ByteBuf buf = Unpooled.buffer(QueryOptions.codec.encodedSize(queryOptions, version));
+        QueryOptions.codec.encode(queryOptions, buf, version);
+        QueryOptions decodedOptions = QueryOptions.codec.decode(buf, version);
+        assertEquals(expectedDecodedSerialConsistency, decodedOptions.getSerialConsistency(null));
+
+        DatabaseDescriptor.getGuardrailsConfig().write_consistency_levels_disallowed = ImmutableSet.copyOf(previousConsistencyLevels);
+    }
+
     // return utf8 string that contains no ascii chars
     public static String randomUTF8(int count)
     {

From b6ca4e39623c8a599ac009e1fc22484750e22b34 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Tue, 8 Jun 2021 19:38:05 +0200
Subject: [PATCH 097/151] STAR-403: Implement basic SSTable read/write
 benchmarks (#187)

* STAR-403: Implement simple SSTable benchmarks

* Apply review comments

* STAR-403: Fix WriteTest and ReadWriteTest

* STAR-403: Fix SSTableReaderBench - it was too heavy

Co-authored-by: Jacek Lewandowski <lewandowski.jacek@gmail.com>
(cherry picked from commit 2568ab50a47319c34964642744840529d13a6558)
---
 .../test/microbench/ReadWriteTest.java        |   2 +
 .../test/microbench/instance/WriteTest.java   |  19 +-
 .../sstable/AbstractSSTableBench.java         | 120 +++++++++
 .../sstable/SSTableReaderBench.java           | 228 ++++++++++++++++++
 .../sstable/SSTableWriterBench.java           | 130 ++++++++++
 5 files changed, 491 insertions(+), 8 deletions(-)
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/sstable/AbstractSSTableBench.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java

diff --git a/test/microbench/org/apache/cassandra/test/microbench/ReadWriteTest.java b/test/microbench/org/apache/cassandra/test/microbench/ReadWriteTest.java
index 066c2894f9e6..d8c600a3ea1b 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/ReadWriteTest.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/ReadWriteTest.java
@@ -25,6 +25,7 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.commitlog.CommitLog;
 import org.openjdk.jmh.annotations.*;
 
 @BenchmarkMode(Mode.Throughput)
@@ -65,6 +66,7 @@ public void setup() throws Throwable
     @TearDown(Level.Trial)
     public void teardown() throws IOException, ExecutionException, InterruptedException
     {
+        CommitLog.instance.shutdownBlocking();
         CQLTester.cleanup();
     }
 
diff --git a/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java b/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
index d6a21097aeb4..b2538e7f2bcd 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/instance/WriteTest.java
@@ -40,6 +40,8 @@
 import org.apache.cassandra.db.memtable.Memtable;
 import org.apache.cassandra.utils.FBUtilities;
 import org.openjdk.jmh.annotations.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
@@ -50,6 +52,8 @@
 @State(Scope.Benchmark)
 public class WriteTest extends CQLTester
 {
+    private final static Logger logger = LoggerFactory.getLogger(WriteTest.class);
+
     static String keyspace;
     String table;
     ColumnFamilyStore cfs;
@@ -63,8 +67,8 @@ public enum EndOp
         INMEM, TRUNCATE, FLUSH
     }
 
-    @Param({"1000000"})
-    int count = 1_000_000;
+    @Param({"100000"})
+    int count = 100_000;
 
     @Param({"INMEM", "TRUNCATE", "FLUSH"})
     EndOp flush = EndOp.INMEM;
@@ -87,10 +91,9 @@ public void setup() throws Throwable
     {
         rand = new Random(1);
         executorService = Executors.newFixedThreadPool(threadCount);
-        CQLTester.setUpClass();
-        CQLTester.prepareServer();
         DatabaseDescriptor.setAutoSnapshot(false);
-        System.err.println("setupClass done.");
+        CQLTester.setUpClass();
+        logger.info("setupClass done.");
         String memtableSetup = "";
         if (!memtableClass.isEmpty())
             memtableSetup = String.format(" AND memtable = { 'class': '%s' }", memtableClass);
@@ -106,8 +109,8 @@ public void setup() throws Throwable
             executeNet(getDefaultVersion(), "use " + keyspace + ";");
         }
         writeStatement = "INSERT INTO " + table + "(userid,picid,commentid)VALUES(?,?,?)";
-        System.err.println("Prepared, batch " + BATCH + " threads " + threadCount + " flush " + flush);
-        System.err.println("Disk access mode " + DatabaseDescriptor.getDiskAccessMode() +
+        logger.info("Prepared, batch " + BATCH + " threads " + threadCount + " flush " + flush);
+        logger.info("Disk access mode " + DatabaseDescriptor.getDiskAccessMode() +
                            " index " + DatabaseDescriptor.getIndexAccessMode());
 
         cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
@@ -233,7 +236,7 @@ public void teardown() throws InterruptedException
         executorService.awaitTermination(15, TimeUnit.SECONDS);
         Memtable memtable = cfs.getTracker().getView().getCurrentMemtable();
         Memtable.MemoryUsage usage = Memtable.getMemoryUsage(memtable);
-        System.err.format("\n%s in %s mode: %d ops, %s serialized bytes, %s\n",
+        logger.info("\n{} in {} mode: {} ops, {} serialized bytes, {}\n",
                           memtable.getClass().getSimpleName(),
                           DatabaseDescriptor.getMemtableAllocationType(),
                           memtable.getOperations(),
diff --git a/test/microbench/org/apache/cassandra/test/microbench/sstable/AbstractSSTableBench.java b/test/microbench/org/apache/cassandra/test/microbench/sstable/AbstractSSTableBench.java
new file mode 100644
index 000000000000..103a503da832
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/sstable/AbstractSSTableBench.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.sstable;
+
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.ServerTestUtils;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.marshal.BytesType;
+import org.apache.cassandra.db.rows.EncodingStats;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+public abstract class AbstractSSTableBench
+{
+    private final static Logger logger = LoggerFactory.getLogger(AbstractSSTableBench.class);
+
+    public static final String KEYSPACE = "SSTableWriterBench";
+    public static final String TABLE = "table";
+    public static final String TABLE_WITH_CLUSTERING = "table_with_clustering";
+
+    public SSTableFormat getFormat(String formatName)
+    {
+        return SSTableFormat.Type.valueOf(formatName).info;
+    }
+
+    public Keyspace prepareMetadata()
+    {
+        ServerTestUtils.daemonInitialization();
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KEYSPACE,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KEYSPACE, TABLE, 0, BytesType.instance, BytesType.instance, BytesType.instance),
+                                    SchemaLoader.standardCFMD(KEYSPACE, TABLE_WITH_CLUSTERING, 0, BytesType.instance, BytesType.instance, BytesType.instance));
+
+        CommitLog.instance.stopUnsafe(true);
+        return Keyspace.open(KEYSPACE);
+    }
+
+    /**
+     * Create partition keys from numbers in range {@code [min; max)} of size in bytes as in {@code keySize}
+     */
+    public DecoratedKey[] prepareDecoratedKeys(int min, int max, int keySize)
+    {
+        int n = max - min;
+        DecoratedKey[] keys = new DecoratedKey[n];
+        for (int i = 0; i < n; i++)
+        {
+            ByteBuffer buf = ByteBuffer.allocate(keySize);
+            buf.putInt(0, i + min);
+            keys[i] = Murmur3Partitioner.instance.decorateKey(buf.duplicate());
+        }
+        Arrays.sort(keys);
+        return keys;
+    }
+
+    /**
+     * Create clustering keys from numbers in range {@code [min; max)} of size in bytes as in {@code keySize}
+     */
+    public ByteBuffer[] prepareBuffers(int min, int max, int KEY_SIZE)
+    {
+        int n = max - min;
+        ByteBuffer[] ckeys = new ByteBuffer[n];
+        for (int i = 0; i < n; i++)
+        {
+            ckeys[i] = ByteBuffer.allocate(KEY_SIZE);
+            ckeys[i].putInt(0, i + min);
+        }
+        return ckeys;
+    }
+
+    public SSTableWriter createWriter(ColumnFamilyStore table, SSTableFormat format, LifecycleTransaction txn) throws Exception
+    {
+        Path tableDir = Files.createTempDirectory(getClass().getSimpleName());
+        Descriptor desc = table.newSSTableDescriptor(tableDir.toFile(), format.getType());
+
+        return SSTableWriter.create(desc,
+                                    0,
+                                    0,
+                                    null,
+                                    false,
+                                    new SerializationHeader(true, table.metadata(), table.metadata().regularAndStaticColumns(), EncodingStats.NO_STATS),
+                                    table.indexManager.listIndexGroups(),
+                                    txn);
+    }
+
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java
new file mode 100644
index 000000000000..a0ea19ced63c
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.sstable;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import org.apache.commons.math3.primes.Primes;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.UpdateBuilder;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.SSTable;
+import org.apache.cassandra.io.sstable.format.PartitionIndexIterator;
+import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.util.FileUtils;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 1, time = 5)
+@Measurement(iterations = 3, time = 5)
+@Fork(value = 1, jvmArgsAppend = "-Xmx4G")
+@Threads(1)
+@State(Scope.Benchmark)
+public class SSTableReaderBench extends AbstractSSTableBench
+{
+    private final static Logger logger = LoggerFactory.getLogger(SSTableReaderBench.class);
+
+    int KEY_SIZE = 8;
+    int P_KEYS = 2 << 14;
+    int C_KEYS = 2 << 10;
+    int VAL_SIZE = 1;
+
+    public ByteBuffer[] ckeys;
+    public DecoratedKey[] pkeys;
+    public DecoratedKey[] nonpkeys;
+
+    private SSTableReader sstr;
+    private SSTableWriter sstw;
+
+    private int idx = 0;
+
+    private final int step = P_KEYS / 2 - 1;
+
+    @Param({ "table", "table_with_clustering" })
+    public String tableName;
+
+    @Param({ "BIG", "BTI" })
+    public String formatName;
+    private ColumnFamilyStore table;
+    private LifecycleTransaction txn;
+
+    @Setup(Level.Trial)
+    public void setup() throws Exception
+    {
+        assert Integer.highestOneBit(P_KEYS) == Integer.lowestOneBit(P_KEYS);
+        assert Integer.highestOneBit(C_KEYS) == Integer.lowestOneBit(C_KEYS);
+        Keyspace ks = prepareMetadata();
+        table = ks.getColumnFamilyStore(tableName);
+        pkeys = prepareDecoratedKeys(0, P_KEYS, KEY_SIZE);
+        nonpkeys = prepareDecoratedKeys(P_KEYS, P_KEYS * 2, KEY_SIZE);
+        ckeys = prepareBuffers(0, C_KEYS, KEY_SIZE);
+
+        txn = LifecycleTransaction.offline(OperationType.WRITE);
+        sstw = prepareTable(getFormat(formatName), table, txn);
+    }
+
+    @Setup(Level.Iteration)
+    public void setupIteration() throws Exception
+    {
+        sstr = prepareReader(getFormat(formatName), sstw);
+    }
+
+    /**
+     * Generates a quasi random walk over keys but adding a little less than a half and wrapping around.
+     */
+    private int nextIdx() {
+        idx += step;
+        if (idx >= P_KEYS)
+            idx -= P_KEYS;
+        return idx;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public void getEQPosition()
+    {
+        sstr.getPosition(pkeys[nextIdx()], SSTableReader.Operator.EQ);
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public void getEQPositionNonExisting()
+    {
+        sstr.getPosition(nonpkeys[nextIdx()], SSTableReader.Operator.EQ);
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public void getGTPosition()
+    {
+        sstr.getPosition(pkeys[nextIdx()], SSTableReader.Operator.GT);
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    public void getGTPositionNonExisting()
+    {
+        sstr.getPosition(nonpkeys[nextIdx()], SSTableReader.Operator.GT);
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.AverageTime)
+    public void iterateOverAllKeys() throws Exception
+    {
+        try (PartitionIndexIterator it = sstr.allKeysIterator())
+        {
+            while (!it.isExhausted()) it.advance();
+        }
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.AverageTime)
+    public void fullScanTest()
+    {
+        try (ISSTableScanner scanner = sstr.getScanner())
+        {
+            while (scanner.hasNext())
+            {
+                UnfilteredRowIterator rowIt = scanner.next();
+                while (rowIt.hasNext())
+                {
+                    rowIt.next();
+                }
+            }
+        }
+    }
+
+    @TearDown(Level.Iteration)
+    public void tearDownIteration()
+    {
+        sstr.selfRef().release();
+    }
+
+    @TearDown(Level.Trial)
+    public void tearDown()
+    {
+        txn.finish();
+        txn.close();
+        FileUtils.deleteRecursive(sstr.descriptor.directory);
+    }
+
+    private SSTableWriter prepareTable(SSTableFormat format, ColumnFamilyStore table, LifecycleTransaction txn) throws Exception
+    {
+        try (SSTableWriter tableWriter = createWriter(table, format, txn))
+        {
+            for (int i = 0; i < P_KEYS; i++)
+            {
+                UpdateBuilder builder = UpdateBuilder.create(table.metadata(), pkeys[i].getKey().duplicate()).withTimestamp(1);
+                if (table.metadata().clusteringColumns().isEmpty())
+                    builder.newRow().add("val", ByteBuffer.allocate(VAL_SIZE));
+                else
+                    for (int j = 0; j < C_KEYS; j++)
+                        builder.newRow(ckeys[j].duplicate()).add("val", ByteBuffer.allocate(VAL_SIZE));
+
+                tableWriter.append(builder.build().unfilteredIterator());
+            }
+
+            tableWriter.prepareToCommit();
+            Throwable t = tableWriter.commit(null);
+            if (t != null)
+                throw new Exception(t);
+
+            logger.info("Created the following files: \n{}", Arrays.stream(tableWriter.descriptor.directory.listFiles())
+                                                                   .map(f -> f.getName() + " - " + FileUtils.stringifyFileSize(f.length()))
+                                                                   .collect(Collectors.joining("\n")));
+
+            return tableWriter;
+        }
+    }
+
+    private SSTableReader prepareReader(SSTableFormat format, SSTableWriter tableWriter)
+    {
+        return format.getReaderFactory().open(tableWriter.descriptor, SSTable.componentsFor(tableWriter.descriptor), table.metadata, false, true);
+    }
+}
+
+
diff --git a/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java
new file mode 100644
index 000000000000..529f64699361
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.sstable;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.UpdateBuilder;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.cassandra.schema.TableMetadata;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Threads;
+import org.openjdk.jmh.annotations.Warmup;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@Warmup(iterations = 1, time = 5)
+@Measurement(iterations = 3, time = 5)
+@Fork(value = 1, jvmArgsAppend = "-Xmx4G")
+@Threads(1)
+@State(Scope.Benchmark)
+public class SSTableWriterBench extends AbstractSSTableBench
+{
+    @Param({ "table", "table_with_clustering" })
+    public String tableName;
+
+    @Param({ "BIG", "BTI" })
+    public String formatName;
+
+    int KEY_SIZE = 8;
+    int P_KEYS = 1000;
+    int C_KEYS = 1000;
+    int VAL_SIZE = 1;
+
+    public ByteBuffer[] ckeys = new ByteBuffer[C_KEYS];
+    public DecoratedKey[] pkeys = new DecoratedKey[P_KEYS];
+
+    private SSTableWriter tableWriter;
+    private TableMetadata tableMetadata;
+    boolean hasClustering = false;
+    private ColumnFamilyStore table;
+    private LifecycleTransaction txn;
+
+    @Setup(Level.Trial)
+    public void setupTrial()
+    {
+        Keyspace ks = prepareMetadata();
+        pkeys = prepareDecoratedKeys(0, P_KEYS, KEY_SIZE);
+
+        table = ks.getColumnFamilyStore(tableName);
+        tableMetadata = table.metadata();
+        hasClustering = !tableMetadata.clusteringColumns().isEmpty();
+
+        if (hasClustering)
+        {
+            ckeys = new ByteBuffer[C_KEYS];
+            for (int i = 0; i < ckeys.length; i++)
+            {
+                ckeys[i] = ByteBuffer.allocate(KEY_SIZE);
+                ckeys[i].putInt(0, i);
+            }
+        }
+    }
+
+    @Setup(Level.Invocation)
+    public void setupInvocation() throws Exception
+    {
+        txn = LifecycleTransaction.offline(OperationType.WRITE);
+        tableWriter = createWriter(table, getFormat(formatName), txn);
+    }
+
+    @Benchmark
+    public void writeWithClusteringTest()
+    {
+        for (int i = 0; i < P_KEYS; i++)
+        {
+            UpdateBuilder builder = UpdateBuilder.create(tableMetadata, pkeys[i].getKey().duplicate()).withTimestamp(1);
+            if (hasClustering)
+                for (int j = 0; j < C_KEYS; j++)
+                    builder.newRow(ckeys[j].duplicate()).add("val", ByteBuffer.allocate(VAL_SIZE));
+            else
+                builder.newRow().add("val", ByteBuffer.allocate(VAL_SIZE));
+
+            tableWriter.append(builder.build().unfilteredIterator());
+        }
+    }
+
+    @TearDown(Level.Invocation)
+    public void tearDown()
+    {
+        tableWriter.abort();
+        tableWriter.close();
+        txn.close();
+
+        FileUtils.deleteRecursive(tableWriter.descriptor.directory);
+    }
+}

From b708759218177d652a3cdb1535888123c96098dd Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Tue, 8 Jun 2021 16:56:03 -0700
Subject: [PATCH 098/151] STAR-654 Fix permissions race in
 GuardrailDiskUsageTest.testWriteRequests (#183)

(cherry picked from commit 019918125b1b5a2ba2eb7abb7c5d0c4f636061f5)
---
 .../cassandra/guardrails/GuardrailDiskUsageTest.java   |  3 ---
 .../apache/cassandra/guardrails/GuardrailTester.java   | 10 ++++++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
index 83cf70191db5..97f08a10b081 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailDiskUsageTest.java
@@ -336,9 +336,6 @@ public void testWriteRequests() throws Throwable
             executeNet(batchStatement);
         };
 
-        // delay needed after grant permissions
-        Thread.sleep(1000);
-
         // default state, write request works fine
         assertTrue(Guardrails.ready());
         assertValid(select);
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
index 36d5b5b1516f..39309a112467 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
@@ -21,6 +21,7 @@
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.CopyOnWriteArrayList;
+import java.util.concurrent.TimeUnit;
 import java.util.function.BiConsumer;
 import javax.annotation.Nullable;
 
@@ -31,6 +32,7 @@
 import org.junit.BeforeClass;
 
 import com.datastax.driver.core.Statement;
+import com.datastax.driver.core.SimpleStatement;
 import com.datastax.driver.core.exceptions.InvalidQueryException;
 import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -38,6 +40,7 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
+import org.awaitility.Awaitility;
 
 import static java.lang.String.format;
 import static org.junit.Assert.assertEquals;
@@ -80,6 +83,13 @@ public void beforeGuardrailTest() throws Throwable
         executeNet(format("CREATE USER IF NOT EXISTS %s WITH PASSWORD '%s'", USERNAME, PASSWORD));
         executeNet(format("GRANT ALL ON KEYSPACE %s TO %s", KEYSPACE, USERNAME));
 
+        // Make sure keyspace permissions have been applied
+        Awaitility.await()
+                  .atMost(10, TimeUnit.SECONDS)
+                  .with()
+                  .pollInterval(500, TimeUnit.MILLISECONDS)
+                  .until(() -> !executeNet(new SimpleStatement("LIST ALL OF " + USERNAME)).all().isEmpty());
+        
         useUser(USERNAME, PASSWORD);
 
         listener = new TestListener(null);

From 0a42d630495f8158625a311aae04757f872f5abd Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Wed, 9 Jun 2021 11:24:46 +0100
Subject: [PATCH 099/151] STAR-786: Migrate SAI microbenchmark tests to
 Stargazer

(cherry picked from commit b12ffc951af4f5ae5bf6b52779750fa3e28c7e28)
---
 .../index/sai/disk/v1/PostingsWriter.java     |   2 +-
 .../AbstractTrieMemoryIndexBenchmark.java     | 136 ++++++++++++++
 .../memory/ReadTrieMemoryIndexBenchmark.java  | 141 +++++++++++++++
 .../memory/WriteTrieMemoryIndexBenchmark.java | 104 +++++++++++
 .../index/sai/v1/AbstractOnDiskBenchmark.java | 169 ++++++++++++++++++
 .../sai/v1/BlockPackedReaderBenchmark.java    | 115 ++++++++++++
 .../sai/v1/MergePostingListBenchmark.java     |  98 ++++++++++
 .../index/sai/v1/PostingsReaderBenchmark.java | 121 +++++++++++++
 8 files changed, 885 insertions(+), 1 deletion(-)
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/AbstractTrieMemoryIndexBenchmark.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/ReadTrieMemoryIndexBenchmark.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/WriteTrieMemoryIndexBenchmark.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/AbstractOnDiskBenchmark.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/BlockPackedReaderBenchmark.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/MergePostingListBenchmark.java
 create mode 100644 test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/PostingsReaderBenchmark.java

diff --git a/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java
index 2bfe5eca33ee..b86003dbc815 100644
--- a/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java
+++ b/src/java/org/apache/cassandra/index/sai/disk/v1/PostingsWriter.java
@@ -100,7 +100,7 @@ public class PostingsWriter implements Closeable
     private long totalPostings;
 
     @VisibleForTesting
-    PostingsWriter(IndexComponents components, boolean segmented) throws IOException
+    public PostingsWriter(IndexComponents components, boolean segmented) throws IOException
     {
         this(components, BLOCK_SIZE, segmented);
     }
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/AbstractTrieMemoryIndexBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/AbstractTrieMemoryIndexBenchmark.java
new file mode 100644
index 000000000000..b81967eca1ef
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/AbstractTrieMemoryIndexBenchmark.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.memory;
+
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+import java.util.function.LongConsumer;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.schema.IndexTarget;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.db.marshal.UUIDType;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.ColumnContext;
+import org.apache.cassandra.index.sai.StorageAttachedIndex;
+import org.apache.cassandra.index.sai.analyzer.NonTokenizingOptions;
+import org.apache.cassandra.index.sai.memory.MemoryIndex;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.IndexMetadata;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.UUIDGen;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Setup;
+
+public abstract class AbstractTrieMemoryIndexBenchmark
+{
+    private static final String KEYSPACE = "test_keyspace";
+    private static final String TABLE = "test_table";
+    private static final String PARTITION_KEY = "key";
+    private static final String STRING_COLUMN = "string";
+    private static final String STRING_INDEX = "string_index";
+    private static final String INTEGER_COLUMN = "integer";
+    private static final String INTEGER_INDEX = "integer_index";
+    private static final int RANDOM_STRING_SIZE = 64 * 1024 * 1024;
+
+    private char[] randomChars = new char[RANDOM_STRING_SIZE];
+
+    protected int randomSeed;
+
+    protected ColumnContext stringContext;
+    protected ColumnContext integerContext;
+
+    protected MemoryIndex stringIndex;
+    protected MemoryIndex integerIndex;
+
+    protected ByteBuffer[] stringTerms;
+    protected ByteBuffer[] integerTerms;
+    protected DecoratedKey[] partitionKeys;
+
+    @Setup(Level.Trial)
+    public void initialiseConfig()
+    {
+        DatabaseDescriptor.daemonInitialization();
+        Random random = new Random();
+        randomSeed = random.nextInt();
+        for (int i = 0; i < RANDOM_STRING_SIZE; i++)
+        {
+            randomChars[i] = (char)('a' + random.nextInt(26));
+        }
+
+        ColumnMetadata string = ColumnMetadata.regularColumn(KEYSPACE, TABLE, STRING_COLUMN, UTF8Type.instance);
+        ColumnMetadata integer = ColumnMetadata.regularColumn(KEYSPACE, TABLE, INTEGER_COLUMN, Int32Type.instance);
+        TableMetadata table = TableMetadata.builder(KEYSPACE, TABLE)
+                .addPartitionKeyColumn(PARTITION_KEY, UTF8Type.instance)
+                .addRegularColumn(STRING_COLUMN, UTF8Type.instance)
+                .addRegularColumn(INTEGER_COLUMN, Int32Type.instance)
+                .partitioner(Murmur3Partitioner.instance)
+                .caching(CachingParams.CACHE_NOTHING)
+                .build();
+
+        Map<String, String> stringOptions = new HashMap<>();
+        stringOptions.put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getCanonicalName());
+        stringOptions.put(NonTokenizingOptions.CASE_SENSITIVE, "true");
+        stringOptions.put(IndexTarget.TARGET_OPTION_NAME, STRING_COLUMN);
+
+        Map<String, String> integerOptions = new HashMap<>();
+        integerOptions.put(IndexTarget.CUSTOM_INDEX_OPTION_NAME, StorageAttachedIndex.class.getCanonicalName());
+        integerOptions.put(IndexTarget.TARGET_OPTION_NAME, INTEGER_COLUMN);
+
+        IndexMetadata stringMetadata = IndexMetadata.fromSchemaMetadata(STRING_INDEX, IndexMetadata.Kind.CUSTOM, stringOptions);
+        stringContext = new ColumnContext(table, stringMetadata);
+
+        IndexMetadata integerMetadata = IndexMetadata.fromSchemaMetadata(INTEGER_INDEX, IndexMetadata.Kind.CUSTOM, integerOptions);
+        integerContext = new ColumnContext(table, integerMetadata);
+    }
+
+
+    protected void initialiseColumnData(int numberOfTerms, int rowsPerPartition)
+    {
+        Random random = new Random(randomSeed);
+
+        int numberOfKeys = numberOfTerms / rowsPerPartition;
+        stringTerms = new ByteBuffer[numberOfTerms];
+        integerTerms = new ByteBuffer[numberOfTerms];
+        partitionKeys = new DecoratedKey[numberOfKeys];
+
+        int length = 64;
+
+        for (int i = 0; i < numberOfTerms; i++)
+        {
+            stringTerms[i] = UTF8Type.instance.decompose(generateRandomString(random, length));
+            integerTerms[i] = Int32Type.instance.decompose(i);
+        }
+
+        for (int i = 0; i < numberOfKeys; i++)
+        {
+            partitionKeys[i] = Murmur3Partitioner.instance.decorateKey(UUIDType.instance.decompose(UUIDGen.getTimeUUID()));
+        }
+    }
+
+    private String generateRandomString(Random random, int length)
+    {
+        return new String(randomChars, random.nextInt(RANDOM_STRING_SIZE - length), length);
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/ReadTrieMemoryIndexBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/ReadTrieMemoryIndexBenchmark.java
new file mode 100644
index 000000000000..52d8c19767f7
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/ReadTrieMemoryIndexBenchmark.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.memory;
+
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.DataRange;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.index.sai.memory.TrieMemoryIndex;
+import org.apache.cassandra.index.sai.plan.Expression;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+@Fork(1)
+@Warmup(iterations = 5, time = 3)
+@Measurement(iterations = 10, time = 3)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@BenchmarkMode(Mode.AverageTime)
+@State(Scope.Thread)
+public class ReadTrieMemoryIndexBenchmark extends AbstractTrieMemoryIndexBenchmark
+{
+    private static final int NUMBER_OF_SEARCHES = 1000;
+    private static final AbstractBounds<PartitionPosition> ALL_DATA_RANGE = DataRange.allData(Murmur3Partitioner.instance).keyRange();
+
+    @Param({ "1000", "10000", "100000", "1000000" })
+    protected int numberOfTerms;
+
+    @Param({ "1", "10", "100"})
+    protected int rowsPerPartition;
+
+    private Random random;
+    private Expression[] stringEqualityExpressions;
+    private Expression[] integerEqualityExpressions;
+    private Expression[] integerRangeExpressions;
+
+    @Setup(Level.Iteration)
+    public void initialiseIndexes()
+    {
+        initialiseColumnData(numberOfTerms, rowsPerPartition);
+        stringIndex = new TrieMemoryIndex(stringContext);
+        integerIndex = new TrieMemoryIndex(integerContext);
+
+        int rowCount = 0;
+        int keyCount = 0;
+        for (int i = 0; i < numberOfTerms; i++)
+        {
+            stringIndex.add(partitionKeys[keyCount], Clustering.EMPTY, stringTerms[i]);
+            integerIndex.add(partitionKeys[keyCount], Clustering.EMPTY, integerTerms[i]);
+            if (++rowCount == rowsPerPartition)
+            {
+                rowCount = 0;
+                keyCount++;
+            }
+        }
+        random = new Random(randomSeed);
+
+        stringEqualityExpressions =  new Expression[NUMBER_OF_SEARCHES];
+        integerEqualityExpressions  =  new Expression[NUMBER_OF_SEARCHES];
+        integerRangeExpressions = new Expression[NUMBER_OF_SEARCHES];
+
+        for (int i = 0; i < NUMBER_OF_SEARCHES; i++)
+        {
+            stringEqualityExpressions[i] = new Expression(stringContext).add(Operator.EQ, stringTerms[random.nextInt(numberOfTerms)]);
+            integerEqualityExpressions[i] = new Expression(integerContext).add(Operator.EQ, integerTerms[random.nextInt(numberOfTerms)]);
+
+            int lowerValue = random.nextInt(numberOfTerms - 10);
+
+            integerRangeExpressions[i] = new Expression(integerContext)
+            {{
+                operation = Op.RANGE;
+                lower = new Bound(Int32Type.instance.decompose(lowerValue), Int32Type.instance, true);
+                upper = new Bound(Int32Type.instance.decompose(lowerValue + 10), Int32Type.instance, true);
+            }};
+        }
+    }
+
+    @Benchmark
+    public long stringEqualityBenchmark()
+    {
+        long size = 0;
+        for (int i = 0; i < NUMBER_OF_SEARCHES; i++)
+        {
+            stringIndex.search(stringEqualityExpressions[i], ALL_DATA_RANGE);
+        }
+        return size;
+    }
+
+    @Benchmark
+    public long integerEqualityBenchmark()
+    {
+        long size = 0;
+        for (int i = 0; i < NUMBER_OF_SEARCHES; i++)
+        {
+            integerIndex.search(integerEqualityExpressions[i], ALL_DATA_RANGE);
+        }
+        return size;
+    }
+
+    @Benchmark
+    public long integerRangeBenchmark()
+    {
+        long size = 0;
+        for (int i = 0; i < NUMBER_OF_SEARCHES; i++)
+        {
+            integerIndex.search(integerRangeExpressions[i], ALL_DATA_RANGE);
+        }
+        return size;
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/WriteTrieMemoryIndexBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/WriteTrieMemoryIndexBenchmark.java
new file mode 100644
index 000000000000..6f16e69eb3f0
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/memory/WriteTrieMemoryIndexBenchmark.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.memory;
+
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.index.sai.memory.TrieMemoryIndex;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+@Fork(1)
+@Warmup(iterations = 5, time = 3)
+@Measurement(iterations = 10, time = 3)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@BenchmarkMode(Mode.AverageTime)
+@State(Scope.Thread)
+public class WriteTrieMemoryIndexBenchmark extends AbstractTrieMemoryIndexBenchmark
+{
+    @Param({ "1000", "10000", "100000", "1000000" })
+    protected int numberOfTerms;
+
+    @Param({ "1", "10", "100"})
+    protected int rowsPerPartition;
+
+    @Setup(Level.Iteration)
+    public void initialiseColumnData()
+    {
+        initialiseColumnData(numberOfTerms, rowsPerPartition);
+    }
+
+    @Setup(Level.Invocation)
+    public void initialiseIndexes()
+    {
+        stringIndex = new TrieMemoryIndex(stringContext);
+        integerIndex = new TrieMemoryIndex(integerContext);
+    }
+
+    @Benchmark
+    public long writeStringIndex()
+    {
+        long size = 0;
+        int rowCount = 0;
+        int keyCount = 0;
+        for (ByteBuffer term : stringTerms)
+        {
+            stringIndex.add(partitionKeys[keyCount], Clustering.EMPTY, term);
+            if (++rowCount == rowsPerPartition)
+            {
+                rowCount = 0;
+                keyCount++;
+            }
+            size++;
+        }
+        return size;
+    }
+
+    @Benchmark
+    public long writeIntegerIndex()
+    {
+        long size = 0;
+        int rowCount = 0;
+        int keyCount = 0;
+        for (ByteBuffer term : integerTerms)
+        {
+            integerIndex.add(partitionKeys[keyCount], Clustering.EMPTY, term);
+            if (++rowCount == rowsPerPartition)
+            {
+                rowCount = 0;
+                keyCount++;
+            }
+            size++;
+        }
+        return size;
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/AbstractOnDiskBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/AbstractOnDiskBenchmark.java
new file mode 100644
index 000000000000..c2cb9468a433
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/AbstractOnDiskBenchmark.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.v1;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.Random;
+import java.util.stream.IntStream;
+
+import org.apache.cassandra.cache.ChunkCache;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.index.sai.disk.SSTableComponentsWriter;
+import org.apache.cassandra.index.sai.disk.io.IndexComponents;
+import org.apache.cassandra.index.sai.disk.v1.BlockPackedReader;
+import org.apache.cassandra.index.sai.disk.v1.MetadataSource;
+import org.apache.cassandra.index.sai.disk.v1.PostingsReader;
+import org.apache.cassandra.index.sai.disk.v1.PostingsWriter;
+import org.apache.cassandra.index.sai.metrics.QueryEventListener;
+import org.apache.cassandra.index.sai.utils.ArrayPostingList;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.util.FileHandle;
+import org.apache.cassandra.io.util.FileUtils;
+import org.apache.lucene.store.IndexInput;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.TearDown;
+
+public abstract class AbstractOnDiskBenchmark
+{
+    private static Random random = new Random();
+
+    private Descriptor descriptor;
+
+    IndexComponents groupComponents;
+    private FileHandle token;
+
+    private IndexComponents indexComponents;
+    private FileHandle postings;
+    private long summaryPosition;
+
+    /**
+     * @return num of rows to be stored in per-sstable components
+     */
+    public abstract int numRows();
+
+    /**
+     * @return num of postings to be written in posting file
+     */
+    public abstract int numPostings();
+
+    /**
+     * To be called before executing each @Benchmark method
+     */
+    public abstract void beforeInvocation() throws Throwable;
+
+    /**
+     * To be called after executing each @Benchmark method
+     */
+    public abstract void afterInvocation() throws Throwable;
+
+    protected int toPosting(int id)
+    {
+        return id;
+    }
+
+    protected long toToken(long id)
+    {
+        return id * 16_013L + random.nextInt(16_000);
+    }
+
+    protected long toOffset(long id)
+    {
+        return id * 16_013L + random.nextInt(16_000);
+    }
+
+    @Setup(Level.Trial)
+    public void perTrialSetup() throws IOException
+    {
+        DatabaseDescriptor.daemonInitialization(); // required to use ChunkCache
+        assert ChunkCache.instance != null;
+
+        descriptor = new Descriptor(Files.createTempDirectory("jmh").toFile(), "ks", this.getClass().getSimpleName(), 1);
+        groupComponents = IndexComponents.perSSTable(descriptor, null);
+        indexComponents = IndexComponents.create("col", descriptor, null);
+
+        // write per-sstable components: token and offset
+        writeSSTableComponents(numRows());
+        token = groupComponents.createFileHandle(IndexComponents.TOKEN_VALUES);
+
+        // write postings
+        summaryPosition = writePostings(numPostings());
+        postings = indexComponents.createFileHandle(indexComponents.postingLists);
+    }
+
+    @TearDown(Level.Trial)
+    public void perTrialTearDown()
+    {
+        token.close();
+        postings.close();
+        FileUtils.deleteRecursive(descriptor.directory);
+    }
+
+    @Setup(Level.Invocation)
+    public void perInvocationSetup() throws Throwable
+    {
+        beforeInvocation();
+    }
+
+    @TearDown(Level.Invocation)
+    public void perInvocationTearDown() throws Throwable
+    {
+        afterInvocation();
+    }
+
+    private long writePostings(int rows) throws IOException
+    {
+        final int[] postings = IntStream.range(0, rows).map(this::toPosting).toArray();
+        final ArrayPostingList postingList = new ArrayPostingList(postings);
+
+        try (PostingsWriter writer = new PostingsWriter(indexComponents, false))
+        {
+            long summaryPosition = writer.write(postingList);
+            writer.complete();
+
+            return summaryPosition;
+        }
+    }
+
+    protected final PostingsReader openPostingsReader() throws IOException
+    {
+        IndexInput input = indexComponents.openInput(postings);
+        IndexInput summaryInput = indexComponents.openInput(postings);
+
+        PostingsReader.BlocksSummary summary = new PostingsReader.BlocksSummary(summaryInput, summaryPosition);
+        return new PostingsReader(input, summary, QueryEventListener.PostingListEventListener.NO_OP);
+    }
+
+    private void writeSSTableComponents(int rows) throws IOException
+    {
+        SSTableComponentsWriter writer = new SSTableComponentsWriter(descriptor, null);
+        for (int i = 0; i < rows; i++)
+            writer.recordCurrentTokenOffset(toToken(i), toOffset(i));
+
+        writer.complete();
+    }
+
+    protected final LongArray openRowIdToTokenReader() throws IOException
+    {
+        MetadataSource source = MetadataSource.loadGroupMetadata(groupComponents);
+        return new BlockPackedReader(token, IndexComponents.TOKEN_VALUES, groupComponents, source).open();
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/BlockPackedReaderBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/BlockPackedReaderBenchmark.java
new file mode 100644
index 000000000000..f519ed9bde8b
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/BlockPackedReaderBenchmark.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.v1;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.apache.cassandra.io.util.Rebufferer;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+@Fork(value = 1, jvmArgsAppend = {
+        //        "-XX:+UnlockCommercialFeatures", "-XX:+FlightRecorder","-XX:+UnlockDiagnosticVMOptions", "-XX:+DebugNonSafepoints",
+        //        "-XX:StartFlightRecording=duration=60s,filename=./BlockPackedReaderBenchmark.jfr,name=profile,settings=profile",
+        //                            "-XX:FlightRecorderOptions=settings=/home/jake/workspace/cassandra/profiling-advanced.jfc,samplethreads=true"
+})
+@Warmup(iterations = 3)
+@Measurement(iterations = 5, timeUnit = TimeUnit.NANOSECONDS)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+public class BlockPackedReaderBenchmark extends AbstractOnDiskBenchmark
+{
+    private static final int NUM_INVOCATIONS = 10_000;
+
+    @Param({ "1", "10", "100", "1000"})
+    public int skippingDistance;
+
+    protected LongArray rowIdToToken;
+    private int[] rowIds;
+    private long[] tokenValues;
+
+    @Override
+    public int numRows()
+    {
+        return 10_000_000;
+    }
+
+    @Override
+    public int numPostings()
+    {
+        return 10_000_000;
+    }
+
+    @Override
+    public void beforeInvocation() throws Throwable
+    {
+        // rowIdToToken.findTokenRowID keeps track of last position, so it must be per-benchmark-method-invocation.
+        rowIdToToken = openRowIdToTokenReader();
+
+        rowIds = new int[NUM_INVOCATIONS];
+        tokenValues = new long[NUM_INVOCATIONS];
+
+        for (int i = 0; i < rowIds.length; i++)
+        {
+            rowIds[i] = toPosting(i * skippingDistance);
+            tokenValues[i] = toToken(rowIds[i]);
+        }
+    }
+
+    @Override
+    public void afterInvocation() throws Throwable
+    {
+        rowIdToToken.close();
+    }
+
+    @Benchmark
+    @OperationsPerInvocation(NUM_INVOCATIONS)
+    @BenchmarkMode({ Mode.Throughput, Mode.AverageTime })
+    public void get(Blackhole bh)
+    {
+        for (int i = 0; i < rowIds.length;)
+        {
+            bh.consume(rowIdToToken.get(rowIds[i]));
+            i++;
+        }
+    }
+
+    @Benchmark
+    @OperationsPerInvocation(NUM_INVOCATIONS)
+    @BenchmarkMode({ Mode.Throughput, Mode.AverageTime })
+    public void findTokenRowID(Blackhole bh)
+    {
+        for (int i = 0; i < tokenValues.length;)
+        {
+            bh.consume(rowIdToToken.findTokenRowID(tokenValues[i]));
+            i++;
+        }
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/MergePostingListBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/MergePostingListBenchmark.java
new file mode 100644
index 000000000000..c3cdf7b04345
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/MergePostingListBenchmark.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.v1;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.PriorityQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import com.google.common.primitives.Ints;
+
+import org.apache.cassandra.index.sai.disk.PostingList;
+import org.apache.cassandra.index.sai.disk.v1.MergePostingList;
+import org.apache.cassandra.index.sai.utils.ArrayPostingList;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+@Fork(1)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5)
+@State(Scope.Thread)
+public class MergePostingListBenchmark
+{
+    List<int[]> splitPostingLists = new ArrayList<>();
+    PostingList merge;
+
+    @Setup(Level.Trial)
+    public void generatePostings()
+    {
+        final AtomicInteger rowId = new AtomicInteger();
+        final int[] postings = IntStream.generate(() -> rowId.addAndGet(7))
+                                        .limit(1_000_000)
+                                        .toArray();
+
+        // split postings into multiple lists
+        final Map<Integer, List<Integer>> splitPostings = Arrays.stream(postings)
+                                                                .boxed()
+                                                                .collect(Collectors.groupingBy(it -> it % 1000));
+
+        for (List<Integer> split : splitPostings.values())
+        {
+            splitPostingLists.add(Ints.toArray(split));
+        }
+    }
+
+    @Setup(Level.Invocation)
+    public void mergePostings()
+    {
+        final PriorityQueue<PostingList.PeekablePostingList> lists = new PriorityQueue<>(Comparator.comparingLong(PostingList.PeekablePostingList::peek));
+        for (int[] postings : splitPostingLists)
+        {
+            lists.add(new ArrayPostingList(postings).peekable());
+        }
+        merge = MergePostingList.merge(lists);
+    }
+
+    @Benchmark
+    @BenchmarkMode({ Mode.All })
+    public void nextPostingIteration(Blackhole bh) throws IOException
+    {
+        long id;
+        while ((id = merge.nextPosting()) != PostingList.END_OF_STREAM)
+        {
+            bh.consume(id);
+        }
+    }
+}
diff --git a/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/PostingsReaderBenchmark.java b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/PostingsReaderBenchmark.java
new file mode 100644
index 000000000000..54a3d29e724f
--- /dev/null
+++ b/test/microbench/org/apache/cassandra/test/microbench/index/sai/v1/PostingsReaderBenchmark.java
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.test.microbench.index.sai.v1;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.index.sai.disk.v1.PostingsReader;
+import org.apache.cassandra.index.sai.utils.LongArray;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OperationsPerInvocation;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+
+@Fork(1)
+@Warmup(iterations = 3)
+@Measurement(iterations = 5, timeUnit = TimeUnit.MILLISECONDS)
+@OutputTimeUnit(TimeUnit.MICROSECONDS)
+@State(Scope.Thread)
+public class PostingsReaderBenchmark extends AbstractOnDiskBenchmark
+{
+    private static final int NUM_INVOCATIONS = 10_000;
+
+    @Param({ "1", "10", "100", "1000"})
+    public int skippingDistance;
+
+    protected LongArray rowIdToToken;
+    protected PostingsReader reader;
+    private int[] rowIds;
+    protected long[] tokenValues;
+
+    @Override
+    public int numRows()
+    {
+        return 10_000_000;
+    }
+
+    @Override
+    public int numPostings()
+    {
+        return 10_000_000;
+    }
+
+    @Override
+    public void beforeInvocation() throws Throwable
+    {
+        // rowIdToToken.findTokenRowID keeps track of last position, so it must be per-benchmark-method-invocation.
+        rowIdToToken = openRowIdToTokenReader();
+        reader = openPostingsReader();
+
+        tokenValues = new long[NUM_INVOCATIONS];
+        rowIds = new int[NUM_INVOCATIONS];
+        for (int i = 0; i < tokenValues.length; i++)
+        {
+            rowIds[i] = toPosting(i * skippingDistance);
+            tokenValues[i] = toToken(i * skippingDistance);
+        }
+    }
+
+    @Override
+    public void afterInvocation() throws Throwable
+    {
+        rowIdToToken.close();
+        reader.close();
+    }
+
+    @Benchmark
+    @OperationsPerInvocation(NUM_INVOCATIONS)
+    @BenchmarkMode({ Mode.Throughput, Mode.AverageTime })
+    public void skipAndRequestNext(Blackhole bh) throws Throwable
+    {
+        int rowId = -1;
+        for (int i = 0; i < tokenValues.length;)
+        {
+            long token = tokenValues[i];
+            if (rowId < 0)
+                rowId = (int) rowIdToToken.findTokenRowID(token);
+            bh.consume(reader.advance(rowId));
+            rowId = -1;
+
+            i++;
+        }
+    }
+
+    @Benchmark
+    @OperationsPerInvocation(NUM_INVOCATIONS)
+    @BenchmarkMode({ Mode.Throughput, Mode.AverageTime })
+    public void advance(Blackhole bh) throws Throwable
+    {
+        for (int i = 0; i < tokenValues.length;)
+        {
+            int rowId = rowIds[i];
+            bh.consume(reader.advance(rowId));
+
+            i++;
+        }
+    }
+}

From cfee60b432ac0d944f6c9ea2e3843a66c44a8e2b Mon Sep 17 00:00:00 2001
From: Jaroslaw Grabowski <jaroslaw.grabowski@datastax.com>
Date: Fri, 11 Jun 2021 22:27:15 +0200
Subject: [PATCH 100/151] STAR-766 fix testGetScannerForNoIntersectingRanges

Since CASSANDRA-5249 getScanner returns
EmptySSTableScanner when there are no intersecting ranges.
Empty scanner always returns false from hasNext().

Why testGetScannerForNoIntersectingRanges worked although
hasNext returns false? Other test created a second
sstable for which the scanner was not empty and thus
the assertion passed. In other words, the test failed when
executed in isolation.

This commit creates a dedicated table for the test to avoid
other test interference. Additionally the number of sstables
is verified in the test.
Instead of an assertion based on hasNext, an assertion on
EmptySSTableScanner is made.

(cherry picked from commit c097960cd4445e3d1243402a4d86ce952a3e6d9f)
---
 .../io/sstable/SSTableReaderTest.java         | 30 ++++++++++---------
 ... fix testGetScannerForNoIntersectingRanges | 18 +++++++++++
 2 files changed, 34 insertions(+), 14 deletions(-)
 create mode 100644 update-history/STAR-801/3-c097960cd4 STAR-766 fix testGetScannerForNoIntersectingRanges

diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 4eac572f204a..4328899abf86 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -66,6 +66,8 @@
 import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
 import org.apache.cassandra.io.sstable.metadata.MetadataType;
 import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+import org.apache.cassandra.io.sstable.format.big.BigTableScanner;
+import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexScanner;
 import org.apache.cassandra.io.util.FileDataInput;
 import org.apache.cassandra.io.util.MmappedRegions;
 import org.apache.cassandra.schema.CachingParams;
@@ -83,10 +85,12 @@
 
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 import static org.apache.cassandra.io.sstable.format.SSTableReader.selectOnlyBigTableReaders;
+import static org.hamcrest.Matchers.instanceOf;
 import static org.hamcrest.Matchers.is;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
@@ -95,6 +99,7 @@ public class SSTableReaderTest
     public static final String KEYSPACE1 = "SSTableReaderTest";
     public static final String CF_STANDARD = "Standard1";
     public static final String CF_STANDARD2 = "Standard2";
+    public static final String CF_STANDARD3 = "Standard3";
     public static final String CF_COMPRESSED = "Compressed";
     public static final String CF_INDEXED = "Indexed1";
     public static final String CF_STANDARDLOWINDEXINTERVAL = "StandardLowIndexInterval";
@@ -117,6 +122,7 @@ public static void defineSchema() throws Exception
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD2)
                                                 .minIndexInterval(8)
                                                 .maxIndexInterval(8),  // ensure close key count estimation
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD3),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_COMPRESSED).compression(CompressionParams.DEFAULT),
                                     SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEXED, true),
                                     SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARDLOWINDEXINTERVAL)
@@ -603,7 +609,7 @@ public void testLoadingSummaryUsesCorrectPartitioner() throws Exception
     public void testGetScannerForNoIntersectingRanges() throws Exception
     {
         Keyspace keyspace = Keyspace.open(KEYSPACE1);
-        ColumnFamilyStore store = keyspace.getColumnFamilyStore("Standard1");
+        ColumnFamilyStore store = keyspace.getColumnFamilyStore(CF_STANDARD3);
         partitioner = store.getPartitioner();
 
         new RowUpdateBuilder(store.metadata(), 0, "k1")
@@ -613,19 +619,15 @@ public void testGetScannerForNoIntersectingRanges() throws Exception
             .applyUnsafe();
 
         store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
-        boolean foundScanner = false;
-        for (SSTableReader s : store.getLiveSSTables())
-        {
-            try (ISSTableScanner scanner = s.getScanner(new Range<>(t(0), t(1))))
-            {
-                if (scanner.hasNext())
-                {
-                    scanner.next(); // throws exception pre 5407
-                    foundScanner = true;
-                }
-            }
-        }
-        assertTrue(foundScanner);
+
+        Set<SSTableReader> liveSSTables = store.getLiveSSTables();
+        assertEquals("The table should have only one sstable", 1, liveSSTables.size());
+
+        ISSTableScanner scanner = liveSSTables.iterator().next().getScanner(new Range<>(t(0), t(1)));
+        if (SSTableFormat.Type.current() == SSTableFormat.Type.BIG)
+            assertThat(scanner, instanceOf(BigTableScanner.EmptySSTableScanner.class));
+        else
+            assertThat(scanner, instanceOf(TrieIndexScanner.EmptySSTableScanner.class));
     }
 
     @Test
diff --git a/update-history/STAR-801/3-c097960cd4 STAR-766 fix testGetScannerForNoIntersectingRanges b/update-history/STAR-801/3-c097960cd4 STAR-766 fix testGetScannerForNoIntersectingRanges
new file mode 100644
index 000000000000..70faa06f1ef5
--- /dev/null
+++ b/update-history/STAR-801/3-c097960cd4 STAR-766 fix testGetScannerForNoIntersectingRanges	
@@ -0,0 +1,18 @@
+--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
++++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+@@ -62,15 +62,12 @@
+ import org.apache.cassandra.index.Index;
+ import org.apache.cassandra.io.FSReadError;
+ import org.apache.cassandra.io.sstable.format.SSTableFormat;
+-<<<<<<<
+ import org.apache.cassandra.io.sstable.format.SSTableReader;
+ import org.apache.cassandra.io.sstable.metadata.MetadataComponent;
+ import org.apache.cassandra.io.sstable.metadata.MetadataType;
+ import org.apache.cassandra.io.sstable.metadata.ValidationMetadata;
+-=======
+ import org.apache.cassandra.io.sstable.format.big.BigTableScanner;
+ import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexScanner;
+->>>>>>>
+ import org.apache.cassandra.io.util.FileDataInput;
+ import org.apache.cassandra.io.util.MmappedRegions;
+ import org.apache.cassandra.schema.CachingParams;

From 756eb7d5dd467539287380670954099d85f2b90f Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Mon, 14 Jun 2021 19:44:51 +0200
Subject: [PATCH 101/151] STAR-795: Reduce test sizes (#196)

(cherry picked from commit 05f422a1754c25ecdca1a47f357f5bb99e46b3a9)
---
 .../test/microbench/sstable/SSTableReaderBench.java       | 8 ++++----
 .../test/microbench/sstable/SSTableWriterBench.java       | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java
index a0ea19ced63c..ddefb7771a89 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableReaderBench.java
@@ -66,10 +66,10 @@ public class SSTableReaderBench extends AbstractSSTableBench
 {
     private final static Logger logger = LoggerFactory.getLogger(SSTableReaderBench.class);
 
-    int KEY_SIZE = 8;
-    int P_KEYS = 2 << 14;
-    int C_KEYS = 2 << 10;
-    int VAL_SIZE = 1;
+    public final static int KEY_SIZE = 8;
+    public final static int P_KEYS = 2 << 10;
+    public final static int C_KEYS = 2 << 10;
+    public final static int VAL_SIZE = 1;
 
     public ByteBuffer[] ckeys;
     public DecoratedKey[] pkeys;
diff --git a/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java
index 529f64699361..01a81669a3b2 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/sstable/SSTableWriterBench.java
@@ -60,10 +60,10 @@ public class SSTableWriterBench extends AbstractSSTableBench
     @Param({ "BIG", "BTI" })
     public String formatName;
 
-    int KEY_SIZE = 8;
-    int P_KEYS = 1000;
-    int C_KEYS = 1000;
-    int VAL_SIZE = 1;
+    public final static int KEY_SIZE = 8;
+    public final static int P_KEYS = 1000;
+    public final static int C_KEYS = 1000;
+    public final static int VAL_SIZE = 1;
 
     public ByteBuffer[] ckeys = new ByteBuffer[C_KEYS];
     public DecoratedKey[] pkeys = new DecoratedKey[P_KEYS];

From 4ed6c2858ec020e7b132fa145eb7e922e29d7653 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tomek=20=C5=81asica?= <tlasica@life.pl>
Date: Wed, 16 Jun 2021 12:58:51 +0200
Subject: [PATCH 102/151] STAR-765. Add --secure-connect-bundle option to
 cqlsh. (#190)

* STAR-765. Add --secure-connect-bundle option to cqlsh.

This option is required for the cloud connections e.g. Astra.
Summary of changes:
* When it is provided all the connection parameters are read from it ignoring host and port settings;
* can be provided by --secure-connect-bundle or
* via cqlshrc::connection::secure_connect_bundle option and should point out to the bundle .zip file;
* when connecting to the cloud default consistency level is LOCAL_QUORUM;
* special EndpointWhiteListRoundRobin policy will be used to use any of the endpoints discovered after making successful connection;

In addition this change:
* fixes some potential STAR-432 leftovers
* fixes LOGIN command behavior (broken in original code)

Tests are added in dtest.

Co-authored with Aleksandr Sorokoumov <aleksandr.sorokoumov@gmail.com> and Alan Boudreault <alan@kovaro.ca>.

(cherry picked from commit 92d950a1d1ce592ca842086016ef57b1efe5441f)
---
 bin/cqlsh.py                         |  75 +++++++++----
 conf/cqlshrc.sample.cloud            |  17 +++
 pylib/cqlshlib/copyutil.py           |  44 +++++---
 pylib/cqlshlib/driver.py             | 153 +++++++++++++++++++++++++++
 pylib/cqlshlib/test/test_copyutil.py |  41 +++----
 5 files changed, 276 insertions(+), 54 deletions(-)
 create mode 100644 conf/cqlshrc.sample.cloud
 create mode 100644 pylib/cqlshlib/driver.py

diff --git a/bin/cqlsh.py b/bin/cqlsh.py
index 0afb7841c800..6ff6e333f0a2 100755
--- a/bin/cqlsh.py
+++ b/bin/cqlsh.py
@@ -141,12 +141,10 @@ def find_zip(libprefix):
              'Error: %s\n' % (sys.executable, sys.path, e))
 
 from cassandra.auth import PlainTextAuthProvider
-from cassandra.cluster import Cluster
 from cassandra.cqltypes import cql_typename
 from cassandra.marshal import int64_unpack
 from cassandra.metadata import (ColumnMetadata, KeyspaceMetadata,
                                 TableMetadata, protect_name, protect_names)
-from cassandra.policies import WhiteListRoundRobinPolicy
 from cassandra.query import SimpleStatement, ordered_dict_factory, TraceUnavailable
 from cassandra.util import datetime_from_timestamp
 
@@ -169,6 +167,8 @@ def find_zip(libprefix):
 from cqlshlib.geotypes import patch_geotypes_import_conversion  # nopep8
 from cqlshlib.daterangetype import patch_daterange_import_conversion  # nopep
 
+from cqlshlib.driver import cluster_factory
+
 patch_geotypes_import_conversion(ImportConversion)
 patch_daterange_import_conversion(ImportConversion)
 
@@ -210,6 +210,8 @@ def find_zip(libprefix):
 parser.add_option("-u", "--username", help="Authenticate as user.")
 parser.add_option("-p", "--password", help="Authenticate using password.")
 parser.add_option('-k', '--keyspace', help='Authenticate to the given keyspace.')
+parser.add_option('-b', '--secure-connect-bundle',
+                  help="Connect using secure connect bundle. If this option is specified host, port settings are ignored.")
 parser.add_option("-f", "--file", help="Execute commands from FILE, then exit")
 parser.add_option('--debug', action='store_true',
                   help='Show additional debugging information')
@@ -440,6 +442,7 @@ def __init__(self, hostname, port, color=False,
                  username=None, password=None, encoding=None, stdin=None, tty=True,
                  completekey=DEFAULT_COMPLETEKEY, browser=None, use_conn=None,
                  cqlver=None, keyspace=None,
+                 secure_connect_bundle=None,
                  consistency_level=None, serial_consistency_level=None,
                  tracing_enabled=False, expand_enabled=False,
                  display_nanotime_format=DEFAULT_NANOTIME_FORMAT,
@@ -455,8 +458,10 @@ def __init__(self, hostname, port, color=False,
                  protocol_version=None,
                  connect_timeout=DEFAULT_CONNECT_TIMEOUT_SECONDS,
                  no_file_io=DEFAULT_NO_FILE_IO,
-                 is_subshell=False):
+                 is_subshell=False,
+                 debug=False):
         cmd.Cmd.__init__(self, completekey=completekey)
+        self.debug = debug
         self.hostname = hostname
         self.port = port
         self.auth_provider = None
@@ -478,19 +483,27 @@ def __init__(self, hostname, port, color=False,
         self.consistency_level = consistency_level
         self.serial_consistency_level = serial_consistency_level
 
+        self.secure_connect_bundle = secure_connect_bundle
+
         if use_conn:
             self.conn = use_conn
         else:
             kwargs = {}
             if protocol_version is not None:
                 kwargs['protocol_version'] = protocol_version
-            self.conn = Cluster(contact_points=(self.hostname,), port=self.port, cql_version=cqlver,
-                                auth_provider=self.auth_provider,
-                                ssl_options=sslhandling.ssl_settings(hostname, CONFIG_FILE) if ssl else None,
-                                load_balancing_policy=WhiteListRoundRobinPolicy([self.hostname]),
-                                control_connection_timeout=connect_timeout,
-                                connect_timeout=connect_timeout,
-                                **kwargs)
+            self.conn = cluster_factory(
+                self.hostname,
+                port=self.port,
+                cql_version=cqlver,
+                auth_provider=self.auth_provider,
+                ssl_options=sslhandling.ssl_settings(hostname, CONFIG_FILE) if ssl else None,
+                control_connection_timeout=connect_timeout,
+                connect_timeout=connect_timeout,
+                secure_connect_bundle=secure_connect_bundle,
+                application_name=description,
+                application_version=version,
+                **kwargs)
+
         self.owns_connection = not use_conn
 
         if keyspace:
@@ -514,7 +527,9 @@ def __init__(self, hostname, port, color=False,
 
         self.session.default_timeout = request_timeout
         self.session.row_factory = ordered_dict_factory
-        self.session.default_consistency_level = cassandra.ConsistencyLevel.ONE
+        self.session.default_consistency_level = self.consistency_level
+        self.session.default_serial_consistency_level = self.serial_consistency_level
+
         self.get_connection_versions()
         self.set_expanded_cql_version(self.connection_versions['cql'])
 
@@ -1871,8 +1886,9 @@ def do_login(self, parsed):
 
         LOGIN <username> (<password>)
 
-           Login using the specified username. If password is specified, it will be used
-           otherwise, you will be prompted to enter.
+           Login using the specified username.
+           If password is specified it should be wrapped with single quotes.
+           If not specified you will be prompted to enter.
         """
         username = parsed.get_binding('username')
         password = parsed.get_binding('password')
@@ -1883,13 +1899,16 @@ def do_login(self, parsed):
 
         auth_provider = PlainTextAuthProvider(username=username, password=password)
 
-        conn = Cluster(contact_points=(self.hostname,), port=self.port, cql_version=self.conn.cql_version,
-                       protocol_version=self.conn.protocol_version,
-                       auth_provider=auth_provider,
-                       ssl_options=self.conn.ssl_options,
-                       load_balancing_policy=WhiteListRoundRobinPolicy([self.hostname]),
-                       control_connection_timeout=self.conn.connect_timeout,
-                       connect_timeout=self.conn.connect_timeout)
+        conn = cluster_factory(
+            self.hostname,
+            port=self.port,
+            cql_version=self.conn.cql_version,
+            protocol_version=self.conn.protocol_version,
+            auth_provider=auth_provider,
+            ssl_options=self.conn.ssl_options,
+            control_connection_timeout=self.conn.connect_timeout,
+            connect_timeout=self.conn.connect_timeout,
+            secure_connect_bundle=self.secure_connect_bundle)
 
         if self.current_keyspace:
             session = conn.connect(self.current_keyspace)
@@ -1900,6 +1919,7 @@ def do_login(self, parsed):
         session.default_timeout = self.session.default_timeout
         session.row_factory = self.session.row_factory
         session.default_consistency_level = self.session.default_consistency_level
+        session.default_serial_consistency_level = self.session.default_serial_consistency_level
         session.max_trace_wait = self.session.max_trace_wait
 
         # Update after we've connected in case we fail to authenticate
@@ -2165,6 +2185,7 @@ def read_options(cmdlineargs, environment):
     optvalues.username = option_with_default(configs.get, 'authentication', 'username')
     optvalues.password = option_with_default(rawconfigs.get, 'authentication', 'password')
     optvalues.keyspace = option_with_default(configs.get, 'authentication', 'keyspace')
+    optvalues.secure_connect_bundle = option_with_default(configs.get, 'connection', 'secure_connect_bundle')
     optvalues.browser = option_with_default(configs.get, 'ui', 'browser', None)
     optvalues.completekey = option_with_default(configs.get, 'ui', 'completekey',
                                                 DEFAULT_COMPLETEKEY)
@@ -2194,7 +2215,7 @@ def read_options(cmdlineargs, environment):
     optvalues.ssl = option_with_default(configs.getboolean, 'connection', 'ssl', DEFAULT_SSL)
     optvalues.encoding = option_with_default(configs.get, 'ui', 'encoding', UTF8)
 
-    optvalues.consistency_level = option_with_default(configs.get, 'cql', 'consistency_level', 'ONE')
+    optvalues.consistency_level = option_with_default(configs.get, 'cql', 'consistency_level', None)
     optvalues.serial_consistency_level = option_with_default(configs.get, 'cql', 'serial_consistency_level', 'SERIAL')
 
     optvalues.tty = option_with_default(configs.getboolean, 'ui', 'tty', sys.stdin.isatty())
@@ -2215,6 +2236,11 @@ def read_options(cmdlineargs, environment):
 
     serial_levels = [cassandra.ConsistencyLevel.SERIAL, cassandra.ConsistencyLevel.LOCAL_SERIAL]
 
+    # If unspecified, set the proper defaut CL
+    default_cl = 'LOCAL_QUORUM' if options.secure_connect_bundle else 'ONE'
+    if options.consistency_level is None:
+        options.consistency_level = default_cl
+
     try:
         cl = cassandra.ConsistencyLevel.name_to_value[options.consistency_level.upper()]
         if cl in serial_levels:
@@ -2334,8 +2360,11 @@ def main(options, hostname, port):
     if options.debug:
         sys.stderr.write("Using CQL driver: %s\n" % (cassandra,))
         sys.stderr.write("Using connect timeout: %s seconds\n" % (options.connect_timeout,))
+        sys.stderr.write("Using consistency level: %s\n" % (cassandra.ConsistencyLevel.value_to_name[options.consistency_level],))
         sys.stderr.write("Using '%s' encoding\n" % (options.encoding,))
         sys.stderr.write("Using ssl: %s\n" % (options.ssl,))
+        if options.secure_connect_bundle:
+            sys.stderr.write("Using secure connect bundle: %s\n" % (options.secure_connect_bundle, ))
 
     # create timezone based on settings, environment or auto-detection
     timezone = None
@@ -2379,9 +2408,11 @@ def main(options, hostname, port):
                       tty=options.tty,
                       completekey=options.completekey,
                       browser=options.browser,
+                      debug=options.debug,
                       protocol_version=options.protocol_version,
                       cqlver=options.cqlversion,
                       keyspace=options.keyspace,
+                      secure_connect_bundle=options.secure_connect_bundle,
                       consistency_level=options.consistency_level,
                       serial_consistency_level=options.serial_consistency_level,
                       display_timestamp_format=options.time_format,
@@ -2403,8 +2434,6 @@ def main(options, hostname, port):
         sys.exit('Connection error: %s' % (e,))
     except VersionNotSupported as e:
         sys.exit('Unsupported CQL version: %s' % (e,))
-    if options.debug:
-        shell.debug = True
     if options.coverage:
         shell.coverage = True
         import signal
diff --git a/conf/cqlshrc.sample.cloud b/conf/cqlshrc.sample.cloud
new file mode 100644
index 000000000000..62528670c48b
--- /dev/null
+++ b/conf/cqlshrc.sample.cloud
@@ -0,0 +1,17 @@
+; Copyright DataStax, Inc.
+;
+; Licensed under the Apache License, Version 2.0 (the "License");
+; you may not use this file except in compliance with the License.
+; You may obtain a copy of the License at
+;
+; http://www.apache.org/licenses/LICENSE-2.0
+;
+; Unless required by applicable law or agreed to in writing, software
+; distributed under the License is distributed on an "AS IS" BASIS,
+; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; See the License for the specific language governing permissions and
+; limitations under the License.
+;
+; Sample ~/.cqlshrc file with cloud configuration.
+[connection]
+secure_connect_bundle = /path/to/creds.zip
diff --git a/pylib/cqlshlib/copyutil.py b/pylib/cqlshlib/copyutil.py
index 3bcd26a98362..a81d16721e20 100644
--- a/pylib/cqlshlib/copyutil.py
+++ b/pylib/cqlshlib/copyutil.py
@@ -42,7 +42,6 @@
 from io import BytesIO, StringIO
 from select import select
 from uuid import UUID
-from .util import profile_on, profile_off
 
 from six import ensure_str, ensure_text
 from six.moves import configparser
@@ -50,23 +49,27 @@
 from six.moves.queue import Queue
 
 from cassandra import OperationTimedOut
-from cassandra.cluster import Cluster, DefaultConnection
+from cassandra.cluster import DefaultConnection
+from cassandra.connection import SniEndPoint
 from cassandra.cqltypes import ReversedType, UserType, BytesType, VarcharType
 from cassandra.metadata import protect_name, protect_names, protect_value
 from cassandra.policies import RetryPolicy, WhiteListRoundRobinPolicy, DCAwareRoundRobinPolicy, FallthroughRetryPolicy
 from cassandra.query import BatchStatement, BatchType, SimpleStatement, tuple_factory
 from cassandra.util import Date, Time
-from cqlshlib.util import profile_on, profile_off
 
 from cqlshlib.cql3handling import CqlRuleSet
 from cqlshlib.displaying import NO_COLOR_MAP
+from cqlshlib.driver import cluster_factory
 from cqlshlib.formatting import format_value_default, CqlType, DateTimeFormat, EMPTY, get_formatter, BlobType
 from cqlshlib.sslhandling import ssl_settings
+from cqlshlib.util import profile_on, profile_off
+
 
 PROFILE_ON = False
 STRACE_ON = False
 DEBUG = False  # This may be set to True when initializing the task
 IS_LINUX = platform.system() == 'Linux'
+IS_WINDOWS = platform.system() == 'Windows'
 
 CopyOptions = namedtuple('CopyOptions', 'copy dialect unrecognized')
 
@@ -494,6 +497,7 @@ def make_params(self):
                     port=shell.port,
                     ssl=shell.ssl,
                     auth_provider=shell.auth_provider,
+                    parent_cluster=shell.conn if not IS_WINDOWS else None,
                     cql_version=shell.conn.cql_version,
                     config_file=self.config_file,
                     protocol_version=self.protocol_version,
@@ -716,9 +720,9 @@ def make_range(prev, curr):
         def make_range_data(replicas=None):
             hosts = []
             if replicas:
-                for r in replicas:
-                    if r.is_up is not False and r.datacenter == local_dc:
-                        hosts.append(r.address)
+                # when connected to a cloud cluster r.address is the proxy
+                # so we need to use host_id as correct in both cloud/not cloud
+                hosts = [r.host_id for r in replicas if r.is_up is not False and r.datacenter == local_dc]
             if not hosts:
                 hosts.append(hostname)  # fallback to default host if no replicas in current dc
             return {'hosts': tuple(hosts), 'attempts': 0, 'rows': 0, 'workerno': -1}
@@ -1409,6 +1413,7 @@ def __init__(self, params, target):
         self.connect_timeout = params['connect_timeout']
         self.cql_version = params['cql_version']
         self.auth_provider = params['auth_provider']
+        self.parent_cluster = params['parent_cluster']
         self.ssl = params['ssl']
         self.protocol_version = params['protocol_version']
         self.config_file = params['config_file']
@@ -1676,23 +1681,36 @@ def connect(self, host):
             session.add_request()
             return session
 
-        new_cluster = Cluster(
-            contact_points=(host,),
+        endpoint = self._endpoint_for_host(host)
+
+        new_cluster = cluster_factory(
+            endpoint,
+            whitelist_lbp=endpoint,
+            cloud=self.parent_cluster.cloud,
             port=self.port,
             cql_version=self.cql_version,
             protocol_version=self.protocol_version,
             auth_provider=self.auth_provider,
-            ssl_options=ssl_settings(host, self.config_file) if self.ssl else None,
-            load_balancing_policy=WhiteListRoundRobinPolicy([host]),
+            ssl_options=ssl_settings(endpoint, self.config_file) if self.ssl else None,
             default_retry_policy=ExpBackoffRetryPolicy(self),
             compression=None,
             control_connection_timeout=self.connect_timeout,
             connect_timeout=self.connect_timeout,
             idle_heartbeat_interval=0)
+
         session = ExportSession(new_cluster, self)
         self.hosts_to_sessions[host] = session
         return session
 
+    def _endpoint_for_host(self, host):
+        endpoint = host
+        if isinstance(host, UUID):
+            for h in self.parent_cluster.metadata.all_hosts():
+                if h.host_id == host:
+                    endpoint = h.endpoint if isinstance(h.endpoint, SniEndPoint) else h.endpoint.address
+                    break
+        return endpoint
+
     def attach_callbacks(self, token_range, future, session):
         metadata = session.cluster.metadata
         ks_meta = metadata.keyspaces[self.ks]
@@ -2353,8 +2371,9 @@ def __init__(self, params):
     @property
     def session(self):
         if not self._session:
-            cluster = Cluster(
-                contact_points=(self.hostname,),
+            cluster = cluster_factory(
+                self.hostname,
+                whitelist_lbp=False,
                 port=self.port,
                 cql_version=self.cql_version,
                 protocol_version=self.protocol_version,
@@ -2366,6 +2385,7 @@ def session(self):
                 control_connection_timeout=self.connect_timeout,
                 connect_timeout=self.connect_timeout,
                 idle_heartbeat_interval=0,
+                cloud=self.parent_cluster.cloud,
                 connection_class=ConnectionWrapper)
 
             self._session = cluster.connect(self.ks)
diff --git a/pylib/cqlshlib/driver.py b/pylib/cqlshlib/driver.py
new file mode 100644
index 000000000000..e0bc7cef951f
--- /dev/null
+++ b/pylib/cqlshlib/driver.py
@@ -0,0 +1,153 @@
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import six
+import stat
+from cassandra.cluster import Cluster
+from cassandra.connection import UnixSocketEndPoint
+from cassandra.policies import WhiteListRoundRobinPolicy, RoundRobinPolicy
+from cassandra.pool import HostDistance
+
+
+class EndpointWhiteListRoundRobinPolicy(WhiteListRoundRobinPolicy):
+    """
+    WhileListRoundRobinPolicy dedicated for the cloud connection.
+
+    It is using endpoint instead of host address.
+
+    Note:
+    we want to override _allowed_hosts and _allowed_hosts_resolved which is the reason
+    to not call direct super class init but rather the one from RoundRobinPolicy.
+    """
+
+    def __init__(self, hosts):
+        self._allowed_hosts = self._allowed_hosts_resolved = tuple(hosts)
+        RoundRobinPolicy.__init__(self)
+
+    def populate(self, cluster, hosts):
+        self._live_hosts = frozenset(h for h in hosts if h.endpoint in self._allowed_hosts_resolved)
+
+        if len(hosts) <= 1:
+            self._position = 0
+        else:
+            self._position = random.randint(0, len(hosts) - 1)
+
+    def distance(self, host):
+        if host.endpoint in self._allowed_hosts_resolved:
+            return HostDistance.LOCAL
+        else:
+            return HostDistance.IGNORED
+
+    def on_up(self, host):
+        if host.endpoint in self._allowed_hosts_resolved:
+            RoundRobinPolicy.on_up(self, host)
+
+    def on_add(self, host):
+        if host.endpoint in self._allowed_hosts_resolved:
+            RoundRobinPolicy.on_add(self, host)
+
+
+def cluster_factory(host, whitelist_lbp=True, **kwargs):
+    """
+    Cluster factory to create a cassandra or dse cluster for cqlsh.
+
+    :param host: the host for the connection. This can be the hostname string or an EndPoint instance.
+    :param whitelist_lbp: Specify if a WhiteListRoundRobinPolicy should be applied. This can be set to True,
+                          False or an EndPoint instance, which is used to configure the policy when
+                          connecting to a cloud cluster. On a cloud cluster, the LBP has to be set after the
+                          nodes discovery. Otherwise, the LBP is set as usual to the execution profile.
+                          Default to True.
+    :kwargs: All other keyword arguments are passed to the Cluster constructor.
+    :return: A Cluster instance.
+    """
+
+    is_cloud_cluster = False
+    is_unix_socket_endpoint = False
+    endpoint = host
+    options = kwargs.copy()
+
+    # Configure the cluster contact point type and address.
+    if is_unix_socket(host):
+        # update endpoint and load balancing policy for unix socket
+        endpoint = UnixSocketEndPoint(host)
+        is_unix_socket_endpoint = True
+
+    # Determine if we are trying to connect to a cloud cluster
+    secure_connect_bundle = options.pop('secure_connect_bundle', None)
+    if secure_connect_bundle:
+        is_cloud_cluster = True
+        options['cloud'] = {'secure_connect_bundle': secure_connect_bundle}
+    elif 'cloud' in options:
+        is_cloud_cluster = options['cloud'] is not None
+
+    # Build the Cluster instance
+    if is_cloud_cluster:
+        return _cloud_cluster_factory(whitelist_lbp, **options)
+
+    if is_unix_socket_endpoint and 'port' in options:
+        del options['port']
+
+    contact_points = (endpoint,)
+
+    if whitelist_lbp:
+        lbp_class = WhiteListRoundRobinPolicy
+        if is_unix_socket_endpoint:
+            lbp_class = EndpointWhiteListRoundRobinPolicy
+
+        whitelist = [endpoint] if whitelist_lbp is True else [whitelist_lbp]
+        options['load_balancing_policy'] = lbp_class(whitelist)
+
+    return Cluster(contact_points=contact_points, **options)
+
+
+def is_unix_socket(hostname):
+    if isinstance(hostname, six.string_types) and os.path.exists(hostname):
+        mode = os.stat(hostname).st_mode
+        return stat.S_ISSOCK(mode)
+    return False
+
+
+def _cloud_cluster_factory(whitelist_lbp, **kwargs):
+    """
+    Create cloud cluster from given options.
+    Please notice that:
+    - cloud should be present in option
+    - contact_points, endpoint_factory, ssl_context, and ssl_options cannot be specified with a cloud configuration (will be removed)
+    - whitelist_lbp can be True (random contact point), False (no policy set), or specific endpoint
+    """
+    options = kwargs.copy()
+    assert 'cloud' in options
+
+    cloud_prohibited_options = ['contact_points', 'endpoint_factory', 'ssl_context', 'ssl_options']
+    for opt_name in cloud_prohibited_options:
+        options.pop(opt_name, None)
+
+    cluster = Cluster(**options)
+
+    if whitelist_lbp:
+
+        if whitelist_lbp is True:
+            # applying load balancing policy as we now know the contact points
+            contact_points = [random.choice(cluster.contact_points)]
+        else:
+            # An explicit host was specified
+            contact_points = [whitelist_lbp]
+
+        for execution_profile in cluster.profile_manager.profiles.values():
+            execution_profile.load_balancing_policy = EndpointWhiteListRoundRobinPolicy(contact_points)
+        cluster.endpoints_resolved = contact_points
+
+    return cluster
diff --git a/pylib/cqlshlib/test/test_copyutil.py b/pylib/cqlshlib/test/test_copyutil.py
index 18b167adbfcf..151fe8756fab 100644
--- a/pylib/cqlshlib/test/test_copyutil.py
+++ b/pylib/cqlshlib/test/test_copyutil.py
@@ -20,10 +20,14 @@
 
 import unittest
 
-from cassandra.metadata import MIN_LONG, Murmur3Token, TokenMap
+from cassandra.metadata import MIN_LONG, Murmur3Token
 from cassandra.policies import SimpleConvictionPolicy
 from cassandra.pool import Host
-from unittest.mock import Mock
+
+try:
+    from unittest.mock import Mock
+except ImportError:
+    from mock import Mock
 
 from cqlshlib.copyutil import ExportTask
 
@@ -44,7 +48,9 @@ def setUp(self):
             Host('10.0.0.2', SimpleConvictionPolicy, 9000),
             Host('10.0.0.3', SimpleConvictionPolicy, 9000),
             Host('10.0.0.4', SimpleConvictionPolicy, 9000)
-    ]
+        ]
+        for h in self.hosts:
+            h.host_id = h.address
 
     def mock_shell(self):
         """
@@ -75,42 +81,39 @@ def _test_get_ranges_murmur3_base(self, opts, expected_ranges):
         }
         # merge override options with standard options
         overridden_opts = dict(self.opts)
-        for k,v in opts.items():
+        for k, v in opts.items():
             overridden_opts[k] = v
         export_task = ExportTask(shell, self.ks, self.table, self.columns, self.fname, overridden_opts, self.protocol_version, self.config_file)
-        assert export_task.get_ranges() == expected_ranges
-
-    def test_get_ranges_murmur3(self):
-        """
-        Test behavior of ExportTask internal get_ranges function
-        """
+        export_ranges = export_task.get_ranges()
+        assert export_ranges == expected_ranges,\
+            "Expected: {e}\n Actual:{a}".format(e=expected_ranges, a=export_ranges)
 
+    def test_murmur3_get_ranges_invalid_input(self):
         # return empty dict and print error if begin_token < min_token
         self._test_get_ranges_murmur3_base({'begintoken': MIN_LONG - 1}, {})
-
-        # return empty dict and print error if begin_token < min_token
         self._test_get_ranges_murmur3_base({'begintoken': 1, 'endtoken': -1}, {})
 
-        # simple case of a single range
-        expected_ranges = {(1,2): {'hosts': ('10.0.0.4', '10.0.0.1', '10.0.0.2'), 'attempts': 0, 'rows': 0, 'workerno': -1}}
+    def test_get_ranges_murmur3_single_range(self):
+        expected_ranges = {(1, 2): {'hosts': ('10.0.0.4', '10.0.0.1', '10.0.0.2'), 'attempts': 0, 'rows': 0, 'workerno': -1}}
         self._test_get_ranges_murmur3_base({'begintoken': 1, 'endtoken': 2}, expected_ranges)
 
-        # simple case of two contiguous ranges
+    def test_get_ranges_murmur3_two_continuous_ranges(self):
         expected_ranges = {
-            (-4611686018427387903,0): {'hosts': ('10.0.0.3', '10.0.0.4', '10.0.0.1'), 'attempts': 0, 'rows': 0, 'workerno': -1},
-            (0,1): {'hosts': ('10.0.0.4', '10.0.0.1', '10.0.0.2'), 'attempts': 0, 'rows': 0, 'workerno': -1}
+            (-4611686018427387903, 0): {'hosts': ('10.0.0.3', '10.0.0.4', '10.0.0.1'), 'attempts': 0, 'rows': 0, 'workerno': -1},
+            (0, 1): {'hosts': ('10.0.0.4', '10.0.0.1', '10.0.0.2'), 'attempts': 0, 'rows': 0, 'workerno': -1}
         }
         self._test_get_ranges_murmur3_base({'begintoken': -4611686018427387903, 'endtoken': 1}, expected_ranges)
 
+    def test_get_ranges_murmur3_begin_token_only(self):
         # specify a begintoken only (endtoken defaults to None)
         expected_ranges = {
-            (4611686018427387905,None): {'hosts': ('10.0.0.1', '10.0.0.2', '10.0.0.3'), 'attempts': 0, 'rows': 0, 'workerno': -1}
+            (4611686018427387905, None): {'hosts': ('10.0.0.1', '10.0.0.2', '10.0.0.3'), 'attempts': 0, 'rows': 0, 'workerno': -1}
         }
         self._test_get_ranges_murmur3_base({'begintoken': 4611686018427387905}, expected_ranges)
 
+    def test_get_ranges_murmur3_end_token_only(self):
         # specify an endtoken only (begintoken defaults to None)
         expected_ranges = {
             (None, MIN_LONG + 1): {'hosts': ('10.0.0.2', '10.0.0.3', '10.0.0.4'), 'attempts': 0, 'rows': 0, 'workerno': -1}
         }
         self._test_get_ranges_murmur3_base({'endtoken': MIN_LONG + 1}, expected_ranges)
-

From 25fa817d823fc317275902d0d156d56f81162adc Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Tue, 22 Jun 2021 09:40:37 +0200
Subject: [PATCH 103/151] STAR-801: Refactor the logic for loading and
 recreating bf

The logic for deciding when to load and when to recreate a bf was separately defined for bti and big. Also, for big the logic has changed in STAR-745 and some tests were added to verify it.

In this commit the logic has been extracted to top SSTableReader level and used consistently for both bti and big.

Post STAR-247+STAR-745 change

It should fix the test failure in SSTableReaderTest
---
 .../io/sstable/format/SSTableFormat.java      |  7 +-
 .../io/sstable/format/SSTableReader.java      | 74 ++++++++++++++++-
 .../sstable/format/SSTableReaderBuilder.java  | 45 +++++------
 .../io/sstable/format/big/BigFormat.java      |  8 ++
 .../format/trieindex/TrieIndexFormat.java     | 11 +++
 .../trieindex/TrieIndexSSTableReader.java     | 80 ++++++-------------
 .../apache/cassandra/utils/BloomFilter.java   | 13 ++-
 .../io/sstable/SSTableReaderTest.java         | 16 ++--
 8 files changed, 163 insertions(+), 91 deletions(-)

diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
index 75b461aefd60..53d2d16d0cc2 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableFormat.java
@@ -21,8 +21,6 @@
 
 import com.google.common.base.CharMatcher;
 
-import org.apache.commons.lang3.StringUtils;
-
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.format.big.BigFormat;
 import org.apache.cassandra.io.sstable.format.trieindex.TrieIndexFormat;
@@ -97,4 +95,9 @@ public static Type validate(String name)
      */
     Set<Component> streamingComponents();
 
+    /**
+     * Returns all primary index components required for index iteration and reading keys
+     */
+    Set<Component> primaryIndexComponents();
+
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index 47a9ae68a5df..ada8d6e16d36 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -46,6 +46,8 @@
 import java.util.concurrent.atomic.AtomicLong;
 import java.util.stream.Collector;
 
+import javax.annotation.Nonnull;
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Ordering;
@@ -129,7 +131,6 @@
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.CacheService;
 import org.apache.cassandra.utils.BloomFilter;
-import org.apache.cassandra.utils.BloomFilterSerializer;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.EstimatedHistogram;
 import org.apache.cassandra.utils.ExecutorUtils;
@@ -2587,4 +2588,75 @@ public static void checkRequiredComponents(Descriptor descriptor, Set<Component>
         }
     }
 
+    public static @Nonnull boolean shouldLoadBloomFilter(Descriptor desc, Set<Component> components, double currerntFPChance, double desiredFPChance)
+    {
+        if (!BloomFilter.shouldUseBloomFilter(desiredFPChance))
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Bloom filter for {} will not be loaded because fpChance={} is neglectable", desc, desiredFPChance);
+
+            return false;
+        }
+        else if (!components.containsAll(desc.getFormat().primaryIndexComponents()))
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Bloom filter for {} will not be loaded because there are missing primary index components: {}", desc, Sets.difference(desc.getFormat().primaryIndexComponents(), components));
+
+            return false;
+        }
+        else if (!components.contains(Component.FILTER) || Double.isNaN(currerntFPChance))
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Bloom filter for {} will not be loaded because filter component is missing or sstable lacks validation metadata", desc);
+
+            return false;
+        }
+        else if (!BloomFilter.isFPChanceDiffNeglectable(desiredFPChance, currerntFPChance) && BloomFilter.recreateOnFPChanceChange)
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Bloom filter for {} will not be loaded because fpChance has changed from {} to {} and the filter should be recreated", desc, currerntFPChance, desiredFPChance);
+
+            return false;
+        }
+
+        return true;
+    }
+
+    public static boolean mayRecreateBloomFilter(Descriptor desc, Set<Component> components, double currentFPChance, boolean isOffline, double desiredFPChance)
+    {
+        if (!BloomFilter.shouldUseBloomFilter(desiredFPChance))
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Bloom filter for {} must not be recreated because fpChance={} is neglectable", desc, desiredFPChance);
+
+            return false;
+        }
+        else if (!components.containsAll(desc.getFormat().primaryIndexComponents()))
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("Bloom filter for {} must not be recreated because there are missing primary index components: {}", desc, Sets.difference(desc.getFormat().primaryIndexComponents(), components));
+
+            return false;
+        }
+        else if (!components.contains(Component.FILTER) || Double.isNaN(currentFPChance))
+        {
+            if (logger.isTraceEnabled() && isOffline)
+                logger.trace("Bloom filter for {} must not be recreated because sstable has been opened in offline mode", desc);
+
+            return !isOffline;
+        }
+        else if (!BloomFilter.isFPChanceDiffNeglectable(desiredFPChance, currentFPChance) && BloomFilter.recreateOnFPChanceChange)
+        {
+            if (logger.isTraceEnabled() && isOffline)
+                logger.trace("Bloom filter for {} must not be recreated because sstable has been opened in offline mode", desc);
+
+            return !isOffline;
+        }
+        else
+        {
+            // bf is enabled and fp chance matches the currently configured value.
+            return true;
+        }
+    }
+
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
index 4b2248612ba9..6f18705bbaed 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReaderBuilder.java
@@ -177,7 +177,10 @@ void buildSummaryAndBloomFilter(boolean recreateBloomFilter,
                                  ? histogramCount
                                  : SSTable.estimateRowsFromIndex(indexIterator); // statistics is supposed to be optional
             if (recreateBloomFilter)
+            {
+                logger.debug("Recreating bloom filter for {} with fpChance={}", descriptor, metadata.params.bloomFilterFpChance);
                 bf = FilterFactory.getFilter(estimatedKeys, metadata.params.bloomFilterFpChance);
+            }
 
             // we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
             try (IndexSummaryBuilder summaryBuilder = summaryLoaded ? null : new IndexSummaryBuilder(estimatedKeys, metadata.params.minIndexInterval, Downsampling.BASE_SAMPLING_LEVEL))
@@ -217,6 +220,7 @@ public static IFilter loadBloomFilter(Path path, boolean oldFormat)
     {
         if (Files.exists(path))
         {
+            logger.debug("Loading bloom filter from {}", path);
             IFilter filter = null;
             try (DataInputStream stream = new DataInputStream(new BufferedInputStream(Files.newInputStream(path))))
             {
@@ -347,7 +351,7 @@ void initSummary(String dataFilePath, Set<Component> components, StatsMetadata s
 
     public static class ForRead extends SSTableReaderBuilder
     {
-        private final ValidationMetadata validationMetadata;
+        private volatile ValidationMetadata validationMetadata;
         private final boolean isOffline;
 
         public ForRead(Descriptor descriptor,
@@ -402,36 +406,22 @@ private void load(ValidationMetadata validation,
                           DiskOptimizationStrategy optimizationStrategy,
                           StatsMetadata statsMetadata) throws IOException
         {
-            if (!BloomFilter.shouldUseBloomFilter(metadata.params.bloomFilterFpChance))
-            {
-                // bf is disabled.
-                load(false, !isOffline, optimizationStrategy, statsMetadata, components);
-            }
-            else if (!components.contains(Component.PRIMARY_INDEX)) // What happens if filter component and primary index is missing?
-            {
-                // avoid any reading of the missing primary index component.
-                // this should only happen during StandaloneScrubber
-                load(false, !isOffline, optimizationStrategy, statsMetadata, components);
-            }
-            else if (!components.contains(Component.FILTER) || validation == null)
-            {
-                // bf is enabled, but filter component is missing.
-                load(!isOffline, !isOffline, optimizationStrategy, statsMetadata, components);
-            }
-            else if (!BloomFilter.isFPChanceDiffNeglectable(metadata.params.bloomFilterFpChance, validationMetadata.bloomFilterFPChance) && BloomFilter.recreateOnFPChanceChange)
-            {
-                // bf is enabled, but fp chance changed
-                load(!isOffline, !isOffline, optimizationStrategy, statsMetadata, components);
-            }
-            else
-            {
-                // bf is enabled and fp chance matches the currently configured value.
+            double currentFPChance = validation != null ? validation.bloomFilterFPChance : Double.NaN;
+            double desiredFPChance = metadata.params.bloomFilterFpChance;
+
+            if (SSTableReader.shouldLoadBloomFilter(descriptor, components, currentFPChance, desiredFPChance))
                 bf = loadBloomFilter(Paths.get(descriptor.filenameFor(Component.FILTER)), descriptor.version.hasOldBfFormat());
-                load(bf == null, !isOffline, optimizationStrategy, statsMetadata, components);
-            }
+
+            boolean recreateBloomFilter = bf == null && SSTableReader.mayRecreateBloomFilter(descriptor, components, currentFPChance, isOffline, desiredFPChance);
+            load(recreateBloomFilter, !isOffline, optimizationStrategy, statsMetadata, components);
+
             // if the filter was neither loaded nor created, or we encountered some problems, we fallback to pass-through filter
             if (bf == null)
+            {
                 bf = FilterFactory.AlwaysPresent;
+                logger.warn("Could not recreate or deserialize existing bloom filter, continuing with a pass-through " +
+                            "bloom filter but this will significantly impact reads performance");
+            }
         }
 
         /**
@@ -475,6 +465,7 @@ void load(boolean recreateBloomFilter,
                         SSTableReader.saveBloomFilter(descriptor, bf);
                         ValidationMetadata updatedValidationMetadata = new ValidationMetadata(validationMetadata.partitioner, metadata.params.bloomFilterFpChance);
                         descriptor.getMetadataSerializer().updateSSTableMetadata(descriptor, ImmutableMap.of(MetadataType.VALIDATION, updatedValidationMetadata));
+                        validationMetadata = updatedValidationMetadata;
                     }
                 }
             }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
index 20288fb189d0..b28778ba7197 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigFormat.java
@@ -73,6 +73,8 @@ public class BigFormat implements SSTableFormat
                                                                                Component.DIGEST,
                                                                                Component.CRC);
 
+    private final static Set<Component> PRIMARY_INDEX_COMPONENTS = ImmutableSet.of(Component.PRIMARY_INDEX);
+
     private BigFormat()
     {
 
@@ -126,6 +128,12 @@ public Set<Component> streamingComponents()
         return STREAMING_COMPONENTS;
     }
 
+    @Override
+    public Set<Component> primaryIndexComponents()
+    {
+        return PRIMARY_INDEX_COMPONENTS;
+    }
+
     static class WriterFactory extends SSTableWriter.Factory
     {
         @Override
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
index 433d59e57ed9..dc4495aee023 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
@@ -84,6 +84,10 @@ public class TrieIndexFormat implements SSTableFormat
                                                                                Component.FILTER,
                                                                                Component.DIGEST,
                                                                                Component.CRC);
+
+    private final static Set<Component> PRIMARY_INDEX_COMPONENTS = ImmutableSet.of(Component.PARTITION_INDEX,
+                                                                                   Component.ROW_INDEX);
+
     public static final TrieIndexFormat instance = new TrieIndexFormat();
     public static final Version latestVersion = new TrieIndexVersion(TrieIndexVersion.current_version);
     static final ReaderFactory readerFactory = new ReaderFactory();
@@ -143,6 +147,13 @@ public Set<Component> streamingComponents()
     {
         return STREAMING_COMPONENTS;
     }
+
+    @Override
+    public Set<Component> primaryIndexComponents()
+    {
+        return PRIMARY_INDEX_COMPONENTS;
+    }
+
     static class WriterFactory extends SSTableWriter.Factory
     {
         @Override
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
index dd25f1018a84..a1405dabff7d 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
@@ -38,7 +38,7 @@
 import javax.annotation.Nonnull;
 
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -69,6 +69,7 @@
 import org.apache.cassandra.io.sstable.SSTableIdentityIterator;
 import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableReaderBuilder;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SelectionReason;
 import org.apache.cassandra.io.sstable.format.SSTableReadsListener.SkippingReason;
@@ -884,8 +885,10 @@ private static IFilter deserializeBloomFilter(Descriptor descriptor, boolean old
         }
     }
 
-    private static IFilter recreateBloomFilter(Descriptor descriptor, TableMetadata metadata, long estimatedKeysCount, Map<MetadataType, MetadataComponent> sstableMetadata, double fpChance)
+    private static IFilter recreateBloomFilter(Descriptor descriptor, TableMetadata metadata, long estimatedKeysCount, double fpChance)
     {
+        logger.debug("Recreating bloom filter for {} with fpChance={}", descriptor, fpChance);
+
         if (estimatedKeysCount <= 0)
         {
             logger.warn("Cannot recreate bloom filter, cannot estimate number of keys");
@@ -920,9 +923,8 @@ private static IFilter recreateBloomFilter(Descriptor descriptor, TableMetadata
             }
 
             // Update the sstable metadata to contain the current FP chance
-            ValidationMetadata validation = new ValidationMetadata(metadata.partitioner.getClass().getCanonicalName(), fpChance);
-            sstableMetadata.put(MetadataType.VALIDATION, validation);
-            descriptor.getMetadataSerializer().rewriteSSTableMetadata(descriptor, sstableMetadata);
+            ValidationMetadata updatedValidationMetadata = new ValidationMetadata(metadata.partitioner.getClass().getCanonicalName(), fpChance);
+            descriptor.getMetadataSerializer().updateSSTableMetadata(descriptor, ImmutableMap.of(MetadataType.VALIDATION, updatedValidationMetadata));
             return bf;
         }
         catch (Throwable t)
@@ -947,55 +949,30 @@ private static IFilter recreateBloomFilter(Descriptor descriptor, TableMetadata
      * @param fpChance        the current FP chance taken from the table metadata
      */
     @VisibleForTesting
-    static @Nonnull
-    IFilter getBloomFilter(Descriptor descriptor, boolean loadIfNeeded, boolean recreateIfNeeded, TableMetadata metadata, long estimatedKeysCount, Map<MetadataType, MetadataComponent> sstableMetadata, double fpChance)
+    static @Nonnull IFilter getBloomFilter(Descriptor descriptor, Set<Component> components, ValidationMetadata validationMetadata, boolean isOffline, TableMetadata metadata, long estimatedKeysCount)
     {
-        if (Math.abs(1 - fpChance) <= fpChanceTolerance)
-        {
-            if (logger.isTraceEnabled())
-                logger.trace("Returning pass-through bloom filter, FP chance is equal to 1: {}", fpChance);
+        double currentFPChance = validationMetadata != null ? validationMetadata.bloomFilterFPChance : Double.NaN;
+        double desiredFPChance = metadata.params.bloomFilterFpChance;
 
-            return FilterFactory.AlwaysPresent;
-        }
-
-        ValidationMetadata validation = (ValidationMetadata) sstableMetadata.get(MetadataType.VALIDATION);
-        boolean fpChanged = Math.abs(fpChance - validation.bloomFilterFPChance) > fpChanceTolerance;
+        IFilter bf = null;
+        if (SSTableReader.shouldLoadBloomFilter(descriptor, components, currentFPChance, desiredFPChance))
+            bf = SSTableReaderBuilder.loadBloomFilter(Paths.get(descriptor.filenameFor(Component.FILTER)), descriptor.version.hasOldBfFormat());
 
-        if (loadIfNeeded && descriptor.fileFor(Component.FILTER).exists())
+        boolean recreateBloomFilter = bf == null && SSTableReader.mayRecreateBloomFilter(descriptor, components, currentFPChance, isOffline, desiredFPChance);
+        if (recreateBloomFilter)
         {
-            if (logger.isTraceEnabled())
-                logger.trace("Deserializing bloom filter");
-
-            IFilter bf = deserializeBloomFilter(descriptor, descriptor.version.hasOldBfFormat());
-            if (bf != null)
-                return bf;
+            bf = recreateBloomFilter(descriptor, metadata, estimatedKeysCount, desiredFPChance);
         }
 
-        String reason = fpChanged
-                        ? String.format("false positive chance changed from %f to %f", validation.bloomFilterFPChance, fpChance)
-                        : (!descriptor.fileFor(Component.FILTER).exists()
-                           ? "there is no bloom filter file"
-                           : "deserialization failed");
-
-        if (logger.isDebugEnabled())
-            logger.debug("Recreating bloom filter because {}", reason);
-
-        IFilter bf = recreateIfNeeded
-                     ? recreateBloomFilter(descriptor, metadata, estimatedKeysCount, sstableMetadata, fpChance)
-                     : FilterFactory.AlwaysPresent;
-        if (bf != null)
-            return bf;
-
-        logger.warn("Could not recreate or deserialize existing bloom filter, continuing with a pass-through " +
-                    "bloom filter but this will significantly impact reads performance");
-
-        return FilterFactory.AlwaysPresent;
-    }
+        // if the filter was neither loaded nor created, or we encountered some problems, we fallback to pass-through filter
+        if (bf == null)
+        {
+            bf = FilterFactory.AlwaysPresent;
+            logger.warn("Could not recreate or deserialize existing bloom filter, continuing with a pass-through " +
+                        "bloom filter but this will significantly impact reads performance");
+        }
 
-    public static boolean hasBloomFilter(double fpChance)
-    {
-        Preconditions.checkArgument(fpChance <= 1, "FP chance should be less or equal to 1: " + fpChance);
-        return Math.abs(1 - fpChance) > fpChanceTolerance;
+        return bf;
     }
 
     public static TrieIndexSSTableReader open(Descriptor descriptor, Set<Component> components, TableMetadataRef metadata, boolean validate, boolean isOffline)
@@ -1034,19 +1011,14 @@ public static TrieIndexSSTableReader open(Descriptor descriptor, Set<Component>
         long fileLength = descriptor.filenameFor(Component.DATA).length();
         logger.debug("Opening {} ({})", descriptor, FBUtilities.prettyPrintMemory(fileLength));
 
-        double fpChance = metadata.get().params.bloomFilterFpChance;
-
         FileHandle dataFH = null;
         FileHandle rowIdxFH = null;
         PartitionIndex partitionIndex = null;
         IFilter bloomFilter = null;
         boolean compressedData = descriptor.fileFor(Component.COMPRESSION_INFO).exists();
 
-        boolean loadBFIfNeeded = components.contains(Component.FILTER);
-        boolean recreatedBFIfNeeded = !isOffline;
-
         try (FileHandle.Builder dataFHBuilder = defaultDataHandleBuilder(descriptor).compressed(compressedData);
-             @Nonnull IFilter bf = getBloomFilter(descriptor, loadBFIfNeeded, recreatedBFIfNeeded, metadata.get(), statsMetadata.totalRows, sstableMetadata, fpChance))
+             @Nonnull IFilter bf = getBloomFilter(descriptor, components, validationMetadata, isOffline, metadata.get(), statsMetadata.totalRows))
         {
             TrieIndexSSTableReader sstable;
             dataFH = dataFHBuilder.complete();
@@ -1058,7 +1030,7 @@ public static TrieIndexSSTableReader open(Descriptor descriptor, Set<Component>
                      FileHandle.Builder rowIdxFHBuilder = defaultIndexHandleBuilder(descriptor, Component.ROW_INDEX))
                 {
                     rowIdxFH = rowIdxFHBuilder.complete();
-                    partitionIndex = PartitionIndex.load(partitionIdxFHBuilder, metadata.get().partitioner, loadBFIfNeeded && !hasBloomFilter(fpChance));
+                    partitionIndex = PartitionIndex.load(partitionIdxFHBuilder, metadata.get().partitioner, bloomFilter == FilterFactory.AlwaysPresent);
                     sstable = TrieIndexSSTableReader.internalOpen(descriptor,
                                                                   components,
                                                                   metadata,
diff --git a/src/java/org/apache/cassandra/utils/BloomFilter.java b/src/java/org/apache/cassandra/utils/BloomFilter.java
index a59a7fb14a9a..fa220283d7e7 100644
--- a/src/java/org/apache/cassandra/utils/BloomFilter.java
+++ b/src/java/org/apache/cassandra/utils/BloomFilter.java
@@ -19,6 +19,9 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import io.netty.util.concurrent.FastThreadLocal;
 import net.nicoulaj.compilecommand.annotations.Inline;
 import org.apache.cassandra.config.Config;
@@ -29,6 +32,8 @@
 
 public class BloomFilter extends WrappedSharedCloseable implements IFilter
 {
+    private final static Logger logger = LoggerFactory.getLogger(BloomFilter.class);
+
     /**
      * The maximum memory to be used by all loaded bloom filters. If the limit is exceeded, pass-through filter will be
      * used until some filters get unloaded.
@@ -185,7 +190,13 @@ public void addTo(Ref.IdentityCollection identities)
 
     public static boolean shouldUseBloomFilter(double fpChance)
     {
-        return Math.abs(1 - fpChance) > BloomFilter.fpChanceTolerance;
+        if (Math.abs(1 - fpChance) <= BloomFilter.fpChanceTolerance) {
+            if (logger.isTraceEnabled())
+                logger.trace("Returning pass-through bloom filter, FP chance is equal to 1: {}", fpChance);
+            return false;
+        }
+
+        return true;
     }
 
     public static boolean isFPChanceDiffNeglectable(double fpChance1, double fpChance2)
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 4328899abf86..2210335592d4 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -986,7 +986,11 @@ public void testBloomFilterIsCreatedOnLoad() throws IOException
         checkSSTableOpenedWithGivenFPChance(sstable, 1 - BloomFilter.fpChanceTolerance, true, numKeys, true);
 
         // missing primary index file should make BF fail to load and we should install the empty one
-        new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete();
+        if (sstable.descriptor.getFormat().getType() == SSTableFormat.Type.BIG)
+            new File(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX)).delete();
+        else
+            new File(sstable.descriptor.filenameFor(Component.PARTITION_INDEX)).delete();
+
         checkSSTableOpenedWithGivenFPChance(sstable, 0.05, false, numKeys, false);
     }
 
@@ -1006,11 +1010,11 @@ private void checkSSTableOpenedWithGivenFPChance(SSTableReader sstable, double f
             // make sure we wait enough - some JDK implementations use seconds granularity and we need to wait a bit to actually see the change
             Uninterruptibles.sleepUninterruptibly(1, Util.supportedMTimeGranularity);
 
-            target = SSTableReader.open(desc,
-                                        SSTableReader.discoverComponentsFor(desc),
-                                        TableMetadataRef.forOfflineTools(metadata),
-                                        false,
-                                        false);
+            target = desc.getFormat().getReaderFactory().open(desc,
+                                                              SSTableReader.discoverComponentsFor(desc),
+                                                              TableMetadataRef.forOfflineTools(metadata),
+                                                              false,
+                                                              false);
             IFilter bloomFilter = target.getBloomFilter();
             ValidationMetadata validationMetadata = getValidationMetadata(desc);
             Assert.assertNotNull(validationMetadata);

From 1c24e77b52ade9e4bb94016649b9492b7a918cdc Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Tue, 22 Jun 2021 10:37:02 +0200
Subject: [PATCH 104/151] STAR-801: Adjust the failing assertion in
 SSTablesIteratedTest.testNonCompactTableWithStaticColumnValueMissingAndMulticellColumn

The test was introduced recently in OSS to verify the correctness when dealing with static columns.

The failing assertion expects 3 sstables to be read when selecting non-static value, which in fact is present only in one sstable. It is correct in OSS that we read 3 sstables as all of them touch that partition however they do not cover that particular clustering key which we query for. The difference results from read path optimizations we applied in STAR-1. That optimization improves filtering of sstables by making it more precise so that more sstables can be skipped. The optimization comes into play in this test case.
---
 .../cql3/validation/miscellaneous/SSTablesIteratedTest.java     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
index 0b441d91cdb0..1689d23ecaae 100644
--- a/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
+++ b/test/unit/org/apache/cassandra/cql3/validation/miscellaneous/SSTablesIteratedTest.java
@@ -1031,7 +1031,7 @@ public void testNonCompactTableWithStaticColumnValueMissingAndMulticellColumn()
         executeAndCheck("SELECT s, v FROM %s WHERE pk = 3 AND c = 3", 3, row(3, set(1)));
         executeAndCheck("SELECT v FROM %s WHERE pk = 1 AND c = 1", 3, row(set(3)));
         executeAndCheck("SELECT v FROM %s WHERE pk = 2 AND c = 1", 2, row(set(3)));
-        executeAndCheck("SELECT v FROM %s WHERE pk = 3 AND c = 3", 3, row(set(1)));
+        executeAndCheck("SELECT v FROM %s WHERE pk = 3 AND c = 3", 1, row(set(1)));
         executeAndCheck("SELECT s FROM %s WHERE pk = 1", 3, row((Integer) null));
         executeAndCheck("SELECT s FROM %s WHERE pk = 2", 2, row(1), row(1));
         executeAndCheck("SELECT DISTINCT s FROM %s WHERE pk = 2", 2, row(1));

From d7674a8785bdf1f47f2651aebdfef5cc3a6c9a2a Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <lewandowski.jacek@gmail.com>
Date: Tue, 22 Jun 2021 13:53:43 +0200
Subject: [PATCH 105/151] STAR-801: Fix ScrubberTest failure (and scrubber
 itself)

Partition index iterator is handled a bit differently than primary index iterator. In particular, it reads stuff on load / creation of scrubber thus we need to catch exception for that instantiation and just leave the index iterator null in that case.
---
 .../cassandra/db/compaction/Scrubber.java       | 13 ++++++++++---
 .../unit/org/apache/cassandra/db/ScrubTest.java | 17 +++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
index 4cf322ac0dd1..0d84b9d13619 100644
--- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java
+++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java
@@ -132,9 +132,16 @@ public Scrubber(ColumnFamilyStore cfs,
                         ? sstable.openDataReader()
                         : sstable.openDataReader(CompactionManager.instance.getRateLimiter());
 
-        this.indexIterator = hasIndexFile
-                             ? openIndexIterator()
-                             : null;
+        try
+        {
+            this.indexIterator = hasIndexFile
+                                 ? openIndexIterator()
+                                 : null;
+        }
+        catch (RuntimeException ex)
+        {
+            outputHandler.warn("Detected corruption in the index file - cannot open index iterator", ex);
+        }
 
         this.scrubInfo = new ScrubInfo(dataFile, sstable, fileAccessLock.readLock());
 
diff --git a/test/unit/org/apache/cassandra/db/ScrubTest.java b/test/unit/org/apache/cassandra/db/ScrubTest.java
index c93d3f4116ad..76c4718538f1 100644
--- a/test/unit/org/apache/cassandra/db/ScrubTest.java
+++ b/test/unit/org/apache/cassandra/db/ScrubTest.java
@@ -241,6 +241,15 @@ public void testScrubCorruptedCounterRow() throws IOException, WriteTimeoutExcep
         assertOrderedAll(cfs, scrubResult.goodRows);
     }
 
+    private String primaryIndexPath(SSTableReader reader)
+    {
+        if (reader.descriptor.getFormat().getType() == SSTableFormat.Type.BIG)
+            return reader.descriptor.filenameFor(Component.PRIMARY_INDEX);
+        if (reader.descriptor.getFormat().getType() == SSTableFormat.Type.BTI)
+            return reader.descriptor.filenameFor(Component.PARTITION_INDEX);
+        else throw new IllegalArgumentException();
+    }
+
     @Test
     public void testScrubCorruptedRowInSmallFile() throws Throwable
     {
@@ -260,7 +269,7 @@ public void testScrubCorruptedIndex() throws Throwable
     {
         // overwrite a part of the index with garbage
         testCorruptionInSmallFile((sstable, keys) ->
-                                  overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX),
+                                  overrideWithGarbage(primaryIndexPath(sstable),
                                                       5,
                                                       6,
                                                       (byte) 0x7A),
@@ -273,7 +282,7 @@ public void testScrubCorruptedIndexOnOpen() throws Throwable
     {
         // overwrite the whole index with garbage
         testCorruptionInSmallFile((sstable, keys) ->
-                                  overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX),
+                                  overrideWithGarbage(primaryIndexPath(sstable),
                                                       0,
                                                       60,
                                                       (byte) 0x7A),
@@ -291,7 +300,7 @@ public void testScrubCorruptedRowCorruptedIndex() throws Throwable
                                                           ByteBufferUtil.bytes(keys[2]),
                                                           ByteBufferUtil.bytes(keys[3]),
                                                           (byte) 0x7A);
-                                      overrideWithGarbage(sstable.descriptor.filenameFor(Component.PRIMARY_INDEX),
+                                      overrideWithGarbage(primaryIndexPath(sstable),
                                                           5,
                                                           6,
                                                           (byte) 0x7A);
@@ -483,7 +492,7 @@ public void testScrubOutOfOrder() throws IOException
             if (new File(desc.filenameFor(Component.COMPRESSION_INFO)).exists())
                 components.add(Component.COMPRESSION_INFO);
             components.add(Component.DATA);
-            components.add(Component.PRIMARY_INDEX);
+            components.addAll(desc.getFormat().primaryIndexComponents());
             components.add(Component.FILTER);
             components.add(Component.STATS);
             components.add(Component.SUMMARY);

From 5e0a9a6e8b06d03fabb2883b842b05f2d49df81d Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Mon, 28 Jun 2021 10:18:38 +0100
Subject: [PATCH 106/151] STAR-228: OR and IN support for SAI (#206)

* STAR-228: OR and IN support for SAI

* Review changes
---
 build.xml                                     |   5 +-
 src/antlr/Parser.g                            |  17 +-
 .../cassandra/cql3/SingleColumnRelation.java  |   2 +-
 .../apache/cassandra/cql3/WhereClause.java    | 415 ++++++++-
 .../ClusteringColumnRestrictions.java         |  11 +-
 .../restrictions/CustomIndexExpression.java   |   2 +-
 .../cql3/restrictions/IndexRestrictions.java  |  13 +
 .../restrictions/MultiColumnRestriction.java  |   8 +-
 .../PartitionKeySingleRestrictionSet.java     |  21 +-
 .../cql3/restrictions/Restriction.java        |   3 +-
 .../cql3/restrictions/RestrictionSet.java     | 129 +--
 .../restrictions/RestrictionSetWrapper.java   |   5 +-
 .../restrictions/SingleColumnRestriction.java |  15 +-
 .../restrictions/StatementRestrictions.java   | 875 +++++++++++-------
 .../cql3/restrictions/TokenFilter.java        |   3 +-
 .../cql3/restrictions/TokenRestriction.java   |   2 +-
 .../statements/ModificationStatement.java     |   2 +-
 .../cql3/statements/SelectStatement.java      |  36 +-
 .../cql3/statements/UpdateStatement.java      |  28 +-
 .../schema/CreateViewStatement.java           |  17 +-
 .../apache/cassandra/db/filter/RowFilter.java | 427 ++++++---
 .../org/apache/cassandra/index/Index.java     |   8 +
 .../cassandra/index/sai/ColumnContext.java    |   3 +
 .../index/sai/StorageAttachedIndexGroup.java  |   6 +
 .../cassandra/index/sai/plan/Expression.java  |   5 +-
 .../cassandra/index/sai/plan/FilterTree.java  | 126 +--
 .../cassandra/index/sai/plan/Operation.java   | 380 +++-----
 .../index/sai/plan/QueryController.java       |  29 +-
 .../plan/StorageAttachedIndexQueryPlan.java   |  12 +-
 .../plan/StorageAttachedIndexSearcher.java    |  29 +-
 .../cql3/WhereClauseExpressionTreeTest.java   | 165 ++++
 .../db/AbstractReadCommandBuilder.java        |   6 +-
 .../org/apache/cassandra/db/CleanupTest.java  |   2 +-
 .../apache/cassandra/db/ReadCommandTest.java  |  12 +-
 .../cassandra/db/filter/RowFilterTest.java    |   8 +-
 .../index/sai/cql/AllowFilteringTest.java     |  13 -
 .../index/sai/cql/ComplexQueryTest.java       | 176 ++++
 .../sai/cql/RandomisedComplexQueryTest.java   | 450 +++++++++
 .../index/sai/plan/OperationTest.java         | 218 ++---
 .../cassandra/index/sasi/SASIIndexTest.java   |   8 +-
 40 files changed, 2532 insertions(+), 1160 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
 rename {src/java => test/unit}/org/apache/cassandra/db/AbstractReadCommandBuilder.java (97%)
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
 create mode 100644 test/unit/org/apache/cassandra/index/sai/cql/RandomisedComplexQueryTest.java

diff --git a/build.xml b/build.xml
index abe4b10c03e1..6c38e4ac77de 100644
--- a/build.xml
+++ b/build.xml
@@ -658,7 +658,8 @@
           <dependency groupId="org.hamcrest" artifactId="hamcrest" version="2.2" scope="test"/>
           <dependency groupId="org.agrona" artifactId="agrona" version="0.9.26" />
           <dependency groupId="org.apache.lucene" artifactId="lucene-core" version="7.5.0" />
-          <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2">
+          <dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
+          <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" version="2.1.2" scope="test">
               <exclusion groupId="junit" artifactId="junit"/>
           </dependency>
         </dependencyManagement>
@@ -737,6 +738,7 @@
         <dependency groupId="org.jacoco" artifactId="org.jacoco.ant"/>
 
         <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
+        <dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.14" scope="test"/>
         <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
       </artifact:pom>
 
@@ -823,6 +825,7 @@
         <dependency groupId="org.hdrhistogram" artifactId="HdrHistogram"/>
         <dependency groupId="org.agrona" artifactId="agrona"/>
         <dependency groupId="org.apache.lucene" artifactId="lucene-core"/>
+        <dependency groupId="com.bpodgursky" artifactId="jbool_expressions" version="1.23" scope="test"/>
         <dependency groupId="com.carrotsearch.randomizedtesting" artifactId="randomizedtesting-runner" scope="test"/>
         <dependency groupId="org.hamcrest" artifactId="hamcrest" scope="test"/>
         <dependency groupId="com.esri.geometry" artifactId="esri-geometry-api"/>
diff --git a/src/antlr/Parser.g b/src/antlr/Parser.g
index 30e1074a4cfe..cc386e178403 100644
--- a/src/antlr/Parser.g
+++ b/src/antlr/Parser.g
@@ -443,14 +443,24 @@ sident returns [Selectable.RawIdentifier id]
 
 whereClause returns [WhereClause.Builder clause]
     @init{ $clause = new WhereClause.Builder(); }
-    : relationOrExpression[$clause] (K_AND relationOrExpression[$clause])*
+    : expression[$clause]
     ;
 
-relationOrExpression [WhereClause.Builder clause]
-    : relation[$clause]
+expression [WhereClause.Builder clause]
+    : primaryExpression[$clause] (booleanOp[$clause] primaryExpression[$clause])*
+    ;
+
+primaryExpression [WhereClause.Builder clause]
+    : '(' {clause.startEnclosure(); } expression[$clause] ')' { clause.endEnclosure(); }
+    | relation[$clause]
     | customIndexExpression[$clause]
     ;
 
+booleanOp [WhereClause.Builder clause]
+    : op=K_AND { clause.setCurrentOperator(op.getText()); }
+    | op=K_OR { clause.setCurrentOperator(op.getText()); }
+    ;
+
 customIndexExpression [WhereClause.Builder clause]
     @init{QualifiedName name = new QualifiedName();}
     : 'expr(' idxName[name] ',' t=term ')' { clause.add(new CustomIndexExpression(name, t));}
@@ -1713,7 +1723,6 @@ relation[WhereClause.Builder clauses]
       | type=relationType tupleMarker=markerForTuple /* (a, b, c) >= ? */
           { $clauses.add(MultiColumnRelation.createNonInRelation(ids, type, tupleMarker)); }
       )
-    | '(' relation[$clauses] ')'
     ;
 
 containsOperator returns [Operator o]
diff --git a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
index cf1cb69066e6..d4e1915709f8 100644
--- a/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
+++ b/src/java/org/apache/cassandra/cql3/SingleColumnRelation.java
@@ -150,7 +150,7 @@ public String toCQLString()
             entityAsString = String.format("%s[%s]", entityAsString, mapKey);
 
         if (isIN())
-            return String.format("%s IN %s", entityAsString, Tuples.tupleToString(inValues));
+            return String.format("%s IN %s", entityAsString, inValues == null ? value : Tuples.tupleToString(inValues));
 
         return String.format("%s %s %s", entityAsString, relationType, value);
     }
diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
index 16116a2b8ad2..9c69d6d8ae68 100644
--- a/src/java/org/apache/cassandra/cql3/WhereClause.java
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java
@@ -17,30 +17,33 @@
  */
 package org.apache.cassandra.cql3;
 
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Deque;
 import java.util.List;
 import java.util.Objects;
+import java.util.stream.Collectors;
 
-import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
 
 import org.antlr.runtime.RecognitionException;
 import org.apache.cassandra.cql3.restrictions.CustomIndexExpression;
 
-import static java.lang.String.join;
-
-import static com.google.common.collect.Iterables.concat;
-import static com.google.common.collect.Iterables.transform;
-
+/**
+ * This is a parsed representation of the expression following the WHERE element
+ * in a CQL statement. It is parsed into an arbitrary sized expression tree consisting
+ * of <code>ExpressionElement</code> elements.
+ */
 public final class WhereClause
 {
-    private static final WhereClause EMPTY = new WhereClause(new Builder());
+    private static final WhereClause EMPTY = new WhereClause(ExpressionElement.EMPTY);
 
-    public final List<Relation> relations;
-    public final List<CustomIndexExpression> expressions;
+    private final ExpressionElement rootElement;
 
-    private WhereClause(Builder builder)
+    private WhereClause(ExpressionElement rootElement)
     {
-        relations = builder.relations.build();
-        expressions = builder.expressions.build();
+        this.rootElement = rootElement;
     }
 
     public static WhereClause empty()
@@ -50,26 +53,24 @@ public static WhereClause empty()
 
     public boolean containsCustomExpressions()
     {
-        return !expressions.isEmpty();
+        return rootElement.containsCustomExpressions();
+    }
+
+    public ExpressionElement root()
+    {
+        return rootElement;
     }
 
     /**
      * Renames identifiers in all relations
+     *
      * @param from the old identifier
-     * @param to the new identifier
+     * @param to   the new identifier
      * @return a new WhereClause with with "from" replaced by "to" in all relations
      */
     public WhereClause renameIdentifier(ColumnIdentifier from, ColumnIdentifier to)
     {
-        WhereClause.Builder builder = new WhereClause.Builder();
-
-        relations.stream()
-                 .map(r -> r.renameIdentifier(from, to))
-                 .forEach(builder::add);
-
-        expressions.forEach(builder::add);
-
-        return builder.build();
+        return new WhereClause(rootElement.rename(from, to));
     }
 
     public static WhereClause parse(String cql) throws RecognitionException
@@ -90,9 +91,7 @@ public String toString()
      */
     public String toCQLString()
     {
-        return join(" AND ",
-                    concat(transform(relations, Relation::toCQLString),
-                           transform(expressions, CustomIndexExpression::toCQLString)));
+        return rootElement.toString();
     }
 
     @Override
@@ -105,35 +104,379 @@ public boolean equals(Object o)
             return false;
 
         WhereClause wc = (WhereClause) o;
-        return relations.equals(wc.relations) && expressions.equals(wc.expressions);
+        return rootElement.toString().equals(wc.rootElement.toString());
     }
 
     @Override
     public int hashCode()
     {
-        return Objects.hash(relations, expressions);
+        return Objects.hash(rootElement);
     }
 
+    /**
+     * This receives fragments from the parse operation and builds them into the final <code>WhereClause</code>.
+     *
+     * The received fragments are:
+     * <ul>
+     *     <li><code>add(Relation)</code> - adds a new relation to the current <code>ParseState</code></li>
+     *     <li><code>add(CustomIndexExpression)</code> - adds a new custom index expression to the current <code>ParseState</code></li>
+     *     <li><code>startEnclosure</code> - responds to a '(' and pushes the current <code>ParseState</code> onto the precedence stack</li>
+     *     <li><code>endEnclosure</code> - responds to a ')' and pulls the <code>ParseState</code> associated with the
+     *     matching <code>startEnclosure</code>. It will pull any intermediate precedence states off the stack until it
+     *     reaches the matching enclosure state</li>
+     *     <li><code>setCurrentOperator</code> - changes the operator in the <code>ParseState</code>. If this new operator is
+     *     of a higher precedence than the current operator, the last expression is popped from the <code>ParseState</code> and
+     *     the state is pushed onto the precedence stack</li>
+     *     <li><code>build</code> - always the last call. This builds the resultant <code>ExpressionTree</code> from the
+     *     precedence stack and the current <code>ParseState</code></li>
+     * </ul>
+     */
     public static final class Builder
     {
-        ImmutableList.Builder<Relation> relations = new ImmutableList.Builder<>();
-        ImmutableList.Builder<CustomIndexExpression> expressions = new ImmutableList.Builder<>();
+        private final Deque<ParseState> precedenceStack = new ArrayDeque<>();
+        private ParseState parseState = new ParseState();
+
+        public void add(Relation relation)
+        {
+            parseState.push(new RelationElement(relation));
+        }
+
+        public void add(CustomIndexExpression customIndexExpression)
+        {
+            parseState.push(new CustomIndexExpressionElement(customIndexExpression));
+        }
+
+        public void startEnclosure()
+        {
+            pushStack(PushState.ENCLOSURE);
+        }
+
+        public void endEnclosure()
+        {
+            do
+            {
+                ExpressionElement expression = generate();
+                parseState = precedenceStack.pop();
+                parseState.push(expression);
+            }
+            while (parseState.enclosure == PushState.PRECEDENCE);
+        }
+
+        public void setCurrentOperator(String value)
+        {
+            Operator operator = Operator.valueOf(value.toUpperCase());
+            if (parseState.isChangeOfOperator(operator))
+            {
+                if (parseState.higherPrecedence(operator))
+                {
+                    // Where we have a = 1 OR b = 1 AND c = 1. When the operator changes to AND
+                    // we need to pop b = 1 from the parseState, push the parseState containing
+                    // a = 1 OR and then add b = 1 to the new parseState
+                    ExpressionElement last = parseState.pop();
+                    pushStack(PushState.PRECEDENCE);
+                    parseState.push(last);
+                }
+                else
+                {
+                    ExpressionElement element = generate();
+                    if (!precedenceStack.isEmpty() && precedenceStack.peek().enclosure == PushState.PRECEDENCE)
+                        parseState = precedenceStack.pop();
+                    else
+                        parseState.clear();
+                    parseState.push(element);
+                }
+            }
+            parseState.operator = operator;
+        }
+
+        public WhereClause build()
+        {
+            while (!precedenceStack.isEmpty())
+            {
+                ExpressionElement expression = generate();
+                parseState = precedenceStack.pop();
+                parseState.push(expression);
+            }
+            return new WhereClause(generate());
+        }
+
+        private void pushStack(PushState enclosure)
+        {
+            parseState.enclosure = enclosure;
+            precedenceStack.push(parseState);
+            parseState = new ParseState();
+        }
+
+        private ExpressionElement generate()
+        {
+            if (parseState.size() == 1)
+                return parseState.pop();
+            return parseState.asContainer();
+        }
+    }
+
+    /**
+     * Represents the state of the parsing operation at a point of enclosure or precedence change.
+     */
+    public static class ParseState
+    {
+        Operator operator = Operator.NONE;
+        PushState enclosure = PushState.NONE;
+        Deque<ExpressionElement> expressionElements = new ArrayDeque<>();
+
+        void push(ExpressionElement element)
+        {
+            expressionElements.add(element);
+        }
+
+        ExpressionElement pop()
+        {
+            return expressionElements.removeLast();
+        }
 
-        public Builder add(Relation relation)
+        int size()
         {
-            relations.add(relation);
+            return expressionElements.size();
+        }
+
+        ParseState clear()
+        {
+            expressionElements.clear();
             return this;
         }
 
-        public Builder add(CustomIndexExpression expression)
+        boolean isChangeOfOperator(Operator operator)
+        {
+            return this.operator != operator && expressionElements.size() > 1;
+        }
+
+        boolean higherPrecedence(Operator operator)
+        {
+            return operator.compareTo(this.operator) > 0;
+        }
+
+        ContainerElement asContainer()
+        {
+            return operator == Operator.OR ? new OrElement().add(expressionElements) : new AndElement().add(expressionElements);
+        }
+    }
+
+    enum Operator
+    {
+        NONE, OR, AND;
+
+        public String joinValue()
+        {
+            return " " + name() + " ";
+        }
+    }
+
+    /**
+     * This is the reason why the <code>ParseState</code> was pushed onto the precedence stack.
+     */
+    enum PushState
+    {
+        NONE, PRECEDENCE, ENCLOSURE
+    }
+
+    public static abstract class ExpressionElement
+    {
+        private static final ExpressionElement EMPTY = new EmptyElement();
+
+        public List<ContainerElement> operations()
+        {
+            return Collections.emptyList();
+        }
+
+        public boolean isDisjunction()
+        {
+            return false;
+        }
+
+        public List<Relation> relations()
+        {
+            return Collections.emptyList();
+        }
+
+        public List<CustomIndexExpression> expressions()
+        {
+            return Collections.emptyList();
+        }
+
+        public boolean containsCustomExpressions()
+        {
+            return false;
+        }
+
+        public abstract String toEncapsulatedString();
+
+        public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
         {
-            expressions.add(expression);
             return this;
         }
+    }
 
-        public WhereClause build()
+    public static abstract class VariableElement extends ExpressionElement
+    {
+        @Override
+        public String toEncapsulatedString()
+        {
+            return toString();
+        }
+    }
+
+    public static class EmptyElement extends VariableElement
+    {
+        @Override
+        public String toString()
+        {
+            return "";
+        }
+    }
+
+    public static class RelationElement extends VariableElement
+    {
+        private final Relation relation;
+
+        public RelationElement(Relation relation)
+        {
+            this.relation = relation;
+        }
+
+        @Override
+        public List<Relation> relations()
+        {
+            return Lists.newArrayList(relation);
+        }
+
+        @Override
+        public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
+        {
+            return new RelationElement(relation.renameIdentifier(from, to));
+        }
+
+        @Override
+        public String toString()
+        {
+            return relation.toString();
+        }
+    }
+
+    public static class CustomIndexExpressionElement extends VariableElement
+    {
+        private final CustomIndexExpression customIndexExpression;
+
+        public CustomIndexExpressionElement(CustomIndexExpression customIndexExpression)
+        {
+            this.customIndexExpression = customIndexExpression;
+        }
+
+        @Override
+        public List<CustomIndexExpression> expressions()
+        {
+            return Lists.newArrayList(customIndexExpression);
+        }
+
+        @Override
+        public boolean containsCustomExpressions()
+        {
+            return true;
+        }
+
+        @Override
+        public String toString()
         {
-            return new WhereClause(this);
+            return customIndexExpression.toString();
+        }
+    }
+
+    public static abstract class ContainerElement extends ExpressionElement
+    {
+        protected final List<ExpressionElement> children = new ArrayList<>();
+
+        @Override
+        public List<ContainerElement> operations()
+        {
+            return children.stream()
+                           .filter(c -> (c instanceof ContainerElement))
+                           .map(r -> ((ContainerElement) r))
+                           .collect(Collectors.toList());
+        }
+
+        public ContainerElement add(Deque<ExpressionElement> children)
+        {
+            this.children.addAll(children);
+            return this;
+        }
+
+        protected abstract Operator operator();
+
+        @Override
+        public List<Relation> relations()
+        {
+            return children.stream()
+                           .filter(c -> (c instanceof RelationElement))
+                           .map(r -> (((RelationElement) r).relation))
+                           .collect(Collectors.toList());
+        }
+
+        @Override
+        public List<CustomIndexExpression> expressions()
+        {
+            return children.stream()
+                           .filter(c -> (c instanceof CustomIndexExpressionElement))
+                           .map(r -> (((CustomIndexExpressionElement) r).customIndexExpression))
+                           .collect(Collectors.toList());
+        }
+
+        @Override
+        public boolean containsCustomExpressions()
+        {
+            return children.stream().anyMatch(ExpressionElement::containsCustomExpressions);
+        }
+
+        @Override
+        public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
+        {
+            AndElement element = new AndElement();
+            children.stream().map(c -> c.rename(from, to)).forEach(c -> element.children.add(c));
+            return element;
+        }
+
+        @Override
+        public String toString()
+        {
+            return children.stream().map(ExpressionElement::toEncapsulatedString).collect(Collectors.joining(operator().joinValue()));
+        }
+
+        @Override
+        public String toEncapsulatedString()
+        {
+            return children.stream().map(ExpressionElement::toEncapsulatedString).collect(Collectors.joining(operator().joinValue(), "(", ")"));
+        }
+    }
+
+    public static class AndElement extends ContainerElement
+    {
+        @Override
+        protected Operator operator()
+        {
+            return Operator.AND;
+        }
+    }
+
+    public static class OrElement extends ContainerElement
+    {
+        @Override
+        protected Operator operator()
+        {
+            return Operator.OR;
+        }
+
+        @Override
+        public boolean isDisjunction()
+        {
+            return true;
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
index 1e11e54db0dd..af4e81465fd9 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/ClusteringColumnRestrictions.java
@@ -127,7 +127,7 @@ public boolean needFiltering()
     }
 
     @Override
-    public void addToRowFilter(RowFilter filter,
+    public void addToRowFilter(RowFilter.Builder filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options) throws InvalidRequestException
     {
@@ -180,6 +180,11 @@ private Builder(TableMetadata table, boolean allowFiltering, IndexRegistry index
         }
 
         public ClusteringColumnRestrictions.Builder addRestriction(Restriction restriction)
+        {
+            return addRestriction(restriction, false);
+        }
+
+        public ClusteringColumnRestrictions.Builder addRestriction(Restriction restriction, boolean isDisjunction)
         {
             SingleRestriction newRestriction = (SingleRestriction) restriction;
             boolean isEmpty = restrictions.isEmpty();
@@ -189,7 +194,7 @@ public ClusteringColumnRestrictions.Builder addRestriction(Restriction restricti
                 SingleRestriction lastRestriction = restrictions.lastRestriction();
                 ColumnMetadata lastRestrictionStart = lastRestriction.getFirstColumn();
                 ColumnMetadata newRestrictionStart = newRestriction.getFirstColumn();
-                restrictions.addRestriction(newRestriction);
+                restrictions.addRestriction(newRestriction, isDisjunction);
 
                 checkFalse(lastRestriction.isSlice() && newRestrictionStart.position() > lastRestrictionStart.position(),
                            "Clustering column \"%s\" cannot be restricted (preceding column \"%s\" is restricted by a non-EQ relation)",
@@ -203,7 +208,7 @@ public ClusteringColumnRestrictions.Builder addRestriction(Restriction restricti
             }
             else
             {
-                restrictions.addRestriction(newRestriction);
+                restrictions.addRestriction(newRestriction, isDisjunction);
             }
 
             return this;
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
index ff6a2a6e25af..7debf1ae3e5d 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/CustomIndexExpression.java
@@ -47,7 +47,7 @@ public void prepareValue(TableMetadata table, AbstractType<?> expressionType, Va
         value.collectMarkerSpecification(boundNames);
     }
 
-    public void addToRowFilter(RowFilter filter, TableMetadata table, QueryOptions options)
+    public void addToRowFilter(RowFilter.Builder filter, TableMetadata table, QueryOptions options)
     {
         filter.addCustomIndexExpression(table,
                                         table.indexes
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
index 10608b85c1c2..812e1816bf90 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/IndexRestrictions.java
@@ -151,6 +151,19 @@ private boolean needFiltering(Index.Group indexGroup, boolean hasMultipleContain
         return false;
     }
 
+    public boolean indexBeingUsed(Index.Group indexGroup)
+    {
+        for (Restrictions restrictions : regularRestrictions)
+            if (!restrictions.needsFiltering(indexGroup))
+                return true;
+
+        for (CustomIndexExpression restriction : externalRestrictions)
+            if (!restriction.needsFiltering(indexGroup))
+                return true;
+
+        return false;
+    }
+
     static InvalidRequestException invalidIndex(QualifiedName indexName, TableMetadata table)
     {
         return new InvalidRequestException(String.format(INVALID_INDEX, indexName.getName(), table));
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
index 3a771333ac6f..453b7e54693d 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/MultiColumnRestriction.java
@@ -211,7 +211,7 @@ public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         }
 
         @Override
-        public final void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public final void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             Tuples.Value t = ((Tuples.Value) value.bind(options));
             List<ByteBuffer> values = t.getElements();
@@ -265,7 +265,7 @@ protected boolean isSupportedBy(Index index, ColumnMetadata column)
         }
 
         @Override
-        public final void addToRowFilter(RowFilter filter,
+        public final void addToRowFilter(RowFilter.Builder filter,
                                          IndexRegistry indexRegistry,
                                          QueryOptions options)
         {
@@ -491,7 +491,7 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        public final void addToRowFilter(RowFilter filter,
+        public final void addToRowFilter(RowFilter.Builder filter,
                                          IndexRegistry indexRegistry,
                                          QueryOptions options)
         {
@@ -577,7 +577,7 @@ public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         }
 
         @Override
-        public final void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public final void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             throw new UnsupportedOperationException("Secondary indexes do not support IS NOT NULL restrictions");
         }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
index f6ecc6923c7c..9e843ccce7f9 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/PartitionKeySingleRestrictionSet.java
@@ -123,7 +123,7 @@ public boolean isInclusive(Bound b)
     }
 
     @Override
-    public void addToRowFilter(RowFilter filter,
+    public void addToRowFilter(RowFilter.Builder filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options)
     {
@@ -166,12 +166,19 @@ private Builder(ClusteringComparator clusteringComparator) {
             this.clusteringComparator = clusteringComparator;
         }
 
-        public Builder addRestriction(Restriction restriction) {
+        public Builder addRestriction(Restriction restriction)
+        {
             restrictions.add(restriction);
             return this;
         }
 
-        public PartitionKeyRestrictions build() {
+        public PartitionKeyRestrictions build()
+        {
+            return build(false);
+        }
+
+        public PartitionKeyRestrictions build(boolean isDisjunction)
+        {
             RestrictionSet.Builder restrictionSet = RestrictionSet.builder();
 
             for (int i = 0; i < restrictions.size(); i++) {
@@ -181,13 +188,14 @@ public PartitionKeyRestrictions build() {
                 if (restriction.isOnToken())
                     return buildWithTokens(restrictionSet, i);
 
-                restrictionSet.addRestriction((SingleRestriction) restriction);
+                restrictionSet.addRestriction((SingleRestriction) restriction, isDisjunction);
             }
 
             return buildPartitionKeyRestrictions(restrictionSet);
         }
 
-        private PartitionKeyRestrictions buildWithTokens(RestrictionSet.Builder restrictionSet, int i) {
+        private PartitionKeyRestrictions buildWithTokens(RestrictionSet.Builder restrictionSet, int i)
+        {
             PartitionKeyRestrictions merged = buildPartitionKeyRestrictions(restrictionSet);
 
             for (; i < restrictions.size(); i++) {
@@ -199,7 +207,8 @@ private PartitionKeyRestrictions buildWithTokens(RestrictionSet.Builder restrict
             return merged;
         }
 
-        private PartitionKeySingleRestrictionSet buildPartitionKeyRestrictions(RestrictionSet.Builder restrictionSet) {
+        private PartitionKeySingleRestrictionSet buildPartitionKeyRestrictions(RestrictionSet.Builder restrictionSet)
+        {
             return new PartitionKeySingleRestrictionSet(restrictionSet.build(), clusteringComparator);
         }
     }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java b/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
index f523d45096e9..bf181c085646 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/Restriction.java
@@ -18,7 +18,6 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.util.List;
-import java.util.function.Consumer;
 
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -89,7 +88,7 @@ public default boolean isOnToken()
      * @param indexRegistry the index registry
      * @param options the query options
      */
-    public void addToRowFilter(RowFilter filter,
+    public void addToRowFilter(RowFilter.Builder filter,
                                IndexRegistry indexRegistry,
                                QueryOptions options);
 }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
index 8548ea2b8b9d..8f9f1e604126 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSet.java
@@ -18,7 +18,10 @@
 package org.apache.cassandra.cql3.restrictions;
 
 import java.util.*;
-import java.util.function.Consumer;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.Multimap;
 
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.functions.Function;
@@ -50,7 +53,7 @@ private EmptyRestrictionSet()
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options) throws InvalidRequestException
+        public void addToRowFilter(RowFilter.Builder rowFilter, IndexRegistry indexRegistry, QueryOptions options) throws InvalidRequestException
         {
         }
 
@@ -149,7 +152,7 @@ private static final class DefaultRestrictionSet extends RestrictionSet
          * The values as returned from {@link #restrictions()}.
          */
         private final List<SingleRestriction> restrictionsValues;
-        private final Map<ColumnMetadata, SingleRestriction> restrictionsHashMap;
+        private final Multimap<ColumnMetadata, SingleRestriction> restrictionsMap;
         private final int hasBitmap;
         private final int restrictionForKindBitmap;
         private static final int maskHasContains = 1;
@@ -159,7 +162,7 @@ private static final class DefaultRestrictionSet extends RestrictionSet
         private static final int maskHasMultiColumnSlice = 16;
         private static final int maskHasMultipleContains = 32;
 
-        private DefaultRestrictionSet(Map<ColumnMetadata, SingleRestriction> restrictions,
+        private DefaultRestrictionSet(Multimap<ColumnMetadata, SingleRestriction> restrictions,
                                       boolean hasMultiColumnRestrictions)
         {
             this.restrictionsKeys = new ArrayList<>(restrictions.keySet());
@@ -175,52 +178,55 @@ private DefaultRestrictionSet(Map<ColumnMetadata, SingleRestriction> restriction
             for (int i = 0; i < restrictionsKeys.size(); i++)
             {
                 ColumnMetadata col = restrictionsKeys.get(i);
-                SingleRestriction singleRestriction = restrictions.get(col);
-
-                if (singleRestriction.isContains())
-                {
-                    bitmap |= maskHasContains;
-                    ContainsRestriction contains = (ContainsRestriction) singleRestriction;
-                    numberOfContains += (contains.numberOfValues() + contains.numberOfKeys() + contains.numberOfEntries());
-                }
+                Collection<SingleRestriction> columnRestrictions = restrictions.get(col);
 
-                if (hasMultiColumnRestrictions)
+                for (SingleRestriction singleRestriction : columnRestrictions)
                 {
-                    if (singleRestriction.equals(previous))
-                        continue;
-                    previous = singleRestriction;
+                    if (singleRestriction.isContains())
+                    {
+                        bitmap |= maskHasContains;
+                        ContainsRestriction contains = (ContainsRestriction) singleRestriction;
+                        numberOfContains += (contains.numberOfValues() + contains.numberOfKeys() + contains.numberOfEntries());
+                    }
+
+                    if (hasMultiColumnRestrictions)
+                    {
+                        if (singleRestriction.equals(previous))
+                            continue;
+                        previous = singleRestriction;
+                    }
+
+                    restrictionForBitmap |= 1 << col.kind.ordinal();
+
+                    sortedRestrictions.add(singleRestriction);
+
+                    if (singleRestriction.isSlice())
+                    {
+                        bitmap |= maskHasSlice;
+                        if (singleRestriction.isMultiColumn())
+                            bitmap |= maskHasMultiColumnSlice;
+                    }
+
+                    if (singleRestriction.isIN())
+                        bitmap |= maskHasIN;
+                    else if (!singleRestriction.isEQ())
+                        bitmap &= ~maskHasOnlyEqualityRestrictions;
                 }
-
-                restrictionForBitmap |= 1 << col.kind.ordinal();
-
-                sortedRestrictions.add(singleRestriction);
-
-                if (singleRestriction.isSlice())
-                {
-                    bitmap |= maskHasSlice;
-                    if (singleRestriction.isMultiColumn())
-                        bitmap |= maskHasMultiColumnSlice;
-                }
-
-                if (singleRestriction.isIN())
-                    bitmap |= maskHasIN;
-                else if (!singleRestriction.isEQ())
-                    bitmap &= ~maskHasOnlyEqualityRestrictions;
             }
             this.hasBitmap = bitmap | (numberOfContains > 1 ? maskHasMultipleContains : 0);
             this.restrictionForKindBitmap = restrictionForBitmap;
 
             this.restrictionsValues = Collections.unmodifiableList(sortedRestrictions);
-            this.restrictionsHashMap = restrictions;
+            this.restrictionsMap = restrictions;
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter,
+        public void addToRowFilter(RowFilter.Builder rowFilter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options) throws InvalidRequestException
         {
-            for (SingleRestriction restriction : restrictionsHashMap.values())
-                restriction.addToRowFilter(filter, indexRegistry, options);
+            for (SingleRestriction restriction : restrictionsMap.values())
+                restriction.addToRowFilter(rowFilter, indexRegistry, options);
         }
 
         @Override
@@ -257,14 +263,13 @@ public boolean hasRestrictionFor(ColumnMetadata.Kind kind)
         @Override
         public Set<Restriction> getRestrictions(ColumnMetadata columnDef)
         {
-            Restriction existing = restrictionsHashMap.get(columnDef);
-            return existing == null ? Collections.emptySet() : Collections.singleton(existing);
+            return restrictionsMap.get(columnDef).stream().map(r -> ((Restriction)r)).collect(Collectors.toSet());
         }
 
         @Override
         public boolean hasSupportingIndex(IndexRegistry indexRegistry)
         {
-            for (SingleRestriction restriction : restrictionsHashMap.values())
+            for (SingleRestriction restriction : restrictionsMap.values())
                 if (restriction.hasSupportingIndex(indexRegistry))
                     return true;
             return false;
@@ -273,7 +278,7 @@ public boolean hasSupportingIndex(IndexRegistry indexRegistry)
         @Override
         public boolean needsFiltering(Index.Group indexGroup)
         {
-            for (SingleRestriction restriction : restrictionsHashMap.values())
+            for (SingleRestriction restriction : restrictionsMap.values())
                 if (restriction.needsFiltering(indexGroup))
                     return true;
 
@@ -378,7 +383,7 @@ public static Builder builder()
 
     public static final class Builder
     {
-        private final Map<ColumnMetadata, SingleRestriction> newRestrictions = new HashMap<>();
+        private final Multimap<ColumnMetadata, SingleRestriction> newRestrictions = ArrayListMultimap.create();
         private boolean multiColumn = false;
 
         private ColumnMetadata lastRestrictionColumn;
@@ -388,27 +393,41 @@ private Builder()
         {
         }
 
-        public void addRestriction(SingleRestriction restriction)
+        public void addRestriction(SingleRestriction restriction, boolean isDisjunction)
         {
             List<ColumnMetadata> columnDefs = restriction.getColumnDefs();
-            Set<SingleRestriction> existingRestrictions = getRestrictions(newRestrictions, columnDefs);
 
-            if (existingRestrictions.isEmpty())
+            if (isDisjunction)
             {
-                addRestrictionForColumns(columnDefs, restriction);
+                // If this restriction is part of a disjunction query then we don't want
+                // to merge the restrictions (if that is possible), we just add the
+                // restriction to the set of restrictions for the column.
+                addRestrictionForColumns(columnDefs, restriction, false);
             }
             else
             {
-                for (SingleRestriction existing : existingRestrictions)
+                // If this restriction isn't part of a disjunction then we need to get
+                // the set of existing restrictions for the column and merge them with the
+                // new restriction
+                Set<SingleRestriction> existingRestrictions = getRestrictions(newRestrictions, columnDefs);
+
+                if (existingRestrictions.isEmpty())
+                {
+                    addRestrictionForColumns(columnDefs, restriction, false);
+                }
+                else
                 {
-                    SingleRestriction newRestriction = existing.mergeWith(restriction);
+                    for (SingleRestriction existing : existingRestrictions)
+                    {
+                        SingleRestriction newRestriction = existing.mergeWith(restriction);
 
-                    addRestrictionForColumns(columnDefs, newRestriction);
+                        addRestrictionForColumns(columnDefs, newRestriction, true);
+                    }
                 }
             }
         }
 
-        private void addRestrictionForColumns(List<ColumnMetadata> columnDefs, SingleRestriction restriction)
+        private void addRestrictionForColumns(List<ColumnMetadata> columnDefs, SingleRestriction restriction, boolean replace)
         {
             for (int i = 0; i < columnDefs.size(); i++)
             {
@@ -418,21 +437,25 @@ private void addRestrictionForColumns(List<ColumnMetadata> columnDefs, SingleRes
                     lastRestrictionColumn = column;
                     lastRestriction = restriction;
                 }
+                // If the restriction is a merger of new restriction and existing restrictions then
+                // we need to remove the existing restrictions for the column before adding it
+                if (replace)
+                    newRestrictions.removeAll(column);
                 newRestrictions.put(column, restriction);
             }
 
             multiColumn |= restriction.isMultiColumn();
         }
 
-        private static Set<SingleRestriction> getRestrictions(Map<ColumnMetadata, SingleRestriction> restrictions,
+        private static Set<SingleRestriction> getRestrictions(Multimap<ColumnMetadata, SingleRestriction> restrictions,
                                                               List<ColumnMetadata> columnDefs)
         {
             Set<SingleRestriction> set = new HashSet<>();
             for (int i = 0; i < columnDefs.size(); i++)
             {
-                SingleRestriction existing = restrictions.get(columnDefs.get(i));
-                if (existing != null)
-                    set.add(existing);
+                Collection<SingleRestriction> existing = restrictions.get(columnDefs.get(i));
+                if (!existing.isEmpty())
+                    set.addAll(existing);
             }
             return set;
         }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java
index 967e1bad8f3c..b6c943ab065a 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/RestrictionSetWrapper.java
@@ -19,7 +19,6 @@
 
 import java.util.List;
 import java.util.Set;
-import java.util.function.Consumer;
 
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -45,11 +44,11 @@ class RestrictionSetWrapper implements Restrictions
     }
 
     @Override
-    public void addToRowFilter(RowFilter filter,
+    public void addToRowFilter(RowFilter.Builder rowFilter,
                                IndexRegistry indexRegistry,
                                QueryOptions options)
     {
-        restrictions.addToRowFilter(filter, indexRegistry, options);
+        restrictions.addToRowFilter(rowFilter, indexRegistry, options);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
index 0af4d2e730e1..2a7e0f54111f 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/SingleColumnRestriction.java
@@ -21,15 +21,14 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
-import java.util.function.Consumer;
 
+import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.cql3.*;
 import org.apache.cassandra.cql3.Term.Terminal;
 import org.apache.cassandra.cql3.functions.Function;
 import org.apache.cassandra.cql3.statements.Bound;
 import org.apache.cassandra.db.MultiCBuilder;
-import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.IndexRegistry;
 import org.apache.cassandra.serializers.ListSerializer;
@@ -163,7 +162,7 @@ MultiColumnRestriction toMultiColumnRestriction()
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter,
+        public void addToRowFilter(RowFilter.Builder filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
@@ -227,7 +226,7 @@ public MultiCBuilder appendTo(MultiCBuilder builder, QueryOptions options)
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter,
+        public void addToRowFilter(RowFilter.Builder filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
@@ -405,7 +404,7 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             for (Bound b : Bound.values())
                 if (hasBound(b))
@@ -495,7 +494,7 @@ public SingleRestriction doMergeWith(SingleRestriction otherRestriction)
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+        public void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry, QueryOptions options)
         {
             for (ByteBuffer value : bindAndGet(values, options))
                 filter.add(columnDef, Operator.CONTAINS, value);
@@ -634,7 +633,7 @@ MultiColumnRestriction toMultiColumnRestriction()
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter,
+        public void addToRowFilter(RowFilter.Builder filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
@@ -710,7 +709,7 @@ MultiColumnRestriction toMultiColumnRestriction()
         }
 
         @Override
-        public void addToRowFilter(RowFilter filter,
+        public void addToRowFilter(RowFilter.Builder filter,
                                    IndexRegistry indexRegistry,
                                    QueryOptions options)
         {
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index 41750521fdaa..d506f3b26e00 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
@@ -19,7 +19,6 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
-import java.util.function.Consumer;
 
 import com.google.common.base.Joiner;
 import com.google.common.collect.ImmutableList;
@@ -42,7 +41,9 @@
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.utils.btree.BTreeSet;
 
-import static org.apache.cassandra.cql3.statements.RequestValidations.*;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkFalse;
+import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
+import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest;
 
 /**
  * The restrictions corresponding to the relations specified on the where-clause of CQL query.
@@ -50,9 +51,9 @@
 public class StatementRestrictions
 {
     public static final String REQUIRES_ALLOW_FILTERING_MESSAGE =
-            "Cannot execute this query as it might involve data filtering and " +
-            "thus may have unpredictable performance. If you want to execute " +
-            "this query despite the performance unpredictability, use ALLOW FILTERING";
+    "Cannot execute this query as it might involve data filtering and " +
+    "thus may have unpredictable performance. If you want to execute " +
+    "this query despite the performance unpredictability, use ALLOW FILTERING";
 
     public static final String HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE =
     "Column '%s' has an index but does not support the operators specified in the query. " +
@@ -64,10 +65,8 @@ public class StatementRestrictions
 
     public static final String INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE = "Index on column %s does not support LIKE restrictions.";
 
-    /**
-     * The type of statement
-     */
-    private final StatementType type;
+    public static final String INDEX_DOES_NOT_SUPPORT_DISJUNCTION =
+    "An index involved in this query does not support disjunctive queries using the OR operator";
 
     /**
      * The Column Family meta data
@@ -96,6 +95,11 @@ public class StatementRestrictions
      */
     private final IndexRestrictions filterRestrictions;
 
+    /**
+     * <code>true</code> if these restrictions form part of an OR query, <code>false</code> otherwise
+     */
+    private boolean isDisjunction;
+
     /**
      * <code>true</code> if the secondary index need to be queried, <code>false</code> otherwise
      */
@@ -112,48 +116,28 @@ public class StatementRestrictions
      */
     private boolean hasRegularColumnsRestrictions;
 
+    private final List<StatementRestrictions> children;
+
     /**
      * Creates a new empty <code>StatementRestrictions</code>.
      *
-     * @param type the type of statement
      * @param table the column family meta data
      * @return a new empty <code>StatementRestrictions</code>.
      */
-    public static StatementRestrictions empty(StatementType type, TableMetadata table)
+    public static StatementRestrictions empty(TableMetadata table)
     {
-        return new StatementRestrictions(type, table, false);
+        return new StatementRestrictions(table, false);
     }
 
-    private StatementRestrictions(StatementType type, TableMetadata table, boolean allowFiltering)
+    private StatementRestrictions(TableMetadata table, boolean allowFiltering)
     {
-        this.type = type;
         this.table = table;
         this.partitionKeyRestrictions = PartitionKeySingleRestrictionSet.builder(table.partitionKeyAsClusteringComparator()).build();
         this.clusteringColumnsRestrictions = ClusteringColumnRestrictions.builder(table, allowFiltering).build();
         this.nonPrimaryKeyRestrictions = RestrictionSet.builder().build();
         this.notNullColumns = ImmutableSet.of();
         this.filterRestrictions = IndexRestrictions.of();
-    }
-
-    private StatementRestrictions(StatementType type,
-                                  TableMetadata table,
-                                  PartitionKeyRestrictions partitionKeyRestrictions,
-                                  ClusteringColumnRestrictions clusteringColumnsRestrictions,
-                                  RestrictionSet nonPrimaryKeyRestrictions,
-                                  ImmutableSet<ColumnMetadata> notNullColumns,
-                                  boolean usesSecondaryIndexing,
-                                  boolean isKeyRange,
-                                  IndexRestrictions filterRestrictions)
-    {
-        this.type = type;
-        this.table = table;
-        this.partitionKeyRestrictions = partitionKeyRestrictions;
-        this.clusteringColumnsRestrictions = clusteringColumnsRestrictions;
-        this.nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictions;
-        this.notNullColumns = notNullColumns;
-        this.usesSecondaryIndexing = usesSecondaryIndexing;
-        this.isKeyRange = isKeyRange;
-        this.filterRestrictions = filterRestrictions;
+        this.children = Collections.emptyList();
     }
 
     /**
@@ -169,15 +153,17 @@ public StatementRestrictions addIndexRestrictions(Restrictions restrictions)
                                                                   .add(restrictions)
                                                                   .build();
 
-        return new StatementRestrictions(type,
-                                         table,
+        return new StatementRestrictions(table,
                                          partitionKeyRestrictions,
                                          clusteringColumnsRestrictions,
                                          nonPrimaryKeyRestrictions,
                                          notNullColumns,
+                                         isDisjunction,
                                          usesSecondaryIndexing,
                                          isKeyRange,
-                                         newIndexRestrictions);
+                                         hasRegularColumnsRestrictions,
+                                         newIndexRestrictions,
+                                         children);
     }
 
     /**
@@ -194,196 +180,483 @@ public StatementRestrictions addExternalRestrictions(Iterable<CustomIndexExpress
         for (CustomIndexExpression restriction : restrictions)
             newIndexRestrictions.add(restriction);
 
-        return new StatementRestrictions(type,
-                                         table,
+        return new StatementRestrictions(table,
                                          partitionKeyRestrictions,
                                          clusteringColumnsRestrictions,
                                          nonPrimaryKeyRestrictions,
                                          notNullColumns,
+                                         isDisjunction,
                                          usesSecondaryIndexing,
                                          isKeyRange,
-                                         newIndexRestrictions.build());
+                                         hasRegularColumnsRestrictions,
+                                         newIndexRestrictions.build(),
+                                         children);
     }
 
-    public StatementRestrictions(StatementType type,
-                                 TableMetadata table,
-                                 WhereClause whereClause,
-                                 VariableSpecifications boundNames,
-                                 boolean selectsOnlyStaticColumns,
-                                 boolean allowFiltering,
-                                 boolean forView)
+    private StatementRestrictions(TableMetadata table,
+                                  PartitionKeyRestrictions partitionKeyRestrictions,
+                                  ClusteringColumnRestrictions clusteringColumnsRestrictions,
+                                  RestrictionSet nonPrimaryKeyRestrictions,
+                                  ImmutableSet<ColumnMetadata> notNullColumns,
+                                  boolean isDisjunction,
+                                  boolean usesSecondaryIndexing,
+                                  boolean isKeyRange,
+                                  boolean hasRegularColumnsRestrictions,
+                                  IndexRestrictions filterRestrictions,
+                                  List<StatementRestrictions> children)
     {
-        this(type, table, whereClause, boundNames, selectsOnlyStaticColumns, type.allowUseOfSecondaryIndices(), allowFiltering, forView);
+        this.table = table;
+        this.partitionKeyRestrictions = partitionKeyRestrictions;
+        this.clusteringColumnsRestrictions = clusteringColumnsRestrictions;
+        this.nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictions;
+        this.notNullColumns = notNullColumns;
+        this.filterRestrictions = filterRestrictions;
+        this.isDisjunction = isDisjunction;
+        this.usesSecondaryIndexing = usesSecondaryIndexing;
+        this.isKeyRange = isKeyRange;
+        this.hasRegularColumnsRestrictions = hasRegularColumnsRestrictions;
+        this.children = children;
     }
 
-    /*
-     * We want to override allowUseOfSecondaryIndices flag from the StatementType for MV statements
-     * to avoid initing the Keyspace and SecondaryIndexManager.
-     */
-    public StatementRestrictions(StatementType type,
-                                 TableMetadata table,
-                                 WhereClause whereClause,
-                                 VariableSpecifications boundNames,
-                                 boolean selectsOnlyStaticColumns,
-                                 boolean allowUseOfSecondaryIndices,
-                                 boolean allowFiltering,
-                                 boolean forView)
+    public static StatementRestrictions create(StatementType type,
+                                               TableMetadata table,
+                                               WhereClause whereClause,
+                                               VariableSpecifications boundNames,
+                                               boolean selectsOnlyStaticColumns,
+                                               boolean allowFiltering,
+                                               boolean forView)
     {
-        this.type = type;
-        this.table = table;
+        return new Builder(type,
+                           table,
+                           whereClause,
+                           boundNames,
+                           selectsOnlyStaticColumns,
+                           type.allowUseOfSecondaryIndices(),
+                           allowFiltering,
+                           forView).build();
+    }
 
-        IndexRegistry indexRegistry = null;
+    public static StatementRestrictions create(StatementType type,
+                                               TableMetadata table,
+                                               WhereClause whereClause,
+                                               VariableSpecifications boundNames,
+                                               boolean selectsOnlyStaticColumns,
+                                               boolean allowUseOfSecondaryIndices,
+                                               boolean allowFiltering,
+                                               boolean forView)
+    {
+        return new Builder(type,
+                           table,
+                           whereClause,
+                           boundNames,
+                           selectsOnlyStaticColumns,
+                           allowUseOfSecondaryIndices,
+                           allowFiltering,
+                           forView).build();
+    }
 
-        // We want to avoid opening the keyspace during view construction
-        // since we're parsing these for restore and the base table or keyspace might not exist in the current schema.
-        if (allowUseOfSecondaryIndices && type.allowUseOfSecondaryIndices())
-            indexRegistry = IndexRegistry.obtain(table);
+    /**
+     * Build a <code>StatementRestrictions</code> from a <code>WhereClause</code> for a given
+     * <code>StatementType</code>, <code>TableMetadata</code> and <code>VariableSpecifications</code>
+     *
+     * The validation rules for whether the <code>StatementRestrictions</code> are valid depend on a
+     * number of considerations, including whether indexes are being used and whether filtering is being
+     * used.
+     */
+    public static class Builder
+    {
+        private final StatementType type;
+        private final TableMetadata table;
+        private final WhereClause whereClause;
+        private final VariableSpecifications boundNames;
+        private final boolean selectsOnlyStaticColumns;
+        private final boolean allowUseOfSecondaryIndices;
+        private final boolean allowFiltering;
+        private final boolean forView;
+
+        public Builder(StatementType type,
+                       TableMetadata table,
+                       WhereClause whereClause,
+                       VariableSpecifications boundNames,
+                       boolean selectsOnlyStaticColumns,
+                       boolean allowUseOfSecondaryIndices,
+                       boolean allowFiltering,
+                       boolean forView)
+        {
+            this.type = type;
+            this.table = table;
+            this.whereClause = whereClause;
+            this.boundNames = boundNames;
+            this.selectsOnlyStaticColumns = selectsOnlyStaticColumns;
+            this.allowUseOfSecondaryIndices = allowUseOfSecondaryIndices;
+            this.allowFiltering = allowFiltering;
+            this.forView = forView;
+        }
 
-        PartitionKeySingleRestrictionSet.Builder partitionKeyRestrictionSet = PartitionKeySingleRestrictionSet.builder(table.partitionKeyAsClusteringComparator());
-        ClusteringColumnRestrictions.Builder clusteringColumnsRestrictionSet = ClusteringColumnRestrictions.builder(table, allowFiltering, indexRegistry);
-        RestrictionSet.Builder nonPrimaryKeyRestrictionSet = RestrictionSet.builder();
+        public StatementRestrictions build()
+        {
+            IndexRegistry indexRegistry = null;
 
-        ImmutableSet.Builder<ColumnMetadata> notNullColumnsBuilder = ImmutableSet.builder();
+            // We want to avoid opening the keyspace during view construction
+            // since we're parsing these for restore and the base table or keyspace might not exist in the current schema.
+            if (allowUseOfSecondaryIndices && type.allowUseOfSecondaryIndices())
+                indexRegistry = IndexRegistry.obtain(table);
 
-        /*
-         * WHERE clause. For a given entity, rules are:
-         *   - EQ relation conflicts with anything else (including a 2nd EQ)
-         *   - Can't have more than one LT(E) relation (resp. GT(E) relation)
-         *   - IN relation are restricted to row keys (for now) and conflicts with anything else (we could
-         *     allow two IN for the same entity but that doesn't seem very useful)
-         *   - The value_alias cannot be restricted in any way (we don't support wide rows with indexed value
-         *     in CQL so far)
-         */
-        for (Relation relation : whereClause.relations)
+            return doBuild(whereClause.root(), indexRegistry);
+        }
+
+        StatementRestrictions doBuild(WhereClause.ExpressionElement element, IndexRegistry indexRegistry)
         {
-            if (relation.operator() == Operator.IS_NOT)
-            {
-                if (!forView)
-                    throw invalidRequest("Unsupported restriction: %s", relation);
+            PartitionKeySingleRestrictionSet.Builder partitionKeyRestrictionSet = PartitionKeySingleRestrictionSet.builder(table.partitionKeyAsClusteringComparator());
+            ClusteringColumnRestrictions.Builder clusteringColumnsRestrictionSet = ClusteringColumnRestrictions.builder(table, allowFiltering, indexRegistry);
+            RestrictionSet.Builder nonPrimaryKeyRestrictionSet = RestrictionSet.builder();
+            ImmutableSet.Builder<ColumnMetadata> notNullColumnsBuilder = ImmutableSet.builder();
 
-                notNullColumnsBuilder.addAll(relation.toRestriction(table, boundNames).getColumnDefs());
-            }
-            else
+            for (Relation relation : element.relations())
             {
-                Restriction restriction = relation.toRestriction(table, boundNames);
+                if (relation.operator() == Operator.IS_NOT)
+                {
+                    if (!forView)
+                        throw invalidRequest("Unsupported restriction: %s", relation);
 
-                if (relation.isLIKE() && (!type.allowUseOfSecondaryIndices() || !restriction.hasSupportingIndex(indexRegistry)))
+                    notNullColumnsBuilder.addAll(relation.toRestriction(table, boundNames).getColumnDefs());
+                }
+                else
                 {
-                    if (getColumnsWithUnsupportedIndexRestrictions(table, ImmutableList.of(restriction)).isEmpty())
+                    Restriction restriction = relation.toRestriction(table, boundNames);
+
+                    if (relation.isLIKE() && (!type.allowUseOfSecondaryIndices() || !restriction.hasSupportingIndex(indexRegistry)))
+                    {
+                        if (getColumnsWithUnsupportedIndexRestrictions(table, ImmutableList.of(restriction)).isEmpty())
+                        {
+                            throw invalidRequest("LIKE restriction is only supported on properly indexed columns. %s is not valid.", relation.toString());
+                        }
+                        else
+                        {
+                            throw invalidRequest(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE, restriction.getFirstColumn());
+                        }
+                    }
+
+                    ColumnMetadata def = restriction.getFirstColumn();
+                    if (def.isPartitionKey())
                     {
-                        throw invalidRequest("LIKE restriction is only supported on properly indexed columns. %s is not valid.", relation.toString());
+                        partitionKeyRestrictionSet.addRestriction(restriction);
+                    }
+                    else if (def.isClusteringColumn())
+                    {
+                        clusteringColumnsRestrictionSet.addRestriction(restriction, element.isDisjunction());
                     }
                     else
                     {
-                        throw invalidRequest(INDEX_DOES_NOT_SUPPORT_LIKE_MESSAGE, restriction.getFirstColumn());
+                        nonPrimaryKeyRestrictionSet.addRestriction((SingleRestriction) restriction, element.isDisjunction());
                     }
                 }
+            }
+
+            PartitionKeyRestrictions partitionKeyRestrictions = partitionKeyRestrictionSet.build();
+            ClusteringColumnRestrictions clusteringColumnsRestrictions = clusteringColumnsRestrictionSet.build();
+            RestrictionSet nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictionSet.build();
+            ImmutableSet<ColumnMetadata> notNullColumns = notNullColumnsBuilder.build();
+            boolean hasRegularColumnsRestrictions = nonPrimaryKeyRestrictions.hasRestrictionFor(ColumnMetadata.Kind.REGULAR);
+            boolean usesSecondaryIndexing = false;
+            boolean isKeyRange = false;
 
-                ColumnMetadata def = restriction.getFirstColumn();
-                if (def.isPartitionKey())
+            boolean hasQueriableClusteringColumnIndex = false;
+            boolean hasQueriableIndex = false;
+
+            IndexRestrictions.Builder filterRestrictionsBuilder = IndexRestrictions.builder();
+
+            if (allowUseOfSecondaryIndices)
+            {
+                if (element.containsCustomExpressions())
                 {
-                    partitionKeyRestrictionSet.addRestriction(restriction);
+                    CustomIndexExpression customExpression = prepareCustomIndexExpression(element.expressions(),
+                                                                                          boundNames,
+                                                                                          indexRegistry);
+                    filterRestrictionsBuilder.add(customExpression);
                 }
-                else if (def.isClusteringColumn())
+
+                hasQueriableClusteringColumnIndex = clusteringColumnsRestrictions.hasSupportingIndex(indexRegistry);
+                hasQueriableIndex = element.containsCustomExpressions()
+                                    || hasQueriableClusteringColumnIndex
+                                    || partitionKeyRestrictions.hasSupportingIndex(indexRegistry)
+                                    || nonPrimaryKeyRestrictions.hasSupportingIndex(indexRegistry);
+            }
+
+            // At this point, the select statement if fully constructed, but we still have a few things to validate
+            if (!type.allowPartitionKeyRanges())
+            {
+                checkFalse(partitionKeyRestrictions.isOnToken(),
+                           "The token function cannot be used in WHERE clauses for %s statements", type);
+
+                if (partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents(table))
+                    throw invalidRequest("Some partition key parts are missing: %s",
+                                         Joiner.on(", ").join(getPartitionKeyUnrestrictedComponents(partitionKeyRestrictions)));
+
+                // slice query
+                checkFalse(partitionKeyRestrictions.hasSlice(),
+                           "Only EQ and IN relation are supported on the partition key (unless you use the token() function)"
+                           + " for %s statements", type);
+            }
+            else
+            {
+                // If there are no partition restrictions or there's only token restriction, we have to set a key range
+                if (partitionKeyRestrictions.isOnToken())
+                    isKeyRange = true;
+
+                if (partitionKeyRestrictions.isEmpty() && partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents(table))
                 {
-                    clusteringColumnsRestrictionSet.addRestriction(restriction);
+                    isKeyRange = true;
+                    usesSecondaryIndexing = hasQueriableIndex;
                 }
-                else
+
+                // If there is a queriable index, no special condition is required on the other restrictions.
+                // But we still need to know 2 things:
+                // - If we don't have a queriable index, is the query ok
+                // - Is it queriable without 2ndary index, which is always more efficient
+                // If a component of the partition key is restricted by a relation, all preceding
+                // components must have a EQ. Only the last partition key component can be in IN relation.
+                // If partition key restrictions exist and this is a disjunction then we may need filtering
+                if (partitionKeyRestrictions.needFiltering(table) || (!partitionKeyRestrictions.isEmpty() && element.isDisjunction()))
                 {
-                    nonPrimaryKeyRestrictionSet.addRestriction((SingleRestriction) restriction);
+                    if (!allowFiltering && !forView && !hasQueriableIndex)
+                        throw new InvalidRequestException(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+
+                    isKeyRange = true;
+                    usesSecondaryIndexing = hasQueriableIndex;
                 }
             }
-        }
 
-        this.partitionKeyRestrictions = partitionKeyRestrictionSet.build();
-        this.clusteringColumnsRestrictions = clusteringColumnsRestrictionSet.build();
-        this.nonPrimaryKeyRestrictions = nonPrimaryKeyRestrictionSet.build();
-        this.notNullColumns = notNullColumnsBuilder.build();
-        this.hasRegularColumnsRestrictions = nonPrimaryKeyRestrictions.hasRestrictionFor(ColumnMetadata.Kind.REGULAR);
+            // Some but not all of the partition key columns have been specified or they form part of a disjunction;
+            // hence we need turn these restrictions into a row filter.
+            if (usesSecondaryIndexing || partitionKeyRestrictions.needFiltering(table) || element.isDisjunction())
+                filterRestrictionsBuilder.add(partitionKeyRestrictions);
 
-        boolean hasQueriableClusteringColumnIndex = false;
-        boolean hasQueriableIndex = false;
+            if (selectsOnlyStaticColumns && !clusteringColumnsRestrictions.isEmpty())
+            {
+                // If the only updated/deleted columns are static, then we don't need clustering columns.
+                // And in fact, unless it is an INSERT, we reject if clustering colums are provided as that
+                // suggest something unintended. For instance, given:
+                //   CREATE TABLE t (k int, v int, s int static, PRIMARY KEY (k, v))
+                // it can make sense to do:
+                //   INSERT INTO t(k, v, s) VALUES (0, 1, 2)
+                // but both
+                //   UPDATE t SET s = 3 WHERE k = 0 AND v = 1
+                //   DELETE v FROM t WHERE k = 0 AND v = 1
+                // sounds like you don't really understand what your are doing.
+                if (type.isDelete() || type.isUpdate())
+                    throw invalidRequest("Invalid restrictions on clustering columns since the %s statement modifies only static columns",
+                                         type);
+            }
 
-        IndexRestrictions.Builder filterRestrictionsBuilder = IndexRestrictions.builder();
+            // Now process and validate the clustering column restrictions
+            checkFalse(!type.allowClusteringColumnSlices() && clusteringColumnsRestrictions.hasSlice(),
+                       "Slice restrictions are not supported on the clustering columns in %s statements", type);
 
-        if (allowUseOfSecondaryIndices)
-        {
-            if (whereClause.containsCustomExpressions())
+            if (!type.allowClusteringColumnSlices()
+                && (!table.isCompactTable() || (table.isCompactTable() && clusteringColumnsRestrictions.isEmpty())))
             {
-                CustomIndexExpression customExpression = prepareCustomIndexExpression(whereClause.expressions,
-                                                                                      boundNames,
-                                                                                      indexRegistry);
-                filterRestrictionsBuilder.add(customExpression);
+                if (!selectsOnlyStaticColumns && (table.clusteringColumns().size() != clusteringColumnsRestrictions.size()))
+                    throw invalidRequest("Some clustering keys are missing: %s",
+                                         Joiner.on(", ").join(getUnrestrictedClusteringColumns(clusteringColumnsRestrictions)));
             }
+            else
+            {
+                checkFalse(clusteringColumnsRestrictions.hasContains() && !hasQueriableIndex && !allowFiltering,
+                           "Clustering columns can only be restricted with CONTAINS with a secondary index or filtering");
 
-            hasQueriableClusteringColumnIndex = clusteringColumnsRestrictions.hasSupportingIndex(indexRegistry);
-            hasQueriableIndex = whereClause.containsCustomExpressions()
-                    || hasQueriableClusteringColumnIndex
-                    || partitionKeyRestrictions.hasSupportingIndex(indexRegistry)
-                    || nonPrimaryKeyRestrictions.hasSupportingIndex(indexRegistry);
-        }
+                if (!clusteringColumnsRestrictions.isEmpty() && clusteringColumnsRestrictions.needFiltering())
+                {
+                    if (hasQueriableIndex || forView)
+                    {
+                        usesSecondaryIndexing = true;
+                    }
+                    else if (!allowFiltering)
+                    {
+                        List<ColumnMetadata> clusteringColumns = table.clusteringColumns();
+                        List<ColumnMetadata> restrictedColumns = clusteringColumnsRestrictions.getColumnDefs();
+
+                        for (int i = 0, m = restrictedColumns.size(); i < m; i++)
+                        {
+                            ColumnMetadata clusteringColumn = clusteringColumns.get(i);
+                            ColumnMetadata restrictedColumn = restrictedColumns.get(i);
+
+                            if (!clusteringColumn.equals(restrictedColumn))
+                            {
+                                throw invalidRequest("PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
+                                                     restrictedColumn.name,
+                                                     clusteringColumn.name);
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Covers indexes on the first clustering column (among others).
+            if (isKeyRange && hasQueriableClusteringColumnIndex)
+                usesSecondaryIndexing = true;
 
-        // At this point, the select statement if fully constructed, but we still have a few things to validate
-        processPartitionKeyRestrictions(hasQueriableIndex, allowFiltering, forView);
+            if (usesSecondaryIndexing || clusteringColumnsRestrictions.needFiltering())
+                filterRestrictionsBuilder.add(clusteringColumnsRestrictions);
 
-        // Some but not all of the partition key columns have been specified;
-        // hence we need turn these restrictions into a row filter.
-        if (usesSecondaryIndexing || partitionKeyRestrictions.needFiltering(table))
-            filterRestrictionsBuilder.add(partitionKeyRestrictions);
+            // Even if usesSecondaryIndexing is false at this point, we'll still have to use one if
+            // there is restrictions not covered by the PK.
+            if (!nonPrimaryKeyRestrictions.isEmpty())
+            {
+                if (!type.allowNonPrimaryKeyInWhereClause())
+                {
+                    Collection<ColumnIdentifier> nonPrimaryKeyColumns =
+                    ColumnMetadata.toIdentifiers(nonPrimaryKeyRestrictions.getColumnDefs());
 
-        if (selectsOnlyStaticColumns && hasClusteringColumnsRestrictions())
+                    throw invalidRequest("Non PRIMARY KEY columns found in where clause: %s ",
+                                         Joiner.on(", ").join(nonPrimaryKeyColumns));
+                }
+                if (hasQueriableIndex)
+                    usesSecondaryIndexing = true;
+                else if (!allowFiltering)
+                    throwRequiresAllowFilteringError(table, clusteringColumnsRestrictions, nonPrimaryKeyRestrictions);
+
+                filterRestrictionsBuilder.add(nonPrimaryKeyRestrictions);
+            }
+
+            if (usesSecondaryIndexing)
+                checkFalse(partitionKeyRestrictions.hasIN(),
+                           "Select on indexed columns and with IN clause for the PRIMARY KEY are not supported");
+
+            ImmutableList.Builder<StatementRestrictions> children = ImmutableList.builder();
+
+            for (WhereClause.ContainerElement container : element.operations())
+                children.add(doBuild(container, indexRegistry));
+
+            return new StatementRestrictions(table,
+                                             partitionKeyRestrictions,
+                                             clusteringColumnsRestrictions,
+                                             nonPrimaryKeyRestrictions,
+                                             notNullColumns,
+                                             element.isDisjunction(),
+                                             usesSecondaryIndexing,
+                                             isKeyRange,
+                                             hasRegularColumnsRestrictions,
+                                             filterRestrictionsBuilder.build(),
+                                             children.build());
+        }
+
+        private Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table,
+                                                                               ClusteringColumnRestrictions clusteringColumnsRestrictions,
+                                                                               RestrictionSet nonPrimaryKeyRestrictions)
         {
-            // If the only updated/deleted columns are static, then we don't need clustering columns.
-            // And in fact, unless it is an INSERT, we reject if clustering colums are provided as that
-            // suggest something unintended. For instance, given:
-            //   CREATE TABLE t (k int, v int, s int static, PRIMARY KEY (k, v))
-            // it can make sense to do:
-            //   INSERT INTO t(k, v, s) VALUES (0, 1, 2)
-            // but both
-            //   UPDATE t SET s = 3 WHERE k = 0 AND v = 1
-            //   DELETE v FROM t WHERE k = 0 AND v = 1
-            // sounds like you don't really understand what your are doing.
-            if (type.isDelete() || type.isUpdate())
-                throw invalidRequest("Invalid restrictions on clustering columns since the %s statement modifies only static columns",
-                                     type);
+            return getColumnsWithUnsupportedIndexRestrictions(table, Iterables.concat(clusteringColumnsRestrictions.restrictions(), nonPrimaryKeyRestrictions.restrictions()));
         }
 
-        processClusteringColumnsRestrictions(hasQueriableIndex,
-                                             selectsOnlyStaticColumns,
-                                             forView,
-                                             allowFiltering);
+        private Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table, Iterable<Restriction> restrictions)
+        {
+            IndexRegistry indexRegistry = IndexRegistry.obtain(table);
+            if (indexRegistry.listIndexes().isEmpty())
+                return Collections.emptySet();
+
+            ImmutableSet.Builder<ColumnMetadata> builder = ImmutableSet.builder();
 
-        // Covers indexes on the first clustering column (among others).
-        if (isKeyRange && hasQueriableClusteringColumnIndex)
-            usesSecondaryIndexing = true;
+            for (Restriction restriction : restrictions)
+            {
+                if (!restriction.hasSupportingIndex(indexRegistry))
+                {
+                    for (Index index : indexRegistry.listIndexes())
+                    {
+                        // If a column restriction has an index which was not picked up by hasSupportingIndex, it means it's an unsupported restriction
+                        for (ColumnMetadata column : restriction.getColumnDefs())
+                        {
+                            if (index.dependsOn(column))
+                                builder.add(column);
+                        }
+                    }
+                }
+            }
 
-        if (usesSecondaryIndexing || clusteringColumnsRestrictions.needFiltering())
-            filterRestrictionsBuilder.add(clusteringColumnsRestrictions);
+            return builder.build();
+        }
 
-        // Even if usesSecondaryIndexing is false at this point, we'll still have to use one if
-        // there is restrictions not covered by the PK.
-        if (!nonPrimaryKeyRestrictions.isEmpty())
+        private void throwRequiresAllowFilteringError(TableMetadata table,
+                                                      ClusteringColumnRestrictions clusteringColumnsRestrictions,
+                                                      RestrictionSet nonPrimaryKeyRestrictions)
         {
-            if (!type.allowNonPrimaryKeyInWhereClause())
+            Set<ColumnMetadata> unsupported = getColumnsWithUnsupportedIndexRestrictions(table,
+                                                                                         clusteringColumnsRestrictions,
+                                                                                         nonPrimaryKeyRestrictions);
+            if (unsupported.isEmpty())
             {
-                Collection<ColumnIdentifier> nonPrimaryKeyColumns =
-                        ColumnMetadata.toIdentifiers(nonPrimaryKeyRestrictions.getColumnDefs());
-
-                throw invalidRequest("Non PRIMARY KEY columns found in where clause: %s ",
-                                     Joiner.on(", ").join(nonPrimaryKeyColumns));
+                throw invalidRequest(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
             }
-            if (hasQueriableIndex)
-                usesSecondaryIndexing = true;
-            else if (!allowFiltering)
-                throwRequiresAllowFilteringError(table);
+            else
+            {
+                // If there's an index on these columns but the restriction is not supported on this index, throw a more specific error message
+                if (unsupported.size() == 1)
+                    throw invalidRequest(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, unsupported.iterator().next()));
+                else
+                    throw invalidRequest(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI, unsupported));
+            }
+        }
+
+
+        private CustomIndexExpression prepareCustomIndexExpression(List<CustomIndexExpression> expressions,
+                                                                   VariableSpecifications boundNames,
+                                                                   IndexRegistry indexRegistry)
+        {
+            if (expressions.size() > 1)
+                throw new InvalidRequestException(IndexRestrictions.MULTIPLE_EXPRESSIONS);
+
+            CustomIndexExpression expression = expressions.get(0);
+
+            QualifiedName name = expression.targetIndex;
+
+            if (name.hasKeyspace() && !name.getKeyspace().equals(table.keyspace))
+                throw IndexRestrictions.invalidIndex(expression.targetIndex, table);
+
+            if (!table.indexes.has(expression.targetIndex.getName()))
+                throw IndexRestrictions.indexNotFound(expression.targetIndex, table);
+
+            Index index = indexRegistry.getIndex(table.indexes.get(expression.targetIndex.getName()).get());
+            if (!index.getIndexMetadata().isCustom())
+                throw IndexRestrictions.nonCustomIndexInExpression(expression.targetIndex);
 
-            filterRestrictionsBuilder.add(nonPrimaryKeyRestrictions);
+            AbstractType<?> expressionType = index.customExpressionValueType();
+            if (expressionType == null)
+                throw IndexRestrictions.customExpressionNotSupported(expression.targetIndex);
+
+            expression.prepareValue(table, expressionType, boundNames);
+            return expression;
         }
 
-        filterRestrictions = filterRestrictionsBuilder.build();
+        /**
+         * Returns the partition key components that are not restricted.
+         * @return the partition key components that are not restricted.
+         */
+        private Collection<ColumnIdentifier> getPartitionKeyUnrestrictedComponents(PartitionKeyRestrictions partitionKeyRestrictions)
+        {
+            List<ColumnMetadata> list = new ArrayList<>(table.partitionKeyColumns());
+            list.removeAll(partitionKeyRestrictions.getColumnDefs());
+            return ColumnMetadata.toIdentifiers(list);
+        }
 
-        if (usesSecondaryIndexing)
-            validateSecondaryIndexSelections();
+        /**
+         * Returns the clustering columns that are not restricted.
+         * @return the clustering columns that are not restricted.
+         */
+        private Collection<ColumnIdentifier> getUnrestrictedClusteringColumns(ClusteringColumnRestrictions clusteringColumnsRestrictions)
+        {
+            List<ColumnMetadata> missingClusteringColumns = new ArrayList<>(table.clusteringColumns());
+            missingClusteringColumns.removeAll(clusteringColumnsRestrictions.getColumnDefs());
+            return ColumnMetadata.toIdentifiers(missingClusteringColumns);
+        }
+    }
+
+    public IndexRestrictions filterRestrictions()
+    {
+        return filterRestrictions;
+    }
+
+    public List<StatementRestrictions> children()
+    {
+        return children;
     }
 
     public void throwRequiresAllowFilteringError(TableMetadata table)
@@ -403,11 +676,19 @@ public void throwRequiresAllowFilteringError(TableMetadata table)
         }
     }
 
+    public void throwsRequiresIndexSupportingDisjunctionError(TableMetadata table)
+    {
+        throw invalidRequest(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_DISJUNCTION);
+    }
+
     public void addFunctionsTo(List<Function> functions)
     {
         partitionKeyRestrictions.addFunctionsTo(functions);
         clusteringColumnsRestrictions.addFunctionsTo(functions);
         nonPrimaryKeyRestrictions.addFunctionsTo(functions);
+
+        for (StatementRestrictions child : children)
+            child.addFunctionsTo(functions);
     }
 
     // may be used by QueryHandler implementations
@@ -433,13 +714,16 @@ public Set<ColumnMetadata> nonPKRestrictedColumns(boolean includeNotNullRestrict
 
         if (includeNotNullRestrictions)
         {
-            for (ColumnMetadata def : notNullColumns)
+            for (ColumnMetadata def : notNullColumns())
             {
                 if (!def.isPrimaryKeyColumn())
                     columns.add(def);
             }
         }
 
+        for (StatementRestrictions child : children)
+            columns.addAll(child.nonPKRestrictedColumns(includeNotNullRestrictions));
+
         return columns;
     }
 
@@ -456,10 +740,17 @@ public ImmutableSet<ColumnMetadata> notNullColumns()
      */
     public boolean isRestricted(ColumnMetadata column)
     {
-        if (notNullColumns.contains(column))
+        if (notNullColumns().contains(column))
             return true;
 
-        return getRestrictions(column.kind).getColumnDefs().contains(column);
+        if (getRestrictions(column.kind).getColumnDefs().contains(column))
+            return true;
+
+        for (StatementRestrictions child : children)
+            if (child.isRestricted(column))
+                return true;
+
+        return false;
     }
 
     /**
@@ -480,7 +771,7 @@ public boolean keyIsInRelation()
      */
     public boolean isKeyRange()
     {
-        return this.isKeyRange;
+        return isKeyRange;
     }
 
     /**
@@ -521,52 +812,14 @@ protected Restrictions getRestrictions(ColumnMetadata.Kind kind)
      */
     public boolean usesSecondaryIndexing()
     {
-        return this.usesSecondaryIndexing;
-    }
-
-    protected void processPartitionKeyRestrictions(boolean hasQueriableIndex, boolean allowFiltering, boolean forView)
-    {
-        if (!type.allowPartitionKeyRanges())
-        {
-            checkFalse(partitionKeyRestrictions.isOnToken(),
-                       "The token function cannot be used in WHERE clauses for %s statements", type);
-
-            if (partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents(table))
-                throw invalidRequest("Some partition key parts are missing: %s",
-                                     Joiner.on(", ").join(getPartitionKeyUnrestrictedComponents()));
-
-            // slice query
-            checkFalse(partitionKeyRestrictions.hasSlice(),
-                    "Only EQ and IN relation are supported on the partition key (unless you use the token() function)"
-                            + " for %s statements", type);
-        }
-        else
-        {
-            // If there are no partition restrictions or there's only token restriction, we have to set a key range
-            if (partitionKeyRestrictions.isOnToken())
-                isKeyRange = true;
-
-            if (partitionKeyRestrictions.isEmpty() && partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents(table))
-            {
-                isKeyRange = true;
-                usesSecondaryIndexing = hasQueriableIndex;
-            }
+        if (usesSecondaryIndexing)
+            return true;
 
-            // If there is a queriable index, no special condition is required on the other restrictions.
-            // But we still need to know 2 things:
-            // - If we don't have a queriable index, is the query ok
-            // - Is it queriable without 2ndary index, which is always more efficient
-            // If a component of the partition key is restricted by a relation, all preceding
-            // components must have a EQ. Only the last partition key component can be in IN relation.
-            if (partitionKeyRestrictions.needFiltering(table))
-            {
-                if (!allowFiltering && !forView && !hasQueriableIndex)
-                    throw new InvalidRequestException(REQUIRES_ALLOW_FILTERING_MESSAGE);
+        for (StatementRestrictions child: children)
+            if (child.usesSecondaryIndexing)
+                return true;
 
-                isKeyRange = true;
-                usesSecondaryIndexing = hasQueriableIndex;
-            }
-        }
+        return false;
     }
 
     public boolean hasPartitionKeyRestrictions()
@@ -583,17 +836,6 @@ public boolean hasNonPrimaryKeyRestrictions()
         return !nonPrimaryKeyRestrictions.isEmpty();
     }
 
-    /**
-     * Returns the partition key components that are not restricted.
-     * @return the partition key components that are not restricted.
-     */
-    private Collection<ColumnIdentifier> getPartitionKeyUnrestrictedComponents()
-    {
-        List<ColumnMetadata> list = new ArrayList<>(table.partitionKeyColumns());
-        list.removeAll(partitionKeyRestrictions.getColumnDefs());
-        return ColumnMetadata.toIdentifiers(list);
-    }
-
     /**
      * Checks if the restrictions on the partition key are token restrictions.
      *
@@ -616,74 +858,6 @@ public boolean clusteringKeyRestrictionsHasIN()
         return clusteringColumnsRestrictions.hasIN();
     }
 
-    /**
-     * Processes the clustering column restrictions.
-     *
-     * @param hasQueriableIndex <code>true</code> if some of the queried data are indexed, <code>false</code> otherwise
-     * @param selectsOnlyStaticColumns <code>true</code> if the selected or modified columns are all statics,
-     * <code>false</code> otherwise.
-     */
-    private void processClusteringColumnsRestrictions(boolean hasQueriableIndex,
-                                                      boolean selectsOnlyStaticColumns,
-                                                      boolean forView,
-                                                      boolean allowFiltering)
-    {
-        checkFalse(!type.allowClusteringColumnSlices() && clusteringColumnsRestrictions.hasSlice(),
-                   "Slice restrictions are not supported on the clustering columns in %s statements", type);
-
-        if (!type.allowClusteringColumnSlices()
-            && (!table.isCompactTable() || (table.isCompactTable() && !hasClusteringColumnsRestrictions())))
-        {
-            if (!selectsOnlyStaticColumns && hasUnrestrictedClusteringColumns())
-                throw invalidRequest("Some clustering keys are missing: %s",
-                                     Joiner.on(", ").join(getUnrestrictedClusteringColumns()));
-        }
-        else
-        {
-            checkFalse(clusteringColumnsRestrictions.hasContains() && !hasQueriableIndex && !allowFiltering,
-                       "Clustering columns can only be restricted with CONTAINS with a secondary index or filtering");
-
-            if (hasClusteringColumnsRestrictions() && clusteringColumnsRestrictions.needFiltering())
-            {
-                if (hasQueriableIndex || forView)
-                {
-                    usesSecondaryIndexing = true;
-                }
-                else if (!allowFiltering)
-                {
-                    List<ColumnMetadata> clusteringColumns = table.clusteringColumns();
-                    List<ColumnMetadata> restrictedColumns = clusteringColumnsRestrictions.getColumnDefs();
-
-                    for (int i = 0, m = restrictedColumns.size(); i < m; i++)
-                    {
-                        ColumnMetadata clusteringColumn = clusteringColumns.get(i);
-                        ColumnMetadata restrictedColumn = restrictedColumns.get(i);
-
-                        if (!clusteringColumn.equals(restrictedColumn))
-                        {
-                            throw invalidRequest("PRIMARY KEY column \"%s\" cannot be restricted as preceding column \"%s\" is not restricted",
-                                                 restrictedColumn.name,
-                                                 clusteringColumn.name);
-                        }
-                    }
-                }
-            }
-
-        }
-
-    }
-
-    /**
-     * Returns the clustering columns that are not restricted.
-     * @return the clustering columns that are not restricted.
-     */
-    private Collection<ColumnIdentifier> getUnrestrictedClusteringColumns()
-    {
-        List<ColumnMetadata> missingClusteringColumns = new ArrayList<>(table.clusteringColumns());
-        missingClusteringColumns.removeAll(clusteringColumnsRestrictions.getColumnDefs());
-        return ColumnMetadata.toIdentifiers(missingClusteringColumns);
-    }
-
     /**
      * Checks if some clustering columns are not restricted.
      * @return <code>true</code> if some clustering columns are not restricted, <code>false</code> otherwise.
@@ -693,48 +867,12 @@ private boolean hasUnrestrictedClusteringColumns()
         return table.clusteringColumns().size() != clusteringColumnsRestrictions.size();
     }
 
-    private CustomIndexExpression prepareCustomIndexExpression(List<CustomIndexExpression> expressions,
-                                                               VariableSpecifications boundNames,
-                                                               IndexRegistry indexRegistry)
-    {
-        if (expressions.size() > 1)
-            throw new InvalidRequestException(IndexRestrictions.MULTIPLE_EXPRESSIONS);
-
-        CustomIndexExpression expression = expressions.get(0);
-
-        QualifiedName name = expression.targetIndex;
-
-        if (name.hasKeyspace() && !name.getKeyspace().equals(table.keyspace))
-            throw IndexRestrictions.invalidIndex(expression.targetIndex, table);
-
-        if (!table.indexes.has(expression.targetIndex.getName()))
-            throw IndexRestrictions.indexNotFound(expression.targetIndex, table);
-
-        Index index = indexRegistry.getIndex(table.indexes.get(expression.targetIndex.getName()).get());
-        if (!index.getIndexMetadata().isCustom())
-            throw IndexRestrictions.nonCustomIndexInExpression(expression.targetIndex);
-
-        AbstractType<?> expressionType = index.customExpressionValueType();
-        if (expressionType == null)
-            throw IndexRestrictions.customExpressionNotSupported(expression.targetIndex);
-
-        expression.prepareValue(table, expressionType, boundNames);
-        return expression;
-    }
-
     public RowFilter getRowFilter(IndexRegistry indexManager, QueryOptions options)
     {
-        if (filterRestrictions.isEmpty())
+        if (filterRestrictions.isEmpty() && children.isEmpty())
             return RowFilter.NONE;
 
-        RowFilter filter = RowFilter.create();
-        for (Restrictions restrictions : filterRestrictions.getRestrictions())
-            restrictions.addToRowFilter(filter, indexManager, options);
-
-        for (CustomIndexExpression expression : filterRestrictions.getExternalExpressions())
-            expression.addToRowFilter(filter, table, options);
-
-        return filter;
+        return RowFilter.builder().buildFromRestrictions(this, indexManager, table, options);
     }
 
     /**
@@ -785,7 +923,7 @@ private AbstractBounds<PartitionPosition> getPartitionKeyBounds(IPartitioner p,
     {
         // Deal with unrestricted partition key components (special-casing is required to deal with 2i queries on the
         // first component of a composite partition key) queries that filter on the partition key.
-        if (partitionKeyRestrictions.needFiltering(table))
+        if (partitionKeyRestrictions.needFiltering(table) || isDisjunction)
             return new Range<>(p.getMinimumToken().minKeyBound(), p.getMinimumToken().maxKeyBound());
 
         ByteBuffer startKeyBytes = getPartitionKeyBound(Bound.START, options);
@@ -800,13 +938,13 @@ private AbstractBounds<PartitionPosition> getPartitionKeyBounds(IPartitioner p,
         if (partitionKeyRestrictions.isInclusive(Bound.START))
         {
             return partitionKeyRestrictions.isInclusive(Bound.END)
-                    ? new Bounds<>(startKey, finishKey)
-                    : new IncludingExcludingBounds<>(startKey, finishKey);
+                   ? new Bounds<>(startKey, finishKey)
+                   : new IncludingExcludingBounds<>(startKey, finishKey);
         }
 
         return partitionKeyRestrictions.isInclusive(Bound.END)
-                ? new Range<>(startKey, finishKey)
-                : new ExcludingBounds<>(startKey, finishKey);
+               ? new Range<>(startKey, finishKey)
+               : new ExcludingBounds<>(startKey, finishKey);
     }
 
     private AbstractBounds<PartitionPosition> getPartitionKeyBoundsForTokenRestrictions(IPartitioner p,
@@ -830,7 +968,7 @@ private AbstractBounds<PartitionPosition> getPartitionKeyBoundsForTokenRestricti
          */
         int cmp = startToken.compareTo(endToken);
         if (!startToken.isMinimum() && !endToken.isMinimum()
-                && (cmp > 0 || (cmp == 0 && (!includeStart || !includeEnd))))
+            && (cmp > 0 || (cmp == 0 && (!includeStart || !includeEnd))))
             return null;
 
         PartitionPosition start = includeStart ? startToken.minKeyBound() : startToken.maxKeyBound();
@@ -890,6 +1028,11 @@ public NavigableSet<ClusteringBound<?>> getClusteringColumnsBounds(Bound b, Quer
         return clusteringColumnsRestrictions.boundsAsClustering(b, options);
     }
 
+    public boolean isDisjunction()
+    {
+        return isDisjunction;
+    }
+
     /**
      * Checks if the query returns a range of columns.
      *
@@ -903,7 +1046,7 @@ public boolean isColumnRange()
         int numberOfClusteringColumns = table.isStaticCompactTable() ? 0 : table.clusteringColumns().size();
         // it is a range query if it has at least one the column alias for which no relation is defined or is not EQ or IN.
         return clusteringColumnsRestrictions.size() < numberOfClusteringColumns
-            || !clusteringColumnsRestrictions.hasOnlyEqualityRestrictions();
+               || !clusteringColumnsRestrictions.hasOnlyEqualityRestrictions();
     }
 
     /**
@@ -916,15 +1059,43 @@ public boolean needFiltering(TableMetadata table)
         IndexRegistry indexRegistry = IndexRegistry.obtain(table);
         boolean hasClusteringColumnRestrictions = !clusteringColumnsRestrictions.isEmpty();
         boolean hasMultipleContains = nonPrimaryKeyRestrictions.hasMultipleContains();
-        return filterRestrictions.needFiltering(indexRegistry, hasClusteringColumnRestrictions, hasMultipleContains);
+        if (filterRestrictions.needFiltering(indexRegistry, hasClusteringColumnRestrictions, hasMultipleContains))
+            return true;
+
+        for (StatementRestrictions child : children)
+            if (child.needFiltering(table))
+                return true;
+
+        return false;
     }
 
-    public Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table)
+    public boolean needsDisjunctionSupport(TableMetadata table)
     {
-        return getColumnsWithUnsupportedIndexRestrictions(table, Iterables.concat(clusteringColumnsRestrictions.restrictions(), nonPrimaryKeyRestrictions.restrictions()));
+        boolean containsDisjunction = isDisjunction || !children.isEmpty();
+
+        if (!containsDisjunction)
+            return false;
+
+        IndexRegistry indexRegistry = IndexRegistry.obtain(table);
+
+        for (Index.Group group : indexRegistry.listIndexGroups())
+            if (filterRestrictions.indexBeingUsed(group) && !group.supportsDisjunction())
+                return true;
+
+        for (StatementRestrictions child : children)
+            if (child.needsDisjunctionSupport(table))
+                return true;
+
+        return false;
+    }
+
+    private Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table)
+    {
+        return getColumnsWithUnsupportedIndexRestrictions(table, Iterables.concat(clusteringColumnsRestrictions.restrictions(),
+                                                                                  nonPrimaryKeyRestrictions.restrictions()));
     }
 
-    public Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table, Iterable<Restriction> restrictions)
+    private Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetadata table, Iterable<Restriction> restrictions)
     {
         IndexRegistry indexRegistry = IndexRegistry.obtain(table);
         if (indexRegistry.listIndexes().isEmpty())
@@ -951,12 +1122,6 @@ public Set<ColumnMetadata> getColumnsWithUnsupportedIndexRestrictions(TableMetad
         return builder.build();
     }
 
-    protected void validateSecondaryIndexSelections()
-    {
-        checkFalse(keyIsInRelation(),
-                   "Select on indexed columns and with IN clause for the PRIMARY KEY are not supported");
-    }
-
     /**
      * Checks that all the primary key columns (partition key and clustering columns) are restricted by an equality
      * relation ('=' or 'IN').
@@ -966,10 +1131,10 @@ protected void validateSecondaryIndexSelections()
     public boolean hasAllPKColumnsRestrictedByEqualities()
     {
         return !isPartitionKeyRestrictionsOnToken()
-                && !partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents(table)
-                && (partitionKeyRestrictions.hasOnlyEqualityRestrictions())
-                && !hasUnrestrictedClusteringColumns()
-                && (clusteringColumnsRestrictions.hasOnlyEqualityRestrictions());
+               && !partitionKeyRestrictions.hasUnrestrictedPartitionKeyComponents(table)
+               && (partitionKeyRestrictions.hasOnlyEqualityRestrictions())
+               && !hasUnrestrictedClusteringColumns()
+               && (clusteringColumnsRestrictions.hasOnlyEqualityRestrictions());
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
index 96f15d2c9c10..8100777543e1 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenFilter.java
@@ -22,7 +22,6 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
-import java.util.function.Consumer;
 
 import com.google.common.collect.BoundType;
 import com.google.common.collect.ImmutableRangeSet;
@@ -338,7 +337,7 @@ public boolean needsFiltering(Index.Group indexGroup)
     }
 
     @Override
-    public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+    public void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry, QueryOptions options)
     {
         restrictions.addToRowFilter(filter, indexRegistry, options);
     }
diff --git a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
index f9838d48e9e5..654f33bc7500 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/TokenRestriction.java
@@ -130,7 +130,7 @@ public boolean needsFiltering(Index.Group indexGroup)
     }
 
     @Override
-    public void addToRowFilter(RowFilter filter, IndexRegistry indexRegistry, QueryOptions options)
+    public void addToRowFilter(RowFilter.Builder filter, IndexRegistry indexRegistry, QueryOptions options)
     {
         throw new UnsupportedOperationException("Index expression cannot be created for token restriction");
     }
diff --git a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
index 413181b6ab9c..4ac3a53565ae 100644
--- a/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/ModificationStatement.java
@@ -1008,7 +1008,7 @@ protected StatementRestrictions newRestrictions(TableMetadata metadata,
                 throw new InvalidRequestException(CUSTOM_EXPRESSIONS_NOT_ALLOWED);
 
             boolean applyOnlyToStaticColumns = appliesOnlyToStaticColumns(operations, conditions);
-            return new StatementRestrictions(type, metadata, where, boundNames, applyOnlyToStaticColumns, false, false);
+            return StatementRestrictions.create(type, metadata, where, boundNames, applyOnlyToStaticColumns, false, false);
         }
 
         public List<Pair<ColumnIdentifier, ColumnCondition.Raw>> getConditions()
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index f913b6912fc4..c20baa3ed657 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -194,7 +194,7 @@ static SelectStatement forSelection(TableMetadata table, Selection selection)
                                    VariableSpecifications.empty(),
                                    defaultParameters,
                                    selection,
-                                   StatementRestrictions.empty(StatementType.SELECT, table),
+                                   StatementRestrictions.empty(table),
                                    false,
                                    null,
                                    null,
@@ -285,7 +285,7 @@ public ReadQuery getQuery(QueryState queryState,
                               int perPartitionLimit,
                               int pageSize)
     {
-        boolean isPartitionRangeQuery = restrictions.isKeyRange() || restrictions.usesSecondaryIndexing();
+        boolean isPartitionRangeQuery = restrictions.isKeyRange() || restrictions.usesSecondaryIndexing() || restrictions.isDisjunction();
 
         DataLimits limit = getDataLimits(userLimit, perPartitionLimit, pageSize);
 
@@ -622,6 +622,11 @@ private ClusteringIndexFilter makeClusteringIndexFilter(QueryOptions options, Co
             return new ClusteringIndexSliceFilter(Slices.ALL, false);
         }
 
+        if (restrictions.isDisjunction())
+        {
+            return new ClusteringIndexSliceFilter(Slices.ALL, false);
+        }
+
         if (restrictions.isColumnRange())
         {
             Slices slices = makeSlices(options);
@@ -990,6 +995,8 @@ public SelectStatement prepare(boolean forView) throws InvalidRequestException
                     orderingComparator = Collections.reverseOrder(orderingComparator);
             }
 
+            checkDisjunctionIsSupported(table, restrictions);
+
             checkNeedsFiltering(table, restrictions);
 
             return new SelectStatement(table,
@@ -1079,13 +1086,13 @@ private StatementRestrictions prepareRestrictions(TableMetadata metadata,
                                                           boolean selectsOnlyStaticColumns,
                                                           boolean forView) throws InvalidRequestException
         {
-            return new StatementRestrictions(StatementType.SELECT,
-                                             metadata,
-                                             whereClause,
-                                             boundNames,
-                                             selectsOnlyStaticColumns,
-                                             parameters.allowFiltering,
-                                             forView);
+            return StatementRestrictions.create(StatementType.SELECT,
+                                                metadata,
+                                                whereClause,
+                                                boundNames,
+                                                selectsOnlyStaticColumns,
+                                                parameters.allowFiltering,
+                                                forView);
         }
 
         /** Returns a Term for the limit or null if no limit is set */
@@ -1248,6 +1255,17 @@ private boolean isReversed(TableMetadata table, Map<ColumnMetadata, Boolean> ord
             return isReversed;
         }
 
+        /**
+         * This verifies that if the expression contains a disjunction - "value = 1 or value = 2" or "value in (1, 2)"
+         * the indexes involved in the query support disjunction.
+         */
+        private void checkDisjunctionIsSupported(TableMetadata table, StatementRestrictions restrictions) throws InvalidRequestException
+        {
+            if (restrictions.usesSecondaryIndexing())
+                if (restrictions.needsDisjunctionSupport(table))
+                    restrictions.throwsRequiresIndexSupportingDisjunctionError(table);
+        }
+
         /** If ALLOW FILTERING was not specified, this verifies that it is not needed */
         private void checkNeedsFiltering(TableMetadata table, StatementRestrictions restrictions) throws InvalidRequestException
         {
diff --git a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
index f67db14ea949..3b000362e106 100644
--- a/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/UpdateStatement.java
@@ -176,13 +176,13 @@ protected ModificationStatement prepareInternal(TableMetadata metadata,
 
             boolean applyOnlyToStaticColumns = !hasClusteringColumnsSet && appliesOnlyToStaticColumns(operations, conditions);
 
-            StatementRestrictions restrictions = new StatementRestrictions(type,
-                                                                           metadata,
-                                                                           whereClause.build(),
-                                                                           bindVariables,
-                                                                           applyOnlyToStaticColumns,
-                                                                           false,
-                                                                           false);
+            StatementRestrictions restrictions = StatementRestrictions.create(type,
+                                                                              metadata,
+                                                                              whereClause.build(),
+                                                                              bindVariables,
+                                                                              applyOnlyToStaticColumns,
+                                                                              false,
+                                                                              false);
 
             return new UpdateStatement(type,
                                        bindVariables,
@@ -244,13 +244,13 @@ protected ModificationStatement prepareInternal(TableMetadata metadata,
 
             boolean applyOnlyToStaticColumns = !hasClusteringColumnsSet && appliesOnlyToStaticColumns(operations, conditions);
 
-            StatementRestrictions restrictions = new StatementRestrictions(type,
-                                                                           metadata,
-                                                                           whereClause.build(),
-                                                                           bindVariables,
-                                                                           applyOnlyToStaticColumns,
-                                                                           false,
-                                                                           false);
+            StatementRestrictions restrictions = StatementRestrictions.create(type,
+                                                                              metadata,
+                                                                              whereClause.build(),
+                                                                              bindVariables,
+                                                                              applyOnlyToStaticColumns,
+                                                                              false,
+                                                                              false);
 
             return new UpdateStatement(type,
                                        bindVariables,
diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
index 8104cd9da305..10c0d6c83e85 100644
--- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateViewStatement.java
@@ -272,15 +272,14 @@ public Keyspaces apply(Keyspaces schema)
         if (whereClause.containsCustomExpressions())
             throw ire("WHERE clause for materialized view '%s' cannot contain custom index expressions", viewName);
 
-        StatementRestrictions restrictions =
-            new StatementRestrictions(StatementType.SELECT,
-                                      table,
-                                      whereClause,
-                                      VariableSpecifications.empty(),
-                                      false,
-                                      false,
-                                      true,
-                                      true);
+        StatementRestrictions restrictions = StatementRestrictions.create(StatementType.SELECT,
+                                                                          table,
+                                                                          whereClause,
+                                                                          VariableSpecifications.empty(),
+                                                                          false,
+                                                                          false,
+                                                                          true,
+                                                                          true);
 
         List<ColumnIdentifier> nonRestrictedPrimaryKeyColumns =
             Lists.newArrayList(filter(primaryKeyColumns, name -> !restrictions.isRestricted(table.getColumn(name))));
diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java
index 1892a51f4fa3..443df8afd55f 100644
--- a/src/java/org/apache/cassandra/db/filter/RowFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/RowFilter.java
@@ -31,6 +31,10 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.restrictions.CustomIndexExpression;
+import org.apache.cassandra.cql3.restrictions.Restrictions;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.context.*;
 import org.apache.cassandra.db.marshal.*;
@@ -39,6 +43,7 @@
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.IndexRegistry;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -64,56 +69,23 @@ public abstract class RowFilter implements Iterable<RowFilter.Expression>
     private static final Logger logger = LoggerFactory.getLogger(RowFilter.class);
 
     public static final Serializer serializer = new Serializer();
-    public static final RowFilter NONE = new CQLFilter(Collections.emptyList());
+    public static final RowFilter NONE = CQLFilter.NONE;
 
-    protected final List<Expression> expressions;
+    protected final FilterElement root;
 
-    protected RowFilter(List<Expression> expressions)
+    protected RowFilter(FilterElement root)
     {
-        this.expressions = expressions;
+        this.root = root;
     }
 
-    public static RowFilter create()
+    public FilterElement root()
     {
-        return new CQLFilter(new ArrayList<>());
-    }
-
-    public static RowFilter create(int capacity)
-    {
-        return new CQLFilter(new ArrayList<>(capacity));
-    }
-
-    public SimpleExpression add(ColumnMetadata def, Operator op, ByteBuffer value)
-    {
-        SimpleExpression expression = new SimpleExpression(def, op, value);
-        add(expression);
-        return expression;
-    }
-
-    public void addMapEquality(ColumnMetadata def, ByteBuffer key, Operator op, ByteBuffer value)
-    {
-        add(new MapEqualityExpression(def, key, op, value));
-    }
-
-    public void addCustomIndexExpression(TableMetadata metadata, IndexMetadata targetIndex, ByteBuffer value)
-    {
-        add(CustomExpression.build(metadata, targetIndex, value));
-    }
-
-    private void add(Expression expression)
-    {
-        expression.validate();
-        expressions.add(expression);
-    }
-
-    public void addUserExpression(UserExpression e)
-    {
-        expressions.add(e);
+        return root;
     }
 
     public List<Expression> getExpressions()
     {
-        return expressions;
+        return root.expressions;
     }
 
     /**
@@ -122,7 +94,7 @@ public List<Expression> getExpressions()
      */
     public boolean hasExpressionOnClusteringOrRegularColumns()
     {
-        for (Expression expression : expressions)
+        for (Expression expression : root)
         {
             ColumnMetadata column = expression.column();
             if (column.isClusteringColumn() || column.isRegular())
@@ -143,7 +115,7 @@ public boolean hasExpressionOnClusteringOrRegularColumns()
      */
     public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator iter, int nowInSec)
     {
-        return expressions.isEmpty() ? iter : Transformation.apply(iter, filter(iter.metadata(), nowInSec));
+        return root.isEmpty() ? iter : Transformation.apply(iter, filter(iter.metadata(), nowInSec));
     }
 
     /**
@@ -156,7 +128,7 @@ public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator iter, int
      */
     public PartitionIterator filter(PartitionIterator iter, TableMetadata metadata, int nowInSec)
     {
-        return expressions.isEmpty() ? iter : Transformation.apply(iter, filter(metadata, nowInSec));
+        return root.isEmpty() ? iter : Transformation.apply(iter, filter(metadata, nowInSec));
     }
 
     /**
@@ -173,14 +145,9 @@ public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey,
         // We purge all tombstones as the expressions isSatisfiedBy methods expects it
         Row purged = row.purge(DeletionPurger.PURGE_ALL, nowInSec, metadata.enforceStrictLiveness());
         if (purged == null)
-            return expressions.isEmpty();
+            return root.isEmpty();
 
-        for (Expression e : expressions)
-        {
-            if (!e.isSatisfiedBy(metadata, partitionKey, purged))
-                return false;
-        }
-        return true;
+        return root.isSatisfiedBy(metadata, partitionKey, purged);
     }
 
     /**
@@ -189,7 +156,7 @@ public boolean isSatisfiedBy(TableMetadata metadata, DecoratedKey partitionKey,
      */
     public boolean partitionKeyRestrictionsAreSatisfiedBy(DecoratedKey key, AbstractType<?> keyValidator)
     {
-        for (Expression e : expressions)
+        for (Expression e : root)
         {
             if (!e.column.isPartitionKey())
                 continue;
@@ -209,7 +176,7 @@ public boolean partitionKeyRestrictionsAreSatisfiedBy(DecoratedKey key, Abstract
      */
     public boolean clusteringKeyRestrictionsAreSatisfiedBy(Clustering<?> clustering)
     {
-        for (Expression e : expressions)
+        for (Expression e : root)
         {
             if (!e.column.isClusteringColumn())
                 continue;
@@ -228,79 +195,313 @@ public boolean clusteringKeyRestrictionsAreSatisfiedBy(Clustering<?> clustering)
      */
     public RowFilter without(Expression expression)
     {
-        assert expressions.contains(expression);
-        if (expressions.size() == 1)
+        assert root.contains(expression);
+        if (root.size() == 1)
             return RowFilter.NONE;
 
-        List<Expression> newExpressions = new ArrayList<>(expressions.size() - 1);
-        for (Expression e : expressions)
-            if (!e.equals(expression))
-                newExpressions.add(e);
-
-        return withNewExpressions(newExpressions);
+        return new CQLFilter(root.filter(e -> !e.equals(expression)));
     }
 
     public RowFilter withoutExpressions()
     {
-        return withNewExpressions(Collections.emptyList());
+        return NONE;
     }
 
     public RowFilter restrict(Predicate<Expression> filter)
     {
-        return fromExpressions(expressions.stream().filter(filter).collect(Collectors.toList()));
-    }
-
-    private RowFilter fromExpressions(List<Expression> expressions)
-    {
-        return expressions.isEmpty() ? NONE : withNewExpressions(expressions);
+        return new CQLFilter(root.filter(filter));
     }
 
-    protected abstract RowFilter withNewExpressions(List<Expression> expressions);
-
     public boolean isEmpty()
     {
-        return expressions.isEmpty();
+        return root.isEmpty();
     }
 
     public Iterator<Expression> iterator()
     {
-        return expressions.iterator();
+        return root.iterator();
     }
 
     @Override
     public String toString()
     {
-        StringBuilder sb = new StringBuilder();
-        for (int i = 0; i < expressions.size(); i++)
+        return root.toString();
+    }
+
+    public static Builder builder()
+    {
+        return new Builder();
+    }
+
+    public static class Builder
+    {
+        private FilterElement.Builder current = new FilterElement.Builder(false);
+
+        public RowFilter build()
+        {
+            return new CQLFilter(current.build());
+        }
+
+        public RowFilter buildFromRestrictions(StatementRestrictions restrictions, IndexRegistry indexManager, TableMetadata table, QueryOptions options)
+        {
+            return new CQLFilter(doBuild(restrictions, indexManager, table, options));
+        }
+
+        private FilterElement doBuild(StatementRestrictions restrictions, IndexRegistry indexManager, TableMetadata table, QueryOptions options)
         {
-            if (i > 0)
-                sb.append(" AND ");
-            sb.append(expressions.get(i));
+            FilterElement.Builder element = new FilterElement.Builder(restrictions.isDisjunction());
+            this.current = element;
+
+            for (Restrictions restrictionSet : restrictions.filterRestrictions().getRestrictions())
+                restrictionSet.addToRowFilter(this, indexManager, options);
+
+            for (CustomIndexExpression expression : restrictions.filterRestrictions().getExternalExpressions())
+                expression.addToRowFilter(this, table, options);
+
+            for (StatementRestrictions child : restrictions.children())
+                element.children.add(doBuild(child, indexManager, table, options));
+
+            return element.build();
+        }
+
+        public SimpleExpression add(ColumnMetadata def, Operator op, ByteBuffer value)
+        {
+            SimpleExpression expression = new SimpleExpression(def, op, value);
+            add(expression);
+            return expression;
+        }
+
+        public void addMapEquality(ColumnMetadata def, ByteBuffer key, Operator op, ByteBuffer value)
+        {
+            add(new MapEqualityExpression(def, key, op, value));
+        }
+
+        public void addCustomIndexExpression(TableMetadata metadata, IndexMetadata targetIndex, ByteBuffer value)
+        {
+            add(CustomExpression.build(metadata, targetIndex, value));
+        }
+
+        private void add(Expression expression)
+        {
+            expression.validate();
+            current.expressions.add(expression);
+        }
+
+        public void addUserExpression(UserExpression e)
+        {
+            current.expressions.add(e);
         }
-        return sb.toString();
     }
 
-    private static class CQLFilter extends RowFilter
+    public static class FilterElement implements Iterable<Expression>
     {
-        private CQLFilter(List<Expression> expressions)
+        public static final Serializer serializer = new Serializer();
+
+        public static final FilterElement NONE = new FilterElement(false, Collections.emptyList(), Collections.emptyList());
+
+        private boolean isDisjunction;
+
+        private final List<Expression> expressions;
+
+        private final List<FilterElement> children;
+
+        public FilterElement(boolean isDisjunction, List<Expression> expressions, List<FilterElement> children)
         {
-            super(expressions);
+            this.isDisjunction = isDisjunction;
+            this.expressions = expressions;
+            this.children = children;
         }
 
-        protected Transformation<BaseRowIterator<?>> filter(TableMetadata metadata, int nowInSec)
+        public boolean isDisjunction()
+        {
+            return isDisjunction;
+        }
+
+        public List<Expression> expressions()
+        {
+            return expressions;
+        }
+
+        public Iterator<Expression> iterator()
+        {
+            List<Expression> allExpressions = new ArrayList<>(expressions);
+            for (FilterElement child : children)
+                allExpressions.addAll(child.expressions);
+            return allExpressions.iterator();
+        }
+
+        public FilterElement filter(Predicate<Expression> filter)
         {
-            List<Expression> partitionLevelExpressions = new ArrayList<>();
-            List<Expression> rowLevelExpressions = new ArrayList<>();
-            for (Expression e: expressions)
+            FilterElement.Builder builder = new Builder(isDisjunction);
+
+            expressions.stream().filter(filter).forEach(e -> builder.expressions.add(e));
+
+            children.stream().map(c -> c.filter(filter)).forEach(c -> builder.children.add(c));
+
+            return builder.build();
+        }
+
+        public List<FilterElement> children()
+        {
+            return children;
+        }
+
+        public boolean isEmpty()
+        {
+            return expressions.isEmpty() && children.isEmpty();
+        }
+
+        public boolean contains(Expression expression)
+        {
+            return expressions.contains(expression) || children.stream().anyMatch(c -> contains(expression));
+        }
+
+        public FilterElement partitionLevelTree()
+        {
+            return new FilterElement(isDisjunction,
+                                     expressions.stream()
+                                                  .filter(e -> e.column.isStatic() || e.column.isPartitionKey())
+                                                  .collect(Collectors.toList()),
+                                     children.stream()
+                                               .map(FilterElement::partitionLevelTree)
+                                               .collect(Collectors.toList()));
+        }
+
+        public FilterElement rowLevelTree()
+        {
+            return new FilterElement(isDisjunction,
+                                     expressions.stream()
+                                                  .filter(e -> !e.column.isStatic() && !e.column.isPartitionKey())
+                                                  .collect(Collectors.toList()),
+                                     children.stream()
+                                               .map(FilterElement::rowLevelTree)
+                                               .collect(Collectors.toList()));
+        }
+
+        public int size()
+        {
+            return expressions.size() + children.stream().mapToInt(FilterElement::size).sum();
+        }
+
+        public boolean isSatisfiedBy(TableMetadata table, DecoratedKey key, Row row)
+        {
+            if (isEmpty())
+                return true;
+            if (isDisjunction)
             {
-                if (e.column.isStatic() || e.column.isPartitionKey())
-                    partitionLevelExpressions.add(e);
-                else
-                    rowLevelExpressions.add(e);
+                for (Expression e : expressions)
+                    if (e.isSatisfiedBy(table, key, row))
+                        return true;
+                for (FilterElement child : children)
+                    if (child.isSatisfiedBy(table, key, row))
+                        return true;
+                return false;
+            }
+            else
+            {
+                for (Expression e : expressions)
+                    if (!e.isSatisfiedBy(table, key, row))
+                        return false;
+                for (FilterElement child : children)
+                    if (!child.isSatisfiedBy(table, key, row))
+                        return false;
+                return true;
+            }
+        }
+
+        @Override
+        public String toString()
+        {
+            StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < expressions.size(); i++)
+            {
+                if (sb.length() > 0)
+                    sb.append(isDisjunction ? " OR " : " AND ");
+                sb.append(expressions.get(i));
+            }
+            for (int i = 0; i < children.size(); i++)
+            {
+                if (sb.length() > 0)
+                    sb.append(isDisjunction ? " OR " : " AND ");
+                sb.append("(");
+                sb.append(children.get(i));
+                sb.append(")");
             }
+            return sb.toString();
+        }
 
-            long numberOfRegularColumnExpressions = rowLevelExpressions.size();
-            final boolean filterNonStaticColumns = numberOfRegularColumnExpressions > 0;
+        public static class Builder
+        {
+            private boolean isDisjunction;
+            private final List<Expression> expressions = new ArrayList<>();
+            private final List<FilterElement> children = new ArrayList<>();
+
+            public Builder(boolean isDisjunction)
+            {
+                this.isDisjunction = isDisjunction;
+            }
+
+            public FilterElement build()
+            {
+                return new FilterElement(isDisjunction, expressions, children);
+            }
+        }
+
+        public static class Serializer
+        {
+            public void serialize(FilterElement operation, DataOutputPlus out, int version) throws IOException
+            {
+                out.writeBoolean(operation.isDisjunction);
+                out.writeUnsignedVInt(operation.expressions.size());
+                for (Expression expr : operation.expressions)
+                    Expression.serializer.serialize(expr, out, version);
+                out.writeUnsignedVInt(operation.children.size());
+                for (FilterElement child : operation.children)
+                    serialize(child, out, version);
+            }
+
+            public FilterElement deserialize(DataInputPlus in, int version, TableMetadata metadata) throws IOException
+            {
+                boolean isDisjunction = in.readBoolean();
+                int size = (int)in.readUnsignedVInt();
+                List<Expression> expressions = new ArrayList<>(size);
+                for (int i = 0; i < size; i++)
+                    expressions.add(Expression.serializer.deserialize(in, version, metadata));
+                size = (int)in.readUnsignedVInt();
+                List<FilterElement> children = new ArrayList<>(size);
+                for (int i  = 0; i < size; i++)
+                    children.add(deserialize(in, version, metadata));
+                return new FilterElement(isDisjunction, expressions, children);
+            }
+
+            public long serializedSize(FilterElement operation, int version)
+            {
+                long size = 1 + TypeSizes.sizeofUnsignedVInt(operation.expressions.size());
+                for (Expression expr : operation.expressions)
+                    size += Expression.serializer.serializedSize(expr, version);
+                size += TypeSizes.sizeofUnsignedVInt(operation.children.size());
+                for (FilterElement child : operation.children)
+                    size += serializedSize(child, version);
+                return size;
+            }
+        }
+    }
+
+    private static class CQLFilter extends RowFilter
+    {
+        private static final CQLFilter NONE = new CQLFilter(FilterElement.NONE);
+
+        private CQLFilter(FilterElement operation)
+        {
+            super(operation);
+        }
+
+        protected Transformation<BaseRowIterator<?>> filter(TableMetadata metadata, int nowInSec)
+        {
+            FilterElement partitionLevelOperation = root.partitionLevelTree();
+            FilterElement rowLevelOperation = root.rowLevelTree();
+
+            final boolean filterNonStaticColumns = rowLevelOperation.size() > 0;
 
             return new Transformation<BaseRowIterator<?>>()
             {
@@ -312,12 +513,11 @@ protected BaseRowIterator<?> applyToPartition(BaseRowIterator<?> partition)
                     pk = partition.partitionKey();
 
                     // Short-circuit all partitions that won't match based on static and partition keys
-                    for (Expression e : partitionLevelExpressions)
-                        if (!e.isSatisfiedBy(metadata, partition.partitionKey(), partition.staticRow()))
-                        {
-                            partition.close();
-                            return null;
-                        }
+                    if (!partitionLevelOperation.isSatisfiedBy(metadata, partition.partitionKey(), partition.staticRow()))
+                    {
+                        partition.close();
+                        return null;
+                    }
 
                     BaseRowIterator<?> iterator = partition instanceof UnfilteredRowIterator
                                                   ? Transformation.apply((UnfilteredRowIterator) partition, this)
@@ -338,24 +538,18 @@ public Row applyToRow(Row row)
                     if (purged == null)
                         return null;
 
-                    for (Expression e : rowLevelExpressions)
-                        if (!e.isSatisfiedBy(metadata, pk, purged))
-                            return null;
+                    if (!rowLevelOperation.isSatisfiedBy(metadata, pk, purged))
+                        return null;
 
                     return row;
                 }
             };
         }
-
-        protected RowFilter withNewExpressions(List<Expression> expressions)
-        {
-            return new CQLFilter(expressions);
-        }
     }
 
     public static abstract class Expression
     {
-        private static final Serializer serializer = new Serializer();
+        public static final Serializer serializer = new Serializer();
 
         // Note: the order of this enum matter, it's used for serialization,
         // and this is why we have some UNUSEDX for values we don't use anymore
@@ -490,7 +684,7 @@ public int hashCode()
             return Objects.hashCode(column.name, operator, value);
         }
 
-        private static class Serializer
+        public static class Serializer
         {
             public void serialize(Expression expression, DataOutputPlus out, int version) throws IOException
             {
@@ -601,7 +795,7 @@ public long serializedSize(Expression expression, int version)
      */
     public static class SimpleExpression extends Expression
     {
-        SimpleExpression(ColumnMetadata column, Operator operator, ByteBuffer value)
+        public SimpleExpression(ColumnMetadata column, Operator operator, ByteBuffer value)
         {
             super(column, operator, value);
         }
@@ -747,7 +941,7 @@ protected Kind kind()
      * An expression of the form 'column' ['key'] = 'value' (which is only
      * supported when 'column' is a map).
      */
-    private static class MapEqualityExpression extends Expression
+    public static class MapEqualityExpression extends Expression
     {
         private final ByteBuffer key;
 
@@ -998,29 +1192,20 @@ public static class Serializer
         public void serialize(RowFilter filter, DataOutputPlus out, int version) throws IOException
         {
             out.writeBoolean(false); // Old "is for thrift" boolean
-            out.writeUnsignedVInt(filter.expressions.size());
-            for (Expression expr : filter.expressions)
-                Expression.serializer.serialize(expr, out, version);
-
+            FilterElement.serializer.serialize(filter.root, out, version);
         }
 
         public RowFilter deserialize(DataInputPlus in, int version, TableMetadata metadata) throws IOException
         {
             in.readBoolean(); // Unused
-            int size = (int)in.readUnsignedVInt();
-            List<Expression> expressions = new ArrayList<>(size);
-            for (int i = 0; i < size; i++)
-                expressions.add(Expression.serializer.deserialize(in, version, metadata));
-
-            return new CQLFilter(expressions);
+            FilterElement operation = FilterElement.serializer.deserialize(in, version, metadata);
+            return new CQLFilter(operation);
         }
 
         public long serializedSize(RowFilter filter, int version)
         {
             long size = 1 // unused boolean
-                      + TypeSizes.sizeofUnsignedVInt(filter.expressions.size());
-            for (Expression expr : filter.expressions)
-                size += Expression.serializer.serializedSize(expr, version);
+                        + FilterElement.serializer.serializedSize(filter.root, version);
             return size;
         }
     }
diff --git a/src/java/org/apache/cassandra/index/Index.java b/src/java/org/apache/cassandra/index/Index.java
index 7dd0f53aefe0..4ecdb09c089a 100644
--- a/src/java/org/apache/cassandra/index/Index.java
+++ b/src/java/org/apache/cassandra/index/Index.java
@@ -769,6 +769,14 @@ default boolean supportsMultipleContains()
         {
             return false;
         }
+
+        /**
+         * @return true is this index group supports disjunction queries of "a = 1 OR a = 2" or "a IN (1, 2)"
+         */
+        default boolean supportsDisjunction()
+        {
+            return false;
+        }
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/index/sai/ColumnContext.java b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
index 9c86d5ecb917..99221b58540b 100644
--- a/src/java/org/apache/cassandra/index/sai/ColumnContext.java
+++ b/src/java/org/apache/cassandra/index/sai/ColumnContext.java
@@ -390,6 +390,9 @@ public boolean supports(Operator op)
 
         AbstractType<?> validator = getValidator();
 
+        if (operator == Expression.Op.IN)
+            return true;
+
         if (operator != Expression.Op.EQ && EQ_ONLY_TYPES.contains(validator)) return false;
 
         // RANGE only applicable to non-literal indexes
diff --git a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
index d6702c0753d4..fe4dbd5f858b 100644
--- a/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
+++ b/src/java/org/apache/cassandra/index/sai/StorageAttachedIndexGroup.java
@@ -148,6 +148,12 @@ public boolean supportsMultipleContains()
         return true;
     }
 
+    @Override
+    public boolean supportsDisjunction()
+    {
+        return true;
+    }
+
     @Override
     public boolean containsIndex(Index index)
     {
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Expression.java b/src/java/org/apache/cassandra/index/sai/plan/Expression.java
index 20e5f4766832..dfe0c1b7dadc 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/Expression.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/Expression.java
@@ -48,7 +48,7 @@ public class Expression
 
     public enum Op
     {
-        EQ, MATCH, PREFIX, NOT_EQ, RANGE, CONTAINS_KEY, CONTAINS_VALUE;
+        EQ, MATCH, PREFIX, NOT_EQ, RANGE, CONTAINS_KEY, CONTAINS_VALUE, IN;
 
         public static Op valueOf(Operator operator)
         {
@@ -78,6 +78,9 @@ public static Op valueOf(Operator operator)
                 case LIKE_MATCHES:
                     return MATCH;
 
+                case IN:
+                    return IN;
+
                 default:
                     return null;
             }
diff --git a/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java b/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java
index e35edbb9685b..7264877cb1b8 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/FilterTree.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.index.sai.plan;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.ListIterator;
@@ -47,131 +48,30 @@ public class FilterTree
 {
     protected final OperationType op;
     protected final ListMultimap<ColumnMetadata, Expression> expressions;
-
-    protected final FilterTree left;
-    protected final FilterTree right;
+    protected final List<FilterTree> children = new ArrayList<>();
 
     FilterTree(OperationType operation,
-               ListMultimap<ColumnMetadata, Expression> expressions,
-               FilterTree left, FilterTree right)
+               ListMultimap<ColumnMetadata, Expression> expressions)
     {
         this.op = operation;
         this.expressions = expressions;
-
-        this.left = left;
-        this.right = right;
     }
 
-    /**
-     * Recursive "satisfies" checks based on operation
-     * and data from the lower level members using depth-first search
-     * and bubbling the results back to the top level caller.
-     *
-     * Most of the work here is done by localSatisfiedBy(Unfiltered, Row, boolean)
-     * see it's comment for details, if there are no local expressions
-     * assigned to Operation it will call satisfiedBy(Row) on it's children.
-     *
-     * Query: first_name = X AND (last_name = Y OR address = XYZ AND street = IL AND city = C) OR (state = 'CA' AND country = 'US')
-     * Row: key1: (first_name: X, last_name: Z, address: XYZ, street: IL, city: C, state: NY, country:US)
-     *
-     * #1                       OR
-     *                        /    \
-     * #2       (first_name) AND   AND (state, country)
-     *                          \
-     * #3            (last_name) OR
-     *                             \
-     * #4                          AND (address, street, city)
-     *
-     *
-     * Evaluation of the key1 is top-down depth-first search:
-     *
-     * --- going down ---
-     * Level #1 is evaluated, OR expression has to pull results from it's children which are at level #2 and OR them together,
-     * Level #2 AND (state, country) could be be evaluated right away, AND (first_name) refers to it's "right" child from level #3
-     * Level #3 OR (last_name) requests results from level #4
-     * Level #4 AND (address, street, city) does logical AND between it's 3 fields, returns result back to level #3.
-     * --- bubbling up ---
-     * Level #3 computes OR between AND (address, street, city) result and it's "last_name" expression
-     * Level #2 computes AND between "first_name" and result of level #3, AND (state, country) which is already computed
-     * Level #1 does OR between results of AND (first_name) and AND (state, country) and returns final result.
-     *
-     * @param key The partition key for the row.
-     * @param currentCluster The row cluster to check.
-     * @param staticRow The static row associated with current cluster.
-     * @return true if give Row satisfied all of the expressions in the tree,
-     *         false otherwise.
-     */
-    public boolean satisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
+    void addChild(FilterTree child)
     {
-        boolean sideL, sideR;
-
-        if (expressions == null || expressions.isEmpty())
-        {
-            sideL =  left != null &&  left.satisfiedBy(key, currentCluster, staticRow);
-            sideR = right != null && right.satisfiedBy(key, currentCluster, staticRow);
-
-            // one of the expressions was skipped
-            // because it had no indexes attached
-            if (left == null)
-                return sideR;
-        }
-        else
-        {
-            sideL = localSatisfiedBy(key, currentCluster, staticRow);
+        children.add(child);
+    }
 
-            // if there is no right it means that this expression
-            // is last in the sequence, we can just return result from local expressions
-            if (right == null)
-                return sideL;
+    public boolean isSatisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
+    {
+        boolean result = localSatisfiedBy(key, currentCluster, staticRow);
 
-            sideR = right.satisfiedBy(key, currentCluster, staticRow);
-        }
+        for (FilterTree child : children)
+            result = op.apply(result, child.isSatisfiedBy(key, currentCluster, staticRow));
 
-        return op.apply(sideL, sideR);
+        return result;
     }
 
-    /**
-     * Check every expression in the analyzed list to figure out if the
-     * columns in the give row match all of the based on the operation
-     * set to the current operation node.
-     *
-     * The algorithm is as follows: for every given expression from analyzed
-     * list get corresponding column from the Row:
-     *   - apply {@link Expression#isSatisfiedBy(ByteBuffer)}
-     *     method to figure out if it's satisfied;
-     *   - apply logical operation between boolean accumulator and current boolean result;
-     *   - if result == false and node's operation is AND return right away;
-     *
-     * After all of the expressions have been evaluated return resulting accumulator variable.
-     *
-     * Example:
-     *
-     * Operation = (op: AND, columns: [first_name = p, 5 < age < 7, last_name: y])
-     * Row = (first_name: pavel, last_name: y, age: 6, timestamp: 15)
-     *
-     * #1 get "first_name" = p (expressions)
-     *      - row-get "first_name"                      => "pavel"
-     *      - compare "pavel" against "p"               => true (current)
-     *      - set accumulator current                   => true (because this is expression #1)
-     *
-     * #2 get "last_name" = y (expressions)
-     *      - row-get "last_name"                       => "y"
-     *      - compare "y" against "y"                   => true (current)
-     *      - set accumulator to accumulator & current  => true
-     *
-     * #3 get 5 < "age" < 7 (expressions)
-     *      - row-get "age"                             => "6"
-     *      - compare 5 < 6 < 7                         => true (current)
-     *      - set accumulator to accumulator & current  => true
-     *
-     * #4 return accumulator => true (row satisfied all of the conditions)
-     *
-     * @param key The partition key for the row.
-     * @param currentCluster The row cluster to check.
-     * @param staticRow The static row associated with current cluster.
-     * @return true if give Row satisfied all of the analyzed expressions,
-     *         false otherwise.
-     */
     private boolean localSatisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
     {
         if (currentCluster == null || !currentCluster.isRow())
@@ -211,6 +111,8 @@ private boolean localSatisfiedBy(DecoratedKey key, Unfiltered currentCluster, Ro
                 // If the operation is an AND then exit early if we get a single false
                 if (op == OperationType.AND && !result)
                     return false;
+                else if (op == OperationType.OR && result)
+                    return true;
             }
         }
         return result;
diff --git a/src/java/org/apache/cassandra/index/sai/plan/Operation.java b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
index 2e91a0d9650a..2d4a8e52d369 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/Operation.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/Operation.java
@@ -24,10 +24,8 @@
 
 package org.apache.cassandra.index.sai.plan;
 
-import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 
@@ -36,21 +34,19 @@
 import com.google.common.collect.Iterables;
 import com.google.common.collect.ListMultimap;
 
-import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.db.TypeSizes;
 import org.apache.cassandra.db.filter.RowFilter;
-import org.apache.cassandra.db.rows.Row;
-import org.apache.cassandra.db.rows.Unfiltered;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.index.sai.ColumnContext;
-import org.apache.cassandra.index.sai.SSTableIndex;
-import org.apache.cassandra.index.sai.Token;
 import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
-import org.apache.cassandra.index.sai.utils.RangeIntersectionIterator;
 import org.apache.cassandra.index.sai.utils.RangeIterator;
-import org.apache.cassandra.index.sai.utils.RangeUnionIterator;
 import org.apache.cassandra.index.sai.utils.TypeUtil;
 import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.serializers.ListSerializer;
+import org.apache.cassandra.transport.ProtocolVersion;
 
-public class Operation extends RangeIterator
+public class Operation
 {
     public enum OperationType
     {
@@ -72,24 +68,6 @@ public boolean apply(boolean a, boolean b)
         }
     }
 
-    final FilterTree filterTree;
-    final RangeIterator range;
-
-    final QueryController controller;
-
-    private Operation(RangeIterator range, FilterTree filterTree, QueryController controller)
-    {
-        super(range);
-        this.filterTree = filterTree;
-        this.range = range;
-        this.controller = controller;
-    }
-
-    public boolean satisfiedBy(DecoratedKey key, Unfiltered currentCluster, Row staticRow)
-    {
-        return filterTree.satisfiedBy(key, currentCluster, staticRow);
-    }
-
     @VisibleForTesting
     protected static ListMultimap<ColumnMetadata, Expression> analyzeGroup(QueryController controller,
                                                                            OperationType op,
@@ -210,269 +188,211 @@ private static int getPriority(org.apache.cassandra.cql3.Operator op)
         }
     }
 
-    @Override
-    protected Token computeNext()
+    static RangeIterator buildIterator(QueryController controller)
     {
-        return range != null && range.hasNext() ? range.next() : endOfData();
+        return Node.buildTree(controller.filterOperation()).analyzeTree(controller).rangeIterator(controller);
     }
 
-    @Override
-    protected void performSkipTo(Long nextToken)
+    static FilterTree buildFilter(QueryController controller)
     {
-        if (range != null)
-            range.skipTo(nextToken);
+        return Node.buildTree(controller.filterOperation()).buildFilter(controller);
     }
 
-    @Override
-    public void close() throws IOException
+    public static abstract class Node
     {
-        if (range != null)
-            range.close();
+        ListMultimap<ColumnMetadata, Expression> expressionMap;
 
-        controller.releaseIndexes(filterTree.expressions);
-    }
+        boolean canFilter()
+        {
+            return (expressionMap != null && !expressionMap.isEmpty()) || !children().isEmpty() ;
+        }
 
-    /**
-     * @param controller current query controller
-     * @return tree builder with query expressions added from query controller.
-     */
-    static TreeBuilder initTreeBuilder(QueryController controller)
-    {
-        TreeBuilder tree = new TreeBuilder(controller);
-        tree.add(controller.getExpressions());
-        return tree;
-    }
+        List<Node> children()
+        {
+            return Collections.emptyList();
+        }
 
-    /**
-     * A builder on which like expressions are built as subtrees using {@link OperationType} OR to
-     * keep their correct semantics. Remaining expressions are added into the root AND OperationType.
-     *
-     *  Example:
-     *
-     *   3 Like expressions:
-     *
-     *                    AND (expressions)
-     *                  /   \
-     *                AND   OR (like)
-     *               /   \
-     *      (like) OR   OR (like)
-     *
-     **/
-    public static class TreeBuilder
-    {
-        private final QueryController controller;
-        final Builder root;
-        Builder subtree;
+        void add(Node child)
+        {
+            throw new UnsupportedOperationException();
+        }
 
-        TreeBuilder(QueryController controller)
+        RowFilter.Expression expression()
         {
-            this.controller = controller;
-            this.root = new Builder(OperationType.AND, controller);
-            this.subtree = root;
+            throw new UnsupportedOperationException();
         }
 
-        public TreeBuilder add(Collection<RowFilter.Expression> expressions)
+        abstract void analyze(List<RowFilter.Expression> expressionList, QueryController controller);
+
+        abstract FilterTree filterTree();
+
+        abstract RangeIterator rangeIterator(QueryController controller);
+
+        static Node buildTree(RowFilter.FilterElement filterOperation)
         {
-            if (expressions != null)
-                expressions.forEach(this::add);
-            return this;
+            OperatorNode node = filterOperation.isDisjunction() ? new OrNode() : new AndNode();
+            for (RowFilter.Expression expression : filterOperation.expressions())
+                node.add(buildExpression(expression));
+            for (RowFilter.FilterElement child : filterOperation.children())
+                node.add(buildTree(child));
+            return node;
         }
 
-        public TreeBuilder add(RowFilter.Expression exp)
+        static Node buildExpression(RowFilter.Expression expression)
         {
-            if (exp.operator().isLike())
-                addToSubTree(exp);
+            if (expression.operator() == Operator.IN)
+            {
+                OperatorNode node = new OrNode();
+                int size = ListSerializer.readCollectionSize(expression.getIndexValue(), ByteBufferAccessor.instance, ProtocolVersion.V3);
+                int offset = ListSerializer.sizeOfCollectionSize(size, ProtocolVersion.V3);
+                for (int index = 0; index < size; index++)
+                {
+                    node.add(new ExpressionNode(new RowFilter.SimpleExpression(expression.column(),
+                                                                               Operator.EQ,
+                                                                               ListSerializer.readValue(expression.getIndexValue(),
+                                                                                                        ByteBufferAccessor.instance,
+                                                                                                        offset,
+                                                                                                        ProtocolVersion.V3))));
+                    offset += TypeSizes.INT_SIZE + ByteBufferAccessor.instance.getInt(expression.getIndexValue(), offset);
+                }
+                return node;
+            }
             else
-                root.add(exp);
+                return new ExpressionNode(expression);
+        }
 
+        Node analyzeTree(QueryController controller)
+        {
+            List<RowFilter.Expression> expressionList = new ArrayList<>();
+            doTreeAnalysis(this, expressionList, controller);
+            if (!expressionList.isEmpty())
+                this.analyze(expressionList, controller);
             return this;
         }
 
-        private void addToSubTree(RowFilter.Expression exp)
+        void doTreeAnalysis(Node node, List<RowFilter.Expression> expressions, QueryController controller)
         {
-            Builder likeOperation = new Builder(OperationType.OR, controller);
-            likeOperation.add(exp);
-            if (subtree.right == null)
-            {
-                subtree.setRight(likeOperation);
-            }
-            else if (subtree.left == null)
-            {
-                Builder newSubtree = new Builder(OperationType.AND, controller);
-                subtree.setLeft(newSubtree);
-                newSubtree.setRight(likeOperation);
-                subtree = newSubtree;
-            }
+            if (node.children().isEmpty())
+                expressions.add(node.expression());
             else
             {
-                throw new IllegalStateException("Both trees are full");
+                List<RowFilter.Expression> expressionList = new ArrayList<>();
+                for (Node child : node.children())
+                    doTreeAnalysis(child, expressionList, controller);
+                node.analyze(expressionList, controller);
             }
         }
 
-        public Operation complete()
-        {
-            return root.complete();
-        }
-
-        FilterTree completeFilter()
+        FilterTree buildFilter(QueryController controller)
         {
-            return root.completeFilter();
+            analyzeTree(controller);
+            FilterTree tree = filterTree();
+            for (Node child : children())
+                if (child.canFilter())
+                    tree.addChild(child.buildFilter(controller));
+            return tree;
         }
     }
 
-    public static class Builder
+    public static abstract class OperatorNode extends Node
     {
-        private final QueryController controller;
+        List<Node> children = new ArrayList<>();
 
-        protected final OperationType op;
-        private final List<RowFilter.Expression> expressions;
-
-        protected Builder left, right;
-
-        public Builder(OperationType operation, QueryController controller, RowFilter.Expression... columns)
+        @Override
+        public List<Node> children()
         {
-            this.op = operation;
-            this.controller = controller;
-            this.expressions = new ArrayList<>();
-            Collections.addAll(expressions, columns);
+            return children;
         }
 
-        public Builder setRight(Builder operation)
+        @Override
+        public void add(Node child)
         {
-            this.right = operation;
-            return this;
+            children.add(child);
         }
+    }
 
-        public Builder setLeft(Builder operation)
+    public static class AndNode extends OperatorNode
+    {
+        @Override
+        public void analyze(List<RowFilter.Expression> expressionList, QueryController controller)
         {
-            this.left = operation;
-            return this;
+            expressionMap = analyzeGroup(controller, OperationType.AND, expressionList);
         }
 
-        public void add(RowFilter.Expression e)
+        @Override
+        FilterTree filterTree()
         {
-            expressions.add(e);
+            return new FilterTree(OperationType.AND, expressionMap);
         }
 
-        public void add(Collection<RowFilter.Expression> newExpressions)
+        @Override
+        RangeIterator rangeIterator(QueryController controller)
         {
-            if (expressions != null)
-                expressions.addAll(newExpressions);
+            RangeIterator.Builder builder = controller.getIndexes(OperationType.AND, expressionMap.values());
+            for (Node child : children)
+                if (child.canFilter())
+                    builder.add(child.rangeIterator(controller));
+            return builder.build();
         }
+    }
 
-        @SuppressWarnings("resource")
-        public Operation complete()
+    public static class OrNode extends OperatorNode
+    {
+        @Override
+        public void analyze(List<RowFilter.Expression> expressionList, QueryController controller)
         {
-            if (!expressions.isEmpty())
-            {
-                ListMultimap<ColumnMetadata, Expression> analyzedExpressions = analyzeGroup(controller, op, expressions);
-                RangeIterator.Builder range = controller.getIndexes(op, analyzedExpressions.values());
-
-                Operation rightOp = null;
-                if (right != null)
-                {
-                    rightOp = right.complete();
-                    range.add(rightOp);
-                }
-
-                FilterTree filterTree  = new FilterTree(op, analyzedExpressions, null, rightOp != null ? rightOp.filterTree : null);
-                return new Operation(range.build(), filterTree, controller);
-            }
-            else // when OR is used
-            {
-                Operation leftOp = null, rightOp = null;
-                boolean leftIndexes = false, rightIndexes = false;
+            expressionMap = analyzeGroup(controller, OperationType.OR, expressionList);
+        }
 
-                if (left != null)
-                {
-                    leftOp = left.complete();
-                    leftIndexes = leftOp != null && leftOp.range != null;
-                }
+        @Override
+        FilterTree filterTree()
+        {
+            return new FilterTree(OperationType.OR, expressionMap);
+        }
 
-                if (right != null)
-                {
-                    rightOp = right.complete();
-                    rightIndexes = rightOp != null && rightOp.range != null;
-                }
+        @Override
+        RangeIterator rangeIterator(QueryController controller)
+        {
+            RangeIterator.Builder builder = controller.getIndexes(OperationType.OR, expressionMap.values());
+            for (Node child : children)
+                if (child.canFilter())
+                    builder.add(child.rangeIterator(controller));
+            return builder.build();
+        }
+    }
 
-                RangeIterator join;
-                /**
-                 * Operation should allow one of it's sub-trees to wrap no indexes, that is related  to the fact that we
-                 * have to accept defined-but-not-indexed columns as well as key range as IndexExpressions.
-                 *
-                 * Two cases are possible:
-                 *
-                 * only left child produced indexed iterators, that could happen when there are two columns
-                 * or key range on the right:
-                 *
-                 *                AND
-                 *              /     \
-                 *            OR       \
-                 *           /   \     AND
-                 *          a     b   /   \
-                 *                  key   key
-                 *
-                 * only right child produced indexed iterators:
-                 *
-                 *               AND
-                 *              /    \
-                 *            AND     a
-                 *           /   \
-                 *         key  key
-                 */
-                if (leftIndexes && !rightIndexes)
-                    join = leftOp;
-                else if (!leftIndexes && rightIndexes)
-                    join = rightOp;
-                else if (leftIndexes)
-                {
-                    RangeIterator.Builder builder = op == OperationType.OR
-                                                                 ? RangeUnionIterator.builder()
-                                                                 : RangeIntersectionIterator.selectiveBuilder();
+    public static class ExpressionNode extends Node
+    {
+        RowFilter.Expression expression;
 
-                    join = builder.add(leftOp).add(rightOp).build();
-                }
-                else
-                    throw new AssertionError("both sub-trees have 0 indexes.");
+        @Override
+        public void analyze(List<RowFilter.Expression> expressionList, QueryController controller)
+        {
+            expressionMap = analyzeGroup(controller, OperationType.AND, expressionList);
+        }
 
-                return new Operation(join,
-                                     new FilterTree(op, null,
-                                                    leftOp == null ? null : leftOp.filterTree,
-                                                    leftOp == null ? null : leftOp.filterTree),
-                                     controller);
-            }
+        @Override
+        FilterTree filterTree()
+        {
+            return new FilterTree(OperationType.AND, expressionMap);
         }
 
-        /**
-         * To build a filter tree used to filter data using indexed expressions and non-user-defined expressions.
-         *
-         * Similar to {@link #complete()}, except that this method won't reference {@link SSTableIndex} and avoids
-         * complexity of RangeIterator.
-         *
-         * @return the filter tree
-         */
-        FilterTree completeFilter()
+        public ExpressionNode(RowFilter.Expression expression)
         {
-            if (!expressions.isEmpty())
-            {
-                ListMultimap<ColumnMetadata, Expression> analyzedExpressions = analyzeGroup(controller, op, expressions);
-                if (right != null)
-                {
-                    FilterTree ro = right.completeFilter();
-                    return new FilterTree(op, analyzedExpressions, null, ro);
-                }
-                return new FilterTree(op, analyzedExpressions, null, null);
-            }
-            else
-            {
-                FilterTree leftOperation = left != null ? left.completeFilter() : null;
-                FilterTree rightOperation = right != null ? right.completeFilter() : null;
+            this.expression = expression;
+        }
 
-                if (leftOperation == null && rightOperation == null)
-                    throw new AssertionError("both sub-trees have 0 indexes.");
+        @Override
+        public RowFilter.Expression expression()
+        {
+            return expression;
+        }
 
-                return new FilterTree(op, null, leftOperation, rightOperation);
-            }
+        @Override
+        RangeIterator rangeIterator(QueryController controller)
+        {
+            assert canFilter();
+            return controller.getIndexes(OperationType.AND, expressionMap.values()).build();
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
index 72e7592fe833..b0cfad2527cb 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/QueryController.java
@@ -21,7 +21,6 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.NavigableSet;
@@ -30,7 +29,6 @@
 import java.util.TreeSet;
 import java.util.stream.Collectors;
 
-import com.google.common.collect.ListMultimap;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Sets;
 import org.slf4j.Logger;
@@ -73,17 +71,16 @@ public class QueryController
 
     private final ColumnFamilyStore cfs;
     private final ReadCommand command;
-    private final Set<Collection<Expression>> resources = new HashSet<>();
     private final QueryContext queryContext;
     private final TableQueryMetrics tableQueryMetrics;
-    private final List<RowFilter.Expression> expressions;
+    private final RowFilter.FilterElement filterOperation;
 
     private final List<DataRange> ranges;
     private final AbstractBounds<PartitionPosition> mergeRange;
 
     public QueryController(ColumnFamilyStore cfs,
                            ReadCommand command,
-                           List<RowFilter.Expression> expressions,
+                           RowFilter.FilterElement filterOperation,
                            QueryContext queryContext,
                            TableQueryMetrics tableQueryMetrics)
     {
@@ -91,8 +88,7 @@ public QueryController(ColumnFamilyStore cfs,
         this.command = command;
         this.queryContext = queryContext;
         this.tableQueryMetrics = tableQueryMetrics;
-        this.expressions = expressions;
-
+        this.filterOperation = filterOperation;
         this.ranges = dataRanges(command);
         DataRange first = ranges.get(0);
         DataRange last = ranges.get(ranges.size() - 1);
@@ -104,12 +100,9 @@ public TableMetadata metadata()
         return command.metadata();
     }
 
-    /**
-     * @return non-user defined expressions used in the read command
-     */
-    List<RowFilter.Expression> getExpressions()
+    RowFilter.FilterElement filterOperation()
     {
-        return expressions;
+        return this.filterOperation;
     }
 
     /**
@@ -172,7 +165,6 @@ public UnfilteredRowIterator getPartition(DecoratedKey key, ReadExecutionControl
     /**
      * Build a {@link RangeIterator.Builder} from the given list of expressions by applying given operation (OR/AND).
      * Building of such builder involves index search, results of which are persisted in the internal resources list
-     * and can be released later via {@link QueryController#releaseIndexes(ListMultimap)}}.
      *
      * @param op The operation type to coalesce expressions with.
      * @param expressions The expressions to build range iterator from (expressions with not results are ignored).
@@ -181,9 +173,6 @@ public UnfilteredRowIterator getPartition(DecoratedKey key, ReadExecutionControl
      */
     public RangeIterator.Builder getIndexes(Operation.OperationType op, Collection<Expression> expressions)
     {
-        if (resources.contains(expressions))
-            throw new IllegalArgumentException("Can't process the same expressions multiple times.");
-
         boolean defer = op == Operation.OperationType.OR || RangeIntersectionIterator.shouldDefer(expressions.size());
 
         RangeIterator.Builder builder = op == Operation.OperationType.OR
@@ -209,8 +198,6 @@ public RangeIterator.Builder getIndexes(Operation.OperationType op, Collection<E
             view.forEach(e -> e.getValue().forEach(SSTableIndex::release));
             throw t;
         }
-
-        resources.add(expressions);
         return builder;
     }
 
@@ -226,12 +213,6 @@ private static void releaseQuietly(SSTableIndex index)
         }
     }
 
-    public void releaseIndexes(ListMultimap<?, Expression> expressions)
-    {
-        if (expressions != null)
-            resources.remove(expressions.values());
-    }
-
     /**
      * Used to release all resources and record metrics when query finishes.
      */
diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
index 5c11aee35cd1..47e970e1ae5e 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
@@ -40,19 +40,19 @@ public class StorageAttachedIndexQueryPlan implements Index.QueryPlan
     private final ColumnFamilyStore cfs;
     private final TableQueryMetrics queryMetrics;
     private final RowFilter postIndexFilter;
-    private final List<RowFilter.Expression> expressions;
+    private final RowFilter.FilterElement filterOperation;
     private final Set<Index> indexes;
 
     private StorageAttachedIndexQueryPlan(ColumnFamilyStore cfs,
                                           TableQueryMetrics queryMetrics,
                                           RowFilter postIndexFilter,
-                                          List<RowFilter.Expression> expressions,
+                                          RowFilter.FilterElement filterOperation,
                                           ImmutableSet<Index> indexes)
     {
         this.cfs = cfs;
         this.queryMetrics = queryMetrics;
         this.postIndexFilter = postIndexFilter;
-        this.expressions = expressions;
+        this.filterOperation = filterOperation;
         this.indexes = indexes;
     }
 
@@ -65,7 +65,7 @@ public static StorageAttachedIndexQueryPlan create(ColumnFamilyStore cfs,
         ImmutableSet.Builder<Index> selectedIndexesBuilder = ImmutableSet.builder();
         List<RowFilter.Expression> acceptedExpressions = new ArrayList<>();
 
-        for (RowFilter.Expression expression : rowFilter.getExpressions())
+        for (RowFilter.Expression expression : rowFilter)
         {
             // we ignore user-defined expressions here because we don't have a way to translate their #isSatifiedBy
             // method, they will be included in the filter returned by QueryPlan#postIndexQueryFilter()
@@ -92,7 +92,7 @@ public static StorageAttachedIndexQueryPlan create(ColumnFamilyStore cfs,
          * at {@link RowFilter.UserExpression}s like those used by RLAC.
          */
         RowFilter postIndexFilter = rowFilter.restrict(e -> e.isUserDefined());
-        return new StorageAttachedIndexQueryPlan(cfs, queryMetrics, postIndexFilter, acceptedExpressions, selectedIndexes);
+        return new StorageAttachedIndexQueryPlan(cfs, queryMetrics, postIndexFilter, rowFilter.root(), selectedIndexes);
     }
 
     @Override
@@ -119,7 +119,7 @@ public boolean shouldEstimateInitialConcurrency()
     @Override
     public Index.Searcher searcherFor(ReadCommand command)
     {
-        return new StorageAttachedIndexSearcher(cfs, queryMetrics, command, expressions, DatabaseDescriptor.getRangeRpcTimeout(TimeUnit.MILLISECONDS));
+        return new StorageAttachedIndexSearcher(cfs, queryMetrics, command, filterOperation, DatabaseDescriptor.getRangeRpcTimeout(TimeUnit.MILLISECONDS));
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
index 3f55e3f648a6..33bc4fd4dda2 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexSearcher.java
@@ -46,6 +46,7 @@
 import org.apache.cassandra.index.sai.QueryContext;
 import org.apache.cassandra.index.sai.SSTableIndex;
 import org.apache.cassandra.index.sai.metrics.TableQueryMetrics;
+import org.apache.cassandra.index.sai.utils.RangeIterator;
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.AbstractIterator;
@@ -59,12 +60,12 @@ public class StorageAttachedIndexSearcher implements Index.Searcher
     public StorageAttachedIndexSearcher(ColumnFamilyStore cfs,
                                         TableQueryMetrics tableQueryMetrics,
                                         ReadCommand command,
-                                        List<RowFilter.Expression> expressions,
+                                        RowFilter.FilterElement filterOperation,
                                         long executionQuotaMs)
     {
         this.command = command;
         this.queryContext = new QueryContext(executionQuotaMs);
-        this.controller = new QueryController(cfs, command, expressions, queryContext, tableQueryMetrics);
+        this.controller = new QueryController(cfs, command, filterOperation, queryContext, tableQueryMetrics);
     }
 
     @Override
@@ -76,7 +77,7 @@ public ReadCommand command()
     @Override
     public PartitionIterator filterReplicaFilteringProtection(PartitionIterator fullResponse)
     {
-        for (RowFilter.Expression expression : controller.getExpressions())
+        for (RowFilter.Expression expression : controller.filterOperation())
         {
             if (controller.getContext(expression).getAnalyzer().transformValue())
                 return applyIndexFilter(fullResponse, analyzeFilter(), queryContext);
@@ -89,7 +90,7 @@ public PartitionIterator filterReplicaFilteringProtection(PartitionIterator full
     @Override
     public UnfilteredPartitionIterator search(ReadExecutionController executionController) throws RequestTimeoutException
     {
-        return  new ResultRetriever(analyze(), controller, executionController, queryContext);
+        return  new ResultRetriever(analyze(), analyzeFilter(), controller, executionController, queryContext);
     }
 
     /**
@@ -97,9 +98,9 @@ public UnfilteredPartitionIterator search(ReadExecutionController executionContr
      *
      * @return operation
      */
-    private Operation analyze()
+    private RangeIterator analyze()
     {
-        return Operation.initTreeBuilder(controller).complete();
+        return Operation.buildIterator(controller);
     }
 
     /**
@@ -113,7 +114,7 @@ private Operation analyze()
      */
     private FilterTree analyzeFilter()
     {
-        return Operation.initTreeBuilder(controller).completeFilter();
+        return Operation.buildFilter(controller);
     }
 
     private static class ResultRetriever extends AbstractIterator<UnfilteredRowIterator> implements UnfilteredPartitionIterator
@@ -123,7 +124,8 @@ private static class ResultRetriever extends AbstractIterator<UnfilteredRowItera
         private final Iterator<DataRange> keyRanges;
         private AbstractBounds<PartitionPosition> current;
 
-        private final Operation operation;
+        private final RangeIterator operation;
+        private final FilterTree filterTree;
         private final QueryController controller;
         private final ReadExecutionController executionController;
         private final QueryContext queryContext;
@@ -131,13 +133,14 @@ private static class ResultRetriever extends AbstractIterator<UnfilteredRowItera
         private Iterator<DecoratedKey> currentKeys = null;
         private DecoratedKey lastKey;
 
-        private ResultRetriever(Operation operation, QueryController controller,
+        private ResultRetriever(RangeIterator operation, FilterTree filterTree, QueryController controller,
                                 ReadExecutionController executionController, QueryContext queryContext)
         {
             this.keyRanges = controller.dataRanges().iterator();
             this.current = keyRanges.next().keyRange();
 
             this.operation = operation;
+            this.filterTree = filterTree;
             this.controller = controller;
             this.executionController = executionController;
             this.queryContext = queryContext;
@@ -221,7 +224,7 @@ public UnfilteredRowIterator apply(DecoratedKey key)
             {
                 queryContext.partitionsRead++;
 
-                return applyIndexFilter(key, partition, operation.filterTree, queryContext);
+                return applyIndexFilter(key, partition, filterTree, queryContext);
             }
         }
 
@@ -235,14 +238,14 @@ private static UnfilteredRowIterator applyIndexFilter(DecoratedKey key, Unfilter
                 Unfiltered row = partition.next();
 
                 queryContext.rowsFiltered++;
-                if (tree.satisfiedBy(key, row, staticRow))
+                if (tree.isSatisfiedBy(key, row, staticRow))
                     clusters.add(row);
             }
 
             if (clusters.isEmpty())
             {
                 queryContext.rowsFiltered++;
-                if (tree.satisfiedBy(key, staticRow, staticRow))
+                if (tree.isSatisfiedBy(key, staticRow, staticRow))
                     clusters.add(staticRow);
             }
 
@@ -371,7 +374,7 @@ public boolean hasNext()
                         {
                             next = delegate.next();
                             queryContext.rowsFiltered++;
-                            if (tree.satisfiedBy(delegate.partitionKey(), next, staticRow))
+                            if (tree.isSatisfiedBy(delegate.partitionKey(), next, staticRow))
                                 return true;
                         }
                         return false;
diff --git a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
new file mode 100644
index 000000000000..8de2ce40ff1e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import org.junit.Test;
+
+import com.bpodgursky.jbool_expressions.parsers.ExprParser;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.WhereClause;
+import org.apache.cassandra.exceptions.SyntaxException;
+
+import static org.junit.Assert.assertEquals;
+
+public class WhereClauseExpressionTreeTest
+{
+    @Test(expected = SyntaxException.class)
+    public void cannotHaveEmptyWhereClause() throws Throwable
+    {
+        cqlParse("");
+    }
+
+    @Test
+    public void singleRelationWithoutEnclosure() throws Throwable
+    {
+        testExpression("a = 1");
+    }
+
+    @Test
+    public void singleRelationWithEnclosure() throws Throwable
+    {
+        testExpression("(a = 1)");
+    }
+
+    @Test
+    public void simpleAndExpressionWithRelationsWithoutEnclosure() throws Throwable
+    {
+        testExpression("a = 1 AND b = 1");
+    }
+
+    @Test
+    public void simpleAndExpressionWithRelationsWithEnclosure() throws Throwable
+    {
+        testExpression("(a = 1 AND b = 1)");
+    }
+
+    @Test
+    public void multipleAndExpressionWithRelations() throws Throwable
+    {
+        testExpression("a = 1 AND b = 1 AND c = 1");
+    }
+
+    @Test
+    public void disjunctionExpression() throws Throwable
+    {
+        testExpression("a = 1 AND b = 1 OR c = 1");
+    }
+
+    @Test
+    public void test() throws Throwable
+    {
+        System.out.println(cqlParse("a = 1 OR b = 1 AND c = 1"));
+    }
+
+    @Test
+    public void precedenceIsMaintainedWithoutParentheses() throws Throwable
+    {
+        testExpression("a = 1 AND b = 1 OR c = 1");
+
+        testExpression("a = 1 OR b = 1 AND c = 1");
+
+        testExpression("a = 1 OR b = 1 OR c = 1 AND d = 1 OR e = 1");
+
+        testExpression("a = 1 AND b = 1 AND c = 1 OR d = 1 AND e = 1");
+    }
+
+    @Test
+    public void multipleDisjunctionExpression() throws Throwable
+    {
+        testExpression("(a = 1 AND b = 1) OR (c = 1 AND d = 1)");
+    }
+
+    @Test
+    public void disjunctionExpressionWithPrecedence() throws Throwable
+    {
+        testExpression("a = 1 AND (b = 1 OR (c = 1 AND d = 1 AND e = 1))");
+    }
+
+    @Test
+    public void randomTest() throws Throwable
+    {
+        for (int count = 0; count < CQLTester.getRandom().nextIntBetween(100, 1000); count++)
+            testExpression(randomExpression());
+    }
+
+    private void testExpression(String expression) throws Throwable
+    {
+        assertEquals("Failed to correctly parse: [" + expression + "]", jboolParse(expression), jboolParse(cqlParse(expression)));
+    }
+
+   private static String alphabet = "abcdefghijklmnopqrstuvwxyz";
+
+   private String randomExpression()
+   {
+       StringBuilder builder = new StringBuilder();
+
+       boolean applyPrecedence = CQLTester.getRandom().nextBoolean();
+
+       int numberOfElements = CQLTester.getRandom().nextIntBetween(1, 26);
+       int precedenceLevel = 0;
+       for (int element = 0; element < numberOfElements - 1; element++)
+       {
+           if (applyPrecedence && CQLTester.getRandom().nextIntBetween(0, 2) == 0)
+           {
+               builder.append("(");
+               precedenceLevel++;
+           }
+           builder.append(alphabet, element, element + 1);
+           builder.append(" = 1");
+           if (applyPrecedence && CQLTester.getRandom().nextIntBetween(0, 2) == 2 && precedenceLevel > 0)
+           {
+               builder.append(")");
+               precedenceLevel--;
+           }
+           builder.append(CQLTester.getRandom().nextBoolean() ? " AND " : " OR ");
+       }
+       builder.append(alphabet, numberOfElements - 1, numberOfElements);
+       builder.append(" = 1");
+       if (applyPrecedence)
+           while (precedenceLevel-- > 0)
+               builder.append(")");
+
+       return builder.toString();
+   }
+
+   private String cqlParse(String expression) throws Throwable
+   {
+       return WhereClause.parse(expression).root().toString();
+   }
+
+   private String jboolParse(String expression)
+   {
+       return ExprParser.parse(toJbool(expression)).toString();
+   }
+
+   private String toJbool(String cqlExpression)
+   {
+       return cqlExpression.replaceAll("AND", "&").replaceAll("OR", "|").replaceAll(" = 1", "");
+   }
+}
diff --git a/src/java/org/apache/cassandra/db/AbstractReadCommandBuilder.java b/test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java
similarity index 97%
rename from src/java/org/apache/cassandra/db/AbstractReadCommandBuilder.java
rename to test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java
index 48e7fbda0338..6215ca637de9 100644
--- a/src/java/org/apache/cassandra/db/AbstractReadCommandBuilder.java
+++ b/test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java
@@ -43,7 +43,7 @@ public abstract class AbstractReadCommandBuilder
     protected boolean reversed = false;
 
     protected Set<ColumnIdentifier> columns;
-    protected final RowFilter filter = RowFilter.create();
+    protected final RowFilter.Builder filter = RowFilter.builder();
 
     private ClusteringBound<?> lowerClusteringBound;
     private ClusteringBound<?> upperClusteringBound;
@@ -233,7 +233,7 @@ public SinglePartitionBuilder(ColumnFamilyStore cfs, DecoratedKey key)
         @Override
         public ReadCommand build()
         {
-            return SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, makeColumnFilter(), filter, makeLimits(), partitionKey, makeFilter());
+            return SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, makeColumnFilter(), filter.build(), makeLimits(), partitionKey, makeFilter());
         }
     }
 
@@ -307,7 +307,7 @@ else if (!startInclusive && endInclusive)
             else
                 bounds = new ExcludingBounds<>(start, end);
 
-            return PartitionRangeReadCommand.create(cfs.metadata(), nowInSeconds, makeColumnFilter(), filter, makeLimits(), new DataRange(bounds, makeFilter()));
+            return PartitionRangeReadCommand.create(cfs.metadata(), nowInSeconds, makeColumnFilter(), filter.build(), makeLimits(), new DataRange(bounds, makeFilter()));
         }
 
         static DecoratedKey makeKey(TableMetadata metadata, Object... partitionKey)
diff --git a/test/unit/org/apache/cassandra/db/CleanupTest.java b/test/unit/org/apache/cassandra/db/CleanupTest.java
index d53241949d01..3285f6dde757 100644
--- a/test/unit/org/apache/cassandra/db/CleanupTest.java
+++ b/test/unit/org/apache/cassandra/db/CleanupTest.java
@@ -159,7 +159,7 @@ public void testCleanupWithIndexes() throws IOException, ExecutionException, Int
         while (!cfs.getBuiltIndexes().contains(indexName) && System.nanoTime() - start < TimeUnit.SECONDS.toNanos(10))
             Thread.sleep(10);
 
-        RowFilter cf = RowFilter.create();
+        RowFilter.Builder cf = RowFilter.builder();
         cf.add(cdef, Operator.EQ, VALUE);
         assertEquals(LOOPS, Util.getAll(Util.cmd(cfs).filterOn("birthdate", Operator.EQ, VALUE).build()).size());
 
diff --git a/test/unit/org/apache/cassandra/db/ReadCommandTest.java b/test/unit/org/apache/cassandra/db/ReadCommandTest.java
index 8aca305bdef5..0096e59aad3a 100644
--- a/test/unit/org/apache/cassandra/db/ReadCommandTest.java
+++ b/test/unit/org/apache/cassandra/db/ReadCommandTest.java
@@ -330,7 +330,7 @@ public void testSinglePartitionGroupMerge() throws Exception
         List<ByteBuffer> buffers = new ArrayList<>(groups.length);
         int nowInSeconds = FBUtilities.nowInSeconds();
         ColumnFilter columnFilter = ColumnFilter.allRegularColumnsBuilder(cfs.metadata(), false).build();
-        RowFilter rowFilter = RowFilter.create();
+        RowFilter.Builder rowFilter = RowFilter.builder();
         Slice slice = Slice.make(BufferClusteringBound.BOTTOM, BufferClusteringBound.TOP);
         ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(Slices.with(cfs.metadata().comparator, slice), false);
 
@@ -354,7 +354,7 @@ public void testSinglePartitionGroupMerge() throws Exception
                 {
                     RowUpdateBuilder.deleteRow(cfs.metadata(), FBUtilities.timestampMicros(), ByteBufferUtil.bytes(data[1]), data[2]).apply();
                 }
-                commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter, DataLimits.NONE, Util.dk(data[1]), sliceFilter));
+                commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter.build(), DataLimits.NONE, Util.dk(data[1]), sliceFilter));
             }
 
             cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
@@ -497,7 +497,7 @@ public void testCountDeletedRows() throws Exception
         List<ByteBuffer> buffers = new ArrayList<>(groups.length);
         int nowInSeconds = FBUtilities.nowInSeconds();
         ColumnFilter columnFilter = ColumnFilter.allRegularColumnsBuilder(cfs.metadata(), false).build();
-        RowFilter rowFilter = RowFilter.create();
+        RowFilter.Builder rowFilter = RowFilter.builder();
         Slice slice = Slice.make(BufferClusteringBound.BOTTOM, BufferClusteringBound.TOP);
         ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(
                 Slices.with(cfs.metadata().comparator, slice), false);
@@ -523,7 +523,7 @@ public void testCountDeletedRows() throws Exception
                     RowUpdateBuilder.deleteRow(cfs.metadata(), FBUtilities.timestampMicros(),
                             ByteBufferUtil.bytes(data[1]), data[2]).apply();
                 }
-                commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter,
+                commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter.build(),
                         DataLimits.NONE, Util.dk(data[1]), sliceFilter));
             }
 
@@ -573,7 +573,7 @@ public void testCountWithNoDeletedRow() throws Exception
         List<ByteBuffer> buffers = new ArrayList<>(groups.length);
         int nowInSeconds = FBUtilities.nowInSeconds();
         ColumnFilter columnFilter = ColumnFilter.allRegularColumnsBuilder(cfs.metadata(), false).build();
-        RowFilter rowFilter = RowFilter.create();
+        RowFilter.Builder rowFilter = RowFilter.builder();
         Slice slice = Slice.make(BufferClusteringBound.BOTTOM, BufferClusteringBound.TOP);
         ClusteringIndexSliceFilter sliceFilter = new ClusteringIndexSliceFilter(
                 Slices.with(cfs.metadata().comparator, slice), false);
@@ -599,7 +599,7 @@ public void testCountWithNoDeletedRow() throws Exception
                     RowUpdateBuilder.deleteRow(cfs.metadata(), FBUtilities.timestampMicros(),
                             ByteBufferUtil.bytes(data[1]), data[2]).apply();
                 }
-                commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter,
+                commands.add(SinglePartitionReadCommand.create(cfs.metadata(), nowInSeconds, columnFilter, rowFilter.build(),
                         DataLimits.NONE, Util.dk(data[1]), sliceFilter));
             }
 
diff --git a/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java b/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java
index 333d3f8c9e73..c2ca7d558008 100644
--- a/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java
+++ b/test/unit/org/apache/cassandra/db/filter/RowFilterTest.java
@@ -63,10 +63,10 @@ public void testCQLFilterClose()
         ColumnMetadata r = metadata.getColumn(new ColumnIdentifier("r", true));
 
         ByteBuffer one = Int32Type.instance.decompose(1);
-        RowFilter filter = RowFilter.NONE.withNewExpressions(new ArrayList<>());
+        RowFilter.Builder filter = RowFilter.builder();
         filter.add(s, Operator.NEQ, one);
         AtomicBoolean closed = new AtomicBoolean();
-        UnfilteredPartitionIterator iter = filter.filter(new SingletonUnfilteredPartitionIterator(new UnfilteredRowIterator()
+        UnfilteredPartitionIterator iter = filter.build().filter(new SingletonUnfilteredPartitionIterator(new UnfilteredRowIterator()
         {
             public DeletionTime partitionLevelDeletion() { return null; }
             public EncodingStats stats() { return null; }
@@ -91,10 +91,10 @@ public void close()
         Assert.assertFalse(iter.hasNext());
         Assert.assertTrue(closed.get());
 
-        filter = RowFilter.NONE.withNewExpressions(new ArrayList<>());
+        filter = RowFilter.builder();
         filter.add(r, Operator.NEQ, one);
         closed.set(false);
-        iter = filter.filter(new SingletonUnfilteredPartitionIterator(new UnfilteredRowIterator()
+        iter = filter.build().filter(new SingletonUnfilteredPartitionIterator(new UnfilteredRowIterator()
         {
             boolean hasNext = true;
             public DeletionTime partitionLevelDeletion() { return null; }
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java b/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java
index 1c98f9ae14e4..899b7861c600 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/AllowFilteringTest.java
@@ -381,11 +381,6 @@ public void testUnsupportedIndexRestrictions() throws Throwable
         assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI, "[b, c]"), "SELECT * FROM %s WHERE c > 'Test' AND b < 'Test3'");
         assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_MULTI, "[b, d]"), "SELECT * FROM %s WHERE d > 'Test' AND b < 'Test3'");
 
-        // IN restriction
-        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "b"), "SELECT * FROM %s WHERE b IN ('Test1', 'Test2')");
-        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "c"), "SELECT * FROM %s WHERE c IN ('Test1', 'Test2')");
-        assertInvalidMessage(String.format(StatementRestrictions.HAS_UNSUPPORTED_INDEX_RESTRICTION_MESSAGE_SINGLE, "d"), "SELECT * FROM %s WHERE d IN ('Test1', 'Test2')");
-
         // The same queries with ALLOW FILTERING should work
 
         // Single restriction
@@ -411,14 +406,6 @@ public void testUnsupportedIndexRestrictions() throws Throwable
                                                                                                                     row("Test2", "Test2", "Test2", "Test2"));
         assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE d > 'Test' AND b < 'Test3' ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
                                                                                                                     row("Test2", "Test2", "Test2", "Test2"));
-
-        // IN restriction
-        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE b IN ('Test1', 'Test2') ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
-                                                                                                                 row("Test2", "Test2", "Test2", "Test2"));
-        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE c IN ('Test1', 'Test2') ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
-                                                                                                                 row("Test2", "Test2", "Test2", "Test2"));
-        assertRowsIgnoringOrder(execute("SELECT * FROM %s WHERE d IN ('Test1', 'Test2') ALLOW FILTERING"), row("Test1", "Test1", "Test1", "Test1"),
-                                                                                                                 row("Test2", "Test2", "Test2", "Test2"));
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
new file mode 100644
index 000000000000..1dc18601b843
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
@@ -0,0 +1,176 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import org.junit.Test;
+
+import com.datastax.driver.core.ResultSet;
+import org.apache.cassandra.cql3.UntypedResultSet;
+import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.index.sai.SAITester;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.junit.Assert.assertEquals;
+
+public class ComplexQueryTest extends SAITester
+{
+    @Test
+    public void basicOrTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, a int, PRIMARY KEY(pk))");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 2, 2);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 3, 3);
+
+        UntypedResultSet resultSet = execute("SELECT pk FROM %s WHERE a = 1 or a = 3");
+
+        assertRowsIgnoringOrder(resultSet, row(1), row(3) );
+    }
+
+    @Test
+    public void basicInTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, a int, PRIMARY KEY(pk))");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 2, 2);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 3, 3);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 4, 4);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 5, 5);
+
+        UntypedResultSet resultSet = execute("SELECT pk FROM %s WHERE a in (1, 3, 5)");
+
+        assertRowsIgnoringOrder(resultSet, row(1), row(3), row(5));
+    }
+
+    @Test
+    public void complexQueryTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, a int, b int, c int, d int, PRIMARY KEY(pk))");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(b) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(c) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(d) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 1, 1, 1, 1, 1);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 2, 2, 1, 1, 1);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 3, 3, 2, 1, 1);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 4, 4, 2, 2, 1);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 5, 5, 3, 2, 1);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 6, 6, 3, 2, 2);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 7, 7, 4, 3, 2);
+        execute("INSERT INTO %s (pk, a, b, c, d) VALUES (?, ?, ?, ?, ?)", 8, 8, 4, 3, 3);
+
+
+        UntypedResultSet resultSet = execute("SELECT pk FROM %s WHERE (a = 1 AND c = 1) OR (b IN (3, 4) AND d = 2)");
+
+        assertRowsIgnoringOrder(resultSet, row(1), row(6), row(7) );
+    }
+
+    @Test
+    public void disjunctionWithIndexOnClusteringKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, a int, PRIMARY KEY(pk, ck))");
+        createIndex("CREATE CUSTOM INDEX ON %s(ck) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, ck, a) VALUES (?, ?, ?)", 1, 1, 1);
+        execute("INSERT INTO %s (pk, ck, a) VALUES (?, ?, ?)", 2, 2, 2);
+
+        UntypedResultSet resultSet = execute("SELECT pk FROM %s WHERE a = 1 or ck = 2");
+
+        assertRowsIgnoringOrder(resultSet, row(1), row(2));
+    }
+
+    @Test
+    public void complexQueryWithMultipleClusterings() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck0 int, ck1 int, a int, b int, c int, d int, e int, PRIMARY KEY(pk, ck0, ck1))");
+        createIndex("CREATE CUSTOM INDEX ON %s(ck0) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(ck1) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(b) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(c) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(d) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(e) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, ck0, ck1, a, b, c, d, e) VALUES (?, ?, ?, ?, ?, ? ,?, ?)", 1, 1, 1, 1, 1, 1, 1, 1);
+        execute("INSERT INTO %s (pk, ck0, ck1, a, b, c, d, e) VALUES (?, ?, ?, ?, ?, ? ,?, ?)", 2, 2, 2, 2, 2, 2, 2, 2);
+        execute("INSERT INTO %s (pk, ck0, ck1, a, b, c, d, e) VALUES (?, ?, ?, ?, ?, ? ,?, ?)", 3, 3, 3, 3, 3, 3, 3, 3);
+        execute("INSERT INTO %s (pk, ck0, ck1, a, b, c, d, e) VALUES (?, ?, ?, ?, ?, ? ,?, ?)", 4, 4, 4, 4, 4, 4, 4, 4);
+        execute("INSERT INTO %s (pk, ck0, ck1, a, b, c, d, e) VALUES (?, ?, ?, ?, ?, ? ,?, ?)", 5, 5, 5, 5, 5, 5, 5, 5);
+
+        UntypedResultSet resultSet = execute("SELECT pk FROM %s WHERE b = 6 AND d = 6 OR (a = 6 OR (c = 3 OR ck0 = 5))");
+
+        assertRowsIgnoringOrder(resultSet, row(3), row(5));
+
+        resultSet = execute("SELECT pk FROM %s WHERE ck0 = 1 AND (b = 6 AND c = 6 OR (d = 6 OR e = 6))");
+
+        assertEquals(0 , resultSet.size());
+
+        resultSet = execute("SELECT pk FROM %s WHERE b = 4 OR a = 3 OR c = 5");
+
+        assertRowsIgnoringOrder(resultSet, row(3), row(4), row(5));
+
+    }
+
+    @Test
+    public void complexQueryWithPartitionKeyRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, a int, b int, PRIMARY KEY(pk, ck))");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
+        createIndex("CREATE CUSTOM INDEX ON %s(b) USING 'StorageAttachedIndex'");
+
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 1, 1, 1, 5);
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 1, 2, 2, 6);
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 2, 1, 3, 7);
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 2, 2, 4, 8);
+
+        UntypedResultSet resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 AND (a = 2 OR b = 7)");
+
+        assertRowsIgnoringOrder(resultSet, row(1, 2));
+
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7")).isInstanceOf(InvalidRequestException.class)
+                                                                                                 .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+
+        resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7 ALLOW FILTERING");
+
+        assertRowsIgnoringOrder(resultSet, row(1, 1), row(1, 2), row(2, 1));
+    }
+
+    @Test
+    public void indexNotSupportingDisjunctionTest() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, a int, PRIMARY KEY(pk))");
+        createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'org.apache.cassandra.index.sasi.SASIIndex'");
+
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 1, 1);
+        execute("INSERT INTO %s (pk, a) VALUES (?, ?)", 2, 2);
+
+        assertThatThrownBy(() -> execute("SELECT pk FROM %s WHERE a = 1 or a = 2")).isInstanceOf(InvalidRequestException.class)
+                                                                                   .hasMessage(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_DISJUNCTION);
+
+        assertThatThrownBy(() -> execute("SELECT pk FROM %s WHERE a = 1 or a = 2 ALLOW FILTERING")).isInstanceOf(InvalidRequestException.class)
+                                                                                                   .hasMessage(StatementRestrictions.INDEX_DOES_NOT_SUPPORT_DISJUNCTION);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/RandomisedComplexQueryTest.java b/test/unit/org/apache/cassandra/index/sai/cql/RandomisedComplexQueryTest.java
new file mode 100644
index 000000000000..ab02dff76098
--- /dev/null
+++ b/test/unit/org/apache/cassandra/index/sai/cql/RandomisedComplexQueryTest.java
@@ -0,0 +1,450 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.index.sai.cql;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Streams;
+import com.google.common.collect.TreeMultimap;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQL3Type;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.ColumnIdentifier;
+import org.apache.cassandra.cql3.Constants;
+import org.apache.cassandra.cql3.Relation;
+import org.apache.cassandra.cql3.SingleColumnRelation;
+import org.apache.cassandra.cql3.WhereClause;
+import org.apache.cassandra.index.sai.SAITester;
+import org.assertj.core.util.Lists;
+
+/**
+ * This test produces a random schema, loads it with random data and then runs a series of
+ * random queries against it.
+ *
+ * The purpose of the test is to test that the <code>RowFilter</code> and <code>Operation</code>
+ * classes correctly support complex queries.
+ *
+ * At present the test only supports ascii and int datatypes and only supports EQ expressions.
+ * It is intended that this can be extended in the future to support more functionality.
+ */
+public class RandomisedComplexQueryTest extends SAITester
+{
+    private static final List<TypeInfo> types = Lists.list(TypeInfo.create(CQL3Type.Native.ASCII, () -> CQLTester.getRandom().nextAsciiString(4, 30), true),
+                                                           TypeInfo.create(CQL3Type.Native.INT, () -> CQLTester.getRandom().nextIntBetween(0, 1000), false));
+
+    @Test
+    public void test() throws Throwable
+    {
+        for (int test = 0; test < getRandom().nextIntBetween(10, 50); test++)
+            runRandomTest();
+    }
+
+    private void runRandomTest() throws Throwable
+    {
+        RandomSchema schema = new RandomSchema();
+
+        createTable(schema.toTableDefinition());
+
+        schema.generateIndexStrings().stream().forEach(index -> createIndex(index));
+
+        waitForIndexQueryable();
+
+        List<RandomRow> data = schema.generateDataset();
+
+        String insert = schema.toInsert();
+
+        for (RandomRow row : data)
+            execute(insert, row.toArray());
+
+        for (int query = 0; query < getRandom().nextIntBetween(100, 1000); query++)
+            schema.generateQuery().test(this, data);
+    }
+
+    public static class RandomSchema
+    {
+        private final Map<String, RandomColumn> columnMap = new HashMap<>();
+        private final TreeMultimap<String, Object> values = TreeMultimap.create(Ordering.natural(), Ordering.arbitrary());
+
+        private final List<RandomColumn> partitionKeys;
+        private final List<RandomColumn> clusteringKeys;
+        private final List<RandomColumn> normalColumns;
+
+        RandomSchema()
+        {
+            int bindPosition = 0;
+            partitionKeys = generateColumns("pk", CQLTester.getRandom().nextIntBetween(1, 3), bindPosition);
+            bindPosition += partitionKeys.size();
+            clusteringKeys = generateColumns("ck", CQLTester.getRandom().nextIntBetween(0, 4), bindPosition);
+            bindPosition += clusteringKeys.size();
+            normalColumns = generateColumns("nc", CQLTester.getRandom().nextIntBetween(1, 10), bindPosition);
+        }
+
+        public String toTableDefinition()
+        {
+            StringBuilder builder = new StringBuilder();
+
+            builder.append("CREATE TABLE %s (");
+            builder.append(Streams.concat(partitionKeys.stream(), clusteringKeys.stream(), normalColumns.stream())
+                                  .map(RandomColumn::toColumnDefinition)
+                                  .collect(Collectors.joining(", ")));
+            builder.append(", PRIMARY KEY (");
+            String partitionKeyString = partitionKeys.stream().map(RandomColumn::name).collect(Collectors.joining(", ", "(", ")"));
+            if (clusteringKeys.isEmpty())
+                builder.append(partitionKeyString);
+            else
+                builder.append(Stream.of(partitionKeyString,
+                                         clusteringKeys.stream().map(RandomColumn::name).collect(Collectors.joining(", ")))
+                                     .collect(Collectors.joining(", ")));
+            builder.append("))");
+            return builder.toString();
+        }
+
+        public String toInsert()
+        {
+            StringBuilder builder = new StringBuilder();
+
+            builder.append("INSERT INTO %s (");
+            builder.append(Streams.concat(partitionKeys.stream(), clusteringKeys.stream(), normalColumns.stream())
+                                  .map(RandomColumn::name)
+                                  .collect(Collectors.joining(", ")));
+            builder.append(") VALUES (");
+            builder.append(Streams.concat(partitionKeys.stream(), clusteringKeys.stream(), normalColumns.stream())
+                                  .map(RandomColumn::bindMarker)
+                                  .collect(Collectors.joining(", ")));
+            builder.append(")");
+
+            return builder.toString();
+        }
+
+        public List<String> generateIndexStrings()
+        {
+            List<String> indexes = new ArrayList<>();
+            clusteringKeys.stream().map(RandomColumn::toIndexDefinition).forEach(indexes::add);
+            normalColumns.stream().map(RandomColumn::toIndexDefinition).forEach(indexes::add);
+            return indexes;
+        }
+
+        public List<RandomRow> generateDataset()
+        {
+            List<RandomRow> data = new ArrayList<>();
+
+            for (int row = 0; row < CQLTester.getRandom().nextIntBetween(100, 1000); row++)
+            {
+                RandomRow newRow = generateRow();
+                // Remove any duplicate rows - makes it easier to build result set
+                // It may be possible to handle duplicates in the filtering but for the
+                // time being we just get rid of them
+                List<RandomRow> duplicates = data.stream()
+                                                 .filter(r -> {
+                                                    for (int pk = 0; pk < partitionKeys.size(); pk++)
+                                                        if (!r.values.get(pk).equals(newRow.values.get(pk)))
+                                                            return true;
+                                                    return false;
+                                                 })
+                                                 .collect(Collectors.toList());
+                duplicates.stream().forEach(data::remove);
+                data.add(newRow);
+            }
+
+            return data;
+        }
+
+        public RandomQuery generateQuery() throws Throwable
+        {
+            StringBuilder builder = new StringBuilder();
+
+            boolean applyPrecedence = CQLTester.getRandom().nextBoolean();
+
+            List<RandomColumn> allColumns = Lists.newArrayList(clusteringKeys);
+            allColumns.addAll(normalColumns);
+            int numberOfElements = CQLTester.getRandom().nextIntBetween(1, allColumns.size());
+            Set<RandomColumn> columns = new HashSet<>();
+            while (columns.size() < numberOfElements)
+                columns.add(allColumns.get(CQLTester.getRandom().nextIntBetween(0, allColumns.size() - 1)));
+
+            RandomColumn[] columnArray = columns.toArray(new RandomColumn[] {});
+            int precedenceLevel = 0;
+            for (int element = 0; element < numberOfElements - 1; element++)
+            {
+                if (applyPrecedence && CQLTester.getRandom().nextIntBetween(0, 2) == 0)
+                {
+                    builder.append("(");
+                    precedenceLevel++;
+                }
+                builder.append(columnArray[element].name);
+                builder.append(" = ");
+                builder.append(columnArray[element].randomQueryValue());
+
+                if (applyPrecedence && CQLTester.getRandom().nextIntBetween(0, 2) == 2 && precedenceLevel > 0)
+                {
+                    builder.append(")");
+                    precedenceLevel--;
+                }
+                builder.append(CQLTester.getRandom().nextBoolean() ? " AND " : " OR ");
+            }
+            builder.append(columnArray[columnArray.length - 1].name);
+            builder.append(" = ");
+            builder.append(columnArray[columnArray.length - 1].randomQueryValue());
+            if (applyPrecedence)
+                while (precedenceLevel-- > 0)
+                    builder.append(")");
+
+            return new RandomQuery(this, builder.toString());
+        }
+
+        private RandomRow generateRow()
+        {
+            RandomRow row = new RandomRow();
+            Streams.concat(partitionKeys.stream(), clusteringKeys.stream(), normalColumns.stream())
+                   .map(RandomColumn::nextValue).forEach(row::add);
+            return row;
+        }
+
+        private List<RandomColumn> generateColumns(String prefix, int count, int bindPosition)
+        {
+            List<RandomColumn> columns = new ArrayList<>(count);
+            for (int index = 0; index < count; index++)
+            {
+                RandomColumn column = new RandomColumn(this, prefix + index, getRandomType(), bindPosition++);
+                columns.add(column);
+                columnMap.put(column.name, column);
+            }
+            return columns;
+        }
+
+        private static TypeInfo getRandomType()
+        {
+            return types.get(CQLTester.getRandom().nextIntBetween(0, types.size() - 1));
+        }
+    }
+
+    public static class RandomQuery
+    {
+        private final RandomSchema schema;
+        private final String query;
+        private final Filter filter;
+
+        RandomQuery(RandomSchema schema, String query) throws Throwable
+        {
+            this.schema = schema;
+            this.query = query;
+            filter = buildFilter(WhereClause.parse(query).root());
+        }
+
+        void test(SAITester tester, List<RandomRow> data) throws Throwable
+        {
+            CQLTester.assertRowsIgnoringOrder(tester.execute("SELECT * FROM %s WHERE " + query), expectedRows(data));
+        }
+
+        Object[][] expectedRows(List<RandomRow> data)
+        {
+            List<Object[]> expected = new ArrayList<>();
+
+            for (RandomRow row : data)
+            {
+                if (filter.isSatisfiedBy(row))
+                    expected.add(row.toArray());
+            }
+
+            return expected.toArray(new Object[][]{});
+        }
+
+        Filter buildFilter(WhereClause.ExpressionElement element)
+        {
+            Filter filter = new Filter();
+            filter.isDisjunction = element.isDisjunction();
+            for (Relation relation : element.relations())
+            {
+                filter.expressions.add(new Expression(schema, relation));
+            }
+            for (WhereClause.ExpressionElement child : element.operations())
+                filter.children.add(buildFilter(child));
+            return filter;
+        }
+
+        static class Filter
+        {
+            boolean isDisjunction;
+
+            List<Expression> expressions = new ArrayList<>();
+
+            List<Filter> children = new ArrayList<>();
+
+            boolean isSatisfiedBy(RandomRow row)
+            {
+                if (isDisjunction)
+                {
+                    for (Expression e : expressions)
+                        if (e.isSatisfiedBy(row))
+                            return true;
+                    for (Filter child : children)
+                        if (child.isSatisfiedBy(row))
+                            return true;
+                    return false;
+                }
+                else
+                {
+                    for (Expression e : expressions)
+                        if (!e.isSatisfiedBy(row))
+                            return false;
+                    for (Filter child : children)
+                        if (!child.isSatisfiedBy(row))
+                            return false;
+                    return true;
+                }
+            }
+        }
+
+        static class Expression
+        {
+            ColumnIdentifier column;
+            String value;
+            int bindPosition;
+
+            Expression(RandomSchema schema, Relation relation)
+            {
+                assert relation instanceof SingleColumnRelation;
+                SingleColumnRelation singleColumnRelation = (SingleColumnRelation)relation;
+                column = singleColumnRelation.getEntity();
+                value = ((Constants.Literal)singleColumnRelation.getValue()).getRawText();
+                bindPosition = schema.columnMap.get(column.toString()).bindPosition;
+            }
+
+            boolean isSatisfiedBy(RandomRow row)
+            {
+                Object rowValue = row.values.get(bindPosition);
+                return rowValue.toString().equals(value);
+            }
+        }
+    }
+
+    public static class RandomColumn
+    {
+        private final RandomSchema schema;
+        private final String name;
+        private final TypeInfo type;
+        private final int bindPosition;
+
+        RandomColumn(RandomSchema schema, String name, TypeInfo type, int bindPosition)
+        {
+            this.schema = schema;
+            this.name = name;
+            this.type = type;
+            this.bindPosition = bindPosition;
+        }
+
+        public String name()
+        {
+            return name;
+        }
+
+        public String toColumnDefinition()
+        {
+            return name + " " + type.type.toString();
+        }
+
+        public String bindMarker()
+        {
+            return "?";
+        }
+
+        public String randomQueryValue()
+        {
+            Object[] columnValues = schema.values.get(name).toArray();
+            Object randomValue = columnValues[CQLTester.getRandom().nextIntBetween(0, columnValues.length - 1)];
+            return type.toCqlString(randomValue);
+        }
+
+        public String toIndexDefinition()
+        {
+            StringBuilder builder = new StringBuilder();
+            builder.append("CREATE CUSTOM INDEX ON %s(");
+            builder.append(name);
+            builder.append(") USING 'StorageAttachedIndex'");
+            return builder.toString();
+        }
+
+        public Object nextValue()
+        {
+            Object value = type.nextValue();
+            schema.values.put(name, value);
+            return value;
+        }
+
+        @Override
+        public String toString()
+        {
+            return name;
+        }
+    }
+
+    public static class RandomRow
+    {
+        List<Object> values = new ArrayList<>();
+
+        public void add(Object value)
+        {
+            values.add(value);
+        }
+
+        public Object[] toArray()
+        {
+            return values.toArray();
+        }
+    }
+
+    public static class TypeInfo
+    {
+        private final CQL3Type.Native type;
+        private final Supplier<?> supplier;
+        private final boolean quoted;
+
+        TypeInfo(CQL3Type.Native type, Supplier<?> supplier, boolean quoted)
+        {
+            this.type = type;
+            this.supplier = supplier;
+            this.quoted = quoted;
+        }
+
+        static TypeInfo create(CQL3Type.Native type, Supplier<?> supplier, boolean quoted)
+        {
+            return new TypeInfo(type, supplier, quoted);
+        }
+
+        public Object nextValue()
+        {
+            return supplier.get();
+        }
+
+        public String toCqlString(Object value)
+        {
+            return quoted ? "'" + value + "'" : value.toString();
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java b/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java
index a6de29535aaf..71098ee0ec11 100644
--- a/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/plan/OperationTest.java
@@ -21,7 +21,6 @@
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -109,13 +108,13 @@ public static void loadSchema() throws ConfigurationException
     public void beforeTest()
     {
         ReadCommand command = PartitionRangeReadCommand.allDataRead(BACKEND.metadata(), FBUtilities.nowInSeconds());
-        controller = new QueryController(BACKEND, command, Collections.emptyList(), new QueryContext(), null);
+        controller = new QueryController(BACKEND, command, null, new QueryContext(), null);
 
         command = PartitionRangeReadCommand.allDataRead(CLUSTERING_BACKEND.metadata(), FBUtilities.nowInSeconds());
-        controllerClustering = new QueryController(CLUSTERING_BACKEND, command, Collections.emptyList(), new QueryContext(), null);
+        controllerClustering = new QueryController(CLUSTERING_BACKEND, command, null, new QueryContext(), null);
 
         command = PartitionRangeReadCommand.allDataRead(STATIC_BACKEND.metadata(), FBUtilities.nowInSeconds());
-        controllerStatic = new QueryController(STATIC_BACKEND, command, Collections.emptyList(), new QueryContext(), null);
+        controllerStatic = new QueryController(STATIC_BACKEND, command, null, new QueryContext(), null);
     }
 
     @After
@@ -291,133 +290,133 @@ public void testSatisfiedBy()
         final ColumnMetadata timestamp = getColumn(UTF8Type.instance.decompose("timestamp"));
         final ColumnMetadata age = getColumn(UTF8Type.instance.decompose("age"));
 
-        Operation.Builder builder = new Operation.Builder(Operation.OperationType.AND, controller, new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)));
-        Operation op = builder.complete();
+        Operation.Node node = new Operation.ExpressionNode(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)));
+        FilterTree filterTree = node.buildFilter(controller);
 
         DecoratedKey key = buildKey("0");
         Unfiltered row = buildRow(buildCell(age, instance.decompose(6), System.currentTimeMillis()));
         Row staticRow = buildRow(Clustering.STATIC_CLUSTERING);
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()));
 
         // and reject incorrect value
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(buildCell(age, instance.decompose(6), System.currentTimeMillis()));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(filterTree.isSatisfiedBy(key, row, staticRow));
 
         // range with exclusions - age != 5 AND age > 1 AND age != 6 AND age <= 10
-        builder = new Operation.Builder(Operation.OperationType.AND, controller,
-                                        new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5)),
-                                        new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(1)),
-                                        new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(6)),
-                                        new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(10)));
-        op = builder.complete();
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(5))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(1))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(6))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(10))));
+        filterTree = node.buildFilter(controller);
 
         Set<Integer> exclusions = Sets.newHashSet(0, 1, 5, 6, 11);
         for (int i = 0; i <= 11; i++)
         {
             row = buildRow(buildCell(age, instance.decompose(i), System.currentTimeMillis()));
 
-            boolean result = op.satisfiedBy(key, row, staticRow);
+            boolean result = filterTree.isSatisfiedBy(key, row, staticRow);
             Assert.assertTrue(exclusions.contains(i) != result);
         }
 
         // now let's do something more complex - age = 5 OR age = 6
-        builder = new Operation.Builder(Operation.OperationType.OR, controller,
-                                        new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(5)),
-                                        new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(6)));
-
-        op = builder.complete();
+        node = new Operation.OrNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(5))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(6))));
+        filterTree = node.buildFilter(controller);
 
         exclusions = Sets.newHashSet(0, 1, 2, 3, 4, 7, 8, 9, 10);
         for (int i = 0; i <= 10; i++)
         {
             row = buildRow(buildCell(age, instance.decompose(i), System.currentTimeMillis()));
 
-            boolean result = op.satisfiedBy(key, row, staticRow);
+            boolean result = filterTree.isSatisfiedBy(key, row, staticRow);
             Assert.assertTrue(exclusions.contains(i) != result);
         }
 
         // now let's test aggregated AND commands
-        builder = new Operation.Builder(Operation.OperationType.AND, controller);
+        node = new Operation.AndNode();
 
         // logical should be ignored by analyzer, but we still what to make sure that it is
         //IndexExpression logical = new IndexExpression(ByteBufferUtil.EMPTY_BYTE_BUFFER, IndexOperator.EQ, ByteBufferUtil.EMPTY_BYTE_BUFFER);
         //logical.setLogicalOp(LogicalIndexOperator.AND);
 
         //builder.add(logical);
-        builder.add(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(0)));
-        builder.add(new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(10)));
-        builder.add(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(7)));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(0))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(10))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.NEQ, Int32Type.instance.decompose(7))));
 
-        op = builder.complete();
+        filterTree = node.buildFilter(controller);
 
         exclusions = Sets.newHashSet(7);
         for (int i = 0; i < 10; i++)
         {
             row = buildRow(buildCell(age, instance.decompose(i), System.currentTimeMillis()));
 
-            boolean result = op.satisfiedBy(key, row, staticRow);
+            boolean result = filterTree.isSatisfiedBy(key, row, staticRow);
             Assert.assertTrue(exclusions.contains(i) != result);
         }
 
         // multiple analyzed expressions in the Operation timestamp >= 10 AND age = 5
-        builder = new Operation.Builder(Operation.OperationType.AND, controller);
-        builder.add(new SimpleExpression(timestamp, Operator.GTE, LongType.instance.decompose(10L)));
-        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(5)));
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(timestamp, Operator.GTE, LongType.instance.decompose(10L))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(5))));
 
-        op = builder.complete();
+        filterTree = node.buildFilter(controller);
 
         row = buildRow(buildCell(age, instance.decompose(6), System.currentTimeMillis()),
                                   buildCell(timestamp, LongType.instance.decompose(11L), System.currentTimeMillis()));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()),
                                   buildCell(timestamp, LongType.instance.decompose(22L), System.currentTimeMillis()));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()),
                                   buildCell(timestamp, LongType.instance.decompose(9L), System.currentTimeMillis()));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         // operation with internal expressions and right child
-        builder = new Operation.Builder(Operation.OperationType.OR, controller,
-                                        new SimpleExpression(timestamp, Operator.GT, LongType.instance.decompose(10L)));
-        builder.setRight(new Operation.Builder(Operation.OperationType.AND, controller,
-                                               new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(0)),
-                                               new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(10))));
-        op = builder.complete();
+        node = new Operation.OrNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(timestamp, Operator.GT, LongType.instance.decompose(10L))));
+        Operation.Node child = new Operation.AndNode();
+        child.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.GT, Int32Type.instance.decompose(0))));
+        child.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.LT, Int32Type.instance.decompose(10))));
+        node.add(child);
+        filterTree = node.buildFilter(controller);
 
         row = buildRow(buildCell(age, instance.decompose(5), System.currentTimeMillis()),
                                   buildCell(timestamp, LongType.instance.decompose(9L), System.currentTimeMillis()));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(buildCell(age, instance.decompose(20), System.currentTimeMillis()),
                                   buildCell(timestamp, LongType.instance.decompose(11L), System.currentTimeMillis()));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(buildCell(age, instance.decompose(0), System.currentTimeMillis()),
                                   buildCell(timestamp, LongType.instance.decompose(9L), System.currentTimeMillis()));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         // and for desert let's try out null and deleted rows etc.
-        builder = new Operation.Builder(Operation.OperationType.AND, controller);
-        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(30)));
-        op = builder.complete();
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(30))));
+        filterTree = node.buildFilter(controller);
 
-        Assert.assertFalse(op.satisfiedBy(key, null, staticRow));
-        Assert.assertFalse(op.satisfiedBy(key, row, null));
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, null, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, null));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         long now = System.currentTimeMillis();
 
@@ -425,15 +424,15 @@ public void testSatisfiedBy()
         Row.Deletion.regular(new DeletionTime(now - 10, (int) (now / 1000))),
         buildCell(age, instance.decompose(6), System.currentTimeMillis()));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         row = buildRow(deletedCell(age, System.currentTimeMillis(), FBUtilities.nowInSeconds()));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(filterTree.isSatisfiedBy(key, row, staticRow));
 
         try
         {
-            Assert.assertFalse(op.satisfiedBy(key, buildRow(), staticRow));
+            Assert.assertFalse(filterTree.isSatisfiedBy(key, buildRow(), staticRow));
         }
         catch (IllegalStateException e)
         {
@@ -507,50 +506,50 @@ public void testSatisfiedByWithClustering()
                                   buildCell(score, DoubleType.instance.decompose(1.0d), System.currentTimeMillis()));
         Row staticRow = buildRow(Clustering.STATIC_CLUSTERING);
 
-        Operation.Builder builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
-        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(27)));
-        builder.add(new SimpleExpression(height, Operator.EQ, Int32Type.instance.decompose(182)));
+        Operation.Node node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(27))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(height, Operator.EQ, Int32Type.instance.decompose(182))));
 
-        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
 
-        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
+        node = new Operation.AndNode();
 
-        builder.add(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(28)));
-        builder.add(new SimpleExpression(height, Operator.EQ, Int32Type.instance.decompose(182)));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.EQ, Int32Type.instance.decompose(28))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(height, Operator.EQ, Int32Type.instance.decompose(182))));
 
-        Assert.assertFalse(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
 
-        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
-        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US")));
-        builder.add(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(27)));
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(27))));
 
-        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
 
-        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
-        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("BY")));
-        builder.add(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(28)));
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("BY"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.GTE, Int32Type.instance.decompose(28))));
 
-        Assert.assertFalse(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
 
-        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
-        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US")));
-        builder.add(new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(27)));
-        builder.add(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182)));
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(age, Operator.LTE, Int32Type.instance.decompose(27))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182))));
 
-        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
 
-        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
-        builder.add(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US")));
-        builder.add(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182)));
-        builder.add(new SimpleExpression(score, Operator.EQ, DoubleType.instance.decompose(1.0d)));
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(location, Operator.EQ, UTF8Type.instance.decompose("US"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(score, Operator.EQ, DoubleType.instance.decompose(1.0d))));
 
-        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
 
-        builder = new Operation.Builder(Operation.OperationType.AND, controllerClustering);
-        builder.add(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182)));
-        builder.add(new SimpleExpression(score, Operator.EQ, DoubleType.instance.decompose(1.0d)));
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(height, Operator.GTE, Int32Type.instance.decompose(182))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(score, Operator.EQ, DoubleType.instance.decompose(1.0d))));
 
-        Assert.assertTrue(builder.complete().satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerClustering).isSatisfiedBy(key, row, staticRow));
     }
 
     private Map<Expression.Op, Expression> convert(Multimap<ColumnMetadata, Expression> expressions)
@@ -579,48 +578,49 @@ public void testSatisfiedByWithStatic()
                                  buildCell(sensorType, UTF8Type.instance.decompose("TEMPERATURE"), System.currentTimeMillis()));
 
         // sensor_type ='TEMPERATURE' AND value = 24.56
-        Operation op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
-                                             new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
-                                             new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))).complete();
+        Operation.Node node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerStatic).isSatisfiedBy(key, row, staticRow));
 
         // sensor_type ='TEMPERATURE' AND value = 30
-        op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
-                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
-                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(30.00))).complete();
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(30.00))));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(node.buildFilter(controllerStatic).isSatisfiedBy(key, row, staticRow));
 
         // sensor_type ='PRESSURE' OR value = 24.56
-        op = new Operation.Builder(Operation.OperationType.OR, controllerStatic,
-                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
-                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))).complete();
+        node = new Operation.OrNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerStatic).isSatisfiedBy(key, row, staticRow));
 
         // sensor_type ='PRESSURE' OR value = 30
-        op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
-                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE")),
-                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(30.00))).complete();
+        node = new Operation.OrNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(30.00))));
 
-        Assert.assertFalse(op.satisfiedBy(key, row, staticRow));
+        Assert.assertFalse(node.buildFilter(controllerStatic).isSatisfiedBy(key, row, staticRow));
 
         // (sensor_type = 'TEMPERATURE' OR sensor_type = 'PRESSURE') AND value = 24.56
-        op = new Operation.Builder(Operation.OperationType.OR, controllerStatic,
-                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE")),
-                                   new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE")))
-             .setRight(new Operation.Builder(Operation.OperationType.AND, controllerStatic,
-                                             new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56)))).complete();
+        node = new Operation.AndNode();
+        Operation.Node child = new Operation.OrNode();
+        child.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("TEMPERATURE"))));
+        child.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.EQ, UTF8Type.instance.decompose("PRESSURE"))));
+        node.add(child);
+        node.add(new Operation.ExpressionNode(new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerStatic).isSatisfiedBy(key, row, staticRow));
 
         // sensor_type = LIKE 'TEMP%'  AND value = 24.56
-        op = new Operation.Builder(Operation.OperationType.AND, controllerStatic,
-                                   new SimpleExpression(sensorType, Operator.LIKE_PREFIX, UTF8Type.instance.decompose("TEMP")),
-                                   new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))).complete();
+        node = new Operation.AndNode();
+        node.add(new Operation.ExpressionNode(new SimpleExpression(sensorType, Operator.LIKE_PREFIX, UTF8Type.instance.decompose("TEMP"))));
+        node.add(new Operation.ExpressionNode(new SimpleExpression(value, Operator.EQ, DoubleType.instance.decompose(24.56))));
 
-        Assert.assertTrue(op.satisfiedBy(key, row, staticRow));
+        Assert.assertTrue(node.buildFilter(controllerStatic).isSatisfiedBy(key, row, staticRow));
     }
 
     private static class SimpleExpression extends RowFilter.Expression
diff --git a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
index 752d77185692..2ee2c212c86b 100644
--- a/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/sasi/SASIIndexTest.java
@@ -1369,14 +1369,14 @@ public void testSearchTimeouts()
 
         ColumnFamilyStore store = loadData(data1, true);
 
-        RowFilter filter = RowFilter.create();
+        RowFilter.Builder filter = RowFilter.builder();
         filter.add(store.metadata().getColumn(firstName), Operator.LIKE_CONTAINS, AsciiType.instance.fromString("a"));
 
         ReadCommand command =
             PartitionRangeReadCommand.create(store.metadata(),
                                              FBUtilities.nowInSeconds(),
                                              ColumnFilter.all(store.metadata()),
-                                             filter,
+                                             filter.build(),
                                              DataLimits.NONE,
                                              DataRange.allData(store.metadata().partitioner));
         try
@@ -2563,7 +2563,7 @@ private static UnfilteredPartitionIterator getIndexed(ColumnFamilyStore store, C
                             ? DataRange.allData(PARTITIONER)
                             : DataRange.forKeyRange(new Range<>(startKey, PARTITIONER.getMinimumToken().maxKeyBound()));
 
-        RowFilter filter = RowFilter.create();
+        RowFilter.Builder filter = RowFilter.builder();
         for (Expression e : expressions)
             filter.add(store.metadata().getColumn(e.name), e.op, e.value);
 
@@ -2571,7 +2571,7 @@ private static UnfilteredPartitionIterator getIndexed(ColumnFamilyStore store, C
             PartitionRangeReadCommand.create(store.metadata(),
                                              FBUtilities.nowInSeconds(),
                                              columnFilter,
-                                             filter,
+                                             filter.build(),
                                              DataLimits.cqlLimits(maxResults),
                                              range);
 

From 7d0c209de0915ce223e2371edb870a56808c7917 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Tue, 13 Jul 2021 10:11:50 +0100
Subject: [PATCH 107/151] STAR-815: Make MultiRangeReadCommand usage
 configurable in SAI (#212)

---
 .../index/sai/plan/StorageAttachedIndexQueryPlan.java          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
index 47e970e1ae5e..50878558ae96 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
@@ -37,6 +37,7 @@
 
 public class StorageAttachedIndexQueryPlan implements Index.QueryPlan
 {
+    private static final boolean USE_MULTI_RANGE_READ_COMMAND = Boolean.parseBoolean(System.getProperty("cassandra.sai.use_multi_range_read_command", "false"));
     private final ColumnFamilyStore cfs;
     private final TableQueryMetrics queryMetrics;
     private final RowFilter postIndexFilter;
@@ -136,6 +137,6 @@ public RowFilter postIndexQueryFilter()
     @Override
     public boolean supportsMultiRangeReadCommand()
     {
-        return true;
+        return USE_MULTI_RANGE_READ_COMMAND;
     }
 }

From ac0be817fae3c6472532d193a98e2f666ba2cf85 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Thu, 15 Jul 2021 10:30:34 +0100
Subject: [PATCH 108/151] STAR-813: New MessagingService version for Stargazer
 (#218)

---
 .../apache/cassandra/db/CounterMutation.java  |  6 +++++
 .../org/apache/cassandra/db/Mutation.java     |  6 +++++
 .../db/commitlog/CommitLogDescriptor.java     |  6 ++++-
 .../apache/cassandra/db/filter/RowFilter.java | 23 ++++++++++++++++---
 .../cassandra/hints/HintsDescriptor.java      |  5 +++-
 .../format/trieindex/TrieIndexFormat.java     |  2 +-
 .../org/apache/cassandra/net/Message.java     | 11 +++++++++
 .../cassandra/net/MessagingService.java       |  8 +++++--
 .../org/apache/cassandra/net/FramingTest.java |  5 +++-
 9 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/CounterMutation.java b/src/java/org/apache/cassandra/db/CounterMutation.java
index fe1e46e0b1c6..70058886d8af 100644
--- a/src/java/org/apache/cassandra/db/CounterMutation.java
+++ b/src/java/org/apache/cassandra/db/CounterMutation.java
@@ -47,6 +47,7 @@
 import org.apache.cassandra.utils.btree.BTreeSet;
 
 import static java.util.concurrent.TimeUnit.*;
+import static org.apache.cassandra.net.MessagingService.VERSION_SG_10;
 import static org.apache.cassandra.net.MessagingService.VERSION_30;
 import static org.apache.cassandra.net.MessagingService.VERSION_3014;
 import static org.apache.cassandra.net.MessagingService.VERSION_40;
@@ -326,6 +327,7 @@ public long getTimeout(TimeUnit unit)
     private int serializedSize30;
     private int serializedSize3014;
     private int serializedSize40;
+    private int serializedSizeSG10;
 
     public int serializedSize(int version)
     {
@@ -343,6 +345,10 @@ public int serializedSize(int version)
                 if (serializedSize40 == 0)
                     serializedSize40 = (int) serializer.serializedSize(this, VERSION_40);
                 return serializedSize40;
+            case VERSION_SG_10:
+                if (serializedSizeSG10 == 0)
+                    serializedSizeSG10 = (int) serializer.serializedSize(this, VERSION_SG_10);
+                return serializedSizeSG10;
             default:
                 throw new IllegalStateException("Unknown serialization version: " + version);
         }
diff --git a/src/java/org/apache/cassandra/db/Mutation.java b/src/java/org/apache/cassandra/db/Mutation.java
index 8a1ffc123c54..ae8cb6cec561 100644
--- a/src/java/org/apache/cassandra/db/Mutation.java
+++ b/src/java/org/apache/cassandra/db/Mutation.java
@@ -39,6 +39,7 @@
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.apache.cassandra.net.MessagingService.VERSION_SG_10;
 import static org.apache.cassandra.net.MessagingService.VERSION_30;
 import static org.apache.cassandra.net.MessagingService.VERSION_3014;
 import static org.apache.cassandra.net.MessagingService.VERSION_40;
@@ -282,6 +283,7 @@ public String toString(boolean shallow)
     private int serializedSize30;
     private int serializedSize3014;
     private int serializedSize40;
+    private int serializedSizeSG10;
 
     public int serializedSize(int version)
     {
@@ -299,6 +301,10 @@ public int serializedSize(int version)
                 if (serializedSize40 == 0)
                     serializedSize40 = (int) serializer.serializedSize(this, VERSION_40);
                 return serializedSize40;
+            case VERSION_SG_10:
+                if (serializedSizeSG10 == 0)
+                    serializedSizeSG10 = (int) serializer.serializedSize(this, VERSION_SG_10);
+                return serializedSizeSG10;
             default:
                 throw new IllegalStateException("Unknown serialization version: " + version);
         }
diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
index 9120a3925f04..159cd2d696fe 100644
--- a/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
+++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogDescriptor.java
@@ -62,13 +62,15 @@ public class CommitLogDescriptor
     public static final int VERSION_40 = 7;
     // For compatibility with CNDB
     public static final int VERSION_DSE_68 = 680;
+    // Stargazer 1.0 messaging
+    public static final int VERSION_SG_10 = 100;
 
     /**
      * Increment this number if there is a changes in the commit log disc layout or MessagingVersion changes.
      * Note: make sure to handle {@link #getMessagingVersion()}
      */
     @VisibleForTesting
-    public static final int current_version = VERSION_40;
+    public static final int current_version = VERSION_SG_10;
 
     final int version;
     public final long id;
@@ -212,6 +214,8 @@ public int getMessagingVersion()
             case VERSION_40:
             case VERSION_DSE_68:
                 return MessagingService.VERSION_40;
+            case VERSION_SG_10:
+                return MessagingService.VERSION_SG_10;
             default:
                 throw new IllegalStateException("Unknown commitlog version " + version);
         }
diff --git a/src/java/org/apache/cassandra/db/filter/RowFilter.java b/src/java/org/apache/cassandra/db/filter/RowFilter.java
index 443df8afd55f..5885ef8d3b68 100644
--- a/src/java/org/apache/cassandra/db/filter/RowFilter.java
+++ b/src/java/org/apache/cassandra/db/filter/RowFilter.java
@@ -46,6 +46,7 @@
 import org.apache.cassandra.index.IndexRegistry;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.IndexMetadata;
 import org.apache.cassandra.schema.TableMetadata;
@@ -451,10 +452,17 @@ public static class Serializer
         {
             public void serialize(FilterElement operation, DataOutputPlus out, int version) throws IOException
             {
-                out.writeBoolean(operation.isDisjunction);
+                assert (!operation.isDisjunction && operation.children().isEmpty()) || version == MessagingService.VERSION_SG_10 :
+                "Attempting to serialize a disjunct row filter to a node that doesn't support disjunction";
+
                 out.writeUnsignedVInt(operation.expressions.size());
                 for (Expression expr : operation.expressions)
                     Expression.serializer.serialize(expr, out, version);
+
+                if (version < MessagingService.VERSION_SG_10)
+                    return;
+
+                out.writeBoolean(operation.isDisjunction);
                 out.writeUnsignedVInt(operation.children.size());
                 for (FilterElement child : operation.children)
                     serialize(child, out, version);
@@ -462,11 +470,15 @@ public void serialize(FilterElement operation, DataOutputPlus out, int version)
 
             public FilterElement deserialize(DataInputPlus in, int version, TableMetadata metadata) throws IOException
             {
-                boolean isDisjunction = in.readBoolean();
                 int size = (int)in.readUnsignedVInt();
                 List<Expression> expressions = new ArrayList<>(size);
                 for (int i = 0; i < size; i++)
                     expressions.add(Expression.serializer.deserialize(in, version, metadata));
+
+                if (version < MessagingService.VERSION_SG_10)
+                    return new FilterElement(false, expressions, Collections.emptyList());
+
+                boolean isDisjunction = in.readBoolean();
                 size = (int)in.readUnsignedVInt();
                 List<FilterElement> children = new ArrayList<>(size);
                 for (int i  = 0; i < size; i++)
@@ -476,9 +488,14 @@ public FilterElement deserialize(DataInputPlus in, int version, TableMetadata me
 
             public long serializedSize(FilterElement operation, int version)
             {
-                long size = 1 + TypeSizes.sizeofUnsignedVInt(operation.expressions.size());
+                long size = TypeSizes.sizeofUnsignedVInt(operation.expressions.size());
                 for (Expression expr : operation.expressions)
                     size += Expression.serializer.serializedSize(expr, version);
+
+                if (version < MessagingService.VERSION_SG_10)
+                    return size;
+
+                size++; // isDisjunction boolean
                 size += TypeSizes.sizeofUnsignedVInt(operation.children.size());
                 for (FilterElement child : operation.children)
                     size += serializedSize(child, version);
diff --git a/src/java/org/apache/cassandra/hints/HintsDescriptor.java b/src/java/org/apache/cassandra/hints/HintsDescriptor.java
index 1979637779e4..d9c11ab770f0 100644
--- a/src/java/org/apache/cassandra/hints/HintsDescriptor.java
+++ b/src/java/org/apache/cassandra/hints/HintsDescriptor.java
@@ -64,7 +64,8 @@ final class HintsDescriptor
 
     static final int VERSION_30 = 1;
     static final int VERSION_40 = 2;
-    static final int CURRENT_VERSION = VERSION_40;
+    static final int VERSION_SG_10 = 100;
+    static final int CURRENT_VERSION = VERSION_SG_10;
 
     static final String COMPRESSION = "compression";
     static final String ENCRYPTION = "encryption";
@@ -220,6 +221,8 @@ static int messagingVersion(int hintsVersion)
                 return MessagingService.VERSION_30;
             case VERSION_40:
                 return MessagingService.VERSION_40;
+            case VERSION_SG_10:
+                return MessagingService.VERSION_SG_10;
             default:
                 throw new AssertionError();
         }
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
index dc4495aee023..59cff5b36bf3 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexFormat.java
@@ -323,7 +323,7 @@ static class TrieIndexVersion extends Version
             hasAccurateLegacyMinMax = version.compareTo("ac") >= 0;
             hasOriginatingHostId = version.matches("(a[d-z])|(b[b-z])") || version.compareTo("ca") >= 0;
             hasMaxColumnValueLengths = version.matches("b[a-z]"); // DSE only field
-            correspondingMessagingVersion = version.compareTo("ca") >= 0 ? MessagingService.VERSION_40 : MessagingService.VERSION_3014;
+            correspondingMessagingVersion = version.compareTo("ca") >= 0 ? MessagingService.VERSION_SG_10 : MessagingService.VERSION_3014;
         }
 
         // this is for the ab version which was used in the LABS, and then has been renamed to ba
diff --git a/src/java/org/apache/cassandra/net/Message.java b/src/java/org/apache/cassandra/net/Message.java
index 17f157dd967a..dc2508672a06 100644
--- a/src/java/org/apache/cassandra/net/Message.java
+++ b/src/java/org/apache/cassandra/net/Message.java
@@ -48,6 +48,7 @@
 import static org.apache.cassandra.db.TypeSizes.sizeof;
 import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
 import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer;
+import static org.apache.cassandra.net.MessagingService.VERSION_SG_10;
 import static org.apache.cassandra.net.MessagingService.VERSION_3014;
 import static org.apache.cassandra.net.MessagingService.VERSION_30;
 import static org.apache.cassandra.net.MessagingService.VERSION_40;
@@ -1318,6 +1319,7 @@ private static <In,Out> IVersionedAsymmetricSerializer<In, Out> getPayloadSerial
     private int serializedSize30;
     private int serializedSize3014;
     private int serializedSize40;
+    private int serializedSizeSG10;
 
     /**
      * Serialized size of the entire message, for the provided messaging version. Caches the calculated value.
@@ -1338,6 +1340,10 @@ public int serializedSize(int version)
                 if (serializedSize40 == 0)
                     serializedSize40 = serializer.serializedSize(this, VERSION_40);
                 return serializedSize40;
+            case VERSION_SG_10:
+                if (serializedSizeSG10 == 0)
+                    serializedSizeSG10 = (int) serializer.serializedSize(this, VERSION_SG_10);
+                return serializedSizeSG10;
             default:
                 throw new IllegalStateException();
         }
@@ -1346,6 +1352,7 @@ public int serializedSize(int version)
     private int payloadSize30   = -1;
     private int payloadSize3014 = -1;
     private int payloadSize40   = -1;
+    private int payloadSizeSG10 = -1;
 
     private int payloadSize(int version)
     {
@@ -1363,6 +1370,10 @@ private int payloadSize(int version)
                 if (payloadSize40 < 0)
                     payloadSize40 = serializer.payloadSize(this, VERSION_40);
                 return payloadSize40;
+            case VERSION_SG_10:
+                if (payloadSizeSG10 < 0)
+                    payloadSizeSG10 = serializer.payloadSize(this, VERSION_SG_10);
+                return payloadSizeSG10;
             default:
                 throw new IllegalStateException();
         }
diff --git a/src/java/org/apache/cassandra/net/MessagingService.java b/src/java/org/apache/cassandra/net/MessagingService.java
index 4d712e86d531..7a92a2dc44f4 100644
--- a/src/java/org/apache/cassandra/net/MessagingService.java
+++ b/src/java/org/apache/cassandra/net/MessagingService.java
@@ -202,8 +202,11 @@ public final class MessagingService extends MessagingServiceMBeanImpl
     public static final int VERSION_30 = 10;
     public static final int VERSION_3014 = 11;
     public static final int VERSION_40 = 12;
+    // Current Stargazer version while we have serialization differences
+    // If differences get merged upstream then we can revert to OS versioning
+    public static final int VERSION_SG_10 = 100;
     public static final int minimum_version = VERSION_30;
-    public static final int current_version = VERSION_40;
+    public static final int current_version = VERSION_SG_10;
     static AcceptVersions accept_messaging = new AcceptVersions(minimum_version, current_version);
     static AcceptVersions accept_streaming = new AcceptVersions(current_version, current_version);
 
@@ -211,7 +214,8 @@ public enum Version
     {
         VERSION_30(10),
         VERSION_3014(11),
-        VERSION_40(12);
+        VERSION_40(12),
+        STARGAZER_10(100);
 
         public final int value;
 
diff --git a/test/unit/org/apache/cassandra/net/FramingTest.java b/test/unit/org/apache/cassandra/net/FramingTest.java
index 78d0a8493eb1..d893c0657965 100644
--- a/test/unit/org/apache/cassandra/net/FramingTest.java
+++ b/test/unit/org/apache/cassandra/net/FramingTest.java
@@ -52,6 +52,8 @@
 import static java.lang.Math.*;
 import static org.apache.cassandra.net.MessagingService.VERSION_30;
 import static org.apache.cassandra.net.MessagingService.VERSION_3014;
+import static org.apache.cassandra.net.MessagingService.VERSION_40;
+import static org.apache.cassandra.net.MessagingService.VERSION_SG_10;
 import static org.apache.cassandra.net.MessagingService.current_version;
 import static org.apache.cassandra.net.MessagingService.minimum_version;
 import static org.apache.cassandra.net.OutboundConnections.LARGE_MESSAGE_THRESHOLD;
@@ -251,13 +253,14 @@ public void testSerializeSizeMatchesEdgeCases() // See CASSANDRA-16103
 
     private void burnRandomLegacy(int count)
     {
+        int[] versions = new int[] { VERSION_30, VERSION_3014, VERSION_40, VERSION_SG_10 };
         SecureRandom seed = new SecureRandom();
         Random random = new Random();
         for (int i = 0 ; i < count ; ++i)
         {
             long innerSeed = seed.nextLong();
             float ratio = seed.nextFloat();
-            int version = minimum_version + random.nextInt(1 + current_version - minimum_version);
+            int version = versions[random.nextInt(4)];
             logger.debug("seed: {}, ratio: {}, version: {}", innerSeed, ratio, version);
             random.setSeed(innerSeed);
             testRandomSequenceOfMessages(random, ratio, version, new FrameDecoderLegacy(GlobalBufferPoolAllocator.instance, version));

From 8cb7905aa079f7e84b480a6f23166192ee94d9d9 Mon Sep 17 00:00:00 2001
From: Aleksandr Sorokoumov <918393+Gerrrr@users.noreply.github.com>
Date: Thu, 15 Jul 2021 15:31:43 +0200
Subject: [PATCH 109/151] STAR-13 Add the Unified Compaction Strategy. (#132)

This is the implementation of UnifiedCompactionStrategy, whish is
intended to not only replace all other compaction strategies and CSM,
but also to optimally choose the configuration that will result in the
minimum read and write costs, given a specific workload and dataset size.

The strategy will choose either leveled or tiered merge policies
depending on the workload and the costs associated with user queries and
inserts.

This strategy should be considered experimental at this stage.

--

This patch also introduces a compaction simulation that can be run with:

ant compactionSim

With cmd line arguments:

ant compactionSim--args="-wl R50_W50 -t adaptive"

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Introduce compaction interfaces.

This patch introduces 3 interfaces:

- CompactionStrategy : to encapsulate the behaviour of a compaction strategy
- CompactionStrategyContainer: to enacapsulate the additional behavior of CSM
- CompactionStrategyFactory: to create the right compaction container (CSM or the unified CS)

CompactionStrategyManager and AbstractCompactionStrategy now implement these interfaces and have been
encapsulated in the compaction package.

In a future patch, UnifiedCompactionStrategy will also implement these interfaces,
therefore standing on its own, without the need of CSM, which should eventually be removed along
with the legacy strategies.

The factory will take care of instantiating either the new strategy or CSM.

This patch also introduces CompactionStrategyOptions, for option validation.
CompactionParams now uses this class.

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Implement compaction shards

The unified compaction strategy is now decoupled from
CompactionStrategyManager by implementing its own thin compaction
container.

When a compaction candidate needs to be produced, the strategy takes a
snapshot of the eligible sstables and disk boundaries. It then applies
a set of equivalence classes, which implement the same partitioning of
sstables currently performed by CSM.

One of the equivalence classes splits sstables across token range
boundaries, or shards. The number of shards is specified in the
compaction properties, and the shards are created by splitting the
local ranges by this number. sstables are assigned to a shard by
looking at their first partition key.

When flushing or compacting, a specialized writer splits sstables
at the shard boundaries. If the current sstable is larger than a
minimum sstable size that can be specified in the compaction properties,
then it is split when a boundary is reached.

getNextBackgroundTask() now returns a list of tasks, which are processed by
the compaction executor asynchronously.

The following minor updates have been performed:

- the adaptive algorithm now searches for better choices of W every 5 minutes
  rather than 2 minutes;
- the cost calculator now uses a read multiplier of 0.1 rather than 0.25;
- all sstables in an bucket are compacted if their number is >= T. Compactions
  no longer stop at T or F. This may skip levels but has proven very effective
  in tests when switching from tiered to levelled.

The documentation for the unified strategy has been added as a mark down
document.

Reviewed by Branimir Lambov, Dimitar Dimitrov, Justin Chu, Aleksandr Sorokoumov

Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Add a section in the UCS markdown about differences with STCS and LCS.

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Update disk boundaries if current boundaries are null or
out of date, even if the corresponding table is just being reloaded
due to a metadata change.

Fixes the failing
TestCompaction_with_UnifiedCompactionStrategy.compaction_strategy_switching_test

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Simplify container creation and reloading and allow inheriting the state of the previous container when switching strategies.

Fixes the failing
TestCompaction_with_UnifiedCompactionStrategy.disable_autocompaction_alter_and_nodetool_test

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Make the enable/disable and isEnabled/isActive behavior of
UnifiedCompactionContainer similar to that of CompactionStrategyManager.
This includes always starting up the backing compaction strategy.
The previous behavior resulted in every compaction task being interrupted
while autocompaction is disabled.

Fixes the failing
TestDiskBalanceAfterJoiningRing.disk_balance_after_joining_ring_ucs_test

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Make compaction shards split inside disks and apply disk boundaries

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Permit early open in UnifiedCompactionStrategy

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Limit the number of concurrently running "oversized"
compactions, in order to limit size amplification. "Oversized"
here is defined as close to the maximum shard size.
- Calculate the limit for the number of concurrent "oversized" compactions
based on a configurable option for max allowed (tolerable) SA as a fraction
of the expected uncompacted dataset size.
- Use reservoir sampling + keeping a non-oversized alternative to
ensure that the limited number of oversized compactions that will
be submitted are uniformally chosen from the available shards.

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Remove CompactionStatistics

STAR-13 Port isTrulyWrapAround

Co-authored-by: Sylvain Lebresne <sylvain@datastax.com>

STAR-13 Fix UCS tests

STAR-13 Refactor repair out from compaction strategies

Some major refactorings:
    Move mutateRepaired from CompactionStrategyManager to ColumnFamilyStore
    Move repair related codes in CompactionStrategyContainer, PendingRepairManager to LocalSessions

    CompactionStrategyContainer:
    Added a method to acquire the ReentrantReadWriteLock.WriteLock such that it can be passed and used for mutateRepaired

Co-authored-by: Justin Chu <justin.chu@datastax.com>

STAR-13 Refactor compaction statistics in order to simplify
    the code and reduce duplication and add some of the statistics
    (already available in CompactionAggregateStatistics) to TableMetrics.

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Improves the handling of compactions at the lowest levels of the compaction hierarchy:
    - shard-spanning compactions's size is divided by the number of shards spanned for the purposes of deciding in which level to put them;
    - the levels hierarchy starts at the average flush size for the table;
    - selects the tasks to run randomly to give each level and shard equal chance to run its compaction;
    - shard spanning compactions are given more chances to be selected.

    Additionally, this applies a couple of fixes:
    - getNextCompaction was run too many times because the decision whether or not to run it was made at scheduling, which caused many to be scheduled, and they never checked if one didn't run before;
    - early opened sstables could be selected for compaction, causing multiple compaction passes over the same data.
    - Do not run multiple getNextAggregates

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Change the newly introduced compaction metrics to be
    aggregate metrics instead of per-table metrics. This will
    make them easier to record/monitor in Fallout/Grafana, and
    will also enable computing them more efficiently from a
    cached value of the AggregateCompactions metric.

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Update UCS defaults

Co-authored-by: Justin Chu <justin.chu@datastax.com>

STAR-13 Track compaction rate in backgroundCompactions

    Also:
    - switch to sample-based exponential moving average, which is much simpler to implement correctly
    at the expense of expressing averaging periods in terms of updates count instead of time;
    - add debug logging of the compaction task count decisions

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Spread compaction threads equally among the levels

    Fixes the problem of long-running higher-level tasks starving level 0 or any level
    from continuing by reserving compaction threads for each of the levels of the
    hierarchy. More specifically, the whole part of the ratio between compaction threads
    and number of levels is reserved for each level, and any remainder is distributed
    randomly as before.

    Replaces the oversized compaction mechanism with a simple limit for the aggregate
    size of running compactions, which is now also applied when single compactions are
    above that limit. This should prevent running out of space at the expense of several
    highest-level tables extra, i.e. slightly higher read amplification, until someone
    reacts to the warning, which I think is a sensible tradeoff.

    Also removes unsupported options from the documentation markdown and adds
    max_space_overhead description.

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Fix Bloom Filter key estimation for ShardedCompactionWriter

STAR-13 Unable to set max_space_overhead in UCS

Co-authored-by: Justin Chu <justin.chu@datastax.com>

STAR-13 Log the number of compacted SSTables, and the shard and
    bucket identifiers when rejecting a compaction bigger than the
    max space overhead.

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Fix Bloom Filter tracking

BloomFilterTracker uses meters to avoid the situation when subsequent retrievals
of recent metrics return 0. Tracking is done at CFS instance instead of
per-SSTableReader to reduce overhead. SSTableReaders set correct
BloomFilterTracker in setupOnline. Before correct BloomFilterTracker is set,
SSTableReaders use NoopBloomFilterTracker.

STAR-13 Introduce a limit on the number of sstables in a compaction and
a layout-preserving mode

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Log shard and bucket details on each getShardsWithBuckets()
at TRACE level instead of at DEBUG level.

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 TTL-based SSTable expiration in UCS

STAR-13 Inherit BackgroundCompactions when recreating UCS

STAR-13 Compaction log analyzer

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Always create mocked SSTables with stubOnly

STAR-13 Fix compaction strategy reload

Prevent resetting JMX changes when we create new strategy containers.
This patch makes sure that JMX changes that alter the container type
(CSM->UCS or UCS->CSM) are not overwritten by subsequent metadata changes
that are unrelated to compaction.

STAR-13 Handle IOErrors during background compaction task execution as FSErrors

Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>

STAR-13 Trigger layout compactions automatically when there are more than  F*F SSTables in a bucket

STAR-13 Use descriptor passed to ShardedMultiWriter

STAR-13 Fix condition for triggering layout compactions in case of non-uniform W

STAR-13 computeShardBoundaries handles no splitter and no disk boundaries

STAR-13 Check for partitioner mismatch before splitting local ranges

STAR-13 Use avg bucket size to adjust maxSSTablesToCompact

STAR-13 Limit the number of SSTables to compact in one operation

STAR-13 Switch CompactionsBytemanTest to signals

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Fix ShardedMultiWriterTest with compression

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Move releaseRepairData to LocalSessions

STAR-13 Decouple condition for switching writers from append

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>

STAR-13 Shutdown previous strategy container on reload

STAR-13 Remove unused maxConcurrentOversizedCompactions

STAR-13 Metrics hold reference of a single instance controller

STAR-13 Move Controller params check to validateOptions

STAR-13 Fix target size estimation

STAR-13 Use correct bucket index for picks that contain only expired SSTables

STAR-13 Update shard index on disk change

STAR-13 Remove redundant shutdown call on keyspace drop

STAR-13 Fix amplification estimation

STAR-13 Fix calculation of the number of pending picks per bucket

Co-authored-by: Branimir Lambov <branimir.lambov@datastax.com>
Co-authored-by: Stefania Alborghetti <stefania.alborghetti@datastax.com>
Co-authored-by: Dimitar Dimitrov <dmtrndmtrv@gmail.com>
Co-authored-by: Justin Chu <justin.chu@datastax.com>
---
 build.xml                                     |   30 +
 doc/unified_compaction.md                     |  172 ++
 doc/unified_compaction_level_formula.svg      |   48 +
 .../cassandra/config/DatabaseDescriptor.java  |   49 +-
 .../cassandra/db/ColumnFamilyStore.java       |  414 +++--
 .../cassandra/db/ColumnFamilyStoreMBean.java  |    5 +-
 .../apache/cassandra/db/DiskBoundaries.java   |   88 +-
 .../cassandra/db/DiskBoundaryManager.java     |  100 +-
 .../org/apache/cassandra/db/Keyspace.java     |    7 +-
 .../cassandra/db/MultiRangeReadCommand.java   |    8 +-
 .../db/PartitionRangeReadCommand.java         |    9 +-
 .../org/apache/cassandra/db/ReadCommand.java  |   12 +-
 .../apache/cassandra/db/RepairedDataInfo.java |    2 +-
 .../db/SinglePartitionReadCommand.java        |   26 +-
 .../cassandra/db/SortedLocalRanges.java       |  242 +++
 .../AbstractCompactionStrategy.java           |  673 ++------
 .../db/compaction/AbstractCompactionTask.java |   27 +-
 .../db/compaction/AbstractStrategyHolder.java |   27 +-
 .../db/compaction/ArenaSelector.java          |  233 +++
 .../db/compaction/BackgroundCompactions.java  |  130 +-
 .../cassandra/db/compaction/CleanupTask.java  |   83 +
 .../db/compaction/CompactionAggregate.java    |  533 +++---
 .../CompactionAggregateStatistics.java        |  119 +-
 .../db/compaction/CompactionController.java   |   17 +-
 .../db/compaction/CompactionIterator.java     |    2 +-
 .../db/compaction/CompactionLogger.java       |  208 ++-
 .../db/compaction/CompactionManager.java      |  227 ++-
 .../db/compaction/CompactionObserver.java     |   24 +-
 .../db/compaction/CompactionPick.java         |   82 +-
 .../db/compaction/CompactionProgress.java     |    3 +-
 .../db/compaction/CompactionStatistics.java   |  101 --
 .../db/compaction/CompactionStrategy.java     |  188 +++
 .../CompactionStrategyContainer.java          |  191 +++
 .../compaction/CompactionStrategyFactory.java |  188 +++
 .../compaction/CompactionStrategyHolder.java  |   41 +-
 .../compaction/CompactionStrategyManager.java |  536 +++---
 .../compaction/CompactionStrategyOptions.java |  386 +++++
 .../CompactionStrategyStatistics.java         |   50 +-
 .../db/compaction/CompactionTask.java         |  106 +-
 .../DateTieredCompactionStrategy.java         |   82 +-
 .../LegacyAbstractCompactionStrategy.java     |  344 ++++
 .../LeveledCompactionStatistics.java          |   94 +-
 .../compaction/LeveledCompactionStrategy.java |   87 +-
 .../db/compaction/LeveledCompactionTask.java  |    7 +-
 .../db/compaction/LeveledManifest.java        |   33 +-
 .../db/compaction/PendingRepairHolder.java    |   66 +-
 .../db/compaction/PendingRepairManager.java   |  221 +--
 .../RepairFinishedCompactionTask.java         |  122 ++
 .../db/compaction/SSTableSplitter.java        |   11 +-
 .../db/compaction/SingleSSTableLCSTask.java   |   17 +-
 .../SizeTieredCompactionStatistics.java       |   16 +-
 .../SizeTieredCompactionStrategy.java         |   29 +-
 .../TieredCompactionStatistics.java           |   55 +-
 .../TimeTieredCompactionStatistics.java       |   16 +-
 .../TimeWindowCompactionStrategy.java         |   48 +-
 .../compaction/TimeWindowCompactionTask.java  |    4 +-
 .../UnifiedCompactionContainer.java           |  361 ++++
 .../UnifiedCompactionStatistics.java          |  165 ++
 .../compaction/UnifiedCompactionStrategy.java | 1066 +++++++++++-
 .../cassandra/db/compaction/Upgrader.java     |    9 +-
 .../unified/AdaptiveController.java           |  344 ++++
 .../db/compaction/unified/Controller.java     |  832 ++++++++++
 .../compaction/unified/CostsCalculator.java   |  277 ++++
 .../db/compaction/unified/Environment.java    |   82 +
 .../compaction/unified/RealEnvironment.java   |  127 ++
 .../unified/ShardedCompactionWriter.java      |  142 ++
 .../unified/ShardedMultiWriter.java           |  272 +++
 .../compaction/unified/StaticController.java  |  129 ++
 .../unified/UnifiedCompactionTask.java        |   59 +
 .../writers/CompactionAwareWriter.java        |   77 +-
 .../writers/DefaultCompactionWriter.java      |   32 +-
 .../writers/MajorLeveledCompactionWriter.java |   48 +-
 .../writers/MaxSSTableSizeWriter.java         |   43 +-
 .../SplittingSizeTieredCompactionWriter.java  |   38 +-
 .../cassandra/db/lifecycle/Helpers.java       |    7 +-
 .../db/lifecycle/LifecycleTransaction.java    |    6 +
 .../db/lifecycle/SSTableIntervalTree.java     |    2 +-
 .../cassandra/db/lifecycle/Tracker.java       |   19 +-
 .../cassandra/db/memtable/Flushing.java       |   15 +-
 .../db/memtable/ShardBoundaries.java          |   11 +-
 .../repair/CassandraTableRepairManager.java   |   39 +-
 .../repair/CassandraValidationIterator.java   |    6 +-
 src/java/org/apache/cassandra/dht/Range.java  |   58 +
 .../org/apache/cassandra/dht/Splitter.java    |   97 +-
 .../cassandra/index/sasi/SASIIndex.java       |    2 +-
 .../io/sstable/BloomFilterTracker.java        |  162 +-
 .../io/sstable/SSTableMultiWriter.java        |    4 +-
 .../io/sstable/SSTableTxnWriter.java          |    5 -
 .../cassandra/io/sstable/ScannerList.java     |   74 +
 .../io/sstable/SimpleSSTableMultiWriter.java  |   12 +-
 .../format/RangeAwareSSTableWriter.java       |   28 +-
 .../io/sstable/format/SSTableReader.java      |   82 +-
 .../sstable/format/SSTableZeroCopyWriter.java |   15 +-
 .../io/sstable/format/big/BigTableReader.java |    8 +-
 .../trieindex/TrieIndexSSTableReader.java     |   19 +-
 .../sstable/metadata/MetadataCollector.java   |    7 +-
 .../apache/cassandra/io/util/FileUtils.java   |    2 +-
 .../cassandra/metrics/CompactionMetrics.java  |  159 +-
 .../cassandra/metrics/TableMetrics.java       |  159 +-
 .../repair/consistent/LocalSessions.java      |   67 +-
 .../cassandra/schema/CompactionParams.java    |  172 +-
 .../apache/cassandra/schema/TableParams.java  |    2 +-
 .../cassandra/service/CassandraDaemon.java    |    2 +-
 .../service/DefaultFSErrorHandler.java        |    3 +
 .../cassandra/service/StorageService.java     |    2 +-
 .../tools/CompactionLogAnalyzer.java          |  554 +++++++
 .../org/apache/cassandra/tools/NodeProbe.java |    1 +
 .../cassandra/tools/StandaloneScrubber.java   |   13 +-
 .../tools/nodetool/CompactionStats.java       |   14 +-
 .../nodetool/stats/TableStatsHolder.java      |    2 +-
 .../cassandra/utils/ExpMovingAverage.java     |  107 ++
 .../utils/JVMStabilityInspector.java          |    5 +
 .../apache/cassandra/utils/MovingAverage.java |   26 +
 .../org/apache/cassandra/graph/graph.html     |  568 +++++++
 .../distributed/test/PreviewRepairTest.java   |    6 +-
 .../format/ForwardingSSTableReader.java       |   24 -
 .../db/compaction/LongCompactionsTest.java    |    2 +-
 .../LongLeveledCompactionStrategyTest.java    |   42 +-
 .../compaction/CompactionAllocationTest.java  |    2 +-
 ...ategy-density-blobs-0-Repaired-shard_1.csv |  292 ++++
 ...egy-density-blobs-0-Unrepaired-shard_0.csv |  304 ++++
 test/unit/org/apache/cassandra/Util.java      |   11 +-
 .../config/DatabaseDescriptorRefTest.java     |    1 +
 .../config/DatabaseDescriptorTest.java        |   52 +-
 .../org/apache/cassandra/cql3/CQLTester.java  |    9 +
 .../cql3/CompactionOutOfSpaceTest.java        |  243 +++
 .../cassandra/cql3/GcCompactionTest.java      |    2 +-
 .../org/apache/cassandra/cql3/ViewTest.java   |    4 +-
 .../org/apache/cassandra/db/CleanupTest.java  |   37 +-
 .../cassandra/db/ColumnFamilyStoreTest.java   |   29 +-
 .../cassandra/db/DiskBoundaryManagerTest.java |    6 +-
 .../cassandra/db/SortedLocalRangesTest.java   |  206 +++
 .../compaction/AbstractPendingRepairTest.java |   24 +-
 .../compaction/BackgroundCompactionsTest.java |  123 +-
 .../BaseCompactionStrategyTest.java           |  376 +++++
 .../compaction/CQLUnifiedCompactionTest.java  |  341 ++++
 .../db/compaction/CancelCompactionsTest.java  |   16 +-
 .../compaction/CompactionAwareWriterTest.java |    3 +-
 .../CompactionManagerUpgradeTest.java         |  163 ++
 .../compaction/CompactionSimulationTest.java  | 1453 +++++++++++++++++
 ...ionStrategyContainerPendingRepairTest.java |   60 +
 ...ctionStrategyManagerPendingRepairTest.java |  244 ++-
 .../CompactionStrategyManagerTest.java        |  165 +-
 .../CompactionStrategyStatisticsTest.java     |  528 +++---
 .../db/compaction/CompactionTaskTest.java     |   19 +-
 .../db/compaction/CompactionsBytemanTest.java |  125 +-
 .../db/compaction/CompactionsCQLTest.java     |  187 ++-
 .../db/compaction/CompactionsPurgeTest.java   |    4 +-
 .../db/compaction/CompactionsTest.java        |    8 +-
 .../DateTieredCompactionStrategyTest.java     |   18 +-
 ...LegacyAbstractCompactionStrategyTest.java} |   15 +-
 .../LeveledCompactionStrategyTest.java        |   95 +-
 .../compaction/PendingRepairManagerTest.java  |   70 +-
 .../compaction/SingleSSTableLCSTaskTest.java  |   35 +-
 .../TimeWindowCompactionStrategyTest.java     |   22 +-
 ...dCompactionContainerPendingRepairTest.java |  941 +++++++++++
 .../UnifiedCompactionStrategyTest.java        | 1361 +++++++++++++++
 .../db/compaction/ZombieSSTablesTest.java     |   12 +-
 .../unified/AdaptiveControllerTest.java       |  313 ++++
 .../db/compaction/unified/ControllerTest.java |  178 ++
 .../unified/CostsCalculatorTest.java          |  286 ++++
 .../unified/ShardedCompactionWriterTest.java  |  209 +++
 .../unified/ShardedMultiWriterTest.java       |  121 ++
 .../unified/StaticControllerTest.java         |  189 +++
 .../db/lifecycle/RealTransactionsTest.java    |    4 +-
 .../apache/cassandra/dht/SplitterTest.java    |   64 +-
 .../cassandra/io/BloomFilterTrackerTest.java  |   41 +-
 .../io/sstable/LegacySSTableTest.java         |    7 +-
 .../io/sstable/SSTableReaderTest.java         |    6 +-
 .../io/sstable/SSTableRewriterTest.java       |    9 +-
 .../repair/consistent/LocalSessionTest.java   |    2 +-
 .../consistent/PendingRepairStatTest.java     |    2 +-
 .../tools/CompactionLogAnalyzerTest.java      |  136 ++
 .../cassandra/utils/ExpMovingAverageTest.java |   85 +
 tools/analytics/plot_adaptive.gnu             |   71 +
 tools/analytics/plot_static.gnu               |   59 +
 tools/bin/analyzecompactionlog                |   48 +
 .../cassandra/stress/CompactionStress.java    |   10 +-
 178 files changed, 19055 insertions(+), 3743 deletions(-)
 create mode 100644 doc/unified_compaction.md
 create mode 100644 doc/unified_compaction_level_formula.svg
 create mode 100644 src/java/org/apache/cassandra/db/SortedLocalRanges.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/ArenaSelector.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CleanupTask.java
 delete mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStrategy.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStrategyContainer.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStrategyFactory.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/CompactionStrategyOptions.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategy.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/RepairFinishedCompactionTask.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/UnifiedCompactionContainer.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStatistics.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/AdaptiveController.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/Controller.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/CostsCalculator.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/Environment.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/RealEnvironment.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriter.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/ShardedMultiWriter.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/StaticController.java
 create mode 100644 src/java/org/apache/cassandra/db/compaction/unified/UnifiedCompactionTask.java
 create mode 100644 src/java/org/apache/cassandra/io/sstable/ScannerList.java
 create mode 100644 src/java/org/apache/cassandra/tools/CompactionLogAnalyzer.java
 create mode 100644 src/java/org/apache/cassandra/utils/ExpMovingAverage.java
 create mode 100644 src/java/org/apache/cassandra/utils/MovingAverage.java
 create mode 100644 src/resources/org/apache/cassandra/graph/graph.html
 create mode 100644 test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Repaired-shard_1.csv
 create mode 100644 test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Unrepaired-shard_0.csv
 create mode 100644 test/unit/org/apache/cassandra/cql3/CompactionOutOfSpaceTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/SortedLocalRangesTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/BaseCompactionStrategyTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/CQLUnifiedCompactionTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/CompactionManagerUpgradeTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/CompactionSimulationTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/CompactionStrategyContainerPendingRepairTest.java
 rename test/unit/org/apache/cassandra/db/compaction/{AbstractCompactionStrategyTest.java => LegacyAbstractCompactionStrategyTest.java} (89%)
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionContainerPendingRepairTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/unified/AdaptiveControllerTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/unified/CostsCalculatorTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriterTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/unified/ShardedMultiWriterTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/compaction/unified/StaticControllerTest.java
 create mode 100644 test/unit/org/apache/cassandra/tools/CompactionLogAnalyzerTest.java
 create mode 100644 test/unit/org/apache/cassandra/utils/ExpMovingAverageTest.java
 create mode 100644 tools/analytics/plot_adaptive.gnu
 create mode 100644 tools/analytics/plot_static.gnu
 create mode 100755 tools/bin/analyzecompactionlog

diff --git a/build.xml b/build.xml
index 6c38e4ac77de..537c87da39a7 100644
--- a/build.xml
+++ b/build.xml
@@ -2190,6 +2190,36 @@
 
   </target>
 
+  <target name="compactionSim"
+          depends="build-test"
+          description="A test that simulates compactions to see how strategies behave">
+      <sequential>
+          <mkdir dir="${build.test.dir}/logs"/>
+          <delete>
+              <fileset dir="${build.test.dir}/logs">
+                  <include name="*"/>
+              </fileset>
+          </delete>
+          <java classname="org.apache.cassandra.db.compaction.CompactionSimulationTest"
+                fork="true"
+                failonerror="true"
+                dir="${basedir}">
+              <classpath>
+                  <path refid="cassandra.classpath.test" />
+                  <pathelement location="${test.classes}"/>
+                  <pathelement location="${test.conf}"/>
+                  <fileset dir="${test.lib}">
+                      <include name="**/*.jar" />
+                  </fileset>
+              </classpath>
+              <jvmarg value="-Dstorage-config=${test.conf}"/>
+              <jvmarg value="-Dcassandra.logdir=${build.test.dir}/logs"/>
+              <jvmarg value="-Dlogback.configurationFile=file:///${basedir}/conf/logback.xml"/>
+              <jvmarg value="-Dcassandra.config=file:///${test.conf}/cassandra.yaml"/>
+          </java>
+      </sequential>
+  </target>
+
   <import file="${basedir}/.build/build-resolver.xml"/>
   <import file="${basedir}/.build/build-rat.xml"/>
 </project>
diff --git a/doc/unified_compaction.md b/doc/unified_compaction.md
new file mode 100644
index 000000000000..201de72562d7
--- /dev/null
+++ b/doc/unified_compaction.md
@@ -0,0 +1,172 @@
+<!--
+#
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+-->
+
+## Unified compaction strategy (UCS)
+
+This is a new compaction strategy that unifies tiered and leveled compaction strategies, adds sharding, lends itself to be reconfigured at any time and forms the basis for future compaction improvements including automatic adaptation to the workload.
+
+The strategy is based on the observation that tiered and levelled compaction can be generalized as the same thing if one observes that both form exponentially-growing levels based on the size of sstables (or non-overlapping sstable runs) and trigger a compaction when more than a given number of sstables are present on one level.
+
+UCS groups sstables in levels based on the logarithm of the sstable size, with
+the fanout factor **F** as the base of the logarithm, and with each level triggering a compaction as soon as it has
+**T** sstables. The choice of the parameters **F** and **T**, and of a minimum sstable size, determines the behaviour
+of the strategy. This allows users to choose a levelled strategy by setting **T=2**, or a tiered strategy by choosing **T=F**. Because the two options are mutually exclusive, meet at **F=2** and form a space of options for choosing different ratios of read amplification (RA) vs write amplification (WA) (where levelled compaction improves reads at the expense of writes and approaches a sorted array as **F** increases, and tiered compaction favors writes at the expense of reads and approaches an unsorted log as **F** increases), we combine the two parameters into one integer value, **W**, and set them to be:
+* If **W < 0** then **F = 2 - W** and **T = 2**. This means leveled compactions, high WA but low RA.
+* If **W > 0** then set **F = 2 + W** and **T = F**. This means tiered compactions, low WA but high RA.
+* If **W = 0** then **F = T = 2**. This is the middle ground, leveled and tiered compactions behave identically.
+
+Further, because levels can choose different values of **W**, levels can behave differently. For example level
+zero could behave like STCS but higher levels could behave more and more like LCS.
+
+This strategy also introduces compaction shards. Data is partitioned in independent shards that can be compacted in parallel. Shards are defined by splitting the token ranges for which the node is responsible into equally-sized sections.
+
+## Size based levels
+
+Let's explore more closely how sstables are grouped into levels.
+
+Given:
+
+- the fanout factor **F**
+- the sstable flush size **m** (i.e. the average size of sstables written when a memtable is flushed)
+
+then the level **L** for an sstable of size **s** is calculated as follows:
+
+![L = log_F (s/m)](unified_compaction_level_formula.svg)
+
+This means that sstables are assigned to levels as follows:
+
+|Level|Min sstable size|Max sstable size|
+|---|---|---|
+|0|0|**m&#x2219;F**|
+|1|**m&#x2219;F**|**m&#x2219;F<sup>2</sup>**|
+|2|**m&#x2219;F<sup>2</sup>**|**m&#x2219;F<sup>3</sup>**|
+|3|**m&#x2219;F<sup>3</sup>**|**m&#x2219;F<sup>4</sup>**|
+|...|...|...|
+|N|**m&#x2219;F<sup>n</sup>**|**m&#x2219;F<sup>n+1</sup>**|
+
+If we define **T** as the number of sstables in a level that triggers a compaction, then:
+
+* **T = 2** means the strategy is using a leveled merged policy. An sstable enters level **n** with size **>=mF<sup>n</sup>**.
+  When another sstable enters (also with size **>=mF<sup>n</sup>**) they compact and form a new table with size
+  **~2mF<sup>n</sup>**, which keeps the result in the same level for **F > 2**. After this repeats at least **F-2**
+  more times (i.e. F tables enter the level altogether), the compaction result grows to **>= mF<sup>n+1</sup>**
+  and enters the next level.
+* **T = F** means the strategy is using a tiered merge policy. After **F** sstables enter level **n**, each of size **>=mF<sup>n</sup>**, they are compacted together, resulting in an sstable of size **>=mF<sup>n+1</sup>** which belongs to the next level.
+
+Note that the above ignores overwrites and deletions. Given knowledge of the expected proportion of overwrites/deletion, they can also be accounted for (this is not implemented at this time).
+
+For leveled strategies, the write amplification will be proportional to **F-1** times the number of levels whilst
+for tiered strategies it will be proportional only to the number of levels. On the other hand, the read
+amplification will be proportional to the number of levels for leveled strategies and to **F-1** times the number
+of levels for tiered strategies.
+
+The number of levels for our size based scheme can be calculated by substituting the maximal dataset size **D** in our
+equation above, giving a maximal number of levels inversely proportional to the logarithm of **F**.
+
+Therefore when we try to control the overheads of compaction on the database, we have a space of choices for the strategy
+that range from:
+
+* leveled compaction (**T=2**) with high **F** - low number of levels, high read efficiency, high write cost,
+  moving closer to the behaviour of a sorted array as **F** increases;
+* compaction with **T = F = 2** where leveled is the same as tiered and we have a middle ground with logarithmically
+  increasing read and write costs;
+* tiered compaction (**T=F**) with high **F** - very high number of sstables, low read efficiency and low write cost,
+  moving closer to an unsorted log as **F** increases.
+
+## Sharding
+
+Sharding is used to reduce the size of the biggest files that are produced by compaction by splitting sstables at selected token boundaries when the data grows above a given size. This helps both with the maximum space overhead that compaction will require, as well as the number of concurrent compactions that can be executed.
+
+The number of required shards is specified in the compaction options. Based on that, the strategy will select shard boundaries which split the token space handled by the node in equal portions. The flush size **m** used in the calculations above is also divided by the number of shards.
+
+When flushing or when compacting, output sstables will be split along the boundaries of compaction shards as long as
+they are at least as large as a minimum sstable size specified in the compaction options. If sstables are smaller than this size, then they will continue into the next shard.
+The aim is to avoid sstables that are excessively small. For example, if there are four shards
+and if the flush size is twice the minimum sstable size, then assuming uniform data distribution (no hot partitions),
+flushing will create 2 sstables. The first sstable will be in the first shard and the second sstable will likely be
+in the third shard.
+
+This means that some sstables effectively span several shards. We assign such sstables to the shard that contains their start position, but divide the size that we use for the level calculation by the number of spanned shards to reflect the fact that they contain less shard-specific data. This avoids problems with the result of compaction not advancing to the required level because it has shed data belonging to a different shard.
+
+For the example above and a fan factor of 2, the sstables in shards 1 and 3 belong to level 0, because their size is **m * 2** (where **m** is the shard-adjusted flush size), but taking into account that they each span 2 shards, we use the effective size of **m * 2 / 2**. Compacting them with another flushed pair will likely result in 4 sstables of the same size, one in each of the four shards. They, however, will all belong to level 1, as their effective size is **m * 2**.
+
+## Selecting compactions to run
+
+Because of sharding, UCS can do more compactions in parallel. However, it will often not use all available compaction threads.
+
+The reason for this is that UCS splits the available compaction threads equally among the levels of the compaction hierarchy. For example, if there are 16 compaction threads and we have no other work to do but compact the highest level 5, we can only use up to ⌈16/6⌉ = 3 threads to do compactions on level 5. If we permit more, we risk starving the lower levels of resources, which can result in sstables accumulating on level 0 for as long as that compaction takes, which can be a very long time, and breaking our read amplification expectations.
+
+In theory each level requires an equal amount of processing resources: for tiered compaction, every piece of data that enters the system goes through one compaction for each level, and for leveled - through **F-1**. Because of this UCS reserves an equal number of compaction threads for each level, and assign tasks to the remainder of threads randomly.
+
+Make sure the number of compaction threads is greater than the number of expected levels to ensure compaction runs smoothly.
+
+## Differences with STCS and LCS
+
+Note that there are some differences between the tiered flavors of UCS (UCS-tiered) and STCS, and between the leveled flavors of UCS (UCS-leveled) and LCS.
+
+#### UCS-tiered vs STCS
+
+SizeTieredCompactionStrategy is pretty close to UCS. However, it defines buckets/levels by looking for sstables of similar size. This can result in some odd selections of buckets, possibly spanning sstables of wildly different sizes, while UCS's selection is more stable and predictable.
+
+STCS triggers a compaction when it finds at least `min_threshold` sstables on some bucket, and it compacts between `min_threshold` and `max_threshold` sstables from that bucket at a time. `min_threshold` is equivalent to UCS's **T = F = W + 2**. UCS drops the upper limit as we have seen that compaction is still efficient with very large numbers of sstables. 
+
+If there are multiple choices to pick SSTables within a bucket, STCS groups them by size while UCS groups them by timestamp. Because of that, STCS easily loses time order and makes whole table expiration less efficient.
+
+#### UCS-leveled vs LCS
+
+On first glance LeveledCompactionStrategy look very different in behaviour compared to UCS.
+
+LCS keeps multiple sstables per level which form a sorted run of non-overlapping sstables of small fixed size. So physical sstables on increasing levels increase in number (by a factor of `fanout_size`) instead of size. LCS does that to reduce space amplification and to ensure shorter compaction operations. When it finds that the combined size of a run on a level is higher than expected, it selects some sstables to compact with overlapping ones from the next level of the hierarchy. This eventually pushes the size of the next level over its size limit and triggers higher-level operations.
+
+UCS-leveled keeps one sstable per sharded level in the physical sense. So sstables on increasing levels increase in size (by a factor of **F**, see the **Size based levels** section above). UCS-leveled triggers a compaction when it finds a second sstable on some sharded level. It compacts the two sstables on that level, and the result most often ends up on that level too, but eventually it reaches sufficient size for the next level. This is the same time as a run in LCS would outgrow its size, thus compactions are in effect triggered at the same time as LCS would trigger them.
+
+The two approaches end up with a very similar effect, with the added benefit for UCS that sstables are structured in a way that can be easily switched to UCS-tiered or a different set values for the UCS parameters.
+
+UCS deals with the problem of space amplification by sharding on specific token boundaries. LCS's splitting of sstables on a fixed size means that the boundaries usually fall inside sstables on the next level, which tends to cause these sstables to be compacted more often than strictly necessary. This is not acceptable if we need tight write amplification control (i.e. this solution suits UCS-leveled, but not UCS-tiered and is thus not general enough for UCS).
+
+## Configuration
+
+UCS accepts these compaction strategy parameters:
+
+* **static_scaling_parameters**. Typically this will be a single integer, specifying **W** for all levels of the hierarchy. Positive values specify tiered compaction, and negative specify leveled, with fan factor **|W|+2**. Increasing **W** improves write amplification at the expense of reads, and decreasing it improves reads at the expense of writes. The default value is 2, which should be roughly equivalent to using STCS with the default threshold of 4. To use the equivalent of LCS with its default fan factor of 10, set this to -8.<br/>
+  The option also accepts passing a list of integers separated by a comma, in which case different values may be passed for the levels of the hierarchy. The first value will be used to set the value of
+  **W** for the first level, the second for the second level, and so on. The last value in this list will also be used for
+  all remaining levels.
+
+* **num_shards**. This is the number of shards. It is recommended that users set this value. More shards means more parallelism and smaller sstables at the higher levels at the expense of somewhat higher CPU usage.
+  By default, 10 would be used for single disk.
+  If JBOD / multi-drive, it would be 10 * disks. For example, if there are 5 disks, there would be 50 shards.
+  With data size 10 TB, the shard size would be 200 GB, which is an upper bound for the size of the largest sstables and compaction operations.
+
+* **min_sstable_size_in_mb**. This is the minimum sstable size in MB under which data will not be split on shard boundaries, by default 100. Higher values mean fewer sstables on disk and larger compaction operations on the lowest levels of the hierarchy. Storage-attached secondary indexes will work better with higher minimum sstable sizes.
+
+* **dataset_size_in_gb**. This is the target dataset size, by default the minimum total space for all the data file directories.
+  This is used to calculate the number of levels and therefore the theoretical read and write amplification.
+  It doesn't need to be very accurate but it is recommended
+  that it should be set to a value that is close to the target local dataset size, within a few GBs.
+  If not given, the database will use the total space on the devices containing the data directories, adjusting for the fact that data is equally split among them.
+
+* **max_space_overhead**. The maximum permitted space overhead as a fraction of the dataset size, by default 0.2 i.e. 20%. This cannot be smaller than 1/num_shards and limits the extra space that is required to complete compactions. UCS will only run compactions that will not overrun this limit. E.g. for datasize of 10TB and 20% max overhead, if a 1.1TB compaction is currently running, it will only start the 1.1TB one in the next shard after it completes. This also means that to prevent running out of space UCS will never start compactions that are larger than the limit by themselves; a warning will be issued if this happens as it may cause performance to deteriorate.
+
+* **expired_sstable_check_frequency_seconds**. Determines how often to check for expired SSTables, 10 minutes by default.
+
+* **unsafe_aggressive_sstable_expiration**. Expired sstables will be dropped without checking if their data is shadowing other sstables, by default false. This flag can only be enabled if `cassandra.allow_unsafe_aggressive_sstable_expiration` is true. Turning this flag can cause correctness issues, e.g. re-appearing of deleted data. See discussions in CASSANDRA-13418, DB-902 for valid use cases and potential problems.
+
+In **cassandra.yaml**:
+
+* **concurrent_compactors**. The number of compaction threads available. Set this to a large number, at minimum the number of expected levels of the compaction hierarchy to make sure that each level is given a dedicated compaction thread. This will avoid latency spikes caused by lower levels of the compaction hierarchy not getting a chance to run.
diff --git a/doc/unified_compaction_level_formula.svg b/doc/unified_compaction_level_formula.svg
new file mode 100644
index 000000000000..492fb81a825c
--- /dev/null
+++ b/doc/unified_compaction_level_formula.svg
@@ -0,0 +1,48 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!-- Generated by CodeCogs with dvisvgm 2.9.1 -->
+<svg version='1.1' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' width='176.333151pt' height='32.408907pt' viewBox='-.239051 -.22797 176.333151 32.408907'>
+    <defs>
+        <path id='g2-70' d='M2.518555-2.582316H3.347447C4.000996-2.582316 4.032877-2.454795 4.032877-2.223661C4.032877-2.16787 4.032877-2.088169 3.977086-1.880946C3.969116-1.849066 3.961146-1.785305 3.961146-1.761395C3.961146-1.753425 3.961146-1.649813 4.080697-1.649813C4.176339-1.649813 4.200249-1.729514 4.224159-1.833126L4.646575-3.53873C4.654545-3.55467 4.678456-3.658281 4.678456-3.666252C4.678456-3.698132 4.654545-3.777833 4.550934-3.777833C4.455293-3.777833 4.439352-3.706102 4.415442-3.610461C4.25604-2.988792 4.072727-2.84533 3.363387-2.84533H2.590286L3.092403-4.853798C3.164134-5.140722 3.172105-5.156663 3.498879-5.156663H4.670486C5.618929-5.156663 5.889913-4.95741 5.889913-4.27198C5.889913-4.104608 5.850062-3.897385 5.850062-3.745953C5.850062-3.650311 5.905853-3.610461 5.969614-3.610461C6.081196-3.610461 6.089166-3.682192 6.105106-3.817684L6.248568-5.172603C6.256538-5.212453 6.256538-5.268244 6.256538-5.308095C6.256538-5.419676 6.160897-5.419676 6.017435-5.419676H1.928767C1.785305-5.419676 1.681694-5.419676 1.681694-5.276214C1.681694-5.156663 1.777335-5.156663 1.912827-5.156663C1.968618-5.156663 2.080199-5.156663 2.215691-5.140722C2.383064-5.124782 2.406974-5.108842 2.406974-5.029141C2.406974-4.98929 2.399004-4.95741 2.375093-4.869738L1.315068-.629639C1.243337-.326775 1.227397-.263014 .637609-.263014C.486177-.263014 .390535-.263014 .390535-.111582C.390535-.079701 .414446 0 .518057 0C.68543 0 .876712-.02391 1.052055-.02391H2.15193C2.303362-.01594 2.590286 0 2.741719 0C2.797509 0 2.909091 0 2.909091-.151432C2.909091-.263014 2.81345-.263014 2.646077-.263014S2.414944-.263014 2.231631-.278954C2.016438-.302864 1.992528-.326775 1.992528-.422416C1.992528-.430386 1.992528-.478207 2.024408-.597758L2.518555-2.582316Z'/>
+        <path id='g1-0' d='M7.878456-2.749689C8.081694-2.749689 8.296887-2.749689 8.296887-2.988792S8.081694-3.227895 7.878456-3.227895H1.41071C1.207472-3.227895 .992279-3.227895 .992279-2.988792S1.207472-2.749689 1.41071-2.749689H7.878456Z'/>
+        <path id='g0-22' d='M2.988792 28.202242H6.790535V27.544707H3.646326V-.478207H2.988792V28.202242Z'/>
+        <path id='g0-23' d='M3.311582 27.544707H.167372V28.202242H3.969116V-.478207H3.311582V27.544707Z'/>
+        <path id='g0-106' d='M2.701868 21.029141H6.085181V20.467248H3.263761V-.478207H2.701868V21.029141Z'/>
+        <path id='g0-107' d='M3.036613 20.467248H.215193V21.029141H3.598506V-.478207H3.036613V20.467248Z'/>
+        <path id='g4-61' d='M8.069738-3.873474C8.237111-3.873474 8.452304-3.873474 8.452304-4.088667C8.452304-4.315816 8.249066-4.315816 8.069738-4.315816H1.028144C.860772-4.315816 .645579-4.315816 .645579-4.100623C.645579-3.873474 .848817-3.873474 1.028144-3.873474H8.069738ZM8.069738-1.649813C8.237111-1.649813 8.452304-1.649813 8.452304-1.865006C8.452304-2.092154 8.249066-2.092154 8.069738-2.092154H1.028144C.860772-2.092154 .645579-2.092154 .645579-1.876961C.645579-1.649813 .848817-1.649813 1.028144-1.649813H8.069738Z'/>
+        <path id='g4-103' d='M1.422665-2.163885C1.984558-1.793275 2.462765-1.793275 2.594271-1.793275C3.670237-1.793275 4.471233-2.606227 4.471233-3.526775C4.471233-3.849564 4.375592-4.303861 3.993026-4.686426C4.459278-5.164633 5.021171-5.164633 5.080946-5.164633C5.128767-5.164633 5.188543-5.164633 5.236364-5.140722C5.116812-5.092902 5.057036-4.97335 5.057036-4.841843C5.057036-4.674471 5.176588-4.531009 5.36787-4.531009C5.463512-4.531009 5.678705-4.590785 5.678705-4.853798C5.678705-5.068991 5.511333-5.403736 5.092902-5.403736C4.471233-5.403736 4.004981-5.021171 3.837609-4.841843C3.478954-5.116812 3.060523-5.272229 2.606227-5.272229C1.530262-5.272229 .729265-4.459278 .729265-3.53873C.729265-2.857285 1.147696-2.414944 1.267248-2.307347C1.123786-2.12802 .908593-1.78132 .908593-1.315068C.908593-.621669 1.327024-.32279 1.422665-.263014C.872727-.107597 .32279 .32279 .32279 .944458C.32279 1.769365 1.446575 2.450809 2.917061 2.450809C4.339726 2.450809 5.523288 1.817186 5.523288 .920548C5.523288 .621669 5.439601-.083686 4.722291-.454296C4.112578-.765131 3.514819-.765131 2.486675-.765131C1.75741-.765131 1.673724-.765131 1.458531-.992279C1.338979-1.111831 1.231382-1.338979 1.231382-1.590037C1.231382-1.793275 1.303113-1.996513 1.422665-2.163885ZM2.606227-2.044334C1.554172-2.044334 1.554172-3.251806 1.554172-3.526775C1.554172-3.741968 1.554172-4.23213 1.75741-4.554919C1.984558-4.901619 2.343213-5.021171 2.594271-5.021171C3.646326-5.021171 3.646326-3.813699 3.646326-3.53873C3.646326-3.323537 3.646326-2.833375 3.443088-2.510585C3.21594-2.163885 2.857285-2.044334 2.606227-2.044334ZM2.929016 2.199751C1.78132 2.199751 .908593 1.613948 .908593 .932503C.908593 .836862 .932503 .37061 1.3868 .059776C1.649813-.107597 1.75741-.107597 2.594271-.107597C3.58655-.107597 4.937484-.107597 4.937484 .932503C4.937484 1.637858 4.028892 2.199751 2.929016 2.199751Z'/>
+        <path id='g4-108' d='M2.056289-8.296887L.394521-8.16538V-7.81868C1.207472-7.81868 1.303113-7.734994 1.303113-7.149191V-.884682C1.303113-.3467 1.171606-.3467 .394521-.3467V0C.729265-.02391 1.315068-.02391 1.673724-.02391S2.630137-.02391 2.964882 0V-.3467C2.199751-.3467 2.056289-.3467 2.056289-.884682V-8.296887Z'/>
+        <path id='g4-110' d='M5.32005-2.905106C5.32005-4.016936 5.32005-4.351681 5.045081-4.734247C4.698381-5.200498 4.136488-5.272229 3.730012-5.272229C2.570361-5.272229 2.116065-4.27995 2.020423-4.040847H2.008468V-5.272229L.382565-5.140722V-4.794022C1.195517-4.794022 1.291158-4.710336 1.291158-4.124533V-.884682C1.291158-.3467 1.159651-.3467 .382565-.3467V0C.6934-.02391 1.338979-.02391 1.673724-.02391C2.020423-.02391 2.666002-.02391 2.976837 0V-.3467C2.211706-.3467 2.068244-.3467 2.068244-.884682V-3.108344C2.068244-4.363636 2.893151-5.033126 3.634371-5.033126S4.542964-4.423412 4.542964-3.694147V-.884682C4.542964-.3467 4.411457-.3467 3.634371-.3467V0C3.945205-.02391 4.590785-.02391 4.925529-.02391C5.272229-.02391 5.917808-.02391 6.228643 0V-.3467C5.630884-.3467 5.332005-.3467 5.32005-.705355V-2.905106Z'/>
+        <path id='g4-111' d='M5.487422-2.558406C5.487422-4.100623 4.315816-5.332005 2.929016-5.332005C1.494396-5.332005 .358655-4.064757 .358655-2.558406C.358655-1.028144 1.554172 .119552 2.917061 .119552C4.327771 .119552 5.487422-1.052055 5.487422-2.558406ZM2.929016-.143462C2.486675-.143462 1.948692-.334745 1.601993-.920548C1.279203-1.458531 1.267248-2.163885 1.267248-2.666002C1.267248-3.120299 1.267248-3.849564 1.637858-4.387547C1.972603-4.901619 2.49863-5.092902 2.917061-5.092902C3.383313-5.092902 3.88543-4.877709 4.208219-4.411457C4.578829-3.861519 4.578829-3.108344 4.578829-2.666002C4.578829-2.247572 4.578829-1.506351 4.267995-.944458C3.93325-.37061 3.383313-.143462 2.929016-.143462Z'/>
+        <path id='g3-70' d='M3.550685-3.897385H4.698381C5.606974-3.897385 5.678705-3.694147 5.678705-3.347447C5.678705-3.19203 5.654795-3.024658 5.595019-2.761644C5.571108-2.713823 5.559153-2.654047 5.559153-2.630137C5.559153-2.546451 5.606974-2.49863 5.69066-2.49863C5.786301-2.49863 5.798257-2.546451 5.846077-2.737733L6.539477-5.523288C6.539477-5.571108 6.503611-5.642839 6.419925-5.642839C6.312329-5.642839 6.300374-5.595019 6.252553-5.391781C6.001494-4.495143 5.762391-4.244085 4.722291-4.244085H3.634371L4.411457-7.340473C4.519054-7.758904 4.542964-7.79477 5.033126-7.79477H6.635118C8.129514-7.79477 8.344707-7.352428 8.344707-6.503611C8.344707-6.43188 8.344707-6.168867 8.308842-5.858032C8.296887-5.810212 8.272976-5.654795 8.272976-5.606974C8.272976-5.511333 8.332752-5.475467 8.404483-5.475467C8.488169-5.475467 8.53599-5.523288 8.5599-5.738481L8.810959-7.830635C8.810959-7.866501 8.834869-7.986052 8.834869-8.009963C8.834869-8.141469 8.727273-8.141469 8.51208-8.141469H2.84533C2.618182-8.141469 2.49863-8.141469 2.49863-7.926276C2.49863-7.79477 2.582316-7.79477 2.785554-7.79477C3.526775-7.79477 3.526775-7.711083 3.526775-7.579577C3.526775-7.519801 3.514819-7.47198 3.478954-7.340473L1.865006-.884682C1.75741-.466252 1.733499-.3467 .896638-.3467C.669489-.3467 .549938-.3467 .549938-.131507C.549938 0 .657534 0 .729265 0C.956413 0 1.195517-.02391 1.422665-.02391H2.976837C3.239851-.02391 3.526775 0 3.789788 0C3.897385 0 4.040847 0 4.040847-.215193C4.040847-.3467 3.969116-.3467 3.706102-.3467C2.761644-.3467 2.737733-.430386 2.737733-.609714C2.737733-.669489 2.761644-.765131 2.785554-.848817L3.550685-3.897385Z'/>
+        <path id='g3-76' d='M4.387547-7.244832C4.495143-7.699128 4.531009-7.81868 5.583064-7.81868C5.905853-7.81868 5.989539-7.81868 5.989539-8.045828C5.989539-8.16538 5.858032-8.16538 5.810212-8.16538C5.571108-8.16538 5.296139-8.141469 5.057036-8.141469H3.455044C3.227895-8.141469 2.964882-8.16538 2.737733-8.16538C2.642092-8.16538 2.510585-8.16538 2.510585-7.938232C2.510585-7.81868 2.618182-7.81868 2.797509-7.81868C3.526775-7.81868 3.526775-7.723039 3.526775-7.591532C3.526775-7.567621 3.526775-7.49589 3.478954-7.316563L1.865006-.884682C1.75741-.466252 1.733499-.3467 .896638-.3467C.669489-.3467 .549938-.3467 .549938-.131507C.549938 0 .621669 0 .860772 0H6.216687C6.479701 0 6.491656-.011955 6.575342-.227148L7.49589-2.773599C7.519801-2.833375 7.543711-2.905106 7.543711-2.940971C7.543711-3.012702 7.483935-3.060523 7.424159-3.060523C7.412204-3.060523 7.352428-3.060523 7.328518-3.012702C7.304608-3.000747 7.304608-2.976837 7.208966-2.749689C6.826401-1.697634 6.288418-.3467 4.267995-.3467H3.120299C2.952927-.3467 2.929016-.3467 2.857285-.358655C2.725778-.37061 2.713823-.394521 2.713823-.490162C2.713823-.573848 2.737733-.645579 2.761644-.753176L4.387547-7.244832Z'/>
+        <path id='g3-109' d='M2.462765-3.502864C2.486675-3.574595 2.785554-4.172354 3.227895-4.554919C3.53873-4.841843 3.945205-5.033126 4.411457-5.033126C4.889664-5.033126 5.057036-4.674471 5.057036-4.196264C5.057036-4.124533 5.057036-3.88543 4.913574-3.323537L4.614695-2.092154C4.519054-1.733499 4.291905-.848817 4.267995-.71731C4.220174-.537983 4.148443-.227148 4.148443-.179328C4.148443-.011955 4.27995 .119552 4.459278 .119552C4.817933 .119552 4.877709-.155417 4.985305-.585803L5.702615-3.443088C5.726526-3.53873 6.348194-5.033126 7.663263-5.033126C8.141469-5.033126 8.308842-4.674471 8.308842-4.196264C8.308842-3.526775 7.84259-2.223661 7.579577-1.506351C7.47198-1.219427 7.412204-1.06401 7.412204-.848817C7.412204-.310834 7.782814 .119552 8.356663 .119552C9.468493 .119552 9.886924-1.637858 9.886924-1.709589C9.886924-1.769365 9.839103-1.817186 9.767372-1.817186C9.659776-1.817186 9.647821-1.78132 9.588045-1.578082C9.313076-.621669 8.870735-.119552 8.392528-.119552C8.272976-.119552 8.081694-.131507 8.081694-.514072C8.081694-.824907 8.225156-1.207472 8.272976-1.338979C8.488169-1.912827 9.026152-3.323537 9.026152-4.016936C9.026152-4.734247 8.607721-5.272229 7.699128-5.272229C6.898132-5.272229 6.252553-4.817933 5.774346-4.112578C5.738481-4.758157 5.34396-5.272229 4.447323-5.272229C3.383313-5.272229 2.82142-4.519054 2.606227-4.220174C2.570361-4.901619 2.080199-5.272229 1.554172-5.272229C1.207472-5.272229 .932503-5.104857 .705355-4.65056C.490162-4.220174 .32279-3.490909 .32279-3.443088S.37061-3.335492 .454296-3.335492C.549938-3.335492 .561893-3.347447 .633624-3.622416C.812951-4.327771 1.0401-5.033126 1.518306-5.033126C1.793275-5.033126 1.888917-4.841843 1.888917-4.483188C1.888917-4.220174 1.769365-3.753923 1.685679-3.383313L1.350934-2.092154C1.303113-1.865006 1.171606-1.327024 1.111831-1.111831C1.028144-.800996 .896638-.239103 .896638-.179328C.896638-.011955 1.028144 .119552 1.207472 .119552C1.350934 .119552 1.518306 .047821 1.613948-.131507C1.637858-.191283 1.745455-.609714 1.80523-.848817L2.068244-1.924782L2.462765-3.502864Z'/>
+        <path id='g3-115' d='M2.725778-2.391034C2.929016-2.355168 3.251806-2.283437 3.323537-2.271482C3.478954-2.223661 4.016936-2.032379 4.016936-1.458531C4.016936-1.08792 3.682192-.119552 2.295392-.119552C2.044334-.119552 1.147696-.155417 .908593-.812951C1.3868-.753176 1.625903-1.123786 1.625903-1.3868C1.625903-1.637858 1.458531-1.769365 1.219427-1.769365C.956413-1.769365 .609714-1.566127 .609714-1.028144C.609714-.32279 1.327024 .119552 2.283437 .119552C4.100623 .119552 4.638605-1.219427 4.638605-1.841096C4.638605-2.020423 4.638605-2.355168 4.25604-2.737733C3.957161-3.024658 3.670237-3.084433 3.024658-3.21594C2.701868-3.287671 2.187796-3.395268 2.187796-3.93325C2.187796-4.172354 2.402989-5.033126 3.53873-5.033126C4.040847-5.033126 4.531009-4.841843 4.65056-4.411457C4.124533-4.411457 4.100623-3.957161 4.100623-3.945205C4.100623-3.694147 4.327771-3.622416 4.435367-3.622416C4.60274-3.622416 4.937484-3.753923 4.937484-4.25604S4.483188-5.272229 3.550685-5.272229C1.984558-5.272229 1.566127-4.040847 1.566127-3.550685C1.566127-2.642092 2.450809-2.450809 2.725778-2.391034Z'/>
+    </defs>
+    <g id='page1' transform='matrix(1.13 0 0 1.13 -63.986043 -61.019978)'>
+        <use x='56.413267' y='71.133381' xlink:href='#g3-76'/>
+        <use x='67.698603' y='71.133381' xlink:href='#g4-61'/>
+        <use x='80.124084' y='57.863031' xlink:href='#g0-106'/>
+        <use x='86.433781' y='71.133381' xlink:href='#g4-108'/>
+        <use x='89.685442' y='71.133381' xlink:href='#g4-111'/>
+        <use x='95.538433' y='71.133381' xlink:href='#g4-103'/>
+        <use x='101.554006' y='73.956109' xlink:href='#g2-70'/>
+        <use x='114.138072' y='63.045622' xlink:href='#g3-115'/>
+        <rect x='111.775441' y='67.905495' height='.478187' width='10.239267'/>
+        <use x='111.775441' y='79.334043' xlink:href='#g3-109'/>
+        <use x='123.210222' y='57.863031' xlink:href='#g0-107'/>
+        <use x='132.840749' y='71.133381' xlink:href='#g4-61'/>
+        <use x='145.26623' y='54.276444' xlink:href='#g0-22'/>
+        <use x='153.435622' y='63.045622' xlink:href='#g4-108'/>
+        <use x='156.687283' y='63.045622' xlink:href='#g4-110'/>
+        <use x='165.183103' y='63.045622' xlink:href='#g3-115'/>
+        <use x='173.353772' y='63.045622' xlink:href='#g1-0'/>
+        <use x='185.308933' y='63.045622' xlink:href='#g4-108'/>
+        <use x='188.560594' y='63.045622' xlink:href='#g4-110'/>
+        <use x='197.056414' y='63.045622' xlink:href='#g3-109'/>
+        <rect x='153.435622' y='67.905495' height='.478187' width='53.860052'/>
+        <use x='169.890107' y='79.334043' xlink:href='#g4-108'/>
+        <use x='173.141768' y='79.334043' xlink:href='#g4-110'/>
+        <use x='181.637588' y='79.334043' xlink:href='#g3-70'/>
+        <use x='208.491187' y='54.276444' xlink:href='#g0-23'/>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index 2a763ad75230..ed844aea1e8b 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -21,6 +21,7 @@
 import java.io.IOException;
 import java.net.*;
 import java.nio.file.FileStore;
+import java.nio.file.Files;
 import java.nio.file.NoSuchFileException;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -31,7 +32,9 @@
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.HashMultiset;
 import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Multiset;
 import com.google.common.primitives.Ints;
 import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.RateLimiter;
@@ -607,7 +610,7 @@ else if (conf.repair_session_space_in_mb > (int) (Runtime.getRuntime().maxMemory
                 }
             }
 
-            logger.info("cdc_enabled is true. Starting casssandra node with Change-Data-Capture enabled.");
+            logger.info("cdc_enabled is true. Starting cassandra node with Change-Data-Capture enabled.");
         }
 
         if (conf.saved_caches_directory == null)
@@ -1924,6 +1927,50 @@ public static String[] getAllDataFileLocations()
         return ArrayUtils.addFirst(conf.data_file_directories, conf.local_system_data_file_directory);
     }
 
+    /**
+     * @return Minimum total space for all the data file directories in GB.
+     *         0 if fail to get the total space, or total space is under 1 GB.
+     */
+    public static long getDataFileDirectoriesMinTotalSpaceInGB()
+    {
+        String[] dataDirectories = getAllDataFileLocations();
+        if (dataDirectories.length == 0)
+        {
+            return 0L;
+        }
+
+        Multiset<FileStore> fileStores = HashMultiset.create();
+        for (String dir : dataDirectories)
+        {
+            try
+            {
+                fileStores.add(Files.getFileStore(new File(dir).toPath()));
+            }
+            catch (IOException ioe)
+            {
+                logger.warn("Unable to get FileStore of {}. {}", dir, ioe);
+            }
+        }
+        return getDataFileDirectoriesMinTotalSpaceInGB(fileStores);
+    }
+
+    @VisibleForTesting
+    static long getDataFileDirectoriesMinTotalSpaceInGB(Multiset<FileStore> fileStores)
+    {
+        return fileStores.entrySet().stream().mapToLong(entry -> {
+            long totalSpace = 0L;
+            try
+            {
+                totalSpace = FileUtils.handleLargeFileSystem(entry.getElement().getTotalSpace());
+            }
+            catch (IOException ioe)
+            {
+                logger.warn("Unable to get total space of {}. {}", entry.getElement(), ioe);
+            }
+            return (totalSpace >> 30) / entry.getCount();
+        }).min().orElse(0L) * fileStores.size();
+    }
+
     public static String getCommitLogLocation()
     {
         return conf.commitlog_directory;
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
index fb523d2dfc30..00d073808cbe 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java
@@ -20,8 +20,6 @@
 import java.io.File;
 import java.io.IOException;
 import java.io.PrintStream;
-import java.lang.reflect.Constructor;
-import java.lang.reflect.InvocationTargetException;
 import java.nio.ByteBuffer;
 import java.nio.file.Files;
 import java.util.*;
@@ -29,7 +27,10 @@
 import java.util.concurrent.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.regex.Pattern;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
 import javax.management.*;
 import javax.management.openmbean.*;
 
@@ -61,7 +62,6 @@
 import org.apache.cassandra.db.rows.CellPath;
 import org.apache.cassandra.dht.*;
 import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Splitter;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.exceptions.StartupException;
 import org.apache.cassandra.index.SecondaryIndexManager;
@@ -69,6 +69,7 @@
 import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.io.FSReadError;
 import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.sstable.BloomFilterTracker;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTable;
@@ -206,7 +207,7 @@ public enum FlushReason
      *
      * We synchronize on the Tracker to ensure isolation when we want to make sure
      * that the memtable we're acting on doesn't change out from under us.  I.e., flush
-     * syncronizes on it to make sure it can submit on both executors atomically,
+     * synchronizes on it to make sure it can submit on both executors atomically,
      * so anyone else who wants to make sure flush doesn't interfere should as well.
      */
     private final Tracker data;
@@ -225,7 +226,8 @@ public enum FlushReason
     private volatile DefaultValue<Integer> maxCompactionThreshold;
     private volatile DefaultValue<Double> crcCheckChance;
 
-    private final CompactionStrategyManager compactionStrategyManager;
+    private final CompactionStrategyFactory strategyFactory;
+    private volatile CompactionStrategyContainer strategyContainer;
 
     private final Directories directories;
 
@@ -242,12 +244,21 @@ public enum FlushReason
 
     private volatile boolean compactionSpaceCheck = true;
 
+    /** The local ranges are used by the {@link DiskBoundaryManager} to create the disk boundaries but can also be
+     * used independently. They are created lazily and invalidated whenever {@link this#invalidateLocalRangesAndDiskBoundaries()}
+     * is called.
+     */
+    private volatile SortedLocalRanges localRanges;
+
     @VisibleForTesting
     final DiskBoundaryManager diskBoundaryManager = new DiskBoundaryManager();
     ShardBoundaries cachedShardBoundaries = null;
 
     private volatile boolean neverPurgeTombstones = false;
 
+    // BloomFilterTracker is updated from corresponding {@link SSTableReader}s. Metrics are queried via CFS instance.
+    private final BloomFilterTracker bloomFilterTracker = BloomFilterTracker.createMeterTracker();
+
     public static void shutdownPostFlushExecutor() throws InterruptedException
     {
         postFlushExecutor.shutdown();
@@ -269,7 +280,7 @@ public void reload()
         // only update these runtime-modifiable settings if they have not been modified.
         if (!minCompactionThreshold.isModified())
             for (ColumnFamilyStore cfs : concatWithIndexes())
-                cfs.minCompactionThreshold = new DefaultValue(metadata().params.compaction.minCompactionThreshold());
+                cfs.minCompactionThreshold = new DefaultValue<>(metadata().params.compaction.minCompactionThreshold());
         if (!maxCompactionThreshold.isModified())
             for (ColumnFamilyStore cfs : concatWithIndexes())
                 cfs.maxCompactionThreshold = new DefaultValue(metadata().params.compaction.maxCompactionThreshold());
@@ -277,7 +288,7 @@ public void reload()
             for (ColumnFamilyStore cfs : concatWithIndexes())
                 cfs.crcCheckChance = new DefaultValue(metadata().params.crcCheckChance);
 
-        compactionStrategyManager.maybeReload(metadata());
+        reloadCompactionStrategy(metadata().params.compaction, CompactionStrategyContainer.ReloadReason.METADATA_CHANGE);
 
         indexManager.reload();
 
@@ -289,6 +300,14 @@ public void reload()
             currentMemtable.metadataUpdated();
     }
 
+    /**
+     * Reload the compaction strategy using the given compaction parameters and reason.
+     */
+    private void reloadCompactionStrategy(CompactionParams compactionParams, CompactionStrategyContainer.ReloadReason reason)
+    {
+        strategyContainer = strategyFactory.reload(strategyContainer, compactionParams, reason);
+    }
+
     public static Runnable getBackgroundCompactionTaskSubmitter()
     {
         return () -> {
@@ -298,9 +317,20 @@ public static Runnable getBackgroundCompactionTaskSubmitter()
         };
     }
 
+    @VisibleForTesting
+    public CompactionStrategyFactory getCompactionFactory()
+    {
+        return strategyFactory;
+    }
+
+    public CompactionParams getCompactionParams()
+    {
+        return strategyContainer.getCompactionParams();
+    }
+
     public Map<String, String> getCompactionParameters()
     {
-        return compactionStrategyManager.getCompactionParams().asMap();
+        return getCompactionParams().asMap();
     }
 
     public String getCompactionParametersJson()
@@ -312,9 +342,7 @@ public void setCompactionParameters(Map<String, String> options)
     {
         try
         {
-            CompactionParams compactionParams = CompactionParams.fromMap(options);
-            compactionParams.validate();
-            compactionStrategyManager.setNewLocalCompactionStrategy(compactionParams);
+            reloadCompactionStrategy(CompactionParams.fromMap(options), CompactionStrategyContainer.ReloadReason.JMX_REQUEST);
         }
         catch (Throwable t)
         {
@@ -406,14 +434,10 @@ public ColumnFamilyStore(Keyspace keyspace,
         }
 
         // compaction strategy should be created after the CFS has been prepared
-        compactionStrategyManager = new CompactionStrategyManager(this);
-        compactionStrategyManager.reload(metadata().params.compaction);
-
-        if (maxCompactionThreshold.value() <= 0 || minCompactionThreshold.value() <=0)
-        {
-            logger.warn("Disabling compaction strategy by setting compaction thresholds to 0 is deprecated, set the compaction option 'enabled' to 'false' instead.");
-            this.compactionStrategyManager.disable();
-        }
+        this.strategyFactory = new CompactionStrategyFactory(this);
+        this.strategyContainer = strategyFactory.reload(null,
+                                                        metadata.get().params.compaction,
+                                                        CompactionStrategyContainer.ReloadReason.FULL);
 
         // create the private ColumnFamilyStores for the secondary column indexes
         indexManager = new SecondaryIndexManager(this);
@@ -541,12 +565,12 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long k
 
     public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor, long keyCount, long repairedAt, UUID pendingRepair, boolean isTransient, MetadataCollector metadataCollector, SerializationHeader header, LifecycleNewTracker lifecycleNewTracker)
     {
-        return getCompactionStrategyManager().createSSTableMultiWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadataCollector, header, indexManager.listIndexGroups(), lifecycleNewTracker);
+        return getCompactionStrategy().createSSTableMultiWriter(descriptor, keyCount, repairedAt, pendingRepair, isTransient, metadataCollector, header, indexManager.listIndexGroups(), lifecycleNewTracker);
     }
 
     public boolean supportsEarlyOpen()
     {
-        return compactionStrategyManager.supportsEarlyOpen();
+        return strategyContainer.supportsEarlyOpen();
     }
 
     /** call when dropping or renaming a CF. Performs mbean housekeeping and invalidates CFS to other operations */
@@ -574,7 +598,7 @@ public void invalidate(boolean expectMBean)
             }
         }
 
-        compactionStrategyManager.shutdown();
+        strategyContainer.shutdown();
         SystemKeyspace.removeTruncationRecord(metadata.id);
 
         data.dropSSTables();
@@ -797,20 +821,6 @@ public static void rebuildSecondaryIndex(String ksName, String cfName, String...
         cfs.indexManager.rebuildIndexesBlocking(Sets.newHashSet(Arrays.asList(idxNames)));
     }
 
-    public AbstractCompactionStrategy createCompactionStrategyInstance(CompactionParams compactionParams)
-    {
-        try
-        {
-            Constructor<? extends AbstractCompactionStrategy> constructor =
-                compactionParams.klass().getConstructor(ColumnFamilyStore.class, Map.class);
-            return constructor.newInstance(this, compactionParams.options());
-        }
-        catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
     @Deprecated
     public String getColumnFamilyName()
     {
@@ -1185,7 +1195,7 @@ public Collection<SSTableReader> flushMemtable(ColumnFamilyStore cfs, Memtable m
                     {
                         @SuppressWarnings("resource")
                         SSTableMultiWriter writer = writerIterator.next();
-                        if (writer.getFilePointer() > 0)
+                        if (writer.getBytesWritten() > 0)
                         {
                             writer.setOpenResult(true).prepareToCommit();
                         }
@@ -1230,7 +1240,7 @@ public Collection<SSTableReader> flushMemtable(ColumnFamilyStore cfs, Memtable m
             }
             cfs.replaceFlushed(memtable, sstables);
             reclaim(memtable);
-            cfs.compactionStrategyManager.compactionLogger.flush(sstables);
+            cfs.strategyFactory.getCompactionLogger().flush(sstables);
             logger.debug("Flushed to {} ({} sstables, {}), biggest {}, smallest {}",
                          sstables,
                          sstables.size(),
@@ -1324,6 +1334,7 @@ public void apply(PartitionUpdate update, CassandraWriteContext context, boolean
             metric.topWritePartitionFrequency.addSample(key.getKey(), 1);
             if (metric.topWritePartitionSize.isEnabled()) // dont compute datasize if not needed
                 metric.topWritePartitionSize.addSample(key.getKey(), update.dataSize());
+            metric.bytesInserted.inc(update.dataSize());
             StorageHook.instance.reportWrite(metadata.id, update);
             metric.writeLatency.addNano(System.nanoTime() - start);
             // CASSANDRA-11117 - certain resolution paths on memtable put can result in very
@@ -1359,29 +1370,12 @@ public ShardBoundaries localRangeSplits(int shardCount)
             shardBoundaries.shardCount() != shardCount ||
             shardBoundaries.ringVersion != StorageService.instance.getTokenMetadata().getRingVersion())
         {
-            DiskBoundaryManager.VersionedRangesAtEndpoint versionedLocalRanges = DiskBoundaryManager.getVersionedLocalRanges(this);
-            Set<Range<Token>> localRanges = versionedLocalRanges.rangesAtEndpoint.ranges();
-            List<Splitter.WeightedRange> weightedRanges;
-            if (localRanges.isEmpty())
-                weightedRanges = ImmutableList.of(new Splitter.WeightedRange(1.0, new Range<>(getPartitioner().getMinimumToken(), getPartitioner().getMaximumToken())));
-            else
-            {
-                weightedRanges = new ArrayList<>(localRanges.size());
-                for (Range<Token> r : localRanges)
-                {
-                    // WeightedRange supports only unwrapped ranges as it relies
-                    // on right - left == num tokens equality
-                    for (Range<Token> u: r.unwrap())
-                        weightedRanges.add(new Splitter.WeightedRange(1.0, u));
-                }
-                weightedRanges.sort(Comparator.comparing(Splitter.WeightedRange::left));
-            }
-
-            List<Token> boundaries = getPartitioner().splitter().get().splitOwnedRanges(shardCount, weightedRanges, false);
-            shardBoundaries = new ShardBoundaries(boundaries.subList(0, boundaries.size() - 1),
-                                                  versionedLocalRanges.ringVersion);
+            SortedLocalRanges localRanges = getLocalRanges();
+            List<PartitionPosition> positions = localRanges.split(shardCount);
+            shardBoundaries = new ShardBoundaries(positions.subList(0, positions.size() - 1),
+                                                  localRanges.getRingVersion());
             cachedShardBoundaries = shardBoundaries;
-            logger.info("Memtable shard boundaries for {}.{}: {}", keyspace.getName(), getTableName(), boundaries);
+            logger.info("Memtable shard boundaries for {}.{}: {}", keyspace.getName(), getTableName(), positions);
         }
         return shardBoundaries;
     }
@@ -1639,7 +1633,7 @@ public Tracker getTracker()
 
     public Set<SSTableReader> getLiveSSTables()
     {
-        return data.getView().liveSSTables();
+        return data.getLiveSSTables();
     }
 
     public Iterable<SSTableReader> getSSTables(SSTableSet sstableSet)
@@ -1685,29 +1679,6 @@ public Map<UUID, PendingStat> getPendingRepairStats()
         return stats;
     }
 
-    /**
-     * promotes (or demotes) data attached to an incremental repair session that has either completed successfully,
-     * or failed
-     *
-     * @return session ids whose data could not be released
-     */
-    public CleanupSummary releaseRepairData(Collection<UUID> sessions, boolean force)
-    {
-        if (force)
-        {
-            Predicate<SSTableReader> predicate = sst -> {
-                UUID session = sst.getPendingRepair();
-                return session != null && sessions.contains(session);
-            };
-            return runWithCompactionsDisabled(() -> compactionStrategyManager.releaseRepairData(sessions),
-                                              predicate, false, true, true);
-        }
-        else
-        {
-            return compactionStrategyManager.releaseRepairData(sessions);
-        }
-    }
-
     public boolean isFilterFullyCoveredBy(ClusteringIndexFilter filter,
                                           DataLimits limits,
                                           CachedPartition cached,
@@ -2366,7 +2337,7 @@ public void clearUnsafe()
         {
             cfs.runWithCompactionsDisabled((Callable<Void>) () -> {
                 cfs.data.reset(memtableFactory.create(new AtomicReference<>(CommitLogPosition.NONE), cfs.metadata, cfs));
-                cfs.compactionStrategyManager.forceReload();
+                cfs.reloadCompactionStrategy(metadata().params.compaction, CompactionStrategyContainer.ReloadReason.FULL);
                 return null;
             }, true, false);
         }
@@ -2560,7 +2531,7 @@ private static CompactionManager.CompactionPauser pauseCompactionStrategies(Iter
             for (ColumnFamilyStore cfs : toPause)
             {
                 successfullyPaused.ensureCapacity(successfullyPaused.size() + 1); // to avoid OOM:ing after pausing the strategies
-                cfs.getCompactionStrategyManager().pause();
+                cfs.getCompactionStrategy().pause();
                 successfullyPaused.add(cfs);
             }
             return () -> maybeFail(resumeAll(null, toPause));
@@ -2578,7 +2549,7 @@ private static Throwable resumeAll(Throwable accumulate, Iterable<ColumnFamilySt
         {
             try
             {
-                cfs.getCompactionStrategyManager().resume();
+                cfs.getCompactionStrategy().resume();
             }
             catch (Throwable t)
             {
@@ -2592,8 +2563,7 @@ public LifecycleTransaction markAllCompacting(final OperationType operationType)
     {
         Callable<LifecycleTransaction> callable = () -> {
             assert data.getCompacting().isEmpty() : data.getCompacting();
-            Iterable<SSTableReader> sstables = getLiveSSTables();
-            sstables = AbstractCompactionStrategy.filterSuspectSSTables(sstables);
+            Iterable<SSTableReader> sstables = Iterables.filter(getLiveSSTables(), sstable -> !sstable.isMarkedSuspect());
             LifecycleTransaction modifier = data.tryModify(sstables, operationType);
             assert modifier != null: "something marked things compacting while compactions are disabled";
             return modifier;
@@ -2616,7 +2586,12 @@ public void disableAutoCompaction()
     {
         // we don't use CompactionStrategy.pause since we don't want users flipping that on and off
         // during runWithCompactionsDisabled
-        compactionStrategyManager.disable();
+        strategyContainer.disable();
+    }
+
+    public boolean compactionShouldBeEnabled()
+    {
+        return strategyContainer.getCompactionParams().isEnabled();
     }
 
     public void enableAutoCompaction()
@@ -2626,39 +2601,79 @@ public void enableAutoCompaction()
 
     /**
      * used for tests - to be able to check things after a minor compaction
-     * @param waitForFutures if we should block until autocompaction is done
+     * @param waitForFuture if we should block until autocompaction is done
      */
     @VisibleForTesting
-    public void enableAutoCompaction(boolean waitForFutures)
+    public void enableAutoCompaction(boolean waitForFuture)
     {
-        compactionStrategyManager.enable();
-        List<Future<?>> futures = CompactionManager.instance.submitBackground(this);
-        if (waitForFutures)
-            FBUtilities.waitOnFutures(futures);
+        strategyContainer.enable();
+        Future<?> future = CompactionManager.instance.submitBackground(this);
+        if (waitForFuture)
+            FBUtilities.waitOnFuture(future);
     }
 
     public boolean isAutoCompactionDisabled()
     {
-        return !this.compactionStrategyManager.isEnabled();
+        return !this.strategyContainer.isEnabled();
     }
 
-    /*
-     JMX getters and setters for the Default<T>s.
-       - get/set minCompactionThreshold
-       - get/set maxCompactionThreshold
-       - get     memsize
-       - get     memops
-       - get/set memtime
+    public SortedLocalRanges getLocalRanges()
+    {
+        synchronized (this)
+        {
+            if (localRanges != null && !localRanges.isOutOfDate())
+                return localRanges;
+
+            localRanges = SortedLocalRanges.create(this);
+            return localRanges;
+        }
+    }
+
+    /**
+     * Return the compaction strategy for this CFS. Even though internally the strategy container
+     * implements the strategy, we would like to just expose {@link CompactionStrategy} externally.
+     * This is not currently possible for the reasons explained in {@link this#getCompactionStrategyContainer()},
+     * so we expose the container as well, but using a separate method, marked as deprecated.
+     *
+     * @return the compaction strategy for this CFS
      */
+    public CompactionStrategy getCompactionStrategy()
+    {
+        return strategyContainer;
+    }
 
-    public CompactionStrategyManager getCompactionStrategyManager()
+    /**
+     * The reasons for exposing the compaction strategy container are the following:
+     *
+     * - Unit tests
+     * - Repair
+     *
+     * Eventually we would like to only expose the {@link CompactionStrategy}, so for new code call
+     * {@link this#getCompactionStrategy()} instead.
+     *
+     * @return the compaction strategy container
+     */
+    @Deprecated
+    @VisibleForTesting
+    public CompactionStrategyContainer getCompactionStrategyContainer()
     {
-        return compactionStrategyManager;
+        return strategyContainer;
     }
 
-    public CompactionLogger getCompactionLogger()
+    /**
+     * This option determines if tombstones should only be removed when the sstable has been repaired.
+     * Because this option was introduced in patch releases (I'm guessing), the compaction parameters were
+     * abused. Eventually this option should be moved out of the compaction parameters. TODO: move it
+     * to the new compaction strategy interface.
+     *
+     * @return true if tombstones can only be removed if the sstable has been repaired
+     */
+    public boolean onlyPurgeRepairedTombstones()
     {
-        return compactionStrategyManager == null ? null : compactionStrategyManager.compactionLogger;
+        // Here we need to ask the CSM for the parameters in case they were changed over JMX without changing the schema,
+        // for now the CSM has the up-to-date copy of the params
+        CompactionParams params = strategyContainer.getCompactionParams();
+        return Boolean.parseBoolean(params.options().get(CompactionStrategyOptions.ONLY_PURGE_REPAIRED_TOMBSTONES));
     }
 
     public void setCrcCheckChance(double crcCheckChance)
@@ -2755,6 +2770,21 @@ public double getMeanPartitionSize()
         return count > 0 ? sum * 1.0 / count : 0;
     }
 
+    public double sstablePartitionReadLatency()
+    {
+        return metric == null ? 0 : metric.sstablePartitionReadLatency.get();
+    }
+
+    public double getCompactionTimePerKb()
+    {
+        return metric == null ? 0 : metric.compactionTimePerKb.get();
+    }
+
+    public double getFlushTimePerKb()
+    {
+        return metric == null ? 0 : metric.flushTimePerKb.get();
+    }
+
     public int getMeanRowCount()
     {
         long totalRows = 0;
@@ -2786,6 +2816,77 @@ public DecoratedKey decorateKey(ByteBuffer key)
         return getPartitioner().decorateKey(key);
     }
 
+    public BloomFilterTracker getBloomFilterTracker()
+    {
+        return bloomFilterTracker;
+    }
+
+    public long getBloomFilterFalsePositiveCount()
+    {
+        return bloomFilterTracker.getFalsePositiveCount();
+    }
+
+    public long getBloomFilterTruePositiveCount()
+    {
+        return bloomFilterTracker.getTruePositiveCount();
+    }
+
+    public long getBloomFilterTrueNegativeCount()
+    {
+        return bloomFilterTracker.getTrueNegativeCount();
+    }
+
+    public double getRecentBloomFilterFalsePositiveRate()
+    {
+        return bloomFilterTracker.getRecentFalsePositiveRate();
+    }
+
+    public double getRecentBloomFilterTruePositiveRate()
+    {
+        return bloomFilterTracker.getRecentTruePositiveRate();
+    }
+
+    public double getRecentBloomFilterTrueNegativeRate()
+    {
+        return bloomFilterTracker.getRecentTrueNegativeRate();
+    }
+
+    public double bloomFilterFpRatio()
+    {
+        return metric == null ? 0 : metric.bloomFilterFalseRatio.getValue();
+    }
+
+    public long getReadRequests()
+    {
+        return metric == null ? 0 : metric.readRequests.getCount();
+    }
+
+    public long getBytesInserted()
+    {
+        return metric == null ? 0 : metric.bytesInserted.getCount();
+    }
+
+    /**
+     * @return the write amplification (bytes flushed + bytes compacted / bytes flushed).
+     */
+    public double getWA()
+    {
+        if (metric == null)
+            return 0;
+
+        double bytesCompacted = metric.compactionBytesWritten.getCount();
+        double bytesFlushed = metric.bytesFlushed.getCount();
+        return bytesFlushed <= 0 ? 0 : (bytesFlushed + bytesCompacted) / bytesFlushed;
+    }
+
+    public double getFlushSizeOnDisk()
+    {
+        if (metric == null)
+            return 0;
+
+        return metric.flushSizeOnDisk.get();
+    }
+
     /** true if this CFS contains secondary index data */
     public boolean isIndex()
     {
@@ -2806,17 +2907,20 @@ public List<String> getBuiltIndexes()
 
     public int getUnleveledSSTables()
     {
-        return compactionStrategyManager.getUnleveledSSTables();
+        if (strategyContainer instanceof CompactionStrategyManager)
+            return ((CompactionStrategyManager) strategyContainer).getUnleveledSSTables();
+        else
+            return 0;
     }
 
     public int[] getSSTableCountPerLevel()
     {
-        return compactionStrategyManager.getSSTableCountPerLevel();
+        return strategyContainer.getSSTableCountPerLevel();
     }
 
     public int getLevelFanoutSize()
     {
-        return compactionStrategyManager.getLevelFanoutSize();
+        return strategyContainer.getLevelFanoutSize();
     }
 
     public static class ViewFragment
@@ -2977,8 +3081,14 @@ public DiskBoundaries getDiskBoundaries()
         return diskBoundaryManager.getDiskBoundaries(this);
     }
 
-    public void invalidateDiskBoundaries()
+    public void invalidateLocalRangesAndDiskBoundaries()
     {
+        synchronized (this)
+        {
+            if (localRanges != null)
+                localRanges.invalidate();
+        }
+
         diskBoundaryManager.invalidate();
     }
 
@@ -3103,4 +3213,84 @@ public boolean hasMisplacedSSTables()
         }
         return false;
     }
+
+    private static void verifyMetadata(SSTableReader sstable, long repairedAt, UUID pendingRepair, boolean isTransient)
+    {
+        if (!Objects.equals(pendingRepair, sstable.getPendingRepair()))
+            throw new IllegalStateException(String.format("Failed setting pending repair to %s on %s (pending repair is %s)", pendingRepair, sstable, sstable.getPendingRepair()));
+        if (repairedAt != sstable.getRepairedAt())
+            throw new IllegalStateException(String.format("Failed setting repairedAt to %d on %s (repairedAt is %d)", repairedAt, sstable, sstable.getRepairedAt()));
+        if (isTransient != sstable.isTransient())
+            throw new IllegalStateException(String.format("Failed setting isTransient to %b on %s (isTransient is %b)", isTransient, sstable, sstable.isTransient()));
+    }
+
+    /**
+     * This method is exposed for testing only
+     * NotThreadSafe
+     */
+    @VisibleForTesting
+    public int mutateRepaired(Collection<SSTableReader> sstables, long repairedAt, UUID pendingRepair, boolean isTransient) throws IOException
+    {
+        Set<SSTableReader> changed = new HashSet<>();
+        try
+        {
+            for (SSTableReader sstable: sstables)
+            {
+                sstable.mutateRepairedAndReload(repairedAt, pendingRepair, isTransient);
+                verifyMetadata(sstable, repairedAt, pendingRepair, isTransient);
+                changed.add(sstable);
+            }
+        }
+        finally
+        {
+            // if there was an exception mutating repairedAt, we should still notify for the
+            // sstables that we were able to modify successfully before releasing the lock
+            getTracker().notifySSTableRepairedStatusChanged(changed);
+        }
+        return changed.size();
+    }
+
+    /**
+     * Mutates sstable repairedAt times and notifies listeners of the change with the writeLock held. Prevents races
+     * with other processes between when the metadata is changed and when sstables are moved between strategies.
+     */
+    public int mutateRepaired(@Nullable final ReentrantReadWriteLock.WriteLock writeLock,
+                              Collection<SSTableReader> sstables,
+                              long repairedAt,
+                              UUID pendingRepair,
+                              boolean isTransient) throws IOException
+    {
+        if (writeLock == null)
+            return mutateRepaired(sstables, repairedAt, pendingRepair, isTransient);
+
+        writeLock.lock();
+        try
+        {
+            return mutateRepaired(sstables, repairedAt, pendingRepair, isTransient);
+        }
+        finally
+        {
+            writeLock.unlock();
+        }
+    }
+
+    public boolean hasPendingRepairSSTables(UUID sessionID)
+    {
+        return Iterables.any(data.getLiveSSTables(), pendingRepairPredicate(sessionID));
+    }
+
+    public Set<SSTableReader> getPendingRepairSSTables(UUID sessionID)
+    {
+        return Sets.filter(data.getLiveSSTables(), pendingRepairPredicate(sessionID));
+    }
+
+    public static Predicate<SSTableReader> pendingRepairPredicate(@Nonnull UUID sessionID)
+    {
+        return sstable -> sstable.getPendingRepair() != null && sessionID.equals(sstable.getPendingRepair());
+    }
+
+    public static Predicate<SSTableReader> nonSuspectAndNotInPredicate(Set<SSTableReader> compacting)
+    {
+        return sstable -> !sstable.isMarkedSuspect() && !compacting.contains(sstable);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
index 0360e344d16a..634bf1e3141e 100644
--- a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
+++ b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java
@@ -191,19 +191,20 @@ public List<String> importNewSSTables(Set<String> srcPaths,
 
     @Deprecated
     public void loadNewSSTables();
+
     /**
      * @return the number of SSTables in L0.  Always return 0 if Leveled compaction is not enabled.
      */
     public int getUnleveledSSTables();
 
     /**
-     * @return sstable count for each level. null unless leveled compaction is used.
+     * @return sstable count for each level. empty unless leveled or unified compaction is used.
      *         array index corresponds to level(int[0] is for level 0, ...).
      */
     public int[] getSSTableCountPerLevel();
 
     /**
-     * @return sstable fanout size for level compaction strategy.
+     * @return sstable fanout size for level or unified compaction strategies. Default LCS fanout size otherwise.
      */
     public int getLevelFanoutSize();
 
diff --git a/src/java/org/apache/cassandra/db/DiskBoundaries.java b/src/java/org/apache/cassandra/db/DiskBoundaries.java
index f33b43eb3d80..30d2aada44e6 100644
--- a/src/java/org/apache/cassandra/db/DiskBoundaries.java
+++ b/src/java/org/apache/cassandra/db/DiskBoundaries.java
@@ -20,34 +20,41 @@
 
 import java.util.Collections;
 import java.util.List;
+import java.util.Objects;
+
+import javax.annotation.Nullable;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableList;
 
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.service.StorageService;
 
 public class DiskBoundaries
 {
-    public final List<Directories.DataDirectory> directories;
-    public final ImmutableList<PartitionPosition> positions;
-    final long ringVersion;
+    @Nullable public final List<Directories.DataDirectory> directories;
+    @Nullable private final ImmutableList<PartitionPosition> positions;
+    public final SortedLocalRanges localRanges;
     final int directoriesVersion;
     private final ColumnFamilyStore cfs;
     private volatile boolean isInvalid = false;
 
-    public DiskBoundaries(ColumnFamilyStore cfs, Directories.DataDirectory[] directories, int diskVersion)
+    public DiskBoundaries(ColumnFamilyStore cfs,
+                          @Nullable Directories.DataDirectory[] directories,
+                          SortedLocalRanges localRanges,
+                          int diskVersion)
     {
-        this(cfs, directories, null, -1, diskVersion);
+        this(cfs, directories, null, localRanges, diskVersion);
     }
 
-    @VisibleForTesting
-    public DiskBoundaries(ColumnFamilyStore cfs, Directories.DataDirectory[] directories, List<PartitionPosition> positions, long ringVersion, int diskVersion)
+    public DiskBoundaries(ColumnFamilyStore cfs,
+                          @Nullable Directories.DataDirectory[] directories,
+                          @Nullable List<PartitionPosition> positions,
+                          SortedLocalRanges localRanges,
+                          int diskVersion)
     {
         this.directories = directories == null ? null : ImmutableList.copyOf(directories);
         this.positions = positions == null ? null : ImmutableList.copyOf(positions);
-        this.ringVersion = ringVersion;
+        this.localRanges = localRanges;
         this.directoriesVersion = diskVersion;
         this.cfs = cfs;
     }
@@ -59,17 +66,17 @@ public boolean equals(Object o)
 
         DiskBoundaries that = (DiskBoundaries) o;
 
-        if (ringVersion != that.ringVersion) return false;
-        if (directoriesVersion != that.directoriesVersion) return false;
-        if (!directories.equals(that.directories)) return false;
-        return positions != null ? positions.equals(that.positions) : that.positions == null;
+        return Objects.equals(localRanges, that.localRanges) &&
+               directoriesVersion == that.directoriesVersion &&
+               Objects.equals(directories, that.directories) &&
+               Objects.equals(positions, that.positions);
     }
 
     public int hashCode()
     {
         int result = directories != null ? directories.hashCode() : 0;
         result = 31 * result + (positions != null ? positions.hashCode() : 0);
-        result = 31 * result + (int) (ringVersion ^ (ringVersion >>> 32));
+        result = 31 * result + localRanges.hashCode();
         result = 31 * result + directoriesVersion;
         return result;
     }
@@ -79,7 +86,7 @@ public String toString()
         return "DiskBoundaries{" +
                "directories=" + directories +
                ", positions=" + positions +
-               ", ringVersion=" + ringVersion +
+               ", localRanges=" + localRanges.toString() +
                ", directoriesVersion=" + directoriesVersion +
                '}';
     }
@@ -91,9 +98,9 @@ public boolean isOutOfDate()
     {
         if (isInvalid)
             return true;
+
         int currentDiskVersion = DisallowedDirectories.getDirectoriesVersion();
-        long currentRingVersion = StorageService.instance.getTokenMetadata().getRingVersion();
-        return currentDiskVersion != directoriesVersion || (ringVersion != -1 && currentRingVersion != ringVersion);
+        return currentDiskVersion != directoriesVersion || localRanges.isOutOfDate();
     }
 
     public void invalidate()
@@ -101,7 +108,7 @@ public void invalidate()
         this.isInvalid = true;
     }
 
-    public int getDiskIndex(SSTableReader sstable)
+    public int getDiskIndexFromKey(SSTableReader sstable)
     {
         if (positions == null)
         {
@@ -130,7 +137,7 @@ public int getBoundariesFromSSTableDirectory(Descriptor descriptor)
 
     public Directories.DataDirectory getCorrectDiskForSSTable(SSTableReader sstable)
     {
-        return directories.get(getDiskIndex(sstable));
+        return directories.get(getDiskIndexFromKey(sstable));
     }
 
     public Directories.DataDirectory getCorrectDiskForKey(DecoratedKey key)
@@ -138,20 +145,55 @@ public Directories.DataDirectory getCorrectDiskForKey(DecoratedKey key)
         if (positions == null)
             return null;
 
-        return directories.get(getDiskIndex(key));
+        return directories.get(getDiskIndexFromKey(key));
     }
 
     public boolean isInCorrectLocation(SSTableReader sstable, Directories.DataDirectory currentLocation)
     {
-        int diskIndex = getDiskIndex(sstable);
+        int diskIndex = getDiskIndexFromKey(sstable);
         PartitionPosition diskLast = positions.get(diskIndex);
         return directories.get(diskIndex).equals(currentLocation) && sstable.last.compareTo(diskLast) <= 0;
     }
 
-    private int getDiskIndex(DecoratedKey key)
+    /**
+     * Return the number of boundaries. If this instance was created with token boundaries (positions) then this
+     * is the number of boundaries. If this instance was created without boundaries but only with directories, then
+     * this is the number of directories.
+     *
+     * @return the number of boundaries.
+     */
+    public int getNumBoundaries()
+    {
+        return positions == null ? directories.size() : positions.size();
+    }
+
+    private int getDiskIndexFromKey(DecoratedKey key)
     {
         int pos = Collections.binarySearch(positions, key);
         assert pos < 0;
         return -pos - 1;
     }
+
+    /**
+     * Return the local sorted ranges, which contain the local ranges for this node, sorted.
+     * See {@link SortedLocalRanges}.
+     *
+     * @return the local ranges, see {@link SortedLocalRanges}.
+     */
+    public SortedLocalRanges getLocalRanges()
+    {
+        return localRanges;
+    }
+
+    /**
+     * Returns a non-modifiable list of the disk boundary positions. This will be null if the token space is not split
+     * for the disks, this is not normally the case).
+     *
+     * Extracted as a method (instead of direct access to the final field) to permit mocking in tests.
+     */
+    @Nullable
+    public List<PartitionPosition> getPositions()
+    {
+        return positions;
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java
index 0de745d3cf80..04c6384b2a2d 100644
--- a/src/java/org/apache/cassandra/db/DiskBoundaryManager.java
+++ b/src/java/org/apache/cassandra/db/DiskBoundaryManager.java
@@ -19,7 +19,6 @@
 package org.apache.cassandra.db;
 
 import java.util.ArrayList;
-import java.util.Comparator;
 import java.util.List;
 
 import org.slf4j.Logger;
@@ -27,14 +26,8 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Splitter;
 import org.apache.cassandra.dht.Token;
-import org.apache.cassandra.locator.RangesAtEndpoint;
-import org.apache.cassandra.locator.TokenMetadata;
-import org.apache.cassandra.service.PendingRangeCalculatorService;
-import org.apache.cassandra.service.StorageService;
-import org.apache.cassandra.utils.FBUtilities;
 
 public class DiskBoundaryManager
 {
@@ -43,8 +36,6 @@ public class DiskBoundaryManager
 
     public DiskBoundaries getDiskBoundaries(ColumnFamilyStore cfs)
     {
-        if (!cfs.getPartitioner().splitter().isPresent())
-            return new DiskBoundaries(cfs, cfs.getDirectories().getWriteableLocations(), DisallowedDirectories.getDirectoriesVersion());
         if (diskBoundaries == null || diskBoundaries.isOutOfDate())
         {
             synchronized (this)
@@ -52,8 +43,13 @@ public DiskBoundaries getDiskBoundaries(ColumnFamilyStore cfs)
                 if (diskBoundaries == null || diskBoundaries.isOutOfDate())
                 {
                     logger.debug("Refreshing disk boundary cache for {}.{}", cfs.keyspace.getName(), cfs.getTableName());
+                    SortedLocalRanges localRanges = cfs.getLocalRanges();
+
                     DiskBoundaries oldBoundaries = diskBoundaries;
-                    diskBoundaries = getDiskBoundaryValue(cfs);
+                    diskBoundaries = !cfs.getPartitioner().splitter().isPresent()
+                                     ? new DiskBoundaries(cfs, cfs.getDirectories().getWriteableLocations(), localRanges, DisallowedDirectories.getDirectoriesVersion())
+                                     : getDiskBoundaryValue(cfs, localRanges);
+
                     logger.debug("Updating boundaries from {} to {} for {}.{}", oldBoundaries, diskBoundaries, cfs.keyspace.getName(), cfs.getTableName());
                 }
             }
@@ -67,43 +63,9 @@ public void invalidate()
            diskBoundaries.invalidate();
     }
 
-    static class VersionedRangesAtEndpoint
-    {
-        public final RangesAtEndpoint rangesAtEndpoint;
-        public final long ringVersion;
-
-        VersionedRangesAtEndpoint(RangesAtEndpoint rangesAtEndpoint, long ringVersion)
-        {
-            this.rangesAtEndpoint = rangesAtEndpoint;
-            this.ringVersion = ringVersion;
-        }
-    }
 
-    public static VersionedRangesAtEndpoint getVersionedLocalRanges(ColumnFamilyStore cfs)
+    private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs, SortedLocalRanges localRanges)
     {
-        RangesAtEndpoint localRanges;
-
-        long ringVersion;
-        TokenMetadata tmd;
-        do
-        {
-            tmd = StorageService.instance.getTokenMetadata();
-            ringVersion = tmd.getRingVersion();
-            localRanges = getLocalRanges(cfs, tmd);
-            logger.debug("Got local ranges {} (ringVersion = {})", localRanges, ringVersion);
-        }
-        while (ringVersion != tmd.getRingVersion()); // if ringVersion is different here it means that
-        // it might have changed before we calculated localRanges - recalculate
-
-        return new VersionedRangesAtEndpoint(localRanges, ringVersion);
-    }
-
-    private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs)
-    {
-        VersionedRangesAtEndpoint rangesAtEndpoint = getVersionedLocalRanges(cfs);
-        RangesAtEndpoint localRanges = rangesAtEndpoint.rangesAtEndpoint;
-        long ringVersion = rangesAtEndpoint.ringVersion;
-
         int directoriesVersion;
         Directories.DataDirectory[] dirs;
         do
@@ -113,31 +75,11 @@ private static DiskBoundaries getDiskBoundaryValue(ColumnFamilyStore cfs)
         }
         while (directoriesVersion != DisallowedDirectories.getDirectoriesVersion()); // if directoriesVersion has changed we need to recalculate
 
-        if (localRanges == null || localRanges.isEmpty())
-            return new DiskBoundaries(cfs, dirs, null, ringVersion, directoriesVersion);
+        if (localRanges == null || localRanges.getRanges().isEmpty())
+            return new DiskBoundaries(cfs, dirs, null, localRanges, directoriesVersion);
 
-        List<PartitionPosition> positions = getDiskBoundaries(localRanges, cfs.getPartitioner(), dirs);
-
-        return new DiskBoundaries(cfs, dirs, positions, ringVersion, directoriesVersion);
-    }
-
-    private static RangesAtEndpoint getLocalRanges(ColumnFamilyStore cfs, TokenMetadata tmd)
-    {
-        RangesAtEndpoint localRanges;
-        if (StorageService.instance.isBootstrapMode()
-        && !StorageService.isReplacingSameAddress()) // When replacing same address, the node marks itself as UN locally
-        {
-            PendingRangeCalculatorService.instance.blockUntilFinished();
-            localRanges = tmd.getPendingRanges(cfs.keyspace.getName(), FBUtilities.getBroadcastAddressAndPort());
-        }
-        else
-        {
-            // Reason we use use the future settled TMD is that if we decommission a node, we want to stream
-            // from that node to the correct location on disk, if we didn't, we would put new files in the wrong places.
-            // We do this to minimize the amount of data we need to move in rebalancedisks once everything settled
-            localRanges = cfs.keyspace.getReplicationStrategy().getAddressReplicas(tmd.cloneAfterAllSettled(), FBUtilities.getBroadcastAddressAndPort());
-        }
-        return localRanges;
+        List<PartitionPosition> positions = getDiskBoundaries(localRanges.getRanges(), cfs.getPartitioner(), dirs);
+        return new DiskBoundaries(cfs, dirs, positions, localRanges, directoriesVersion);
     }
 
     /**
@@ -149,27 +91,15 @@ private static RangesAtEndpoint getLocalRanges(ColumnFamilyStore cfs, TokenMetad
      *
      * The final entry in the returned list will always be the partitioner maximum tokens upper key bound
      */
-    private static List<PartitionPosition> getDiskBoundaries(RangesAtEndpoint replicas, IPartitioner partitioner, Directories.DataDirectory[] dataDirectories)
+    private static List<PartitionPosition> getDiskBoundaries(List<Splitter.WeightedRange> weightedRanges, IPartitioner partitioner, Directories.DataDirectory[] dataDirectories)
     {
         assert partitioner.splitter().isPresent();
 
         Splitter splitter = partitioner.splitter().get();
-        boolean dontSplitRanges = DatabaseDescriptor.getNumTokens() > 1;
-
-        List<Splitter.WeightedRange> weightedRanges = new ArrayList<>(replicas.size());
-        // note that Range.sort unwraps any wraparound ranges, so we need to sort them here
-        for (Range<Token> r : Range.sort(replicas.onlyFull().ranges()))
-            weightedRanges.add(new Splitter.WeightedRange(1.0, r));
-
-        for (Range<Token> r : Range.sort(replicas.onlyTransient().ranges()))
-            weightedRanges.add(new Splitter.WeightedRange(0.1, r));
-
-        weightedRanges.sort(Comparator.comparing(Splitter.WeightedRange::left));
+        Splitter.SplitType splitType = DatabaseDescriptor.getNumTokens() > 1 ? Splitter.SplitType.PREFER_WHOLE : Splitter.SplitType.ALWAYS_SPLIT;
 
-        List<Token> boundaries = splitter.splitOwnedRanges(dataDirectories.length, weightedRanges, dontSplitRanges);
-        // If we can't split by ranges, split evenly to ensure utilisation of all disks
-        if (dontSplitRanges && boundaries.size() < dataDirectories.length)
-            boundaries = splitter.splitOwnedRanges(dataDirectories.length, weightedRanges, false);
+        List<Token> boundaries = splitter.splitOwnedRanges(dataDirectories.length, weightedRanges, splitType).boundaries;
+        assert boundaries.size() == dataDirectories.length : "Wrong number of boundaries for directories: " + boundaries.size();
 
         List<PartitionPosition> diskBoundaries = new ArrayList<>();
         for (int i = 0; i < boundaries.size() - 1; i++)
diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java
index 0d4376426596..46e46688496b 100644
--- a/src/java/org/apache/cassandra/db/Keyspace.java
+++ b/src/java/org/apache/cassandra/db/Keyspace.java
@@ -51,7 +51,6 @@
 import org.apache.cassandra.exceptions.WriteTimeoutException;
 import org.apache.cassandra.index.Index;
 import org.apache.cassandra.index.SecondaryIndexManager;
-import org.apache.cassandra.index.transactions.UpdateTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.locator.AbstractReplicationStrategy;
 import org.apache.cassandra.metrics.KeyspaceMetrics;
@@ -66,7 +65,6 @@
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.tracing.Tracing;
 import org.apache.cassandra.utils.ByteBufferUtil;
-import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.concurrent.OpOrder;
 
@@ -388,12 +386,12 @@ private void createReplicationStrategy(KeyspaceMetadata ksm)
         if (!ksm.params.replication.equals(replicationParams))
         {
             logger.debug("New replication settings for keyspace {} - invalidating disk boundary caches", ksm.name);
-            columnFamilyStores.values().forEach(ColumnFamilyStore::invalidateDiskBoundaries);
+            columnFamilyStores.values().forEach(ColumnFamilyStore::invalidateLocalRangesAndDiskBoundaries);
         }
         replicationParams = ksm.params.replication;
     }
 
-    // best invoked on the compaction mananger.
+    // best invoked on the compaction manager.
     public void dropCf(TableId tableId)
     {
         assert columnFamilyStores.containsKey(tableId);
@@ -401,7 +399,6 @@ public void dropCf(TableId tableId)
         if (cfs == null)
             return;
 
-        cfs.getCompactionStrategyManager().shutdown();
         CompactionManager.instance.interruptCompactionForCFs(cfs.concatWithIndexes(), (sstable) -> true, true);
         // wait for any outstanding reads/writes that might affect the CFS
         cfs.keyspace.writeOrder.awaitNewBarrier();
diff --git a/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java b/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
index 644fd5c3e897..df980fed790c 100644
--- a/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/MultiRangeReadCommand.java
@@ -295,11 +295,17 @@ public boolean isReversed()
     }
 
     @Override
-    protected void recordLatency(TableMetrics metric, long latencyNanos)
+    protected void recordReadLatency(TableMetrics metric, long latencyNanos)
     {
         metric.rangeLatency.addNano(latencyNanos);
     }
 
+    @Override
+    protected void recordReadRequest(TableMetrics metric)
+    {
+        metric.rangeRequests.inc();
+    }
+
     @Override
     public Verb verb()
     {
diff --git a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
index 0dac5b8d6c79..ad2f745f285b 100644
--- a/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
+++ b/src/java/org/apache/cassandra/db/PartitionRangeReadCommand.java
@@ -268,11 +268,16 @@ public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryS
         return StorageProxy.getRangeSlice(this, consistency, queryStartNanoTime);
     }
 
-    protected void recordLatency(TableMetrics metric, long latencyNanos)
+    protected void recordReadLatency(TableMetrics metric, long latencyNanos)
     {
         metric.rangeLatency.addNano(latencyNanos);
     }
 
+    protected void recordReadRequest(TableMetrics metric)
+    {
+        metric.rangeRequests.inc();
+    }
+
     @VisibleForTesting
     public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadExecutionController executionController)
     {
@@ -284,7 +289,7 @@ public UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, Rea
         try
         {
             // avoid iterating over the memtable if we purge all tombstones
-            boolean useMinLocalDeletionTime = cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones();
+            boolean useMinLocalDeletionTime = cfs.onlyPurgeRepairedTombstones();
 
             for (Memtable memtable : view.memtables)
             {
diff --git a/src/java/org/apache/cassandra/db/ReadCommand.java b/src/java/org/apache/cassandra/db/ReadCommand.java
index 2ada437b9fd7..5446ad7174c7 100644
--- a/src/java/org/apache/cassandra/db/ReadCommand.java
+++ b/src/java/org/apache/cassandra/db/ReadCommand.java
@@ -523,7 +523,8 @@ public UnfilteredPartitionIterator searchStorage(Index.Searcher searcher, ReadEx
         return searcher.search(executionController);
     }
 
-    protected abstract void recordLatency(TableMetrics metric, long latencyNanos);
+    protected abstract void recordReadRequest(TableMetrics metric);
+    protected abstract void recordReadLatency(TableMetrics metric, long latencyNanos);
 
     /**
      * Allow to post-process the result of the query after it has been reconciled on the coordinator
@@ -581,6 +582,11 @@ private Guardrail.Threshold.GuardedCounter createTombstoneCounter()
                 return guardrail.newCounter(ReadCommand.this::toCQLString, true, null);
             }
 
+            private MetricRecording()
+            {
+                recordReadRequest(metric);
+            }
+
             @Override
             public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator iter)
             {
@@ -647,7 +653,7 @@ private void countTombstone(ClusteringPrefix<?> clustering)
             @Override
             public void onClose()
             {
-                recordLatency(metric, System.nanoTime() - startTimeNanos);
+                recordReadLatency(metric, System.nanoTime() - startTimeNanos);
 
                 metric.tombstoneScannedHistogram.update(tombstones.get());
                 metric.liveScannedHistogram.update(liveRows);
@@ -743,7 +749,7 @@ class WithoutPurgeableTombstones extends PurgeFunction
             public WithoutPurgeableTombstones()
             {
                 super(nowInSec(), cfs.gcBefore(nowInSec()), oldestUnrepairedTombstone(),
-                      cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones(),
+                      cfs.onlyPurgeRepairedTombstones(),
                       iterator.metadata().enforceStrictLiveness());
             }
 
diff --git a/src/java/org/apache/cassandra/db/RepairedDataInfo.java b/src/java/org/apache/cassandra/db/RepairedDataInfo.java
index c136f26eff71..0496a07982f8 100644
--- a/src/java/org/apache/cassandra/db/RepairedDataInfo.java
+++ b/src/java/org/apache/cassandra/db/RepairedDataInfo.java
@@ -289,7 +289,7 @@ private static class RepairedDataPurger extends PurgeFunction
             super(nowInSec,
                   cfs.gcBefore(nowInSec),
                   oldestUnrepairedTombstone,
-                  cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones(),
+                  cfs.onlyPurgeRepairedTombstones(),
                   cfs.metadata.get().enforceStrictLiveness());
         }
 
diff --git a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
index 9525854b4254..45ace44db1e6 100644
--- a/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
+++ b/src/java/org/apache/cassandra/db/SinglePartitionReadCommand.java
@@ -390,11 +390,16 @@ public PartitionIterator execute(ConsistencyLevel consistency, QueryState queryS
         return StorageProxy.read(Group.one(this), consistency, queryState, queryStartNanoTime);
     }
 
-    protected void recordLatency(TableMetrics metric, long latencyNanos)
+    protected void recordReadLatency(TableMetrics metric, long latencyNanos)
     {
         metric.readLatency.addNano(latencyNanos);
     }
 
+    protected void recordReadRequest(TableMetrics metric)
+    {
+        metric.readRequests.inc();
+    }
+
     @SuppressWarnings("resource") // we close the created iterator through closing the result of this method (and SingletonUnfilteredPartitionIterator ctor cannot fail)
     protected UnfilteredPartitionIterator queryStorage(final ColumnFamilyStore cfs, ReadExecutionController executionController)
     {
@@ -443,7 +448,7 @@ private UnfilteredRowIterator getThroughCache(ColumnFamilyStore cfs, ReadExecuti
                 cfs.metric.rowCacheHit.inc();
                 Tracing.trace("Row cache hit");
                 UnfilteredRowIterator unfilteredRowIterator = clusteringIndexFilter().getUnfilteredRowIterator(columnFilter(), cachedPartition);
-                cfs.metric.updateSSTableIterated(0);
+                cfs.metric.updateSSTableIterated(0, 0);
                 return unfilteredRowIterator;
             }
 
@@ -568,10 +573,10 @@ public UnfilteredRowIterator queryMemtableAndDisk(ColumnFamilyStore cfs, ReadExe
         assert executionController != null && executionController.validForReadOn(cfs);
         Tracing.trace("Executing single-partition query on {}", cfs.name);
 
-        return queryMemtableAndDiskInternal(cfs);
+        return queryMemtableAndDiskInternal(cfs, System.nanoTime());
     }
 
-    private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs)
+    private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs, long startTimeNanos)
     {
         /*
          * We have 2 main strategies:
@@ -587,7 +592,7 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs
          *      and generate a digest over their merge, which procludes an early return.
          */
         if (clusteringIndexFilter() instanceof ClusteringIndexNamesFilter && !queriesMulticellType() && !isTrackingRepairedStatus())
-            return queryMemtableAndSSTablesInTimestampOrder(cfs, (ClusteringIndexNamesFilter)clusteringIndexFilter());
+            return queryMemtableAndSSTablesInTimestampOrder(cfs, (ClusteringIndexNamesFilter)clusteringIndexFilter(), startTimeNanos);
 
         Tracing.trace("Acquiring sstable references");
         ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, partitionKey()));
@@ -699,7 +704,7 @@ private UnfilteredRowIterator queryMemtableAndDiskInternal(ColumnFamilyStore cfs
 
             StorageHook.instance.reportRead(cfs.metadata().id, partitionKey());
 
-            return withSSTablesIterated(inputCollector.finalizeIterators(cfs, nowInSec(), oldestUnrepairedTombstone), cfs.metric, metricsCollector);
+            return withSSTablesIterated(inputCollector.finalizeIterators(cfs, nowInSec(), oldestUnrepairedTombstone), cfs.metric, metricsCollector, startTimeNanos);
         }
         catch (RuntimeException | Error e)
         {
@@ -765,7 +770,8 @@ private UnfilteredRowIterator makeIteratorWithSkippedNonStaticContent(ColumnFami
     @SuppressWarnings("resource")
     private UnfilteredRowIterator withSSTablesIterated(List<UnfilteredRowIterator> iterators,
                                                        TableMetrics metrics,
-                                                       SSTableReadMetricsCollector metricsCollector)
+                                                       SSTableReadMetricsCollector metricsCollector,
+                                                       long startTimeNanos)
     {
         @SuppressWarnings("resource") //  Closed through the closing of the result of the caller method.
         UnfilteredRowIterator merged = UnfilteredRowIterators.merge(iterators);
@@ -781,7 +787,7 @@ class UpdateSstablesIterated extends Transformation
            public void onPartitionClose()
            {
                int mergedSSTablesIterated = metricsCollector.getMergedSSTables();
-               metrics.updateSSTableIterated(mergedSSTablesIterated);
+               metrics.updateSSTableIterated(mergedSSTablesIterated, System.nanoTime() - startTimeNanos);
                Tracing.trace("Merged data from memtables and {} sstables", mergedSSTablesIterated);
            }
         };
@@ -807,7 +813,7 @@ private boolean queriesMulticellType()
      * no collection or counters are included).
      * This method assumes the filter is a {@code ClusteringIndexNamesFilter}.
      */
-    private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, ClusteringIndexNamesFilter filter)
+    private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFamilyStore cfs, ClusteringIndexNamesFilter filter, long startTimeNanos)
     {
         Tracing.trace("Acquiring sstable references");
         ColumnFamilyStore.ViewFragment view = cfs.select(View.select(SSTableSet.LIVE, partitionKey()));
@@ -897,7 +903,7 @@ private UnfilteredRowIterator queryMemtableAndSSTablesInTimestampOrder(ColumnFam
             }
         }
 
-        cfs.metric.updateSSTableIterated(metricsCollector.getMergedSSTables());
+        cfs.metric.updateSSTableIterated(metricsCollector.getMergedSSTables(), System.nanoTime() - startTimeNanos);
 
         if (result == null || result.isEmpty())
             return EmptyIterators.unfilteredRow(metadata(), partitionKey(), false);
diff --git a/src/java/org/apache/cassandra/db/SortedLocalRanges.java b/src/java/org/apache/cassandra/db/SortedLocalRanges.java
new file mode 100644
index 000000000000..55c634ce3e49
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/SortedLocalRanges.java
@@ -0,0 +1,242 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Map;
+import java.util.Objects;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.stream.Collectors;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.locator.RangesAtEndpoint;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.service.PendingRangeCalculatorService;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * This class contains the local ranges for a given table, sorted.
+ */
+public class SortedLocalRanges
+{
+    private static final Logger logger = LoggerFactory.getLogger(SortedLocalRanges.class);
+
+    private final StorageService storageService;
+    private final ColumnFamilyStore cfs;
+    private final long ringVersion;
+    private final List<Splitter.WeightedRange> ranges;
+    private final Map<Integer, List<PartitionPosition>> splits;
+
+    private volatile boolean valid;
+
+    public SortedLocalRanges(StorageService storageService, ColumnFamilyStore cfs, long ringVersion, List<Splitter.WeightedRange> ranges)
+    {
+        this.storageService = storageService;
+        this.cfs = cfs;
+        this.ringVersion = ringVersion;
+
+        List<Splitter.WeightedRange> sortedRanges = new ArrayList<>(ranges.size());
+        for (Splitter.WeightedRange range : ranges)
+        {
+            for (Range<Token> unwrapped : range.range().unwrap())
+            {
+                sortedRanges.add(new Splitter.WeightedRange(range.weight(), unwrapped));
+            }
+        }
+        sortedRanges.sort(Comparator.comparing(Splitter.WeightedRange::left));
+
+        this.ranges = sortedRanges;
+        this.splits = new ConcurrentHashMap<>();
+        this.valid = true;
+    }
+
+    /**
+     * Create a set of sorted local ranges based on the current token metadata and ring version.
+     *
+     * This method should preferably only be called by {@link ColumnFamilyStore} because later on,
+     * ranges may need invalidating, see {@link this#invalidate()} and so a reference must be
+     * kept to ranges that are passed around, and current cfs does this.
+     */
+    static SortedLocalRanges create(ColumnFamilyStore cfs)
+    {
+        StorageService storageService = StorageService.instance;
+        RangesAtEndpoint localRanges;
+        List<Splitter.WeightedRange> weightedRanges;
+        long ringVersion;
+        TokenMetadata tmd;
+
+        do
+        {
+            tmd = storageService.getTokenMetadata();
+            ringVersion = tmd.getRingVersion();
+            localRanges = getLocalRanges(cfs, tmd);
+
+            weightedRanges = new ArrayList<>(localRanges.size());
+            for (Range<Token> r : localRanges.onlyFull().ranges())
+                weightedRanges.add(new Splitter.WeightedRange(1.0, r));
+
+            for (Range<Token> r : localRanges.onlyTransient().ranges())
+                weightedRanges.add(new Splitter.WeightedRange(0.1, r));
+
+            if (logger.isTraceEnabled())
+                logger.trace("Got local ranges {} (ringVersion = {})", localRanges, ringVersion);
+        }
+        while (ringVersion != tmd.getRingVersion()); // if ringVersion is different here it means that
+        // it might have changed before we calculated localRanges - recalculate
+
+        return new SortedLocalRanges(storageService, cfs, ringVersion, weightedRanges);
+    }
+
+    private static RangesAtEndpoint getLocalRanges(ColumnFamilyStore cfs, TokenMetadata tmd)
+    {
+        RangesAtEndpoint localRanges;
+        if (StorageService.instance.isBootstrapMode()
+            && !StorageService.isReplacingSameAddress()) // When replacing same address, the node marks itself as UN locally
+        {
+            PendingRangeCalculatorService.instance.blockUntilFinished();
+            localRanges = tmd.getPendingRanges(cfs.keyspace.getName(), FBUtilities.getBroadcastAddressAndPort());
+        }
+        else
+        {
+            // Reason we use use the future settled TMD is that if we decommission a node, we want to stream
+            // from that node to the correct location on disk, if we didn't, we would put new files in the wrong places.
+            // We do this to minimize the amount of data we need to move in rebalancedisks once everything settled
+            localRanges = cfs.keyspace.getReplicationStrategy().getAddressReplicas(tmd.cloneAfterAllSettled(), FBUtilities.getBroadcastAddressAndPort());
+        }
+        return localRanges;
+    }
+
+    @VisibleForTesting
+    public static SortedLocalRanges forTesting(ColumnFamilyStore cfs, List<Splitter.WeightedRange> ranges)
+    {
+        return new SortedLocalRanges(null, cfs, 0, ranges);
+    }
+
+    /**
+     * check if the given disk boundaries are out of date due not being set or to having too old diskVersion/ringVersion
+     */
+    public boolean isOutOfDate()
+    {
+        return !valid || ringVersion != storageService.getTokenMetadata().getRingVersion();
+    }
+
+    public void invalidate()
+    {
+        this.valid = false;
+    }
+
+    public List<Splitter.WeightedRange> getRanges()
+    {
+        return ranges;
+    }
+
+    public long getRingVersion()
+    {
+        return ringVersion;
+    }
+
+    /**
+     * Split the local ranges into the given number of parts.
+     *
+     * @param numParts the number of parts to split into
+     *
+     * @return a list of positions into which the local ranges were split
+     */
+    public List<PartitionPosition> split(int numParts)
+    {
+        return splits.computeIfAbsent(numParts, this::doSplit);
+    }
+
+    private List<PartitionPosition> doSplit(int numParts)
+    {
+        Splitter splitter = cfs.getPartitioner().splitter().orElse(null);
+
+        List<Token> boundaries;
+        if (splitter == null)
+        {
+            logger.debug("Could not split local ranges into {} parts for {}.{} (no splitter)", numParts, cfs.getKeyspaceName(), cfs.getTableName());
+            boundaries = ranges.stream().map(Splitter.WeightedRange::right).collect(Collectors.toList());
+        }
+        else
+        {
+            logger.debug("Splitting local ranges into {} parts for {}.{}", numParts, cfs.getKeyspaceName(), cfs.getTableName());
+            boundaries = splitter.splitOwnedRanges(numParts, ranges, Splitter.SplitType.ALWAYS_SPLIT).boundaries;
+        }
+
+        logger.debug("Boundaries for {}.{}: {} ({} splits)", cfs.getKeyspaceName(), cfs.getTableName(), boundaries, boundaries.size());
+        return boundaries.stream().map(Token::maxKeyBound).collect(Collectors.toList());
+    }
+
+    /**
+     * Returns the intersection of this list with the given range.
+     */
+    public List<Splitter.WeightedRange> subrange(Range<Token> range)
+    {
+        return ranges.stream()
+                     .map(r -> {
+                         Range<Token> subRange = r.range().intersectionNonWrapping(range);
+                         return subRange == null ? null : new Splitter.WeightedRange(r.weight(), subRange);
+                     })
+                     .filter(Objects::nonNull)
+                     .collect(Collectors.toList());
+    }
+
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        SortedLocalRanges that = (SortedLocalRanges) o;
+        if (ringVersion != that.ringVersion)
+            return false;
+
+        if (!cfs.equals(that.cfs))
+            return false;
+
+        return ranges.equals(that.ranges);
+    }
+
+    public int hashCode()
+    {
+        int result = cfs.hashCode();
+        result = 31 * result + Long.hashCode(ringVersion);
+        result = 31 * result + ranges.hashCode();
+        return result;
+    }
+
+    public String toString()
+    {
+        return "LocalRanges{" +
+               "table=" + cfs.getKeyspaceName() + "." + cfs.getTableName() +
+               ", ring version=" + ringVersion +
+               ", num ranges=" + ranges.size() + '}';
+    }
+
+    public ColumnFamilyStore getCfs()
+    {
+        return cfs;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
index 6230237ca604..80770d702946 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionStrategy.java
@@ -17,76 +17,56 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.util.*;
-import java.util.function.Function;
-
-import javax.annotation.Nullable;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.ImmutableMap;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.SerializationHeader;
-import org.apache.cassandra.index.Index;
-import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
-import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.SSTableMultiWriter;
-import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.lifecycle.Tracker;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
-import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.ScannerList;
+import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
-import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
-import org.apache.cassandra.schema.CompactionParams;
-import org.apache.cassandra.schema.TableMetadata;
-
-/**
- * Pluggable compaction strategy determines how SSTables get merged.
- *
- * There are two main goals:
- *  - perform background compaction constantly as needed; this typically makes a tradeoff between
- *    i/o done by compaction, and merging done at read time.
- *  - perform a full (maximum possible) compaction if requested by the user
- */
-public abstract class AbstractCompactionStrategy
-{
-    private static final Logger logger = LoggerFactory.getLogger(AbstractCompactionStrategy.class);
 
-    protected static final float DEFAULT_TOMBSTONE_THRESHOLD = 0.2f;
-    // minimum interval needed to perform tombstone removal compaction in seconds, default 86400 or 1 day.
-    protected static final long DEFAULT_TOMBSTONE_COMPACTION_INTERVAL = 86400;
-    protected static final boolean DEFAULT_UNCHECKED_TOMBSTONE_COMPACTION_OPTION = false;
-    protected static final boolean DEFAULT_LOG_ALL_OPTION = false;
+import static org.apache.cassandra.db.ColumnFamilyStore.nonSuspectAndNotInPredicate;
 
-    protected static final String TOMBSTONE_THRESHOLD_OPTION = "tombstone_threshold";
-    protected static final String TOMBSTONE_COMPACTION_INTERVAL_OPTION = "tombstone_compaction_interval";
-    // disable range overlap check when deciding if an SSTable is candidate for tombstone compaction (CASSANDRA-6563)
-    protected static final String UNCHECKED_TOMBSTONE_COMPACTION_OPTION = "unchecked_tombstone_compaction";
-    protected static final String LOG_ALL_OPTION = "log_all";
-    protected static final String COMPACTION_ENABLED = "enabled";
-    public static final String ONLY_PURGE_REPAIRED_TOMBSTONES = "only_purge_repaired_tombstones";
+abstract class AbstractCompactionStrategy implements CompactionStrategy
+{
+    public static final Class<? extends CompactionStrategyContainer> CONTAINER_CLASS = CompactionStrategyManager.class;
 
-    protected Map<String, String> options;
+    protected static final Logger logger = LoggerFactory.getLogger(AbstractCompactionStrategy.class);
 
+    protected final CompactionStrategyOptions options;
     protected final ColumnFamilyStore cfs;
-    protected float tombstoneThreshold;
-    protected long tombstoneCompactionInterval;
-    protected boolean uncheckedTombstoneCompaction;
-    protected boolean disableTombstoneCompactions = false;
-    protected boolean logAll = true;
+    protected final Tracker dataTracker;
 
-    private final Directories directories;
+    protected final CompactionLogger compactionLogger;
+    protected final Directories directories;
+    /**
+     * This class groups all the compaction tasks that are pending, submitted, in progress and completed.
+     */
+    protected final BackgroundCompactions backgroundCompactions;
 
     /**
      * pause/resume/getNextBackgroundTask must synchronize.  This guarantees that after pause completes,
@@ -98,59 +78,54 @@ public abstract class AbstractCompactionStrategy
      *
      * See CASSANDRA-3430
      */
-    protected boolean isActive = false;
+    protected volatile boolean isActive = false;
 
-    /**
-     * This class groups all the compaction tasks that are pending, submitted, in progress and completed.
-     */
-    protected final BackgroundCompactions backgroundCompactions;
-
-    protected AbstractCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    protected AbstractCompactionStrategy(CompactionStrategyFactory factory, BackgroundCompactions backgroundCompactions, Map<String, String> options)
     {
-        assert cfs != null;
-        this.cfs = cfs;
-        this.options = ImmutableMap.copyOf(options);
-        this.backgroundCompactions = new BackgroundCompactions(this, cfs);
-
-        /* checks must be repeated here, as user supplied strategies might not call validateOptions directly */
+        assert factory != null;
+        this.cfs = factory.getCfs();
+        this.dataTracker = cfs.getTracker();
+        this.compactionLogger = factory.getCompactionLogger();
+        this.options = new CompactionStrategyOptions(getClass(), options, false);
+        this.directories = cfs.getDirectories();
+        this.backgroundCompactions = backgroundCompactions;
+    }
 
-        try
-        {
-            validateOptions(options);
-            String optionValue = options.get(TOMBSTONE_THRESHOLD_OPTION);
-            tombstoneThreshold = optionValue == null ? DEFAULT_TOMBSTONE_THRESHOLD : Float.parseFloat(optionValue);
-            optionValue = options.get(TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-            tombstoneCompactionInterval = optionValue == null ? DEFAULT_TOMBSTONE_COMPACTION_INTERVAL : Long.parseLong(optionValue);
-            optionValue = options.get(UNCHECKED_TOMBSTONE_COMPACTION_OPTION);
-            uncheckedTombstoneCompaction = optionValue == null ? DEFAULT_UNCHECKED_TOMBSTONE_COMPACTION_OPTION : Boolean.parseBoolean(optionValue);
-            optionValue = options.get(LOG_ALL_OPTION);
-            logAll = optionValue == null ? DEFAULT_LOG_ALL_OPTION : Boolean.parseBoolean(optionValue);
-        }
-        catch (ConfigurationException e)
-        {
-            logger.warn("Error setting compaction strategy options ({}), defaults will be used", e.getMessage());
-            tombstoneThreshold = DEFAULT_TOMBSTONE_THRESHOLD;
-            tombstoneCompactionInterval = DEFAULT_TOMBSTONE_COMPACTION_INTERVAL;
-            uncheckedTombstoneCompaction = DEFAULT_UNCHECKED_TOMBSTONE_COMPACTION_OPTION;
-        }
+    CompactionStrategyOptions getOptions()
+    {
+        return options;
+    }
 
-        directories = cfs.getDirectories();
+    public CompactionLogger getCompactionLogger()
+    {
+        return compactionLogger;
     }
 
-    public BackgroundCompactions getBackgroundCompactions()
+    //
+    // Compaction Observer
+    //
+
+    @Override
+    public void onInProgress(CompactionProgress progress)
     {
-        return backgroundCompactions;
+        backgroundCompactions.onInProgress(progress);
     }
 
-    public Directories getDirectories()
+    @Override
+    public void onCompleted(UUID id)
     {
-        return directories;
+        backgroundCompactions.onCompleted(this, id);
     }
 
+    //
+    // CompactionStrategy
+    //
+
     /**
      * For internal, temporary suspension of background compactions so that we can do exceptional
      * things like truncate or major compaction
      */
+    @Override
     public synchronized void pause()
     {
         isActive = false;
@@ -160,6 +135,7 @@ public synchronized void pause()
      * For internal, temporary suspension of background compactions so that we can do exceptional
      * things like truncate or major compaction
      */
+    @Override
     public synchronized void resume()
     {
         isActive = true;
@@ -168,6 +144,7 @@ public synchronized void resume()
     /**
      * Performs any extra initialization required
      */
+    @Override
     public void startup()
     {
         isActive = true;
@@ -176,137 +153,12 @@ public void startup()
     /**
      * Releases any resources if this strategy is shutdown (when the CFS is reloaded after a schema change).
      */
+    @Override
     public void shutdown()
     {
         isActive = false;
     }
 
-    /**
-     * @param gcBefore throw away tombstones older than this
-     *
-     * @return the next background/minor compaction task to run; null if nothing to do.
-     *
-     * Is responsible for marking its sstables as compaction-pending.
-     */
-    public abstract AbstractCompactionTask getNextBackgroundTask(int gcBefore);
-
-    /**
-     * Helper base class for strategies that provide CompactionAggregates, implementing the typical
-     * getNextBackgroundTask logic based on a getNextBackgroundAggregate method.
-     */
-    protected static abstract class WithAggregates extends AbstractCompactionStrategy
-    {
-        protected WithAggregates(ColumnFamilyStore cfs, Map<String, String> options)
-        {
-            super(cfs, options);
-        }
-
-        @Override
-        @SuppressWarnings("resource")
-        public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
-        {
-            CompactionPick previous = null;
-            while (true)
-            {
-                CompactionAggregate compaction = getNextBackgroundAggregate(gcBefore);
-                if (compaction == null || compaction.isEmpty())
-                    return null;
-
-                // Already tried acquiring references without success. It means there is a race with
-                // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
-                if (compaction.getSelected().equals(previous))
-                {
-                    logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
-                                "unless it happens frequently, in which case it must be reported. Will retry later.",
-                                compaction.getSelected());
-                    return null;
-                }
-
-                LifecycleTransaction transaction = cfs.getTracker().tryModify(compaction.getSelected().sstables, OperationType.COMPACTION);
-                if (transaction != null)
-                {
-                    backgroundCompactions.setSubmitted(transaction.opId(), compaction);
-                    return createCompactionTask(gcBefore, transaction, compaction);
-                }
-
-                // Getting references to the sstables failed. This may be because we tried to compact sstables that are
-                // no longer present (due to races in getting the notification), or because we still haven't
-                // received any replace notifications. Remove any non-live sstables we track and try again.
-                removeDeadSSTables();
-
-                previous = compaction.getSelected();
-            }
-        }
-
-        /**
-         * Select the next compaction to perform. This method is typically synchronized.
-         */
-        protected abstract CompactionAggregate getNextBackgroundAggregate(int gcBefore);
-
-        protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, CompactionAggregate compaction)
-        {
-            return CompactionTask.forCompaction(this, txn, gcBefore);
-        }
-
-        @Override
-        public int getEstimatedRemainingTasks()
-        {
-            return backgroundCompactions.getEstimatedRemainingTasks();
-        }
-    }
-
-    /**
-     * Helper base class for (older, deprecated) strategies that provide a list of tables to compact, implementing the
-     * typical getNextBackgroundTask logic based on a getNextBackgroundSSTables method.
-     */
-    protected static abstract class WithSSTableList extends AbstractCompactionStrategy
-    {
-        protected WithSSTableList(ColumnFamilyStore cfs, Map<String, String> options)
-        {
-            super(cfs, options);
-        }
-
-        @Override
-        @SuppressWarnings("resource")
-        public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
-        {
-            List<SSTableReader> previousCandidate = null;
-            while (true)
-            {
-                List<SSTableReader> latestBucket = getNextBackgroundSSTables(gcBefore);
-
-                if (latestBucket.isEmpty())
-                    return null;
-
-                // Already tried acquiring references without success. It means there is a race with
-                // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
-                if (latestBucket.equals(previousCandidate))
-                {
-                    logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
-                                "unless it happens frequently, in which case it must be reported. Will retry later.",
-                                latestBucket);
-                    return null;
-                }
-
-                LifecycleTransaction modifier = cfs.getTracker().tryModify(latestBucket, OperationType.COMPACTION);
-                if (modifier != null)
-                    return createCompactionTask(gcBefore, modifier, false, false);
-
-                // Getting references to the sstables failed. This may be because we tried to compact sstables that are
-                // no longer present (due to races in getting the notification), or because we still haven't
-                // received any replace notifications. Remove any non-live sstables we track and try again.
-                removeDeadSSTables();
-
-                previousCandidate = latestBucket;
-            }
-        }
-
-        /**
-         * Select the next tables to compact. This method is typically synchronized.
-         */
-        protected abstract List<SSTableReader> getNextBackgroundSSTables(final int gcBefore);
-    }
-
     /**
      * @param gcBefore throw away tombstones older than this
      *
@@ -315,18 +167,17 @@ public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
      *
      * Is responsible for marking its sstables as compaction-pending.
      */
+    @Override
     @SuppressWarnings("resource")
-    public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefore, boolean splitOutput)
+    public synchronized CompactionTasks getMaximalTasks(int gcBefore, boolean splitOutput)
     {
-        removeDeadSSTables();
-
-        Iterable<SSTableReader> filteredSSTables = filterSuspectSSTables(getSSTables());
+        Iterable<SSTableReader> filteredSSTables = Iterables.filter(getSSTables(), sstable -> !sstable.isMarkedSuspect());
         if (Iterables.isEmpty(filteredSSTables))
-            return null;
-        LifecycleTransaction txn = cfs.getTracker().tryModify(filteredSSTables, OperationType.COMPACTION);
+            return CompactionTasks.empty();
+        LifecycleTransaction txn = dataTracker.tryModify(filteredSSTables, OperationType.COMPACTION);
         if (txn == null)
-            return null;
-        return Collections.singleton(createCompactionTask(gcBefore, txn, true, splitOutput));
+            return CompactionTasks.empty();
+        return CompactionTasks.create(Collections.singleton(createCompactionTask(gcBefore, txn, true, splitOutput)));
     }
 
     /**
@@ -338,25 +189,26 @@ public synchronized Collection<AbstractCompactionTask> getMaximalTask(int gcBefo
      *
      * Is responsible for marking its sstables as compaction-pending.
      */
+    @Override
     @SuppressWarnings("resource")
-    public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTableReader> sstables, int gcBefore)
+    public synchronized CompactionTasks getUserDefinedTasks(Collection<SSTableReader> sstables, int gcBefore)
     {
         assert !sstables.isEmpty(); // checked for by CM.submitUserDefined
 
-        LifecycleTransaction modifier = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
+        LifecycleTransaction modifier = dataTracker.tryModify(sstables, OperationType.COMPACTION);
         if (modifier == null)
         {
             logger.trace("Unable to mark {} for compaction; probably a background compaction got to it first.  You can disable background compactions temporarily if this is a problem", sstables);
-            return null;
+            return CompactionTasks.empty();
         }
 
-        return createCompactionTask(gcBefore, modifier, false, false).setUserDefined(true);
+        return CompactionTasks.create(ImmutableList.of(createCompactionTask(gcBefore, modifier, false, false).setUserDefined(true)));
     }
 
     /**
      * Create a compaction task for a maximal, user defined or background compaction without aggregates (legacy strategies).
-     * Background compactions for strategies that extend {@link WithAggregates} will use
-     * {@link WithAggregates#createCompactionTask(int, LifecycleTransaction, boolean, boolean)} instead.
+     * Background compactions for strategies that extend {@link LegacyAbstractCompactionStrategy.WithAggregates} will use
+     * {@link LegacyAbstractCompactionStrategy.WithAggregates#createCompactionTask(int, LifecycleTransaction, boolean, boolean)} instead.
      *
      * @param gcBefore tombstone threshold, older tombstones can be discarded
      * @param txn the transaction containing the files to be compacted
@@ -368,7 +220,7 @@ public synchronized AbstractCompactionTask getUserDefinedTask(Collection<SSTable
      */
     protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
     {
-        return CompactionTask.forCompaction(this, txn, gcBefore);
+        return new CompactionTask(cfs, txn, gcBefore, false, this);
     }
 
     /**
@@ -380,69 +232,57 @@ protected AbstractCompactionTask createCompactionTask(final int gcBefore, Lifecy
      *
      * @return a compaction task, see {@link AbstractCompactionTask} and sub-classes
      */
+    @Override
     public AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, final int gcBefore, long maxSSTableBytes)
     {
-        return CompactionTask.forCompaction(this, txn, gcBefore);
+        return new CompactionTask(cfs, txn, gcBefore, false, this);
     }
 
     /**
-     * Get the estimated remaining compactions. Strategies that implement {@link WithAggregates} can delegate this
-     * to {@link BackgroundCompactions} because they set the pending aggregates as background compactions but legacy
-     * strategies that do not support aggregates must implement this method.
-     * <p/>
-     * @return the number of background tasks estimated to still be needed for this strategy
+     * @return a list of the compaction aggregates, e.g. the levels or buckets. Note that legacy strategies that derive from
+     * {@link LeveledCompactionStrategy.WithSSTableList} will return an empty list.
      */
-    public abstract int getEstimatedRemainingTasks();
+    public Collection<CompactionAggregate> getAggregates()
+    {
+        return backgroundCompactions.getAggregates();
+    }
 
     /**
      * @return the total number of background compactions, pending or in progress
      */
+    @Override
     public int getTotalCompactions()
     {
-        return getEstimatedRemainingTasks() + backgroundCompactions.getCompactionsInProgress();
+        return getEstimatedRemainingTasks() + backgroundCompactions.getCompactionsInProgress().size();
     }
 
     /**
-     * Return the statistics. Only strategies that implement {@link WithAggregates} will provide non-empty statistics,
+     * Return the statistics. Only strategies that implement {@link LegacyAbstractCompactionStrategy.WithAggregates} will provide non-empty statistics,
      * the legacy strategies will always have empty statistics.
      * <p/>
      * @return statistics about this compaction picks.
      */
-    public CompactionStrategyStatistics getStatistics()
+    @Override
+    public List<CompactionStrategyStatistics> getStatistics()
     {
-        return backgroundCompactions.getStatistics();
+        return ImmutableList.of(backgroundCompactions.getStatistics(this));
     }
 
-    /**
-     * @return size in bytes of the largest sstables for this strategy
-     */
-    public abstract long getMaxSSTableBytes();
-
-    /**
-     * Filters SSTables that are to be excluded from the given collection
-     *
-     * @param originalCandidates The collection to check for excluded SSTables
-     * @return list of the SSTables with excluded ones filtered out
-     */
-    public static List<SSTableReader> filterSuspectSSTables(Iterable<? extends SSTableReader> originalCandidates)
+    public static Iterable<SSTableReader> nonSuspectAndNotIn(Iterable<SSTableReader> sstables, Set<SSTableReader> compacting)
     {
-        List<SSTableReader> filtered = new ArrayList<>();
-        for (SSTableReader sstable : originalCandidates)
-        {
-            if (!sstable.isMarkedSuspect())
-                filtered.add(sstable);
-        }
-        return filtered;
+        return Iterables.filter(sstables, nonSuspectAndNotInPredicate(compacting));
     }
 
-    public static Iterable<SSTableReader> nonSuspectAndNotIn(Iterable<SSTableReader> tables, Set<SSTableReader> compacting)
+    @Override
+    public int[] getSSTableCountPerLevel()
     {
-        return Iterables.filter(tables, t -> !t.isMarkedSuspect() && !compacting.contains(t));
+        return new int[0];
     }
 
-    public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token> range)
+    @Override
+    public int getLevelFanoutSize()
     {
-        return range == null ? getScanners(sstables, (Collection<Range<Token>>)null) : getScanners(sstables, Collections.singleton(range));
+        return LeveledCompactionStrategy.DEFAULT_LEVEL_FANOUT_SIZE; // this makes no sense but it's the existing behaviour
     }
 
     /**
@@ -452,6 +292,7 @@ public ScannerList getScanners(Collection<SSTableReader> sstables, Range<Token>
      * LeveledCompactionStrategy for instance).
      */
     @SuppressWarnings("resource")
+    @Override
     public ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
     {
         ArrayList<ISSTableScanner> scanners = new ArrayList<ISSTableScanner>();
@@ -467,308 +308,15 @@ public ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Ra
         return new ScannerList(scanners);
     }
 
+    @Override
     public String getName()
     {
         return getClass().getSimpleName();
     }
 
-    public TableMetadata getMetadata()
-    {
-        return cfs.metadata();
-    }
-
-    /**
-     * Replaces sstables in the compaction strategy
-     *
-     * Note that implementations must be able to handle duplicate notifications here (that removed are already gone and
-     * added have already been added)
-     * */
-    public abstract void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added);
-
-    /**
-     * Adds sstable, note that implementations must handle duplicate notifications here (added already being in the compaction strategy)
-     */
-    public abstract void addSSTable(SSTableReader added);
-
-    /**
-     * Adds sstables, note that implementations must handle duplicate notifications here (added already being in the compaction strategy)
-     */
-    public synchronized void addSSTables(Iterable<SSTableReader> added)
-    {
-        for (SSTableReader sstable : added)
-            addSSTable(sstable);
-    }
-
-    /**
-     * Remove any tracked sstable that is no longer in the live set. Note that because we get notifications after the
-     * tracker is modified, anything we know of must be already in the live set -- if it is not, it has been removed
-     * from there, and we either haven't received the removal notification yet, or we did and we messed it up (i.e.
-     * we got it before the addition). The former is transient, but the latter can cause persistent problems, including
-     * fully stopping compaction. In any case, we should remove any such sstables.
-     * There are two special-case implementations of this in MemoryOnlyStrategy and LeveledManifest.
-     */
-    abstract void removeDeadSSTables();
-
-    void removeDeadSSTables(Iterable<SSTableReader> sstables)
-    {
-        synchronized (sstables)
-        {
-            int removed = 0;
-            Set<SSTableReader> liveSet = cfs.getLiveSSTables();
-            for (Iterator<SSTableReader> it = sstables.iterator(); it.hasNext(); )
-            {
-                SSTableReader sstable = it.next();
-                if (!liveSet.contains(sstable))
-                {
-                    it.remove();
-                    ++removed;
-                }
-            }
-
-            if (removed > 0)
-                logger.debug("Removed {} dead sstables from the compactions tracked list.", removed);
-        }
-    }
-
-    /**
-     * Removes sstable from the strategy, implementations must be able to handle the sstable having already been removed.
-     */
-    public abstract void removeSSTable(SSTableReader sstable);
-
-    /**
-     * Removes sstables from the strategy, implementations must be able to handle the sstables having already been removed.
-     */
-    public void removeSSTables(Iterable<SSTableReader> removed)
-    {
-        for (SSTableReader sstable : removed)
-            removeSSTable(sstable);
-    }
-
-    /**
-     * Returns the sstables managed by this strategy instance
-     */
-    @VisibleForTesting
-    protected abstract Set<SSTableReader> getSSTables();
-
-    /**
-     * Called when the metadata has changed for an sstable - for example if the level changed
-     *
-     * Not called when repair status changes (which is also metadata), because this results in the
-     * sstable getting removed from the compaction strategy instance.
-     *
-     * @param oldMetadata
-     * @param sstable
-     */
-    public void metadataChanged(StatsMetadata oldMetadata, SSTableReader sstable)
-    {
-    }
-
-    public static class ScannerList implements AutoCloseable
-    {
-        public final List<ISSTableScanner> scanners;
-        public ScannerList(List<ISSTableScanner> scanners)
-        {
-            this.scanners = scanners;
-        }
-
-        public long getTotalBytesScanned()
-        {
-            long bytesScanned = 0L;
-            for (int i=0, isize=scanners.size(); i<isize; i++)
-                bytesScanned += scanners.get(i).getBytesScanned();
-
-            return bytesScanned;
-        }
-
-        public long getTotalCompressedSize()
-        {
-            long compressedSize = 0;
-            for (int i=0, isize=scanners.size(); i<isize; i++)
-                compressedSize += scanners.get(i).getCompressedLengthInBytes();
-
-            return compressedSize;
-        }
-
-        public double getCompressionRatio()
-        {
-            double compressed = 0.0;
-            double uncompressed = 0.0;
-
-            for (int i=0, isize=scanners.size(); i<isize; i++)
-            {
-                @SuppressWarnings("resource")
-                ISSTableScanner scanner = scanners.get(i);
-                compressed += scanner.getCompressedLengthInBytes();
-                uncompressed += scanner.getLengthInBytes();
-            }
-
-            if (compressed == uncompressed || uncompressed == 0)
-                return MetadataCollector.NO_COMPRESSION_RATIO;
-
-            return compressed / uncompressed;
-        }
-
-        public void close()
-        {
-            ISSTableScanner.closeAllAndPropagate(scanners, null);
-        }
-    }
-
-    public ScannerList getScanners(Collection<SSTableReader> toCompact)
-    {
-        return getScanners(toCompact, (Collection<Range<Token>>)null);
-    }
-
-    /**
-     * Select a table for tombstone-removing compaction from the given set. Returns null if no table is suitable.
-     */
-    @Nullable
-    CompactionAggregate makeTombstoneCompaction(int gcBefore,
-                                                Iterable<SSTableReader> candidates,
-                                                Function<Collection<SSTableReader>, SSTableReader> selector)
-    {
-        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
-        for (SSTableReader sstable : candidates)
-        {
-            if (worthDroppingTombstones(sstable, gcBefore))
-                sstablesWithTombstones.add(sstable);
-        }
-        if (sstablesWithTombstones.isEmpty())
-            return null;
-
-        final SSTableReader sstable = selector.apply(sstablesWithTombstones);
-        return CompactionAggregate.createForTombstones(sstable);
-    }
-
-    /**
-     * Check if given sstable is worth dropping tombstones at gcBefore.
-     * Check is skipped if tombstone_compaction_interval time does not elapse since sstable creation and returns false.
-     *
-     * @param sstable SSTable to check
-     * @param gcBefore time to drop tombstones
-     * @return true if given sstable's tombstones are expected to be removed
-     */
-    protected boolean worthDroppingTombstones(SSTableReader sstable, int gcBefore)
-    {
-        if (disableTombstoneCompactions || CompactionController.NEVER_PURGE_TOMBSTONES || cfs.getNeverPurgeTombstones())
-            return false;
-        // since we use estimations to calculate, there is a chance that compaction will not drop tombstones actually.
-        // if that happens we will end up in infinite compaction loop, so first we check enough if enough time has
-        // elapsed since SSTable created.
-        if (System.currentTimeMillis() < sstable.getCreationTimeFor(Component.DATA) + tombstoneCompactionInterval * 1000)
-           return false;
-
-        double droppableRatio = sstable.getEstimatedDroppableTombstoneRatio(gcBefore);
-        if (droppableRatio <= tombstoneThreshold)
-            return false;
-
-        //sstable range overlap check is disabled. See CASSANDRA-6563.
-        if (uncheckedTombstoneCompaction)
-            return true;
-
-        Collection<SSTableReader> overlaps = cfs.getOverlappingLiveSSTables(Collections.singleton(sstable));
-        if (overlaps.isEmpty())
-        {
-            // there is no overlap, tombstones are safely droppable
-            return true;
-        }
-        else if (CompactionController.getFullyExpiredSSTables(cfs, Collections.singleton(sstable), overlaps, gcBefore).size() > 0)
-        {
-            return true;
-        }
-        else
-        {
-            // what percentage of columns do we expect to compact outside of overlap?
-            if (sstable.getIndexSummarySize() < 2)
-            {
-                // we have too few samples to estimate correct percentage
-                return false;
-            }
-            // first, calculate estimated keys that do not overlap
-            long keys = sstable.estimatedKeys();
-            Set<Range<Token>> ranges = new HashSet<Range<Token>>(overlaps.size());
-            for (SSTableReader overlap : overlaps)
-                ranges.add(new Range<>(overlap.first.getToken(), overlap.last.getToken()));
-            long remainingKeys = keys - sstable.estimatedKeysForRanges(ranges);
-            // next, calculate what percentage of columns we have within those keys
-            long columns = sstable.getEstimatedCellPerPartitionCount().mean() * remainingKeys;
-            double remainingColumnsRatio = ((double) columns) / (sstable.getEstimatedCellPerPartitionCount().count() * sstable.getEstimatedCellPerPartitionCount().mean());
-
-            // return if we still expect to have droppable tombstones in rest of columns
-            return remainingColumnsRatio * droppableRatio > tombstoneThreshold;
-        }
-    }
-
     public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
     {
-        String threshold = options.get(TOMBSTONE_THRESHOLD_OPTION);
-        if (threshold != null)
-        {
-            try
-            {
-                float thresholdValue = Float.parseFloat(threshold);
-                if (thresholdValue < 0)
-                {
-                    throw new ConfigurationException(String.format("%s must be greater than 0, but was %f", TOMBSTONE_THRESHOLD_OPTION, thresholdValue));
-                }
-            }
-            catch (NumberFormatException e)
-            {
-                throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", threshold, TOMBSTONE_THRESHOLD_OPTION), e);
-            }
-        }
-
-        String interval = options.get(TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-        if (interval != null)
-        {
-            try
-            {
-                long tombstoneCompactionInterval = Long.parseLong(interval);
-                if (tombstoneCompactionInterval < 0)
-                {
-                    throw new ConfigurationException(String.format("%s must be greater than 0, but was %d", TOMBSTONE_COMPACTION_INTERVAL_OPTION, tombstoneCompactionInterval));
-                }
-            }
-            catch (NumberFormatException e)
-            {
-                throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", interval, TOMBSTONE_COMPACTION_INTERVAL_OPTION), e);
-            }
-        }
-
-        String unchecked = options.get(UNCHECKED_TOMBSTONE_COMPACTION_OPTION);
-        if (unchecked != null)
-        {
-            if (!unchecked.equalsIgnoreCase("true") && !unchecked.equalsIgnoreCase("false"))
-                throw new ConfigurationException(String.format("'%s' should be either 'true' or 'false', not '%s'", UNCHECKED_TOMBSTONE_COMPACTION_OPTION, unchecked));
-        }
-
-        String logAll = options.get(LOG_ALL_OPTION);
-        if (logAll != null)
-        {
-            if (!logAll.equalsIgnoreCase("true") && !logAll.equalsIgnoreCase("false"))
-            {
-                throw new ConfigurationException(String.format("'%s' should either be 'true' or 'false', not %s", LOG_ALL_OPTION, logAll));
-            }
-        }
-
-        String compactionEnabled = options.get(COMPACTION_ENABLED);
-        if (compactionEnabled != null)
-        {
-            if (!compactionEnabled.equalsIgnoreCase("true") && !compactionEnabled.equalsIgnoreCase("false"))
-            {
-                throw new ConfigurationException(String.format("enabled should either be 'true' or 'false', not %s", compactionEnabled));
-            }
-        }
-
-        Map<String, String> uncheckedOptions = new HashMap<String, String>(options);
-        uncheckedOptions.remove(TOMBSTONE_THRESHOLD_OPTION);
-        uncheckedOptions.remove(TOMBSTONE_COMPACTION_INTERVAL_OPTION);
-        uncheckedOptions.remove(UNCHECKED_TOMBSTONE_COMPACTION_OPTION);
-        uncheckedOptions.remove(LOG_ALL_OPTION);
-        uncheckedOptions.remove(COMPACTION_ENABLED);
-        uncheckedOptions.remove(ONLY_PURGE_REPAIRED_TOMBSTONES);
-        uncheckedOptions.remove(CompactionParams.Option.PROVIDE_OVERLAPPING_TOMBSTONES.toString());
-        return uncheckedOptions;
+        return CompactionStrategyOptions.validateOptions(options);
     }
 
     /**
@@ -777,6 +325,7 @@ public static Map<String, String> validateOptions(Map<String, String> options) t
      * as a group. If a given compaction strategy creates sstables which
      * cannot be merged due to some constraint it must override this method.
      */
+    @Override
     public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup)
     {
         int groupSize = 2;
@@ -801,11 +350,6 @@ public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Coll
         return groupedSSTables;
     }
 
-    public CompactionLogger.Strategy strategyLogger()
-    {
-        return CompactionLogger.Strategy.none;
-    }
-
     public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                        long keyCount,
                                                        long repairedAt,
@@ -819,6 +363,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
         return SimpleSSTableMultiWriter.create(descriptor, keyCount, repairedAt, pendingRepair, isTransient, cfs.metadata, meta, header, indexGroups, lifecycleNewTracker);
     }
 
+    @Override
     public boolean supportsEarlyOpen()
     {
         return true;
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
index 4d14b22fb87f..04b559501373 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractCompactionTask.java
@@ -17,10 +17,13 @@
  */
 package org.apache.cassandra.db.compaction;
 
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Set;
 import java.util.UUID;
 
+import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
@@ -42,7 +45,7 @@ public abstract class AbstractCompactionTask extends WrappedRunnable
     protected boolean isUserDefined;
     protected OperationType compactionType;
     protected TableOperationObserver opObserver;
-    protected CompactionObserver compObserver;
+    protected final List<CompactionObserver> compObservers;
 
     /**
      * @param cfs
@@ -55,7 +58,7 @@ public AbstractCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transa
         this.isUserDefined = false;
         this.compactionType = OperationType.COMPACTION;
         this.opObserver = TableOperationObserver.NOOP;
-        this.compObserver = CompactionObserver.NO_OP;
+        this.compObservers = new ArrayList<>();
 
         try
         {
@@ -122,7 +125,7 @@ public int execute()
         {
             return executeInternal();
         }
-        catch(FSDiskFullWriteError e)
+        catch (FSDiskFullWriteError e)
         {
             RuntimeException cause = new RuntimeException("Converted from FSDiskFullWriteError: " + e.getMessage());
             cause.setStackTrace(e.getStackTrace());
@@ -136,9 +139,10 @@ public int execute()
 
     private Throwable cleanup(Throwable err)
     {
-        return Throwables.perform(err,
-                                  () -> compObserver.setCompleted(transaction.opId()),
-                                  () -> transaction.close());
+        for (CompactionObserver compObserver : compObservers)
+            err = Throwables.perform(err, () -> compObserver.onCompleted(transaction.opId()));
+
+        return Throwables.perform(err, () -> transaction.close());
     }
 
     public abstract CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables);
@@ -168,6 +172,17 @@ AbstractCompactionTask setOpObserver(TableOperationObserver opObserver)
         return this;
     }
 
+    void addObserver(CompactionObserver compObserver)
+    {
+        compObservers.add(compObserver);
+    }
+
+    @VisibleForTesting
+    LifecycleTransaction transaction()
+    {
+        return transaction;
+    }
+
     public String toString()
     {
         return "CompactionTask(" + transaction + ")";
diff --git a/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java b/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java
index ab8c0046dae2..89c212c7a831 100644
--- a/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java
+++ b/src/java/org/apache/cassandra/db/compaction/AbstractStrategyHolder.java
@@ -31,7 +31,6 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.index.Index;
@@ -50,23 +49,23 @@
  */
 public abstract class AbstractStrategyHolder
 {
-    public static class TaskSupplier implements Comparable<TaskSupplier>
+    public static class TasksSupplier implements Comparable<TasksSupplier>
     {
         private final int numRemaining;
-        private final Supplier<AbstractCompactionTask> supplier;
+        private final Supplier<Collection<AbstractCompactionTask>> supplier;
 
-        TaskSupplier(int numRemaining, Supplier<AbstractCompactionTask> supplier)
+        TasksSupplier(int numRemaining, Supplier<Collection<AbstractCompactionTask>> supplier)
         {
             this.numRemaining = numRemaining;
             this.supplier = supplier;
         }
 
-        public AbstractCompactionTask getTask()
+        public Collection<AbstractCompactionTask> getTasks()
         {
             return supplier.get();
         }
 
-        public int compareTo(TaskSupplier o)
+        public int compareTo(TasksSupplier o)
         {
             return o.numRemaining - numRemaining;
         }
@@ -130,12 +129,14 @@ boolean isEmpty()
     }
 
     protected final ColumnFamilyStore cfs;
+    protected final CompactionStrategyFactory strategyFactory;
     final DestinationRouter router;
     private int numTokenPartitions = -1;
 
-    AbstractStrategyHolder(ColumnFamilyStore cfs, DestinationRouter router)
+    AbstractStrategyHolder(ColumnFamilyStore cfs, CompactionStrategyFactory strategyFactory, DestinationRouter router)
     {
         this.cfs = cfs;
+        this.strategyFactory = strategyFactory;
         this.router = router;
     }
 
@@ -166,11 +167,11 @@ public boolean managesSSTable(SSTableReader sstable)
         return managesRepairedGroup(sstable.isRepaired(), sstable.isPendingRepair(), sstable.isTransient());
     }
 
-    public abstract AbstractCompactionStrategy getStrategyFor(SSTableReader sstable);
+    public abstract LegacyAbstractCompactionStrategy getStrategyFor(SSTableReader sstable);
 
-    public abstract Iterable<AbstractCompactionStrategy> allStrategies();
+    public abstract Iterable<LegacyAbstractCompactionStrategy> allStrategies();
 
-    public abstract Collection<TaskSupplier> getBackgroundTaskSuppliers(int gcBefore);
+    public abstract Collection<TasksSupplier> getBackgroundTaskSuppliers(int gcBefore);
 
     public abstract Collection<AbstractCompactionTask> getMaximalTasks(int gcBefore, boolean splitOutput);
 
@@ -200,11 +201,5 @@ public abstract SSTableMultiWriter createSSTableMultiWriter(Descriptor descripto
                                                                 Collection<Index.Group> indexGroups,
                                                                 LifecycleNewTracker lifecycleNewTracker);
 
-    /**
-     * Return the directory index the given compaction strategy belongs to, or -1
-     * if it's not held by this holder
-     */
-    public abstract int getStrategyIndex(AbstractCompactionStrategy strategy);
-
     public abstract boolean containsSSTable(SSTableReader sstable);
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/ArenaSelector.java b/src/java/org/apache/cassandra/db/compaction/ArenaSelector.java
new file mode 100644
index 000000000000..89157593f764
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/ArenaSelector.java
@@ -0,0 +1,233 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.Iterables;
+
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DiskBoundaries;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+/**
+ * Arena selector, used by UnifiedCompactionStrategy to distribute SSTables to separate compaction arenas.
+ *
+ * This is used to:
+ * - ensure that sstables that should not be compacted together (e.g. repaired with unrepaired) are separated
+ * - ensure that each disk's sstables are compacted separately
+ * - implement compaction shards, subsections of the token space which compact separately for improved parallelism
+ *   and compaction overheads.
+ */
+public class ArenaSelector implements Comparator<SSTableReader>
+{
+    private final EquivClassSplitter[] classSplitters;
+    final List<PartitionPosition> shardBoundaries;
+    final DiskBoundaries diskBoundaries;
+
+    public ArenaSelector(DiskBoundaries diskBoundaries, List<PartitionPosition> shardBoundaries)
+    {
+        this.shardBoundaries = shardBoundaries;
+        this.diskBoundaries = diskBoundaries;
+
+        ArrayList<EquivClassSplitter> ret = new ArrayList<>(2);
+
+        ret.add(RepairEquivClassSplitter.INSTANCE);
+
+        if (diskBoundaries.getPositions() != null)
+        {
+            // The shard boundaries must also split on disks. Verify it.
+            assert new HashSet<>(shardBoundaries).containsAll(diskBoundaries.getPositions());
+        }
+        else if (diskBoundaries.getNumBoundaries() > 1)
+        {
+            // We end up here if there are multiple disks, but not assigned according to token range.
+            ret.add(new DiskIndexEquivClassSplitter());
+        }
+
+        if (shardBoundaries.size() > 1)
+            ret.add(new ShardEquivClassSplitter());
+
+        classSplitters = ret.toArray(new EquivClassSplitter[0]);
+    }
+
+    @Override
+    public int compare(SSTableReader o1, SSTableReader o2)
+    {
+        int res = 0;
+        for (int i = 0; res == 0 && i < classSplitters.length; i++)
+            res = classSplitters[i].compare(o1, o2);
+        return res;
+    }
+
+    public String name(SSTableReader t)
+    {
+        return Arrays.stream(classSplitters)
+                     .map(e -> e.name(t))
+                     .collect(Collectors.joining("-"));
+    }
+
+    /**
+     * Returns the shard where this key belongs. Shards are given by their end boundaries (i.e. shard 0 covers the space
+     * between minimum and shardBoundaries[0], shard 1 is is between shardBoundaries[0] and shardBoundaries[1]), thus
+     * finding the index of the first bigger boundary gives the index of the covering shard.
+     */
+    public int shardFor(DecoratedKey key)
+    {
+        return shardFor(key, shardBoundaries);
+    }
+
+    public static int shardFor(DecoratedKey key, List<PartitionPosition> shardBoundaries)
+    {
+        int pos = Collections.binarySearch(shardBoundaries, key);
+        assert pos < 0; // boundaries are .minkeybound and .maxkeybound so they should never be equal to a DecoratedKey
+        return -pos - 1;
+    }
+
+    public static int shardsSpanned(SSTableReader rdr, List<PartitionPosition> shardBoundaries)
+    {
+        if (shardBoundaries.size() <= 1)
+            return 1;
+        int startIdx = shardFor(rdr.getFirst(), shardBoundaries);
+        DecoratedKey last = rdr.getLast();
+        if (last.compareTo(shardBoundaries.get(startIdx)) < 0)
+            return 1;   // quick path, end boundary is in the same shard
+        return shardFor(last, shardBoundaries) - startIdx + 1;
+    }
+
+    public long shardAdjustedSize(SSTableReader rdr)
+    {
+        return shardAdjustedSize(rdr, shardBoundaries);
+    }
+
+    public static long shardAdjustedSize(SSTableReader rdr, List<PartitionPosition> shardBoundaries)
+    {
+        // This may need to duplicate the above to avoid the division in the happy path
+        return rdr.onDiskLength() / shardsSpanned(rdr, shardBoundaries);
+    }
+
+    public static Set<SSTableReader> sstablesFor(int boundaryIndex, List<PartitionPosition> shardBoundaries, Set<SSTableReader > sstables)
+    {
+        assert boundaryIndex < shardBoundaries.size();
+        return sstables.stream()
+                       .filter(sstable -> shardFor(sstable.getFirst(), shardBoundaries) <= boundaryIndex && shardFor(sstable.getLast(), shardBoundaries) >= boundaryIndex)
+                       .collect(Collectors.toSet());
+    }
+
+    public int compareByShardAdjustedSize(SSTableReader a, SSTableReader b)
+    {
+        return Long.compare(shardAdjustedSize(a), shardAdjustedSize(b));
+    }
+
+    /**
+     * An equivalence class is a function that compares two sstables and returns 0 when they fall in the same class.
+     * For example, the repair status or disk index may define equivalence classes. See the concrete equivalence classes below.
+     */
+    private interface EquivClassSplitter extends Comparator<SSTableReader> {
+
+        @Override
+        int compare(SSTableReader a, SSTableReader b);
+
+        /** Return a name that describes the equivalence class */
+        String name(SSTableReader ssTableReader);
+    }
+
+    /**
+     * Split sstables by their repair state: repaired, unrepaired, pending repair with a specific UUID (one group per pending repair).
+     */
+    private static final class RepairEquivClassSplitter implements EquivClassSplitter
+    {
+        public static final EquivClassSplitter INSTANCE = new RepairEquivClassSplitter();
+
+        @Override
+        public int compare(SSTableReader a, SSTableReader b)
+        {
+            // This is the same as name(a).compareTo(name(b))
+            int af = a.isRepaired() ? 1 : !a.isPendingRepair() ? 2 : 0;
+            int bf = b.isRepaired() ? 1 : !b.isPendingRepair() ? 2 : 0;
+            if (af != 0 || bf != 0)
+                return Integer.compare(af, bf);
+            return a.getPendingRepair().compareTo(b.getPendingRepair());
+        }
+
+        @Override
+        public String name(SSTableReader ssTableReader)
+        {
+            if (ssTableReader.isRepaired())
+                return "repaired";
+            else if (!ssTableReader.isPendingRepair())
+                return "unrepaired";
+            else
+                return "pending_repair_" + ssTableReader.getPendingRepair();
+        }
+    }
+
+    /**
+     * Split sstables by their shard. If the data set size is larger than the shard size in the compaction options,
+     * then we create an equivalence class based by shard. Each sstable ends up in a shard based on their first
+     * key. Each shard is calculated by splitting the local token ranges into a number of shards, where the number
+     * of shards is calculated as ceil(data_size / shard size);
+     *
+     * Shard boundaries also split the sstables that reside on different disks.
+     */
+    private final class ShardEquivClassSplitter implements EquivClassSplitter
+    {
+        @Override
+        public int compare(SSTableReader a, SSTableReader b)
+        {
+            return Integer.compare(shardFor(a.getFirst()), shardFor(b.getFirst()));
+        }
+
+        @Override
+        public String name(SSTableReader ssTableReader)
+        {
+            return "shard_" + shardFor(ssTableReader.getFirst());
+        }
+    }
+
+    /**
+     * Group sstables by their disk index.
+     */
+    private final class DiskIndexEquivClassSplitter implements EquivClassSplitter
+    {
+        @Override
+        public int compare(SSTableReader a, SSTableReader b)
+        {
+            return Integer.compare(diskBoundaries.getDiskIndexFromKey(a), diskBoundaries.getDiskIndexFromKey(b));
+        }
+
+        @Override
+        public String name(SSTableReader ssTableReader)
+        {
+            return "disk_" + diskBoundaries.getDiskIndexFromKey(ssTableReader);
+        }
+    }
+
+
+    // TODO - missing equivalence classes:
+
+    // - by time window to emulate TWCS, in this case only the latest shard will use size based buckets, the older
+    //   shards will get major compactions
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java b/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
index d63229e1e594..ef760f85df66 100644
--- a/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
+++ b/src/java/org/apache/cassandra/db/compaction/BackgroundCompactions.java
@@ -19,6 +19,7 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 import java.util.TreeMap;
 import java.util.UUID;
@@ -30,37 +31,53 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.ExpMovingAverage;
+import org.apache.cassandra.utils.MovingAverage;
 
 /**
  * A class for grouping the background compactions picked by a strategy, either pending or in progress.
+ *
+ * A compaction strategy has a {@link BackgroundCompactions} object as part of its state. Each
+ * {@link LegacyAbstractCompactionStrategy} instance has its {@link BackgroundCompactions}, and their lifespans are the
+ * same. In the case of {@link UnifiedCompactionStrategy} the new strategy instance inherits
+ * {@link BackgroundCompactions} from its predecessor.
  */
-class BackgroundCompactions implements CompactionObserver
+public class BackgroundCompactions
 {
     private static final Logger logger = LoggerFactory.getLogger(BackgroundCompactions.class);
 
-    /** The parent strategy */
-    private final AbstractCompactionStrategy strategy;
-
     /** The table metadata */
     private final TableMetadata metadata;
 
-    /** The compaction logger */
-    private final CompactionLogger compactionLogger;
+    /** The compaction aggregates with either pending or ongoing compactions, or both. This is a private map
+     * whose access needs to be synchronized. */
+    private final TreeMap<CompactionAggregate.Key, CompactionAggregate> aggregatesMap;
 
-    /** The compaction aggregates with either pending or ongoing compactions, or both. */
-    private volatile TreeMap<Long, CompactionAggregate> aggregates = new TreeMap<>();
+    /**
+     * The current list of compaction aggregates, this list must be recreated every time the aggregates
+     * map is changed.
+     *
+     * We publish aggregates to a separate variable instead of calling {@code aggregatesMap.values()} so that reads
+     * that race with updates always observe a consistent snapshot.
+     */
+    private volatile List<CompactionAggregate> aggregates;
 
     /**  The ongoing compactions grouped by unique operation ID. */
-    private ConcurrentHashMap<UUID, CompactionPick> compactions = new ConcurrentHashMap<>();
+    private final ConcurrentHashMap<UUID, CompactionPick> compactions = new ConcurrentHashMap<>();
 
-    BackgroundCompactions(AbstractCompactionStrategy strategy, ColumnFamilyStore cfs)
-    {
-        if (cfs.getCompactionStrategyManager() == null)
-            throw new IllegalStateException("Compaction strategy manager should be set in the CFS first");
+    /**
+     * Rate of progress (per thread) of recent compactions for the CFS. Used by the UnifiedCompactionStrategy to
+     * limit the number of running compactions to no more than what is sufficient to saturate the throughput limit.
+     * This needs to be a longer-running average to ensure that the rate limiter stalling a new thread can't cause
+     * the compaction rate to temporarily drop to levels that permit an extra thread.
+     */
+    MovingAverage compactionRate = ExpMovingAverage.decayBy1000();
 
-        this.strategy = strategy;
+    BackgroundCompactions(ColumnFamilyStore cfs)
+    {
         this.metadata = cfs.metadata();
-        this.compactionLogger = cfs.getCompactionStrategyManager().compactionLogger();
+        this.aggregatesMap = new TreeMap<>();
+        this.aggregates = ImmutableList.of();
     }
 
     /**
@@ -72,7 +89,7 @@ class BackgroundCompactions implements CompactionObserver
      *
      * @param pending compaction aggregates with pending compactions
      */
-    synchronized void setPending(List<CompactionAggregate> pending)
+    synchronized void setPending(CompactionStrategy strategy, Collection<? extends CompactionAggregate> pending)
     {
         if (pending == null)
             throw new IllegalArgumentException("argument cannot be null");
@@ -81,11 +98,13 @@ synchronized void setPending(List<CompactionAggregate> pending)
             logger.trace("Resetting pending aggregates for strategy {}/{}, received {} new aggregates",
                          strategy.getName(), strategy.hashCode(), pending.size());
 
-        // First create a new map with all the pending aggregates
-        TreeMap<Long, CompactionAggregate> aggregates = new TreeMap();
+        // First remove the existing aggregates
+        aggregatesMap.clear();
+
+        // Then add all the pending aggregates
         for (CompactionAggregate aggregate : pending)
         {
-            CompactionAggregate prev = aggregates.put(aggregate.getKey(), aggregate);
+            CompactionAggregate prev = aggregatesMap.put(aggregate.getKey(), aggregate);
             if (logger.isTraceEnabled())
                 logger.trace("Adding new pending aggregate: {}", aggregate);
 
@@ -93,8 +112,8 @@ synchronized void setPending(List<CompactionAggregate> pending)
                 throw new IllegalArgumentException("Received pending aggregates with non unique keys: " + prev.getKey());
         }
 
-        // Then add the current aggregates with ongoing compactions
-        for (CompactionAggregate oldAggregate : this.aggregates.values())
+        // Then add the old aggregates but only if they have ongoing compactions
+        for (CompactionAggregate oldAggregate : this.aggregates)
         {
             Collection<CompactionPick> compacting = oldAggregate.getInProgress();
             if (compacting.isEmpty())
@@ -108,12 +127,12 @@ synchronized void setPending(List<CompactionAggregate> pending)
             // See if we have a matching aggregate in the pending aggregates, if so add all the existing compactions to it
             // otherwise strip the pending and selected compactions from the old one and keep it only with the compactions in progress
             CompactionAggregate newAggregate;
-            CompactionAggregate matchingAggregate = oldAggregate.getMatching(aggregates);
+            CompactionAggregate matchingAggregate = oldAggregate.getMatching(aggregatesMap);
             if (matchingAggregate != null)
             {
                 // add the old compactions to the new aggregate
                 // the key will change slightly for STCS so remove it before adding it again
-                aggregates.remove(matchingAggregate.getKey());
+                aggregatesMap.remove(matchingAggregate.getKey());
                 newAggregate = matchingAggregate.withAdditionalCompactions(compacting);
 
                 if (logger.isTraceEnabled())
@@ -131,21 +150,21 @@ synchronized void setPending(List<CompactionAggregate> pending)
             if (logger.isTraceEnabled())
                 logger.trace("Adding new aggregate with previous compactions {}", newAggregate);
 
-            aggregates.put(newAggregate.getKey(), newAggregate);
+            aggregatesMap.put(newAggregate.getKey(), newAggregate);
         }
 
-        // Finally publish the new aggregates
-        this.aggregates = aggregates;
+        // Publish the new aggregates
+        this.aggregates = ImmutableList.copyOf(aggregatesMap.values());
 
+        CompactionLogger compactionLogger = strategy.getCompactionLogger();
         if (compactionLogger != null && compactionLogger.enabled())
         {
+            // compactionLogger.statistics(strategy, "pending", getStatistics()); // too much noise
             compactionLogger.pending(strategy, getEstimatedRemainingTasks());
-            compactionLogger.statistics(strategy, "pending", getStatistics());
         }
     }
 
-    @Override
-    public void setSubmitted(UUID id, CompactionAggregate aggregate)
+    void setSubmitted(CompactionStrategy strategy, UUID id, CompactionAggregate aggregate)
     {
         if (id == null || aggregate == null)
             throw new IllegalArgumentException("arguments cannot be null");
@@ -161,14 +180,16 @@ public void setSubmitted(UUID id, CompactionAggregate aggregate)
 
         synchronized (this)
         {
-            CompactionAggregate existingAggregate = aggregate.getMatching(aggregates);
+            CompactionAggregate existingAggregate = aggregate.getMatching(aggregatesMap);
+            boolean aggregatesMapChanged = false;
 
             if (existingAggregate == null)
             {
                 if (logger.isTraceEnabled())
                     logger.trace("Could not find aggregate for compaction using the one passed in: {}", aggregate);
 
-                aggregates.put(aggregate.getKey(), aggregate);
+                aggregatesMapChanged = true;
+                aggregatesMap.put(aggregate.getKey(), aggregate);
             }
             else
             {
@@ -179,9 +200,10 @@ public void setSubmitted(UUID id, CompactionAggregate aggregate)
                 {
                     // add the compaction just submitted to the aggregate that was found but because for STCS its
                     // key may change slightly, first remove it
-                    aggregates.remove(existingAggregate.getKey());
+                    aggregatesMapChanged = true;
+                    aggregatesMap.remove(existingAggregate.getKey());
                     CompactionAggregate newAggregate = existingAggregate.withAdditionalCompactions(ImmutableList.of(compaction));
-                    aggregates.put(newAggregate.getKey(), newAggregate);
+                    aggregatesMap.put(newAggregate.getKey(), newAggregate);
 
                     if (logger.isTraceEnabled())
                         logger.trace("Added compaction to existing aggregate: {} -> {}", existingAggregate, newAggregate);
@@ -192,18 +214,24 @@ public void setSubmitted(UUID id, CompactionAggregate aggregate)
                         logger.trace("Existing aggregate {} already had compaction", existingAggregate);
                 }
             }
+
+            // Publish the new aggregates if needed
+            if (aggregatesMapChanged)
+                this.aggregates = ImmutableList.copyOf(aggregatesMap.values());
         }
 
+        CompactionLogger compactionLogger = strategy.getCompactionLogger();
         if (compactionLogger != null && compactionLogger.enabled())
-            compactionLogger.statistics(strategy, "submitted", getStatistics());
+            compactionLogger.statistics(strategy, "submitted", getStatistics(strategy));
     }
 
-    @Override
-    public void setInProgress(CompactionProgress progress)
+    public void onInProgress(CompactionProgress progress)
     {
         if (progress == null)
             throw new IllegalArgumentException("argument cannot be null");
 
+        updateCompactionRate(progress);
+
         CompactionPick compaction = compactions.computeIfAbsent(progress.operationId(),
                                                                 uuid -> CompactionPick.create(-1, progress.inSSTables()));
 
@@ -211,8 +239,7 @@ public void setInProgress(CompactionProgress progress)
         compaction.setProgress(progress);
     }
 
-    @Override
-    public void setCompleted(UUID id)
+    public void onCompleted(CompactionStrategy strategy, UUID id)
     {
         if (id == null)
             throw new IllegalArgumentException("argument cannot be null");
@@ -221,10 +248,14 @@ public void setCompleted(UUID id)
 
         // log the statistics before completing the compaction so that we see the stats for the
         // compaction that just completed
+        CompactionLogger compactionLogger = strategy.getCompactionLogger();
         if (compactionLogger != null && compactionLogger.enabled())
-            compactionLogger.statistics(strategy, "completed", getStatistics());
+            compactionLogger.statistics(strategy, "completed", getStatistics(strategy));
 
         CompactionPick completed = compactions.remove(id);
+        CompactionProgress progress = completed.progress;
+        updateCompactionRate(progress);
+
         if (completed != null)
             completed.setCompleted();
 
@@ -232,6 +263,17 @@ public void setCompleted(UUID id)
         // called immediately (e.g. compactions disabled)
     }
 
+    private void updateCompactionRate(CompactionProgress progress)
+    {
+        if (progress != null && progress.durationInNanos() > 0 && progress.outputDiskSize() > 0)
+            compactionRate.update(progress.outputDiskSize() * 1.e9 / progress.durationInNanos());
+    }
+
+    public Collection<CompactionAggregate> getAggregates()
+    {
+        return aggregates;
+    }
+
     /**
      * @return the number of background compactions estimated to still be needed
      */
@@ -241,11 +283,11 @@ public int getEstimatedRemainingTasks()
     }
 
     /**
-     * @return the number of compactions currently in progress
+     * @return the compactions currently in progress
      */
-    public int getCompactionsInProgress()
+    public Collection<CompactionPick> getCompactionsInProgress()
     {
-        return compactions.size();
+        return Collections.unmodifiableCollection(compactions.values());
     }
 
     /**
@@ -253,7 +295,7 @@ public int getCompactionsInProgress()
      */
     public int getTotalCompactions()
     {
-        return getCompactionsInProgress() + getEstimatedRemainingTasks();
+        return compactions.size() + getEstimatedRemainingTasks();
     }
 
     /**
@@ -261,7 +303,7 @@ public int getTotalCompactions()
      *
      * @return statistics about this compaction strategy.
      */
-    public CompactionStrategyStatistics getStatistics()
+    public CompactionStrategyStatistics getStatistics(CompactionStrategy strategy)
     {
         return CompactionAggregate.getStatistics(metadata, strategy, aggregates);
     }
diff --git a/src/java/org/apache/cassandra/db/compaction/CleanupTask.java b/src/java/org/apache/cassandra/db/compaction/CleanupTask.java
new file mode 100644
index 000000000000..8daf43505035
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CleanupTask.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.repair.consistent.admin.CleanupSummary;
+import org.apache.cassandra.utils.Pair;
+
+public class CleanupTask
+{
+    private static final Logger logger = LoggerFactory.getLogger(CleanupTask.class);
+
+    private final ColumnFamilyStore cfs;
+    private final List<Pair<UUID, RepairFinishedCompactionTask>> tasks;
+
+    public CleanupTask(ColumnFamilyStore cfs, List<Pair<UUID, RepairFinishedCompactionTask>> tasks)
+    {
+        this.cfs = cfs;
+        this.tasks = tasks;
+    }
+
+    public CleanupSummary cleanup()
+    {
+        Set<UUID> successful = new HashSet<>();
+        Set<UUID> unsuccessful = new HashSet<>();
+        for (Pair<UUID, RepairFinishedCompactionTask> pair : tasks)
+        {
+            UUID session = pair.left;
+            RepairFinishedCompactionTask task = pair.right;
+
+            if (task != null)
+            {
+                try
+                {
+                    task.run();
+                    successful.add(session);
+                }
+                catch (Throwable t)
+                {
+                    t = task.transaction.abort(t);
+                    logger.error("Failed cleaning up " + session, t);
+                    unsuccessful.add(session);
+                }
+            }
+            else
+            {
+                unsuccessful.add(session);
+            }
+        }
+        return new CleanupSummary(cfs, successful, unsuccessful);
+    }
+
+    public Throwable abort(Throwable accumulate)
+    {
+        for (Pair<UUID, RepairFinishedCompactionTask> pair : tasks)
+            accumulate = pair.right.transaction.abort(accumulate);
+        return accumulate;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java b/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java
index 125bf68b26fc..6c7bb9f73e1d 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionAggregate.java
@@ -23,14 +23,11 @@
 import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
-import java.util.Map;
+import java.util.NavigableMap;
 import java.util.Objects;
 import java.util.Set;
 import java.util.SortedMap;
-import java.util.TreeMap;
-import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
-
 import javax.annotation.Nullable;
 
 import com.google.common.collect.ImmutableList;
@@ -41,7 +38,6 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.FBUtilities;
-import org.apache.cassandra.utils.Pair;
 
 /**
  * A compaction aggregate is either a level in {@link LeveledCompactionStrategy} or a tier (bucket) in other
@@ -56,6 +52,9 @@ public abstract class CompactionAggregate
 {
     private static final Logger logger = LoggerFactory.getLogger(CompactionAggregate.class);
 
+    /** The unique key that identifies this aggregate. */
+    final Key key;
+
     /** The sstables in this aggregate, whether they are compaction candidates or not */
     final Set<SSTableReader> sstables;
 
@@ -65,11 +64,12 @@ public abstract class CompactionAggregate
     /** The compactions that are part of this aggregate, they could be pending or in progress. */
     final LinkedHashSet<CompactionPick> compactions;
 
-    CompactionAggregate(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending)
+    CompactionAggregate(Key key, Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending)
     {
         if (sstables == null || selected == null || pending == null)
             throw new IllegalArgumentException("Arguments cannot be null");
 
+        this.key = key;
         this.sstables = new HashSet<>(); sstables.forEach(this.sstables::add);
         this.selected = selected;
 
@@ -93,6 +93,20 @@ public CompactionPick getSelected()
         return selected;
     }
 
+    /**
+     * @return the total sstable size for all the compaction picks that are either pending or still in progress
+     */
+    public long getPendingBytes()
+    {
+        long ret = 0;
+        for (CompactionPick comp : compactions)
+        {
+            if (comp.id == null)
+                ret += comp.totSizeInBytes;
+        }
+        return ret;
+    }
+
     /**
      * @return compactions that have not yet been submitted (no compaction id).
      */
@@ -146,6 +160,72 @@ public boolean isEmpty()
      */
     public abstract CompactionAggregateStatistics getStatistics();
 
+    /**
+     * Calculates basic compaction statistics, common for all types of {@link CompactionAggregate}s.
+     *
+     * @param trackHotness Indicates whether aggregate (tier/bucket) hotness is relevant and should be calculated.
+     *                     If this is {@code false}, a default value of {@link Double#NaN} will be used to indicate
+     *                     that hotness hasn't been calculated.
+     *
+     * @return a new {@link CompactionAggregateStatistics} instance, containing all the common statistics for the
+     *         different types of {@link CompactionAggregate}s (see above for the caveat about hotness).
+     */
+    CompactionAggregateStatistics getCommonStatistics(boolean trackHotness)
+    {
+        int numCompactions = 0;
+        int numCompactionsInProgress = 0;
+        int numCandidateSSTables = 0;
+        int numCompactingSSTables = 0;
+        int numExpiredSSTables = 0;
+        long tot = 0;
+        long expiredTot = 0;
+        double hotness = trackHotness ? 0.0 : Double.NaN;
+        long read = 0;
+        long written = 0;
+        long durationNanos = 0;
+
+        for (CompactionPick compaction : compactions)
+        {
+            if (compaction.completed)
+                continue;
+
+            numCompactions++;
+            numCandidateSSTables += compaction.sstables.size();
+            numExpiredSSTables += compaction.expired.size();
+            tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
+            expiredTot += compaction.expired.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
+            if (trackHotness)
+                hotness += compaction.hotness;
+
+            if (compaction.id != null)
+            {
+                numCompactionsInProgress++;
+                numCompactingSSTables += compaction.sstables.size();
+            }
+
+            if (compaction.progress != null)
+            {
+                read += compaction.progress.uncompressedBytesRead();
+                written += compaction.progress.uncompressedBytesWritten();
+                durationNanos += compaction.progress.durationInNanos();
+            }
+        }
+
+        return new CompactionAggregateStatistics(numCompactions,
+                                                 numCompactionsInProgress,
+                                                 sstables.size(),
+                                                 numExpiredSSTables,
+                                                 numCandidateSSTables,
+                                                 numCompactingSSTables,
+                                                 getTotSizeBytes(sstables),
+                                                 tot,
+                                                 expiredTot,
+                                                 read,
+                                                 written,
+                                                 durationNanos,
+                                                 hotness);
+    }
+
     /**
      * @return the number of estimated compactions that are still pending.
      */
@@ -155,9 +235,14 @@ public int numEstimatedCompactions()
     }
 
     /**
-     * @return a key that is specific to the concrete implementation, used for grouping compacting aggregates
+     * @return a key that ensures the uniqueness of an aggregate but also that allows identify future identical aggregates,
+     *         e.g. when an aggregate is merged with an older aggregate that has still ongoing compactions like a level
+     *         in LCS or a bucket in the unified strategy or STCS or a time window in TWCS
      */
-    abstract long getKey();
+    public Key getKey()
+    {
+        return key;
+    }
 
     /**
      * Return a matching aggregate from the map passed in or null. Normally this is just a matter of finding
@@ -168,7 +253,7 @@ public int numEstimatedCompactions()
      *
      * @return an aggregate with the same key or null
      */
-    @Nullable CompactionAggregate getMatching(TreeMap<Long, CompactionAggregate> others)
+    @Nullable CompactionAggregate getMatching(NavigableMap<Key, CompactionAggregate> others)
     {
         return others.get(getKey());
     }
@@ -180,13 +265,12 @@ public int numEstimatedCompactions()
      */
     protected abstract CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions);
 
-
     /**
      * Add expired sstables to the selected compaction pick and return a new compaction aggregate.
      */
     CompactionAggregate withExpired(Collection<SSTableReader> expired)
     {
-        return clone(Iterables.concat(sstables, expired), selected.withAddedSSTables(expired), compactions);
+       return clone(Iterables.concat(sstables, expired), selected.withExpiredSSTables(expired), compactions);
     }
 
     /**
@@ -245,11 +329,15 @@ public static final class Leveled extends CompactionAggregate
         /** The maximum size of each output sstable that will be produced by compaction, Long.MAX_VALUE if no maximum exists */
         final long maxSSTableBytes;
 
-        /** How many more compactions this level is expected to perform. This is required because for LCS we cannot easily identify candidate
-         * sstables to put into the pending picks.
+        /**
+         * How many more compactions this level is expected to perform. This is required because for LCS we cannot
+         * easily identify candidate sstables to put into the pending picks.
          */
         final int pendingCompactions;
 
+        /** The fanout size */
+        final int fanout;
+
         Leveled(Iterable<SSTableReader> sstables,
                 CompactionPick selected,
                 Iterable<CompactionPick> compactions,
@@ -257,77 +345,37 @@ public static final class Leveled extends CompactionAggregate
                 int nextLevel,
                 double score,
                 long maxSSTableBytes,
-                int pendingCompactions)
+                int pendingCompactions,
+                int fanout)
         {
-            super(sstables, selected, compactions);
+            super(new Key(level), sstables, selected, compactions);
 
             this.level = level;
             this.nextLevel = nextLevel;
             this.score = score;
             this.maxSSTableBytes = maxSSTableBytes;
             this.pendingCompactions = pendingCompactions;
+            this.fanout = fanout;
         }
 
         @Override
         protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
         {
-            return new Leveled(sstables, selected, compactions, level, nextLevel, score, maxSSTableBytes, pendingCompactions);
+            return new Leveled(sstables, selected, compactions, level, nextLevel, score, maxSSTableBytes, pendingCompactions, fanout);
         }
 
         @Override
         public CompactionAggregateStatistics getStatistics()
         {
-            int numCompactions = pendingCompactions;
-            int numCompactionsInProgress = 0;
-            int numCandidateSSTables = 0;
-            int numCompactingSSTables = 0;
-            long tot = 0;
-            long read = 0;
-            long readLevel = 0;
-            long written = 0;
-            long durationNanos = 0;
-
-            for (CompactionPick compaction : compactions)
-            {
-                if (compaction.completed)
-                    continue;
+            CompactionAggregateStatistics stats = getCommonStatistics(false);
 
-                numCompactions++;
-                numCandidateSSTables += compaction.sstables.size();
-                tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
+            long readLevel = 0L;
 
-                if (compaction.id != null)
-                {
-                    numCompactionsInProgress++;
-                    numCompactingSSTables += compaction.sstables.size();
-                }
-
-                if (compaction.progress != null)
-                {
-                    read += compaction.progress.uncompressedBytesRead();
+            for (CompactionPick compaction : compactions)
+                if (!compaction.completed && compaction.progress != null)
                     readLevel += compaction.progress.uncompressedBytesRead(level);
-                    written += compaction.progress.uncompressedBytesWritten();
-                    durationNanos += compaction.progress.durationInNanos();
-                }
-            }
 
-            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
-            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
-
-            return new LeveledCompactionStatistics(level,
-                                                   score,
-                                                   numCompactions,
-                                                   numCompactionsInProgress,
-                                                   sstables.size(),
-                                                   numCandidateSSTables,
-                                                   numCompactingSSTables,
-                                                   getTotSizeBytes(sstables),
-                                                   readThroughput,
-                                                   writeThroughput,
-                                                   tot,
-                                                   read,
-                                                   readLevel,
-                                                   written);
+            return new LeveledCompactionStatistics(stats, level, score, pendingCompactions, readLevel);
         }
 
         @Override
@@ -342,12 +390,6 @@ public boolean isEmpty()
             return super.isEmpty() && pendingCompactions == 0;
         }
 
-        @Override
-        long getKey()
-        {
-            return level;
-        }
-
         @Override
         public String toString()
         {
@@ -364,7 +406,8 @@ static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
                                                      long maxSSTableBytes,
                                                      int level,
                                                      int nextLevel,
-                                                     double score)
+                                                     double score,
+                                                     int fanout)
     {
         return new Leveled(all,
                            CompactionPick.create(level, candidates),
@@ -373,7 +416,8 @@ static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
                            nextLevel,
                            score,
                            maxSSTableBytes,
-                           pendingCompactions);
+                           pendingCompactions,
+                           fanout);
     }
 
     /**
@@ -383,7 +427,8 @@ static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
                                                      int pendingCompactions,
                                                      long maxSSTableBytes,
                                                      int level,
-                                                     double score)
+                                                     double score,
+                                                     int fanout)
     {
         return new Leveled(all,
                            CompactionPick.EMPTY,
@@ -392,7 +437,8 @@ static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
                            level + 1,
                            score,
                            maxSSTableBytes,
-                           pendingCompactions);
+                           pendingCompactions,
+                           fanout);
     }
 
     /**
@@ -401,7 +447,8 @@ static CompactionAggregate.Leveled createLeveled(Collection<SSTableReader> all,
     static CompactionAggregate.Leveled createLeveledForSTCS(Collection<SSTableReader> all,
                                                             CompactionPick pick,
                                                             int pendingCompactions,
-                                                            double score)
+                                                            double score,
+                                                            int fanout)
     {
         return new Leveled(all,
                            pick,
@@ -410,7 +457,8 @@ static CompactionAggregate.Leveled createLeveledForSTCS(Collection<SSTableReader
                            0,
                            score,
                            Long.MAX_VALUE,
-                           pendingCompactions);
+                           pendingCompactions,
+                           fanout);
     }
 
     /**
@@ -440,7 +488,7 @@ public static final class SizeTiered extends CompactionAggregate
                    long minSizeBytes,
                    long maxSizeBytes)
         {
-            super(sstables, selected, pending);
+            super(new Key(avgSizeBytes), sstables, selected, pending);
 
             this.hotness = hotness;
             this.avgSizeBytes = avgSizeBytes;
@@ -457,68 +505,15 @@ protected CompactionAggregate clone(Iterable<SSTableReader> sstables, Compaction
         @Override
         public CompactionAggregateStatistics getStatistics()
         {
-            int numCompactions = 0;
-            int numCompactionsInProgress = 0;
-            int numCandidateSSTables = 0;
-            int numCompactingSSTables = 0;
-            long tot = 0;
-            long read = 0;
-            long written = 0;
-            double hotness = 0;
-            long durationNanos = 0;
-
-            for (CompactionPick compaction : compactions)
-            {
-                if (compaction.completed)
-                    continue;
-
-                numCompactions++;
-                numCandidateSSTables += compaction.sstables.size();
-                tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
-                hotness += compaction.hotness;
-
-                if (compaction.id != null)
-                {
-                    numCompactionsInProgress++;
-                    numCompactingSSTables += compaction.sstables.size();
-                }
-
-                if (compaction.progress != null)
-                {
-                    read += compaction.progress.uncompressedBytesRead();
-                    written += compaction.progress.uncompressedBytesWritten();
-                    durationNanos += compaction.progress.durationInNanos();
-                }
-            }
-
-            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
-            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+            CompactionAggregateStatistics stats = getCommonStatistics(true);
 
-            return new SizeTieredCompactionStatistics(avgSizeBytes,
-                                                      hotness,
-                                                      numCompactions,
-                                                      numCompactionsInProgress,
-                                                      sstables.size(),
-                                                      numCandidateSSTables,
-                                                      numCompactingSSTables,
-                                                      getTotSizeBytes(sstables),
-                                                      readThroughput,
-                                                      writeThroughput,
-                                                      tot,
-                                                      read,
-                                                      written);
+            return new SizeTieredCompactionStatistics(stats, avgSizeBytes);
         }
 
         @Override
-        long getKey()
+        @Nullable CompactionAggregate getMatching(NavigableMap<Key, CompactionAggregate> others)
         {
-            return avgSizeBytes;
-        }
-
-        @Override
-        @Nullable CompactionAggregate getMatching(TreeMap<Long, CompactionAggregate> others)
-        {
-            SortedMap<Long, CompactionAggregate> subMap = others.subMap(minSizeBytes, maxSizeBytes);
+            SortedMap<Key, CompactionAggregate> subMap = others.subMap(new Key(minSizeBytes), new Key(maxSizeBytes));
             if (subMap.isEmpty())
             {
                 if (logger.isTraceEnabled())
@@ -533,11 +528,11 @@ long getKey()
                              subMap.size(),
                              FBUtilities.prettyPrintMemory(avgSizeBytes));
 
-            Long closest = null;
+            Key closest = null;
             long minDiff = 0;
-            for (Long m : subMap.keySet())
+            for (Key m : subMap.keySet())
             {
-                long diff = Math.abs(m - avgSizeBytes);
+                long diff = Math.abs(m.index - avgSizeBytes);
                 if (closest == null || diff < minDiff)
                 {
                     closest = m;
@@ -548,7 +543,7 @@ long getKey()
             if (logger.isTraceEnabled())
                 logger.trace("Using closest matching aggregate for {}: {}",
                              FBUtilities.prettyPrintMemory(avgSizeBytes),
-                             FBUtilities.prettyPrintMemory(closest));
+                             FBUtilities.prettyPrintMemory(closest != null ? closest.index : -1));
 
             return others.get(closest);
         }
@@ -586,7 +581,7 @@ public static final class TimeTiered extends CompactionAggregate
 
         TimeTiered(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending, long timestamp)
         {
-            super(sstables, selected, pending);
+            super(new Key(timestamp), sstables, selected, pending);
             this.timestamp = timestamp;
         }
 
@@ -599,62 +594,8 @@ protected CompactionAggregate clone(Iterable<SSTableReader> sstables, Compaction
         @Override
         public CompactionAggregateStatistics getStatistics()
         {
-            int numCompactions = 0;
-            int numCompactionsInProgress = 0;
-            int numCandidateSSTables = 0;
-            int numCompactingSSTables = 0;
-            long tot = 0;
-            long read = 0;
-            long written = 0;
-            double hotness = 0;
-            long durationNanos = 0;
-
-            for (CompactionPick compaction : compactions)
-            {
-                if (compaction.completed)
-                    continue;
-
-                numCompactions++;
-                numCandidateSSTables += compaction.sstables.size();
-                tot += compaction.sstables.stream().mapToLong(SSTableReader::uncompressedLength).reduce(0L, Long::sum);
-                hotness += compaction.hotness;
-
-                if (compaction.id != null)
-                {
-                    numCompactionsInProgress++;
-                    numCompactingSSTables += compaction.sstables.size();
-                }
-
-                if (compaction.progress != null)
-                {
-                    read += compaction.progress.uncompressedBytesRead();
-                    written += compaction.progress.uncompressedBytesWritten();
-                    durationNanos += compaction.progress.durationInNanos();
-                }
-            }
-
-            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
-            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
-
-            return new TimeTieredCompactionStatistics(timestamp,
-                                                      hotness,
-                                                      numCompactions,
-                                                      numCompactionsInProgress,
-                                                      sstables.size(),
-                                                      numCandidateSSTables,
-                                                      numCompactingSSTables,
-                                                      getTotSizeBytes(sstables),
-                                                      readThroughput,
-                                                      writeThroughput,
-                                                      tot,
-                                                      read,
-                                                      written);
-        }
-
-        @Override
-        long getKey()
-        {
-            return timestamp;
+            CompactionAggregateStatistics stats = getCommonStatistics(true);
+            return new TimeTieredCompactionStatistics(stats, timestamp);
         }
 
         @Override
@@ -674,70 +615,120 @@ static CompactionAggregate createTimeTiered(Collection<SSTableReader> sstables,
         return new TimeTiered(sstables, selected, pending, timestamp);
     }
 
-    /** An aggregate that is created for a compaction issued only to drop tombstones */
-    public static final class TombstoneAggregate extends CompactionAggregate
+    public static final class UnifiedAggregate extends CompactionAggregate
     {
-        TombstoneAggregate(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending)
+        /** The shard to which this bucket belongs */
+        private final UnifiedCompactionStrategy.Shard shard;
+
+        /** The bucket generated by the compaction strategy */
+        private final UnifiedCompactionStrategy.Bucket bucket;
+
+        UnifiedAggregate(Iterable<SSTableReader> sstables,
+                         CompactionPick selected,
+                         Iterable<CompactionPick> pending,
+                         UnifiedCompactionStrategy.Shard shard,
+                         UnifiedCompactionStrategy.Bucket bucket)
+        {
+            super(new ShardedKey(shard, bucket.index), sstables, selected, pending);
+            this.shard = shard;
+            this.bucket = bucket;
+        }
+
+        public UnifiedCompactionStrategy.Shard getShard()
         {
-            super(sstables, selected, pending);
+            return shard;
+        }
+
+        @Override
+        public CompactionAggregateStatistics getStatistics()
+        {
+            CompactionAggregateStatistics stats = getCommonStatistics(false);
+
+            return new UnifiedCompactionStatistics(stats,
+                                                   bucket.index,
+                                                   bucket.survivalFactor,
+                                                   bucket.scalingParameter,
+                                                   bucket.threshold,
+                                                   bucket.fanout,
+                                                   bucket.min,
+                                                   bucket.max,
+                                                   shard.name());
         }
 
         @Override
         protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
         {
-            return new TombstoneAggregate(sstables, selected, compactions);
+            return new UnifiedAggregate(sstables, selected, compactions, shard, bucket);
+        }
+
+        int bucketIndex()
+        {
+            return bucket.index;
         }
 
         @Override
-        public CompactionAggregateStatistics getStatistics()
+        public String toString()
         {
-            int numCompactions = 0;
-            int numCompactionsInProgress = 0;
-            int numCandidateSSTables = 0;
-            int numCompactingSSTables = 0;
-            long read = 0;
-            long written = 0;
-            long durationNanos = 0;
+            return String.format("Unified shard %s bucket %d with %d sstables and %d compactions",
+                                 shard.name(),
+                                 bucket.index,
+                                 sstables.size(),
+                                 compactions.size());
+        }
 
-            for (CompactionPick compaction : compactions)
-            {
-                if (compaction.completed)
-                    continue;
+        @Override
+        public boolean equals(Object obj)
+        {
+            if (obj == this)
+                return true;
+
+            if (!(obj instanceof UnifiedAggregate))
+                return false;
+
+            UnifiedAggregate that = (UnifiedAggregate) obj;
+            return sstables.equals(that.sstables) &&
+                   selected.equals(that.selected) &&
+                   compactions.equals(that.compactions) &&
+                   bucket.equals(that.bucket) &&
+                   shard.equals(that.shard);
+        }
 
-                numCompactions++;
-                numCandidateSSTables += compaction.sstables.size();
+        @Override
+        public int hashCode()
+        {
+            return Objects.hash(sstables, selected, compactions, bucket, shard);
+        }
+    }
 
-                if (compaction.id  != null)
-                {
-                    numCompactionsInProgress++;
-                    numCompactingSSTables += compaction.sstables.size();
-                }
+    static UnifiedAggregate createUnified(Collection<SSTableReader> sstables,
+                                          CompactionPick selected,
+                                          Iterable<CompactionPick> pending,
+                                          UnifiedCompactionStrategy.Shard shard,
+                                          UnifiedCompactionStrategy.Bucket bucket)
+    {
+        return new UnifiedAggregate(sstables, selected, pending, shard, bucket);
+    }
 
-                if (compaction.progress != null)
-                {
-                    read += compaction.progress.uncompressedBytesRead();
-                    written += compaction.progress.uncompressedBytesWritten();
-                    durationNanos += compaction.progress.durationInNanos();
-                }
-            }
 
-            double readThroughput = durationNanos == 0 ? 0 : ((double) read / durationNanos) * TimeUnit.SECONDS.toNanos(1);
-            double writeThroughput = durationNanos == 0 ? 0 : ((double) written / durationNanos) * TimeUnit.SECONDS.toNanos(1);
 
-            return new CompactionAggregateStatistics(numCompactions,
-                                                     numCompactionsInProgress,
-                                                     sstables.size(),
-                                                     numCandidateSSTables,
-                                                     numCompactingSSTables,
-                                                     getTotSizeBytes(sstables),
-                                                     readThroughput,
-                                                     writeThroughput);
+    /** An aggregate that is created for a compaction issued only to drop tombstones */
+    public static final class TombstoneAggregate extends CompactionAggregate
+    {
+        TombstoneAggregate(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> pending)
+        {
+            super(new Key(-1), sstables, selected, pending);
+        }
+
+        @Override
+        protected CompactionAggregate clone(Iterable<SSTableReader> sstables, CompactionPick selected, Iterable<CompactionPick> compactions)
+        {
+            return new TombstoneAggregate(sstables, selected, compactions);
         }
 
         @Override
-        long getKey()
+        public CompactionAggregateStatistics getStatistics()
         {
-            return -1; // Tombstone compactions are the only ones with negative keys so they will be matched by a unique aggregate
+            return getCommonStatistics(false);
         }
 
         @Override
@@ -754,6 +745,56 @@ static CompactionAggregate createForTombstones(SSTableReader sstable)
         return new TombstoneAggregate(sstables, comp, ImmutableList.of());
     }
 
+    /**
+     * A key suitable for a strategy that has no shards, that is a legacy strategy that is
+     * managed by CompactionStrategyManager.
+     */
+    public static class Key implements Comparable<Key>
+    {
+        protected final long index;
+
+        Key(long index)
+        {
+            this.index = index;
+        }
+
+        @Override
+        public int compareTo(Key key)
+        {
+            return Long.compare(index, key.index);
+        }
+    }
+
+    /**
+     * A key suitable for a strategy using shards, first it compares by shard, and then by bucket index.
+     */
+    private static final class ShardedKey extends Key
+    {
+        private final UnifiedCompactionStrategy.Shard shard;
+
+        ShardedKey(UnifiedCompactionStrategy.Shard shard, long index)
+        {
+            super(index);
+            this.shard = shard;
+        }
+
+        @Override
+        public int compareTo(Key key)
+        {
+            if (key instanceof ShardedKey)
+            {
+                ShardedKey shardedKey = (ShardedKey) key;
+
+                int ret = shard.compareTo(shardedKey.shard);
+                if (ret != 0)
+                    return ret;
+            }
+
+            // either not sharded or same shard
+            return Long.compare(index, key.index);
+        }
+    }
+
     /**
      * Return the compaction statistics for this strategy and list of compactions that are either pending or in progress.
      *
@@ -762,17 +803,15 @@ static CompactionAggregate createForTombstones(SSTableReader sstable)
      * @return the statistics about this compactions
      */
     static CompactionStrategyStatistics getStatistics(TableMetadata metadata,
-                                                      AbstractCompactionStrategy strategy,
-                                                      Map<Long, CompactionAggregate> aggregates)
+                                                      CompactionStrategy strategy,
+                                                      Collection<CompactionAggregate> aggregates)
     {
-        List<Pair<Long, CompactionAggregateStatistics>> statistcs = new ArrayList<>(aggregates.size());
+        List<CompactionAggregateStatistics> statistics = new ArrayList<>(aggregates.size());
 
-        for (CompactionAggregate comp : aggregates.values())
-            statistcs.add(Pair.create(comp.getKey(), comp.getStatistics()));
+        for (CompactionAggregate aggregate : aggregates)
+            statistics.add(aggregate.getStatistics());
 
-        return new CompactionStrategyStatistics(metadata,
-                                                strategy.getClass().getSimpleName(),
-                                                statistcs.stream().map(p -> p.right).collect(Collectors.toList()));
+        return new CompactionStrategyStatistics(metadata, strategy.getClass().getSimpleName(), statistics);
     }
 
     /**
@@ -781,10 +820,10 @@ static CompactionStrategyStatistics getStatistics(TableMetadata metadata,
      *
      * @return the number of compactions that are still pending (net yet submitted)
      */
-    static int numEstimatedCompactions(Map<Long, CompactionAggregate> aggregates)
+    static int numEstimatedCompactions(Collection<CompactionAggregate> aggregates)
     {
         int ret = 0;
-        for (CompactionAggregate aggregate : aggregates.values())
+        for (CompactionAggregate aggregate : aggregates)
             ret += aggregate.numEstimatedCompactions();
 
         return ret;
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java b/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java
index 60b3439e99c8..95acf6e0cdc8 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionAggregateStatistics.java
@@ -20,6 +20,7 @@
 
 import java.io.Serializable;
 import java.util.Collection;
+import java.util.concurrent.TimeUnit;
 
 import com.google.common.collect.ImmutableList;
 
@@ -38,49 +39,103 @@
  */
 public class CompactionAggregateStatistics implements Serializable
 {
-    protected static final Collection<String> HEADER = ImmutableList.of("Tot. sstables", "Size (bytes)", "Compactions", "Comp. Sstables", "Read (bytes/sec)", "Write (bytes/sec)");
+    public static final String NO_SHARD = "";
 
+    protected static final Collection<String> HEADER = ImmutableList.of("Tot. SSTables",
+                                                                        "Tot. size (bytes)",
+                                                                        "Compactions",
+                                                                        "Comp. SSTables",
+                                                                        "Read (bytes/sec)",
+                                                                        "Write (bytes/sec)",
+                                                                        "Tot. comp. size/Read/Written (bytes)");
     /** The number of compactions that are either pending or in progress */
-    private final int numCompactions;
+    protected final int numCompactions;
 
     /** The number of compactions that are in progress */
-    private final int numCompactionsInProgress;
+    protected final int numCompactionsInProgress;
 
     /** The total number of sstables, whether they need compacting or not */
-    private final int numSSTables;
+    protected final int numSSTables;
+
+    /** The total number of expired sstables */
+    protected final int numExpiredSSTables;
 
     /** The number of sstables that are compaction candidates */
-    private final int numCandidateSSTables;
+    protected final int numCandidateSSTables;
 
     /** The number of sstables that are currently compacting */
-    private final int numCompactingSSTables;
+    protected final int numCompactingSSTables;
 
     /** The size in bytes (on disk) of the total sstables */
-    private final long sizeInBytes;
+    protected final long sizeInBytes;
+
+    /** The total uncompressed size of the sstables selected for compaction */
+    protected final long totBytesToCompact;
+
+    /** The total uncompressed size of the expired sstables that are going to be dropped during compaction */
+    protected final long totalBytesToDrop;
+
+    /** The number of bytes read so far for the compactions here - read throughput is calculated based on this */
+    protected final long readBytes;
+
+    /** The number of bytes written so far for the compaction here - write throughput is calculated based on this */
+    protected final long writtenBytes;
 
     /** The read throughput in bytes per second */
-    private final double readThroughput;
+    protected final double readThroughput;
 
     /** The write throughput in bytes per second */
-    private final double writeThroughput;
+    protected final double writeThroughput;
+
+    /** The hotness of this aggregate (where applicable) */
+    protected final double hotness;
 
     CompactionAggregateStatistics(int numCompactions,
                                   int numCompactionsInProgress,
                                   int numSSTables,
+                                  int numExpiredSSTables,
                                   int numCandidateSSTables,
                                   int numCompactingSSTables,
                                   long sizeInBytes,
-                                  double readThroughput,
-                                  double writeThroughput)
+                                  long totBytesToCompact,
+                                  long totBytesToDrop,
+                                  long readBytes,
+                                  long writtenBytes,
+                                  long durationNanos,
+                                  double hotness)
     {
         this.numCompactions = numCompactions;
         this.numCompactionsInProgress = numCompactionsInProgress;
         this.numCandidateSSTables = numCandidateSSTables;
         this.numCompactingSSTables = numCompactingSSTables;
         this.numSSTables = numSSTables;
+        this.numExpiredSSTables = numExpiredSSTables;
         this.sizeInBytes = sizeInBytes;
-        this.readThroughput = readThroughput;
-        this.writeThroughput = writeThroughput;
+        this.totBytesToCompact = totBytesToCompact;
+        this.totalBytesToDrop = totBytesToDrop;
+        this.readBytes = readBytes;
+        this.writtenBytes = writtenBytes;
+        this.readThroughput = durationNanos == 0 ? 0 : ((double) readBytes / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+        this.writeThroughput = durationNanos == 0 ? 0 : ((double) writtenBytes / durationNanos) * TimeUnit.SECONDS.toNanos(1);
+        this.hotness = hotness;
+    }
+
+    CompactionAggregateStatistics(CompactionAggregateStatistics base)
+    {
+        this.numCompactions = base.numCompactions;
+        this.numCompactionsInProgress = base.numCompactionsInProgress;
+        this.numCandidateSSTables = base.numCandidateSSTables;
+        this.numCompactingSSTables = base.numCompactingSSTables;
+        this.numExpiredSSTables = base.numExpiredSSTables;
+        this.numSSTables = base.numSSTables;
+        this.sizeInBytes = base.sizeInBytes;
+        this.totBytesToCompact = base.totBytesToCompact;
+        this.totalBytesToDrop = base.totalBytesToDrop;
+        this.readBytes = base.readBytes;
+        this.writtenBytes = base.writtenBytes;
+        this.readThroughput = base.readThroughput;
+        this.writeThroughput = base.writeThroughput;
+        this.hotness = base.hotness;
     }
 
     /** The number of compactions that are either pending or in progress */
@@ -138,6 +193,41 @@ public double writeThroughput()
         return writeThroughput;
     }
 
+    /** The total uncompressed size of the sstables selected for compaction */
+    @JsonProperty
+    public long tot()
+    {
+        return totBytesToCompact;
+    }
+
+    /** The number of bytes read so far for the compactions here - read throughput is calculated based on this */
+    @JsonProperty
+    public long read()
+    {
+        return readBytes;
+    }
+
+    /** The number of bytes written so far for the compaction here - write throughput is calculated based on this */
+    @JsonProperty
+    public long written()
+    {
+        return writtenBytes;
+    }
+
+    /** The hotness of this aggregate (where applicable) */
+    @JsonProperty
+    public double hotness()
+    {
+        return hotness;
+    }
+
+    /** The name of the shard, empty if the compaction is not sharded (the default). */
+    @JsonProperty
+    public String shard()
+    {
+        return NO_SHARD;
+    }
+
     @Override
     public String toString()
     {
@@ -156,7 +246,8 @@ protected Collection<String> data()
                                         Integer.toString(numCompactions()) + '/' + numCompactionsInProgress(),
                                         Integer.toString(numCandidateSSTables()) + '/' + numCompactingSSTables(),
                                 prettyPrintMemoryPerSecond((long) readThroughput()),
-                                prettyPrintMemoryPerSecond((long) writeThroughput()));
+                                prettyPrintMemoryPerSecond((long) writeThroughput()),
+                                prettyPrintMemory(totBytesToCompact) + '/' + prettyPrintMemory(readBytes) + '/' + prettyPrintMemory(writtenBytes));
     }
 
     protected String toString(long value)
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
index 6d98c1b542f6..2c3b4a9972a9 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java
@@ -68,8 +68,7 @@ protected CompactionController(ColumnFamilyStore cfs, int maxValue)
 
     public CompactionController(ColumnFamilyStore cfs, Set<SSTableReader> compacting, int gcBefore)
     {
-        this(cfs, compacting, gcBefore, null,
-             cfs.getCompactionStrategyManager().getCompactionParams().tombstoneOption());
+        this(cfs, compacting, gcBefore, null, cfs.getCompactionParams().tombstoneOption());
     }
 
     public CompactionController(ColumnFamilyStore cfs, Set<SSTableReader> compacting, int gcBefore, RateLimiter limiter, TombstoneOption tombstoneOption)
@@ -165,7 +164,7 @@ public static Set<SSTableReader> getFullyExpiredSSTables(ColumnFamilyStore cfSto
         if (NEVER_PURGE_TOMBSTONES || compacting == null || cfStore.getNeverPurgeTombstones())
             return Collections.<SSTableReader>emptySet();
 
-        if (cfStore.getCompactionStrategyManager().onlyPurgeRepairedTombstones() && !Iterables.all(compacting, SSTableReader::isRepaired))
+        if (cfStore.onlyPurgeRepairedTombstones() && !Iterables.all(compacting, SSTableReader::isRepaired))
             return Collections.emptySet();
 
         if (ignoreOverlaps)
@@ -173,11 +172,11 @@ public static Set<SSTableReader> getFullyExpiredSSTables(ColumnFamilyStore cfSto
             Set<SSTableReader> fullyExpired = new HashSet<>();
             for (SSTableReader candidate : compacting)
             {
-                if (candidate.getSSTableMetadata().maxLocalDeletionTime < gcBefore)
+                if (candidate.getMaxLocalDeletionTime() < gcBefore)
                 {
                     fullyExpired.add(candidate);
                     logger.trace("Dropping overlap ignored expired SSTable {} (maxLocalDeletionTime={}, gcBefore={})",
-                                 candidate, candidate.getSSTableMetadata().maxLocalDeletionTime, gcBefore);
+                                 candidate, candidate.getMaxLocalDeletionTime(), gcBefore);
                 }
             }
             return fullyExpired;
@@ -190,13 +189,13 @@ public static Set<SSTableReader> getFullyExpiredSSTables(ColumnFamilyStore cfSto
         {
             // Overlapping might include fully expired sstables. What we care about here is
             // the min timestamp of the overlapping sstables that actually contain live data.
-            if (sstable.getSSTableMetadata().maxLocalDeletionTime >= gcBefore)
+            if (sstable.getMaxLocalDeletionTime() >= gcBefore)
                 minTimestamp = Math.min(minTimestamp, sstable.getMinTimestamp());
         }
 
         for (SSTableReader candidate : compacting)
         {
-            if (candidate.getSSTableMetadata().maxLocalDeletionTime < gcBefore)
+            if (candidate.getMaxLocalDeletionTime() < gcBefore)
                 candidates.add(candidate);
             else
                 minTimestamp = Math.min(minTimestamp, candidate.getMinTimestamp());
@@ -221,7 +220,7 @@ public static Set<SSTableReader> getFullyExpiredSSTables(ColumnFamilyStore cfSto
             else
             {
                logger.trace("Dropping expired SSTable {} (maxLocalDeletionTime={}, gcBefore={})",
-                        candidate, candidate.getSSTableMetadata().maxLocalDeletionTime, gcBefore);
+                        candidate, candidate.getMaxLocalDeletionTime(), gcBefore);
             }
         }
         return new HashSet<>(candidates);
@@ -296,7 +295,7 @@ public void close()
 
     public boolean compactingRepaired()
     {
-        return !cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones() || compactingRepaired;
+        return !cfs.onlyPurgeRepairedTombstones() || compactingRepaired;
     }
 
     boolean provideTombstoneSources()
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
index 377f770c6fc1..b247480bcdce 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java
@@ -334,7 +334,7 @@ private class Purger extends PurgeFunction
         private Purger(AbstractCompactionController controller, int nowInSec)
         {
             super(nowInSec, controller.gcBefore, controller.compactingRepaired() ? Integer.MAX_VALUE : Integer.MIN_VALUE,
-                  controller.cfs.getCompactionStrategyManager().onlyPurgeRepairedTombstones(),
+                  controller.cfs.onlyPurgeRepairedTombstones(),
                   controller.cfs.metadata.get().enforceStrictLiveness());
             this.controller = controller;
         }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java b/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java
index 1455d8b3b93e..310e6148842d 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionLogger.java
@@ -21,17 +21,20 @@
 import java.io.Closeable;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
-import java.lang.ref.WeakReference;
-import java.nio.file.*;
-import java.text.DateFormat;
-import java.text.SimpleDateFormat;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.time.Instant;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
 import java.util.Collection;
-import java.util.Date;
 import java.util.HashSet;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.concurrent.*;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Consumer;
@@ -47,8 +50,9 @@
 import com.fasterxml.jackson.databind.node.ArrayNode;
 import com.fasterxml.jackson.databind.node.JsonNodeFactory;
 import com.fasterxml.jackson.databind.node.ObjectNode;
-import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.JVMStabilityInspector;
 import org.apache.cassandra.utils.NoSpamLogger;
 import org.apache.cassandra.utils.Throwables;
@@ -59,27 +63,9 @@
  */
 public class CompactionLogger
 {
-    private static final DateFormat dateFormatter = new SimpleDateFormat("HH:mm:ss.SSS");
-
-    public interface Strategy
-    {
-        JsonNode sstable(SSTableReader sstable);
-
-        JsonNode options();
-
-        static Strategy none = new Strategy()
-        {
-            public JsonNode sstable(SSTableReader sstable)
-            {
-                return null;
-            }
-
-            public JsonNode options()
-            {
-                return null;
-            }
-        };
-    }
+    private static final DateTimeFormatter dateFormatter = DateTimeFormatter
+                                                           .ofPattern("yyyy-MM-dd' 'HH:mm:ss.SSS")
+                                                           .withZone(ZoneId.systemDefault() );
 
     /**
      * This will produce the compaction strategy's starting information.
@@ -121,147 +107,124 @@ public interface Writer extends Closeable
         void close();
     }
 
-    private interface CompactionStrategyAndTableFunction
-    {
-        JsonNode apply(AbstractCompactionStrategy strategy, SSTableReader sstable);
-    }
-
     private static final JsonNodeFactory json = JsonNodeFactory.instance;
     private static final Logger logger = LoggerFactory.getLogger(CompactionLogger.class);
 
-    private static final ExecutorService loggerService = Executors.newFixedThreadPool(1);
+    private static final ExecutorService loggerService = Executors.newFixedThreadPool(1, new NamedThreadFactory("compaction-logger"));
     private static final Writer jsonWriter = new CompactionLogSerializer("compaction", "log", loggerService);
 
-    private final WeakReference<ColumnFamilyStore> cfsRef;
-    private final WeakReference<CompactionStrategyManager> csmRef;
+    private final String keyspace;
+    private final String table;
     private final AtomicInteger identifier = new AtomicInteger(0);
-    private final Map<AbstractCompactionStrategy, String> compactionStrategyMapping = new MapMaker().weakKeys().makeMap();
-    private final Map<AbstractCompactionStrategy, Writer> csvWriters = new MapMaker().makeMap();
+    private final Map<CompactionStrategy, String> compactionStrategyMapping = new MapMaker().weakKeys().makeMap();
+    private final Map<CompactionStrategy, Map<String, Writer>> csvWriters = new MapMaker().makeMap();
     private final AtomicBoolean enabled = new AtomicBoolean(false);
 
-    CompactionLogger(ColumnFamilyStore cfs, CompactionStrategyManager csm)
+    CompactionLogger(TableMetadata metadata)
     {
-        csmRef = new WeakReference<>(csm);
-        cfsRef = new WeakReference<>(cfs);
+        this.keyspace = metadata.keyspace;
+        this.table = metadata.name;
+    }
+
+    void strategyCreated(CompactionStrategy strategy)
+    {
+        compactionStrategyMapping.computeIfAbsent(strategy, s -> String.valueOf(identifier.getAndIncrement()));
     }
 
     /**
-     * Visit all the strategies in the {@link CompactionStrategyManager} reference, if available.
+     * Visit all the strategies.
      *
      * @param consumer a consumer function that receives all the strategies one by one
      */
-    private void visitStrategies(Consumer<AbstractCompactionStrategy> consumer)
+    private void visitStrategies(Consumer<CompactionStrategy> consumer)
     {
-        CompactionStrategyManager csm = csmRef.get();
-        if (csm == null)
-            return;
-        csm.getStrategies()
-           .forEach(l -> l.forEach(consumer));
+        compactionStrategyMapping.keySet().forEach(consumer);
     }
 
     /**
-     * Rely on {@link this#visitStrategies(Consumer)} to visit all the strategies in the {@link CompactionStrategyManager}
-     * reference and add the properties extracted by the function passed in to a json node that is returned.
+     * Rely on {@link this#visitStrategies(Consumer)} to visit all the strategies
+     * and add the properties extracted by the function passed in to a json node that is returned.
      *
      * @param select a function that given a strategy returns a json node
      *
      * @return a json node containing information on all the strategies returned by the strategy manager and the function passed in.
      */
-    private ArrayNode getStrategiesJsonNode(Function<AbstractCompactionStrategy, JsonNode> select)
+    private ArrayNode getStrategiesJsonNode(Function<CompactionStrategy, JsonNode> select)
     {
         ArrayNode node = json.arrayNode();
         visitStrategies(acs -> node.add(select.apply(acs)));
         return node;
     }
 
-    private ArrayNode sstableMap(Collection<SSTableReader> sstables, CompactionStrategyAndTableFunction csatf)
+    private ArrayNode sstableMap(Collection<SSTableReader> sstables)
     {
-        CompactionStrategyManager csm = csmRef.get();
         ArrayNode node = json.arrayNode();
-        if (csm == null)
-            return node;
-        sstables.forEach(t -> node.add(csatf.apply(csm.getCompactionStrategyFor(t), t)));
+        sstables.forEach(t -> node.add(describeSSTable(t)));
         return node;
     }
 
-    private String getId(AbstractCompactionStrategy strategy)
+    private String getId(CompactionStrategy strategy)
     {
-        return compactionStrategyMapping.computeIfAbsent(strategy, s -> String.valueOf(identifier.getAndIncrement()));
+        return compactionStrategyMapping.getOrDefault(strategy, "-1"); // there should always be a strategy because of strategyCreated()
     }
 
-    private JsonNode formatSSTables(AbstractCompactionStrategy strategy)
+    private JsonNode formatSSTables(CompactionStrategy strategy)
     {
         ArrayNode node = json.arrayNode();
-        CompactionStrategyManager csm = csmRef.get();
-        ColumnFamilyStore cfs = cfsRef.get();
-        if (csm == null || cfs == null)
-            return node;
-        for (SSTableReader sstable : cfs.getLiveSSTables())
-        {
-            if (csm.getCompactionStrategyFor(sstable) == strategy)
-                node.add(formatSSTable(strategy, sstable));
-        }
+        for (SSTableReader sstable : strategy.getSSTables())
+            node.add(formatSSTable(sstable));
+
         return node;
     }
 
-    private JsonNode formatSSTable(AbstractCompactionStrategy strategy, SSTableReader sstable)
+    private JsonNode formatSSTable(SSTableReader sstable)
     {
         ObjectNode node = json.objectNode();
         node.put("generation", sstable.descriptor.generation);
         node.put("version", sstable.descriptor.version.getVersion());
         node.put("size", sstable.onDiskLength());
-        JsonNode logResult = strategy.strategyLogger().sstable(sstable);
-        if (logResult != null)
-            node.set("details", logResult);
+
+        // The details are only relevant or available for some strategies, e.g. LCS or Date tiered but
+        // it doesn't hurt to log them all the time in order to simplify things
+        ObjectNode details = json.objectNode();
+        details.put("level", sstable.getSSTableLevel());
+        details.put("min_token", sstable.first.getToken().toString());
+        details.put("max_token", sstable.last.getToken().toString());
+        details.put("min_timestamp", sstable.getMinTimestamp());
+        details.put("max_timestamp", sstable.getMaxTimestamp());
+
+        node.put("details", details);
+
         return node;
     }
 
-    private JsonNode getStrategyDetails(AbstractCompactionStrategy strategy)
+    private JsonNode getStrategyDetails(CompactionStrategy strategy)
     {
         ObjectNode node = json.objectNode();
-        CompactionStrategyManager csm = csmRef.get();
-        if (csm == null)
-            return node;
         node.put("strategyId", getId(strategy));
         node.put("type", strategy.getName());
         node.set("tables", formatSSTables(strategy));
-        node.put("repaired", csm.isRepaired(strategy));
-        List<String> folders = csm.getStrategyFolders(strategy);
-        ArrayNode folderNode = json.arrayNode();
-        for (String folder : folders)
-        {
-            folderNode.add(folder);
-        }
-        node.set("folders", folderNode);
-
-        JsonNode logResult = strategy.strategyLogger().options();
-        if (logResult != null)
-            node.set("options", logResult);
         return node;
     }
 
-    private JsonNode getStrategyId(AbstractCompactionStrategy strategy)
+    private JsonNode getStrategyId(CompactionStrategy strategy)
     {
         ObjectNode node = json.objectNode();
         node.put("strategyId", getId(strategy));
         return node;
     }
 
-    private JsonNode describeSSTable(AbstractCompactionStrategy strategy, SSTableReader sstable)
+    private JsonNode describeSSTable(SSTableReader sstable)
     {
         ObjectNode node = json.objectNode();
-        node.put("strategyId", getId(strategy));
-        node.set("table", formatSSTable(strategy, sstable));
+        node.put("table", formatSSTable(sstable));
         return node;
     }
 
     private void maybeAddSchemaAndTimeInfo(ObjectNode node)
     {
-        ColumnFamilyStore cfs = cfsRef.get();
-        if (cfs == null)
-            return;
-        node.put("keyspace", cfs.getKeyspaceName());
-        node.put("table", cfs.getTableName());
+        node.put("keyspace", keyspace);
+        node.put("table", table);
         node.put("time", System.currentTimeMillis());
     }
 
@@ -292,7 +255,7 @@ public void disable()
             node.set("strategies", getStrategiesJsonNode(this::getStrategyId));
             jsonWriter.write(node, this::getEventJsonNode, this);
 
-            visitStrategies(strategy -> csvWriters.computeIfPresent(strategy, (s, w) -> { w.close(); return null; }));
+            visitStrategies(strategy -> csvWriters.computeIfPresent(strategy, (s, writers) -> { writers.values().forEach(Writer::close); return null; }));
         }
     }
 
@@ -308,7 +271,7 @@ public void flush(Collection<SSTableReader> sstables)
             ObjectNode node = json.objectNode();
             node.put("type", "flush");
             maybeAddSchemaAndTimeInfo(node);
-            node.set("tables", sstableMap(sstables, this::describeSSTable));
+            node.set("tables", sstableMap(sstables));
             jsonWriter.write(node, this::getEventJsonNode, this);
         }
     }
@@ -322,13 +285,13 @@ public void compaction(long startTime, Collection<SSTableReader> input, long end
             maybeAddSchemaAndTimeInfo(node);
             node.put("start", String.valueOf(startTime));
             node.put("end", String.valueOf(endTime));
-            node.set("input", sstableMap(input, this::describeSSTable));
-            node.set("output", sstableMap(output, this::describeSSTable));
+            node.set("input", sstableMap(input));
+            node.set("output", sstableMap(output));
             jsonWriter.write(node, this::getEventJsonNode, this);
         }
     }
 
-    public void pending(AbstractCompactionStrategy strategy, int remaining)
+    public void pending(CompactionStrategy strategy, int remaining)
     {
         if (remaining != 0 && enabled.get())
         {
@@ -344,7 +307,7 @@ public void pending(AbstractCompactionStrategy strategy, int remaining)
     /**
      * Write the strategy statistics formatted as CSV.
      **/
-    public void statistics(AbstractCompactionStrategy strategy, String event, CompactionStrategyStatistics statistics)
+    public void statistics(CompactionStrategy strategy, String event, CompactionStrategyStatistics statistics)
     {
         if (logger.isTraceEnabled())
             logger.trace("Compaction statistics for strategy {} and event {}: {}", strategy, event, statistics);
@@ -352,26 +315,41 @@ public void statistics(AbstractCompactionStrategy strategy, String event, Compac
         if (!enabled.get())
             return;
 
-        Writer writer = getCsvWriter(strategy, statistics.getHeader());
-        for (Collection<String> data : statistics.getData())
-            writer.write(String.join(",", Iterables.concat(ImmutableList.of(currentTime(), event), data)) + System.lineSeparator());
+        for (CompactionAggregateStatistics aggregateStatistics : statistics.aggregates())
+        {
+            Writer writer = getCsvWriter(strategy, statistics.getHeader(), aggregateStatistics);
+            writer.write(String.join(",", Iterables.concat(ImmutableList.of(currentTime(), event), aggregateStatistics.data())) + System.lineSeparator());
+        }
     }
 
-    private Writer getCsvWriter(AbstractCompactionStrategy strategy, Collection<String> header)
+    private Writer getCsvWriter(CompactionStrategy strategy, Collection<String> header, CompactionAggregateStatistics statistics)
     {
-        Writer writer = csvWriters.get(strategy);
+        Map<String, Writer> writers = csvWriters.get(strategy);
+        if (writers == null)
+        {
+            writers = new MapMaker().makeMap();
+            if (csvWriters.putIfAbsent(strategy, writers) != null)
+            {
+                writers = csvWriters.get(strategy);
+            }
+        }
+
+        String shard = statistics.shard();
+        Writer writer = writers.get(shard);
         if (writer != null)
             return writer;
 
-        // TODO - should we add the repair status?
         String fileName = String.format("compaction-%s-%s-%s-%s",
                                         strategy.getName(),
-                                        strategy.getMetadata().keyspace,
-                                        strategy.getMetadata().name,
+                                        keyspace,
+                                        table,
                                         getId(strategy));
 
+        if (!shard.isEmpty())
+            fileName += '-' + shard;
+
         writer = new CompactionLogSerializer(fileName, "csv", loggerService);
-        if (csvWriters.putIfAbsent(strategy, writer) == null)
+        if (writers.putIfAbsent(shard, writer) == null)
         {
             writer.write(String.join(",", Iterables.concat(ImmutableList.of("Timestamp", "Event"), header)) + System.lineSeparator());
             return writer;
@@ -379,13 +357,13 @@ private Writer getCsvWriter(AbstractCompactionStrategy strategy, Collection<Stri
         else
         {
             writer.close();
-            return csvWriters.get(strategy);
+            return writers.get(shard);
         }
     }
 
     private String currentTime()
     {
-        return dateFormatter.format(new Date(System.currentTimeMillis()));
+        return dateFormatter.format(Instant.ofEpochMilli(System.currentTimeMillis()));
     }
 
     private static class CompactionLogSerializer implements Writer
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
index e590c894ffdb..e76eb2739497 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java
@@ -19,6 +19,7 @@
 
 import java.io.Closeable;
 import java.io.File;
+import java.io.IOError;
 import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.*;
@@ -33,10 +34,16 @@
 import com.google.common.base.Preconditions;
 import com.google.common.base.Predicates;
 import com.google.common.collect.*;
+import com.google.common.primitives.Longs;
 import com.google.common.util.concurrent.*;
 
 import org.apache.cassandra.config.Config;
 import org.apache.cassandra.dht.AbstractBounds;
+import org.apache.cassandra.io.FSDiskFullWriteError;
+import org.apache.cassandra.io.FSError;
+import org.apache.cassandra.io.FSWriteError;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.locator.RangesAtEndpoint;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -176,16 +183,21 @@ public void setRate(final double throughPutMbPerSec)
     }
 
     /**
-     * Call this whenever a compaction might be needed on the given columnfamily.
+     * Call this whenever a compaction might be needed on the given column family store.
      * It's okay to over-call (within reason) if a call is unnecessary, it will
      * turn into a no-op in the bucketing/candidate-scan phase.
      */
-    public List<Future<?>> submitBackground(final ColumnFamilyStore cfs)
+    public Future<?> submitBackground(final ColumnFamilyStore cfs)
     {
+        if (!cfs.isValid())
+        {
+            logger.trace("Aborting compaction for dropped CF {}.{}", cfs.keyspace.getName(), cfs.name);
+            return CompletableFuture.completedFuture(null);
+        }
         if (cfs.isAutoCompactionDisabled())
         {
             logger.trace("Autocompaction is disabled");
-            return Collections.emptyList();
+            return CompletableFuture.completedFuture(null);
         }
 
         /**
@@ -193,27 +205,53 @@ public List<Future<?>> submitBackground(final ColumnFamilyStore cfs)
          * we can wait for the current compaction to finish and re-submit when more information is available.
          * Otherwise, we should submit at least one task to prevent starvation by busier CFs, and more if there
          * are idle threads stil. (CASSANDRA-4310)
+         *
+         ** We will check again when the scheduled task is executed.
          */
+        if (hasEnoughCompactionsRunning(cfs))
+            return CompletableFuture.completedFuture(null);
+
+        logger.trace("Scheduling a background task check for {}.{} with {}",
+                     cfs.keyspace.getName(),
+                     cfs.name,
+                     cfs.getCompactionStrategy().getName());
+
+        ListenableFuture<?> fut = executor.submitIfRunning(new BackgroundCompactionCandidate(cfs), "background task");
+        if (fut.isCancelled())
+            compactingCF.remove(cfs);
+        return fut;
+    }
+
+    private boolean hasEnoughCompactionsRunning(ColumnFamilyStore cfs)
+    {
         int count = compactingCF.count(cfs);
         if (count > 0 && executor.getActiveCount() >= executor.getMaximumPoolSize())
         {
             logger.trace("Background compaction is still running for {}.{} ({} remaining). Skipping",
                          cfs.keyspace.getName(), cfs.name, count);
-            return Collections.emptyList();
+            return true;
         }
+        return false;
+    }
 
-        logger.trace("Scheduling a background task check for {}.{} with {}",
-                     cfs.keyspace.getName(),
-                     cfs.name,
-                     cfs.getCompactionStrategyManager().getName());
-
-        List<Future<?>> futures = new ArrayList<>(1);
-        Future<?> fut = executor.submitIfRunning(new BackgroundCompactionCandidate(cfs), "background task");
-        if (!fut.isCancelled())
-            futures.add(fut);
+    private static void handleCompactionError(Throwable t, ColumnFamilyStore cfs)
+    {
+        t = Throwables.unwrapped(t);
+        // FSDiskFullWriteErrors caught during compaction are expected to be recoverable, so we don't explicitly
+        // trigger the disk failure policy because of them (see CASSANDRA-12385).
+        if (t instanceof IOError && !(t instanceof FSDiskFullWriteError))
+        {
+            logger.error("Potentially unrecoverable error during background compaction of table {}", cfs, t);
+            // Strictly speaking it's also possible to hit a read-related IOError during compaction, although the
+            // chances for that are much lower than the chances for write-related IOError. If we want to handle that,
+            // we might have to rely on error message parsing...
+            t = t instanceof FSError ? t : new FSWriteError(t);
+            JVMStabilityInspector.inspectThrowable(t);
+        }
         else
-            compactingCF.remove(cfs);
-        return futures;
+        {
+            logger.error("Exception during background compaction of table {}", cfs, t);
+        }
     }
 
     public boolean isCompacting(Iterable<ColumnFamilyStore> cfses, Predicate<SSTableReader> sstablePredicate)
@@ -280,58 +318,129 @@ class BackgroundCompactionCandidate implements Runnable
 
         public void run()
         {
-            boolean ranCompaction = false;
-            try
+            logger.trace("Checking {}.{}", cfs.keyspace.getName(), cfs.name);
+            if (!cfs.isValid())
             {
-                logger.trace("Checking {}.{}", cfs.keyspace.getName(), cfs.name);
-                if (!cfs.isValid())
+                compactingCF.remove(cfs);
+                logger.trace("Aborting compaction for dropped CF");
+                return;
+            }
+
+            Collection<AbstractCompactionTask> tasks;
+            synchronized (CompactionManager.instance)
+            {
+                if (hasEnoughCompactionsRunning(cfs))
                 {
-                    logger.trace("Aborting compaction for dropped CF");
+                    compactingCF.remove(cfs);
                     return;
                 }
 
-                CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
-                AbstractCompactionTask task = strategy.getNextBackgroundTask(getDefaultGcBefore(cfs, FBUtilities.nowInSeconds()));
-                if (task == null)
+                tasks = cfs.getCompactionStrategy()
+                           .getNextBackgroundTasks(getDefaultGcBefore(cfs, FBUtilities.nowInSeconds()));
+            }
+
+            CompletableFuture<?> fut = null;
+            if (tasks.isEmpty())
+            {
+                if (DatabaseDescriptor.automaticSSTableUpgrade())
+                    fut = maybeRunUpgradeTask();
+            }
+            else
+            {
+                CompletableFuture<?>[] futures = new CompletableFuture[tasks.size()];
+                int i = 0;
+                for (AbstractCompactionTask task : tasks)
+                {
+                    futures[i++] = CompletableFuture.runAsync(() -> task.execute(active), executor);
+                }
+                fut = CompletableFuture.allOf(futures);
+            }
+
+            if (fut != null)
+            {
+                try
                 {
-                    if (DatabaseDescriptor.automaticSSTableUpgrade())
-                        ranCompaction = maybeRunUpgradeTask(strategy);
+                    fut.get();
                 }
-                else
+                catch (InterruptedException | ExecutionException e)
+                {
+                    handleCompactionError(e, cfs);
+                    logger.warn("Aborting compaction due to ", e);
+                    if (e instanceof InterruptedException)
+                        Thread.currentThread().interrupt();
+                }
+                finally
                 {
-                    task.execute(active);
-                    ranCompaction = true;
+                    compactingCF.remove(cfs);
                 }
+                // Since we have ran at least one task and thus the set of sstables has changed, check
+                // for new compaction possibilities. We will still do this if they all errored out or
+                // the cfs is no longer valid, which is not helpful but not a problem.
+                submitBackground(cfs);
             }
-            finally
+            else
             {
                 compactingCF.remove(cfs);
             }
-            if (ranCompaction) // only submit background if we actually ran a compaction - otherwise we end up in an infinite loop submitting noop background tasks
-                submitBackground(cfs);
         }
 
-        boolean maybeRunUpgradeTask(CompactionStrategyManager strategy)
+        CompletableFuture<?> maybeRunUpgradeTask()
         {
             logger.debug("Checking for upgrade tasks {}.{}", cfs.keyspace.getName(), cfs.getTableName());
-            try
+            if (currentlyBackgroundUpgrading.incrementAndGet() <= DatabaseDescriptor.maxConcurrentAutoUpgradeTasks())
             {
-                if (currentlyBackgroundUpgrading.incrementAndGet() <= DatabaseDescriptor.maxConcurrentAutoUpgradeTasks())
+                AbstractCompactionTask upgradeTask = findUpgradeSSTableTask();
+                if (upgradeTask != null)
                 {
-                    AbstractCompactionTask upgradeTask = strategy.findUpgradeSSTableTask();
-                    if (upgradeTask != null)
-                    {
-                        upgradeTask.execute(active);
-                        return true;
-                    }
+                    return CompletableFuture.runAsync(() -> upgradeTask.execute(active), executor)
+                                            .handle((ignored1, ignored2) -> {
+                                                currentlyBackgroundUpgrading.decrementAndGet();
+                                                return null;
+                                            });
+                }
+                else
+                {
+                    logger.trace("No tasks available");
+                    currentlyBackgroundUpgrading.decrementAndGet();
                 }
             }
-            finally
+            else
             {
                 currentlyBackgroundUpgrading.decrementAndGet();
             }
-            logger.trace("No tasks available");
-            return false;
+            return null;
+        }
+
+        /**
+         * finds the oldest (by modification date) non-latest-version sstable on disk and creates an upgrade task for it
+         * @return
+         */
+        @VisibleForTesting
+        @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
+        public AbstractCompactionTask findUpgradeSSTableTask()
+        {
+            if (cfs.isAutoCompactionDisabled() || !DatabaseDescriptor.automaticSSTableUpgrade())
+                return null;
+
+            Set<SSTableReader> compacting = cfs.getTracker().getCompacting();
+            List<SSTableReader> potentialUpgrade = cfs.getLiveSSTables()
+                                                      .stream()
+                                                      .filter(s -> !compacting.contains(s) && !s.descriptor.version.isLatestVersion())
+                                                      .sorted((o1, o2) -> {
+                                                          File f1 = new File(o1.descriptor.filenameFor(Component.DATA));
+                                                          File f2 = new File(o2.descriptor.filenameFor(Component.DATA));
+                                                          return Longs.compare(f1.lastModified(), f2.lastModified());
+                                                      }).collect(Collectors.toList());
+            for (SSTableReader sstable : potentialUpgrade)
+            {
+                LifecycleTransaction txn = cfs.getTracker().tryModify(sstable, OperationType.UPGRADE_SSTABLES);
+                if (txn != null)
+                {
+                    logger.debug("Running automatic sstable upgrade for {}", sstable);
+                    return cfs.getCompactionStrategy().createCompactionTask(txn, Integer.MIN_VALUE, Long.MAX_VALUE);
+                }
+            }
+            return null;
         }
     }
 
@@ -507,7 +616,7 @@ public Iterable<SSTableReader> filterSSTables(LifecycleTransaction transaction)
             @Override
             public void execute(LifecycleTransaction txn)
             {
-                AbstractCompactionTask task = cfs.getCompactionStrategyManager().getCompactionTask(txn, NO_GC, Long.MAX_VALUE);
+                AbstractCompactionTask task = cfs.getCompactionStrategy().createCompactionTask(txn, NO_GC, Long.MAX_VALUE);
                 task.setUserDefined(true);
                 task.setCompactionType(OperationType.UPGRADE_SSTABLES);
                 task.execute(active);
@@ -587,7 +696,7 @@ public AllSSTableOpStatus performGarbageCollection(final ColumnFamilyStore cfSto
             public Iterable<SSTableReader> filterSSTables(LifecycleTransaction transaction)
             {
                 Iterable<SSTableReader> originals = transaction.originals();
-                if (cfStore.getCompactionStrategyManager().onlyPurgeRepairedTombstones())
+                if (cfStore.onlyPurgeRepairedTombstones())
                     originals = Iterables.filter(originals, SSTableReader::isRepaired);
                 List<SSTableReader> sortedSSTables = Lists.newArrayList(originals);
                 Collections.sort(sortedSSTables, SSTableReader.maxTimestampAscending);
@@ -650,7 +759,7 @@ public Iterable<SSTableReader> filterSSTables(LifecycleTransaction transaction)
 
             public Map<Integer, List<SSTableReader>> groupByDiskIndex(Set<SSTableReader> needsRelocation)
             {
-                return needsRelocation.stream().collect(Collectors.groupingBy((s) -> diskBoundaries.getDiskIndex(s)));
+                return needsRelocation.stream().collect(Collectors.groupingBy((s) -> diskBoundaries.getDiskIndexFromKey(s)));
             }
 
             private boolean inCorrectLocation(SSTableReader sstable)
@@ -667,7 +776,7 @@ private boolean inCorrectLocation(SSTableReader sstable)
             public void execute(LifecycleTransaction txn)
             {
                 logger.debug("Relocating {}", txn.originals());
-                AbstractCompactionTask task = cfs.getCompactionStrategyManager().getCompactionTask(txn, NO_GC, Long.MAX_VALUE);
+                AbstractCompactionTask task = cfs.getCompactionStrategy().createCompactionTask(txn, NO_GC, Long.MAX_VALUE);
                 task.setUserDefined(true);
                 task.setCompactionType(OperationType.RELOCATE);
                 task.execute(active);
@@ -732,7 +841,7 @@ private static void mutateFullyContainedSSTables(ColumnFamilyStore cfs,
         Set<SSTableReader> fullyContainedSSTables = findSSTablesToAnticompact(sstableIterator, normalizedRanges, sessionID);
 
         cfs.metric.bytesMutatedAnticompaction.inc(SSTableReader.getTotalBytes(fullyContainedSSTables));
-        cfs.getCompactionStrategyManager().mutateRepaired(fullyContainedSSTables, UNREPAIRED_SSTABLE, sessionID, isTransient);
+        cfs.mutateRepaired(fullyContainedSSTables, UNREPAIRED_SSTABLE, sessionID, isTransient);
         // since we're just re-writing the sstable metdata for the fully contained sstables, we don't want
         // them obsoleted when the anti-compaction is complete. So they're removed from the transaction here
         txn.cancel(fullyContainedSSTables);
@@ -856,7 +965,7 @@ public List<Future<?>> submitMaximal(final ColumnFamilyStore cfStore, final int
         // here we compute the task off the compaction executor, so having that present doesn't
         // confuse runWithCompactionsDisabled -- i.e., we don't want to deadlock ourselves, waiting
         // for ourselves to finish/acknowledge cancellation before continuing.
-        CompactionTasks tasks = cfStore.getCompactionStrategyManager().getMaximalTasks(gcBefore, splitOutput);
+        CompactionTasks tasks = cfStore.getCompactionStrategy().getMaximalTasks(gcBefore, splitOutput);
 
         if (tasks.isEmpty())
             return Collections.emptyList();
@@ -896,7 +1005,7 @@ public void forceCompactionForTokenRange(ColumnFamilyStore cfStore, Collection<R
                 logger.debug("No sstables found for the provided token range");
                 return CompactionTasks.empty();
             }
-            return cfStore.getCompactionStrategyManager().getUserDefinedTasks(sstables, getDefaultGcBefore(cfStore, FBUtilities.nowInSeconds()));
+            return cfStore.getCompactionStrategy().getUserDefinedTasks(sstables, getDefaultGcBefore(cfStore, FBUtilities.nowInSeconds()));
         };
 
         try (CompactionTasks tasks = cfStore.runWithCompactionsDisabled(taskCreator,
@@ -1062,7 +1171,7 @@ protected void runMayThrow() throws Exception
                 }
                 else
                 {
-                    try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstables, gcBefore))
+                    try (CompactionTasks tasks = cfs.getCompactionStrategy().getUserDefinedTasks(sstables, gcBefore))
                     {
                         for (AbstractCompactionTask task : tasks)
                         {
@@ -1493,7 +1602,7 @@ private void doAntiCompaction(ColumnFamilyStore cfs,
         // make use of any actual repairedAt value and splitting up sstables just for that is not worth it at this point.
         Set<SSTableReader> unrepairedSSTables = sstables.stream().filter((s) -> !s.isRepaired()).collect(Collectors.toSet());
         cfs.metric.bytesAnticompacted.inc(SSTableReader.getTotalBytes(unrepairedSSTables));
-        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(unrepairedSSTables);
+        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategy().groupSSTablesForAntiCompaction(unrepairedSSTables);
 
         // iterate over sstables to check if the full / transient / unrepaired ranges intersect them.
         int antiCompactedSSTableCount = 0;
@@ -1573,13 +1682,13 @@ public void obsoleteOriginals() {}
             public void close() {}
         }
 
-        CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
+        CompactionStrategy strategy = cfs.getCompactionStrategy();
         try (SharedTxn sharedTxn = new SharedTxn(txn);
              SSTableRewriter fullWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
              SSTableRewriter transWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
              SSTableRewriter unrepairedWriter = SSTableRewriter.constructWithoutEarlyOpening(sharedTxn, false, groupMaxDataAge);
 
-             AbstractCompactionStrategy.ScannerList scanners = strategy.getScanners(txn.originals());
+             ScannerList scanners = strategy.getScanners(txn.originals());
              CompactionController controller = new CompactionController(cfs, sstableAsSet, getDefaultGcBefore(cfs, nowInSec));
              CompactionIterator ci = getAntiCompactionIterator(scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID(), isCancelled))
         {
@@ -1961,6 +2070,16 @@ public void incrementSstablesDropppedFromCompactions(long num)
         metrics.sstablesDropppedFromCompactions.inc(num);
     }
 
+    public void incrementRemovedExpiredSSTables(long num)
+    {
+        metrics.removedExpiredSSTables.mark(num);
+    }
+
+    public void incrementDeleteOnlyCompactions()
+    {
+        metrics.deleteOnlyCompactions.mark();
+    }
+
 
     public List<Map<String, String>> getCompactions()
     {
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java b/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java
index 0b0fec5e887b..286e3fb9e3a0 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionObserver.java
@@ -29,37 +29,17 @@
  */
 public interface CompactionObserver
 {
-    CompactionObserver NO_OP = new CompactionObserver()
-    {
-        @Override
-        public void setSubmitted(UUID id, CompactionAggregate compaction) { }
-
-        @Override
-        public void setInProgress(CompactionProgress progress) { }
-
-        @Override
-        public void setCompleted(UUID id) { }
-    };
-
-    /**
-     * Indicates that a compaction with the given id has been submitted for the given aggregate.
-     * <p/>
-     * @param id the id of the compaction
-     * @param compaction the compaction aggregate the compaction is part of
-     */
-    void setSubmitted(UUID id, CompactionAggregate compaction);
-
     /**
      * Indicates that a compaction has started.
      * <p/>
      * @param progress the compaction progress, it contains the unique id and real-time progress information
      */
-    void setInProgress(CompactionProgress progress);
+    void onInProgress(CompactionProgress progress);
 
     /**
      * Indicates that a compaction with the given id has completed.
      * <p/>
      * @param id  the id of the compaction
      */
-    void setCompleted(UUID id);
+    void onCompleted(UUID id);
 }
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionPick.java b/src/java/org/apache/cassandra/db/compaction/CompactionPick.java
index 3200ccc0b4b8..4b8881167747 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionPick.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionPick.java
@@ -19,14 +19,14 @@
 package org.apache.cassandra.db.compaction;
 
 import java.util.Collection;
+import java.util.Collections;
 import java.util.Objects;
 import java.util.UUID;
-import java.util.concurrent.CopyOnWriteArraySet;
+import java.util.stream.Collectors;
 
 import javax.annotation.Nullable;
 
-import com.google.common.collect.ImmutableList;
-
+import com.google.common.collect.ImmutableSet;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
 /**
@@ -39,13 +39,16 @@
  **/
 class CompactionPick
 {
-    final static CompactionPick EMPTY = create(-1, new CopyOnWriteArraySet<>(), 0);
+    final static CompactionPick EMPTY = create(-1, Collections.emptyList(), 0);
 
     /** The key to the parent compaction aggregate, e.g. a level number or tier avg size, -1 if no parent */
     final long parent;
 
     /** The sstables to be compacted */
-    final CopyOnWriteArraySet<SSTableReader> sstables;
+    final ImmutableSet<SSTableReader> sstables;
+
+    /** Only expired sstables */
+    final ImmutableSet<SSTableReader> expired;
 
     /** The sum of all the sstable hotness scores */
     final double hotness;
@@ -53,6 +56,9 @@ class CompactionPick
     /** The average size in bytes for the sstables in this compaction */
     final long avgSizeInBytes;
 
+    /** The total size on disk for the sstables in this compaction */
+    final long totSizeInBytes;
+
     /** The unique compaction id, this is only available when a compaction is submitted */
     @Nullable
     volatile UUID id;
@@ -65,20 +71,38 @@ class CompactionPick
     /** Set to true when the compaction has completed */
     volatile boolean completed;
 
-    private CompactionPick(long parent, Collection<SSTableReader> sstables, double hotness, long avgSizeInBytes)
+    private CompactionPick(long parent,
+                           Collection<SSTableReader> compacting,
+                           Collection<SSTableReader> expired,
+                           double hotness,
+                           long avgSizeInBytes,
+                           long totSizeInBytes)
     {
         this.parent = parent;
-        this.sstables = new CopyOnWriteArraySet<>(sstables);
+        this.sstables = ImmutableSet.copyOf(compacting);
+        this.expired = ImmutableSet.copyOf(expired);
         this.hotness = hotness;
         this.avgSizeInBytes = avgSizeInBytes;
+        this.totSizeInBytes = totSizeInBytes;
     }
 
     /**
      * Create a pending compaction candidate calculating hotness and avg size.
      */
+    static CompactionPick create(long parent, Collection<SSTableReader> sstables, Collection<SSTableReader> expired)
+    {
+        Collection<SSTableReader> nonExpiring = sstables.stream().filter(sstable -> !expired.contains(sstable)).collect(Collectors.toList());
+        return create(parent,
+                      sstables,
+                      expired,
+                      CompactionAggregate.getTotHotness(nonExpiring),
+                      CompactionAggregate.getAvgSizeBytes(nonExpiring),
+                      CompactionAggregate.getTotSizeBytes(nonExpiring));
+    }
+
     static CompactionPick create(long parent, Collection<SSTableReader> sstables)
     {
-        return create(parent, sstables, CompactionAggregate.getTotHotness(sstables), CompactionAggregate.getAvgSizeBytes(sstables));
+        return create(parent, sstables, Collections.emptyList());
     }
 
     /**
@@ -86,15 +110,15 @@ static CompactionPick create(long parent, Collection<SSTableReader> sstables)
      */
     static CompactionPick create(long parent, Collection<SSTableReader> sstables, double hotness)
     {
-        return create(parent, sstables, hotness, CompactionAggregate.getAvgSizeBytes(sstables));
+        return create(parent, sstables, Collections.emptyList(), hotness, CompactionAggregate.getAvgSizeBytes(sstables), CompactionAggregate.getTotSizeBytes(sstables));
     }
 
     /**
      * Create a pending compaction candidate with the given parameters.
      */
-    static CompactionPick create(long parent, Collection<SSTableReader> sstables, double hotness, long avgSizeInBytes)
+    static CompactionPick create(long parent, Collection<SSTableReader> sstables, Collection<SSTableReader> expired, double hotness, long avgSizeInBytes, long totSizeInBytes)
     {
-        return new CompactionPick(parent, sstables, hotness, avgSizeInBytes);
+        return new CompactionPick(parent, sstables, expired, hotness, avgSizeInBytes, totSizeInBytes);
     }
 
     /**
@@ -102,7 +126,7 @@ static CompactionPick create(long parent, Collection<SSTableReader> sstables, do
      */
     static CompactionPick create(long parent, CompactionPick pick)
     {
-        return new CompactionPick(parent, pick.sstables, pick.hotness, pick.avgSizeInBytes);
+        return new CompactionPick(parent, pick.sstables, pick.expired, pick.hotness, pick.avgSizeInBytes, pick.totSizeInBytes);
     }
 
     public double hotness()
@@ -157,15 +181,24 @@ void setCompleted()
      * <p/>
      * This is currently used by {@link TimeWindowCompactionStrategy} to add expired sstables.
      *
-     * @param sstables the sstables to add
+     * @param expired the sstables to add
      */
-    CompactionPick withAddedSSTables(Collection<SSTableReader> sstables)
+    CompactionPick withExpiredSSTables(Collection<SSTableReader> expired)
     {
-        ImmutableList.Builder builder = ImmutableList.builder();
-        builder.addAll(this.sstables);
-        builder.addAll(sstables);
-
-        return new CompactionPick(parent, builder.build(), CompactionAggregate.getTotHotness(sstables), CompactionAggregate.getAvgSizeBytes(sstables));
+        ImmutableSet<SSTableReader> newSSTables = ImmutableSet.<SSTableReader>builder()
+                                                              .addAll(this.sstables)
+                                                              .addAll(expired)
+                                                              .build();
+        ImmutableSet<SSTableReader> newExpired = ImmutableSet.<SSTableReader>builder()
+                                                             .addAll(this.expired)
+                                                             .addAll(expired)
+                                                             .build();
+        return new CompactionPick(parent,
+                                  newSSTables,
+                                  newExpired,
+                                  hotness,
+                                  avgSizeInBytes,
+                                  totSizeInBytes);
     }
 
     /**
@@ -176,10 +209,15 @@ boolean isEmpty()
         return sstables.isEmpty();
     }
 
+    boolean hasExpiredOnly()
+    {
+        return sstables.size() == expired.size();
+    }
+
     @Override
     public int hashCode()
     {
-        return Objects.hash(parent, sstables);
+        return Objects.hash(parent, sstables, expired);
     }
 
     @Override
@@ -195,12 +233,12 @@ public boolean equals(Object obj)
 
         // a pick is the same if the sstables are the same given that the other properties are derived from sstables and two
         // picks are the same whether compaction has started or not so the progress and completed properties should not determine equality
-        return parent == that.parent && sstables.equals(that.sstables);
+        return parent == that.parent && sstables.equals(that.sstables) && expired.equals(that.expired);
     }
 
     @Override
     public String toString()
     {
-        return String.format("Parent: %d, Hotness: %f, Avg size in bytes: %d, id: %s, sstables: %s", parent, hotness, avgSizeInBytes, id, sstables);
+        return String.format("Parent: %d, Hotness: %f, Avg size in bytes: %d, id: %s, sstables: %s, expired: %s", parent, hotness, avgSizeInBytes, id, sstables, expired);
     }
 }
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java b/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java
index f6b38511bb3f..08099ef785aa 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionProgress.java
@@ -38,7 +38,8 @@ public interface CompactionProgress extends TableOperation.Progress
      *
      * @return the compaction strategy when available or null.
      */
-    @Nullable AbstractCompactionStrategy strategy();
+    @Nullable
+    CompactionStrategy strategy();
 
     /**
      * @return true if the compaction was requested to interrupt
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java
deleted file mode 100644
index fa6883096530..000000000000
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStatistics.java
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.db.compaction;
-
-import java.util.Collection;
-import java.util.UUID;
-
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.schema.TableMetadata;
-
-/**
- * A container for the statistics of a single compaction event:
- * input and output size, duration, before and after partition and row counters, etc.
- */
-public class CompactionStatistics
-{
-    private final CompactionStrategyManager strategyManager;
-
-    public final TableMetadata metadata;
-    public final OperationType tasktype;
-    public final UUID compactionId;
-
-    public final Collection<SSTableReader> startSstables;
-    public final long startSizeBytes;
-
-    public long estInputSizeBytes;
-    public long totalSourceRows;
-    public long totalSourcePartitions;
-
-    public long[] mergedPartitionCounts;
-    public long[] mergedRowsCounts;
-
-    public Collection<SSTableReader> endSstables;
-    public long endSizeBytes;
-
-    public long bytesRead;
-    public long bytesWritten;
-
-    public long durationInNanos;
-    public boolean stopRequested;
-
-    CompactionStatistics(ColumnFamilyStore cfs, OperationType tasktype, UUID compactionId, Collection<SSTableReader> startSstables)
-    {
-        this.strategyManager = cfs.getCompactionStrategyManager();
-
-        this.metadata = cfs.metadata();
-        this.tasktype = tasktype;
-        this.compactionId = compactionId;
-        this.startSstables = startSstables;
-        this.startSizeBytes = SSTableReader.getTotalBytes(startSstables);
-        this.estInputSizeBytes = this.startSizeBytes;
-
-        this.totalSourceRows = 0;
-        this.totalSourcePartitions = 0;
-        this.mergedPartitionCounts = new long[0];
-        this.mergedRowsCounts = new long[0];
-        this.endSizeBytes = 0;
-        this.bytesRead = 0;
-        this.bytesWritten = 0;
-        this.durationInNanos = 0;
-        this.stopRequested = false;
-    }
-
-    double sizeRatio()
-    {
-        if (estInputSizeBytes > 0)
-            return endSizeBytes / (double) estInputSizeBytes;
-
-        // this is a valid case, when there are no sstables to actually compact
-        // the previous code would return a NaN that would be logged as zero
-        return 0;
-    }
-
-    void setEndSstables(Collection<SSTableReader> endSstables)
-    {
-        this.endSstables = endSstables;
-        this.endSizeBytes = SSTableReader.getTotalBytes(endSstables);
-    }
-
-    public AbstractCompactionStrategy getStrategyFor(SSTableReader ssTableReader)
-    {
-        return strategyManager.getCompactionStrategyFor(ssTableReader);
-    }
-}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategy.java
new file mode 100644
index 000000000000..d687b125ac6c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategy.java
@@ -0,0 +1,188 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.ScannerList;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+
+/**
+ * The common interface between legacy compaction strategies (those that extend {@link LegacyAbstractCompactionStrategy}
+ * and the new compaction strategy, {@link UnifiedCompactionStrategy}.
+ */
+public interface CompactionStrategy extends CompactionObserver
+{
+    /**
+     * @return the compaction logger optionally logs events in a csv file.
+     */
+    CompactionLogger getCompactionLogger();
+
+    /**
+     * For internal, temporary suspension of background compactions so that we can do exceptional
+     * things like truncate or major compaction
+     */
+    void pause();
+
+    /**
+     * For internal, temporary suspension of background compactions so that we can do exceptional
+     * things like truncate or major compaction
+     */
+    void resume();
+
+    /**
+     * Performs any extra initialization required
+     */
+    void startup();
+
+    /**
+     * Releases any resources if this strategy is shutdown (when the CFS is reloaded after a schema change).
+     */
+    void shutdown();
+
+    /**
+     * @param gcBefore throw away tombstones older than this
+     *
+     * @return the next background/minor compaction tasks to run; empty if nothing to do.
+     *
+     * Is responsible for marking its sstables as compaction-pending.
+     */
+    Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore);
+
+    /**
+     * @param gcBefore throw away tombstones older than this
+     *
+     * @return a compaction task that should be run to compact this columnfamilystore
+     * as much as possible.  Null if nothing to do.
+     *
+     * Is responsible for marking its sstables as compaction-pending.
+     */
+    @SuppressWarnings("resource")
+    CompactionTasks getMaximalTasks(int gcBefore, boolean splitOutput);
+
+    /**
+     * @param sstables SSTables to compact. Must be marked as compacting.
+     * @param gcBefore throw away tombstones older than this
+     *
+     * @return a compaction task corresponding to the requested sstables.
+     * Will not be null. (Will throw if user requests an invalid compaction.)
+     *
+     * Is responsible for marking its sstables as compaction-pending.
+     */
+    @SuppressWarnings("resource")
+    CompactionTasks getUserDefinedTasks(Collection<SSTableReader> sstables, int gcBefore);
+
+    /**
+     * Get the estimated remaining compactions.
+     *
+     * @return the number of background tasks estimated to still be needed for this strategy
+     */
+    int getEstimatedRemainingTasks();
+
+    /**
+     * Create a compaction task for the sstables in the transaction.
+     *
+     * @return a valid compaction task that can be executed.
+     */
+    AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes);
+
+    /**
+     * @return the total number of background compactions, pending or in progress
+     */
+    int getTotalCompactions();
+
+    /**
+     * Return the statistics. Not all strategies will provide non-empty statistics,
+     * the legacy strategies that do not support aggregates will return empty statistics.
+     * <p/>
+     * @return statistics about this compaction picks.
+     */
+    List<CompactionStrategyStatistics> getStatistics();
+
+    /**
+     * @return size in bytes of the largest sstables for this strategy
+     */
+    long getMaxSSTableBytes();
+
+    /**
+     * @return the number of sstables for each level, if this strategy supports levels. Otherwise return an empty array.
+     */
+    int[] getSSTableCountPerLevel();
+
+    /**
+     * @return the level fanout size if applicable to this strategy. Otherwise return the default LCS fanout size.
+     */
+    int getLevelFanoutSize();
+
+    /**
+     * Returns a list of KeyScanners given sstables and a range on which to scan.
+     * The default implementation simply grab one SSTableScanner per-sstable, but overriding this method
+     * allow for a more memory efficient solution if we know the sstable don't overlap (see
+     * LeveledCompactionStrategy for instance).
+     */
+    ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges);
+
+    default ScannerList getScanners(Collection<SSTableReader> toCompact)
+    {
+        return getScanners(toCompact, null);
+    }
+
+    /**
+     * @return the name of the strategy
+     */
+    String getName();
+
+    /**
+     * Returns the sstables managed by the strategy
+     */
+    Set<SSTableReader> getSSTables();
+
+    /**
+     * Group sstables that can be anti-compacted togetehr.
+     */
+    Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup);
+
+    /**
+     * Create an sstable writer that is suitable for the strategy.
+     */
+    SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
+                                                long keyCount,
+                                                long repairedAt,
+                                                UUID pendingRepair,
+                                                boolean isTransient,
+                                                MetadataCollector collector,
+                                                SerializationHeader header,
+                                                Collection<Index.Group> indexGroups,
+                                                LifecycleNewTracker lifecycleNewTracker);
+
+    /**
+     * @return true if the strategy supports early open
+     */
+    boolean supportsEarlyOpen();
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyContainer.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyContainer.java
new file mode 100644
index 000000000000..1ce0c37cb080
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyContainer.java
@@ -0,0 +1,191 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.List;
+import java.util.UUID;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.notifications.INotificationConsumer;
+import org.apache.cassandra.schema.CompactionParams;
+
+/**
+ * A strategy container manages compaction strategies for a {@link ColumnFamilyStore}.
+ *
+ * This class is responsible for:
+ * - providing a single interface for possibly multiple active strategy instances - e.g. due to having
+ * multiple arenas for repaired, unrepaired, pending, transient SSTables.
+ * - updating or recreating the strategies when configuration change - e.g. compaction parameters
+ * or disk boundaries
+ */
+public interface CompactionStrategyContainer extends CompactionStrategy, INotificationConsumer
+{
+    /**
+     * Enable compaction.
+     */
+    void enable();
+
+    /**
+     * Disable compaction.
+     */
+    void disable();
+
+    /**
+     * @return {@code true} if compaction is enabled and running; e.g. if autocompaction has been disabled via nodetool
+     *         or JMX, this should return {@code false}, even if the underlying compaction strategy hasn't been paused.
+     */
+    boolean isEnabled();
+
+    /**
+     * @return {@code true} if compaction is running, i.e. if the underlying compaction strategy is not currently
+     *         paused or being shut down.
+     */
+    boolean isActive();
+
+    /**
+     * The reason for reloading
+     */
+    enum ReloadReason
+    {
+        /** A new strategy container has been created.  */
+        FULL,
+
+        /** A new strategy container has been reloaded due to table metadata changes, e.g. a schema change. */
+        METADATA_CHANGE,
+
+        /** A request over JMX to update the compaction parameters only locally, without changing the schema permanently. */
+        JMX_REQUEST,
+
+        /** The disk boundaries were updated, in this case the strategies may need to be recreated even if the params haven't changed */
+        DISK_BOUNDARIES_UPDATED
+    }
+
+    /**
+     * Reload the strategy container taking into account the state of the previous strategy container instance
+     * ({@code this}, in case we're not reloading after switching between containers), the new compaction parameters,
+     * and the reason for reloading.
+     * <p/>
+     * Depending on the reason, different actions are taken, for example the schema parameters are not updated over
+     * JMX and the decision on whether to enable or disable compaction depends only on the parameters over JMX, but
+     * also on the previous JMX directive in case of a full reload. Also, the disk boundaries are not updated over JMX.
+     * <p/>
+     * See the implementations of this method for more details.
+     *
+     * @param previous the strategy container instance which state needs to be inherited/taken into account, in many
+     *                 cases the same as {@code this}, but never {@code null}.
+     * @param compactionParams the new compaction parameters
+     * @param reason the reason for reloading
+     *
+     * @return existing or new container with updated parameters
+     */
+    CompactionStrategyContainer reload(@Nonnull CompactionStrategyContainer previous,
+                                       CompactionParams compactionParams,
+                                       ReloadReason reason);
+
+    /**
+     * @param params new compaction parameters
+     * @param reason the reason for reloading
+     * @return {@code true} if the compaction parameters should be updated on reload
+     */
+    default boolean shouldReload(CompactionParams params, ReloadReason reason)
+    {
+        return reason != CompactionStrategyContainer.ReloadReason.METADATA_CHANGE || !params.equals(getMetadataCompactionParams());
+    }
+
+    /**
+     * Creates new {@link CompactionStrategyContainer} and loads its parameters
+     *
+     * This method is used by {@link CompactionStrategyFactory} to create a
+     * {@link CompactionStrategyContainer}s via reflection.
+     *
+     * @param previous the strategy container instance which state needs to be inherited/taken into account
+     *                 or {@code null} if there was no container to inherit from.
+     * @param strategyFactory the factory instance responsible for creating the CSM
+     * @param compactionParams the new compaction parameters
+     * @param reason the reason for creating a new container
+     *
+     * @return a new {@link CompactionStrategyContainer} with newly loaded parameters
+     */
+    static CompactionStrategyContainer create(@Nullable CompactionStrategyContainer previous,
+                                              CompactionStrategyFactory strategyFactory,
+                                              CompactionParams compactionParams,
+                                              CompactionStrategyContainer.ReloadReason reason)
+    {
+        throw new UnsupportedOperationException("Implementations of CompactionStrategyContainer must implement static create method");
+    }
+
+    /**
+     * Return the compaction parameters. These are not necessarily the same as the ones specified in the schema, they
+     * may have been overwritten over JMX.
+     *
+     * @return the compaction params currently active
+     */
+    CompactionParams getCompactionParams();
+
+    /**
+     * Returns the compaction parameters set via metadata.
+     *
+     * This method is useful to decide if we should update the compaction strategy due to a
+     * metadata change such as a schema changed caused by an ALTER TABLE.
+     *
+     * If a user changes the local compaction strategy via JMX and then later ALTERs a compaction parameter,
+     * we will use the new compaction parameters but we will not override the JMX parameters if compaction
+     * was not changed by the ALTER.
+     *
+     * @return the compaction parameters set via metadata changes
+     */
+    CompactionParams getMetadataCompactionParams();
+
+    /**
+     * This method is to keep compatibility with strategies baked by {@link CompactionStrategyManager} where
+     * there are multiple inner strategies handling sstables by repair status.
+     *
+     * @return all inner compaction strategies
+     */
+    List<CompactionStrategy> getStrategies();
+
+    /**
+     * This method is to keep compatibility with strategies baked by {@link CompactionStrategyManager} where
+     * there are multiple inner strategies handling sstables by repair status.
+     *
+     * Note that if {@code isRepaired} is true, {@code pendingRepair} must be null.
+     *
+     * @param isRepaired will return strategies for repaired SSTables; must be {@code false} if
+     *                   {@code pendingRepair} is specified
+     * @param pendingRepair will return strategies for the given pending repair; must be {@code null}
+     *                      if {@code isRepaired} is true
+     *
+     * @return a list of inner strategies that match given parameters
+     */
+    List<CompactionStrategy> getStrategies(boolean isRepaired, @Nullable UUID pendingRepair);
+
+    /**
+     * Called to clean up state when a repair session completes.
+     *
+     * @param sessionID repair session id.
+     */
+    void repairSessionCompleted(UUID sessionID);
+
+    /**
+     * The method is for CompactionStrategyManager to use with {@link org.apache.cassandra.db.ColumnFamilyStore#mutateRepaired}.
+     * UnifiedCompactionContainer does not need it.
+     */
+    ReentrantReadWriteLock.WriteLock getWriteLock();
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyFactory.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyFactory.java
new file mode 100644
index 000000000000..847e70c0d066
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyFactory.java
@@ -0,0 +1,188 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.lang.reflect.Constructor;
+import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.Map;
+
+import javax.annotation.Nullable;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.schema.CompactionParams;
+
+/**
+ * The factory for compaction strategies and their containers.
+ */
+public class CompactionStrategyFactory
+{
+    private final ColumnFamilyStore cfs;
+    private final CompactionLogger compactionLogger;
+
+    public CompactionStrategyFactory(ColumnFamilyStore cfs)
+    {
+        this.cfs = cfs;
+        this.compactionLogger = new CompactionLogger(cfs.metadata());
+    }
+
+    /**
+     * Reload the existing strategy container, possibly creating a new one if required.
+     *
+     * @param current the current strategy container, or {@code null} if this is the first time we're loading a
+     *                compaction strategy
+     * @param compactionParams the new compaction parameters
+     * @param reason the reason for reloading
+     *
+     * @return Either a new strategy container or the current one, but reloaded with the given compaction parameters.
+     */
+    public CompactionStrategyContainer reload(@Nullable CompactionStrategyContainer current,
+                                              CompactionParams compactionParams,
+                                              CompactionStrategyContainer.ReloadReason reason)
+    {
+        // If we were called due to a metadata change but the compaction parameters are the same then
+        // don't reload since we risk overriding parameters set via JMX
+        if (current != null && !current.shouldReload(compactionParams, reason))
+            return current;
+
+        Class<? extends CompactionStrategyContainer> containerClass = containerForStrategy(compactionParams.klass());
+        CompactionStrategyContainer ret;
+
+        // if the strategy belongs to the same container, we can just reload
+        if (current != null && current.getClass().equals(containerClass))
+            ret = current.reload(current, compactionParams, reason);
+        else
+        {
+            // otherwise we need to re-create the container
+            ret = createStrategyContainer(containerClass, current, compactionParams, reason);
+        }
+        
+        if (ret != current)
+            cfs.getTracker().subscribe(ret);
+
+        return ret;
+    }
+
+    static boolean enableCompactionOnReload(@Nullable CompactionStrategyContainer previous,
+                                            CompactionParams compactionParams,
+                                            CompactionStrategyContainer.ReloadReason reason)
+    {
+        // If this is a JMX request, we only consider the params passed by it
+        if (reason == CompactionStrategyContainer.ReloadReason.JMX_REQUEST)
+            return compactionParams.isEnabled();
+        // If the enabled state flag and the params of the previous container differ, compaction was forcefully
+        // enabled/disabled by JMX/nodetool, and we should inherit that setting through the enabled state flag
+        if (previous != null && previous.isEnabled() != previous.getCompactionParams().isEnabled())
+            return previous.isEnabled();
+
+        return compactionParams.isEnabled();
+    }
+
+    /**
+     * Returns a {@link CompactionStrategyContainer#} class for the given strategy class.
+     *
+     * We need this method to create correct container for the strategy, but also to distinguish
+     * between situations when a container should reloaded or recreated.
+     */
+    private Class<? extends CompactionStrategyContainer> containerForStrategy(Class<? extends CompactionStrategy> strategyClass)
+    {
+        Class<? extends CompactionStrategyContainer> containerClass;
+        try
+        {
+            Field containerClassField = strategyClass.getField("CONTAINER_CLASS");
+            containerClass = (Class<? extends CompactionStrategyContainer>) containerClassField.get(null);
+        }
+        catch (IllegalAccessException | NoSuchFieldException e)
+        {
+            containerClass = CompactionStrategyManager.class;
+        }
+
+        return containerClass;
+    }
+
+    private CompactionStrategyContainer createStrategyContainer(Class<? extends CompactionStrategyContainer> containerClass,
+                                                                CompactionStrategyContainer previous,
+                                                                CompactionParams compactionParams,
+                                                                CompactionStrategyContainer.ReloadReason reason)
+    {
+        CompactionStrategyContainer ret;
+        try
+        {
+            Method createMethod = containerClass.getMethod("create",
+                                                           CompactionStrategyContainer.class,
+                                                           CompactionStrategyFactory.class,
+                                                           CompactionParams.class,
+                                                           CompactionStrategyContainer.ReloadReason.class);
+            ret = (CompactionStrategyContainer) createMethod.invoke(null,
+                                                                    previous,
+                                                                    this,
+                                                                    compactionParams,
+                                                                    reason);
+        }
+        catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e)
+        {
+            ret = new CompactionStrategyManager(this);
+            ret.reload(previous, compactionParams, reason);
+        }
+        return ret;
+    }
+
+    public CompactionLogger getCompactionLogger()
+    {
+        return compactionLogger;
+    }
+
+    ColumnFamilyStore getCfs()
+    {
+        return cfs;
+    }
+
+    /**
+     * Creates a compaction strategy that is managed by {@link CompactionStrategyManager} and its strategy holders.
+     * These strategies must extend {@link LegacyAbstractCompactionStrategy}.
+     *
+     * @return an instance of the compaction strategy specified in the parameters so long as it extends {@link LegacyAbstractCompactionStrategy}
+     * @throws IllegalArgumentException if the params do not contain a strategy that extends  {@link LegacyAbstractCompactionStrategy}
+     */
+    LegacyAbstractCompactionStrategy createLegacyStrategy(CompactionParams compactionParams)
+    {
+        try
+        {
+            if (!LegacyAbstractCompactionStrategy.class.isAssignableFrom(compactionParams.klass()))
+                throw new IllegalArgumentException("Expected compaction params for legacy strategy: " + compactionParams);
+
+            Constructor<? extends CompactionStrategy> constructor =
+            compactionParams.klass().getConstructor(CompactionStrategyFactory.class, Map.class);
+            LegacyAbstractCompactionStrategy ret = (LegacyAbstractCompactionStrategy) constructor.newInstance(this, compactionParams.options());
+            compactionLogger.strategyCreated(ret);
+            return ret;
+        }
+        catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException | InstantiationException e)
+        {
+            throw org.apache.cassandra.utils.Throwables.cleaned(e);
+        }
+    }
+
+    /**
+     * Create a compaction strategy. This is only called by tiered storage so we forward to the legacy strategy.
+     */
+    public CompactionStrategy createStrategy(CompactionParams compactionParams)
+    {
+        return createLegacyStrategy(compactionParams);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java
index bd2ac772657c..e1a88798989d 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyHolder.java
@@ -29,7 +29,6 @@
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.index.Index;
@@ -43,25 +42,25 @@
 
 public class CompactionStrategyHolder extends AbstractStrategyHolder
 {
-    private final List<AbstractCompactionStrategy> strategies = new ArrayList<>();
+    private final List<LegacyAbstractCompactionStrategy> strategies = new ArrayList<>();
     private final boolean isRepaired;
 
-    public CompactionStrategyHolder(ColumnFamilyStore cfs, DestinationRouter router, boolean isRepaired)
+    public CompactionStrategyHolder(ColumnFamilyStore cfs, CompactionStrategyFactory strategyFactory, DestinationRouter router, boolean isRepaired)
     {
-        super(cfs, router);
+        super(cfs, strategyFactory, router);
         this.isRepaired = isRepaired;
     }
 
     @Override
     public void startup()
     {
-        strategies.forEach(AbstractCompactionStrategy::startup);
+        strategies.forEach(CompactionStrategy::startup);
     }
 
     @Override
     public void shutdown()
     {
-        strategies.forEach(AbstractCompactionStrategy::shutdown);
+        strategies.forEach(CompactionStrategy::shutdown);
     }
 
     @Override
@@ -69,7 +68,7 @@ public void setStrategyInternal(CompactionParams params, int numTokenPartitions)
     {
         strategies.clear();
         for (int i = 0; i < numTokenPartitions; i++)
-            strategies.add(cfs.createCompactionStrategyInstance(params));
+            strategies.add(strategyFactory.createLegacyStrategy(params));
     }
 
     @Override
@@ -89,24 +88,24 @@ public boolean managesRepairedGroup(boolean isRepaired, boolean isPendingRepair,
     }
 
     @Override
-    public AbstractCompactionStrategy getStrategyFor(SSTableReader sstable)
+    public LegacyAbstractCompactionStrategy getStrategyFor(SSTableReader sstable)
     {
         Preconditions.checkArgument(managesSSTable(sstable), "Attempting to get compaction strategy from wrong holder");
         return strategies.get(router.getIndexForSSTable(sstable));
     }
 
     @Override
-    public Iterable<AbstractCompactionStrategy> allStrategies()
+    public Iterable<LegacyAbstractCompactionStrategy> allStrategies()
     {
         return strategies;
     }
 
     @Override
-    public Collection<TaskSupplier> getBackgroundTaskSuppliers(int gcBefore)
+    public Collection<TasksSupplier> getBackgroundTaskSuppliers(int gcBefore)
     {
-        List<TaskSupplier> suppliers = new ArrayList<>(strategies.size());
-        for (AbstractCompactionStrategy strategy : strategies)
-            suppliers.add(new TaskSupplier(strategy.getEstimatedRemainingTasks(), () -> strategy.getNextBackgroundTask(gcBefore)));
+        List<TasksSupplier> suppliers = new ArrayList<>(strategies.size());
+        for (CompactionStrategy strategy : strategies)
+            suppliers.add(new TasksSupplier(strategy.getEstimatedRemainingTasks(), () -> strategy.getNextBackgroundTasks(gcBefore)));
 
         return suppliers;
     }
@@ -115,11 +114,9 @@ public Collection<TaskSupplier> getBackgroundTaskSuppliers(int gcBefore)
     public Collection<AbstractCompactionTask> getMaximalTasks(int gcBefore, boolean splitOutput)
     {
         List<AbstractCompactionTask> tasks = new ArrayList<>(strategies.size());
-        for (AbstractCompactionStrategy strategy : strategies)
+        for (CompactionStrategy strategy : strategies)
         {
-            Collection<AbstractCompactionTask> task = strategy.getMaximalTask(gcBefore, splitOutput);
-            if (task != null)
-                tasks.addAll(task);
+           tasks.addAll(strategy.getMaximalTasks(gcBefore, splitOutput));
         }
         return tasks;
     }
@@ -133,7 +130,7 @@ public Collection<AbstractCompactionTask> getUserDefinedTasks(GroupedSSTableCont
             if (sstables.isGroupEmpty(i))
                 continue;
 
-            tasks.add(strategies.get(i).getUserDefinedTask(sstables.getGroup(i), gcBefore));
+            tasks.addAll(strategies.get(i).getUserDefinedTasks(sstables.getGroup(i), gcBefore));
         }
         return tasks;
     }
@@ -239,7 +236,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
         Preconditions.checkArgument(pendingRepair == null,
                                     "CompactionStrategyHolder can't create sstable writer with pendingRepair id");
         // to avoid creating a compaction strategy for the wrong pending repair manager, we get the index based on where the sstable is to be written
-        AbstractCompactionStrategy strategy = strategies.get(router.getIndexForSSTableDirectory(descriptor));
+        CompactionStrategy strategy = strategies.get(router.getIndexForSSTableDirectory(descriptor));
         return strategy.createSSTableMultiWriter(descriptor,
                                                  keyCount,
                                                  repairedAt,
@@ -251,12 +248,6 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                  lifecycleNewTracker);
     }
 
-    @Override
-    public int getStrategyIndex(AbstractCompactionStrategy strategy)
-    {
-        return strategies.indexOf(strategy);
-    }
-
     @Override
     public boolean containsSSTable(SSTableReader sstable)
     {
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
index b9e382fc631f..e91224842de3 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java
@@ -18,8 +18,6 @@
 package org.apache.cassandra.db.compaction;
 
 
-import java.io.File;
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -27,52 +25,46 @@
 import java.util.ConcurrentModificationException;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Objects;
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.function.Supplier;
 import java.util.stream.Collectors;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 import com.google.common.collect.Lists;
-import com.google.common.primitives.Longs;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.DiskBoundaries;
 import org.apache.cassandra.db.SerializationHeader;
-import org.apache.cassandra.db.compaction.AbstractStrategyHolder.TaskSupplier;
-import org.apache.cassandra.db.compaction.PendingRepairManager.CleanupTask;
+import org.apache.cassandra.db.compaction.AbstractStrategyHolder.TasksSupplier;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.index.Index;
-import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
 import org.apache.cassandra.notifications.INotification;
-import org.apache.cassandra.notifications.INotificationConsumer;
 import org.apache.cassandra.notifications.SSTableAddedNotification;
 import org.apache.cassandra.notifications.SSTableDeletingNotification;
 import org.apache.cassandra.notifications.SSTableListChangedNotification;
 import org.apache.cassandra.notifications.SSTableMetadataChanged;
 import org.apache.cassandra.notifications.SSTableRepairStatusChanged;
-import org.apache.cassandra.repair.consistent.admin.CleanupSummary;
 import org.apache.cassandra.schema.CompactionParams;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
 
 import static org.apache.cassandra.db.compaction.AbstractStrategyHolder.GroupedSSTableContainer;
@@ -94,11 +86,11 @@
  *
  * Whenever the {@link DiskBoundaries} change, the compaction strategies must be reloaded, so in order to ensure
  * the compaction strategy placement reflect most up-to-date disk boundaries, call {@link this#maybeReloadDiskBoundaries()}
- * before acquiring the read lock to acess the strategies.
+ * before acquiring the read lock to access the strategies.
  *
  */
 
-public class CompactionStrategyManager implements INotificationConsumer
+public class CompactionStrategyManager implements CompactionStrategyContainer
 {
     private static final Logger logger = LoggerFactory.getLogger(CompactionStrategyManager.class);
     public final CompactionLogger compactionLogger;
@@ -129,25 +121,31 @@ public class CompactionStrategyManager implements INotificationConsumer
     private volatile boolean isActive = true;
 
     /*
-        We keep a copy of the schema compaction parameters here to be able to decide if we
-        should update the compaction strategy in maybeReload() due to an ALTER.
+        We keep a copy of the table metadata compaction parameters here to be able to decide if we
+        should update the compaction strategy due to a metadata change such as a schema changed
+        caused by an ALTER TABLE.
 
-        If a user changes the local compaction strategy and then later ALTERs a compaction parameter,
-        we will use the new compaction parameters.
+        If a user changes the local compaction strategy via JMX and then later ALTERs a compaction parameter,
+        we will use the new compaction parameters but we will not override the JMX parameters if compaction
+        was not changed by the ALTER.
      */
-    private volatile CompactionParams schemaCompactionParams;
+    @SuppressWarnings("thread-safe")
+    private volatile CompactionParams metadataParams;
     private volatile boolean supportsEarlyOpen;
     private volatile int fanout;
     private volatile long maxSSTableSizeBytes;
     private volatile String name;
 
-    public CompactionStrategyManager(ColumnFamilyStore cfs)
+    public CompactionStrategyManager(CompactionStrategyFactory strategyFactory)
     {
-        this(cfs, cfs::getDiskBoundaries, cfs.getPartitioner().splitter().isPresent());
+        this(strategyFactory,
+             () -> strategyFactory.getCfs().getDiskBoundaries(),
+             strategyFactory.getCfs().getPartitioner().splitter().isPresent());
     }
 
     @VisibleForTesting
-    public CompactionStrategyManager(ColumnFamilyStore cfs, Supplier<DiskBoundaries> boundariesSupplier,
+    public CompactionStrategyManager(CompactionStrategyFactory strategyFactory,
+                                     Supplier<DiskBoundaries> boundariesSupplier,
                                      boolean partitionSSTablesByTokenRange)
     {
         AbstractStrategyHolder.DestinationRouter router = new AbstractStrategyHolder.DestinationRouter()
@@ -162,69 +160,78 @@ public int getIndexForSSTableDirectory(Descriptor descriptor)
                 return compactionStrategyIndexForDirectory(descriptor);
             }
         };
-        transientRepairs = new PendingRepairHolder(cfs, router, true);
-        pendingRepairs = new PendingRepairHolder(cfs, router, false);
-        repaired = new CompactionStrategyHolder(cfs, router, true);
-        unrepaired = new CompactionStrategyHolder(cfs, router, false);
+
+        cfs = strategyFactory.getCfs();
+
+        transientRepairs = new PendingRepairHolder(cfs, strategyFactory, router, true);
+        pendingRepairs = new PendingRepairHolder(cfs, strategyFactory, router, false);
+        repaired = new CompactionStrategyHolder(cfs, strategyFactory, router, true);
+        unrepaired = new CompactionStrategyHolder(cfs, strategyFactory, router, false);
         holders = ImmutableList.of(transientRepairs, pendingRepairs, repaired, unrepaired);
 
-        cfs.getTracker().subscribe(this);
-        logger.trace("{} subscribed to the data tracker.", this);
-        this.cfs = cfs;
-        this.compactionLogger = new CompactionLogger(cfs, this);
+        compactionLogger = strategyFactory.getCompactionLogger();
         this.boundariesSupplier = boundariesSupplier;
         this.partitionSSTablesByTokenRange = partitionSSTablesByTokenRange;
         params = cfs.metadata().params.compaction;
         enabled = params.isEnabled();
     }
 
-    CompactionLogger compactionLogger()
+    public static CompactionStrategyContainer create(@Nullable CompactionStrategyContainer previous,
+                                                     CompactionStrategyFactory strategyFactory,
+                                                     CompactionParams compactionParams,
+                                                     CompactionStrategyContainer.ReloadReason reason)
     {
-        return compactionLogger;
+        CompactionStrategyManager csm = new CompactionStrategyManager(strategyFactory);
+        csm.reload(previous != null ? previous : csm, compactionParams, reason);
+        return csm;
     }
 
     /**
      * Return the next background task
      *
-     * Returns a task for the compaction strategy that needs it the most (most estimated remaining tasks)
-     */
-    public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+     * Legacy strategies will always return one task but we wrap this in a collection because new strategies
+     * might return multiple tasks.
+     *
+     * @return the task for the compaction strategy that needs it the most (most estimated remaining tasks)     */
+    @Override
+    public Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore)
     {
         maybeReloadDiskBoundaries();
         readLock.lock();
         try
         {
             if (!isEnabled())
-                return null;
+                return ImmutableList.of();
 
             int numPartitions = getNumTokenPartitions();
 
             // first try to promote/demote sstables from completed repairs
-            AbstractCompactionTask repairFinishedTask;
-            repairFinishedTask = pendingRepairs.getNextRepairFinishedTask();
-            if (repairFinishedTask != null)
-                return repairFinishedTask;
+            Collection<AbstractCompactionTask> repairFinishedTasks;
+            repairFinishedTasks = pendingRepairs.getNextRepairFinishedTasks();
+            if (!repairFinishedTasks.isEmpty())
+                return repairFinishedTasks;
 
-            repairFinishedTask = transientRepairs.getNextRepairFinishedTask();
-            if (repairFinishedTask != null)
-                return repairFinishedTask;
+            repairFinishedTasks = transientRepairs.getNextRepairFinishedTasks();
+            if (!repairFinishedTasks.isEmpty())
+                return repairFinishedTasks;
 
             // sort compaction task suppliers by remaining tasks descending
-            List<TaskSupplier> suppliers = new ArrayList<>(numPartitions * holders.size());
+            List<TasksSupplier> suppliers = new ArrayList<>(numPartitions * holders.size());
             for (AbstractStrategyHolder holder : holders)
                 suppliers.addAll(holder.getBackgroundTaskSuppliers(gcBefore));
 
             Collections.sort(suppliers);
 
-            // return the first non-null task
-            for (TaskSupplier supplier : suppliers)
+            // return the first non-empty list, we could enhance it to return all tasks of all
+            // suppliers but this would change existing behavior
+            for (TasksSupplier supplier : suppliers)
             {
-                AbstractCompactionTask task = supplier.getTask();
-                if (task != null)
-                    return task;
+                Collection<AbstractCompactionTask> tasks = supplier.getTasks();
+                if (!tasks.isEmpty())
+                    return tasks;
             }
 
-            return null;
+            return ImmutableList.of();
         }
         finally
         {
@@ -232,47 +239,25 @@ public AbstractCompactionTask getNextBackgroundTask(int gcBefore)
         }
     }
 
-    /**
-     * finds the oldest (by modification date) non-latest-version sstable on disk and creates an upgrade task for it
-     * @return
-     */
-    @VisibleForTesting
-    @SuppressWarnings("resource") // transaction is closed by AbstractCompactionTask::execute
-    AbstractCompactionTask findUpgradeSSTableTask()
-    {
-        if (!isEnabled() || !DatabaseDescriptor.automaticSSTableUpgrade())
-            return null;
-        Set<SSTableReader> compacting = cfs.getTracker().getCompacting();
-        List<SSTableReader> potentialUpgrade = cfs.getLiveSSTables()
-                                                  .stream()
-                                                  .filter(s -> !compacting.contains(s) && !s.descriptor.version.isLatestVersion())
-                                                  .sorted((o1, o2) -> {
-                                                      File f1 = new File(o1.descriptor.filenameFor(Component.DATA));
-                                                      File f2 = new File(o2.descriptor.filenameFor(Component.DATA));
-                                                      return Longs.compare(f1.lastModified(), f2.lastModified());
-                                                  }).collect(Collectors.toList());
-        for (SSTableReader sstable : potentialUpgrade)
-        {
-            LifecycleTransaction txn = cfs.getTracker().tryModify(sstable, OperationType.UPGRADE_SSTABLES);
-            if (txn != null)
-            {
-                logger.debug("Running automatic sstable upgrade for {}", sstable);
-                return getCompactionStrategyFor(sstable).createCompactionTask(txn, Integer.MIN_VALUE, Long.MAX_VALUE);
-            }
-        }
-        return null;
+    @Override
+    public CompactionLogger getCompactionLogger()
+    {
+        return compactionLogger;
     }
 
+    @Override
     public boolean isEnabled()
     {
         return enabled && isActive;
     }
 
+    @Override
     public boolean isActive()
     {
         return isActive;
     }
 
+    @Override
     public void resume()
     {
         writeLock.lock();
@@ -291,6 +276,7 @@ public void resume()
      *
      * Separate call from enable/disable to not have to save the enabled-state externally
       */
+    @Override
     public void pause()
     {
         writeLock.lock();
@@ -305,7 +291,8 @@ public void pause()
 
     }
 
-    private void startup()
+    @Override
+    public void startup()
     {
         writeLock.lock();
         try
@@ -317,7 +304,6 @@ private void startup()
             }
             holders.forEach(AbstractStrategyHolder::startup);
             supportsEarlyOpen = repaired.first().supportsEarlyOpen();
-            fanout = (repaired.first() instanceof LeveledCompactionStrategy) ? ((LeveledCompactionStrategy) repaired.first()).getLevelFanoutSize() : LeveledCompactionStrategy.DEFAULT_LEVEL_FANOUT_SIZE;
             maxSSTableSizeBytes = repaired.first().getMaxSSTableBytes();
             name = repaired.first().getName();
         }
@@ -326,25 +312,23 @@ private void startup()
             writeLock.unlock();
         }
 
-        if (repaired.first().logAll)
+        if (repaired.first().getOptions().isLogAll())
             compactionLogger.enable();
     }
 
     /**
-     * return the compaction strategy for the given sstable
-     *
      * returns differently based on the repaired status and which vnode the compaction strategy belongs to
      * @param sstable
-     * @return
+     * @return the compaction strategy for the given sstable
      */
-    public AbstractCompactionStrategy getCompactionStrategyFor(SSTableReader sstable)
+    LegacyAbstractCompactionStrategy getCompactionStrategyFor(SSTableReader sstable)
     {
         maybeReloadDiskBoundaries();
         return compactionStrategyFor(sstable);
     }
 
     @VisibleForTesting
-    AbstractCompactionStrategy compactionStrategyFor(SSTableReader sstable)
+    LegacyAbstractCompactionStrategy compactionStrategyFor(SSTableReader sstable)
     {
         // should not call maybeReloadDiskBoundaries because it may be called from within lock
         readLock.lock();
@@ -380,7 +364,7 @@ int compactionStrategyIndexFor(SSTableReader sstable)
             if (!partitionSSTablesByTokenRange)
                 return 0;
 
-            return currentBoundaries.getDiskIndex(sstable);
+            return currentBoundaries.getDiskIndexFromKey(sstable);
         }
         finally
         {
@@ -425,19 +409,7 @@ PendingRepairHolder getTransientRepairsUnsafe()
         return transientRepairs;
     }
 
-    public boolean hasDataForPendingRepair(UUID sessionID)
-    {
-        readLock.lock();
-        try
-        {
-            return pendingRepairs.hasDataForSession(sessionID) || transientRepairs.hasDataForSession(sessionID);
-        }
-        finally
-        {
-            readLock.unlock();
-        }
-    }
-
+    @Override
     public void shutdown()
     {
         writeLock.lock();
@@ -453,42 +425,6 @@ public void shutdown()
         }
     }
 
-    public void maybeReload(TableMetadata metadata)
-    {
-        // compare the old schema configuration to the new one, ignore any locally set changes.
-        if (metadata.params.compaction.equals(schemaCompactionParams))
-            return;
-
-        writeLock.lock();
-        try
-        {
-            // compare the old schema configuration to the new one, ignore any locally set changes.
-            if (metadata.params.compaction.equals(schemaCompactionParams))
-                return;
-            reload(metadata.params.compaction);
-        }
-        finally
-        {
-            writeLock.unlock();
-        }
-    }
-
-    /**
-     * Version of the above forcing the strategy to always be reloaded. Used by tests that need to clear the state.
-     */
-    public void forceReload()
-    {
-        writeLock.lock();
-        try
-        {
-            reload(schemaCompactionParams);
-        }
-        finally
-        {
-            writeLock.unlock();
-        }
-    }
-
     /**
      * Checks if the disk boundaries changed and reloads the compaction strategies
      * to reflect the most up-to-date disk boundaries.
@@ -512,7 +448,7 @@ protected void maybeReloadDiskBoundaries()
         {
             if (!currentBoundaries.isOutOfDate())
                 return;
-            reload(params);
+            doReload(this, params, ReloadReason.DISK_BOUNDARIES_UPDATED);
         }
         finally
         {
@@ -520,39 +456,67 @@ protected void maybeReloadDiskBoundaries()
         }
     }
 
-    /**
-     * Reload the compaction strategies
-     *
-     * Called after changing configuration and at startup.
-     * @param newCompactionParams
-     */
-    public void reload(CompactionParams newCompactionParams)
+    @Override
+    public CompactionStrategyContainer reload(@Nonnull CompactionStrategyContainer previous, CompactionParams newCompactionParams, ReloadReason reason)
     {
-        boolean enabledWithJMX = enabled && !shouldBeEnabled();
-        boolean disabledWithJMX = !enabled && shouldBeEnabled();
-
-        if (currentBoundaries != null)
+        writeLock.lock();
+        try
+        {
+            doReload(previous, newCompactionParams, reason);
+        }
+        finally
         {
-            if (!newCompactionParams.equals(schemaCompactionParams))
-                logger.debug("Recreating compaction strategy - compaction parameters changed for {}.{}", cfs.keyspace.getName(), cfs.getTableName());
-            else if (currentBoundaries.isOutOfDate())
-                logger.debug("Recreating compaction strategy - disk boundaries are out of date for {}.{}.", cfs.keyspace.getName(), cfs.getTableName());
+            writeLock.unlock();
         }
+        if (previous != this)
+            previous.shutdown();
+
+        return this;
+    }
+
+    private void doReload(CompactionStrategyContainer previous, CompactionParams compactionParams, ReloadReason reason)
+    {
+        boolean updateDiskBoundaries = currentBoundaries == null || currentBoundaries.isOutOfDate();
+        boolean enabledOnReload = CompactionStrategyFactory.enableCompactionOnReload(previous, compactionParams, reason);
+
+        logger.debug("Recreating compaction strategy for {}.{}, reason: {}, params updated: {}, disk boundaries updated: {}, enabled: {}, params: {} -> {}, metadataParams: {}",
+                     cfs.getKeyspaceName(), cfs.getTableName(), reason, !compactionParams.equals(params), updateDiskBoundaries, enabledOnReload, params, compactionParams, metadataParams);
 
-        if (currentBoundaries == null || currentBoundaries.isOutOfDate())
+        if (updateDiskBoundaries)
             currentBoundaries = boundariesSupplier.get();
 
-        setStrategy(newCompactionParams);
-        schemaCompactionParams = cfs.metadata().params.compaction;
+        int numPartitions = getNumTokenPartitions();
+        for (AbstractStrategyHolder holder : holders)
+            holder.setStrategy(compactionParams, numPartitions);
+
+        params = compactionParams;
+
+        // full reload or switch from a strategy not managed by CompactionStrategyManager
+        if (metadataParams == null || reason == ReloadReason.FULL)
+            metadataParams = cfs.metadata().params.compaction;
+        else if (reason == ReloadReason.METADATA_CHANGE)
+            // metadataParams are aligned with compactionParams. We do not access TableParams.COMPACTION to avoid racing with
+            // concurrent ALTER TABLE metadata change.
+            metadataParams = compactionParams;
+
+        // no-op for DISK_BOUNDARIES_UPDATED and JMX_REQUEST. DISK_BOUNDARIES_UPDATED does not change compaction params
+        // and JMX changes do not affect table metadata
+
 
-        if (disabledWithJMX || !shouldBeEnabled() && !enabledWithJMX)
+        if (params.maxCompactionThreshold() <= 0 || params.minCompactionThreshold() <= 0)
+        {
+            logger.warn("Disabling compaction strategy by setting compaction thresholds to 0 is deprecated, set the compaction option 'enabled' to 'false' instead.");
+            disable();
+        }
+        else if (!enabledOnReload)
             disable();
         else
             enable();
+
         startup();
     }
 
-    private Iterable<AbstractCompactionStrategy> getAllStrategies()
+    private Iterable<CompactionStrategy> getAllStrategies()
     {
         return Iterables.concat(Iterables.transform(holders, AbstractStrategyHolder::allStrategies));
     }
@@ -566,7 +530,7 @@ public int getUnleveledSSTables()
             if (repaired.first() instanceof LeveledCompactionStrategy)
             {
                 int count = 0;
-                for (AbstractCompactionStrategy strategy : getAllStrategies())
+                for (CompactionStrategy strategy : getAllStrategies())
                     count += ((LeveledCompactionStrategy) strategy).getLevelSize(0);
                 return count;
             }
@@ -578,11 +542,13 @@ public int getUnleveledSSTables()
         return 0;
     }
 
+    @Override
     public int getLevelFanoutSize()
     {
-        return fanout;
+        return repaired.first().getLevelFanoutSize();
     }
 
+    @Override
     public int[] getSSTableCountPerLevel()
     {
         maybeReloadDiskBoundaries();
@@ -592,19 +558,22 @@ public int[] getSSTableCountPerLevel()
             if (repaired.first() instanceof LeveledCompactionStrategy)
             {
                 int[] res = new int[LeveledGenerations.MAX_LEVEL_COUNT];
-                for (AbstractCompactionStrategy strategy : getAllStrategies())
+                for (CompactionStrategy strategy : getAllStrategies())
                 {
                     int[] repairedCountPerLevel = ((LeveledCompactionStrategy) strategy).getAllLevelSize();
                     res = sumArrays(res, repairedCountPerLevel);
                 }
                 return res;
             }
+            else
+            {
+                return new int[0];
+            }
         }
         finally
         {
             readLock.unlock();
         }
-        return null;
     }
 
     static int[] sumArrays(int[] a, int[] b)
@@ -744,7 +713,7 @@ private void handleRepairStatusChangedNotification(Iterable<SSTableReader> sstab
      */
     private void handleMetadataChangedNotification(SSTableReader sstable, StatsMetadata oldMetadata)
     {
-        AbstractCompactionStrategy acs = getCompactionStrategyFor(sstable);
+        LegacyAbstractCompactionStrategy acs = getCompactionStrategyFor(sstable);
         acs.metadataChanged(oldMetadata, sstable);
     }
 
@@ -795,6 +764,7 @@ else if (notification instanceof SSTableMetadataChanged)
         }
     }
 
+    @Override
     public void enable()
     {
         writeLock.lock();
@@ -809,6 +779,7 @@ public void enable()
         }
     }
 
+    @Override
     public void disable()
     {
         writeLock.lock();
@@ -831,7 +802,7 @@ public void disable()
      * @return
      */
     @SuppressWarnings("resource")
-    public AbstractCompactionStrategy.ScannerList maybeGetScanners(Collection<SSTableReader> sstables,  Collection<Range<Token>> ranges)
+    private ScannerList maybeGetScanners(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
     {
         maybeReloadDiskBoundaries();
         List<ISSTableScanner> scanners = new ArrayList<>(sstables.size());
@@ -855,10 +826,11 @@ public AbstractCompactionStrategy.ScannerList maybeGetScanners(Collection<SSTabl
         {
             readLock.unlock();
         }
-        return new AbstractCompactionStrategy.ScannerList(scanners);
+        return new ScannerList(scanners);
     }
 
-    public AbstractCompactionStrategy.ScannerList getScanners(Collection<SSTableReader> sstables,  Collection<Range<Token>> ranges)
+    @Override
+    public ScannerList getScanners(Collection<SSTableReader> sstables,  Collection<Range<Token>> ranges)
     {
         while (true)
         {
@@ -873,11 +845,21 @@ public AbstractCompactionStrategy.ScannerList getScanners(Collection<SSTableRead
         }
     }
 
-    public AbstractCompactionStrategy.ScannerList getScanners(Collection<SSTableReader> sstables)
+    @Override
+    public ScannerList getScanners(Collection<SSTableReader> sstables)
     {
         return getScanners(sstables, null);
     }
 
+    @Override
+    public Set<SSTableReader> getSSTables()
+    {
+        return getStrategies().stream()
+                              .flatMap(strategy -> strategy.getSSTables().stream())
+                              .collect(Collectors.toSet());
+    }
+
+    @Override
     public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup)
     {
         maybeReloadDiskBoundaries();
@@ -892,12 +874,14 @@ public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Coll
         }
     }
 
+    @Override
     public long getMaxSSTableBytes()
     {
         return maxSSTableSizeBytes;
     }
 
-    public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes)
+    @Override
+    public AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes)
     {
         maybeReloadDiskBoundaries();
         readLock.lock();
@@ -910,7 +894,6 @@ public AbstractCompactionTask getCompactionTask(LifecycleTransaction txn, int gc
         {
             readLock.unlock();
         }
-
     }
 
     private void validateForCompaction(Iterable<SSTableReader> input)
@@ -940,6 +923,7 @@ private void validateForCompaction(Iterable<SSTableReader> input)
         }
     }
 
+    @Override
     public CompactionTasks getMaximalTasks(final int gcBefore, final boolean splitOutput)
     {
         maybeReloadDiskBoundaries();
@@ -973,6 +957,7 @@ public CompactionTasks getMaximalTasks(final int gcBefore, final boolean splitOu
      * @param gcBefore gc grace period, throw away tombstones older than this
      * @return a list of compaction tasks corresponding to the sstables requested
      */
+    @Override
     public CompactionTasks getUserDefinedTasks(Collection<SSTableReader> sstables, int gcBefore)
     {
         maybeReloadDiskBoundaries();
@@ -993,39 +978,37 @@ public CompactionTasks getUserDefinedTasks(Collection<SSTableReader> sstables, i
         }
     }
 
+    @Override
     public int getEstimatedRemainingTasks()
     {
-        maybeReloadDiskBoundaries();
-        int tasks = 0;
-        readLock.lock();
-        try
-        {
-            for (AbstractCompactionStrategy strategy : getAllStrategies())
-                tasks += strategy.getEstimatedRemainingTasks();
-        }
-        finally
-        {
-            readLock.unlock();
-        }
-        return tasks;
+        return getStrategies(false).stream()
+                                   .flatMap(list -> list.stream())
+                                   .mapToInt(CompactionStrategy::getEstimatedRemainingTasks)
+                                   .sum();
     }
 
-    public boolean shouldBeEnabled()
+    @Override
+    public int getTotalCompactions()
     {
-        return params.isEnabled();
+        return getStrategies(false).stream()
+                                   .flatMap(list -> list.stream())
+                                   .mapToInt(CompactionStrategy::getTotalCompactions)
+                                   .sum();
     }
 
+    @Override
     public String getName()
     {
         return name;
     }
 
-    public List<List<AbstractCompactionStrategy>> getStrategies()
+    @Override
+    public List<CompactionStrategy> getStrategies()
     {
-        return getStrategies(true);
+        return getStrategies(true).stream().flatMap(List::stream).collect(Collectors.toList());
     }
 
-    private List<List<AbstractCompactionStrategy>> getStrategies(boolean checkBoundaries)
+    private List<List<CompactionStrategy>> getStrategies(boolean checkBoundaries)
     {
         if (checkBoundaries)
             maybeReloadDiskBoundaries();
@@ -1043,60 +1026,57 @@ private List<List<AbstractCompactionStrategy>> getStrategies(boolean checkBounda
         }
     }
 
-    /**
-     * @return the statistics for the compaction strategies that have compactions in progress or pending
-     */
-    public List<CompactionStrategyStatistics> getStrategyStatistics()
-    {
-        return getStrategies(false).stream()
-                                   .flatMap(list -> list.stream())
-                                   .filter(strategy -> strategy.getTotalCompactions() > 0)
-                                   .map(AbstractCompactionStrategy::getStatistics)
-                                   .collect(Collectors.toList());
-    }
-
-    public void setNewLocalCompactionStrategy(CompactionParams params)
+    @Override
+    public List<CompactionStrategy> getStrategies(boolean isRepaired, @Nullable UUID pendingRepair)
     {
-        logger.info("Switching local compaction strategy from {} to {}}", this.params, params);
-        writeLock.lock();
+        readLock.lock();
         try
         {
-            setStrategy(params);
-            if (shouldBeEnabled())
-                enable();
+            if (isRepaired)
+                return Lists.newArrayList(repaired.allStrategies());
+            else if (pendingRepair != null)
+                return Lists.newArrayList(pendingRepairs.getStrategiesFor(pendingRepair));
             else
-                disable();
-            startup();
+                return Lists.newArrayList(unrepaired.allStrategies());
         }
         finally
         {
-            writeLock.unlock();
+            readLock.unlock();
         }
     }
 
-    private int getNumTokenPartitions()
+    /**
+     * @return the statistics for the compaction strategies that have compactions in progress or pending
+     */
+    @Override
+    public List<CompactionStrategyStatistics> getStatistics()
     {
-        return partitionSSTablesByTokenRange ? currentBoundaries.directories.size() : 1;
+        return getStrategies(false).stream()
+                                   .flatMap(list -> list.stream())
+                                   .filter(strategy -> strategy.getTotalCompactions() > 0)
+                                   .map(CompactionStrategy::getStatistics)
+                                   .flatMap(List::stream)
+                                   .collect(Collectors.toList());
     }
 
-    private void setStrategy(CompactionParams params)
+    private int getNumTokenPartitions()
     {
-        int numPartitions = getNumTokenPartitions();
-        for (AbstractStrategyHolder holder : holders)
-            holder.setStrategy(params, numPartitions);
-        this.params = params;
+        return partitionSSTablesByTokenRange && currentBoundaries != null ? currentBoundaries.directories.size() : 1;
     }
 
+    @Override
     public CompactionParams getCompactionParams()
     {
         return params;
     }
 
-    public boolean onlyPurgeRepairedTombstones()
+    @Override
+    public CompactionParams getMetadataCompactionParams()
     {
-        return Boolean.parseBoolean(params.options().get(AbstractCompactionStrategy.ONLY_PURGE_REPAIRED_TOMBSTONES));
+        return metadataParams;
     }
 
+    @Override
     public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                        long keyCount,
                                                        long repairedAt,
@@ -1128,121 +1108,49 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
         }
     }
 
-    public boolean isRepaired(AbstractCompactionStrategy strategy)
-    {
-        return repaired.getStrategyIndex(strategy) >= 0;
-    }
-
-    public List<String> getStrategyFolders(AbstractCompactionStrategy strategy)
-    {
-        readLock.lock();
-        try
-        {
-            Directories.DataDirectory[] locations = cfs.getDirectories().getWriteableLocations();
-            if (partitionSSTablesByTokenRange)
-            {
-                for (AbstractStrategyHolder holder : holders)
-                {
-                    int idx = holder.getStrategyIndex(strategy);
-                    if (idx >= 0)
-                        return Collections.singletonList(locations[idx].location.getAbsolutePath());
-                }
-            }
-            List<String> folders = new ArrayList<>(locations.length);
-            for (Directories.DataDirectory location : locations)
-            {
-                folders.add(location.location.getAbsolutePath());
-            }
-            return folders;
-        }
-        finally
-        {
-            readLock.unlock();
-        }
-    }
-
+    @Override
     public boolean supportsEarlyOpen()
     {
         return supportsEarlyOpen;
     }
 
-    @VisibleForTesting
-    List<PendingRepairManager> getPendingRepairManagers()
+    public ReentrantReadWriteLock.WriteLock getWriteLock()
     {
-        maybeReloadDiskBoundaries();
-        readLock.lock();
-        try
-        {
-            return Lists.newArrayList(pendingRepairs.getManagers());
-        }
-        finally
-        {
-            readLock.unlock();
-        }
+        return this.writeLock;
     }
 
     /**
-     * Mutates sstable repairedAt times and notifies listeners of the change with the writeLock held. Prevents races
-     * with other processes between when the metadata is changed and when sstables are moved between strategies.
-      */
-    public void mutateRepaired(Collection<SSTableReader> sstables, long repairedAt, UUID pendingRepair, boolean isTransient) throws IOException
+     * This method is exposed for testing only
+     * @return the LocalSession sessionIDs of any pending repairs
+     */
+    @VisibleForTesting
+    public Set<UUID> pendingRepairs()
     {
-        Set<SSTableReader> changed = new HashSet<>();
-
-        writeLock.lock();
-        try
-        {
-            for (SSTableReader sstable: sstables)
-            {
-                sstable.mutateRepairedAndReload(repairedAt, pendingRepair, isTransient);
-                verifyMetadata(sstable, repairedAt, pendingRepair, isTransient);
-                changed.add(sstable);
-            }
-        }
-        finally
-        {
-            try
-            {
-                // if there was an exception mutating repairedAt, we should still notify for the
-                // sstables that we were able to modify successfully before releasing the lock
-                cfs.getTracker().notifySSTableRepairedStatusChanged(changed);
-            }
-            finally
-            {
-                writeLock.unlock();
-            }
-        }
+        Set<UUID> ids = new HashSet<>();
+        pendingRepairs.getManagers().forEach(p -> ids.addAll(p.getSessions()));
+        return ids;
     }
 
-    private static void verifyMetadata(SSTableReader sstable, long repairedAt, UUID pendingRepair, boolean isTransient)
+    @Override
+    public void repairSessionCompleted(UUID sessionID)
     {
-        if (!Objects.equals(pendingRepair, sstable.getPendingRepair()))
-            throw new IllegalStateException(String.format("Failed setting pending repair to %s on %s (pending repair is %s)", pendingRepair, sstable, sstable.getPendingRepair()));
-        if (repairedAt != sstable.getRepairedAt())
-            throw new IllegalStateException(String.format("Failed setting repairedAt to %d on %s (repairedAt is %d)", repairedAt, sstable, sstable.getRepairedAt()));
-        if (isTransient != sstable.isTransient())
-            throw new IllegalStateException(String.format("Failed setting isTransient to %b on %s (isTransient is %b)", isTransient, sstable, sstable.isTransient()));
+        for (PendingRepairManager manager : pendingRepairs.getManagers())
+            manager.removeSessionIfEmpty(sessionID);
     }
 
-    public CleanupSummary releaseRepairData(Collection<UUID> sessions)
+    //
+    // CompactionObserver - because the strategies observe compactions, for CSM this is currently a no-op
+    //
+
+    @Override
+    public void onInProgress(CompactionProgress progress)
     {
-        List<CleanupTask> cleanupTasks = new ArrayList<>();
-        readLock.lock();
-        try
-        {
-            for (PendingRepairManager prm : Iterables.concat(pendingRepairs.getManagers(), transientRepairs.getManagers()))
-                cleanupTasks.add(prm.releaseSessionData(sessions));
-        }
-        finally
-        {
-            readLock.unlock();
-        }
 
-        CleanupSummary summary = new CleanupSummary(cfs, Collections.emptySet(), Collections.emptySet());
+    }
 
-        for (CleanupTask task : cleanupTasks)
-            summary = CleanupSummary.add(summary, task.cleanup());
+    @Override
+    public void onCompleted(UUID id)
+    {
 
-        return summary;
     }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyOptions.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyOptions.java
new file mode 100644
index 000000000000..b4f3871edcf6
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyOptions.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.lang.reflect.InvocationTargetException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Objects;
+
+import com.google.common.base.MoreObjects;
+
+import com.google.common.collect.ImmutableMap;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.utils.Throwables;
+
+import static java.lang.String.format;
+
+/**
+ * This class contains all compaction options that are shared by all strategies.
+ */
+public class CompactionStrategyOptions
+{
+    public static final int DEFAULT_MIN_THRESHOLD = 4;
+    public static final int DEFAULT_MAX_THRESHOLD = 32;
+    private static final Logger logger = LoggerFactory.getLogger(CompactionStrategyOptions.class);
+
+    public static final Map<String, String> DEFAULT_THRESHOLDS =
+    ImmutableMap.of(CompactionParams.Option.MIN_THRESHOLD.toString(), Integer.toString(DEFAULT_MIN_THRESHOLD),
+                    CompactionParams.Option.MAX_THRESHOLD.toString(), Integer.toString(DEFAULT_MAX_THRESHOLD));
+
+    public static final String ONLY_PURGE_REPAIRED_TOMBSTONES = "only_purge_repaired_tombstones";
+
+    public static final String DEFAULT_TOMBSTONE_THRESHOLD = "0.2";
+    // minimum interval needed to perform tombstone removal compaction in seconds, default 86400 or 1 day.
+    public static final String DEFAULT_TOMBSTONE_COMPACTION_INTERVAL = "86400";
+    public static final String DEFAULT_UNCHECKED_TOMBSTONE_COMPACTION_OPTION = "false";
+    public static final String DEFAULT_LOG_ALL_OPTION = "false";
+
+    public static final String TOMBSTONE_THRESHOLD_OPTION = "tombstone_threshold";
+    public static final String TOMBSTONE_COMPACTION_INTERVAL_OPTION = "tombstone_compaction_interval";
+    // disable range overlap check when deciding if an SSTable is candidate for tombstone compaction (CASSANDRA-6563)
+    public static final String UNCHECKED_TOMBSTONE_COMPACTION_OPTION = "unchecked_tombstone_compaction";
+    public static final String LOG_ALL_OPTION = "log_all";
+    public static final String COMPACTION_ENABLED = "enabled";
+
+    private final Class<? extends CompactionStrategy> klass;
+    private final Map<String, String> options;
+    private final float tombstoneThreshold;
+    private final long tombstoneCompactionInterval;
+    private final boolean uncheckedTombstoneCompaction;
+    private boolean disableTombstoneCompactions = false;
+    private final boolean logAll;
+
+    public CompactionStrategyOptions(Class<? extends CompactionStrategy> klass, Map<String, String> options, boolean throwOnInvalidOption)
+    {
+        this.klass = klass;
+        this.options = copyOptions(klass, options);
+
+        boolean useDefault = false;
+        try
+        {
+            validate(); // will throw ConfigurationException if the options are invalid
+        }
+        catch (ConfigurationException e)
+        {
+            // when called from CompactionParams we throw but when called from AbstractCompactionStrategy we use defaults
+            // could probably not bother with the latter (?)
+            if (throwOnInvalidOption)
+            {
+                throw e;
+            }
+            else
+            {
+                logger.warn("Error setting compaction strategy options ({}), defaults will be used", e.getMessage());
+                useDefault = true;
+            }
+        }
+
+        tombstoneThreshold = Float.parseFloat(getOption(TOMBSTONE_THRESHOLD_OPTION, useDefault, DEFAULT_TOMBSTONE_THRESHOLD));
+        tombstoneCompactionInterval = Long.parseLong(getOption(TOMBSTONE_COMPACTION_INTERVAL_OPTION, useDefault, DEFAULT_TOMBSTONE_COMPACTION_INTERVAL));
+        uncheckedTombstoneCompaction = Boolean.parseBoolean(getOption(UNCHECKED_TOMBSTONE_COMPACTION_OPTION, useDefault, DEFAULT_UNCHECKED_TOMBSTONE_COMPACTION_OPTION));
+        logAll = Boolean.parseBoolean(getOption(LOG_ALL_OPTION, useDefault, DEFAULT_LOG_ALL_OPTION));
+    }
+
+    private Map<String, String> copyOptions(Class<? extends CompactionStrategy> klass, Map<String, String> options)
+    {
+        Map<String, String> newOptions = new HashMap<>(options);
+
+        // For legacy compatibility reasons, for some compaction strategies we want to see the default min and max threshold
+        // in the compaction parameters that can be seen in CQL when retrieving the table from the schema tables so for
+        // these strategies we need to add these options when they have not been specified by the user
+        if (supportsThresholdParams(klass))
+        {
+            newOptions.putIfAbsent(CompactionParams.Option.MIN_THRESHOLD.toString(), Integer.toString(DEFAULT_MIN_THRESHOLD));
+            newOptions.putIfAbsent(CompactionParams.Option.MAX_THRESHOLD.toString(), Integer.toString(DEFAULT_MAX_THRESHOLD));
+        }
+
+        return newOptions;
+    }
+
+    /**
+     * All strategies except {@link UnifiedCompactionStrategy} support the minimum and maximum thresholds
+     */
+    @SuppressWarnings("unchecked")
+    public static boolean supportsThresholdParams(Class<? extends CompactionStrategy> klass)
+    {
+        try
+        {
+            Map<String, String> unrecognizedOptions =
+            (Map<String, String>) klass.getMethod("validateOptions", Map.class)
+                                       .invoke(null, DEFAULT_THRESHOLDS);
+
+            return unrecognizedOptions.isEmpty();
+        }
+        catch (Exception e)
+        {
+            throw Throwables.cleaned(e);
+        }
+    }
+
+    private String getOption(String optionName, boolean useDefault, String defaultValue)
+    {
+        if (useDefault)
+            return defaultValue;
+
+        String optionValue = options.get(optionName);
+        if (optionValue == null)
+            return defaultValue;
+
+        return optionValue;
+    }
+
+    @Override
+    public String toString()
+    {
+        return MoreObjects.toStringHelper(this)
+                          .add("class", klass.getName())
+                          .add("options", options)
+                          .toString();
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o)
+            return true;
+
+        if (!(o instanceof CompactionStrategyOptions))
+            return false;
+
+        CompactionStrategyOptions that = (CompactionStrategyOptions) o;
+
+        return klass.equals(that.klass) && options.equals(that.options);
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(klass, options);
+    }
+
+    private Map<String, String> validate()
+    {
+        try
+        {
+            // Each strategy currently implements a static validateOptions() method for custom validation, the default behavior
+            // is to simply call validateOptions() below, through AbstractCompactionStrategy.validateOptions(), we could simplify
+            // all this assuming we don't need to support any user-defined compaction strategy
+            Map<String, String> unknownOptions = (Map<String, String>) klass.getMethod("validateOptions", Map.class).invoke(null, options);
+            if (!unknownOptions.isEmpty())
+            {
+                throw new ConfigurationException(format("Properties specified %s are not understood by %s",
+                                                        unknownOptions.keySet(),
+                                                        klass.getSimpleName()));
+            }
+
+            return unknownOptions;
+        }
+        catch (NoSuchMethodException e)
+        {
+            logger.warn("Compaction strategy {} does not have a static validateOptions method. Validation ignored", klass.getName());
+        }
+        catch (InvocationTargetException e)
+        {
+            if (e.getTargetException() instanceof ConfigurationException)
+                throw (ConfigurationException) e.getTargetException();
+
+            Throwable cause = e.getCause() == null
+                              ? e
+                              : e.getCause();
+
+            throw new ConfigurationException(format("%s.validateOptions() threw an error: %s %s",
+                                                    klass.getName(),
+                                                    cause.getClass().getName(),
+                                                    cause.getMessage()),
+                                             e);
+        }
+        catch (IllegalAccessException e)
+        {
+            throw new ConfigurationException("Cannot access method validateOptions in " + klass.getName(), e);
+        }
+
+        if (minCompactionThreshold() <= 0 || maxCompactionThreshold() <= 0)
+        {
+            throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been removed,"
+                                             + " set the compaction option 'enabled' to false instead.");
+        }
+
+        if (minCompactionThreshold() <= 1)
+        {
+            throw new ConfigurationException(format("Min compaction threshold cannot be less than 2 (got %d)",
+                                                    minCompactionThreshold()));
+        }
+
+        if (minCompactionThreshold() > maxCompactionThreshold())
+        {
+            throw new ConfigurationException(format("Min compaction threshold (got %d) cannot be greater than max compaction threshold (got %d)",
+                                                    minCompactionThreshold(),
+                                                    maxCompactionThreshold()));
+        }
+
+        return options;
+    }
+
+    public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
+    {
+        String minThreshold = options.get(CompactionParams.Option.MIN_THRESHOLD.toString());
+        if (minThreshold != null && !StringUtils.isNumeric(minThreshold))
+        {
+            throw new ConfigurationException(format("Invalid value %s for '%s' compaction sub-option - must be an integer",
+                                                    minThreshold,
+                                                    CompactionParams.Option.MIN_THRESHOLD));
+        }
+
+        String maxThreshold = options.get(CompactionParams.Option.MAX_THRESHOLD.toString());
+        if (maxThreshold != null && !StringUtils.isNumeric(maxThreshold))
+        {
+            throw new ConfigurationException(format("Invalid value %s for '%s' compaction sub-option - must be an integer",
+                                                    maxThreshold,
+                                                    CompactionParams.Option.MAX_THRESHOLD));
+        }
+
+        String threshold = options.get(TOMBSTONE_THRESHOLD_OPTION);
+        if (threshold != null)
+        {
+            try
+            {
+                float thresholdValue = Float.parseFloat(threshold);
+                if (thresholdValue < 0)
+                {
+                    throw new ConfigurationException(String.format("%s must be greater than 0, but was %f", TOMBSTONE_THRESHOLD_OPTION, thresholdValue));
+                }
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", threshold, TOMBSTONE_THRESHOLD_OPTION), e);
+            }
+        }
+
+        String interval = options.get(TOMBSTONE_COMPACTION_INTERVAL_OPTION);
+        if (interval != null)
+        {
+            try
+            {
+                long tombstoneCompactionInterval = Long.parseLong(interval);
+                if (tombstoneCompactionInterval < 0)
+                {
+                    throw new ConfigurationException(String.format("%s must be greater than 0, but was %d", TOMBSTONE_COMPACTION_INTERVAL_OPTION, tombstoneCompactionInterval));
+                }
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format("%s is not a parsable int (base10) for %s", interval, TOMBSTONE_COMPACTION_INTERVAL_OPTION), e);
+            }
+        }
+
+        String unchecked = options.get(UNCHECKED_TOMBSTONE_COMPACTION_OPTION);
+        if (unchecked != null && !unchecked.equalsIgnoreCase("true") && !unchecked.equalsIgnoreCase("false"))
+        {
+            throw new ConfigurationException(String.format("'%s' should be either 'true' or 'false', not '%s'", UNCHECKED_TOMBSTONE_COMPACTION_OPTION, unchecked));
+        }
+
+        String logAll = options.get(LOG_ALL_OPTION);
+        if (logAll != null && !logAll.equalsIgnoreCase("true") && !logAll.equalsIgnoreCase("false"))
+        {
+            throw new ConfigurationException(String.format("'%s' should either be 'true' or 'false', not %s", LOG_ALL_OPTION, logAll));
+        }
+
+        String compactionEnabled = options.get(COMPACTION_ENABLED);
+        if (compactionEnabled != null && !compactionEnabled.equalsIgnoreCase("true") && !compactionEnabled.equalsIgnoreCase("false"))
+        {
+            throw new ConfigurationException(String.format("enabled should either be 'true' or 'false', not %s", compactionEnabled));
+        }
+
+        Map<String, String> uncheckedOptions = new HashMap<>(options);
+        uncheckedOptions.remove(TOMBSTONE_THRESHOLD_OPTION);
+        uncheckedOptions.remove(TOMBSTONE_COMPACTION_INTERVAL_OPTION);
+        uncheckedOptions.remove(UNCHECKED_TOMBSTONE_COMPACTION_OPTION);
+        uncheckedOptions.remove(LOG_ALL_OPTION);
+        uncheckedOptions.remove(COMPACTION_ENABLED);
+        uncheckedOptions.remove(ONLY_PURGE_REPAIRED_TOMBSTONES);
+        uncheckedOptions.remove(CompactionParams.Option.PROVIDE_OVERLAPPING_TOMBSTONES.toString());
+        return uncheckedOptions;
+    }
+
+    public int minCompactionThreshold()
+    {
+        String threshold = options.get(CompactionParams.Option.MIN_THRESHOLD.toString());
+        return threshold == null
+               ? DEFAULT_MIN_THRESHOLD
+               : Integer.parseInt(threshold);
+    }
+
+    public int maxCompactionThreshold()
+    {
+        String threshold = options.get(CompactionParams.Option.MAX_THRESHOLD.toString());
+        return threshold == null
+               ? DEFAULT_MAX_THRESHOLD
+               : Integer.parseInt(threshold);
+    }
+
+    public Class<? extends CompactionStrategy> klass()
+    {
+        return klass;
+    }
+
+    public Map<String, String> getOptions()
+    {
+        return options;
+    }
+
+    public float getTombstoneThreshold()
+    {
+        return tombstoneThreshold;
+    }
+
+    public long getTombstoneCompactionInterval()
+    {
+        return tombstoneCompactionInterval;
+    }
+
+    public boolean isUncheckedTombstoneCompaction()
+    {
+        return uncheckedTombstoneCompaction;
+    }
+
+    public boolean isDisableTombstoneCompactions()
+    {
+        return disableTombstoneCompactions;
+    }
+
+    /**
+     * {@link DateTieredCompactionStrategy} and {@link TimeWindowCompactionStrategy} disable this
+     * parameter if other parameters aren't available.
+     */
+    public void setDisableTombstoneCompactions(boolean disableTombstoneCompactions)
+    {
+        this.disableTombstoneCompactions = disableTombstoneCompactions;
+    }
+
+    public boolean isLogAll()
+    {
+        return logAll;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java
index e1d24d044f11..307810b912ee 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyStatistics.java
@@ -22,7 +22,9 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.stream.Collectors;
 
 import com.google.common.collect.ImmutableList;
@@ -92,31 +94,51 @@ public String toString()
         if (!aggregates.isEmpty())
         {
             Collection<String> header = aggregates.get(0).header(); // all headers are identical
-            String[][] rows = new String[1 + aggregates.size()][header.size()]; // rows including the header
             int[] lengths = new int[header.size()]; // the max lengths of each column
-
             Iterator<String> it = header.iterator();
+
             for (int i = 0; i < lengths.length; i++)
-            {
-                rows[0][i] = it.next();
-                lengths[i] = rows[0][i].length();
-            }
+                lengths[i] = it.next().length();
 
-            for (int idx = 1; idx <= aggregates.size(); idx++)
+            Map<String, List<String[]>> rowsByShard = new LinkedHashMap<>();
+            for (CompactionAggregateStatistics aggregate : aggregates)
             {
-                it = aggregates.get(idx-1).data().iterator();
+                String shard = aggregate.shard();
+                List<String[]> rows = rowsByShard.computeIfAbsent(shard, key -> new ArrayList<>(aggregates.size()));
+                String[] data = new String[header.size()];
+
+                it = aggregate.data().iterator();
                 for (int i = 0; i < lengths.length; i++)
                 {
-                    rows[idx][i] = it.next();
-                    if (rows[idx][i].length() > lengths[i])
-                        lengths[i] = rows[idx][i].length();
+                    data[i] = it.next();
+                    if (data[i].length() > lengths[i])
+                        lengths[i] = data[i].length();
                 }
+
+                rows.add(data);
             }
 
-            for (String[] row : rows)
+            for (Map.Entry<String, List<String[]>> entry : rowsByShard.entrySet())
             {
-                for (int i = 0; i < row.length; i++)
-                    ret.append(String.format("%-" + lengths[i] + "s\t", row[i]));
+                // optional shard
+                if (!entry.getKey().isEmpty())
+                    ret.append("Shard/").append(entry.getKey()).append('\n');
+
+                // header
+                it = header.iterator();
+                for (int i = 0; i < header.size(); i++)
+                    ret.append(String.format("%-" + lengths[i] + "s\t", it.next()));
+
+                ret.append('\n');
+
+                // rows
+                for (String[] row : entry.getValue())
+                {
+                    for (int i = 0; i < row.length; i++)
+                        ret.append(String.format("%-" + lengths[i] + "s\t", row[i]));
+
+                    ret.append('\n');
+                }
 
                 ret.append('\n');
             }
diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
index b93fb301dfaf..51582fbf4615 100644
--- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java
@@ -46,6 +46,7 @@
 import org.apache.cassandra.db.compaction.writers.DefaultCompactionWriter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.schema.CompactionParams;
@@ -70,55 +71,23 @@ public class CompactionTask extends AbstractCompactionTask
 
     // The compaction strategy is not necessarily available for all compaction tasks (e.g. GC or sstable splitting)
     @Nullable
-    private final AbstractCompactionStrategy strategy;
+    private final CompactionStrategy strategy;
 
-    /**
-     * This constructs a compaction tasks that operations that do not normally have a compaction strategy, such as tombstone
-     * collection or table splitting, also tests.
-     */
-    protected CompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean keepOriginals)
-    {
-        this(cfs, txn, gcBefore, keepOriginals, CompactionObserver.NO_OP, null);
-    }
-
-    /**
-     * This constructs a compaction task that has been created by a compaction strategy.
-     */
-    protected CompactionTask(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore, boolean keepOriginals)
-    {
-        this(strategy.cfs, txn, gcBefore, keepOriginals, strategy == null ? CompactionObserver.NO_OP : strategy.getBackgroundCompactions(), strategy);
-    }
-
-    private CompactionTask(ColumnFamilyStore cfs,
-                           LifecycleTransaction txn,
-                           int gcBefore,
-                           boolean keepOriginals,
-                           CompactionObserver compObserver,
-                           @Nullable AbstractCompactionStrategy strategy)
+    public CompactionTask(ColumnFamilyStore cfs,
+                          LifecycleTransaction txn,
+                          int gcBefore,
+                          boolean keepOriginals,
+                          @Nullable CompactionStrategy strategy)
     {
         super(cfs, txn);
         this.gcBefore = gcBefore;
         this.keepOriginals = keepOriginals;
-        this.compObserver = compObserver;
         this.strategy = strategy;
 
-        logger.debug("Created compaction task with id {} and strategy {}", txn.opId(), strategy);
-    }
+        if (strategy != null)
+            addObserver(strategy);
 
-    /**
-     * Create a compaction task for a generic compaction strategy.
-     */
-    public static AbstractCompactionTask forCompaction(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
-    {
-        return new CompactionTask(strategy, txn, gcBefore, false);
-    }
-
-    /**
-     * Create a compaction task for {@link TimeWindowCompactionStrategy}.
-     */
-    static AbstractCompactionTask forTimeWindowCompaction(TimeWindowCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
-    {
-        return new TimeWindowCompactionTask(strategy, txn, gcBefore, strategy.ignoreOverlaps());
+        logger.debug("Created compaction task with id {} and strategy {}", txn.opId(), strategy);
     }
 
     /**
@@ -126,15 +95,7 @@ static AbstractCompactionTask forTimeWindowCompaction(TimeWindowCompactionStrate
      */
     static AbstractCompactionTask forTesting(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore)
     {
-        return new CompactionTask(cfs, txn, gcBefore, false);
-    }
-
-    /**
-     * Create a compaction task without a compaction strategy, currently only called by tests.
-     */
-    static AbstractCompactionTask forTesting(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, CompactionObserver compObserver)
-    {
-        return new CompactionTask(cfs, txn, gcBefore, false, compObserver, null);
+        return new CompactionTask(cfs, txn, gcBefore, false, null);
     }
 
     /**
@@ -142,7 +103,7 @@ static AbstractCompactionTask forTesting(ColumnFamilyStore cfs, LifecycleTransac
      */
     public static AbstractCompactionTask forGarbageCollection(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, CompactionParams.TombstoneOption tombstoneOption)
     {
-        AbstractCompactionTask task = new CompactionTask(cfs, txn, gcBefore, false)
+        AbstractCompactionTask task = new CompactionTask(cfs, txn, gcBefore, false, null)
         {
             @Override
             protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
@@ -225,11 +186,11 @@ protected void runMayThrow() throws Exception
     public final class CompactionOperation implements AutoCloseable
     {
         private final CompactionController controller;
-        private final CompactionStrategyManager strategyManager;
+        private final CompactionStrategy strategy;
         private final Set<SSTableReader> fullyExpiredSSTables;
         private final UUID taskId;
         private final RateLimiter limiter;
-        private final long start;
+        private final long startNanos;
         private final long startTime;
         private final Set<SSTableReader> actuallyCompact;
         private final CompactionProgress progress;
@@ -244,7 +205,7 @@ public final class CompactionOperation implements AutoCloseable
 
         // resources that need closing
         private Refs<SSTableReader> sstableRefs;
-        private AbstractCompactionStrategy.ScannerList scanners;
+        private ScannerList scanners;
         private CompactionIterator compactionIterator;
         private TableOperation op;
         private Closeable obsCloseable;
@@ -259,9 +220,7 @@ private CompactionOperation(CompactionController controller)
         {
             this.controller = controller;
 
-            // Note that the current compaction strategy, is not necessarily the one this task was created under.
-            // This should be harmless; see comments to CFS.maybeReloadCompactionStrategy.
-            this.strategyManager = cfs.getCompactionStrategyManager();
+            this.strategy = cfs.getCompactionStrategy();
             this.fullyExpiredSSTables = controller.getFullyExpiredSSTables();
             this.taskId = transaction.opId();
 
@@ -272,7 +231,7 @@ private CompactionOperation(CompactionController controller)
             assert !Iterables.any(transaction.originals(), sstable -> !sstable.descriptor.cfname.equals(cfs.name));
 
             this.limiter = CompactionManager.instance.getRateLimiter();
-            this.start = System.nanoTime();
+            this.startNanos = System.nanoTime();
             this.startTime = System.currentTimeMillis();
             this.actuallyCompact = Sets.difference(transaction.originals(), fullyExpiredSSTables);
             this.progress = new Progress();
@@ -287,13 +246,13 @@ private CompactionOperation(CompactionController controller)
             {
                 // resources that need closing, must be created last in case of exceptions and released if there is an exception in the c.tor
                 this.sstableRefs = Refs.ref(actuallyCompact);
-                this.scanners = strategyManager.getScanners(actuallyCompact);
+                this.scanners = strategy.getScanners(actuallyCompact);
                 this.compactionIterator = new CompactionIterator(compactionType, scanners.scanners, controller, FBUtilities.nowInSeconds(), taskId);
                 this.op = compactionIterator.getOperation();
                 this.writer = getCompactionAwareWriter(cfs, dirs, transaction, actuallyCompact);
                 this.obsCloseable = opObserver.onOperationStart(op);
 
-                compObserver.setInProgress(progress);
+                compObservers.forEach(obs -> obs.onInProgress(progress));
             }
             catch (Throwable t)
             {
@@ -325,14 +284,14 @@ private void execute0()
                 debugLogCompactingMessage(taskId);
             }
 
-            long lastCheckObsoletion = start;
+            long lastCheckObsoletion = startNanos;
             double compressionRatio = scanners.getCompressionRatio();
             if (compressionRatio == MetadataCollector.NO_COMPRESSION_RATIO)
                 compressionRatio = 1.0;
 
             long lastBytesScanned = 0;
 
-            if (!controller.cfs.getCompactionStrategyManager().isActive())
+            if (!controller.cfs.getCompactionStrategyContainer().isActive())
                 throw new CompactionInterruptedException(op.getProgress());
 
             estimatedKeys = writer.estimatedKeys();
@@ -403,19 +362,25 @@ else if (completed)
             {
                 // This code used to execute only if the compaction was successful so we preserve the existing behavior
                 updateCompactionHistory(taskId, cfs.keyspace.getName(), cfs.getTableName(), progress);
+                CompactionManager.instance.incrementRemovedExpiredSSTables(fullyExpiredSSTables.size());
+                if (transaction.originals().size() > 0 && actuallyCompact.size() == 0)
+                    // this CompactionOperation only deleted fully expired SSTables without compacting anything
+                    CompactionManager.instance.incrementDeleteOnlyCompactions();
 
                 if (logger.isDebugEnabled())
                 {
-                    debugLogCompactionSummaryInfo(taskId, start, totalKeysWritten, newSStables, progress);
+                    debugLogCompactionSummaryInfo(taskId, System.nanoTime() - startNanos, totalKeysWritten, newSStables, progress);
                 }
                 if (logger.isTraceEnabled())
                 {
                     traceLogCompactionSummaryInfo(totalKeysWritten, estimatedKeys, progress);
                 }
-                cfs.getCompactionLogger().compaction(startTime, transaction.originals(),  System.currentTimeMillis(), newSStables);
+                strategy.getCompactionLogger().compaction(startTime, transaction.originals(),  System.currentTimeMillis(), newSStables);
 
                 // update the metrics
-                cfs.metric.compactionBytesWritten.inc(progress.outputDiskSize());
+                cfs.metric.incBytesCompacted(progress.adjustedInputDiskSize(),
+                                             progress.outputDiskSize(),
+                                             System.nanoTime() - startNanos);
             }
 
             Throwables.maybeFail(err);
@@ -499,9 +464,9 @@ public Set<SSTableReader> sstables()
 
             @Override
             @Nullable
-            public AbstractCompactionStrategy strategy()
+            public CompactionStrategy strategy()
             {
-                return strategy;
+                return CompactionTask.this.strategy;
             }
 
             @Override
@@ -569,7 +534,7 @@ public long uncompressedBytesWritten()
             @Override
             public long durationInNanos()
             {
-                return System.nanoTime() - start;
+                return System.nanoTime() - startNanos;
             }
 
             @Override
@@ -682,7 +647,7 @@ protected void buildCompactionCandidatesForAvailableDiskSpace(final Set<SSTableR
         }
 
         final Set<SSTableReader> nonExpiredSSTables = Sets.difference(transaction.originals(), fullyExpiredSSTables);
-        CompactionStrategyManager strategy = cfs.getCompactionStrategyManager();
+        CompactionStrategy strategy = cfs.getCompactionStrategy();
         int sstablesRemoved = 0;
 
         while(!nonExpiredSSTables.isEmpty())
@@ -753,13 +718,12 @@ public static long getMaxDataAge(Collection<SSTableReader> sstables)
     }
 
     private void debugLogCompactionSummaryInfo(UUID taskId,
-                                               long start,
+                                               long durationInNano,
                                                long totalKeysWritten,
                                                Collection<SSTableReader> newSStables,
                                                CompactionProgress progress)
     {
         // log a bunch of statistics about the result and save to system table compaction_history
-        long durationInNano = System.nanoTime() - start;
         long dTime = TimeUnit.NANOSECONDS.toMillis(durationInNano);
 
         long totalMergedPartitions = 0;
diff --git a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
index 5dc4b54c02ce..dcfc657e0f37 100644
--- a/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/DateTieredCompactionStrategy.java
@@ -17,20 +17,27 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.util.*;
-import java.util.concurrent.TimeUnit;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Predicate;
-import com.google.common.collect.*;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Iterators;
+import com.google.common.collect.Lists;
+import com.google.common.collect.PeekingIterator;
+import com.google.common.collect.Sets;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.node.JsonNodeFactory;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
@@ -43,24 +50,24 @@
  * @deprecated in favour of {@link TimeWindowCompactionStrategy}
  */
 @Deprecated
-public class DateTieredCompactionStrategy extends AbstractCompactionStrategy.WithSSTableList
+public class DateTieredCompactionStrategy extends LegacyAbstractCompactionStrategy.WithSSTableList
 {
     private static final Logger logger = LoggerFactory.getLogger(DateTieredCompactionStrategy.class);
 
-    private final DateTieredCompactionStrategyOptions options;
+    private final DateTieredCompactionStrategyOptions dtOptions;
     protected volatile int estimatedRemainingTasks;
     private final Set<SSTableReader> sstables = new HashSet<>();
     private long lastExpiredCheck;
     private final SizeTieredCompactionStrategyOptions stcsOptions;
 
-    public DateTieredCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    public DateTieredCompactionStrategy(CompactionStrategyFactory factory, Map<String, String> options)
     {
-        super(cfs, options);
+        super(factory, options);
         this.estimatedRemainingTasks = 0;
-        this.options = new DateTieredCompactionStrategyOptions(options);
-        if (!options.containsKey(AbstractCompactionStrategy.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(AbstractCompactionStrategy.TOMBSTONE_THRESHOLD_OPTION))
+        this.dtOptions = new DateTieredCompactionStrategyOptions(options);
+        if (!options.containsKey(CompactionStrategyOptions.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(CompactionStrategyOptions.TOMBSTONE_THRESHOLD_OPTION))
         {
-            disableTombstoneCompactions = true;
+            super.options.setDisableTombstoneCompactions(true);
             logger.trace("Disabling tombstone compactions for DTCS");
         }
         else
@@ -87,13 +94,13 @@ protected synchronized List<SSTableReader> getNextBackgroundSSTables(final int g
 
         Set<SSTableReader> expired = Collections.emptySet();
         // we only check for expired sstables every 10 minutes (by default) due to it being an expensive operation
-        if (System.currentTimeMillis() - lastExpiredCheck > options.expiredSSTableCheckFrequency)
+        if (System.currentTimeMillis() - lastExpiredCheck > dtOptions.expiredSSTableCheckFrequency)
         {
             // Find fully expired SSTables. Those will be included no matter what.
             expired = CompactionController.getFullyExpiredSSTables(cfs, uncompacting, cfs.getOverlappingLiveSSTables(uncompacting), gcBefore);
             lastExpiredCheck = System.currentTimeMillis();
         }
-        Set<SSTableReader> candidates = Sets.newHashSet(filterSuspectSSTables(uncompacting));
+        Set<SSTableReader> candidates = Sets.newHashSet(Iterables.filter(uncompacting, sstable -> !sstable.isMarkedSuspect()));
 
         List<SSTableReader> compactionCandidates = new ArrayList<>(getNextNonExpiredSSTables(Sets.difference(candidates, expired), gcBefore));
         if (!expired.isEmpty())
@@ -130,17 +137,17 @@ private List<SSTableReader> getNextNonExpiredSSTables(Iterable<SSTableReader> no
 
     private List<SSTableReader> getCompactionCandidates(Iterable<SSTableReader> candidateSSTables, long now, int base)
     {
-        Iterable<SSTableReader> candidates = filterOldSSTables(Lists.newArrayList(candidateSSTables), options.maxSSTableAge, now);
+        Iterable<SSTableReader> candidates = filterOldSSTables(Lists.newArrayList(candidateSSTables), dtOptions.maxSSTableAge, now);
 
-        List<List<SSTableReader>> buckets = getBuckets(createSSTableAndMinTimestampPairs(candidates), options.baseTime, base, now, options.maxWindowSize);
+        List<List<SSTableReader>> buckets = getBuckets(createSSTableAndMinTimestampPairs(candidates), dtOptions.baseTime, base, now, dtOptions.maxWindowSize);
         logger.debug("Compaction buckets are {}", buckets);
         updateEstimatedCompactionsByTasks(buckets);
         List<SSTableReader> mostInteresting = newestBucket(buckets,
                                                            cfs.getMinimumCompactionThreshold(),
                                                            cfs.getMaximumCompactionThreshold(),
                                                            now,
-                                                           options.baseTime,
-                                                           options.maxWindowSize,
+                                                           dtOptions.baseTime,
+                                                           dtOptions.maxWindowSize,
                                                            stcsOptions);
         if (!mostInteresting.isEmpty())
             return mostInteresting;
@@ -229,7 +236,7 @@ void removeDeadSSTables()
     }
 
     @Override
-    protected Set<SSTableReader> getSSTables()
+    public Set<SSTableReader> getSSTables()
     {
         synchronized (sstables)
         {
@@ -366,7 +373,7 @@ private void updateEstimatedCompactionsByTasks(List<List<SSTableReader>> tasks)
                     n += Math.ceil((double)stcsBucket.size() / cfs.getMaximumCompactionThreshold());
         }
         estimatedRemainingTasks = n;
-        cfs.getCompactionStrategyManager().compactionLogger.pending(this, n);
+        getCompactionLogger().pending(this, n);
     }
 
 
@@ -444,7 +451,7 @@ public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Coll
 
     public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
     {
-        Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
+        Map<String, String> uncheckedOptions = CompactionStrategyOptions.validateOptions(options);
         uncheckedOptions = DateTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
 
         uncheckedOptions.remove(CompactionParams.Option.MIN_THRESHOLD.toString());
@@ -455,33 +462,6 @@ public static Map<String, String> validateOptions(Map<String, String> options) t
         return uncheckedOptions;
     }
 
-    public CompactionLogger.Strategy strategyLogger()
-    {
-        return new CompactionLogger.Strategy()
-        {
-            public JsonNode sstable(SSTableReader sstable)
-            {
-                ObjectNode node = JsonNodeFactory.instance.objectNode();
-                node.put("min_timestamp", sstable.getMinTimestamp());
-                node.put("max_timestamp", sstable.getMaxTimestamp());
-                return node;
-            }
-
-            public JsonNode options()
-            {
-                ObjectNode node = JsonNodeFactory.instance.objectNode();
-                TimeUnit resolution = DateTieredCompactionStrategy.this.options.timestampResolution;
-                node.put(DateTieredCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY,
-                         resolution.toString());
-                node.put(DateTieredCompactionStrategyOptions.BASE_TIME_KEY,
-                         resolution.toSeconds(DateTieredCompactionStrategy.this.options.baseTime));
-                node.put(DateTieredCompactionStrategyOptions.MAX_WINDOW_SIZE_KEY,
-                         resolution.toSeconds(DateTieredCompactionStrategy.this.options.maxWindowSize));
-                return node;
-            }
-        };
-    }
-
     public String toString()
     {
         return String.format("DateTieredCompactionStrategy[%s/%s]",
diff --git a/src/java/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategy.java
new file mode 100644
index 000000000000..2c307a5e71d5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategy.java
@@ -0,0 +1,344 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.Component;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+
+/**
+ * Pluggable compaction strategy determines how SSTables get merged.
+ *
+ * There are two main goals:
+ *  - perform background compaction constantly as needed; this typically makes a tradeoff between
+ *    i/o done by compaction, and merging done at read time.
+ *  - perform a full (maximum possible) compaction if requested by the user
+ */
+abstract class LegacyAbstractCompactionStrategy extends AbstractCompactionStrategy
+{
+    protected LegacyAbstractCompactionStrategy(CompactionStrategyFactory factory, Map<String, String> options)
+    {
+        super(factory, new BackgroundCompactions(factory.getCfs()), options);
+        assert factory != null;
+    }
+
+    /**
+     * Helper base class for strategies that provide CompactionAggregates, implementing the typical
+     * getNextBackgroundTasks logic based on a getNextBackgroundAggregate method.
+     */
+    protected static abstract class WithAggregates extends LegacyAbstractCompactionStrategy
+    {
+        protected WithAggregates(CompactionStrategyFactory factory, Map<String, String> options)
+        {
+            super(factory, options);
+        }
+
+        @Override
+        @SuppressWarnings("resource")
+        public Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore)
+        {
+            CompactionPick previous = null;
+            while (true)
+            {
+                CompactionAggregate compaction = getNextBackgroundAggregate(gcBefore);
+                if (compaction == null || compaction.isEmpty())
+                    return ImmutableList.of();
+
+                // Already tried acquiring references without success. It means there is a race with
+                // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
+                if (compaction.getSelected().equals(previous))
+                {
+                    logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
+                                "unless it happens frequently, in which case it must be reported. Will retry later.",
+                                compaction.getSelected());
+                    return ImmutableList.of();
+                }
+
+                LifecycleTransaction transaction = dataTracker.tryModify(compaction.getSelected().sstables, OperationType.COMPACTION);
+                if (transaction != null)
+                {
+                    backgroundCompactions.setSubmitted(this, transaction.opId(), compaction);
+                    return ImmutableList.of(createCompactionTask(gcBefore, transaction, compaction));
+                }
+
+                // Getting references to the sstables failed. This may be because we tried to compact sstables that are
+                // no longer present (due to races in getting the notification), or because we still haven't
+                // received any replace notifications. Remove any non-live sstables we track and try again.
+                removeDeadSSTables();
+
+                previous = compaction.getSelected();
+            }
+        }
+
+        /**
+         * Select the next compaction to perform. This method is typically synchronized.
+         */
+        protected abstract CompactionAggregate getNextBackgroundAggregate(int gcBefore);
+
+        protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, CompactionAggregate compaction)
+        {
+            return new CompactionTask(cfs, txn, gcBefore, false, this);
+        }
+
+        /**
+         * Get the estimated remaining compactions. Strategies that implement {@link WithAggregates} can delegate this
+         * to {@link BackgroundCompactions} because they set the pending aggregates as background compactions but legacy
+         * strategies that do not support aggregates must implement this method.
+         * <p/>
+         * @return the number of background tasks estimated to still be needed for this strategy
+         */
+        @Override
+        public int getEstimatedRemainingTasks()
+        {
+            return backgroundCompactions.getEstimatedRemainingTasks();
+        }
+    }
+
+    /**
+     * Helper base class for (older, deprecated) strategies that provide a list of tables to compact, implementing the
+     * typical getNextBackgroundTask logic based on a getNextBackgroundSSTables method.
+     */
+    protected static abstract class WithSSTableList extends LegacyAbstractCompactionStrategy
+    {
+        protected WithSSTableList(CompactionStrategyFactory factory, Map<String, String> options)
+        {
+            super(factory, options);
+        }
+
+        @Override
+        @SuppressWarnings("resource")
+        public Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore)
+        {
+            List<SSTableReader> previousCandidate = null;
+            while (true)
+            {
+                List<SSTableReader> latestBucket = getNextBackgroundSSTables(gcBefore);
+
+                if (latestBucket.isEmpty())
+                    return ImmutableList.of();
+
+                // Already tried acquiring references without success. It means there is a race with
+                // the tracker but candidate SSTables were not yet replaced in the compaction strategy manager
+                if (latestBucket.equals(previousCandidate))
+                {
+                    logger.warn("Could not acquire references for compacting SSTables {} which is not a problem per se," +
+                                "unless it happens frequently, in which case it must be reported. Will retry later.",
+                                latestBucket);
+                    return ImmutableList.of();
+                }
+
+                LifecycleTransaction modifier = dataTracker.tryModify(latestBucket, OperationType.COMPACTION);
+                if (modifier != null)
+                    return ImmutableList.of(createCompactionTask(gcBefore, modifier, false, false));
+
+                // Getting references to the sstables failed. This may be because we tried to compact sstables that are
+                // no longer present (due to races in getting the notification), or because we still haven't
+                // received any replace notifications. Remove any non-live sstables we track and try again.
+                removeDeadSSTables();
+
+                previousCandidate = latestBucket;
+            }
+        }
+
+        /**
+         * Select the next tables to compact. This method is typically synchronized.
+         */
+        protected abstract List<SSTableReader> getNextBackgroundSSTables(final int gcBefore);
+    }
+
+    /**
+     * Replaces sstables in the compaction strategy
+     *
+     * Note that implementations must be able to handle duplicate notifications here (that removed are already gone and
+     * added have already been added)
+     * */
+    public abstract void replaceSSTables(Collection<SSTableReader> removed, Collection<SSTableReader> added);
+
+    /**
+     * Adds sstable, note that implementations must handle duplicate notifications here (added already being in the compaction strategy)
+     */
+    abstract void addSSTable(SSTableReader added);
+
+    /**
+     * Adds sstables, note that implementations must handle duplicate notifications here (added already being in the compaction strategy)
+     */
+    public synchronized void addSSTables(Iterable<SSTableReader> added)
+    {
+        for (SSTableReader sstable : added)
+            addSSTable(sstable);
+    }
+
+    /**
+     * Removes sstable from the strategy, implementations must be able to handle the sstable having already been removed.
+     */
+    abstract void removeSSTable(SSTableReader sstable);
+
+    /**
+     * Removes sstables from the strategy, implementations must be able to handle the sstables having already been removed.
+     */
+    public void removeSSTables(Iterable<SSTableReader> removed)
+    {
+        for (SSTableReader sstable : removed)
+            removeSSTable(sstable);
+    }
+
+    /**
+     * Remove any tracked sstable that is no longer in the live set. Note that because we get notifications after the
+     * tracker is modified, anything we know of must be already in the live set. If it is not, it has been removed
+     * from there, and we either haven't received the removal notification yet, or we did and we messed it up (i.e.
+     * we got it before the addition). The former is transient, but the latter can cause persistent problems, including
+     * fully stopping compaction. In any case, we should remove any such sstables.
+     * There is a special-case implementation of this in LeveledManifest.
+     */
+    abstract void removeDeadSSTables();
+
+    void removeDeadSSTables(Iterable<SSTableReader> sstables)
+    {
+        synchronized (sstables)
+        {
+            int removed = 0;
+            Set<SSTableReader> liveSet = cfs.getLiveSSTables();
+            for (Iterator<SSTableReader> it = sstables.iterator(); it.hasNext(); )
+            {
+                SSTableReader sstable = it.next();
+                if (!liveSet.contains(sstable))
+                {
+                    it.remove();
+                    ++removed;
+                }
+            }
+
+            if (removed > 0)
+                logger.debug("Removed {} dead sstables from the compactions tracked list.", removed);
+        }
+    }
+
+    public synchronized CompactionTasks getMaximalTasks(int gcBefore, boolean splitOutput)
+    {
+        removeDeadSSTables();
+        return super.getMaximalTasks(gcBefore, splitOutput);
+    }
+
+    /**
+     * Called when the metadata has changed for an sstable - for example if the level changed
+     *
+     * Not called when repair status changes (which is also metadata), because this results in the
+     * sstable getting removed from the compaction strategy instance.
+     *
+     * This is only needed by the LCS manifest from what I could see.
+     */
+    void metadataChanged(StatsMetadata oldMetadata, SSTableReader sstable)
+    {
+    }
+
+    /**
+     * Select a table for tombstone-removing compaction from the given set. Returns null if no table is suitable.
+     */
+    @Nullable
+    CompactionAggregate makeTombstoneCompaction(int gcBefore,
+                                                Iterable<SSTableReader> candidates,
+                                                Function<Collection<SSTableReader>, SSTableReader> selector)
+    {
+        List<SSTableReader> sstablesWithTombstones = new ArrayList<>();
+        for (SSTableReader sstable : candidates)
+        {
+            if (worthDroppingTombstones(sstable, gcBefore))
+                sstablesWithTombstones.add(sstable);
+        }
+        if (sstablesWithTombstones.isEmpty())
+            return null;
+
+        final SSTableReader sstable = selector.apply(sstablesWithTombstones);
+        return CompactionAggregate.createForTombstones(sstable);
+    }
+
+    /**
+     * Check if given sstable is worth dropping tombstones at gcBefore.
+     * Check is skipped if tombstone_compaction_interval time does not elapse since sstable creation and returns false.
+     *
+     * @param sstable SSTable to check
+     * @param gcBefore time to drop tombstones
+     * @return true if given sstable's tombstones are expected to be removed
+     */
+    protected boolean worthDroppingTombstones(SSTableReader sstable, int gcBefore)
+    {
+        if (options.isDisableTombstoneCompactions() || CompactionController.NEVER_PURGE_TOMBSTONES || cfs.getNeverPurgeTombstones())
+            return false;
+        // since we use estimations to calculate, there is a chance that compaction will not drop tombstones actually.
+        // if that happens we will end up in infinite compaction loop, so first we check enough if enough time has
+        // elapsed since SSTable created.
+        if (System.currentTimeMillis() < sstable.getCreationTimeFor(Component.DATA) + options.getTombstoneCompactionInterval() * 1000)
+            return false;
+
+        double droppableRatio = sstable.getEstimatedDroppableTombstoneRatio(gcBefore);
+        if (droppableRatio <= options.getTombstoneThreshold())
+            return false;
+
+        //sstable range overlap check is disabled. See CASSANDRA-6563.
+        if (options.isUncheckedTombstoneCompaction())
+            return true;
+
+        Collection<SSTableReader> overlaps = cfs.getOverlappingLiveSSTables(Collections.singleton(sstable));
+        if (overlaps.isEmpty())
+        {
+            // there is no overlap, tombstones are safely droppable
+            return true;
+        }
+        else if (CompactionController.getFullyExpiredSSTables(cfs, Collections.singleton(sstable), overlaps, gcBefore).size() > 0)
+        {
+            return true;
+        }
+        else
+        {
+            // what percentage of columns do we expect to compact outside of overlap?
+            if (sstable.getIndexSummarySize() < 2)
+            {
+                // we have too few samples to estimate correct percentage
+                return false;
+            }
+            // first, calculate estimated keys that do not overlap
+            long keys = sstable.estimatedKeys();
+            Set<Range<Token>> ranges = new HashSet<Range<Token>>(overlaps.size());
+            for (SSTableReader overlap : overlaps)
+                ranges.add(new Range<>(overlap.first.getToken(), overlap.last.getToken()));
+            long remainingKeys = keys - sstable.estimatedKeysForRanges(ranges);
+            // next, calculate what percentage of columns we have within those keys
+            long columns = sstable.getEstimatedCellPerPartitionCount().mean() * remainingKeys;
+            double remainingColumnsRatio = ((double) columns) / (sstable.getEstimatedCellPerPartitionCount().count() * sstable.getEstimatedCellPerPartitionCount().mean());
+
+            // return if we still expect to have droppable tombstones in rest of columns
+            return remainingColumnsRatio * droppableRatio > options.getTombstoneThreshold();
+        }
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java
index 6f6cdad0e3eb..15fd1b725470 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStatistics.java
@@ -28,7 +28,7 @@
 import com.google.common.collect.Iterables;
 
 /**
- * The statistics for levelled compaction.
+ * The statistics for leveled compaction.
  * <p/>
  * Implements serializable to allow structured info to be returned via JMX.
  */
@@ -36,10 +36,9 @@ public class LeveledCompactionStatistics extends CompactionAggregateStatistics
 {
     private static final Collection<String> HEADER = ImmutableList.copyOf(Iterables.concat(ImmutableList.of("Level", "Score"),
                                                                                            CompactionAggregateStatistics.HEADER,
-                                                                                           ImmutableList.of("Tot/Read/Written",
-                                                                                                                            "Read: Tot/Prev/Next",
-                                                                                                                            "Written: Tot/New",
-                                                                                                                            "WA (tot_written/read_prev)")));
+                                                                                           ImmutableList.of("Read: Tot/Prev/Next",
+                                                                                                            "Written: Tot/New",
+                                                                                                            "WA (tot_written/read_prev)")));
 
     private static final long serialVersionUID = 3695927592357744816L;
 
@@ -49,18 +48,18 @@ public class LeveledCompactionStatistics extends CompactionAggregateStatistics
     /** The score of this level */
     private final double score;
 
-    /** Total bytes of the sstables selected for compaction */
-    private final long tot;
-
-    /** Total bytes read during compaction between levels N and N+1. This includes bytes read from this level (N) and from the next level (N+1) */
-    private final long totRead;
+    /**
+     * How many more compactions this level is expected to perform. This is required because for LCS we cannot
+     * easily identify candidate sstables to put into the pending picks.
+     */
+    private final int pendingCompactions;
 
-    /** Bytes read from the current level (N) during compaction between levels N and N+1 */
+    /**
+     * Bytes read from the current level (N) during compaction between levels N and N+1. Note that {@link #readBytes}
+     * includes bytes read from both the current level (N) and the target level (N+1).
+     */
     private final long readLevel;
 
-    /** Total bytes written during compaction between levels N and N+1 */
-    private final long totWritten;
-
     /**
      * Additional RocksDB metrics we may want to consider:
      * Moved(GB): Bytes moved to level N+1 during compaction. In this case there is no IO other than updating the manifest to indicate that a file which used to be in level X is now in level Y
@@ -80,29 +79,25 @@ public class LeveledCompactionStatistics extends CompactionAggregateStatistics
      * KeyDrop: number of records dropped (not written out) during compaction
      */
 
-    public LeveledCompactionStatistics(int level,
+    public LeveledCompactionStatistics(CompactionAggregateStatistics base,
+                                       int level,
                                        double score,
-                                       int numCompactions,
-                                       int numCompactionsInProgress,
-                                       int numSSTables,
-                                       int numCandidateSSTables,
-                                       int numCompactingSSTables,
-                                       long sizeInBytes,
-                                       double readThroughput,
-                                       double writeThroughput,
-                                       long tot,
-                                       long totRead,
-                                       long readLevel,
-                                       long totWritten)
+                                       int pendingCompactions,
+                                       long readLevel)
     {
-        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput);
-
+        super(base);
         this.level = level;
         this.score = score;
-        this.tot = tot;
-        this.totRead = totRead;
+        this.pendingCompactions = pendingCompactions;
         this.readLevel = readLevel;
-        this.totWritten = totWritten;
+    }
+
+    /** The number of compactions that are either pending or in progress */
+    @Override
+    @JsonProperty
+    public int numCompactions()
+    {
+        return numCompactions + pendingCompactions;
     }
 
     /** The current level */
@@ -120,22 +115,10 @@ public double score()
         return score;
     }
 
-    /** Total bytes of the sstables selected for compaction */
-    @JsonProperty
-    public long tot()
-    {
-        return tot;
-    }
-
-    /** Total uncompressed bytes read during compaction between this level and the next. This includes bytes read from this level (N) and from the next level (N+1) */
-    @JsonProperty
-    public long read()
-    {
-        return totRead;
-    }
-
-    /** Uncompressed bytes read from the previous level (N) during compaction between levels N and N+1*/
-    @JsonProperty
+    /**
+     * Bytes read from the current level (N) during compaction between levels N and N+1. Note that
+     * {@link #read()} includes bytes read from both the current level (N) and the target level (N+1).
+     */    @JsonProperty
     public long readLevel()
     {
         return readLevel;
@@ -145,28 +128,21 @@ public long readLevel()
     @JsonProperty
     public long readNext()
     {
-        return totRead - readLevel;
-    }
-
-    /** Uncompressed  bytes written during compaction between levels N and N+1 */
-    @JsonProperty
-    public long written()
-    {
-        return totWritten;
+        return readBytes - readLevel;
     }
 
     /** Uncompressed  bytes written to level N+1, calculated as total bytes written - bytes read from N+1 */
     @JsonProperty
     public long writtenNew()
     {
-        return totWritten - readNext();
+        return writtenBytes - readNext();
     }
 
     /** W-Amp: total bytes written divided by the bytes read from level N. */
     @JsonProperty
     public double writeAmpl()
     {
-        return readLevel() > 0 ? (double)totWritten / readLevel() : Double.NaN;
+        return readLevel() > 0 ? (double) writtenBytes / readLevel() : Double.NaN;
     }
 
     @Override
@@ -184,10 +160,10 @@ protected Collection<String> data()
 
         data.addAll(super.data());
 
-        data.add(toString(tot()) + '/' + toString(read()) + '/' + toString(written()));
         data.add(toString(read()) + '/' + toString(readLevel()) + '/' + toString(readNext()));
         data.add(toString(written()) + '/' + toString(writtenNew()));
         data.add(String.format("%.3f", writeAmpl()));
+
         return data;
     }
 }
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
index 13765fc7f200..1c6339353f25 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionStrategy.java
@@ -17,34 +17,43 @@
  */
 package org.apache.cassandra.db.compaction;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
 import java.util.function.Function;
 
-
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.collect.*;
+import com.google.common.collect.AbstractIterator;
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multimap;
 import com.google.common.primitives.Doubles;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.node.JsonNodeFactory;
-import com.fasterxml.jackson.databind.node.ObjectNode;
-import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
-import org.apache.cassandra.schema.CompactionParams;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.config.Config;
-import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.StatsMetadata;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.TableMetadata;
 
-public class LeveledCompactionStrategy extends AbstractCompactionStrategy.WithAggregates
+public class LeveledCompactionStrategy extends LegacyAbstractCompactionStrategy.WithAggregates
 {
     private static final Logger logger = LoggerFactory.getLogger(LeveledCompactionStrategy.class);
     static final String SSTABLE_SIZE_OPTION = "sstable_size_in_mb";
@@ -59,9 +68,9 @@ public class LeveledCompactionStrategy extends AbstractCompactionStrategy.WithAg
     private final int levelFanoutSize;
     private final boolean singleSSTableUplevel;
 
-    public LeveledCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    public LeveledCompactionStrategy(CompactionStrategyFactory factory, Map<String, String> options)
     {
-        super(cfs, options);
+        super(factory, options);
         int configuredMaxSSTableSize = 160;
         int configuredLevelFanoutSize = DEFAULT_LEVEL_FANOUT_SIZE;
         boolean configuredSingleSSTableUplevel = false;
@@ -75,10 +84,10 @@ public LeveledCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> opti
                 {
                     if (configuredMaxSSTableSize >= 1000)
                         logger.warn("Max sstable size of {}MB is configured for {}.{}; having a unit of compaction this large is probably a bad idea",
-                                configuredMaxSSTableSize, cfs.name, cfs.getTableName());
+                                configuredMaxSSTableSize, cfs.getKeyspaceName(), cfs.getTableName());
                     if (configuredMaxSSTableSize < 50)
                         logger.warn("Max sstable size of {}MB is configured for {}.{}.  Testing done for CASSANDRA-5727 indicates that performance improves up to 160MB",
-                                configuredMaxSSTableSize, cfs.name, cfs.getTableName());
+                                configuredMaxSSTableSize, cfs.getKeyspaceName(), cfs.getTableName());
                 }
             }
 
@@ -100,7 +109,7 @@ public LeveledCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> opti
         logger.trace("Created {}", manifest);
     }
 
-    public int getLevelSize(int i)
+    int getLevelSize(int i)
     {
         return manifest.getLevelSize(i);
     }
@@ -110,6 +119,12 @@ public int[] getAllLevelSize()
         return manifest.getAllLevelSize();
     }
 
+    @Override
+    public int[] getSSTableCountPerLevel()
+    {
+        return manifest.getSSTableCountPerLevel();
+    }
+
     @Override
     public void startup()
     {
@@ -121,7 +136,7 @@ public void startup()
     protected CompactionAggregate getNextBackgroundAggregate(int gcBefore)
     {
         CompactionAggregate.Leveled candidate = manifest.getCompactionCandidate();
-        backgroundCompactions.setPending(manifest.getEstimatedTasks(candidate));
+        backgroundCompactions.setPending(this, manifest.getEstimatedTasks(candidate));
 
         if (candidate != null)
             return candidate;
@@ -153,9 +168,9 @@ protected AbstractCompactionTask createCompactionTask(final int gcBefore, Lifecy
 
         AbstractCompactionTask newTask;
         if (!singleSSTableUplevel || op == OperationType.TOMBSTONE_COMPACTION || txn.originals().size() > 1)
-            newTask = LeveledCompactionTask.forCompaction(this, txn, nextLevel, gcBefore, maxxSSTableBytes, false);
+            newTask = new LeveledCompactionTask(this, txn, nextLevel, gcBefore, maxxSSTableBytes, false);
         else
-            newTask = SingleSSTableLCSTask.forCompaction(this, txn, nextLevel);
+            newTask = new SingleSSTableLCSTask(this, txn, nextLevel);
 
         newTask.setCompactionType(op);
         return newTask;
@@ -168,7 +183,7 @@ protected AbstractCompactionTask createCompactionTask(final int gcBefore, Lifecy
         Collection<SSTableReader> sstables = txn.originals();
         int level = sstables.size() > 1 ? 0 : sstables.iterator().next().getSSTableLevel();
         long maxSSTableBytes = (level == 0 && !isMaximal) ? Long.MAX_VALUE : getMaxSSTableBytes();
-        return LeveledCompactionTask.forCompaction(this, txn, level, gcBefore, maxSSTableBytes, isMaximal);
+        return new LeveledCompactionTask(this, txn, level, gcBefore, maxSSTableBytes, isMaximal);
     }
 
     @Override
@@ -184,7 +199,7 @@ public AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, int
             if (level != sstable.getSSTableLevel())
                 level = 0;
         }
-        return LeveledCompactionTask.forCompaction(this, txn, level, gcBefore, maxSSTableBytes, false);
+        return new LeveledCompactionTask(this, txn, level, gcBefore, maxSSTableBytes, false);
     }
 
     /**
@@ -336,7 +351,7 @@ public void removeSSTable(SSTableReader sstable)
     }
 
     @Override
-    protected Set<SSTableReader> getSSTables()
+    public Set<SSTableReader> getSSTables()
     {
         return manifest.getSSTables();
     }
@@ -478,7 +493,7 @@ public int level()
     @Override
     public String toString()
     {
-        return String.format("LCS@%d(%s)", hashCode(), cfs.name);
+        return String.format("LCS@%d(%s)", hashCode(), cfs.getTableName());
     }
 
     private CompactionAggregate findDroppableSSTable(final int gcBefore)
@@ -489,7 +504,7 @@ private CompactionAggregate findDroppableSSTable(final int gcBefore)
             return -1 * Doubles.compare(r1, r2);
         };
         Function<Collection<SSTableReader>, SSTableReader> selector = list -> Collections.max(list, comparator);
-        Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+        Set<SSTableReader> compacting = dataTracker.getCompacting();
 
         for (int i = manifest.getLevelCount(); i >= 0; i--)
         {
@@ -502,29 +517,9 @@ private CompactionAggregate findDroppableSSTable(final int gcBefore)
         return null;
     }
 
-    public CompactionLogger.Strategy strategyLogger()
-    {
-        return new CompactionLogger.Strategy()
-        {
-            public JsonNode sstable(SSTableReader sstable)
-            {
-                ObjectNode node = JsonNodeFactory.instance.objectNode();
-                node.put("level", sstable.getSSTableLevel());
-                node.put("min_token", sstable.first.getToken().toString());
-                node.put("max_token", sstable.last.getToken().toString());
-                return node;
-            }
-
-            public JsonNode options()
-            {
-                return null;
-            }
-        };
-    }
-
     public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
     {
-        Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
+        Map<String, String> uncheckedOptions = CompactionStrategyOptions.validateOptions(options);
 
         String size = options.containsKey(SSTABLE_SIZE_OPTION) ? options.get(SSTABLE_SIZE_OPTION) : "1";
         try
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
index c4ee57fa3212..cdf72f5847ba 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledCompactionTask.java
@@ -35,17 +35,12 @@ public class LeveledCompactionTask extends CompactionTask
 
     public LeveledCompactionTask(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level, int gcBefore, long maxSSTableBytes, boolean majorCompaction)
     {
-        super(strategy, txn, gcBefore, false);
+        super(strategy.cfs, txn, gcBefore, false, strategy);
         this.level = level;
         this.maxSSTableBytes = maxSSTableBytes;
         this.majorCompaction = majorCompaction;
     }
 
-    static AbstractCompactionTask forCompaction(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level, int gcBefore, long maxSSTableBytes, boolean majorCompaction)
-    {
-        return new LeveledCompactionTask(strategy, txn, level, gcBefore, maxSSTableBytes, majorCompaction);
-    }
-
     @Override
     public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
                                                           Directories directories,
diff --git a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
index 7a9f69d41896..cf4ca4c020fa 100644
--- a/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
+++ b/src/java/org/apache/cassandra/db/compaction/LeveledManifest.java
@@ -27,6 +27,7 @@
 import com.google.common.collect.Sets;
 
 import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.lifecycle.Tracker;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
@@ -66,6 +67,7 @@ public class LeveledManifest
     private static final int NO_COMPACTION_LIMIT = 25;
 
     private final ColumnFamilyStore cfs;
+    private final Tracker dataTracker;
 
     private final LeveledGenerations generations;
 
@@ -78,6 +80,7 @@ public class LeveledManifest
     LeveledManifest(ColumnFamilyStore cfs, int maxSSTableSizeInMB, int fanoutSize, SizeTieredCompactionStrategyOptions options)
     {
         this.cfs = cfs;
+        this.dataTracker = cfs.getTracker();
         this.maxSSTableSizeInBytes = maxSSTableSizeInMB * 1024L * 1024L;
         this.options = options;
         this.levelFanoutSize = fanoutSize;
@@ -190,9 +193,9 @@ private String toString(Collection<SSTableReader> sstables)
         StringBuilder builder = new StringBuilder();
         for (SSTableReader sstable : sstables)
         {
-            builder.append(sstable.descriptor.cfname)
+            builder.append(sstable.getColumnFamilyName())
                    .append('-')
-                   .append(sstable.descriptor.generation)
+                   .append(sstable.getGeneration())
                    .append("(L")
                    .append(sstable.getSSTableLevel())
                    .append("), ");
@@ -272,7 +275,7 @@ synchronized CompactionAggregate.Leveled getCompactionCandidate()
                 continue; // mostly this just avoids polluting the debug log with zero scores
             // we want to calculate score excluding compacting ones
             Set<SSTableReader> sstablesInLevel = Sets.newHashSet(sstables);
-            Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, cfs.getCompactingSSTables());
+            Set<SSTableReader> remaining = Sets.difference(sstablesInLevel, dataTracker.getCompacting());
             long remainingBytesForLevel = SSTableReader.getTotalBytes(remaining);
             long maxBytesForLevel = maxBytesForLevel(i, maxSSTableSizeInBytes);
             double score = (double) remainingBytesForLevel / (double) maxBytesForLevel;
@@ -303,7 +306,7 @@ synchronized CompactionAggregate.Leveled getCompactionCandidate()
                     candidates = getOverlappingStarvedSSTables(nextLevel, candidates);
                     if (logger.isTraceEnabled())
                         logger.trace("Compaction candidates for L{} are {}", i, toString(candidates));
-                    return CompactionAggregate.createLeveled(sstablesInLevel, candidates, pendingCompactions, maxSSTableSizeInBytes, i, nextLevel, score);
+                    return CompactionAggregate.createLeveled(sstablesInLevel, candidates, pendingCompactions, maxSSTableSizeInBytes, i, nextLevel, score, levelFanoutSize);
                 }
                 else
                 {
@@ -327,7 +330,7 @@ synchronized CompactionAggregate.Leveled getCompactionCandidate()
         }
         double l0Score = (double) SSTableReader.getTotalBytes(sstables) / (double) maxBytesForLevel(0, maxSSTableSizeInBytes);
         int l0PendingCompactions = Math.max(0, getEstimatedPendingTasks(0) - 1);
-        return CompactionAggregate.createLeveled(sstables, candidates, l0PendingCompactions, maxSSTableSizeInBytes, 0, getNextLevel(candidates), l0Score);
+        return CompactionAggregate.createLeveled(sstables, candidates, l0PendingCompactions, maxSSTableSizeInBytes, 0, getNextLevel(candidates), l0Score, levelFanoutSize);
     }
 
     private CompactionAggregate.Leveled getSTCSInL0CompactionCandidate()
@@ -353,12 +356,12 @@ private CompactionAggregate.Leveled getSTCSAggregate(CompactionPick compaction)
         int pendingTasks = remainingSSTables > cfs.getMinimumCompactionThreshold()
                            ? (int) Math.ceil(remainingSSTables / cfs.getMaximumCompactionThreshold())
                            : 0;
-        return CompactionAggregate.createLeveledForSTCS(sstables, compaction, pendingTasks, score);
+        return CompactionAggregate.createLeveledForSTCS(sstables, compaction, pendingTasks, score, levelFanoutSize);
     }
 
     private CompactionPick getSSTablesForSTCS(Collection<SSTableReader> sstables)
     {
-        Iterable<? extends SSTableReader> candidates = cfs.getNoncompactingSSTables(sstables);
+        Iterable<? extends SSTableReader> candidates = dataTracker.getNoncompacting(sstables);
 
         SizeTieredCompactionStrategy.SizeTieredBuckets sizeTieredBuckets;
         sizeTieredBuckets = new SizeTieredCompactionStrategy.SizeTieredBuckets(candidates,
@@ -440,6 +443,14 @@ public synchronized int getLevelSize(int i)
         return generations.get(i).size();
     }
 
+    public synchronized int[] getSSTableCountPerLevel()
+    {
+        int[] counts = new int[getLevelCount()];
+        for (int i = 0; i < counts.length; i++)
+            counts[i] = getLevel(i).size();
+        return counts;
+    }
+
     public synchronized int[] getAllLevelSize()
     {
         return generations.getAllLevelSize();
@@ -541,7 +552,7 @@ private Collection<SSTableReader> getCandidatesFor(int level)
         assert !generations.get(level).isEmpty();
         logger.trace("Choosing candidates for L{}", level);
 
-        final Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+        final Set<SSTableReader> compacting = dataTracker.getCompacting();
 
         if (level == 0)
         {
@@ -639,7 +650,7 @@ private Set<SSTableReader> getCompactingL0()
     {
         Set<SSTableReader> sstables = new HashSet<>();
         Set<SSTableReader> levelSSTables = new HashSet<>(generations.get(0));
-        for (SSTableReader sstable : cfs.getCompactingSSTables())
+        for (SSTableReader sstable : dataTracker.getCompacting())
         {
             if (levelSSTables.contains(sstable))
                 sstables.add(sstable);
@@ -704,7 +715,7 @@ public synchronized List<CompactionAggregate> getEstimatedTasks(CompactionAggreg
 
             int pendingTasks = getEstimatedPendingTasks(i);
             double score = (double) SSTableReader.getTotalBytes(sstables) / (double) maxBytesForLevel(i, maxSSTableSizeInBytes);
-            ret.add(CompactionAggregate.createLeveled(sstables, pendingTasks, maxSSTableSizeInBytes, i, score));
+            ret.add(CompactionAggregate.createLeveled(sstables, pendingTasks, maxSSTableSizeInBytes, i, score, levelFanoutSize));
         }
 
         logger.trace("Estimating {} compactions to do for {}", ret.size(), cfs.metadata());
@@ -724,7 +735,7 @@ private int getEstimatedPendingTasks(int level)
         if (sstables.isEmpty())
             return 0;
 
-        final Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+        final Set<SSTableReader> compacting = dataTracker.getCompacting();
         final Set<SSTableReader> remaining = Sets.difference(Sets.newHashSet(sstables), compacting);
 
         if (level == 0 && !DatabaseDescriptor.getDisableSTCSInL0() && remaining.size() > MAX_COMPACTING_L0)
diff --git a/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java b/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java
index 000f5c48be56..07c1983f203c 100644
--- a/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java
+++ b/src/java/org/apache/cassandra/db/compaction/PendingRepairHolder.java
@@ -25,12 +25,12 @@
 import java.util.UUID;
 
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.index.Index;
@@ -47,9 +47,9 @@ public class PendingRepairHolder extends AbstractStrategyHolder
     private final List<PendingRepairManager> managers = new ArrayList<>();
     private final boolean isTransient;
 
-    public PendingRepairHolder(ColumnFamilyStore cfs, DestinationRouter router, boolean isTransient)
+    public PendingRepairHolder(ColumnFamilyStore cfs, CompactionStrategyFactory strategyFactory, DestinationRouter router, boolean isTransient)
     {
-        super(cfs, router);
+        super(cfs, strategyFactory, router);
         this.isTransient = isTransient;
     }
 
@@ -70,7 +70,7 @@ public void setStrategyInternal(CompactionParams params, int numTokenPartitions)
     {
         managers.clear();
         for (int i = 0; i < numTokenPartitions; i++)
-            managers.add(new PendingRepairManager(cfs, params, isTransient));
+            managers.add(new PendingRepairManager(cfs, strategyFactory, params, isTransient));
     }
 
     @Override
@@ -82,24 +82,24 @@ public boolean managesRepairedGroup(boolean isRepaired, boolean isPendingRepair,
     }
 
     @Override
-    public AbstractCompactionStrategy getStrategyFor(SSTableReader sstable)
+    public LegacyAbstractCompactionStrategy getStrategyFor(SSTableReader sstable)
     {
         Preconditions.checkArgument(managesSSTable(sstable), "Attempting to get compaction strategy from wrong holder");
         return managers.get(router.getIndexForSSTable(sstable)).getOrCreate(sstable);
     }
 
     @Override
-    public Iterable<AbstractCompactionStrategy> allStrategies()
+    public Iterable<LegacyAbstractCompactionStrategy> allStrategies()
     {
         return Iterables.concat(Iterables.transform(managers, PendingRepairManager::getStrategies));
     }
 
-    Iterable<AbstractCompactionStrategy> getStrategiesFor(UUID session)
+    Iterable<LegacyAbstractCompactionStrategy> getStrategiesFor(UUID session)
     {
-        List<AbstractCompactionStrategy> strategies = new ArrayList<>(managers.size());
+        List<LegacyAbstractCompactionStrategy> strategies = new ArrayList<>(managers.size());
         for (PendingRepairManager manager : managers)
         {
-            AbstractCompactionStrategy strategy = manager.get(session);
+            LegacyAbstractCompactionStrategy strategy = manager.get(session);
             if (strategy != null)
                 strategies.add(strategy);
         }
@@ -112,11 +112,11 @@ public Iterable<PendingRepairManager> getManagers()
     }
 
     @Override
-    public Collection<TaskSupplier> getBackgroundTaskSuppliers(int gcBefore)
+    public Collection<TasksSupplier> getBackgroundTaskSuppliers(int gcBefore)
     {
-        List<TaskSupplier> suppliers = new ArrayList<>(managers.size());
+        List<TasksSupplier> suppliers = new ArrayList<>(managers.size());
         for (PendingRepairManager manager : managers)
-            suppliers.add(new TaskSupplier(manager.getMaxEstimatedRemainingTasks(), () -> manager.getNextBackgroundTask(gcBefore)));
+            suppliers.add(new TasksSupplier(manager.getMaxEstimatedRemainingTasks(), () -> manager.getNextBackgroundTasks(gcBefore)));
 
         return suppliers;
     }
@@ -127,9 +127,7 @@ public Collection<AbstractCompactionTask> getMaximalTasks(int gcBefore, boolean
         List<AbstractCompactionTask> tasks = new ArrayList<>(managers.size());
         for (PendingRepairManager manager : managers)
         {
-            Collection<AbstractCompactionTask> task = manager.getMaximalTasks(gcBefore, splitOutput);
-            if (task != null)
-                tasks.addAll(task);
+            tasks.addAll(manager.getMaximalTasks(gcBefore, splitOutput));
         }
         return tasks;
     }
@@ -149,31 +147,31 @@ public Collection<AbstractCompactionTask> getUserDefinedTasks(GroupedSSTableCont
         return tasks;
     }
 
-    AbstractCompactionTask getNextRepairFinishedTask()
+    Collection<AbstractCompactionTask> getNextRepairFinishedTasks()
     {
-        List<TaskSupplier> repairFinishedSuppliers = getRepairFinishedTaskSuppliers();
+        List<TasksSupplier> repairFinishedSuppliers = getRepairFinishedTaskSuppliers();
         if (!repairFinishedSuppliers.isEmpty())
         {
             Collections.sort(repairFinishedSuppliers);
-            for (TaskSupplier supplier : repairFinishedSuppliers)
+            for (TasksSupplier supplier : repairFinishedSuppliers)
             {
-                AbstractCompactionTask task = supplier.getTask();
-                if (task != null)
-                    return task;
+                Collection<AbstractCompactionTask> tasks = supplier.getTasks();
+                if (!tasks.isEmpty())
+                    return tasks;
             }
         }
-        return null;
+        return ImmutableList.of();
     }
 
-    private ArrayList<TaskSupplier> getRepairFinishedTaskSuppliers()
+    private ArrayList<TasksSupplier> getRepairFinishedTaskSuppliers()
     {
-        ArrayList<TaskSupplier> suppliers = new ArrayList<>(managers.size());
+        ArrayList<TasksSupplier> suppliers = new ArrayList<>(managers.size());
         for (PendingRepairManager manager : managers)
         {
             int numPending = manager.getNumPendingRepairFinishedTasks();
             if (numPending > 0)
             {
-                suppliers.add(new TaskSupplier(numPending, manager::getNextRepairFinishedTask));
+                suppliers.add(new TasksSupplier(numPending, manager::getNextRepairFinishedTasks));
             }
         }
 
@@ -249,7 +247,7 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
         Preconditions.checkArgument(pendingRepair != null,
                                     "PendingRepairHolder can't create sstable writer without pendingRepair id");
         // to avoid creating a compaction strategy for the wrong pending repair manager, we get the index based on where the sstable is to be written
-        AbstractCompactionStrategy strategy = managers.get(router.getIndexForSSTableDirectory(descriptor)).getOrCreate(pendingRepair);
+        CompactionStrategy strategy = managers.get(router.getIndexForSSTableDirectory(descriptor)).getOrCreate(pendingRepair);
         return strategy.createSSTableMultiWriter(descriptor,
                                                  keyCount,
                                                  repairedAt,
@@ -261,17 +259,6 @@ public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
                                                  lifecycleNewTracker);
     }
 
-    @Override
-    public int getStrategyIndex(AbstractCompactionStrategy strategy)
-    {
-        for (int i = 0; i < managers.size(); i++)
-        {
-            if (managers.get(i).hasStrategy(strategy))
-                return i;
-        }
-        return -1;
-    }
-
     public boolean hasDataForSession(UUID sessionID)
     {
         return Iterables.any(managers, prm -> prm.hasDataForSession(sessionID));
@@ -282,4 +269,9 @@ public boolean containsSSTable(SSTableReader sstable)
     {
         return Iterables.any(managers, prm -> prm.containsSSTable(sstable));
     }
+
+    public int size()
+    {
+        return managers.size();
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java b/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java
index 5c40be1fc1ec..7d49a581b439 100644
--- a/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java
+++ b/src/java/org/apache/cassandra/db/compaction/PendingRepairManager.java
@@ -29,30 +29,27 @@
 import java.util.UUID;
 import java.util.stream.Collectors;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Iterables;
 import com.google.common.collect.Maps;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.repair.consistent.admin.CleanupSummary;
 import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.Pair;
 
 /**
- * Companion to CompactionStrategyManager which manages the sstables marked pending repair.
+ * This class manages the sstables marked pending repair so that they can be assigned to legacy compaction
+ * strategies via the legacy strategy container or manager.
  *
  * SSTables are classified as pending repair by the anti-compaction performed at the beginning
  * of an incremental repair, or when they're streamed in with a pending repair id. This prevents
@@ -64,9 +61,10 @@ class PendingRepairManager
     private static final Logger logger = LoggerFactory.getLogger(PendingRepairManager.class);
 
     private final ColumnFamilyStore cfs;
+    private final CompactionStrategyFactory strategyFactory;
     private final CompactionParams params;
     private final boolean isTransient;
-    private volatile ImmutableMap<UUID, AbstractCompactionStrategy> strategies = ImmutableMap.of();
+    private volatile ImmutableMap<UUID, LegacyAbstractCompactionStrategy> strategies = ImmutableMap.of();
 
     /**
      * Indicates we're being asked to do something with an sstable that isn't marked pending repair
@@ -79,34 +77,35 @@ public IllegalSSTableArgumentException(String s)
         }
     }
 
-    PendingRepairManager(ColumnFamilyStore cfs, CompactionParams params, boolean isTransient)
+    PendingRepairManager(ColumnFamilyStore cfs, CompactionStrategyFactory strategyFactory, CompactionParams params, boolean isTransient)
     {
         this.cfs = cfs;
+        this.strategyFactory = strategyFactory;
         this.params = params;
         this.isTransient = isTransient;
     }
 
-    private ImmutableMap.Builder<UUID, AbstractCompactionStrategy> mapBuilder()
+    private ImmutableMap.Builder<UUID, LegacyAbstractCompactionStrategy> mapBuilder()
     {
         return ImmutableMap.builder();
     }
 
-    AbstractCompactionStrategy get(UUID id)
+    LegacyAbstractCompactionStrategy get(UUID id)
     {
         return strategies.get(id);
     }
 
-    AbstractCompactionStrategy get(SSTableReader sstable)
+    LegacyAbstractCompactionStrategy get(SSTableReader sstable)
     {
         assert sstable.isPendingRepair();
-        return get(sstable.getSSTableMetadata().pendingRepair);
+        return get(sstable.getPendingRepair());
     }
 
-    AbstractCompactionStrategy getOrCreate(UUID id)
+    LegacyAbstractCompactionStrategy getOrCreate(UUID id)
     {
         checkPendingID(id);
         assert id != null;
-        AbstractCompactionStrategy strategy = get(id);
+        LegacyAbstractCompactionStrategy strategy = get(id);
         if (strategy == null)
         {
             synchronized (this)
@@ -116,7 +115,7 @@ AbstractCompactionStrategy getOrCreate(UUID id)
                 if (strategy == null)
                 {
                     logger.debug("Creating {}.{} compaction strategy for pending repair: {}", cfs.metadata.keyspace, cfs.metadata.name, id);
-                    strategy = cfs.createCompactionStrategyInstance(params);
+                    strategy = strategyFactory.createLegacyStrategy(params);
                     strategies = mapBuilder().putAll(strategies).put(id, strategy).build();
                 }
             }
@@ -132,12 +131,12 @@ private static void checkPendingID(UUID pendingID)
         }
     }
 
-    AbstractCompactionStrategy getOrCreate(SSTableReader sstable)
+    LegacyAbstractCompactionStrategy getOrCreate(SSTableReader sstable)
     {
-        return getOrCreate(sstable.getSSTableMetadata().pendingRepair);
+        return getOrCreate(sstable.getPendingRepair());
     }
 
-    private synchronized void removeSessionIfEmpty(UUID sessionID)
+    synchronized void removeSessionIfEmpty(UUID sessionID)
     {
         if (!strategies.containsKey(sessionID) || !strategies.get(sessionID).getSSTables().isEmpty())
             return;
@@ -148,14 +147,13 @@ private synchronized void removeSessionIfEmpty(UUID sessionID)
 
     synchronized void removeSSTable(SSTableReader sstable)
     {
-        for (Map.Entry<UUID, AbstractCompactionStrategy> entry : strategies.entrySet())
+        for (Map.Entry<UUID, LegacyAbstractCompactionStrategy> entry : strategies.entrySet())
         {
             entry.getValue().removeSSTable(sstable);
             removeSessionIfEmpty(entry.getKey());
         }
     }
 
-
     void removeSSTables(Iterable<SSTableReader> removed)
     {
         for (SSTableReader sstable : removed)
@@ -183,7 +181,7 @@ synchronized void replaceSSTables(Set<SSTableReader> removed, Set<SSTableReader>
         Map<UUID, Pair<Set<SSTableReader>, Set<SSTableReader>>> groups = new HashMap<>();
         for (SSTableReader sstable : removed)
         {
-            UUID sessionID = sstable.getSSTableMetadata().pendingRepair;
+            UUID sessionID = sstable.getPendingRepair();
             if (!groups.containsKey(sessionID))
             {
                 groups.put(sessionID, Pair.create(new HashSet<>(), new HashSet<>()));
@@ -193,7 +191,7 @@ synchronized void replaceSSTables(Set<SSTableReader> removed, Set<SSTableReader>
 
         for (SSTableReader sstable : added)
         {
-            UUID sessionID = sstable.getSSTableMetadata().pendingRepair;
+            UUID sessionID = sstable.getPendingRepair();
             if (!groups.containsKey(sessionID))
             {
                 groups.put(sessionID, Pair.create(new HashSet<>(), new HashSet<>()));
@@ -203,7 +201,7 @@ synchronized void replaceSSTables(Set<SSTableReader> removed, Set<SSTableReader>
 
         for (Map.Entry<UUID, Pair<Set<SSTableReader>, Set<SSTableReader>>> entry : groups.entrySet())
         {
-            AbstractCompactionStrategy strategy = getOrCreate(entry.getKey());
+            LegacyAbstractCompactionStrategy strategy = getOrCreate(entry.getKey());
             Set<SSTableReader> groupRemoved = entry.getValue().left;
             Set<SSTableReader> groupAdded = entry.getValue().right;
 
@@ -218,15 +216,15 @@ synchronized void replaceSSTables(Set<SSTableReader> removed, Set<SSTableReader>
 
     synchronized void startup()
     {
-        strategies.values().forEach(AbstractCompactionStrategy::startup);
+        strategies.values().forEach(CompactionStrategy::startup);
     }
 
     synchronized void shutdown()
     {
-        strategies.values().forEach(AbstractCompactionStrategy::shutdown);
+        strategies.values().forEach(CompactionStrategy::shutdown);
     }
 
-    private int getEstimatedRemainingTasks(UUID sessionID, AbstractCompactionStrategy strategy)
+    private int getEstimatedRemainingTasks(UUID sessionID, CompactionStrategy strategy)
     {
         if (canCleanup(sessionID))
         {
@@ -241,7 +239,7 @@ private int getEstimatedRemainingTasks(UUID sessionID, AbstractCompactionStrateg
     int getEstimatedRemainingTasks()
     {
         int tasks = 0;
-        for (Map.Entry<UUID, AbstractCompactionStrategy> entry : strategies.entrySet())
+        for (Map.Entry<UUID, LegacyAbstractCompactionStrategy> entry : strategies.entrySet())
         {
             tasks += getEstimatedRemainingTasks(entry.getKey(), entry.getValue());
         }
@@ -254,7 +252,7 @@ int getEstimatedRemainingTasks()
     int getMaxEstimatedRemainingTasks()
     {
         int tasks = 0;
-        for (Map.Entry<UUID, AbstractCompactionStrategy> entry : strategies.entrySet())
+        for (Map.Entry<UUID, LegacyAbstractCompactionStrategy> entry : strategies.entrySet())
         {
             tasks = Math.max(tasks, getEstimatedRemainingTasks(entry.getKey(), entry.getValue()));
         }
@@ -265,63 +263,13 @@ int getMaxEstimatedRemainingTasks()
     private RepairFinishedCompactionTask getRepairFinishedCompactionTask(UUID sessionID)
     {
         Preconditions.checkState(canCleanup(sessionID));
-        AbstractCompactionStrategy compactionStrategy = get(sessionID);
+        LegacyAbstractCompactionStrategy compactionStrategy = get(sessionID);
         if (compactionStrategy == null)
             return null;
         Set<SSTableReader> sstables = compactionStrategy.getSSTables();
         long repairedAt = ActiveRepairService.instance.consistent.local.getFinalSessionRepairedAt(sessionID);
         LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
-        return txn == null ? null : new RepairFinishedCompactionTask(cfs, txn, sessionID, repairedAt);
-    }
-
-    public static class CleanupTask
-    {
-        private final ColumnFamilyStore cfs;
-        private final List<Pair<UUID, RepairFinishedCompactionTask>> tasks;
-
-        public CleanupTask(ColumnFamilyStore cfs, List<Pair<UUID, RepairFinishedCompactionTask>> tasks)
-        {
-            this.cfs = cfs;
-            this.tasks = tasks;
-        }
-
-        public CleanupSummary cleanup()
-        {
-            Set<UUID> successful = new HashSet<>();
-            Set<UUID> unsuccessful = new HashSet<>();
-            for (Pair<UUID, RepairFinishedCompactionTask> pair : tasks)
-            {
-                UUID session = pair.left;
-                RepairFinishedCompactionTask task = pair.right;
-
-                if (task != null)
-                {
-                    try
-                    {
-                        task.run();
-                        successful.add(session);
-                    }
-                    catch (Throwable t)
-                    {
-                        t = task.transaction.abort(t);
-                        logger.error("Failed cleaning up " + session, t);
-                        unsuccessful.add(session);
-                    }
-                }
-                else
-                {
-                    unsuccessful.add(session);
-                }
-            }
-            return new CleanupSummary(cfs, successful, unsuccessful);
-        }
-
-        public Throwable abort(Throwable accumulate)
-        {
-            for (Pair<UUID, RepairFinishedCompactionTask> pair : tasks)
-                accumulate = pair.right.transaction.abort(accumulate);
-            return accumulate;
-        }
+        return txn == null ? null : new RepairFinishedCompactionTask(cfs, txn, sessionID, repairedAt, isTransient);
     }
 
     public CleanupTask releaseSessionData(Collection<UUID> sessionIDs)
@@ -350,26 +298,30 @@ synchronized int getNumPendingRepairFinishedTasks()
         return count;
     }
 
-    synchronized AbstractCompactionTask getNextRepairFinishedTask()
+    synchronized Collection<AbstractCompactionTask> getNextRepairFinishedTasks()
     {
         for (UUID sessionID : strategies.keySet())
         {
             if (canCleanup(sessionID))
             {
-                return getRepairFinishedCompactionTask(sessionID);
+                RepairFinishedCompactionTask task = getRepairFinishedCompactionTask(sessionID);
+                if (task != null)
+                    return ImmutableList.of(task);
+                else
+                    return ImmutableList.of();
             }
         }
-        return null;
+        return ImmutableList.of();
     }
 
-    synchronized AbstractCompactionTask getNextBackgroundTask(int gcBefore)
+    synchronized Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore)
     {
         if (strategies.isEmpty())
-            return null;
+            return ImmutableList.of();
 
         Map<UUID, Integer> numTasks = new HashMap<>(strategies.size());
         ArrayList<UUID> sessions = new ArrayList<>(strategies.size());
-        for (Map.Entry<UUID, AbstractCompactionStrategy> entry : strategies.entrySet())
+        for (Map.Entry<UUID, LegacyAbstractCompactionStrategy> entry : strategies.entrySet())
         {
             if (canCleanup(entry.getKey()))
             {
@@ -380,22 +332,22 @@ synchronized AbstractCompactionTask getNextBackgroundTask(int gcBefore)
         }
 
         if (sessions.isEmpty())
-            return null;
+            return ImmutableList.of();
 
         // we want the session with the most compactions at the head of the list
         sessions.sort((o1, o2) -> numTasks.get(o2) - numTasks.get(o1));
 
         UUID sessionID = sessions.get(0);
-        return get(sessionID).getNextBackgroundTask(gcBefore);
+        return get(sessionID).getNextBackgroundTasks(gcBefore);
     }
 
     synchronized Collection<AbstractCompactionTask> getMaximalTasks(int gcBefore, boolean splitOutput)
     {
         if (strategies.isEmpty())
-            return null;
+            return ImmutableList.of();
 
         List<AbstractCompactionTask> maximalTasks = new ArrayList<>(strategies.size());
-        for (Map.Entry<UUID, AbstractCompactionStrategy> entry : strategies.entrySet())
+        for (Map.Entry<UUID, LegacyAbstractCompactionStrategy> entry : strategies.entrySet())
         {
             if (canCleanup(entry.getKey()))
             {
@@ -403,15 +355,13 @@ synchronized Collection<AbstractCompactionTask> getMaximalTasks(int gcBefore, bo
             }
             else
             {
-                Collection<AbstractCompactionTask> tasks = entry.getValue().getMaximalTask(gcBefore, splitOutput);
-                if (tasks != null)
-                    maximalTasks.addAll(tasks);
+                maximalTasks.addAll(entry.getValue().getMaximalTasks(gcBefore, splitOutput));
             }
         }
-        return !maximalTasks.isEmpty() ? maximalTasks : null;
+        return maximalTasks;
     }
 
-    Collection<AbstractCompactionStrategy> getStrategies()
+    Collection<LegacyAbstractCompactionStrategy> getStrategies()
     {
         return strategies.values();
     }
@@ -437,7 +387,7 @@ synchronized Set<ISSTableScanner> getScanners(Collection<SSTableReader> sstables
         Map<UUID, Set<SSTableReader>> sessionSSTables = new HashMap<>();
         for (SSTableReader sstable : sstables)
         {
-            UUID sessionID = sstable.getSSTableMetadata().pendingRepair;
+            UUID sessionID = sstable.getPendingRepair();
             checkPendingID(sessionID);
             sessionSSTables.computeIfAbsent(sessionID, k -> new HashSet<>()).add(sstable);
         }
@@ -457,14 +407,14 @@ synchronized Set<ISSTableScanner> getScanners(Collection<SSTableReader> sstables
         return scanners;
     }
 
-    public boolean hasStrategy(AbstractCompactionStrategy strategy)
+    public boolean hasStrategy(CompactionStrategy strategy)
     {
         return strategies.values().contains(strategy);
     }
 
     public synchronized boolean hasDataForSession(UUID sessionID)
     {
-        return strategies.keySet().contains(sessionID);
+        return strategies.containsKey(sessionID);
     }
 
     boolean containsSSTable(SSTableReader sstable)
@@ -479,79 +429,6 @@ boolean containsSSTable(SSTableReader sstable)
     public Collection<AbstractCompactionTask> createUserDefinedTasks(Collection<SSTableReader> sstables, int gcBefore)
     {
         Map<UUID, List<SSTableReader>> group = sstables.stream().collect(Collectors.groupingBy(s -> s.getSSTableMetadata().pendingRepair));
-        return group.entrySet().stream().map(g -> strategies.get(g.getKey()).getUserDefinedTask(g.getValue(), gcBefore)).collect(Collectors.toList());
-    }
-
-    /**
-     * promotes/demotes sstables involved in a consistent repair that has been finalized, or failed
-     */
-    class RepairFinishedCompactionTask extends AbstractCompactionTask
-    {
-        private final UUID sessionID;
-        private final long repairedAt;
-
-        RepairFinishedCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transaction, UUID sessionID, long repairedAt)
-        {
-            super(cfs, transaction);
-            this.sessionID = sessionID;
-            this.repairedAt = repairedAt;
-        }
-
-        @VisibleForTesting
-        UUID getSessionID()
-        {
-            return sessionID;
-        }
-
-        protected void runMayThrow() throws Exception
-        {
-            boolean completed = false;
-            boolean obsoleteSSTables = isTransient && repairedAt > 0;
-            try
-            {
-                if (obsoleteSSTables)
-                {
-                    logger.info("Obsoleting transient repaired sstables for {}", sessionID);
-                    Preconditions.checkState(Iterables.all(transaction.originals(), SSTableReader::isTransient));
-                    transaction.obsoleteOriginals();
-                }
-                else
-                {
-                    logger.info("Moving {} from pending to repaired with repaired at = {} and session id = {}", transaction.originals(), repairedAt, sessionID);
-                    cfs.getCompactionStrategyManager().mutateRepaired(transaction.originals(), repairedAt, ActiveRepairService.NO_PENDING_REPAIR, false);
-                }
-                completed = true;
-            }
-            finally
-            {
-                if (obsoleteSSTables)
-                {
-                    transaction.finish();
-                }
-                else
-                {
-                    // we abort here because mutating metadata isn't guarded by LifecycleTransaction, so this won't roll
-                    // anything back. Also, we don't want to obsolete the originals. We're only using it to prevent other
-                    // compactions from marking these sstables compacting, and unmarking them when we're done
-                    transaction.abort();
-                }
-                if (completed)
-                {
-                    removeSessionIfEmpty(sessionID);
-                }
-            }
-        }
-
-        public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        protected int executeInternal()
-        {
-            run();
-            return transaction.originals().size();
-        }
+        return group.entrySet().stream().map(g -> strategies.get(g.getKey()).getUserDefinedTasks(g.getValue(), gcBefore)).flatMap(Collection::stream).collect(Collectors.toList());
     }
-
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/RepairFinishedCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/RepairFinishedCompactionTask.java
new file mode 100644
index 000000000000..24769206b782
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/RepairFinishedCompactionTask.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Set;
+import java.util.UUID;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Iterables;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.service.ActiveRepairService;
+
+/**
+ * promotes/demotes sstables involved in a consistent repair that has been finalized, or failed
+ */
+public class RepairFinishedCompactionTask extends AbstractCompactionTask
+{
+    private static final Logger logger = LoggerFactory.getLogger(RepairFinishedCompactionTask.class);
+
+    private final UUID sessionID;
+    private final long repairedAt;
+    private final boolean isTransient;
+
+    public RepairFinishedCompactionTask(ColumnFamilyStore cfs,
+                                        LifecycleTransaction transaction,
+                                        UUID sessionID,
+                                        long repairedAt,
+                                        boolean isTransient)
+    {
+        super(cfs, transaction);
+        this.sessionID = sessionID;
+        this.repairedAt = repairedAt;
+        this.isTransient = isTransient;
+    }
+
+    @VisibleForTesting
+    UUID getSessionID()
+    {
+        return sessionID;
+    }
+
+    protected void runMayThrow() throws Exception
+    {
+        boolean completed = false;
+        boolean obsoleteSSTables = isTransient && repairedAt > 0;
+        try
+        {
+            if (obsoleteSSTables)
+            {
+                logger.info("Obsoleting transient repaired sstables for {}", sessionID);
+                Preconditions.checkState(Iterables.all(transaction.originals(), SSTableReader::isTransient));
+                transaction.obsoleteOriginals();
+            }
+            else
+            {
+                logger.info("Moving {} from pending to repaired with repaired at = {} and session id = {}", transaction.originals(), repairedAt, sessionID);
+                CompactionStrategyContainer compactionStrategyContainer = cfs.getCompactionStrategyContainer();
+                cfs.mutateRepaired(compactionStrategyContainer.getWriteLock(),
+                                   transaction.originals(),
+                                   repairedAt,
+                                   ActiveRepairService.NO_PENDING_REPAIR,
+                                   false);
+                compactionStrategyContainer.repairSessionCompleted(sessionID);
+            }
+            completed = true;
+        }
+        finally
+        {
+            if (obsoleteSSTables)
+            {
+                transaction.finish();
+            }
+            else
+            {
+                // we abort here because mutating metadata isn't guarded by LifecycleTransaction, so this won't roll
+                // anything back. Also, we don't want to obsolete the originals. We're only using it to prevent other
+                // compactions from marking these sstables compacting, and unmarking them when we're done
+                transaction.abort();
+            }
+            if (completed)
+            {
+                cfs.getCompactionStrategyContainer().repairSessionCompleted(sessionID);
+            }
+        }
+    }
+
+    public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
+    {
+        throw new UnsupportedOperationException();
+    }
+
+    protected int executeInternal()
+    {
+        run();
+        return transaction.originals().size();
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
index 6b7568571c7b..c4b7332da879 100644
--- a/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
+++ b/src/java/org/apache/cassandra/db/compaction/SSTableSplitter.java
@@ -32,7 +32,7 @@ public class SSTableSplitter
 
     public SSTableSplitter(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
     {
-        this.task = SplittingCompactionTask.forSSTableSplitting(cfs, transaction, sstableSizeInMB);
+        this.task = new SplittingCompactionTask(cfs, transaction, sstableSizeInMB);
     }
 
     public void split()
@@ -44,20 +44,15 @@ private static class SplittingCompactionTask extends CompactionTask
     {
         private final int sstableSizeInMB;
 
-        private SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
+        public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
         {
-            super(cfs, transaction, CompactionManager.NO_GC, false);
+            super(cfs, transaction, CompactionManager.NO_GC, false, null);
             this.sstableSizeInMB = sstableSizeInMB;
 
             if (sstableSizeInMB <= 0)
                 throw new IllegalArgumentException("Invalid target size for SSTables, must be > 0 (got: " + sstableSizeInMB + ")");
         }
 
-        static AbstractCompactionTask forSSTableSplitting(ColumnFamilyStore cfs, LifecycleTransaction transaction, int sstableSizeInMB)
-        {
-            return new SplittingCompactionTask(cfs, transaction, sstableSizeInMB);
-        }
-
         @Override
         protected CompactionController getCompactionController(Set<SSTableReader> toCompact)
         {
diff --git a/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java b/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java
index 166b2ae1af6a..3958fba09d75 100644
--- a/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/SingleSSTableLCSTask.java
@@ -43,18 +43,12 @@ public class SingleSSTableLCSTask extends AbstractCompactionTask
 
     private final int level;
 
-    public SingleSSTableLCSTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int level)
+    public SingleSSTableLCSTask(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level)
     {
-        super(cfs, txn);
+        super(strategy.cfs, txn);
         assert txn.originals().size() == 1;
         this.level = level;
-    }
-
-    public static AbstractCompactionTask forCompaction(LeveledCompactionStrategy strategy, LifecycleTransaction txn, int level)
-    {
-        SingleSSTableLCSTask ret = new SingleSSTableLCSTask(strategy.cfs, txn, level);
-        ret.compObserver = strategy.getBackgroundCompactions();
-        return ret;
+        addObserver(strategy);
     }
 
     @Override
@@ -63,6 +57,11 @@ public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs, Dir
         throw new UnsupportedOperationException("This method should never be called on SingleSSTableLCSTask");
     }
 
+    int getLevel()
+    {
+        return level;
+    }
+
     @Override
     protected int executeInternal()
     {
diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java
index 642aa55af7c5..e5592ed48c9b 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStatistics.java
@@ -30,21 +30,9 @@ public class SizeTieredCompactionStatistics extends TieredCompactionStatistics
     /** The average sstable size in this tier */
     private final long avgSSTableSize;
 
-    SizeTieredCompactionStatistics(long avgSSTableSize,
-                                   double hotness,
-                                   int numCompactions,
-                                   int numCompactionsInProgress,
-                                   int numSSTables,
-                                   int numCandidateSSTables,
-                                   int numCompactingSSTables,
-                                   long sizeInBytes,
-                                   double readThroughput,
-                                   double writeThroughput,
-                                   long tot,
-                                   long read,
-                                   long written)
+    SizeTieredCompactionStatistics(CompactionAggregateStatistics base, long avgSSTableSize)
     {
-        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput, hotness, tot, read, written);
+        super(base);
         this.avgSSTableSize = avgSSTableSize;
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
index 5e8c2f8c6eca..6a90e2889dc4 100644
--- a/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/SizeTieredCompactionStrategy.java
@@ -41,9 +41,7 @@
 import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.utils.Pair;
 
-import static com.google.common.collect.Iterables.filter;
-
-public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy.WithAggregates
+public class SizeTieredCompactionStrategy extends LegacyAbstractCompactionStrategy.WithAggregates
 {
     private static final Logger logger = LoggerFactory.getLogger(SizeTieredCompactionStrategy.class);
 
@@ -61,9 +59,9 @@ public class SizeTieredCompactionStrategy extends AbstractCompactionStrategy.Wit
     @VisibleForTesting
     protected final Set<SSTableReader> sstables = new HashSet<>();
 
-    public SizeTieredCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    public SizeTieredCompactionStrategy(CompactionStrategyFactory factory, Map<String, String> options)
     {
-        super(cfs, options);
+        super(factory, options);
         this.sizeTieredOptions = new SizeTieredCompactionStrategyOptions(options);
     }
 
@@ -77,13 +75,13 @@ protected synchronized CompactionAggregate getNextBackgroundAggregate(final int
         List<SSTableReader> candidates = new ArrayList<>();
         synchronized (sstables)
         {
-            Iterables.addAll(candidates, nonSuspectAndNotIn(sstables, cfs.getCompactingSSTables()));
+            Iterables.addAll(candidates, nonSuspectAndNotIn(sstables, dataTracker.getCompacting()));
         }
 
         SizeTieredBuckets sizeTieredBuckets = new SizeTieredBuckets(candidates, sizeTieredOptions, minThreshold, maxThreshold);
         sizeTieredBuckets.aggregate();
 
-        backgroundCompactions.setPending(sizeTieredBuckets.getAggregates());
+        backgroundCompactions.setPending(this, sizeTieredBuckets.getAggregates());
 
         CompactionAggregate ret = sizeTieredBuckets.getAggregates().isEmpty() ? null : sizeTieredBuckets.getAggregates().get(0);
 
@@ -336,8 +334,8 @@ private static double totHotness(Iterable<SSTableReader> sstables, @Nullable fin
     protected AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
     {
         return isMaximal && splitOutput
-               ? SplittingCompactionTask.forSplitting(this, txn, gcBefore)
-               : CompactionTask.forCompaction(this, txn, gcBefore);
+               ? new SplittingCompactionTask(cfs, txn, gcBefore, this)
+               : new CompactionTask(cfs, txn, gcBefore, false, this);
     }
 
     public long getMaxSSTableBytes()
@@ -347,7 +345,7 @@ public long getMaxSSTableBytes()
 
     public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
     {
-        Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
+        Map<String, String> uncheckedOptions = CompactionStrategyOptions.validateOptions(options);
         uncheckedOptions = SizeTieredCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
 
         uncheckedOptions.remove(CompactionParams.Option.MIN_THRESHOLD.toString());
@@ -392,7 +390,7 @@ public void removeSSTable(SSTableReader sstable)
     }
 
     @Override
-    protected Set<SSTableReader> getSSTables()
+    public Set<SSTableReader> getSSTables()
     {
         synchronized (sstables)
         {
@@ -409,14 +407,9 @@ public String toString()
 
     private static class SplittingCompactionTask extends CompactionTask
     {
-        public SplittingCompactionTask(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
-        {
-            super(strategy, txn, gcBefore, false);
-        }
-
-        static AbstractCompactionTask forSplitting(AbstractCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore)
+        public SplittingCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, CompactionStrategy strategy)
         {
-            return new SplittingCompactionTask(strategy, txn, gcBefore);
+            super(cfs, txn, gcBefore, false, strategy);
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java
index 9d6803c0a682..f785a180efdf 100644
--- a/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java
+++ b/src/java/org/apache/cassandra/db/compaction/TieredCompactionStatistics.java
@@ -27,41 +27,16 @@
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 
-public abstract class TieredCompactionStatistics extends CompactionAggregateStatistics
+abstract class TieredCompactionStatistics extends CompactionAggregateStatistics
 {
     private static final Collection<String> HEADER = ImmutableList.copyOf(Iterables.concat(ImmutableList.of("Bucket", "Hotness"),
-                                                                                           CompactionAggregateStatistics.HEADER,
-                                                                                           ImmutableList.of("Tot/Read/Written")));
+                                                                                           CompactionAggregateStatistics.HEADER));
 
     private static final long serialVersionUID = 3695927592357987916L;
-    /** The total read hotness of the sstables */
-    protected final double hotness;
-    /** Total uncompressed bytes of the sstables */
-    protected final long tot;
-    /** Total bytes read by ongoing compactions */
-    protected final long read;
-    /** Total bytes written by ongoing compactions */
-    protected final long written;
 
-    public TieredCompactionStatistics(int numCompactions,
-                                      int numCompactionsInProgress,
-                                      int numSSTables,
-                                      int numCandidateSSTables,
-                                      int numCompactingSSTables,
-                                      long sizeInBytes,
-                                      double readThroughput,
-                                      double writeThroughput,
-                                      double hotness,
-                                      long tot,
-                                      long read,
-                                      long written)
+    public TieredCompactionStatistics(CompactionAggregateStatistics base)
     {
-        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput);
-
-        this.hotness = hotness;
-        this.tot = tot;
-        this.read = read;
-        this.written = written;
+        super(base);
     }
 
     /** The total read hotness of the sstables */
@@ -71,27 +46,6 @@ public double hotness()
         return hotness;
     }
 
-    /** Total uncompressed bytes of the sstables */
-    @JsonProperty
-    public long tot()
-    {
-        return tot;
-    }
-
-    /** Uncompressed bytes read by compactions so far. */
-    @JsonProperty
-    public long read()
-    {
-        return read;
-    }
-
-    /** Uncompressed  bytes written by compactions so far. */
-    @JsonProperty
-    public long written()
-    {
-        return written;
-    }
-
     @Override
     protected Collection<String> header()
     {
@@ -107,7 +61,6 @@ protected Collection<String> data()
 
         data.addAll(super.data());
 
-        data.add(toString(tot()) + '/' + toString(read()) + '/' + toString(written()));
         return data;
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java
index c106f4073ca2..ce935e058932 100644
--- a/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java
+++ b/src/java/org/apache/cassandra/db/compaction/TimeTieredCompactionStatistics.java
@@ -35,21 +35,9 @@ public class TimeTieredCompactionStatistics extends TieredCompactionStatistics
     /** The timestamp in this tier */
     private final long timestamp;
 
-    TimeTieredCompactionStatistics(long timestamp,
-                                   double hotness,
-                                   int numCompactions,
-                                   int numCompactionsInProgress,
-                                   int numSSTables,
-                                   int numCandidateSSTables,
-                                   int numCompactingSSTables,
-                                   long sizeInBytes,
-                                   double readThroughput,
-                                   double writeThroughput,
-                                   long tot,
-                                   long read,
-                                   long written)
+    TimeTieredCompactionStatistics(CompactionAggregateStatistics base, long timestamp)
     {
-        super(numCompactions, numCompactionsInProgress, numSSTables, numCandidateSSTables, numCompactingSSTables, sizeInBytes, readThroughput, writeThroughput, hotness, tot, read, written);
+        super(base);
 
         this.timestamp = timestamp;
     }
diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
index 7f3765ed464a..f6ec22cb18d2 100644
--- a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java
@@ -35,7 +35,6 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.exceptions.ConfigurationException;
@@ -44,22 +43,22 @@
 
 import static com.google.common.collect.Iterables.filter;
 
-public class TimeWindowCompactionStrategy extends AbstractCompactionStrategy.WithAggregates
+public class TimeWindowCompactionStrategy extends LegacyAbstractCompactionStrategy.WithAggregates
 {
     private static final Logger logger = LoggerFactory.getLogger(TimeWindowCompactionStrategy.class);
 
-    private final TimeWindowCompactionStrategyOptions options;
+    private final TimeWindowCompactionStrategyOptions twcsOptions;
     private final Set<SSTableReader> sstables = new HashSet<>();
     private long lastExpiredCheck;
     private long highestWindowSeen;
 
-    public TimeWindowCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    public TimeWindowCompactionStrategy(CompactionStrategyFactory factory, Map<String, String> options)
     {
-        super(cfs, options);
-        this.options = new TimeWindowCompactionStrategyOptions(options);
-        if (!options.containsKey(AbstractCompactionStrategy.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(AbstractCompactionStrategy.TOMBSTONE_THRESHOLD_OPTION))
+        super(factory, options);
+        this.twcsOptions = new TimeWindowCompactionStrategyOptions(options);
+        if (!options.containsKey(CompactionStrategyOptions.TOMBSTONE_COMPACTION_INTERVAL_OPTION) && !options.containsKey(CompactionStrategyOptions.TOMBSTONE_THRESHOLD_OPTION))
         {
-            disableTombstoneCompactions = true;
+            super.options.setDisableTombstoneCompactions(true);
             logger.debug("Disabling tombstone compactions for TWCS");
         }
         else
@@ -67,9 +66,12 @@ public TimeWindowCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> o
     }
 
     @Override
-    public AbstractCompactionTask createCompactionTask(final int gcBefore, LifecycleTransaction txn, boolean isMaximal, boolean splitOutput)
+    public AbstractCompactionTask createCompactionTask(final int gcBefore,
+                                                       LifecycleTransaction txn,
+                                                       boolean isMaximal,
+                                                       boolean splitOutput)
     {
-        return CompactionTask.forTimeWindowCompaction(this, txn, gcBefore);
+        return new TimeWindowCompactionTask(cfs, txn, gcBefore, ignoreOverlaps(), this);
     }
 
     /**
@@ -80,10 +82,10 @@ public AbstractCompactionTask createCompactionTask(final int gcBefore, Lifecycle
     @Override
     protected synchronized CompactionAggregate getNextBackgroundAggregate(final int gcBefore)
     {
-        if (Iterables.isEmpty(cfs.getSSTables(SSTableSet.LIVE)))
+        if (Iterables.isEmpty(dataTracker.getView().select(SSTableSet.LIVE)))
             return null;
 
-        Set<SSTableReader> compacting = cfs.getCompactingSSTables();
+        Set<SSTableReader> compacting = dataTracker.getCompacting();
         Set<SSTableReader> uncompacting;
         synchronized (sstables)
         {
@@ -93,11 +95,11 @@ protected synchronized CompactionAggregate getNextBackgroundAggregate(final int
         // Find fully expired SSTables. Those will be included no matter what.
         Set<SSTableReader> expired = Collections.emptySet();
 
-        if (System.currentTimeMillis() - lastExpiredCheck > options.expiredSSTableCheckFrequency)
+        if (System.currentTimeMillis() - lastExpiredCheck > twcsOptions.expiredSSTableCheckFrequency)
         {
             logger.debug("TWCS expired check sufficiently far in the past, checking for fully expired SSTables");
-            expired = CompactionController.getFullyExpiredSSTables(cfs, uncompacting, options.ignoreOverlaps ? Collections.emptySet() : cfs.getOverlappingLiveSSTables(uncompacting),
-                                                                   gcBefore, options.ignoreOverlaps);
+            expired = CompactionController.getFullyExpiredSSTables(cfs, uncompacting, twcsOptions.ignoreOverlaps ? Collections.emptySet() : cfs.getOverlappingLiveSSTables(uncompacting),
+                                                                   gcBefore, twcsOptions.ignoreOverlaps);
             lastExpiredCheck = System.currentTimeMillis();
         }
         else
@@ -105,7 +107,7 @@ protected synchronized CompactionAggregate getNextBackgroundAggregate(final int
             logger.debug("TWCS skipping check for fully expired SSTables");
         }
 
-        Set<SSTableReader> candidates = Sets.newHashSet(filterSuspectSSTables(uncompacting));
+        Set<SSTableReader> candidates = Sets.newHashSet(Iterables.filter(uncompacting, sstable -> !sstable.isMarkedSuspect()));
 
         CompactionAggregate compactionCandidate = getNextNonExpiredSSTables(Sets.difference(candidates, expired), gcBefore);
         if (expired.isEmpty())
@@ -114,7 +116,7 @@ protected synchronized CompactionAggregate getNextBackgroundAggregate(final int
         logger.debug("Including expired sstables: {}", expired);
         if (compactionCandidate == null)
         {
-            long timestamp = getWindowBoundsInMillis(options.sstableWindowUnit, options.sstableWindowSize,
+            long timestamp = getWindowBoundsInMillis(twcsOptions.sstableWindowUnit, twcsOptions.sstableWindowSize,
                                                      Collections.max(expired, Comparator.comparing(SSTableReader::getMaxTimestamp)).getMaxTimestamp());
             return CompactionAggregate.createTimeTiered(expired, timestamp);
         }
@@ -125,7 +127,7 @@ protected synchronized CompactionAggregate getNextBackgroundAggregate(final int
     private CompactionAggregate getNextNonExpiredSSTables(Iterable<SSTableReader> nonExpiringSSTables, final int gcBefore)
     {
         List<CompactionAggregate> candidates = getCompactionCandidates(nonExpiringSSTables);
-        backgroundCompactions.setPending(candidates);
+        backgroundCompactions.setPending(this, candidates);
 
         CompactionAggregate ret = candidates.isEmpty() ? null : candidates.get(0);
 
@@ -139,7 +141,7 @@ private CompactionAggregate getNextNonExpiredSSTables(Iterable<SSTableReader> no
 
     private List<CompactionAggregate> getCompactionCandidates(Iterable<SSTableReader> candidateSSTables)
     {
-        NavigableMap<Long, List<SSTableReader>> buckets = getBuckets(candidateSSTables, options.sstableWindowUnit, options.sstableWindowSize, options.timestampResolution);
+        NavigableMap<Long, List<SSTableReader>> buckets = getBuckets(candidateSSTables, twcsOptions.sstableWindowUnit, twcsOptions.sstableWindowSize, twcsOptions.timestampResolution);
         // Update the highest window seen, if necessary
         if (!buckets.isEmpty())
         {
@@ -151,7 +153,7 @@ private List<CompactionAggregate> getCompactionCandidates(Iterable<SSTableReader
         return getBucketAggregates(buckets,
                                    cfs.getMinimumCompactionThreshold(),
                                    cfs.getMaximumCompactionThreshold(),
-                                   options.stcsOptions,
+                                   twcsOptions.stcsOptions,
                                    this.highestWindowSeen);
     }
 
@@ -192,7 +194,7 @@ public void removeSSTable(SSTableReader sstable)
     }
 
     @Override
-    protected Set<SSTableReader> getSSTables()
+    public Set<SSTableReader> getSSTables()
     {
         synchronized (sstables)
         {
@@ -370,7 +372,7 @@ else if (bucket.size() >= 2 && key < now)
 
     boolean ignoreOverlaps()
     {
-        return options.ignoreOverlaps;
+        return twcsOptions.ignoreOverlaps;
     }
 
     public long getMaxSSTableBytes()
@@ -381,7 +383,7 @@ public long getMaxSSTableBytes()
 
     public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
     {
-        Map<String, String> uncheckedOptions = AbstractCompactionStrategy.validateOptions(options);
+        Map<String, String> uncheckedOptions = CompactionStrategyOptions.validateOptions(options);
         uncheckedOptions = TimeWindowCompactionStrategyOptions.validateOptions(options, uncheckedOptions);
 
         uncheckedOptions.remove(CompactionParams.Option.MIN_THRESHOLD.toString());
diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java
index 57221ab03d53..a2c10d9088a2 100644
--- a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java
+++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionTask.java
@@ -28,9 +28,9 @@ public class TimeWindowCompactionTask extends CompactionTask
 {
     private final boolean ignoreOverlaps;
 
-    public TimeWindowCompactionTask(TimeWindowCompactionStrategy strategy, LifecycleTransaction txn, int gcBefore, boolean ignoreOverlaps)
+    public TimeWindowCompactionTask(ColumnFamilyStore cfs, LifecycleTransaction txn, int gcBefore, boolean ignoreOverlaps, TimeWindowCompactionStrategy strategy)
     {
-        super(strategy, txn, gcBefore, false);
+        super(cfs, txn, gcBefore, false, strategy);
         this.ignoreOverlaps = ignoreOverlaps;
     }
 
diff --git a/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionContainer.java b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionContainer.java
new file mode 100644
index 000000000000..def929cb3ae3
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionContainer.java
@@ -0,0 +1,361 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+
+import com.google.common.collect.ImmutableList;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.ScannerList;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.notifications.INotification;
+import org.apache.cassandra.schema.CompactionParams;
+
+public class UnifiedCompactionContainer implements CompactionStrategyContainer
+{
+    private final CompactionStrategyFactory factory;
+    private final CompactionParams params;
+    private final CompactionParams metadataParams;
+    private final UnifiedCompactionStrategy strategy;
+
+    AtomicBoolean enabled;
+
+    UnifiedCompactionContainer(CompactionStrategyFactory factory,
+                               BackgroundCompactions backgroundCompactions,
+                               CompactionParams params,
+                               CompactionParams metadataParams,
+                               boolean enabled)
+    {
+        this.factory = factory;
+        this.params = params;
+        this.metadataParams = metadataParams;
+        this.strategy = new UnifiedCompactionStrategy(factory, backgroundCompactions, params.options());
+        this.enabled = new AtomicBoolean(enabled);
+
+        factory.getCompactionLogger().strategyCreated(this.strategy);
+
+        if (this.strategy.getOptions().isLogAll())
+            factory.getCompactionLogger().enable();
+        else
+            factory.getCompactionLogger().disable();
+
+        startup();
+    }
+
+    @Override
+    public void enable()
+    {
+        this.enabled.set(true);
+    }
+
+    @Override
+    public void disable()
+    {
+        this.enabled.set(false);
+    }
+
+    @Override
+    public boolean isEnabled()
+    {
+        return enabled.get() && strategy.isActive;
+    }
+
+    @Override
+    public boolean isActive()
+    {
+        return strategy.isActive;
+    }
+
+    public static CompactionStrategyContainer create(@Nullable CompactionStrategyContainer previous,
+                                                     CompactionStrategyFactory strategyFactory,
+                                                     CompactionParams compactionParams,
+                                                     CompactionStrategyContainer.ReloadReason reason)
+    {
+        boolean enabled = CompactionStrategyFactory.enableCompactionOnReload(previous, compactionParams, reason);
+        BackgroundCompactions backgroundCompactions;
+        // inherit compactions history from previous UCS container
+        if (previous instanceof UnifiedCompactionContainer)
+            backgroundCompactions = ((UnifiedCompactionContainer) previous).getBackgroundCompactions();
+
+        // for other cases start from scratch
+        // We don't inherit from legacy compactions right now because there are multiple strategies and we'd need
+        // to merge their BackgroundCompactions to support that. Merging per se is not tricky, but the bigger problem
+        // is aggregate cleanup. We'd need to unsubscribe from compaction tasks by legacy strategies and subscribe
+        // by the new UCS to remove inherited ongoing compactions when they complete.
+        // We might want to revisit this issue later to improve UX.
+        else
+            backgroundCompactions = new BackgroundCompactions(strategyFactory.getCfs());
+        CompactionParams metadataParams = createMetadataParams(previous, compactionParams, reason);
+
+        if (previous != null)
+            previous.shutdown();
+
+        return new UnifiedCompactionContainer(strategyFactory,
+                                              backgroundCompactions,
+                                              compactionParams,
+                                              metadataParams,
+                                              enabled);
+    }
+
+    @Override
+    public CompactionStrategyContainer reload(@Nonnull CompactionStrategyContainer previous,
+                                              CompactionParams compactionParams,
+                                              ReloadReason reason)
+    {
+        return create(previous, factory, compactionParams, reason);
+    }
+
+    private static CompactionParams createMetadataParams(@Nullable CompactionStrategyContainer previous,
+                                                         CompactionParams compactionParams,
+                                                         ReloadReason reason)
+    {
+        CompactionParams metadataParams;
+        if (reason == CompactionStrategyContainer.ReloadReason.METADATA_CHANGE)
+            // metadataParams are aligned with compactionParams. We do not access TableParams.compaction to avoid racing with
+            // concurrent ALTER TABLE metadata change.
+            metadataParams = compactionParams;
+        else if (previous != null)
+            metadataParams = previous.getMetadataCompactionParams();
+        else
+            metadataParams = null;
+
+        return metadataParams;
+    }
+
+    @Override
+    public CompactionParams getCompactionParams()
+    {
+        return params;
+    }
+
+    @Override
+    public CompactionParams getMetadataCompactionParams()
+    {
+        return metadataParams;
+    }
+
+    @Override
+    public List<CompactionStrategy> getStrategies()
+    {
+        return ImmutableList.of(strategy);
+    }
+
+    @Override
+    public List<CompactionStrategy> getStrategies(boolean isRepaired, @Nullable UUID pendingRepair)
+    {
+        return getStrategies();
+    }
+
+    @Override
+    public void repairSessionCompleted(UUID sessionID)
+    {
+        // We are not tracking SSTables, so nothing to do here.
+    }
+
+    /**
+     * UCC does not need to use this method with {@link ColumnFamilyStore#mutateRepaired}
+     * @return null
+     */
+    @Override
+    public ReentrantReadWriteLock.WriteLock getWriteLock()
+    {
+        return null;
+    }
+
+    @Override
+    public CompactionLogger getCompactionLogger()
+    {
+        return strategy.compactionLogger;
+    }
+
+    @Override
+    public void pause()
+    {
+        strategy.pause();
+    }
+
+    @Override
+    public void resume()
+    {
+        strategy.resume();
+    }
+
+    @Override
+    public void startup()
+    {
+        strategy.startup();
+    }
+
+    @Override
+    public void shutdown()
+    {
+        strategy.shutdown();
+    }
+
+    @Override
+    public Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore)
+    {
+        return strategy.getNextBackgroundTasks(gcBefore);
+    }
+
+    @Override
+    public CompactionTasks getMaximalTasks(int gcBefore, boolean splitOutput)
+    {
+        return strategy.getMaximalTasks(gcBefore, splitOutput);
+    }
+
+    @Override
+    public CompactionTasks getUserDefinedTasks(Collection<SSTableReader> sstables, int gcBefore)
+    {
+        return strategy.getUserDefinedTasks(sstables, gcBefore);
+    }
+
+    @Override
+    public int getEstimatedRemainingTasks()
+    {
+        return strategy.getEstimatedRemainingTasks();
+    }
+
+    @Override
+    public AbstractCompactionTask createCompactionTask(LifecycleTransaction txn, int gcBefore, long maxSSTableBytes)
+    {
+        return strategy.createCompactionTask(txn, gcBefore, maxSSTableBytes);
+    }
+
+    @Override
+    public int getTotalCompactions()
+    {
+        return strategy.getTotalCompactions();
+    }
+
+    @Override
+    public List<CompactionStrategyStatistics> getStatistics()
+    {
+        return strategy.getStatistics();
+    }
+
+    @Override
+    public long getMaxSSTableBytes()
+    {
+        return strategy.getMaxSSTableBytes();
+    }
+
+    @Override
+    public int[] getSSTableCountPerLevel()
+    {
+        return strategy.getSSTableCountPerLevel();
+    }
+
+    @Override
+    public int getLevelFanoutSize()
+    {
+        return strategy.getLevelFanoutSize();
+    }
+
+    @Override
+    public ScannerList getScanners(Collection<SSTableReader> sstables, Collection<Range<Token>> ranges)
+    {
+        return strategy.getScanners(sstables, ranges);
+    }
+
+    @Override
+    public String getName()
+    {
+        return strategy.getName();
+    }
+
+    @Override
+    public Set<SSTableReader> getSSTables()
+    {
+        return strategy.getSSTables();
+    }
+
+    @Override
+    public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup)
+    {
+        return strategy.groupSSTablesForAntiCompaction(sstablesToGroup);
+    }
+
+    @Override
+    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
+                                                       long keyCount,
+                                                       long repairedAt,
+                                                       UUID pendingRepair,
+                                                       boolean isTransient,
+                                                       MetadataCollector collector,
+                                                       SerializationHeader header,
+                                                       Collection<Index.Group> indexGroups,
+                                                       LifecycleNewTracker lifecycleNewTracker)
+    {
+        return strategy.createSSTableMultiWriter(descriptor,
+                                                 keyCount,
+                                                 repairedAt,
+                                                 pendingRepair,
+                                                 isTransient,
+                                                 collector,
+                                                 header,
+                                                 indexGroups,
+                                                 lifecycleNewTracker);
+    }
+
+    @Override
+    public boolean supportsEarlyOpen()
+    {
+        return strategy.supportsEarlyOpen();
+    }
+
+    BackgroundCompactions getBackgroundCompactions()
+    {
+        return strategy.backgroundCompactions;
+    }
+
+    @Override
+    public void onInProgress(CompactionProgress progress)
+    {
+        strategy.onInProgress(progress);
+    }
+
+    @Override
+    public void onCompleted(UUID id)
+    {
+        strategy.onCompleted(id);
+    }
+
+    @Override
+    public void handleNotification(INotification notification, Object sender)
+    {
+        // TODO - this is a no-op because the strategy is stateless but we could detect here
+        // sstables that are added either because of streaming or because of nodetool refresh
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStatistics.java b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStatistics.java
new file mode 100644
index 000000000000..e4f2dde3f607
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStatistics.java
@@ -0,0 +1,165 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * The statistics for size tiered compaction.
+ * <p/>
+ * Implements serializable to allow structured info to be returned via JMX.
+ */
+public class UnifiedCompactionStatistics extends CompactionAggregateStatistics
+{
+    private static final Collection<String> HEADER = ImmutableList.copyOf(Iterables.concat(ImmutableList.of("Bucket", "W", "T", "F", "min size", "max size"),
+                                                                                           CompactionAggregateStatistics.HEADER));
+
+    private static final long serialVersionUID = 3695927592357345266L;
+
+    /** The bucket number */
+    private final int bucket;
+
+    /** The survival factor o */
+    private final double survivalFactor;
+
+    /** The scaling parameter W */
+    private final int scalingParameter;
+
+    /** The number of SSTables T that trigger a compaction */
+    private final int threshold;
+
+    /** The fanout size F */
+    private final int fanout;
+
+    /** The minimum size for an SSTable that belongs to this bucket */
+    private final long minSizeBytes;
+
+    /** The maximum size for an SSTable run that belongs to this bucket */
+    private final long maxSizeBytes;
+
+    /** The name of the shard */
+    private final String shard;
+
+    UnifiedCompactionStatistics(CompactionAggregateStatistics base,
+                                int bucketIndex,
+                                double survivalFactor,
+                                int scalingParameter,
+                                int threshold,
+                                int fanout,
+                                long minSizeBytes,
+                                long maxSizeBytes,
+                                String shard)
+    {
+        super(base);
+
+        this.bucket = bucketIndex;
+        this.survivalFactor = survivalFactor;
+        this.scalingParameter = scalingParameter;
+        this.threshold = threshold;
+        this.fanout = fanout;
+        this.minSizeBytes = minSizeBytes;
+        this.maxSizeBytes = maxSizeBytes;
+        this.shard = shard;
+    }
+
+    /** The bucket number */
+    @JsonProperty
+    public int bucket()
+    {
+        return bucket;
+    }
+
+    /** The survival factor o, currently always one */
+    @JsonProperty
+    public double survivalFactor()
+    {
+        return survivalFactor;
+    }
+
+    /** The scaling parameter W */
+    @JsonProperty
+    public int scalingParameter()
+    {
+        return scalingParameter;
+    }
+
+    /** The number of SSTables T that trigger a compaction */
+    @JsonProperty
+    public int threshold()
+    {
+        return threshold;
+    }
+
+    /** The fanout size F */
+    @JsonProperty
+    public int fanout()
+    {
+        return fanout;
+    }
+
+    /** The minimum size for an SSTable that belongs to this bucket */
+    @JsonProperty
+    public long minSizeBytes()
+    {
+        return minSizeBytes;
+    }
+
+    /** The maximum size for an SSTable that belongs to this bucket */
+    @JsonProperty
+    public long maxSizeBytes()
+    {
+        return maxSizeBytes;
+    }
+
+    /** The name of the shard, empty if the compaction is not sharded (the default). */
+    @JsonProperty
+    @Override
+    public String shard()
+    {
+        return shard;
+    }
+
+    @Override
+    protected Collection<String> header()
+    {
+        return HEADER;
+    }
+
+    @Override
+    protected Collection<String> data()
+    {
+        List<String> data = new ArrayList<>(HEADER.size());
+        data.add(Integer.toString(bucket()));
+        data.add(Integer.toString(scalingParameter));
+        data.add(Integer.toString(threshold));
+        data.add(Integer.toString(fanout));
+        data.add(FBUtilities.prettyPrintMemory(minSizeBytes));
+        data.add(FBUtilities.prettyPrintMemory(maxSizeBytes));
+
+        data.addAll(super.data());
+
+        return data;
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java
index 98a2bbda0577..d090b27dc474 100644
--- a/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java
+++ b/src/java/org/apache/cassandra/db/compaction/UnifiedCompactionStrategy.java
@@ -1,13 +1,11 @@
 /*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
+ * Copyright DataStax, Inc.
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -18,17 +16,1061 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
 import java.util.Map;
+import java.util.Optional;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.UUID;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Ordering;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import org.agrona.collections.IntArrayList;
+import org.apache.cassandra.concurrent.ScheduledExecutors;
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DiskBoundaries;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.SortedLocalRanges;
+import org.apache.cassandra.db.compaction.unified.Controller;
+import org.apache.cassandra.db.compaction.unified.ShardedMultiWriter;
+import org.apache.cassandra.db.compaction.unified.UnifiedCompactionTask;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.apache.cassandra.utils.Throwables.perform;
 
 /**
- * TODO: STAR-13 will introduce true UCS, please remove me.
+ * The unified compaction strategy is described in this design document:
+ *
+ * TODO: link to design doc or SEP
  */
-public class UnifiedCompactionStrategy extends SizeTieredCompactionStrategy
+public class UnifiedCompactionStrategy extends AbstractCompactionStrategy
 {
-    public UnifiedCompactionStrategy(ColumnFamilyStore cfs, Map<String, String> options)
+    public static final Class<? extends CompactionStrategyContainer> CONTAINER_CLASS = UnifiedCompactionContainer.class;
+
+    private static final Logger logger = LoggerFactory.getLogger(UnifiedCompactionStrategy.class);
+
+    static final int MAX_LEVELS = 32;   // This is enough for a few petabytes of data (with the worst case fan factor
+                                        // at W=0 this leaves room for 2^32 sstables, presumably of at least 1MB each).
+
+    private final Controller controller;
+
+    private volatile ArenaSelector arenaSelector;
+
+    private long lastExpiredCheck;
+
+    public UnifiedCompactionStrategy(CompactionStrategyFactory factory, BackgroundCompactions backgroundCompactions, Map<String, String> options)
+    {
+        this(factory, backgroundCompactions, options, Controller.fromOptions(factory.getCfs(), options));
+    }
+
+    public UnifiedCompactionStrategy(CompactionStrategyFactory factory, BackgroundCompactions backgroundCompactions, Controller controller)
+    {
+        this(factory, backgroundCompactions, new HashMap<>(), controller);
+    }
+
+    public UnifiedCompactionStrategy(CompactionStrategyFactory factory, BackgroundCompactions backgroundCompactions, Map<String, String> options, Controller controller)
+    {
+        super(factory, backgroundCompactions, options);
+        this.controller = controller;
+    }
+
+    @VisibleForTesting
+    public UnifiedCompactionStrategy(CompactionStrategyFactory factory, Controller controller)
+    {
+        this(factory, new BackgroundCompactions(factory.getCfs()), new HashMap<>(), controller);
+    }
+
+    public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
+    {
+        return Controller.validateOptions(CompactionStrategyOptions.validateOptions(options));
+    }
+
+    @Override
+    public Collection<Collection<SSTableReader>> groupSSTablesForAntiCompaction(Collection<SSTableReader> sstablesToGroup)
+    {
+        Collection<Collection<SSTableReader>> groups = new ArrayList<>();
+        for (Shard shard : getCompactionShards(sstablesToGroup))
+        {
+            groups.addAll(super.groupSSTablesForAntiCompaction(shard.sstables));
+        }
+
+        return groups;
+    }
+
+    @Override
+    public void startup()
+    {
+        perform(super::startup,
+                () -> controller.startup(this, ScheduledExecutors.scheduledTasks));
+    }
+
+    @Override
+    public void shutdown()
+    {
+        perform(super::shutdown,
+                controller::shutdown);
+    }
+
+    /**
+     * Returns a collections of compaction tasks.
+     *
+     * This method is synchornized because task creation is significantly more expensive in UCS; the strategy is
+     * stateless, therefore it has to compute the shard/bucket structure on each call.
+     *
+     * @param gcBefore throw away tombstones older than this
+     * @return collection of AbstractCompactionTask, which could be either a CompactionTask or an UnifiedCompactionTask
+     */
+    @Override
+    public synchronized Collection<AbstractCompactionTask> getNextBackgroundTasks(int gcBefore)
+    {
+        controller.onStrategyBackgroundTaskRequest();
+
+        Collection<CompactionAggregate> compactionAggregates = getNextCompactionAggregates(gcBefore);
+
+        Collection<AbstractCompactionTask> tasks = new ArrayList<>(compactionAggregates.size());
+        for (CompactionAggregate aggregate : compactionAggregates)
+        {
+            LifecycleTransaction transaction = dataTracker.tryModify(aggregate.getSelected().sstables, OperationType.COMPACTION);
+            if (transaction != null)
+            {
+                backgroundCompactions.setSubmitted(this, transaction.opId(), aggregate);
+                tasks.add(createCompactionTask(transaction, gcBefore));
+            }
+            else
+            {
+                // This can happen e.g. due to a race with upgrade tasks
+                logger.error("Failed to submit compaction {} because a transaction could not be created. If this happens frequently, it should be reported", aggregate);
+            }
+        }
+
+        return tasks;
+    }
+
+    /**
+     * Create the sstable writer used for flushing.
+     *
+     * @return either a normal sstable writer, if there are no shards, or a sharded sstable writer that will
+     *         create multiple sstables if a shard has a sufficiently large sstable.
+     */
+    @Override
+    public SSTableMultiWriter createSSTableMultiWriter(Descriptor descriptor,
+                                                       long keyCount,
+                                                       long repairedAt,
+                                                       UUID pendingRepair,
+                                                       boolean isTransient,
+                                                       MetadataCollector meta,
+                                                       SerializationHeader header,
+                                                       Collection<Index.Group> indexGroups,
+                                                       LifecycleNewTracker lifecycleNewTracker)
+    {
+        if (controller.getNumShards() <= 1)
+            return super.createSSTableMultiWriter(descriptor,
+                                                  keyCount,
+                                                  repairedAt,
+                                                  pendingRepair,
+                                                  isTransient,
+                                                  meta,
+                                                  header,
+                                                  indexGroups,
+                                                  lifecycleNewTracker);
+
+        return new ShardedMultiWriter(cfs,
+                                      descriptor,
+                                      keyCount,
+                                      repairedAt,
+                                      pendingRepair,
+                                      isTransient,
+                                      meta,
+                                      header,
+                                      indexGroups,
+                                      lifecycleNewTracker,
+                                      controller.getMinSstableSizeBytes(),
+                                      getShardBoundaries());
+    }
+
+    /**
+     * Create the task that in turns creates the sstable writer used for compaction.
+     *
+     * @return either a normal compaction task, if there are no shards, or a sharded compaction task that in turn will
+     * create a sharded compaction writer.
+     */
+    private CompactionTask createCompactionTask(LifecycleTransaction transaction, int gcBefore)
+    {
+        if (controller.getNumShards() <= 1)
+            return new CompactionTask(cfs, transaction, gcBefore, false, this);
+
+        return new UnifiedCompactionTask(cfs, this, transaction, gcBefore, controller.getMinSstableSizeBytes(), getShardBoundaries());
+    }
+
+    private void maybeUpdateSelector()
+    {
+        if (arenaSelector != null && !arenaSelector.diskBoundaries.isOutOfDate())
+            return; // the disk boundaries (and thus the local ranges too) have not changed since the last time we calculated
+
+        synchronized (this)
+        {
+            if (arenaSelector != null && !arenaSelector.diskBoundaries.isOutOfDate())
+                return; // another thread beat us to the update
+
+            DiskBoundaries currentBoundaries = cfs.getDiskBoundaries();
+            List<PartitionPosition> shardBoundaries = computeShardBoundaries(currentBoundaries.getLocalRanges(),
+                                                                             currentBoundaries.getPositions(),
+                                                                             controller.getNumShards(),
+                                                                             cfs.getPartitioner());
+            arenaSelector = new ArenaSelector(currentBoundaries, shardBoundaries);
+            // Note: this can just as well be done without the synchronization (races would be benign, just doing some
+            // redundant work). For the current usages of this blocking is fine and expected to perform no worse.
+        }
+    }
+
+    /**
+     * We want to split the local token range in shards, aiming for close to equal share for each shard.
+     * If there are no disk boundaries, we just split the token space equally, but if multiple disks have been defined
+     * (each with its own share of the local range), we can't have shards spanning disk boundaries. This means that
+     * shards need to be selected within the disk's portion of the local ranges.
+     *
+     * As an example of what this means, consider a 3-disk node and 10 shards. The range is split equally between
+     * disks, but we can only split shards within a disk range, thus we end up with 6 shards taking 1/3*1/3=1/9 of the
+     * token range, and 4 smaller shards taking 1/3*1/4=1/12 of the token range.
+     */
+    @VisibleForTesting
+    static List<PartitionPosition> computeShardBoundaries(SortedLocalRanges localRanges,
+                                                          List<PartitionPosition> diskBoundaries,
+                                                          int numShards,
+                                                          IPartitioner partitioner)
+    {
+        Optional<Splitter> splitter = partitioner.splitter();
+        if (diskBoundaries != null && !splitter.isPresent())
+            return diskBoundaries;
+        else if (!splitter.isPresent()) // C* 2i case, just return 1 boundary at min token
+            return ImmutableList.of(partitioner.getMinimumToken().minKeyBound());
+
+        // this should only happen in tests that change partitioners, but we don't want UCS to throw
+        // where other strategies work even if the situations are unrealistic.
+        if (localRanges.getRanges().isEmpty() || !localRanges.getRanges()
+                                                             .get(0)
+                                                             .range()
+                                                             .left
+                                                             .getPartitioner()
+                                                             .equals(partitioner))
+            localRanges = new SortedLocalRanges(StorageService.instance,
+                                                localRanges.getCfs(),
+                                                localRanges.getRingVersion(),
+                                                ImmutableList.of(new Splitter.WeightedRange(1.0,
+                                                                                            new Range<>(partitioner.getMinimumToken(),
+                                                                                                        partitioner.getMaximumToken()))));
+
+        if (diskBoundaries == null || diskBoundaries.size() <= 1)
+            return localRanges.split(numShards);
+
+        if (numShards <= diskBoundaries.size())
+            return diskBoundaries;
+
+        return splitPerDiskRanges(localRanges,
+                                  diskBoundaries,
+                                  getRangesTotalSize(localRanges.getRanges()),
+                                  numShards,
+                                  splitter.get());
+    }
+
+    /**
+     * Split the per-disk ranges and generate the required number of shard boundaries.
+     * This works by accumulating the size after each disk's share, multiplying by shardNum/totalSize and rounding to
+     * produce an integer number of total shards needed by the disk boundary, which in turns defines how many need to be
+     * added for this disk.
+     *
+     * For example, for a total size of 1, 2 disks (each of 0.5 share) and 3 shards, this will:
+     * -process disk 1:
+     * -- calculate 1/2 as the accumulated size
+     * -- map this to 3/2 and round to 2 shards
+     * -- split the disk's ranges into two equally-sized shards
+     * -process disk 2:
+     * -- calculate 1 as the accumulated size
+     * -- map it to 3 and round to 3 shards
+     * -- assign the disk's ranges to one shard
+     *
+     * The resulting shards will not be of equal size and this works best if the disk shares are distributed evenly
+     * (which the current code always ensures).
+     */
+    private static List<PartitionPosition> splitPerDiskRanges(SortedLocalRanges localRanges,
+                                                              List<PartitionPosition> diskBoundaries,
+                                                              double totalSize,
+                                                              int numShards,
+                                                              Splitter splitter)
+    {
+        double perShard = totalSize / numShards;
+        List<PartitionPosition> shardBoundaries = new ArrayList<>(numShards);
+        double processedSize = 0;
+        Token left = diskBoundaries.get(0).getToken().getPartitioner().getMinimumToken();
+        for (PartitionPosition boundary : diskBoundaries)
+        {
+            Token right = boundary.getToken();
+            List<Splitter.WeightedRange> disk = localRanges.subrange(new Range<>(left, right));
+
+            processedSize += getRangesTotalSize(disk);
+            int targetCount = (int) Math.round(processedSize / perShard);
+            List<Token> splits = splitter.splitOwnedRanges(Math.max(targetCount - shardBoundaries.size(), 1), disk, Splitter.SplitType.ALWAYS_SPLIT).boundaries;
+            shardBoundaries.addAll(Collections2.transform(splits, Token::maxKeyBound));
+            // The splitting always results in maxToken as the last boundary. Replace it with the disk's upper bound.
+            shardBoundaries.set(shardBoundaries.size() - 1, boundary);
+
+            left = right;
+        }
+        assert shardBoundaries.size() == numShards;
+        return shardBoundaries;
+    }
+
+    private static double getRangesTotalSize(List<Splitter.WeightedRange> ranges)
+    {
+        double totalSize = 0;
+        for (Splitter.WeightedRange range : ranges)
+            totalSize += range.left().size(range.right());
+        return totalSize;
+    }
+
+    @VisibleForTesting
+    List<PartitionPosition> getShardBoundaries()
+    {
+        maybeUpdateSelector();
+        return arenaSelector.shardBoundaries;
+    }
+
+    private Collection<CompactionAggregate> getNextCompactionAggregates(int gcBefore)
+    {
+        // Calculate the running compaction limits, i.e. the overall number of compactions permitted, which is either
+        // the compaction thread count, or the compaction throughput divided by the compaction rate (to prevent slowing
+        // down individual compaction progress).
+        String rateLimitLog = "";
+
+        // identify parallel compactions limit
+        int maxConcurrentCompactions = controller.maxConcurrentCompactions();
+        long spaceOverheadLimit = controller.maxCompactionSpaceBytes();
+
+        // identify throughput limit
+        double throughputLimit = controller.maxThroughput();
+        int maxCompactions;
+        if (throughputLimit < Double.MAX_VALUE)
+        {
+            int maxCompactionsForThroughput;
+
+            double compactionRate = backgroundCompactions.compactionRate.get();
+            if (compactionRate > 0)
+            {
+                // Start as many as can saturate the limit, making sure to also account for compactions that have
+                // already been started but don't have progress yet.
+
+                // Note: the throughput limit is adjusted here because the limiter won't let compaction proceed at more
+                // than the given rate, and small hiccups or rounding errors could cause this to go above the current
+                // running count when we are already at capacity.
+                // Allow up to 5% variability, or if we are permitted more than 20 concurrent compactions, one/maxcount
+                // so that we don't issue less tasks than we should.
+                double adjustment = Math.min(0.05, 1.0 / maxConcurrentCompactions);
+                maxCompactionsForThroughput = (int) Math.ceil(throughputLimit * (1 - adjustment) / compactionRate);
+            }
+            else
+            {
+                // If we don't have running compactions we don't know the effective rate.
+                // Allow only one compaction; this will be called again soon enough to recheck.
+                maxCompactionsForThroughput = 1;
+            }
+
+            rateLimitLog = String.format(" rate-based limit %d (rate %s/%s)",
+                                         maxCompactionsForThroughput,
+                                         FBUtilities.prettyPrintMemoryPerSecond((long) compactionRate),
+                                         FBUtilities.prettyPrintMemoryPerSecond((long) throughputLimit));
+            maxCompactions = Math.min(maxConcurrentCompactions, maxCompactionsForThroughput);
+        }
+        else
+            maxCompactions = maxConcurrentCompactions;
+
+        // Now that we have a count, make sure it is spread close to equally among levels. In other words, reserve
+        // floor(permitted / levels) compactions for each level and don't permit more than ceil(permitted / levels) on
+        // any, to make sure that no level hogs all threads and thus lowest-level ops (which need to run more often but
+        // complete quickest) have a chance to run frequently. Also, running compactions can't go above the specified
+        // space overhead limit.
+        // To do this we count the number and size of already running compactions on each level and make sure any new
+        // ones we select satisfy these constraints.
+        int[] perLevel = new int[MAX_LEVELS];
+        int levelCount = 1; // Start at 1 to avoid division by zero if the aggregates list is empty.
+        int runningCompactions = 0;
+        long spaceAvailable = spaceOverheadLimit;
+        for (CompactionPick compaction : backgroundCompactions.getCompactionsInProgress())
+        {
+            final int level = levelOf(compaction);
+            ++perLevel[level];
+            ++runningCompactions;
+            levelCount = Math.max(levelCount, level + 1);
+            spaceAvailable -= compaction.totSizeInBytes;
+        }
+
+        logger.debug("Selecting up to {} new compactions of up to {}, concurrency limit {}{}",
+                     maxCompactions - runningCompactions,
+                     FBUtilities.prettyPrintMemory(spaceAvailable),
+                     maxConcurrentCompactions,
+                     rateLimitLog);
+
+        List<CompactionAggregate.UnifiedAggregate> pending = new ArrayList<>();
+        long ts = System.currentTimeMillis();
+        boolean expiredCheck = ts - lastExpiredCheck > controller.getExpiredSSTableCheckFrequency();
+        if (expiredCheck)
+            lastExpiredCheck = ts;
+
+        for (Map.Entry<Shard, List<Bucket>> entry : getShardsWithBuckets().entrySet())
+        {
+            Shard shard = entry.getKey();
+            Set<SSTableReader> expired;
+            if (expiredCheck)
+            {
+                expired = shard.getExpiredSSTables(gcBefore, controller.getIgnoreOverlapsInExpirationCheck());
+                if (logger.isTraceEnabled() && expired.size() > 0)
+                    logger.trace("Expiration check for shard {} found {} fully expired SSTables", shard.name(), expired.size());
+            }
+            else
+                expired = Collections.emptySet();
+
+            for (Bucket bucket : entry.getValue())
+            {
+                CompactionAggregate.UnifiedAggregate aggregate = bucket.getCompactionAggregate(shard, expired, controller, spaceAvailable);
+                // Note: We allow empty aggregates into the list of pending compactions. The pending compactions list
+                // is for progress tracking only, and it is helpful to see empty levels there.
+                pending.add(aggregate);
+
+                // Make sure the level count includes all levels for which we have sstables (to be ready to compact
+                // as soon as the threshold is crossed)...
+                levelCount = Math.max(levelCount, aggregate.bucketIndex() + 1);
+                if (aggregate.selected != null)
+                {
+                    // ... and also the levels that a layout-preserving selection would create.
+                    levelCount = Math.max(levelCount, levelOf(aggregate.selected) + 1);
+                }
+
+                // The space overhead limit also applies when a single compaction is above that limit. This should
+                // prevent running out of space at the expense of several highest-level tables extra, i.e. slightly
+                // higher read amplification, which I think is a sensible tradeoff; however, operators must be warned
+                // if this happens.
+                warnIfSizeAbove(aggregate, spaceOverheadLimit);
+            }
+        }
+
+        // Update the tracked background tasks.
+        backgroundCompactions.setPending(this, pending);
+
+        final List<CompactionAggregate> selection = getSelection(pending, maxCompactions, levelCount, perLevel, spaceAvailable);
+        logger.debug("Starting {} compactions.", selection.size());
+        return selection;
+    }
+
+    private void warnIfSizeAbove(CompactionAggregate.UnifiedAggregate aggregate, long spaceOverheadLimit)
+    {
+        if (aggregate.selected.totSizeInBytes > spaceOverheadLimit)
+            logger.warn("Compaction needs to perform an operation that is bigger than the current space overhead " +
+                        "limit - size {} (compacting {} sstables in shard {}/bucket {}); limit {} = {}% of dataset size {}. " +
+                        "To honor the limit, this operation will not be performed, which may result in degraded performance.\n" +
+                        "Please verify the compaction parameters, specifically {} and {}.",
+                        FBUtilities.prettyPrintMemory(aggregate.selected.totSizeInBytes),
+                        aggregate.selected.sstables.size(),
+                        aggregate.getShard().name(),
+                        aggregate.bucketIndex(),
+                        FBUtilities.prettyPrintMemory(spaceOverheadLimit),
+                        controller.getMaxSpaceOverhead() * 100,
+                        FBUtilities.prettyPrintMemory(controller.getDataSetSizeBytes()),
+                        Controller.DATASET_SIZE_OPTION_GB,
+                        Controller.MAX_SPACE_OVERHEAD_OPTION);
+    }
+
+    /**
+     * Returns a random selection of the compactions to be submitted. The selection will be chosen so that the total
+     * number of compactions is at most totalCount, where each level gets a share that is the whole part of the ratio
+     * between the the total permitted number of compactions, and the remainder gets distributed randomly among the
+     * levels. Note that if a level does not have tasks to fill its share, its quota will remain unused in this
+     * allocation.
+     *
+     * The selection also limits the size of the newly scheduled compactions to be below spaceAvailable by not
+     * scheduling compactions if they would push the combined size above that limit.
+     *
+     * @param pending list of all current aggregates with possible selection for each bucket
+     * @param totalCount maximum number of compactions permitted to run
+     * @param levelCount number of levels in use
+     * @param perLevel int array with the number of in-progress compactions per level
+     * @param spaceAvailable amount of space in bytes available for the new compactions
+     */
+    List<CompactionAggregate> getSelection(List<CompactionAggregate.UnifiedAggregate> pending,
+                                           int totalCount,
+                                           int levelCount,
+                                           int[] perLevel,
+                                           long spaceAvailable)
     {
-        super(cfs, options);
+        int perLevelCount = totalCount / levelCount;   // each level has this number of tasks reserved for it
+        int remainder = totalCount % levelCount;       // and the remainder is distributed randomly, up to 1 per level
+
+        // List the indexes of all compaction picks, adding several entries for compactions that span multiple shards.
+        IntArrayList list = new IntArrayList(pending.size(), -1);
+        IntArrayList expired = new IntArrayList(pending.size(), -1);
+        for (int aggregateIndex = 0; aggregateIndex < pending.size(); ++aggregateIndex)
+        {
+            CompactionAggregate.UnifiedAggregate aggregate = pending.get(aggregateIndex);
+            final CompactionPick pick = aggregate.selected;
+            if (pick.isEmpty())
+                continue;
+            if (pick.hasExpiredOnly())
+            {
+                expired.add(aggregateIndex);
+                continue;
+            }
+            if (pick.totSizeInBytes > spaceAvailable)
+                continue;
+            if (perLevel[levelOf(pick)] > perLevelCount)
+                continue;  // this level is already using up all its share + one, we can ignore candidate altogether
+
+            int shardsSpanned = shardsSpanned(pick);
+            for (int i = 0; i < shardsSpanned; ++i)  // put an entry for each spanned shard
+                list.addInt(aggregateIndex);
+        }
+        if (list.isEmpty() && expired.isEmpty())
+            return ImmutableList.of();
+
+        BitSet selection = new BitSet(pending.size());
+
+        // Always include expire-only aggregates
+        for (int i = 0; i < expired.size(); i++)
+            selection.set(expired.get(i));
+
+        int selectedSize = 0;
+        if (!list.isEmpty())
+        {
+            // Randomize the list.
+            Collections.shuffle(list, controller.random());
+
+            // Calculate how many new ones we can add in each level, and how many we can assign randomly.
+            int remaining = totalCount;
+            for (int i = 0; i < levelCount; ++i)
+            {
+                remaining -= perLevel[i];
+                if (perLevel[i] > perLevelCount)
+                    remainder -= perLevel[i] - perLevelCount;
+            }
+            int toAdd = remaining;
+            // Note: if we are in the middle of changes in the parameters or level count, remainder might become negative.
+            // This is okay, some buckets will temporarily not get their rightful share until these tasks complete.
+
+            // Select the first ones, skipping over duplicates and permitting only the specified number per level.
+            for (int i = 0; remaining > 0 && i < list.size(); ++i)
+            {
+                final int aggregateIndex = list.getInt(i);
+                if (selection.get(aggregateIndex))
+                    continue; // this is a repeat
+                CompactionAggregate.UnifiedAggregate aggregate = pending.get(aggregateIndex);
+                if (aggregate.selected.totSizeInBytes > spaceAvailable)
+                    continue; // compaction is too large for current cycle
+                int level = levelOf(aggregate.selected);
+
+                if (perLevel[level] > perLevelCount)
+                    continue;   // share + one already used
+                else if (perLevel[level] == perLevelCount)
+                {
+                    if (remainder <= 0)
+                        continue;   // share used up, no remainder to distribute
+                    --remainder;
+                }
+
+                --remaining;
+                ++perLevel[level];
+                spaceAvailable -= aggregate.selected.totSizeInBytes;
+                selection.set(aggregateIndex);
+            }
+
+            selectedSize = toAdd - remaining;
+        }
+
+        // Return in the order of the pending aggregates to satisfy tests.
+        List<CompactionAggregate> aggregates = new ArrayList<>(selectedSize + expired.size());
+        for (int i = selection.nextSetBit(0); i >= 0; i = selection.nextSetBit(i+1))
+            aggregates.add(pending.get(i));
+
+        return aggregates;
+    }
+
+    private int shardsSpanned(CompactionPick pick)
+    {
+        DecoratedKey min = pick.sstables.stream().map(SSTableReader::getFirst).min(Ordering.natural()).get();
+        DecoratedKey max = pick.sstables.stream().map(SSTableReader::getLast).max(Ordering.natural()).get();
+        return arenaSelector.shardFor(max) - arenaSelector.shardFor(min) + 1;
+    }
+
+    @Override
+    public int getEstimatedRemainingTasks()
+    {
+        return backgroundCompactions.getEstimatedRemainingTasks();
+    }
+
+    @Override
+    public long getMaxSSTableBytes()
+    {
+        return Long.MAX_VALUE;
+    }
+
+    @Override
+    public Set<SSTableReader> getSSTables()
+    {
+        return dataTracker.getLiveSSTables();
+    }
+
+    @VisibleForTesting
+    public int getW(int index)
+    {
+        return controller.getScalingParameter(index);
+    }
+
+    @VisibleForTesting
+    public Controller getController()
+    {
+        return controller;
+    }
+
+    /**
+     * Group candidate sstables (non suspect and not already compacting, and not an early version of a compaction
+     * result) into one or more compaction shards. Each compaction shard is obtained by comparing using a compound
+     * comparator for the equivalence classes.
+     *
+     * @return a list of shards, where each shard contains sstables that are eligible for being compacted together
+     */
+    @VisibleForTesting
+    Collection<Shard> getCompactionShards()
+    {
+        return getCompactionShards(dataTracker.getLiveSSTables());
+    }
+
+    Collection<Shard> getCompactionShards(Collection<SSTableReader> sstables)
+    {
+        final ArenaSelector arenaSelector = this.arenaSelector;
+        Map<SSTableReader, Shard> tables = new TreeMap<>(arenaSelector);
+        for (SSTableReader table : sstables)
+            if (isSuitableForCompaction(table))
+                tables.computeIfAbsent(table, t -> new Shard(arenaSelector, cfs))
+                      .add(table);
+
+        return tables.values();
+    }
+
+    private boolean isSuitableForCompaction(SSTableReader r)
+    {
+        return !r.isMarkedSuspect()
+               && r.openReason != SSTableReader.OpenReason.EARLY
+               && !dataTracker.getCompacting().contains(r);
+    }
+
+    /**
+     * @return a LinkedHashMap of shards with buckets where order of shards are preserved
+     */
+    @VisibleForTesting
+    Map<Shard, List<Bucket>> getShardsWithBuckets()
+    {
+        maybeUpdateSelector();
+        Collection<Shard> shards = getCompactionShards();
+        Map<Shard, List<Bucket>> ret = new LinkedHashMap<>(); // should preserve the order of shards
+
+        for (Shard shard : shards)
+        {
+            List<Bucket> buckets = new ArrayList<>(MAX_LEVELS);
+            shard.sstables.sort(arenaSelector::compareByShardAdjustedSize);
+
+            int index = 0;
+            Bucket bucket = new Bucket(controller, index, 0);
+            for (SSTableReader candidate : shard.sstables)
+            {
+                final long size = arenaSelector.shardAdjustedSize(candidate);
+                if (size < bucket.max)
+                {
+                    bucket.add(candidate);
+                    continue;
+                }
+
+                bucket.sort();
+                buckets.add(bucket); // add even if empty
+
+                while (true)
+                {
+                    bucket = new Bucket(controller, ++index, bucket.max);
+                    if (size < bucket.max)
+                    {
+                        bucket.add(candidate);
+                        break;
+                    }
+                    else
+                    {
+                        buckets.add(bucket); // add the empty bucket
+                    }
+                }
+            }
+
+            if (!bucket.sstables.isEmpty())
+            {
+                bucket.sort();
+                buckets.add(bucket);
+            }
+
+            if (!buckets.isEmpty())
+                ret.put(shard, buckets);
+
+            if (logger.isTraceEnabled())
+                logger.trace("Shard {} has {} buckets", shard, buckets.size());
+        }
+
+        logger.debug("Found {} shards with buckets for {}.{}", ret.size(), cfs.getKeyspaceName(), cfs.getTableName());
+        return ret;
+    }
+
+    private static int levelOf(CompactionPick pick)
+    {
+        return (int) pick.parent;
+    }
+
+    public TableMetadata getMetadata()
+    {
+        return cfs.metadata();
+    }
+
+    /**
+     * A compaction shard contains the list of sstables that belong to this shard as well as the arena
+     * selector used for comparison.
+     */
+    final static class Shard implements Comparable<Shard>
+    {
+        final List<SSTableReader> sstables;
+        final ArenaSelector selector;
+        private final ColumnFamilyStore cfs;
+
+        Shard(ArenaSelector selector, ColumnFamilyStore cfs)
+        {
+            this.cfs = cfs;
+            this.sstables = new ArrayList<>();
+            this.selector = selector;
+        }
+
+        void add(SSTableReader ssTableReader)
+        {
+            sstables.add(ssTableReader);
+        }
+
+        public String name()
+        {
+            SSTableReader t = sstables.get(0);
+            return selector.name(t);
+        }
+
+        @Override
+        public int compareTo(Shard o)
+        {
+            return selector.compare(this.sstables.get(0), o.sstables.get(0));
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s, %d sstables", name(), sstables.size());
+        }
+
+        /**
+         * Find fully expired SSTables. Those will be included in the aggregate no matter what.
+         * @param gcBefore
+         * @param ignoreOverlaps
+         * @return expired SSTables
+         */
+        Set<SSTableReader> getExpiredSSTables(int gcBefore, boolean ignoreOverlaps)
+        {
+            return CompactionController.getFullyExpiredSSTables(cfs,
+                                                                sstables,
+                                                                cfs.getOverlappingLiveSSTables(sstables),
+                                                                gcBefore,
+                                                                ignoreOverlaps);
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Unified strategy %s", getMetadata());
+    }
+
+    /**
+     * A bucket: index, sstables and some properties.
+     */
+    static class Bucket
+    {
+        final List<SSTableReader> sstables;
+        final int index;
+        final double survivalFactor;
+        final int scalingParameter; // scaling parameter used to calculate fanout and threshold
+        final int fanout; // fanout factor between buckets
+        final int threshold; // number of SSTables that trigger a compaction
+        final long min; // min size of sstables for this bucket
+        final long max; // max size of sstables for this bucket
+        double avg = 0; // avg size of sstables in this bucket
+
+        Bucket(Controller controller, int index, long minSize)
+        {
+            this.index = index;
+            this.survivalFactor = controller.getSurvivalFactor();
+            this.scalingParameter = controller.getScalingParameter(index);
+            this.fanout = controller.getFanout(index);
+            this.threshold = controller.getThreshold(index);
+            this.sstables = new ArrayList<>(threshold);
+            this.min = minSize;
+
+            double baseSize = minSize;
+            if (minSize == 0)
+                baseSize = controller.getBaseSstableSize(fanout);
+
+            this.max = (long) Math.floor(baseSize * fanout * controller.getSurvivalFactor());
+        }
+
+        void add(SSTableReader sstable)
+        {
+            this.sstables.add(sstable);
+            this.avg += (sstable.onDiskLength() - avg) / sstables.size();
+        }
+
+        void sort()
+        {
+            // Always sort by timestamp, older sstables first. If only a subset of the tables is compacted, let it
+            // be from a contiguous time span to aid whole-sstable expiration.
+            sstables.sort(Comparator.comparing(SSTableReader::getMaxTimestamp));
+
+            if (logger.isTraceEnabled())
+                logger.trace("Bucket: {}", this);
+        }
+
+        /**
+         * Return the compaction aggregate
+         */
+        CompactionAggregate.UnifiedAggregate getCompactionAggregate(Shard shard,
+                                                                    Set<SSTableReader> allExpiredSSTables,
+                                                                    Controller controller,
+                                                                    long spaceAvailable)
+        {
+            List<SSTableReader> expiredSet = Collections.emptyList();
+            List<SSTableReader> liveSet = sstables;
+            if (!allExpiredSSTables.isEmpty())
+            {
+                liveSet = new ArrayList<>();
+                expiredSet = new ArrayList<>();
+                bipartitionSSTables(sstables, allExpiredSSTables, liveSet, expiredSet);
+            }
+
+            List<CompactionPick> pending = ImmutableList.of();
+            CompactionPick selected;
+            int count = liveSet.size();
+            int maxSSTablesToCompact = Math.max(fanout, controller.maxSSTablesToCompact());
+
+            if (count < threshold)
+            {
+                // We do not have enough sstables for a compaction.
+                selected = CompactionPick.EMPTY;
+            }
+            else if (count <= fanout)
+            {
+                /**
+                 * Happy path. We are not late or (for levelled) we are only so late that a compaction now will
+                 * have the  same effect as doing levelled compactions one by one. Compact all. We do not cap
+                 * this pick at maxSSTablesToCompact due to an assumption that maxSSTablesToCompact is much
+                 * greater than F. See {@link Controller#MAX_SSTABLES_TO_COMPACT_OPTION} for more details.
+                 */
+                selected = CompactionPick.create(index, liveSet);
+            }
+            else if (count <= fanout * controller.getFanout(index + 1))
+            {
+                // Compaction is a bit late, but not enough to jump levels via layout compactions. We need a special
+                // case to cap compaction pick at maxSSTablesToCompact.
+                selected = CompactionPick.create(index, liveSet.subList(0, Math.min(maxSSTablesToCompact, count)));
+                if (count - maxSSTablesToCompact >= threshold)
+                {
+                    pending = new ArrayList<>();
+                    int start = maxSSTablesToCompact;
+                    int end = Math.min(2 * maxSSTablesToCompact, count);
+                    while (end - start > threshold)
+                    {
+                        pending.add(CompactionPick.create(index, liveSet.subList(start, end)));
+                        start = end;
+                        end = Math.min(end + maxSSTablesToCompact, count);
+                    }
+                }
+            }
+            // We may, however, have accumulated a lot more than T if compaction is very late, or a set of small
+            // tables was dumped on us (e.g. when converting from legacy LCS or for tests).
+            else
+            {
+                // We need to pick the compactions in such a way that the result of doing them all spreads the data in
+                // a similar way to how compaction would lay them if it was able to keep up. This means:
+                // - for tiered compaction (W >= 0), compact in sets of as many as required to get to a level.
+                //   for example, for W=2 and 55 sstables, do 3 compactions of 16 sstables, 1 of 4, and leave the other 3 alone
+                // - for levelled compaction (W < 0), compact all that would reach a level.
+                //   for W=-2 and 55, this means one compaction of 48, one of 4, and one of 3 sstables.
+                pending = layoutCompactions(controller, liveSet, (int) Math.min(spaceAvailable / avg, maxSSTablesToCompact));
+                // Out of the set of necessary compactions, choose the one to run randomly. This gives a better
+                // distribution among levels and should result in more compactions running in parallel in a big data
+                // dump.
+                assert !pending.isEmpty();  // we only enter this if count > F: layoutCompactions must have selected something to run
+                int index = controller.random().nextInt(pending.size());
+                selected = pending.remove(index);
+            }
+
+            boolean hasExpiredSSTables = !expiredSet.isEmpty();
+            if (hasExpiredSSTables && selected.equals(CompactionPick.EMPTY))
+                // overrides default CompactionPick.EMPTY with parent equal to -1
+                selected = CompactionPick.create(index, expiredSet, expiredSet);
+            else if (hasExpiredSSTables)
+                selected = selected.withExpiredSSTables(expiredSet);
+
+            return CompactionAggregate.createUnified(sstables, selected, pending, shard, this);
+        }
+
+        /**
+         * Bipartitions SSTables into liveSet and expiredSet, depending on whether they are present in allExpiredSSTables.
+         *
+         * @param sstables list of SSTables in a bucket
+         * @param allExpiredSSTables set of expired SSTables for all shards/buckets
+         * @param liveSet empty list that is going to be filled up with SSTables that are not present in {@param allExpiredSSTables}
+         * @param expiredSet empty list that is going to be filled up with SSTables that are present in {@param allExpiredSSTables}
+         */
+        private static void bipartitionSSTables(List<SSTableReader> sstables,
+                                                Set<SSTableReader> allExpiredSSTables,
+                                                List<SSTableReader> liveSet,
+                                                List<SSTableReader> expiredSet)
+        {
+            for (SSTableReader sstable : sstables)
+            {
+                if (allExpiredSSTables.contains(sstable))
+                    expiredSet.add(sstable);
+                else
+                    liveSet.add(sstable);
+            }
+        }
+
+        private List<CompactionPick> layoutCompactions(Controller controller, List<SSTableReader> liveSet, int maxSSTablesToCompact)
+        {
+            List<CompactionPick> pending = new ArrayList<>();
+            int pos = layoutCompactions(controller, liveSet, index + 1, fanout, maxSSTablesToCompact, pending);
+            int size = liveSet.size();
+            if (size - pos >= threshold) // can only happen in the levelled case.
+            {
+                assert size - pos < maxSSTablesToCompact; // otherwise it should have already been picked
+                pending.add(CompactionPick.create(index, liveSet.subList(pos, size)));
+            }
+            return pending;
+        }
+
+        /**
+         * Collects in {@param list} compactions of {@param sstables} such that they land in {@param level} and higher.
+         *
+         * Recursively combines SSTables into {@link CompactionPick}s in way that up to {@param maxSSTablesToCompact}
+         * SSTables are combined to reach the highest possible level, then the rest is combined for the level before,
+         * etc up to {@param level}.
+         *
+         * To agree with what compaction normally does, the first sstables from the list are placed in the picks that
+         * combine to reach the highest levels.
+         *
+         * @param controller
+         * @param sstables SSTables to compact, sorted by age from old to new
+         * @param level minimum target level for compactions to land
+         * @param step - number of source SSTables required to reach level
+         * @param maxSSTablesToCompact limit on the number of sstables per compaction
+         * @param list - result list of layout-preserving compaction picks
+         * @return index of the last used SSTable from {@param sstables}; the number of remaining sstables will be lower
+         *         than step
+         */
+        private int layoutCompactions(Controller controller,
+                                      List<SSTableReader> sstables,
+                                      int level,
+                                      int step,
+                                      int maxSSTablesToCompact,
+                                      List<CompactionPick> list)
+        {
+            if (step > sstables.size() || step > maxSSTablesToCompact)
+                return 0;
+
+            int W = controller.getScalingParameter(level);
+            int F = controller.getFanout(level);
+            int pos = layoutCompactions(controller,
+                                        sstables,
+                                        level + 1,
+                                        step * F,
+                                        maxSSTablesToCompact,
+                                        list);
+
+            int total = sstables.size();
+            // step defines the number of source sstables that are needed to reach this level (ignoring overwrites
+            // and deletions).
+            // For tiered compaction we will select batches of this many.
+            int pickSize = step;
+            if (W < 0)
+            {
+                // For levelled compaction all the sstables that would reach this level need to be compacted to one,
+                // so select the highest multiple of step that is available, but make sure we don't do a compaction
+                // bigger than the limit.
+                pickSize *= Math.min(total - pos, maxSSTablesToCompact) / pickSize;
+
+                if (pickSize == 0)  // Not enough sstables to reach this level, we can skip the processing below.
+                    return pos;     // Note: this cannot happen on the top level, but can on lower ones.
+            }
+
+            while (pos + pickSize <= total)
+            {
+                // Note that we assign these compactions to the level that would normally produce them, which means that
+                // they won't be taking up threads dedicated to the busy level.
+                // Normally sstables end up on a level when a compaction on the previous brings their size to the
+                // threshold (which corresponds to pickSize == step, always the case for tiered); in the case of
+                // levelled compaction, when we compact more than 1 but less than F sstables on a level (which
+                // corresponds to pickSize > step), it is an operation that is triggered on the same level.
+                list.add(CompactionPick.create(pickSize > step ? level : level - 1,
+                                               sstables.subList(pos, pos + pickSize)));
+                pos += pickSize;
+            }
+
+            // In the levelled case, if we had to adjust pickSize due to maxSSTablesToCompact, there may
+            // still be enough sstables to reach this level (e.g. if max was enough for 2*step, but we had 3*step).
+            if (pos + step <= total)
+            {
+                pickSize = ((total - pos) / step) * step;
+                list.add(CompactionPick.create(pickSize > step ? level : level - 1,
+                                               sstables.subList(pos, pos + pickSize)));
+                pos += pickSize;
+            }
+            return pos;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("W: %d, T: %d, F: %d, index: %d, min: %s, max %s, %d sstables",
+                                 scalingParameter, threshold, fanout, index, FBUtilities.prettyPrintMemory(min), FBUtilities.prettyPrintMemory(max), sstables.size());
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/Upgrader.java b/src/java/org/apache/cassandra/db/compaction/Upgrader.java
index 3972579efc88..e35a8d3b886c 100644
--- a/src/java/org/apache/cassandra/db/compaction/Upgrader.java
+++ b/src/java/org/apache/cassandra/db/compaction/Upgrader.java
@@ -20,7 +20,6 @@
 import java.io.File;
 import java.util.*;
 import java.util.function.LongPredicate;
-import java.util.function.Predicate;
 
 import com.google.common.base.Throwables;
 import com.google.common.collect.Sets;
@@ -46,7 +45,7 @@ public class Upgrader
     private final File directory;
 
     private final CompactionController controller;
-    private final CompactionStrategyManager strategyManager;
+    private final CompactionStrategyContainer strategyContainer;
     private final long estimatedRows;
 
     private final OutputHandler outputHandler;
@@ -62,9 +61,9 @@ public Upgrader(ColumnFamilyStore cfs, LifecycleTransaction txn, OutputHandler o
 
         this.controller = new UpgradeController(cfs);
 
-        this.strategyManager = cfs.getCompactionStrategyManager();
+        this.strategyContainer = cfs.getCompactionStrategyContainer();
         long estimatedTotalKeys = Math.max(cfs.metadata().params.minIndexInterval, SSTableReader.getApproximateKeyCount(Arrays.asList(this.sstable)));
-        long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(Arrays.asList(this.sstable)) / strategyManager.getMaxSSTableBytes());
+        long estimatedSSTables = Math.max(1, SSTableReader.getTotalBytes(Arrays.asList(this.sstable)) / strategyContainer.getMaxSSTableBytes());
         this.estimatedRows = (long) Math.ceil((double) estimatedTotalKeys / estimatedSSTables);
     }
 
@@ -89,7 +88,7 @@ public void upgrade(boolean keepOriginals)
         outputHandler.output("Upgrading " + sstable);
         int nowInSec = FBUtilities.nowInSeconds();
         try (SSTableRewriter writer = SSTableRewriter.construct(cfs, transaction, keepOriginals, CompactionTask.getMaxDataAge(transaction.originals()));
-             AbstractCompactionStrategy.ScannerList scanners = strategyManager.getScanners(transaction.originals());
+             ScannerList scanners = strategyContainer.getScanners(transaction.originals());
              CompactionIterator iter = new CompactionIterator(transaction.opType(), scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
             writer.switchWriter(createCompactionWriter(sstable.getSSTableMetadata()));
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/AdaptiveController.java b/src/java/org/apache/cassandra/db/compaction/unified/AdaptiveController.java
new file mode 100644
index 000000000000..43cedcb9ba33
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/AdaptiveController.java
@@ -0,0 +1,344 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.Arrays;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import javax.annotation.Nullable;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MonotonicClock;
+
+/**
+ * The adaptive compaction controller dynamically calculates the optimal scaling parameter W.
+ * <p/>
+ * Generally it tries to find a local minimum for the total IO cost that is projected
+ * by the strategy. The projected IO cost is composed by two parts: the read amplification,
+ * which is weighted by the number of partitions read by the user, and the write amplification, which
+ * is weighted by the number of bytes inserted into memtables. Other parameters are also considered, such
+ * as the cache miss rate and the time it takes to read and write from disk. See also the comments in
+ * {@link CostsCalculator}.
+ *
+ * Design doc: TODO: link to design doc or SEP
+ */
+public class AdaptiveController extends Controller
+{
+    private static final Logger logger = LoggerFactory.getLogger(AdaptiveController.class);
+
+    /** The starting value for the scaling parameter */
+    static final String STARTING_SCALING_PARAMETER = "adaptive_starting_scaling_parameter";
+    private static final int DEFAULT_STARTING_SCALING_PARAMETER = Integer.getInteger(PREFIX + STARTING_SCALING_PARAMETER, 0);
+
+    /** The minimum valid value for the scaling parameter */
+    static final String MIN_SCALING_PARAMETER = "adaptive_min_scaling_parameter";
+    static private final int DEFAULT_MIN_SCALING_PARAMETER = Integer.getInteger(PREFIX + MIN_SCALING_PARAMETER, -10);
+
+    /** The maximum valid value for the scaling parameter */
+    static final String MAX_SCALING_PARAMETER = "adaptive_max_scaling_parameter";
+    static private final int DEFAULT_MAX_SCALING_PARAMETER = Integer.getInteger(PREFIX + MIN_SCALING_PARAMETER, 36);
+
+    /** The interval for periodically checking the optimal value for W */
+    static final String INTERVAL_SEC = "adaptive_interval_sec";
+    static private final int DEFAULT_INTERVAL_SEC = Integer.getInteger(PREFIX + INTERVAL_SEC, 300);
+
+    /** The gain is a number between 0 and 1 used to determine if a new choice of W is better than the current one */
+    static final String THRESHOLD = "adaptive_threshold";
+    private static final double DEFAULT_THRESHOLD = Double.parseDouble(System.getProperty(PREFIX + THRESHOLD, "0.15"));
+
+    /** Below the minimum cost we don't try to optimize W, we consider the current W good enough. This is necessary because the cost
+     * can vanish to zero when there are neither reads nor writes and right now we don't know how to handle this case.  */
+    static final String MIN_COST = "adaptive_min_cost";
+    static private final int DEFAULT_MIN_COST = Integer.getInteger(PREFIX + MIN_COST, 1000);
+
+    private final int intervalSec;
+    private final int minW;
+    private final int maxW;
+    private final double threshold;
+    private final int minCost;
+
+    private volatile int W;
+    private volatile long lastChecked;
+
+    @VisibleForTesting
+    public AdaptiveController(MonotonicClock clock,
+                              Environment env,
+                              int W,
+                              double survivalFactor,
+                              long dataSetSizeMB,
+                              int numShards,
+                              long minSstableSizeMB,
+                              long flushSizeOverrideMB,
+                              double maxSpaceOverhead,
+                              int maxSSTablesToCompact,
+                              long expiredSSTableCheckFrequency,
+                              boolean ignoreOverlapsInExpirationCheck,
+                              int intervalSec,
+                              int minW,
+                              int maxW,
+                              double threshold,
+                              int minCost)
+    {
+        super(clock, env, survivalFactor, dataSetSizeMB, numShards, minSstableSizeMB, flushSizeOverrideMB, maxSpaceOverhead, maxSSTablesToCompact, expiredSSTableCheckFrequency, ignoreOverlapsInExpirationCheck);
+
+        this.W = W;
+        this.intervalSec = intervalSec;
+        this.minW = minW;
+        this.maxW = maxW;
+        this.threshold = threshold;
+        this.minCost = minCost;
+    }
+
+    static Controller fromOptions(Environment env,
+                                  double survivalFactor,
+                                  long dataSetSizeMB,
+                                  int numShards,
+                                  long minSstableSizeMB,
+                                  long flushSizeOverrideMB,
+                                  double maxSpaceOverhead,
+                                  int maxSSTablesToCompact,
+                                  long expiredSSTableCheckFrequency,
+                                  boolean ignoreOverlapsInExpirationCheck,
+                                  Map<String, String> options)
+    {
+        int W = options.containsKey(STARTING_SCALING_PARAMETER) ? Integer.parseInt(options.get(STARTING_SCALING_PARAMETER)) : DEFAULT_STARTING_SCALING_PARAMETER;
+        int minW = options.containsKey(MIN_SCALING_PARAMETER) ? Integer.parseInt(options.get(MIN_SCALING_PARAMETER)) : DEFAULT_MIN_SCALING_PARAMETER;
+        int maxW = options.containsKey(MAX_SCALING_PARAMETER) ? Integer.parseInt(options.get(MAX_SCALING_PARAMETER)) : DEFAULT_MAX_SCALING_PARAMETER;
+        int intervalSec = options.containsKey(INTERVAL_SEC) ? Integer.parseInt(options.get(INTERVAL_SEC)) : DEFAULT_INTERVAL_SEC;
+        double threshold = options.containsKey(THRESHOLD) ? Double.parseDouble(options.get(THRESHOLD)) : DEFAULT_THRESHOLD;
+        int minCost = options.containsKey(MIN_COST) ? Integer.parseInt(options.get(MIN_COST)) : DEFAULT_MIN_COST;
+
+        return new AdaptiveController(MonotonicClock.preciseTime, env, W, survivalFactor, dataSetSizeMB, numShards, minSstableSizeMB, flushSizeOverrideMB, maxSpaceOverhead, maxSSTablesToCompact, expiredSSTableCheckFrequency, ignoreOverlapsInExpirationCheck, intervalSec, minW, maxW, threshold, minCost);
+    }
+
+    public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
+    {
+        int W = DEFAULT_STARTING_SCALING_PARAMETER;
+        int minW = DEFAULT_MIN_SCALING_PARAMETER;
+        int maxW = DEFAULT_MAX_SCALING_PARAMETER;
+
+        String s;
+        s = options.remove(STARTING_SCALING_PARAMETER);
+        if (s != null)
+            W = Integer.parseInt(s);
+        s = options.remove(MIN_SCALING_PARAMETER);
+        if (s != null)
+            minW = Integer.parseInt(s);
+        s = options.remove(MAX_SCALING_PARAMETER);
+        if (s != null)
+            maxW = Integer.parseInt(s);
+
+        if (minW >= maxW || W < minW || W > maxW)
+            throw new ConfigurationException(String.format("Invalid configuration for W: %d, min: %d, max: %d", W, minW, maxW));
+
+        s = options.remove(INTERVAL_SEC);
+        if (s != null)
+        {
+            int intervalSec = Integer.parseInt(s);
+            if (intervalSec <= 0)
+                throw new ConfigurationException(String.format("Invalid configuration for interval, it should be positive: %d", intervalSec));
+        }
+        s = options.remove(THRESHOLD);
+        if (s != null)
+        {
+            double threshold = Double.parseDouble(s);
+            if (threshold <= 0 || threshold > 1)
+            {
+                throw new ConfigurationException(String.format("Invalid configuration for threshold, it should be within (0,1]: %f", threshold));
+            }
+        }
+        s = options.remove(MIN_COST);
+        if (s != null)
+        {
+            int minCost = Integer.parseInt(s);
+            if (minCost <= 0)
+                throw new ConfigurationException(String.format("Invalid configuration for minCost, it should be positive: %d", minCost));
+        }
+        return options;
+    }
+
+    @Override
+    void startup(UnifiedCompactionStrategy strategy, CostsCalculator calculator)
+    {
+        super.startup(strategy, calculator);
+        this.lastChecked = clock.now();
+    }
+
+    @Override
+    public int getScalingParameter(int index)
+    {
+        return W;
+    }
+
+    @Override
+    public double getSurvivalFactor()
+    {
+        return survivalFactor;
+    }
+
+    @Override
+    @Nullable
+    public CostsCalculator getCalculator()
+    {
+        return calculator;
+    }
+
+    public int getInterval()
+    {
+        return intervalSec;
+    }
+
+    public int getMinW()
+    {
+        return minW;
+    }
+
+    public int getMaxW()
+    {
+        return maxW;
+    }
+
+    public double getThreshold()
+    {
+        return threshold;
+    }
+
+    public int getMinCost()
+    {
+        return minCost;
+    }
+
+    @Override
+    public void onStrategyBackgroundTaskRequest()
+    {
+        if (!isRunning())
+            return;
+
+        long now = clock.now();
+        if (now - lastChecked < TimeUnit.SECONDS.toNanos(intervalSec))
+            return;
+
+        try
+        {
+            maybeUpdate(now);
+        }
+        finally
+        {
+            lastChecked = now;
+        }
+    }
+
+    /**
+     * Maybe updates the scaling parameter according to the data size, read, and write costs.
+     *
+     * The scaling parameter calculation is based on current read and write query costs for the entire data size.
+     * We use the entire data size instead of shard size here because query cost calculations do not take
+     * sharding into account. Also, the same scaling parameter is going to be used across all shards.
+     *
+     * @param now current timestamp only used for debug logging
+     */
+    private void maybeUpdate(long now)
+    {
+        final long targetSize = Math.max(getDataSetSizeBytes(), (long) Math.ceil(calculator.spaceUsed()));
+
+        final int RA = readAmplification(targetSize, W);
+        final int WA = writeAmplification(targetSize, W);
+
+        final double readCost = calculator.getReadCostForQueries(RA);
+        final double writeCost = calculator.getWriteCostForQueries(WA);
+        final double cost =  readCost + writeCost;
+
+        if (cost <= minCost)
+        {
+            logger.debug("Adaptive compaction controller not updated, cost for current W {} is below minimum cost {}: read cost: {}, write cost: {}\\nAverages: {}", W, minCost, readCost, writeCost, calculator);
+            return;
+        }
+
+        final double[] totCosts = new double[maxW - minW + 1];
+        final double[] readCosts = new double[maxW - minW + 1];
+        final double[] writeCosts = new double[maxW - minW + 1];
+        int candW = W;
+        double candCost = cost;
+
+        for (int i = minW; i <= maxW; i++)
+        {
+            final int idx = i - minW;
+            if (i == W)
+            {
+                readCosts[idx] = readCost;
+                writeCosts[idx] = writeCost;
+            }
+            else
+            {
+                final int ra = readAmplification(targetSize, i);
+                final int wa = writeAmplification(targetSize, i);
+
+                readCosts[idx] = calculator.getReadCostForQueries(ra);
+                writeCosts[idx] = calculator.getWriteCostForQueries(wa);
+            }
+            totCosts[idx] = readCosts[idx] + writeCosts[idx];
+            // in case of a tie, for neg.ve Ws we prefer higher Ws (smaller WA), but not for pos.ve Ws we prefer lower Ws (more parallelism)
+            if (totCosts[idx] < candCost || (i < 0 && totCosts[idx] == candCost))
+            {
+                candW = i;
+                candCost = totCosts[idx];
+            }
+        }
+
+        logger.debug("Min cost: {}, min W: {}, min sstable size: {}\nread costs: {}\nwrite costs: {}\ntot costs: {}\nAverages: {}",
+                     candCost,
+                     candW,
+                     FBUtilities.prettyPrintMemory(getMinSstableSizeBytes()),
+                     Arrays.toString(readCosts),
+                     Arrays.toString(writeCosts),
+                     Arrays.toString(totCosts),
+                     calculator);
+
+        StringBuilder str = new StringBuilder(100);
+        str.append("Adaptive compaction controller ");
+
+        if (W != candW && (cost - candCost) >= threshold * cost)
+        {
+            str.append("updated ").append(W).append(" -> ").append(candW);
+            this.W = candW;
+        }
+        else
+        {
+            str.append("unchanged");
+        }
+
+        str.append(", data size: ").append(FBUtilities.prettyPrintMemory(targetSize));
+        str.append(", query cost: ").append(cost);
+        str.append(", new query cost: ").append(candCost);
+        str.append(", took ").append(TimeUnit.NANOSECONDS.toMicros(clock.now() - now)).append(" us");
+
+        logger.debug(str.toString());
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("m: %d, o: %f, W: %d - %s", minSstableSizeMB, survivalFactor, W, calculator);
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/Controller.java b/src/java/org/apache/cassandra/db/compaction/unified/Controller.java
new file mode 100644
index 000000000000..8cb060e182a6
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/Controller.java
@@ -0,0 +1,832 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;
+import javax.annotation.Nullable;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.cassandra.config.Config;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.codahale.metrics.Gauge;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.compaction.CompactionStrategy;
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.metrics.DefaultNameFactory;
+import org.apache.cassandra.metrics.MetricNameFactory;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.MonotonicClock;
+
+import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics;
+
+/**
+* The controller provides compaction parameters to the unified compaction strategy
+*/
+public abstract class Controller
+{
+    protected static final Logger logger = LoggerFactory.getLogger(Controller.class);
+    private static final ConcurrentMap<TableMetadata, Controller.Metrics> allMetrics = new ConcurrentHashMap<>();
+
+    static final String PREFIX = "unified_compaction.";
+
+    /** The data size in GB, it will be assumed that the node will have on disk roughly this size of data when it
+     * reaches equilibrium. By default 1 TB. */
+    public static final String DATASET_SIZE_OPTION_GB = "dataset_size_in_gb";
+    static final long DEFAULT_DATASET_SIZE_GB = Long.getLong(PREFIX + DATASET_SIZE_OPTION_GB,
+                                                             DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB());
+
+    /** The number of shards. The shard size will be calculated by dividing the data size by this number.
+     * By default, 10 would be used for single disk. If the data size is 1 TB, then the shard size becomes 100 GB.
+     * If JBOD / multi-drive, it would be 10 * disks. For example, if there are 5 disks, there would be 50 shards.
+     * With data size 10 TB, the shard size would be 200 GB.
+     * */
+    static final String NUM_SHARDS_OPTION = "num_shards";
+    static final int DEFAULT_NUM_SHARDS = Integer.getInteger(PREFIX + NUM_SHARDS_OPTION,
+                                                             10 * DatabaseDescriptor.getAllDataFileLocations().length);
+
+    /**
+     * The minimum sstable size. Sharded writers split sstables over shard only if they are at least as large
+     * as the minimum size.
+     *
+     * When the minimum sstable size is zero in the compaction options, then it is calculated by the controller by
+     * looking at the initial flush size.
+     */
+    static final String MIN_SSTABLE_SIZE_OPTION_MB = "min_sstable_size_in_mb";
+    static final int DEFAULT_MIN_SSTABLE_SIZE_MB = Integer.getInteger(PREFIX + MIN_SSTABLE_SIZE_OPTION_MB, 100);
+
+    /**
+     * Override for the flush size in MB. The database should be able to calculate this from executing flushes, this
+     * should only be necessary in rare cases.
+     */
+    static final String FLUSH_SIZE_OVERRIDE_OPTION_MB = "flush_size_override_mb";
+
+    /**
+     * The maximum tolerable compaction-induced space amplification, as fraction of the dataset size. The idea behind
+     * this property is to be able to tune how much to limit concurrent "oversized" compactions in different shards.
+     * On one hand allowing such compactions concurrently running in all shards allows for STCS-like space
+     * amplification, where at some point you might need free space double the size of your working set to do a (top
+     * tier) compaction, while on the other hand limiting such compactions too much might lead to compaction lagging
+     * behind, higher read amplification, and other problems of that nature.
+     */
+    static public final String MAX_SPACE_OVERHEAD_OPTION = "max_space_overhead";
+    static final double DEFAULT_MAX_SPACE_OVERHEAD = Double.parseDouble(System.getProperty(PREFIX + MAX_SPACE_OVERHEAD_OPTION, "0.2"));
+    static final double MAX_SPACE_OVERHEAD_LOWER_BOUND = 0.01;
+    static final double MAX_SPACE_OVERHEAD_UPPER_BOUND = 1.0;
+
+    /**
+     * This parameter is intended to modify the shape of the LSM by taking into account the survival ratio of data, for now it is fixed to one.
+     */
+    static final double DEFAULT_SURVIVAL_FACTOR = Double.parseDouble(System.getProperty(PREFIX + "survival_factor", "1"));
+
+    /**
+     * Either true or false. This parameter determines which controller will be used.
+     */
+    static final String ADAPTIVE_OPTION = "adaptive";
+    static final boolean DEFAULT_ADAPTIVE = Boolean.parseBoolean(System.getProperty(PREFIX + ADAPTIVE_OPTION, "false"));
+
+    /**
+     * The maximum number of sstables to compact in one operation.
+     *
+     * This is expected to be large and never be reached, but compaction going very very late may cause the accumulation
+     * of thousands and even tens of thousands of sstables which may cause problems if compacted in one long operation.
+     * The default is chosen to be half of the maximum permitted space overhead when the source sstables are of the
+     * minimum sstable size.
+     *
+     * If the fanout factor is larger than the maximum number of sstables, the strategy will ignore the latter.
+     */
+    static final String MAX_SSTABLES_TO_COMPACT_OPTION = "max_sstables_to_compact";
+
+    static final String ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION = "unsafe_aggressive_sstable_expiration";
+    static final String ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_PROPERTY = Config.PROPERTY_PREFIX + "allow_unsafe_aggressive_sstable_expiration";
+    static final boolean ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION = Boolean.parseBoolean(System.getProperty(ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_PROPERTY));
+    static final boolean DEFAULT_ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION = false;
+
+    static final int DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS = 60 * 10;
+    static final String EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION = "expired_sstable_check_frequency_seconds";
+
+    protected final MonotonicClock clock;
+    protected final Environment env;
+    protected final double survivalFactor;
+    protected final long dataSetSizeMB;
+    protected final int numShards;
+    protected final long shardSizeMB;
+    protected volatile long minSstableSizeMB;
+    protected final double maxSpaceOverhead;
+    protected final long flushSizeOverrideMB;
+    protected volatile long currentFlushSize;
+    protected final int maxSSTablesToCompact;
+    protected final long expiredSSTableCheckFrequency;
+    protected final boolean ignoreOverlapsInExpirationCheck;
+    @Nullable protected volatile CostsCalculator calculator;
+    @Nullable private volatile Metrics metrics;
+
+    Controller(MonotonicClock clock,
+               Environment env,
+               double survivalFactor,
+               long dataSetSizeMB,
+               int numShards,
+               long minSstableSizeMB,
+               long flushSizeOverrideMB,
+               double maxSpaceOverhead,
+               int maxSSTablesToCompact,
+               long expiredSSTableCheckFrequency,
+               boolean ignoreOverlapsInExpirationCheck)
+    {
+        this.clock = clock;
+        this.env = env;
+        this.survivalFactor = survivalFactor;
+        this.dataSetSizeMB = dataSetSizeMB;
+        this.numShards = numShards;
+        this.shardSizeMB = (int) Math.ceil((double) dataSetSizeMB / numShards);
+        this.minSstableSizeMB = minSstableSizeMB;
+        this.flushSizeOverrideMB = flushSizeOverrideMB;
+        this.currentFlushSize = flushSizeOverrideMB << 20;
+        this.expiredSSTableCheckFrequency = TimeUnit.MILLISECONDS.convert(expiredSSTableCheckFrequency, TimeUnit.SECONDS);
+
+        double maxSpaceOverheadLowerBound = 1.0d / numShards;
+        if (maxSpaceOverhead < maxSpaceOverheadLowerBound)
+        {
+            logger.warn("{} shards are not enough to maintain the required maximum space overhead of {}!\n" +
+                        "Falling back to {}={} instead. If this limit needs to be satisfied, please increase the number" +
+                        " of shards.",
+                        numShards,
+                        maxSpaceOverhead,
+                        MAX_SPACE_OVERHEAD_OPTION,
+                        String.format("%.3f", maxSpaceOverheadLowerBound));
+            this.maxSpaceOverhead = maxSpaceOverheadLowerBound;
+        }
+        else
+            this.maxSpaceOverhead = maxSpaceOverhead;
+
+        if (maxSSTablesToCompact <= 0)  // use half the maximum permitted compaction size as upper bound by default
+            maxSSTablesToCompact = (int) (dataSetSizeMB * this.maxSpaceOverhead * 0.5 / minSstableSizeMB);
+
+        this.maxSSTablesToCompact = maxSSTablesToCompact;
+
+        if (ignoreOverlapsInExpirationCheck && !ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION)
+        {
+            logger.warn("Not enabling aggressive SSTable expiration, as the system property '" + ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_PROPERTY + "' is set to 'false'. " +
+                    "Set it to 'true' to enable aggressive SSTable expiration.");
+        }
+        this.ignoreOverlapsInExpirationCheck = ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION && ignoreOverlapsInExpirationCheck;
+    }
+
+    @VisibleForTesting
+    public Environment getEnv()
+    {
+        return env;
+    }
+
+    /**
+     * @return the scaling parameter W
+     * @param index
+     */
+    public abstract int getScalingParameter(int index);
+
+    public int getFanout(int index) {
+        int W = getScalingParameter(index);
+        return W < 0 ? 2 - W : 2 + W; // see formula in design doc
+    }
+
+    public int getThreshold(int index) {
+        int W = getScalingParameter(index);
+        return W < 0 ? 2 : getFanout(index); // see formula in design doc
+    }
+
+    /**
+     * @return the number of shards according to the dataset and shard sizes set by the user
+     */
+    public int getNumShards()
+    {
+        return numShards;
+    }
+
+    /**
+     * @return the survival factor o
+     */
+    public double getSurvivalFactor()
+    {
+        return survivalFactor;
+    }
+
+    /**
+     * The user specified dataset size.
+     *
+     * @return the target size of the entire data set, in bytes.
+     */
+    public long getDataSetSizeBytes()
+    {
+        return dataSetSizeMB << 20;
+    }
+
+    /**
+     * The user specified shard, or compaction arena, size.
+     *
+     * @return the desired size of each shard, or compaction arena, in bytes.
+     */
+    public long getShardSizeBytes()
+    {
+        return shardSizeMB << 20;
+    }
+
+    /**
+     * Return the sstable size in bytes.
+     *
+     * This is either set by the user in the options or calculated by rounding up the first flush size to 50 MB.
+     *
+     * @return the minimum sstable size in bytes.
+     */
+    public long getMinSstableSizeBytes()
+    {
+        if (minSstableSizeMB > 0)
+            return minSstableSizeMB << 20;
+
+        synchronized (this)
+        {
+            if (minSstableSizeMB > 0)
+                return minSstableSizeMB << 20;
+
+            // round the avg flush size to the nearest byte
+            long envFlushSize = Math.round(env.flushSize());
+            long fiftyMB = 50 << 20;
+
+            // round up to 50 MB
+            long flushSize = ((Math.max(1, envFlushSize) + fiftyMB - 1) / fiftyMB) * fiftyMB;
+
+            // If the env flush size is positive, then we've flushed at least once and we use this value permanently
+            if (envFlushSize > 0)
+                minSstableSizeMB = flushSize >> 20;
+
+            return flushSize;
+        }
+    }
+
+    /**
+     * Return the flush sstable size in bytes.
+     *
+     * This is usually obtained from the observed sstable flush sizes, refreshed when it differs significantly
+     * from the current values.
+     * It can also be set by the user in the options.
+     *
+     * @return the flush size in bytes.
+     */
+    public long getFlushSizeBytes()
+    {
+        if (flushSizeOverrideMB > 0)
+            return flushSizeOverrideMB << 20;
+
+        double envFlushSize = env.flushSize();
+        if (currentFlushSize == 0 || Math.abs(1 - (currentFlushSize / envFlushSize)) > 0.5)
+        {
+            // The current size is not initialized, or it differs by over 50% from the observed.
+            // Use the observed size rounded up to a whole megabyte.
+            currentFlushSize = ((long) (Math.ceil(Math.scalb(envFlushSize, -20)))) << 20;
+        }
+        return currentFlushSize;
+    }
+
+    /**
+     * Returns the maximum tolerable compaction-induced space amplification, as a fraction of the dataset size.
+     * Currently this is not a strict limit for which compaction gives an ironclad guarantee never to exceed it, but
+     * the main input in a simple heuristic that is designed to limit UCS' space amplification in exchange of some
+     * delay in top bucket compactions.
+     *
+     * @return a {@code double} value between 0.01 and 1.0, representing the fraction of the expected uncompacted
+     * dataset size that should be additionally available for compaction's space amplification overhead.
+     */
+    public double getMaxSpaceOverhead()
+    {
+        return maxSpaceOverhead;
+    }
+
+    /**
+     * @return whether is allowed to drop expired SSTables without checking if partition keys appear in other SSTables.
+     * Same behavior as in TWCS.
+     */
+    public boolean getIgnoreOverlapsInExpirationCheck()
+    {
+        return ignoreOverlapsInExpirationCheck;
+    }
+
+    public long getExpiredSSTableCheckFrequency()
+    {
+        return expiredSSTableCheckFrequency;
+    }
+
+    /**
+     * Perform any initialization that requires the strategy.
+     */
+    public void startup(UnifiedCompactionStrategy strategy, ScheduledExecutorService executorService)
+    {
+        if (calculator != null)
+            throw new IllegalStateException("Already started");
+
+        startup(strategy, new CostsCalculator(env, strategy, executorService, survivalFactor));
+    }
+
+    @VisibleForTesting
+    void startup(UnifiedCompactionStrategy strategy, CostsCalculator calculator)
+    {
+        this.calculator = calculator;
+        metrics = allMetrics.computeIfAbsent(strategy.getMetadata(), Controller.Metrics::new);
+        metrics.setController(this);
+        logger.debug("Started compaction controller {}", this);
+    }
+
+    /**
+     * Signals that the strategy is about to be deleted or stopped.
+     */
+    public void shutdown()
+    {
+        if (calculator == null)
+            return;
+
+        calculator.close();
+        calculator = null;
+
+        if (metrics != null)
+        {
+            metrics.release();
+            metrics.removeController();
+            metrics = null;
+        }
+
+        logger.debug("Stopped compaction controller {}", this);
+    }
+
+    /**
+     * @return true if the controller is running
+     */
+    public boolean isRunning()
+    {
+        return calculator != null;
+    }
+
+    /**
+     * @return the cost calculator, will be null until {@link this#startup(UnifiedCompactionStrategy, ScheduledExecutorService)} is called.
+     */
+    @Nullable
+    @VisibleForTesting
+    public CostsCalculator getCalculator()
+    {
+        return calculator;
+    }
+
+    /**
+     * The strategy will call this method each time {@link CompactionStrategy#getNextBackgroundTasks(int)} is called.
+     */
+    public void onStrategyBackgroundTaskRequest()
+    {
+    }
+
+    /**
+     * Calculate the read amplification assuming a single scaling parameter W and a given total
+     * length of data on disk.
+     *
+     * @param length the total length on disk
+     * @param scalingParameter the scaling parameter to use for the calculation
+     *
+     * @return the read amplification of all the buckets needed to cover the total length
+     */
+    public int readAmplification(long length, int scalingParameter)
+    {
+        double o = getSurvivalFactor();
+        long m = getFlushSizeBytes();
+
+        int F = scalingParameter < 0 ? 2 - scalingParameter : 2 + scalingParameter;
+        int T = scalingParameter < 0 ? 2 : F;
+        int maxIndex = maxBucketIndex(length, F);
+
+        int ret = 0;
+        for (int i = 0; i < maxIndex; i++)
+            ret += T - 1;
+
+        if (scalingParameter >= 0)
+            ret += Math.max(0, Math.ceil(length / (m * Math.pow(o * F, maxIndex))) - 1);
+        else
+            ret += 1;
+
+        return ret;
+    }
+
+    /**
+     * Calculate the write amplification assuming a single scaling parameter W and a given total
+     * length of data on disk.
+     *
+     * @param length the total length on disk
+     * @param scalingParameter the scaling parameter to use for the calculation
+     *
+     * @return the write amplification of all the buckets needed to cover the total length
+     */
+    public int writeAmplification(long length, int scalingParameter)
+    {
+        double o = getSurvivalFactor();
+        long m = getFlushSizeBytes();
+
+        int F = scalingParameter < 0 ? 2 - scalingParameter : 2 + scalingParameter;
+        int maxIndex = maxBucketIndex(length, F);
+
+        int ret = 0;
+
+        if (scalingParameter >= 0)
+        {   // for tiered, at each level the WA is 1. We start at level 0 and end up at level maxIndex so that's a WA of maxIndex.
+            ret += maxIndex + 1;
+        }
+        else
+        {   // for leveled, at each level the WA is F - 1 except for the last one, where it's (size / size of previous level) - 1
+            // or (size / (m*(o*F)^maxIndex)) - 1
+            for (int i = 0; i < maxIndex; i++)
+                ret += F - 1;
+
+            ret += Math.max(0, Math.ceil(length / (m * Math.pow(o * F, maxIndex))));
+        }
+
+        return ret;
+    }
+
+    /**
+     * Returns a maximum bucket index for the given data size and fanout.
+     */
+    private int maxBucketIndex(long totalLength, int fanout)
+    {
+        double o = getSurvivalFactor();
+        long m = getFlushSizeBytes();
+        return Math.max(0, (int) Math.floor((Math.log(totalLength) - Math.log(m)) / (Math.log(fanout) - Math.log(o))));
+    }
+
+    private double getReadIOCost()
+    {
+        if (calculator == null)
+            return 0;
+
+        int W = getScalingParameter(0);
+        long length = (long) Math.ceil(calculator.spaceUsed());
+        return calculator.getReadCostForQueries(readAmplification(length, W));
+    }
+
+    private double getWriteIOCost()
+    {
+        if (calculator == null)
+            return 0;
+
+        int W = getScalingParameter(0);
+        long length = (long) Math.ceil(calculator.spaceUsed());
+        return calculator.getWriteCostForQueries(writeAmplification(length, W));
+    }
+
+    public static Controller fromOptions(ColumnFamilyStore cfs, Map<String, String> options)
+    {
+        boolean adaptive = options.containsKey(ADAPTIVE_OPTION) ? Boolean.parseBoolean(options.get(ADAPTIVE_OPTION)) : DEFAULT_ADAPTIVE;
+        long dataSetSizeMb = (options.containsKey(DATASET_SIZE_OPTION_GB) ? Long.parseLong(options.get(DATASET_SIZE_OPTION_GB)) : DEFAULT_DATASET_SIZE_GB) << 10;
+        int numShards = options.containsKey(NUM_SHARDS_OPTION) ? Integer.parseInt(options.get(NUM_SHARDS_OPTION)) : DEFAULT_NUM_SHARDS;
+        long sstableSizeMb = options.containsKey(MIN_SSTABLE_SIZE_OPTION_MB) ? Long.parseLong(options.get(MIN_SSTABLE_SIZE_OPTION_MB)) : DEFAULT_MIN_SSTABLE_SIZE_MB;
+        long flushSizeOverrideMb = Long.parseLong(options.getOrDefault(FLUSH_SIZE_OVERRIDE_OPTION_MB, "0"));
+        double maxSpaceOverhead = options.containsKey(MAX_SPACE_OVERHEAD_OPTION)
+                ? Double.parseDouble(options.get(MAX_SPACE_OVERHEAD_OPTION))
+                : DEFAULT_MAX_SPACE_OVERHEAD;
+        int maxSSTablesToCompact = Integer.parseInt(options.getOrDefault(MAX_SSTABLES_TO_COMPACT_OPTION, "0"));
+        long expiredSSTableCheckFrequency = options.containsKey(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION)
+                ? Long.parseLong(options.get(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION))
+                : DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS;
+        boolean ignoreOverlapsInExpirationCheck = options.containsKey(ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION)
+                ? Boolean.parseBoolean(options.get(ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION))
+                : DEFAULT_ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION;
+
+        Environment env = new RealEnvironment(cfs);
+
+        return adaptive
+               ? AdaptiveController.fromOptions(env,
+                                                DEFAULT_SURVIVAL_FACTOR,
+                                                dataSetSizeMb,
+                                                numShards,
+                                                sstableSizeMb,
+                                                flushSizeOverrideMb,
+                                                maxSpaceOverhead,
+                                                maxSSTablesToCompact,
+                                                expiredSSTableCheckFrequency,
+                                                ignoreOverlapsInExpirationCheck,
+                                                options)
+               : StaticController.fromOptions(env,
+                                              DEFAULT_SURVIVAL_FACTOR,
+                                              dataSetSizeMb,
+                                              numShards,
+                                              sstableSizeMb,
+                                              flushSizeOverrideMb,
+                                              maxSpaceOverhead,
+                                              maxSSTablesToCompact,
+                                              expiredSSTableCheckFrequency,
+                                              ignoreOverlapsInExpirationCheck,
+                                              options);
+    }
+
+    public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
+    {
+        String nonPositiveErr = "Invalid configuration, %s should be positive: %d";
+        String booleanParseErr = "%s should either be 'true' or 'false', not %s";
+        String intParseErr = "%s is not a parsable int (base10) for %s";
+        String longParseErr = "%s is not a parsable long (base10) for %s";
+        String floatParseErr = "%s is not a parsable float for %s";
+        options = new HashMap<>(options);
+        String s;
+        boolean adaptive = DEFAULT_ADAPTIVE;
+
+        s = options.remove(ADAPTIVE_OPTION);
+        if (s != null)
+        {
+            if (!s.equalsIgnoreCase("true") && !s.equalsIgnoreCase("false"))
+            {
+                throw new ConfigurationException(String.format(booleanParseErr, ADAPTIVE_OPTION, s));
+            }
+            adaptive = Boolean.parseBoolean(s);
+        }
+
+        s = options.remove(MIN_SSTABLE_SIZE_OPTION_MB);
+        if (s != null)
+        {
+            try
+            {
+                long minSStableSize = Long.parseLong(s);
+                if (minSStableSize <= 0)
+                    throw new ConfigurationException(String.format(nonPositiveErr,
+                                                                   MIN_SSTABLE_SIZE_OPTION_MB,
+                                                                   minSStableSize));
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format(longParseErr,
+                                                               s,
+                                                               MIN_SSTABLE_SIZE_OPTION_MB),
+                                                 e);
+            }
+        }
+
+        s = options.remove(FLUSH_SIZE_OVERRIDE_OPTION_MB);
+        if (s != null)
+        {
+            try
+            {
+                long flushSize = Long.parseLong(s);
+                if (flushSize <= 0)
+                    throw new ConfigurationException(String.format(nonPositiveErr,
+                                                                   FLUSH_SIZE_OVERRIDE_OPTION_MB,
+                                                                   flushSize));
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format(longParseErr,
+                                                               s,
+                                                               FLUSH_SIZE_OVERRIDE_OPTION_MB),
+                                                 e);
+            }
+        }
+
+        s = options.remove(DATASET_SIZE_OPTION_GB);
+        if (s != null)
+        {
+            try
+            {
+                long dataSetSizeMb = Long.parseLong(s);
+                if (dataSetSizeMb <= 0)
+                    throw new ConfigurationException(String.format(nonPositiveErr,
+                                                                   DATASET_SIZE_OPTION_GB,
+                                                                   dataSetSizeMb));
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format(longParseErr, s, DATASET_SIZE_OPTION_GB), e);
+            }
+        }
+
+        s = options.remove(NUM_SHARDS_OPTION);
+        if (s != null)
+        {
+            try
+            {
+                int numShards = Integer.parseInt(s);
+                if (numShards <= 0)
+                    throw new ConfigurationException(String.format(nonPositiveErr,
+                                                                   NUM_SHARDS_OPTION,
+                                                                   numShards));
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format(intParseErr, s, NUM_SHARDS_OPTION), e);
+            }
+        }
+        s = options.remove(MAX_SSTABLES_TO_COMPACT_OPTION);
+        if (s != null)
+        {
+             try
+             {
+                 Integer.parseInt(s); // values less than or equal to 0 enable the default
+             }
+             catch (NumberFormatException e)
+             {
+                 throw new ConfigurationException(String.format(intParseErr,
+                                                                s,
+                                                                MAX_SSTABLES_TO_COMPACT_OPTION),
+                                                  e);
+             }
+        }
+        s = options.remove(EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION);
+        if (s != null)
+        {
+            try
+            {
+                long expiredSSTableCheckFrequency = Long.parseLong(s);
+                if (expiredSSTableCheckFrequency <= 0)
+                    throw new ConfigurationException(String.format(nonPositiveErr,
+                                                                   EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION,
+                                                                   expiredSSTableCheckFrequency));
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format(longParseErr,
+                                                               s,
+                                                               EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION),
+                                                 e);
+            }
+        }
+
+        s = options.remove(MAX_SPACE_OVERHEAD_OPTION);
+        if (s != null)
+        {
+            try
+            {
+                double maxSpaceOverhead = Double.parseDouble(s);
+                if (maxSpaceOverhead < MAX_SPACE_OVERHEAD_LOWER_BOUND || maxSpaceOverhead > MAX_SPACE_OVERHEAD_UPPER_BOUND)
+                    throw new ConfigurationException(String.format("Invalid configuration, %s must be between %f and %f: %s",
+                                                                   MAX_SPACE_OVERHEAD_OPTION,
+                                                                   MAX_SPACE_OVERHEAD_LOWER_BOUND,
+                                                                   MAX_SPACE_OVERHEAD_UPPER_BOUND,
+                                                                   s));
+            }
+            catch (NumberFormatException e)
+            {
+                throw new ConfigurationException(String.format(floatParseErr,
+                                                               s,
+                                                               MAX_SPACE_OVERHEAD_OPTION),
+                                                 e);
+            }
+        }
+
+        s = options.remove(ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION);
+        if (s != null && !s.equalsIgnoreCase("true") && !s.equalsIgnoreCase("false"))
+        {
+            throw new ConfigurationException(String.format(booleanParseErr,
+                                                           ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION, s));
+        }
+
+        return adaptive ? AdaptiveController.validateOptions(options) : StaticController.validateOptions(options);
+    }
+
+    // The methods below are implemented here (rather than directly in UCS) to aid testability.
+
+    public double getBaseSstableSize(int F)
+    {
+        // The compaction hierarchy should start at a minimum size which is close to the typical flush size, with
+        // some leeway to make sure we don't overcompact when flushes end up a little smaller.
+        // The leeway should be less than 1/F, though, to make sure we don't overshoot the boundary combining F-1
+        // sources instead of F.
+        // Note that while we have not had flushes, the size will be 0 and we will use 1MB as the flush size. With
+        // fixed and positive W this should not hurt us, as the hierarchy will be in multiples of F and will still
+        // result in the same buckets, but for negative W or hybrid strategies this may cause temporary overcompaction.
+        // If this is a concern, the flush size override should be used to avoid it until DB-4401.
+        return Math.max(1 << 20, getFlushSizeBytes()) * (1.0 - 0.9 / F) / getNumShards();
+    }
+
+    public double maxThroughput()
+    {
+        final int compactionThroughputMbPerSec = DatabaseDescriptor.getCompactionThroughputMbPerSec();
+        if (compactionThroughputMbPerSec <= 0)
+            return Double.MAX_VALUE;
+        return compactionThroughputMbPerSec * 1024.0 * 1024.0;
+    }
+
+    public int maxConcurrentCompactions()
+    {
+        return DatabaseDescriptor.getConcurrentCompactors();
+    }
+
+    public long maxCompactionSpaceBytes()
+    {
+        // Note: Compaction will not proceed with operations larger than this size (i.e. it will compact on the lower
+        // levels but will accumulate sstables on the top until the space on the drive fills up). This sounds risky but
+        // is less of a problem than running out of space during compaction.
+        return (long) (getDataSetSizeBytes() * getMaxSpaceOverhead());
+    }
+
+    public int maxSSTablesToCompact()
+    {
+        return maxSSTablesToCompact;
+    }
+
+    /**
+     * Random number generator to be used for the selection of tasks.
+     * Replaced by some tests.
+     */
+    public Random random()
+    {
+        return ThreadLocalRandom.current();
+    }
+
+    static final class Metrics
+    {
+        private final MetricNameFactory factory;
+        private final AtomicReference<Controller> controllerRef;
+        private final Gauge<Double> totWAGauge;
+        private final Gauge<Double> readIOCostGauge;
+        private final Gauge<Double> writeIOCostGauge;
+        private final Gauge<Double> totIOCostGauge;
+
+        Metrics(TableMetadata metadata)
+        {
+            this.factory = new DefaultNameFactory("CompactionCosts",
+                                                  String.format("%s.%s", metadata.keyspace, metadata.name));
+            this.controllerRef = new AtomicReference<>();
+            this.totWAGauge = Metrics.register(factory.createMetricName("WA"), this::getMeasuredWA);
+            this.readIOCostGauge = Metrics.register(factory.createMetricName("ReadIOCost"), this::getReadIOCost);
+            this.writeIOCostGauge = Metrics.register(factory.createMetricName("WriteIOCost"), this::getWriteIOCost);
+            this.totIOCostGauge = Metrics.register(factory.createMetricName("TotIOCost"), this::getTotalIOCost);
+        }
+
+        void setController(Controller controller)
+        {
+            this.controllerRef.set(controller);
+        }
+
+        void removeController()
+        {
+           this.controllerRef.set(null);
+        }
+
+        void release()
+        {
+            Metrics.remove(factory.createMetricName("WA"));
+            Metrics.remove(factory.createMetricName("ReadIOCost"));
+            Metrics.remove(factory.createMetricName("WriteIOCost"));
+            Metrics.remove(factory.createMetricName("TotIOCost"));
+        }
+
+        double getMeasuredWA()
+        {
+            double ret = 0;
+            Controller controller = controllerRef.get();
+            if (controller != null)
+                ret = controller.env.WA();
+
+            return ret;
+        }
+
+        double getReadIOCost()
+        {
+            double ret = 0;
+            Controller controller = controllerRef.get();
+            if (controller != null)
+                ret = controller.getReadIOCost();
+
+            return ret;
+        }
+
+        double getWriteIOCost()
+        {
+            double ret = 0;
+            Controller controller = controllerRef.get();
+            if (controller != null)
+                ret = controller.getWriteIOCost();
+
+            return ret;
+        }
+
+        double getTotalIOCost()
+        {
+            return getReadIOCost() + getWriteIOCost();
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/CostsCalculator.java b/src/java/org/apache/cassandra/db/compaction/unified/CostsCalculator.java
new file mode 100644
index 000000000000..673d4b77dd17
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/CostsCalculator.java
@@ -0,0 +1,277 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+import javax.annotation.concurrent.NotThreadSafe;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.metrics.CompactionMetrics;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.MovingAverage;
+
+/**
+ * This class periodically retrieves delta values from the environment and stores them into exponentially weighted averages.
+ * It then uses these values to calculate IO costs that are exported to {@link CompactionMetrics} and used by {@link AdaptiveController}
+ * to choose the optimal configuration for compaction.
+ */
+public class CostsCalculator
+{
+    private final static Logger logger = LoggerFactory.getLogger(CostsCalculator.class);
+
+    /** How often values are sampled. Sampling for periods that are too short (<= 1 second) may not give good results since
+     * we many not collect sufficient data. */
+    final static int samplingPeriodMs = Integer.getInteger(Controller.PREFIX + "sample_time_ms", 5000);
+
+    /** The multipliers can be used by users if they wish to adjust the costs. We reduce the read costs because writes are batch processes (flush and compaction)
+     * and therefore the costs tend to be lower that for reads, so by reducing read costs we make the costs more comparable.
+     */
+    final static double defaultWriteMultiplier = Double.parseDouble(System.getProperty(Controller.PREFIX + "costs_write_multiplier", "1"));
+    final static double defaultReadMultiplier = Double.parseDouble(System.getProperty(Controller.PREFIX + "costs_read_multiplier", "0.1"));
+
+    private final Environment env;
+    private final double readMultiplier;
+    private final double writeMultiplier;
+    private final double survivalFactor;
+    private final MovingAverageOfDelta partitionsReadPerPeriod;
+    private final MovingAverageOfDelta bytesInsertedPerPeriod;
+    private final MovingAverage numSSTables;
+    private final MovingAverage spaceUsed;
+    private final UnifiedCompactionStrategy strategy;
+
+    private final ReentrantReadWriteLock lock;
+    private final ReentrantReadWriteLock.ReadLock readLock;
+    private final ReentrantReadWriteLock.WriteLock writeLock;
+    private final ScheduledFuture<?> future;
+
+    CostsCalculator(Environment env,
+                    UnifiedCompactionStrategy strategy,
+                    ScheduledExecutorService executorService,
+                    double survivalFactor)
+    {
+        this(env, strategy, executorService, survivalFactor, defaultReadMultiplier, defaultWriteMultiplier);
+    }
+
+    CostsCalculator(Environment env,
+                    UnifiedCompactionStrategy strategy,
+                    ScheduledExecutorService executorService,
+                    double survivalFactor,
+                    double readMultiplier,
+                    double writeMultiplier)
+    {
+        this.env = env;
+        this.readMultiplier = readMultiplier;
+        this.writeMultiplier = writeMultiplier;
+        this.survivalFactor = survivalFactor;
+        this.partitionsReadPerPeriod = new MovingAverageOfDelta(env.makeExpMovAverage());
+        this.bytesInsertedPerPeriod = new MovingAverageOfDelta(env.makeExpMovAverage());
+        this.numSSTables = env.makeExpMovAverage();
+        this.spaceUsed = env.makeExpMovAverage();
+        this.strategy = strategy;
+        this.lock = new ReentrantReadWriteLock();
+        this.readLock = lock.readLock();
+        this.writeLock = lock.writeLock();
+        this.future = executorService.scheduleAtFixedRate(this::sampleValues, samplingPeriodMs, samplingPeriodMs, TimeUnit.MILLISECONDS);
+    }
+
+    public void close()
+    {
+        writeLock.lock();
+
+        try
+        {
+            logger.debug("Stopping cost calculations for {}", strategy.getMetadata());
+            future.cancel(false);
+            logger.debug("Stopped cost calculations for {}", strategy.getMetadata());
+        }
+        finally
+        {
+            writeLock.unlock();
+        }
+    }
+
+    @VisibleForTesting
+    void sampleValues()
+    {
+        writeLock.lock();
+
+        try
+        {
+            partitionsReadPerPeriod.update(env.partitionsRead());
+            bytesInsertedPerPeriod.update(env.bytesInserted());
+
+            numSSTables.update(strategy.getSSTables().size());
+            spaceUsed.update(strategy.getSSTables().stream().map(SSTableReader::onDiskLength).reduce(0L, Long::sum));
+        }
+        catch (Throwable err)
+        {
+            JVMStabilityInspector.inspectThrowable(err);
+            logger.error("Failed to update values: {}/{}", err.getClass().getName(), err.getMessage(), err);
+        }
+        finally
+        {
+            writeLock.unlock();
+        }
+    }
+
+    /**
+     * @return the estimated read cost for the given number of partitions, in milliseconds
+     */
+    private double getReadCost(double partitionsRead)
+    {
+        return (env.sstablePartitionReadLatencyNanos() * partitionsRead) / TimeUnit.MILLISECONDS.toNanos(1);
+    }
+
+    /**
+     * Calculate the projected read cost for user queries.
+     *
+     * The projected read cost is given by the number of partitions read, times the mean partition latency and is calculated
+     * by {@link this#getReadCost(double)}. This value is then multiplied by the number of sstables we're likely to hit
+     * per partition read and the read multiplier.
+     * <p/>
+     * The number of sstables is calculated as Math.min(1 + env.bloomFilterFpRatio() * RA / survivalFactor, RA). Here we
+     * assume there is going to be at least one sstable accessed, possibly more in case of :
+     *
+     * - bloom filter's false positives;
+     * - partitions not surviving a compaction (1/survivalFactor is the limit of the sum of (1-survivalFactor)^n), that
+     *   is partitions that would not exist if compaction was done; Note that the survival factor is currently fixed to 1.
+     *
+     * The RA is then a cap since we cannot read more than RA sstables, which are the sstables that exist because
+     * compactions allows them to exist.
+     * </p>
+     * The read multiplier is a factor that operators can use to tweak the algorithm.
+     * </p>
+     * @param RA the expected read amplification due to the current choice of compaction strategy
+     *
+     * @return the projected read cost for user queries
+     */
+    public double getReadCostForQueries(int RA)
+    {
+        readLock.lock();
+
+        try
+        {
+            return getReadCost(partitionsReadPerPeriod.avg.get()) * Math.min(1 + env.bloomFilterFpRatio() * RA / survivalFactor, RA) * readMultiplier;
+        }
+        finally
+        {
+            readLock.unlock();
+        }
+    }
+
+    private double getFlushCost(double bytesWritten)
+    {
+        return ((bytesWritten / (1 << 10)) * env.flushLatencyPerKbInNanos()) / (double) TimeUnit.MILLISECONDS.toNanos(1);
+    }
+
+    private double getCompactionCost(double bytesWritten)
+    {
+        // So, the compaction latency will depend on the size of the sstables, so in the correct solution each level
+        // should pass its output size and we should measure latency in MB or something like that
+        return ((bytesWritten / (1 << 10)) * env.compactionLatencyPerKbInNanos()) / (double)  TimeUnit.MILLISECONDS.toNanos(1);
+    }
+
+    /**
+     * Calculate the projected write cost for user insertions.
+     *
+     * The projected write cost is given by the number of bytes that were inserted times the flush cost
+     * plus the same number of bytes times the compaction cost and the compaction WA. We also multiply by
+     * a write multiplier to let users change the weights if needed.
+     *
+     * @param WA the expected write amplification due to compaction
+     *
+     * @return the projected flush and write cost.
+     */
+    public double getWriteCostForQueries(int WA)
+    {
+        readLock.lock();
+
+        try
+        {
+            double bytesInserted = this.bytesInsertedPerPeriod.avg.get();
+            // using bytesInserted for the compaction cost doesn't take into account overwrites but for now it's good enough
+            return (getFlushCost(bytesInserted) + getCompactionCost(bytesInserted) * WA) * writeMultiplier;
+        }
+        finally
+        {
+            readLock.unlock();
+        }
+    }
+
+    public double partitionsRead()
+    {
+        return partitionsReadPerPeriod.avg.get();
+    }
+
+    public double numSSTables()
+    {
+        return numSSTables.get();
+    }
+
+    public double spaceUsed()
+    {
+        return spaceUsed.get();
+    }
+
+    public Environment getEnv()
+    {
+        return env;
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("num partitions read %s, bytes inserted: %s, num sstables %s; Environment: %s",
+                             partitionsReadPerPeriod, bytesInsertedPerPeriod, numSSTables, env);
+    }
+
+    @NotThreadSafe
+    private static final class MovingAverageOfDelta
+    {
+        private final MovingAverage avg;
+        private volatile double prev;
+
+        MovingAverageOfDelta(MovingAverage avg)
+        {
+            this.avg = avg;
+            this.prev = Double.MIN_VALUE;
+        }
+
+        void update(double val)
+        {
+            if (prev != Double.MIN_VALUE)
+                avg.update(val - prev);
+
+            prev = val;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%s/%d sec", FBUtilities.prettyPrintMemory((long) avg.get()), TimeUnit.MILLISECONDS.toSeconds(samplingPeriodMs));
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/Environment.java b/src/java/org/apache/cassandra/db/compaction/unified/Environment.java
new file mode 100644
index 000000000000..f94141222fa5
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/Environment.java
@@ -0,0 +1,82 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import org.apache.cassandra.utils.MovingAverage;
+
+/**
+ * This class supplies to the cost calculator the required parameters for the calculations.
+ * There are two implementations, one used in real life and one for the simulation.
+ */
+public interface Environment
+{
+    /**
+     * @return an exponential moving average. New values have greater representation in the average, and older samples'
+     * effect exponentially decays with new data.
+     */
+    MovingAverage makeExpMovAverage();
+
+    /**
+     * @return the cache miss ratio in the last 5 minutes
+     */
+    double cacheMissRatio();
+
+    /**
+     * @return the bloom filter false positive ratio for all sstables
+     */
+    double bloomFilterFpRatio();
+
+    /**
+     * @return the size of the chunk that read from disk.
+     */
+    int chunkSize();
+
+    /**
+     * @return the total bytes inserted into the memtables so far
+     */
+    long bytesInserted();
+
+    /**
+     * @return the total number of partitions read so far
+     */
+    long partitionsRead();
+
+    /**
+     * @return the mean read latency in nano seconds to read a partition from an sstable
+     */
+    double sstablePartitionReadLatencyNanos();
+
+    /**
+     * @return the mean compaction time per 1 Kb of input, in nano seconds
+     */
+    double compactionLatencyPerKbInNanos();
+
+    /**
+     * @return the mean flush latency per 1 Kb of input, in nano seconds
+     */
+    double flushLatencyPerKbInNanos();
+
+    /**
+     * @return the write amplification (bytes flushed + bytes compacted / bytes flushed).
+     */
+    double WA();
+
+    /**
+     * @return the average size of sstables when they are flushed, averaged over the last 5 minutes.
+     */
+    double flushSize();
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/RealEnvironment.java b/src/java/org/apache/cassandra/db/compaction/unified/RealEnvironment.java
new file mode 100644
index 000000000000..ed98bfd23616
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/RealEnvironment.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.cassandra.cache.ChunkCache;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.schema.CompressionParams;
+import org.apache.cassandra.utils.ExpMovingAverage;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MovingAverage;
+import org.apache.cassandra.utils.PageAware;
+
+/**
+ * An implementation of {@link Environment} that returns
+ * real values.
+ */
+class RealEnvironment implements Environment
+{
+    private final ColumnFamilyStore cfs;
+
+    RealEnvironment(ColumnFamilyStore cfs)
+    {
+        this.cfs = cfs;
+    }
+
+    @Override
+    public MovingAverage makeExpMovAverage()
+    {
+        return ExpMovingAverage.decayBy100();
+    }
+
+    @Override
+    public double cacheMissRatio()
+    {
+        double hitRate = ChunkCache.instance.metrics.hitRate.getValue();
+        if (Double.isNaN(hitRate))
+            return 1; // if the cache is not yet initialized then assume all requests are a cache miss
+
+        return 1 - Math.min(1, hitRate); // hit rate should never be > 1 but just in case put a check
+    }
+
+    @Override
+    public double bloomFilterFpRatio()
+    {
+        return cfs.bloomFilterFpRatio();
+    }
+
+    @Override
+    public int chunkSize()
+    {
+        CompressionParams compressionParams = cfs.metadata().params.compression;
+        if (compressionParams.isEnabled())
+            return compressionParams.chunkLength();
+
+        return PageAware.PAGE_SIZE;
+    }
+
+    @Override
+    public long partitionsRead()
+    {
+        return cfs.getReadRequests();
+    }
+
+    @Override
+    public double sstablePartitionReadLatencyNanos()
+    {
+        return cfs.sstablePartitionReadLatency();
+    }
+
+    @Override
+    public double compactionLatencyPerKbInNanos()
+    {
+        return cfs.getCompactionTimePerKb();
+    }
+
+    @Override
+    public double flushLatencyPerKbInNanos()
+    {
+        return cfs.getFlushTimePerKb();
+    }
+
+    @Override
+    public long bytesInserted()
+    {
+        return cfs.getBytesInserted();
+    }
+
+    @Override
+    public double WA()
+    {
+        return cfs.getWA();
+    }
+
+    @Override
+    public double flushSize()
+    {
+        return cfs.getFlushSizeOnDisk();
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Read latency: %d us / partition, flush latency: %d us / KiB, compaction latency: %d us / KiB, bfpr: %f, measured WA: %.2f, flush size %s",
+                             TimeUnit.NANOSECONDS.toMicros((long) sstablePartitionReadLatencyNanos()),
+                             TimeUnit.NANOSECONDS.toMicros((long) flushLatencyPerKbInNanos()),
+                             TimeUnit.NANOSECONDS.toMicros((long) compactionLatencyPerKbInNanos()),
+                             bloomFilterFpRatio(),
+                             WA(),
+                             FBUtilities.prettyPrintMemory((long)flushSize()));
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriter.java
new file mode 100644
index 000000000000..01a6be45d18c
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriter.java
@@ -0,0 +1,142 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.List;
+import java.util.Set;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.compaction.ArenaSelector;
+import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A {@link CompactionAwareWriter} that splits the output sstable at the partition boundaries of the compaction
+ * shards used by {@link org.apache.cassandra.db.compaction.UnifiedCompactionStrategy} as long as the size of
+ * the sstable so far is sufficiently large.
+ */
+public class ShardedCompactionWriter extends CompactionAwareWriter
+{
+    protected final static Logger logger = LoggerFactory.getLogger(ShardedCompactionWriter.class);
+
+    private final long minSstableSizeInBytes;
+    private final List<PartitionPosition> boundaries;
+    private final double overwriteRatio;
+
+    private int currentIndex;
+
+    public ShardedCompactionWriter(ColumnFamilyStore cfs,
+                                   Directories directories,
+                                   LifecycleTransaction txn,
+                                   Set<SSTableReader> nonExpiredSSTables,
+                                   boolean keepOriginals,
+                                   long minSstableSizeInBytes,
+                                   List<PartitionPosition> boundaries)
+    {
+        super(cfs, directories, txn, nonExpiredSSTables, keepOriginals);
+
+        this.minSstableSizeInBytes = minSstableSizeInBytes;
+        this.boundaries = boundaries;
+        this.currentIndex = 0;
+        long totalKeyCount = nonExpiredSSTables.stream()
+                                               .mapToLong(SSTableReader::estimatedKeys)
+                                               .sum();
+        this.overwriteRatio = 1.0 * SSTableReader.getApproximateKeyCount(nonExpiredSSTables) / totalKeyCount;
+    }
+
+    @Override
+    protected boolean shouldSwitchWriterInCurrentLocation(DecoratedKey key)
+    {
+        boolean boundaryCrossed = false;
+        /*
+        The comparison to detect a boundary is costly, but if we only do this when the size is above the threshold,
+        we may detect a boundary change in the middle of a shard and split sstables at the wrong place.
+         */
+        while (currentIndex < boundaries.size() && key.compareTo(boundaries.get(currentIndex)) >= 0)
+        {
+            currentIndex++;
+            boundaryCrossed = true;
+        }
+
+        if (boundaryCrossed && sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() >= minSstableSizeInBytes)
+        {
+            logger.debug("Switching writer at boundary {}/{} index {}, with size {} for {}.{}",
+                         key.getToken(), boundaries.get(currentIndex-1), currentIndex-1,
+                         FBUtilities.prettyPrintMemory(sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten()),
+                         cfs.getKeyspaceName(), cfs.getTableName());
+            return true;
+        }
+
+        return false;
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    protected SSTableWriter sstableWriter(Directories.DataDirectory directory, PartitionPosition diskBoundary)
+    {
+        while (diskBoundary != null && currentIndex < boundaries.size() && diskBoundary.compareTo(boundaries.get(currentIndex)) < 0)
+            currentIndex++;
+
+        return SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
+                                    shardAdjustedKeyCount(currentIndex, boundaries, minSstableSizeInBytes, nonExpiredSSTables, overwriteRatio),
+                                    minRepairedAt,
+                                    pendingRepair,
+                                    isTransient,
+                                    cfs.metadata,
+                                    new MetadataCollector(txn.originals(), cfs.metadata().comparator, 0),
+                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
+                                    cfs.indexManager.listIndexGroups(),
+                                    txn);
+    }
+
+    private long shardAdjustedKeyCount(int shardIdx,
+                                       List<PartitionPosition> boundaries,
+                                       long minSstableSizeInBytes,
+                                       Set<SSTableReader> sstables,
+                                       double overwriteRatio)
+    {
+        long shardAdjustedSize = 0;
+        long shardAdjustedKeyCount = 0;
+        for (int i = shardIdx; i < boundaries.size(); i++)
+        {
+            Set<SSTableReader> sstablesForShard = ArenaSelector.sstablesFor(i, boundaries, sstables);
+            for (SSTableReader sstable : sstablesForShard)
+            {
+                int shardsSpanned = ArenaSelector.shardsSpanned(sstable, boundaries);
+                // calculating manually instead of calling ArenaSelector.shardAdjustedSize to save 1 call to ArenaSelector.shardsSpanned
+                shardAdjustedSize += sstable.onDiskLength() / shardsSpanned;
+                shardAdjustedKeyCount += sstable.estimatedKeys() / shardsSpanned;
+            }
+
+            if (shardAdjustedSize > minSstableSizeInBytes)
+                break;
+        }
+
+        return Math.round(shardAdjustedKeyCount * overwriteRatio);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/ShardedMultiWriter.java b/src/java/org/apache/cassandra/db/compaction/unified/ShardedMultiWriter.java
new file mode 100644
index 000000000000..fe129b9cf0b2
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/ShardedMultiWriter.java
@@ -0,0 +1,272 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.UUID;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SerializationHeader;
+import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.index.Index;
+import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.io.sstable.SSTableMultiWriter;
+import org.apache.cassandra.io.sstable.SimpleSSTableMultiWriter;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.schema.TableId;
+import org.apache.cassandra.utils.FBUtilities;
+
+/**
+ * A {@link SSTableMultiWriter} that splits the output sstable at the partition boundaries of the compaction
+ * shards used by {@link org.apache.cassandra.db.compaction.UnifiedCompactionStrategy} as long as the size of
+ * the sstable so far is sufficiently large.
+ * <p/>
+ * This is class is similar to {@link ShardedMultiWriter} but for flushing. Unfortunately
+ * we currently have 2 separate writers hierarchy that are not compatible and so we must
+ * duplicate the functionality of splitting sstables over compaction shards if they have
+ * reached a minimum size.
+ */
+public class ShardedMultiWriter implements SSTableMultiWriter
+{
+    protected final static Logger logger = LoggerFactory.getLogger(ShardedMultiWriter.class);
+
+    private final ColumnFamilyStore cfs;
+    private final Descriptor descriptor;
+    private final long keyCount;
+    private final long repairedAt;
+    private final UUID pendingRepair;
+    private final boolean isTransient;
+    private final MetadataCollector meta;
+    private final SerializationHeader header;
+    private final Collection<Index.Group> indexGroups;
+    private final LifecycleNewTracker lifecycleNewTracker;
+    private final long minSstableSizeInBytes;
+    private final List<PartitionPosition> boundaries;
+    private final SSTableMultiWriter[] writers;
+    private final int estimatedSSTables;
+    private int currentBoundary;
+    private int currentWriter;
+
+    public ShardedMultiWriter(ColumnFamilyStore cfs,
+                              Descriptor descriptor,
+                              long keyCount,
+                              long repairedAt,
+                              UUID pendingRepair,
+                              boolean isTransient,
+                              MetadataCollector meta,
+                              SerializationHeader header,
+                              Collection<Index.Group> indexGroups,
+                              LifecycleNewTracker lifecycleNewTracker,
+                              long minSstableSizeInBytes,
+                              List<PartitionPosition> boundaries)
+    {
+        this.cfs = cfs;
+        this.descriptor = descriptor;
+        this.keyCount = keyCount;
+        this.repairedAt = repairedAt;
+        this.pendingRepair = pendingRepair;
+        this.isTransient = isTransient;
+        this.meta = meta;
+        this.header = header;
+        this.indexGroups = indexGroups;
+        this.lifecycleNewTracker = lifecycleNewTracker;
+        this.minSstableSizeInBytes = minSstableSizeInBytes;
+        this.boundaries = boundaries;
+        this.writers = new SSTableMultiWriter[boundaries.size()];
+        this.estimatedSSTables = (int) Math.max(1, Math.ceil(cfs.getFlushSizeOnDisk() / minSstableSizeInBytes));
+
+        this.currentBoundary = 0;
+        this.currentWriter = 0;
+        this.writers[currentWriter] = createWriter(descriptor);
+    }
+
+    private SSTableMultiWriter createWriter()
+    {
+        Descriptor newDesc = cfs.newSSTableDescriptor(descriptor.directory);
+        return createWriter(newDesc);
+    }
+
+    private SSTableMultiWriter createWriter(Descriptor desc)
+    {
+        return SimpleSSTableMultiWriter.create(desc,
+                                               forSplittingKeysBy(estimatedSSTables),
+                                               repairedAt,
+                                               pendingRepair,
+                                               isTransient,
+                                               cfs.metadata,
+                                               meta,
+                                               header,
+                                               indexGroups,
+                                               lifecycleNewTracker);
+    }
+
+    private long forSplittingKeysBy(long splits) {
+        return splits <= 1 ? keyCount : keyCount / splits;
+    }
+
+    @Override
+    public boolean append(UnfilteredRowIterator partition)
+    {
+        DecoratedKey key = partition.partitionKey();
+
+        boolean boundaryCrossed = false;
+        /*
+        The comparison to detect a boundary is costly, but if we only do this when the size is above the threshold,
+        we may detect a boundary change in the middle of a shard and split sstables at the wrong place.
+         */
+        while (currentBoundary < boundaries.size() && key.compareTo(boundaries.get(currentBoundary)) >= 0)
+        {
+            currentBoundary++;
+            if (!boundaryCrossed)
+                boundaryCrossed = true;
+        }
+
+        if (boundaryCrossed && writers[currentWriter].getOnDiskBytesWritten() >= minSstableSizeInBytes)
+        {
+            logger.debug("Switching writer at boundary {}/{} index {}/{}, with size {} for {}.{}",
+                         key.getToken(), boundaries.get(currentBoundary-1), currentBoundary-1, currentWriter,
+                         FBUtilities.prettyPrintMemory(writers[currentWriter].getBytesWritten()),
+                         cfs.getKeyspaceName(), cfs.getTableName());
+
+            writers[++currentWriter] = createWriter();
+        }
+
+        return writers[currentWriter].append(partition);
+    }
+
+    @Override
+    public Collection<SSTableReader> finish(long repairedAt, long maxDataAge, boolean openResult)
+    {
+        List<SSTableReader> sstables = new ArrayList<>(writers.length);
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                sstables.addAll(writer.finish(repairedAt, maxDataAge, openResult));
+        return sstables;
+    }
+
+    @Override
+    public Collection<SSTableReader> finish(boolean openResult)
+    {
+        List<SSTableReader> sstables = new ArrayList<>(writers.length);
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                sstables.addAll(writer.finish(openResult));
+        return sstables;
+    }
+
+    @Override
+    public Collection<SSTableReader> finished()
+    {
+        List<SSTableReader> sstables = new ArrayList<>(writers.length);
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                sstables.addAll(writer.finished());
+        return sstables;
+    }
+
+    @Override
+    public SSTableMultiWriter setOpenResult(boolean openResult)
+    {
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                writer.setOpenResult(openResult);
+        return this;
+    }
+
+    @Override
+    public String getFilename()
+    {
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                return writer.getFilename();
+        return "";
+    }
+
+    @Override
+    public long getBytesWritten()
+    {
+        long bytesWritten = 0;
+        for (int i = 0; i <= currentWriter; ++i)
+            bytesWritten += writers[i].getBytesWritten();
+        return bytesWritten;
+    }
+
+    @Override
+    public long getOnDiskBytesWritten()
+    {
+        long bytesWritten = 0;
+        for (int i = 0; i <= currentWriter; ++i)
+            bytesWritten += writers[i].getOnDiskBytesWritten();
+        return bytesWritten;
+    }
+
+    public int getSegmentCount()
+    {
+        return currentWriter + 1;
+    }
+
+    @Override
+    public TableId getTableId()
+    {
+        return cfs.metadata.id;
+    }
+
+    @Override
+    public Throwable commit(Throwable accumulate)
+    {
+        Throwable t = accumulate;
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                t = writer.commit(t);
+        return t;
+    }
+
+    @Override
+    public Throwable abort(Throwable accumulate)
+    {
+        Throwable t = accumulate;
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                t = writer.abort(t);
+        return t;
+    }
+
+    @Override
+    public void prepareToCommit()
+    {
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                writer.prepareToCommit();
+    }
+
+    @Override
+    public void close()
+    {
+        for (SSTableMultiWriter writer : writers)
+            if (writer != null)
+                writer.close();
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/StaticController.java b/src/java/org/apache/cassandra/db/compaction/unified/StaticController.java
new file mode 100644
index 000000000000..1d5578127927
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/StaticController.java
@@ -0,0 +1,129 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.Arrays;
+import java.util.Map;
+
+import com.google.common.annotations.VisibleForTesting;
+
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.utils.MonotonicClock;
+
+/**
+ * The static compaction controller periodically checks the IO costs
+ * that result from the current configuration of the {@link UnifiedCompactionStrategy}.
+ */
+public class StaticController extends Controller
+{
+    /**
+     * The scaling parameters W, one per bucket index and separated by a comma.
+     * Higher indexes will use the value of the last index with a W specified.
+     */
+    final static String STATIC_SCALING_PARAMETERS_OPTION = "static_scaling_parameters";
+    private final static String DEFAULT_STATIC_SCALING_PARAMETERS = System.getProperty(PREFIX + STATIC_SCALING_PARAMETERS_OPTION, "2");
+
+    private final int[] scalingParameters;
+
+    @VisibleForTesting // comp. simulation
+    public StaticController(Environment env,
+                            int[] scalingParameters,
+                            double survivalFactor,
+                            long dataSetSizeMB,
+                            int numShards,
+                            long minSSTableSizeMB,
+                            long flushSizeOverrideMB,
+                            double maxSpaceOverhead,
+                            int maxSSTablesToCompact,
+                            long expiredSSTableCheckFrequency,
+                            boolean ignoreOverlapsInExpirationCheck)
+    {
+        super(MonotonicClock.preciseTime,
+              env,
+              survivalFactor,
+              dataSetSizeMB,
+              numShards,
+              minSSTableSizeMB,
+              flushSizeOverrideMB,
+              maxSpaceOverhead,
+              maxSSTablesToCompact,
+              expiredSSTableCheckFrequency,
+              ignoreOverlapsInExpirationCheck);
+        this.scalingParameters = scalingParameters;
+    }
+
+    static Controller fromOptions(Environment env,
+                                  double survivalFactor,
+                                  long dataSetSizeMB,
+                                  int numShards,
+                                  long minSSTableSizeMB,
+                                  long flushSizeOverrideMB,
+                                  double maxSpaceOverhead,
+                                  int maxSSTablesToCompact,
+                                  long expiredSSTableCheckFrequency,
+                                  boolean ignoreOverlapsInExpirationCheck,
+                                  Map<String, String> options)
+    {
+        int[] Ws = parseScalingParameters(options.getOrDefault(STATIC_SCALING_PARAMETERS_OPTION, DEFAULT_STATIC_SCALING_PARAMETERS));
+        return new StaticController(env,
+                                    Ws,
+                                    survivalFactor,
+                                    dataSetSizeMB,
+                                    numShards,
+                                    minSSTableSizeMB,
+                                    flushSizeOverrideMB,
+                                    maxSpaceOverhead,
+                                    maxSSTablesToCompact,
+                                    expiredSSTableCheckFrequency,
+                                    ignoreOverlapsInExpirationCheck);
+    }
+
+    @VisibleForTesting
+    static int[] parseScalingParameters(String str)
+    {
+        String[] vals = str.split(",");
+        int[] ret = new int[vals.length];
+        for (int i = 0; i < vals.length; i++)
+            ret[i] = Integer.parseInt(vals[i].trim());
+
+        return ret;
+    }
+
+    public static Map<String, String> validateOptions(Map<String, String> options) throws ConfigurationException
+    {
+        String s = options.remove(STATIC_SCALING_PARAMETERS_OPTION);
+        if (s != null)
+            parseScalingParameters(s);
+        return options;
+    }
+
+    @Override
+    public int getScalingParameter(int index)
+    {
+        if (index < 0)
+            throw new IllegalArgumentException("Index should be >= 0: " + index);
+
+        return index < scalingParameters.length ? scalingParameters[index] : scalingParameters[scalingParameters.length - 1];
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("Static controller, m: %d, o: %f, Ws: %s, cost: %s", minSstableSizeMB, survivalFactor, Arrays.toString(scalingParameters), calculator);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/unified/UnifiedCompactionTask.java b/src/java/org/apache/cassandra/db/compaction/unified/UnifiedCompactionTask.java
new file mode 100644
index 000000000000..7df03fe970f2
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/compaction/unified/UnifiedCompactionTask.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.List;
+import java.util.Set;
+
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.compaction.CompactionTask;
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+
+/**
+ * The sole purpose of this class is to currently create a {@link ShardedCompactionWriter}.
+ */
+public class UnifiedCompactionTask extends CompactionTask
+{
+    private final long minSstableSizeInBytes;
+    private final List<PartitionPosition> boundaries;
+
+    public UnifiedCompactionTask(ColumnFamilyStore cfs,
+                                 UnifiedCompactionStrategy strategy,
+                                 LifecycleTransaction txn,
+                                 int gcBefore,
+                                 long minSstableSizeInBytes,
+                                 List<PartitionPosition> boundaries)
+    {
+        super(cfs, txn, gcBefore, strategy.getController().getIgnoreOverlapsInExpirationCheck(), strategy);
+        this.minSstableSizeInBytes = minSstableSizeInBytes;
+        this.boundaries = boundaries;
+    }
+
+    @Override
+    public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
+                                                          Directories directories,
+                                                          LifecycleTransaction txn,
+                                                          Set<SSTableReader> nonExpiredSSTables)
+    {
+        return new ShardedCompactionWriter(cfs, directories, txn, nonExpiredSSTables, keepOriginals, minSstableSizeInBytes, boundaries);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
index 55ccb3260380..61e97ac09717 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java
@@ -18,7 +18,6 @@
 
 package org.apache.cassandra.db.compaction.writers;
 
-import java.io.File;
 import java.util.Collection;
 import java.util.List;
 import java.util.Set;
@@ -32,15 +31,18 @@
 import org.apache.cassandra.db.Directories;
 import org.apache.cassandra.db.DiskBoundaries;
 import org.apache.cassandra.db.PartitionPosition;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
+import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.CompactionTask;
+import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.Descriptor;
 import org.apache.cassandra.io.sstable.SSTableRewriter;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.format.SSTableWriter;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.concurrent.Transactional;
-import org.apache.cassandra.db.compaction.OperationType;
 
 
 /**
@@ -65,6 +67,7 @@ public abstract class CompactionAwareWriter extends Transactional.AbstractTransa
     private final List<Directories.DataDirectory> locations;
     private final List<PartitionPosition> diskBoundaries;
     private int locationIndex;
+    protected Directories.DataDirectory currentDirectory;
 
     public CompactionAwareWriter(ColumnFamilyStore cfs,
                                  Directories directories,
@@ -84,7 +87,7 @@ public CompactionAwareWriter(ColumnFamilyStore cfs,
         pendingRepair = CompactionTask.getPendingRepair(nonExpiredSSTables);
         isTransient = CompactionTask.getIsTransient(nonExpiredSSTables);
         DiskBoundaries db = cfs.getDiskBoundaries();
-        diskBoundaries = db.positions;
+        diskBoundaries = db.getPositions();
         locations = db.directories;
         locationIndex = -1;
     }
@@ -131,10 +134,10 @@ public long estimatedKeys()
      * @param partition the partition to append
      * @return true if the partition was written, false otherwise
      */
-    public final boolean append(UnfilteredRowIterator partition)
+    public boolean append(UnfilteredRowIterator partition)
     {
         maybeSwitchWriter(partition.partitionKey());
-        return realAppend(partition);
+        return sstableWriter.append(partition) != null;
     }
 
     @Override
@@ -144,43 +147,87 @@ protected Throwable doPostCleanup(Throwable accumulate)
         return super.doPostCleanup(accumulate);
     }
 
-    protected abstract boolean realAppend(UnfilteredRowIterator partition);
-
     /**
+     * Switches the writer if necessary, i.e. if the new key should be placed in a different data directory, or if the
+     * specific strategy has decided a new sstable is needed.
+     *
      * Guaranteed to be called before the first call to realAppend.
-     * @param key
      */
     protected void maybeSwitchWriter(DecoratedKey key)
+    {
+        if (maybeSwitchLocation(key))
+            return;
+
+        if (shouldSwitchWriterInCurrentLocation(key))
+            switchCompactionWriter(currentDirectory);
+    }
+
+    /**
+     * Switches the file location and writer and returns true if the new key should be placed in a different data
+     * directory.
+     */
+    protected boolean maybeSwitchLocation(DecoratedKey key)
     {
         if (diskBoundaries == null)
         {
             if (locationIndex < 0)
             {
                 Directories.DataDirectory defaultLocation = getWriteDirectory(nonExpiredSSTables, cfs.getExpectedCompactedFileSize(nonExpiredSSTables, OperationType.UNKNOWN));
-                switchCompactionLocation(defaultLocation);
+                switchCompactionWriter(defaultLocation);
                 locationIndex = 0;
+                return true;
             }
-            return;
+            return false;
         }
 
         if (locationIndex > -1 && key.compareTo(diskBoundaries.get(locationIndex)) < 0)
-            return;
+            return false;
 
         int prevIdx = locationIndex;
         while (locationIndex == -1 || key.compareTo(diskBoundaries.get(locationIndex)) > 0)
             locationIndex++;
+        Directories.DataDirectory newLocation = locations.get(locationIndex);
         if (prevIdx >= 0)
-            logger.debug("Switching write location from {} to {}", locations.get(prevIdx), locations.get(locationIndex));
-        switchCompactionLocation(locations.get(locationIndex));
+            logger.debug("Switching write location from {} to {}", locations.get(prevIdx), newLocation);
+        switchCompactionWriter(newLocation);
+        return true;
     }
 
+    /**
+     * Returns true if the writer should be switched for reasons other than switching to a new data directory
+     * (e.g. because an sstable size limit has been reached).
+     */
+    protected abstract boolean shouldSwitchWriterInCurrentLocation(DecoratedKey key);
+
     /**
      * Implementations of this method should finish the current sstable writer and start writing to this directory.
      *
      * Called once before starting to append and then whenever we see a need to start writing to another directory.
      * @param directory
      */
-    protected abstract void switchCompactionLocation(Directories.DataDirectory directory);
+    protected void switchCompactionWriter(Directories.DataDirectory directory)
+    {
+        currentDirectory = directory;
+        PartitionPosition diskBoundary = diskBoundaries != null && locationIndex > -1
+                                         ? diskBoundaries.get(locationIndex)
+                                         : null;
+        sstableWriter.switchWriter(sstableWriter(directory, diskBoundary));
+    }
+
+    @SuppressWarnings("resource")
+    protected SSTableWriter sstableWriter(Directories.DataDirectory directory, PartitionPosition diskBoundary)
+    {
+        return SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
+                                    estimatedTotalKeys,
+                                    minRepairedAt,
+                                    pendingRepair,
+                                    isTransient,
+                                    cfs.metadata,
+                                    new MetadataCollector(txn.originals(), cfs.metadata().comparator),
+                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
+                                    cfs.indexManager.listIndexGroups(),
+                                    txn);
+    }
 
     /**
      * The directories we can write to
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
index f74c7532383c..d51184f336b6 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java
@@ -24,9 +24,10 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.SerializationHeader;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
@@ -59,26 +60,25 @@ public DefaultCompactionWriter(ColumnFamilyStore cfs, Directories directories, L
     }
 
     @Override
-    public boolean realAppend(UnfilteredRowIterator partition)
+    protected boolean shouldSwitchWriterInCurrentLocation(DecoratedKey key)
     {
-        return sstableWriter.append(partition) != null;
+        return false;
     }
 
+    @SuppressWarnings("resource")
     @Override
-    public void switchCompactionLocation(Directories.DataDirectory directory)
+    protected SSTableWriter sstableWriter(Directories.DataDirectory directory, PartitionPosition diskBoundary)
     {
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
-                                                    estimatedTotalKeys,
-                                                    minRepairedAt,
-                                                    pendingRepair,
-                                                    isTransient,
-                                                    cfs.metadata,
-                                                    new MetadataCollector(txn.originals(), cfs.metadata().comparator, sstableLevel),
-                                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
-                                                    cfs.indexManager.listIndexGroups(),
-                                                    txn);
-        sstableWriter.switchWriter(writer);
+        return SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
+                                    estimatedTotalKeys,
+                                    minRepairedAt,
+                                    pendingRepair,
+                                    isTransient,
+                                    cfs.metadata,
+                                    new MetadataCollector(txn.originals(), cfs.metadata().comparator, sstableLevel),
+                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
+                                    cfs.indexManager.listIndexGroups(),
+                                    txn);
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
index ac4dd64f1f04..240ccc504a1c 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java
@@ -20,9 +20,10 @@
 import java.util.Set;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Directories;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.io.sstable.format.RowIndexEntry;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.compaction.LeveledManifest;
@@ -39,7 +40,6 @@ public class MajorLeveledCompactionWriter extends CompactionAwareWriter
     private long totalWrittenInLevel = 0;
     private int sstablesWritten = 0;
     private final long keysPerSSTable;
-    private Directories.DataDirectory sstableDirectory;
     private final int levelFanoutSize;
 
     public MajorLeveledCompactionWriter(ColumnFamilyStore cfs,
@@ -67,11 +67,15 @@ public MajorLeveledCompactionWriter(ColumnFamilyStore cfs,
     }
 
     @Override
-    @SuppressWarnings("resource")
-    public boolean realAppend(UnfilteredRowIterator partition)
+    public boolean append(UnfilteredRowIterator partition)
     {
-        RowIndexEntry rie = sstableWriter.append(partition);
         partitionsWritten++;
+        return super.append(partition);
+    }
+
+    @Override
+    protected boolean shouldSwitchWriterInCurrentLocation(DecoratedKey key)
+    {
         long totalWrittenInCurrentWriter = sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten();
         if (totalWrittenInCurrentWriter > maxSSTableSize)
         {
@@ -81,28 +85,34 @@ public boolean realAppend(UnfilteredRowIterator partition)
                 totalWrittenInLevel = 0;
                 currentLevel++;
             }
-            switchCompactionLocation(sstableDirectory);
+            return true;
         }
-        return rie != null;
+        return false;
 
     }
 
     @Override
-    public void switchCompactionLocation(Directories.DataDirectory location)
+    public void switchCompactionWriter(Directories.DataDirectory location)
     {
-        this.sstableDirectory = location;
         averageEstimatedKeysPerSSTable = Math.round(((double) averageEstimatedKeysPerSSTable * sstablesWritten + partitionsWritten) / (sstablesWritten + 1));
-        sstableWriter.switchWriter(SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(sstableDirectory)),
-                keysPerSSTable,
-                minRepairedAt,
-                pendingRepair,
-                isTransient,
-                cfs.metadata,
-                new MetadataCollector(txn.originals(), cfs.metadata().comparator, currentLevel),
-                SerializationHeader.make(cfs.metadata(), txn.originals()),
-                cfs.indexManager.listIndexGroups(),
-                txn));
         partitionsWritten = 0;
         sstablesWritten = 0;
+        super.switchCompactionWriter(location);
+    }
+
+    @Override
+    @SuppressWarnings("resource")
+    protected SSTableWriter sstableWriter(Directories.DataDirectory directory, PartitionPosition diskBoundary)
+    {
+        return SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
+                                    keysPerSSTable,
+                                    minRepairedAt,
+                                    pendingRepair,
+                                    isTransient,
+                                    cfs.metadata,
+                                    new MetadataCollector(txn.originals(), cfs.metadata().comparator, currentLevel),
+                                    SerializationHeader.make(cfs.metadata(), txn.originals()),
+                                    cfs.indexManager.listIndexGroups(),
+                                    txn);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
index 9fd7531127db..a094d4253102 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java
@@ -20,11 +20,11 @@
 import java.util.Set;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.SerializationHeader;
 import org.apache.cassandra.db.compaction.OperationType;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
@@ -35,8 +35,6 @@ public class MaxSSTableSizeWriter extends CompactionAwareWriter
     private final long maxSSTableSize;
     private final int level;
     private final long estimatedSSTables;
-    private final Set<SSTableReader> allSSTables;
-    private Directories.DataDirectory sstableDirectory;
 
     public MaxSSTableSizeWriter(ColumnFamilyStore cfs,
                                 Directories directories,
@@ -57,7 +55,6 @@ public MaxSSTableSizeWriter(ColumnFamilyStore cfs,
                                 boolean keepOriginals)
     {
         super(cfs, directories, txn, nonExpiredSSTables, keepOriginals);
-        this.allSSTables = txn.originals();
         this.level = level;
         this.maxSSTableSize = maxSSTableSize;
 
@@ -79,32 +76,24 @@ private static long getTotalWriteSize(Iterable<SSTableReader> nonExpiredSSTables
         return Math.round(estimatedCompactionRatio * cfs.getExpectedCompactedFileSize(nonExpiredSSTables, compactionType));
     }
 
-    protected boolean realAppend(UnfilteredRowIterator partition)
+    @Override
+    protected boolean shouldSwitchWriterInCurrentLocation(DecoratedKey key)
     {
-        RowIndexEntry rie = sstableWriter.append(partition);
-        if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > maxSSTableSize)
-        {
-            switchCompactionLocation(sstableDirectory);
-        }
-        return rie != null;
+        return sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > maxSSTableSize;
     }
 
     @Override
-    public void switchCompactionLocation(Directories.DataDirectory location)
+    protected SSTableWriter sstableWriter(Directories.DataDirectory directory, PartitionPosition diskBoundary)
     {
-        sstableDirectory = location;
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(sstableDirectory)),
-                                                    estimatedTotalKeys / estimatedSSTables,
-                                                    minRepairedAt,
-                                                    pendingRepair,
-                                                    isTransient,
-                                                    cfs.metadata,
-                                                    new MetadataCollector(allSSTables, cfs.metadata().comparator, level),
-                                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
-                                                    cfs.indexManager.listIndexGroups(),
-                                                    txn);
-
-        sstableWriter.switchWriter(writer);
+        return SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
+                                    estimatedTotalKeys / estimatedSSTables,
+                                    minRepairedAt,
+                                    pendingRepair,
+                                    isTransient,
+                                    cfs.metadata,
+                                    new MetadataCollector(txn.originals(), cfs.metadata().comparator, level),
+                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
+                                    cfs.indexManager.listIndexGroups(),
+                                    txn);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
index 0199bc03b7f7..c389d81186f7 100644
--- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
+++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java
@@ -24,10 +24,10 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Directories;
-import org.apache.cassandra.io.sstable.format.RowIndexEntry;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.SerializationHeader;
-import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.io.sstable.format.SSTableWriter;
@@ -49,7 +49,6 @@ public class SplittingSizeTieredCompactionWriter extends CompactionAwareWriter
     private final Set<SSTableReader> allSSTables;
     private long currentBytesToWrite;
     private int currentRatioIndex = 0;
-    private Directories.DataDirectory location;
 
     public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set<SSTableReader> nonExpiredSSTables)
     {
@@ -84,36 +83,33 @@ public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories di
     }
 
     @Override
-    public boolean realAppend(UnfilteredRowIterator partition)
+    protected boolean shouldSwitchWriterInCurrentLocation(DecoratedKey key)
     {
-        RowIndexEntry rie = sstableWriter.append(partition);
         if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > currentBytesToWrite && currentRatioIndex < ratios.length - 1) // if we underestimate how many keys we have, the last sstable might get more than we expect
         {
             currentRatioIndex++;
             currentBytesToWrite = Math.round(totalSize * ratios[currentRatioIndex]);
-            switchCompactionLocation(location);
             logger.debug("Switching writer, currentBytesToWrite = {}", currentBytesToWrite);
+            return true;
         }
-        return rie != null;
+        return false;
     }
 
     @Override
-    public void switchCompactionLocation(Directories.DataDirectory location)
+    protected SSTableWriter sstableWriter(Directories.DataDirectory directory, PartitionPosition diskBoundary)
     {
-        this.location = location;
         long currentPartitionsToWrite = Math.round(ratios[currentRatioIndex] * estimatedTotalKeys);
-        @SuppressWarnings("resource")
-        SSTableWriter writer = SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(location)),
-                                                    currentPartitionsToWrite,
-                                                    minRepairedAt,
-                                                    pendingRepair,
-                                                    isTransient,
-                                                    cfs.metadata,
-                                                    new MetadataCollector(allSSTables, cfs.metadata().comparator, 0),
-                                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
-                                                    cfs.indexManager.listIndexGroups(),
-                                                    txn);
         logger.trace("Switching writer, currentPartitionsToWrite = {}", currentPartitionsToWrite);
-        sstableWriter.switchWriter(writer);
+
+        return SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)),
+                                    currentPartitionsToWrite,
+                                    minRepairedAt,
+                                    pendingRepair,
+                                    isTransient,
+                                    cfs.metadata,
+                                    new MetadataCollector(allSSTables, cfs.metadata().comparator, 0),
+                                    SerializationHeader.make(cfs.metadata(), nonExpiredSSTables),
+                                    cfs.indexManager.listIndexGroups(),
+                                    txn);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
index 78f5453db2cc..336a5774de9f 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Helpers.java
@@ -22,6 +22,7 @@
 import com.google.common.base.Predicate;
 import com.google.common.collect.*;
 
+import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.io.sstable.SSTable;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.Throwables;
@@ -54,7 +55,7 @@ static <T> Map<T, T> replace(Map<T, T> original, Set<? extends T> remove, Iterab
     {
         // ensure the ones being removed are the exact same ones present
         for (T reader : remove)
-            assert original.get(reader) == reader;
+            assert original.get(reader) == reader : String.format("%s not found in original set: %s", reader, original);
 
         // ensure we don't already contain any we're adding, that we aren't also removing
         assert !any(add, and(not(in(remove)), in(original.keySet()))) : String.format("original:%s remove:%s add:%s", original.keySet(), remove, add);
@@ -72,10 +73,10 @@ static <T> Map<T, T> replace(Map<T, T> original, Set<? extends T> remove, Iterab
      * A convenience method for encapsulating this action over multiple SSTableReader with exception-safety
      * @return accumulate if not null (with any thrown exception attached), or any thrown exception otherwise
      */
-    static void setupOnline(Iterable<SSTableReader> readers)
+    static void setupOnline(ColumnFamilyStore cfs, Iterable<SSTableReader> readers)
     {
         for (SSTableReader reader : readers)
-            reader.setupOnline();
+            reader.setupOnline(cfs);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
index 99da6f9d95b0..720f1ebf93eb 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/LifecycleTransaction.java
@@ -321,6 +321,12 @@ public boolean isOffline()
         return tracker.isDummy();
     }
 
+    @VisibleForTesting
+    public void unsafeClose()
+    {
+        log.close();
+    }
+
     /**
      * call when a consistent batch of changes is ready to be made atomically visible
      * these will be exposed in the Tracker atomically, or an exception will be thrown; in this case
diff --git a/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java b/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java
index 61fab98a0082..9315e8b35176 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/SSTableIntervalTree.java
@@ -54,7 +54,7 @@ public static List<Interval<PartitionPosition, SSTableReader>> buildIntervals(It
     {
         List<Interval<PartitionPosition, SSTableReader>> intervals = new ArrayList<>(Iterables.size(sstables));
         for (SSTableReader sstable : sstables)
-            intervals.add(Interval.<PartitionPosition, SSTableReader>create(sstable.first, sstable.last, sstable));
+            intervals.add(Interval.create(sstable.getFirst(), sstable.getLast(), sstable));
         return intervals;
     }
 }
diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
index 26f2b4ccec8f..3c2b4b586434 100644
--- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
+++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java
@@ -222,7 +222,7 @@ private void addSSTablesInternal(Iterable<SSTableReader> sstables,
                                      boolean updateSize)
     {
         if (!isDummy())
-            setupOnline(sstables);
+            setupOnline(cfstore, sstables);
         apply(updateLiveSet(emptySet(), sstables));
         if(updateSize)
             maybeFail(updateSizeTracking(emptySet(), sstables, null));
@@ -373,7 +373,7 @@ public void replaceFlushed(Memtable memtable, Iterable<SSTableReader> sstables)
             return;
         }
 
-        sstables.forEach(SSTableReader::setupOnline);
+        setupOnline(cfstore, sstables);
         // back up before creating a new Snapshot (which makes the new one eligible for compaction)
         maybeIncrementallyBackup(sstables);
 
@@ -412,6 +412,11 @@ public Iterable<? extends SSTableReader> getNoncompacting(Iterable<? extends SST
         return view.get().getNoncompacting(candidates);
     }
 
+    public Set<SSTableReader> getLiveSSTables()
+    {
+        return view.get().liveSSTables();
+    }
+
     public void maybeIncrementallyBackup(final Iterable<SSTableReader> sstables)
     {
         if (!DatabaseDescriptor.isIncrementalBackupsEnabled())
@@ -528,6 +533,8 @@ public boolean isDummy()
     public void subscribe(INotificationConsumer consumer)
     {
         subscribers.add(consumer);
+        if (logger.isTraceEnabled())
+            logger.trace("{} subscribed to the data tracker.", consumer);
     }
 
     public void unsubscribe(INotificationConsumer consumer)
@@ -548,6 +555,12 @@ public View getView()
     @VisibleForTesting
     public void removeUnsafe(Set<SSTableReader> toRemove)
     {
-        Pair<View, View> result = apply(view -> updateLiveSet(toRemove, emptySet()).apply(view));
+        apply(view -> updateLiveSet(toRemove, emptySet()).apply(view));
+    }
+
+    @VisibleForTesting
+    public void removeCompactingUnsafe(Set<SSTableReader> toRemove)
+    {
+        apply(view -> updateCompacting(toRemove, emptySet()).apply(view));
     }
 }
diff --git a/src/java/org/apache/cassandra/db/memtable/Flushing.java b/src/java/org/apache/cassandra/db/memtable/Flushing.java
index 8332a1bc9d1e..753349fa0d47 100644
--- a/src/java/org/apache/cassandra/db/memtable/Flushing.java
+++ b/src/java/org/apache/cassandra/db/memtable/Flushing.java
@@ -66,7 +66,7 @@ public static List<FlushRunnable> flushRunnables(ColumnFamilyStore cfs,
                                  cfs.name);
 
         DiskBoundaries diskBoundaries = cfs.getDiskBoundaries();
-        List<PartitionPosition> boundaries = diskBoundaries.positions;
+        List<PartitionPosition> boundaries = diskBoundaries.getPositions();
         List<Directories.DataDirectory> locations = diskBoundaries.directories;
         return flushRunnables(cfs, memtable, boundaries, locations, txn);
     }
@@ -185,6 +185,7 @@ private void writeSortedContents()
                 return;
             }
 
+            long before = System.nanoTime();
             logger.debug("Writing {}, flushed range = ({}, {}], state: {}",
                          toFlush.memtable().toString(), toFlush.from(), toFlush.to(), state);
 
@@ -222,13 +223,19 @@ private void writeSortedContents()
                     {
                         if (logCompletion)
                         {
-                            long bytesFlushed = writer.getFilePointer();
-                            logger.info("Completed flushing {} ({}) for commitlog position {}",
+                            long bytesFlushed = writer.getBytesWritten();
+                            long onDiskBytesWritten = writer.getOnDiskBytesWritten();
+                            long segmentCount = writer.getSegmentCount();
+                            logger.info("Completed flushing {} ({}/{} on disk/{} files) for commitlog position {}",
                                         writer.getFilename(),
                                         FBUtilities.prettyPrintMemory(bytesFlushed),
+                                        FBUtilities.prettyPrintMemory(onDiskBytesWritten),
+                                        segmentCount,
                                         toFlush.memtable().getFinalCommitLogUpperBound());
                             // Update the metrics
-                            metrics.bytesFlushed.inc(bytesFlushed);
+                            metrics.incBytesFlushed(toFlush.memtable().getLiveDataSize(), bytesFlushed, before - System.nanoTime());
+                            metrics.flushSizeOnDisk.update(onDiskBytesWritten);
+                            metrics.flushSegmentCount.update(segmentCount);
                         }
 
                         break;
diff --git a/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java b/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java
index 87d283527f97..720dae33d94f 100644
--- a/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java
+++ b/src/java/org/apache/cassandra/db/memtable/ShardBoundaries.java
@@ -25,6 +25,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.dht.Token;
 
 /**
@@ -37,7 +38,7 @@
  */
 public class ShardBoundaries
 {
-    private static final Token[] EMPTY_TOKEN_ARRAY = new Token[0];
+    private static final PartitionPosition[] EMPTY_TOKEN_ARRAY = new PartitionPosition[0];
 
     // Special boundaries that map all tokens to one shard.
     // These boundaries will be used in either of these cases:
@@ -46,17 +47,17 @@ public class ShardBoundaries
     // - the keyspace is local system keyspace
     public static final ShardBoundaries NONE = new ShardBoundaries(EMPTY_TOKEN_ARRAY, -1);
 
-    private final Token[] boundaries;
+    private final PartitionPosition[] boundaries;
     public final long ringVersion;
 
     @VisibleForTesting
-    public ShardBoundaries(Token[] boundaries, long ringVersion)
+    public ShardBoundaries(PartitionPosition[] boundaries, long ringVersion)
     {
         this.boundaries = boundaries;
         this.ringVersion = ringVersion;
     }
 
-    public ShardBoundaries(List<Token> boundaries, long ringVersion)
+    public ShardBoundaries(List<PartitionPosition> boundaries, long ringVersion)
     {
         this(boundaries.toArray(EMPTY_TOKEN_ARRAY), ringVersion);
     }
@@ -68,7 +69,7 @@ public int getShardForToken(Token tk)
     {
         for (int i = 0; i < boundaries.length; i++)
         {
-            if (tk.compareTo(boundaries[i]) < 0)
+            if (tk.compareTo(boundaries[i].getToken()) < 0)
                 return i;
         }
         return boundaries.length;
diff --git a/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java b/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java
index 983e30f20728..dc975417cf72 100644
--- a/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java
+++ b/src/java/org/apache/cassandra/db/repair/CassandraTableRepairManager.java
@@ -20,6 +20,7 @@
 
 import java.io.IOException;
 import java.util.Collection;
+import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Future;
@@ -28,13 +29,17 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.compaction.CompactionManager;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.RepairFinishedCompactionTask;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
 import org.apache.cassandra.dht.Bounds;
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.repair.TableRepairManager;
 import org.apache.cassandra.repair.ValidationPartitionIterator;
-import org.apache.cassandra.repair.Validator;
+import org.apache.cassandra.repair.consistent.LocalSessions;
+import org.apache.cassandra.service.ActiveRepairService;
 
 public class CassandraTableRepairManager implements TableRepairManager
 {
@@ -58,9 +63,37 @@ public Future<?> submitValidation(Callable<Object> validation)
     }
 
     @Override
-    public void incrementalSessionCompleted(UUID sessionID)
+    public synchronized void incrementalSessionCompleted(UUID sessionID)
     {
-        CompactionManager.instance.submitBackground(cfs);
+        LocalSessions sessions = ActiveRepairService.instance.consistent.local;
+        if (sessions.isSessionInProgress(sessionID))
+            return;
+
+        Set<SSTableReader> pendingRepairSSTables = cfs.getPendingRepairSSTables(sessionID);
+        if (pendingRepairSSTables.isEmpty())
+            return;
+
+        LifecycleTransaction txn = cfs.getTracker().tryModify(pendingRepairSSTables, OperationType.COMPACTION);
+        if (txn == null)
+            return;
+
+        boolean isTransient = false;
+        for (SSTableReader sstable : pendingRepairSSTables)
+        {
+            if (sstable.isTransient())
+            {
+                isTransient = true;
+                break;
+            }
+        }
+
+        long repairedAt = sessions.getFinalSessionRepairedAt(sessionID);
+        RepairFinishedCompactionTask task = new RepairFinishedCompactionTask(cfs,
+                                                                             txn,
+                                                                             sessionID,
+                                                                             repairedAt,
+                                                                             isTransient);
+        task.run();
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
index add2c9a80792..81067ef7bbcc 100644
--- a/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
+++ b/src/java/org/apache/cassandra/db/repair/CassandraValidationIterator.java
@@ -38,7 +38,6 @@
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.CompactionController;
 import org.apache.cassandra.db.compaction.CompactionIterator;
 import org.apache.cassandra.db.compaction.OperationType;
@@ -49,6 +48,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.repair.ValidationPartitionIterator;
 import org.apache.cassandra.schema.TableMetadata;
@@ -164,7 +164,7 @@ else if (isIncremental)
     private final boolean isGlobalSnapshotValidation;
 
     private final boolean isSnapshotValidation;
-    private final AbstractCompactionStrategy.ScannerList scanners;
+    private final ScannerList scanners;
     private final ValidationCompactionController controller;
 
     private final CompactionIterator ci;
@@ -221,7 +221,7 @@ public CassandraValidationIterator(ColumnFamilyStore cfs, Collection<Range<Token
         }
 
         controller = new ValidationCompactionController(cfs, getDefaultGcBefore(cfs, nowInSec));
-        scanners = cfs.getCompactionStrategyManager().getScanners(sstables, ranges);
+        scanners = cfs.getCompactionStrategyContainer().getScanners(sstables, ranges);
         ci = new ValidationCompactionIterator(scanners.scanners, controller, nowInSec);
 
         long allPartitions = 0;
diff --git a/src/java/org/apache/cassandra/dht/Range.java b/src/java/org/apache/cassandra/dht/Range.java
index 5b2f3d9fbf16..50cf4bc7ff2a 100644
--- a/src/java/org/apache/cassandra/dht/Range.java
+++ b/src/java/org/apache/cassandra/dht/Range.java
@@ -201,6 +201,38 @@ public Set<Range<T>> intersectionWith(Range<T> that)
         return intersectionOneWrapping(that, this);
     }
 
+    /**
+     * Returns the intersection of this range with the provided one, assuming neither are wrapping.
+     *
+     * @param that the other range to return the intersection with. It must not be wrapping.
+     * @return the intersection of {@code this} and {@code that}, or {@code null} if both ranges don't intersect.
+     */
+    public Range<T> intersectionNonWrapping(Range<T> that)
+    {
+        assert !isTrulyWrapAround() && !that.isTrulyWrapAround() : this + " and " + that;
+
+        if (left.compareTo(that.left) < 0)
+        {
+            if (right.isMinimum() || (!that.right.isMinimum() && right.compareTo(that.right) >= 0))
+                return that;  // this contains that.
+
+            if (right.compareTo(that.left) <= 0)
+                return null;  // this is fully before that.
+
+            return new Range<>(that.left, right);
+        }
+        else
+        {
+            if (that.right.isMinimum() || (!right.isMinimum() && that.right.compareTo(right) >= 0))
+                return this;  // that contains this.
+
+            if (that.right.compareTo(left) <= 0)
+                return null;  // that is fully before this.
+
+            return new Range<>(left, that.right);
+        }
+    }
+
     private static <T extends RingPosition<T>> Set<Range<T>> intersectionBothWrapping(Range<T> first, Range<T> that)
     {
         Set<Range<T>> intersection = new HashSet<Range<T>>(2);
@@ -456,6 +488,32 @@ public boolean isWrapAround()
         return isWrapAround(left, right);
     }
 
+    /**
+     * Checks if the range truly wraps around.
+     *
+     * This exists only because {@link #isWrapAround()} is a tad dumb and return true if right is the minimum token,
+     * no matter what left is, but for most intent and purposes, such range doesn't truly warp around (unwrap produces
+     * the identity in this case).
+     * <p>
+     * Also note that it could be that the remaining uses of {@link #isWrapAround()} could be replaced by this method,
+     * but that is to be checked carefully at some other time (Sylvain).
+     * <p>
+     * The one thing this method guarantees is that if it's true, then {@link #unwrap()} will return a list with
+     * exactly 2 ranges, never one.
+     *
+     * @return whether the range "true" wraps around.
+     */
+    public boolean isTrulyWrapAround()
+    {
+        return isTrulyWrapAround(left, right);
+    }
+
+    public static <T extends RingPosition<T>> boolean isTrulyWrapAround(T left, T right)
+    {
+        T minValue = right.minValue();
+        return isWrapAround(left, right) && !right.equals(minValue);
+    }
+
     /**
      * @return A copy of the given list of with all ranges unwrapped, sorted by left bound and with overlapping bounds merged.
      */
diff --git a/src/java/org/apache/cassandra/dht/Splitter.java b/src/java/org/apache/cassandra/dht/Splitter.java
index 857844843461..07709514ab09 100644
--- a/src/java/org/apache/cassandra/dht/Splitter.java
+++ b/src/java/org/apache/cassandra/dht/Splitter.java
@@ -118,10 +118,75 @@ public double positionInRange(Token token, Range<Token> range)
         return new BigDecimal(elapsedTokens(token, range)).divide(new BigDecimal(tokensInRange(range)), 3, BigDecimal.ROUND_HALF_EVEN).doubleValue();
     }
 
-    public List<Token> splitOwnedRanges(int parts, List<WeightedRange> weightedRanges, boolean dontSplitRanges)
+    /**
+     * How local ranges should be split
+     */
+    public enum SplitType
+    {
+        /** Local ranges should always be split, without attempting to keep them whole */
+        ALWAYS_SPLIT,
+        /** A first pass will try to avoid splitting ranges, but if there aren't enough parts,
+         * then ranges will be split in a second pass.
+         */
+        PREFER_WHOLE,
+        /** Ranges Should never be split */
+        ONLY_WHOLE
+    }
+
+    /**
+     * The result of a split operation, this is just a wrapper of the boundaries and the type
+     * of split that was done, i.e. if the local ranges were split or not. This is just so that
+     * we can test the algorithm.
+     */
+    public final static class SplitResult
+    {
+        public final List<Token> boundaries;
+        public final boolean rangesWereSplit;
+
+        SplitResult(List<Token> boundaries, boolean rangesWereSplit)
+        {
+            this.boundaries = boundaries;
+            this.rangesWereSplit = rangesWereSplit;
+        }
+
+        @Override
+        public boolean equals(Object o)
+        {
+            if (this == o)
+                return true;
+
+            if (!(o instanceof SplitResult))
+                return false;
+
+            SplitResult splitResult = (SplitResult) o;
+            return Objects.equals(boundaries, splitResult.boundaries)
+                   && Objects.equals(rangesWereSplit, splitResult.rangesWereSplit);
+        }
+
+        @Override
+        public int hashCode()
+        {
+            return Objects.hash(boundaries, rangesWereSplit);
+        }
+    }
+
+    /**
+     * Split the local ranges into the specified number of parts.
+     *
+     * Depending on the parameter {@code splitType}, it may attempt to only merge the local ranges first,
+     * to see if this is sufficient to cover the requested number of parts. If it's not, it will then split
+     * existing ranges.
+     *
+     * @param parts the number of parts
+     * @param weightedRanges the local ranges owned by this node
+     * @param splitType how local ranges should be split, see {@link SplitType}
+     *
+     * @return the split result, which contains a list of tokens, one per part, and if the ranges were split or not
+     */
+    public SplitResult splitOwnedRanges(int parts, List<WeightedRange> weightedRanges, SplitType splitType)
     {
-        if (weightedRanges.isEmpty() || parts == 1)
-            return Collections.singletonList(partitioner.getMaximumToken());
+        if (weightedRanges.isEmpty() || parts <= 1)
+            return new SplitResult(Collections.singletonList(partitioner.getMaximumToken()), false);
 
         BigInteger totalTokens = BigInteger.ZERO;
         for (WeightedRange weightedRange : weightedRanges)
@@ -132,12 +197,23 @@ public List<Token> splitOwnedRanges(int parts, List<WeightedRange> weightedRange
         BigInteger perPart = totalTokens.divide(BigInteger.valueOf(parts));
         // the range owned is so tiny we can't split it:
         if (perPart.equals(BigInteger.ZERO))
-            return Collections.singletonList(partitioner.getMaximumToken());
-
-        if (dontSplitRanges)
-            return splitOwnedRangesNoPartialRanges(weightedRanges, perPart, parts);
+            return new SplitResult(Collections.singletonList(partitioner.getMaximumToken()), false);
 
         List<Token> boundaries = new ArrayList<>();
+
+        if (splitType != SplitType.ALWAYS_SPLIT)
+        {
+            // see if we can obtain a sufficient number of parts by only merging local ranges
+            boundaries = splitOwnedRangesNoPartialRanges(weightedRanges, perPart, parts);
+            // we were either able to obtain sufficient parts without splitting ranges or we should never split ranges
+            if (splitType == SplitType.ONLY_WHOLE || boundaries.size() == parts)
+                return new SplitResult(boundaries, false);
+            else
+                boundaries.clear();
+        }
+
+        // otherwise continue by splitting ranges
+
         BigInteger sum = BigInteger.ZERO;
         for (WeightedRange weightedRange : weightedRanges)
         {
@@ -156,7 +232,7 @@ public List<Token> splitOwnedRanges(int parts, List<WeightedRange> weightedRange
         boundaries.set(boundaries.size() - 1, partitioner.getMaximumToken());
 
         assert boundaries.size() == parts : boundaries.size() + "!=" + parts + " " + boundaries + ":" + weightedRanges;
-        return boundaries;
+        return new SplitResult(boundaries, true);
     }
 
     private List<Token> splitOwnedRangesNoPartialRanges(List<WeightedRange> weightedRanges, BigInteger perPart, int parts)
@@ -290,6 +366,11 @@ public Range<Token> range()
             return range;
         }
 
+        public double weight()
+        {
+            return weight;
+        }
+
         public String toString()
         {
             return "WeightedRange{" +
diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndex.java b/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
index 7bd4d10f360e..c07a395416dd 100644
--- a/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
+++ b/src/java/org/apache/cassandra/index/sasi/SASIIndex.java
@@ -119,7 +119,7 @@ public SASIIndex(ColumnFamilyStore baseCfs, IndexMetadata config)
         SortedMap<SSTableReader, Map<ColumnMetadata, ColumnIndex>> toRebuild = new TreeMap<>((a, b)
                                                 -> Integer.compare(a.descriptor.generation, b.descriptor.generation));
 
-        for (SSTableReader sstable : index.init(tracker.getView().liveSSTables()))
+        for (SSTableReader sstable : index.init(tracker.getLiveSSTables()))
         {
             Map<ColumnMetadata, ColumnIndex> perSSTable = toRebuild.get(sstable);
             if (perSSTable == null)
diff --git a/src/java/org/apache/cassandra/io/sstable/BloomFilterTracker.java b/src/java/org/apache/cassandra/io/sstable/BloomFilterTracker.java
index 07523a001a92..801d7ed8d4c4 100644
--- a/src/java/org/apache/cassandra/io/sstable/BloomFilterTracker.java
+++ b/src/java/org/apache/cassandra/io/sstable/BloomFilterTracker.java
@@ -17,83 +17,147 @@
  */
 package org.apache.cassandra.io.sstable;
 
-import java.util.concurrent.atomic.AtomicLong;
+import com.codahale.metrics.Meter;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 
-public class BloomFilterTracker
+public abstract class BloomFilterTracker
 {
-    private final AtomicLong falsePositiveCount = new AtomicLong(0);
-    private final AtomicLong truePositiveCount = new AtomicLong(0);
-    private final AtomicLong trueNegativeCount = new AtomicLong(0);
-    private long lastFalsePositiveCount = 0L;
-    private long lastTruePositiveCount = 0L;
-    private long lastTrueNegativeCount = 0L;
-
-    public void addFalsePositive()
-    {
-        falsePositiveCount.incrementAndGet();
-    }
+    public abstract void addFalsePositive();
+    public abstract void addTruePositive();
+    public abstract void addTrueNegative();
+    public abstract long getFalsePositiveCount();
+    public abstract double getRecentFalsePositiveRate();
+    public abstract long getTruePositiveCount();
+    public abstract double getRecentTruePositiveRate();
+    public abstract long getTrueNegativeCount();
+    public abstract double getRecentTrueNegativeRate();
 
-    public void addTruePositive()
+    public static BloomFilterTracker createNoopTracker()
     {
-        truePositiveCount.incrementAndGet();
+        return NoopBloomFilterTracker.instance;
     }
 
-    public void addTrueNegative()
+    public static BloomFilterTracker createMeterTracker()
     {
-        trueNegativeCount.incrementAndGet();
+        return new MeterBloomFilterTracker();
     }
 
-    public long getFalsePositiveCount()
+    private static class MeterBloomFilterTracker extends BloomFilterTracker
     {
-        return falsePositiveCount.get();
-    }
+        private final Meter falsePositiveCount = new Meter();
+        private final Meter truePositiveCount = new Meter();
+        private final Meter trueNegativeCount = new Meter();
 
-    public long getRecentFalsePositiveCount()
-    {
-        long fpc = getFalsePositiveCount();
-        try
+        @Override
+        public void addFalsePositive()
         {
-            return (fpc - lastFalsePositiveCount);
+            falsePositiveCount.mark();
         }
-        finally
+
+        @Override
+        public void addTruePositive()
         {
-            lastFalsePositiveCount = fpc;
+            truePositiveCount.mark();
         }
-    }
 
-    public long getTruePositiveCount()
-    {
-        return truePositiveCount.get();
-    }
+        @Override
+        public void addTrueNegative()
+        {
+            trueNegativeCount.mark();
+        }
 
-    public long getRecentTruePositiveCount()
-    {
-        long tpc = getTruePositiveCount();
-        try
+        @Override
+        public long getFalsePositiveCount()
         {
-            return (tpc - lastTruePositiveCount);
+            return falsePositiveCount.getCount();
         }
-        finally
+
+        @Override
+        public double getRecentFalsePositiveRate()
         {
-            lastTruePositiveCount = tpc;
+            return falsePositiveCount.getFifteenMinuteRate();
         }
-    }
 
-    public long getTrueNegativeCount()
-    {
-        return trueNegativeCount.get();
+        @Override
+        public long getTruePositiveCount()
+        {
+            return truePositiveCount.getCount();
+        }
+
+        @Override
+        public double getRecentTruePositiveRate()
+        {
+            return truePositiveCount.getFifteenMinuteRate();
+        }
+
+        @Override
+        public long getTrueNegativeCount()
+        {
+            return trueNegativeCount.getCount();
+        }
+
+        @Override
+        public double getRecentTrueNegativeRate()
+        {
+            return trueNegativeCount.getFifteenMinuteRate();
+        }
     }
 
-    public long getRecentTrueNegativeCount()
+    /**
+     * Bloom filter tracker that does nothing and always returns 0 for all counters.
+     *
+     * Bloom Filter tracking is managed on the CFS level, so there is no reason to count anything if an SSTable does not
+     * belong (yet) to a CFS. This tracker is used initially on SSTableReaders and is overwritten during setup
+     * in {@link SSTableReader#setupOnline()} or {@link SSTableReader#setupOnline(ColumnFamilyStore)}}.
+     */
+    private static class NoopBloomFilterTracker extends BloomFilterTracker
     {
-        long tnc = getTrueNegativeCount();
-        try
+        static final NoopBloomFilterTracker instance = new NoopBloomFilterTracker();
+
+        @Override
+        public void addFalsePositive() {}
+
+        @Override
+        public void addTruePositive() {}
+
+        @Override
+        public void addTrueNegative() {}
+
+        @Override
+        public long getFalsePositiveCount()
+        {
+            return 0;
+        }
+
+        @Override
+        public double getRecentFalsePositiveRate()
         {
-            return (tnc - lastTrueNegativeCount);
+            return 0;
         }
-        finally
+
+        @Override
+        public long getTruePositiveCount()
+        {
+            return 0;
+        }
+
+        @Override
+        public double getRecentTruePositiveRate()
+        {
+            return 0;
+        }
+
+        @Override
+        public long getTrueNegativeCount()
+        {
+            return 0;
+        }
+
+        @Override
+        public double getRecentTrueNegativeRate()
         {
-            lastTrueNegativeCount = tnc;
+            return 0;
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java
index 1be79abf8ba8..0dee13259e1f 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableMultiWriter.java
@@ -43,7 +43,9 @@ public interface SSTableMultiWriter extends Transactional
     SSTableMultiWriter setOpenResult(boolean openResult);
 
     String getFilename();
-    long getFilePointer();
+    long getBytesWritten();
+    long getOnDiskBytesWritten();
+    int getSegmentCount();
     TableId getTableId();
 
     static void abortOrDie(SSTableMultiWriter writer)
diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java b/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
index a10b9fc4dbcc..213520dc022e 100644
--- a/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SSTableTxnWriter.java
@@ -62,11 +62,6 @@ public String getFilename()
         return writer.getFilename();
     }
 
-    public long getFilePointer()
-    {
-        return writer.getFilePointer();
-    }
-
     protected Throwable doCommit(Throwable accumulate)
     {
         return writer.commit(txn.commit(accumulate));
diff --git a/src/java/org/apache/cassandra/io/sstable/ScannerList.java b/src/java/org/apache/cassandra/io/sstable/ScannerList.java
new file mode 100644
index 000000000000..38ee3251769b
--- /dev/null
+++ b/src/java/org/apache/cassandra/io/sstable/ScannerList.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.io.sstable;
+
+import java.util.List;
+
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+
+public class ScannerList implements AutoCloseable
+{
+    public final List<ISSTableScanner> scanners;
+    public ScannerList(List<ISSTableScanner> scanners)
+    {
+        this.scanners = scanners;
+    }
+
+    public long getTotalBytesScanned()
+    {
+        long bytesScanned = 0L;
+        for (ISSTableScanner scanner : scanners)
+            bytesScanned += scanner.getBytesScanned();
+
+        return bytesScanned;
+    }
+
+    public long getTotalCompressedSize()
+    {
+        long compressedSize = 0;
+        for (int i=0, isize=scanners.size(); i<isize; i++)
+            compressedSize += scanners.get(i).getCompressedLengthInBytes();
+
+        return compressedSize;
+    }
+
+    public double getCompressionRatio()
+    {
+        double compressed = 0.0;
+        double uncompressed = 0.0;
+
+        for (int i=0, isize=scanners.size(); i<isize; i++)
+        {
+            @SuppressWarnings("resource")
+            ISSTableScanner scanner = scanners.get(i);
+            compressed += scanner.getCompressedLengthInBytes();
+            uncompressed += scanner.getLengthInBytes();
+        }
+
+        if (compressed == uncompressed || uncompressed == 0)
+            return MetadataCollector.NO_COMPRESSION_RATIO;
+
+        return compressed / uncompressed;
+    }
+
+    public void close()
+    {
+        ISSTableScanner.closeAllAndPropagate(scanners, null);
+    }
+}
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
index 93d9aa4695e2..6eaa1661ee8e 100644
--- a/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/SimpleSSTableMultiWriter.java
@@ -75,11 +75,21 @@ public String getFilename()
         return writer.getFilename();
     }
 
-    public long getFilePointer()
+    public long getBytesWritten()
     {
         return writer.getFilePointer();
     }
 
+    public long getOnDiskBytesWritten()
+    {
+        return writer.getEstimatedOnDiskBytesWritten();
+    }
+
+    public int getSegmentCount()
+    {
+        return 1;
+    }
+
     public TableId getTableId()
     {
         return writer.metadata().id;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/RangeAwareSSTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/RangeAwareSSTableWriter.java
index ef4deb719331..3720b86a592d 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/RangeAwareSSTableWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/RangeAwareSSTableWriter.java
@@ -67,7 +67,7 @@ public RangeAwareSSTableWriter(ColumnFamilyStore cfs, long estimatedKeys, long r
         this.format = format;
         this.lifecycleNewTracker = lifecycleNewTracker;
         this.header = header;
-        boundaries = db.positions;
+        boundaries = db.getPositions();
         if (boundaries == null)
         {
             Directories.DataDirectory localDir = cfs.getDirectories().getWriteableLocation(totalSize);
@@ -115,7 +115,7 @@ public Collection<SSTableReader> finish(long repairedAt, long maxDataAge, boolea
         currentWriter = null;
         for (SSTableMultiWriter writer : finishedWriters)
         {
-            if (writer.getFilePointer() > 0)
+            if (writer.getBytesWritten() > 0)
                 finishedReaders.addAll(writer.finish(repairedAt, maxDataAge, openResult));
             else
                 SSTableMultiWriter.abortOrDie(writer);
@@ -131,7 +131,7 @@ public Collection<SSTableReader> finish(boolean openResult)
         currentWriter = null;
         for (SSTableMultiWriter writer : finishedWriters)
         {
-            if (writer.getFilePointer() > 0)
+            if (writer.getBytesWritten() > 0)
                 finishedReaders.addAll(writer.finish(openResult));
             else
                 SSTableMultiWriter.abortOrDie(writer);
@@ -159,9 +159,27 @@ public String getFilename()
     }
 
     @Override
-    public long getFilePointer()
+    public long getBytesWritten()
     {
-        return currentWriter.getFilePointer();
+        long bytesWritten = currentWriter.getBytesWritten();
+        for (SSTableMultiWriter writer : finishedWriters)
+            bytesWritten += writer.getBytesWritten();
+        return bytesWritten;
+    }
+
+    @Override
+    public long getOnDiskBytesWritten()
+    {
+        long bytesWritten = currentWriter.getOnDiskBytesWritten();
+        for (SSTableMultiWriter writer : finishedWriters)
+            bytesWritten += writer.getOnDiskBytesWritten();
+        return bytesWritten;
+    }
+
+    @Override
+    public int getSegmentCount()
+    {
+        return finishedWriters.size() + 1;
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
index ada8d6e16d36..04f93eb20abd 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java
@@ -273,7 +273,7 @@ public enum OpenReason
 
     protected InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> keyCache;
 
-    protected final BloomFilterTracker bloomFilterTracker = new BloomFilterTracker();
+    private volatile BloomFilterTracker bloomFilterTracker = BloomFilterTracker.createNoopTracker();
 
     // technically isCompacted is not necessary since it should never be unreferenced unless it is also compacted,
     // but it seems like a good extra layer of protection against reference counting bugs to not delete data based on that alone
@@ -370,6 +370,31 @@ public static long getApproximateKeyCount(Iterable<? extends SSTableReader> ssta
         return count;
     }
 
+    /**
+     * The key cardinality estimator for the sstable, if it can be loaded.
+     *
+     * @return the sstable key cardinality estimator created during flush/compaction, or {@code null} if that estimator
+     * cannot be loaded for any reason.
+     */
+    @VisibleForTesting
+    public ICardinality keyCardinalityEstimator()
+    {
+        if (openReason == OpenReason.EARLY)
+            return null;
+
+        try
+        {
+            CompactionMetadata metadata = (CompactionMetadata) descriptor.getMetadataSerializer()
+                                                                         .deserialize(descriptor, MetadataType.COMPACTION);
+            return metadata == null ? null : metadata.cardinalityEstimator;
+        }
+        catch (IOException e)
+        {
+            logger.warn("Reading cardinality from Statistics.db failed for {}.", this, e);
+            return null;
+        }
+    }
+
     /**
      * Estimates how much of the keys we would keep if the sstables were compacted together
      */
@@ -646,6 +671,21 @@ public void run()
 
     }
 
+
+    /**
+     * Set the Bloom Filter tracker. The argument supplied is obtained
+     * from the the property of the owning CFS.
+     **/
+    public void setBloomFilterTracker(BloomFilterTracker bloomFilterTracker)
+    {
+        this.bloomFilterTracker = bloomFilterTracker;
+    }
+
+    protected BloomFilterTracker getBloomFilterTracker()
+    {
+        return this.bloomFilterTracker;
+    }
+
     /**
      * Open a RowIndexedReader which already has its state initialized (by SSTableWriter).
      */
@@ -781,6 +821,12 @@ public String getFilename()
     }
 
     public void setupOnline()
+    {
+        final ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata().id);
+        setupOnline(cfs);
+    }
+
+    public void setupOnline(ColumnFamilyStore cfs)
     {
         // under normal operation we can do this at any time, but SSTR is also used outside C* proper,
         // e.g. by BulkLoader, which does not initialize the cache.  As a kludge, we set up the cache
@@ -789,9 +835,11 @@ public void setupOnline()
         if (maybeKeyCache.getCapacity() > 0)
             keyCache = maybeKeyCache;
 
-        final ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata().id);
         if (cfs != null)
+        {
             setCrcCheckChance(cfs.getCrcCheckChance());
+            setBloomFilterTracker(cfs.getBloomFilterTracker());
+        }
     }
 
     /**
@@ -1852,36 +1900,6 @@ final static class GreaterThan extends Operator
         }
     }
 
-    public long getBloomFilterFalsePositiveCount()
-    {
-        return bloomFilterTracker.getFalsePositiveCount();
-    }
-
-    public long getRecentBloomFilterFalsePositiveCount()
-    {
-        return bloomFilterTracker.getRecentFalsePositiveCount();
-    }
-
-    public long getBloomFilterTruePositiveCount()
-    {
-        return bloomFilterTracker.getTruePositiveCount();
-    }
-
-    public long getRecentBloomFilterTruePositiveCount()
-    {
-        return bloomFilterTracker.getRecentTruePositiveCount();
-    }
-
-    public long getBloomFilterTrueNegativeCount()
-    {
-        return bloomFilterTracker.getTrueNegativeCount();
-    }
-
-    public long getRecentBloomFilterTrueNegativeCount()
-    {
-        return bloomFilterTracker.getRecentTrueNegativeCount();
-    }
-
     public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
     {
         return keyCache;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java
index 90f788db0646..ee72094fcd7e 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableZeroCopyWriter.java
@@ -144,11 +144,24 @@ public SSTableMultiWriter setOpenResult(boolean openResult)
     }
 
     @Override
-    public long getFilePointer()
+    public long getBytesWritten()
     {
+        // TODO: these two may need fixing.
         return 0;
     }
 
+    @Override
+    public long getOnDiskBytesWritten()
+    {
+        return 0;
+    }
+
+    @Override
+    public int getSegmentCount()
+    {
+        return 1;
+    }
+
     @Override
     public TableId getTableId()
     {
diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
index 11c9b1a90424..3a8b0626640e 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableReader.java
@@ -181,7 +181,7 @@ protected BigTableRowIndexEntry getPosition(PartitionPosition key,
             {
                 listener.onSSTableSkipped(this, SkippingReason.BLOOM_FILTER);
                 Tracing.trace("Bloom filter allows skipping sstable {}", descriptor.generation);
-                bloomFilterTracker.addTrueNegative();
+                getBloomFilterTracker().addTrueNegative();
                 return null;
             }
         }
@@ -221,7 +221,7 @@ protected BigTableRowIndexEntry getPosition(PartitionPosition key,
         if (skip)
         {
             if (op == Operator.EQ && updateCacheAndStats)
-                bloomFilterTracker.addFalsePositive();
+                getBloomFilterTracker().addFalsePositive();
             listener.onSSTableSkipped(this, SkippingReason.MIN_MAX_KEYS);
             Tracing.trace("Check against min and max keys allows skipping sstable {}", descriptor.generation);
             return null;
@@ -300,7 +300,7 @@ protected BigTableRowIndexEntry getPosition(PartitionPosition key,
                         cacheKey(decoratedKey, indexEntry);
                     }
                     if (op == Operator.EQ && updateCacheAndStats)
-                        bloomFilterTracker.addTruePositive();
+                        getBloomFilterTracker().addTruePositive();
                     listener.onSSTableSelected(this, indexEntry, SelectionReason.INDEX_ENTRY_FOUND);
                     Tracing.trace("Partition index with {} entries found for sstable {}", indexEntry.columnsIndexCount(), descriptor.generation);
                     return indexEntry;
@@ -316,7 +316,7 @@ protected BigTableRowIndexEntry getPosition(PartitionPosition key,
         }
 
         if (op == SSTableReader.Operator.EQ && updateCacheAndStats)
-            bloomFilterTracker.addFalsePositive();
+            getBloomFilterTracker().addFalsePositive();
         listener.onSSTableSkipped(this, SkippingReason.INDEX_ENTRY_NOT_FOUND);
         Tracing.trace("Partition index lookup complete (bloom filter false positive) for sstable {}", descriptor.generation);
         return null;
diff --git a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
index a1405dabff7d..273e9897684e 100644
--- a/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
+++ b/src/java/org/apache/cassandra/io/sstable/format/trieindex/TrieIndexSSTableReader.java
@@ -336,14 +336,14 @@ public RowIndexEntry getExactPosition(DecoratedKey dk,
             listener.onSSTableSkipped(this, SkippingReason.BLOOM_FILTER);
             Tracing.trace("Bloom filter allows skipping sstable {}", descriptor.generation);
             if (updateStats)
-                bloomFilterTracker.addTrueNegative();
+                getBloomFilterTracker().addTrueNegative();
             return null;
         }
 
         if ((filterFirst() && first.compareTo(dk) > 0) || (filterLast() && last.compareTo(dk) < 0))
         {
             if (updateStats)
-                bloomFilterTracker.addFalsePositive();
+                getBloomFilterTracker().addFalsePositive();
             listener.onSSTableSkipped(this, SkippingReason.MIN_MAX_KEYS);
             return null;
         }
@@ -354,7 +354,7 @@ public RowIndexEntry getExactPosition(DecoratedKey dk,
             if (indexPos == PartitionIndex.NOT_FOUND)
             {
                 if (updateStats)
-                    bloomFilterTracker.addFalsePositive();
+                    getBloomFilterTracker().addFalsePositive();
                 listener.onSSTableSkipped(this, SkippingReason.PARTITION_INDEX_LOOKUP);
                 return null;
             }
@@ -389,7 +389,7 @@ public RowIndexEntry getExactPosition(DecoratedKey dk,
     private RowIndexEntry handleKeyNotFound(boolean updateStats, SSTableReadsListener listener)
     {
         if (updateStats)
-            bloomFilterTracker.addFalsePositive();
+            getBloomFilterTracker().addFalsePositive();
         listener.onSSTableSkipped(this, SkippingReason.INDEX_ENTRY_NOT_FOUND);
         return null;
     }
@@ -397,7 +397,7 @@ private RowIndexEntry handleKeyNotFound(boolean updateStats, SSTableReadsListene
     private RowIndexEntry handleKeyFound(boolean updateStats, SSTableReadsListener listener, FileDataInput in, long indexPos) throws IOException
     {
         if (updateStats)
-            bloomFilterTracker.addTruePositive();
+            getBloomFilterTracker().addTruePositive();
         RowIndexEntry entry = indexPos >= 0 ? TrieIndexEntry.deserialize(in, in.getFilePointer())
                                             : new RowIndexEntry(~indexPos);
 
@@ -688,8 +688,17 @@ public boolean hasIndex()
     public void setupOnline()
     {
         final ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(metadata().id);
+        setupOnline(cfs);
+    }
+
+    @Override
+    public void setupOnline(ColumnFamilyStore cfs)
+    {
         if (cfs != null)
+        {
             setCrcCheckChance(cfs.getCrcCheckChance());
+            setBloomFilterTracker(cfs.getBloomFilterTracker());
+        }
     }
 
     @Override
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
index 2ece9a6ceaa1..b15a9d667394 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
@@ -152,7 +152,7 @@ public MetadataCollector(ClusteringComparator comparator, UUID originatingHostId
         this.originatingHostId = originatingHostId;
     }
 
-    public MetadataCollector(Iterable<SSTableReader> sstables, ClusteringComparator comparator, int level)
+    public MetadataCollector(Iterable<SSTableReader> sstables, ClusteringComparator comparator)
     {
         this(comparator);
 
@@ -166,6 +166,11 @@ public MetadataCollector(Iterable<SSTableReader> sstables, ClusteringComparator
             }
         }
         commitLogIntervals(intervals.build());
+    }
+
+    public MetadataCollector(Iterable<SSTableReader> sstables, ClusteringComparator comparator, int level)
+    {
+        this(sstables, comparator);
         sstableLevel(level);
     }
 
diff --git a/src/java/org/apache/cassandra/io/util/FileUtils.java b/src/java/org/apache/cassandra/io/util/FileUtils.java
index 0711b911a050..47d244cc1e39 100644
--- a/src/java/org/apache/cassandra/io/util/FileUtils.java
+++ b/src/java/org/apache/cassandra/io/util/FileUtils.java
@@ -954,7 +954,7 @@ public static FileStore getFileStore(Path path) throws IOException
      * @param size returned by the Java's FileStore methods
      * @return the size or {@code Long.MAX_VALUE} if the size was bigger than {@code Long.MAX_VALUE}
      */
-    private static long handleLargeFileSystem(long size)
+    public static long handleLargeFileSystem(long size)
     {
         return size < 0 ? Long.MAX_VALUE : size;
     }
diff --git a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
index bc5d01c28e70..4299f436acfb 100644
--- a/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/CompactionMetrics.java
@@ -19,14 +19,19 @@
 
 import java.util.*;
 import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
 
+import com.codahale.metrics.CachedGauge;
 import com.codahale.metrics.Counter;
+import com.codahale.metrics.DerivativeGauge;
 import com.codahale.metrics.Gauge;
 import com.codahale.metrics.Meter;
 
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.compaction.AbstractTableOperation;
+import org.apache.cassandra.db.compaction.CompactionAggregateStatistics;
 import org.apache.cassandra.db.compaction.CompactionManager;
 import org.apache.cassandra.db.compaction.CompactionStrategyStatistics;
 import org.apache.cassandra.db.compaction.TableOperation;
@@ -50,6 +55,9 @@ public class CompactionMetrics
     /** Estimated number of compactions remaining to perform, group by keyspace and then table name */
     public final Gauge<Map<String, Map<String, Integer>>> pendingTasksByTableName;
 
+    /** Write amplification of compactions (bytes compacted / bytes flushed), group by keyspace and then table name */
+    public final Gauge<Map<String, Map<String, Double>>> writeAmplificationByTableName;
+
     /** Number of completed operations since server [re]start */
     public final Gauge<Long> completedTasks;
     /** Total number of operations since server [re]start */
@@ -57,6 +65,26 @@ public class CompactionMetrics
     /** Total number of bytes processed by operations since server [re]start */
     public final Counter bytesCompacted;
 
+    /**
+     * The compaction strategy information for each table. Cached, because its computation might be fairly expensive.
+     */
+    public final CachedGauge<List<CompactionStrategyStatistics>> aggregateCompactions;
+
+    /*
+     * The compaction metrics below are derivatives of the complex compaction statistics metric aggregateCompactions.
+     */
+
+    /** Number of currently running compactions for all tables */
+    public final DerivativeGauge<List<CompactionStrategyStatistics>, Integer> runningCompactions;
+    /** Mean read throughput of currently running compactions in bytes per second */
+    public final DerivativeGauge<List<CompactionStrategyStatistics>, Double> meanCompactionReadThroughput;
+    /** Mean write throughput of currently running compactions in bytes per second */
+    public final DerivativeGauge<List<CompactionStrategyStatistics>, Double> meanCompactionWriteThroughput;
+    /** Total bytes to compact from currently running compactions */
+    public final DerivativeGauge<List<CompactionStrategyStatistics>, Long> runningCompactionsTotalBytes;
+    /** Remaining bytes to compact from currently running compactions */
+    public final DerivativeGauge<List<CompactionStrategyStatistics>, Long> runningCompactionsRemainingBytes;
+
     /** Total number of compactions that have had sstables drop out of them */
     public final Counter compactionsReduced;
 
@@ -66,8 +94,10 @@ public class CompactionMetrics
     /** Total number of compactions which have outright failed due to lack of disk space */
     public final Counter compactionsAborted;
 
-    /** The compaction strategy information for each table. */
-    public final Gauge<List<CompactionStrategyStatistics>> aggregateCompactions;
+    /** Total number of deleted expired SSTables */
+    public final Meter removedExpiredSSTables;
+    /** Total number compactions that consisted of only expired SSTables */
+    public final Meter deleteOnlyCompactions;
 
     public CompactionMetrics(final ThreadPoolExecutor... collectors)
     {
@@ -77,7 +107,7 @@ public CompactionMetrics(final ThreadPoolExecutor... collectors)
             for (String keyspaceName : Schema.instance.getKeyspaces())
             {
                 for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
-                    n += cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
+                    n += cfs.getCompactionStrategy().getEstimatedRemainingTasks();
             }
             // add number of currently running compactions
             return n + CompactionManager.instance.active.getTableOperations().size();
@@ -90,7 +120,7 @@ public CompactionMetrics(final ThreadPoolExecutor... collectors)
             {
                 for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
                 {
-                    int taskNumber = cfs.getCompactionStrategyManager().getEstimatedRemainingTasks();
+                    int taskNumber = cfs.getCompactionStrategy().getEstimatedRemainingTasks();
                     if (taskNumber > 0)
                     {
                         if (!resultMap.containsKey(keyspaceName))
@@ -131,6 +161,20 @@ public CompactionMetrics(final ThreadPoolExecutor... collectors)
             return resultMap;
         });
 
+        writeAmplificationByTableName = Metrics.register(factory.createMetricName("WriteAmplificationByTableName"), () -> {
+            Map<String, Map<String, Double>> resultMap = new HashMap<>();
+
+            for (String keyspaceName : Schema.instance.getKeyspaces())
+            {
+                Map<String, Double> ksMap = new HashMap<>();
+                resultMap.put(keyspaceName, ksMap);
+                for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+                    ksMap.put(cfs.getTableName(), cfs.getWA());
+            }
+
+            return resultMap;
+        });
+
         completedTasks = Metrics.register(factory.createMetricName("CompletedTasks"), new Gauge<Long>()
         {
             public Long getValue()
@@ -149,23 +193,106 @@ public Long getValue()
         sstablesDropppedFromCompactions = Metrics.counter(factory.createMetricName("SSTablesDroppedFromCompaction"));
         compactionsAborted = Metrics.counter(factory.createMetricName("CompactionsAborted"));
 
-        aggregateCompactions = Metrics.register(factory.createMetricName("AggregateCompactions"), this::getAggregateCompactions);
+        removedExpiredSSTables = Metrics.meter(factory.createMetricName("ExpiredSSTablesDropped"));
+        deleteOnlyCompactions = Metrics.meter(factory.createMetricName("DeleteOnlyCompactions"));
+
+        aggregateCompactions = Metrics.register(factory.createMetricName("AggregateCompactions"),
+                                                // TODO 50 ms is 100x less than the default report interval of our distributed test harness (Fallout) at
+                                                //  the moment of writing this. This implies that even a bigger timeout might be OK.
+                                                new CachedGauge<List<CompactionStrategyStatistics>>(50, TimeUnit.MILLISECONDS)
+                                                {
+                                                    @Override
+                                                    protected List<CompactionStrategyStatistics> loadValue()
+                                                    {
+                                                        List<CompactionStrategyStatistics> ret = new ArrayList<>();
+                                                        for (String keyspaceName : Schema.instance.getKeyspaces())
+                                                        {
+                                                            // Scan all the compactions strategies of all tables and find those that have compactions in progress.
+                                                            for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
+                                                                // For those return the statistics.
+                                                                ret.addAll(cfs.getCompactionStrategy().getStatistics());
+                                                        }
+
+                                                        return ret;
+                                                    }
+                                                });
+
+        runningCompactions = Metrics.register(factory.createMetricName("RunningCompactions"),
+                                              new DerivativeGauge<List<CompactionStrategyStatistics>, Integer>(aggregateCompactions)
+        {
+            @Override
+            protected Integer transform(List<CompactionStrategyStatistics> value)
+            {
+                return deriveSafeAggregateStatisticsStream(value)
+                       .mapToInt(CompactionAggregateStatistics::numCompactionsInProgress)
+                       .sum();
+            }
+        });
+        meanCompactionReadThroughput = Metrics.register(factory.createMetricName("MeanCompactionReadThroughput"),
+                                                        new DerivativeGauge<List<CompactionStrategyStatistics>, Double>(aggregateCompactions)
+        {
+            @Override
+            protected Double transform(List<CompactionStrategyStatistics> value)
+            {
+                return deriveSafeAggregateStatisticsStream(value)
+                       // Don't take into account aggregates for which there are no running compactions
+                       .filter(s -> s.numCompactionsInProgress() > 0)
+                       .mapToDouble(CompactionAggregateStatistics::readThroughput)
+                       .average()
+                       .orElse(0.0);
+            }
+        });
+        meanCompactionWriteThroughput = Metrics.register(factory.createMetricName("MeanCompactionWriteThroughput"),
+                                                         new DerivativeGauge<List<CompactionStrategyStatistics>, Double>(aggregateCompactions)
+        {
+            @Override
+            protected Double transform(List<CompactionStrategyStatistics> value)
+            {
+                return deriveSafeAggregateStatisticsStream(value)
+                       // Don't take into account aggregates for which there are no running compactions
+                       .filter(s -> s.numCompactionsInProgress() > 0)
+                       .mapToDouble(CompactionAggregateStatistics::writeThroughput)
+                       .average()
+                       .orElse(0.0);
+            }
+        });
+        runningCompactionsTotalBytes = Metrics.register(factory.createMetricName("RunningCompactionsTotalBytes"),
+                                                        new DerivativeGauge<List<CompactionStrategyStatistics>, Long>(aggregateCompactions)
+        {
+            @Override
+            protected Long transform(List<CompactionStrategyStatistics> value)
+            {
+                return deriveSafeAggregateStatisticsStream(value)
+                       .mapToLong(CompactionAggregateStatistics::tot)
+                       .sum();
+            }
+        });
+        runningCompactionsRemainingBytes = Metrics.register(factory.createMetricName("RunningCompactionsRemainingBytes"),
+                                                            new DerivativeGauge<List<CompactionStrategyStatistics>, Long>(aggregateCompactions)
+        {
+            @Override
+            protected Long transform(List<CompactionStrategyStatistics> value)
+            {
+                return deriveSafeAggregateStatisticsStream(value)
+                       .mapToLong(s -> s.tot() - s.read())
+                       .sum();
+            }
+        });
     }
 
     /**
-     * Scan all the compactions strategies of all tables and find those that have compactions in progress.
-     * For those return the statistics.
+     * Needed because deriving from a CachedGauge might hit a NullPointerException until we move to a version of
+     * dropwizard's metrics-core where https://github.com/dropwizard/metrics/pull/711 /
+     * https://github.com/dropwizard/metrics/pull/1566 are fixed (currently targeting metrics-core 4.1.7).
      *
-     * @return a list of statistics for the compaction strategies that have compactions in progress
+     * @param aggregateCompactions The cached compaction strategy statistics to derive from.
+     *
+     * @return A stream (potentially empty) of the aggregate statistics corresponding to the given strategy statistics.
      */
-    List<CompactionStrategyStatistics> getAggregateCompactions()
+    private static Stream<CompactionAggregateStatistics> deriveSafeAggregateStatisticsStream(List<CompactionStrategyStatistics> aggregateCompactions)
     {
-        List<CompactionStrategyStatistics> ret = new ArrayList<>();
-        for (String keyspaceName : Schema.instance.getKeyspaces())
-        {
-            for (ColumnFamilyStore cfs : Keyspace.open(keyspaceName).getColumnFamilyStores())
-                ret.addAll(cfs.getCompactionStrategyManager().getStrategyStatistics());
-        }
-        return ret;
+        if (aggregateCompactions == null)
+            return Stream.empty();
+        return aggregateCompactions.stream().flatMap(s -> s.aggregates().stream());
     }
 }
diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java
index 7473d9c1f50d..764ff59dac7d 100644
--- a/src/java/org/apache/cassandra/metrics/TableMetrics.java
+++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java
@@ -49,6 +49,8 @@
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.ExpMovingAverage;
+import org.apache.cassandra.utils.MovingAverage;
 import org.apache.cassandra.utils.Pair;
 
 import com.codahale.metrics.Counter;
@@ -101,19 +103,43 @@ public class TableMetrics
     public final Gauge<long[]> estimatedColumnCountHistogram;
     /** Histogram of the number of sstable data files accessed per read */
     public final TableHistogram sstablesPerReadHistogram;
+    /** An approximate measure of how long it takes to read a partition from an sstable, in nanoseconds. This is
+     * a moving average of a very rough approximation: the total latency for a single partition
+     * read command divided by the number of sstables that were accessed for that command.
+     * Therefore it currently includes other costs, which is not ideal but it does give a rough estimate.
+     * since disk costs would dominate computing costs. */
+    public final MovingAverage sstablePartitionReadLatency;
     /** (Local) read metrics */
     public final LatencyMetrics readLatency;
     /** (Local) range slice metrics */
     public final LatencyMetrics rangeLatency;
     /** (Local) write metrics */
     public final LatencyMetrics writeLatency;
+    /** The number of single partition read requests, including those dropped due to timeouts */
+    public final Counter readRequests;
+    /** The number of range read requests, including those dropped due to timeouts */
+    public final Counter rangeRequests;
     /** Estimated number of tasks pending for this table */
     public final Counter pendingFlushes;
     /** Total number of bytes flushed since server [re]start */
     public final Counter bytesFlushed;
+    /** The average flushed size for sstables, which is derived from {@link this#bytesFlushed}. */
+    public final MovingAverage flushSize;
+    /** The average on-disk flushed size for sstables. */
+    public final MovingAverage flushSizeOnDisk;
+    /** The average number of sstables created on flush. */
+    public final MovingAverage flushSegmentCount;
+    /** The average duration per 1Kb of data flushed, in nanoseconds. */
+    public final MovingAverage flushTimePerKb;
+    /** Total number of bytes inserted into memtables since server [re]start. */
+    public final Counter bytesInserted;
     /** Total number of bytes written by compaction since server [re]start */
     public final Counter compactionBytesWritten;
-    /** Estimate of number of pending compactios for this table */
+    /** Total number of bytes read by compaction since server [re]start */
+    public final Counter compactionBytesRead;
+    /** The average duration per 1Kb of data compacted, in nanoseconds. */
+    public final MovingAverage compactionTimePerKb;
+    /** Estimate of number of pending compactions for this table */
     public final Gauge<Integer> pendingCompactions;
     /** Number of SSTables on disk for this CF */
     public final Gauge<Integer> liveSSTableCount;
@@ -132,7 +158,7 @@ public class TableMetrics
     /** Number of false positives in bloom filter */
     public final Gauge<Long> bloomFilterFalsePositives;
     /** Number of false positives in bloom filter from last read */
-    public final Gauge<Long> recentBloomFilterFalsePositives;
+    public final Gauge<Double> recentBloomFilterFalsePositives;
     /** False positive ratio of bloom filter */
     public final Gauge<Double> bloomFilterFalseRatio;
     /** False positive ratio of bloom filter from last read */
@@ -496,6 +522,7 @@ public Long getValue()
                                                                                  SSTableReader::getEstimatedCellPerPartitionCount), null);
         
         sstablesPerReadHistogram = createTableHistogram("SSTablesPerReadHistogram", cfs.keyspace.metric.sstablesPerReadHistogram, true);
+        sstablePartitionReadLatency = ExpMovingAverage.decayBy100();
         compressionRatio = createTableGauge("CompressionRatio", new Gauge<Double>()
         {
             public Double getValue()
@@ -571,12 +598,23 @@ public Long getValue()
         readLatency = createLatencyMetrics("Read", cfs.keyspace.metric.readLatency, GLOBAL_READ_LATENCY);
         writeLatency = createLatencyMetrics("Write", cfs.keyspace.metric.writeLatency, GLOBAL_WRITE_LATENCY);
         rangeLatency = createLatencyMetrics("Range", cfs.keyspace.metric.rangeLatency, GLOBAL_RANGE_LATENCY);
+
+        readRequests = createTableCounter("ReadRequests");
+        rangeRequests = createTableCounter("RangeRequests");
+
         pendingFlushes = createTableCounter("PendingFlushes");
         bytesFlushed = createTableCounter("BytesFlushed");
+        flushSize = ExpMovingAverage.decayBy100();
+        flushSizeOnDisk = ExpMovingAverage.decayBy1000();
+        flushSegmentCount = ExpMovingAverage.decayBy1000();
+        flushTimePerKb = ExpMovingAverage.decayBy100();
+        bytesInserted = createTableCounter("BytesInserted");
 
         compactionBytesWritten = createTableCounter("CompactionBytesWritten");
-        pendingCompactions = createTableGauge("PendingCompactions", () -> cfs.getCompactionStrategyManager().getEstimatedRemainingTasks());
-        liveSSTableCount = createTableGauge("LiveSSTableCount", () -> cfs.getTracker().getView().liveSSTables().size());
+        compactionBytesRead = createTableCounter("CompactionBytesRead");
+        compactionTimePerKb = ExpMovingAverage.decayBy100();
+        pendingCompactions = createTableGauge("PendingCompactions", () -> cfs.getCompactionStrategy().getEstimatedRemainingTasks());
+        liveSSTableCount = createTableGauge("LiveSSTableCount", () -> cfs.getLiveSSTables().size());
         oldVersionSSTableCount = createTableGauge("OldVersionSSTableCount", new Gauge<Integer>()
         {
             public Integer getValue()
@@ -674,35 +712,24 @@ public Long getValue()
         {
             public Long getValue()
             {
-                long count = 0L;
-                for (SSTableReader sstable: cfs.getSSTables(SSTableSet.LIVE))
-                    count += sstable.getBloomFilterFalsePositiveCount();
-                return count;
+                return cfs.getBloomFilterFalsePositiveCount();
             }
         });
-        recentBloomFilterFalsePositives = createTableGauge("RecentBloomFilterFalsePositives", new Gauge<Long>()
+        recentBloomFilterFalsePositives = createTableGauge("RecentBloomFilterFalsePositives", new Gauge<Double>()
         {
-            public Long getValue()
+            public Double getValue()
             {
-                long count = 0L;
-                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
-                    count += sstable.getRecentBloomFilterFalsePositiveCount();
-                return count;
+                return cfs.getRecentBloomFilterFalsePositiveRate();
             }
         });
         bloomFilterFalseRatio = createTableGauge("BloomFilterFalseRatio", new Gauge<Double>()
         {
             public Double getValue()
             {
-                long falsePositiveCount = 0L;
-                long truePositiveCount = 0L;
-                long trueNegativeCount = 0L;
-                for (SSTableReader sstable : cfs.getSSTables(SSTableSet.LIVE))
-                {
-                    falsePositiveCount += sstable.getBloomFilterFalsePositiveCount();
-                    truePositiveCount += sstable.getBloomFilterTruePositiveCount();
-                    trueNegativeCount += sstable.getBloomFilterTrueNegativeCount();
-                }
+                long falsePositiveCount = cfs.getBloomFilterFalsePositiveCount();
+                long truePositiveCount = cfs.getBloomFilterTruePositiveCount();
+                long trueNegativeCount = cfs.getBloomFilterTrueNegativeCount();
+
                 if (falsePositiveCount == 0L && truePositiveCount == 0L)
                     return 0d;
                 return (double) falsePositiveCount / (truePositiveCount + falsePositiveCount + trueNegativeCount);
@@ -716,11 +743,11 @@ public Double getValue()
                 long trueNegativeCount = 0L;
                 for (Keyspace keyspace : Keyspace.all())
                 {
-                    for (SSTableReader sstable : keyspace.getAllSSTables(SSTableSet.LIVE))
+                    for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
                     {
-                        falsePositiveCount += sstable.getBloomFilterFalsePositiveCount();
-                        truePositiveCount += sstable.getBloomFilterTruePositiveCount();
-                        trueNegativeCount += sstable.getBloomFilterTrueNegativeCount();
+                        falsePositiveCount += cfs.getBloomFilterFalsePositiveCount();
+                        truePositiveCount += cfs.getBloomFilterTruePositiveCount();
+                        trueNegativeCount += cfs.getBloomFilterTrueNegativeCount();
                     }
                 }
                 if (falsePositiveCount == 0L && truePositiveCount == 0L)
@@ -732,38 +759,33 @@ public Double getValue()
         {
             public Double getValue()
             {
-                long falsePositiveCount = 0L;
-                long truePositiveCount = 0L;
-                long trueNegativeCount = 0L;
-                for (SSTableReader sstable: cfs.getSSTables(SSTableSet.LIVE))
-                {
-                    falsePositiveCount += sstable.getRecentBloomFilterFalsePositiveCount();
-                    truePositiveCount += sstable.getRecentBloomFilterTruePositiveCount();
-                    trueNegativeCount += sstable.getRecentBloomFilterTrueNegativeCount();
-                }
-                if (falsePositiveCount == 0L && truePositiveCount == 0L)
+                double falsePositiveRate = cfs.getRecentBloomFilterFalsePositiveRate();
+                double truePositiveRate = cfs.getRecentBloomFilterTruePositiveRate();
+                double trueNegativeRate = cfs.getRecentBloomFilterTrueNegativeRate();
+
+                if (falsePositiveRate == 0d && truePositiveRate == 0d)
                     return 0d;
-                return (double) falsePositiveCount / (truePositiveCount + falsePositiveCount + trueNegativeCount);
+                return falsePositiveRate / (truePositiveRate + falsePositiveRate + trueNegativeRate);
             }
         }, new Gauge<Double>() // global gauge
         {
             public Double getValue()
             {
-                long falsePositiveCount = 0L;
-                long truePositiveCount = 0L;
-                long trueNegativeCount = 0L;
+                double falsePositiveRate = 0d;
+                double truePositiveRate = 0d;
+                double trueNegativeRate = 0d;
                 for (Keyspace keyspace : Keyspace.all())
                 {
-                    for (SSTableReader sstable : keyspace.getAllSSTables(SSTableSet.LIVE))
+                    for (ColumnFamilyStore cfs : keyspace.getColumnFamilyStores())
                     {
-                        falsePositiveCount += sstable.getRecentBloomFilterFalsePositiveCount();
-                        truePositiveCount += sstable.getRecentBloomFilterTruePositiveCount();
-                        trueNegativeCount += sstable.getRecentBloomFilterTrueNegativeCount();
+                        falsePositiveRate += cfs.getRecentBloomFilterFalsePositiveRate();
+                        truePositiveRate += cfs.getRecentBloomFilterTruePositiveRate();
+                        trueNegativeRate += cfs.getRecentBloomFilterTrueNegativeRate();
                     }
                 }
-                if (falsePositiveCount == 0L && truePositiveCount == 0L)
+                if (falsePositiveRate == 0d && truePositiveRate == 0d)
                     return 0d;
-                return (double) falsePositiveCount / (truePositiveCount + falsePositiveCount + trueNegativeCount);
+                return falsePositiveRate / (truePositiveRate + falsePositiveRate + trueNegativeRate);
             }
         });
         bloomFilterDiskSpaceUsed = createTableGauge("BloomFilterDiskSpaceUsed", new Gauge<Long>()
@@ -924,9 +946,28 @@ private Memtable.MemoryUsage getMemoryUsageWithIndexes(ColumnFamilyStore cfs)
         return usage;
     }
 
-    public void updateSSTableIterated(int count)
+    public void incBytesFlushed(long inputSize, long outputSize, long elapsedNanos)
+    {
+        bytesFlushed.inc(outputSize);
+        flushSize.update(outputSize);
+        // this assumes that at least 1 Kb was flushed, which should always be the case, then rounds down
+        flushTimePerKb.update(elapsedNanos / (double) Math.max(1, inputSize / 1024L));
+    }
+
+    public void incBytesCompacted(long inputDiskSize, long outputDiskSize, long elapsedNanos)
+    {
+        compactionBytesRead.inc(inputDiskSize);
+        compactionBytesWritten.inc(outputDiskSize);
+        // this assumes that at least 1 Kb was compacted, which should always be the case, then rounds down
+        compactionTimePerKb.update(elapsedNanos / (double) Math.max(1, inputDiskSize / 1024L));
+    }
+
+    public void updateSSTableIterated(int count, long elapsedNanos)
     {
         sstablesPerReadHistogram.update(count);
+
+        if (count > 0)
+            sstablePartitionReadLatency.update(elapsedNanos / (double) count);
     }
 
     /**
@@ -1014,17 +1055,17 @@ protected Counter createTableCounter(final String name, final String alias)
             Metrics.register(GLOBAL_FACTORY.createMetricName(name),
                              GLOBAL_ALIAS_FACTORY.createMetricName(alias),
                              new Gauge<Long>()
-            {
-                public Long getValue()
-                {
-                    long total = 0;
-                    for (Metric cfGauge : ALL_TABLE_METRICS.get(name))
-                    {
-                        total += ((Counter) cfGauge).getCount();
-                    }
-                    return total;
-                }
-            });
+                             {
+                                 public Long getValue()
+                                 {
+                                     long total = 0;
+                                     for (Metric cfGauge : ALL_TABLE_METRICS.get(name))
+                                     {
+                                         total += ((Counter) cfGauge).getCount();
+                                     }
+                                     return total;
+                                 }
+                             });
         }
         return cfCounter;
     }
diff --git a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
index 35e69e29988c..fa668cbea362 100644
--- a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
+++ b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
@@ -22,6 +22,7 @@
 import java.net.UnknownHostException;
 import java.nio.ByteBuffer;
 import java.time.Instant;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashMap;
@@ -35,12 +36,12 @@
 import java.util.concurrent.Executors;
 import java.util.concurrent.TimeUnit;
 import java.util.function.BooleanSupplier;
-import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import javax.annotation.Nullable;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import com.google.common.base.Predicate;
 import com.google.common.base.Verify;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableSet;
@@ -53,6 +54,11 @@
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.MoreExecutors;
 
+import org.apache.cassandra.db.compaction.CleanupTask;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.RepairFinishedCompactionTask;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.locator.RangesAtEndpoint;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -94,6 +100,7 @@
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
 
 import static org.apache.cassandra.net.Verb.FAILED_SESSION_MSG;
 import static org.apache.cassandra.net.Verb.FINALIZE_PROMISE_MSG;
@@ -296,6 +303,12 @@ public PendingStats getPendingStats(TableId tid, Collection<Range<Token>> ranges
         return new PendingStats(cfs.keyspace.getName(), cfs.name, pending.build(), finalized.build(), failed.build());
     }
 
+    /**
+     * promotes (or demotes) data attached to an incremental repair session that has either completed successfully,
+     * or failed
+     *
+     * @return session ids whose data could not be released
+     */
     public CleanupSummary cleanup(TableId tid, Collection<Range<Token>> ranges, boolean force)
     {
         Iterable<LocalSession> candidates = Iterables.filter(sessions.values(),
@@ -304,10 +317,56 @@ public CleanupSummary cleanup(TableId tid, Collection<Range<Token>> ranges, bool
                                                                    && Range.intersects(ls.ranges, ranges));
 
         ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(tid);
+        Preconditions.checkNotNull(cfs);
+
         Set<UUID> sessionIds = Sets.newHashSet(Iterables.transform(candidates, s -> s.sessionID));
+        return releaseRepairData(cfs, sessionIds, force);
+    }
+
+    private CleanupSummary releaseRepairData(ColumnFamilyStore cfs, Collection<UUID> sessions, boolean force)
+    {
+        if (force)
+        {
+            Predicate<SSTableReader> predicate = sst -> {
+                UUID session = sst.getPendingRepair();
+                return session != null && sessions.contains(session);
+            };
+            return cfs.runWithCompactionsDisabled(() -> doReleaseRepairData(cfs, sessions),
+                                                  predicate, false, true, true);
+        }
+        else
+        {
+            return doReleaseRepairData(cfs, sessions);
+        }
+    }
+
+    private CleanupSummary doReleaseRepairData(ColumnFamilyStore cfs, Collection<UUID> sessions)
+    {
+        List<Pair<UUID, RepairFinishedCompactionTask>> tasks = new ArrayList<>(sessions.size());
+        for (UUID session : sessions)
+        {
+            if (canCleanup(session))
+                tasks.add(Pair.create(session, getRepairFinishedCompactionTask(cfs, session)));
+        }
+
+        return new CleanupTask(cfs, tasks).cleanup();
+    }
+
+    private RepairFinishedCompactionTask getRepairFinishedCompactionTask(ColumnFamilyStore cfs, UUID session)
+    {
+        Set<SSTableReader> sstables = cfs.getPendingRepairSSTables(session);
+        if (sstables.isEmpty())
+            return null;
 
+        long repairedAt = getFinalSessionRepairedAt(session);
+        boolean isTransient = sstables.iterator().next().isTransient();
+        LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
+        return txn == null ? null : new RepairFinishedCompactionTask(cfs, txn, session, repairedAt, isTransient);
+    }
 
-        return cfs.releaseRepairData(sessionIds, force);
+    public boolean canCleanup(UUID sessionID)
+    {
+        return !isSessionInProgress(sessionID);
     }
 
     /**
@@ -893,7 +952,7 @@ public void handleFinalizeProposeMessage(InetAddressAndPort from, FinalizePropos
     }
 
     @VisibleForTesting
-    protected void sessionCompleted(LocalSession session)
+    public void sessionCompleted(LocalSession session)
     {
         for (TableId tid: session.tableIds)
         {
@@ -1019,7 +1078,7 @@ protected boolean sessionHasData(LocalSession session)
     {
         Predicate<TableId> predicate = tid -> {
             ColumnFamilyStore cfs = Schema.instance.getColumnFamilyStoreInstance(tid);
-            return cfs != null && cfs.getCompactionStrategyManager().hasDataForPendingRepair(session.sessionID);
+            return cfs != null && cfs.hasPendingRepairSSTables(session.sessionID);
 
         };
         return Iterables.any(session.tableIds, predicate::test);
diff --git a/src/java/org/apache/cassandra/schema/CompactionParams.java b/src/java/org/apache/cassandra/schema/CompactionParams.java
index 485946820ef2..0ec61fe253af 100644
--- a/src/java/org/apache/cassandra/schema/CompactionParams.java
+++ b/src/java/org/apache/cassandra/schema/CompactionParams.java
@@ -17,23 +17,22 @@
  */
 package org.apache.cassandra.schema;
 
-import java.lang.reflect.InvocationTargetException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
 
-import com.google.common.base.MoreObjects;
 import com.google.common.collect.ImmutableMap;
-import org.apache.commons.lang3.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
+import org.apache.commons.lang.StringUtils;
+
+import org.apache.cassandra.db.compaction.CompactionStrategy;
+import org.apache.cassandra.db.compaction.CompactionStrategyOptions;
 import org.apache.cassandra.db.compaction.LeveledCompactionStrategy;
 import org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy;
 import org.apache.cassandra.db.compaction.TimeWindowCompactionStrategy;
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.utils.FBUtilities;
 
@@ -41,8 +40,6 @@
 
 public final class CompactionParams
 {
-    private static final Logger logger = LoggerFactory.getLogger(CompactionParams.class);
-
     public enum Option
     {
         CLASS,
@@ -72,34 +69,29 @@ public static Optional<TombstoneOption> forName(String name)
         }
     }
 
-    public static final int DEFAULT_MIN_THRESHOLD = 4;
-    public static final int DEFAULT_MAX_THRESHOLD = 32;
-
     public static final boolean DEFAULT_ENABLED = true;
     public static final TombstoneOption DEFAULT_PROVIDE_OVERLAPPING_TOMBSTONES =
             TombstoneOption.valueOf(System.getProperty("default.provide.overlapping.tombstones", TombstoneOption.NONE.toString()).toUpperCase());
 
     public static final Map<String, String> DEFAULT_THRESHOLDS =
-        ImmutableMap.of(Option.MIN_THRESHOLD.toString(), Integer.toString(DEFAULT_MIN_THRESHOLD),
-                        Option.MAX_THRESHOLD.toString(), Integer.toString(DEFAULT_MAX_THRESHOLD));
+        ImmutableMap.of(Option.MIN_THRESHOLD.toString(), Integer.toString(CompactionStrategyOptions.DEFAULT_MIN_THRESHOLD),
+                        Option.MAX_THRESHOLD.toString(), Integer.toString(CompactionStrategyOptions.DEFAULT_MAX_THRESHOLD));
 
     public static final CompactionParams DEFAULT =
         new CompactionParams(SizeTieredCompactionStrategy.class, DEFAULT_THRESHOLDS, DEFAULT_ENABLED, DEFAULT_PROVIDE_OVERLAPPING_TOMBSTONES);
 
-    private final Class<? extends AbstractCompactionStrategy> klass;
-    private final ImmutableMap<String, String> options;
+    private final CompactionStrategyOptions strategyOptions;
     private final boolean isEnabled;
     private final TombstoneOption tombstoneOption;
 
-    private CompactionParams(Class<? extends AbstractCompactionStrategy> klass, Map<String, String> options, boolean isEnabled, TombstoneOption tombstoneOption)
+    private CompactionParams(Class<? extends CompactionStrategy> klass, Map<String, String> options, boolean isEnabled, TombstoneOption tombstoneOption)
     {
-        this.klass = klass;
-        this.options = ImmutableMap.copyOf(options);
+        this.strategyOptions = new CompactionStrategyOptions(klass, options, true);
         this.isEnabled = isEnabled;
         this.tombstoneOption = tombstoneOption;
     }
 
-    public static CompactionParams create(Class<? extends AbstractCompactionStrategy> klass, Map<String, String> options)
+    public static CompactionParams create(Class<? extends CompactionStrategy> klass, Map<String, String> options)
     {
         boolean isEnabled = options.containsKey(Option.ENABLED.toString())
                           ? Boolean.parseBoolean(options.get(Option.ENABLED.toString()))
@@ -115,14 +107,7 @@ public static CompactionParams create(Class<? extends AbstractCompactionStrategy
         }
         TombstoneOption tombstoneOption = tombstoneOptional.get();
 
-        Map<String, String> allOptions = new HashMap<>(options);
-        if (supportsThresholdParams(klass))
-        {
-            allOptions.putIfAbsent(Option.MIN_THRESHOLD.toString(), Integer.toString(DEFAULT_MIN_THRESHOLD));
-            allOptions.putIfAbsent(Option.MAX_THRESHOLD.toString(), Integer.toString(DEFAULT_MAX_THRESHOLD));
-        }
-
-        return new CompactionParams(klass, allOptions, isEnabled, tombstoneOption);
+        return new CompactionParams(klass, new HashMap<>(options), isEnabled, tombstoneOption);
     }
 
     public static CompactionParams stcs(Map<String, String> options)
@@ -140,20 +125,19 @@ public static CompactionParams twcs(Map<String, String> options)
         return create(TimeWindowCompactionStrategy.class, options);
     }
 
+    public static CompactionParams ucs(Map<String, String> options)
+    {
+        return create(UnifiedCompactionStrategy.class, options);
+    }
+
     public int minCompactionThreshold()
     {
-        String threshold = options.get(Option.MIN_THRESHOLD.toString());
-        return threshold == null
-             ? DEFAULT_MIN_THRESHOLD
-             : Integer.parseInt(threshold);
+        return strategyOptions.minCompactionThreshold();
     }
 
     public int maxCompactionThreshold()
     {
-        String threshold = options.get(Option.MAX_THRESHOLD.toString());
-        return threshold == null
-             ? DEFAULT_MAX_THRESHOLD
-             : Integer.parseInt(threshold);
+        return strategyOptions.maxCompactionThreshold();
     }
 
     public TombstoneOption tombstoneOption()
@@ -161,87 +145,14 @@ public TombstoneOption tombstoneOption()
         return tombstoneOption;
     }
 
-    public void validate()
-    {
-        try
-        {
-            Map<?, ?> unknownOptions = (Map) klass.getMethod("validateOptions", Map.class).invoke(null, options);
-            if (!unknownOptions.isEmpty())
-            {
-                throw new ConfigurationException(format("Properties specified %s are not understood by %s",
-                                                        unknownOptions.keySet(),
-                                                        klass.getSimpleName()));
-            }
-        }
-        catch (NoSuchMethodException e)
-        {
-            logger.warn("Compaction strategy {} does not have a static validateOptions method. Validation ignored",
-                        klass.getName());
-        }
-        catch (InvocationTargetException e)
-        {
-            if (e.getTargetException() instanceof ConfigurationException)
-                throw (ConfigurationException) e.getTargetException();
-
-            Throwable cause = e.getCause() == null
-                            ? e
-                            : e.getCause();
-
-            throw new ConfigurationException(format("%s.validateOptions() threw an error: %s %s",
-                                                    klass.getName(),
-                                                    cause.getClass().getName(),
-                                                    cause.getMessage()),
-                                             e);
-        }
-        catch (IllegalAccessException e)
-        {
-            throw new ConfigurationException("Cannot access method validateOptions in " + klass.getName(), e);
-        }
-
-        String minThreshold = options.get(Option.MIN_THRESHOLD.toString());
-        if (minThreshold != null && !StringUtils.isNumeric(minThreshold))
-        {
-            throw new ConfigurationException(format("Invalid value %s for '%s' compaction sub-option - must be an integer",
-                                                    minThreshold,
-                                                    Option.MIN_THRESHOLD));
-        }
-
-        String maxThreshold = options.get(Option.MAX_THRESHOLD.toString());
-        if (maxThreshold != null && !StringUtils.isNumeric(maxThreshold))
-        {
-            throw new ConfigurationException(format("Invalid value %s for '%s' compaction sub-option - must be an integer",
-                                                    maxThreshold,
-                                                    Option.MAX_THRESHOLD));
-        }
-
-        if (minCompactionThreshold() <= 0 || maxCompactionThreshold() <= 0)
-        {
-            throw new ConfigurationException("Disabling compaction by setting compaction thresholds to 0 has been removed,"
-                                             + " set the compaction option 'enabled' to false instead.");
-        }
-
-        if (minCompactionThreshold() <= 1)
-        {
-            throw new ConfigurationException(format("Min compaction threshold cannot be less than 2 (got %d)",
-                                                    minCompactionThreshold()));
-        }
-
-        if (minCompactionThreshold() > maxCompactionThreshold())
-        {
-            throw new ConfigurationException(format("Min compaction threshold (got %d) cannot be greater than max compaction threshold (got %d)",
-                                                    minCompactionThreshold(),
-                                                    maxCompactionThreshold()));
-        }
-    }
-
     double defaultBloomFilterFbChance()
     {
-        return klass.equals(LeveledCompactionStrategy.class) ? 0.1 : 0.01;
+        return klass().equals(LeveledCompactionStrategy.class) ? 0.1 : 0.01;
     }
 
-    public Class<? extends AbstractCompactionStrategy> klass()
+    public Class<? extends CompactionStrategy> klass()
     {
-        return klass;
+        return strategyOptions.klass();
     }
 
     /**
@@ -249,7 +160,7 @@ public Class<? extends AbstractCompactionStrategy> klass()
      */
     public Map<String, String> options()
     {
-        return options;
+        return strategyOptions.getOptions();
     }
 
     public boolean isEnabled()
@@ -272,14 +183,14 @@ public static CompactionParams fromMap(Map<String, String> map)
         return create(classFromName(className), options);
     }
 
-    public static Class<? extends AbstractCompactionStrategy> classFromName(String name)
+    public static Class<? extends CompactionStrategy> classFromName(String name)
     {
         String className = name.contains(".")
                          ? name
                          : "org.apache.cassandra.db.compaction." + name;
-        Class<AbstractCompactionStrategy> strategyClass = FBUtilities.classForName(className, "compaction strategy");
+        Class<CompactionStrategy> strategyClass = FBUtilities.classForName(className, "compaction strategy");
 
-        if (!AbstractCompactionStrategy.class.isAssignableFrom(strategyClass))
+        if (!CompactionStrategy.class.isAssignableFrom(strategyClass))
         {
             throw new ConfigurationException(format("Compaction strategy class %s is not derived from AbstractReplicationStrategy",
                                                     className));
@@ -288,40 +199,17 @@ public static Class<? extends AbstractCompactionStrategy> classFromName(String n
         return strategyClass;
     }
 
-    /*
-     * LCS doesn't, STCS and DTCS do
-     */
-    @SuppressWarnings("unchecked")
-    public static boolean supportsThresholdParams(Class<? extends AbstractCompactionStrategy> klass)
-    {
-        try
-        {
-            Map<String, String> unrecognizedOptions =
-                (Map<String, String>) klass.getMethod("validateOptions", Map.class)
-                                           .invoke(null, DEFAULT_THRESHOLDS);
-
-            return unrecognizedOptions.isEmpty();
-        }
-        catch (Exception e)
-        {
-            throw new RuntimeException(e);
-        }
-    }
-
     public Map<String, String> asMap()
     {
         Map<String, String> map = new HashMap<>(options());
-        map.put(Option.CLASS.toString(), klass.getName());
+        map.put(Option.CLASS.toString(), klass().getName());
         return map;
     }
 
     @Override
     public String toString()
     {
-        return MoreObjects.toStringHelper(this)
-                          .add("class", klass.getName())
-                          .add("options", options)
-                          .toString();
+        return strategyOptions.toString();
     }
 
     @Override
@@ -335,12 +223,12 @@ public boolean equals(Object o)
 
         CompactionParams cp = (CompactionParams) o;
 
-        return klass.equals(cp.klass) && options.equals(cp.options);
+        return strategyOptions.equals(cp.strategyOptions);
     }
 
     @Override
     public int hashCode()
     {
-        return Objects.hash(klass, options);
+        return Objects.hash(strategyOptions);
     }
 }
diff --git a/src/java/org/apache/cassandra/schema/TableParams.java b/src/java/org/apache/cassandra/schema/TableParams.java
index 3ca6e26eafd2..2e8358da3d1a 100644
--- a/src/java/org/apache/cassandra/schema/TableParams.java
+++ b/src/java/org/apache/cassandra/schema/TableParams.java
@@ -140,7 +140,7 @@ public Builder unbuild()
 
     public void validate()
     {
-        compaction.validate();
+        // compaction parameters are validated during CompactionParams construction
         compression.validate();
 
         double minBloomFilterFpChanceValue = BloomCalculations.minSupportedBloomFilterFpChance();
diff --git a/src/java/org/apache/cassandra/service/CassandraDaemon.java b/src/java/org/apache/cassandra/service/CassandraDaemon.java
index 2cb12540c6cf..4b43fefcc35f 100644
--- a/src/java/org/apache/cassandra/service/CassandraDaemon.java
+++ b/src/java/org/apache/cassandra/service/CassandraDaemon.java
@@ -448,7 +448,7 @@ protected void setup()
                 for (final ColumnFamilyStore store : cfs.concatWithIndexes())
                 {
                     store.reload(); //reload CFs in case there was a change of disk boundaries
-                    if (store.getCompactionStrategyManager().shouldBeEnabled())
+                    if (store.compactionShouldBeEnabled())
                     {
                         if (DatabaseDescriptor.getAutocompactionOnStartupEnabled())
                         {
diff --git a/src/java/org/apache/cassandra/service/DefaultFSErrorHandler.java b/src/java/org/apache/cassandra/service/DefaultFSErrorHandler.java
index d5e3e531c1c5..aae77c8a2aa7 100644
--- a/src/java/org/apache/cassandra/service/DefaultFSErrorHandler.java
+++ b/src/java/org/apache/cassandra/service/DefaultFSErrorHandler.java
@@ -86,6 +86,9 @@ public void handleFSError(FSError e)
                         Keyspace.removeUnreadableSSTables(directory);
                 }
                 break;
+            case die:
+                JVMStabilityInspector.killCurrentJVM(e, false);
+                break;
             case ignore:
                 // already logged, so left nothing to do
                 break;
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index e8943fe09c4b..b3f6c9436956 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -1796,7 +1796,7 @@ private void invalidateDiskBoundaries()
             {
                 for (final ColumnFamilyStore store : cfs.concatWithIndexes())
                 {
-                    store.invalidateDiskBoundaries();
+                    store.invalidateLocalRangesAndDiskBoundaries();
                 }
             }
         }
diff --git a/src/java/org/apache/cassandra/tools/CompactionLogAnalyzer.java b/src/java/org/apache/cassandra/tools/CompactionLogAnalyzer.java
new file mode 100644
index 000000000000..8a164e918c0a
--- /dev/null
+++ b/src/java/org/apache/cassandra/tools/CompactionLogAnalyzer.java
@@ -0,0 +1,554 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.tools;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.text.SimpleDateFormat;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import javax.annotation.Nullable;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.base.Throwables;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.Table;
+import com.google.common.io.ByteStreams;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.utils.EstimatedHistogram;
+import org.apache.cassandra.utils.FBUtilities;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+
+
+/**
+ * Analyzes a collection of CSV logs from the unified compaction strategy. Run with
+ *
+ *   tools/bin/analyzecompactionlog <path-to-directory-with-csvs>
+ *
+ * It will process the CSVs are create a compaction_report.html file in the target directory. The file is similar to our
+ * performance reports.
+ */
+public class CompactionLogAnalyzer
+{
+
+    private static final Options options = new Options();
+    private static CommandLine cmd;
+
+    public static final String OPTION_LIMIT = "l";
+    public static final String OPTION_RESOLUTION = "r";
+
+    static
+    {
+        DatabaseDescriptor.toolInitialization();
+
+        Option optLimit = new Option(OPTION_LIMIT, true, "If specified, will only read this number of events " +
+                                                         "from the first file, and up to that time from the others.");
+        optLimit.setArgs(1);
+        options.addOption(optLimit);
+
+        Option optResolution = new Option(OPTION_RESOLUTION, true, "The resolution of the produced" +
+                                                                   "report in milliseconds, 100 by default.");
+        optResolution.setArgs(1);
+        options.addOption(optResolution);
+    }
+
+    /**
+     * A data point represents both an input data point as well as aggregated data for a level or total.
+     */
+    static class DataPoint
+    {
+        String shardId;
+        long timestamp;
+        int bucket;
+        // number of sstables
+        int sstables;
+        // total size of the sstables
+        long size;
+        // number of running compactions
+        int compactionsInProgress;
+        // number of compactions to do
+        int compactionsPending;
+        // bytes read per second
+        long readBytesPerSecond;
+        // bytes written per second
+        long writeBytesPerSecond;
+        // total bytes to compact
+        long totalBytes;
+        // remaining bytes to compact
+        long remainingReadBytes;
+        // number of buckets above T sstables (excl compacting)
+        int bucketsAboveT;
+        // number of buckets above T*T sstables (excl compacting)
+        int bucketsAboveT2;
+
+        /**
+         * Called to aggregate data in response to a new data point for a bucket.
+         * Unless the process is just starting, the new data point will be replacing the older state of the bucket,
+         * thus this will add the new data but also remove the older values.
+         */
+        private void updateTotals(DataPoint toAdd, DataPoint toRemove)
+        {
+            timestamp = toAdd.timestamp;
+            compactionsInProgress += toAdd.compactionsInProgress - toRemove.compactionsInProgress;
+            compactionsPending += toAdd.compactionsPending - toRemove.compactionsPending;
+            sstables += toAdd.sstables - toRemove.sstables;
+            size += toAdd.size - toRemove.size;
+            readBytesPerSecond += toAdd.readBytesPerSecond - toRemove.readBytesPerSecond;
+            writeBytesPerSecond += toAdd.writeBytesPerSecond - toRemove.writeBytesPerSecond;
+            totalBytes += toAdd.totalBytes - toRemove.totalBytes;
+            remainingReadBytes += toAdd.remainingReadBytes - toRemove.remainingReadBytes;
+            bucketsAboveT += toAdd.bucketsAboveT - toRemove.bucketsAboveT;
+            bucketsAboveT2 += toAdd.bucketsAboveT2 - toRemove.bucketsAboveT2;
+        }
+    }
+
+
+    final static Pattern CSVNamePattern = Pattern.compile("compaction-(\\w+)-(.*?)-(.*?)-(.*)\\.csv");
+    final static Pattern HumanReadablePattern = Pattern.compile("(\\d+(\\.\\d+)?)([ KMGTP])iB(/s)?");
+    final static String HumanReadablePowers = " KMGTP";
+    private static final String fullDateFormatter = "yyyy-MM-dd' 'HH:mm:ss.SSS";
+
+    static int reportResolutionInMs;
+
+    // Indexes of the relevant columns in the source CSV, set by initializeIndexes below.
+    static int timestampIndex = -1;
+    static int eventIndex;
+    static int bucketIndex;
+    static int sstablesIndex;
+    static int compactingSstablesIndex;
+    static int sizeIndex;
+    static int compactionsIndex;
+    static int readPerSecIndex;
+    static int writePerSecIndex;
+    static int sizesIndex;
+    static int Tindex;
+
+    private static void initializeIndexes(String header)
+    {
+        if (timestampIndex < 0)
+            synchronized (CompactionLogAnalyzer.class) {
+                if (timestampIndex < 0)
+                {
+                    Map<String, Integer> indexMap = new HashMap<>();
+                    String[] headers = header.split(",");
+                    for (int i = 0; i < headers.length; ++i)
+                        indexMap.put(headers[i], i);
+
+                    timestampIndex = indexMap.get("Timestamp");
+                    eventIndex = indexMap.get("Event");
+                    bucketIndex = indexMap.get("Bucket");
+                    sstablesIndex = indexMap.get("Tot. SSTables");
+                    compactingSstablesIndex = indexMap.get("Comp. SSTables");
+                    sizeIndex = indexMap.getOrDefault("Size (bytes)", -1);
+                    sizeIndex = indexMap.get("Tot. size (bytes)");
+                    compactionsIndex = indexMap.get("Compactions");
+                    readPerSecIndex = indexMap.get("Read (bytes/sec)");
+                    writePerSecIndex = indexMap.get("Write (bytes/sec)");
+                    sizesIndex = indexMap.getOrDefault("Tot/Read/Written", -1);
+                    sizesIndex = indexMap.get("Tot. comp. size/Read/Written (bytes)");
+                    Tindex = indexMap.get("T");
+                }
+            }
+    }
+
+    static DataPoint parse(String shardId, String dataLine) throws java.text.ParseException
+    {
+        String[] data = dataLine.split(",");
+
+        DataPoint dp = new DataPoint();
+        dp.shardId = shardId;
+        dp.timestamp = getTimestamp(data[timestampIndex]);
+        dp.bucket = Integer.parseInt(data[bucketIndex]);
+        dp.sstables = Integer.parseInt(data[sstablesIndex]);
+        dp.size = parseHumanReadable(data[sizeIndex]);
+        final String[] compactions = data[compactionsIndex].split("/");
+        dp.compactionsInProgress = Integer.parseInt(compactions[1]);
+        dp.compactionsPending = Integer.parseInt(compactions[0]);
+        dp.readBytesPerSecond = parseHumanReadable(data[readPerSecIndex]);
+        dp.writeBytesPerSecond = parseHumanReadable(data[writePerSecIndex]);
+        String[] sizes = data[sizesIndex].split("/");
+        dp.totalBytes = parseHumanReadable(sizes[0]);
+        dp.remainingReadBytes = dp.totalBytes - parseHumanReadable(sizes[1]);
+        int T = Integer.parseInt(data[Tindex]);
+        int compactingSSTables = Integer.parseInt(data[compactingSstablesIndex].split("/")[1]);
+        int nonCompacting = dp.sstables - compactingSSTables;
+        dp.bucketsAboveT = nonCompacting > T ? 1 : 0;
+        dp.bucketsAboveT2 = nonCompacting > T*T ? 1 : 0;
+        return dp;
+    }
+
+    private static long getTimestamp(String datum) throws java.text.ParseException
+    {
+        Date date = new SimpleDateFormat(fullDateFormatter).parse(datum);
+        return date.getTime();
+    }
+
+    private static long parseHumanReadable(String datum)
+    {
+        Matcher m = HumanReadablePattern.matcher(datum);
+        if (!m.matches())
+            throw new AssertionError();
+        double v = Double.parseDouble(m.group(1));
+        int power = HumanReadablePowers.indexOf(m.group(3).charAt(0));
+
+        return (long) Math.scalb(v, 10 * power);
+    }
+
+    public static void generateGraph(File htmlFile, JSONObject stats)
+    {
+        try (PrintWriter out = new PrintWriter(htmlFile))
+        {
+            String statsBlock = "/* stats start */\nstats = " + stats.toJSONString() + ";\n/* stats end */\n";
+            String html = getGraphHTML().replaceFirst("/\\* stats start \\*/\n\n/\\* stats end \\*/\n", statsBlock);
+            out.write(html);
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException("Couldn't write stats html.");
+        }
+    }
+
+    private static String getGraphHTML()
+    {
+        try (InputStream graphHTMLRes = CompactionLogAnalyzer.class.getClassLoader().getResourceAsStream("org/apache/cassandra/graph/graph.html"))
+        {
+            return new String(ByteStreams.toByteArray(graphHTMLRes));
+        }
+        catch (IOException e)
+        {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public static void main(String[] args) throws Exception
+    {
+        CommandLineParser parser = new PosixParser();
+        try
+        {
+            cmd = parser.parse(options, args);
+        }
+        catch (ParseException e1)
+        {
+            System.err.println(e1.getMessage());
+            printUsage();
+            System.exit(1);
+        }
+
+        if (cmd.getArgs().length != 1)
+        {
+            System.err.println("You must supply exactly one log csv path.");
+            printUsage();
+            System.exit(1);
+        }
+
+        File logPath = new File(cmd.getArgs()[0]);
+        File[] files = logPath.listFiles(f -> CSVNamePattern.matcher(f.getName()).matches());
+        Arrays.sort(files);
+
+        reportResolutionInMs = Integer.parseInt(cmd.getOptionValue(OPTION_RESOLUTION, "100"));
+
+        final String limitOption = cmd.getOptionValue(OPTION_LIMIT);
+        Integer lineCountLimit = limitOption == null ? null : Integer.parseInt(limitOption);
+
+        List<DataPoint> dataPoints = readDataPoints(files, lineCountLimit);
+        dataPoints.sort((a, b) -> Long.compare(a.timestamp, b.timestamp));
+
+        JSONArray marr = processData(dataPoints);
+        JSONObject main = new JSONObject();
+        main.put("title", "Compaction report");
+        main.put("stats", marr);
+
+        generateGraph(new File(logPath.getPath() + File.separator + "compaction_report.html"), main);
+
+        System.exit(0);
+    }
+
+    @VisibleForTesting
+    static List<DataPoint> readDataPoints(File[] files, @Nullable Integer lineCountLimit) throws IOException, java.text.ParseException
+    {
+        List<DataPoint> dataPoints;
+
+        if (lineCountLimit != null)
+        {
+            long timestampLimit = Long.MAX_VALUE;
+            dataPoints = new ArrayList<>();
+
+            for (File file : files)
+                timestampLimit = readDataPoints(dataPoints, lineCountLimit, timestampLimit, file);
+        }
+        else
+        {
+            // Reading the files can take a long time. Do it in parallel.
+            dataPoints = Arrays.stream(files)
+                               .parallel()
+                               .flatMap(file ->
+                                    {
+                                        List<DataPoint> pts = new ArrayList<>();
+                                        try
+                                        {
+                                            readDataPoints(pts, Integer.MAX_VALUE, Long.MAX_VALUE, file);
+                                            return pts.stream();
+                                        }
+                                        catch (Exception e)
+                                        {
+                                            throw Throwables.propagate(e);
+                                        }
+                                    })
+                               .collect(Collectors.toList());
+        }
+
+        return dataPoints;
+    }
+
+    private static long readDataPoints(List<DataPoint> dataPoints, int lineCountLimit, long timestampLimit, File file) throws IOException, java.text.ParseException
+    {
+        Matcher m = CSVNamePattern.matcher(file.getName());
+        if (!m.matches())
+            throw new AssertionError();
+
+        String shardId = m.group(4);
+        try (BufferedReader rdr = Files.newBufferedReader(file.toPath(), StandardCharsets.UTF_8))
+        {
+            String header = rdr.readLine();
+            initializeIndexes(header);
+            DataPoint curr = null;
+
+            int lineCount = 0;
+
+            while (rdr.ready())
+            {
+                if (++lineCount > lineCountLimit && curr != null)
+                {
+                    timestampLimit = curr.timestamp;
+                    break;
+                }
+
+                String line = rdr.readLine();
+                if (line.isEmpty())
+                    continue;
+
+                curr = parse(shardId, line);
+                if (curr.timestamp > timestampLimit)
+                    break;
+                dataPoints.add(curr);
+            }
+            System.out.format("%d data points processed for shard %s.\n", lineCount, shardId);
+        }
+        return timestampLimit;
+    }
+
+    @VisibleForTesting
+    static JSONArray processData(List<DataPoint> dataPoints)
+    {
+        int levels = dataPoints.stream().mapToInt(dp -> dp.bucket).max().getAsInt() + 1;
+
+        // Prepare the JSON objects representing the data in the report
+        JSONArray marr = new JSONArray();
+
+        JSONArray[] intervalsPerLevel = new JSONArray[levels + 1];
+        Table<String, Integer, DataPoint> progressMap = HashBasedTable.create();
+        DataPoint totals = new DataPoint();
+        DataPoint[] perLevel = new DataPoint[levels + 1];
+        perLevel[levels] = totals;
+        DataPoint zero = new DataPoint();
+        totals.shardId = "Total";
+        totals.bucket = levels;
+
+        JSONArray metricsHeader = makeMetricsHeader();
+        for (int i = 0; i < levels; ++i)
+        {
+            perLevel[i] = new DataPoint();
+            perLevel[i].shardId = "Level " + i;
+            perLevel[i].bucket = i;
+        }
+
+
+        for (int i = 0; i <= levels; ++i)
+        {
+            intervalsPerLevel[i] = new JSONArray();
+
+            JSONObject stats = new JSONObject();
+            stats.put("revision", perLevel[i].shardId);
+            stats.put("test", "Compaction");
+            stats.put("metrics", metricsHeader);
+            stats.put("intervals", intervalsPerLevel[i]);
+            marr.add(stats);
+        }
+
+        System.out.println("Totals");
+        System.out.format("%25s %8s %9s %15s %15s %15s %15s\n", "Timestamp", "SSTables", "Run/Pendg", "Read tput", "Write tput", "TotalCompBytes", "RemCompBytes");
+
+        // Process the data points to compile aggregate state and report it with the specified resolution.
+        long startTimestamp = -1;
+        int count = 0;
+        for (DataPoint dp : dataPoints)
+        {
+            // Data points replace previous data for the given bucket. This map is used to find what is replaced.
+            DataPoint prev = progressMap.get(dp.shardId, dp.bucket);
+            if (prev == null)
+                prev = zero;
+
+            if (startTimestamp == -1)
+                startTimestamp = dp.timestamp;
+            else if (dp.timestamp >= totals.timestamp + reportResolutionInMs)
+            {
+                report(intervalsPerLevel, progressMap, perLevel, startTimestamp);
+                ++count;
+            }
+
+            totals.updateTotals(dp, prev);
+            perLevel[dp.bucket].updateTotals(dp, prev);
+            progressMap.put(dp.shardId, dp.bucket, dp);
+        }
+        report(intervalsPerLevel, progressMap, perLevel, startTimestamp);
+        ++count;
+
+        System.out.format("Wrote %d datapoints, spanning %.1f seconds\n", count, (totals.timestamp - startTimestamp) / 1000.0);
+        return marr;
+    }
+
+    private static void report(JSONArray[] intervalsPerLevel,
+                               Table<String, Integer, DataPoint> progressMap,
+                               DataPoint[] perLevel,
+                               long startTimestamp)
+    {
+        // Collect a histogram of the number of sstables per bucket.
+        int levels = perLevel.length - 1;
+        EstimatedHistogram[] histPerLevel = new EstimatedHistogram[levels + 1];
+        for (int i = 0; i <= levels; ++i)
+            histPerLevel[i] = new EstimatedHistogram();
+
+        EstimatedHistogram histTotal = histPerLevel[levels];
+        for (DataPoint bucket : progressMap.values())
+        {
+            histTotal.add(bucket.sstables);
+            histPerLevel[bucket.bucket].add(bucket.sstables);
+        }
+
+        print(perLevel[levels]);    // print out the totals on the console
+        for (int i = 0; i <= levels; ++i)
+            addMetrics(perLevel[i], intervalsPerLevel[i], startTimestamp, histPerLevel[i]);
+    }
+
+    private static JSONArray makeMetricsHeader()
+    {
+        JSONArray metrics = new JSONArray();
+        metrics.add("SSTables");
+        metrics.add("Size MB");
+        metrics.add("Running compactions");
+        metrics.add("Pending compactions");
+        metrics.add("Read throughput MB/s");
+        metrics.add("Write throughput MB/s");
+        metrics.add("Read throughput per thread MB/s");
+        metrics.add("Write throughput per thread MB/s");
+        metrics.add("Total GB to compact");
+        metrics.add("Remaining GB to compact");
+        metrics.add("Number of buckets above T sstables");
+        metrics.add("Number of buckets above T^2 sstables");
+
+        metrics.add("Max SSTables in bucket");
+        metrics.add("90th percentile SSTables in bucket");
+        metrics.add("50th percentile SSTables in bucket");
+
+        metrics.add("time");
+        return metrics;
+    }
+
+    private static void addMetrics(DataPoint totals, JSONArray intervals, long startTimestamp, EstimatedHistogram hist)
+    {
+        if (totals.timestamp < startTimestamp)
+            return; // nothing to add yet
+
+        JSONArray metrics = new JSONArray();
+        metrics.add(totals.sstables);
+        metrics.add(Math.scalb(totals.size, -20));
+        metrics.add(totals.compactionsInProgress);
+        metrics.add(totals.compactionsPending);
+        metrics.add(Math.scalb(totals.readBytesPerSecond, -20));
+        metrics.add(Math.scalb(totals.writeBytesPerSecond, -20));
+        if (totals.compactionsInProgress > 0)
+        {
+            long readThroughput = totals.readBytesPerSecond / totals.compactionsInProgress;
+            long writeThroughput = totals.writeBytesPerSecond / totals.compactionsInProgress;
+            metrics.add(Math.scalb(readThroughput, -20));
+            metrics.add(Math.scalb(writeThroughput, -20));
+        }
+        else
+        {
+            metrics.add(null);
+            metrics.add(null);
+        }
+        metrics.add(Math.scalb(totals.totalBytes, -30));
+        metrics.add(Math.scalb(totals.remainingReadBytes, -30));
+
+        metrics.add(totals.bucketsAboveT);
+        metrics.add(totals.bucketsAboveT2);
+
+        metrics.add(hist.max());
+        metrics.add(hist.percentile(0.90));
+        metrics.add(hist.percentile(0.50));
+
+        metrics.add((totals.timestamp - startTimestamp) / 1000.0);
+        intervals.add(metrics);
+    }
+
+    static void print(DataPoint dp)
+    {
+        System.out.format("%25s %8s %3d/%5d %13s/s %13s/s %15s %15s\n",
+                          new SimpleDateFormat(fullDateFormatter).format(new Date(dp.timestamp)),
+                          dp.sstables,
+                          dp.compactionsInProgress,
+                          dp.compactionsPending,
+                          FBUtilities.prettyPrintMemory(dp.readBytesPerSecond),
+                          FBUtilities.prettyPrintMemory(dp.writeBytesPerSecond),
+                          FBUtilities.prettyPrintMemory(dp.totalBytes),
+                          FBUtilities.prettyPrintMemory(dp.remainingReadBytes));
+    }
+
+    private static void printUsage()
+    {
+        String usage = String.format("analyzecompactionlog <options> <log csvs path>%n");
+        String header = "Perform an analysis of the UCS compaction log.\n\n" +
+                        "The input is a directory that contains the per-shard CSV files generated using the " +
+                        "'logAll: true' flag by the unified compaction strategy.\n" +
+                        "Constructs a compaction_report.html in the target directory with summarized metrics.";
+        new HelpFormatter().printHelp(usage, header, options, "");
+    }
+}
diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java
index 558816725b69..457233bb1f38 100644
--- a/src/java/org/apache/cassandra/tools/NodeProbe.java
+++ b/src/java/org/apache/cassandra/tools/NodeProbe.java
@@ -1601,6 +1601,7 @@ public Object getCompactionMetric(String metricName)
                 case "CompletedTasks":
                 case "PendingTasks":
                 case "PendingTasksByTableName":
+                case "WriteAmplificationByTableName":
                 case "AggregateCompactions":
                     return JMX.newMBeanProxy(mbeanServerConn,
                             new ObjectName("org.apache.cassandra.metrics:type=Compaction,name=" + metricName),
diff --git a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
index 1eda9af15198..6d5efe88f73f 100644
--- a/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
+++ b/src/java/org/apache/cassandra/tools/StandaloneScrubber.java
@@ -242,7 +242,7 @@ public static void main(String args[])
             }
 
             // Check (and repair) manifests
-            checkManifest(cfs.getCompactionStrategyManager(), cfs, sstables);
+            checkManifest(cfs, sstables);
             CompactionManager.instance.finishCompactionsAndShutdown(5, TimeUnit.MINUTES);
             LifecycleTransaction.waitForDeletions();
             System.exit(0); // We need that to stop non daemonized threads
@@ -256,13 +256,14 @@ public static void main(String args[])
         }
     }
 
-    private static void checkManifest(CompactionStrategyManager strategyManager, ColumnFamilyStore cfs, Collection<SSTableReader> sstables)
+    private static void checkManifest(ColumnFamilyStore cfs, Collection<SSTableReader> sstables)
     {
-        if (strategyManager.getCompactionParams().klass().equals(LeveledCompactionStrategy.class))
+        if (cfs.getCompactionParams().klass().equals(LeveledCompactionStrategy.class))
         {
-            int maxSizeInMB = (int)((cfs.getCompactionStrategyManager().getMaxSSTableBytes()) / (1024L * 1024L));
-            int fanOut = cfs.getCompactionStrategyManager().getLevelFanoutSize();
-            for (AbstractStrategyHolder.GroupedSSTableContainer sstableGroup : strategyManager.groupSSTables(sstables))
+            int maxSizeInMB = (int)((cfs.getCompactionStrategy().getMaxSSTableBytes()) / (1024L * 1024L));
+            int fanOut = cfs.getCompactionStrategy().getLevelFanoutSize();
+            CompactionStrategyManager csm = (CompactionStrategyManager) cfs.getCompactionStrategyContainer();
+            for (AbstractStrategyHolder.GroupedSSTableContainer sstableGroup : csm.groupSSTables(sstables))
             {
                 for (int i = 0; i < sstableGroup.numGroups(); i++)
                 {
diff --git a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
index a3abeea5de65..0fa612dd7819 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java
@@ -57,23 +57,35 @@ public void execute(NodeProbe probe)
         CompactionManagerMBean cm = probe.getCompactionManagerProxy();
         Map<String, Map<String, Integer>> pendingTaskNumberByTable =
             (Map<String, Map<String, Integer>>) probe.getCompactionMetric("PendingTasksByTableName");
+        Map<String, Map<String, Double>> writeAmplificationByTableName =
+        (Map<String, Map<String, Double>>) probe.getCompactionMetric("WriteAmplificationByTableName");
         int numTotalPendingTask = 0;
+        double totWriteAmplification = 0;
         for (Entry<String, Map<String, Integer>> ksEntry : pendingTaskNumberByTable.entrySet())
         {
+            Map<String, Double> ksWriteAmplification = writeAmplificationByTableName.get(ksEntry.getKey());
             for (Entry<String, Integer> tableEntry : ksEntry.getValue().entrySet())
+            {
                 numTotalPendingTask += tableEntry.getValue();
+                if (ksWriteAmplification != null)
+                    totWriteAmplification += ksWriteAmplification.get(tableEntry.getKey());
+            }
         }
 
         out.println("pending tasks: " + numTotalPendingTask);
+        System.out.println(String.format("write amplification: %.2f", totWriteAmplification));
         for (Entry<String, Map<String, Integer>> ksEntry : pendingTaskNumberByTable.entrySet())
         {
             String ksName = ksEntry.getKey();
+            Map<String, Double> ksWriteAmplification = writeAmplificationByTableName.get(ksName);
             for (Entry<String, Integer> tableEntry : ksEntry.getValue().entrySet())
             {
                 String tableName = tableEntry.getKey();
                 int pendingTaskCount = tableEntry.getValue();
 
-                out.println("- " + ksName + '.' + tableName + ": " + pendingTaskCount);
+                double wa = ksWriteAmplification == null ? 0 : ksWriteAmplification.get(tableName);
+                System.out.println(String.format("- %s.%s: %d", ksName, tableName, pendingTaskCount));
+                System.out.println(String.format("- %s.%s write amplification.: %.2f", ksName, tableName, wa));
             }
         }
         out.println();
diff --git a/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java b/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java
index 8b9b7222415c..1f11a16925c1 100644
--- a/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java
+++ b/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java
@@ -217,7 +217,7 @@ private void initializeKeyspaces(NodeProbe probe, boolean ignore, List<String> t
                 statsTable.oldSSTableCount = probe.getColumnFamilyMetric(keyspaceName, tableName, "OldVersionSSTableCount");
 
                 int[] leveledSStables = table.getSSTableCountPerLevel();
-                if (leveledSStables != null)
+                if (leveledSStables.length > 0)
                 {
                     statsTable.isLeveledSstable = true;
 
diff --git a/src/java/org/apache/cassandra/utils/ExpMovingAverage.java b/src/java/org/apache/cassandra/utils/ExpMovingAverage.java
new file mode 100644
index 000000000000..2dc726a2e42d
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/ExpMovingAverage.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import com.google.common.util.concurrent.AtomicDouble;
+
+/**
+ * Sample-based exponential moving average. On every update a fraction of the current average is replaced by the new
+ * sample. New values have greater representation in the average, and older samples' effect exponentially decays with
+ * new data.
+ */
+public class ExpMovingAverage implements MovingAverage
+{
+    /** The ratio of decay, between 0 and 1, where smaller alpha means values are averaged over more samples */
+    private final double alpha;
+
+    /** The long term average with exponential decay */
+    private final AtomicDouble average = new AtomicDouble(Double.NaN);
+
+    /**
+     * Create a {@link ExpMovingAverage} where older values have less than 1% effect after 1000 samples.
+     */
+    public static MovingAverage decayBy1000()
+    {
+        return new ExpMovingAverage(0.0046);
+    }
+
+    /**
+     * Create a {@link ExpMovingAverage} where older values have less than 1% effect after 100 samples.
+     */
+    public static ExpMovingAverage decayBy100()
+    {
+        return new ExpMovingAverage(0.045);
+    }
+
+    /**
+     * Create a {@link ExpMovingAverage} where older values have less than 1% effect after 10 samples.
+     */
+    public static ExpMovingAverage decayBy10()
+    {
+        return new ExpMovingAverage(0.37);
+    }
+
+    /**
+     * Create a {@link ExpMovingAverage} where older values have less effect than the given ratio after the given
+     * number of samples.
+     */
+    public static ExpMovingAverage withDecay(double ratio, int samples)
+    {
+        assert ratio > 0.0 && ratio < 1.0;
+        assert samples > 0;
+        return new ExpMovingAverage(1 - Math.pow(ratio, 1.0 / samples));
+    }
+
+    ExpMovingAverage(double alpha)
+    {
+        assert alpha > 0.0 && alpha <= 1.0;
+        this.alpha = alpha;
+    }
+
+    @Override
+    public MovingAverage update(double val)
+    {
+        double current, update;
+        do
+        {
+            current = average.get();
+
+            if (!Double.isNaN(current))
+                update = current + alpha * (val - current);
+            else
+                update = val;   // Not initialized yet. Incidentally, passing NaN will cause reinitialization on the
+                                // next update.
+        }
+        while (!average.compareAndSet(current, update));
+
+        return this;
+    }
+
+    @Override
+    public double get()
+    {
+        return average.get();
+    }
+
+    @Override
+    public String toString()
+    {
+        return String.format("%.2f", get());
+    }
+}
diff --git a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
index 6ef6310f06c6..e4da30db4752 100644
--- a/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
+++ b/src/java/org/apache/cassandra/utils/JVMStabilityInspector.java
@@ -205,6 +205,11 @@ public static Killer replaceKiller(Killer newKiller)
         return oldKiller;
     }
 
+    public static Killer killer()
+    {
+        return killer;
+    }
+
     @VisibleForTesting
     public static class Killer
     {
diff --git a/src/java/org/apache/cassandra/utils/MovingAverage.java b/src/java/org/apache/cassandra/utils/MovingAverage.java
new file mode 100644
index 000000000000..11f97934eeb8
--- /dev/null
+++ b/src/java/org/apache/cassandra/utils/MovingAverage.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+public interface MovingAverage
+{
+    MovingAverage update(double val);
+
+    double get();
+}
diff --git a/src/resources/org/apache/cassandra/graph/graph.html b/src/resources/org/apache/cassandra/graph/graph.html
new file mode 100644
index 000000000000..bf6dd9fd7ace
--- /dev/null
+++ b/src/resources/org/apache/cassandra/graph/graph.html
@@ -0,0 +1,568 @@
+<!DOCTYPE html>
+<!--
+  Copyright DataStax, Inc.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License atYou may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitatins under the License.
+-->
+<!-- cstar_perf (https://github.com/datastax/cstar_perf) graphing
+utility adapted for use directly with command line cassandra-stress -->
+
+<head>
+  <meta charset="utf-8">
+  <script language="javascript" type="text/javascript">
+    <!--
+
+/* stats start */
+
+/* stats end */
+
+/*! jQuery v1.11.1 | (c) 2005, 2014 jQuery Foundation, Inc. | jquery.org/license */
+!function(a,b){"object"==typeof module&&"object"==typeof module.exports?module.exports=a.document?b(a,!0):function(a){if(!a.document)throw new Error("jQuery requires a window with a document");return b(a)}:b(a)}("undefined"!=typeof window?window:this,function(a,b){var c=[],d=c.slice,e=c.concat,f=c.push,g=c.indexOf,h={},i=h.toString,j=h.hasOwnProperty,k={},l="1.11.1",m=function(a,b){return new m.fn.init(a,b)},n=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,o=/^-ms-/,p=/-([\da-z])/gi,q=function(a,b){return b.toUpperCase()};m.fn=m.prototype={jquery:l,constructor:m,selector:"",length:0,toArray:function(){return d.call(this)},get:function(a){return null!=a?0>a?this[a+this.length]:this[a]:d.call(this)},pushStack:function(a){var b=m.merge(this.constructor(),a);return b.prevObject=this,b.context=this.context,b},each:function(a,b){return m.each(this,a,b)},map:function(a){return this.pushStack(m.map(this,function(b,c){return a.call(b,c,b)}))},slice:function(){return this.pushStack(d.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},eq:function(a){var b=this.length,c=+a+(0>a?b:0);return this.pushStack(c>=0&&b>c?[this[c]]:[])},end:function(){return this.prevObject||this.constructor(null)},push:f,sort:c.sort,splice:c.splice},m.extend=m.fn.extend=function(){var a,b,c,d,e,f,g=arguments[0]||{},h=1,i=arguments.length,j=!1;for("boolean"==typeof g&&(j=g,g=arguments[h]||{},h++),"object"==typeof g||m.isFunction(g)||(g={}),h===i&&(g=this,h--);i>h;h++)if(null!=(e=arguments[h]))for(d in e)a=g[d],c=e[d],g!==c&&(j&&c&&(m.isPlainObject(c)||(b=m.isArray(c)))?(b?(b=!1,f=a&&m.isArray(a)?a:[]):f=a&&m.isPlainObject(a)?a:{},g[d]=m.extend(j,f,c)):void 0!==c&&(g[d]=c));return g},m.extend({expando:"jQuery"+(l+Math.random()).replace(/\D/g,""),isReady:!0,error:function(a){throw new Error(a)},noop:function(){},isFunction:function(a){return"function"===m.type(a)},isArray:Array.isArray||function(a){return"array"===m.type(a)},isWindow:function(a){return null!=a&&a==a.window},isNumeric:function(a){return!m.isArray(a)&&a-parseFloat(a)>=0},isEmptyObject:function(a){var b;for(b in a)return!1;return!0},isPlainObject:function(a){var b;if(!a||"object"!==m.type(a)||a.nodeType||m.isWindow(a))return!1;try{if(a.constructor&&!j.call(a,"constructor")&&!j.call(a.constructor.prototype,"isPrototypeOf"))return!1}catch(c){return!1}if(k.ownLast)for(b in a)return j.call(a,b);for(b in a);return void 0===b||j.call(a,b)},type:function(a){return null==a?a+"":"object"==typeof a||"function"==typeof a?h[i.call(a)]||"object":typeof a},globalEval:function(b){b&&m.trim(b)&&(a.execScript||function(b){a.eval.call(a,b)})(b)},camelCase:function(a){return a.replace(o,"ms-").replace(p,q)},nodeName:function(a,b){return a.nodeName&&a.nodeName.toLowerCase()===b.toLowerCase()},each:function(a,b,c){var d,e=0,f=a.length,g=r(a);if(c){if(g){for(;f>e;e++)if(d=b.apply(a[e],c),d===!1)break}else for(e in a)if(d=b.apply(a[e],c),d===!1)break}else if(g){for(;f>e;e++)if(d=b.call(a[e],e,a[e]),d===!1)break}else for(e in a)if(d=b.call(a[e],e,a[e]),d===!1)break;return a},trim:function(a){return null==a?"":(a+"").replace(n,"")},makeArray:function(a,b){var c=b||[];return null!=a&&(r(Object(a))?m.merge(c,"string"==typeof a?[a]:a):f.call(c,a)),c},inArray:function(a,b,c){var d;if(b){if(g)return g.call(b,a,c);for(d=b.length,c=c?0>c?Math.max(0,d+c):c:0;d>c;c++)if(c in b&&b[c]===a)return c}return-1},merge:function(a,b){var c=+b.length,d=0,e=a.length;while(c>d)a[e++]=b[d++];if(c!==c)while(void 0!==b[d])a[e++]=b[d++];return a.length=e,a},grep:function(a,b,c){for(var d,e=[],f=0,g=a.length,h=!c;g>f;f++)d=!b(a[f],f),d!==h&&e.push(a[f]);return e},map:function(a,b,c){var d,f=0,g=a.length,h=r(a),i=[];if(h)for(;g>f;f++)d=b(a[f],f,c),null!=d&&i.push(d);else for(f in a)d=b(a[f],f,c),null!=d&&i.push(d);return e.apply([],i)},guid:1,proxy:function(a,b){var c,e,f;return"string"==typeof b&&(f=a[b],b=a,a=f),m.isFunction(a)?(c=d.call(arguments,2),e=function(){return a.apply(b||this,c.concat(d.call(arguments)))},e.guid=a.guid=a.guid||m.guid++,e):void 0},now:function(){return+new Date},support:k}),m.each("Boolean Number String Function Array Date RegExp Object Error".split(" "),function(a,b){h["[object "+b+"]"]=b.toLowerCase()});function r(a){var b=a.length,c=m.type(a);return"function"===c||m.isWindow(a)?!1:1===a.nodeType&&b?!0:"array"===c||0===b||"number"==typeof b&&b>0&&b-1 in a}var s=function(a){var b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u="sizzle"+-new Date,v=a.document,w=0,x=0,y=gb(),z=gb(),A=gb(),B=function(a,b){return a===b&&(l=!0),0},C="undefined",D=1<<31,E={}.hasOwnProperty,F=[],G=F.pop,H=F.push,I=F.push,J=F.slice,K=F.indexOf||function(a){for(var b=0,c=this.length;c>b;b++)if(this[b]===a)return b;return-1},L="checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped",M="[\\x20\\t\\r\\n\\f]",N="(?:\\\\.|[\\w-]|[^\\x00-\\xa0])+",O=N.replace("w","w#"),P="\\["+M+"*("+N+")(?:"+M+"*([*^$|!~]?=)"+M+"*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|("+O+"))|)"+M+"*\\]",Q=":("+N+")(?:\\((('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|((?:\\\\.|[^\\\\()[\\]]|"+P+")*)|.*)\\)|)",R=new RegExp("^"+M+"+|((?:^|[^\\\\])(?:\\\\.)*)"+M+"+$","g"),S=new RegExp("^"+M+"*,"+M+"*"),T=new RegExp("^"+M+"*([>+~]|"+M+")"+M+"*"),U=new RegExp("="+M+"*([^\\]'\"]*?)"+M+"*\\]","g"),V=new RegExp(Q),W=new RegExp("^"+O+"$"),X={ID:new RegExp("^#("+N+")"),CLASS:new RegExp("^\\.("+N+")"),TAG:new RegExp("^("+N.replace("w","w*")+")"),ATTR:new RegExp("^"+P),PSEUDO:new RegExp("^"+Q),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+L+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/^(?:input|select|textarea|button)$/i,Z=/^h\d$/i,$=/^[^{]+\{\s*\[native \w/,_=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ab=/[+~]/,bb=/'|\\/g,cb=new RegExp("\\\\([\\da-f]{1,6}"+M+"?|("+M+")|.)","ig"),db=function(a,b,c){var d="0x"+b-65536;return d!==d||c?b:0>d?String.fromCharCode(d+65536):String.fromCharCode(d>>10|55296,1023&d|56320)};try{I.apply(F=J.call(v.childNodes),v.childNodes),F[v.childNodes.length].nodeType}catch(eb){I={apply:F.length?function(a,b){H.apply(a,J.call(b))}:function(a,b){var c=a.length,d=0;while(a[c++]=b[d++]);a.length=c-1}}}function fb(a,b,d,e){var f,h,j,k,l,o,r,s,w,x;if((b?b.ownerDocument||b:v)!==n&&m(b),b=b||n,d=d||[],!a||"string"!=typeof a)return d;if(1!==(k=b.nodeType)&&9!==k)return[];if(p&&!e){if(f=_.exec(a))if(j=f[1]){if(9===k){if(h=b.getElementById(j),!h||!h.parentNode)return d;if(h.id===j)return d.push(h),d}else if(b.ownerDocument&&(h=b.ownerDocument.getElementById(j))&&t(b,h)&&h.id===j)return d.push(h),d}else{if(f[2])return I.apply(d,b.getElementsByTagName(a)),d;if((j=f[3])&&c.getElementsByClassName&&b.getElementsByClassName)return I.apply(d,b.getElementsByClassName(j)),d}if(c.qsa&&(!q||!q.test(a))){if(s=r=u,w=b,x=9===k&&a,1===k&&"object"!==b.nodeName.toLowerCase()){o=g(a),(r=b.getAttribute("id"))?s=r.replace(bb,"\\$&"):b.setAttribute("id",s),s="[id='"+s+"'] ",l=o.length;while(l--)o[l]=s+qb(o[l]);w=ab.test(a)&&ob(b.parentNode)||b,x=o.join(",")}if(x)try{return I.apply(d,w.querySelectorAll(x)),d}catch(y){}finally{r||b.removeAttribute("id")}}}return i(a.replace(R,"$1"),b,d,e)}function gb(){var a=[];function b(c,e){return a.push(c+" ")>d.cacheLength&&delete b[a.shift()],b[c+" "]=e}return b}function hb(a){return a[u]=!0,a}function ib(a){var b=n.createElement("div");try{return!!a(b)}catch(c){return!1}finally{b.parentNode&&b.parentNode.removeChild(b),b=null}}function jb(a,b){var c=a.split("|"),e=a.length;while(e--)d.attrHandle[c[e]]=b}function kb(a,b){var c=b&&a,d=c&&1===a.nodeType&&1===b.nodeType&&(~b.sourceIndex||D)-(~a.sourceIndex||D);if(d)return d;if(c)while(c=c.nextSibling)if(c===b)return-1;return a?1:-1}function lb(a){return function(b){var c=b.nodeName.toLowerCase();return"input"===c&&b.type===a}}function mb(a){return function(b){var c=b.nodeName.toLowerCase();return("input"===c||"button"===c)&&b.type===a}}function nb(a){return hb(function(b){return b=+b,hb(function(c,d){var e,f=a([],c.length,b),g=f.length;while(g--)c[e=f[g]]&&(c[e]=!(d[e]=c[e]))})})}function ob(a){return a&&typeof a.getElementsByTagName!==C&&a}c=fb.support={},f=fb.isXML=function(a){var b=a&&(a.ownerDocument||a).documentElement;return b?"HTML"!==b.nodeName:!1},m=fb.setDocument=function(a){var b,e=a?a.ownerDocument||a:v,g=e.defaultView;return e!==n&&9===e.nodeType&&e.documentElement?(n=e,o=e.documentElement,p=!f(e),g&&g!==g.top&&(g.addEventListener?g.addEventListener("unload",function(){m()},!1):g.attachEvent&&g.attachEvent("onunload",function(){m()})),c.attributes=ib(function(a){return a.className="i",!a.getAttribute("className")}),c.getElementsByTagName=ib(function(a){return a.appendChild(e.createComment("")),!a.getElementsByTagName("*").length}),c.getElementsByClassName=$.test(e.getElementsByClassName)&&ib(function(a){return a.innerHTML="<div class='a'></div><div class='a i'></div>",a.firstChild.className="i",2===a.getElementsByClassName("i").length}),c.getById=ib(function(a){return o.appendChild(a).id=u,!e.getElementsByName||!e.getElementsByName(u).length}),c.getById?(d.find.ID=function(a,b){if(typeof b.getElementById!==C&&p){var c=b.getElementById(a);return c&&c.parentNode?[c]:[]}},d.filter.ID=function(a){var b=a.replace(cb,db);return function(a){return a.getAttribute("id")===b}}):(delete d.find.ID,d.filter.ID=function(a){var b=a.replace(cb,db);return function(a){var c=typeof a.getAttributeNode!==C&&a.getAttributeNode("id");return c&&c.value===b}}),d.find.TAG=c.getElementsByTagName?function(a,b){return typeof b.getElementsByTagName!==C?b.getElementsByTagName(a):void 0}:function(a,b){var c,d=[],e=0,f=b.getElementsByTagName(a);if("*"===a){while(c=f[e++])1===c.nodeType&&d.push(c);return d}return f},d.find.CLASS=c.getElementsByClassName&&function(a,b){return typeof b.getElementsByClassName!==C&&p?b.getElementsByClassName(a):void 0},r=[],q=[],(c.qsa=$.test(e.querySelectorAll))&&(ib(function(a){a.innerHTML="<select msallowclip=''><option selected=''></option></select>",a.querySelectorAll("[msallowclip^='']").length&&q.push("[*^$]="+M+"*(?:''|\"\")"),a.querySelectorAll("[selected]").length||q.push("\\["+M+"*(?:value|"+L+")"),a.querySelectorAll(":checked").length||q.push(":checked")}),ib(function(a){var b=e.createElement("input");b.setAttribute("type","hidden"),a.appendChild(b).setAttribute("name","D"),a.querySelectorAll("[name=d]").length&&q.push("name"+M+"*[*^$|!~]?="),a.querySelectorAll(":enabled").length||q.push(":enabled",":disabled"),a.querySelectorAll("*,:x"),q.push(",.*:")})),(c.matchesSelector=$.test(s=o.matches||o.webkitMatchesSelector||o.mozMatchesSelector||o.oMatchesSelector||o.msMatchesSelector))&&ib(function(a){c.disconnectedMatch=s.call(a,"div"),s.call(a,"[s!='']:x"),r.push("!=",Q)}),q=q.length&&new RegExp(q.join("|")),r=r.length&&new RegExp(r.join("|")),b=$.test(o.compareDocumentPosition),t=b||$.test(o.contains)?function(a,b){var c=9===a.nodeType?a.documentElement:a,d=b&&b.parentNode;return a===d||!(!d||1!==d.nodeType||!(c.contains?c.contains(d):a.compareDocumentPosition&&16&a.compareDocumentPosition(d)))}:function(a,b){if(b)while(b=b.parentNode)if(b===a)return!0;return!1},B=b?function(a,b){if(a===b)return l=!0,0;var d=!a.compareDocumentPosition-!b.compareDocumentPosition;return d?d:(d=(a.ownerDocument||a)===(b.ownerDocument||b)?a.compareDocumentPosition(b):1,1&d||!c.sortDetached&&b.compareDocumentPosition(a)===d?a===e||a.ownerDocument===v&&t(v,a)?-1:b===e||b.ownerDocument===v&&t(v,b)?1:k?K.call(k,a)-K.call(k,b):0:4&d?-1:1)}:function(a,b){if(a===b)return l=!0,0;var c,d=0,f=a.parentNode,g=b.parentNode,h=[a],i=[b];if(!f||!g)return a===e?-1:b===e?1:f?-1:g?1:k?K.call(k,a)-K.call(k,b):0;if(f===g)return kb(a,b);c=a;while(c=c.parentNode)h.unshift(c);c=b;while(c=c.parentNode)i.unshift(c);while(h[d]===i[d])d++;return d?kb(h[d],i[d]):h[d]===v?-1:i[d]===v?1:0},e):n},fb.matches=function(a,b){return fb(a,null,null,b)},fb.matchesSelector=function(a,b){if((a.ownerDocument||a)!==n&&m(a),b=b.replace(U,"='$1']"),!(!c.matchesSelector||!p||r&&r.test(b)||q&&q.test(b)))try{var d=s.call(a,b);if(d||c.disconnectedMatch||a.document&&11!==a.document.nodeType)return d}catch(e){}return fb(b,n,null,[a]).length>0},fb.contains=function(a,b){return(a.ownerDocument||a)!==n&&m(a),t(a,b)},fb.attr=function(a,b){(a.ownerDocument||a)!==n&&m(a);var e=d.attrHandle[b.toLowerCase()],f=e&&E.call(d.attrHandle,b.toLowerCase())?e(a,b,!p):void 0;return void 0!==f?f:c.attributes||!p?a.getAttribute(b):(f=a.getAttributeNode(b))&&f.specified?f.value:null},fb.error=function(a){throw new Error("Syntax error, unrecognized expression: "+a)},fb.uniqueSort=function(a){var b,d=[],e=0,f=0;if(l=!c.detectDuplicates,k=!c.sortStable&&a.slice(0),a.sort(B),l){while(b=a[f++])b===a[f]&&(e=d.push(f));while(e--)a.splice(d[e],1)}return k=null,a},e=fb.getText=function(a){var b,c="",d=0,f=a.nodeType;if(f){if(1===f||9===f||11===f){if("string"==typeof a.textContent)return a.textContent;for(a=a.firstChild;a;a=a.nextSibling)c+=e(a)}else if(3===f||4===f)return a.nodeValue}else while(b=a[d++])c+=e(b);return c},d=fb.selectors={cacheLength:50,createPseudo:hb,match:X,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(a){return a[1]=a[1].replace(cb,db),a[3]=(a[3]||a[4]||a[5]||"").replace(cb,db),"~="===a[2]&&(a[3]=" "+a[3]+" "),a.slice(0,4)},CHILD:function(a){return a[1]=a[1].toLowerCase(),"nth"===a[1].slice(0,3)?(a[3]||fb.error(a[0]),a[4]=+(a[4]?a[5]+(a[6]||1):2*("even"===a[3]||"odd"===a[3])),a[5]=+(a[7]+a[8]||"odd"===a[3])):a[3]&&fb.error(a[0]),a},PSEUDO:function(a){var b,c=!a[6]&&a[2];return X.CHILD.test(a[0])?null:(a[3]?a[2]=a[4]||a[5]||"":c&&V.test(c)&&(b=g(c,!0))&&(b=c.indexOf(")",c.length-b)-c.length)&&(a[0]=a[0].slice(0,b),a[2]=c.slice(0,b)),a.slice(0,3))}},filter:{TAG:function(a){var b=a.replace(cb,db).toLowerCase();return"*"===a?function(){return!0}:function(a){return a.nodeName&&a.nodeName.toLowerCase()===b}},CLASS:function(a){var b=y[a+" "];return b||(b=new RegExp("(^|"+M+")"+a+"("+M+"|$)"))&&y(a,function(a){return b.test("string"==typeof a.className&&a.className||typeof a.getAttribute!==C&&a.getAttribute("class")||"")})},ATTR:function(a,b,c){return function(d){var e=fb.attr(d,a);return null==e?"!="===b:b?(e+="","="===b?e===c:"!="===b?e!==c:"^="===b?c&&0===e.indexOf(c):"*="===b?c&&e.indexOf(c)>-1:"$="===b?c&&e.slice(-c.length)===c:"~="===b?(" "+e+" ").indexOf(c)>-1:"|="===b?e===c||e.slice(0,c.length+1)===c+"-":!1):!0}},CHILD:function(a,b,c,d,e){var f="nth"!==a.slice(0,3),g="last"!==a.slice(-4),h="of-type"===b;return 1===d&&0===e?function(a){return!!a.parentNode}:function(b,c,i){var j,k,l,m,n,o,p=f!==g?"nextSibling":"previousSibling",q=b.parentNode,r=h&&b.nodeName.toLowerCase(),s=!i&&!h;if(q){if(f){while(p){l=b;while(l=l[p])if(h?l.nodeName.toLowerCase()===r:1===l.nodeType)return!1;o=p="only"===a&&!o&&"nextSibling"}return!0}if(o=[g?q.firstChild:q.lastChild],g&&s){k=q[u]||(q[u]={}),j=k[a]||[],n=j[0]===w&&j[1],m=j[0]===w&&j[2],l=n&&q.childNodes[n];while(l=++n&&l&&l[p]||(m=n=0)||o.pop())if(1===l.nodeType&&++m&&l===b){k[a]=[w,n,m];break}}else if(s&&(j=(b[u]||(b[u]={}))[a])&&j[0]===w)m=j[1];else while(l=++n&&l&&l[p]||(m=n=0)||o.pop())if((h?l.nodeName.toLowerCase()===r:1===l.nodeType)&&++m&&(s&&((l[u]||(l[u]={}))[a]=[w,m]),l===b))break;return m-=e,m===d||m%d===0&&m/d>=0}}},PSEUDO:function(a,b){var c,e=d.pseudos[a]||d.setFilters[a.toLowerCase()]||fb.error("unsupported pseudo: "+a);return e[u]?e(b):e.length>1?(c=[a,a,"",b],d.setFilters.hasOwnProperty(a.toLowerCase())?hb(function(a,c){var d,f=e(a,b),g=f.length;while(g--)d=K.call(a,f[g]),a[d]=!(c[d]=f[g])}):function(a){return e(a,0,c)}):e}},pseudos:{not:hb(function(a){var b=[],c=[],d=h(a.replace(R,"$1"));return d[u]?hb(function(a,b,c,e){var f,g=d(a,null,e,[]),h=a.length;while(h--)(f=g[h])&&(a[h]=!(b[h]=f))}):function(a,e,f){return b[0]=a,d(b,null,f,c),!c.pop()}}),has:hb(function(a){return function(b){return fb(a,b).length>0}}),contains:hb(function(a){return function(b){return(b.textContent||b.innerText||e(b)).indexOf(a)>-1}}),lang:hb(function(a){return W.test(a||"")||fb.error("unsupported lang: "+a),a=a.replace(cb,db).toLowerCase(),function(b){var c;do if(c=p?b.lang:b.getAttribute("xml:lang")||b.getAttribute("lang"))return c=c.toLowerCase(),c===a||0===c.indexOf(a+"-");while((b=b.parentNode)&&1===b.nodeType);return!1}}),target:function(b){var c=a.location&&a.location.hash;return c&&c.slice(1)===b.id},root:function(a){return a===o},focus:function(a){return a===n.activeElement&&(!n.hasFocus||n.hasFocus())&&!!(a.type||a.href||~a.tabIndex)},enabled:function(a){return a.disabled===!1},disabled:function(a){return a.disabled===!0},checked:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&!!a.checked||"option"===b&&!!a.selected},selected:function(a){return a.parentNode&&a.parentNode.selectedIndex,a.selected===!0},empty:function(a){for(a=a.firstChild;a;a=a.nextSibling)if(a.nodeType<6)return!1;return!0},parent:function(a){return!d.pseudos.empty(a)},header:function(a){return Z.test(a.nodeName)},input:function(a){return Y.test(a.nodeName)},button:function(a){var b=a.nodeName.toLowerCase();return"input"===b&&"button"===a.type||"button"===b},text:function(a){var b;return"input"===a.nodeName.toLowerCase()&&"text"===a.type&&(null==(b=a.getAttribute("type"))||"text"===b.toLowerCase())},first:nb(function(){return[0]}),last:nb(function(a,b){return[b-1]}),eq:nb(function(a,b,c){return[0>c?c+b:c]}),even:nb(function(a,b){for(var c=0;b>c;c+=2)a.push(c);return a}),odd:nb(function(a,b){for(var c=1;b>c;c+=2)a.push(c);return a}),lt:nb(function(a,b,c){for(var d=0>c?c+b:c;--d>=0;)a.push(d);return a}),gt:nb(function(a,b,c){for(var d=0>c?c+b:c;++d<b;)a.push(d);return a})}},d.pseudos.nth=d.pseudos.eq;for(b in{radio:!0,checkbox:!0,file:!0,password:!0,image:!0})d.pseudos[b]=lb(b);for(b in{submit:!0,reset:!0})d.pseudos[b]=mb(b);function pb(){}pb.prototype=d.filters=d.pseudos,d.setFilters=new pb,g=fb.tokenize=function(a,b){var c,e,f,g,h,i,j,k=z[a+" "];if(k)return b?0:k.slice(0);h=a,i=[],j=d.preFilter;while(h){(!c||(e=S.exec(h)))&&(e&&(h=h.slice(e[0].length)||h),i.push(f=[])),c=!1,(e=T.exec(h))&&(c=e.shift(),f.push({value:c,type:e[0].replace(R," ")}),h=h.slice(c.length));for(g in d.filter)!(e=X[g].exec(h))||j[g]&&!(e=j[g](e))||(c=e.shift(),f.push({value:c,type:g,matches:e}),h=h.slice(c.length));if(!c)break}return b?h.length:h?fb.error(a):z(a,i).slice(0)};function qb(a){for(var b=0,c=a.length,d="";c>b;b++)d+=a[b].value;return d}function rb(a,b,c){var d=b.dir,e=c&&"parentNode"===d,f=x++;return b.first?function(b,c,f){while(b=b[d])if(1===b.nodeType||e)return a(b,c,f)}:function(b,c,g){var h,i,j=[w,f];if(g){while(b=b[d])if((1===b.nodeType||e)&&a(b,c,g))return!0}else while(b=b[d])if(1===b.nodeType||e){if(i=b[u]||(b[u]={}),(h=i[d])&&h[0]===w&&h[1]===f)return j[2]=h[2];if(i[d]=j,j[2]=a(b,c,g))return!0}}}function sb(a){return a.length>1?function(b,c,d){var e=a.length;while(e--)if(!a[e](b,c,d))return!1;return!0}:a[0]}function tb(a,b,c){for(var d=0,e=b.length;e>d;d++)fb(a,b[d],c);return c}function ub(a,b,c,d,e){for(var f,g=[],h=0,i=a.length,j=null!=b;i>h;h++)(f=a[h])&&(!c||c(f,d,e))&&(g.push(f),j&&b.push(h));return g}function vb(a,b,c,d,e,f){return d&&!d[u]&&(d=vb(d)),e&&!e[u]&&(e=vb(e,f)),hb(function(f,g,h,i){var j,k,l,m=[],n=[],o=g.length,p=f||tb(b||"*",h.nodeType?[h]:h,[]),q=!a||!f&&b?p:ub(p,m,a,h,i),r=c?e||(f?a:o||d)?[]:g:q;if(c&&c(q,r,h,i),d){j=ub(r,n),d(j,[],h,i),k=j.length;while(k--)(l=j[k])&&(r[n[k]]=!(q[n[k]]=l))}if(f){if(e||a){if(e){j=[],k=r.length;while(k--)(l=r[k])&&j.push(q[k]=l);e(null,r=[],j,i)}k=r.length;while(k--)(l=r[k])&&(j=e?K.call(f,l):m[k])>-1&&(f[j]=!(g[j]=l))}}else r=ub(r===g?r.splice(o,r.length):r),e?e(null,g,r,i):I.apply(g,r)})}function wb(a){for(var b,c,e,f=a.length,g=d.relative[a[0].type],h=g||d.relative[" "],i=g?1:0,k=rb(function(a){return a===b},h,!0),l=rb(function(a){return K.call(b,a)>-1},h,!0),m=[function(a,c,d){return!g&&(d||c!==j)||((b=c).nodeType?k(a,c,d):l(a,c,d))}];f>i;i++)if(c=d.relative[a[i].type])m=[rb(sb(m),c)];else{if(c=d.filter[a[i].type].apply(null,a[i].matches),c[u]){for(e=++i;f>e;e++)if(d.relative[a[e].type])break;return vb(i>1&&sb(m),i>1&&qb(a.slice(0,i-1).concat({value:" "===a[i-2].type?"*":""})).replace(R,"$1"),c,e>i&&wb(a.slice(i,e)),f>e&&wb(a=a.slice(e)),f>e&&qb(a))}m.push(c)}return sb(m)}function xb(a,b){var c=b.length>0,e=a.length>0,f=function(f,g,h,i,k){var l,m,o,p=0,q="0",r=f&&[],s=[],t=j,u=f||e&&d.find.TAG("*",k),v=w+=null==t?1:Math.random()||.1,x=u.length;for(k&&(j=g!==n&&g);q!==x&&null!=(l=u[q]);q++){if(e&&l){m=0;while(o=a[m++])if(o(l,g,h)){i.push(l);break}k&&(w=v)}c&&((l=!o&&l)&&p--,f&&r.push(l))}if(p+=q,c&&q!==p){m=0;while(o=b[m++])o(r,s,g,h);if(f){if(p>0)while(q--)r[q]||s[q]||(s[q]=G.call(i));s=ub(s)}I.apply(i,s),k&&!f&&s.length>0&&p+b.length>1&&fb.uniqueSort(i)}return k&&(w=v,j=t),r};return c?hb(f):f}return h=fb.compile=function(a,b){var c,d=[],e=[],f=A[a+" "];if(!f){b||(b=g(a)),c=b.length;while(c--)f=wb(b[c]),f[u]?d.push(f):e.push(f);f=A(a,xb(e,d)),f.selector=a}return f},i=fb.select=function(a,b,e,f){var i,j,k,l,m,n="function"==typeof a&&a,o=!f&&g(a=n.selector||a);if(e=e||[],1===o.length){if(j=o[0]=o[0].slice(0),j.length>2&&"ID"===(k=j[0]).type&&c.getById&&9===b.nodeType&&p&&d.relative[j[1].type]){if(b=(d.find.ID(k.matches[0].replace(cb,db),b)||[])[0],!b)return e;n&&(b=b.parentNode),a=a.slice(j.shift().value.length)}i=X.needsContext.test(a)?0:j.length;while(i--){if(k=j[i],d.relative[l=k.type])break;if((m=d.find[l])&&(f=m(k.matches[0].replace(cb,db),ab.test(j[0].type)&&ob(b.parentNode)||b))){if(j.splice(i,1),a=f.length&&qb(j),!a)return I.apply(e,f),e;break}}}return(n||h(a,o))(f,b,!p,e,ab.test(a)&&ob(b.parentNode)||b),e},c.sortStable=u.split("").sort(B).join("")===u,c.detectDuplicates=!!l,m(),c.sortDetached=ib(function(a){return 1&a.compareDocumentPosition(n.createElement("div"))}),ib(function(a){return a.innerHTML="<a href='#'></a>","#"===a.firstChild.getAttribute("href")})||jb("type|href|height|width",function(a,b,c){return c?void 0:a.getAttribute(b,"type"===b.toLowerCase()?1:2)}),c.attributes&&ib(function(a){return a.innerHTML="<input/>",a.firstChild.setAttribute("value",""),""===a.firstChild.getAttribute("value")})||jb("value",function(a,b,c){return c||"input"!==a.nodeName.toLowerCase()?void 0:a.defaultValue}),ib(function(a){return null==a.getAttribute("disabled")})||jb(L,function(a,b,c){var d;return c?void 0:a[b]===!0?b.toLowerCase():(d=a.getAttributeNode(b))&&d.specified?d.value:null}),fb}(a);m.find=s,m.expr=s.selectors,m.expr[":"]=m.expr.pseudos,m.unique=s.uniqueSort,m.text=s.getText,m.isXMLDoc=s.isXML,m.contains=s.contains;var t=m.expr.match.needsContext,u=/^<(\w+)\s*\/?>(?:<\/\1>|)$/,v=/^.[^:#\[\.,]*$/;function w(a,b,c){if(m.isFunction(b))return m.grep(a,function(a,d){return!!b.call(a,d,a)!==c});if(b.nodeType)return m.grep(a,function(a){return a===b!==c});if("string"==typeof b){if(v.test(b))return m.filter(b,a,c);b=m.filter(b,a)}return m.grep(a,function(a){return m.inArray(a,b)>=0!==c})}m.filter=function(a,b,c){var d=b[0];return c&&(a=":not("+a+")"),1===b.length&&1===d.nodeType?m.find.matchesSelector(d,a)?[d]:[]:m.find.matches(a,m.grep(b,function(a){return 1===a.nodeType}))},m.fn.extend({find:function(a){var b,c=[],d=this,e=d.length;if("string"!=typeof a)return this.pushStack(m(a).filter(function(){for(b=0;e>b;b++)if(m.contains(d[b],this))return!0}));for(b=0;e>b;b++)m.find(a,d[b],c);return c=this.pushStack(e>1?m.unique(c):c),c.selector=this.selector?this.selector+" "+a:a,c},filter:function(a){return this.pushStack(w(this,a||[],!1))},not:function(a){return this.pushStack(w(this,a||[],!0))},is:function(a){return!!w(this,"string"==typeof a&&t.test(a)?m(a):a||[],!1).length}});var x,y=a.document,z=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]*))$/,A=m.fn.init=function(a,b){var c,d;if(!a)return this;if("string"==typeof a){if(c="<"===a.charAt(0)&&">"===a.charAt(a.length-1)&&a.length>=3?[null,a,null]:z.exec(a),!c||!c[1]&&b)return!b||b.jquery?(b||x).find(a):this.constructor(b).find(a);if(c[1]){if(b=b instanceof m?b[0]:b,m.merge(this,m.parseHTML(c[1],b&&b.nodeType?b.ownerDocument||b:y,!0)),u.test(c[1])&&m.isPlainObject(b))for(c in b)m.isFunction(this[c])?this[c](b[c]):this.attr(c,b[c]);return this}if(d=y.getElementById(c[2]),d&&d.parentNode){if(d.id!==c[2])return x.find(a);this.length=1,this[0]=d}return this.context=y,this.selector=a,this}return a.nodeType?(this.context=this[0]=a,this.length=1,this):m.isFunction(a)?"undefined"!=typeof x.ready?x.ready(a):a(m):(void 0!==a.selector&&(this.selector=a.selector,this.context=a.context),m.makeArray(a,this))};A.prototype=m.fn,x=m(y);var B=/^(?:parents|prev(?:Until|All))/,C={children:!0,contents:!0,next:!0,prev:!0};m.extend({dir:function(a,b,c){var d=[],e=a[b];while(e&&9!==e.nodeType&&(void 0===c||1!==e.nodeType||!m(e).is(c)))1===e.nodeType&&d.push(e),e=e[b];return d},sibling:function(a,b){for(var c=[];a;a=a.nextSibling)1===a.nodeType&&a!==b&&c.push(a);return c}}),m.fn.extend({has:function(a){var b,c=m(a,this),d=c.length;return this.filter(function(){for(b=0;d>b;b++)if(m.contains(this,c[b]))return!0})},closest:function(a,b){for(var c,d=0,e=this.length,f=[],g=t.test(a)||"string"!=typeof a?m(a,b||this.context):0;e>d;d++)for(c=this[d];c&&c!==b;c=c.parentNode)if(c.nodeType<11&&(g?g.index(c)>-1:1===c.nodeType&&m.find.matchesSelector(c,a))){f.push(c);break}return this.pushStack(f.length>1?m.unique(f):f)},index:function(a){return a?"string"==typeof a?m.inArray(this[0],m(a)):m.inArray(a.jquery?a[0]:a,this):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(a,b){return this.pushStack(m.unique(m.merge(this.get(),m(a,b))))},addBack:function(a){return this.add(null==a?this.prevObject:this.prevObject.filter(a))}});function D(a,b){do a=a[b];while(a&&1!==a.nodeType);return a}m.each({parent:function(a){var b=a.parentNode;return b&&11!==b.nodeType?b:null},parents:function(a){return m.dir(a,"parentNode")},parentsUntil:function(a,b,c){return m.dir(a,"parentNode",c)},next:function(a){return D(a,"nextSibling")},prev:function(a){return D(a,"previousSibling")},nextAll:function(a){return m.dir(a,"nextSibling")},prevAll:function(a){return m.dir(a,"previousSibling")},nextUntil:function(a,b,c){return m.dir(a,"nextSibling",c)},prevUntil:function(a,b,c){return m.dir(a,"previousSibling",c)},siblings:function(a){return m.sibling((a.parentNode||{}).firstChild,a)},children:function(a){return m.sibling(a.firstChild)},contents:function(a){return m.nodeName(a,"iframe")?a.contentDocument||a.contentWindow.document:m.merge([],a.childNodes)}},function(a,b){m.fn[a]=function(c,d){var e=m.map(this,b,c);return"Until"!==a.slice(-5)&&(d=c),d&&"string"==typeof d&&(e=m.filter(d,e)),this.length>1&&(C[a]||(e=m.unique(e)),B.test(a)&&(e=e.reverse())),this.pushStack(e)}});var E=/\S+/g,F={};function G(a){var b=F[a]={};return m.each(a.match(E)||[],function(a,c){b[c]=!0}),b}m.Callbacks=function(a){a="string"==typeof a?F[a]||G(a):m.extend({},a);var b,c,d,e,f,g,h=[],i=!a.once&&[],j=function(l){for(c=a.memory&&l,d=!0,f=g||0,g=0,e=h.length,b=!0;h&&e>f;f++)if(h[f].apply(l[0],l[1])===!1&&a.stopOnFalse){c=!1;break}b=!1,h&&(i?i.length&&j(i.shift()):c?h=[]:k.disable())},k={add:function(){if(h){var d=h.length;!function f(b){m.each(b,function(b,c){var d=m.type(c);"function"===d?a.unique&&k.has(c)||h.push(c):c&&c.length&&"string"!==d&&f(c)})}(arguments),b?e=h.length:c&&(g=d,j(c))}return this},remove:function(){return h&&m.each(arguments,function(a,c){var d;while((d=m.inArray(c,h,d))>-1)h.splice(d,1),b&&(e>=d&&e--,f>=d&&f--)}),this},has:function(a){return a?m.inArray(a,h)>-1:!(!h||!h.length)},empty:function(){return h=[],e=0,this},disable:function(){return h=i=c=void 0,this},disabled:function(){return!h},lock:function(){return i=void 0,c||k.disable(),this},locked:function(){return!i},fireWith:function(a,c){return!h||d&&!i||(c=c||[],c=[a,c.slice?c.slice():c],b?i.push(c):j(c)),this},fire:function(){return k.fireWith(this,arguments),this},fired:function(){return!!d}};return k},m.extend({Deferred:function(a){var b=[["resolve","done",m.Callbacks("once memory"),"resolved"],["reject","fail",m.Callbacks("once memory"),"rejected"],["notify","progress",m.Callbacks("memory")]],c="pending",d={state:function(){return c},always:function(){return e.done(arguments).fail(arguments),this},then:function(){var a=arguments;return m.Deferred(function(c){m.each(b,function(b,f){var g=m.isFunction(a[b])&&a[b];e[f[1]](function(){var a=g&&g.apply(this,arguments);a&&m.isFunction(a.promise)?a.promise().done(c.resolve).fail(c.reject).progress(c.notify):c[f[0]+"With"](this===d?c.promise():this,g?[a]:arguments)})}),a=null}).promise()},promise:function(a){return null!=a?m.extend(a,d):d}},e={};return d.pipe=d.then,m.each(b,function(a,f){var g=f[2],h=f[3];d[f[1]]=g.add,h&&g.add(function(){c=h},b[1^a][2].disable,b[2][2].lock),e[f[0]]=function(){return e[f[0]+"With"](this===e?d:this,arguments),this},e[f[0]+"With"]=g.fireWith}),d.promise(e),a&&a.call(e,e),e},when:function(a){var b=0,c=d.call(arguments),e=c.length,f=1!==e||a&&m.isFunction(a.promise)?e:0,g=1===f?a:m.Deferred(),h=function(a,b,c){return function(e){b[a]=this,c[a]=arguments.length>1?d.call(arguments):e,c===i?g.notifyWith(b,c):--f||g.resolveWith(b,c)}},i,j,k;if(e>1)for(i=new Array(e),j=new Array(e),k=new Array(e);e>b;b++)c[b]&&m.isFunction(c[b].promise)?c[b].promise().done(h(b,k,c)).fail(g.reject).progress(h(b,j,i)):--f;return f||g.resolveWith(k,c),g.promise()}});var H;m.fn.ready=function(a){return m.ready.promise().done(a),this},m.extend({isReady:!1,readyWait:1,holdReady:function(a){a?m.readyWait++:m.ready(!0)},ready:function(a){if(a===!0?!--m.readyWait:!m.isReady){if(!y.body)return setTimeout(m.ready);m.isReady=!0,a!==!0&&--m.readyWait>0||(H.resolveWith(y,[m]),m.fn.triggerHandler&&(m(y).triggerHandler("ready"),m(y).off("ready")))}}});function I(){y.addEventListener?(y.removeEventListener("DOMContentLoaded",J,!1),a.removeEventListener("load",J,!1)):(y.detachEvent("onreadystatechange",J),a.detachEvent("onload",J))}function J(){(y.addEventListener||"load"===event.type||"complete"===y.readyState)&&(I(),m.ready())}m.ready.promise=function(b){if(!H)if(H=m.Deferred(),"complete"===y.readyState)setTimeout(m.ready);else if(y.addEventListener)y.addEventListener("DOMContentLoaded",J,!1),a.addEventListener("load",J,!1);else{y.attachEvent("onreadystatechange",J),a.attachEvent("onload",J);var c=!1;try{c=null==a.frameElement&&y.documentElement}catch(d){}c&&c.doScroll&&!function e(){if(!m.isReady){try{c.doScroll("left")}catch(a){return setTimeout(e,50)}I(),m.ready()}}()}return H.promise(b)};var K="undefined",L;for(L in m(k))break;k.ownLast="0"!==L,k.inlineBlockNeedsLayout=!1,m(function(){var a,b,c,d;c=y.getElementsByTagName("body")[0],c&&c.style&&(b=y.createElement("div"),d=y.createElement("div"),d.style.cssText="position:absolute;border:0;width:0;height:0;top:0;left:-9999px",c.appendChild(d).appendChild(b),typeof b.style.zoom!==K&&(b.style.cssText="display:inline;margin:0;border:0;padding:1px;width:1px;zoom:1",k.inlineBlockNeedsLayout=a=3===b.offsetWidth,a&&(c.style.zoom=1)),c.removeChild(d))}),function(){var a=y.createElement("div");if(null==k.deleteExpando){k.deleteExpando=!0;try{delete a.test}catch(b){k.deleteExpando=!1}}a=null}(),m.acceptData=function(a){var b=m.noData[(a.nodeName+" ").toLowerCase()],c=+a.nodeType||1;return 1!==c&&9!==c?!1:!b||b!==!0&&a.getAttribute("classid")===b};var M=/^(?:\{[\w\W]*\}|\[[\w\W]*\])$/,N=/([A-Z])/g;function O(a,b,c){if(void 0===c&&1===a.nodeType){var d="data-"+b.replace(N,"-$1").toLowerCase();if(c=a.getAttribute(d),"string"==typeof c){try{c="true"===c?!0:"false"===c?!1:"null"===c?null:+c+""===c?+c:M.test(c)?m.parseJSON(c):c}catch(e){}m.data(a,b,c)}else c=void 0}return c}function P(a){var b;for(b in a)if(("data"!==b||!m.isEmptyObject(a[b]))&&"toJSON"!==b)return!1;return!0}function Q(a,b,d,e){if(m.acceptData(a)){var f,g,h=m.expando,i=a.nodeType,j=i?m.cache:a,k=i?a[h]:a[h]&&h;
+if(k&&j[k]&&(e||j[k].data)||void 0!==d||"string"!=typeof b)return k||(k=i?a[h]=c.pop()||m.guid++:h),j[k]||(j[k]=i?{}:{toJSON:m.noop}),("object"==typeof b||"function"==typeof b)&&(e?j[k]=m.extend(j[k],b):j[k].data=m.extend(j[k].data,b)),g=j[k],e||(g.data||(g.data={}),g=g.data),void 0!==d&&(g[m.camelCase(b)]=d),"string"==typeof b?(f=g[b],null==f&&(f=g[m.camelCase(b)])):f=g,f}}function R(a,b,c){if(m.acceptData(a)){var d,e,f=a.nodeType,g=f?m.cache:a,h=f?a[m.expando]:m.expando;if(g[h]){if(b&&(d=c?g[h]:g[h].data)){m.isArray(b)?b=b.concat(m.map(b,m.camelCase)):b in d?b=[b]:(b=m.camelCase(b),b=b in d?[b]:b.split(" ")),e=b.length;while(e--)delete d[b[e]];if(c?!P(d):!m.isEmptyObject(d))return}(c||(delete g[h].data,P(g[h])))&&(f?m.cleanData([a],!0):k.deleteExpando||g!=g.window?delete g[h]:g[h]=null)}}}m.extend({cache:{},noData:{"applet ":!0,"embed ":!0,"object ":"clsid:D27CDB6E-AE6D-11cf-96B8-444553540000"},hasData:function(a){return a=a.nodeType?m.cache[a[m.expando]]:a[m.expando],!!a&&!P(a)},data:function(a,b,c){return Q(a,b,c)},removeData:function(a,b){return R(a,b)},_data:function(a,b,c){return Q(a,b,c,!0)},_removeData:function(a,b){return R(a,b,!0)}}),m.fn.extend({data:function(a,b){var c,d,e,f=this[0],g=f&&f.attributes;if(void 0===a){if(this.length&&(e=m.data(f),1===f.nodeType&&!m._data(f,"parsedAttrs"))){c=g.length;while(c--)g[c]&&(d=g[c].name,0===d.indexOf("data-")&&(d=m.camelCase(d.slice(5)),O(f,d,e[d])));m._data(f,"parsedAttrs",!0)}return e}return"object"==typeof a?this.each(function(){m.data(this,a)}):arguments.length>1?this.each(function(){m.data(this,a,b)}):f?O(f,a,m.data(f,a)):void 0},removeData:function(a){return this.each(function(){m.removeData(this,a)})}}),m.extend({queue:function(a,b,c){var d;return a?(b=(b||"fx")+"queue",d=m._data(a,b),c&&(!d||m.isArray(c)?d=m._data(a,b,m.makeArray(c)):d.push(c)),d||[]):void 0},dequeue:function(a,b){b=b||"fx";var c=m.queue(a,b),d=c.length,e=c.shift(),f=m._queueHooks(a,b),g=function(){m.dequeue(a,b)};"inprogress"===e&&(e=c.shift(),d--),e&&("fx"===b&&c.unshift("inprogress"),delete f.stop,e.call(a,g,f)),!d&&f&&f.empty.fire()},_queueHooks:function(a,b){var c=b+"queueHooks";return m._data(a,c)||m._data(a,c,{empty:m.Callbacks("once memory").add(function(){m._removeData(a,b+"queue"),m._removeData(a,c)})})}}),m.fn.extend({queue:function(a,b){var c=2;return"string"!=typeof a&&(b=a,a="fx",c--),arguments.length<c?m.queue(this[0],a):void 0===b?this:this.each(function(){var c=m.queue(this,a,b);m._queueHooks(this,a),"fx"===a&&"inprogress"!==c[0]&&m.dequeue(this,a)})},dequeue:function(a){return this.each(function(){m.dequeue(this,a)})},clearQueue:function(a){return this.queue(a||"fx",[])},promise:function(a,b){var c,d=1,e=m.Deferred(),f=this,g=this.length,h=function(){--d||e.resolveWith(f,[f])};"string"!=typeof a&&(b=a,a=void 0),a=a||"fx";while(g--)c=m._data(f[g],a+"queueHooks"),c&&c.empty&&(d++,c.empty.add(h));return h(),e.promise(b)}});var S=/[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/.source,T=["Top","Right","Bottom","Left"],U=function(a,b){return a=b||a,"none"===m.css(a,"display")||!m.contains(a.ownerDocument,a)},V=m.access=function(a,b,c,d,e,f,g){var h=0,i=a.length,j=null==c;if("object"===m.type(c)){e=!0;for(h in c)m.access(a,b,h,c[h],!0,f,g)}else if(void 0!==d&&(e=!0,m.isFunction(d)||(g=!0),j&&(g?(b.call(a,d),b=null):(j=b,b=function(a,b,c){return j.call(m(a),c)})),b))for(;i>h;h++)b(a[h],c,g?d:d.call(a[h],h,b(a[h],c)));return e?a:j?b.call(a):i?b(a[0],c):f},W=/^(?:checkbox|radio)$/i;!function(){var a=y.createElement("input"),b=y.createElement("div"),c=y.createDocumentFragment();if(b.innerHTML="  <link/><table></table><a href='/a'>a</a><input type='checkbox'/>",k.leadingWhitespace=3===b.firstChild.nodeType,k.tbody=!b.getElementsByTagName("tbody").length,k.htmlSerialize=!!b.getElementsByTagName("link").length,k.html5Clone="<:nav></:nav>"!==y.createElement("nav").cloneNode(!0).outerHTML,a.type="checkbox",a.checked=!0,c.appendChild(a),k.appendChecked=a.checked,b.innerHTML="<textarea>x</textarea>",k.noCloneChecked=!!b.cloneNode(!0).lastChild.defaultValue,c.appendChild(b),b.innerHTML="<input type='radio' checked='checked' name='t'/>",k.checkClone=b.cloneNode(!0).cloneNode(!0).lastChild.checked,k.noCloneEvent=!0,b.attachEvent&&(b.attachEvent("onclick",function(){k.noCloneEvent=!1}),b.cloneNode(!0).click()),null==k.deleteExpando){k.deleteExpando=!0;try{delete b.test}catch(d){k.deleteExpando=!1}}}(),function(){var b,c,d=y.createElement("div");for(b in{submit:!0,change:!0,focusin:!0})c="on"+b,(k[b+"Bubbles"]=c in a)||(d.setAttribute(c,"t"),k[b+"Bubbles"]=d.attributes[c].expando===!1);d=null}();var X=/^(?:input|select|textarea)$/i,Y=/^key/,Z=/^(?:mouse|pointer|contextmenu)|click/,$=/^(?:focusinfocus|focusoutblur)$/,_=/^([^.]*)(?:\.(.+)|)$/;function ab(){return!0}function bb(){return!1}function cb(){try{return y.activeElement}catch(a){}}m.event={global:{},add:function(a,b,c,d,e){var f,g,h,i,j,k,l,n,o,p,q,r=m._data(a);if(r){c.handler&&(i=c,c=i.handler,e=i.selector),c.guid||(c.guid=m.guid++),(g=r.events)||(g=r.events={}),(k=r.handle)||(k=r.handle=function(a){return typeof m===K||a&&m.event.triggered===a.type?void 0:m.event.dispatch.apply(k.elem,arguments)},k.elem=a),b=(b||"").match(E)||[""],h=b.length;while(h--)f=_.exec(b[h])||[],o=q=f[1],p=(f[2]||"").split(".").sort(),o&&(j=m.event.special[o]||{},o=(e?j.delegateType:j.bindType)||o,j=m.event.special[o]||{},l=m.extend({type:o,origType:q,data:d,handler:c,guid:c.guid,selector:e,needsContext:e&&m.expr.match.needsContext.test(e),namespace:p.join(".")},i),(n=g[o])||(n=g[o]=[],n.delegateCount=0,j.setup&&j.setup.call(a,d,p,k)!==!1||(a.addEventListener?a.addEventListener(o,k,!1):a.attachEvent&&a.attachEvent("on"+o,k))),j.add&&(j.add.call(a,l),l.handler.guid||(l.handler.guid=c.guid)),e?n.splice(n.delegateCount++,0,l):n.push(l),m.event.global[o]=!0);a=null}},remove:function(a,b,c,d,e){var f,g,h,i,j,k,l,n,o,p,q,r=m.hasData(a)&&m._data(a);if(r&&(k=r.events)){b=(b||"").match(E)||[""],j=b.length;while(j--)if(h=_.exec(b[j])||[],o=q=h[1],p=(h[2]||"").split(".").sort(),o){l=m.event.special[o]||{},o=(d?l.delegateType:l.bindType)||o,n=k[o]||[],h=h[2]&&new RegExp("(^|\\.)"+p.join("\\.(?:.*\\.|)")+"(\\.|$)"),i=f=n.length;while(f--)g=n[f],!e&&q!==g.origType||c&&c.guid!==g.guid||h&&!h.test(g.namespace)||d&&d!==g.selector&&("**"!==d||!g.selector)||(n.splice(f,1),g.selector&&n.delegateCount--,l.remove&&l.remove.call(a,g));i&&!n.length&&(l.teardown&&l.teardown.call(a,p,r.handle)!==!1||m.removeEvent(a,o,r.handle),delete k[o])}else for(o in k)m.event.remove(a,o+b[j],c,d,!0);m.isEmptyObject(k)&&(delete r.handle,m._removeData(a,"events"))}},trigger:function(b,c,d,e){var f,g,h,i,k,l,n,o=[d||y],p=j.call(b,"type")?b.type:b,q=j.call(b,"namespace")?b.namespace.split("."):[];if(h=l=d=d||y,3!==d.nodeType&&8!==d.nodeType&&!$.test(p+m.event.triggered)&&(p.indexOf(".")>=0&&(q=p.split("."),p=q.shift(),q.sort()),g=p.indexOf(":")<0&&"on"+p,b=b[m.expando]?b:new m.Event(p,"object"==typeof b&&b),b.isTrigger=e?2:3,b.namespace=q.join("."),b.namespace_re=b.namespace?new RegExp("(^|\\.)"+q.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,b.result=void 0,b.target||(b.target=d),c=null==c?[b]:m.makeArray(c,[b]),k=m.event.special[p]||{},e||!k.trigger||k.trigger.apply(d,c)!==!1)){if(!e&&!k.noBubble&&!m.isWindow(d)){for(i=k.delegateType||p,$.test(i+p)||(h=h.parentNode);h;h=h.parentNode)o.push(h),l=h;l===(d.ownerDocument||y)&&o.push(l.defaultView||l.parentWindow||a)}n=0;while((h=o[n++])&&!b.isPropagationStopped())b.type=n>1?i:k.bindType||p,f=(m._data(h,"events")||{})[b.type]&&m._data(h,"handle"),f&&f.apply(h,c),f=g&&h[g],f&&f.apply&&m.acceptData(h)&&(b.result=f.apply(h,c),b.result===!1&&b.preventDefault());if(b.type=p,!e&&!b.isDefaultPrevented()&&(!k._default||k._default.apply(o.pop(),c)===!1)&&m.acceptData(d)&&g&&d[p]&&!m.isWindow(d)){l=d[g],l&&(d[g]=null),m.event.triggered=p;try{d[p]()}catch(r){}m.event.triggered=void 0,l&&(d[g]=l)}return b.result}},dispatch:function(a){a=m.event.fix(a);var b,c,e,f,g,h=[],i=d.call(arguments),j=(m._data(this,"events")||{})[a.type]||[],k=m.event.special[a.type]||{};if(i[0]=a,a.delegateTarget=this,!k.preDispatch||k.preDispatch.call(this,a)!==!1){h=m.event.handlers.call(this,a,j),b=0;while((f=h[b++])&&!a.isPropagationStopped()){a.currentTarget=f.elem,g=0;while((e=f.handlers[g++])&&!a.isImmediatePropagationStopped())(!a.namespace_re||a.namespace_re.test(e.namespace))&&(a.handleObj=e,a.data=e.data,c=((m.event.special[e.origType]||{}).handle||e.handler).apply(f.elem,i),void 0!==c&&(a.result=c)===!1&&(a.preventDefault(),a.stopPropagation()))}return k.postDispatch&&k.postDispatch.call(this,a),a.result}},handlers:function(a,b){var c,d,e,f,g=[],h=b.delegateCount,i=a.target;if(h&&i.nodeType&&(!a.button||"click"!==a.type))for(;i!=this;i=i.parentNode||this)if(1===i.nodeType&&(i.disabled!==!0||"click"!==a.type)){for(e=[],f=0;h>f;f++)d=b[f],c=d.selector+" ",void 0===e[c]&&(e[c]=d.needsContext?m(c,this).index(i)>=0:m.find(c,this,null,[i]).length),e[c]&&e.push(d);e.length&&g.push({elem:i,handlers:e})}return h<b.length&&g.push({elem:this,handlers:b.slice(h)}),g},fix:function(a){if(a[m.expando])return a;var b,c,d,e=a.type,f=a,g=this.fixHooks[e];g||(this.fixHooks[e]=g=Z.test(e)?this.mouseHooks:Y.test(e)?this.keyHooks:{}),d=g.props?this.props.concat(g.props):this.props,a=new m.Event(f),b=d.length;while(b--)c=d[b],a[c]=f[c];return a.target||(a.target=f.srcElement||y),3===a.target.nodeType&&(a.target=a.target.parentNode),a.metaKey=!!a.metaKey,g.filter?g.filter(a,f):a},props:"altKey bubbles cancelable ctrlKey currentTarget eventPhase metaKey relatedTarget shiftKey target timeStamp view which".split(" "),fixHooks:{},keyHooks:{props:"char charCode key keyCode".split(" "),filter:function(a,b){return null==a.which&&(a.which=null!=b.charCode?b.charCode:b.keyCode),a}},mouseHooks:{props:"button buttons clientX clientY fromElement offsetX offsetY pageX pageY screenX screenY toElement".split(" "),filter:function(a,b){var c,d,e,f=b.button,g=b.fromElement;return null==a.pageX&&null!=b.clientX&&(d=a.target.ownerDocument||y,e=d.documentElement,c=d.body,a.pageX=b.clientX+(e&&e.scrollLeft||c&&c.scrollLeft||0)-(e&&e.clientLeft||c&&c.clientLeft||0),a.pageY=b.clientY+(e&&e.scrollTop||c&&c.scrollTop||0)-(e&&e.clientTop||c&&c.clientTop||0)),!a.relatedTarget&&g&&(a.relatedTarget=g===a.target?b.toElement:g),a.which||void 0===f||(a.which=1&f?1:2&f?3:4&f?2:0),a}},special:{load:{noBubble:!0},focus:{trigger:function(){if(this!==cb()&&this.focus)try{return this.focus(),!1}catch(a){}},delegateType:"focusin"},blur:{trigger:function(){return this===cb()&&this.blur?(this.blur(),!1):void 0},delegateType:"focusout"},click:{trigger:function(){return m.nodeName(this,"input")&&"checkbox"===this.type&&this.click?(this.click(),!1):void 0},_default:function(a){return m.nodeName(a.target,"a")}},beforeunload:{postDispatch:function(a){void 0!==a.result&&a.originalEvent&&(a.originalEvent.returnValue=a.result)}}},simulate:function(a,b,c,d){var e=m.extend(new m.Event,c,{type:a,isSimulated:!0,originalEvent:{}});d?m.event.trigger(e,null,b):m.event.dispatch.call(b,e),e.isDefaultPrevented()&&c.preventDefault()}},m.removeEvent=y.removeEventListener?function(a,b,c){a.removeEventListener&&a.removeEventListener(b,c,!1)}:function(a,b,c){var d="on"+b;a.detachEvent&&(typeof a[d]===K&&(a[d]=null),a.detachEvent(d,c))},m.Event=function(a,b){return this instanceof m.Event?(a&&a.type?(this.originalEvent=a,this.type=a.type,this.isDefaultPrevented=a.defaultPrevented||void 0===a.defaultPrevented&&a.returnValue===!1?ab:bb):this.type=a,b&&m.extend(this,b),this.timeStamp=a&&a.timeStamp||m.now(),void(this[m.expando]=!0)):new m.Event(a,b)},m.Event.prototype={isDefaultPrevented:bb,isPropagationStopped:bb,isImmediatePropagationStopped:bb,preventDefault:function(){var a=this.originalEvent;this.isDefaultPrevented=ab,a&&(a.preventDefault?a.preventDefault():a.returnValue=!1)},stopPropagation:function(){var a=this.originalEvent;this.isPropagationStopped=ab,a&&(a.stopPropagation&&a.stopPropagation(),a.cancelBubble=!0)},stopImmediatePropagation:function(){var a=this.originalEvent;this.isImmediatePropagationStopped=ab,a&&a.stopImmediatePropagation&&a.stopImmediatePropagation(),this.stopPropagation()}},m.each({mouseenter:"mouseover",mouseleave:"mouseout",pointerenter:"pointerover",pointerleave:"pointerout"},function(a,b){m.event.special[a]={delegateType:b,bindType:b,handle:function(a){var c,d=this,e=a.relatedTarget,f=a.handleObj;return(!e||e!==d&&!m.contains(d,e))&&(a.type=f.origType,c=f.handler.apply(this,arguments),a.type=b),c}}}),k.submitBubbles||(m.event.special.submit={setup:function(){return m.nodeName(this,"form")?!1:void m.event.add(this,"click._submit keypress._submit",function(a){var b=a.target,c=m.nodeName(b,"input")||m.nodeName(b,"button")?b.form:void 0;c&&!m._data(c,"submitBubbles")&&(m.event.add(c,"submit._submit",function(a){a._submit_bubble=!0}),m._data(c,"submitBubbles",!0))})},postDispatch:function(a){a._submit_bubble&&(delete a._submit_bubble,this.parentNode&&!a.isTrigger&&m.event.simulate("submit",this.parentNode,a,!0))},teardown:function(){return m.nodeName(this,"form")?!1:void m.event.remove(this,"._submit")}}),k.changeBubbles||(m.event.special.change={setup:function(){return X.test(this.nodeName)?(("checkbox"===this.type||"radio"===this.type)&&(m.event.add(this,"propertychange._change",function(a){"checked"===a.originalEvent.propertyName&&(this._just_changed=!0)}),m.event.add(this,"click._change",function(a){this._just_changed&&!a.isTrigger&&(this._just_changed=!1),m.event.simulate("change",this,a,!0)})),!1):void m.event.add(this,"beforeactivate._change",function(a){var b=a.target;X.test(b.nodeName)&&!m._data(b,"changeBubbles")&&(m.event.add(b,"change._change",function(a){!this.parentNode||a.isSimulated||a.isTrigger||m.event.simulate("change",this.parentNode,a,!0)}),m._data(b,"changeBubbles",!0))})},handle:function(a){var b=a.target;return this!==b||a.isSimulated||a.isTrigger||"radio"!==b.type&&"checkbox"!==b.type?a.handleObj.handler.apply(this,arguments):void 0},teardown:function(){return m.event.remove(this,"._change"),!X.test(this.nodeName)}}),k.focusinBubbles||m.each({focus:"focusin",blur:"focusout"},function(a,b){var c=function(a){m.event.simulate(b,a.target,m.event.fix(a),!0)};m.event.special[b]={setup:function(){var d=this.ownerDocument||this,e=m._data(d,b);e||d.addEventListener(a,c,!0),m._data(d,b,(e||0)+1)},teardown:function(){var d=this.ownerDocument||this,e=m._data(d,b)-1;e?m._data(d,b,e):(d.removeEventListener(a,c,!0),m._removeData(d,b))}}}),m.fn.extend({on:function(a,b,c,d,e){var f,g;if("object"==typeof a){"string"!=typeof b&&(c=c||b,b=void 0);for(f in a)this.on(f,b,c,a[f],e);return this}if(null==c&&null==d?(d=b,c=b=void 0):null==d&&("string"==typeof b?(d=c,c=void 0):(d=c,c=b,b=void 0)),d===!1)d=bb;else if(!d)return this;return 1===e&&(g=d,d=function(a){return m().off(a),g.apply(this,arguments)},d.guid=g.guid||(g.guid=m.guid++)),this.each(function(){m.event.add(this,a,d,c,b)})},one:function(a,b,c,d){return this.on(a,b,c,d,1)},off:function(a,b,c){var d,e;if(a&&a.preventDefault&&a.handleObj)return d=a.handleObj,m(a.delegateTarget).off(d.namespace?d.origType+"."+d.namespace:d.origType,d.selector,d.handler),this;if("object"==typeof a){for(e in a)this.off(e,b,a[e]);return this}return(b===!1||"function"==typeof b)&&(c=b,b=void 0),c===!1&&(c=bb),this.each(function(){m.event.remove(this,a,c,b)})},trigger:function(a,b){return this.each(function(){m.event.trigger(a,b,this)})},triggerHandler:function(a,b){var c=this[0];return c?m.event.trigger(a,b,c,!0):void 0}});function db(a){var b=eb.split("|"),c=a.createDocumentFragment();if(c.createElement)while(b.length)c.createElement(b.pop());return c}var eb="abbr|article|aside|audio|bdi|canvas|data|datalist|details|figcaption|figure|footer|header|hgroup|mark|meter|nav|output|progress|section|summary|time|video",fb=/ jQuery\d+="(?:null|\d+)"/g,gb=new RegExp("<(?:"+eb+")[\\s/>]","i"),hb=/^\s+/,ib=/<(?!area|br|col|embed|hr|img|input|link|meta|param)(([\w:]+)[^>]*)\/>/gi,jb=/<([\w:]+)/,kb=/<tbody/i,lb=/<|&#?\w+;/,mb=/<(?:script|style|link)/i,nb=/checked\s*(?:[^=]|=\s*.checked.)/i,ob=/^$|\/(?:java|ecma)script/i,pb=/^true\/(.*)/,qb=/^\s*<!(?:\[CDATA\[|--)|(?:\]\]|--)>\s*$/g,rb={option:[1,"<select multiple='multiple'>","</select>"],legend:[1,"<fieldset>","</fieldset>"],area:[1,"<map>","</map>"],param:[1,"<object>","</object>"],thead:[1,"<table>","</table>"],tr:[2,"<table><tbody>","</tbody></table>"],col:[2,"<table><tbody></tbody><colgroup>","</colgroup></table>"],td:[3,"<table><tbody><tr>","</tr></tbody></table>"],_default:k.htmlSerialize?[0,"",""]:[1,"X<div>","</div>"]},sb=db(y),tb=sb.appendChild(y.createElement("div"));rb.optgroup=rb.option,rb.tbody=rb.tfoot=rb.colgroup=rb.caption=rb.thead,rb.th=rb.td;function ub(a,b){var c,d,e=0,f=typeof a.getElementsByTagName!==K?a.getElementsByTagName(b||"*"):typeof a.querySelectorAll!==K?a.querySelectorAll(b||"*"):void 0;if(!f)for(f=[],c=a.childNodes||a;null!=(d=c[e]);e++)!b||m.nodeName(d,b)?f.push(d):m.merge(f,ub(d,b));return void 0===b||b&&m.nodeName(a,b)?m.merge([a],f):f}function vb(a){W.test(a.type)&&(a.defaultChecked=a.checked)}function wb(a,b){return m.nodeName(a,"table")&&m.nodeName(11!==b.nodeType?b:b.firstChild,"tr")?a.getElementsByTagName("tbody")[0]||a.appendChild(a.ownerDocument.createElement("tbody")):a}function xb(a){return a.type=(null!==m.find.attr(a,"type"))+"/"+a.type,a}function yb(a){var b=pb.exec(a.type);return b?a.type=b[1]:a.removeAttribute("type"),a}function zb(a,b){for(var c,d=0;null!=(c=a[d]);d++)m._data(c,"globalEval",!b||m._data(b[d],"globalEval"))}function Ab(a,b){if(1===b.nodeType&&m.hasData(a)){var c,d,e,f=m._data(a),g=m._data(b,f),h=f.events;if(h){delete g.handle,g.events={};for(c in h)for(d=0,e=h[c].length;e>d;d++)m.event.add(b,c,h[c][d])}g.data&&(g.data=m.extend({},g.data))}}function Bb(a,b){var c,d,e;if(1===b.nodeType){if(c=b.nodeName.toLowerCase(),!k.noCloneEvent&&b[m.expando]){e=m._data(b);for(d in e.events)m.removeEvent(b,d,e.handle);b.removeAttribute(m.expando)}"script"===c&&b.text!==a.text?(xb(b).text=a.text,yb(b)):"object"===c?(b.parentNode&&(b.outerHTML=a.outerHTML),k.html5Clone&&a.innerHTML&&!m.trim(b.innerHTML)&&(b.innerHTML=a.innerHTML)):"input"===c&&W.test(a.type)?(b.defaultChecked=b.checked=a.checked,b.value!==a.value&&(b.value=a.value)):"option"===c?b.defaultSelected=b.selected=a.defaultSelected:("input"===c||"textarea"===c)&&(b.defaultValue=a.defaultValue)}}m.extend({clone:function(a,b,c){var d,e,f,g,h,i=m.contains(a.ownerDocument,a);if(k.html5Clone||m.isXMLDoc(a)||!gb.test("<"+a.nodeName+">")?f=a.cloneNode(!0):(tb.innerHTML=a.outerHTML,tb.removeChild(f=tb.firstChild)),!(k.noCloneEvent&&k.noCloneChecked||1!==a.nodeType&&11!==a.nodeType||m.isXMLDoc(a)))for(d=ub(f),h=ub(a),g=0;null!=(e=h[g]);++g)d[g]&&Bb(e,d[g]);if(b)if(c)for(h=h||ub(a),d=d||ub(f),g=0;null!=(e=h[g]);g++)Ab(e,d[g]);else Ab(a,f);return d=ub(f,"script"),d.length>0&&zb(d,!i&&ub(a,"script")),d=h=e=null,f},buildFragment:function(a,b,c,d){for(var e,f,g,h,i,j,l,n=a.length,o=db(b),p=[],q=0;n>q;q++)if(f=a[q],f||0===f)if("object"===m.type(f))m.merge(p,f.nodeType?[f]:f);else if(lb.test(f)){h=h||o.appendChild(b.createElement("div")),i=(jb.exec(f)||["",""])[1].toLowerCase(),l=rb[i]||rb._default,h.innerHTML=l[1]+f.replace(ib,"<$1></$2>")+l[2],e=l[0];while(e--)h=h.lastChild;if(!k.leadingWhitespace&&hb.test(f)&&p.push(b.createTextNode(hb.exec(f)[0])),!k.tbody){f="table"!==i||kb.test(f)?"<table>"!==l[1]||kb.test(f)?0:h:h.firstChild,e=f&&f.childNodes.length;while(e--)m.nodeName(j=f.childNodes[e],"tbody")&&!j.childNodes.length&&f.removeChild(j)}m.merge(p,h.childNodes),h.textContent="";while(h.firstChild)h.removeChild(h.firstChild);h=o.lastChild}else p.push(b.createTextNode(f));h&&o.removeChild(h),k.appendChecked||m.grep(ub(p,"input"),vb),q=0;while(f=p[q++])if((!d||-1===m.inArray(f,d))&&(g=m.contains(f.ownerDocument,f),h=ub(o.appendChild(f),"script"),g&&zb(h),c)){e=0;while(f=h[e++])ob.test(f.type||"")&&c.push(f)}return h=null,o},cleanData:function(a,b){for(var d,e,f,g,h=0,i=m.expando,j=m.cache,l=k.deleteExpando,n=m.event.special;null!=(d=a[h]);h++)if((b||m.acceptData(d))&&(f=d[i],g=f&&j[f])){if(g.events)for(e in g.events)n[e]?m.event.remove(d,e):m.removeEvent(d,e,g.handle);j[f]&&(delete j[f],l?delete d[i]:typeof d.removeAttribute!==K?d.removeAttribute(i):d[i]=null,c.push(f))}}}),m.fn.extend({text:function(a){return V(this,function(a){return void 0===a?m.text(this):this.empty().append((this[0]&&this[0].ownerDocument||y).createTextNode(a))},null,a,arguments.length)},append:function(){return this.domManip(arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=wb(this,a);b.appendChild(a)}})},prepend:function(){return this.domManip(arguments,function(a){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var b=wb(this,a);b.insertBefore(a,b.firstChild)}})},before:function(){return this.domManip(arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this)})},after:function(){return this.domManip(arguments,function(a){this.parentNode&&this.parentNode.insertBefore(a,this.nextSibling)})},remove:function(a,b){for(var c,d=a?m.filter(a,this):this,e=0;null!=(c=d[e]);e++)b||1!==c.nodeType||m.cleanData(ub(c)),c.parentNode&&(b&&m.contains(c.ownerDocument,c)&&zb(ub(c,"script")),c.parentNode.removeChild(c));return this},empty:function(){for(var a,b=0;null!=(a=this[b]);b++){1===a.nodeType&&m.cleanData(ub(a,!1));while(a.firstChild)a.removeChild(a.firstChild);a.options&&m.nodeName(a,"select")&&(a.options.length=0)}return this},clone:function(a,b){return a=null==a?!1:a,b=null==b?a:b,this.map(function(){return m.clone(this,a,b)})},html:function(a){return V(this,function(a){var b=this[0]||{},c=0,d=this.length;if(void 0===a)return 1===b.nodeType?b.innerHTML.replace(fb,""):void 0;if(!("string"!=typeof a||mb.test(a)||!k.htmlSerialize&&gb.test(a)||!k.leadingWhitespace&&hb.test(a)||rb[(jb.exec(a)||["",""])[1].toLowerCase()])){a=a.replace(ib,"<$1></$2>");try{for(;d>c;c++)b=this[c]||{},1===b.nodeType&&(m.cleanData(ub(b,!1)),b.innerHTML=a);b=0}catch(e){}}b&&this.empty().append(a)},null,a,arguments.length)},replaceWith:function(){var a=arguments[0];return this.domManip(arguments,function(b){a=this.parentNode,m.cleanData(ub(this)),a&&a.replaceChild(b,this)}),a&&(a.length||a.nodeType)?this:this.remove()},detach:function(a){return this.remove(a,!0)},domManip:function(a,b){a=e.apply([],a);var c,d,f,g,h,i,j=0,l=this.length,n=this,o=l-1,p=a[0],q=m.isFunction(p);if(q||l>1&&"string"==typeof p&&!k.checkClone&&nb.test(p))return this.each(function(c){var d=n.eq(c);q&&(a[0]=p.call(this,c,d.html())),d.domManip(a,b)});if(l&&(i=m.buildFragment(a,this[0].ownerDocument,!1,this),c=i.firstChild,1===i.childNodes.length&&(i=c),c)){for(g=m.map(ub(i,"script"),xb),f=g.length;l>j;j++)d=i,j!==o&&(d=m.clone(d,!0,!0),f&&m.merge(g,ub(d,"script"))),b.call(this[j],d,j);if(f)for(h=g[g.length-1].ownerDocument,m.map(g,yb),j=0;f>j;j++)d=g[j],ob.test(d.type||"")&&!m._data(d,"globalEval")&&m.contains(h,d)&&(d.src?m._evalUrl&&m._evalUrl(d.src):m.globalEval((d.text||d.textContent||d.innerHTML||"").replace(qb,"")));i=c=null}return this}}),m.each({appendTo:"append",prependTo:"prepend",insertBefore:"before",insertAfter:"after",replaceAll:"replaceWith"},function(a,b){m.fn[a]=function(a){for(var c,d=0,e=[],g=m(a),h=g.length-1;h>=d;d++)c=d===h?this:this.clone(!0),m(g[d])[b](c),f.apply(e,c.get());return this.pushStack(e)}});var Cb,Db={};function Eb(b,c){var d,e=m(c.createElement(b)).appendTo(c.body),f=a.getDefaultComputedStyle&&(d=a.getDefaultComputedStyle(e[0]))?d.display:m.css(e[0],"display");return e.detach(),f}function Fb(a){var b=y,c=Db[a];return c||(c=Eb(a,b),"none"!==c&&c||(Cb=(Cb||m("<iframe frameborder='0' width='0' height='0'/>")).appendTo(b.documentElement),b=(Cb[0].contentWindow||Cb[0].contentDocument).document,b.write(),b.close(),c=Eb(a,b),Cb.detach()),Db[a]=c),c}!function(){var a;k.shrinkWrapBlocks=function(){if(null!=a)return a;a=!1;var b,c,d;return c=y.getElementsByTagName("body")[0],c&&c.style?(b=y.createElement("div"),d=y.createElement("div"),d.style.cssText="position:absolute;border:0;width:0;height:0;top:0;left:-9999px",c.appendChild(d).appendChild(b),typeof b.style.zoom!==K&&(b.style.cssText="-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;display:block;margin:0;border:0;padding:1px;width:1px;zoom:1",b.appendChild(y.createElement("div")).style.width="5px",a=3!==b.offsetWidth),c.removeChild(d),a):void 0}}();var Gb=/^margin/,Hb=new RegExp("^("+S+")(?!px)[a-z%]+$","i"),Ib,Jb,Kb=/^(top|right|bottom|left)$/;a.getComputedStyle?(Ib=function(a){return a.ownerDocument.defaultView.getComputedStyle(a,null)},Jb=function(a,b,c){var d,e,f,g,h=a.style;return c=c||Ib(a),g=c?c.getPropertyValue(b)||c[b]:void 0,c&&(""!==g||m.contains(a.ownerDocument,a)||(g=m.style(a,b)),Hb.test(g)&&Gb.test(b)&&(d=h.width,e=h.minWidth,f=h.maxWidth,h.minWidth=h.maxWidth=h.width=g,g=c.width,h.width=d,h.minWidth=e,h.maxWidth=f)),void 0===g?g:g+""}):y.documentElement.currentStyle&&(Ib=function(a){return a.currentStyle},Jb=function(a,b,c){var d,e,f,g,h=a.style;return c=c||Ib(a),g=c?c[b]:void 0,null==g&&h&&h[b]&&(g=h[b]),Hb.test(g)&&!Kb.test(b)&&(d=h.left,e=a.runtimeStyle,f=e&&e.left,f&&(e.left=a.currentStyle.left),h.left="fontSize"===b?"1em":g,g=h.pixelLeft+"px",h.left=d,f&&(e.left=f)),void 0===g?g:g+""||"auto"});function Lb(a,b){return{get:function(){var c=a();if(null!=c)return c?void delete this.get:(this.get=b).apply(this,arguments)}}}!function(){var b,c,d,e,f,g,h;if(b=y.createElement("div"),b.innerHTML="  <link/><table></table><a href='/a'>a</a><input type='checkbox'/>",d=b.getElementsByTagName("a")[0],c=d&&d.style){c.cssText="float:left;opacity:.5",k.opacity="0.5"===c.opacity,k.cssFloat=!!c.cssFloat,b.style.backgroundClip="content-box",b.cloneNode(!0).style.backgroundClip="",k.clearCloneStyle="content-box"===b.style.backgroundClip,k.boxSizing=""===c.boxSizing||""===c.MozBoxSizing||""===c.WebkitBoxSizing,m.extend(k,{reliableHiddenOffsets:function(){return null==g&&i(),g},boxSizingReliable:function(){return null==f&&i(),f},pixelPosition:function(){return null==e&&i(),e},reliableMarginRight:function(){return null==h&&i(),h}});function i(){var b,c,d,i;c=y.getElementsByTagName("body")[0],c&&c.style&&(b=y.createElement("div"),d=y.createElement("div"),d.style.cssText="position:absolute;border:0;width:0;height:0;top:0;left:-9999px",c.appendChild(d).appendChild(b),b.style.cssText="-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box;display:block;margin-top:1%;top:1%;border:1px;padding:1px;width:4px;position:absolute",e=f=!1,h=!0,a.getComputedStyle&&(e="1%"!==(a.getComputedStyle(b,null)||{}).top,f="4px"===(a.getComputedStyle(b,null)||{width:"4px"}).width,i=b.appendChild(y.createElement("div")),i.style.cssText=b.style.cssText="-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;display:block;margin:0;border:0;padding:0",i.style.marginRight=i.style.width="0",b.style.width="1px",h=!parseFloat((a.getComputedStyle(i,null)||{}).marginRight)),b.innerHTML="<table><tr><td></td><td>t</td></tr></table>",i=b.getElementsByTagName("td"),i[0].style.cssText="margin:0;border:0;padding:0;display:none",g=0===i[0].offsetHeight,g&&(i[0].style.display="",i[1].style.display="none",g=0===i[0].offsetHeight),c.removeChild(d))}}}(),m.swap=function(a,b,c,d){var e,f,g={};for(f in b)g[f]=a.style[f],a.style[f]=b[f];e=c.apply(a,d||[]);for(f in b)a.style[f]=g[f];return e};var Mb=/alpha\([^)]*\)/i,Nb=/opacity\s*=\s*([^)]*)/,Ob=/^(none|table(?!-c[ea]).+)/,Pb=new RegExp("^("+S+")(.*)$","i"),Qb=new RegExp("^([+-])=("+S+")","i"),Rb={position:"absolute",visibility:"hidden",display:"block"},Sb={letterSpacing:"0",fontWeight:"400"},Tb=["Webkit","O","Moz","ms"];function Ub(a,b){if(b in a)return b;var c=b.charAt(0).toUpperCase()+b.slice(1),d=b,e=Tb.length;while(e--)if(b=Tb[e]+c,b in a)return b;return d}function Vb(a,b){for(var c,d,e,f=[],g=0,h=a.length;h>g;g++)d=a[g],d.style&&(f[g]=m._data(d,"olddisplay"),c=d.style.display,b?(f[g]||"none"!==c||(d.style.display=""),""===d.style.display&&U(d)&&(f[g]=m._data(d,"olddisplay",Fb(d.nodeName)))):(e=U(d),(c&&"none"!==c||!e)&&m._data(d,"olddisplay",e?c:m.css(d,"display"))));for(g=0;h>g;g++)d=a[g],d.style&&(b&&"none"!==d.style.display&&""!==d.style.display||(d.style.display=b?f[g]||"":"none"));return a}function Wb(a,b,c){var d=Pb.exec(b);return d?Math.max(0,d[1]-(c||0))+(d[2]||"px"):b}function Xb(a,b,c,d,e){for(var f=c===(d?"border":"content")?4:"width"===b?1:0,g=0;4>f;f+=2)"margin"===c&&(g+=m.css(a,c+T[f],!0,e)),d?("content"===c&&(g-=m.css(a,"padding"+T[f],!0,e)),"margin"!==c&&(g-=m.css(a,"border"+T[f]+"Width",!0,e))):(g+=m.css(a,"padding"+T[f],!0,e),"padding"!==c&&(g+=m.css(a,"border"+T[f]+"Width",!0,e)));return g}function Yb(a,b,c){var d=!0,e="width"===b?a.offsetWidth:a.offsetHeight,f=Ib(a),g=k.boxSizing&&"border-box"===m.css(a,"boxSizing",!1,f);if(0>=e||null==e){if(e=Jb(a,b,f),(0>e||null==e)&&(e=a.style[b]),Hb.test(e))return e;d=g&&(k.boxSizingReliable()||e===a.style[b]),e=parseFloat(e)||0}return e+Xb(a,b,c||(g?"border":"content"),d,f)+"px"}m.extend({cssHooks:{opacity:{get:function(a,b){if(b){var c=Jb(a,"opacity");return""===c?"1":c}}}},cssNumber:{columnCount:!0,fillOpacity:!0,flexGrow:!0,flexShrink:!0,fontWeight:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,widows:!0,zIndex:!0,zoom:!0},cssProps:{"float":k.cssFloat?"cssFloat":"styleFloat"},style:function(a,b,c,d){if(a&&3!==a.nodeType&&8!==a.nodeType&&a.style){var e,f,g,h=m.camelCase(b),i=a.style;if(b=m.cssProps[h]||(m.cssProps[h]=Ub(i,h)),g=m.cssHooks[b]||m.cssHooks[h],void 0===c)return g&&"get"in g&&void 0!==(e=g.get(a,!1,d))?e:i[b];if(f=typeof c,"string"===f&&(e=Qb.exec(c))&&(c=(e[1]+1)*e[2]+parseFloat(m.css(a,b)),f="number"),null!=c&&c===c&&("number"!==f||m.cssNumber[h]||(c+="px"),k.clearCloneStyle||""!==c||0!==b.indexOf("background")||(i[b]="inherit"),!(g&&"set"in g&&void 0===(c=g.set(a,c,d)))))try{i[b]=c}catch(j){}}},css:function(a,b,c,d){var e,f,g,h=m.camelCase(b);return b=m.cssProps[h]||(m.cssProps[h]=Ub(a.style,h)),g=m.cssHooks[b]||m.cssHooks[h],g&&"get"in g&&(f=g.get(a,!0,c)),void 0===f&&(f=Jb(a,b,d)),"normal"===f&&b in Sb&&(f=Sb[b]),""===c||c?(e=parseFloat(f),c===!0||m.isNumeric(e)?e||0:f):f}}),m.each(["height","width"],function(a,b){m.cssHooks[b]={get:function(a,c,d){return c?Ob.test(m.css(a,"display"))&&0===a.offsetWidth?m.swap(a,Rb,function(){return Yb(a,b,d)}):Yb(a,b,d):void 0},set:function(a,c,d){var e=d&&Ib(a);return Wb(a,c,d?Xb(a,b,d,k.boxSizing&&"border-box"===m.css(a,"boxSizing",!1,e),e):0)}}}),k.opacity||(m.cssHooks.opacity={get:function(a,b){return Nb.test((b&&a.currentStyle?a.currentStyle.filter:a.style.filter)||"")?.01*parseFloat(RegExp.$1)+"":b?"1":""},set:function(a,b){var c=a.style,d=a.currentStyle,e=m.isNumeric(b)?"alpha(opacity="+100*b+")":"",f=d&&d.filter||c.filter||"";c.zoom=1,(b>=1||""===b)&&""===m.trim(f.replace(Mb,""))&&c.removeAttribute&&(c.removeAttribute("filter"),""===b||d&&!d.filter)||(c.filter=Mb.test(f)?f.replace(Mb,e):f+" "+e)}}),m.cssHooks.marginRight=Lb(k.reliableMarginRight,function(a,b){return b?m.swap(a,{display:"inline-block"},Jb,[a,"marginRight"]):void 0}),m.each({margin:"",padding:"",border:"Width"},function(a,b){m.cssHooks[a+b]={expand:function(c){for(var d=0,e={},f="string"==typeof c?c.split(" "):[c];4>d;d++)e[a+T[d]+b]=f[d]||f[d-2]||f[0];return e}},Gb.test(a)||(m.cssHooks[a+b].set=Wb)}),m.fn.extend({css:function(a,b){return V(this,function(a,b,c){var d,e,f={},g=0;if(m.isArray(b)){for(d=Ib(a),e=b.length;e>g;g++)f[b[g]]=m.css(a,b[g],!1,d);return f}return void 0!==c?m.style(a,b,c):m.css(a,b)},a,b,arguments.length>1)},show:function(){return Vb(this,!0)},hide:function(){return Vb(this)},toggle:function(a){return"boolean"==typeof a?a?this.show():this.hide():this.each(function(){U(this)?m(this).show():m(this).hide()})}});function Zb(a,b,c,d,e){return new Zb.prototype.init(a,b,c,d,e)}m.Tween=Zb,Zb.prototype={constructor:Zb,init:function(a,b,c,d,e,f){this.elem=a,this.prop=c,this.easing=e||"swing",this.options=b,this.start=this.now=this.cur(),this.end=d,this.unit=f||(m.cssNumber[c]?"":"px")
+},cur:function(){var a=Zb.propHooks[this.prop];return a&&a.get?a.get(this):Zb.propHooks._default.get(this)},run:function(a){var b,c=Zb.propHooks[this.prop];return this.pos=b=this.options.duration?m.easing[this.easing](a,this.options.duration*a,0,1,this.options.duration):a,this.now=(this.end-this.start)*b+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),c&&c.set?c.set(this):Zb.propHooks._default.set(this),this}},Zb.prototype.init.prototype=Zb.prototype,Zb.propHooks={_default:{get:function(a){var b;return null==a.elem[a.prop]||a.elem.style&&null!=a.elem.style[a.prop]?(b=m.css(a.elem,a.prop,""),b&&"auto"!==b?b:0):a.elem[a.prop]},set:function(a){m.fx.step[a.prop]?m.fx.step[a.prop](a):a.elem.style&&(null!=a.elem.style[m.cssProps[a.prop]]||m.cssHooks[a.prop])?m.style(a.elem,a.prop,a.now+a.unit):a.elem[a.prop]=a.now}}},Zb.propHooks.scrollTop=Zb.propHooks.scrollLeft={set:function(a){a.elem.nodeType&&a.elem.parentNode&&(a.elem[a.prop]=a.now)}},m.easing={linear:function(a){return a},swing:function(a){return.5-Math.cos(a*Math.PI)/2}},m.fx=Zb.prototype.init,m.fx.step={};var $b,_b,ac=/^(?:toggle|show|hide)$/,bc=new RegExp("^(?:([+-])=|)("+S+")([a-z%]*)$","i"),cc=/queueHooks$/,dc=[ic],ec={"*":[function(a,b){var c=this.createTween(a,b),d=c.cur(),e=bc.exec(b),f=e&&e[3]||(m.cssNumber[a]?"":"px"),g=(m.cssNumber[a]||"px"!==f&&+d)&&bc.exec(m.css(c.elem,a)),h=1,i=20;if(g&&g[3]!==f){f=f||g[3],e=e||[],g=+d||1;do h=h||".5",g/=h,m.style(c.elem,a,g+f);while(h!==(h=c.cur()/d)&&1!==h&&--i)}return e&&(g=c.start=+g||+d||0,c.unit=f,c.end=e[1]?g+(e[1]+1)*e[2]:+e[2]),c}]};function fc(){return setTimeout(function(){$b=void 0}),$b=m.now()}function gc(a,b){var c,d={height:a},e=0;for(b=b?1:0;4>e;e+=2-b)c=T[e],d["margin"+c]=d["padding"+c]=a;return b&&(d.opacity=d.width=a),d}function hc(a,b,c){for(var d,e=(ec[b]||[]).concat(ec["*"]),f=0,g=e.length;g>f;f++)if(d=e[f].call(c,b,a))return d}function ic(a,b,c){var d,e,f,g,h,i,j,l,n=this,o={},p=a.style,q=a.nodeType&&U(a),r=m._data(a,"fxshow");c.queue||(h=m._queueHooks(a,"fx"),null==h.unqueued&&(h.unqueued=0,i=h.empty.fire,h.empty.fire=function(){h.unqueued||i()}),h.unqueued++,n.always(function(){n.always(function(){h.unqueued--,m.queue(a,"fx").length||h.empty.fire()})})),1===a.nodeType&&("height"in b||"width"in b)&&(c.overflow=[p.overflow,p.overflowX,p.overflowY],j=m.css(a,"display"),l="none"===j?m._data(a,"olddisplay")||Fb(a.nodeName):j,"inline"===l&&"none"===m.css(a,"float")&&(k.inlineBlockNeedsLayout&&"inline"!==Fb(a.nodeName)?p.zoom=1:p.display="inline-block")),c.overflow&&(p.overflow="hidden",k.shrinkWrapBlocks()||n.always(function(){p.overflow=c.overflow[0],p.overflowX=c.overflow[1],p.overflowY=c.overflow[2]}));for(d in b)if(e=b[d],ac.exec(e)){if(delete b[d],f=f||"toggle"===e,e===(q?"hide":"show")){if("show"!==e||!r||void 0===r[d])continue;q=!0}o[d]=r&&r[d]||m.style(a,d)}else j=void 0;if(m.isEmptyObject(o))"inline"===("none"===j?Fb(a.nodeName):j)&&(p.display=j);else{r?"hidden"in r&&(q=r.hidden):r=m._data(a,"fxshow",{}),f&&(r.hidden=!q),q?m(a).show():n.done(function(){m(a).hide()}),n.done(function(){var b;m._removeData(a,"fxshow");for(b in o)m.style(a,b,o[b])});for(d in o)g=hc(q?r[d]:0,d,n),d in r||(r[d]=g.start,q&&(g.end=g.start,g.start="width"===d||"height"===d?1:0))}}function jc(a,b){var c,d,e,f,g;for(c in a)if(d=m.camelCase(c),e=b[d],f=a[c],m.isArray(f)&&(e=f[1],f=a[c]=f[0]),c!==d&&(a[d]=f,delete a[c]),g=m.cssHooks[d],g&&"expand"in g){f=g.expand(f),delete a[d];for(c in f)c in a||(a[c]=f[c],b[c]=e)}else b[d]=e}function kc(a,b,c){var d,e,f=0,g=dc.length,h=m.Deferred().always(function(){delete i.elem}),i=function(){if(e)return!1;for(var b=$b||fc(),c=Math.max(0,j.startTime+j.duration-b),d=c/j.duration||0,f=1-d,g=0,i=j.tweens.length;i>g;g++)j.tweens[g].run(f);return h.notifyWith(a,[j,f,c]),1>f&&i?c:(h.resolveWith(a,[j]),!1)},j=h.promise({elem:a,props:m.extend({},b),opts:m.extend(!0,{specialEasing:{}},c),originalProperties:b,originalOptions:c,startTime:$b||fc(),duration:c.duration,tweens:[],createTween:function(b,c){var d=m.Tween(a,j.opts,b,c,j.opts.specialEasing[b]||j.opts.easing);return j.tweens.push(d),d},stop:function(b){var c=0,d=b?j.tweens.length:0;if(e)return this;for(e=!0;d>c;c++)j.tweens[c].run(1);return b?h.resolveWith(a,[j,b]):h.rejectWith(a,[j,b]),this}}),k=j.props;for(jc(k,j.opts.specialEasing);g>f;f++)if(d=dc[f].call(j,a,k,j.opts))return d;return m.map(k,hc,j),m.isFunction(j.opts.start)&&j.opts.start.call(a,j),m.fx.timer(m.extend(i,{elem:a,anim:j,queue:j.opts.queue})),j.progress(j.opts.progress).done(j.opts.done,j.opts.complete).fail(j.opts.fail).always(j.opts.always)}m.Animation=m.extend(kc,{tweener:function(a,b){m.isFunction(a)?(b=a,a=["*"]):a=a.split(" ");for(var c,d=0,e=a.length;e>d;d++)c=a[d],ec[c]=ec[c]||[],ec[c].unshift(b)},prefilter:function(a,b){b?dc.unshift(a):dc.push(a)}}),m.speed=function(a,b,c){var d=a&&"object"==typeof a?m.extend({},a):{complete:c||!c&&b||m.isFunction(a)&&a,duration:a,easing:c&&b||b&&!m.isFunction(b)&&b};return d.duration=m.fx.off?0:"number"==typeof d.duration?d.duration:d.duration in m.fx.speeds?m.fx.speeds[d.duration]:m.fx.speeds._default,(null==d.queue||d.queue===!0)&&(d.queue="fx"),d.old=d.complete,d.complete=function(){m.isFunction(d.old)&&d.old.call(this),d.queue&&m.dequeue(this,d.queue)},d},m.fn.extend({fadeTo:function(a,b,c,d){return this.filter(U).css("opacity",0).show().end().animate({opacity:b},a,c,d)},animate:function(a,b,c,d){var e=m.isEmptyObject(a),f=m.speed(b,c,d),g=function(){var b=kc(this,m.extend({},a),f);(e||m._data(this,"finish"))&&b.stop(!0)};return g.finish=g,e||f.queue===!1?this.each(g):this.queue(f.queue,g)},stop:function(a,b,c){var d=function(a){var b=a.stop;delete a.stop,b(c)};return"string"!=typeof a&&(c=b,b=a,a=void 0),b&&a!==!1&&this.queue(a||"fx",[]),this.each(function(){var b=!0,e=null!=a&&a+"queueHooks",f=m.timers,g=m._data(this);if(e)g[e]&&g[e].stop&&d(g[e]);else for(e in g)g[e]&&g[e].stop&&cc.test(e)&&d(g[e]);for(e=f.length;e--;)f[e].elem!==this||null!=a&&f[e].queue!==a||(f[e].anim.stop(c),b=!1,f.splice(e,1));(b||!c)&&m.dequeue(this,a)})},finish:function(a){return a!==!1&&(a=a||"fx"),this.each(function(){var b,c=m._data(this),d=c[a+"queue"],e=c[a+"queueHooks"],f=m.timers,g=d?d.length:0;for(c.finish=!0,m.queue(this,a,[]),e&&e.stop&&e.stop.call(this,!0),b=f.length;b--;)f[b].elem===this&&f[b].queue===a&&(f[b].anim.stop(!0),f.splice(b,1));for(b=0;g>b;b++)d[b]&&d[b].finish&&d[b].finish.call(this);delete c.finish})}}),m.each(["toggle","show","hide"],function(a,b){var c=m.fn[b];m.fn[b]=function(a,d,e){return null==a||"boolean"==typeof a?c.apply(this,arguments):this.animate(gc(b,!0),a,d,e)}}),m.each({slideDown:gc("show"),slideUp:gc("hide"),slideToggle:gc("toggle"),fadeIn:{opacity:"show"},fadeOut:{opacity:"hide"},fadeToggle:{opacity:"toggle"}},function(a,b){m.fn[a]=function(a,c,d){return this.animate(b,a,c,d)}}),m.timers=[],m.fx.tick=function(){var a,b=m.timers,c=0;for($b=m.now();c<b.length;c++)a=b[c],a()||b[c]!==a||b.splice(c--,1);b.length||m.fx.stop(),$b=void 0},m.fx.timer=function(a){m.timers.push(a),a()?m.fx.start():m.timers.pop()},m.fx.interval=13,m.fx.start=function(){_b||(_b=setInterval(m.fx.tick,m.fx.interval))},m.fx.stop=function(){clearInterval(_b),_b=null},m.fx.speeds={slow:600,fast:200,_default:400},m.fn.delay=function(a,b){return a=m.fx?m.fx.speeds[a]||a:a,b=b||"fx",this.queue(b,function(b,c){var d=setTimeout(b,a);c.stop=function(){clearTimeout(d)}})},function(){var a,b,c,d,e;b=y.createElement("div"),b.setAttribute("className","t"),b.innerHTML="  <link/><table></table><a href='/a'>a</a><input type='checkbox'/>",d=b.getElementsByTagName("a")[0],c=y.createElement("select"),e=c.appendChild(y.createElement("option")),a=b.getElementsByTagName("input")[0],d.style.cssText="top:1px",k.getSetAttribute="t"!==b.className,k.style=/top/.test(d.getAttribute("style")),k.hrefNormalized="/a"===d.getAttribute("href"),k.checkOn=!!a.value,k.optSelected=e.selected,k.enctype=!!y.createElement("form").enctype,c.disabled=!0,k.optDisabled=!e.disabled,a=y.createElement("input"),a.setAttribute("value",""),k.input=""===a.getAttribute("value"),a.value="t",a.setAttribute("type","radio"),k.radioValue="t"===a.value}();var lc=/\r/g;m.fn.extend({val:function(a){var b,c,d,e=this[0];{if(arguments.length)return d=m.isFunction(a),this.each(function(c){var e;1===this.nodeType&&(e=d?a.call(this,c,m(this).val()):a,null==e?e="":"number"==typeof e?e+="":m.isArray(e)&&(e=m.map(e,function(a){return null==a?"":a+""})),b=m.valHooks[this.type]||m.valHooks[this.nodeName.toLowerCase()],b&&"set"in b&&void 0!==b.set(this,e,"value")||(this.value=e))});if(e)return b=m.valHooks[e.type]||m.valHooks[e.nodeName.toLowerCase()],b&&"get"in b&&void 0!==(c=b.get(e,"value"))?c:(c=e.value,"string"==typeof c?c.replace(lc,""):null==c?"":c)}}}),m.extend({valHooks:{option:{get:function(a){var b=m.find.attr(a,"value");return null!=b?b:m.trim(m.text(a))}},select:{get:function(a){for(var b,c,d=a.options,e=a.selectedIndex,f="select-one"===a.type||0>e,g=f?null:[],h=f?e+1:d.length,i=0>e?h:f?e:0;h>i;i++)if(c=d[i],!(!c.selected&&i!==e||(k.optDisabled?c.disabled:null!==c.getAttribute("disabled"))||c.parentNode.disabled&&m.nodeName(c.parentNode,"optgroup"))){if(b=m(c).val(),f)return b;g.push(b)}return g},set:function(a,b){var c,d,e=a.options,f=m.makeArray(b),g=e.length;while(g--)if(d=e[g],m.inArray(m.valHooks.option.get(d),f)>=0)try{d.selected=c=!0}catch(h){d.scrollHeight}else d.selected=!1;return c||(a.selectedIndex=-1),e}}}}),m.each(["radio","checkbox"],function(){m.valHooks[this]={set:function(a,b){return m.isArray(b)?a.checked=m.inArray(m(a).val(),b)>=0:void 0}},k.checkOn||(m.valHooks[this].get=function(a){return null===a.getAttribute("value")?"on":a.value})});var mc,nc,oc=m.expr.attrHandle,pc=/^(?:checked|selected)$/i,qc=k.getSetAttribute,rc=k.input;m.fn.extend({attr:function(a,b){return V(this,m.attr,a,b,arguments.length>1)},removeAttr:function(a){return this.each(function(){m.removeAttr(this,a)})}}),m.extend({attr:function(a,b,c){var d,e,f=a.nodeType;if(a&&3!==f&&8!==f&&2!==f)return typeof a.getAttribute===K?m.prop(a,b,c):(1===f&&m.isXMLDoc(a)||(b=b.toLowerCase(),d=m.attrHooks[b]||(m.expr.match.bool.test(b)?nc:mc)),void 0===c?d&&"get"in d&&null!==(e=d.get(a,b))?e:(e=m.find.attr(a,b),null==e?void 0:e):null!==c?d&&"set"in d&&void 0!==(e=d.set(a,c,b))?e:(a.setAttribute(b,c+""),c):void m.removeAttr(a,b))},removeAttr:function(a,b){var c,d,e=0,f=b&&b.match(E);if(f&&1===a.nodeType)while(c=f[e++])d=m.propFix[c]||c,m.expr.match.bool.test(c)?rc&&qc||!pc.test(c)?a[d]=!1:a[m.camelCase("default-"+c)]=a[d]=!1:m.attr(a,c,""),a.removeAttribute(qc?c:d)},attrHooks:{type:{set:function(a,b){if(!k.radioValue&&"radio"===b&&m.nodeName(a,"input")){var c=a.value;return a.setAttribute("type",b),c&&(a.value=c),b}}}}}),nc={set:function(a,b,c){return b===!1?m.removeAttr(a,c):rc&&qc||!pc.test(c)?a.setAttribute(!qc&&m.propFix[c]||c,c):a[m.camelCase("default-"+c)]=a[c]=!0,c}},m.each(m.expr.match.bool.source.match(/\w+/g),function(a,b){var c=oc[b]||m.find.attr;oc[b]=rc&&qc||!pc.test(b)?function(a,b,d){var e,f;return d||(f=oc[b],oc[b]=e,e=null!=c(a,b,d)?b.toLowerCase():null,oc[b]=f),e}:function(a,b,c){return c?void 0:a[m.camelCase("default-"+b)]?b.toLowerCase():null}}),rc&&qc||(m.attrHooks.value={set:function(a,b,c){return m.nodeName(a,"input")?void(a.defaultValue=b):mc&&mc.set(a,b,c)}}),qc||(mc={set:function(a,b,c){var d=a.getAttributeNode(c);return d||a.setAttributeNode(d=a.ownerDocument.createAttribute(c)),d.value=b+="","value"===c||b===a.getAttribute(c)?b:void 0}},oc.id=oc.name=oc.coords=function(a,b,c){var d;return c?void 0:(d=a.getAttributeNode(b))&&""!==d.value?d.value:null},m.valHooks.button={get:function(a,b){var c=a.getAttributeNode(b);return c&&c.specified?c.value:void 0},set:mc.set},m.attrHooks.contenteditable={set:function(a,b,c){mc.set(a,""===b?!1:b,c)}},m.each(["width","height"],function(a,b){m.attrHooks[b]={set:function(a,c){return""===c?(a.setAttribute(b,"auto"),c):void 0}}})),k.style||(m.attrHooks.style={get:function(a){return a.style.cssText||void 0},set:function(a,b){return a.style.cssText=b+""}});var sc=/^(?:input|select|textarea|button|object)$/i,tc=/^(?:a|area)$/i;m.fn.extend({prop:function(a,b){return V(this,m.prop,a,b,arguments.length>1)},removeProp:function(a){return a=m.propFix[a]||a,this.each(function(){try{this[a]=void 0,delete this[a]}catch(b){}})}}),m.extend({propFix:{"for":"htmlFor","class":"className"},prop:function(a,b,c){var d,e,f,g=a.nodeType;if(a&&3!==g&&8!==g&&2!==g)return f=1!==g||!m.isXMLDoc(a),f&&(b=m.propFix[b]||b,e=m.propHooks[b]),void 0!==c?e&&"set"in e&&void 0!==(d=e.set(a,c,b))?d:a[b]=c:e&&"get"in e&&null!==(d=e.get(a,b))?d:a[b]},propHooks:{tabIndex:{get:function(a){var b=m.find.attr(a,"tabindex");return b?parseInt(b,10):sc.test(a.nodeName)||tc.test(a.nodeName)&&a.href?0:-1}}}}),k.hrefNormalized||m.each(["href","src"],function(a,b){m.propHooks[b]={get:function(a){return a.getAttribute(b,4)}}}),k.optSelected||(m.propHooks.selected={get:function(a){var b=a.parentNode;return b&&(b.selectedIndex,b.parentNode&&b.parentNode.selectedIndex),null}}),m.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){m.propFix[this.toLowerCase()]=this}),k.enctype||(m.propFix.enctype="encoding");var uc=/[\t\r\n\f]/g;m.fn.extend({addClass:function(a){var b,c,d,e,f,g,h=0,i=this.length,j="string"==typeof a&&a;if(m.isFunction(a))return this.each(function(b){m(this).addClass(a.call(this,b,this.className))});if(j)for(b=(a||"").match(E)||[];i>h;h++)if(c=this[h],d=1===c.nodeType&&(c.className?(" "+c.className+" ").replace(uc," "):" ")){f=0;while(e=b[f++])d.indexOf(" "+e+" ")<0&&(d+=e+" ");g=m.trim(d),c.className!==g&&(c.className=g)}return this},removeClass:function(a){var b,c,d,e,f,g,h=0,i=this.length,j=0===arguments.length||"string"==typeof a&&a;if(m.isFunction(a))return this.each(function(b){m(this).removeClass(a.call(this,b,this.className))});if(j)for(b=(a||"").match(E)||[];i>h;h++)if(c=this[h],d=1===c.nodeType&&(c.className?(" "+c.className+" ").replace(uc," "):"")){f=0;while(e=b[f++])while(d.indexOf(" "+e+" ")>=0)d=d.replace(" "+e+" "," ");g=a?m.trim(d):"",c.className!==g&&(c.className=g)}return this},toggleClass:function(a,b){var c=typeof a;return"boolean"==typeof b&&"string"===c?b?this.addClass(a):this.removeClass(a):this.each(m.isFunction(a)?function(c){m(this).toggleClass(a.call(this,c,this.className,b),b)}:function(){if("string"===c){var b,d=0,e=m(this),f=a.match(E)||[];while(b=f[d++])e.hasClass(b)?e.removeClass(b):e.addClass(b)}else(c===K||"boolean"===c)&&(this.className&&m._data(this,"__className__",this.className),this.className=this.className||a===!1?"":m._data(this,"__className__")||"")})},hasClass:function(a){for(var b=" "+a+" ",c=0,d=this.length;d>c;c++)if(1===this[c].nodeType&&(" "+this[c].className+" ").replace(uc," ").indexOf(b)>=0)return!0;return!1}}),m.each("blur focus focusin focusout load resize scroll unload click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup error contextmenu".split(" "),function(a,b){m.fn[b]=function(a,c){return arguments.length>0?this.on(b,null,a,c):this.trigger(b)}}),m.fn.extend({hover:function(a,b){return this.mouseenter(a).mouseleave(b||a)},bind:function(a,b,c){return this.on(a,null,b,c)},unbind:function(a,b){return this.off(a,null,b)},delegate:function(a,b,c,d){return this.on(b,a,c,d)},undelegate:function(a,b,c){return 1===arguments.length?this.off(a,"**"):this.off(b,a||"**",c)}});var vc=m.now(),wc=/\?/,xc=/(,)|(\[|{)|(}|])|"(?:[^"\\\r\n]|\\["\\\/bfnrt]|\\u[\da-fA-F]{4})*"\s*:?|true|false|null|-?(?!0\d)\d+(?:\.\d+|)(?:[eE][+-]?\d+|)/g;m.parseJSON=function(b){if(a.JSON&&a.JSON.parse)return a.JSON.parse(b+"");var c,d=null,e=m.trim(b+"");return e&&!m.trim(e.replace(xc,function(a,b,e,f){return c&&b&&(d=0),0===d?a:(c=e||b,d+=!f-!e,"")}))?Function("return "+e)():m.error("Invalid JSON: "+b)},m.parseXML=function(b){var c,d;if(!b||"string"!=typeof b)return null;try{a.DOMParser?(d=new DOMParser,c=d.parseFromString(b,"text/xml")):(c=new ActiveXObject("Microsoft.XMLDOM"),c.async="false",c.loadXML(b))}catch(e){c=void 0}return c&&c.documentElement&&!c.getElementsByTagName("parsererror").length||m.error("Invalid XML: "+b),c};var yc,zc,Ac=/#.*$/,Bc=/([?&])_=[^&]*/,Cc=/^(.*?):[ \t]*([^\r\n]*)\r?$/gm,Dc=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,Ec=/^(?:GET|HEAD)$/,Fc=/^\/\//,Gc=/^([\w.+-]+:)(?:\/\/(?:[^\/?#]*@|)([^\/?#:]*)(?::(\d+)|)|)/,Hc={},Ic={},Jc="*/".concat("*");try{zc=location.href}catch(Kc){zc=y.createElement("a"),zc.href="",zc=zc.href}yc=Gc.exec(zc.toLowerCase())||[];function Lc(a){return function(b,c){"string"!=typeof b&&(c=b,b="*");var d,e=0,f=b.toLowerCase().match(E)||[];if(m.isFunction(c))while(d=f[e++])"+"===d.charAt(0)?(d=d.slice(1)||"*",(a[d]=a[d]||[]).unshift(c)):(a[d]=a[d]||[]).push(c)}}function Mc(a,b,c,d){var e={},f=a===Ic;function g(h){var i;return e[h]=!0,m.each(a[h]||[],function(a,h){var j=h(b,c,d);return"string"!=typeof j||f||e[j]?f?!(i=j):void 0:(b.dataTypes.unshift(j),g(j),!1)}),i}return g(b.dataTypes[0])||!e["*"]&&g("*")}function Nc(a,b){var c,d,e=m.ajaxSettings.flatOptions||{};for(d in b)void 0!==b[d]&&((e[d]?a:c||(c={}))[d]=b[d]);return c&&m.extend(!0,a,c),a}function Oc(a,b,c){var d,e,f,g,h=a.contents,i=a.dataTypes;while("*"===i[0])i.shift(),void 0===e&&(e=a.mimeType||b.getResponseHeader("Content-Type"));if(e)for(g in h)if(h[g]&&h[g].test(e)){i.unshift(g);break}if(i[0]in c)f=i[0];else{for(g in c){if(!i[0]||a.converters[g+" "+i[0]]){f=g;break}d||(d=g)}f=f||d}return f?(f!==i[0]&&i.unshift(f),c[f]):void 0}function Pc(a,b,c,d){var e,f,g,h,i,j={},k=a.dataTypes.slice();if(k[1])for(g in a.converters)j[g.toLowerCase()]=a.converters[g];f=k.shift();while(f)if(a.responseFields[f]&&(c[a.responseFields[f]]=b),!i&&d&&a.dataFilter&&(b=a.dataFilter(b,a.dataType)),i=f,f=k.shift())if("*"===f)f=i;else if("*"!==i&&i!==f){if(g=j[i+" "+f]||j["* "+f],!g)for(e in j)if(h=e.split(" "),h[1]===f&&(g=j[i+" "+h[0]]||j["* "+h[0]])){g===!0?g=j[e]:j[e]!==!0&&(f=h[0],k.unshift(h[1]));break}if(g!==!0)if(g&&a["throws"])b=g(b);else try{b=g(b)}catch(l){return{state:"parsererror",error:g?l:"No conversion from "+i+" to "+f}}}return{state:"success",data:b}}m.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:zc,type:"GET",isLocal:Dc.test(yc[1]),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Jc,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/xml/,html:/html/,json:/json/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":m.parseJSON,"text xml":m.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(a,b){return b?Nc(Nc(a,m.ajaxSettings),b):Nc(m.ajaxSettings,a)},ajaxPrefilter:Lc(Hc),ajaxTransport:Lc(Ic),ajax:function(a,b){"object"==typeof a&&(b=a,a=void 0),b=b||{};var c,d,e,f,g,h,i,j,k=m.ajaxSetup({},b),l=k.context||k,n=k.context&&(l.nodeType||l.jquery)?m(l):m.event,o=m.Deferred(),p=m.Callbacks("once memory"),q=k.statusCode||{},r={},s={},t=0,u="canceled",v={readyState:0,getResponseHeader:function(a){var b;if(2===t){if(!j){j={};while(b=Cc.exec(f))j[b[1].toLowerCase()]=b[2]}b=j[a.toLowerCase()]}return null==b?null:b},getAllResponseHeaders:function(){return 2===t?f:null},setRequestHeader:function(a,b){var c=a.toLowerCase();return t||(a=s[c]=s[c]||a,r[a]=b),this},overrideMimeType:function(a){return t||(k.mimeType=a),this},statusCode:function(a){var b;if(a)if(2>t)for(b in a)q[b]=[q[b],a[b]];else v.always(a[v.status]);return this},abort:function(a){var b=a||u;return i&&i.abort(b),x(0,b),this}};if(o.promise(v).complete=p.add,v.success=v.done,v.error=v.fail,k.url=((a||k.url||zc)+"").replace(Ac,"").replace(Fc,yc[1]+"//"),k.type=b.method||b.type||k.method||k.type,k.dataTypes=m.trim(k.dataType||"*").toLowerCase().match(E)||[""],null==k.crossDomain&&(c=Gc.exec(k.url.toLowerCase()),k.crossDomain=!(!c||c[1]===yc[1]&&c[2]===yc[2]&&(c[3]||("http:"===c[1]?"80":"443"))===(yc[3]||("http:"===yc[1]?"80":"443")))),k.data&&k.processData&&"string"!=typeof k.data&&(k.data=m.param(k.data,k.traditional)),Mc(Hc,k,b,v),2===t)return v;h=k.global,h&&0===m.active++&&m.event.trigger("ajaxStart"),k.type=k.type.toUpperCase(),k.hasContent=!Ec.test(k.type),e=k.url,k.hasContent||(k.data&&(e=k.url+=(wc.test(e)?"&":"?")+k.data,delete k.data),k.cache===!1&&(k.url=Bc.test(e)?e.replace(Bc,"$1_="+vc++):e+(wc.test(e)?"&":"?")+"_="+vc++)),k.ifModified&&(m.lastModified[e]&&v.setRequestHeader("If-Modified-Since",m.lastModified[e]),m.etag[e]&&v.setRequestHeader("If-None-Match",m.etag[e])),(k.data&&k.hasContent&&k.contentType!==!1||b.contentType)&&v.setRequestHeader("Content-Type",k.contentType),v.setRequestHeader("Accept",k.dataTypes[0]&&k.accepts[k.dataTypes[0]]?k.accepts[k.dataTypes[0]]+("*"!==k.dataTypes[0]?", "+Jc+"; q=0.01":""):k.accepts["*"]);for(d in k.headers)v.setRequestHeader(d,k.headers[d]);if(k.beforeSend&&(k.beforeSend.call(l,v,k)===!1||2===t))return v.abort();u="abort";for(d in{success:1,error:1,complete:1})v[d](k[d]);if(i=Mc(Ic,k,b,v)){v.readyState=1,h&&n.trigger("ajaxSend",[v,k]),k.async&&k.timeout>0&&(g=setTimeout(function(){v.abort("timeout")},k.timeout));try{t=1,i.send(r,x)}catch(w){if(!(2>t))throw w;x(-1,w)}}else x(-1,"No Transport");function x(a,b,c,d){var j,r,s,u,w,x=b;2!==t&&(t=2,g&&clearTimeout(g),i=void 0,f=d||"",v.readyState=a>0?4:0,j=a>=200&&300>a||304===a,c&&(u=Oc(k,v,c)),u=Pc(k,u,v,j),j?(k.ifModified&&(w=v.getResponseHeader("Last-Modified"),w&&(m.lastModified[e]=w),w=v.getResponseHeader("etag"),w&&(m.etag[e]=w)),204===a||"HEAD"===k.type?x="nocontent":304===a?x="notmodified":(x=u.state,r=u.data,s=u.error,j=!s)):(s=x,(a||!x)&&(x="error",0>a&&(a=0))),v.status=a,v.statusText=(b||x)+"",j?o.resolveWith(l,[r,x,v]):o.rejectWith(l,[v,x,s]),v.statusCode(q),q=void 0,h&&n.trigger(j?"ajaxSuccess":"ajaxError",[v,k,j?r:s]),p.fireWith(l,[v,x]),h&&(n.trigger("ajaxComplete",[v,k]),--m.active||m.event.trigger("ajaxStop")))}return v},getJSON:function(a,b,c){return m.get(a,b,c,"json")},getScript:function(a,b){return m.get(a,void 0,b,"script")}}),m.each(["get","post"],function(a,b){m[b]=function(a,c,d,e){return m.isFunction(c)&&(e=e||d,d=c,c=void 0),m.ajax({url:a,type:b,dataType:e,data:c,success:d})}}),m.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(a,b){m.fn[b]=function(a){return this.on(b,a)}}),m._evalUrl=function(a){return m.ajax({url:a,type:"GET",dataType:"script",async:!1,global:!1,"throws":!0})},m.fn.extend({wrapAll:function(a){if(m.isFunction(a))return this.each(function(b){m(this).wrapAll(a.call(this,b))});if(this[0]){var b=m(a,this[0].ownerDocument).eq(0).clone(!0);this[0].parentNode&&b.insertBefore(this[0]),b.map(function(){var a=this;while(a.firstChild&&1===a.firstChild.nodeType)a=a.firstChild;return a}).append(this)}return this},wrapInner:function(a){return this.each(m.isFunction(a)?function(b){m(this).wrapInner(a.call(this,b))}:function(){var b=m(this),c=b.contents();c.length?c.wrapAll(a):b.append(a)})},wrap:function(a){var b=m.isFunction(a);return this.each(function(c){m(this).wrapAll(b?a.call(this,c):a)})},unwrap:function(){return this.parent().each(function(){m.nodeName(this,"body")||m(this).replaceWith(this.childNodes)}).end()}}),m.expr.filters.hidden=function(a){return a.offsetWidth<=0&&a.offsetHeight<=0||!k.reliableHiddenOffsets()&&"none"===(a.style&&a.style.display||m.css(a,"display"))},m.expr.filters.visible=function(a){return!m.expr.filters.hidden(a)};var Qc=/%20/g,Rc=/\[\]$/,Sc=/\r?\n/g,Tc=/^(?:submit|button|image|reset|file)$/i,Uc=/^(?:input|select|textarea|keygen)/i;function Vc(a,b,c,d){var e;if(m.isArray(b))m.each(b,function(b,e){c||Rc.test(a)?d(a,e):Vc(a+"["+("object"==typeof e?b:"")+"]",e,c,d)});else if(c||"object"!==m.type(b))d(a,b);else for(e in b)Vc(a+"["+e+"]",b[e],c,d)}m.param=function(a,b){var c,d=[],e=function(a,b){b=m.isFunction(b)?b():null==b?"":b,d[d.length]=encodeURIComponent(a)+"="+encodeURIComponent(b)};if(void 0===b&&(b=m.ajaxSettings&&m.ajaxSettings.traditional),m.isArray(a)||a.jquery&&!m.isPlainObject(a))m.each(a,function(){e(this.name,this.value)});else for(c in a)Vc(c,a[c],b,e);return d.join("&").replace(Qc,"+")},m.fn.extend({serialize:function(){return m.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var a=m.prop(this,"elements");return a?m.makeArray(a):this}).filter(function(){var a=this.type;return this.name&&!m(this).is(":disabled")&&Uc.test(this.nodeName)&&!Tc.test(a)&&(this.checked||!W.test(a))}).map(function(a,b){var c=m(this).val();return null==c?null:m.isArray(c)?m.map(c,function(a){return{name:b.name,value:a.replace(Sc,"\r\n")}}):{name:b.name,value:c.replace(Sc,"\r\n")}}).get()}}),m.ajaxSettings.xhr=void 0!==a.ActiveXObject?function(){return!this.isLocal&&/^(get|post|head|put|delete|options)$/i.test(this.type)&&Zc()||$c()}:Zc;var Wc=0,Xc={},Yc=m.ajaxSettings.xhr();a.ActiveXObject&&m(a).on("unload",function(){for(var a in Xc)Xc[a](void 0,!0)}),k.cors=!!Yc&&"withCredentials"in Yc,Yc=k.ajax=!!Yc,Yc&&m.ajaxTransport(function(a){if(!a.crossDomain||k.cors){var b;return{send:function(c,d){var e,f=a.xhr(),g=++Wc;if(f.open(a.type,a.url,a.async,a.username,a.password),a.xhrFields)for(e in a.xhrFields)f[e]=a.xhrFields[e];a.mimeType&&f.overrideMimeType&&f.overrideMimeType(a.mimeType),a.crossDomain||c["X-Requested-With"]||(c["X-Requested-With"]="XMLHttpRequest");for(e in c)void 0!==c[e]&&f.setRequestHeader(e,c[e]+"");f.send(a.hasContent&&a.data||null),b=function(c,e){var h,i,j;if(b&&(e||4===f.readyState))if(delete Xc[g],b=void 0,f.onreadystatechange=m.noop,e)4!==f.readyState&&f.abort();else{j={},h=f.status,"string"==typeof f.responseText&&(j.text=f.responseText);try{i=f.statusText}catch(k){i=""}h||!a.isLocal||a.crossDomain?1223===h&&(h=204):h=j.text?200:404}j&&d(h,i,j,f.getAllResponseHeaders())},a.async?4===f.readyState?setTimeout(b):f.onreadystatechange=Xc[g]=b:b()},abort:function(){b&&b(void 0,!0)}}}});function Zc(){try{return new a.XMLHttpRequest}catch(b){}}function $c(){try{return new a.ActiveXObject("Microsoft.XMLHTTP")}catch(b){}}m.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/(?:java|ecma)script/},converters:{"text script":function(a){return m.globalEval(a),a}}}),m.ajaxPrefilter("script",function(a){void 0===a.cache&&(a.cache=!1),a.crossDomain&&(a.type="GET",a.global=!1)}),m.ajaxTransport("script",function(a){if(a.crossDomain){var b,c=y.head||m("head")[0]||y.documentElement;return{send:function(d,e){b=y.createElement("script"),b.async=!0,a.scriptCharset&&(b.charset=a.scriptCharset),b.src=a.url,b.onload=b.onreadystatechange=function(a,c){(c||!b.readyState||/loaded|complete/.test(b.readyState))&&(b.onload=b.onreadystatechange=null,b.parentNode&&b.parentNode.removeChild(b),b=null,c||e(200,"success"))},c.insertBefore(b,c.firstChild)},abort:function(){b&&b.onload(void 0,!0)}}}});var _c=[],ad=/(=)\?(?=&|$)|\?\?/;m.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var a=_c.pop()||m.expando+"_"+vc++;return this[a]=!0,a}}),m.ajaxPrefilter("json jsonp",function(b,c,d){var e,f,g,h=b.jsonp!==!1&&(ad.test(b.url)?"url":"string"==typeof b.data&&!(b.contentType||"").indexOf("application/x-www-form-urlencoded")&&ad.test(b.data)&&"data");return h||"jsonp"===b.dataTypes[0]?(e=b.jsonpCallback=m.isFunction(b.jsonpCallback)?b.jsonpCallback():b.jsonpCallback,h?b[h]=b[h].replace(ad,"$1"+e):b.jsonp!==!1&&(b.url+=(wc.test(b.url)?"&":"?")+b.jsonp+"="+e),b.converters["script json"]=function(){return g||m.error(e+" was not called"),g[0]},b.dataTypes[0]="json",f=a[e],a[e]=function(){g=arguments},d.always(function(){a[e]=f,b[e]&&(b.jsonpCallback=c.jsonpCallback,_c.push(e)),g&&m.isFunction(f)&&f(g[0]),g=f=void 0}),"script"):void 0}),m.parseHTML=function(a,b,c){if(!a||"string"!=typeof a)return null;"boolean"==typeof b&&(c=b,b=!1),b=b||y;var d=u.exec(a),e=!c&&[];return d?[b.createElement(d[1])]:(d=m.buildFragment([a],b,e),e&&e.length&&m(e).remove(),m.merge([],d.childNodes))};var bd=m.fn.load;m.fn.load=function(a,b,c){if("string"!=typeof a&&bd)return bd.apply(this,arguments);var d,e,f,g=this,h=a.indexOf(" ");return h>=0&&(d=m.trim(a.slice(h,a.length)),a=a.slice(0,h)),m.isFunction(b)?(c=b,b=void 0):b&&"object"==typeof b&&(f="POST"),g.length>0&&m.ajax({url:a,type:f,dataType:"html",data:b}).done(function(a){e=arguments,g.html(d?m("<div>").append(m.parseHTML(a)).find(d):a)}).complete(c&&function(a,b){g.each(c,e||[a.responseText,b,a])}),this},m.expr.filters.animated=function(a){return m.grep(m.timers,function(b){return a===b.elem}).length};var cd=a.document.documentElement;function dd(a){return m.isWindow(a)?a:9===a.nodeType?a.defaultView||a.parentWindow:!1}m.offset={setOffset:function(a,b,c){var d,e,f,g,h,i,j,k=m.css(a,"position"),l=m(a),n={};"static"===k&&(a.style.position="relative"),h=l.offset(),f=m.css(a,"top"),i=m.css(a,"left"),j=("absolute"===k||"fixed"===k)&&m.inArray("auto",[f,i])>-1,j?(d=l.position(),g=d.top,e=d.left):(g=parseFloat(f)||0,e=parseFloat(i)||0),m.isFunction(b)&&(b=b.call(a,c,h)),null!=b.top&&(n.top=b.top-h.top+g),null!=b.left&&(n.left=b.left-h.left+e),"using"in b?b.using.call(a,n):l.css(n)}},m.fn.extend({offset:function(a){if(arguments.length)return void 0===a?this:this.each(function(b){m.offset.setOffset(this,a,b)});var b,c,d={top:0,left:0},e=this[0],f=e&&e.ownerDocument;if(f)return b=f.documentElement,m.contains(b,e)?(typeof e.getBoundingClientRect!==K&&(d=e.getBoundingClientRect()),c=dd(f),{top:d.top+(c.pageYOffset||b.scrollTop)-(b.clientTop||0),left:d.left+(c.pageXOffset||b.scrollLeft)-(b.clientLeft||0)}):d},position:function(){if(this[0]){var a,b,c={top:0,left:0},d=this[0];return"fixed"===m.css(d,"position")?b=d.getBoundingClientRect():(a=this.offsetParent(),b=this.offset(),m.nodeName(a[0],"html")||(c=a.offset()),c.top+=m.css(a[0],"borderTopWidth",!0),c.left+=m.css(a[0],"borderLeftWidth",!0)),{top:b.top-c.top-m.css(d,"marginTop",!0),left:b.left-c.left-m.css(d,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var a=this.offsetParent||cd;while(a&&!m.nodeName(a,"html")&&"static"===m.css(a,"position"))a=a.offsetParent;return a||cd})}}),m.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(a,b){var c=/Y/.test(b);m.fn[a]=function(d){return V(this,function(a,d,e){var f=dd(a);return void 0===e?f?b in f?f[b]:f.document.documentElement[d]:a[d]:void(f?f.scrollTo(c?m(f).scrollLeft():e,c?e:m(f).scrollTop()):a[d]=e)},a,d,arguments.length,null)}}),m.each(["top","left"],function(a,b){m.cssHooks[b]=Lb(k.pixelPosition,function(a,c){return c?(c=Jb(a,b),Hb.test(c)?m(a).position()[b]+"px":c):void 0})}),m.each({Height:"height",Width:"width"},function(a,b){m.each({padding:"inner"+a,content:b,"":"outer"+a},function(c,d){m.fn[d]=function(d,e){var f=arguments.length&&(c||"boolean"!=typeof d),g=c||(d===!0||e===!0?"margin":"border");return V(this,function(b,c,d){var e;return m.isWindow(b)?b.document.documentElement["client"+a]:9===b.nodeType?(e=b.documentElement,Math.max(b.body["scroll"+a],e["scroll"+a],b.body["offset"+a],e["offset"+a],e["client"+a])):void 0===d?m.css(b,c,g):m.style(b,c,d,g)},b,f?d:void 0,f,null)}})}),m.fn.size=function(){return this.length},m.fn.andSelf=m.fn.addBack,"function"==typeof define&&define.amd&&define("jquery",[],function(){return m});var ed=a.jQuery,fd=a.$;return m.noConflict=function(b){return a.$===m&&(a.$=fd),b&&a.jQuery===m&&(a.jQuery=ed),m},typeof b===K&&(a.jQuery=a.$=m),m});
+
+
+
+/** d3js **/
+!function(){function n(n,t){return t>n?-1:n>t?1:n>=t?0:0/0}function t(n){return null!=n&&!isNaN(n)}function e(n){return{left:function(t,e,r,u){for(arguments.length<3&&(r=0),arguments.length<4&&(u=t.length);u>r;){var i=r+u>>>1;n(t[i],e)<0?r=i+1:u=i}return r},right:function(t,e,r,u){for(arguments.length<3&&(r=0),arguments.length<4&&(u=t.length);u>r;){var i=r+u>>>1;n(t[i],e)>0?u=i:r=i+1}return r}}}function r(n){return n.length}function u(n){for(var t=1;n*t%1;)t*=10;return t}function i(n,t){try{for(var e in t)Object.defineProperty(n.prototype,e,{value:t[e],enumerable:!1})}catch(r){n.prototype=t}}function o(){}function a(n){return ia+n in this}function c(n){return n=ia+n,n in this&&delete this[n]}function s(){var n=[];return this.forEach(function(t){n.push(t)}),n}function l(){var n=0;for(var t in this)t.charCodeAt(0)===oa&&++n;return n}function f(){for(var n in this)if(n.charCodeAt(0)===oa)return!1;return!0}function h(){}function g(n,t,e){return function(){var r=e.apply(t,arguments);return r===t?n:r}}function p(n,t){if(t in n)return t;t=t.charAt(0).toUpperCase()+t.substring(1);for(var e=0,r=aa.length;r>e;++e){var u=aa[e]+t;if(u in n)return u}}function v(){}function d(){}function m(n){function t(){for(var t,r=e,u=-1,i=r.length;++u<i;)(t=r[u].on)&&t.apply(this,arguments);return n}var e=[],r=new o;return t.on=function(t,u){var i,o=r.get(t);return arguments.length<2?o&&o.on:(o&&(o.on=null,e=e.slice(0,i=e.indexOf(o)).concat(e.slice(i+1)),r.remove(t)),u&&e.push(r.set(t,{on:u})),n)},t}function y(){Zo.event.preventDefault()}function x(){for(var n,t=Zo.event;n=t.sourceEvent;)t=n;return t}function M(n){for(var t=new d,e=0,r=arguments.length;++e<r;)t[arguments[e]]=m(t);return t.of=function(e,r){return function(u){try{var i=u.sourceEvent=Zo.event;u.target=n,Zo.event=u,t[u.type].apply(e,r)}finally{Zo.event=i}}},t}function _(n){return sa(n,pa),n}function b(n){return"function"==typeof n?n:function(){return la(n,this)}}function w(n){return"function"==typeof n?n:function(){return fa(n,this)}}function S(n,t){function e(){this.removeAttribute(n)}function r(){this.removeAttributeNS(n.space,n.local)}function u(){this.setAttribute(n,t)}function i(){this.setAttributeNS(n.space,n.local,t)}function o(){var e=t.apply(this,arguments);null==e?this.removeAttribute(n):this.setAttribute(n,e)}function a(){var e=t.apply(this,arguments);null==e?this.removeAttributeNS(n.space,n.local):this.setAttributeNS(n.space,n.local,e)}return n=Zo.ns.qualify(n),null==t?n.local?r:e:"function"==typeof t?n.local?a:o:n.local?i:u}function k(n){return n.trim().replace(/\s+/g," ")}function E(n){return new RegExp("(?:^|\\s+)"+Zo.requote(n)+"(?:\\s+|$)","g")}function A(n){return(n+"").trim().split(/^|\s+/)}function C(n,t){function e(){for(var e=-1;++e<u;)n[e](this,t)}function r(){for(var e=-1,r=t.apply(this,arguments);++e<u;)n[e](this,r)}n=A(n).map(N);var u=n.length;return"function"==typeof t?r:e}function N(n){var t=E(n);return function(e,r){if(u=e.classList)return r?u.add(n):u.remove(n);var u=e.getAttribute("class")||"";r?(t.lastIndex=0,t.test(u)||e.setAttribute("class",k(u+" "+n))):e.setAttribute("class",k(u.replace(t," ")))}}function z(n,t,e){function r(){this.style.removeProperty(n)}function u(){this.style.setProperty(n,t,e)}function i(){var r=t.apply(this,arguments);null==r?this.style.removeProperty(n):this.style.setProperty(n,r,e)}return null==t?r:"function"==typeof t?i:u}function L(n,t){function e(){delete this[n]}function r(){this[n]=t}function u(){var e=t.apply(this,arguments);null==e?delete this[n]:this[n]=e}return null==t?e:"function"==typeof t?u:r}function T(n){return"function"==typeof n?n:(n=Zo.ns.qualify(n)).local?function(){return this.ownerDocument.createElementNS(n.space,n.local)}:function(){return this.ownerDocument.createElementNS(this.namespaceURI,n)}}function q(n){return{__data__:n}}function R(n){return function(){return ga(this,n)}}function D(t){return arguments.length||(t=n),function(n,e){return n&&e?t(n.__data__,e.__data__):!n-!e}}function P(n,t){for(var e=0,r=n.length;r>e;e++)for(var u,i=n[e],o=0,a=i.length;a>o;o++)(u=i[o])&&t(u,o,e);return n}function U(n){return sa(n,da),n}function j(n){var t,e;return function(r,u,i){var o,a=n[i].update,c=a.length;for(i!=e&&(e=i,t=0),u>=t&&(t=u+1);!(o=a[t])&&++t<c;);return o}}function H(){var n=this.__transition__;n&&++n.active}function F(n,t,e){function r(){var t=this[o];t&&(this.removeEventListener(n,t,t.$),delete this[o])}function u(){var u=c(t,Xo(arguments));r.call(this),this.addEventListener(n,this[o]=u,u.$=e),u._=t}function i(){var t,e=new RegExp("^__on([^.]+)"+Zo.requote(n)+"$");for(var r in this)if(t=r.match(e)){var u=this[r];this.removeEventListener(t[1],u,u.$),delete this[r]}}var o="__on"+n,a=n.indexOf("."),c=O;a>0&&(n=n.substring(0,a));var s=ya.get(n);return s&&(n=s,c=Y),a?t?u:r:t?v:i}function O(n,t){return function(e){var r=Zo.event;Zo.event=e,t[0]=this.__data__;try{n.apply(this,t)}finally{Zo.event=r}}}function Y(n,t){var e=O(n,t);return function(n){var t=this,r=n.relatedTarget;r&&(r===t||8&r.compareDocumentPosition(t))||e.call(t,n)}}function I(){var n=".dragsuppress-"+ ++Ma,t="click"+n,e=Zo.select(Wo).on("touchmove"+n,y).on("dragstart"+n,y).on("selectstart"+n,y);if(xa){var r=Bo.style,u=r[xa];r[xa]="none"}return function(i){function o(){e.on(t,null)}e.on(n,null),xa&&(r[xa]=u),i&&(e.on(t,function(){y(),o()},!0),setTimeout(o,0))}}function Z(n,t){t.changedTouches&&(t=t.changedTouches[0]);var e=n.ownerSVGElement||n;if(e.createSVGPoint){var r=e.createSVGPoint();if(0>_a&&(Wo.scrollX||Wo.scrollY)){e=Zo.select("body").append("svg").style({position:"absolute",top:0,left:0,margin:0,padding:0,border:"none"},"important");var u=e[0][0].getScreenCTM();_a=!(u.f||u.e),e.remove()}return _a?(r.x=t.pageX,r.y=t.pageY):(r.x=t.clientX,r.y=t.clientY),r=r.matrixTransform(n.getScreenCTM().inverse()),[r.x,r.y]}var i=n.getBoundingClientRect();return[t.clientX-i.left-n.clientLeft,t.clientY-i.top-n.clientTop]}function V(){return Zo.event.changedTouches[0].identifier}function X(){return Zo.event.target}function $(){return Wo}function B(n){return n>0?1:0>n?-1:0}function W(n,t,e){return(t[0]-n[0])*(e[1]-n[1])-(t[1]-n[1])*(e[0]-n[0])}function J(n){return n>1?0:-1>n?ba:Math.acos(n)}function G(n){return n>1?Sa:-1>n?-Sa:Math.asin(n)}function K(n){return((n=Math.exp(n))-1/n)/2}function Q(n){return((n=Math.exp(n))+1/n)/2}function nt(n){return((n=Math.exp(2*n))-1)/(n+1)}function tt(n){return(n=Math.sin(n/2))*n}function et(){}function rt(n,t,e){return this instanceof rt?(this.h=+n,this.s=+t,void(this.l=+e)):arguments.length<2?n instanceof rt?new rt(n.h,n.s,n.l):mt(""+n,yt,rt):new rt(n,t,e)}function ut(n,t,e){function r(n){return n>360?n-=360:0>n&&(n+=360),60>n?i+(o-i)*n/60:180>n?o:240>n?i+(o-i)*(240-n)/60:i}function u(n){return Math.round(255*r(n))}var i,o;return n=isNaN(n)?0:(n%=360)<0?n+360:n,t=isNaN(t)?0:0>t?0:t>1?1:t,e=0>e?0:e>1?1:e,o=.5>=e?e*(1+t):e+t-e*t,i=2*e-o,new gt(u(n+120),u(n),u(n-120))}function it(n,t,e){return this instanceof it?(this.h=+n,this.c=+t,void(this.l=+e)):arguments.length<2?n instanceof it?new it(n.h,n.c,n.l):n instanceof at?st(n.l,n.a,n.b):st((n=xt((n=Zo.rgb(n)).r,n.g,n.b)).l,n.a,n.b):new it(n,t,e)}function ot(n,t,e){return isNaN(n)&&(n=0),isNaN(t)&&(t=0),new at(e,Math.cos(n*=Aa)*t,Math.sin(n)*t)}function at(n,t,e){return this instanceof at?(this.l=+n,this.a=+t,void(this.b=+e)):arguments.length<2?n instanceof at?new at(n.l,n.a,n.b):n instanceof it?ot(n.l,n.c,n.h):xt((n=gt(n)).r,n.g,n.b):new at(n,t,e)}function ct(n,t,e){var r=(n+16)/116,u=r+t/500,i=r-e/200;return u=lt(u)*ja,r=lt(r)*Ha,i=lt(i)*Fa,new gt(ht(3.2404542*u-1.5371385*r-.4985314*i),ht(-.969266*u+1.8760108*r+.041556*i),ht(.0556434*u-.2040259*r+1.0572252*i))}function st(n,t,e){return n>0?new it(Math.atan2(e,t)*Ca,Math.sqrt(t*t+e*e),n):new it(0/0,0/0,n)}function lt(n){return n>.206893034?n*n*n:(n-4/29)/7.787037}function ft(n){return n>.008856?Math.pow(n,1/3):7.787037*n+4/29}function ht(n){return Math.round(255*(.00304>=n?12.92*n:1.055*Math.pow(n,1/2.4)-.055))}function gt(n,t,e){return this instanceof gt?(this.r=~~n,this.g=~~t,void(this.b=~~e)):arguments.length<2?n instanceof gt?new gt(n.r,n.g,n.b):mt(""+n,gt,ut):new gt(n,t,e)}function pt(n){return new gt(n>>16,255&n>>8,255&n)}function vt(n){return pt(n)+""}function dt(n){return 16>n?"0"+Math.max(0,n).toString(16):Math.min(255,n).toString(16)}function mt(n,t,e){var r,u,i,o=0,a=0,c=0;if(r=/([a-z]+)\((.*)\)/i.exec(n))switch(u=r[2].split(","),r[1]){case"hsl":return e(parseFloat(u[0]),parseFloat(u[1])/100,parseFloat(u[2])/100);case"rgb":return t(_t(u[0]),_t(u[1]),_t(u[2]))}return(i=Ia.get(n))?t(i.r,i.g,i.b):(null==n||"#"!==n.charAt(0)||isNaN(i=parseInt(n.substring(1),16))||(4===n.length?(o=(3840&i)>>4,o=o>>4|o,a=240&i,a=a>>4|a,c=15&i,c=c<<4|c):7===n.length&&(o=(16711680&i)>>16,a=(65280&i)>>8,c=255&i)),t(o,a,c))}function yt(n,t,e){var r,u,i=Math.min(n/=255,t/=255,e/=255),o=Math.max(n,t,e),a=o-i,c=(o+i)/2;return a?(u=.5>c?a/(o+i):a/(2-o-i),r=n==o?(t-e)/a+(e>t?6:0):t==o?(e-n)/a+2:(n-t)/a+4,r*=60):(r=0/0,u=c>0&&1>c?0:r),new rt(r,u,c)}function xt(n,t,e){n=Mt(n),t=Mt(t),e=Mt(e);var r=ft((.4124564*n+.3575761*t+.1804375*e)/ja),u=ft((.2126729*n+.7151522*t+.072175*e)/Ha),i=ft((.0193339*n+.119192*t+.9503041*e)/Fa);return at(116*u-16,500*(r-u),200*(u-i))}function Mt(n){return(n/=255)<=.04045?n/12.92:Math.pow((n+.055)/1.055,2.4)}function _t(n){var t=parseFloat(n);return"%"===n.charAt(n.length-1)?Math.round(2.55*t):t}function bt(n){return"function"==typeof n?n:function(){return n}}function wt(n){return n}function St(n){return function(t,e,r){return 2===arguments.length&&"function"==typeof e&&(r=e,e=null),kt(t,e,n,r)}}function kt(n,t,e,r){function u(){var n,t=c.status;if(!t&&c.responseText||t>=200&&300>t||304===t){try{n=e.call(i,c)}catch(r){return o.error.call(i,r),void 0}o.load.call(i,n)}else o.error.call(i,c)}var i={},o=Zo.dispatch("beforesend","progress","load","error"),a={},c=new XMLHttpRequest,s=null;return!Wo.XDomainRequest||"withCredentials"in c||!/^(http(s)?:)?\/\//.test(n)||(c=new XDomainRequest),"onload"in c?c.onload=c.onerror=u:c.onreadystatechange=function(){c.readyState>3&&u()},c.onprogress=function(n){var t=Zo.event;Zo.event=n;try{o.progress.call(i,c)}finally{Zo.event=t}},i.header=function(n,t){return n=(n+"").toLowerCase(),arguments.length<2?a[n]:(null==t?delete a[n]:a[n]=t+"",i)},i.mimeType=function(n){return arguments.length?(t=null==n?null:n+"",i):t},i.responseType=function(n){return arguments.length?(s=n,i):s},i.response=function(n){return e=n,i},["get","post"].forEach(function(n){i[n]=function(){return i.send.apply(i,[n].concat(Xo(arguments)))}}),i.send=function(e,r,u){if(2===arguments.length&&"function"==typeof r&&(u=r,r=null),c.open(e,n,!0),null==t||"accept"in a||(a.accept=t+",*/*"),c.setRequestHeader)for(var l in a)c.setRequestHeader(l,a[l]);return null!=t&&c.overrideMimeType&&c.overrideMimeType(t),null!=s&&(c.responseType=s),null!=u&&i.on("error",u).on("load",function(n){u(null,n)}),o.beforesend.call(i,c),c.send(null==r?null:r),i},i.abort=function(){return c.abort(),i},Zo.rebind(i,o,"on"),null==r?i:i.get(Et(r))}function Et(n){return 1===n.length?function(t,e){n(null==t?e:null)}:n}function At(){var n=Ct(),t=Nt()-n;t>24?(isFinite(t)&&(clearTimeout($a),$a=setTimeout(At,t)),Xa=0):(Xa=1,Wa(At))}function Ct(){var n=Date.now();for(Ba=Za;Ba;)n>=Ba.t&&(Ba.f=Ba.c(n-Ba.t)),Ba=Ba.n;return n}function Nt(){for(var n,t=Za,e=1/0;t;)t.f?t=n?n.n=t.n:Za=t.n:(t.t<e&&(e=t.t),t=(n=t).n);return Va=n,e}function zt(n,t){return t-(n?Math.ceil(Math.log(n)/Math.LN10):1)}function Lt(n,t){var e=Math.pow(10,3*ua(8-t));return{scale:t>8?function(n){return n/e}:function(n){return n*e},symbol:n}}function Tt(n){var t=n.decimal,e=n.thousands,r=n.grouping,u=n.currency,i=r?function(n){for(var t=n.length,u=[],i=0,o=r[0];t>0&&o>0;)u.push(n.substring(t-=o,t+o)),o=r[i=(i+1)%r.length];return u.reverse().join(e)}:wt;return function(n){var e=Ga.exec(n),r=e[1]||" ",o=e[2]||">",a=e[3]||"",c=e[4]||"",s=e[5],l=+e[6],f=e[7],h=e[8],g=e[9],p=1,v="",d="",m=!1;switch(h&&(h=+h.substring(1)),(s||"0"===r&&"="===o)&&(s=r="0",o="=",f&&(l-=Math.floor((l-1)/4))),g){case"n":f=!0,g="g";break;case"%":p=100,d="%",g="f";break;case"p":p=100,d="%",g="r";break;case"b":case"o":case"x":case"X":"#"===c&&(v="0"+g.toLowerCase());case"c":case"d":m=!0,h=0;break;case"s":p=-1,g="r"}"$"===c&&(v=u[0],d=u[1]),"r"!=g||h||(g="g"),null!=h&&("g"==g?h=Math.max(1,Math.min(21,h)):("e"==g||"f"==g)&&(h=Math.max(0,Math.min(20,h)))),g=Ka.get(g)||qt;var y=s&&f;return function(n){var e=d;if(m&&n%1)return"";var u=0>n||0===n&&0>1/n?(n=-n,"-"):a;if(0>p){var c=Zo.formatPrefix(n,h);n=c.scale(n),e=c.symbol+d}else n*=p;n=g(n,h);var x=n.lastIndexOf("."),M=0>x?n:n.substring(0,x),_=0>x?"":t+n.substring(x+1);!s&&f&&(M=i(M));var b=v.length+M.length+_.length+(y?0:u.length),w=l>b?new Array(b=l-b+1).join(r):"";return y&&(M=i(w+M)),u+=v,n=M+_,("<"===o?u+n+w:">"===o?w+u+n:"^"===o?w.substring(0,b>>=1)+u+n+w.substring(b):u+(y?n:w+n))+e}}}function qt(n){return n+""}function Rt(){this._=new Date(arguments.length>1?Date.UTC.apply(this,arguments):arguments[0])}function Dt(n,t,e){function r(t){var e=n(t),r=i(e,1);return r-t>t-e?e:r}function u(e){return t(e=n(new nc(e-1)),1),e}function i(n,e){return t(n=new nc(+n),e),n}function o(n,r,i){var o=u(n),a=[];if(i>1)for(;r>o;)e(o)%i||a.push(new Date(+o)),t(o,1);else for(;r>o;)a.push(new Date(+o)),t(o,1);return a}function a(n,t,e){try{nc=Rt;var r=new Rt;return r._=n,o(r,t,e)}finally{nc=Date}}n.floor=n,n.round=r,n.ceil=u,n.offset=i,n.range=o;var c=n.utc=Pt(n);return c.floor=c,c.round=Pt(r),c.ceil=Pt(u),c.offset=Pt(i),c.range=a,n}function Pt(n){return function(t,e){try{nc=Rt;var r=new Rt;return r._=t,n(r,e)._}finally{nc=Date}}}function Ut(n){function t(n){function t(t){for(var e,u,i,o=[],a=-1,c=0;++a<r;)37===n.charCodeAt(a)&&(o.push(n.substring(c,a)),null!=(u=ec[e=n.charAt(++a)])&&(e=n.charAt(++a)),(i=C[e])&&(e=i(t,null==u?"e"===e?" ":"0":u)),o.push(e),c=a+1);return o.push(n.substring(c,a)),o.join("")}var r=n.length;return t.parse=function(t){var r={y:1900,m:0,d:1,H:0,M:0,S:0,L:0,Z:null},u=e(r,n,t,0);if(u!=t.length)return null;"p"in r&&(r.H=r.H%12+12*r.p);var i=null!=r.Z&&nc!==Rt,o=new(i?Rt:nc);return"j"in r?o.setFullYear(r.y,0,r.j):"w"in r&&("W"in r||"U"in r)?(o.setFullYear(r.y,0,1),o.setFullYear(r.y,0,"W"in r?(r.w+6)%7+7*r.W-(o.getDay()+5)%7:r.w+7*r.U-(o.getDay()+6)%7)):o.setFullYear(r.y,r.m,r.d),o.setHours(r.H+Math.floor(r.Z/100),r.M+r.Z%100,r.S,r.L),i?o._:o},t.toString=function(){return n},t}function e(n,t,e,r){for(var u,i,o,a=0,c=t.length,s=e.length;c>a;){if(r>=s)return-1;if(u=t.charCodeAt(a++),37===u){if(o=t.charAt(a++),i=N[o in ec?t.charAt(a++):o],!i||(r=i(n,e,r))<0)return-1}else if(u!=e.charCodeAt(r++))return-1}return r}function r(n,t,e){b.lastIndex=0;var r=b.exec(t.substring(e));return r?(n.w=w.get(r[0].toLowerCase()),e+r[0].length):-1}function u(n,t,e){M.lastIndex=0;var r=M.exec(t.substring(e));return r?(n.w=_.get(r[0].toLowerCase()),e+r[0].length):-1}function i(n,t,e){E.lastIndex=0;var r=E.exec(t.substring(e));return r?(n.m=A.get(r[0].toLowerCase()),e+r[0].length):-1}function o(n,t,e){S.lastIndex=0;var r=S.exec(t.substring(e));return r?(n.m=k.get(r[0].toLowerCase()),e+r[0].length):-1}function a(n,t,r){return e(n,C.c.toString(),t,r)}function c(n,t,r){return e(n,C.x.toString(),t,r)}function s(n,t,r){return e(n,C.X.toString(),t,r)}function l(n,t,e){var r=x.get(t.substring(e,e+=2).toLowerCase());return null==r?-1:(n.p=r,e)}var f=n.dateTime,h=n.date,g=n.time,p=n.periods,v=n.days,d=n.shortDays,m=n.months,y=n.shortMonths;t.utc=function(n){function e(n){try{nc=Rt;var t=new nc;return t._=n,r(t)}finally{nc=Date}}var r=t(n);return e.parse=function(n){try{nc=Rt;var t=r.parse(n);return t&&t._}finally{nc=Date}},e.toString=r.toString,e},t.multi=t.utc.multi=re;var x=Zo.map(),M=Ht(v),_=Ft(v),b=Ht(d),w=Ft(d),S=Ht(m),k=Ft(m),E=Ht(y),A=Ft(y);p.forEach(function(n,t){x.set(n.toLowerCase(),t)});var C={a:function(n){return d[n.getDay()]},A:function(n){return v[n.getDay()]},b:function(n){return y[n.getMonth()]},B:function(n){return m[n.getMonth()]},c:t(f),d:function(n,t){return jt(n.getDate(),t,2)},e:function(n,t){return jt(n.getDate(),t,2)},H:function(n,t){return jt(n.getHours(),t,2)},I:function(n,t){return jt(n.getHours()%12||12,t,2)},j:function(n,t){return jt(1+Qa.dayOfYear(n),t,3)},L:function(n,t){return jt(n.getMilliseconds(),t,3)},m:function(n,t){return jt(n.getMonth()+1,t,2)},M:function(n,t){return jt(n.getMinutes(),t,2)},p:function(n){return p[+(n.getHours()>=12)]},S:function(n,t){return jt(n.getSeconds(),t,2)},U:function(n,t){return jt(Qa.sundayOfYear(n),t,2)},w:function(n){return n.getDay()},W:function(n,t){return jt(Qa.mondayOfYear(n),t,2)},x:t(h),X:t(g),y:function(n,t){return jt(n.getFullYear()%100,t,2)},Y:function(n,t){return jt(n.getFullYear()%1e4,t,4)},Z:te,"%":function(){return"%"}},N={a:r,A:u,b:i,B:o,c:a,d:Wt,e:Wt,H:Gt,I:Gt,j:Jt,L:ne,m:Bt,M:Kt,p:l,S:Qt,U:Yt,w:Ot,W:It,x:c,X:s,y:Vt,Y:Zt,Z:Xt,"%":ee};return t}function jt(n,t,e){var r=0>n?"-":"",u=(r?-n:n)+"",i=u.length;return r+(e>i?new Array(e-i+1).join(t)+u:u)}function Ht(n){return new RegExp("^(?:"+n.map(Zo.requote).join("|")+")","i")}function Ft(n){for(var t=new o,e=-1,r=n.length;++e<r;)t.set(n[e].toLowerCase(),e);return t}function Ot(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+1));return r?(n.w=+r[0],e+r[0].length):-1}function Yt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e));return r?(n.U=+r[0],e+r[0].length):-1}function It(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e));return r?(n.W=+r[0],e+r[0].length):-1}function Zt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+4));return r?(n.y=+r[0],e+r[0].length):-1}function Vt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+2));return r?(n.y=$t(+r[0]),e+r[0].length):-1}function Xt(n,t,e){return/^[+-]\d{4}$/.test(t=t.substring(e,e+5))?(n.Z=-t,e+5):-1}function $t(n){return n+(n>68?1900:2e3)}function Bt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+2));return r?(n.m=r[0]-1,e+r[0].length):-1}function Wt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+2));return r?(n.d=+r[0],e+r[0].length):-1}function Jt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+3));return r?(n.j=+r[0],e+r[0].length):-1}function Gt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+2));return r?(n.H=+r[0],e+r[0].length):-1}function Kt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+2));return r?(n.M=+r[0],e+r[0].length):-1}function Qt(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+2));return r?(n.S=+r[0],e+r[0].length):-1}function ne(n,t,e){rc.lastIndex=0;var r=rc.exec(t.substring(e,e+3));return r?(n.L=+r[0],e+r[0].length):-1}function te(n){var t=n.getTimezoneOffset(),e=t>0?"-":"+",r=~~(ua(t)/60),u=ua(t)%60;return e+jt(r,"0",2)+jt(u,"0",2)}function ee(n,t,e){uc.lastIndex=0;var r=uc.exec(t.substring(e,e+1));return r?e+r[0].length:-1}function re(n){for(var t=n.length,e=-1;++e<t;)n[e][0]=this(n[e][0]);return function(t){for(var e=0,r=n[e];!r[1](t);)r=n[++e];return r[0](t)}}function ue(){}function ie(n,t,e){var r=e.s=n+t,u=r-n,i=r-u;e.t=n-i+(t-u)}function oe(n,t){n&&cc.hasOwnProperty(n.type)&&cc[n.type](n,t)}function ae(n,t,e){var r,u=-1,i=n.length-e;for(t.lineStart();++u<i;)r=n[u],t.point(r[0],r[1],r[2]);t.lineEnd()}function ce(n,t){var e=-1,r=n.length;for(t.polygonStart();++e<r;)ae(n[e],t,1);t.polygonEnd()}function se(){function n(n,t){n*=Aa,t=t*Aa/2+ba/4;var e=n-r,o=e>=0?1:-1,a=o*e,c=Math.cos(t),s=Math.sin(t),l=i*s,f=u*c+l*Math.cos(a),h=l*o*Math.sin(a);lc.add(Math.atan2(h,f)),r=n,u=c,i=s}var t,e,r,u,i;fc.point=function(o,a){fc.point=n,r=(t=o)*Aa,u=Math.cos(a=(e=a)*Aa/2+ba/4),i=Math.sin(a)},fc.lineEnd=function(){n(t,e)}}function le(n){var t=n[0],e=n[1],r=Math.cos(e);return[r*Math.cos(t),r*Math.sin(t),Math.sin(e)]}function fe(n,t){return n[0]*t[0]+n[1]*t[1]+n[2]*t[2]}function he(n,t){return[n[1]*t[2]-n[2]*t[1],n[2]*t[0]-n[0]*t[2],n[0]*t[1]-n[1]*t[0]]}function ge(n,t){n[0]+=t[0],n[1]+=t[1],n[2]+=t[2]}function pe(n,t){return[n[0]*t,n[1]*t,n[2]*t]}function ve(n){var t=Math.sqrt(n[0]*n[0]+n[1]*n[1]+n[2]*n[2]);n[0]/=t,n[1]/=t,n[2]/=t}function de(n){return[Math.atan2(n[1],n[0]),G(n[2])]}function me(n,t){return ua(n[0]-t[0])<ka&&ua(n[1]-t[1])<ka}function ye(n,t){n*=Aa;var e=Math.cos(t*=Aa);xe(e*Math.cos(n),e*Math.sin(n),Math.sin(t))}function xe(n,t,e){++hc,pc+=(n-pc)/hc,vc+=(t-vc)/hc,dc+=(e-dc)/hc}function Me(){function n(n,u){n*=Aa;var i=Math.cos(u*=Aa),o=i*Math.cos(n),a=i*Math.sin(n),c=Math.sin(u),s=Math.atan2(Math.sqrt((s=e*c-r*a)*s+(s=r*o-t*c)*s+(s=t*a-e*o)*s),t*o+e*a+r*c);gc+=s,mc+=s*(t+(t=o)),yc+=s*(e+(e=a)),xc+=s*(r+(r=c)),xe(t,e,r)}var t,e,r;wc.point=function(u,i){u*=Aa;var o=Math.cos(i*=Aa);t=o*Math.cos(u),e=o*Math.sin(u),r=Math.sin(i),wc.point=n,xe(t,e,r)}}function _e(){wc.point=ye}function be(){function n(n,t){n*=Aa;var e=Math.cos(t*=Aa),o=e*Math.cos(n),a=e*Math.sin(n),c=Math.sin(t),s=u*c-i*a,l=i*o-r*c,f=r*a-u*o,h=Math.sqrt(s*s+l*l+f*f),g=r*o+u*a+i*c,p=h&&-J(g)/h,v=Math.atan2(h,g);Mc+=p*s,_c+=p*l,bc+=p*f,gc+=v,mc+=v*(r+(r=o)),yc+=v*(u+(u=a)),xc+=v*(i+(i=c)),xe(r,u,i)}var t,e,r,u,i;wc.point=function(o,a){t=o,e=a,wc.point=n,o*=Aa;var c=Math.cos(a*=Aa);r=c*Math.cos(o),u=c*Math.sin(o),i=Math.sin(a),xe(r,u,i)},wc.lineEnd=function(){n(t,e),wc.lineEnd=_e,wc.point=ye}}function we(){return!0}function Se(n,t,e,r,u){var i=[],o=[];if(n.forEach(function(n){if(!((t=n.length-1)<=0)){var t,e=n[0],r=n[t];if(me(e,r)){u.lineStart();for(var a=0;t>a;++a)u.point((e=n[a])[0],e[1]);return u.lineEnd(),void 0}var c=new Ee(e,n,null,!0),s=new Ee(e,null,c,!1);c.o=s,i.push(c),o.push(s),c=new Ee(r,n,null,!1),s=new Ee(r,null,c,!0),c.o=s,i.push(c),o.push(s)}}),o.sort(t),ke(i),ke(o),i.length){for(var a=0,c=e,s=o.length;s>a;++a)o[a].e=c=!c;for(var l,f,h=i[0];;){for(var g=h,p=!0;g.v;)if((g=g.n)===h)return;l=g.z,u.lineStart();do{if(g.v=g.o.v=!0,g.e){if(p)for(var a=0,s=l.length;s>a;++a)u.point((f=l[a])[0],f[1]);else r(g.x,g.n.x,1,u);g=g.n}else{if(p){l=g.p.z;for(var a=l.length-1;a>=0;--a)u.point((f=l[a])[0],f[1])}else r(g.x,g.p.x,-1,u);g=g.p}g=g.o,l=g.z,p=!p}while(!g.v);u.lineEnd()}}}function ke(n){if(t=n.length){for(var t,e,r=0,u=n[0];++r<t;)u.n=e=n[r],e.p=u,u=e;u.n=e=n[0],e.p=u}}function Ee(n,t,e,r){this.x=n,this.z=t,this.o=e,this.e=r,this.v=!1,this.n=this.p=null}function Ae(n,t,e,r){return function(u,i){function o(t,e){var r=u(t,e);n(t=r[0],e=r[1])&&i.point(t,e)}function a(n,t){var e=u(n,t);d.point(e[0],e[1])}function c(){y.point=a,d.lineStart()}function s(){y.point=o,d.lineEnd()}function l(n,t){v.push([n,t]);var e=u(n,t);M.point(e[0],e[1])}function f(){M.lineStart(),v=[]}function h(){l(v[0][0],v[0][1]),M.lineEnd();var n,t=M.clean(),e=x.buffer(),r=e.length;if(v.pop(),p.push(v),v=null,r)if(1&t){n=e[0];var u,r=n.length-1,o=-1;if(r>0){for(_||(i.polygonStart(),_=!0),i.lineStart();++o<r;)i.point((u=n[o])[0],u[1]);i.lineEnd()}}else r>1&&2&t&&e.push(e.pop().concat(e.shift())),g.push(e.filter(Ce))}var g,p,v,d=t(i),m=u.invert(r[0],r[1]),y={point:o,lineStart:c,lineEnd:s,polygonStart:function(){y.point=l,y.lineStart=f,y.lineEnd=h,g=[],p=[]},polygonEnd:function(){y.point=o,y.lineStart=c,y.lineEnd=s,g=Zo.merge(g);var n=Le(m,p);g.length?(_||(i.polygonStart(),_=!0),Se(g,ze,n,e,i)):n&&(_||(i.polygonStart(),_=!0),i.lineStart(),e(null,null,1,i),i.lineEnd()),_&&(i.polygonEnd(),_=!1),g=p=null},sphere:function(){i.polygonStart(),i.lineStart(),e(null,null,1,i),i.lineEnd(),i.polygonEnd()}},x=Ne(),M=t(x),_=!1;return y}}function Ce(n){return n.length>1}function Ne(){var n,t=[];return{lineStart:function(){t.push(n=[])},point:function(t,e){n.push([t,e])},lineEnd:v,buffer:function(){var e=t;return t=[],n=null,e},rejoin:function(){t.length>1&&t.push(t.pop().concat(t.shift()))}}}function ze(n,t){return((n=n.x)[0]<0?n[1]-Sa-ka:Sa-n[1])-((t=t.x)[0]<0?t[1]-Sa-ka:Sa-t[1])}function Le(n,t){var e=n[0],r=n[1],u=[Math.sin(e),-Math.cos(e),0],i=0,o=0;lc.reset();for(var a=0,c=t.length;c>a;++a){var s=t[a],l=s.length;if(l)for(var f=s[0],h=f[0],g=f[1]/2+ba/4,p=Math.sin(g),v=Math.cos(g),d=1;;){d===l&&(d=0),n=s[d];var m=n[0],y=n[1]/2+ba/4,x=Math.sin(y),M=Math.cos(y),_=m-h,b=_>=0?1:-1,w=b*_,S=w>ba,k=p*x;if(lc.add(Math.atan2(k*b*Math.sin(w),v*M+k*Math.cos(w))),i+=S?_+b*wa:_,S^h>=e^m>=e){var E=he(le(f),le(n));ve(E);var A=he(u,E);ve(A);var C=(S^_>=0?-1:1)*G(A[2]);(r>C||r===C&&(E[0]||E[1]))&&(o+=S^_>=0?1:-1)}if(!d++)break;h=m,p=x,v=M,f=n}}return(-ka>i||ka>i&&0>lc)^1&o}function Te(n){var t,e=0/0,r=0/0,u=0/0;return{lineStart:function(){n.lineStart(),t=1},point:function(i,o){var a=i>0?ba:-ba,c=ua(i-e);ua(c-ba)<ka?(n.point(e,r=(r+o)/2>0?Sa:-Sa),n.point(u,r),n.lineEnd(),n.lineStart(),n.point(a,r),n.point(i,r),t=0):u!==a&&c>=ba&&(ua(e-u)<ka&&(e-=u*ka),ua(i-a)<ka&&(i-=a*ka),r=qe(e,r,i,o),n.point(u,r),n.lineEnd(),n.lineStart(),n.point(a,r),t=0),n.point(e=i,r=o),u=a},lineEnd:function(){n.lineEnd(),e=r=0/0},clean:function(){return 2-t}}}function qe(n,t,e,r){var u,i,o=Math.sin(n-e);return ua(o)>ka?Math.atan((Math.sin(t)*(i=Math.cos(r))*Math.sin(e)-Math.sin(r)*(u=Math.cos(t))*Math.sin(n))/(u*i*o)):(t+r)/2}function Re(n,t,e,r){var u;if(null==n)u=e*Sa,r.point(-ba,u),r.point(0,u),r.point(ba,u),r.point(ba,0),r.point(ba,-u),r.point(0,-u),r.point(-ba,-u),r.point(-ba,0),r.point(-ba,u);else if(ua(n[0]-t[0])>ka){var i=n[0]<t[0]?ba:-ba;u=e*i/2,r.point(-i,u),r.point(0,u),r.point(i,u)}else r.point(t[0],t[1])}function De(n){function t(n,t){return Math.cos(n)*Math.cos(t)>i}function e(n){var e,i,c,s,l;return{lineStart:function(){s=c=!1,l=1},point:function(f,h){var g,p=[f,h],v=t(f,h),d=o?v?0:u(f,h):v?u(f+(0>f?ba:-ba),h):0;if(!e&&(s=c=v)&&n.lineStart(),v!==c&&(g=r(e,p),(me(e,g)||me(p,g))&&(p[0]+=ka,p[1]+=ka,v=t(p[0],p[1]))),v!==c)l=0,v?(n.lineStart(),g=r(p,e),n.point(g[0],g[1])):(g=r(e,p),n.point(g[0],g[1]),n.lineEnd()),e=g;else if(a&&e&&o^v){var m;d&i||!(m=r(p,e,!0))||(l=0,o?(n.lineStart(),n.point(m[0][0],m[0][1]),n.point(m[1][0],m[1][1]),n.lineEnd()):(n.point(m[1][0],m[1][1]),n.lineEnd(),n.lineStart(),n.point(m[0][0],m[0][1])))}!v||e&&me(e,p)||n.point(p[0],p[1]),e=p,c=v,i=d},lineEnd:function(){c&&n.lineEnd(),e=null},clean:function(){return l|(s&&c)<<1}}}function r(n,t,e){var r=le(n),u=le(t),o=[1,0,0],a=he(r,u),c=fe(a,a),s=a[0],l=c-s*s;if(!l)return!e&&n;var f=i*c/l,h=-i*s/l,g=he(o,a),p=pe(o,f),v=pe(a,h);ge(p,v);var d=g,m=fe(p,d),y=fe(d,d),x=m*m-y*(fe(p,p)-1);if(!(0>x)){var M=Math.sqrt(x),_=pe(d,(-m-M)/y);if(ge(_,p),_=de(_),!e)return _;var b,w=n[0],S=t[0],k=n[1],E=t[1];w>S&&(b=w,w=S,S=b);var A=S-w,C=ua(A-ba)<ka,N=C||ka>A;if(!C&&k>E&&(b=k,k=E,E=b),N?C?k+E>0^_[1]<(ua(_[0]-w)<ka?k:E):k<=_[1]&&_[1]<=E:A>ba^(w<=_[0]&&_[0]<=S)){var z=pe(d,(-m+M)/y);return ge(z,p),[_,de(z)]}}}function u(t,e){var r=o?n:ba-n,u=0;return-r>t?u|=1:t>r&&(u|=2),-r>e?u|=4:e>r&&(u|=8),u}var i=Math.cos(n),o=i>0,a=ua(i)>ka,c=sr(n,6*Aa);return Ae(t,e,c,o?[0,-n]:[-ba,n-ba])}function Pe(n,t,e,r){return function(u){var i,o=u.a,a=u.b,c=o.x,s=o.y,l=a.x,f=a.y,h=0,g=1,p=l-c,v=f-s;if(i=n-c,p||!(i>0)){if(i/=p,0>p){if(h>i)return;g>i&&(g=i)}else if(p>0){if(i>g)return;i>h&&(h=i)}if(i=e-c,p||!(0>i)){if(i/=p,0>p){if(i>g)return;i>h&&(h=i)}else if(p>0){if(h>i)return;g>i&&(g=i)}if(i=t-s,v||!(i>0)){if(i/=v,0>v){if(h>i)return;g>i&&(g=i)}else if(v>0){if(i>g)return;i>h&&(h=i)}if(i=r-s,v||!(0>i)){if(i/=v,0>v){if(i>g)return;i>h&&(h=i)}else if(v>0){if(h>i)return;g>i&&(g=i)}return h>0&&(u.a={x:c+h*p,y:s+h*v}),1>g&&(u.b={x:c+g*p,y:s+g*v}),u}}}}}}function Ue(n,t,e,r){function u(r,u){return ua(r[0]-n)<ka?u>0?0:3:ua(r[0]-e)<ka?u>0?2:1:ua(r[1]-t)<ka?u>0?1:0:u>0?3:2}function i(n,t){return o(n.x,t.x)}function o(n,t){var e=u(n,1),r=u(t,1);return e!==r?e-r:0===e?t[1]-n[1]:1===e?n[0]-t[0]:2===e?n[1]-t[1]:t[0]-n[0]}return function(a){function c(n){for(var t=0,e=d.length,r=n[1],u=0;e>u;++u)for(var i,o=1,a=d[u],c=a.length,s=a[0];c>o;++o)i=a[o],s[1]<=r?i[1]>r&&W(s,i,n)>0&&++t:i[1]<=r&&W(s,i,n)<0&&--t,s=i;return 0!==t}function s(i,a,c,s){var l=0,f=0;if(null==i||(l=u(i,c))!==(f=u(a,c))||o(i,a)<0^c>0){do s.point(0===l||3===l?n:e,l>1?r:t);while((l=(l+c+4)%4)!==f)}else s.point(a[0],a[1])}function l(u,i){return u>=n&&e>=u&&i>=t&&r>=i}function f(n,t){l(n,t)&&a.point(n,t)}function h(){N.point=p,d&&d.push(m=[]),S=!0,w=!1,_=b=0/0}function g(){v&&(p(y,x),M&&w&&A.rejoin(),v.push(A.buffer())),N.point=f,w&&a.lineEnd()}function p(n,t){n=Math.max(-kc,Math.min(kc,n)),t=Math.max(-kc,Math.min(kc,t));var e=l(n,t);if(d&&m.push([n,t]),S)y=n,x=t,M=e,S=!1,e&&(a.lineStart(),a.point(n,t));else if(e&&w)a.point(n,t);else{var r={a:{x:_,y:b},b:{x:n,y:t}};C(r)?(w||(a.lineStart(),a.point(r.a.x,r.a.y)),a.point(r.b.x,r.b.y),e||a.lineEnd(),k=!1):e&&(a.lineStart(),a.point(n,t),k=!1)}_=n,b=t,w=e}var v,d,m,y,x,M,_,b,w,S,k,E=a,A=Ne(),C=Pe(n,t,e,r),N={point:f,lineStart:h,lineEnd:g,polygonStart:function(){a=A,v=[],d=[],k=!0},polygonEnd:function(){a=E,v=Zo.merge(v);var t=c([n,r]),e=k&&t,u=v.length;(e||u)&&(a.polygonStart(),e&&(a.lineStart(),s(null,null,1,a),a.lineEnd()),u&&Se(v,i,t,s,a),a.polygonEnd()),v=d=m=null}};return N}}function je(n,t){function e(e,r){return e=n(e,r),t(e[0],e[1])}return n.invert&&t.invert&&(e.invert=function(e,r){return e=t.invert(e,r),e&&n.invert(e[0],e[1])}),e}function He(n){var t=0,e=ba/3,r=tr(n),u=r(t,e);return u.parallels=function(n){return arguments.length?r(t=n[0]*ba/180,e=n[1]*ba/180):[180*(t/ba),180*(e/ba)]},u}function Fe(n,t){function e(n,t){var e=Math.sqrt(i-2*u*Math.sin(t))/u;return[e*Math.sin(n*=u),o-e*Math.cos(n)]}var r=Math.sin(n),u=(r+Math.sin(t))/2,i=1+r*(2*u-r),o=Math.sqrt(i)/u;return e.invert=function(n,t){var e=o-t;return[Math.atan2(n,e)/u,G((i-(n*n+e*e)*u*u)/(2*u))]},e}function Oe(){function n(n,t){Ac+=u*n-r*t,r=n,u=t}var t,e,r,u;Tc.point=function(i,o){Tc.point=n,t=r=i,e=u=o},Tc.lineEnd=function(){n(t,e)}}function Ye(n,t){Cc>n&&(Cc=n),n>zc&&(zc=n),Nc>t&&(Nc=t),t>Lc&&(Lc=t)}function Ie(){function n(n,t){o.push("M",n,",",t,i)}function t(n,t){o.push("M",n,",",t),a.point=e}function e(n,t){o.push("L",n,",",t)}function r(){a.point=n}function u(){o.push("Z")}var i=Ze(4.5),o=[],a={point:n,lineStart:function(){a.point=t},lineEnd:r,polygonStart:function(){a.lineEnd=u},polygonEnd:function(){a.lineEnd=r,a.point=n},pointRadius:function(n){return i=Ze(n),a},result:function(){if(o.length){var n=o.join("");return o=[],n}}};return a}function Ze(n){return"m0,"+n+"a"+n+","+n+" 0 1,1 0,"+-2*n+"a"+n+","+n+" 0 1,1 0,"+2*n+"z"}function Ve(n,t){pc+=n,vc+=t,++dc}function Xe(){function n(n,r){var u=n-t,i=r-e,o=Math.sqrt(u*u+i*i);mc+=o*(t+n)/2,yc+=o*(e+r)/2,xc+=o,Ve(t=n,e=r)}var t,e;Rc.point=function(r,u){Rc.point=n,Ve(t=r,e=u)}}function $e(){Rc.point=Ve}function Be(){function n(n,t){var e=n-r,i=t-u,o=Math.sqrt(e*e+i*i);mc+=o*(r+n)/2,yc+=o*(u+t)/2,xc+=o,o=u*n-r*t,Mc+=o*(r+n),_c+=o*(u+t),bc+=3*o,Ve(r=n,u=t)}var t,e,r,u;Rc.point=function(i,o){Rc.point=n,Ve(t=r=i,e=u=o)},Rc.lineEnd=function(){n(t,e)}}function We(n){function t(t,e){n.moveTo(t,e),n.arc(t,e,o,0,wa)}function e(t,e){n.moveTo(t,e),a.point=r}function r(t,e){n.lineTo(t,e)}function u(){a.point=t}function i(){n.closePath()}var o=4.5,a={point:t,lineStart:function(){a.point=e},lineEnd:u,polygonStart:function(){a.lineEnd=i},polygonEnd:function(){a.lineEnd=u,a.point=t},pointRadius:function(n){return o=n,a},result:v};return a}function Je(n){function t(n){return(a?r:e)(n)}function e(t){return Qe(t,function(e,r){e=n(e,r),t.point(e[0],e[1])})}function r(t){function e(e,r){e=n(e,r),t.point(e[0],e[1])}function r(){x=0/0,S.point=i,t.lineStart()}function i(e,r){var i=le([e,r]),o=n(e,r);u(x,M,y,_,b,w,x=o[0],M=o[1],y=e,_=i[0],b=i[1],w=i[2],a,t),t.point(x,M)}function o(){S.point=e,t.lineEnd()}function c(){r(),S.point=s,S.lineEnd=l}function s(n,t){i(f=n,h=t),g=x,p=M,v=_,d=b,m=w,S.point=i}function l(){u(x,M,y,_,b,w,g,p,f,v,d,m,a,t),S.lineEnd=o,o()}var f,h,g,p,v,d,m,y,x,M,_,b,w,S={point:e,lineStart:r,lineEnd:o,polygonStart:function(){t.polygonStart(),S.lineStart=c},polygonEnd:function(){t.polygonEnd(),S.lineStart=r}};return S}function u(t,e,r,a,c,s,l,f,h,g,p,v,d,m){var y=l-t,x=f-e,M=y*y+x*x;if(M>4*i&&d--){var _=a+g,b=c+p,w=s+v,S=Math.sqrt(_*_+b*b+w*w),k=Math.asin(w/=S),E=ua(ua(w)-1)<ka||ua(r-h)<ka?(r+h)/2:Math.atan2(b,_),A=n(E,k),C=A[0],N=A[1],z=C-t,L=N-e,T=x*z-y*L;(T*T/M>i||ua((y*z+x*L)/M-.5)>.3||o>a*g+c*p+s*v)&&(u(t,e,r,a,c,s,C,N,E,_/=S,b/=S,w,d,m),m.point(C,N),u(C,N,E,_,b,w,l,f,h,g,p,v,d,m))}}var i=.5,o=Math.cos(30*Aa),a=16;
+return t.precision=function(n){return arguments.length?(a=(i=n*n)>0&&16,t):Math.sqrt(i)},t}function Ge(n){var t=Je(function(t,e){return n([t*Ca,e*Ca])});return function(n){return er(t(n))}}function Ke(n){this.stream=n}function Qe(n,t){return{point:t,sphere:function(){n.sphere()},lineStart:function(){n.lineStart()},lineEnd:function(){n.lineEnd()},polygonStart:function(){n.polygonStart()},polygonEnd:function(){n.polygonEnd()}}}function nr(n){return tr(function(){return n})()}function tr(n){function t(n){return n=a(n[0]*Aa,n[1]*Aa),[n[0]*h+c,s-n[1]*h]}function e(n){return n=a.invert((n[0]-c)/h,(s-n[1])/h),n&&[n[0]*Ca,n[1]*Ca]}function r(){a=je(o=ir(m,y,x),i);var n=i(v,d);return c=g-n[0]*h,s=p+n[1]*h,u()}function u(){return l&&(l.valid=!1,l=null),t}var i,o,a,c,s,l,f=Je(function(n,t){return n=i(n,t),[n[0]*h+c,s-n[1]*h]}),h=150,g=480,p=250,v=0,d=0,m=0,y=0,x=0,M=Sc,_=wt,b=null,w=null;return t.stream=function(n){return l&&(l.valid=!1),l=er(M(o,f(_(n)))),l.valid=!0,l},t.clipAngle=function(n){return arguments.length?(M=null==n?(b=n,Sc):De((b=+n)*Aa),u()):b},t.clipExtent=function(n){return arguments.length?(w=n,_=n?Ue(n[0][0],n[0][1],n[1][0],n[1][1]):wt,u()):w},t.scale=function(n){return arguments.length?(h=+n,r()):h},t.translate=function(n){return arguments.length?(g=+n[0],p=+n[1],r()):[g,p]},t.center=function(n){return arguments.length?(v=n[0]%360*Aa,d=n[1]%360*Aa,r()):[v*Ca,d*Ca]},t.rotate=function(n){return arguments.length?(m=n[0]%360*Aa,y=n[1]%360*Aa,x=n.length>2?n[2]%360*Aa:0,r()):[m*Ca,y*Ca,x*Ca]},Zo.rebind(t,f,"precision"),function(){return i=n.apply(this,arguments),t.invert=i.invert&&e,r()}}function er(n){return Qe(n,function(t,e){n.point(t*Aa,e*Aa)})}function rr(n,t){return[n,t]}function ur(n,t){return[n>ba?n-wa:-ba>n?n+wa:n,t]}function ir(n,t,e){return n?t||e?je(ar(n),cr(t,e)):ar(n):t||e?cr(t,e):ur}function or(n){return function(t,e){return t+=n,[t>ba?t-wa:-ba>t?t+wa:t,e]}}function ar(n){var t=or(n);return t.invert=or(-n),t}function cr(n,t){function e(n,t){var e=Math.cos(t),a=Math.cos(n)*e,c=Math.sin(n)*e,s=Math.sin(t),l=s*r+a*u;return[Math.atan2(c*i-l*o,a*r-s*u),G(l*i+c*o)]}var r=Math.cos(n),u=Math.sin(n),i=Math.cos(t),o=Math.sin(t);return e.invert=function(n,t){var e=Math.cos(t),a=Math.cos(n)*e,c=Math.sin(n)*e,s=Math.sin(t),l=s*i-c*o;return[Math.atan2(c*i+s*o,a*r+l*u),G(l*r-a*u)]},e}function sr(n,t){var e=Math.cos(n),r=Math.sin(n);return function(u,i,o,a){var c=o*t;null!=u?(u=lr(e,u),i=lr(e,i),(o>0?i>u:u>i)&&(u+=o*wa)):(u=n+o*wa,i=n-.5*c);for(var s,l=u;o>0?l>i:i>l;l-=c)a.point((s=de([e,-r*Math.cos(l),-r*Math.sin(l)]))[0],s[1])}}function lr(n,t){var e=le(t);e[0]-=n,ve(e);var r=J(-e[1]);return((-e[2]<0?-r:r)+2*Math.PI-ka)%(2*Math.PI)}function fr(n,t,e){var r=Zo.range(n,t-ka,e).concat(t);return function(n){return r.map(function(t){return[n,t]})}}function hr(n,t,e){var r=Zo.range(n,t-ka,e).concat(t);return function(n){return r.map(function(t){return[t,n]})}}function gr(n){return n.source}function pr(n){return n.target}function vr(n,t,e,r){var u=Math.cos(t),i=Math.sin(t),o=Math.cos(r),a=Math.sin(r),c=u*Math.cos(n),s=u*Math.sin(n),l=o*Math.cos(e),f=o*Math.sin(e),h=2*Math.asin(Math.sqrt(tt(r-t)+u*o*tt(e-n))),g=1/Math.sin(h),p=h?function(n){var t=Math.sin(n*=h)*g,e=Math.sin(h-n)*g,r=e*c+t*l,u=e*s+t*f,o=e*i+t*a;return[Math.atan2(u,r)*Ca,Math.atan2(o,Math.sqrt(r*r+u*u))*Ca]}:function(){return[n*Ca,t*Ca]};return p.distance=h,p}function dr(){function n(n,u){var i=Math.sin(u*=Aa),o=Math.cos(u),a=ua((n*=Aa)-t),c=Math.cos(a);Dc+=Math.atan2(Math.sqrt((a=o*Math.sin(a))*a+(a=r*i-e*o*c)*a),e*i+r*o*c),t=n,e=i,r=o}var t,e,r;Pc.point=function(u,i){t=u*Aa,e=Math.sin(i*=Aa),r=Math.cos(i),Pc.point=n},Pc.lineEnd=function(){Pc.point=Pc.lineEnd=v}}function mr(n,t){function e(t,e){var r=Math.cos(t),u=Math.cos(e),i=n(r*u);return[i*u*Math.sin(t),i*Math.sin(e)]}return e.invert=function(n,e){var r=Math.sqrt(n*n+e*e),u=t(r),i=Math.sin(u),o=Math.cos(u);return[Math.atan2(n*i,r*o),Math.asin(r&&e*i/r)]},e}function yr(n,t){function e(n,t){o>0?-Sa+ka>t&&(t=-Sa+ka):t>Sa-ka&&(t=Sa-ka);var e=o/Math.pow(u(t),i);return[e*Math.sin(i*n),o-e*Math.cos(i*n)]}var r=Math.cos(n),u=function(n){return Math.tan(ba/4+n/2)},i=n===t?Math.sin(n):Math.log(r/Math.cos(t))/Math.log(u(t)/u(n)),o=r*Math.pow(u(n),i)/i;return i?(e.invert=function(n,t){var e=o-t,r=B(i)*Math.sqrt(n*n+e*e);return[Math.atan2(n,e)/i,2*Math.atan(Math.pow(o/r,1/i))-Sa]},e):Mr}function xr(n,t){function e(n,t){var e=i-t;return[e*Math.sin(u*n),i-e*Math.cos(u*n)]}var r=Math.cos(n),u=n===t?Math.sin(n):(r-Math.cos(t))/(t-n),i=r/u+n;return ua(u)<ka?rr:(e.invert=function(n,t){var e=i-t;return[Math.atan2(n,e)/u,i-B(u)*Math.sqrt(n*n+e*e)]},e)}function Mr(n,t){return[n,Math.log(Math.tan(ba/4+t/2))]}function _r(n){var t,e=nr(n),r=e.scale,u=e.translate,i=e.clipExtent;return e.scale=function(){var n=r.apply(e,arguments);return n===e?t?e.clipExtent(null):e:n},e.translate=function(){var n=u.apply(e,arguments);return n===e?t?e.clipExtent(null):e:n},e.clipExtent=function(n){var o=i.apply(e,arguments);if(o===e){if(t=null==n){var a=ba*r(),c=u();i([[c[0]-a,c[1]-a],[c[0]+a,c[1]+a]])}}else t&&(o=null);return o},e.clipExtent(null)}function br(n,t){return[Math.log(Math.tan(ba/4+t/2)),-n]}function wr(n){return n[0]}function Sr(n){return n[1]}function kr(n){for(var t=n.length,e=[0,1],r=2,u=2;t>u;u++){for(;r>1&&W(n[e[r-2]],n[e[r-1]],n[u])<=0;)--r;e[r++]=u}return e.slice(0,r)}function Er(n,t){return n[0]-t[0]||n[1]-t[1]}function Ar(n,t,e){return(e[0]-t[0])*(n[1]-t[1])<(e[1]-t[1])*(n[0]-t[0])}function Cr(n,t,e,r){var u=n[0],i=e[0],o=t[0]-u,a=r[0]-i,c=n[1],s=e[1],l=t[1]-c,f=r[1]-s,h=(a*(c-s)-f*(u-i))/(f*o-a*l);return[u+h*o,c+h*l]}function Nr(n){var t=n[0],e=n[n.length-1];return!(t[0]-e[0]||t[1]-e[1])}function zr(){Gr(this),this.edge=this.site=this.circle=null}function Lr(n){var t=Bc.pop()||new zr;return t.site=n,t}function Tr(n){Yr(n),Vc.remove(n),Bc.push(n),Gr(n)}function qr(n){var t=n.circle,e=t.x,r=t.cy,u={x:e,y:r},i=n.P,o=n.N,a=[n];Tr(n);for(var c=i;c.circle&&ua(e-c.circle.x)<ka&&ua(r-c.circle.cy)<ka;)i=c.P,a.unshift(c),Tr(c),c=i;a.unshift(c),Yr(c);for(var s=o;s.circle&&ua(e-s.circle.x)<ka&&ua(r-s.circle.cy)<ka;)o=s.N,a.push(s),Tr(s),s=o;a.push(s),Yr(s);var l,f=a.length;for(l=1;f>l;++l)s=a[l],c=a[l-1],Br(s.edge,c.site,s.site,u);c=a[0],s=a[f-1],s.edge=Xr(c.site,s.site,null,u),Or(c),Or(s)}function Rr(n){for(var t,e,r,u,i=n.x,o=n.y,a=Vc._;a;)if(r=Dr(a,o)-i,r>ka)a=a.L;else{if(u=i-Pr(a,o),!(u>ka)){r>-ka?(t=a.P,e=a):u>-ka?(t=a,e=a.N):t=e=a;break}if(!a.R){t=a;break}a=a.R}var c=Lr(n);if(Vc.insert(t,c),t||e){if(t===e)return Yr(t),e=Lr(t.site),Vc.insert(c,e),c.edge=e.edge=Xr(t.site,c.site),Or(t),Or(e),void 0;if(!e)return c.edge=Xr(t.site,c.site),void 0;Yr(t),Yr(e);var s=t.site,l=s.x,f=s.y,h=n.x-l,g=n.y-f,p=e.site,v=p.x-l,d=p.y-f,m=2*(h*d-g*v),y=h*h+g*g,x=v*v+d*d,M={x:(d*y-g*x)/m+l,y:(h*x-v*y)/m+f};Br(e.edge,s,p,M),c.edge=Xr(s,n,null,M),e.edge=Xr(n,p,null,M),Or(t),Or(e)}}function Dr(n,t){var e=n.site,r=e.x,u=e.y,i=u-t;if(!i)return r;var o=n.P;if(!o)return-1/0;e=o.site;var a=e.x,c=e.y,s=c-t;if(!s)return a;var l=a-r,f=1/i-1/s,h=l/s;return f?(-h+Math.sqrt(h*h-2*f*(l*l/(-2*s)-c+s/2+u-i/2)))/f+r:(r+a)/2}function Pr(n,t){var e=n.N;if(e)return Dr(e,t);var r=n.site;return r.y===t?r.x:1/0}function Ur(n){this.site=n,this.edges=[]}function jr(n){for(var t,e,r,u,i,o,a,c,s,l,f=n[0][0],h=n[1][0],g=n[0][1],p=n[1][1],v=Zc,d=v.length;d--;)if(i=v[d],i&&i.prepare())for(a=i.edges,c=a.length,o=0;c>o;)l=a[o].end(),r=l.x,u=l.y,s=a[++o%c].start(),t=s.x,e=s.y,(ua(r-t)>ka||ua(u-e)>ka)&&(a.splice(o,0,new Wr($r(i.site,l,ua(r-f)<ka&&p-u>ka?{x:f,y:ua(t-f)<ka?e:p}:ua(u-p)<ka&&h-r>ka?{x:ua(e-p)<ka?t:h,y:p}:ua(r-h)<ka&&u-g>ka?{x:h,y:ua(t-h)<ka?e:g}:ua(u-g)<ka&&r-f>ka?{x:ua(e-g)<ka?t:f,y:g}:null),i.site,null)),++c)}function Hr(n,t){return t.angle-n.angle}function Fr(){Gr(this),this.x=this.y=this.arc=this.site=this.cy=null}function Or(n){var t=n.P,e=n.N;if(t&&e){var r=t.site,u=n.site,i=e.site;if(r!==i){var o=u.x,a=u.y,c=r.x-o,s=r.y-a,l=i.x-o,f=i.y-a,h=2*(c*f-s*l);if(!(h>=-Ea)){var g=c*c+s*s,p=l*l+f*f,v=(f*g-s*p)/h,d=(c*p-l*g)/h,f=d+a,m=Wc.pop()||new Fr;m.arc=n,m.site=u,m.x=v+o,m.y=f+Math.sqrt(v*v+d*d),m.cy=f,n.circle=m;for(var y=null,x=$c._;x;)if(m.y<x.y||m.y===x.y&&m.x<=x.x){if(!x.L){y=x.P;break}x=x.L}else{if(!x.R){y=x;break}x=x.R}$c.insert(y,m),y||(Xc=m)}}}}function Yr(n){var t=n.circle;t&&(t.P||(Xc=t.N),$c.remove(t),Wc.push(t),Gr(t),n.circle=null)}function Ir(n){for(var t,e=Ic,r=Pe(n[0][0],n[0][1],n[1][0],n[1][1]),u=e.length;u--;)t=e[u],(!Zr(t,n)||!r(t)||ua(t.a.x-t.b.x)<ka&&ua(t.a.y-t.b.y)<ka)&&(t.a=t.b=null,e.splice(u,1))}function Zr(n,t){var e=n.b;if(e)return!0;var r,u,i=n.a,o=t[0][0],a=t[1][0],c=t[0][1],s=t[1][1],l=n.l,f=n.r,h=l.x,g=l.y,p=f.x,v=f.y,d=(h+p)/2,m=(g+v)/2;if(v===g){if(o>d||d>=a)return;if(h>p){if(i){if(i.y>=s)return}else i={x:d,y:c};e={x:d,y:s}}else{if(i){if(i.y<c)return}else i={x:d,y:s};e={x:d,y:c}}}else if(r=(h-p)/(v-g),u=m-r*d,-1>r||r>1)if(h>p){if(i){if(i.y>=s)return}else i={x:(c-u)/r,y:c};e={x:(s-u)/r,y:s}}else{if(i){if(i.y<c)return}else i={x:(s-u)/r,y:s};e={x:(c-u)/r,y:c}}else if(v>g){if(i){if(i.x>=a)return}else i={x:o,y:r*o+u};e={x:a,y:r*a+u}}else{if(i){if(i.x<o)return}else i={x:a,y:r*a+u};e={x:o,y:r*o+u}}return n.a=i,n.b=e,!0}function Vr(n,t){this.l=n,this.r=t,this.a=this.b=null}function Xr(n,t,e,r){var u=new Vr(n,t);return Ic.push(u),e&&Br(u,n,t,e),r&&Br(u,t,n,r),Zc[n.i].edges.push(new Wr(u,n,t)),Zc[t.i].edges.push(new Wr(u,t,n)),u}function $r(n,t,e){var r=new Vr(n,null);return r.a=t,r.b=e,Ic.push(r),r}function Br(n,t,e,r){n.a||n.b?n.l===e?n.b=r:n.a=r:(n.a=r,n.l=t,n.r=e)}function Wr(n,t,e){var r=n.a,u=n.b;this.edge=n,this.site=t,this.angle=e?Math.atan2(e.y-t.y,e.x-t.x):n.l===t?Math.atan2(u.x-r.x,r.y-u.y):Math.atan2(r.x-u.x,u.y-r.y)}function Jr(){this._=null}function Gr(n){n.U=n.C=n.L=n.R=n.P=n.N=null}function Kr(n,t){var e=t,r=t.R,u=e.U;u?u.L===e?u.L=r:u.R=r:n._=r,r.U=u,e.U=r,e.R=r.L,e.R&&(e.R.U=e),r.L=e}function Qr(n,t){var e=t,r=t.L,u=e.U;u?u.L===e?u.L=r:u.R=r:n._=r,r.U=u,e.U=r,e.L=r.R,e.L&&(e.L.U=e),r.R=e}function nu(n){for(;n.L;)n=n.L;return n}function tu(n,t){var e,r,u,i=n.sort(eu).pop();for(Ic=[],Zc=new Array(n.length),Vc=new Jr,$c=new Jr;;)if(u=Xc,i&&(!u||i.y<u.y||i.y===u.y&&i.x<u.x))(i.x!==e||i.y!==r)&&(Zc[i.i]=new Ur(i),Rr(i),e=i.x,r=i.y),i=n.pop();else{if(!u)break;qr(u.arc)}t&&(Ir(t),jr(t));var o={cells:Zc,edges:Ic};return Vc=$c=Ic=Zc=null,o}function eu(n,t){return t.y-n.y||t.x-n.x}function ru(n,t,e){return(n.x-e.x)*(t.y-n.y)-(n.x-t.x)*(e.y-n.y)}function uu(n){return n.x}function iu(n){return n.y}function ou(){return{leaf:!0,nodes:[],point:null,x:null,y:null}}function au(n,t,e,r,u,i){if(!n(t,e,r,u,i)){var o=.5*(e+u),a=.5*(r+i),c=t.nodes;c[0]&&au(n,c[0],e,r,o,a),c[1]&&au(n,c[1],o,r,u,a),c[2]&&au(n,c[2],e,a,o,i),c[3]&&au(n,c[3],o,a,u,i)}}function cu(n,t){n=Zo.rgb(n),t=Zo.rgb(t);var e=n.r,r=n.g,u=n.b,i=t.r-e,o=t.g-r,a=t.b-u;return function(n){return"#"+dt(Math.round(e+i*n))+dt(Math.round(r+o*n))+dt(Math.round(u+a*n))}}function su(n,t){var e,r={},u={};for(e in n)e in t?r[e]=hu(n[e],t[e]):u[e]=n[e];for(e in t)e in n||(u[e]=t[e]);return function(n){for(e in r)u[e]=r[e](n);return u}}function lu(n,t){return t-=n=+n,function(e){return n+t*e}}function fu(n,t){var e,r,u,i=Gc.lastIndex=Kc.lastIndex=0,o=-1,a=[],c=[];for(n+="",t+="";(e=Gc.exec(n))&&(r=Kc.exec(t));)(u=r.index)>i&&(u=t.substring(i,u),a[o]?a[o]+=u:a[++o]=u),(e=e[0])===(r=r[0])?a[o]?a[o]+=r:a[++o]=r:(a[++o]=null,c.push({i:o,x:lu(e,r)})),i=Kc.lastIndex;return i<t.length&&(u=t.substring(i),a[o]?a[o]+=u:a[++o]=u),a.length<2?c[0]?(t=c[0].x,function(n){return t(n)+""}):function(){return t}:(t=c.length,function(n){for(var e,r=0;t>r;++r)a[(e=c[r]).i]=e.x(n);return a.join("")})}function hu(n,t){for(var e,r=Zo.interpolators.length;--r>=0&&!(e=Zo.interpolators[r](n,t)););return e}function gu(n,t){var e,r=[],u=[],i=n.length,o=t.length,a=Math.min(n.length,t.length);for(e=0;a>e;++e)r.push(hu(n[e],t[e]));for(;i>e;++e)u[e]=n[e];for(;o>e;++e)u[e]=t[e];return function(n){for(e=0;a>e;++e)u[e]=r[e](n);return u}}function pu(n){return function(t){return 0>=t?0:t>=1?1:n(t)}}function vu(n){return function(t){return 1-n(1-t)}}function du(n){return function(t){return.5*(.5>t?n(2*t):2-n(2-2*t))}}function mu(n){return n*n}function yu(n){return n*n*n}function xu(n){if(0>=n)return 0;if(n>=1)return 1;var t=n*n,e=t*n;return 4*(.5>n?e:3*(n-t)+e-.75)}function Mu(n){return function(t){return Math.pow(t,n)}}function _u(n){return 1-Math.cos(n*Sa)}function bu(n){return Math.pow(2,10*(n-1))}function wu(n){return 1-Math.sqrt(1-n*n)}function Su(n,t){var e;return arguments.length<2&&(t=.45),arguments.length?e=t/wa*Math.asin(1/n):(n=1,e=t/4),function(r){return 1+n*Math.pow(2,-10*r)*Math.sin((r-e)*wa/t)}}function ku(n){return n||(n=1.70158),function(t){return t*t*((n+1)*t-n)}}function Eu(n){return 1/2.75>n?7.5625*n*n:2/2.75>n?7.5625*(n-=1.5/2.75)*n+.75:2.5/2.75>n?7.5625*(n-=2.25/2.75)*n+.9375:7.5625*(n-=2.625/2.75)*n+.984375}function Au(n,t){n=Zo.hcl(n),t=Zo.hcl(t);var e=n.h,r=n.c,u=n.l,i=t.h-e,o=t.c-r,a=t.l-u;return isNaN(o)&&(o=0,r=isNaN(r)?t.c:r),isNaN(i)?(i=0,e=isNaN(e)?t.h:e):i>180?i-=360:-180>i&&(i+=360),function(n){return ot(e+i*n,r+o*n,u+a*n)+""}}function Cu(n,t){n=Zo.hsl(n),t=Zo.hsl(t);var e=n.h,r=n.s,u=n.l,i=t.h-e,o=t.s-r,a=t.l-u;return isNaN(o)&&(o=0,r=isNaN(r)?t.s:r),isNaN(i)?(i=0,e=isNaN(e)?t.h:e):i>180?i-=360:-180>i&&(i+=360),function(n){return ut(e+i*n,r+o*n,u+a*n)+""}}function Nu(n,t){n=Zo.lab(n),t=Zo.lab(t);var e=n.l,r=n.a,u=n.b,i=t.l-e,o=t.a-r,a=t.b-u;return function(n){return ct(e+i*n,r+o*n,u+a*n)+""}}function zu(n,t){return t-=n,function(e){return Math.round(n+t*e)}}function Lu(n){var t=[n.a,n.b],e=[n.c,n.d],r=qu(t),u=Tu(t,e),i=qu(Ru(e,t,-u))||0;t[0]*e[1]<e[0]*t[1]&&(t[0]*=-1,t[1]*=-1,r*=-1,u*=-1),this.rotate=(r?Math.atan2(t[1],t[0]):Math.atan2(-e[0],e[1]))*Ca,this.translate=[n.e,n.f],this.scale=[r,i],this.skew=i?Math.atan2(u,i)*Ca:0}function Tu(n,t){return n[0]*t[0]+n[1]*t[1]}function qu(n){var t=Math.sqrt(Tu(n,n));return t&&(n[0]/=t,n[1]/=t),t}function Ru(n,t,e){return n[0]+=e*t[0],n[1]+=e*t[1],n}function Du(n,t){var e,r=[],u=[],i=Zo.transform(n),o=Zo.transform(t),a=i.translate,c=o.translate,s=i.rotate,l=o.rotate,f=i.skew,h=o.skew,g=i.scale,p=o.scale;return a[0]!=c[0]||a[1]!=c[1]?(r.push("translate(",null,",",null,")"),u.push({i:1,x:lu(a[0],c[0])},{i:3,x:lu(a[1],c[1])})):c[0]||c[1]?r.push("translate("+c+")"):r.push(""),s!=l?(s-l>180?l+=360:l-s>180&&(s+=360),u.push({i:r.push(r.pop()+"rotate(",null,")")-2,x:lu(s,l)})):l&&r.push(r.pop()+"rotate("+l+")"),f!=h?u.push({i:r.push(r.pop()+"skewX(",null,")")-2,x:lu(f,h)}):h&&r.push(r.pop()+"skewX("+h+")"),g[0]!=p[0]||g[1]!=p[1]?(e=r.push(r.pop()+"scale(",null,",",null,")"),u.push({i:e-4,x:lu(g[0],p[0])},{i:e-2,x:lu(g[1],p[1])})):(1!=p[0]||1!=p[1])&&r.push(r.pop()+"scale("+p+")"),e=u.length,function(n){for(var t,i=-1;++i<e;)r[(t=u[i]).i]=t.x(n);return r.join("")}}function Pu(n,t){return t=t-(n=+n)?1/(t-n):0,function(e){return(e-n)*t}}function Uu(n,t){return t=t-(n=+n)?1/(t-n):0,function(e){return Math.max(0,Math.min(1,(e-n)*t))}}function ju(n){for(var t=n.source,e=n.target,r=Fu(t,e),u=[t];t!==r;)t=t.parent,u.push(t);for(var i=u.length;e!==r;)u.splice(i,0,e),e=e.parent;return u}function Hu(n){for(var t=[],e=n.parent;null!=e;)t.push(n),n=e,e=e.parent;return t.push(n),t}function Fu(n,t){if(n===t)return n;for(var e=Hu(n),r=Hu(t),u=e.pop(),i=r.pop(),o=null;u===i;)o=u,u=e.pop(),i=r.pop();return o}function Ou(n){n.fixed|=2}function Yu(n){n.fixed&=-7}function Iu(n){n.fixed|=4,n.px=n.x,n.py=n.y}function Zu(n){n.fixed&=-5}function Vu(n,t,e){var r=0,u=0;if(n.charge=0,!n.leaf)for(var i,o=n.nodes,a=o.length,c=-1;++c<a;)i=o[c],null!=i&&(Vu(i,t,e),n.charge+=i.charge,r+=i.charge*i.cx,u+=i.charge*i.cy);if(n.point){n.leaf||(n.point.x+=Math.random()-.5,n.point.y+=Math.random()-.5);var s=t*e[n.point.index];n.charge+=n.pointCharge=s,r+=s*n.point.x,u+=s*n.point.y}n.cx=r/n.charge,n.cy=u/n.charge}function Xu(n,t){return Zo.rebind(n,t,"sort","children","value"),n.nodes=n,n.links=Ku,n}function $u(n,t){for(var e=[n];null!=(n=e.pop());)if(t(n),(u=n.children)&&(r=u.length))for(var r,u;--r>=0;)e.push(u[r])}function Bu(n,t){for(var e=[n],r=[];null!=(n=e.pop());)if(r.push(n),(i=n.children)&&(u=i.length))for(var u,i,o=-1;++o<u;)e.push(i[o]);for(;null!=(n=r.pop());)t(n)}function Wu(n){return n.children}function Ju(n){return n.value}function Gu(n,t){return t.value-n.value}function Ku(n){return Zo.merge(n.map(function(n){return(n.children||[]).map(function(t){return{source:n,target:t}})}))}function Qu(n){return n.x}function ni(n){return n.y}function ti(n,t,e){n.y0=t,n.y=e}function ei(n){return Zo.range(n.length)}function ri(n){for(var t=-1,e=n[0].length,r=[];++t<e;)r[t]=0;return r}function ui(n){for(var t,e=1,r=0,u=n[0][1],i=n.length;i>e;++e)(t=n[e][1])>u&&(r=e,u=t);return r}function ii(n){return n.reduce(oi,0)}function oi(n,t){return n+t[1]}function ai(n,t){return ci(n,Math.ceil(Math.log(t.length)/Math.LN2+1))}function ci(n,t){for(var e=-1,r=+n[0],u=(n[1]-r)/t,i=[];++e<=t;)i[e]=u*e+r;return i}function si(n){return[Zo.min(n),Zo.max(n)]}function li(n,t){return n.value-t.value}function fi(n,t){var e=n._pack_next;n._pack_next=t,t._pack_prev=n,t._pack_next=e,e._pack_prev=t}function hi(n,t){n._pack_next=t,t._pack_prev=n}function gi(n,t){var e=t.x-n.x,r=t.y-n.y,u=n.r+t.r;return.999*u*u>e*e+r*r}function pi(n){function t(n){l=Math.min(n.x-n.r,l),f=Math.max(n.x+n.r,f),h=Math.min(n.y-n.r,h),g=Math.max(n.y+n.r,g)}if((e=n.children)&&(s=e.length)){var e,r,u,i,o,a,c,s,l=1/0,f=-1/0,h=1/0,g=-1/0;if(e.forEach(vi),r=e[0],r.x=-r.r,r.y=0,t(r),s>1&&(u=e[1],u.x=u.r,u.y=0,t(u),s>2))for(i=e[2],yi(r,u,i),t(i),fi(r,i),r._pack_prev=i,fi(i,u),u=r._pack_next,o=3;s>o;o++){yi(r,u,i=e[o]);var p=0,v=1,d=1;for(a=u._pack_next;a!==u;a=a._pack_next,v++)if(gi(a,i)){p=1;break}if(1==p)for(c=r._pack_prev;c!==a._pack_prev&&!gi(c,i);c=c._pack_prev,d++);p?(d>v||v==d&&u.r<r.r?hi(r,u=a):hi(r=c,u),o--):(fi(r,i),u=i,t(i))}var m=(l+f)/2,y=(h+g)/2,x=0;for(o=0;s>o;o++)i=e[o],i.x-=m,i.y-=y,x=Math.max(x,i.r+Math.sqrt(i.x*i.x+i.y*i.y));n.r=x,e.forEach(di)}}function vi(n){n._pack_next=n._pack_prev=n}function di(n){delete n._pack_next,delete n._pack_prev}function mi(n,t,e,r){var u=n.children;if(n.x=t+=r*n.x,n.y=e+=r*n.y,n.r*=r,u)for(var i=-1,o=u.length;++i<o;)mi(u[i],t,e,r)}function yi(n,t,e){var r=n.r+e.r,u=t.x-n.x,i=t.y-n.y;if(r&&(u||i)){var o=t.r+e.r,a=u*u+i*i;o*=o,r*=r;var c=.5+(r-o)/(2*a),s=Math.sqrt(Math.max(0,2*o*(r+a)-(r-=a)*r-o*o))/(2*a);e.x=n.x+c*u+s*i,e.y=n.y+c*i-s*u}else e.x=n.x+r,e.y=n.y}function xi(n,t){return n.parent==t.parent?1:2}function Mi(n){var t=n.children;return t.length?t[0]:n.t}function _i(n){var t,e=n.children;return(t=e.length)?e[t-1]:n.t}function bi(n,t,e){var r=e/(t.i-n.i);t.c-=r,t.s+=e,n.c+=r,t.z+=e,t.m+=e}function wi(n){for(var t,e=0,r=0,u=n.children,i=u.length;--i>=0;)t=u[i],t.z+=e,t.m+=e,e+=t.s+(r+=t.c)}function Si(n,t,e){return n.a.parent===t.parent?n.a:e}function ki(n){return 1+Zo.max(n,function(n){return n.y})}function Ei(n){return n.reduce(function(n,t){return n+t.x},0)/n.length}function Ai(n){var t=n.children;return t&&t.length?Ai(t[0]):n}function Ci(n){var t,e=n.children;return e&&(t=e.length)?Ci(e[t-1]):n}function Ni(n){return{x:n.x,y:n.y,dx:n.dx,dy:n.dy}}function zi(n,t){var e=n.x+t[3],r=n.y+t[0],u=n.dx-t[1]-t[3],i=n.dy-t[0]-t[2];return 0>u&&(e+=u/2,u=0),0>i&&(r+=i/2,i=0),{x:e,y:r,dx:u,dy:i}}function Li(n){var t=n[0],e=n[n.length-1];return e>t?[t,e]:[e,t]}function Ti(n){return n.rangeExtent?n.rangeExtent():Li(n.range())}function qi(n,t,e,r){var u=e(n[0],n[1]),i=r(t[0],t[1]);return function(n){return i(u(n))}}function Ri(n,t){var e,r=0,u=n.length-1,i=n[r],o=n[u];return i>o&&(e=r,r=u,u=e,e=i,i=o,o=e),n[r]=t.floor(i),n[u]=t.ceil(o),n}function Di(n){return n?{floor:function(t){return Math.floor(t/n)*n},ceil:function(t){return Math.ceil(t/n)*n}}:ss}function Pi(n,t,e,r){var u=[],i=[],o=0,a=Math.min(n.length,t.length)-1;for(n[a]<n[0]&&(n=n.slice().reverse(),t=t.slice().reverse());++o<=a;)u.push(e(n[o-1],n[o])),i.push(r(t[o-1],t[o]));return function(t){var e=Zo.bisect(n,t,1,a)-1;return i[e](u[e](t))}}function Ui(n,t,e,r){function u(){var u=Math.min(n.length,t.length)>2?Pi:qi,c=r?Uu:Pu;return o=u(n,t,c,e),a=u(t,n,c,hu),i}function i(n){return o(n)}var o,a;return i.invert=function(n){return a(n)},i.domain=function(t){return arguments.length?(n=t.map(Number),u()):n},i.range=function(n){return arguments.length?(t=n,u()):t},i.rangeRound=function(n){return i.range(n).interpolate(zu)},i.clamp=function(n){return arguments.length?(r=n,u()):r},i.interpolate=function(n){return arguments.length?(e=n,u()):e},i.ticks=function(t){return Oi(n,t)},i.tickFormat=function(t,e){return Yi(n,t,e)},i.nice=function(t){return Hi(n,t),u()},i.copy=function(){return Ui(n,t,e,r)},u()}function ji(n,t){return Zo.rebind(n,t,"range","rangeRound","interpolate","clamp")}function Hi(n,t){return Ri(n,Di(Fi(n,t)[2]))}function Fi(n,t){null==t&&(t=10);var e=Li(n),r=e[1]-e[0],u=Math.pow(10,Math.floor(Math.log(r/t)/Math.LN10)),i=t/r*u;return.15>=i?u*=10:.35>=i?u*=5:.75>=i&&(u*=2),e[0]=Math.ceil(e[0]/u)*u,e[1]=Math.floor(e[1]/u)*u+.5*u,e[2]=u,e}function Oi(n,t){return Zo.range.apply(Zo,Fi(n,t))}function Yi(n,t,e){var r=Fi(n,t);if(e){var u=Ga.exec(e);if(u.shift(),"s"===u[8]){var i=Zo.formatPrefix(Math.max(ua(r[0]),ua(r[1])));return u[7]||(u[7]="."+Ii(i.scale(r[2]))),u[8]="f",e=Zo.format(u.join("")),function(n){return e(i.scale(n))+i.symbol}}u[7]||(u[7]="."+Zi(u[8],r)),e=u.join("")}else e=",."+Ii(r[2])+"f";return Zo.format(e)}function Ii(n){return-Math.floor(Math.log(n)/Math.LN10+.01)}function Zi(n,t){var e=Ii(t[2]);return n in ls?Math.abs(e-Ii(Math.max(ua(t[0]),ua(t[1]))))+ +("e"!==n):e-2*("%"===n)}function Vi(n,t,e,r){function u(n){return(e?Math.log(0>n?0:n):-Math.log(n>0?0:-n))/Math.log(t)}function i(n){return e?Math.pow(t,n):-Math.pow(t,-n)}function o(t){return n(u(t))}return o.invert=function(t){return i(n.invert(t))},o.domain=function(t){return arguments.length?(e=t[0]>=0,n.domain((r=t.map(Number)).map(u)),o):r},o.base=function(e){return arguments.length?(t=+e,n.domain(r.map(u)),o):t},o.nice=function(){var t=Ri(r.map(u),e?Math:hs);return n.domain(t),r=t.map(i),o},o.ticks=function(){var n=Li(r),o=[],a=n[0],c=n[1],s=Math.floor(u(a)),l=Math.ceil(u(c)),f=t%1?2:t;if(isFinite(l-s)){if(e){for(;l>s;s++)for(var h=1;f>h;h++)o.push(i(s)*h);o.push(i(s))}else for(o.push(i(s));s++<l;)for(var h=f-1;h>0;h--)o.push(i(s)*h);for(s=0;o[s]<a;s++);for(l=o.length;o[l-1]>c;l--);o=o.slice(s,l)}return o},o.tickFormat=function(n,t){if(!arguments.length)return fs;arguments.length<2?t=fs:"function"!=typeof t&&(t=Zo.format(t));var r,a=Math.max(.1,n/o.ticks().length),c=e?(r=1e-12,Math.ceil):(r=-1e-12,Math.floor);return function(n){return n/i(c(u(n)+r))<=a?t(n):""}},o.copy=function(){return Vi(n.copy(),t,e,r)},ji(o,n)}function Xi(n,t,e){function r(t){return n(u(t))}var u=$i(t),i=$i(1/t);return r.invert=function(t){return i(n.invert(t))},r.domain=function(t){return arguments.length?(n.domain((e=t.map(Number)).map(u)),r):e},r.ticks=function(n){return Oi(e,n)},r.tickFormat=function(n,t){return Yi(e,n,t)},r.nice=function(n){return r.domain(Hi(e,n))},r.exponent=function(o){return arguments.length?(u=$i(t=o),i=$i(1/t),n.domain(e.map(u)),r):t},r.copy=function(){return Xi(n.copy(),t,e)},ji(r,n)}function $i(n){return function(t){return 0>t?-Math.pow(-t,n):Math.pow(t,n)}}function Bi(n,t){function e(e){return i[((u.get(e)||("range"===t.t?u.set(e,n.push(e)):0/0))-1)%i.length]}function r(t,e){return Zo.range(n.length).map(function(n){return t+e*n})}var u,i,a;return e.domain=function(r){if(!arguments.length)return n;n=[],u=new o;for(var i,a=-1,c=r.length;++a<c;)u.has(i=r[a])||u.set(i,n.push(i));return e[t.t].apply(e,t.a)},e.range=function(n){return arguments.length?(i=n,a=0,t={t:"range",a:arguments},e):i},e.rangePoints=function(u,o){arguments.length<2&&(o=0);var c=u[0],s=u[1],l=(s-c)/(Math.max(1,n.length-1)+o);return i=r(n.length<2?(c+s)/2:c+l*o/2,l),a=0,t={t:"rangePoints",a:arguments},e},e.rangeBands=function(u,o,c){arguments.length<2&&(o=0),arguments.length<3&&(c=o);var s=u[1]<u[0],l=u[s-0],f=u[1-s],h=(f-l)/(n.length-o+2*c);return i=r(l+h*c,h),s&&i.reverse(),a=h*(1-o),t={t:"rangeBands",a:arguments},e},e.rangeRoundBands=function(u,o,c){arguments.length<2&&(o=0),arguments.length<3&&(c=o);var s=u[1]<u[0],l=u[s-0],f=u[1-s],h=Math.floor((f-l)/(n.length-o+2*c)),g=f-l-(n.length-o)*h;return i=r(l+Math.round(g/2),h),s&&i.reverse(),a=Math.round(h*(1-o)),t={t:"rangeRoundBands",a:arguments},e},e.rangeBand=function(){return a},e.rangeExtent=function(){return Li(t.a[0])},e.copy=function(){return Bi(n,t)},e.domain(n)}function Wi(e,r){function u(){var n=0,t=r.length;for(o=[];++n<t;)o[n-1]=Zo.quantile(e,n/t);return i}function i(n){return isNaN(n=+n)?void 0:r[Zo.bisect(o,n)]}var o;return i.domain=function(r){return arguments.length?(e=r.filter(t).sort(n),u()):e},i.range=function(n){return arguments.length?(r=n,u()):r},i.quantiles=function(){return o},i.invertExtent=function(n){return n=r.indexOf(n),0>n?[0/0,0/0]:[n>0?o[n-1]:e[0],n<o.length?o[n]:e[e.length-1]]},i.copy=function(){return Wi(e,r)},u()}function Ji(n,t,e){function r(t){return e[Math.max(0,Math.min(o,Math.floor(i*(t-n))))]}function u(){return i=e.length/(t-n),o=e.length-1,r}var i,o;return r.domain=function(e){return arguments.length?(n=+e[0],t=+e[e.length-1],u()):[n,t]},r.range=function(n){return arguments.length?(e=n,u()):e},r.invertExtent=function(t){return t=e.indexOf(t),t=0>t?0/0:t/i+n,[t,t+1/i]},r.copy=function(){return Ji(n,t,e)},u()}function Gi(n,t){function e(e){return e>=e?t[Zo.bisect(n,e)]:void 0}return e.domain=function(t){return arguments.length?(n=t,e):n},e.range=function(n){return arguments.length?(t=n,e):t},e.invertExtent=function(e){return e=t.indexOf(e),[n[e-1],n[e]]},e.copy=function(){return Gi(n,t)},e}function Ki(n){function t(n){return+n}return t.invert=t,t.domain=t.range=function(e){return arguments.length?(n=e.map(t),t):n},t.ticks=function(t){return Oi(n,t)},t.tickFormat=function(t,e){return Yi(n,t,e)},t.copy=function(){return Ki(n)},t}function Qi(n){return n.innerRadius}function no(n){return n.outerRadius}function to(n){return n.startAngle}function eo(n){return n.endAngle}function ro(n){function t(t){function o(){s.push("M",i(n(l),a))}for(var c,s=[],l=[],f=-1,h=t.length,g=bt(e),p=bt(r);++f<h;)u.call(this,c=t[f],f)?l.push([+g.call(this,c,f),+p.call(this,c,f)]):l.length&&(o(),l=[]);return l.length&&o(),s.length?s.join(""):null}var e=wr,r=Sr,u=we,i=uo,o=i.key,a=.7;return t.x=function(n){return arguments.length?(e=n,t):e},t.y=function(n){return arguments.length?(r=n,t):r},t.defined=function(n){return arguments.length?(u=n,t):u},t.interpolate=function(n){return arguments.length?(o="function"==typeof n?i=n:(i=xs.get(n)||uo).key,t):o},t.tension=function(n){return arguments.length?(a=n,t):a},t}function uo(n){return n.join("L")}function io(n){return uo(n)+"Z"}function oo(n){for(var t=0,e=n.length,r=n[0],u=[r[0],",",r[1]];++t<e;)u.push("H",(r[0]+(r=n[t])[0])/2,"V",r[1]);return e>1&&u.push("H",r[0]),u.join("")}function ao(n){for(var t=0,e=n.length,r=n[0],u=[r[0],",",r[1]];++t<e;)u.push("V",(r=n[t])[1],"H",r[0]);return u.join("")}function co(n){for(var t=0,e=n.length,r=n[0],u=[r[0],",",r[1]];++t<e;)u.push("H",(r=n[t])[0],"V",r[1]);return u.join("")}function so(n,t){return n.length<4?uo(n):n[1]+ho(n.slice(1,n.length-1),go(n,t))}function lo(n,t){return n.length<3?uo(n):n[0]+ho((n.push(n[0]),n),go([n[n.length-2]].concat(n,[n[1]]),t))}function fo(n,t){return n.length<3?uo(n):n[0]+ho(n,go(n,t))}function ho(n,t){if(t.length<1||n.length!=t.length&&n.length!=t.length+2)return uo(n);var e=n.length!=t.length,r="",u=n[0],i=n[1],o=t[0],a=o,c=1;if(e&&(r+="Q"+(i[0]-2*o[0]/3)+","+(i[1]-2*o[1]/3)+","+i[0]+","+i[1],u=n[1],c=2),t.length>1){a=t[1],i=n[c],c++,r+="C"+(u[0]+o[0])+","+(u[1]+o[1])+","+(i[0]-a[0])+","+(i[1]-a[1])+","+i[0]+","+i[1];for(var s=2;s<t.length;s++,c++)i=n[c],a=t[s],r+="S"+(i[0]-a[0])+","+(i[1]-a[1])+","+i[0]+","+i[1]}if(e){var l=n[c];r+="Q"+(i[0]+2*a[0]/3)+","+(i[1]+2*a[1]/3)+","+l[0]+","+l[1]}return r}function go(n,t){for(var e,r=[],u=(1-t)/2,i=n[0],o=n[1],a=1,c=n.length;++a<c;)e=i,i=o,o=n[a],r.push([u*(o[0]-e[0]),u*(o[1]-e[1])]);return r}function po(n){if(n.length<3)return uo(n);var t=1,e=n.length,r=n[0],u=r[0],i=r[1],o=[u,u,u,(r=n[1])[0]],a=[i,i,i,r[1]],c=[u,",",i,"L",xo(bs,o),",",xo(bs,a)];for(n.push(n[e-1]);++t<=e;)r=n[t],o.shift(),o.push(r[0]),a.shift(),a.push(r[1]),Mo(c,o,a);return n.pop(),c.push("L",r),c.join("")}function vo(n){if(n.length<4)return uo(n);for(var t,e=[],r=-1,u=n.length,i=[0],o=[0];++r<3;)t=n[r],i.push(t[0]),o.push(t[1]);for(e.push(xo(bs,i)+","+xo(bs,o)),--r;++r<u;)t=n[r],i.shift(),i.push(t[0]),o.shift(),o.push(t[1]),Mo(e,i,o);return e.join("")}function mo(n){for(var t,e,r=-1,u=n.length,i=u+4,o=[],a=[];++r<4;)e=n[r%u],o.push(e[0]),a.push(e[1]);for(t=[xo(bs,o),",",xo(bs,a)],--r;++r<i;)e=n[r%u],o.shift(),o.push(e[0]),a.shift(),a.push(e[1]),Mo(t,o,a);return t.join("")}function yo(n,t){var e=n.length-1;if(e)for(var r,u,i=n[0][0],o=n[0][1],a=n[e][0]-i,c=n[e][1]-o,s=-1;++s<=e;)r=n[s],u=s/e,r[0]=t*r[0]+(1-t)*(i+u*a),r[1]=t*r[1]+(1-t)*(o+u*c);return po(n)}function xo(n,t){return n[0]*t[0]+n[1]*t[1]+n[2]*t[2]+n[3]*t[3]}function Mo(n,t,e){n.push("C",xo(Ms,t),",",xo(Ms,e),",",xo(_s,t),",",xo(_s,e),",",xo(bs,t),",",xo(bs,e))}function _o(n,t){return(t[1]-n[1])/(t[0]-n[0])}function bo(n){for(var t=0,e=n.length-1,r=[],u=n[0],i=n[1],o=r[0]=_o(u,i);++t<e;)r[t]=(o+(o=_o(u=i,i=n[t+1])))/2;return r[t]=o,r}function wo(n){for(var t,e,r,u,i=[],o=bo(n),a=-1,c=n.length-1;++a<c;)t=_o(n[a],n[a+1]),ua(t)<ka?o[a]=o[a+1]=0:(e=o[a]/t,r=o[a+1]/t,u=e*e+r*r,u>9&&(u=3*t/Math.sqrt(u),o[a]=u*e,o[a+1]=u*r));for(a=-1;++a<=c;)u=(n[Math.min(c,a+1)][0]-n[Math.max(0,a-1)][0])/(6*(1+o[a]*o[a])),i.push([u||0,o[a]*u||0]);return i}function So(n){return n.length<3?uo(n):n[0]+ho(n,wo(n))}function ko(n){for(var t,e,r,u=-1,i=n.length;++u<i;)t=n[u],e=t[0],r=t[1]+ms,t[0]=e*Math.cos(r),t[1]=e*Math.sin(r);return n}function Eo(n){function t(t){function c(){v.push("M",a(n(m),f),l,s(n(d.reverse()),f),"Z")}for(var h,g,p,v=[],d=[],m=[],y=-1,x=t.length,M=bt(e),_=bt(u),b=e===r?function(){return g}:bt(r),w=u===i?function(){return p}:bt(i);++y<x;)o.call(this,h=t[y],y)?(d.push([g=+M.call(this,h,y),p=+_.call(this,h,y)]),m.push([+b.call(this,h,y),+w.call(this,h,y)])):d.length&&(c(),d=[],m=[]);return d.length&&c(),v.length?v.join(""):null}var e=wr,r=wr,u=0,i=Sr,o=we,a=uo,c=a.key,s=a,l="L",f=.7;return t.x=function(n){return arguments.length?(e=r=n,t):r},t.x0=function(n){return arguments.length?(e=n,t):e},t.x1=function(n){return arguments.length?(r=n,t):r},t.y=function(n){return arguments.length?(u=i=n,t):i},t.y0=function(n){return arguments.length?(u=n,t):u},t.y1=function(n){return arguments.length?(i=n,t):i},t.defined=function(n){return arguments.length?(o=n,t):o},t.interpolate=function(n){return arguments.length?(c="function"==typeof n?a=n:(a=xs.get(n)||uo).key,s=a.reverse||a,l=a.closed?"M":"L",t):c},t.tension=function(n){return arguments.length?(f=n,t):f},t}function Ao(n){return n.radius}function Co(n){return[n.x,n.y]}function No(n){return function(){var t=n.apply(this,arguments),e=t[0],r=t[1]+ms;return[e*Math.cos(r),e*Math.sin(r)]}}function zo(){return 64}function Lo(){return"circle"}function To(n){var t=Math.sqrt(n/ba);return"M0,"+t+"A"+t+","+t+" 0 1,1 0,"+-t+"A"+t+","+t+" 0 1,1 0,"+t+"Z"}function qo(n,t){return sa(n,Cs),n.id=t,n}function Ro(n,t,e,r){var u=n.id;return P(n,"function"==typeof e?function(n,i,o){n.__transition__[u].tween.set(t,r(e.call(n,n.__data__,i,o)))}:(e=r(e),function(n){n.__transition__[u].tween.set(t,e)}))}function Do(n){return null==n&&(n=""),function(){this.textContent=n}}function Po(n,t,e,r){var u=n.__transition__||(n.__transition__={active:0,count:0}),i=u[e];if(!i){var a=r.time;i=u[e]={tween:new o,time:a,ease:r.ease,delay:r.delay,duration:r.duration},++u.count,Zo.timer(function(r){function o(r){return u.active>e?s():(u.active=e,i.event&&i.event.start.call(n,l,t),i.tween.forEach(function(e,r){(r=r.call(n,l,t))&&v.push(r)}),Zo.timer(function(){return p.c=c(r||1)?we:c,1},0,a),void 0)}function c(r){if(u.active!==e)return s();for(var o=r/g,a=f(o),c=v.length;c>0;)v[--c].call(n,a);
+return o>=1?(i.event&&i.event.end.call(n,l,t),s()):void 0}function s(){return--u.count?delete u[e]:delete n.__transition__,1}var l=n.__data__,f=i.ease,h=i.delay,g=i.duration,p=Ba,v=[];return p.t=h+a,r>=h?o(r-h):(p.c=o,void 0)},0,a)}}function Uo(n,t){n.attr("transform",function(n){return"translate("+t(n)+",0)"})}function jo(n,t){n.attr("transform",function(n){return"translate(0,"+t(n)+")"})}function Ho(n){return n.toISOString()}function Fo(n,t,e){function r(t){return n(t)}function u(n,e){var r=n[1]-n[0],u=r/e,i=Zo.bisect(Us,u);return i==Us.length?[t.year,Fi(n.map(function(n){return n/31536e6}),e)[2]]:i?t[u/Us[i-1]<Us[i]/u?i-1:i]:[Fs,Fi(n,e)[2]]}return r.invert=function(t){return Oo(n.invert(t))},r.domain=function(t){return arguments.length?(n.domain(t),r):n.domain().map(Oo)},r.nice=function(n,t){function e(e){return!isNaN(e)&&!n.range(e,Oo(+e+1),t).length}var i=r.domain(),o=Li(i),a=null==n?u(o,10):"number"==typeof n&&u(o,n);return a&&(n=a[0],t=a[1]),r.domain(Ri(i,t>1?{floor:function(t){for(;e(t=n.floor(t));)t=Oo(t-1);return t},ceil:function(t){for(;e(t=n.ceil(t));)t=Oo(+t+1);return t}}:n))},r.ticks=function(n,t){var e=Li(r.domain()),i=null==n?u(e,10):"number"==typeof n?u(e,n):!n.range&&[{range:n},t];return i&&(n=i[0],t=i[1]),n.range(e[0],Oo(+e[1]+1),1>t?1:t)},r.tickFormat=function(){return e},r.copy=function(){return Fo(n.copy(),t,e)},ji(r,n)}function Oo(n){return new Date(n)}function Yo(n){return JSON.parse(n.responseText)}function Io(n){var t=$o.createRange();return t.selectNode($o.body),t.createContextualFragment(n.responseText)}var Zo={version:"3.4.11"};Date.now||(Date.now=function(){return+new Date});var Vo=[].slice,Xo=function(n){return Vo.call(n)},$o=document,Bo=$o.documentElement,Wo=window;try{Xo(Bo.childNodes)[0].nodeType}catch(Jo){Xo=function(n){for(var t=n.length,e=new Array(t);t--;)e[t]=n[t];return e}}try{$o.createElement("div").style.setProperty("opacity",0,"")}catch(Go){var Ko=Wo.Element.prototype,Qo=Ko.setAttribute,na=Ko.setAttributeNS,ta=Wo.CSSStyleDeclaration.prototype,ea=ta.setProperty;Ko.setAttribute=function(n,t){Qo.call(this,n,t+"")},Ko.setAttributeNS=function(n,t,e){na.call(this,n,t,e+"")},ta.setProperty=function(n,t,e){ea.call(this,n,t+"",e)}}Zo.ascending=n,Zo.descending=function(n,t){return n>t?-1:t>n?1:t>=n?0:0/0},Zo.min=function(n,t){var e,r,u=-1,i=n.length;if(1===arguments.length){for(;++u<i&&!(null!=(e=n[u])&&e>=e);)e=void 0;for(;++u<i;)null!=(r=n[u])&&e>r&&(e=r)}else{for(;++u<i&&!(null!=(e=t.call(n,n[u],u))&&e>=e);)e=void 0;for(;++u<i;)null!=(r=t.call(n,n[u],u))&&e>r&&(e=r)}return e},Zo.max=function(n,t){var e,r,u=-1,i=n.length;if(1===arguments.length){for(;++u<i&&!(null!=(e=n[u])&&e>=e);)e=void 0;for(;++u<i;)null!=(r=n[u])&&r>e&&(e=r)}else{for(;++u<i&&!(null!=(e=t.call(n,n[u],u))&&e>=e);)e=void 0;for(;++u<i;)null!=(r=t.call(n,n[u],u))&&r>e&&(e=r)}return e},Zo.extent=function(n,t){var e,r,u,i=-1,o=n.length;if(1===arguments.length){for(;++i<o&&!(null!=(e=u=n[i])&&e>=e);)e=u=void 0;for(;++i<o;)null!=(r=n[i])&&(e>r&&(e=r),r>u&&(u=r))}else{for(;++i<o&&!(null!=(e=u=t.call(n,n[i],i))&&e>=e);)e=void 0;for(;++i<o;)null!=(r=t.call(n,n[i],i))&&(e>r&&(e=r),r>u&&(u=r))}return[e,u]},Zo.sum=function(n,t){var e,r=0,u=n.length,i=-1;if(1===arguments.length)for(;++i<u;)isNaN(e=+n[i])||(r+=e);else for(;++i<u;)isNaN(e=+t.call(n,n[i],i))||(r+=e);return r},Zo.mean=function(n,e){var r,u=0,i=n.length,o=-1,a=i;if(1===arguments.length)for(;++o<i;)t(r=n[o])?u+=r:--a;else for(;++o<i;)t(r=e.call(n,n[o],o))?u+=r:--a;return a?u/a:void 0},Zo.quantile=function(n,t){var e=(n.length-1)*t+1,r=Math.floor(e),u=+n[r-1],i=e-r;return i?u+i*(n[r]-u):u},Zo.median=function(e,r){return arguments.length>1&&(e=e.map(r)),e=e.filter(t),e.length?Zo.quantile(e.sort(n),.5):void 0};var ra=e(n);Zo.bisectLeft=ra.left,Zo.bisect=Zo.bisectRight=ra.right,Zo.bisector=function(t){return e(1===t.length?function(e,r){return n(t(e),r)}:t)},Zo.shuffle=function(n){for(var t,e,r=n.length;r;)e=0|Math.random()*r--,t=n[r],n[r]=n[e],n[e]=t;return n},Zo.permute=function(n,t){for(var e=t.length,r=new Array(e);e--;)r[e]=n[t[e]];return r},Zo.pairs=function(n){for(var t,e=0,r=n.length-1,u=n[0],i=new Array(0>r?0:r);r>e;)i[e]=[t=u,u=n[++e]];return i},Zo.zip=function(){if(!(u=arguments.length))return[];for(var n=-1,t=Zo.min(arguments,r),e=new Array(t);++n<t;)for(var u,i=-1,o=e[n]=new Array(u);++i<u;)o[i]=arguments[i][n];return e},Zo.transpose=function(n){return Zo.zip.apply(Zo,n)},Zo.keys=function(n){var t=[];for(var e in n)t.push(e);return t},Zo.values=function(n){var t=[];for(var e in n)t.push(n[e]);return t},Zo.entries=function(n){var t=[];for(var e in n)t.push({key:e,value:n[e]});return t},Zo.merge=function(n){for(var t,e,r,u=n.length,i=-1,o=0;++i<u;)o+=n[i].length;for(e=new Array(o);--u>=0;)for(r=n[u],t=r.length;--t>=0;)e[--o]=r[t];return e};var ua=Math.abs;Zo.range=function(n,t,e){if(arguments.length<3&&(e=1,arguments.length<2&&(t=n,n=0)),1/0===(t-n)/e)throw new Error("infinite range");var r,i=[],o=u(ua(e)),a=-1;if(n*=o,t*=o,e*=o,0>e)for(;(r=n+e*++a)>t;)i.push(r/o);else for(;(r=n+e*++a)<t;)i.push(r/o);return i},Zo.map=function(n){var t=new o;if(n instanceof o)n.forEach(function(n,e){t.set(n,e)});else for(var e in n)t.set(e,n[e]);return t},i(o,{has:a,get:function(n){return this[ia+n]},set:function(n,t){return this[ia+n]=t},remove:c,keys:s,values:function(){var n=[];return this.forEach(function(t,e){n.push(e)}),n},entries:function(){var n=[];return this.forEach(function(t,e){n.push({key:t,value:e})}),n},size:l,empty:f,forEach:function(n){for(var t in this)t.charCodeAt(0)===oa&&n.call(this,t.substring(1),this[t])}});var ia="\x00",oa=ia.charCodeAt(0);Zo.nest=function(){function n(t,a,c){if(c>=i.length)return r?r.call(u,a):e?a.sort(e):a;for(var s,l,f,h,g=-1,p=a.length,v=i[c++],d=new o;++g<p;)(h=d.get(s=v(l=a[g])))?h.push(l):d.set(s,[l]);return t?(l=t(),f=function(e,r){l.set(e,n(t,r,c))}):(l={},f=function(e,r){l[e]=n(t,r,c)}),d.forEach(f),l}function t(n,e){if(e>=i.length)return n;var r=[],u=a[e++];return n.forEach(function(n,u){r.push({key:n,values:t(u,e)})}),u?r.sort(function(n,t){return u(n.key,t.key)}):r}var e,r,u={},i=[],a=[];return u.map=function(t,e){return n(e,t,0)},u.entries=function(e){return t(n(Zo.map,e,0),0)},u.key=function(n){return i.push(n),u},u.sortKeys=function(n){return a[i.length-1]=n,u},u.sortValues=function(n){return e=n,u},u.rollup=function(n){return r=n,u},u},Zo.set=function(n){var t=new h;if(n)for(var e=0,r=n.length;r>e;++e)t.add(n[e]);return t},i(h,{has:a,add:function(n){return this[ia+n]=!0,n},remove:function(n){return n=ia+n,n in this&&delete this[n]},values:s,size:l,empty:f,forEach:function(n){for(var t in this)t.charCodeAt(0)===oa&&n.call(this,t.substring(1))}}),Zo.behavior={},Zo.rebind=function(n,t){for(var e,r=1,u=arguments.length;++r<u;)n[e=arguments[r]]=g(n,t,t[e]);return n};var aa=["webkit","ms","moz","Moz","o","O"];Zo.dispatch=function(){for(var n=new d,t=-1,e=arguments.length;++t<e;)n[arguments[t]]=m(n);return n},d.prototype.on=function(n,t){var e=n.indexOf("."),r="";if(e>=0&&(r=n.substring(e+1),n=n.substring(0,e)),n)return arguments.length<2?this[n].on(r):this[n].on(r,t);if(2===arguments.length){if(null==t)for(n in this)this.hasOwnProperty(n)&&this[n].on(r,null);return this}},Zo.event=null,Zo.requote=function(n){return n.replace(ca,"\\$&")};var ca=/[\\\^\$\*\+\?\|\[\]\(\)\.\{\}]/g,sa={}.__proto__?function(n,t){n.__proto__=t}:function(n,t){for(var e in t)n[e]=t[e]},la=function(n,t){return t.querySelector(n)},fa=function(n,t){return t.querySelectorAll(n)},ha=Bo.matches||Bo[p(Bo,"matchesSelector")],ga=function(n,t){return ha.call(n,t)};"function"==typeof Sizzle&&(la=function(n,t){return Sizzle(n,t)[0]||null},fa=Sizzle,ga=Sizzle.matchesSelector),Zo.selection=function(){return ma};var pa=Zo.selection.prototype=[];pa.select=function(n){var t,e,r,u,i=[];n=b(n);for(var o=-1,a=this.length;++o<a;){i.push(t=[]),t.parentNode=(r=this[o]).parentNode;for(var c=-1,s=r.length;++c<s;)(u=r[c])?(t.push(e=n.call(u,u.__data__,c,o)),e&&"__data__"in u&&(e.__data__=u.__data__)):t.push(null)}return _(i)},pa.selectAll=function(n){var t,e,r=[];n=w(n);for(var u=-1,i=this.length;++u<i;)for(var o=this[u],a=-1,c=o.length;++a<c;)(e=o[a])&&(r.push(t=Xo(n.call(e,e.__data__,a,u))),t.parentNode=e);return _(r)};var va={svg:"http://www.w3.org/2000/svg",xhtml:"http://www.w3.org/1999/xhtml",xlink:"http://www.w3.org/1999/xlink",xml:"http://www.w3.org/XML/1998/namespace",xmlns:"http://www.w3.org/2000/xmlns/"};Zo.ns={prefix:va,qualify:function(n){var t=n.indexOf(":"),e=n;return t>=0&&(e=n.substring(0,t),n=n.substring(t+1)),va.hasOwnProperty(e)?{space:va[e],local:n}:n}},pa.attr=function(n,t){if(arguments.length<2){if("string"==typeof n){var e=this.node();return n=Zo.ns.qualify(n),n.local?e.getAttributeNS(n.space,n.local):e.getAttribute(n)}for(t in n)this.each(S(t,n[t]));return this}return this.each(S(n,t))},pa.classed=function(n,t){if(arguments.length<2){if("string"==typeof n){var e=this.node(),r=(n=A(n)).length,u=-1;if(t=e.classList){for(;++u<r;)if(!t.contains(n[u]))return!1}else for(t=e.getAttribute("class");++u<r;)if(!E(n[u]).test(t))return!1;return!0}for(t in n)this.each(C(t,n[t]));return this}return this.each(C(n,t))},pa.style=function(n,t,e){var r=arguments.length;if(3>r){if("string"!=typeof n){2>r&&(t="");for(e in n)this.each(z(e,n[e],t));return this}if(2>r)return Wo.getComputedStyle(this.node(),null).getPropertyValue(n);e=""}return this.each(z(n,t,e))},pa.property=function(n,t){if(arguments.length<2){if("string"==typeof n)return this.node()[n];for(t in n)this.each(L(t,n[t]));return this}return this.each(L(n,t))},pa.text=function(n){return arguments.length?this.each("function"==typeof n?function(){var t=n.apply(this,arguments);this.textContent=null==t?"":t}:null==n?function(){this.textContent=""}:function(){this.textContent=n}):this.node().textContent},pa.html=function(n){return arguments.length?this.each("function"==typeof n?function(){var t=n.apply(this,arguments);this.innerHTML=null==t?"":t}:null==n?function(){this.innerHTML=""}:function(){this.innerHTML=n}):this.node().innerHTML},pa.append=function(n){return n=T(n),this.select(function(){return this.appendChild(n.apply(this,arguments))})},pa.insert=function(n,t){return n=T(n),t=b(t),this.select(function(){return this.insertBefore(n.apply(this,arguments),t.apply(this,arguments)||null)})},pa.remove=function(){return this.each(function(){var n=this.parentNode;n&&n.removeChild(this)})},pa.data=function(n,t){function e(n,e){var r,u,i,a=n.length,f=e.length,h=Math.min(a,f),g=new Array(f),p=new Array(f),v=new Array(a);if(t){var d,m=new o,y=new o,x=[];for(r=-1;++r<a;)d=t.call(u=n[r],u.__data__,r),m.has(d)?v[r]=u:m.set(d,u),x.push(d);for(r=-1;++r<f;)d=t.call(e,i=e[r],r),(u=m.get(d))?(g[r]=u,u.__data__=i):y.has(d)||(p[r]=q(i)),y.set(d,i),m.remove(d);for(r=-1;++r<a;)m.has(x[r])&&(v[r]=n[r])}else{for(r=-1;++r<h;)u=n[r],i=e[r],u?(u.__data__=i,g[r]=u):p[r]=q(i);for(;f>r;++r)p[r]=q(e[r]);for(;a>r;++r)v[r]=n[r]}p.update=g,p.parentNode=g.parentNode=v.parentNode=n.parentNode,c.push(p),s.push(g),l.push(v)}var r,u,i=-1,a=this.length;if(!arguments.length){for(n=new Array(a=(r=this[0]).length);++i<a;)(u=r[i])&&(n[i]=u.__data__);return n}var c=U([]),s=_([]),l=_([]);if("function"==typeof n)for(;++i<a;)e(r=this[i],n.call(r,r.parentNode.__data__,i));else for(;++i<a;)e(r=this[i],n);return s.enter=function(){return c},s.exit=function(){return l},s},pa.datum=function(n){return arguments.length?this.property("__data__",n):this.property("__data__")},pa.filter=function(n){var t,e,r,u=[];"function"!=typeof n&&(n=R(n));for(var i=0,o=this.length;o>i;i++){u.push(t=[]),t.parentNode=(e=this[i]).parentNode;for(var a=0,c=e.length;c>a;a++)(r=e[a])&&n.call(r,r.__data__,a,i)&&t.push(r)}return _(u)},pa.order=function(){for(var n=-1,t=this.length;++n<t;)for(var e,r=this[n],u=r.length-1,i=r[u];--u>=0;)(e=r[u])&&(i&&i!==e.nextSibling&&i.parentNode.insertBefore(e,i),i=e);return this},pa.sort=function(n){n=D.apply(this,arguments);for(var t=-1,e=this.length;++t<e;)this[t].sort(n);return this.order()},pa.each=function(n){return P(this,function(t,e,r){n.call(t,t.__data__,e,r)})},pa.call=function(n){var t=Xo(arguments);return n.apply(t[0]=this,t),this},pa.empty=function(){return!this.node()},pa.node=function(){for(var n=0,t=this.length;t>n;n++)for(var e=this[n],r=0,u=e.length;u>r;r++){var i=e[r];if(i)return i}return null},pa.size=function(){var n=0;return this.each(function(){++n}),n};var da=[];Zo.selection.enter=U,Zo.selection.enter.prototype=da,da.append=pa.append,da.empty=pa.empty,da.node=pa.node,da.call=pa.call,da.size=pa.size,da.select=function(n){for(var t,e,r,u,i,o=[],a=-1,c=this.length;++a<c;){r=(u=this[a]).update,o.push(t=[]),t.parentNode=u.parentNode;for(var s=-1,l=u.length;++s<l;)(i=u[s])?(t.push(r[s]=e=n.call(u.parentNode,i.__data__,s,a)),e.__data__=i.__data__):t.push(null)}return _(o)},da.insert=function(n,t){return arguments.length<2&&(t=j(this)),pa.insert.call(this,n,t)},pa.transition=function(){for(var n,t,e=Ss||++Ns,r=[],u=ks||{time:Date.now(),ease:xu,delay:0,duration:250},i=-1,o=this.length;++i<o;){r.push(n=[]);for(var a=this[i],c=-1,s=a.length;++c<s;)(t=a[c])&&Po(t,c,e,u),n.push(t)}return qo(r,e)},pa.interrupt=function(){return this.each(H)},Zo.select=function(n){var t=["string"==typeof n?la(n,$o):n];return t.parentNode=Bo,_([t])},Zo.selectAll=function(n){var t=Xo("string"==typeof n?fa(n,$o):n);return t.parentNode=Bo,_([t])};var ma=Zo.select(Bo);pa.on=function(n,t,e){var r=arguments.length;if(3>r){if("string"!=typeof n){2>r&&(t=!1);for(e in n)this.each(F(e,n[e],t));return this}if(2>r)return(r=this.node()["__on"+n])&&r._;e=!1}return this.each(F(n,t,e))};var ya=Zo.map({mouseenter:"mouseover",mouseleave:"mouseout"});ya.forEach(function(n){"on"+n in $o&&ya.remove(n)});var xa="onselectstart"in $o?null:p(Bo.style,"userSelect"),Ma=0;Zo.mouse=function(n){return Z(n,x())};var _a=/WebKit/.test(Wo.navigator.userAgent)?-1:0;Zo.touches=function(n,t){return arguments.length<2&&(t=x().touches),t?Xo(t).map(function(t){var e=Z(n,t);return e.identifier=t.identifier,e}):[]},Zo.behavior.drag=function(){function n(){this.on("mousedown.drag",u).on("touchstart.drag",i)}function t(n,t,u,i,o){return function(){function a(){var n,e,r=t(h,v);r&&(n=r[0]-x[0],e=r[1]-x[1],p|=n|e,x=r,g({type:"drag",x:r[0]+s[0],y:r[1]+s[1],dx:n,dy:e}))}function c(){t(h,v)&&(m.on(i+d,null).on(o+d,null),y(p&&Zo.event.target===f),g({type:"dragend"}))}var s,l=this,f=Zo.event.target,h=l.parentNode,g=e.of(l,arguments),p=0,v=n(),d=".drag"+(null==v?"":"-"+v),m=Zo.select(u()).on(i+d,a).on(o+d,c),y=I(),x=t(h,v);r?(s=r.apply(l,arguments),s=[s.x-x[0],s.y-x[1]]):s=[0,0],g({type:"dragstart"})}}var e=M(n,"drag","dragstart","dragend"),r=null,u=t(v,Zo.mouse,$,"mousemove","mouseup"),i=t(V,Zo.touch,X,"touchmove","touchend");return n.origin=function(t){return arguments.length?(r=t,n):r},Zo.rebind(n,e,"on")};var ba=Math.PI,wa=2*ba,Sa=ba/2,ka=1e-6,Ea=ka*ka,Aa=ba/180,Ca=180/ba,Na=Math.SQRT2,za=2,La=4;Zo.interpolateZoom=function(n,t){function e(n){var t=n*y;if(m){var e=Q(v),o=i/(za*h)*(e*nt(Na*t+v)-K(v));return[r+o*s,u+o*l,i*e/Q(Na*t+v)]}return[r+n*s,u+n*l,i*Math.exp(Na*t)]}var r=n[0],u=n[1],i=n[2],o=t[0],a=t[1],c=t[2],s=o-r,l=a-u,f=s*s+l*l,h=Math.sqrt(f),g=(c*c-i*i+La*f)/(2*i*za*h),p=(c*c-i*i-La*f)/(2*c*za*h),v=Math.log(Math.sqrt(g*g+1)-g),d=Math.log(Math.sqrt(p*p+1)-p),m=d-v,y=(m||Math.log(c/i))/Na;return e.duration=1e3*y,e},Zo.behavior.zoom=function(){function n(n){n.on(A,s).on(Ra+".zoom",f).on("dblclick.zoom",h).on(z,l)}function t(n){return[(n[0]-S.x)/S.k,(n[1]-S.y)/S.k]}function e(n){return[n[0]*S.k+S.x,n[1]*S.k+S.y]}function r(n){S.k=Math.max(E[0],Math.min(E[1],n))}function u(n,t){t=e(t),S.x+=n[0]-t[0],S.y+=n[1]-t[1]}function i(){_&&_.domain(x.range().map(function(n){return(n-S.x)/S.k}).map(x.invert)),w&&w.domain(b.range().map(function(n){return(n-S.y)/S.k}).map(b.invert))}function o(n){n({type:"zoomstart"})}function a(n){i(),n({type:"zoom",scale:S.k,translate:[S.x,S.y]})}function c(n){n({type:"zoomend"})}function s(){function n(){l=1,u(Zo.mouse(r),h),a(s)}function e(){f.on(C,null).on(N,null),g(l&&Zo.event.target===i),c(s)}var r=this,i=Zo.event.target,s=L.of(r,arguments),l=0,f=Zo.select(Wo).on(C,n).on(N,e),h=t(Zo.mouse(r)),g=I();H.call(r),o(s)}function l(){function n(){var n=Zo.touches(g);return h=S.k,n.forEach(function(n){n.identifier in v&&(v[n.identifier]=t(n))}),n}function e(){var t=Zo.event.target;Zo.select(t).on(M,i).on(_,f),b.push(t);for(var e=Zo.event.changedTouches,o=0,c=e.length;c>o;++o)v[e[o].identifier]=null;var s=n(),l=Date.now();if(1===s.length){if(500>l-m){var h=s[0],g=v[h.identifier];r(2*S.k),u(h,g),y(),a(p)}m=l}else if(s.length>1){var h=s[0],x=s[1],w=h[0]-x[0],k=h[1]-x[1];d=w*w+k*k}}function i(){for(var n,t,e,i,o=Zo.touches(g),c=0,s=o.length;s>c;++c,i=null)if(e=o[c],i=v[e.identifier]){if(t)break;n=e,t=i}if(i){var l=(l=e[0]-n[0])*l+(l=e[1]-n[1])*l,f=d&&Math.sqrt(l/d);n=[(n[0]+e[0])/2,(n[1]+e[1])/2],t=[(t[0]+i[0])/2,(t[1]+i[1])/2],r(f*h)}m=null,u(n,t),a(p)}function f(){if(Zo.event.touches.length){for(var t=Zo.event.changedTouches,e=0,r=t.length;r>e;++e)delete v[t[e].identifier];for(var u in v)return void n()}Zo.selectAll(b).on(x,null),w.on(A,s).on(z,l),k(),c(p)}var h,g=this,p=L.of(g,arguments),v={},d=0,x=".zoom-"+Zo.event.changedTouches[0].identifier,M="touchmove"+x,_="touchend"+x,b=[],w=Zo.select(g).on(A,null).on(z,e),k=I();H.call(g),e(),o(p)}function f(){var n=L.of(this,arguments);d?clearTimeout(d):(g=t(p=v||Zo.mouse(this)),H.call(this),o(n)),d=setTimeout(function(){d=null,c(n)},50),y(),r(Math.pow(2,.002*Ta())*S.k),u(p,g),a(n)}function h(){var n=L.of(this,arguments),e=Zo.mouse(this),i=t(e),s=Math.log(S.k)/Math.LN2;o(n),r(Math.pow(2,Zo.event.shiftKey?Math.ceil(s)-1:Math.floor(s)+1)),u(e,i),a(n),c(n)}var g,p,v,d,m,x,_,b,w,S={x:0,y:0,k:1},k=[960,500],E=qa,A="mousedown.zoom",C="mousemove.zoom",N="mouseup.zoom",z="touchstart.zoom",L=M(n,"zoomstart","zoom","zoomend");return n.event=function(n){n.each(function(){var n=L.of(this,arguments),t=S;Ss?Zo.select(this).transition().each("start.zoom",function(){S=this.__chart__||{x:0,y:0,k:1},o(n)}).tween("zoom:zoom",function(){var e=k[0],r=k[1],u=e/2,i=r/2,o=Zo.interpolateZoom([(u-S.x)/S.k,(i-S.y)/S.k,e/S.k],[(u-t.x)/t.k,(i-t.y)/t.k,e/t.k]);return function(t){var r=o(t),c=e/r[2];this.__chart__=S={x:u-r[0]*c,y:i-r[1]*c,k:c},a(n)}}).each("end.zoom",function(){c(n)}):(this.__chart__=S,o(n),a(n),c(n))})},n.translate=function(t){return arguments.length?(S={x:+t[0],y:+t[1],k:S.k},i(),n):[S.x,S.y]},n.scale=function(t){return arguments.length?(S={x:S.x,y:S.y,k:+t},i(),n):S.k},n.scaleExtent=function(t){return arguments.length?(E=null==t?qa:[+t[0],+t[1]],n):E},n.center=function(t){return arguments.length?(v=t&&[+t[0],+t[1]],n):v},n.size=function(t){return arguments.length?(k=t&&[+t[0],+t[1]],n):k},n.x=function(t){return arguments.length?(_=t,x=t.copy(),S={x:0,y:0,k:1},n):_},n.y=function(t){return arguments.length?(w=t,b=t.copy(),S={x:0,y:0,k:1},n):w},Zo.rebind(n,L,"on")};var Ta,qa=[0,1/0],Ra="onwheel"in $o?(Ta=function(){return-Zo.event.deltaY*(Zo.event.deltaMode?120:1)},"wheel"):"onmousewheel"in $o?(Ta=function(){return Zo.event.wheelDelta},"mousewheel"):(Ta=function(){return-Zo.event.detail},"MozMousePixelScroll");Zo.color=et,et.prototype.toString=function(){return this.rgb()+""},Zo.hsl=rt;var Da=rt.prototype=new et;Da.brighter=function(n){return n=Math.pow(.7,arguments.length?n:1),new rt(this.h,this.s,this.l/n)},Da.darker=function(n){return n=Math.pow(.7,arguments.length?n:1),new rt(this.h,this.s,n*this.l)},Da.rgb=function(){return ut(this.h,this.s,this.l)},Zo.hcl=it;var Pa=it.prototype=new et;Pa.brighter=function(n){return new it(this.h,this.c,Math.min(100,this.l+Ua*(arguments.length?n:1)))},Pa.darker=function(n){return new it(this.h,this.c,Math.max(0,this.l-Ua*(arguments.length?n:1)))},Pa.rgb=function(){return ot(this.h,this.c,this.l).rgb()},Zo.lab=at;var Ua=18,ja=.95047,Ha=1,Fa=1.08883,Oa=at.prototype=new et;Oa.brighter=function(n){return new at(Math.min(100,this.l+Ua*(arguments.length?n:1)),this.a,this.b)},Oa.darker=function(n){return new at(Math.max(0,this.l-Ua*(arguments.length?n:1)),this.a,this.b)},Oa.rgb=function(){return ct(this.l,this.a,this.b)},Zo.rgb=gt;var Ya=gt.prototype=new et;Ya.brighter=function(n){n=Math.pow(.7,arguments.length?n:1);var t=this.r,e=this.g,r=this.b,u=30;return t||e||r?(t&&u>t&&(t=u),e&&u>e&&(e=u),r&&u>r&&(r=u),new gt(Math.min(255,t/n),Math.min(255,e/n),Math.min(255,r/n))):new gt(u,u,u)},Ya.darker=function(n){return n=Math.pow(.7,arguments.length?n:1),new gt(n*this.r,n*this.g,n*this.b)},Ya.hsl=function(){return yt(this.r,this.g,this.b)},Ya.toString=function(){return"#"+dt(this.r)+dt(this.g)+dt(this.b)};var Ia=Zo.map({aliceblue:15792383,antiquewhite:16444375,aqua:65535,aquamarine:8388564,azure:15794175,beige:16119260,bisque:16770244,black:0,blanchedalmond:16772045,blue:255,blueviolet:9055202,brown:10824234,burlywood:14596231,cadetblue:6266528,chartreuse:8388352,chocolate:13789470,coral:16744272,cornflowerblue:6591981,cornsilk:16775388,crimson:14423100,cyan:65535,darkblue:139,darkcyan:35723,darkgoldenrod:12092939,darkgray:11119017,darkgreen:25600,darkgrey:11119017,darkkhaki:12433259,darkmagenta:9109643,darkolivegreen:5597999,darkorange:16747520,darkorchid:10040012,darkred:9109504,darksalmon:15308410,darkseagreen:9419919,darkslateblue:4734347,darkslategray:3100495,darkslategrey:3100495,darkturquoise:52945,darkviolet:9699539,deeppink:16716947,deepskyblue:49151,dimgray:6908265,dimgrey:6908265,dodgerblue:2003199,firebrick:11674146,floralwhite:16775920,forestgreen:2263842,fuchsia:16711935,gainsboro:14474460,ghostwhite:16316671,gold:16766720,goldenrod:14329120,gray:8421504,green:32768,greenyellow:11403055,grey:8421504,honeydew:15794160,hotpink:16738740,indianred:13458524,indigo:4915330,ivory:16777200,khaki:15787660,lavender:15132410,lavenderblush:16773365,lawngreen:8190976,lemonchiffon:16775885,lightblue:11393254,lightcoral:15761536,lightcyan:14745599,lightgoldenrodyellow:16448210,lightgray:13882323,lightgreen:9498256,lightgrey:13882323,lightpink:16758465,lightsalmon:16752762,lightseagreen:2142890,lightskyblue:8900346,lightslategray:7833753,lightslategrey:7833753,lightsteelblue:11584734,lightyellow:16777184,lime:65280,limegreen:3329330,linen:16445670,magenta:16711935,maroon:8388608,mediumaquamarine:6737322,mediumblue:205,mediumorchid:12211667,mediumpurple:9662683,mediumseagreen:3978097,mediumslateblue:8087790,mediumspringgreen:64154,mediumturquoise:4772300,mediumvioletred:13047173,midnightblue:1644912,mintcream:16121850,mistyrose:16770273,moccasin:16770229,navajowhite:16768685,navy:128,oldlace:16643558,olive:8421376,olivedrab:7048739,orange:16753920,orangered:16729344,orchid:14315734,palegoldenrod:15657130,palegreen:10025880,paleturquoise:11529966,palevioletred:14381203,papayawhip:16773077,peachpuff:16767673,peru:13468991,pink:16761035,plum:14524637,powderblue:11591910,purple:8388736,red:16711680,rosybrown:12357519,royalblue:4286945,saddlebrown:9127187,salmon:16416882,sandybrown:16032864,seagreen:3050327,seashell:16774638,sienna:10506797,silver:12632256,skyblue:8900331,slateblue:6970061,slategray:7372944,slategrey:7372944,snow:16775930,springgreen:65407,steelblue:4620980,tan:13808780,teal:32896,thistle:14204888,tomato:16737095,turquoise:4251856,violet:15631086,wheat:16113331,white:16777215,whitesmoke:16119285,yellow:16776960,yellowgreen:10145074});Ia.forEach(function(n,t){Ia.set(n,pt(t))}),Zo.functor=bt,Zo.xhr=St(wt),Zo.dsv=function(n,t){function e(n,e,i){arguments.length<3&&(i=e,e=null);var o=kt(n,t,null==e?r:u(e),i);return o.row=function(n){return arguments.length?o.response(null==(e=n)?r:u(n)):e},o}function r(n){return e.parse(n.responseText)}function u(n){return function(t){return e.parse(t.responseText,n)}}function i(t){return t.map(o).join(n)}function o(n){return a.test(n)?'"'+n.replace(/\"/g,'""')+'"':n}var a=new RegExp('["'+n+"\n]"),c=n.charCodeAt(0);return e.parse=function(n,t){var r;return e.parseRows(n,function(n,e){if(r)return r(n,e-1);var u=new Function("d","return {"+n.map(function(n,t){return JSON.stringify(n)+": d["+t+"]"}).join(",")+"}");r=t?function(n,e){return t(u(n),e)}:u})},e.parseRows=function(n,t){function e(){if(l>=s)return o;if(u)return u=!1,i;var t=l;if(34===n.charCodeAt(t)){for(var e=t;e++<s;)if(34===n.charCodeAt(e)){if(34!==n.charCodeAt(e+1))break;++e}l=e+2;var r=n.charCodeAt(e+1);return 13===r?(u=!0,10===n.charCodeAt(e+2)&&++l):10===r&&(u=!0),n.substring(t+1,e).replace(/""/g,'"')}for(;s>l;){var r=n.charCodeAt(l++),a=1;if(10===r)u=!0;else if(13===r)u=!0,10===n.charCodeAt(l)&&(++l,++a);else if(r!==c)continue;return n.substring(t,l-a)}return n.substring(t)}for(var r,u,i={},o={},a=[],s=n.length,l=0,f=0;(r=e())!==o;){for(var h=[];r!==i&&r!==o;)h.push(r),r=e();(!t||(h=t(h,f++)))&&a.push(h)}return a},e.format=function(t){if(Array.isArray(t[0]))return e.formatRows(t);var r=new h,u=[];return t.forEach(function(n){for(var t in n)r.has(t)||u.push(r.add(t))}),[u.map(o).join(n)].concat(t.map(function(t){return u.map(function(n){return o(t[n])}).join(n)})).join("\n")},e.formatRows=function(n){return n.map(i).join("\n")},e},Zo.csv=Zo.dsv(",","text/csv"),Zo.tsv=Zo.dsv("	","text/tab-separated-values"),Zo.touch=function(n,t,e){if(arguments.length<3&&(e=t,t=x().changedTouches),t)for(var r,u=0,i=t.length;i>u;++u)if((r=t[u]).identifier===e)return Z(n,r)};var Za,Va,Xa,$a,Ba,Wa=Wo[p(Wo,"requestAnimationFrame")]||function(n){setTimeout(n,17)};Zo.timer=function(n,t,e){var r=arguments.length;2>r&&(t=0),3>r&&(e=Date.now());var u=e+t,i={c:n,t:u,f:!1,n:null};Va?Va.n=i:Za=i,Va=i,Xa||($a=clearTimeout($a),Xa=1,Wa(At))},Zo.timer.flush=function(){Ct(),Nt()},Zo.round=function(n,t){return t?Math.round(n*(t=Math.pow(10,t)))/t:Math.round(n)};var Ja=["y","z","a","f","p","n","\xb5","m","","k","M","G","T","P","E","Z","Y"].map(Lt);Zo.formatPrefix=function(n,t){var e=0;return n&&(0>n&&(n*=-1),t&&(n=Zo.round(n,zt(n,t))),e=1+Math.floor(1e-12+Math.log(n)/Math.LN10),e=Math.max(-24,Math.min(24,3*Math.floor((e-1)/3)))),Ja[8+e/3]};var Ga=/(?:([^{])?([<>=^]))?([+\- ])?([$#])?(0)?(\d+)?(,)?(\.-?\d+)?([a-z%])?/i,Ka=Zo.map({b:function(n){return n.toString(2)},c:function(n){return String.fromCharCode(n)},o:function(n){return n.toString(8)},x:function(n){return n.toString(16)},X:function(n){return n.toString(16).toUpperCase()},g:function(n,t){return n.toPrecision(t)},e:function(n,t){return n.toExponential(t)},f:function(n,t){return n.toFixed(t)},r:function(n,t){return(n=Zo.round(n,zt(n,t))).toFixed(Math.max(0,Math.min(20,zt(n*(1+1e-15),t))))}}),Qa=Zo.time={},nc=Date;Rt.prototype={getDate:function(){return this._.getUTCDate()},getDay:function(){return this._.getUTCDay()},getFullYear:function(){return this._.getUTCFullYear()},getHours:function(){return this._.getUTCHours()},getMilliseconds:function(){return this._.getUTCMilliseconds()},getMinutes:function(){return this._.getUTCMinutes()},getMonth:function(){return this._.getUTCMonth()},getSeconds:function(){return this._.getUTCSeconds()},getTime:function(){return this._.getTime()},getTimezoneOffset:function(){return 0},valueOf:function(){return this._.valueOf()},setDate:function(){tc.setUTCDate.apply(this._,arguments)},setDay:function(){tc.setUTCDay.apply(this._,arguments)},setFullYear:function(){tc.setUTCFullYear.apply(this._,arguments)},setHours:function(){tc.setUTCHours.apply(this._,arguments)},setMilliseconds:function(){tc.setUTCMilliseconds.apply(this._,arguments)},setMinutes:function(){tc.setUTCMinutes.apply(this._,arguments)},setMonth:function(){tc.setUTCMonth.apply(this._,arguments)},setSeconds:function(){tc.setUTCSeconds.apply(this._,arguments)},setTime:function(){tc.setTime.apply(this._,arguments)}};var tc=Date.prototype;Qa.year=Dt(function(n){return n=Qa.day(n),n.setMonth(0,1),n},function(n,t){n.setFullYear(n.getFullYear()+t)},function(n){return n.getFullYear()}),Qa.years=Qa.year.range,Qa.years.utc=Qa.year.utc.range,Qa.day=Dt(function(n){var t=new nc(2e3,0);return t.setFullYear(n.getFullYear(),n.getMonth(),n.getDate()),t},function(n,t){n.setDate(n.getDate()+t)},function(n){return n.getDate()-1}),Qa.days=Qa.day.range,Qa.days.utc=Qa.day.utc.range,Qa.dayOfYear=function(n){var t=Qa.year(n);return Math.floor((n-t-6e4*(n.getTimezoneOffset()-t.getTimezoneOffset()))/864e5)},["sunday","monday","tuesday","wednesday","thursday","friday","saturday"].forEach(function(n,t){t=7-t;var e=Qa[n]=Dt(function(n){return(n=Qa.day(n)).setDate(n.getDate()-(n.getDay()+t)%7),n},function(n,t){n.setDate(n.getDate()+7*Math.floor(t))},function(n){var e=Qa.year(n).getDay();return Math.floor((Qa.dayOfYear(n)+(e+t)%7)/7)-(e!==t)});Qa[n+"s"]=e.range,Qa[n+"s"].utc=e.utc.range,Qa[n+"OfYear"]=function(n){var e=Qa.year(n).getDay();return Math.floor((Qa.dayOfYear(n)+(e+t)%7)/7)}}),Qa.week=Qa.sunday,Qa.weeks=Qa.sunday.range,Qa.weeks.utc=Qa.sunday.utc.range,Qa.weekOfYear=Qa.sundayOfYear;var ec={"-":"",_:" ",0:"0"},rc=/^\s*\d+/,uc=/^%/;Zo.locale=function(n){return{numberFormat:Tt(n),timeFormat:Ut(n)}};var ic=Zo.locale({decimal:".",thousands:",",grouping:[3],currency:["$",""],dateTime:"%a %b %e %X %Y",date:"%m/%d/%Y",time:"%H:%M:%S",periods:["AM","PM"],days:["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"],shortDays:["Sun","Mon","Tue","Wed","Thu","Fri","Sat"],months:["January","February","March","April","May","June","July","August","September","October","November","December"],shortMonths:["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]});Zo.format=ic.numberFormat,Zo.geo={},ue.prototype={s:0,t:0,add:function(n){ie(n,this.t,oc),ie(oc.s,this.s,this),this.s?this.t+=oc.t:this.s=oc.t},reset:function(){this.s=this.t=0},valueOf:function(){return this.s}};var oc=new ue;Zo.geo.stream=function(n,t){n&&ac.hasOwnProperty(n.type)?ac[n.type](n,t):oe(n,t)};var ac={Feature:function(n,t){oe(n.geometry,t)},FeatureCollection:function(n,t){for(var e=n.features,r=-1,u=e.length;++r<u;)oe(e[r].geometry,t)}},cc={Sphere:function(n,t){t.sphere()},Point:function(n,t){n=n.coordinates,t.point(n[0],n[1],n[2])},MultiPoint:function(n,t){for(var e=n.coordinates,r=-1,u=e.length;++r<u;)n=e[r],t.point(n[0],n[1],n[2])},LineString:function(n,t){ae(n.coordinates,t,0)},MultiLineString:function(n,t){for(var e=n.coordinates,r=-1,u=e.length;++r<u;)ae(e[r],t,0)},Polygon:function(n,t){ce(n.coordinates,t)},MultiPolygon:function(n,t){for(var e=n.coordinates,r=-1,u=e.length;++r<u;)ce(e[r],t)},GeometryCollection:function(n,t){for(var e=n.geometries,r=-1,u=e.length;++r<u;)oe(e[r],t)}};Zo.geo.area=function(n){return sc=0,Zo.geo.stream(n,fc),sc};var sc,lc=new ue,fc={sphere:function(){sc+=4*ba},point:v,lineStart:v,lineEnd:v,polygonStart:function(){lc.reset(),fc.lineStart=se},polygonEnd:function(){var n=2*lc;sc+=0>n?4*ba+n:n,fc.lineStart=fc.lineEnd=fc.point=v}};Zo.geo.bounds=function(){function n(n,t){x.push(M=[l=n,h=n]),f>t&&(f=t),t>g&&(g=t)}function t(t,e){var r=le([t*Aa,e*Aa]);if(m){var u=he(m,r),i=[u[1],-u[0],0],o=he(i,u);ve(o),o=de(o);var c=t-p,s=c>0?1:-1,v=o[0]*Ca*s,d=ua(c)>180;if(d^(v>s*p&&s*t>v)){var y=o[1]*Ca;y>g&&(g=y)}else if(v=(v+360)%360-180,d^(v>s*p&&s*t>v)){var y=-o[1]*Ca;f>y&&(f=y)}else f>e&&(f=e),e>g&&(g=e);d?p>t?a(l,t)>a(l,h)&&(h=t):a(t,h)>a(l,h)&&(l=t):h>=l?(l>t&&(l=t),t>h&&(h=t)):t>p?a(l,t)>a(l,h)&&(h=t):a(t,h)>a(l,h)&&(l=t)}else n(t,e);m=r,p=t}function e(){_.point=t}function r(){M[0]=l,M[1]=h,_.point=n,m=null}function u(n,e){if(m){var r=n-p;y+=ua(r)>180?r+(r>0?360:-360):r}else v=n,d=e;fc.point(n,e),t(n,e)}function i(){fc.lineStart()}function o(){u(v,d),fc.lineEnd(),ua(y)>ka&&(l=-(h=180)),M[0]=l,M[1]=h,m=null}function a(n,t){return(t-=n)<0?t+360:t}function c(n,t){return n[0]-t[0]}function s(n,t){return t[0]<=t[1]?t[0]<=n&&n<=t[1]:n<t[0]||t[1]<n}var l,f,h,g,p,v,d,m,y,x,M,_={point:n,lineStart:e,lineEnd:r,polygonStart:function(){_.point=u,_.lineStart=i,_.lineEnd=o,y=0,fc.polygonStart()},polygonEnd:function(){fc.polygonEnd(),_.point=n,_.lineStart=e,_.lineEnd=r,0>lc?(l=-(h=180),f=-(g=90)):y>ka?g=90:-ka>y&&(f=-90),M[0]=l,M[1]=h}};return function(n){g=h=-(l=f=1/0),x=[],Zo.geo.stream(n,_);var t=x.length;if(t){x.sort(c);for(var e,r=1,u=x[0],i=[u];t>r;++r)e=x[r],s(e[0],u)||s(e[1],u)?(a(u[0],e[1])>a(u[0],u[1])&&(u[1]=e[1]),a(e[0],u[1])>a(u[0],u[1])&&(u[0]=e[0])):i.push(u=e);
+for(var o,e,p=-1/0,t=i.length-1,r=0,u=i[t];t>=r;u=e,++r)e=i[r],(o=a(u[1],e[0]))>p&&(p=o,l=e[0],h=u[1])}return x=M=null,1/0===l||1/0===f?[[0/0,0/0],[0/0,0/0]]:[[l,f],[h,g]]}}(),Zo.geo.centroid=function(n){hc=gc=pc=vc=dc=mc=yc=xc=Mc=_c=bc=0,Zo.geo.stream(n,wc);var t=Mc,e=_c,r=bc,u=t*t+e*e+r*r;return Ea>u&&(t=mc,e=yc,r=xc,ka>gc&&(t=pc,e=vc,r=dc),u=t*t+e*e+r*r,Ea>u)?[0/0,0/0]:[Math.atan2(e,t)*Ca,G(r/Math.sqrt(u))*Ca]};var hc,gc,pc,vc,dc,mc,yc,xc,Mc,_c,bc,wc={sphere:v,point:ye,lineStart:Me,lineEnd:_e,polygonStart:function(){wc.lineStart=be},polygonEnd:function(){wc.lineStart=Me}},Sc=Ae(we,Te,Re,[-ba,-ba/2]),kc=1e9;Zo.geo.clipExtent=function(){var n,t,e,r,u,i,o={stream:function(n){return u&&(u.valid=!1),u=i(n),u.valid=!0,u},extent:function(a){return arguments.length?(i=Ue(n=+a[0][0],t=+a[0][1],e=+a[1][0],r=+a[1][1]),u&&(u.valid=!1,u=null),o):[[n,t],[e,r]]}};return o.extent([[0,0],[960,500]])},(Zo.geo.conicEqualArea=function(){return He(Fe)}).raw=Fe,Zo.geo.albers=function(){return Zo.geo.conicEqualArea().rotate([96,0]).center([-.6,38.7]).parallels([29.5,45.5]).scale(1070)},Zo.geo.albersUsa=function(){function n(n){var i=n[0],o=n[1];return t=null,e(i,o),t||(r(i,o),t)||u(i,o),t}var t,e,r,u,i=Zo.geo.albers(),o=Zo.geo.conicEqualArea().rotate([154,0]).center([-2,58.5]).parallels([55,65]),a=Zo.geo.conicEqualArea().rotate([157,0]).center([-3,19.9]).parallels([8,18]),c={point:function(n,e){t=[n,e]}};return n.invert=function(n){var t=i.scale(),e=i.translate(),r=(n[0]-e[0])/t,u=(n[1]-e[1])/t;return(u>=.12&&.234>u&&r>=-.425&&-.214>r?o:u>=.166&&.234>u&&r>=-.214&&-.115>r?a:i).invert(n)},n.stream=function(n){var t=i.stream(n),e=o.stream(n),r=a.stream(n);return{point:function(n,u){t.point(n,u),e.point(n,u),r.point(n,u)},sphere:function(){t.sphere(),e.sphere(),r.sphere()},lineStart:function(){t.lineStart(),e.lineStart(),r.lineStart()},lineEnd:function(){t.lineEnd(),e.lineEnd(),r.lineEnd()},polygonStart:function(){t.polygonStart(),e.polygonStart(),r.polygonStart()},polygonEnd:function(){t.polygonEnd(),e.polygonEnd(),r.polygonEnd()}}},n.precision=function(t){return arguments.length?(i.precision(t),o.precision(t),a.precision(t),n):i.precision()},n.scale=function(t){return arguments.length?(i.scale(t),o.scale(.35*t),a.scale(t),n.translate(i.translate())):i.scale()},n.translate=function(t){if(!arguments.length)return i.translate();var s=i.scale(),l=+t[0],f=+t[1];return e=i.translate(t).clipExtent([[l-.455*s,f-.238*s],[l+.455*s,f+.238*s]]).stream(c).point,r=o.translate([l-.307*s,f+.201*s]).clipExtent([[l-.425*s+ka,f+.12*s+ka],[l-.214*s-ka,f+.234*s-ka]]).stream(c).point,u=a.translate([l-.205*s,f+.212*s]).clipExtent([[l-.214*s+ka,f+.166*s+ka],[l-.115*s-ka,f+.234*s-ka]]).stream(c).point,n},n.scale(1070)};var Ec,Ac,Cc,Nc,zc,Lc,Tc={point:v,lineStart:v,lineEnd:v,polygonStart:function(){Ac=0,Tc.lineStart=Oe},polygonEnd:function(){Tc.lineStart=Tc.lineEnd=Tc.point=v,Ec+=ua(Ac/2)}},qc={point:Ye,lineStart:v,lineEnd:v,polygonStart:v,polygonEnd:v},Rc={point:Ve,lineStart:Xe,lineEnd:$e,polygonStart:function(){Rc.lineStart=Be},polygonEnd:function(){Rc.point=Ve,Rc.lineStart=Xe,Rc.lineEnd=$e}};Zo.geo.path=function(){function n(n){return n&&("function"==typeof a&&i.pointRadius(+a.apply(this,arguments)),o&&o.valid||(o=u(i)),Zo.geo.stream(n,o)),i.result()}function t(){return o=null,n}var e,r,u,i,o,a=4.5;return n.area=function(n){return Ec=0,Zo.geo.stream(n,u(Tc)),Ec},n.centroid=function(n){return pc=vc=dc=mc=yc=xc=Mc=_c=bc=0,Zo.geo.stream(n,u(Rc)),bc?[Mc/bc,_c/bc]:xc?[mc/xc,yc/xc]:dc?[pc/dc,vc/dc]:[0/0,0/0]},n.bounds=function(n){return zc=Lc=-(Cc=Nc=1/0),Zo.geo.stream(n,u(qc)),[[Cc,Nc],[zc,Lc]]},n.projection=function(n){return arguments.length?(u=(e=n)?n.stream||Ge(n):wt,t()):e},n.context=function(n){return arguments.length?(i=null==(r=n)?new Ie:new We(n),"function"!=typeof a&&i.pointRadius(a),t()):r},n.pointRadius=function(t){return arguments.length?(a="function"==typeof t?t:(i.pointRadius(+t),+t),n):a},n.projection(Zo.geo.albersUsa()).context(null)},Zo.geo.transform=function(n){return{stream:function(t){var e=new Ke(t);for(var r in n)e[r]=n[r];return e}}},Ke.prototype={point:function(n,t){this.stream.point(n,t)},sphere:function(){this.stream.sphere()},lineStart:function(){this.stream.lineStart()},lineEnd:function(){this.stream.lineEnd()},polygonStart:function(){this.stream.polygonStart()},polygonEnd:function(){this.stream.polygonEnd()}},Zo.geo.projection=nr,Zo.geo.projectionMutator=tr,(Zo.geo.equirectangular=function(){return nr(rr)}).raw=rr.invert=rr,Zo.geo.rotation=function(n){function t(t){return t=n(t[0]*Aa,t[1]*Aa),t[0]*=Ca,t[1]*=Ca,t}return n=ir(n[0]%360*Aa,n[1]*Aa,n.length>2?n[2]*Aa:0),t.invert=function(t){return t=n.invert(t[0]*Aa,t[1]*Aa),t[0]*=Ca,t[1]*=Ca,t},t},ur.invert=rr,Zo.geo.circle=function(){function n(){var n="function"==typeof r?r.apply(this,arguments):r,t=ir(-n[0]*Aa,-n[1]*Aa,0).invert,u=[];return e(null,null,1,{point:function(n,e){u.push(n=t(n,e)),n[0]*=Ca,n[1]*=Ca}}),{type:"Polygon",coordinates:[u]}}var t,e,r=[0,0],u=6;return n.origin=function(t){return arguments.length?(r=t,n):r},n.angle=function(r){return arguments.length?(e=sr((t=+r)*Aa,u*Aa),n):t},n.precision=function(r){return arguments.length?(e=sr(t*Aa,(u=+r)*Aa),n):u},n.angle(90)},Zo.geo.distance=function(n,t){var e,r=(t[0]-n[0])*Aa,u=n[1]*Aa,i=t[1]*Aa,o=Math.sin(r),a=Math.cos(r),c=Math.sin(u),s=Math.cos(u),l=Math.sin(i),f=Math.cos(i);return Math.atan2(Math.sqrt((e=f*o)*e+(e=s*l-c*f*a)*e),c*l+s*f*a)},Zo.geo.graticule=function(){function n(){return{type:"MultiLineString",coordinates:t()}}function t(){return Zo.range(Math.ceil(i/d)*d,u,d).map(h).concat(Zo.range(Math.ceil(s/m)*m,c,m).map(g)).concat(Zo.range(Math.ceil(r/p)*p,e,p).filter(function(n){return ua(n%d)>ka}).map(l)).concat(Zo.range(Math.ceil(a/v)*v,o,v).filter(function(n){return ua(n%m)>ka}).map(f))}var e,r,u,i,o,a,c,s,l,f,h,g,p=10,v=p,d=90,m=360,y=2.5;return n.lines=function(){return t().map(function(n){return{type:"LineString",coordinates:n}})},n.outline=function(){return{type:"Polygon",coordinates:[h(i).concat(g(c).slice(1),h(u).reverse().slice(1),g(s).reverse().slice(1))]}},n.extent=function(t){return arguments.length?n.majorExtent(t).minorExtent(t):n.minorExtent()},n.majorExtent=function(t){return arguments.length?(i=+t[0][0],u=+t[1][0],s=+t[0][1],c=+t[1][1],i>u&&(t=i,i=u,u=t),s>c&&(t=s,s=c,c=t),n.precision(y)):[[i,s],[u,c]]},n.minorExtent=function(t){return arguments.length?(r=+t[0][0],e=+t[1][0],a=+t[0][1],o=+t[1][1],r>e&&(t=r,r=e,e=t),a>o&&(t=a,a=o,o=t),n.precision(y)):[[r,a],[e,o]]},n.step=function(t){return arguments.length?n.majorStep(t).minorStep(t):n.minorStep()},n.majorStep=function(t){return arguments.length?(d=+t[0],m=+t[1],n):[d,m]},n.minorStep=function(t){return arguments.length?(p=+t[0],v=+t[1],n):[p,v]},n.precision=function(t){return arguments.length?(y=+t,l=fr(a,o,90),f=hr(r,e,y),h=fr(s,c,90),g=hr(i,u,y),n):y},n.majorExtent([[-180,-90+ka],[180,90-ka]]).minorExtent([[-180,-80-ka],[180,80+ka]])},Zo.geo.greatArc=function(){function n(){return{type:"LineString",coordinates:[t||r.apply(this,arguments),e||u.apply(this,arguments)]}}var t,e,r=gr,u=pr;return n.distance=function(){return Zo.geo.distance(t||r.apply(this,arguments),e||u.apply(this,arguments))},n.source=function(e){return arguments.length?(r=e,t="function"==typeof e?null:e,n):r},n.target=function(t){return arguments.length?(u=t,e="function"==typeof t?null:t,n):u},n.precision=function(){return arguments.length?n:0},n},Zo.geo.interpolate=function(n,t){return vr(n[0]*Aa,n[1]*Aa,t[0]*Aa,t[1]*Aa)},Zo.geo.length=function(n){return Dc=0,Zo.geo.stream(n,Pc),Dc};var Dc,Pc={sphere:v,point:v,lineStart:dr,lineEnd:v,polygonStart:v,polygonEnd:v},Uc=mr(function(n){return Math.sqrt(2/(1+n))},function(n){return 2*Math.asin(n/2)});(Zo.geo.azimuthalEqualArea=function(){return nr(Uc)}).raw=Uc;var jc=mr(function(n){var t=Math.acos(n);return t&&t/Math.sin(t)},wt);(Zo.geo.azimuthalEquidistant=function(){return nr(jc)}).raw=jc,(Zo.geo.conicConformal=function(){return He(yr)}).raw=yr,(Zo.geo.conicEquidistant=function(){return He(xr)}).raw=xr;var Hc=mr(function(n){return 1/n},Math.atan);(Zo.geo.gnomonic=function(){return nr(Hc)}).raw=Hc,Mr.invert=function(n,t){return[n,2*Math.atan(Math.exp(t))-Sa]},(Zo.geo.mercator=function(){return _r(Mr)}).raw=Mr;var Fc=mr(function(){return 1},Math.asin);(Zo.geo.orthographic=function(){return nr(Fc)}).raw=Fc;var Oc=mr(function(n){return 1/(1+n)},function(n){return 2*Math.atan(n)});(Zo.geo.stereographic=function(){return nr(Oc)}).raw=Oc,br.invert=function(n,t){return[-t,2*Math.atan(Math.exp(n))-Sa]},(Zo.geo.transverseMercator=function(){var n=_r(br),t=n.center,e=n.rotate;return n.center=function(n){return n?t([-n[1],n[0]]):(n=t(),[n[1],-n[0]])},n.rotate=function(n){return n?e([n[0],n[1],n.length>2?n[2]+90:90]):(n=e(),[n[0],n[1],n[2]-90])},e([0,0,90])}).raw=br,Zo.geom={},Zo.geom.hull=function(n){function t(n){if(n.length<3)return[];var t,u=bt(e),i=bt(r),o=n.length,a=[],c=[];for(t=0;o>t;t++)a.push([+u.call(this,n[t],t),+i.call(this,n[t],t),t]);for(a.sort(Er),t=0;o>t;t++)c.push([a[t][0],-a[t][1]]);var s=kr(a),l=kr(c),f=l[0]===s[0],h=l[l.length-1]===s[s.length-1],g=[];for(t=s.length-1;t>=0;--t)g.push(n[a[s[t]][2]]);for(t=+f;t<l.length-h;++t)g.push(n[a[l[t]][2]]);return g}var e=wr,r=Sr;return arguments.length?t(n):(t.x=function(n){return arguments.length?(e=n,t):e},t.y=function(n){return arguments.length?(r=n,t):r},t)},Zo.geom.polygon=function(n){return sa(n,Yc),n};var Yc=Zo.geom.polygon.prototype=[];Yc.area=function(){for(var n,t=-1,e=this.length,r=this[e-1],u=0;++t<e;)n=r,r=this[t],u+=n[1]*r[0]-n[0]*r[1];return.5*u},Yc.centroid=function(n){var t,e,r=-1,u=this.length,i=0,o=0,a=this[u-1];for(arguments.length||(n=-1/(6*this.area()));++r<u;)t=a,a=this[r],e=t[0]*a[1]-a[0]*t[1],i+=(t[0]+a[0])*e,o+=(t[1]+a[1])*e;return[i*n,o*n]},Yc.clip=function(n){for(var t,e,r,u,i,o,a=Nr(n),c=-1,s=this.length-Nr(this),l=this[s-1];++c<s;){for(t=n.slice(),n.length=0,u=this[c],i=t[(r=t.length-a)-1],e=-1;++e<r;)o=t[e],Ar(o,l,u)?(Ar(i,l,u)||n.push(Cr(i,o,l,u)),n.push(o)):Ar(i,l,u)&&n.push(Cr(i,o,l,u)),i=o;a&&n.push(n[0]),l=u}return n};var Ic,Zc,Vc,Xc,$c,Bc=[],Wc=[];Ur.prototype.prepare=function(){for(var n,t=this.edges,e=t.length;e--;)n=t[e].edge,n.b&&n.a||t.splice(e,1);return t.sort(Hr),t.length},Wr.prototype={start:function(){return this.edge.l===this.site?this.edge.a:this.edge.b},end:function(){return this.edge.l===this.site?this.edge.b:this.edge.a}},Jr.prototype={insert:function(n,t){var e,r,u;if(n){if(t.P=n,t.N=n.N,n.N&&(n.N.P=t),n.N=t,n.R){for(n=n.R;n.L;)n=n.L;n.L=t}else n.R=t;e=n}else this._?(n=nu(this._),t.P=null,t.N=n,n.P=n.L=t,e=n):(t.P=t.N=null,this._=t,e=null);for(t.L=t.R=null,t.U=e,t.C=!0,n=t;e&&e.C;)r=e.U,e===r.L?(u=r.R,u&&u.C?(e.C=u.C=!1,r.C=!0,n=r):(n===e.R&&(Kr(this,e),n=e,e=n.U),e.C=!1,r.C=!0,Qr(this,r))):(u=r.L,u&&u.C?(e.C=u.C=!1,r.C=!0,n=r):(n===e.L&&(Qr(this,e),n=e,e=n.U),e.C=!1,r.C=!0,Kr(this,r))),e=n.U;this._.C=!1},remove:function(n){n.N&&(n.N.P=n.P),n.P&&(n.P.N=n.N),n.N=n.P=null;var t,e,r,u=n.U,i=n.L,o=n.R;if(e=i?o?nu(o):i:o,u?u.L===n?u.L=e:u.R=e:this._=e,i&&o?(r=e.C,e.C=n.C,e.L=i,i.U=e,e!==o?(u=e.U,e.U=n.U,n=e.R,u.L=n,e.R=o,o.U=e):(e.U=u,u=e,n=e.R)):(r=n.C,n=e),n&&(n.U=u),!r){if(n&&n.C)return n.C=!1,void 0;do{if(n===this._)break;if(n===u.L){if(t=u.R,t.C&&(t.C=!1,u.C=!0,Kr(this,u),t=u.R),t.L&&t.L.C||t.R&&t.R.C){t.R&&t.R.C||(t.L.C=!1,t.C=!0,Qr(this,t),t=u.R),t.C=u.C,u.C=t.R.C=!1,Kr(this,u),n=this._;break}}else if(t=u.L,t.C&&(t.C=!1,u.C=!0,Qr(this,u),t=u.L),t.L&&t.L.C||t.R&&t.R.C){t.L&&t.L.C||(t.R.C=!1,t.C=!0,Kr(this,t),t=u.L),t.C=u.C,u.C=t.L.C=!1,Qr(this,u),n=this._;break}t.C=!0,n=u,u=u.U}while(!n.C);n&&(n.C=!1)}}},Zo.geom.voronoi=function(n){function t(n){var t=new Array(n.length),r=a[0][0],u=a[0][1],i=a[1][0],o=a[1][1];return tu(e(n),a).cells.forEach(function(e,a){var c=e.edges,s=e.site,l=t[a]=c.length?c.map(function(n){var t=n.start();return[t.x,t.y]}):s.x>=r&&s.x<=i&&s.y>=u&&s.y<=o?[[r,o],[i,o],[i,u],[r,u]]:[];l.point=n[a]}),t}function e(n){return n.map(function(n,t){return{x:Math.round(i(n,t)/ka)*ka,y:Math.round(o(n,t)/ka)*ka,i:t}})}var r=wr,u=Sr,i=r,o=u,a=Jc;return n?t(n):(t.links=function(n){return tu(e(n)).edges.filter(function(n){return n.l&&n.r}).map(function(t){return{source:n[t.l.i],target:n[t.r.i]}})},t.triangles=function(n){var t=[];return tu(e(n)).cells.forEach(function(e,r){for(var u,i,o=e.site,a=e.edges.sort(Hr),c=-1,s=a.length,l=a[s-1].edge,f=l.l===o?l.r:l.l;++c<s;)u=l,i=f,l=a[c].edge,f=l.l===o?l.r:l.l,r<i.i&&r<f.i&&ru(o,i,f)<0&&t.push([n[r],n[i.i],n[f.i]])}),t},t.x=function(n){return arguments.length?(i=bt(r=n),t):r},t.y=function(n){return arguments.length?(o=bt(u=n),t):u},t.clipExtent=function(n){return arguments.length?(a=null==n?Jc:n,t):a===Jc?null:a},t.size=function(n){return arguments.length?t.clipExtent(n&&[[0,0],n]):a===Jc?null:a&&a[1]},t)};var Jc=[[-1e6,-1e6],[1e6,1e6]];Zo.geom.delaunay=function(n){return Zo.geom.voronoi().triangles(n)},Zo.geom.quadtree=function(n,t,e,r,u){function i(n){function i(n,t,e,r,u,i,o,a){if(!isNaN(e)&&!isNaN(r))if(n.leaf){var c=n.x,l=n.y;if(null!=c)if(ua(c-e)+ua(l-r)<.01)s(n,t,e,r,u,i,o,a);else{var f=n.point;n.x=n.y=n.point=null,s(n,f,c,l,u,i,o,a),s(n,t,e,r,u,i,o,a)}else n.x=e,n.y=r,n.point=t}else s(n,t,e,r,u,i,o,a)}function s(n,t,e,r,u,o,a,c){var s=.5*(u+a),l=.5*(o+c),f=e>=s,h=r>=l,g=(h<<1)+f;n.leaf=!1,n=n.nodes[g]||(n.nodes[g]=ou()),f?u=s:a=s,h?o=l:c=l,i(n,t,e,r,u,o,a,c)}var l,f,h,g,p,v,d,m,y,x=bt(a),M=bt(c);if(null!=t)v=t,d=e,m=r,y=u;else if(m=y=-(v=d=1/0),f=[],h=[],p=n.length,o)for(g=0;p>g;++g)l=n[g],l.x<v&&(v=l.x),l.y<d&&(d=l.y),l.x>m&&(m=l.x),l.y>y&&(y=l.y),f.push(l.x),h.push(l.y);else for(g=0;p>g;++g){var _=+x(l=n[g],g),b=+M(l,g);v>_&&(v=_),d>b&&(d=b),_>m&&(m=_),b>y&&(y=b),f.push(_),h.push(b)}var w=m-v,S=y-d;w>S?y=d+w:m=v+S;var k=ou();if(k.add=function(n){i(k,n,+x(n,++g),+M(n,g),v,d,m,y)},k.visit=function(n){au(n,k,v,d,m,y)},g=-1,null==t){for(;++g<p;)i(k,n[g],f[g],h[g],v,d,m,y);--g}else n.forEach(k.add);return f=h=n=l=null,k}var o,a=wr,c=Sr;return(o=arguments.length)?(a=uu,c=iu,3===o&&(u=e,r=t,e=t=0),i(n)):(i.x=function(n){return arguments.length?(a=n,i):a},i.y=function(n){return arguments.length?(c=n,i):c},i.extent=function(n){return arguments.length?(null==n?t=e=r=u=null:(t=+n[0][0],e=+n[0][1],r=+n[1][0],u=+n[1][1]),i):null==t?null:[[t,e],[r,u]]},i.size=function(n){return arguments.length?(null==n?t=e=r=u=null:(t=e=0,r=+n[0],u=+n[1]),i):null==t?null:[r-t,u-e]},i)},Zo.interpolateRgb=cu,Zo.interpolateObject=su,Zo.interpolateNumber=lu,Zo.interpolateString=fu;var Gc=/[-+]?(?:\d+\.?\d*|\.?\d+)(?:[eE][-+]?\d+)?/g,Kc=new RegExp(Gc.source,"g");Zo.interpolate=hu,Zo.interpolators=[function(n,t){var e=typeof t;return("string"===e?Ia.has(t)||/^(#|rgb\(|hsl\()/.test(t)?cu:fu:t instanceof et?cu:Array.isArray(t)?gu:"object"===e&&isNaN(t)?su:lu)(n,t)}],Zo.interpolateArray=gu;var Qc=function(){return wt},ns=Zo.map({linear:Qc,poly:Mu,quad:function(){return mu},cubic:function(){return yu},sin:function(){return _u},exp:function(){return bu},circle:function(){return wu},elastic:Su,back:ku,bounce:function(){return Eu}}),ts=Zo.map({"in":wt,out:vu,"in-out":du,"out-in":function(n){return du(vu(n))}});Zo.ease=function(n){var t=n.indexOf("-"),e=t>=0?n.substring(0,t):n,r=t>=0?n.substring(t+1):"in";return e=ns.get(e)||Qc,r=ts.get(r)||wt,pu(r(e.apply(null,Vo.call(arguments,1))))},Zo.interpolateHcl=Au,Zo.interpolateHsl=Cu,Zo.interpolateLab=Nu,Zo.interpolateRound=zu,Zo.transform=function(n){var t=$o.createElementNS(Zo.ns.prefix.svg,"g");return(Zo.transform=function(n){if(null!=n){t.setAttribute("transform",n);var e=t.transform.baseVal.consolidate()}return new Lu(e?e.matrix:es)})(n)},Lu.prototype.toString=function(){return"translate("+this.translate+")rotate("+this.rotate+")skewX("+this.skew+")scale("+this.scale+")"};var es={a:1,b:0,c:0,d:1,e:0,f:0};Zo.interpolateTransform=Du,Zo.layout={},Zo.layout.bundle=function(){return function(n){for(var t=[],e=-1,r=n.length;++e<r;)t.push(ju(n[e]));return t}},Zo.layout.chord=function(){function n(){var n,s,f,h,g,p={},v=[],d=Zo.range(i),m=[];for(e=[],r=[],n=0,h=-1;++h<i;){for(s=0,g=-1;++g<i;)s+=u[h][g];v.push(s),m.push(Zo.range(i)),n+=s}for(o&&d.sort(function(n,t){return o(v[n],v[t])}),a&&m.forEach(function(n,t){n.sort(function(n,e){return a(u[t][n],u[t][e])})}),n=(wa-l*i)/n,s=0,h=-1;++h<i;){for(f=s,g=-1;++g<i;){var y=d[h],x=m[y][g],M=u[y][x],_=s,b=s+=M*n;p[y+"-"+x]={index:y,subindex:x,startAngle:_,endAngle:b,value:M}}r[y]={index:y,startAngle:f,endAngle:s,value:(s-f)/n},s+=l}for(h=-1;++h<i;)for(g=h-1;++g<i;){var w=p[h+"-"+g],S=p[g+"-"+h];(w.value||S.value)&&e.push(w.value<S.value?{source:S,target:w}:{source:w,target:S})}c&&t()}function t(){e.sort(function(n,t){return c((n.source.value+n.target.value)/2,(t.source.value+t.target.value)/2)})}var e,r,u,i,o,a,c,s={},l=0;return s.matrix=function(n){return arguments.length?(i=(u=n)&&u.length,e=r=null,s):u},s.padding=function(n){return arguments.length?(l=n,e=r=null,s):l},s.sortGroups=function(n){return arguments.length?(o=n,e=r=null,s):o},s.sortSubgroups=function(n){return arguments.length?(a=n,e=null,s):a},s.sortChords=function(n){return arguments.length?(c=n,e&&t(),s):c},s.chords=function(){return e||n(),e},s.groups=function(){return r||n(),r},s},Zo.layout.force=function(){function n(n){return function(t,e,r,u){if(t.point!==n){var i=t.cx-n.x,o=t.cy-n.y,a=u-e,c=i*i+o*o;if(c>a*a/d){if(p>c){var s=t.charge/c;n.px-=i*s,n.py-=o*s}return!0}if(t.point&&c&&p>c){var s=t.pointCharge/c;n.px-=i*s,n.py-=o*s}}return!t.charge}}function t(n){n.px=Zo.event.x,n.py=Zo.event.y,a.resume()}var e,r,u,i,o,a={},c=Zo.dispatch("start","tick","end"),s=[1,1],l=.9,f=rs,h=us,g=-30,p=is,v=.1,d=.64,m=[],y=[];return a.tick=function(){if((r*=.99)<.005)return c.end({type:"end",alpha:r=0}),!0;var t,e,a,f,h,p,d,x,M,_=m.length,b=y.length;for(e=0;b>e;++e)a=y[e],f=a.source,h=a.target,x=h.x-f.x,M=h.y-f.y,(p=x*x+M*M)&&(p=r*i[e]*((p=Math.sqrt(p))-u[e])/p,x*=p,M*=p,h.x-=x*(d=f.weight/(h.weight+f.weight)),h.y-=M*d,f.x+=x*(d=1-d),f.y+=M*d);if((d=r*v)&&(x=s[0]/2,M=s[1]/2,e=-1,d))for(;++e<_;)a=m[e],a.x+=(x-a.x)*d,a.y+=(M-a.y)*d;if(g)for(Vu(t=Zo.geom.quadtree(m),r,o),e=-1;++e<_;)(a=m[e]).fixed||t.visit(n(a));for(e=-1;++e<_;)a=m[e],a.fixed?(a.x=a.px,a.y=a.py):(a.x-=(a.px-(a.px=a.x))*l,a.y-=(a.py-(a.py=a.y))*l);c.tick({type:"tick",alpha:r})},a.nodes=function(n){return arguments.length?(m=n,a):m},a.links=function(n){return arguments.length?(y=n,a):y},a.size=function(n){return arguments.length?(s=n,a):s},a.linkDistance=function(n){return arguments.length?(f="function"==typeof n?n:+n,a):f},a.distance=a.linkDistance,a.linkStrength=function(n){return arguments.length?(h="function"==typeof n?n:+n,a):h},a.friction=function(n){return arguments.length?(l=+n,a):l},a.charge=function(n){return arguments.length?(g="function"==typeof n?n:+n,a):g},a.chargeDistance=function(n){return arguments.length?(p=n*n,a):Math.sqrt(p)},a.gravity=function(n){return arguments.length?(v=+n,a):v},a.theta=function(n){return arguments.length?(d=n*n,a):Math.sqrt(d)},a.alpha=function(n){return arguments.length?(n=+n,r?r=n>0?n:0:n>0&&(c.start({type:"start",alpha:r=n}),Zo.timer(a.tick)),a):r},a.start=function(){function n(n,r){if(!e){for(e=new Array(c),a=0;c>a;++a)e[a]=[];for(a=0;s>a;++a){var u=y[a];e[u.source.index].push(u.target),e[u.target.index].push(u.source)}}for(var i,o=e[t],a=-1,s=o.length;++a<s;)if(!isNaN(i=o[a][n]))return i;return Math.random()*r}var t,e,r,c=m.length,l=y.length,p=s[0],v=s[1];for(t=0;c>t;++t)(r=m[t]).index=t,r.weight=0;for(t=0;l>t;++t)r=y[t],"number"==typeof r.source&&(r.source=m[r.source]),"number"==typeof r.target&&(r.target=m[r.target]),++r.source.weight,++r.target.weight;for(t=0;c>t;++t)r=m[t],isNaN(r.x)&&(r.x=n("x",p)),isNaN(r.y)&&(r.y=n("y",v)),isNaN(r.px)&&(r.px=r.x),isNaN(r.py)&&(r.py=r.y);if(u=[],"function"==typeof f)for(t=0;l>t;++t)u[t]=+f.call(this,y[t],t);else for(t=0;l>t;++t)u[t]=f;if(i=[],"function"==typeof h)for(t=0;l>t;++t)i[t]=+h.call(this,y[t],t);else for(t=0;l>t;++t)i[t]=h;if(o=[],"function"==typeof g)for(t=0;c>t;++t)o[t]=+g.call(this,m[t],t);else for(t=0;c>t;++t)o[t]=g;return a.resume()},a.resume=function(){return a.alpha(.1)},a.stop=function(){return a.alpha(0)},a.drag=function(){return e||(e=Zo.behavior.drag().origin(wt).on("dragstart.force",Ou).on("drag.force",t).on("dragend.force",Yu)),arguments.length?(this.on("mouseover.force",Iu).on("mouseout.force",Zu).call(e),void 0):e},Zo.rebind(a,c,"on")};var rs=20,us=1,is=1/0;Zo.layout.hierarchy=function(){function n(u){var i,o=[u],a=[];for(u.depth=0;null!=(i=o.pop());)if(a.push(i),(s=e.call(n,i,i.depth))&&(c=s.length)){for(var c,s,l;--c>=0;)o.push(l=s[c]),l.parent=i,l.depth=i.depth+1;r&&(i.value=0),i.children=s}else r&&(i.value=+r.call(n,i,i.depth)||0),delete i.children;return Bu(u,function(n){var e,u;t&&(e=n.children)&&e.sort(t),r&&(u=n.parent)&&(u.value+=n.value)}),a}var t=Gu,e=Wu,r=Ju;return n.sort=function(e){return arguments.length?(t=e,n):t},n.children=function(t){return arguments.length?(e=t,n):e},n.value=function(t){return arguments.length?(r=t,n):r},n.revalue=function(t){return r&&($u(t,function(n){n.children&&(n.value=0)}),Bu(t,function(t){var e;t.children||(t.value=+r.call(n,t,t.depth)||0),(e=t.parent)&&(e.value+=t.value)})),t},n},Zo.layout.partition=function(){function n(t,e,r,u){var i=t.children;if(t.x=e,t.y=t.depth*u,t.dx=r,t.dy=u,i&&(o=i.length)){var o,a,c,s=-1;for(r=t.value?r/t.value:0;++s<o;)n(a=i[s],e,c=a.value*r,u),e+=c}}function t(n){var e=n.children,r=0;if(e&&(u=e.length))for(var u,i=-1;++i<u;)r=Math.max(r,t(e[i]));return 1+r}function e(e,i){var o=r.call(this,e,i);return n(o[0],0,u[0],u[1]/t(o[0])),o}var r=Zo.layout.hierarchy(),u=[1,1];return e.size=function(n){return arguments.length?(u=n,e):u},Xu(e,r)},Zo.layout.pie=function(){function n(i){var o=i.map(function(e,r){return+t.call(n,e,r)}),a=+("function"==typeof r?r.apply(this,arguments):r),c=(("function"==typeof u?u.apply(this,arguments):u)-a)/Zo.sum(o),s=Zo.range(i.length);null!=e&&s.sort(e===os?function(n,t){return o[t]-o[n]}:function(n,t){return e(i[n],i[t])});var l=[];return s.forEach(function(n){var t;l[n]={data:i[n],value:t=o[n],startAngle:a,endAngle:a+=t*c}}),l}var t=Number,e=os,r=0,u=wa;return n.value=function(e){return arguments.length?(t=e,n):t},n.sort=function(t){return arguments.length?(e=t,n):e},n.startAngle=function(t){return arguments.length?(r=t,n):r},n.endAngle=function(t){return arguments.length?(u=t,n):u},n};var os={};Zo.layout.stack=function(){function n(a,c){var s=a.map(function(e,r){return t.call(n,e,r)}),l=s.map(function(t){return t.map(function(t,e){return[i.call(n,t,e),o.call(n,t,e)]})}),f=e.call(n,l,c);s=Zo.permute(s,f),l=Zo.permute(l,f);var h,g,p,v=r.call(n,l,c),d=s.length,m=s[0].length;for(g=0;m>g;++g)for(u.call(n,s[0][g],p=v[g],l[0][g][1]),h=1;d>h;++h)u.call(n,s[h][g],p+=l[h-1][g][1],l[h][g][1]);return a}var t=wt,e=ei,r=ri,u=ti,i=Qu,o=ni;return n.values=function(e){return arguments.length?(t=e,n):t},n.order=function(t){return arguments.length?(e="function"==typeof t?t:as.get(t)||ei,n):e},n.offset=function(t){return arguments.length?(r="function"==typeof t?t:cs.get(t)||ri,n):r},n.x=function(t){return arguments.length?(i=t,n):i},n.y=function(t){return arguments.length?(o=t,n):o},n.out=function(t){return arguments.length?(u=t,n):u},n};var as=Zo.map({"inside-out":function(n){var t,e,r=n.length,u=n.map(ui),i=n.map(ii),o=Zo.range(r).sort(function(n,t){return u[n]-u[t]}),a=0,c=0,s=[],l=[];for(t=0;r>t;++t)e=o[t],c>a?(a+=i[e],s.push(e)):(c+=i[e],l.push(e));return l.reverse().concat(s)},reverse:function(n){return Zo.range(n.length).reverse()},"default":ei}),cs=Zo.map({silhouette:function(n){var t,e,r,u=n.length,i=n[0].length,o=[],a=0,c=[];for(e=0;i>e;++e){for(t=0,r=0;u>t;t++)r+=n[t][e][1];r>a&&(a=r),o.push(r)}for(e=0;i>e;++e)c[e]=(a-o[e])/2;return c},wiggle:function(n){var t,e,r,u,i,o,a,c,s,l=n.length,f=n[0],h=f.length,g=[];for(g[0]=c=s=0,e=1;h>e;++e){for(t=0,u=0;l>t;++t)u+=n[t][e][1];for(t=0,i=0,a=f[e][0]-f[e-1][0];l>t;++t){for(r=0,o=(n[t][e][1]-n[t][e-1][1])/(2*a);t>r;++r)o+=(n[r][e][1]-n[r][e-1][1])/a;i+=o*n[t][e][1]}g[e]=c-=u?i/u*a:0,s>c&&(s=c)}for(e=0;h>e;++e)g[e]-=s;return g},expand:function(n){var t,e,r,u=n.length,i=n[0].length,o=1/u,a=[];for(e=0;i>e;++e){for(t=0,r=0;u>t;t++)r+=n[t][e][1];if(r)for(t=0;u>t;t++)n[t][e][1]/=r;else for(t=0;u>t;t++)n[t][e][1]=o}for(e=0;i>e;++e)a[e]=0;return a},zero:ri});Zo.layout.histogram=function(){function n(n,i){for(var o,a,c=[],s=n.map(e,this),l=r.call(this,s,i),f=u.call(this,l,s,i),i=-1,h=s.length,g=f.length-1,p=t?1:1/h;++i<g;)o=c[i]=[],o.dx=f[i+1]-(o.x=f[i]),o.y=0;if(g>0)for(i=-1;++i<h;)a=s[i],a>=l[0]&&a<=l[1]&&(o=c[Zo.bisect(f,a,1,g)-1],o.y+=p,o.push(n[i]));return c}var t=!0,e=Number,r=si,u=ai;return n.value=function(t){return arguments.length?(e=t,n):e},n.range=function(t){return arguments.length?(r=bt(t),n):r},n.bins=function(t){return arguments.length?(u="number"==typeof t?function(n){return ci(n,t)}:bt(t),n):u},n.frequency=function(e){return arguments.length?(t=!!e,n):t},n},Zo.layout.pack=function(){function n(n,i){var o=e.call(this,n,i),a=o[0],c=u[0],s=u[1],l=null==t?Math.sqrt:"function"==typeof t?t:function(){return t};if(a.x=a.y=0,Bu(a,function(n){n.r=+l(n.value)}),Bu(a,pi),r){var f=r*(t?1:Math.max(2*a.r/c,2*a.r/s))/2;Bu(a,function(n){n.r+=f}),Bu(a,pi),Bu(a,function(n){n.r-=f})}return mi(a,c/2,s/2,t?1:1/Math.max(2*a.r/c,2*a.r/s)),o}var t,e=Zo.layout.hierarchy().sort(li),r=0,u=[1,1];return n.size=function(t){return arguments.length?(u=t,n):u},n.radius=function(e){return arguments.length?(t=null==e||"function"==typeof e?e:+e,n):t},n.padding=function(t){return arguments.length?(r=+t,n):r},Xu(n,e)},Zo.layout.tree=function(){function n(n,u){var l=o.call(this,n,u),f=l[0],h=t(f);if(Bu(h,e),h.parent.m=-h.z,$u(h,r),s)$u(f,i);else{var g=f,p=f,v=f;$u(f,function(n){n.x<g.x&&(g=n),n.x>p.x&&(p=n),n.depth>v.depth&&(v=n)});var d=a(g,p)/2-g.x,m=c[0]/(p.x+a(p,g)/2+d),y=c[1]/(v.depth||1);$u(f,function(n){n.x=(n.x+d)*m,n.y=n.depth*y})}return l}function t(n){for(var t,e={A:null,children:[n]},r=[e];null!=(t=r.pop());)for(var u,i=t.children,o=0,a=i.length;a>o;++o)r.push((i[o]=u={_:i[o],parent:t,children:(u=i[o].children)&&u.slice()||[],A:null,a:null,z:0,m:0,c:0,s:0,t:null,i:o}).a=u);return e.children[0]}function e(n){var t=n.children,e=n.parent.children,r=n.i?e[n.i-1]:null;if(t.length){wi(n);var i=(t[0].z+t[t.length-1].z)/2;r?(n.z=r.z+a(n._,r._),n.m=n.z-i):n.z=i}else r&&(n.z=r.z+a(n._,r._));n.parent.A=u(n,r,n.parent.A||e[0])}function r(n){n._.x=n.z+n.parent.m,n.m+=n.parent.m}function u(n,t,e){if(t){for(var r,u=n,i=n,o=t,c=u.parent.children[0],s=u.m,l=i.m,f=o.m,h=c.m;o=_i(o),u=Mi(u),o&&u;)c=Mi(c),i=_i(i),i.a=n,r=o.z+f-u.z-s+a(o._,u._),r>0&&(bi(Si(o,n,e),n,r),s+=r,l+=r),f+=o.m,s+=u.m,h+=c.m,l+=i.m;o&&!_i(i)&&(i.t=o,i.m+=f-l),u&&!Mi(c)&&(c.t=u,c.m+=s-h,e=n)}return e}function i(n){n.x*=c[0],n.y=n.depth*c[1]}var o=Zo.layout.hierarchy().sort(null).value(null),a=xi,c=[1,1],s=null;return n.separation=function(t){return arguments.length?(a=t,n):a},n.size=function(t){return arguments.length?(s=null==(c=t)?i:null,n):s?null:c},n.nodeSize=function(t){return arguments.length?(s=null==(c=t)?null:i,n):s?c:null},Xu(n,o)},Zo.layout.cluster=function(){function n(n,i){var o,a=t.call(this,n,i),c=a[0],s=0;Bu(c,function(n){var t=n.children;t&&t.length?(n.x=Ei(t),n.y=ki(t)):(n.x=o?s+=e(n,o):0,n.y=0,o=n)});var l=Ai(c),f=Ci(c),h=l.x-e(l,f)/2,g=f.x+e(f,l)/2;return Bu(c,u?function(n){n.x=(n.x-c.x)*r[0],n.y=(c.y-n.y)*r[1]}:function(n){n.x=(n.x-h)/(g-h)*r[0],n.y=(1-(c.y?n.y/c.y:1))*r[1]}),a}var t=Zo.layout.hierarchy().sort(null).value(null),e=xi,r=[1,1],u=!1;return n.separation=function(t){return arguments.length?(e=t,n):e},n.size=function(t){return arguments.length?(u=null==(r=t),n):u?null:r},n.nodeSize=function(t){return arguments.length?(u=null!=(r=t),n):u?r:null},Xu(n,t)},Zo.layout.treemap=function(){function n(n,t){for(var e,r,u=-1,i=n.length;++u<i;)r=(e=n[u]).value*(0>t?0:t),e.area=isNaN(r)||0>=r?0:r}function t(e){var i=e.children;if(i&&i.length){var o,a,c,s=f(e),l=[],h=i.slice(),p=1/0,v="slice"===g?s.dx:"dice"===g?s.dy:"slice-dice"===g?1&e.depth?s.dy:s.dx:Math.min(s.dx,s.dy);for(n(h,s.dx*s.dy/e.value),l.area=0;(c=h.length)>0;)l.push(o=h[c-1]),l.area+=o.area,"squarify"!==g||(a=r(l,v))<=p?(h.pop(),p=a):(l.area-=l.pop().area,u(l,v,s,!1),v=Math.min(s.dx,s.dy),l.length=l.area=0,p=1/0);l.length&&(u(l,v,s,!0),l.length=l.area=0),i.forEach(t)}}function e(t){var r=t.children;if(r&&r.length){var i,o=f(t),a=r.slice(),c=[];for(n(a,o.dx*o.dy/t.value),c.area=0;i=a.pop();)c.push(i),c.area+=i.area,null!=i.z&&(u(c,i.z?o.dx:o.dy,o,!a.length),c.length=c.area=0);r.forEach(e)}}function r(n,t){for(var e,r=n.area,u=0,i=1/0,o=-1,a=n.length;++o<a;)(e=n[o].area)&&(i>e&&(i=e),e>u&&(u=e));return r*=r,t*=t,r?Math.max(t*u*p/r,r/(t*i*p)):1/0}function u(n,t,e,r){var u,i=-1,o=n.length,a=e.x,s=e.y,l=t?c(n.area/t):0;if(t==e.dx){for((r||l>e.dy)&&(l=e.dy);++i<o;)u=n[i],u.x=a,u.y=s,u.dy=l,a+=u.dx=Math.min(e.x+e.dx-a,l?c(u.area/l):0);u.z=!0,u.dx+=e.x+e.dx-a,e.y+=l,e.dy-=l}else{for((r||l>e.dx)&&(l=e.dx);++i<o;)u=n[i],u.x=a,u.y=s,u.dx=l,s+=u.dy=Math.min(e.y+e.dy-s,l?c(u.area/l):0);u.z=!1,u.dy+=e.y+e.dy-s,e.x+=l,e.dx-=l}}function i(r){var u=o||a(r),i=u[0];return i.x=0,i.y=0,i.dx=s[0],i.dy=s[1],o&&a.revalue(i),n([i],i.dx*i.dy/i.value),(o?e:t)(i),h&&(o=u),u}var o,a=Zo.layout.hierarchy(),c=Math.round,s=[1,1],l=null,f=Ni,h=!1,g="squarify",p=.5*(1+Math.sqrt(5));return i.size=function(n){return arguments.length?(s=n,i):s},i.padding=function(n){function t(t){var e=n.call(i,t,t.depth);return null==e?Ni(t):zi(t,"number"==typeof e?[e,e,e,e]:e)}function e(t){return zi(t,n)}if(!arguments.length)return l;var r;return f=null==(l=n)?Ni:"function"==(r=typeof n)?t:"number"===r?(n=[n,n,n,n],e):e,i},i.round=function(n){return arguments.length?(c=n?Math.round:Number,i):c!=Number},i.sticky=function(n){return arguments.length?(h=n,o=null,i):h},i.ratio=function(n){return arguments.length?(p=n,i):p},i.mode=function(n){return arguments.length?(g=n+"",i):g},Xu(i,a)},Zo.random={normal:function(n,t){var e=arguments.length;return 2>e&&(t=1),1>e&&(n=0),function(){var e,r,u;do e=2*Math.random()-1,r=2*Math.random()-1,u=e*e+r*r;while(!u||u>1);return n+t*e*Math.sqrt(-2*Math.log(u)/u)}},logNormal:function(){var n=Zo.random.normal.apply(Zo,arguments);return function(){return Math.exp(n())}},bates:function(n){var t=Zo.random.irwinHall(n);return function(){return t()/n}},irwinHall:function(n){return function(){for(var t=0,e=0;n>e;e++)t+=Math.random();return t}}},Zo.scale={};var ss={floor:wt,ceil:wt};Zo.scale.linear=function(){return Ui([0,1],[0,1],hu,!1)};var ls={s:1,g:1,p:1,r:1,e:1};Zo.scale.log=function(){return Vi(Zo.scale.linear().domain([0,1]),10,!0,[1,10])};var fs=Zo.format(".0e"),hs={floor:function(n){return-Math.ceil(-n)},ceil:function(n){return-Math.floor(-n)}};Zo.scale.pow=function(){return Xi(Zo.scale.linear(),1,[0,1])},Zo.scale.sqrt=function(){return Zo.scale.pow().exponent(.5)},Zo.scale.ordinal=function(){return Bi([],{t:"range",a:[[]]})},Zo.scale.category10=function(){return Zo.scale.ordinal().range(gs)},Zo.scale.category20=function(){return Zo.scale.ordinal().range(ps)},Zo.scale.category20b=function(){return Zo.scale.ordinal().range(vs)},Zo.scale.category20c=function(){return Zo.scale.ordinal().range(ds)};var gs=[2062260,16744206,2924588,14034728,9725885,9197131,14907330,8355711,12369186,1556175].map(vt),ps=[2062260,11454440,16744206,16759672,2924588,10018698,14034728,16750742,9725885,12955861,9197131,12885140,14907330,16234194,8355711,13092807,12369186,14408589,1556175,10410725].map(vt),vs=[3750777,5395619,7040719,10264286,6519097,9216594,11915115,13556636,9202993,12426809,15186514,15190932,8666169,11356490,14049643,15177372,8077683,10834324,13528509,14589654].map(vt),ds=[3244733,7057110,10406625,13032431,15095053,16616764,16625259,16634018,3253076,7652470,10607003,13101504,7695281,10394312,12369372,14342891,6513507,9868950,12434877,14277081].map(vt);Zo.scale.quantile=function(){return Wi([],[])},Zo.scale.quantize=function(){return Ji(0,1,[0,1])},Zo.scale.threshold=function(){return Gi([.5],[0,1])},Zo.scale.identity=function(){return Ki([0,1])},Zo.svg={},Zo.svg.arc=function(){function n(){var n=t.apply(this,arguments),i=e.apply(this,arguments),o=r.apply(this,arguments)+ms,a=u.apply(this,arguments)+ms,c=(o>a&&(c=o,o=a,a=c),a-o),s=ba>c?"0":"1",l=Math.cos(o),f=Math.sin(o),h=Math.cos(a),g=Math.sin(a);
+return c>=ys?n?"M0,"+i+"A"+i+","+i+" 0 1,1 0,"+-i+"A"+i+","+i+" 0 1,1 0,"+i+"M0,"+n+"A"+n+","+n+" 0 1,0 0,"+-n+"A"+n+","+n+" 0 1,0 0,"+n+"Z":"M0,"+i+"A"+i+","+i+" 0 1,1 0,"+-i+"A"+i+","+i+" 0 1,1 0,"+i+"Z":n?"M"+i*l+","+i*f+"A"+i+","+i+" 0 "+s+",1 "+i*h+","+i*g+"L"+n*h+","+n*g+"A"+n+","+n+" 0 "+s+",0 "+n*l+","+n*f+"Z":"M"+i*l+","+i*f+"A"+i+","+i+" 0 "+s+",1 "+i*h+","+i*g+"L0,0"+"Z"}var t=Qi,e=no,r=to,u=eo;return n.innerRadius=function(e){return arguments.length?(t=bt(e),n):t},n.outerRadius=function(t){return arguments.length?(e=bt(t),n):e},n.startAngle=function(t){return arguments.length?(r=bt(t),n):r},n.endAngle=function(t){return arguments.length?(u=bt(t),n):u},n.centroid=function(){var n=(t.apply(this,arguments)+e.apply(this,arguments))/2,i=(r.apply(this,arguments)+u.apply(this,arguments))/2+ms;return[Math.cos(i)*n,Math.sin(i)*n]},n};var ms=-Sa,ys=wa-ka;Zo.svg.line=function(){return ro(wt)};var xs=Zo.map({linear:uo,"linear-closed":io,step:oo,"step-before":ao,"step-after":co,basis:po,"basis-open":vo,"basis-closed":mo,bundle:yo,cardinal:fo,"cardinal-open":so,"cardinal-closed":lo,monotone:So});xs.forEach(function(n,t){t.key=n,t.closed=/-closed$/.test(n)});var Ms=[0,2/3,1/3,0],_s=[0,1/3,2/3,0],bs=[0,1/6,2/3,1/6];Zo.svg.line.radial=function(){var n=ro(ko);return n.radius=n.x,delete n.x,n.angle=n.y,delete n.y,n},ao.reverse=co,co.reverse=ao,Zo.svg.area=function(){return Eo(wt)},Zo.svg.area.radial=function(){var n=Eo(ko);return n.radius=n.x,delete n.x,n.innerRadius=n.x0,delete n.x0,n.outerRadius=n.x1,delete n.x1,n.angle=n.y,delete n.y,n.startAngle=n.y0,delete n.y0,n.endAngle=n.y1,delete n.y1,n},Zo.svg.chord=function(){function n(n,a){var c=t(this,i,n,a),s=t(this,o,n,a);return"M"+c.p0+r(c.r,c.p1,c.a1-c.a0)+(e(c,s)?u(c.r,c.p1,c.r,c.p0):u(c.r,c.p1,s.r,s.p0)+r(s.r,s.p1,s.a1-s.a0)+u(s.r,s.p1,c.r,c.p0))+"Z"}function t(n,t,e,r){var u=t.call(n,e,r),i=a.call(n,u,r),o=c.call(n,u,r)+ms,l=s.call(n,u,r)+ms;return{r:i,a0:o,a1:l,p0:[i*Math.cos(o),i*Math.sin(o)],p1:[i*Math.cos(l),i*Math.sin(l)]}}function e(n,t){return n.a0==t.a0&&n.a1==t.a1}function r(n,t,e){return"A"+n+","+n+" 0 "+ +(e>ba)+",1 "+t}function u(n,t,e,r){return"Q 0,0 "+r}var i=gr,o=pr,a=Ao,c=to,s=eo;return n.radius=function(t){return arguments.length?(a=bt(t),n):a},n.source=function(t){return arguments.length?(i=bt(t),n):i},n.target=function(t){return arguments.length?(o=bt(t),n):o},n.startAngle=function(t){return arguments.length?(c=bt(t),n):c},n.endAngle=function(t){return arguments.length?(s=bt(t),n):s},n},Zo.svg.diagonal=function(){function n(n,u){var i=t.call(this,n,u),o=e.call(this,n,u),a=(i.y+o.y)/2,c=[i,{x:i.x,y:a},{x:o.x,y:a},o];return c=c.map(r),"M"+c[0]+"C"+c[1]+" "+c[2]+" "+c[3]}var t=gr,e=pr,r=Co;return n.source=function(e){return arguments.length?(t=bt(e),n):t},n.target=function(t){return arguments.length?(e=bt(t),n):e},n.projection=function(t){return arguments.length?(r=t,n):r},n},Zo.svg.diagonal.radial=function(){var n=Zo.svg.diagonal(),t=Co,e=n.projection;return n.projection=function(n){return arguments.length?e(No(t=n)):t},n},Zo.svg.symbol=function(){function n(n,r){return(ws.get(t.call(this,n,r))||To)(e.call(this,n,r))}var t=Lo,e=zo;return n.type=function(e){return arguments.length?(t=bt(e),n):t},n.size=function(t){return arguments.length?(e=bt(t),n):e},n};var ws=Zo.map({circle:To,cross:function(n){var t=Math.sqrt(n/5)/2;return"M"+-3*t+","+-t+"H"+-t+"V"+-3*t+"H"+t+"V"+-t+"H"+3*t+"V"+t+"H"+t+"V"+3*t+"H"+-t+"V"+t+"H"+-3*t+"Z"},diamond:function(n){var t=Math.sqrt(n/(2*As)),e=t*As;return"M0,"+-t+"L"+e+",0"+" 0,"+t+" "+-e+",0"+"Z"},square:function(n){var t=Math.sqrt(n)/2;return"M"+-t+","+-t+"L"+t+","+-t+" "+t+","+t+" "+-t+","+t+"Z"},"triangle-down":function(n){var t=Math.sqrt(n/Es),e=t*Es/2;return"M0,"+e+"L"+t+","+-e+" "+-t+","+-e+"Z"},"triangle-up":function(n){var t=Math.sqrt(n/Es),e=t*Es/2;return"M0,"+-e+"L"+t+","+e+" "+-t+","+e+"Z"}});Zo.svg.symbolTypes=ws.keys();var Ss,ks,Es=Math.sqrt(3),As=Math.tan(30*Aa),Cs=[],Ns=0;Cs.call=pa.call,Cs.empty=pa.empty,Cs.node=pa.node,Cs.size=pa.size,Zo.transition=function(n){return arguments.length?Ss?n.transition():n:ma.transition()},Zo.transition.prototype=Cs,Cs.select=function(n){var t,e,r,u=this.id,i=[];n=b(n);for(var o=-1,a=this.length;++o<a;){i.push(t=[]);for(var c=this[o],s=-1,l=c.length;++s<l;)(r=c[s])&&(e=n.call(r,r.__data__,s,o))?("__data__"in r&&(e.__data__=r.__data__),Po(e,s,u,r.__transition__[u]),t.push(e)):t.push(null)}return qo(i,u)},Cs.selectAll=function(n){var t,e,r,u,i,o=this.id,a=[];n=w(n);for(var c=-1,s=this.length;++c<s;)for(var l=this[c],f=-1,h=l.length;++f<h;)if(r=l[f]){i=r.__transition__[o],e=n.call(r,r.__data__,f,c),a.push(t=[]);for(var g=-1,p=e.length;++g<p;)(u=e[g])&&Po(u,g,o,i),t.push(u)}return qo(a,o)},Cs.filter=function(n){var t,e,r,u=[];"function"!=typeof n&&(n=R(n));for(var i=0,o=this.length;o>i;i++){u.push(t=[]);for(var e=this[i],a=0,c=e.length;c>a;a++)(r=e[a])&&n.call(r,r.__data__,a,i)&&t.push(r)}return qo(u,this.id)},Cs.tween=function(n,t){var e=this.id;return arguments.length<2?this.node().__transition__[e].tween.get(n):P(this,null==t?function(t){t.__transition__[e].tween.remove(n)}:function(r){r.__transition__[e].tween.set(n,t)})},Cs.attr=function(n,t){function e(){this.removeAttribute(a)}function r(){this.removeAttributeNS(a.space,a.local)}function u(n){return null==n?e:(n+="",function(){var t,e=this.getAttribute(a);return e!==n&&(t=o(e,n),function(n){this.setAttribute(a,t(n))})})}function i(n){return null==n?r:(n+="",function(){var t,e=this.getAttributeNS(a.space,a.local);return e!==n&&(t=o(e,n),function(n){this.setAttributeNS(a.space,a.local,t(n))})})}if(arguments.length<2){for(t in n)this.attr(t,n[t]);return this}var o="transform"==n?Du:hu,a=Zo.ns.qualify(n);return Ro(this,"attr."+n,t,a.local?i:u)},Cs.attrTween=function(n,t){function e(n,e){var r=t.call(this,n,e,this.getAttribute(u));return r&&function(n){this.setAttribute(u,r(n))}}function r(n,e){var r=t.call(this,n,e,this.getAttributeNS(u.space,u.local));return r&&function(n){this.setAttributeNS(u.space,u.local,r(n))}}var u=Zo.ns.qualify(n);return this.tween("attr."+n,u.local?r:e)},Cs.style=function(n,t,e){function r(){this.style.removeProperty(n)}function u(t){return null==t?r:(t+="",function(){var r,u=Wo.getComputedStyle(this,null).getPropertyValue(n);return u!==t&&(r=hu(u,t),function(t){this.style.setProperty(n,r(t),e)})})}var i=arguments.length;if(3>i){if("string"!=typeof n){2>i&&(t="");for(e in n)this.style(e,n[e],t);return this}e=""}return Ro(this,"style."+n,t,u)},Cs.styleTween=function(n,t,e){function r(r,u){var i=t.call(this,r,u,Wo.getComputedStyle(this,null).getPropertyValue(n));return i&&function(t){this.style.setProperty(n,i(t),e)}}return arguments.length<3&&(e=""),this.tween("style."+n,r)},Cs.text=function(n){return Ro(this,"text",n,Do)},Cs.remove=function(){return this.each("end.transition",function(){var n;this.__transition__.count<2&&(n=this.parentNode)&&n.removeChild(this)})},Cs.ease=function(n){var t=this.id;return arguments.length<1?this.node().__transition__[t].ease:("function"!=typeof n&&(n=Zo.ease.apply(Zo,arguments)),P(this,function(e){e.__transition__[t].ease=n}))},Cs.delay=function(n){var t=this.id;return arguments.length<1?this.node().__transition__[t].delay:P(this,"function"==typeof n?function(e,r,u){e.__transition__[t].delay=+n.call(e,e.__data__,r,u)}:(n=+n,function(e){e.__transition__[t].delay=n}))},Cs.duration=function(n){var t=this.id;return arguments.length<1?this.node().__transition__[t].duration:P(this,"function"==typeof n?function(e,r,u){e.__transition__[t].duration=Math.max(1,n.call(e,e.__data__,r,u))}:(n=Math.max(1,n),function(e){e.__transition__[t].duration=n}))},Cs.each=function(n,t){var e=this.id;if(arguments.length<2){var r=ks,u=Ss;Ss=e,P(this,function(t,r,u){ks=t.__transition__[e],n.call(t,t.__data__,r,u)}),ks=r,Ss=u}else P(this,function(r){var u=r.__transition__[e];(u.event||(u.event=Zo.dispatch("start","end"))).on(n,t)});return this},Cs.transition=function(){for(var n,t,e,r,u=this.id,i=++Ns,o=[],a=0,c=this.length;c>a;a++){o.push(n=[]);for(var t=this[a],s=0,l=t.length;l>s;s++)(e=t[s])&&(r=Object.create(e.__transition__[u]),r.delay+=r.duration,Po(e,s,i,r)),n.push(e)}return qo(o,i)},Zo.svg.axis=function(){function n(n){n.each(function(){var n,s=Zo.select(this),l=this.__chart__||e,f=this.__chart__=e.copy(),h=null==c?f.ticks?f.ticks.apply(f,a):f.domain():c,g=null==t?f.tickFormat?f.tickFormat.apply(f,a):wt:t,p=s.selectAll(".tick").data(h,f),v=p.enter().insert("g",".domain").attr("class","tick").style("opacity",ka),d=Zo.transition(p.exit()).style("opacity",ka).remove(),m=Zo.transition(p.order()).style("opacity",1),y=Ti(f),x=s.selectAll(".domain").data([0]),M=(x.enter().append("path").attr("class","domain"),Zo.transition(x));v.append("line"),v.append("text");var _=v.select("line"),b=m.select("line"),w=p.select("text").text(g),S=v.select("text"),k=m.select("text");switch(r){case"bottom":n=Uo,_.attr("y2",u),S.attr("y",Math.max(u,0)+o),b.attr("x2",0).attr("y2",u),k.attr("x",0).attr("y",Math.max(u,0)+o),w.attr("dy",".71em").style("text-anchor","middle"),M.attr("d","M"+y[0]+","+i+"V0H"+y[1]+"V"+i);break;case"top":n=Uo,_.attr("y2",-u),S.attr("y",-(Math.max(u,0)+o)),b.attr("x2",0).attr("y2",-u),k.attr("x",0).attr("y",-(Math.max(u,0)+o)),w.attr("dy","0em").style("text-anchor","middle"),M.attr("d","M"+y[0]+","+-i+"V0H"+y[1]+"V"+-i);break;case"left":n=jo,_.attr("x2",-u),S.attr("x",-(Math.max(u,0)+o)),b.attr("x2",-u).attr("y2",0),k.attr("x",-(Math.max(u,0)+o)).attr("y",0),w.attr("dy",".32em").style("text-anchor","end"),M.attr("d","M"+-i+","+y[0]+"H0V"+y[1]+"H"+-i);break;case"right":n=jo,_.attr("x2",u),S.attr("x",Math.max(u,0)+o),b.attr("x2",u).attr("y2",0),k.attr("x",Math.max(u,0)+o).attr("y",0),w.attr("dy",".32em").style("text-anchor","start"),M.attr("d","M"+i+","+y[0]+"H0V"+y[1]+"H"+i)}if(f.rangeBand){var E=f,A=E.rangeBand()/2;l=f=function(n){return E(n)+A}}else l.rangeBand?l=f:d.call(n,f);v.call(n,l),m.call(n,f)})}var t,e=Zo.scale.linear(),r=zs,u=6,i=6,o=3,a=[10],c=null;return n.scale=function(t){return arguments.length?(e=t,n):e},n.orient=function(t){return arguments.length?(r=t in Ls?t+"":zs,n):r},n.ticks=function(){return arguments.length?(a=arguments,n):a},n.tickValues=function(t){return arguments.length?(c=t,n):c},n.tickFormat=function(e){return arguments.length?(t=e,n):t},n.tickSize=function(t){var e=arguments.length;return e?(u=+t,i=+arguments[e-1],n):u},n.innerTickSize=function(t){return arguments.length?(u=+t,n):u},n.outerTickSize=function(t){return arguments.length?(i=+t,n):i},n.tickPadding=function(t){return arguments.length?(o=+t,n):o},n.tickSubdivide=function(){return arguments.length&&n},n};var zs="bottom",Ls={top:1,right:1,bottom:1,left:1};Zo.svg.brush=function(){function n(i){i.each(function(){var i=Zo.select(this).style("pointer-events","all").style("-webkit-tap-highlight-color","rgba(0,0,0,0)").on("mousedown.brush",u).on("touchstart.brush",u),o=i.selectAll(".background").data([0]);o.enter().append("rect").attr("class","background").style("visibility","hidden").style("cursor","crosshair"),i.selectAll(".extent").data([0]).enter().append("rect").attr("class","extent").style("cursor","move");var a=i.selectAll(".resize").data(p,wt);a.exit().remove(),a.enter().append("g").attr("class",function(n){return"resize "+n}).style("cursor",function(n){return Ts[n]}).append("rect").attr("x",function(n){return/[ew]$/.test(n)?-3:null}).attr("y",function(n){return/^[ns]/.test(n)?-3:null}).attr("width",6).attr("height",6).style("visibility","hidden"),a.style("display",n.empty()?"none":null);var l,f=Zo.transition(i),h=Zo.transition(o);c&&(l=Ti(c),h.attr("x",l[0]).attr("width",l[1]-l[0]),e(f)),s&&(l=Ti(s),h.attr("y",l[0]).attr("height",l[1]-l[0]),r(f)),t(f)})}function t(n){n.selectAll(".resize").attr("transform",function(n){return"translate("+l[+/e$/.test(n)]+","+f[+/^s/.test(n)]+")"})}function e(n){n.select(".extent").attr("x",l[0]),n.selectAll(".extent,.n>rect,.s>rect").attr("width",l[1]-l[0])}function r(n){n.select(".extent").attr("y",f[0]),n.selectAll(".extent,.e>rect,.w>rect").attr("height",f[1]-f[0])}function u(){function u(){32==Zo.event.keyCode&&(C||(x=null,z[0]-=l[1],z[1]-=f[1],C=2),y())}function p(){32==Zo.event.keyCode&&2==C&&(z[0]+=l[1],z[1]+=f[1],C=0,y())}function v(){var n=Zo.mouse(_),u=!1;M&&(n[0]+=M[0],n[1]+=M[1]),C||(Zo.event.altKey?(x||(x=[(l[0]+l[1])/2,(f[0]+f[1])/2]),z[0]=l[+(n[0]<x[0])],z[1]=f[+(n[1]<x[1])]):x=null),E&&d(n,c,0)&&(e(S),u=!0),A&&d(n,s,1)&&(r(S),u=!0),u&&(t(S),w({type:"brush",mode:C?"move":"resize"}))}function d(n,t,e){var r,u,a=Ti(t),c=a[0],s=a[1],p=z[e],v=e?f:l,d=v[1]-v[0];return C&&(c-=p,s-=d+p),r=(e?g:h)?Math.max(c,Math.min(s,n[e])):n[e],C?u=(r+=p)+d:(x&&(p=Math.max(c,Math.min(s,2*x[e]-r))),r>p?(u=r,r=p):u=p),v[0]!=r||v[1]!=u?(e?o=null:i=null,v[0]=r,v[1]=u,!0):void 0}function m(){v(),S.style("pointer-events","all").selectAll(".resize").style("display",n.empty()?"none":null),Zo.select("body").style("cursor",null),L.on("mousemove.brush",null).on("mouseup.brush",null).on("touchmove.brush",null).on("touchend.brush",null).on("keydown.brush",null).on("keyup.brush",null),N(),w({type:"brushend"})}var x,M,_=this,b=Zo.select(Zo.event.target),w=a.of(_,arguments),S=Zo.select(_),k=b.datum(),E=!/^(n|s)$/.test(k)&&c,A=!/^(e|w)$/.test(k)&&s,C=b.classed("extent"),N=I(),z=Zo.mouse(_),L=Zo.select(Wo).on("keydown.brush",u).on("keyup.brush",p);if(Zo.event.changedTouches?L.on("touchmove.brush",v).on("touchend.brush",m):L.on("mousemove.brush",v).on("mouseup.brush",m),S.interrupt().selectAll("*").interrupt(),C)z[0]=l[0]-z[0],z[1]=f[0]-z[1];else if(k){var T=+/w$/.test(k),q=+/^n/.test(k);M=[l[1-T]-z[0],f[1-q]-z[1]],z[0]=l[T],z[1]=f[q]}else Zo.event.altKey&&(x=z.slice());S.style("pointer-events","none").selectAll(".resize").style("display",null),Zo.select("body").style("cursor",b.style("cursor")),w({type:"brushstart"}),v()}var i,o,a=M(n,"brushstart","brush","brushend"),c=null,s=null,l=[0,0],f=[0,0],h=!0,g=!0,p=qs[0];return n.event=function(n){n.each(function(){var n=a.of(this,arguments),t={x:l,y:f,i:i,j:o},e=this.__chart__||t;this.__chart__=t,Ss?Zo.select(this).transition().each("start.brush",function(){i=e.i,o=e.j,l=e.x,f=e.y,n({type:"brushstart"})}).tween("brush:brush",function(){var e=gu(l,t.x),r=gu(f,t.y);return i=o=null,function(u){l=t.x=e(u),f=t.y=r(u),n({type:"brush",mode:"resize"})}}).each("end.brush",function(){i=t.i,o=t.j,n({type:"brush",mode:"resize"}),n({type:"brushend"})}):(n({type:"brushstart"}),n({type:"brush",mode:"resize"}),n({type:"brushend"}))})},n.x=function(t){return arguments.length?(c=t,p=qs[!c<<1|!s],n):c},n.y=function(t){return arguments.length?(s=t,p=qs[!c<<1|!s],n):s},n.clamp=function(t){return arguments.length?(c&&s?(h=!!t[0],g=!!t[1]):c?h=!!t:s&&(g=!!t),n):c&&s?[h,g]:c?h:s?g:null},n.extent=function(t){var e,r,u,a,h;return arguments.length?(c&&(e=t[0],r=t[1],s&&(e=e[0],r=r[0]),i=[e,r],c.invert&&(e=c(e),r=c(r)),e>r&&(h=e,e=r,r=h),(e!=l[0]||r!=l[1])&&(l=[e,r])),s&&(u=t[0],a=t[1],c&&(u=u[1],a=a[1]),o=[u,a],s.invert&&(u=s(u),a=s(a)),u>a&&(h=u,u=a,a=h),(u!=f[0]||a!=f[1])&&(f=[u,a])),n):(c&&(i?(e=i[0],r=i[1]):(e=l[0],r=l[1],c.invert&&(e=c.invert(e),r=c.invert(r)),e>r&&(h=e,e=r,r=h))),s&&(o?(u=o[0],a=o[1]):(u=f[0],a=f[1],s.invert&&(u=s.invert(u),a=s.invert(a)),u>a&&(h=u,u=a,a=h))),c&&s?[[e,u],[r,a]]:c?[e,r]:s&&[u,a])},n.clear=function(){return n.empty()||(l=[0,0],f=[0,0],i=o=null),n},n.empty=function(){return!!c&&l[0]==l[1]||!!s&&f[0]==f[1]},Zo.rebind(n,a,"on")};var Ts={n:"ns-resize",e:"ew-resize",s:"ns-resize",w:"ew-resize",nw:"nwse-resize",ne:"nesw-resize",se:"nwse-resize",sw:"nesw-resize"},qs=[["n","e","s","w","nw","ne","se","sw"],["e","w"],["n","s"],[]],Rs=Qa.format=ic.timeFormat,Ds=Rs.utc,Ps=Ds("%Y-%m-%dT%H:%M:%S.%LZ");Rs.iso=Date.prototype.toISOString&&+new Date("2000-01-01T00:00:00.000Z")?Ho:Ps,Ho.parse=function(n){var t=new Date(n);return isNaN(t)?null:t},Ho.toString=Ps.toString,Qa.second=Dt(function(n){return new nc(1e3*Math.floor(n/1e3))},function(n,t){n.setTime(n.getTime()+1e3*Math.floor(t))},function(n){return n.getSeconds()}),Qa.seconds=Qa.second.range,Qa.seconds.utc=Qa.second.utc.range,Qa.minute=Dt(function(n){return new nc(6e4*Math.floor(n/6e4))},function(n,t){n.setTime(n.getTime()+6e4*Math.floor(t))},function(n){return n.getMinutes()}),Qa.minutes=Qa.minute.range,Qa.minutes.utc=Qa.minute.utc.range,Qa.hour=Dt(function(n){var t=n.getTimezoneOffset()/60;return new nc(36e5*(Math.floor(n/36e5-t)+t))},function(n,t){n.setTime(n.getTime()+36e5*Math.floor(t))},function(n){return n.getHours()}),Qa.hours=Qa.hour.range,Qa.hours.utc=Qa.hour.utc.range,Qa.month=Dt(function(n){return n=Qa.day(n),n.setDate(1),n},function(n,t){n.setMonth(n.getMonth()+t)},function(n){return n.getMonth()}),Qa.months=Qa.month.range,Qa.months.utc=Qa.month.utc.range;var Us=[1e3,5e3,15e3,3e4,6e4,3e5,9e5,18e5,36e5,108e5,216e5,432e5,864e5,1728e5,6048e5,2592e6,7776e6,31536e6],js=[[Qa.second,1],[Qa.second,5],[Qa.second,15],[Qa.second,30],[Qa.minute,1],[Qa.minute,5],[Qa.minute,15],[Qa.minute,30],[Qa.hour,1],[Qa.hour,3],[Qa.hour,6],[Qa.hour,12],[Qa.day,1],[Qa.day,2],[Qa.week,1],[Qa.month,1],[Qa.month,3],[Qa.year,1]],Hs=Rs.multi([[".%L",function(n){return n.getMilliseconds()}],[":%S",function(n){return n.getSeconds()}],["%I:%M",function(n){return n.getMinutes()}],["%I %p",function(n){return n.getHours()}],["%a %d",function(n){return n.getDay()&&1!=n.getDate()}],["%b %d",function(n){return 1!=n.getDate()}],["%B",function(n){return n.getMonth()}],["%Y",we]]),Fs={range:function(n,t,e){return Zo.range(Math.ceil(n/e)*e,+t,e).map(Oo)},floor:wt,ceil:wt};js.year=Qa.year,Qa.scale=function(){return Fo(Zo.scale.linear(),js,Hs)};var Os=js.map(function(n){return[n[0].utc,n[1]]}),Ys=Ds.multi([[".%L",function(n){return n.getUTCMilliseconds()}],[":%S",function(n){return n.getUTCSeconds()}],["%I:%M",function(n){return n.getUTCMinutes()}],["%I %p",function(n){return n.getUTCHours()}],["%a %d",function(n){return n.getUTCDay()&&1!=n.getUTCDate()}],["%b %d",function(n){return 1!=n.getUTCDate()}],["%B",function(n){return n.getUTCMonth()}],["%Y",we]]);Os.year=Qa.year.utc,Qa.scale.utc=function(){return Fo(Zo.scale.linear(),Os,Ys)},Zo.text=St(function(n){return n.responseText}),Zo.json=function(n,t){return kt(n,"application/json",Yo,t)},Zo.html=function(n,t){return kt(n,"text/html",Io,t)},Zo.xml=St(function(n){return n.responseXML}),"function"==typeof define&&define.amd?define(Zo):"object"==typeof module&&module.exports&&(module.exports=Zo),this.d3=Zo}();
+
+// parseUri 1.2.2
+// (c) Steven Levithan <stevenlevithan.com>
+// MIT License
+
+function parseUri (str) {
+	var	o   = parseUri.options,
+		m   = o.parser[o.strictMode ? "strict" : "loose"].exec(str),
+		uri = {},
+		i   = 14;
+
+	while (i--) uri[o.key[i]] = m[i] || "";
+
+	uri[o.q.name] = {};
+	uri[o.key[12]].replace(o.q.parser, function ($0, $1, $2) {
+		if ($1) uri[o.q.name][$1] = $2;
+	});
+
+	return uri;
+};
+
+parseUri.options = {
+	strictMode: false,
+	key: ["source","protocol","authority","userInfo","user","password","host","port","relative","path","directory","file","query","anchor"],
+	q:   {
+		name:   "queryKey",
+		parser: /(?:^|&)([^&=]*)=?([^&]*)/g
+	},
+	parser: {
+		strict: /^(?:([^:\/?#]+):)?(?:\/\/((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?))?((((?:[^?#\/]*\/)*)([^?#]*))(?:\?([^#]*))?(?:#(.*))?)/,
+		loose:  /^(?:(?![^:@]+:[^:@\/]*@)([^:\/?#.]+):)?(?:\/\/)?((?:(([^:@]*)(?::([^:@]*))?)?@)?([^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/
+	}
+};
+
+
+// graph.js
+var drawGraph = function() {
+
+    $("svg").remove();
+
+    //Dataset and metric to draw is passed via query option:
+    query = parseUri(location).queryKey;
+    query.stats = unescape(query.stats);
+    stats_db = '/tests/artifacts/' + query.stats + '/stats';
+    var metric = query.metric;
+    var operation = query.operation;
+    var smoothing = query.smoothing;
+
+    xmin = query.xmin;
+    xmax = query.xmax;
+    ymin = query.ymin;
+    ymax = query.ymax;
+
+    //Pull metrics from the stats json:
+    stress_metrics = $.extend([], stats['stats'][0]['metrics']);
+    $.each(stress_metrics, function(i,v) {
+        stress_metrics[i] = v.replace(/\W/g,"_");
+    });
+    stress_metric_names = {};
+    $.each(stress_metrics, function(i,v) {
+        stress_metric_names[v] = stats['stats'][0]['metrics'][i];
+    });
+
+    var updateURLBar = function() {
+        //Update the URL bar with the current parameters:
+        window.history.replaceState(null,null,parseUri(location).path + "?" + $.param(query));
+    };
+
+    //Check query parameters:
+    if (metric == undefined) {
+        metric = query.metric = 'SSTables';
+    }
+    if (operation == undefined) {
+        operation = query.operation = stats['stats'][0]['test'];
+    }
+    if (smoothing == undefined) {
+        smoothing = query.smoothing = 1;
+    }
+    updateURLBar();
+
+    var metric_index = stress_metrics.indexOf(metric);
+    var time_index = stress_metrics.indexOf('time');
+
+    /// Add dropdown controls to select chart criteria / options:
+    var chart_controls = $('<div id="chart_controls"/>');
+    var chart_controls_tbl = $('<table/>');
+    chart_controls.append(chart_controls_tbl);
+    $('body').append(chart_controls);
+    var metric_selector = $('<select id="metric_selector"/>');
+    $.each(stress_metric_names, function(k,v) {
+        if (k == 'time') {
+            return; //Elapsed time makes no sense to graph, skip it.
+        }
+        var option = $('<option/>').attr('value', k).text(v);
+        if (metric == k) {
+            option.attr('selected','selected');
+        }
+        metric_selector.append(option);
+
+    });
+    chart_controls_tbl.append('<tr><td><label for="metric_selector"/>Choose metric:</label></td><td id="metric_selector_td"></td></tr>')
+    $('#metric_selector_td').append(metric_selector);
+
+    var operation_selector = $('<select id="operation_selector"/>')
+    chart_controls_tbl.append('<tr><td><label for="operation_selector"/>Choose operation:</label></td><td id="operation_selector_td"></td></tr>')
+    $('#operation_selector_td').append(operation_selector);
+
+
+    var smoothing_selector = $('<select id="smoothing_selector"/>')
+    $.each([1,2,3,4,5,6,7,8], function(i, v) {
+        var option = $('<option/>').attr('value', v).text(v);
+        if (smoothing == v) {
+            option.attr('selected','selected');
+        }
+        smoothing_selector.append(option);
+    });
+    chart_controls_tbl.append('<tr><td style="width:150px"><label for="smoothing_selector"/>Data smoothing:</label></td><td id="smoothing_selector_td"></td></tr>')
+    $("#smoothing_selector_td").append(smoothing_selector);
+
+    chart_controls_tbl.append('<tr><td colspan="100%">Zoom: <a href="#" id="reset_zoom">reset</a><table id="zoom"><tr><td><label for="xmin"/>x min</label></td><td><input id="xmin"/></td><td><label for="xmax"/>x max</label></td><td><input id="xmax"/></td></tr><tr><td><label for="ymin"/>y min</label></td><td><input id="ymin"/></td><td><label for="ymax"/>y max</label></td><td><input id="ymax"/></td></tr></table></td></tr>');
+
+    chart_controls_tbl.append('<tr><td style="padding-top:10px" colspan="100%">To hide/show a dataset click on the associated colored box</td></tr>');
+
+    var raw_data;
+
+    //Callback to draw graph once we have json data.
+    var graph_callback = function() {
+        var data = [];
+        var trials = {};
+        var data_by_title = {};
+        //Keep track of what operations are availble from the test:
+        var operations = {};
+
+        raw_data.stats.forEach(function(d) {
+            // Make a copy of d so we never modify raw_data
+            d = $.extend({}, d);
+            operations[d.test] = true;
+            if (d.test!=operation) {
+                return;
+            }
+            d.title = d['label'] != undefined ? d['label'] : d['revision'];
+            data_by_title[d.title] = d;
+            data.push(d);
+            trials[d.title] = d;
+            //Clean up the intervals:
+            //Remove every other item, so as to smooth the line:
+            var new_intervals = [];
+            d.intervals.forEach(function(i, x) {
+                if (x % smoothing == 0) {
+                    new_intervals.push(i);
+                }
+            });
+            d.intervals = new_intervals;
+        });
+
+        //Fill operations available from test:
+        operation_selector.children().remove();
+        $.each(operations, function(k) {
+            var option = $('<option/>').attr('value', k).text(k);
+            if (operation == k) {
+                option.attr('selected','selected');
+            }
+            operation_selector.append(option);
+        });
+
+        var getMetricValue = function(d) {
+            if (metric_index >= 0) {
+                //This is one of the metrics directly reported by stress:
+                return d[metric_index];
+            } else {
+                //This metric is not reported by stress, so compute it ourselves:
+                if (metric == 'num_timeouts') {
+                    return d[stress_metrics.indexOf('interval_op_rate')] - d[stress_metrics.indexOf('interval_key_rate')];
+                }
+            }
+        };
+
+        //Parse the dates:
+        data.forEach(function(d) {
+            d.date = new Date(Date.parse(d.date));
+        });
+
+
+        $("svg").remove();
+        //Setup initial zoom level:
+        defaultZoom = function(initialize) {
+            if (!initialize) {
+                //Reset zoom query params:
+                query.xmin = xmin = undefined;
+                query.xmax = xmax = undefined;
+                query.ymin = ymin = undefined;
+                query.ymax = ymax = undefined;
+            }
+            query.xmin = xmin = query.xmin ? query.xmin : 0;
+            query.xmax = xmax = query.xmax ? query.xmax : Math.round(d3.max(data, function(d) {
+                if (d.intervals.length > 0) {
+                    return d.intervals[d.intervals.length-1][time_index];
+                }
+            }) * 1.1 * 100) / 100;
+            query.ymin = ymin = query.ymin ? query.ymin : 0;
+            query.ymax = ymax = query.ymax ? query.ymax : Math.round(d3.max(data, function(d) {
+                return d3.max(d.intervals, function(i) {
+                    return getMetricValue(i);
+                });
+            }) * 1.1 * 100) / 100;
+            $("#xmin").val(xmin);
+            $("#xmax").val(xmax);
+            $("#ymin").val(ymin);
+            $("#ymax").val(ymax);
+            var updateX = function() {
+                query.xmin = xmin = $("#xmin").val();
+                query.xmax = xmax = $("#xmax").val();
+                x.domain([xmin,xmax]);
+                updateURLBar();
+            };
+            var updateY = function() {
+                query.ymin = ymin = $("#ymin").val();
+                query.ymax = ymax = $("#ymax").val();
+                y.domain([ymin, ymax]);
+                updateURLBar();
+            };
+            $("#xmin,#xmax").unbind().change(function(e) {
+                updateX();
+                redrawLines();
+            });
+            $("#ymin,#ymax").unbind().change(function(e) {
+                updateY();
+                redrawLines();
+            });
+            // The first time defaultZoom is called, we pass
+            // initialize=true, and we do not call the change() method
+            // yet. On subsequent calls, without initialize, we do.
+            if (!initialize) {
+                updateX();
+                updateY();
+                redrawLines();
+            }
+        }
+        defaultZoom(true);
+
+        $("#reset_zoom").click(function(e) {
+            defaultZoom();
+            e.preventDefault();
+        });
+
+        //Setup chart:
+        var margin = {top: 20, right: 1180, bottom: 4240, left: 60};
+        var width = 2060 - margin.left - margin.right;
+        var height = 4700 - margin.top - margin.bottom;
+
+        var x = d3.scale.linear()
+            .domain([xmin, xmax])
+            .range([0, width]);
+
+        var y = d3.scale.linear()
+            .domain([ymin, ymax])
+            .range([height, 0]);
+
+        var color = d3.scale.category10();
+        color.domain(data.map(function(d){return d.title}));
+
+        var xAxis = d3.svg.axis()
+            .scale(x)
+            .orient("bottom");
+
+        var yAxis = d3.svg.axis()
+            .scale(y)
+            .orient("left");
+
+        var line = d3.svg.line()
+            .interpolate("basis")
+            .x(function(d) {
+                return x(d[time_index]); //time in seconds
+            })
+            .y(function(d) {
+                return y(getMetricValue(d));
+            });
+
+        $("body").append("<div id='svg_container'>");
+
+        var redrawLines = function() {
+            svg.select(".x.axis").call(xAxis);
+            svg.select(".y.axis").call(yAxis);
+            svg.selectAll(".line")
+                .attr("class","line")
+                .attr("d", function(d) {
+                    return line(d.intervals);
+                })
+            $("#xmin").val(x.domain()[0]);
+            $("#xmax").val(x.domain()[1]);
+            $("#ymin").val(y.domain()[0]);
+            $("#ymax").val(y.domain()[1]);
+        }
+
+        var zoom = d3.behavior.zoom()
+            .x(x)
+            .y(y)
+            .on("zoom", redrawLines);
+
+        var svg = d3.select("div#svg_container").append("svg")
+            .attr("width", width + margin.left + margin.right + 250)
+            .attr("height", height + margin.top + margin.bottom)
+            .append("g")
+            .attr("transform", "translate(" + margin.left + "," + margin.top + ")")
+
+        // Clip Path
+        svg.append("svg:clipPath")
+            .attr("id", "chart_clip")
+            .append("svg:rect")
+            .attr("width", width)
+            .attr("height", height);
+
+        // Chart title
+        svg.append("text")
+            .attr("x", width / 2 )
+            .attr("y", 0 )
+            .style('font-size', '2em')
+            .style("text-anchor", "middle")
+            .text(raw_data.title + ' - ' + operation);
+
+        // Chart subtitle
+        svg.append("text")
+            .attr("x", width / 2 )
+            .attr("y", 15 )
+            .style('font-size', '1.2em')
+            .style("text-anchor", "middle")
+            .text((raw_data.subtitle ? raw_data.subtitle : ''));
+
+        // x-axis - time
+        svg.append("g")
+            .attr("class", "x axis")
+            .attr("transform", "translate(0," + height + ")")
+            .call(xAxis);
+
+        // x-axis label
+        svg.append("text")
+            .attr("x", width / 2 )
+            .attr("y", height + 30 )
+            .style("text-anchor", "middle")
+            .style("font-size", "1.2em")
+            .text(stress_metric_names['time']);
+
+        // y-axis
+        svg.append("g")
+            .attr("class", "y axis")
+            .call(yAxis)
+            .append("text")
+            .attr("transform", "rotate(-90)")
+            .attr("y", -60)
+            .attr("dy", ".91em")
+            .style("font-size", "1.2em")
+            .style("text-anchor", "end")
+            .text(stress_metric_names[metric]);
+
+        var trial = svg.selectAll(".trial")
+            .data(data)
+            .enter().append("g")
+            .attr("class", "trial")
+            .attr("title", function(d) {
+                return d.title;
+            });
+
+        // Draw benchmarked data:
+        trial.append("path")
+            .attr("class", "line")
+            .attr("clip-path", "url(#chart_clip)")
+            .attr("d", function(d) {
+                return line(d.intervals);
+            })
+            .style("stroke", function(d) { return color(d.title); });
+
+        var legend = svg.selectAll(".legend")
+            .data(color.domain())
+            .enter().append("g")
+            .attr("class", "legend")
+            .attr("transform", function(d, i) {
+                var y_offset = 425 + (i*25) + 70;
+                var x_offset = -550;
+                return "translate(" + x_offset + "," + y_offset + ")";
+            });
+
+        var renderLegendText = function(linenum, getTextCallback) {
+            legend.append("text")
+                .attr("x", width - 24 - 250)
+                .attr("y", 12*linenum)
+                .attr("dy", ".35em")
+                .style("font-family", "monospace")
+                .style("font-size", "1.2em")
+                .style("text-anchor", "start")
+                .text(function(d) {
+                    return getTextCallback(d);
+                });
+        };
+
+        var padTextEnd = function(text, length) {
+            for(var x=text.length; x<length; x++) {
+                text = text + '\u00A0';
+            }
+            return text;
+        };
+        var padTextStart = function(text, length) {
+            for(var x=text.length; x<length; x++) {
+                text = '\u00A0' + text;
+            }
+            return text;
+        };
+
+        renderLegendText(1, function(title) {
+            return padTextStart(title, title.length + 5);
+        });
+
+        legend.append("rect")
+            .attr("x", width - 270)
+            .attr("width", 18)
+            .attr("height", 18)
+            .attr("class", "legend-rect")
+            .attr("title", function(title) {
+                return title;
+            })
+            .style("fill", color);
+
+        //Make trials hideable by double clicking on the colored legend box
+        $("rect.legend-rect").click(function() {
+            $("g.trial[title='" + $(this).attr('title') + "']").toggle();
+        });
+
+
+        // Chart control callbacks:
+        metric_selector.unbind().change(function(e) {
+            // change the metric in the url to reload the page:
+            metric = query.metric = this.value;
+            metric_index = stress_metrics.indexOf(metric);
+            graph_callback();
+            defaultZoom();
+        });
+        operation_selector.unbind().change(function(e) {
+            // change the metric in the url to reload the page:
+            operation = query.operation = this.value;
+            graph_callback();
+            defaultZoom();
+        });
+        smoothing_selector.unbind().change(function(e) {
+            // change the metric in the url to reload the page:
+            smoothing = query.smoothing = this.value;
+            graph_callback();
+            defaultZoom();
+        });
+        updateURLBar();
+
+        $("#dl-test-data").attr("href",stats_db);
+
+        // Chart zoom/drag surface
+        // This should always be last, so it's on top of everything else
+        svg.append("svg:rect")
+            .attr("id", "zoom_drag_surface")
+            .attr("width", width)
+            .attr("height", height);
+    }
+
+    raw_data = stats;
+    graph_callback();
+
+}
+
+$(document).ready(function(){
+
+    drawGraph();
+
+});
+      -->
+  </script>
+  <style type="text/css">
+div#chart_controls {
+    margin-left: 900px;
+    position: absolute;
+}
+
+#chart_controls > table {
+    width: 640px;
+}
+
+#chart_controls td {
+    padding: 2px;
+}
+
+#chart_controls #zoom input {
+    width: 50px;
+}
+
+#chart_controls table#zoom {
+    padding-left: 20px;
+}
+
+#chart_controls table#zoom td {
+    padding-left: 20px;
+}
+
+#zoom_drag_surface {
+    fill: rgba(250, 250, 255, 0.0);
+    z-index: 100;
+}
+
+svg {
+  font: 10px sans-serif;
+}
+
+.axis path,
+.axis line {
+  fill: none;
+  stroke: #000;
+  shape-rendering: crispEdges;
+}
+
+.x.axis path {
+  display: none;
+}
+
+.line {
+  fill: none;
+  stroke: steelblue;
+  stroke-width: 1.5px;
+}
+
+  </style>
+</head>
+<body>
+</body>
\ No newline at end of file
diff --git a/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java
index 1ad1ba62def9..4cda37fb7ef5 100644
--- a/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java
+++ b/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java
@@ -103,7 +103,7 @@ public void testWithMismatchingPending() throws Throwable
             // also disables autocompaction on the nodes
             cluster.forEach((node) -> node.runOnInstance(() -> {
                 ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore("tbl");
-                FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(cfs));
+                FBUtilities.waitOnFuture(CompactionManager.instance.submitBackground(cfs));
                 cfs.disableAutoCompaction();
             }));
             cluster.get(1).callOnInstance(repair(options(false, false)));
@@ -111,7 +111,7 @@ public void testWithMismatchingPending() throws Throwable
             cluster.get(1).runOnInstance(() -> {
                 ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore("tbl");
                 cfs.enableAutoCompaction();
-                FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(cfs));
+                FBUtilities.waitOnFuture(CompactionManager.instance.submitBackground(cfs));
             });
 
             //IR and Preview repair can't run concurrently. In case the test is flaky, please check CASSANDRA-15685
@@ -402,7 +402,7 @@ private void unmarkRepaired(IInvokableInstance instance, String table)
             ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore(table);
             try
             {
-                cfs.getCompactionStrategyManager().mutateRepaired(cfs.getLiveSSTables(), ActiveRepairService.UNREPAIRED_SSTABLE, null, false);
+                cfs.mutateRepaired(cfs.getLiveSSTables(), ActiveRepairService.UNREPAIRED_SSTABLE, null, false);
             }
             catch (IOException e)
             {
diff --git a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
index a95cc3c94a6d..196ec51a766c 100644
--- a/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
+++ b/test/distributed/org/apache/cassandra/io/sstable/format/ForwardingSSTableReader.java
@@ -450,30 +450,6 @@ public boolean intersects(Collection<Range<Token>> ranges)
         return delegate.intersects(ranges);
     }
 
-    @Override
-    public long getBloomFilterFalsePositiveCount()
-    {
-        return delegate.getBloomFilterFalsePositiveCount();
-    }
-
-    @Override
-    public long getRecentBloomFilterFalsePositiveCount()
-    {
-        return delegate.getRecentBloomFilterFalsePositiveCount();
-    }
-
-    @Override
-    public long getBloomFilterTruePositiveCount()
-    {
-        return delegate.getBloomFilterTruePositiveCount();
-    }
-
-    @Override
-    public long getRecentBloomFilterTruePositiveCount()
-    {
-        return delegate.getRecentBloomFilterTruePositiveCount();
-    }
-
     @Override
     public InstrumentingCache<KeyCacheKey, BigTableRowIndexEntry> getKeyCache()
     {
diff --git a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
index 60dffb12418a..34f263dbeecd 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongCompactionsTest.java
@@ -189,7 +189,7 @@ private void forceCompactions(ColumnFamilyStore cfs)
         {
             ArrayList<Future<?>> compactions = new ArrayList<Future<?>>();
             for (int i = 0; i < 10; i++)
-                compactions.addAll(CompactionManager.instance.submitBackground(cfs));
+                compactions.add(CompactionManager.instance.submitBackground(cfs));
             // another compaction attempt will be launched in the background by
             // each completing compaction: not much we can do to control them here
             FBUtilities.waitOnFutures(compactions);
diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
index d142eda289c3..075988f7b0ad 100644
--- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
+++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java
@@ -27,6 +27,7 @@
 import org.apache.cassandra.db.lifecycle.SSTableSet;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.io.sstable.ISSTableScanner;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -72,8 +73,10 @@ public void testParallelLeveledCompaction() throws Exception
         Keyspace keyspace = Keyspace.open(ksname);
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(cfname);
         store.disableAutoCompaction();
-        CompactionStrategyManager mgr = store.getCompactionStrategyManager();
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) mgr.getStrategies().get(1).get(0);
+        CompactionStrategyContainer strategyContainer = store.getCompactionStrategyContainer();
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) strategyContainer
+                                                                    .getStrategies(false, null)
+                                                                    .get(0);
 
         ByteBuffer value = ByteBuffer.wrap(new byte[100 * 1024]); // 100 KB value, make it easy to have multiple files
 
@@ -88,17 +91,12 @@ public void testParallelLeveledCompaction() throws Exception
         {
             while (true)
             {
-                final AbstractCompactionTask nextTask = lcs.getNextBackgroundTask(Integer.MIN_VALUE);
-                if (nextTask == null)
+                final Collection<AbstractCompactionTask> nextTasks = lcs.getNextBackgroundTasks(Integer.MIN_VALUE);
+                if (nextTasks.isEmpty())
                     break;
-                tasks.add(new Runnable()
-                {
-                    public void run()
-                    {
-                        nextTask.execute();
-                    }
-                });
+                tasks.addAll(nextTasks.stream().map(t -> (Runnable) () -> t.execute()).collect(Collectors.toList()));
             }
+
             if (tasks.isEmpty())
                 break;
 
@@ -144,8 +142,10 @@ public void testLeveledScanner() throws Exception
 
         LeveledCompactionStrategyTest.waitForLeveling(store);
         store.disableAutoCompaction();
-        CompactionStrategyManager mgr = store.getCompactionStrategyManager();
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) mgr.getStrategies().get(1).get(0);
+        CompactionStrategyContainer strategyContainer  = store.getCompactionStrategyContainer();
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) strategyContainer
+                                                                    .getStrategies(false, null)
+                                                                    .get(0);
 
         value = ByteBuffer.wrap(new byte[10 * 1024]); // 10 KB value
 
@@ -179,7 +179,7 @@ public Void call() throws Exception
                     }
                 }
 
-                try (AbstractCompactionStrategy.ScannerList scannerList = lcs.getScanners(Lists.newArrayList(allSSTables)))
+                try (ScannerList scannerList = lcs.getScanners(Lists.newArrayList(allSSTables)))
                 {
                     //Verify that leveled scanners will always iterate in ascending order (CASSANDRA-9935)
                     for (ISSTableScanner scanner : scannerList.scanners)
@@ -212,15 +212,19 @@ public void testRepairStatusChanges() throws Exception
         ColumnFamilyStore store = keyspace.getColumnFamilyStore(cfname);
         store.disableAutoCompaction();
 
-        CompactionStrategyManager mgr = store.getCompactionStrategyManager();
-        LeveledCompactionStrategy repaired = (LeveledCompactionStrategy) mgr.getStrategies().get(0).get(0);
-        LeveledCompactionStrategy unrepaired = (LeveledCompactionStrategy) mgr.getStrategies().get(1).get(0);
+        CompactionStrategyContainer strategyContainer = store.getCompactionStrategyContainer();
+        LeveledCompactionStrategy repaired = (LeveledCompactionStrategy) strategyContainer
+                                                                         .getStrategies(true, null)
+                                                                         .get(0);
+        LeveledCompactionStrategy unrepaired = (LeveledCompactionStrategy) strategyContainer
+                                                                           .getStrategies(false, null)
+                                                                           .get(0);
 
         // populate repaired sstables
         populateSSTables(store);
         assertTrue(repaired.getSSTables().isEmpty());
         assertFalse(unrepaired.getSSTables().isEmpty());
-        mgr.mutateRepaired(store.getLiveSSTables(), FBUtilities.nowInSeconds(), null, false);
+        store.mutateRepaired(store.getLiveSSTables(), FBUtilities.nowInSeconds(), null, false);
         assertFalse(repaired.getSSTables().isEmpty());
         assertTrue(unrepaired.getSSTables().isEmpty());
 
@@ -235,7 +239,7 @@ public void testRepairStatusChanges() throws Exception
         assertFalse(unrepaired.getSSTables().isEmpty());
 
         // mark unrepair
-        mgr.mutateRepaired(store.getLiveSSTables().stream().filter(s -> s.isRepaired()).collect(Collectors.toList()),
+        store.mutateRepaired(store.getLiveSSTables().stream().filter(s -> s.isRepaired()).collect(Collectors.toList()),
                            ActiveRepairService.UNREPAIRED_SSTABLE,
                            null,
                            false);
diff --git a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
index ec3271139c84..0445f47f6364 100644
--- a/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
+++ b/test/memory/org/apache/cassandra/db/compaction/CompactionAllocationTest.java
@@ -390,7 +390,7 @@ private static void measure(Workload workload) throws Throwable
         ActiveOperations active = new ActiveOperations();
         Set<SSTableReader> sstables = cfs.getLiveSSTables();
 
-        CompactionTasks tasks = cfs.getCompactionStrategyManager()
+        CompactionTasks tasks = cfs.getCompactionStrategyContainer()
                                    .getUserDefinedTasks(sstables, FBUtilities.nowInSeconds());
         Assert.assertFalse(tasks.isEmpty());
 
diff --git a/test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Repaired-shard_1.csv b/test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Repaired-shard_1.csv
new file mode 100644
index 000000000000..e5d8a8935258
--- /dev/null
+++ b/test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Repaired-shard_1.csv
@@ -0,0 +1,292 @@
+Timestamp,Event,Bucket,W,T,F,min size,max size,Tot. SSTables,Tot. size (bytes),Compactions,Comp. SSTables,Read (bytes/sec),Write (bytes/sec),Tot. comp. size/Read/Written (bytes)
+2020-11-10 10:59:51.854,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.857,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.858,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.859,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:30.208,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:30.758,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:33.836,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.918,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.918,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:03:05.538,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:03:05.538,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:03:36.752,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:03:36.752,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:04:38.001,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:04:38.001,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:07.808,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:07.808,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.202,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.202,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.203,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.203,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.204,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.204,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.206,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.206,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:17.764,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:17.764,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:07:58.120,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:07:58.120,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:09.817,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:09.817,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:49.567,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:49.567,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.475,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.475,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.476,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.476,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.477,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.477,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.478,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.478,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:09:53.012,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:09:53.012,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:10:49.931,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:10:49.931,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:05.116,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:05.116,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.362,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.362,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.369,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.369,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.370,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.370,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.371,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.371,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.372,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.372,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.373,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.373,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.374,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.374,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.375,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.375,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.376,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.376,submitted,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:36.481,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:36.481,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:12.694,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:12.694,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:14.952,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:14.952,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:21.342,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:21.342,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:07.301,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:07.301,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:15.871,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:15.872,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:18.864,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:18.864,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:32.666,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:32.666,completed,1,2,4,4,114.700MiB,458.800MiB,1,184.298MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.225,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.225,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.225,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.232,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.232,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.232,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.234,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.234,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.234,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.236,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.236,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.236,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:16:28.557,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:16:28.557,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:16:28.557,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:22.572,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:22.572,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:22.572,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:25.432,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:25.432,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:25.432,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.288,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.288,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.288,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.396,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.396,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.396,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.397,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.397,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.397,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.398,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.398,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.398,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.399,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.399,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.399,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.400,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.400,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.400,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.401,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.401,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.401,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.402,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.402,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.402,submitted,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:26.046,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:26.046,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:26.046,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:43.065,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:43.065,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:43.065,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:24:57.526,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:24:57.526,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:24:57.526,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:25:46.775,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:25:46.775,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:25:46.775,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:28.292,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:28.292,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:28.292,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:38.417,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:38.417,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:38.417,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:53.108,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:53.109,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:53.109,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.156,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.156,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.156,completed,2,2,4,4,458.800MiB,1.792GiB,1,839.291MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.166,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.166,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.166,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.167,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.167,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.167,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.169,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.169,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.169,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.171,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.171,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.171,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:39.447,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:39.447,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:39.447,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:59.855,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:59.855,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:59.855,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:29.096,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:29.096,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:29.096,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.503,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.503,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.503,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.639GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.515,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.515,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.515,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.516,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.516,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.516,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.518,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.518,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.518,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.520,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.520,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.520,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.521,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.521,submitted,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.522,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:08.500,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:08.500,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:08.500,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:09.848,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:09.848,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:09.848,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:31.878,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:31.878,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:31.878,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:18.248,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:18.248,completed,1,2,4,4,114.700MiB,458.800MiB,2,404.842MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:18.248,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.604,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.604,submitted,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.604,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.605,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.605,submitted,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.605,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.607,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.607,submitted,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.607,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.608,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.608,submitted,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.608,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.609,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.609,submitted,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.609,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:43:09.723,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:43:09.723,completed,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:43:09.723,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:32.197,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:32.197,completed,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:32.197,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:38.467,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:38.467,completed,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:38.467,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:56.676,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:56.676,completed,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:56.676,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.042,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.042,completed,1,2,4,4,114.700MiB,458.800MiB,3,625.714MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.042,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.775,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.775,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/0,4/0,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.775,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.776,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.776,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/0,4/0,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.776,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.777,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.777,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.777,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.778,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.778,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.778,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.779,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.779,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.779,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.780,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.782,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.782,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.782,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.783,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.783,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.783,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.785,submitted,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.785,submitted,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.785,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:20.540,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:20.540,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,874.611MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:20.540,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:09.546,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:09.546,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,17.847MiB/s,15.329MiB/s,874.611MiB/874.581MiB/751.182MiB
+2020-11-10 11:46:09.546,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.172,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.172,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,16.308MiB/s,14.007MiB/s,874.611MiB/874.581MiB/751.182MiB
+2020-11-10 11:46:14.172,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.256,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.256,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,1/1,4/4,16.283MiB/s,13.985MiB/s,874.611MiB/874.611MiB/751.182MiB
+2020-11-10 11:46:14.256,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:10.453,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:10.453,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:10.453,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:29.530,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:29.530,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:29.530,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:51.313,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:51.313,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:51.313,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:51:38.206,completed,0,2,4,4,0.000KiB,114.700MiB,1,110.893MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:51:38.206,completed,1,2,4,4,114.700MiB,458.800MiB,4,846.810MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:51:38.206,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.424GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
diff --git a/test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Unrepaired-shard_0.csv b/test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Unrepaired-shard_0.csv
new file mode 100644
index 000000000000..4eea16c288e4
--- /dev/null
+++ b/test/resources/org/apache/cassandra/tools/compaction_logs/compaction-UnifiedCompactionStrategy-density-blobs-0-Unrepaired-shard_0.csv
@@ -0,0 +1,304 @@
+Timestamp,Event,Bucket,W,T,F,min size,max size,Tot. SSTables,Tot. size (bytes),Compactions,Comp. SSTables,Read (bytes/sec),Write (bytes/sec),Tot. comp. size/Read/Written (bytes)
+2020-11-10 10:58:23.606,submitted,0,2,4,4,0.000KiB,114.700MiB,4,444.179MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,461.347MiB/0.000KiB/0.000KiB
+2020-11-10 10:58:23.751,submitted,0,2,4,4,0.000KiB,114.700MiB,4,444.179MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,461.347MiB/0.000KiB/0.000KiB
+2020-11-10 10:58:44.270,completed,0,2,4,4,0.000KiB,114.700MiB,4,444.179MiB,1/1,4/4,22.334MiB/s,11.150MiB/s,461.347MiB/461.347MiB/230.326MiB
+2020-11-10 10:58:44.270,completed,0,2,4,4,0.000KiB,114.700MiB,4,444.179MiB,1/1,4/4,22.334MiB/s,11.150MiB/s,461.347MiB/461.347MiB/230.326MiB
+2020-11-10 10:58:44.274,submitted,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,1/1,6/6,0.000KiB/s,0.000KiB/s,728.674MiB/0.000KiB/0.000KiB
+2020-11-10 10:58:44.275,submitted,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,1/1,6/6,0.000KiB/s,0.000KiB/s,728.674MiB/0.000KiB/0.000KiB
+2020-11-10 10:58:44.276,submitted,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,1/1,6/6,0.000KiB/s,0.000KiB/s,728.674MiB/0.000KiB/0.000KiB
+2020-11-10 10:58:44.277,submitted,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,1/1,6/6,0.000KiB/s,0.000KiB/s,728.674MiB/0.000KiB/0.000KiB
+2020-11-10 10:59:32.778,completed,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,1/1,6/6,15.023MiB/s,6.316MiB/s,728.674MiB/728.620MiB/306.319MiB
+2020-11-10 10:59:37.253,completed,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,1/1,6/6,13.755MiB/s,5.782MiB/s,728.674MiB/728.674MiB/306.319MiB
+2020-11-10 10:59:48.348,completed,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.852,completed,0,2,4,4,0.000KiB,114.700MiB,6,701.783MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.854,submitted,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,1/1,16/16,0.000KiB/s,0.000KiB/s,1.798GiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.856,submitted,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,1/1,16/16,0.000KiB/s,0.000KiB/s,1.798GiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.858,submitted,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,1/1,16/16,0.000KiB/s,0.000KiB/s,1.798GiB/0.000KiB/0.000KiB
+2020-11-10 10:59:51.859,submitted,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,1/1,16/16,0.000KiB/s,0.000KiB/s,1.798GiB/0.000KiB/0.000KiB
+2020-11-10 11:02:30.208,completed,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,1/1,16/16,11.628MiB/s,4.326MiB/s,1.798GiB/1.798GiB/684.947MiB
+2020-11-10 11:02:30.758,completed,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:33.836,completed,0,2,4,4,0.000KiB,114.700MiB,16,1.732GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.918,submitted,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,3/1,36/4,0.000KiB/s,0.000KiB/s,4.044GiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.918,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,3/1,36/4,0.000KiB/s,0.000KiB/s,4.044GiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,3/1,36/4,0.000KiB/s,0.000KiB/s,4.044GiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,3/1,36/4,0.000KiB/s,0.000KiB/s,4.044GiB/0.000KiB/0.000KiB
+2020-11-10 11:02:38.919,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:03:05.538,completed,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,3/1,36/4,4.670MiB/s,2.333MiB/s,4.044GiB/124.293MiB/62.091MiB
+2020-11-10 11:03:05.538,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:03:36.752,completed,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,3/1,36/4,7.954MiB/s,3.974MiB/s,4.044GiB/459.960MiB/229.800MiB
+2020-11-10 11:03:36.752,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:04:38.000,completed,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,2/0,32/0,0.000KiB/s,0.000KiB/s,3.595GiB/0.000KiB/0.000KiB
+2020-11-10 11:04:38.000,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:07.808,completed,0,2,4,4,0.000KiB,114.700MiB,37,4.002GiB,2/0,32/0,0.000KiB/s,0.000KiB/s,3.595GiB/0.000KiB/0.000KiB
+2020-11-10 11:05:07.808,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.202,submitted,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.202,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.203,submitted,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.203,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.204,submitted,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.204,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.206,submitted,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:05:08.206,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:05:17.764,completed,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:05:17.764,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:07:58.120,completed,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:07:58.120,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:09.816,completed,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:08:09.817,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:49.567,completed,0,2,4,4,0.000KiB,114.700MiB,68,7.394GiB,2/1,68/64,0.000KiB/s,0.000KiB/s,7.679GiB/0.000KiB/0.000KiB
+2020-11-10 11:08:49.567,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.475,submitted,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,5/2,116/68,0.000KiB/s,0.000KiB/s,13.071GiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.475,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.476,submitted,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,5/2,116/68,0.000KiB/s,0.000KiB/s,13.071GiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.476,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.477,submitted,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,5/2,116/68,0.000KiB/s,0.000KiB/s,13.071GiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.477,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.478,submitted,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,5/2,116/68,0.000KiB/s,0.000KiB/s,13.071GiB/0.000KiB/0.000KiB
+2020-11-10 11:08:58.478,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:09:53.012,completed,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,5/2,116/68,1.354MiB/s,692.084KiB/s,13.071GiB/459.473MiB/229.346MiB
+2020-11-10 11:09:53.012,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:10:49.931,completed,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,4/1,112/64,0.000KiB/s,0.000KiB/s,12.622GiB/0.000KiB/0.000KiB
+2020-11-10 11:10:49.931,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:05.116,completed,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,4/1,112/64,51.189KiB/s,17.668KiB/s,12.622GiB/17.842MiB/6.158MiB
+2020-11-10 11:11:05.116,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.362,completed,0,2,4,4,0.000KiB,114.700MiB,118,12.802GiB,4/1,112/64,5.042MiB/s,1.736MiB/s,12.622GiB/1.872GiB/659.976MiB
+2020-11-10 11:11:28.362,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.369,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.044MiB/s,1.737MiB/s,16.704GiB/1.872GiB/660.188MiB
+2020-11-10 11:11:28.369,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.370,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.044MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.288MiB
+2020-11-10 11:11:28.370,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.371,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.044MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.313MiB
+2020-11-10 11:11:28.371,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.372,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.044MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.340MiB
+2020-11-10 11:11:28.372,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.373,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.045MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.381MiB
+2020-11-10 11:11:28.373,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.374,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.045MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.424MiB
+2020-11-10 11:11:28.374,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.375,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.045MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.468MiB
+2020-11-10 11:11:28.375,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:28.376,submitted,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,5.046MiB/s,1.737MiB/s,16.704GiB/1.873GiB/660.505MiB
+2020-11-10 11:11:28.376,submitted,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:11:36.480,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,6.429MiB/s,2.214MiB/s,16.704GiB/2.488GiB/877.383MiB
+2020-11-10 11:11:36.481,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:12.694,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,4/2,148/80,13.042MiB/s,4.548MiB/s,16.704GiB/9.027GiB/3.148GiB
+2020-11-10 11:14:12.694,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:14.952,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,3/1,132/64,13.541MiB/s,4.644MiB/s,14.906GiB/7.230GiB/2.480GiB
+2020-11-10 11:14:14.952,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:14:21.342,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,2/0,68/0,0.000KiB/s,0.000KiB/s,7.676GiB/0.000KiB/0.000KiB
+2020-11-10 11:14:21.342,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:07.301,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,2/0,68/0,0.000KiB/s,0.000KiB/s,7.676GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:07.301,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:15.871,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,2/0,68/0,0.000KiB/s,0.000KiB/s,7.676GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:15.871,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:18.864,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,2/0,68/0,0.000KiB/s,0.000KiB/s,7.676GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:18.864,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:32.665,completed,0,2,4,4,0.000KiB,114.700MiB,150,16.300GiB,2/0,68/0,0.000KiB/s,0.000KiB/s,7.676GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:32.666,completed,1,2,4,4,114.700MiB,458.800MiB,1,294.917MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.225,submitted,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,7/1,124/4,0.000KiB/s,0.000KiB/s,13.969GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.225,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.225,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.232,submitted,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,7/1,124/4,0.000KiB/s,0.000KiB/s,13.969GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.232,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.232,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.234,submitted,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,7/1,124/4,0.000KiB/s,0.000KiB/s,13.969GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.234,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.234,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.236,submitted,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,7/1,124/4,0.000KiB/s,0.000KiB/s,13.969GiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.236,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:15:42.236,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:16:28.556,completed,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,7/1,124/4,9.930MiB/s,4.959MiB/s,13.969GiB/459.953MiB/229.669MiB
+2020-11-10 11:16:28.557,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:16:28.557,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:22.572,completed,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,6/0,120/0,0.000KiB/s,0.000KiB/s,13.520GiB/0.000KiB/0.000KiB
+2020-11-10 11:17:22.572,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:22.572,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:25.432,completed,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,6/0,120/0,0.000KiB/s,0.000KiB/s,13.520GiB/0.000KiB/0.000KiB
+2020-11-10 11:17:25.432,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:17:25.432,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.288,completed,0,2,4,4,0.000KiB,114.700MiB,125,13.558GiB,6/0,120/0,0.000KiB/s,0.000KiB/s,13.520GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.288,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.288,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.395,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.395,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.396,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.397,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.397,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.397,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.398,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.398,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.398,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.399,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.399,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.399,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.400,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.400,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.400,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.401,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.401,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.401,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.402,submitted,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.402,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:20:34.402,submitted,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:26.046,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:23:26.046,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:26.046,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:43.065,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:23:43.065,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:23:43.065,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:24:57.523,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:24:57.526,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:24:57.526,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:25:46.775,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:25:46.775,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:25:46.775,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:28.292,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:26:28.292,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:28.292,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:38.417,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,0.000KiB/s,0.000KiB/s,21.196GiB/0.000KiB/0.000KiB
+2020-11-10 11:26:38.417,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:26:38.417,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:53.108,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,14.839MiB/s,5.163MiB/s,21.196GiB/7.227GiB/2.514GiB
+2020-11-10 11:28:53.108,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:53.108,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.156,completed,0,2,4,4,0.000KiB,114.700MiB,190,20.626GiB,8/1,188/64,14.749MiB/s,5.131MiB/s,21.196GiB/7.227GiB/2.514GiB
+2020-11-10 11:28:56.156,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.156,completed,2,2,4,4,458.800MiB,1.792GiB,1,840.890MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.165,submitted,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,0.000KiB/s,0.000KiB/s,20.712GiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.166,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.166,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.167,submitted,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,0.000KiB/s,0.000KiB/s,20.712GiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.167,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.167,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.169,submitted,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,0.000KiB/s,0.000KiB/s,20.712GiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.169,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.169,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.170,submitted,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,0.000KiB/s,0.000KiB/s,20.712GiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.171,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:28:56.171,submitted,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:39.447,completed,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,0.000KiB/s,0.000KiB/s,20.712GiB/0.000KiB/0.000KiB
+2020-11-10 11:29:39.447,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:39.447,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:59.855,completed,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,0.000KiB/s,0.000KiB/s,20.712GiB/0.000KiB/0.000KiB
+2020-11-10 11:29:59.855,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:29:59.855,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:29.095,completed,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,16.253MiB/s,5.442MiB/s,20.712GiB/7.189GiB/2.407GiB
+2020-11-10 11:36:29.096,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:29.096,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.503,completed,0,2,4,4,0.000KiB,114.700MiB,186,20.158GiB,7/1,184/64,16.203MiB/s,5.425MiB/s,20.712GiB/7.189GiB/2.407GiB
+2020-11-10 11:36:30.503,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.503,completed,2,2,4,4,458.800MiB,1.792GiB,2,1.642GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.514,submitted,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,5/1,164/16,0.000KiB/s,0.000KiB/s,18.462GiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.514,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.514,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.516,submitted,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,5/1,164/16,0.000KiB/s,0.000KiB/s,18.462GiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.516,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.516,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.518,submitted,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,5/1,164/16,0.000KiB/s,0.000KiB/s,18.462GiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.518,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.518,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.519,submitted,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,5/1,164/16,0.000KiB/s,0.000KiB/s,18.462GiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.519,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.520,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.521,submitted,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,5/1,164/16,0.000KiB/s,0.000KiB/s,18.462GiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.521,submitted,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:36:30.521,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:08.500,completed,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,5/1,164/16,11.647MiB/s,4.333MiB/s,18.462GiB/1.797GiB/684.564MiB
+2020-11-10 11:39:08.500,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:08.500,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:09.848,completed,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,4/0,148/0,0.000KiB/s,0.000KiB/s,16.665GiB/0.000KiB/0.000KiB
+2020-11-10 11:39:09.848,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:09.848,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:31.878,completed,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,4/0,148/0,0.000KiB/s,0.000KiB/s,16.665GiB/0.000KiB/0.000KiB
+2020-11-10 11:39:31.878,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:39:31.878,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:18.248,completed,0,2,4,4,0.000KiB,114.700MiB,167,18.102GiB,4/0,148/0,0.000KiB/s,0.000KiB/s,16.665GiB/0.000KiB/0.000KiB
+2020-11-10 11:42:18.248,completed,1,2,4,4,114.700MiB,458.800MiB,2,515.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:18.248,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.603,submitted,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,0.000KiB/s,0.000KiB/s,20.709GiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.604,submitted,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.604,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.605,submitted,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,0.000KiB/s,0.000KiB/s,20.709GiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.605,submitted,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.605,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.607,submitted,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,0.000KiB/s,0.000KiB/s,20.709GiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.607,submitted,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.607,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.608,submitted,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,0.000KiB/s,0.000KiB/s,20.709GiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.608,submitted,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.608,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.609,submitted,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,0.000KiB/s,0.000KiB/s,20.709GiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.609,submitted,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:42:24.609,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:43:09.723,completed,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,0.000KiB/s,0.000KiB/s,20.709GiB/0.000KiB/0.000KiB
+2020-11-10 11:43:09.723,completed,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:43:09.723,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:32.197,completed,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,14.419MiB/s,5.369MiB/s,20.709GiB/1.797GiB/685.047MiB
+2020-11-10 11:44:32.197,completed,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:32.197,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:38.466,completed,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,7/1,184/16,13.744MiB/s,5.118MiB/s,20.709GiB/1.797GiB/685.047MiB
+2020-11-10 11:44:38.466,completed,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:38.466,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:56.676,completed,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,6/0,168/0,0.000KiB/s,0.000KiB/s,18.912GiB/0.000KiB/0.000KiB
+2020-11-10 11:44:56.676,completed,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:44:56.676,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.042,completed,0,2,4,4,0.000KiB,114.700MiB,185,20.048GiB,6/0,168/0,0.000KiB/s,0.000KiB/s,18.912GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.042,completed,1,2,4,4,114.700MiB,458.800MiB,3,736.950MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.042,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.775,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.775,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/0,4/0,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.775,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.776,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.776,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.776,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.777,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.777,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.777,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.778,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.778,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.778,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.779,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.779,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.779,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.780,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.780,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.780,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.781,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.782,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.782,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.782,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.783,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.783,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.783,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.784,submitted,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.785,submitted,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,0.000KiB/s,0.000KiB/s,989.038MiB/0.000KiB/0.000KiB
+2020-11-10 11:45:19.785,submitted,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:45:20.540,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,0.000KiB/s,0.000KiB/s,20.707GiB/0.000KiB/0.000KiB
+2020-11-10 11:45:20.540,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,75.074MiB/s,65.493MiB/s,989.038MiB/56.558MiB/49.339MiB
+2020-11-10 11:45:20.540,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:09.546,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,7/1,184/4,9.248MiB/s,4.617MiB/s,20.707GiB/460.169MiB/229.723MiB
+2020-11-10 11:46:09.546,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,19.876MiB/s,17.310MiB/s,989.038MiB/989.006MiB/861.355MiB
+2020-11-10 11:46:09.546,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.171,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,6/0,180/0,0.000KiB/s,0.000KiB/s,20.258GiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.172,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,1/1,4/4,18.186MiB/s,15.838MiB/s,989.038MiB/989.038MiB/861.355MiB
+2020-11-10 11:46:14.172,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.256,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,6/0,180/0,0.000KiB/s,0.000KiB/s,20.258GiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.256,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:46:14.256,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:10.453,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,6/0,180/0,0.000KiB/s,0.000KiB/s,20.258GiB/0.000KiB/0.000KiB
+2020-11-10 11:49:10.453,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:10.453,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:29.530,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,6/0,180/0,0.000KiB/s,0.000KiB/s,20.258GiB/0.000KiB/0.000KiB
+2020-11-10 11:49:29.530,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:29.530,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:51.313,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,6/0,180/0,0.000KiB/s,0.000KiB/s,20.258GiB/0.000KiB/0.000KiB
+2020-11-10 11:49:51.313,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:49:51.313,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:51:38.206,completed,0,2,4,4,0.000KiB,114.700MiB,184,19.939GiB,6/0,180/0,0.000KiB/s,0.000KiB/s,20.258GiB/0.000KiB/0.000KiB
+2020-11-10 11:51:38.206,completed,1,2,4,4,114.700MiB,458.800MiB,4,958.353MiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
+2020-11-10 11:51:38.206,completed,2,2,4,4,458.800MiB,1.792GiB,3,2.427GiB,0/0,0/0,0.000KiB/s,0.000KiB/s,0.000KiB/0.000KiB/0.000KiB
diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java
index a41b04098d4c..7701a10b8b3b 100644
--- a/test/unit/org/apache/cassandra/Util.java
+++ b/test/unit/org/apache/cassandra/Util.java
@@ -258,7 +258,7 @@ public static Future<?> compactAll(ColumnFamilyStore cfs, int gcBefore)
     public static void compact(ColumnFamilyStore cfs, Collection<SSTableReader> sstables)
     {
         int gcBefore = cfs.gcBefore(FBUtilities.nowInSeconds());
-        try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstables, gcBefore))
+        try (CompactionTasks tasks = cfs.getCompactionStrategy().getUserDefinedTasks(sstables, gcBefore))
         {
             for (AbstractCompactionTask task : tasks)
                 task.execute();
@@ -725,6 +725,15 @@ public static Closeable markDirectoriesUnwriteable(ColumnFamilyStore cfs)
         return () -> DisallowedDirectories.clearUnwritableUnsafe();
     }
 
+    public static boolean getDirectoriesWriteable(ColumnFamilyStore cfs)
+    {
+        boolean ret = true;
+        for (File dir : cfs.getDirectories().getCFDirectories())
+            ret &= !DisallowedDirectories.isUnwritable(dir);
+
+        return ret;
+    }
+
     public static PagingState makeSomePagingState(ProtocolVersion protocolVersion)
     {
         return makeSomePagingState(protocolVersion, Integer.MAX_VALUE);
diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
index a33b4375de9f..5cf68de14710 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
@@ -141,6 +141,7 @@ public class DatabaseDescriptorRefTest
     "org.apache.cassandra.io.util.DataOutputPlus",
     "org.apache.cassandra.io.util.DiskOptimizationStrategy",
     "org.apache.cassandra.io.util.SpinningDiskOptimizationStrategy",
+    "org.apache.cassandra.io.util.FileUtils$DiskAccessType",
     "org.apache.cassandra.locator.Replica",
     "org.apache.cassandra.locator.SimpleSeedProvider",
     "org.apache.cassandra.locator.SeedProvider",
diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
index cccc7cc1af95..1e8768c9f418 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorTest.java
@@ -18,33 +18,43 @@
 */
 package org.apache.cassandra.config;
 
+import java.io.IOException;
 import java.net.Inet4Address;
 import java.net.Inet6Address;
 import java.net.InetAddress;
 import java.net.NetworkInterface;
+import java.nio.file.FileStore;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Enumeration;
 
-
 import com.google.common.base.Throwables;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Multiset;
 import org.junit.Assert;
 import org.junit.BeforeClass;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
 import org.junit.runner.RunWith;
 
 import org.apache.cassandra.OrderedJUnit4ClassRunner;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.mockito.Mockito;
 
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.mockito.Mockito.when;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class DatabaseDescriptorTest
 {
+    @Rule
+    public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
     @BeforeClass
     public static void setupDatabaseDescriptor()
     {
@@ -593,4 +603,44 @@ public void testApplyTokensConfigInitialTokensOneNumTokensNotSet()
         Assert.assertEquals(Integer.valueOf(1), config.num_tokens);
         Assert.assertEquals(1, DatabaseDescriptor.tokensFromString(config.initial_token).size());
     }
+
+    @Test
+    public void testDataFileDirectoriesMinTotalSpaceInGB() throws IOException
+    {
+        DatabaseDescriptor.getRawConfig().data_file_directories = new String[]{};
+        assertEquals(0L, DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB());
+
+        DatabaseDescriptor.getRawConfig().data_file_directories = new String[] { temporaryFolder.newFolder("data").toString() };
+        assertTrue(DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB() > 0);
+
+        Multiset<FileStore> fileStoreMultiset = HashMultiset.create();
+
+        // single disk (i.e. mockFileStore1)
+        FileStore mockFileStore1 = Mockito.mock(FileStore.class);
+        when(mockFileStore1.getTotalSpace()).thenReturn(1L << 43); // 8 TB
+        fileStoreMultiset.add(mockFileStore1);
+        assertEquals(8192L, DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB(fileStoreMultiset));
+
+        // two different disks (i.e. mockFileStore1, mockFileStore2)
+        FileStore mockFileStore2 = Mockito.mock(FileStore.class);
+        when(mockFileStore2.getTotalSpace()).thenReturn(1L << 41); // 2 TB
+        fileStoreMultiset.add(mockFileStore2);
+        assertEquals(4096L, DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB(fileStoreMultiset));
+
+        // two different disks with three directories. Two directories are on disk 1 (i.e. mockFileStore1)
+        fileStoreMultiset.add(mockFileStore1);
+        assertEquals(6144L, DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB(fileStoreMultiset));
+
+        fileStoreMultiset.clear();
+
+        FileStore mockLargeFileStore = Mockito.mock(FileStore.class);
+        when(mockLargeFileStore.getTotalSpace()).thenReturn(-1L);
+        fileStoreMultiset.add(mockLargeFileStore);
+        assertEquals(Long.MAX_VALUE >> 30, DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB(fileStoreMultiset));
+
+        FileStore mockSmallFileStore = Mockito.mock(FileStore.class);
+        when(mockSmallFileStore.getTotalSpace()).thenReturn(1L << 29); // 512 MB
+        fileStoreMultiset.add(mockSmallFileStore);
+        assertEquals(0L, DatabaseDescriptor.getDataFileDirectoriesMinTotalSpaceInGB(fileStoreMultiset));
+    }
 }
diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 26fab6c4acea..37beff53e1fc 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -48,6 +48,7 @@
 
 import com.google.common.base.Objects;
 import com.google.common.base.Strings;
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableSet;
 import org.junit.*;
 import org.junit.rules.TestWatcher;
@@ -682,6 +683,14 @@ protected String currentKeyspace()
         return keyspaces.get(keyspaces.size() - 1);
     }
 
+    protected Collection<String> currentTables()
+    {
+        if (tables == null || tables.isEmpty())
+            return ImmutableList.of();
+
+        return new ArrayList<>(tables);
+    }
+
     protected ByteBuffer unset()
     {
         return ByteBufferUtil.UNSET_BYTE_BUFFER;
diff --git a/test/unit/org/apache/cassandra/cql3/CompactionOutOfSpaceTest.java b/test/unit/org/apache/cassandra/cql3/CompactionOutOfSpaceTest.java
new file mode 100644
index 000000000000..bcc2b08e7aae
--- /dev/null
+++ b/test/unit/org/apache/cassandra/cql3/CompactionOutOfSpaceTest.java
@@ -0,0 +1,243 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.cql3;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import org.apache.cassandra.Util;
+import org.apache.cassandra.config.Config;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.gms.Gossiper;
+import org.apache.cassandra.service.CassandraDaemon;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.JVMStabilityInspector;
+import org.apache.cassandra.utils.KillerForTests;
+import org.jboss.byteman.contrib.bmunit.BMRule;
+import org.jboss.byteman.contrib.bmunit.BMUnitRunner;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.fail;
+
+@RunWith(BMUnitRunner.class)
+public class CompactionOutOfSpaceTest extends CQLTester
+{
+    @BeforeClass
+    public static void setupClass()
+    {
+        CQLTester.setUpClass();
+        CQLTester.requireNetwork();
+        CassandraDaemon d = new CassandraDaemon();
+        d.activate();
+        StorageService.instance.registerDaemon(d);
+    }
+
+    @Before
+    public void setup()
+    {
+        // restart the services in case a previous test has stopped them
+
+        if (!StorageService.instance.isNativeTransportRunning())
+            StorageService.instance.startNativeTransport();
+
+        if (!StorageService.instance.isGossipActive())
+            StorageService.instance.startGossiping();
+    }
+
+    @Test
+    @BMRule(name = "Simulate disk full during background compaction",
+    targetClass = "CompactionTask",
+    targetMethod = "runMayThrow",
+    targetLocation = "AT ENTRY",
+    action = "throw new java.io.IOError(new java.io.IOException(\"No space left on device\"))")
+    public void testUcsBackgroundCompactionNoDiskSpaceIgnore() throws Throwable
+    {
+        String ucsCqlCompactionParams = "{'class':'UnifiedCompactionStrategy', 'num_shards':'1'}";
+        flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy.ignore, ucsCqlCompactionParams);
+    }
+
+    @Test
+    @BMRule(name = "Simulate disk full during background compaction",
+    targetClass = "CompactionTask",
+    targetMethod = "runMayThrow",
+    targetLocation = "AT ENTRY",
+    action = "throw new java.io.IOError(new java.io.IOException(\"No space left on device\"))")
+    public void testUcsBackgroundCompactionNoDiskSpaceStop() throws Throwable
+    {
+        String ucsCqlCompactionParams = "{'class':'UnifiedCompactionStrategy', 'num_shards':'1'}";
+        flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy.stop, ucsCqlCompactionParams);
+    }
+
+    @Test
+    @BMRule(name = "Simulate disk full during background compaction",
+    targetClass = "CompactionTask",
+    targetMethod = "runMayThrow",
+    targetLocation = "AT ENTRY",
+    action = "throw new java.io.IOError(new java.io.IOException(\"No space left on device\"))")
+    public void testUcsBackgroundCompactionNoDiskSpaceDie() throws Throwable
+    {
+        String ucsCqlCompactionParams = "{'class':'UnifiedCompactionStrategy', 'num_shards':'1'}";
+        flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy.die, ucsCqlCompactionParams);
+    }
+
+    @Test
+    @BMRule(name = "Simulate disk full during background compaction",
+    targetClass = "CompactionTask",
+    targetMethod = "runMayThrow",
+    targetLocation = "AT ENTRY",
+    action = "throw new java.io.IOError(new java.io.IOException(\"No space left on device\"))")
+    public void testStcsBackgroundCompactionNoDiskSpaceIgnore() throws Throwable
+    {
+        String stcsCqlCompactionParams = "{'class':'SizeTieredCompactionStrategy', 'max_threshold':'4'}";
+        flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy.ignore, stcsCqlCompactionParams);
+    }
+
+    @Test
+    @BMRule(name = "Simulate disk full during background compaction",
+    targetClass = "CompactionTask",
+    targetMethod = "runMayThrow",
+    targetLocation = "AT ENTRY",
+    action = "throw new java.io.IOError(new java.io.IOException(\"No space left on device\"))")
+    public void testStcsBackgroundCompactionNoDiskSpaceStop() throws Throwable
+    {
+        String stcsCqlCompactionParams = "{'class':'SizeTieredCompactionStrategy', 'max_threshold':'4'}";
+        flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy.stop, stcsCqlCompactionParams);
+    }
+
+    @Test
+    @BMRule(name = "Simulate disk full during background compaction",
+    targetClass = "CompactionTask",
+    targetMethod = "runMayThrow",
+    targetLocation = "AT ENTRY",
+    action = "throw new java.io.IOError(new java.io.IOException(\"No space left on device\"))")
+    public void testStcsBackgroundCompactionNoDiskSpaceDie() throws Throwable
+    {
+        String stcsCqlCompactionParams = "{'class':'SizeTieredCompactionStrategy', 'max_threshold':'4'}";
+        flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy.die, stcsCqlCompactionParams);
+    }
+
+    private void flush4SstablesAndEnableAutoCompaction(Config.DiskFailurePolicy policy, String cqlCompactionParams) throws Throwable
+    {
+        createTable("CREATE TABLE %s (k INT, c INT, v INT, PRIMARY KEY (k, c)) WITH compaction = " + cqlCompactionParams);
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        cfs.disableAutoCompaction();
+
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 1, 1);
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 2, 2);
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 3, 3);
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 4, 4);
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        JVMStabilityInspector.Killer originalKiller = JVMStabilityInspector.replaceKiller(new KillerForTests());
+        Config.DiskFailurePolicy originalPolicy = DatabaseDescriptor.getDiskFailurePolicy();
+        try
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(policy);
+            cfs.enableAutoCompaction(true);
+            verifyDiskFailurePolicy(policy);
+        }
+        finally
+        {
+            DatabaseDescriptor.setDiskFailurePolicy(originalPolicy);
+            JVMStabilityInspector.replaceKiller(originalKiller);
+        }
+    }
+
+    private void verifyDiskFailurePolicy(Config.DiskFailurePolicy policy)
+    {
+        switch (policy)
+        {
+            case stop:
+            case stop_paranoid:
+                verifyDiskFailurePolicyStop();
+                break;
+            case die:
+                verifyDiskFailurePolicyDie();
+                break;
+            case best_effort:
+                verifyDiskFailurePolicyBestEffort();
+                break;
+            case ignore:
+                verifyDiskFailurePolicyIgnore();
+                break;
+            default:
+                fail("Unsupported disk failure policy: " + policy);
+                break;
+        }
+    }
+
+    private void verifyDiskFailurePolicyStop()
+    {
+        verifyGossip(false);
+        verifyNativeTransports(false);
+        verifyJVMWasKilled(false);
+    }
+
+    private void verifyDiskFailurePolicyDie()
+    {
+        verifyJVMWasKilled(true);
+    }
+
+    private void verifyDiskFailurePolicyBestEffort()
+    {
+        assertFalse(Util.getDirectoriesWriteable(getCurrentColumnFamilyStore(KEYSPACE_PER_TEST)));
+        FBUtilities.sleepQuietly(10); // give them a chance to stop before verifying they were not stopped
+        verifyGossip(true);
+        verifyNativeTransports(true);
+        verifyJVMWasKilled(false);
+    }
+
+    private void verifyDiskFailurePolicyIgnore()
+    {
+        FBUtilities.sleepQuietly(10); // give them a chance to stop before verifying they were not stopped
+        verifyGossip(true);
+        verifyNativeTransports(true);
+        verifyJVMWasKilled(false);
+    }
+
+    private void verifyJVMWasKilled(boolean killed)
+    {
+        KillerForTests killer = (KillerForTests) JVMStabilityInspector.killer();
+        assertEquals(killed, killer.wasKilled());
+        if (killed)
+            assertFalse(killer.wasKilledQuietly()); // true only on startup
+    }
+
+    private void verifyGossip(boolean isEnabled)
+    {
+        assertEquals(isEnabled, Gossiper.instance.isEnabled());
+    }
+
+    private void verifyNativeTransports(boolean isRunning)
+    {
+        // Native transports are also stopped asynchronously, but isRunning is set synchronously
+        assertEquals(isRunning, StorageService.instance.isNativeTransportRunning());
+
+        // if the transport has been stopped, we wait for it to be fully stopped so that restarting it for
+        // the next test will not fail due to the port being already in use
+        if (!isRunning)
+            StorageService.instance.stopNativeTransport();
+    }
+}
diff --git a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
index 62762d36904f..c35821c295a1 100644
--- a/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
+++ b/test/unit/org/apache/cassandra/cql3/GcCompactionTest.java
@@ -162,7 +162,7 @@ public void testGarbageCollectRetainsLCSLevel() throws Throwable
                   "  PRIMARY KEY ((key), column)" +
                   ") WITH compaction = { 'class' : 'LeveledCompactionStrategy' };");
 
-      assertEquals("LeveledCompactionStrategy", getCurrentColumnFamilyStore().getCompactionStrategyManager().getName());
+      assertEquals("LeveledCompactionStrategy", getCurrentColumnFamilyStore().getCompactionStrategyContainer().getName());
 
       for (int i = 0; i < KEY_COUNT; ++i)
           for (int j = 0; j < CLUSTERING_COUNT; ++j)
diff --git a/test/unit/org/apache/cassandra/cql3/ViewTest.java b/test/unit/org/apache/cassandra/cql3/ViewTest.java
index 23b9195e44ab..f0dbe5e6160d 100644
--- a/test/unit/org/apache/cassandra/cql3/ViewTest.java
+++ b/test/unit/org/apache/cassandra/cql3/ViewTest.java
@@ -1385,7 +1385,7 @@ private void testViewBuilderResume(int concurrentViewBuilders) throws Throwable
         createView(viewName1, "CREATE MATERIALIZED VIEW %s AS SELECT * FROM %%s WHERE val IS NOT NULL AND k IS NOT NULL AND c IS NOT NULL PRIMARY KEY (val,k,c)");
 
         cfs.enableAutoCompaction();
-        List<Future<?>> futures = CompactionManager.instance.submitBackground(cfs);
+        Future<?> future = CompactionManager.instance.submitBackground(cfs);
 
         String viewName2 = viewName1 + "_2";
         //Force a second MV on the same base table, which will restart the first MV builder...
@@ -1393,7 +1393,7 @@ private void testViewBuilderResume(int concurrentViewBuilders) throws Throwable
 
 
         //Compact the base table
-        FBUtilities.waitOnFutures(futures);
+        FBUtilities.waitOnFuture(future);
 
         while (!SystemKeyspace.isViewBuilt(keyspace(), viewName1))
             Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS);
diff --git a/test/unit/org/apache/cassandra/db/CleanupTest.java b/test/unit/org/apache/cassandra/db/CleanupTest.java
index 3285f6dde757..7883e469a2af 100644
--- a/test/unit/org/apache/cassandra/db/CleanupTest.java
+++ b/test/unit/org/apache/cassandra/db/CleanupTest.java
@@ -24,6 +24,7 @@
 import java.util.AbstractMap;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
@@ -41,6 +42,7 @@
 import org.apache.cassandra.locator.InetAddressAndPort;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.schema.ColumnMetadata;
+import org.apache.cassandra.schema.CompactionParams;
 import org.apache.cassandra.schema.KeyspaceMetadata;
 import org.apache.cassandra.cql3.Operator;
 import org.apache.cassandra.db.compaction.CompactionManager;
@@ -65,6 +67,7 @@ public class CleanupTest
     public static final String KEYSPACE1 = "CleanupTest1";
     public static final String CF_INDEXED1 = "Indexed1";
     public static final String CF_STANDARD1 = "Standard1";
+    public static final String CF_STANDARD_UCS1 = "StandardUCS1";
 
     public static final String KEYSPACE2 = "CleanupTestMultiDc";
     public static final String CF_INDEXED2 = "Indexed2";
@@ -87,7 +90,8 @@ public static void defineSchema() throws ConfigurationException
         SchemaLoader.prepareServer();
         SchemaLoader.createKeyspace(KEYSPACE1,
                                     KeyspaceParams.simple(1),
-                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1),
+                                    SchemaLoader.standardCFMD(KEYSPACE1, CF_STANDARD1)
+                                                .compaction(CompactionParams.stcs(new HashMap<>())),
                                     SchemaLoader.compositeIndexCFMD(KEYSPACE1, CF_INDEXED1, true));
 
 
@@ -108,8 +112,11 @@ public String getDatacenter(InetAddressAndPort endpoint)
 
         SchemaLoader.createKeyspace(KEYSPACE2,
                                     KeyspaceParams.nts("DC1", 1),
-                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD2),
-                                    SchemaLoader.compositeIndexCFMD(KEYSPACE2, CF_INDEXED2, true));
+                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD2)
+                                                .compaction(CompactionParams.stcs(new HashMap<>())),
+                                    SchemaLoader.compositeIndexCFMD(KEYSPACE2, CF_INDEXED2, true),
+                                    SchemaLoader.standardCFMD(KEYSPACE2, CF_STANDARD_UCS1)
+                                                .compaction(CompactionParams.ucs(new HashMap<>())));
         SchemaLoader.createKeyspace(KEYSPACE3,
                                     KeyspaceParams.nts("DC1", 1),
                                     SchemaLoader.standardCFMD(KEYSPACE3, CF_STANDARD3));
@@ -209,18 +216,30 @@ public void testCleanupWithNewToken() throws ExecutionException, InterruptedExce
     }
 
     @Test
-    public void testCleanupWithNoTokenRange() throws Exception
+    public void testCleanupSTCSWithNoTokenRange() throws Exception
     {
-        testCleanupWithNoTokenRange(false);
+        testCleanupWithNoTokenRange(CF_STANDARD2, false);
     }
 
     @Test
-    public void testUserDefinedCleanupWithNoTokenRange() throws Exception
+    public void testUserDefinedCleanupSTCSWithNoTokenRange() throws Exception
     {
-        testCleanupWithNoTokenRange(true);
+        testCleanupWithNoTokenRange(CF_STANDARD2, true);
     }
 
-    private void testCleanupWithNoTokenRange(boolean isUserDefined) throws Exception
+    @Test
+    public void testCleanupUCSWithNoTokenRange() throws Exception
+    {
+        testCleanupWithNoTokenRange(CF_STANDARD_UCS1, false);
+    }
+
+    @Test
+    public void testUserDefinedCleanupUCSWithNoTokenRange() throws Exception
+    {
+        testCleanupWithNoTokenRange(CF_STANDARD_UCS1, true);
+    }
+
+    private void testCleanupWithNoTokenRange(String cfsName, boolean isUserDefined) throws Exception
     {
 
         TokenMetadata tmd = StorageService.instance.getTokenMetadata();
@@ -232,7 +251,7 @@ private void testCleanupWithNoTokenRange(boolean isUserDefined) throws Exception
 
         Keyspace keyspace = Keyspace.open(KEYSPACE2);
         keyspace.setMetadata(KeyspaceMetadata.create(KEYSPACE2, KeyspaceParams.nts("DC1", 1)));
-        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF_STANDARD2);
+        ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfsName);
 
         // insert data and verify we get it back w/ range query
         fillCF(cfs, "val", LOOPS);
diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
index 5e8ab729afef..9604e3571097 100644
--- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
+++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java
@@ -38,6 +38,7 @@
 import org.json.simple.parser.JSONParser;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
 import com.google.common.collect.Iterators;
@@ -49,7 +50,6 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.io.sstable.Component;
 import org.apache.cassandra.io.sstable.Descriptor;
-import org.apache.cassandra.io.sstable.format.SSTableFormat;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.metrics.ClearableHistogram;
 import org.apache.cassandra.schema.ColumnMetadata;
@@ -465,7 +465,7 @@ public void testDataDirectoriesOfColumnFamily() throws Exception
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
         List<String> dataPaths = cfs.getDataPaths();
-        Assert.assertFalse(dataPaths.isEmpty());
+        assertFalse(dataPaths.isEmpty());
 
         Path path = Paths.get(dataPaths.get(0));
 
@@ -502,5 +502,30 @@ public void testScrubDataDirectories() throws Throwable
         List<File> ssTableFiles = new Directories(cfs.metadata()).sstableLister(Directories.OnTxnErr.THROW).listFiles();
         assertNotNull(ssTableFiles);
         assertEquals(0, ssTableFiles.size());
+
+        // necessary to let later tests pass
+        cfs.getTracker().removeUnsafe(Collections.singleton(ssTable));
+    }
+
+    @Test
+    public void testMutateRepaired() throws IOException
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1);
+
+        new RowUpdateBuilder(cfs.metadata(), 0, "key1").clustering("Column1").add("val", "val1").build().applyUnsafe();
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        Set<SSTableReader> sstables = cfs.getLiveSSTables();
+        assertEquals(1, sstables.size());
+
+        SSTableReader sstable = sstables.iterator().next();
+        assertFalse(sstable.isRepaired());
+
+        int repaired = cfs.mutateRepaired(sstables, 1, null, false);
+        assertEquals(1, repaired);
+
+        sstables = cfs.getLiveSSTables();
+        sstable = sstables.iterator().next();
+        assertTrue(sstable.isRepaired());
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java b/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java
index 3cd501e80fe8..8a6bdd066955 100644
--- a/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/DiskBoundaryManagerTest.java
@@ -64,7 +64,7 @@ public void setup()
     public void getBoundariesTest()
     {
         DiskBoundaries dbv = dbm.getDiskBoundaries(mock);
-        Assert.assertEquals(3, dbv.positions.size());
+        Assert.assertEquals(3, dbv.getPositions().size());
         assertEquals(dbv.directories, dirs.getWriteableLocations());
     }
 
@@ -72,11 +72,11 @@ public void getBoundariesTest()
     public void disallowedDirectoriesTest()
     {
         DiskBoundaries dbv = dbm.getDiskBoundaries(mock);
-        Assert.assertEquals(3, dbv.positions.size());
+        Assert.assertEquals(3, dbv.getPositions().size());
         assertEquals(dbv.directories, dirs.getWriteableLocations());
         DisallowedDirectories.maybeMarkUnwritable(new File("/tmp/3"));
         dbv = dbm.getDiskBoundaries(mock);
-        Assert.assertEquals(2, dbv.positions.size());
+        Assert.assertEquals(2, dbv.getPositions().size());
         Assert.assertEquals(Lists.newArrayList(new Directories.DataDirectory(new File("/tmp/1")),
                                         new Directories.DataDirectory(new File("/tmp/2"))),
                                  dbv.directories);
diff --git a/test/unit/org/apache/cassandra/db/SortedLocalRangesTest.java b/test/unit/org/apache/cassandra/db/SortedLocalRangesTest.java
new file mode 100644
index 000000000000..7fd67e377aff
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/SortedLocalRangesTest.java
@@ -0,0 +1,206 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.util.List;
+import java.util.Optional;
+import java.util.Random;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.RandomPartitioner;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.dht.SplitterTest;
+import org.apache.cassandra.locator.TokenMetadata;
+import org.apache.cassandra.service.StorageService;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+
+import static org.apache.cassandra.dht.SplitterTest.getSplitter;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.when;
+
+public class SortedLocalRangesTest
+{
+    final private static Random random = new Random(1047504572034957L);
+
+    @Mock
+    ColumnFamilyStore cfs;
+
+    @Mock
+    StorageService storageService;
+
+    @Mock
+    TokenMetadata tmd;
+
+    IPartitioner partitioner;
+
+    @BeforeClass
+    public static void setUpClass()
+    {
+        DatabaseDescriptor.daemonInitialization();
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+    }
+
+    @Before
+    public void setUp()
+    {
+        MockitoAnnotations.initMocks(this);
+
+        partitioner = DatabaseDescriptor.getPartitioner();
+
+        when(cfs.getPartitioner()).thenReturn(partitioner);
+        when(cfs.getKeyspaceName()).thenReturn("keyspace");
+        when(cfs.getTableName()).thenReturn("table");
+
+        when(storageService.getTokenMetadata()).thenReturn(tmd);
+    }
+
+    SortedLocalRanges makeRanges(long ringVersion, List<Splitter.WeightedRange> ranges)
+    {
+        when(tmd.getRingVersion()).thenReturn(ringVersion);
+        return new SortedLocalRanges(storageService, cfs, ringVersion, ranges);
+    }
+
+    @Test
+    public void testNoRanges()
+    {
+        long ringVersion = 1;
+        List<Splitter.WeightedRange> ranges = ImmutableList.of();
+        SortedLocalRanges sortedRanges = makeRanges(ringVersion, ranges);
+
+        assertNotNull(sortedRanges);
+        assertNotNull(sortedRanges.toString());
+
+        assertEquals(sortedRanges, sortedRanges);
+        assertEquals(sortedRanges.hashCode(), sortedRanges.hashCode());
+
+        assertFalse(sortedRanges.isOutOfDate());
+        assertTrue(sortedRanges.getRanges().isEmpty());
+        assertEquals(ringVersion, sortedRanges.getRingVersion());
+
+        // if there are no ranges, the splitter will return an array with the maximum token regardless of how many splits
+        assertEquals(1, sortedRanges.split(0).size());
+        assertEquals(1, sortedRanges.split(1).size());
+        assertEquals(1, sortedRanges.split(2).size());
+    }
+
+    @Test
+    public void testSplit()
+    {
+        long ringVersion = 1;
+
+        for (int i = 1; i <= 100; i++)
+        {
+            int numTokens = 172 + random.nextInt(128);
+            int rf = random.nextInt(4) + 2;
+            int parts = random.nextInt(5) + 1;
+            List<Splitter.WeightedRange> ranges = SplitterTest.generateLocalRanges(numTokens,
+                                                                                   rf,
+                                                                                   getSplitter(partitioner),
+                                                                                   random,
+                                                                                   partitioner instanceof RandomPartitioner);
+            SortedLocalRanges sortedRanges = makeRanges(ringVersion, ranges);
+
+            List<PartitionPosition> boundaries = sortedRanges.split(parts);
+            assertNotNull(boundaries);
+            assertEquals(parts, boundaries.size());
+        }
+    }
+
+    @Test
+    public void testSplitNoSplitter()
+    {
+        long ringVersion = 1;
+        int numTokens = 172 + random.nextInt(128);
+        int rf = random.nextInt(4) + 2;
+        int parts = random.nextInt(5) + 1;
+        List<Splitter.WeightedRange> ranges = SplitterTest.generateLocalRanges(numTokens,
+                                                                               rf,
+                                                                               getSplitter(partitioner),
+                                                                               random,
+                                                                               partitioner instanceof RandomPartitioner);
+
+        // mock a partitioner without the splitter and verify split ranges are the same as the local ranges
+        IPartitioner partitioner = Mockito.mock(IPartitioner.class);
+        when(cfs.getPartitioner()).thenReturn(partitioner);
+        when(partitioner.splitter()).thenReturn(Optional.empty());
+
+        SortedLocalRanges sortedRanges = makeRanges(ringVersion, ranges);
+
+        List<PartitionPosition> boundaries = sortedRanges.split(parts);
+        assertNotNull(boundaries);
+        assertEquals(ranges.size(), boundaries.size()); // it ignores the parts and just returns the ranges
+    }
+
+    @Test
+    public void testEquals()
+    {
+        long ringVersion = 1;
+        int numTokens = 172 + random.nextInt(128);
+        int rf = random.nextInt(4) + 2;
+        List<Splitter.WeightedRange> ranges = SplitterTest.generateLocalRanges(numTokens,
+                                                                               rf,
+                                                                               getSplitter(partitioner),
+                                                                               random,
+                                                                               partitioner instanceof RandomPartitioner);
+
+        SortedLocalRanges sortedRanges1 = makeRanges(ringVersion, ranges);
+        SortedLocalRanges sortedRanges2 = makeRanges(ringVersion, ranges);
+
+        assertEquals(sortedRanges1, sortedRanges2);
+        assertEquals(sortedRanges1.hashCode(), sortedRanges2.hashCode());
+        assertEquals(sortedRanges1.toString(), sortedRanges2.toString());
+
+        sortedRanges1.invalidate();
+        assertEquals(sortedRanges1, sortedRanges2);
+
+        sortedRanges2.invalidate();
+        assertEquals(sortedRanges1, sortedRanges2);
+
+        ringVersion++;
+
+        // different ring version
+        SortedLocalRanges sortedRanges3 = makeRanges(ringVersion, ranges);
+        assertNotEquals(sortedRanges1, sortedRanges3);
+        assertNotEquals(sortedRanges1.hashCode(), sortedRanges3.hashCode());
+        assertNotEquals(sortedRanges1.toString(), sortedRanges3.toString());
+
+        // different ranges
+        ranges = SplitterTest.generateLocalRanges(numTokens,
+                                                  rf,
+                                                  getSplitter(partitioner),
+                                                  random,
+                                                  partitioner instanceof RandomPartitioner);
+        SortedLocalRanges sortedRanges4 = makeRanges(ringVersion, ranges);
+        assertNotEquals(sortedRanges1, sortedRanges4);
+        assertNotEquals(sortedRanges1.hashCode(), sortedRanges4.hashCode());
+        assertNotEquals(sortedRanges1.toString(), sortedRanges4.toString());
+
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java b/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java
index 915abbed1597..84ca2763aa53 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/AbstractPendingRepairTest.java
@@ -19,6 +19,7 @@
 package org.apache.cassandra.db.compaction;
 
 import java.io.IOException;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.UUID;
@@ -39,18 +40,21 @@
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.service.ActiveRepairService;
+import org.apache.cassandra.service.StorageService;
 
 @Ignore
-public class AbstractPendingRepairTest extends AbstractRepairTest
+public abstract class AbstractPendingRepairTest extends AbstractRepairTest
 {
     protected String ks;
     protected final String tbl = "tbl";
     protected TableMetadata cfm;
     protected ColumnFamilyStore cfs;
-    protected CompactionStrategyManager csm;
+    protected CompactionStrategyFactory strategyFactory;
+    protected CompactionStrategyContainer compactionStrategyContainer;
     protected static ActiveRepairService ARS;
 
-    private int nextSSTableKey = 0;
+    protected int nextSSTableKey = 0;
+    public abstract String createTableCql();
 
     @BeforeClass
     public static void setupClass()
@@ -62,20 +66,28 @@ public static void setupClass()
         // cutoff messaging service
         MessagingService.instance().outboundSink.add((message, to) -> false);
         MessagingService.instance().inboundSink.add((message) -> false);
+        StorageService.instance.initServer();
     }
 
     @Before
     public void setup()
     {
         ks = "ks_" + System.currentTimeMillis();
-        cfm = CreateTableStatement.parse(String.format("CREATE TABLE %s.%s (k INT PRIMARY KEY, v INT)", ks, tbl), ks).build();
+        cfm = CreateTableStatement.parse(createTableCql(), ks).build();
         SchemaLoader.createKeyspace(ks, KeyspaceParams.simple(1), cfm);
         cfs = Schema.instance.getColumnFamilyStoreInstance(cfm.id);
-        csm = cfs.getCompactionStrategyManager();
+        strategyFactory = cfs.getCompactionFactory();
+        compactionStrategyContainer = cfs.getCompactionStrategyContainer();
         nextSSTableKey = 0;
         cfs.disableAutoCompaction();
     }
 
+    void handleOrphan(SSTableReader sstable)
+    {
+        compactionStrategyContainer.getStrategies(false, null)
+                                   .forEach(acs -> ((LegacyAbstractCompactionStrategy) acs).removeSSTable(sstable));
+    }
+
     /**
      * creates and returns an sstable
      *
@@ -94,7 +106,7 @@ SSTableReader makeSSTable(boolean orphan)
         SSTableReader sstable = diff.iterator().next();
         if (orphan)
         {
-            csm.getUnrepairedUnsafe().allStrategies().forEach(acs -> acs.removeSSTable(sstable));
+            handleOrphan(sstable);
         }
         return sstable;
     }
diff --git a/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java
index d6b51b4a5f00..7c08816804ec 100644
--- a/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/BackgroundCompactionsTest.java
@@ -40,6 +40,7 @@
 import static org.junit.Assert.*;
 import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.never;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.when;
 
@@ -52,10 +53,7 @@ public class BackgroundCompactionsTest
     private ColumnFamilyStore cfs;
 
     @Mock
-    private AbstractCompactionStrategy strategy;
-
-    @Mock
-    private CompactionStrategyManager strategyManager;
+    private CompactionStrategyContainer strategyContainer;
 
     @Mock
     private CompactionLogger compactionLogger;
@@ -78,9 +76,8 @@ public void setUp()
         when(cfs.metadata()).thenReturn(metadata);
         when(cfs.getKeyspaceName()).thenReturn(keyspace);
         when(cfs.getTableName()).thenReturn(table);
-        when(cfs.getCompactionStrategyManager()).thenReturn(strategyManager);
-        when(strategyManager.compactionLogger()).thenReturn(compactionLogger);
         when(compactionLogger.enabled()).thenReturn(true);
+        when(strategyContainer.getCompactionLogger()).thenReturn(compactionLogger);
     }
 
     private CompactionAggregate mockAggregate(long key, int numCompactions, int numCompacting)
@@ -89,7 +86,7 @@ private CompactionAggregate mockAggregate(long key, int numCompactions, int numC
             throw new IllegalArgumentException("Cannot have more compactions in progress than total compactions");
 
         CompactionAggregate ret = Mockito.mock(CompactionAggregate.class);
-        when(ret.getKey()).thenReturn(key);
+        when(ret.getKey()).thenReturn(new CompactionAggregate.Key(key));
 
         List<CompactionPick> compactions = new ArrayList<>(numCompactions);
         for (int i = 0; i < numCompactions; i++)
@@ -106,57 +103,57 @@ private CompactionAggregate mockAggregate(long key, int numCompactions, int numC
     @Test
     public void testNoCompaction()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(0, backgroundCompactions.getTotalCompactions());
 
-        CompactionStrategyStatistics statistics = backgroundCompactions.getStatistics();
+        CompactionStrategyStatistics statistics = backgroundCompactions.getStatistics(strategyContainer);
         assertNotNull(statistics);
         assertTrue(statistics.aggregates().isEmpty());
         assertEquals(keyspace, statistics.keyspace());
         assertEquals(table, statistics.table());
-        assertEquals(strategy.getClass().getSimpleName(), statistics.strategy());
+        assertEquals(strategyContainer.getClass().getSimpleName(), statistics.strategy());
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testNullPendingCompactions()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
-        backgroundCompactions.setPending(null);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
+        backgroundCompactions.setPending(strategyContainer, null);
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testDuplicatePendingCompactions()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
 
         List<CompactionAggregate> pending = new ArrayList<>(0);
         for (int i = 0; i < 5; i++)
             pending.add(mockAggregate(1, 1, 0));
 
         // Two compactions with the same key are invalid
-        backgroundCompactions.setPending(pending);
+        backgroundCompactions.setPending(strategyContainer, pending);
     }
 
     @Test
     public void testPendingCompactions()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
 
         List<CompactionAggregate> pending = new ArrayList<>(0);
         for (int i = 0; i < 5; i++)
             pending.add(mockAggregate(i, 1, 0));
 
-        backgroundCompactions.setPending(pending);
+        backgroundCompactions.setPending(strategyContainer, pending);
 
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
-        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(pending.size()));
+        Mockito.verify(compactionLogger, never()).statistics(eq(strategyContainer), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategyContainer), eq(pending.size()));
 
         assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
 
         // Remove the previous pending compactions, none should be kept since they don't have in progress compactions
-        backgroundCompactions.setPending(ImmutableList.of());
+        backgroundCompactions.setPending(strategyContainer, ImmutableList.of());
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(0, backgroundCompactions.getTotalCompactions());
     }
@@ -166,16 +163,16 @@ public void testCompactionFromPending()
     {
         // Add some pending compactions, and then submit one of them, the most common case
 
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
 
         List<CompactionAggregate> pending = new ArrayList<>(0);
         for (int i = 0; i < 5; i++)
             pending.add(mockAggregate(i, 1, 0));
 
-        backgroundCompactions.setPending(pending);
+        backgroundCompactions.setPending(strategyContainer, pending);
 
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
-        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(pending.size()));
+        Mockito.verify(compactionLogger, never()).statistics(eq(strategyContainer), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategyContainer), eq(pending.size()));
 
         assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
@@ -187,10 +184,10 @@ public void testCompactionFromPending()
         when(aggregate.getMatching(any(TreeMap.class))).thenReturn(aggregate);
         when(aggregate.getActive()).thenReturn(ImmutableList.of(compaction)); // ensure the aggregate already has the compaction
 
-        backgroundCompactions.setSubmitted(uuid, aggregate);
+        backgroundCompactions.setSubmitted(strategyContainer, uuid, aggregate);
 
         Mockito.verify(compaction, times(1)).setSubmitted(eq(uuid));
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("submitted"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategyContainer), eq("submitted"), any(CompactionStrategyStatistics.class));
 
         when(pending.get(0).numEstimatedCompactions()).thenReturn(0);
         assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
@@ -199,21 +196,21 @@ public void testCompactionFromPending()
         CompactionProgress progress = Mockito.mock(CompactionProgress.class);
         when(progress.operationId()).thenReturn(uuid);
 
-        backgroundCompactions.setInProgress(progress);
+        backgroundCompactions.onInProgress(progress);
         Mockito.verify(compaction, times(1)).setProgress(eq(progress));
 
         assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
 
         // Remove the previous pending compactions, the one submitted should be kept
-        backgroundCompactions.setPending(ImmutableList.of());
+        backgroundCompactions.setPending(strategyContainer, ImmutableList.of());
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(1, backgroundCompactions.getTotalCompactions());
 
-        backgroundCompactions.setCompleted(uuid);
+        backgroundCompactions.onCompleted(strategyContainer, uuid);
 
         Mockito.verify(compaction, times(1)).setCompleted();
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("completed"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategyContainer), eq("completed"), any(CompactionStrategyStatistics.class));
 
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(0, backgroundCompactions.getTotalCompactions());
@@ -226,16 +223,16 @@ public void testCompactionWithMatchingPending()
         // but for which there is a matching aggregate, this would happen if two threads raced and created equivalent
         // but not identical pending aggregates
 
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
 
         List<CompactionAggregate> pending = new ArrayList<>(0);
         for (int i = 0; i < 5; i++)
             pending.add(mockAggregate(i, 1, 0));
 
-        backgroundCompactions.setPending(pending);
+        backgroundCompactions.setPending(strategyContainer, pending);
 
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
-        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(pending.size()));
+        Mockito.verify(compactionLogger, never()).statistics(eq(strategyContainer), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategyContainer), eq(pending.size()));
 
         assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
@@ -248,10 +245,10 @@ public void testCompactionWithMatchingPending()
         when(pending.get(0).getActive()).thenReturn(ImmutableList.of()); // ensure the matching aggregate does not have the compaction
         when(pending.get(0).withAdditionalCompactions(any(Collection.class))).thenReturn(pending.get(0));
 
-        backgroundCompactions.setSubmitted(uuid, aggregate);
+        backgroundCompactions.setSubmitted(strategyContainer, uuid, aggregate);
 
         Mockito.verify(compaction, times(1)).setSubmitted(eq(uuid));
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("submitted"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategyContainer), eq("submitted"), any(CompactionStrategyStatistics.class));
 
         when(pending.get(0).numEstimatedCompactions()).thenReturn(0);
         assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
@@ -260,21 +257,21 @@ public void testCompactionWithMatchingPending()
         CompactionProgress progress = Mockito.mock(CompactionProgress.class);
         when(progress.operationId()).thenReturn(uuid);
 
-        backgroundCompactions.setInProgress(progress);
+        backgroundCompactions.onInProgress(progress);
         Mockito.verify(compaction, times(1)).setProgress(eq(progress));
 
         assertEquals(pending.size() - 1, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
 
         // Remove the previous pending compactions, the one submitted should be kept
-        backgroundCompactions.setPending(ImmutableList.of());
+        backgroundCompactions.setPending(strategyContainer, ImmutableList.of());
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(1, backgroundCompactions.getTotalCompactions());
 
-        backgroundCompactions.setCompleted(uuid);
+        backgroundCompactions.onCompleted(strategyContainer, uuid);
 
         Mockito.verify(compaction, times(1)).setCompleted();
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("completed"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategyContainer), eq("completed"), any(CompactionStrategyStatistics.class));
 
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(0, backgroundCompactions.getTotalCompactions());
@@ -286,12 +283,12 @@ public void testCompactionNotInPending()
         // Submit a compaction that is not part of a pending aggregate, this normally happens for tombstone compactions,
         // in this case the pending aggregates are empty but a tombstone compaction is submitted
 
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
 
-        backgroundCompactions.setPending(ImmutableList.of());
+        backgroundCompactions.setPending(strategyContainer, ImmutableList.of());
 
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("pending"), any(CompactionStrategyStatistics.class));
-        Mockito.verify(compactionLogger, times(1)).pending(eq(strategy), eq(0));
+        Mockito.verify(compactionLogger, never()).statistics(eq(strategyContainer), eq("pending"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).pending(eq(strategyContainer), eq(0));
 
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(0, backgroundCompactions.getTotalCompactions());
@@ -302,10 +299,10 @@ public void testCompactionNotInPending()
         when(aggregate.getSelected()).thenReturn(compaction);
         when(aggregate.getMatching(any(TreeMap.class))).thenReturn(null);
 
-        backgroundCompactions.setSubmitted(uuid, aggregate);
+        backgroundCompactions.setSubmitted(strategyContainer, uuid, aggregate);
 
         Mockito.verify(compaction, times(1)).setSubmitted(eq(uuid));
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("submitted"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategyContainer), eq("submitted"), any(CompactionStrategyStatistics.class));
 
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(1, backgroundCompactions.getTotalCompactions());
@@ -313,21 +310,21 @@ public void testCompactionNotInPending()
         CompactionProgress progress = Mockito.mock(CompactionProgress.class);
         when(progress.operationId()).thenReturn(uuid);
 
-        backgroundCompactions.setInProgress(progress);
+        backgroundCompactions.onInProgress(progress);
         Mockito.verify(compaction, times(1)).setProgress(eq(progress));
 
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(1, backgroundCompactions.getTotalCompactions());
 
         // Remove the previous pending compactions, the one submitted should be kept
-        backgroundCompactions.setPending(ImmutableList.of());
+        backgroundCompactions.setPending(strategyContainer, ImmutableList.of());
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(1, backgroundCompactions.getTotalCompactions());
 
-        backgroundCompactions.setCompleted(uuid);
+        backgroundCompactions.onCompleted(strategyContainer, uuid);
 
         Mockito.verify(compaction, times(1)).setCompleted();
-        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategy), eq("completed"), any(CompactionStrategyStatistics.class));
+        Mockito.verify(compactionLogger, times(1)).statistics(eq(strategyContainer), eq("completed"), any(CompactionStrategyStatistics.class));
 
         assertEquals(0, backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(0, backgroundCompactions.getTotalCompactions());
@@ -339,7 +336,7 @@ public void testReplacePending()
         // Add som pending aggregates, then replace them with aggregates with different keys, verify that only
         // those with compactions are kept, partially overlap the keys between the old and new aggregates
 
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
 
         List<CompactionAggregate> pending = new ArrayList<>(0);
         int key = 0;
@@ -357,7 +354,7 @@ public void testReplacePending()
             pending.add(aggregateWithComps);
         }
 
-        backgroundCompactions.setPending(pending);
+        backgroundCompactions.setPending(strategyContainer, pending);
 
         assertEquals(pending.size(), backgroundCompactions.getEstimatedRemainingTasks());
         assertEquals(pending.size(), backgroundCompactions.getTotalCompactions());
@@ -375,7 +372,7 @@ public void testReplacePending()
             pending.add(aggregate);
         }
 
-        backgroundCompactions.setPending(pending);
+        backgroundCompactions.setPending(strategyContainer, pending);
 
         // the extra compactions are those from the old aggregates with a compaction regardless of whether
         // the keys overlapped or not (when the keys overlap the new one has a compaction added, when they do
@@ -387,15 +384,15 @@ public void testReplacePending()
     @Test(expected = IllegalArgumentException.class)
     public void testSetSubmittedNoId()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
-        backgroundCompactions.setSubmitted(null, Mockito.mock(CompactionAggregate.class));
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
+        backgroundCompactions.setSubmitted(strategyContainer, null, Mockito.mock(CompactionAggregate.class));
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testSetSubmittedNoAggregate()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
-        backgroundCompactions.setSubmitted(UUID.randomUUID(), null);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
+        backgroundCompactions.setSubmitted(strategyContainer, UUID.randomUUID(), null);
     }
 
     @Test(expected = IllegalArgumentException.class)
@@ -405,22 +402,22 @@ public void testSetSubmittedDuplicateId()
         CompactionAggregate aggregate = mockAggregate(1, 1, 0);
         when(aggregate.getSelected()).thenReturn(CompactionPick.EMPTY);
 
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
-        backgroundCompactions.setSubmitted(uuid, aggregate);
-        backgroundCompactions.setSubmitted(uuid, aggregate);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
+        backgroundCompactions.setSubmitted(strategyContainer, uuid, aggregate);
+        backgroundCompactions.setSubmitted(strategyContainer, uuid, aggregate);
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testSetInProgressNoProgress()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
-        backgroundCompactions.setInProgress(null);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
+        backgroundCompactions.onInProgress(null);
     }
 
     @Test(expected = IllegalArgumentException.class)
     public void testSetCompletedNoId()
     {
-        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(strategy, cfs);
-        backgroundCompactions.setCompleted(null);
+        BackgroundCompactions backgroundCompactions = new BackgroundCompactions(cfs);
+        backgroundCompactions.onCompleted(strategyContainer, null);
     }
 }
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/BaseCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/BaseCompactionStrategyTest.java
new file mode 100644
index 000000000000..44d389d302fc
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/BaseCompactionStrategyTest.java
@@ -0,0 +1,376 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.commons.math3.random.JDKRandomGenerator;
+
+import org.junit.Ignore;
+
+import com.clearspring.analytics.stream.cardinality.ICardinality;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.DiskBoundaries;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SortedLocalRanges;
+import org.apache.cassandra.db.lifecycle.Tracker;
+import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FBUtilities;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+
+import static org.junit.Assert.assertNotNull;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.when;
+import static org.mockito.Mockito.withSettings;
+
+/**
+ * A class that contains common mocks and test utilities for unit tests of compaction strategies
+ * that involve mocking compactions and sstables.
+ */
+@Ignore
+public class BaseCompactionStrategyTest
+{
+    static final double epsilon = 0.00000001;
+    static final JDKRandomGenerator random = new JDKRandomGenerator();
+
+    final String keyspace = "ks";
+    final String table = "tbl";
+
+    @Mock
+    ColumnFamilyStore cfs;
+
+    @Mock
+    CompactionStrategyFactory strategyFactory;
+
+    @Mock
+    DiskBoundaries diskBoundaries;
+
+    // Returned by diskBoundaries.getPositions() and modified by UnifiedCompactionStrategyTest
+    protected List<PartitionPosition> diskBoundaryPositions = null;
+
+    SortedLocalRanges localRanges;
+
+    Map<SSTableReader, Integer> diskIndexes;
+
+    Tracker dataTracker;
+
+    long repairedAt;
+
+    CompactionLogger compactionLogger;
+
+    IPartitioner partitioner;
+
+    Splitter splitter;
+
+    protected static void setUpClass()
+    {
+        long seed = System.currentTimeMillis();
+        random.setSeed(seed);
+        System.out.println("Random seed: " + seed);
+
+        DatabaseDescriptor.daemonInitialization(); // because of all the static initialization in CFS
+        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+    }
+
+    protected void setUp()
+    {
+        setUp(1);
+    }
+
+    protected void setUp(int numShards)
+    {
+        MockitoAnnotations.initMocks(this);
+
+        TableMetadata metadata = TableMetadata.builder(keyspace, table)
+                                              .addPartitionKeyColumn("pk", AsciiType.instance)
+                                              .build();
+
+        dataTracker = Tracker.newDummyTracker();
+        repairedAt = System.currentTimeMillis();
+        partitioner = DatabaseDescriptor.getPartitioner();
+        splitter = partitioner.splitter().orElse(null);
+        if (numShards > 1)
+            assertNotNull("Splitter is required with multiple compaction shards", splitter);
+
+        diskIndexes = new HashMap<>();
+        localRanges = SortedLocalRanges.forTesting(cfs, ImmutableList.of(new Splitter.WeightedRange(1.0, new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()))));
+
+        when(cfs.metadata()).thenReturn(metadata);
+        when(cfs.getKeyspaceName()).thenReturn(keyspace);
+        when(cfs.getTableName()).thenReturn(table);
+        when(cfs.getDiskBoundaries()).thenReturn(diskBoundaries);
+        when(cfs.getLocalRanges()).thenReturn(localRanges);
+        when(diskBoundaries.getLocalRanges()).thenReturn(localRanges);
+        when(cfs.getTracker()).thenReturn(dataTracker);
+        when(cfs.getPartitioner()).thenReturn(partitioner);
+
+        // use a real compaction logger to execute that code too, even though we don't really check
+        // the content of the files, at least we cover the code. The files will be overwritten next
+        // time the test is run or by a gradle clean task, so they will not grow indefinitely
+        compactionLogger = new CompactionLogger(cfs.metadata());
+        compactionLogger.enable();
+
+        when(strategyFactory.getCfs()).thenReturn(cfs);
+        when(strategyFactory.getCompactionLogger()).thenReturn(compactionLogger);
+
+        when(diskBoundaries.getNumBoundaries()).thenAnswer(invocation -> diskIndexes.size());
+        when(diskBoundaries.getDiskIndexFromKey(any(SSTableReader.class))).thenAnswer(invocation -> diskIndexes.getOrDefault(invocation.getArgument(0), 0));
+        when(diskBoundaries.getPositions()).thenAnswer(invocationOnMock -> diskBoundaryPositions);
+    }
+
+    /**
+     * Add sstables to the tracker, which is enough for {@link UnifiedCompactionStrategy}, but for
+     * {@link LegacyAbstractCompactionStrategy} we also need to add the sstables directly to the strategy.
+     */
+    void addSSTablesToStrategy(AbstractCompactionStrategy strategy, Iterable<SSTableReader> sstables)
+    {
+        dataTracker.addInitialSSTables(sstables);
+
+        if (strategy instanceof LegacyAbstractCompactionStrategy)
+        {
+            LegacyAbstractCompactionStrategy legacyStrategy = (LegacyAbstractCompactionStrategy) strategy;
+            for (SSTableReader sstable : sstables)
+                legacyStrategy.addSSTable(sstable);
+        }
+    }
+
+    /**
+     * Remove sstables from the tracker, which should be enough for {@link UnifiedCompactionStrategy}, but for
+     * {@link LegacyAbstractCompactionStrategy} we also need to remove the sstables directly from the strategy.
+     */
+    void removeSSTablesFromStrategy(AbstractCompactionStrategy strategy, Set<SSTableReader> sstables)
+    {
+        dataTracker.removeCompactingUnsafe(sstables);
+
+        if (strategy instanceof LegacyAbstractCompactionStrategy)
+        {
+            LegacyAbstractCompactionStrategy legacyStrategy = (LegacyAbstractCompactionStrategy) strategy;
+            for (SSTableReader sstable : sstables)
+                legacyStrategy.removeSSTable(sstable);
+        }
+    }
+
+    SSTableReader mockSSTable(int level, long bytesOnDisk, long timestamp, double hotness, DecoratedKey first, DecoratedKey last)
+    {
+        return mockSSTable(level, bytesOnDisk, timestamp, hotness, first, last,  0, true, null, 0);
+    }
+
+    SSTableReader mockSSTable(long bytesOnDisk, long timestamp, DecoratedKey first, DecoratedKey last)
+    {
+        return mockSSTable(0, bytesOnDisk, timestamp, 0, first, last,  0, true, null, 0);
+    }
+
+    SSTableReader mockSSTable(ICardinality cardinality, long timestamp, int valueSize)
+    {
+        long keyCount = cardinality.cardinality();
+        long bytesOnDisk = valueSize * keyCount;
+        DecoratedKey first = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+        DecoratedKey last = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+
+        SSTableReader ret = mockSSTable(0, bytesOnDisk, timestamp, 0, first, last, 0, true, null, 0);
+        when(ret.keyCardinalityEstimator()).thenReturn(cardinality);
+        return ret;
+    }
+
+    SSTableReader mockSSTable(int level,
+                              long bytesOnDisk,
+                              long timestamp,
+                              double hotness,
+                              DecoratedKey first,
+                              DecoratedKey last,
+                              int diskIndex,
+                              boolean repaired,
+                              UUID pendingRepair,
+                              int ttl)
+    {
+        // We create a ton of mock SSTables that mockito is going to keep until the end of the test suite without stubOnly.
+        // Mockito keeps them alive to preserve the history of invocations which is not available for stubs. If we ever
+        // need history of invocations and remove stubOnly, we should also manually reset mocked SSTables in tearDown.
+        SSTableReader ret = Mockito.mock(SSTableReader.class, withSettings().stubOnly());
+
+        when(ret.getSSTableLevel()).thenReturn(level);
+        when(ret.bytesOnDisk()).thenReturn(bytesOnDisk);
+        when(ret.onDiskLength()).thenReturn(bytesOnDisk);
+        when(ret.uncompressedLength()).thenReturn(bytesOnDisk); // let's assume no compression
+        when(ret.hotness()).thenReturn(hotness);
+        when(ret.getMaxTimestamp()).thenReturn(timestamp);
+        when(ret.getMinTimestamp()).thenReturn(timestamp);
+        when(ret.getFirst()).thenReturn(first);
+        when(ret.getLast()).thenReturn(last);
+        when(ret.isMarkedSuspect()).thenReturn(false);
+        when(ret.isRepaired()).thenReturn(repaired);
+        when(ret.getRepairedAt()).thenReturn(repairedAt);
+        when(ret.getPendingRepair()).thenReturn(pendingRepair);
+        when(ret.isPendingRepair()).thenReturn(pendingRepair != null);
+        when(ret.getColumnFamilyName()).thenReturn(table);
+        when(ret.getGeneration()).thenReturn(level);
+        when(ret.toString()).thenReturn(String.format("Bytes on disk: %s, level %d, hotness %f, timestamp %d, first %s, last %s, disk index: %d, repaired: %b, pend. repair: %b",
+                                                      FBUtilities.prettyPrintMemory(bytesOnDisk), level, hotness, timestamp, first, last, diskIndex, repaired, pendingRepair));
+        int deletionTime;
+        if (ttl > 0)
+            deletionTime = (int) TimeUnit.MILLISECONDS.toSeconds(timestamp) + ttl;
+        else
+            deletionTime = Integer.MAX_VALUE;
+
+        when(ret.getMinLocalDeletionTime()).thenReturn(deletionTime);
+        when(ret.getMaxLocalDeletionTime()).thenReturn(deletionTime);
+        when(ret.getMinTTL()).thenReturn(ttl);
+        when(ret.getMaxTTL()).thenReturn(ttl);
+
+        diskIndexes.put(ret, diskIndex);
+        return ret;
+    }
+
+    List<SSTableReader> mockSSTables(int numSSTables, long bytesOnDisk, double hotness, long timestamp)
+    {
+        return mockSSTables(numSSTables, bytesOnDisk, hotness, timestamp, 0, true,null);
+    }
+
+    List<SSTableReader> mockSSTables(int numSSTables, long bytesOnDisk, double hotness, long timestamp, int diskIndex, boolean repaired, UUID pendingRepair)
+    {
+        DecoratedKey first = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+        DecoratedKey last = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+
+        List<SSTableReader> sstables = new ArrayList<>();
+        for (int i = 0; i < numSSTables; i++)
+        {
+            long b = (long)(bytesOnDisk * 0.95 + bytesOnDisk * 0.05 * random.nextDouble()); // leave 5% variability
+            double h = hotness * 0.95 + hotness * 0.05 * random.nextDouble(); // leave 5% variability
+            sstables.add(mockSSTable(0, b, timestamp, h, first, last, diskIndex, repaired, pendingRepair, 0));
+        }
+
+        return sstables;
+    }
+
+    List<SSTableReader> mockNonOverlappingSSTables(int numSSTables, int level, long bytesOnDisk)
+    {
+        if (!partitioner.splitter().isPresent())
+            throw new IllegalStateException(String.format("Cannot split ranges with current partitioner %s", partitioner));
+
+        Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken());
+        Splitter.WeightedRange weightedRange = new Splitter.WeightedRange(1.0, range);
+        Splitter splitter = partitioner.splitter().get();
+        List<Token> boundaries = splitter.splitOwnedRanges(numSSTables,
+                                                           ImmutableList.of(weightedRange),
+                                                           Splitter.SplitType.ALWAYS_SPLIT)
+                                 .boundaries;
+        boundaries.add(0, partitioner.getMinimumToken());
+        ByteBuffer emptyBuffer = ByteBuffer.allocate(0);
+
+        long timestamp = System.currentTimeMillis();
+        List<SSTableReader> sstables = new ArrayList<>(numSSTables);
+        for (int i = 0; i < numSSTables; i++)
+        {
+            DecoratedKey first = new BufferDecoratedKey(boundaries.get(i).increaseSlightly(), emptyBuffer);
+            DecoratedKey last =  new BufferDecoratedKey(boundaries.get(i+1), emptyBuffer);
+            sstables.add(mockSSTable(level, bytesOnDisk, timestamp, 0., first, last));
+
+            timestamp+=10;
+        }
+
+        return sstables;
+    }
+
+    CompactionProgress mockCompletedCompactionProgress(Set<SSTableReader> compacting, UUID id)
+    {
+        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
+
+        long compactingLen = totUncompressedLength(compacting);
+        when(progress.operationId()).thenReturn(id);
+        when(progress.inSSTables()).thenReturn(compacting);
+        when(progress.uncompressedBytesRead()).thenReturn(compactingLen);
+        when(progress.uncompressedBytesWritten()).thenReturn(compactingLen);
+        when(progress.durationInNanos()).thenReturn(TimeUnit.SECONDS.toNanos(30));
+
+        return progress;
+    }
+
+    void addSizeTieredOptions(Map<String, String> options)
+    {
+        addSizeTieredOptions(options, SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE);
+    }
+
+    void addSizeTieredOptions(Map<String, String> options, long minSSTableSize)
+    {
+        options.put(SizeTieredCompactionStrategyOptions.MIN_SSTABLE_SIZE_KEY, Long.toString(minSSTableSize));
+        options.put(SizeTieredCompactionStrategyOptions.BUCKET_LOW_KEY, Double.toString(SizeTieredCompactionStrategyOptions.DEFAULT_BUCKET_LOW));
+        options.put(SizeTieredCompactionStrategyOptions.BUCKET_HIGH_KEY, Double.toString(SizeTieredCompactionStrategyOptions.DEFAULT_BUCKET_HIGH));
+    }
+
+    void addTimeTieredOptions(Map<String, String> options)
+    {
+        addSizeTieredOptions(options, SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE);
+
+        options.put(TimeWindowCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, TimeUnit.MILLISECONDS.toString());
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "30");
+        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "MINUTES");
+        options.put(TimeWindowCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, Long.toString(Long.MAX_VALUE)); // disable check for expired sstables
+    }
+
+    void addLeveledOptions(Map<String, String> options, long maxSSTableSizeBytes)
+    {
+        addLeveledOptions(options, SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE, maxSSTableSizeBytes, 10);
+    }
+
+    void addLeveledOptions(Map<String, String> options, long minSSTableSizeBytes, long maxSSTableSizeBytes, int fanout)
+    {
+        addSizeTieredOptions(options, minSSTableSizeBytes);
+
+        options.put(LeveledCompactionStrategy.SSTABLE_SIZE_OPTION, Long.toString(maxSSTableSizeBytes >> 20)); // Bytes to MB
+        options.put(LeveledCompactionStrategy.LEVEL_FANOUT_SIZE_OPTION, Integer.toString(fanout));
+    }
+
+    long totUncompressedLength(Collection<SSTableReader> sstables)
+    {
+        long ret = 0;
+        for (SSTableReader sstable : sstables)
+            ret += sstable.uncompressedLength();
+
+        return ret;
+    }
+
+    double totHotness(Collection<SSTableReader> sstables)
+    {
+        double ret = 0;
+        for (SSTableReader sstable : sstables)
+            ret += sstable.hotness();
+
+        return ret;
+    }
+
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/CQLUnifiedCompactionTest.java b/test/unit/org/apache/cassandra/db/compaction/CQLUnifiedCompactionTest.java
new file mode 100644
index 000000000000..4fe62eacb4b7
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CQLUnifiedCompactionTest.java
@@ -0,0 +1,341 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Random;
+import java.util.stream.Collectors;
+
+import org.junit.After;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.commitlog.CommitLog;
+import org.apache.cassandra.db.compaction.unified.AdaptiveController;
+import org.apache.cassandra.db.compaction.unified.Controller;
+import org.apache.cassandra.db.compaction.unified.StaticController;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * CQL tests on a table configured with Unified Compaction.
+ *
+ * The unified compaction strategy is described in this design document:
+ *
+ * TODO: link to design doc or SEP
+ *
+ * It has properties of  both tiered and leveled compactions and it adapts to the workload
+ * by switching between strategies or increasing / decreasing the fanout factor.
+ *
+ * The essential formulae are the calculations of buckets:
+ *
+ * S = ⌊log_oF(size / m)⌋ = ⌊(ln size - ln m) / (ln F + ln o)⌋
+ *
+ * where log_oF is the log with oF as the base
+ * o is the survival factor, currently fixed to 1
+ * F is the fanout factor calculated below
+ * m is the minimal size, fixed in the strategy options
+ * size is the sorted run size (sum of all the sizes of the sstables in the sorted run)
+ *
+ * Also, T is the number of sorted runs that trigger compaction.
+ *
+ * Give a parameter W, which is fixed in these tests, then T and F are calculated as follows:
+ *
+ * - W < 0 then T = 2 and F = 2 - W (leveled merge policy)
+ * - W > 0 then T = F and F = 2 + W (tiered merge policy)
+ * - W = 0 then T = F = 2 (middle ground)
+ */
+public class CQLUnifiedCompactionTest extends CQLTester
+{
+    @BeforeClass
+    public static void beforeClass()
+    {
+        CQLTester.setUpClass();
+        StorageService.instance.initServer();
+    }
+
+    @After
+    public void tearDown()
+    {
+        // This prevents unwanted flushing in future tests
+        // Dirty CL segments cause memtables to be flushed after a schema change and we don't want this
+        // to happen asynchronously in CQLTester.afterTest() because it would interfere with the tests
+        // that rely on an exact number of sstables
+
+        for (String table: currentTables())
+        {
+            logger.debug("Dropping {} synchronously to prevent unwanted flushing due to CL dirty", table);
+            schemaChange(String.format("DROP TABLE IF EXISTS %s.%s", KEYSPACE, table));
+        }
+
+        CommitLog.instance.forceRecycleAllSegments();
+    }
+
+    @Test
+    public void testCreateTable()
+    {
+        createTable("create table %s (id int primary key, val text) with compaction = {'class':'UnifiedCompactionStrategy'}");
+        assertTrue(getCurrentCompactionStrategy() instanceof UnifiedCompactionStrategy);
+    }
+
+    @Test
+    public void testStaticOptions()
+    {
+        testStaticOptions(512, 2, 50, -2);
+        testStaticOptions(1024, 4, 150, 0);
+        testStaticOptions(2048, 10, 250, 2);
+    }
+
+    private void testStaticOptions(int dataSetSizeGB, int numShards, int minSstableSizeMB, int ... Ws)
+    {
+        String scalingParametersStr = String.join(",", Arrays.stream(Ws)
+                                                          .mapToObj(i -> Integer.toString(i))
+                                                          .collect(Collectors.toList()));
+
+        createTable("create table %s (id int primary key, val text) with compaction = " +
+                    "{'class':'UnifiedCompactionStrategy', 'adaptive' : 'false', " +
+                    String.format("'dataset_size_in_gb' : '%d', ", dataSetSizeGB) +
+                    String.format("'num_shards' : '%d', ", numShards) +
+                    String.format("'min_sstable_size_in_mb' : '%d', ", minSstableSizeMB) +
+                    String.format("'static_scaling_parameters' : '%s'}", scalingParametersStr));
+
+        CompactionStrategy strategy = getCurrentCompactionStrategy();
+        assertTrue(strategy instanceof UnifiedCompactionStrategy);
+
+        UnifiedCompactionStrategy unifiedCompactionStrategy = (UnifiedCompactionStrategy) strategy;
+        Controller controller = unifiedCompactionStrategy.getController();
+        assertEquals((long) dataSetSizeGB << 30, controller.getDataSetSizeBytes());
+        assertEquals(numShards, controller.getNumShards());
+        assertEquals((long) minSstableSizeMB << 20, controller.getMinSstableSizeBytes());
+
+        assertTrue(unifiedCompactionStrategy.getController() instanceof StaticController);
+        for (int i = 0; i < Ws.length; i++)
+            assertEquals(Ws[i], unifiedCompactionStrategy.getW(i));
+    }
+
+    @Test
+    public void testAdaptiveOptions()
+    {
+        testAdaptiveOptions(512, 2, 50, -2);
+        testAdaptiveOptions(1024, 4, 150, 0);
+        testAdaptiveOptions(2048, 10, 250, 2);
+    }
+
+    private void testAdaptiveOptions(int dataSetSizeGB, int numShards, int sstableSizeMB, int w)
+    {
+        createTable("create table %s (id int primary key, val text) with compaction = " +
+                    "{'class':'UnifiedCompactionStrategy', 'adaptive' : 'true', " +
+                    String.format("'dataset_size_in_gb' : '%d', ", dataSetSizeGB) +
+                    String.format("'num_shards' : '%d', ", numShards) +
+                    String.format("'min_sstable_size_in_mb' : '%d', ", sstableSizeMB) +
+                    String.format("'adaptive_starting_scaling_parameter' : '%s', ", w) +
+                    String.format("'adaptive_min_scaling_parameter' : '%s', ", -6) +
+                    String.format("'adaptive_max_scaling_parameter' : '%s', ", 16) +
+                    String.format("'adaptive_interval_sec': '%d', ", 300) +
+                    String.format("'adaptive_threshold': '%f', ", 0.25) +
+                    String.format("'adaptive_min_cost': '%d'}", 1));
+
+        CompactionStrategy strategy = getCurrentCompactionStrategy();
+        assertTrue(strategy instanceof UnifiedCompactionStrategy);
+
+        UnifiedCompactionStrategy unifiedCompactionStrategy = (UnifiedCompactionStrategy) strategy;
+        assertEquals(sstableSizeMB << 20, unifiedCompactionStrategy.getController().getMinSstableSizeBytes());
+
+        assertTrue(unifiedCompactionStrategy.getController() instanceof AdaptiveController);
+        for (int i = 0; i < 10; i++)
+            assertEquals(w, unifiedCompactionStrategy.getW(i));
+
+        AdaptiveController controller = (AdaptiveController) unifiedCompactionStrategy.getController();
+        assertEquals((long) dataSetSizeGB << 30, controller.getDataSetSizeBytes());
+        assertEquals(numShards, controller.getNumShards());
+        assertEquals((long) sstableSizeMB << 20, controller.getMinSstableSizeBytes());
+        assertEquals(-6, controller.getMinW());
+        assertEquals(16, controller.getMaxW());
+        assertEquals(300, controller.getInterval());
+        assertEquals(0.25, controller.getThreshold(), 0.000001);
+        assertEquals(1, controller.getMinCost());
+    }
+
+    @Test
+    public void testAlterTable()
+    {
+        createTable("create table %s (id int primary key, val text) with compaction = {'class' : 'SizeTieredCompactionStrategy'}");
+        assertFalse(getCurrentCompactionStrategy() instanceof UnifiedCompactionStrategy);
+
+        alterTable("alter table %s with compaction = {'class' : 'UnifiedCompactionStrategy', 'adaptive' : 'true'}");
+        assertTrue(getCurrentCompactionStrategy() instanceof UnifiedCompactionStrategy);
+        assertTrue(((UnifiedCompactionStrategy) getCurrentCompactionStrategy()).getController() instanceof AdaptiveController);
+
+        alterTable("alter table %s with compaction = {'class' : 'UnifiedCompactionStrategy', 'adaptive' : 'false'}");
+        assertTrue(getCurrentCompactionStrategy() instanceof UnifiedCompactionStrategy);
+        assertTrue(((UnifiedCompactionStrategy) getCurrentCompactionStrategy()).getController() instanceof StaticController);
+    }
+
+    @Test
+    public void testSingleCompaction() throws Throwable
+    {
+        testSingleCompaction(4, 6); // W = 4 => T = 6 sstables required to trigger a compaction, see doc for formula
+        testSingleCompaction(2, 4); // W = 2 => T = 4
+        testSingleCompaction(0, 2); // W = 0 => T = 2
+        testSingleCompaction(-2, 2); // W = -2 => T = 2
+        testSingleCompaction(-4, 2); // W = -4 => T = 2
+    }
+
+    private void testSingleCompaction(int W, int T) throws Throwable
+    {
+        // Start with sstables whose size is minimal_size_in_mb, 1mb, ensure that there are no overlaps between sstables
+        int numInserts = 1024;
+        int valSize = 1024;
+
+        createTable("create table %s (id int primary key, val blob) with compaction = {'class':'UnifiedCompactionStrategy', 'adaptive' : 'false', " +
+                    String.format("'static_scaling_parameters' : '%d', 'min_sstable_size_in_mb' : '1', 'num_shards': '1', 'log_all' : 'true'}", W));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        cfs.disableAutoCompaction();
+
+        assertEquals(0, cfs.getLiveSSTables().size());
+
+        int key = 0;
+        ByteBuffer val = ByteBuffer.wrap(new byte[valSize]);
+        for (int i = 0; i < T; i++)
+            key = insertAndFlush(numInserts, key, val);
+
+        int expectedInserts = numInserts * T;
+
+        assertEquals(T, cfs.getLiveSSTables().size());
+        assertEquals(expectedInserts, getRows(execute("SELECT * FROM %s")).length);
+
+        cfs.enableAutoCompaction(true);
+
+        assertEquals(1, cfs.getLiveSSTables().size());
+        assertEquals(expectedInserts, getRows(execute("SELECT * FROM %s")).length);
+    }
+
+    @Test
+    public void testMultipleCompactionsSingleW_Static() throws Throwable
+    {
+        // tiered tests with W = 2 and T = F = 4
+        testMultipleCompactions(4, 1, 1, new int[] {2});  //  4 sstables should be compacted into 1
+        testMultipleCompactions(8, 1, 1, new int[] {2});  //  8 sstables should be compacted into 1
+        testMultipleCompactions(16, 1, 1, new int[] {2}); // 16 sstables should be compacted into 1
+
+        // middle-point tests between tiered and leveled with W = 0, T = F = 2
+        testMultipleCompactions(2, 1, 1, new int[] {0});   // 2 sstables should be compacted into 1
+        testMultipleCompactions(4, 1, 1, new int[] {0});   // 4 sstables should be compacted into 1
+        testMultipleCompactions(8, 1, 1, new int[] {0});   // 2 sstables should be compacted into 1
+        testMultipleCompactions(16, 1, 1, new int[] {0});  // 16 sstables should be compacted into 1
+
+        // leveled tests with W = -2 and T = 2, F = 4
+        testMultipleCompactions(2, 1, 1, new int[] {-2});  //  2 sstables should be compacted into 1
+        testMultipleCompactions(4, 1, 1, new int[] {-2});  //  4 sstables should be compacted into 1
+        testMultipleCompactions(8, 1, 1, new int[] {-2});  //  8 sstables should be compacted into 1
+        testMultipleCompactions(9, 1, 1, new int[] {-2});  //  9 sstables should be compacted into 2
+        testMultipleCompactions(16, 1, 1, new int[] {-2}); // 12 sstables should be compacted into 1
+    }
+
+    @Test
+    public void testMultipleCompactionsDifferentWs_Static() throws Throwable
+    {
+        // tiered tests with W = [4, -6] and T = [6, 2], F = [6, 8]
+        testMultipleCompactions(12, 1, 1, new int[] {4, -6});  //  sstables: 12 -> (6,6) => 2 => 1
+
+        // tiered tests with W = [30, 2, -6] and T = [32, 4, 2], F = [32, 4, 8]
+        testMultipleCompactions(128, 1, 1, new int[] {30, 2, -6});  //  sstables: 128 -> (32,32, 32, 32) => 4 => (4) => 1
+    }
+
+    @Test
+    public void testMultipleCompactionsSingleW_TwoShards() throws Throwable
+    {
+        testMultipleCompactions(4, 1, 2, new int[]{2});  //  4 sstables should be compacted into 1
+        testMultipleCompactions(8, 1, 2, new int[]{2});  //  8 sstables should be compacted into 1
+    }
+
+    private void testMultipleCompactions(int numInitialSSTables, int numFinalSSTables, int numShards, int[] Ws) throws Throwable
+    {
+        int numInserts = 1024 * numShards;
+        int valSize = 2048;
+
+        String scalingParamsStr = Arrays.stream(Ws)
+                                        .mapToObj(Integer::toString)
+                                        .collect(Collectors.joining(","));
+
+        createTable("create table %s (id int primary key, val blob) with compression = { 'enabled' : false } AND " +
+                    "compaction = {'class':'UnifiedCompactionStrategy', 'adaptive' : 'false', " +
+                    String.format("'static_scaling_parameters' : '%s', 'min_sstable_size_in_mb' : '1', 'num_shards': '%d', 'log_all' : 'true'}",
+                                  scalingParamsStr, numShards));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        cfs.disableAutoCompaction();
+
+        int key = 0;
+        byte[] bytes = new byte[valSize];
+        (new Random(87652)).nextBytes(bytes);
+        ByteBuffer val = ByteBuffer.wrap(bytes);
+
+        for (int i = 0; i < numInitialSSTables; i++)
+            key = insertAndFlush(numInserts, key, val);
+
+        int expectedInserts = numInserts * numInitialSSTables;
+
+        assertEquals(expectedInserts, getRows(execute("SELECT * FROM %s")).length);
+        assertEquals(numInitialSSTables * numShards, cfs.getLiveSSTables().size());
+
+        // trigger a compaction, wait for the future because otherwise the check below
+        // may be called before the strategy has executed getNextBackgroundTask()
+        cfs.enableAutoCompaction(true);
+
+        int numChecks = 0;
+        int numTimesWithNoCompactions = 0;
+        while(numTimesWithNoCompactions < 10  && numChecks < 1500) // 15 seconds
+        {
+            // check multiple times because we don't look ahead to future buckets at the moment so there is a brief
+            // window without pending compactions and without compactions in progress, this may make the test flaky on slow J2
+            if (cfs.getCompactionStrategy().getTotalCompactions() == 0)
+                numTimesWithNoCompactions++;
+
+            FBUtilities.sleepQuietly(10);
+            numChecks++;
+        }
+
+        assertEquals(expectedInserts, getRows(execute("SELECT * FROM %s")).length);
+        assertEquals(numFinalSSTables * numShards, cfs.getLiveSSTables().size());
+    }
+
+    private int insertAndFlush(int numInserts, int key, ByteBuffer val) throws Throwable
+    {
+        for (int i = 0; i < numInserts; i++)
+            execute("INSERT INTO %s (id, val) VALUES(?,?)", key++, val);
+
+        flush();
+        return key;
+    }
+
+    private CompactionStrategy getCurrentCompactionStrategy()
+    {
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        return cfs.getCompactionStrategyContainer()
+                  .getStrategies()
+                  .get(0);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
index 8426be121d63..f1856ee4bb36 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java
@@ -19,7 +19,9 @@
 package org.apache.cassandra.db.compaction;
 
 import java.io.Closeable;
+import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
@@ -231,7 +233,7 @@ public void testSubrangeCompaction() throws InterruptedException
     }
 
     @Test
-    public void testAnticompaction() throws InterruptedException, ExecutionException
+    public void testAnticompaction() throws InterruptedException, ExecutionException, IOException
     {
         ColumnFamilyStore cfs = MockSchema.newCFS();
         List<SSTableReader> sstables = createSSTables(cfs, 10, 0);
@@ -470,15 +472,15 @@ public void testStandardCompactionTaskCancellation() throws Throwable
         }
         AbstractCompactionTask ct = null;
 
-        for (List<AbstractCompactionStrategy> css : getCurrentColumnFamilyStore().getCompactionStrategyManager().getStrategies())
+        for (CompactionStrategy cs : getCurrentColumnFamilyStore().getCompactionStrategyContainer().getStrategies())
         {
-            for (AbstractCompactionStrategy cs : css)
+            Collection<AbstractCompactionTask> tasks = cs.getNextBackgroundTasks(0);
+            if (!tasks.isEmpty())
             {
-                ct = cs.getNextBackgroundTask(0);
+                ct = tasks.iterator().next();
                 if (ct != null)
                     break;
             }
-            if (ct != null) break;
         }
         assertNotNull(ct);
 
@@ -492,11 +494,11 @@ public void testStandardCompactionTaskCancellation() throws Throwable
          */
         Thread t = new Thread(() -> {
             Uninterruptibles.awaitUninterruptibly(waitForBeginCompaction);
-            getCurrentColumnFamilyStore().getCompactionStrategyManager().pause();
+            getCurrentColumnFamilyStore().getCompactionStrategyContainer().pause();
             CompactionManager.instance.interruptCompactionFor(metadatas, (s) -> true, false);
             waitForStart.countDown();
             CompactionManager.instance.waitForCessation(Collections.singleton(getCurrentColumnFamilyStore()), (s) -> true);
-            getCurrentColumnFamilyStore().getCompactionStrategyManager().resume();
+            getCurrentColumnFamilyStore().getCompactionStrategyContainer().resume();
         });
         t.start();
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
index d7d281aaaed1..b43fb4beee2f 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionAwareWriterTest.java
@@ -32,6 +32,7 @@
 import org.apache.cassandra.db.compaction.writers.MaxSSTableSizeWriter;
 import org.apache.cassandra.db.compaction.writers.SplittingSizeTieredCompactionWriter;
 import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.UUIDGen;
@@ -170,7 +171,7 @@ private int compact(ColumnFamilyStore cfs, LifecycleTransaction txn, CompactionA
         assert txn.originals().size() == 1;
         int rowsWritten = 0;
         int nowInSec = FBUtilities.nowInSeconds();
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(txn.originals());
+        try (ScannerList scanners = cfs.getCompactionStrategy().getScanners(txn.originals());
              CompactionController controller = new CompactionController(cfs, txn.originals(), cfs.gcBefore(nowInSec));
              CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
         {
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionManagerUpgradeTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionManagerUpgradeTest.java
new file mode 100644
index 000000000000..334c76d7bcd4
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionManagerUpgradeTest.java
@@ -0,0 +1,163 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.util.Collections;
+import java.util.Set;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.schema.KeyspaceParams;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+public class CompactionManagerUpgradeTest
+{
+    private static final String KS_PREFIX = "Keyspace1";
+    private static final String TABLE_PREFIX = "CF_STANDARD";
+
+    @BeforeClass
+    public static void beforeClass()
+    {
+        SchemaLoader.prepareServer();
+        SchemaLoader.createKeyspace(KS_PREFIX,
+                                    KeyspaceParams.simple(1),
+                                    SchemaLoader.standardCFMD(KS_PREFIX, TABLE_PREFIX)
+                                                .compaction(CompactionParams.stcs(Collections.emptyMap())));
+    }
+
+    @Before
+    public void setUp() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
+        cfs.truncateBlocking();
+    }
+
+
+    @Test
+    public void testAutomaticUpgradeConcurrency() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
+        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(true);
+        DatabaseDescriptor.setMaxConcurrentAutoUpgradeTasks(1);
+
+        // latch to block CompactionManager.BackgroundCompactionCandidate#maybeRunUpgradeTask
+        // inside the currentlyBackgroundUpgrading check - with max_concurrent_auto_upgrade_tasks = 1 this will make
+        // sure that BackgroundCompactionCandidate#maybeRunUpgradeTask returns false until the latch has been counted down
+        CountDownLatch latch = new CountDownLatch(1);
+        AtomicInteger upgradeTaskCount = new AtomicInteger(0);
+        MockCFSForCSM mock = new MockCFSForCSM(cfs, latch, upgradeTaskCount);
+
+        CompactionManager.BackgroundCompactionCandidate r = CompactionManager.instance.getBackgroundCompactionCandidate(mock);
+
+        // basic idea is that we start a thread which will be able to get in to the currentlyBackgroundUpgrading-guarded
+        // code in CompactionManager, then we try to run a bunch more of the upgrade tasks which should return false
+        // due to the currentlyBackgroundUpgrading count being >= max_concurrent_auto_upgrade_tasks
+        Thread t = new Thread(r::maybeRunUpgradeTask);
+        t.start();
+        Thread.sleep(100); // let the thread start and grab the task
+        assertEquals(1, CompactionManager.instance.currentlyBackgroundUpgrading.get());
+        assertNull(r.maybeRunUpgradeTask());
+        assertNull(r.maybeRunUpgradeTask());
+        latch.countDown();
+        t.join();
+        assertEquals(1, upgradeTaskCount.get()); // we should only call findUpgradeSSTableTask once when concurrency = 1
+        assertEquals(0, CompactionManager.instance.currentlyBackgroundUpgrading.get());
+
+        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(false);
+    }
+
+    @Test
+    public void testAutomaticUpgradeConcurrency2() throws Exception
+    {
+        ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
+        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(true);
+        DatabaseDescriptor.setMaxConcurrentAutoUpgradeTasks(2);
+        // latch to block CompactionManager.BackgroundCompactionCandidate#maybeRunUpgradeTask
+        // inside the currentlyBackgroundUpgrading check - with max_concurrent_auto_upgrade_tasks = 1 this will make
+        // sure that BackgroundCompactionCandidate#maybeRunUpgradeTask returns false until the latch has been counted down
+        CountDownLatch latch = new CountDownLatch(1);
+        AtomicInteger upgradeTaskCount = new AtomicInteger();
+        MockCFSForCSM mock = new MockCFSForCSM(cfs, latch, upgradeTaskCount);
+
+        CompactionManager.BackgroundCompactionCandidate r = CompactionManager.instance.getBackgroundCompactionCandidate(mock);
+
+        // basic idea is that we start 2 threads who will be able to get in to the currentlyBackgroundUpgrading-guarded
+        // code in CompactionManager, then we try to run a bunch more of the upgrade task which should return false
+        // due to the currentlyBackgroundUpgrading count being >= max_concurrent_auto_upgrade_tasks
+        Thread t = new Thread(r::maybeRunUpgradeTask);
+        t.start();
+        Thread t2 = new Thread(r::maybeRunUpgradeTask);
+        t2.start();
+        Thread.sleep(100); // let the threads start and grab the task
+        assertEquals(2, CompactionManager.instance.currentlyBackgroundUpgrading.get());
+        assertNull(r.maybeRunUpgradeTask());
+        assertNull(r.maybeRunUpgradeTask());
+        assertNull(r.maybeRunUpgradeTask());
+        assertEquals(2, CompactionManager.instance.currentlyBackgroundUpgrading.get());
+        latch.countDown();
+        t.join();
+        t2.join();
+        assertEquals(2, upgradeTaskCount.get());
+        assertEquals(0, CompactionManager.instance.currentlyBackgroundUpgrading.get());
+
+        DatabaseDescriptor.setMaxConcurrentAutoUpgradeTasks(1);
+        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(false);
+    }
+
+    private static class MockCFSForCSM extends ColumnFamilyStore
+    {
+        private final CountDownLatch latch;
+        private final AtomicInteger upgradeTaskCount;
+
+        private MockCFSForCSM(ColumnFamilyStore cfs, CountDownLatch latch, AtomicInteger upgradeTaskCount)
+        {
+            super(cfs.keyspace, cfs.name, 10, cfs.metadata, cfs.getDirectories(), true, false, false);
+            this.latch = latch;
+            this.upgradeTaskCount = upgradeTaskCount;
+        }
+
+        @Override
+        public Set<SSTableReader> getLiveSSTables()
+        {
+            try
+            {
+                latch.await();
+                upgradeTaskCount.incrementAndGet();
+            }
+            catch (InterruptedException e)
+            {
+                throw new RuntimeException(e);
+            }
+
+            return Collections.emptySet();
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionSimulationTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionSimulationTest.java
new file mode 100644
index 000000000000..1de4cf96537e
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionSimulationTest.java
@@ -0,0 +1,1453 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.nio.ByteBuffer;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import javax.inject.Inject;
+
+import com.google.common.collect.Iterables;
+import com.google.common.util.concurrent.RateLimiter;
+import org.apache.commons.math3.distribution.AbstractIntegerDistribution;
+import org.apache.commons.math3.distribution.UniformIntegerDistribution;
+import org.apache.commons.math3.distribution.ZipfDistribution;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.clearspring.analytics.hash.MurmurHash;
+import com.clearspring.analytics.stream.cardinality.CardinalityMergeException;
+import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus;
+import com.clearspring.analytics.stream.cardinality.ICardinality;
+import io.airlift.airline.Command;
+import io.airlift.airline.HelpOption;
+import io.airlift.airline.Option;
+import io.airlift.airline.SingleCommand;
+
+import org.apache.cassandra.concurrent.NamedThreadFactory;
+import org.apache.cassandra.concurrent.ScheduledExecutors;
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.compaction.unified.AdaptiveController;
+import org.apache.cassandra.db.compaction.unified.Controller;
+import org.apache.cassandra.db.compaction.unified.CostsCalculator;
+import org.apache.cassandra.db.compaction.unified.StaticController;
+import org.apache.cassandra.db.compaction.unified.Environment;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.io.sstable.metadata.MetadataCollector;
+import org.apache.cassandra.utils.ExpMovingAverage;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.MonotonicClock;
+import org.apache.cassandra.utils.MovingAverage;
+import org.apache.cassandra.utils.PageAware;
+import org.mockito.Mockito;
+
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.when;
+
+/**
+ * A test that simulates compactions to see how strategies behave.
+ * <p/>
+ * SSTables are mocked with a specific cardinality {@link ICardinality} that provides
+ * an estimated number of keys in the sstable, e.g. {@link HyperLogLogPlus}.
+ * <p/>
+ * Integers are sampled from a probability distribution such us {@link UniformIntegerDistribution} or {@link ZipfDistribution}
+ * and offered to a cardinality object. When the estimated number of objects in the cardinality reaches a threshold,
+ * an sstable is mocked using this cardinality and the compaction strategy is given the sstable and asked to check for
+ * compactions. If there is a compaction task, then it is placed in a queue and the operation is repeated
+ * from the beginning until the desired number of sampled integers has been reached.
+ * <p/>
+ * Another thread waits for compaction tasks that are put on the queue. When there is a compaction task,
+ * the cardinalities of the sstables in the compaction task are merged and a new sstable is created with the
+ * merged cardinality and given to the strategy, which checks again for any compaction events so that they can
+ * be put on the queue as well. The process then continues until the queue is empty.
+ * <p/>
+ * The simulation completes when both threads have terminated.
+ * <p/>
+ * The size of the sstables is given by the estimated number of objects in the cardinality times a fixed size.
+ * <p/>
+ * The following values are calculated and reported at the end of the simulation:
+ *
+ * <li> Write Amplification (WA): number of entries written (by either flushing or compacting) / number of inserts </li>
+ * <li> Read or Space Amplification (RA): histogram of sorted runs (anything better?) </li>
+ * <li> Sorted runs existing at the end of the simulation </li>
+ * <li> Compaction strategy statistics for the entire simulation </li>
+ */
+@Command(name = "compactionSim", description = "Compaction Simulation Tests")
+@Ignore
+public class CompactionSimulationTest extends BaseCompactionStrategyTest
+{
+    private final static Logger logger = LoggerFactory.getLogger(CompactionSimulationTest.class);
+
+    private static final String logDirectory = System.getProperty("cassandra.logdir", ".");
+
+    /**
+     * The average time for flushing 1kb of data, as measured on Fallout tests ran on ironic.
+     */
+    private long flushTimeMicros = 20;
+
+    /**
+     * The average time for compacting 1kb of data, as measured on Fallout tests ran on ironic.
+     */
+    private long compactionTimeMicros = 45;
+
+    /**
+     * The average time for reading an entire partition, as measured on Fallout tests ran on ironic.
+     */
+    private long partitionReadLatencyMicros = 150;
+
+    /** How often we append values to the csv file */
+    private static final int csvUpdatePeriodMs = 500;
+
+    /** Only collect values for final averages after this warmup period */
+    private static final int warmupPeriodSec = 15;
+
+    /** The minimum sstable size in bytes */
+    private static final long sstableSize = 50 << 20; // 50 MB
+
+    /** The number of unique keys that cause an sstable to be flushed, the value size is calculated by dividing
+     * {@link this#sstableSize} by this value. The smaller this value is, the greater the number of sstables generated.
+     */
+    private static final int uniqueKeysPerSStable = 5000;
+
+    /** When calculating the read cost, we multiply by this factor to simulate a Bloom Filter false positive rate of 1%. So
+     * we estimate that we'll access 1% of the live sstables.
+     */
+    private static final double bfFactor = 0.01;
+
+    /**
+     * When calculating the read cost, we multiply by this factor to simulate a cache hit rate of 1 - cacheFactor.
+     */
+    private static final double cacheFactor = 0.05;
+
+    @Inject
+    public HelpOption helpOption;
+
+    @Option(name = { "-wl", "--workload" }, description = "Workload type specified as RXX_WXX, e.g. R50_W50")
+    String workload = "R50_W50";
+
+    @Option(name = { "-t", "--type" }, description = "The test type: either \"static\" or \"adaptive\"")
+    String type = "adaptive";
+
+    @Option(name= {"-min"}, description = "The minimum value of W")
+    int minW = -10;
+
+    @Option(name= {"-max"}, description = "The maximum value of W")
+    int maxW = 32;
+
+    @Option(name= {"--data-size"}, description = "The data set size in GB")
+    int datasetSizeGB = 128;
+
+    @Option(name= {"--num-shards"}, description = "The number of compaction shards")
+    int numShards = 4;
+
+    @Option(name= {"--min-cost"}, description = "The minimum cost for adaptive analysis")
+    int minCost = 5;
+
+    @Option(name= {"--gain"}, description = "The gain for adaptive analysis")
+    double gain = 0.15;
+
+    @Option(name= {"-step"}, description = "The step size for W for static analysis")
+    int stepW = 2;
+
+    @Option(name= {"-w"}, description = "The initial value of W for adaptive analysis")
+    int W = 0;
+
+    @Option(name= {"-update-time"}, description = "The update interval in seconds for adaptive analysis")
+    int updateTimeSec = 15;
+
+    @Option(name= {"-duration"}, description = "The duration in minutes for adaptive analysis or for each step in static analysis")
+    int durationMinutes = 1;
+
+    @Option(name= {"-expired-sstable-check-frequency"}, description = "How often to check for expired SSTables")
+    long expiredSSTableCheckFrequency = 600;
+
+    @Option(name= {"-unsafe-aggressive-sstable-expiration"}, description = "Whether to drop expired SSTables without checking if the partitions appear in other SSTables")
+    boolean ignoreOverlaps = false;
+
+    @BeforeClass
+    public static void setUpClass()
+    {
+        BaseCompactionStrategyTest.setUpClass();
+    }
+
+    @Before
+    public void setUp()
+    {
+        setUp(numShards);
+        logger.info("Simulation set up for data size of {} GiB, {} shards", datasetSizeGB, numShards);
+    }
+
+    public static void main(String[] args) throws Exception
+    {
+        setUpClass();
+
+        CompactionSimulationTest test = SingleCommand.singleCommand(CompactionSimulationTest.class).parse(args);
+
+        if (test.helpOption.showHelpIfRequested())
+            return;
+
+        test.setUp();
+        test.run();
+    }
+
+    public void run() throws Exception
+    {
+        Pattern WL_REGEX = Pattern.compile("^R(\\d+)_W(\\d+)$");
+        Matcher matcher = WL_REGEX.matcher(workload.toUpperCase());
+        if (!matcher.matches())
+            throw new IllegalArgumentException(String.format("Invalid workload %s.", workload));
+
+        int readRowsSec = Integer.parseInt(matcher.group(1)) * 10000;
+        int writeRowsSec = Integer.parseInt(matcher.group(2)) * 10000;
+        System.out.println(String.format("Running %s with %d read rows / sec and %d write rows /sec", workload, readRowsSec, writeRowsSec));
+        logger.info("Running {} with {} read rows / sec and {} write rows /sec", workload, readRowsSec, writeRowsSec);
+
+        if (type.toLowerCase().equals("static"))
+            testStaticAnalysis(workload, readRowsSec, writeRowsSec);
+        else if (type.toLowerCase().equals("adaptive"))
+            testAdaptiveController(workload, readRowsSec, writeRowsSec);
+        else
+            throw new IllegalArgumentException("Invalid type: " + type);
+
+    }
+
+    @Test
+    public void testAdaptiveController_R50_W50() throws Exception
+    {
+        int readRowsSec = 50_000;
+        int writeRowsSec = 50_000;
+
+        testAdaptiveController("R50_W50", readRowsSec, writeRowsSec);
+    }
+
+    @Test
+    public void testStaticAnalysis_R50_W50() throws Exception
+    {
+        int readRowsSec = 50_000;
+        int writeRowsSec = 50_000;
+
+        testStaticAnalysis("R50_W50", readRowsSec, writeRowsSec);
+    }
+
+    @Test
+    public void testSingleW() throws Exception
+    {
+        int W = 2; // similar to tiered with 4 sorted runs per bucket
+        int writeRowsSec = 1_000_000;
+        int readRowsSec = 1_000_000;
+        int maxKey = 30_000_000;
+
+        CsvWriter csvWriter = CsvWriter.make("testUniform_UnifiedStrategy");
+        testUniform(false, csvWriter, W, sstableSize, TimeUnit.MINUTES.toMillis(1), maxKey, readRowsSec, writeRowsSec, NO_OP_OBSERVER);
+    }
+
+    /**
+     * Run a simulation using {@link UnifiedCompactionStrategy} with an initial value of W and let the adaptive
+     * controller choose the best value depending on the workloa
+     */
+    private void testAdaptiveController(String dataSetName, int readRowsSec, int writeRowsSec) throws Exception
+    {
+        int maxKey = 100_000_000;
+
+        String csvFileName = "testAdaptiveController_" + dataSetName;
+        CsvWriter csvWriter = CsvWriter.make(csvFileName);
+
+        testUniform(true, csvWriter, W, sstableSize, TimeUnit.MINUTES.toMillis(durationMinutes), maxKey, readRowsSec, writeRowsSec, NO_OP_OBSERVER);
+        clearSSTables();
+    }
+
+    /**
+     * Run a simulation using {@link UnifiedCompactionStrategy} with different values of W and for a different number of
+     * trials. Report the average IO cost over the period. This can then be plotted as a function of W to show the
+     * impact that W has on the IO cost depending on the workload type (see callers).
+     */
+    private void testStaticAnalysis(String dataSetName, int readRowsSec, int writeRowsSec) throws Exception
+    {
+        int maxKey = 50_000_000;
+
+        String csvFileName = "testStaticAnalysis_" + dataSetName;
+        CsvWriter csvWriter = CsvWriter.make(csvFileName);
+
+        for (int w = minW; w <= maxW; w += stepW)
+        {
+            testUniform(false, csvWriter, w, sstableSize, TimeUnit.MINUTES.toMillis(durationMinutes), maxKey, readRowsSec, writeRowsSec, NO_OP_OBSERVER);
+            clearSSTables();
+        }
+    }
+
+    private void testUniform(boolean adaptive,
+                             CsvWriter csvWriter,
+                             int W,
+                             long sstableSize,
+                             long durationMillis,
+                             int maxKey,
+                             int readRowsSec,
+                             int writeRowsSec,
+                             SimulationObserver observer) throws Exception
+    {
+        if (maxKey <= 0)
+            fail("Maxkey should be positive");
+
+        int valueSize = (int) Math.ceil(sstableSize / (double) uniqueKeysPerSStable); // value length for each key
+
+        logger.debug("Running simulation with uniform distribution, max key: {}, duration: {} ms, maxKey: {}, keys/sstable: {}, value size: {}, min sstable size: {}",
+                     maxKey, durationMillis, maxKey, uniqueKeysPerSStable, valueSize, FBUtilities.prettyPrintMemory(sstableSize));
+
+        AbstractIntegerDistribution distribution = new UniformIntegerDistribution(random, 0, maxKey);
+
+        Counters counters = new Counters();
+        UnifiedCompactionStrategy strategy = createUnifiedCompactionStrategy(counters, adaptive, W, sstableSize, valueSize);
+
+        Simulation simulation = new Simulation(strategy,
+                                               distribution,
+                                               csvWriter,
+                                               counters,
+                                               maxKey,
+                                               uniqueKeysPerSStable,
+                                               valueSize,
+                                               durationMillis,
+                                               readRowsSec, writeRowsSec,
+                                               observer);
+        simulation.run();
+    }
+
+    private void clearSSTables()
+    {
+        Iterable<SSTableReader> sstables = Iterables.concat(dataTracker.getLiveSSTables(), dataTracker.getCompacting());
+        for (SSTableReader sstable : sstables)
+            Mockito.reset(sstable);
+
+        dataTracker.removeUnsafe(dataTracker.getLiveSSTables());
+        dataTracker.removeCompactingUnsafe(dataTracker.getCompacting());
+        repairedAt = System.currentTimeMillis();
+
+        assertTrue(dataTracker.getLiveSSTables().isEmpty());
+        assertTrue(dataTracker.getCompacting().isEmpty());
+    }
+
+    private UnifiedCompactionStrategy createUnifiedCompactionStrategy(Counters counters, boolean adaptive, int W, long sstableSize, int valueSize)
+    {
+        double o = 1.0;
+        int[] Ws = new int[] { W };
+        double maxSpaceOverhead = 0.2;
+
+        Controller controller = adaptive
+                                ? new AdaptiveController(MonotonicClock.preciseTime,
+                                                         new SimulatedEnvironment(counters, valueSize), Ws[0],
+                                                         o,
+                                                         datasetSizeGB << 10,  // MB
+                                                         numShards,
+                                                         sstableSize >> 20, // MB
+                                                         0,
+                                                         maxSpaceOverhead,
+                                                         0,
+                                                         expiredSSTableCheckFrequency,
+                                                         ignoreOverlaps,
+                                                         updateTimeSec,
+                                                         minW,
+                                                         maxW,
+                                                         gain,
+                                                         minCost)
+                                : new StaticController(new SimulatedEnvironment(counters, valueSize),
+                                                       Ws,
+                                                       o,
+                                                       datasetSizeGB << 10,  // MB
+                                                       numShards,
+                                                       sstableSize >> 20,
+                                                       0,
+                                                       maxSpaceOverhead, // MB
+                                                       0,
+                                                       expiredSSTableCheckFrequency,
+                                                       ignoreOverlaps);
+
+        return new UnifiedCompactionStrategy(strategyFactory, controller);
+    }
+
+    private final static class CsvWriter
+    {
+        private final OutputStreamWriter updateWriter;
+        private final OutputStreamWriter averagesWriter;
+        private boolean headerWritten;
+
+        private CsvWriter(String fileName) throws IOException
+        {
+            this.updateWriter =  new OutputStreamWriter(Files.newOutputStream(Paths.get(logDirectory, fileName + ".csv"), StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE));
+            this.averagesWriter =  new OutputStreamWriter(Files.newOutputStream(Paths.get(logDirectory, fileName + "-avg.csv"), StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE));
+            this.headerWritten = false;
+        }
+
+        static CsvWriter make(String fileName) throws IOException
+        {
+            return new CsvWriter(fileName);
+        }
+
+        void writeHeader(String toWrite)
+        {
+            if (!headerWritten)
+            {
+                performWrite(toWrite, updateWriter);
+                performWrite(toWrite, averagesWriter);
+                headerWritten = true;
+            }
+        }
+
+        void write(String toWrite)
+        {
+            performWrite(toWrite, updateWriter);
+        }
+
+        void writeAverages(String toWrite)
+        {
+            performWrite(toWrite, averagesWriter);
+        }
+
+        private synchronized void performWrite(String toWrite, OutputStreamWriter writer)
+        {
+            try
+            {
+                writer.write(toWrite);
+                writer.flush();
+            }
+            catch (IOException ex)
+            {
+                logger.error("Failed to write to csv: ", ex);
+            }
+        }
+    }
+
+    /**
+     * Some counters for the simulation
+     */
+    private final static class Counters
+    {
+        /** The simulated number of rows inserted by the user. */
+        final AtomicLong numInserted = new AtomicLong(0L);
+
+        /** The simulated number of rows requested by the user. */
+        final AtomicLong numRequested = new AtomicLong(0L);
+
+        /** The simulated number of rows flushed */
+        final AtomicLong numFlushed = new AtomicLong(0L);
+
+        /** The simulated number of rows read during compaction */
+        final AtomicLong numReadForCompaction = new AtomicLong(0L);
+
+        /** The simulated number of rows written during compaction */
+        final AtomicLong numWrittenForCompaction = new AtomicLong(0L);
+
+        /** The simulated number of rows written to disk (by flushing or compactions). */
+        final AtomicLong numWritten = new AtomicLong(0L);
+
+        /** The number of compactions simulated */
+        final AtomicLong numCompactions = new AtomicLong(0L);
+
+        /** The number of compactions submitted but not yet executed */
+        final AtomicInteger numCompactionsPending = new AtomicInteger(0);
+
+        /** The number of sstables that have been compacted away */
+        final AtomicLong numCompactedSSTables = new AtomicLong(0L);
+
+        void reset()
+        {
+            numInserted.set(0);
+            numRequested.set(0);
+            numFlushed.set(0);
+            numReadForCompaction.set(0);
+            numWrittenForCompaction.set(0);
+            numWritten.set(0);
+            numCompactions.set(0);
+            numCompactionsPending.set(0);
+            numCompactedSSTables.set(0);
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("Ins: %d (%d%%), Req: %d (%d%%), Flushed: %d, Written: %d",
+                                 numInserted.get(),
+                                 percentageInserted(),
+                                 numRequested.get(),
+                                 percentageRead(),
+                                 numFlushed.get(),
+                                 numWritten.get());
+        }
+
+        int percentageInserted()
+        {
+            double tot = Math.max(1, numInserted.get() + numRequested.get());
+            return (int) ((numInserted.get() / tot) * 100);
+        }
+
+        int percentageRead()
+        {
+            double tot = Math.max(1, numInserted.get() + numRequested.get());
+            return (int) ((numRequested.get() / tot) * 100);
+        }
+    }
+
+    /**
+     * An implementation of {@link Environment} that uses simulated values.
+     */
+    private class SimulatedEnvironment implements Environment
+    {
+        final Counters counters;
+        final int valueSize;
+
+        SimulatedEnvironment(Counters counters, int valueSize)
+        {
+            this.counters = counters;
+            this.valueSize = valueSize;
+        }
+
+        @Override
+        public MovingAverage makeExpMovAverage()
+        {
+            return ExpMovingAverage.decayBy100();
+        }
+
+        @Override
+        public double cacheMissRatio()
+        {
+            return cacheFactor;
+        }
+
+        @Override
+        public double bloomFilterFpRatio()
+        {
+            return bfFactor;
+        }
+
+        @Override
+        public int chunkSize()
+        {
+            return PageAware.PAGE_SIZE;
+        }
+
+        @Override
+        public long bytesInserted()
+        {
+            return counters.numInserted.get() * valueSize;
+        }
+
+        @Override
+        public long partitionsRead()
+        {
+            return counters.numRequested.get();
+        }
+
+        @Override
+        public double sstablePartitionReadLatencyNanos()
+        {
+            return TimeUnit.MICROSECONDS.toNanos(partitionReadLatencyMicros);
+        }
+
+        @Override
+        public double compactionLatencyPerKbInNanos()
+        {
+            // this is slightly incorrect, we would need to measure the size of compacted sstables
+            return TimeUnit.MICROSECONDS.toNanos(compactionTimeMicros);
+        }
+
+        @Override
+        public double flushLatencyPerKbInNanos()
+        {
+            return TimeUnit.MICROSECONDS.toNanos(flushTimeMicros);
+        }
+
+        @Override
+        public double WA()
+        {
+            double bytesFlushed = counters.numFlushed.get() * valueSize;
+            double bytesCompacted = counters.numWrittenForCompaction.get() * valueSize;
+            return bytesFlushed <= 0 ? 0 : (bytesFlushed + bytesCompacted) / bytesFlushed;
+        }
+
+        @Override
+        public double flushSize()
+        {
+            return uniqueKeysPerSStable * valueSize; // a rough estimation should be fine
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("Read latency: %d us / partition, flush latency: %d us / KiB, compaction latency: %d us / KiB, bfpr: %f, measured WA: %.2f, flush size %s",
+                                 TimeUnit.NANOSECONDS.toMicros((long) sstablePartitionReadLatencyNanos()),
+                                 TimeUnit.NANOSECONDS.toMicros((long) flushLatencyPerKbInNanos()),
+                                 TimeUnit.NANOSECONDS.toMicros((long) compactionLatencyPerKbInNanos()),
+                                 bloomFilterFpRatio(),
+                                 WA(),
+                                 FBUtilities.prettyPrintMemory((long)flushSize()));
+        }
+    }
+
+    /**
+     * The output of the simulation
+     */
+    private final class SimulationOutput
+    {
+        /** The initial timestamp */
+        private final long start;
+
+        /** The compaction strategy */
+        private final UnifiedCompactionStrategy strategy;
+
+        /** The compaction cost calculator */
+        private final CostsCalculator calculator;
+
+        /** Save the read IO costs after the warm-up period for calculating the final average and stddev, TODO - can we do it wihtout a list? */
+        private final List<Double> readIOCosts;
+
+        /** Save the write IO costs after the warm-up period for calculating the final average and stddev, TODO - can we do it wihtout a list? */
+        private final List<Double> writeIOCosts;
+
+        /**
+         * Creates an initial empty status and writes the header to the CSV file.
+         */
+        SimulationOutput(long start, CsvWriter writer, UnifiedCompactionStrategy strategy)
+        {
+            this.start = start;
+            this.strategy = strategy;
+            this.calculator = strategy.getController().getCalculator();
+            this.readIOCosts = new ArrayList<>();
+            this.writeIOCosts = new ArrayList<>();
+
+            writeCSVHeader(writer);
+        }
+
+        private void writeCSVHeader(CsvWriter writer)
+        {
+            writer.writeHeader(String.join(",",
+                                           "timestamp ms",
+                                           "W",
+                                           "num compactions",
+                                           "live sstables",
+                                           "space used bytes",
+                                           "Tot Num inserted",
+                                           "Tot Num read",
+                                           "% inserted",
+                                           "% read",
+                                           "Read IO",
+                                           "Read IO stddev",
+                                           "Write IO",
+                                           "Write IO stddev",
+                                           "Tot IO",
+                                           "Tot IO stddev",
+                                           "Num. pending",
+                                           "WA")
+                               + System.lineSeparator());
+        }
+
+        private void write(CsvWriter writer, Counters counters)
+        {
+            int W = strategy.getW(0);
+            long length = (long) Math.ceil(calculator.spaceUsed());
+            int RA = strategy.getController().readAmplification(length, W);
+            int WA = strategy.getController().writeAmplification(length, W);
+
+            double readIOCost = calculator.getReadCostForQueries(RA);
+            double writeIOCost = calculator.getWriteCostForQueries(WA);
+
+            if (System.currentTimeMillis() - start >= TimeUnit.SECONDS.toMillis(warmupPeriodSec))
+            {
+                this.readIOCosts.add(readIOCost);
+                this.writeIOCosts.add(writeIOCost);
+            }
+
+            String toWrite = String.join(",",
+                                         toString(System.currentTimeMillis() - start),
+                                         toString(W),
+                                         toString(counters.numCompactions.get()),
+                                         toString(calculator.numSSTables()),
+                                         toString(length),
+                                         toString(counters.numInserted.get()),
+                                         toString(counters.numRequested.get()),
+                                         toString(counters.percentageInserted()),
+                                         toString(counters.percentageRead()),
+                                         toString(readIOCost),
+                                         "0",
+                                         toString(writeIOCost),
+                                         "0",
+                                         toString(readIOCost + writeIOCost),
+                                         "0",
+                                         toString(counters.numCompactionsPending.get() + strategy.getEstimatedRemainingTasks()),
+                                         toString(calculator.getEnv().WA()))
+                             + System.lineSeparator();
+
+            writer.write(toWrite);
+        }
+
+        private void writeAverages(CsvWriter writer, Counters counters)
+        {
+            double writeIOCostAvg = average(writeIOCosts);
+            double writeIOCostStd = stddev(writeIOCostAvg, writeIOCosts);
+
+            double readIOCostAvg = average(readIOCosts);
+            double readIOCostStd = stddev(readIOCostAvg, readIOCosts);
+
+
+            String toWrite = String.join(",",
+                                         toString(System.currentTimeMillis() - start),
+                                         toString(strategy.getW(0)),
+                                         toString(counters.numCompactions.get()),
+                                         toString(calculator.numSSTables()),
+                                         toString(calculator.spaceUsed()),
+                                         toString(counters.numInserted.get()),
+                                         toString(counters.numRequested.get()),
+                                         toString(counters.percentageInserted()),
+                                         toString(counters.percentageRead()),
+                                         toString(readIOCostAvg),
+                                         toString(readIOCostStd),
+                                         toString(writeIOCostAvg),
+                                         toString(writeIOCostStd),
+                                         toString(readIOCostAvg + writeIOCostAvg),
+                                         toString(readIOCostStd + writeIOCostStd),
+                                         toString(counters.numCompactionsPending.get() + strategy.getEstimatedRemainingTasks()),
+                                         toString(calculator.getEnv().WA()))
+                             + System.lineSeparator();
+
+            writer.writeAverages(toWrite);
+        }
+
+        public double average(List<Double> vals)
+        {
+            return vals.isEmpty() ? 0 : vals.stream().reduce(Double::sum).get() / vals.size();
+        }
+
+        public double stddev(double avg, List<Double> vals)
+        {
+            if (vals.isEmpty())
+                return 0;
+
+            double sd = 0;
+            for (double v : vals)
+                sd += Math.pow(v - avg, 2);
+
+            return Math.sqrt(sd / vals.size());
+        }
+
+        private String toString(long val)
+        {
+            return String.format("%d", val);
+        }
+
+        private String toString(double val)
+        {
+            return String.format("%.6f", val);
+        }
+
+        @Override
+        public String toString()
+        {
+            long elapsed = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - start);
+            return String.format("W: %d, num. sstables: %d, elapsed: %d s",
+                                 strategy.getW(0),
+                                 dataTracker.getLiveSSTables().size(),
+                                 elapsed);
+        }
+    }
+
+    /**
+     * A simple state machine for the simulation
+     */
+    private enum SimulationState
+    {
+        NONE, // the simulation hasn't yet started or is pre-loading data
+        SETTING_UP, // the simulation is setting up, e.g. pre-loading data and waiting for initial set of compactions
+        RUNNING, // the simulation is running: inserting data and/or reading data and reporting the output
+        TEARING_DOWN, //the simulation is tearing down (waiting for threads to complete)
+        DONE // the simulation is finishing compactions or done
+    }
+
+    /**
+     * Implemented by tests that need to react to simulation progress
+     */
+    private interface SimulationObserver
+    {
+        void onChange(SimulationState state);
+    }
+
+    private static SimulationObserver NO_OP_OBSERVER = state -> {};
+
+    /**
+     * The implementation of the simulation
+     */
+    private final class Simulation
+    {
+        /** The strategy to test */
+        private final UnifiedCompactionStrategy strategy;
+
+        /** The distribution is used to generated values to be inserted */
+        private final AbstractIntegerDistribution distribution;
+
+        /** The simulation output will be passed to this csv writer */
+        private final CsvWriter csvWriter;
+
+        /** The maximum key to insert when pre-loading data, this is also normally the maximum key value of the data distribution */
+        private final int maxKey;
+
+        /** The number of unique keys that trigger an sstable to be created */
+        private final int uniqueKeysPerSStable;
+
+        /** The fixed value size for each key inserted */
+        private final int valueSize;
+
+        /** The simulation duration in milliseconds, it will keep on reading and writing for this period of time
+         * according to the rate limiters below */
+        private final long durationMillis;
+
+        /** The insertion rate limited is based on the insert rows / sec received in the c.tor */
+        private final RateLimiter writeRate;
+
+        /** The read rate limited is based on the read rows / sec received in the c.tor */
+        private final RateLimiter readRate;
+
+        /** These are the compactions that have been submitted by the strategy */
+        private final BlockingQueue<AbstractCompactionTask> compactions;
+
+        /** The cardinalities for the sstables to be flushed */
+        private final BlockingQueue<ICardinality> flushing;
+
+        /** The simulation counters */
+        private final Counters counters;
+
+        /** This is set in case of error to fail the test */
+        private final AtomicReference<Throwable> error;
+
+        /** A simulation observer */
+        private final SimulationObserver observer;
+
+        /** The simulation state */
+        private final AtomicReference<SimulationState> state;
+
+        /** The start of the simulation */
+        private volatile long start;
+
+        /** Contains output parameters that the simulation should produce periodically. */
+        private volatile SimulationOutput output;
+
+        /**
+         * Create a new simulation
+         * @param strategy the strategy to test
+         * @param distribution the distribution to generate random integer keys
+         * @param csvWriter writes statistics to a csv file
+         * @param maxKey the maximum value of the key,
+         * @param uniqueKeysPerSStable the number of unique keys that trigger an sstable to be created
+         * @param valueSize the fixed size for the value associated to each key
+         * @param durationMillis the duration of the simulation read and write phases
+         * @param readRowsSec the simulated number of rows to be read every second
+         * @param writeRowsSec the simulated number of rows to be inserted every second
+         */
+        Simulation(UnifiedCompactionStrategy strategy,
+                   AbstractIntegerDistribution distribution,
+                   CsvWriter csvWriter,
+                   Counters counters,
+                   int maxKey,
+                   int uniqueKeysPerSStable,
+                   int valueSize,
+                   long durationMillis,
+                   int readRowsSec, int writeRowsSec,
+                   SimulationObserver observer)
+        {
+            this.strategy = strategy;
+            this.distribution = distribution;
+            this.csvWriter = csvWriter;
+            this.maxKey = maxKey;
+            this.uniqueKeysPerSStable = uniqueKeysPerSStable;
+            this.valueSize = valueSize;
+            this.durationMillis = durationMillis;
+            this.writeRate = writeRowsSec > 0 ? RateLimiter.create(writeRowsSec) : null;
+            this.readRate = readRowsSec > 0 ? RateLimiter.create(readRowsSec) : null;
+            this.compactions =  new ArrayBlockingQueue<>(512); // flushing / compaction thread will be blocked when queue is full
+            this.flushing = new ArrayBlockingQueue<>(256); // insert thread will be blocked when queue is full
+            this.counters = counters;
+
+            this.error = new AtomicReference<>(null);
+            this.state = new AtomicReference<>(SimulationState.NONE);
+            this.observer = observer;
+        }
+
+        void run() throws Exception
+        {
+            if (state.get() != SimulationState.NONE)
+                throw new IllegalStateException("Simulation already run!");
+
+            try
+            {
+                NamedThreadFactory threadFactory = new NamedThreadFactory("Simulation-worker");
+
+                setState(SimulationState.NONE, SimulationState.SETTING_UP);
+
+                this.start = System.currentTimeMillis();
+                strategy.getController().startup(strategy, ScheduledExecutors.scheduledTasks);
+                this.output = new SimulationOutput(start, csvWriter, strategy);
+
+                int numShards = strategy.getController().getNumShards();
+
+                CountDownLatch settingUpDone = new CountDownLatch(1);
+                CountDownLatch runningDone = new CountDownLatch(2);
+                CountDownLatch tearingDownDone = new CountDownLatch(3 + numShards); // 1 reporter, 2 flusher and num shards compacting threads
+
+                threadFactory.newThread(new RunAndCountDown(settingUpDone, "preload", this::preloadData)).start();
+                threadFactory.newThread(new RunAndCountDown(tearingDownDone, "report", this::reportOutput)).start();
+
+                for (int i = 0; i < numShards; i++)
+                    threadFactory.newThread(new RunAndCountDown(tearingDownDone, "compact " + i, this::compactData)).start();
+
+                settingUpDone.await();
+
+                if (error.get() != null)
+                    throw new RuntimeException("Simulation has failed");
+
+                waitForCompactionsToSettle();
+
+                setState(SimulationState.SETTING_UP, SimulationState.RUNNING);
+                this.start = System.currentTimeMillis();
+                //this.counters.reset();
+
+                threadFactory.newThread(new RunAndCountDown(tearingDownDone, "flush 1", this::flushData)).start();
+                threadFactory.newThread(new RunAndCountDown(tearingDownDone, "flush 2", this::flushData)).start();
+                threadFactory.newThread(new RunAndCountDown(runningDone, "insert", () -> runOrWait(this::insertData, writeRate))).start();
+                threadFactory.newThread(new RunAndCountDown(runningDone, "read", () -> runOrWait(this::readData, readRate))).start();
+
+                runningDone.await();
+
+                if (error.get() != null)
+                    throw new RuntimeException("Simulation has failed");
+
+                setState(SimulationState.RUNNING, SimulationState.TEARING_DOWN);
+
+                waitForCompactionsToSettle();
+
+                tearingDownDone.await();
+
+                summarize();
+            }
+            finally
+            {
+                setState(SimulationState.TEARING_DOWN, SimulationState.DONE);
+
+                if (strategy.getController().isRunning())
+                    strategy.getController().shutdown();
+            }
+        }
+
+        private class RunAndCountDown implements Runnable
+        {
+            CountDownLatch done;
+            String what;
+            Runnable task;
+
+            RunAndCountDown(CountDownLatch done, String what, Runnable task)
+            {
+                this.done = done;
+                this.what = what;
+                this.task = task;
+            }
+
+            @Override
+            public void run()
+            {
+                try
+                {
+                    logger.debug("Running \"{}\"", what);
+                    task.run();
+                }
+                catch (Throwable t)
+                {
+                    SimulationState currentState = state.get();
+                    logger.error("Unexpected error during \"{}\" with state {}:", what, currentState, t);
+
+                    error.compareAndSet(null, t);
+
+                    if (currentState.ordinal() < SimulationState.TEARING_DOWN.ordinal())
+                        setState(currentState, SimulationState.TEARING_DOWN);
+                }
+                finally
+                {
+                    logger.debug("Finished \"{}\"", what);
+                    done.countDown();
+                }
+            }
+        }
+
+
+        private void setState(SimulationState from, SimulationState to)
+        {
+            logger.debug("Updating simulation state from {} to {}", from, to);
+
+            if (state.compareAndSet(from, to))
+                observer.onChange(to);
+            else
+                throw new IllegalStateException(String.format("Failed to update simulation state from %s to %s", from, to));
+        }
+
+        void waitForCompactionsToSettle()
+        {
+            logger.debug("Waiting for compactions to settle...");
+
+            for (int i = 0; i < 3; i++)
+            { // 3 attempts in case the queue is temporarily empty before submitting a new compaction
+                while (!compactions.isEmpty())
+                {
+                    FBUtilities.sleepQuietly(1000);
+                    logger.debug("{}, live sstables: {}, compacting: {}, pending compactions: {}, pending flushing: {}, elapsed: {} s",
+                                 counters,
+                                 dataTracker.getLiveSSTables().size(),
+                                 dataTracker.getCompacting().size(),
+                                 compactions.size() + strategy.getEstimatedRemainingTasks(),
+                                 flushing.size(),
+                                 TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - start));
+                }
+            }
+
+            logger.debug("Compactions settled, live sstables: {}", dataTracker.getLiveSSTables().size());
+        }
+
+        void summarize()
+        {
+            if (error.get() != null)
+            {
+                Throwable err = error.get();
+                err.printStackTrace();
+                fail("Simulation failed with exception: " + err.getClass().getCanonicalName() + '/' + err.getMessage());
+
+                return;
+            }
+
+            long elapsedMs = System.currentTimeMillis() - start;
+            logger.info("Total time: {}s  WA : {}", TimeUnit.SECONDS.convert(elapsedMs, TimeUnit.MILLISECONDS), strategy.getController().getEnv().WA());
+            logger.info("Final outputs: {} {}", counters, output);
+
+            logger.info("Strategy aggregated statistics:");
+            logger.info(strategy.getStatistics().toString());
+        }
+
+        /**
+         * If the rate limiter is null simply sleep for the entire duration, otherwise run the task.
+         */
+        private void runOrWait(Runnable task, RateLimiter rateLimiter)
+        {
+            if (rateLimiter != null)
+                task.run();
+            else
+                FBUtilities.sleepQuietly(durationMillis);
+        }
+
+        /**
+         * Insert the entire key space.
+         */
+        private void preloadData()
+        {
+            ICardinality cardinality = newCardinality();
+
+            byte[] scratchBytes = new byte[8];
+            ByteBuffer scratch = ByteBuffer.wrap(scratchBytes);
+            long numToFlush;
+            int lastFlushed = 0;
+            long lastLogged = System.currentTimeMillis();
+            long maxBytesToInsert = (long) datasetSizeGB << 30;
+            long bytesInserted = 0;
+            int i = 0;
+
+            logger.info("Inserting up to {}", FBUtilities.prettyPrintMemory(maxBytesToInsert));
+
+            try
+            {
+                while(bytesInserted < maxBytesToInsert)
+                {
+                    scratch.clear();
+                    scratch.putLong(0, i);
+                    long hash = MurmurHash.hash64(scratchBytes, scratchBytes.length);
+                    cardinality.offerHashed(hash);
+
+                    counters.numInserted.incrementAndGet();
+                    bytesInserted += valueSize;
+
+                    i++;
+                    if (i == maxKey)
+                        i = 0;
+
+                    if (System.currentTimeMillis()- lastLogged >= TimeUnit.SECONDS.toMillis(1))
+                    {
+                        lastLogged = System.currentTimeMillis();
+                        logger.debug("Ins: {}, keys: {}, live sstables: {}, compacting: {}, pending compactions: {}",
+                                     FBUtilities.prettyPrintMemory(bytesInserted),
+                                     i,
+                                     dataTracker.getLiveSSTables().size(),
+                                     dataTracker.getCompacting().size(),
+                                     compactions.size() + strategy.getEstimatedRemainingTasks());
+                    }
+
+                    if (i >= (lastFlushed + uniqueKeysPerSStable) && // no point in checking the cardinality until we've inserted uniqueKeysPerSStable more entries
+                        (numToFlush = cardinality.cardinality()) >= uniqueKeysPerSStable)
+                    {
+                        counters.numFlushed.addAndGet(numToFlush);
+                        lastFlushed = i;
+                        generateSSTables(cardinality, numToFlush, "preload", true);
+
+                        cardinality = newCardinality();
+                    }
+
+                    if (i % 1000 == 0 && state.get() == SimulationState.TEARING_DOWN)
+                    { // this happens if the compaction threads fail
+                        logger.debug("Interrupting preload, simulation is tearing down");
+                        break;
+                    }
+                }
+
+                if ((numToFlush = cardinality.cardinality()) > 0)
+                {
+                    counters.numFlushed.addAndGet(numToFlush);
+                    generateSSTables(cardinality, numToFlush, "preload", true);
+                }
+            }
+            catch (Exception e)
+            {
+                logger.error("Exception happen during preloading", e);
+            }
+        }
+
+        /**
+         * Simulate inserting data and generating sstables when the cardinality has reached {@link this#uniqueKeysPerSStable} unique entries.
+         */
+        private void insertData()
+        {
+            ICardinality cardinality = newCardinality();
+
+            int numSSTables = 0;
+            long numFlushed = 0;
+
+            byte[] scratchBytes = new byte[8];
+            ByteBuffer scratch = ByteBuffer.wrap(scratchBytes);
+
+            long now;
+            long lastLogged = start;
+            try
+            {
+                while((now = System.currentTimeMillis()) - start <= durationMillis)
+                {
+                    scratch.clear();
+                    scratch.putLong(0, distribution.sample());
+                    long hash = MurmurHash.hash64(scratchBytes, scratchBytes.length);
+                    cardinality.offerHashed(hash);
+
+                    counters.numInserted.incrementAndGet();
+                    writeRate.acquire();
+
+                    if (now - lastLogged >= TimeUnit.SECONDS.toMillis(1))
+                    {
+                        lastLogged = now;
+                        logger.debug("{}, live sstables: {}, compacting: {}, pending compactions: {}, pending flushing: {}, elapsed: {} s",
+                                     counters,
+                                     dataTracker.getLiveSSTables().size(),
+                                     dataTracker.getCompacting().size(),
+                                     compactions.size() + strategy.getEstimatedRemainingTasks(),
+                                     flushing.size(),
+                                     TimeUnit.MILLISECONDS.toSeconds(now - start));
+
+                        if (state.get() == SimulationState.TEARING_DOWN)
+                            break;
+                    }
+
+                    if (counters.numInserted.get() >= (numFlushed + uniqueKeysPerSStable) && // no point in checking the cardinality until we've inserted uniqueKeysPerSStable more entries
+                        cardinality.cardinality() >= uniqueKeysPerSStable)
+                    {
+                        numFlushed = counters.numInserted.get();
+                        numSSTables++;
+
+                        flushing.put(cardinality);
+                        cardinality = newCardinality();
+                    }
+                }
+
+                // generate one final sstable
+                if (cardinality.cardinality() > 0)
+                {
+                    numSSTables++;
+                    flushing.put(cardinality);
+                }
+
+                logger.debug("Status: {} {}, sstables: {}, completed inserting data", counters, output, numSSTables);
+            }
+            catch (InterruptedException e)
+            {
+                logger.error("Exception happen during insertion", e);
+            }
+        }
+
+        /** Simulate reading some data */
+        private void readData()
+        {
+            while(System.currentTimeMillis() - start <= durationMillis)
+            {
+                counters.numRequested.incrementAndGet();
+                readRate.acquire();
+
+                if (state.get() == SimulationState.TEARING_DOWN)
+                    break;
+            }
+        }
+
+        /**
+         * Convert the compaction statistics to the simulation output and append it to the csv file.
+         */
+        void reportOutput()
+        {
+            while(state.get().ordinal() < SimulationState.TEARING_DOWN.ordinal())
+            {
+                FBUtilities.sleepQuietly(csvUpdatePeriodMs);
+                doReportOutput(false);
+            }
+
+            doReportOutput(true);
+        }
+
+        private void doReportOutput(boolean isLast)
+        {
+            if (isLast)
+                output.writeAverages(csvWriter, counters);
+            else
+                output.write(csvWriter, counters);
+
+            logger.trace("{} {}", counters, output);
+        }
+
+        /**
+         * Take the cardinalities from the flushing queue and generate sstables.
+         * @throws Exception
+         */
+        private void flushData()
+        {
+            try
+            {
+                while(state.get().ordinal() < SimulationState.TEARING_DOWN.ordinal() || !flushing.isEmpty())
+                {
+                    ICardinality cardinality = flushing.poll(1, TimeUnit.MILLISECONDS);
+                    if (cardinality == null)
+                        continue;
+
+                    long numToFlush = cardinality.cardinality();
+                    counters.numFlushed.addAndGet(numToFlush);
+                    generateSSTables(cardinality, numToFlush, "flushing", true);
+                }
+            }
+            catch (InterruptedException e)
+            {
+                logger.error("Exception happen during flushing", e);
+            }
+        }
+
+        /**
+         * Perform the following:
+         *
+         * <li>Take compaction tasks from {@link this#compactions}</li>
+         * <li>Merge the cardinality of the txn sstables</li>
+         * <li>Generate a new merged sstable</li>
+         * <li>Pass it to the strategy and live sstables</li>
+         * <li>Check with the strategy if there is a new compaction task</li>
+         */
+        private void compactData()
+        {
+            try
+            {
+                while(state.get().ordinal() < SimulationState.TEARING_DOWN.ordinal() || !compactions.isEmpty())
+                {
+                    AbstractCompactionTask task = compactions.poll(1, TimeUnit.SECONDS);
+                    if (task == null)
+                        continue;
+
+                    LifecycleTransaction txn = task.transaction();
+                    Set<SSTableReader> candidates = txn.originals();
+                    for (SSTableReader candidate : candidates)
+                        counters.numReadForCompaction.addAndGet(candidate.keyCardinalityEstimator().cardinality());
+
+                    UUID id = txn.opId();
+
+                    //strategy.getBackgroundCompactions().setInProgress(mockCompletedCompactionProgress(candidates, id));
+                    ICardinality merged = getMerged(candidates);
+
+                    counters.numWrittenForCompaction.addAndGet(merged.cardinality());
+
+                    // first remove the sstables to avoid overlaps when adding the new one for LCS
+                    dataTracker.removeUnsafe(candidates);
+                    dataTracker.removeCompactingUnsafe(candidates);
+
+                    // first create the new merged sstable
+                    generateSSTables(merged, merged.cardinality(), "compacting", false);
+                    //Thread.sleep(5);
+
+                    // then remove the old sstables
+                    strategy.onCompleted(id);
+                    counters.numCompactions.incrementAndGet();
+                    counters.numCompactionsPending.decrementAndGet();
+                    counters.numCompactedSSTables.addAndGet(candidates.size());
+
+                    logger.debug("Executed {} compactions, live sstables: {}, compacting sstables: {}, compacted sstables: {}",
+                                 counters.numCompactions, dataTracker.getLiveSSTables().size(), dataTracker.getCompacting().size(), counters.numCompactedSSTables);
+
+                    maybeSubmitCompaction();
+
+                    txn.unsafeClose();
+                }
+                logger.debug("...completed monitoring compactions");
+            }
+            catch (InterruptedException | CardinalityMergeException e)
+            {
+                logger.error("Exception happen during compaction", e);
+            }
+        }
+
+        /**
+         * Merge the cardinalities of the input sstables.
+         *
+         * @return the merged cardinality
+         *
+         * @throws CardinalityMergeException
+         */
+        private ICardinality getMerged(Set<SSTableReader> candidates) throws CardinalityMergeException
+        {
+            ICardinality[] cardinalities = new ICardinality[candidates.size() - 1];
+            int i = 0;
+            ICardinality first = null;
+
+            for (SSTableReader sstable : candidates)
+            {
+                if (first == null)
+                    first = sstable.keyCardinalityEstimator();
+                else
+                    cardinalities[i++] = sstable.keyCardinalityEstimator();
+            }
+
+            return first.merge(cardinalities);
+        }
+
+        /**
+         * Create a new cardinality with similar parameters as those used in {@link MetadataCollector}.
+         * See CASSANDRA-5906 for error and size details. Instead of using 12, 25 we use 12,24 since that
+         * halves the memory used (2.7k instead of 5.5k for 10k entries) and we can tollerate a slightly larger error.
+         *
+         * @return a newly constructed cardinality
+         */
+        private ICardinality newCardinality()
+        {
+            return new HyperLogLogPlus(12, 24); // for real sstables in MetadataCollector we use 13, 25
+        }
+
+        /**
+         * Create one or more mocked sstables based on the cardinality received and the value size. The theoretical sstable size
+         * (numEntries * valueSize) will be split across multiple compaction shards.
+         *
+         * @param cardinality - the cardinality used to simulate the sstable
+         * @param numEntries - the total number of entries to write to disk
+         * @param reason - the reason (flushing, compacting, etc)
+         * @param checkForCompaction- if true we check if a compaction needs to be submitted
+         */
+        private void generateSSTables(ICardinality cardinality, long numEntries, String reason, boolean checkForCompaction) throws InterruptedException
+        {
+            // The theoretical sstable size that is being mocked
+            long sstableSize = numEntries * valueSize;
+
+            // The minimum sstable size and the compaction boundaries as dictated by the strategy
+            long minSStableSize = strategy.getController().getMinSstableSizeBytes();
+            List<PartitionPosition> boundaries = strategy.getShardBoundaries();
+
+            // If we didn't have a minimum sstable size, each shard would get an sstable segment of this size and the number of
+            // sstables would be the number of shards, but because we have a minimum sstable size, we need to put a cap of minSStableSize
+            long sizeAssignableToEachShard = Math.max(minSStableSize, (int) Math.ceil((double) sstableSize / boundaries.size()));
+
+            // How many sstables to compose the original sstable size, by rounding down we disregard the final segment which would be < minSStableSize
+            int numSStables = Math.min(boundaries.size(), Math.max(1, (int) (sstableSize / sizeAssignableToEachShard)));
+
+            // spread the sstables over the boundaries
+            int numShardsCoveredByEachSStable = boundaries.size() / numSStables;
+
+            List<SSTableReader> sstables = new ArrayList<>(numSStables);
+            long keyCount = (long) Math.ceil(numEntries / (double) numSStables);
+            long bytesOnDisk = valueSize * keyCount;
+            long timestamp = System.currentTimeMillis();
+
+            Token min = partitioner.getMinimumToken();
+            int max = numShardsCoveredByEachSStable - 1;
+            for (int i = 0; i < numSStables; i++)
+            {
+                // use the max token for the last sstable, this is because we rounded down when calculating numShardsCoveredByEachSStable
+                if (i == numSStables -1)
+                    max = boundaries.size() - 1;
+                else
+                    assertTrue(max < boundaries.size());
+
+                DecoratedKey first = new BufferDecoratedKey(min, ByteBuffer.allocate(0));
+                DecoratedKey last = new BufferDecoratedKey(boundaries.get(max).getToken(), ByteBuffer.allocate(0));
+
+                SSTableReader sstable = mockSSTable(0, bytesOnDisk, timestamp, 0, first, last, 0, true, null, 0);
+                when(sstable.keyCardinalityEstimator()).thenReturn(cardinality);
+                sstables.add(sstable);
+
+                min = boundaries.get(max).getToken().increaseSlightly();
+                max += numShardsCoveredByEachSStable;
+            }
+
+            counters.numWritten.addAndGet(numEntries);
+            dataTracker.addInitialSSTablesWithoutUpdatingSize(sstables);
+            logger.debug("Generated {} new sstables for {}, live: {}, compacting: {}, tot sstable size {}, min sstable size {}, sizeAssignableToEachShard {}",
+                         sstables.size(), reason, dataTracker.getLiveSSTables().size(), dataTracker.getCompacting().size(),
+                         sstableSize, minSStableSize, sizeAssignableToEachShard);
+
+            if (checkForCompaction)
+                maybeSubmitCompaction();
+        }
+
+        private void maybeSubmitCompaction() throws InterruptedException
+        {
+            Collection<AbstractCompactionTask> tasks = strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+            for (AbstractCompactionTask task : tasks)
+            {
+                compactions.put(task);
+                counters.numCompactionsPending.incrementAndGet();
+                logger.debug("Submitted new compaction, live sstables: {}, compacting sstables: {}, compacted sstables: {}",
+                             dataTracker.getLiveSSTables().size(), dataTracker.getCompacting().size(), counters.numCompactedSSTables);
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyContainerPendingRepairTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyContainerPendingRepairTest.java
new file mode 100644
index 000000000000..86ec4f928af8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyContainerPendingRepairTest.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.io.IOException;
+
+import org.junit.Ignore;
+
+@Ignore
+public interface CompactionStrategyContainerPendingRepairTest
+{
+    /**
+     * Pending repair strategy should be created when we encounter a new pending id
+     */
+    void testSstableAdded() throws IOException;
+
+    void testSstableDeleted() throws IOException;
+
+    void testSstableListChangedAddAndRemove() throws IOException;
+
+    void testSstableRepairStatusChanged() throws IOException;
+
+    /**
+     * {@link CompactionStrategyContainer} should include
+     * pending repair strategies when appropriate
+     */
+    void testStrategiesContainsPendingRepair() throws IOException;
+
+    /**
+     * Tests that finalized repairs result in cleanup compaction tasks
+     * which reclassify the sstables as repaired
+     */
+    void testCleanupCompactionFinalized() throws IOException;
+
+    void testFinalizedSessionTransientCleanup() throws IOException;
+
+    void testFailedSessionTransientCleanup() throws IOException;
+
+    void testCleanupCompactionFailed() throws IOException;
+
+    void testSessionCompleted() throws IOException;
+
+    void testSessionCompletedWithDifferentSSTables() throws IOException;
+
+
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java
index 3a804b63bfbe..692c1861544a 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerPendingRepairTest.java
@@ -18,10 +18,13 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.io.IOException;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.UUID;
 
+import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Iterables;
 import org.junit.Assert;
 import org.junit.Test;
@@ -35,51 +38,80 @@
 import org.apache.cassandra.service.ActiveRepairService;
 import org.apache.cassandra.utils.FBUtilities;
 
+import static org.junit.Assert.assertEquals;
+
 /**
- * Tests CompactionStrategyManager's handling of pending repair sstables
+ * Tests CompactionStrategyContainer's handling of pending repair sstables
  */
-public class CompactionStrategyManagerPendingRepairTest extends AbstractPendingRepairTest
+public class CompactionStrategyManagerPendingRepairTest extends AbstractPendingRepairTest implements CompactionStrategyContainerPendingRepairTest
 {
+    @Override
+    public String createTableCql()
+    {
+        return String.format("CREATE TABLE %s.%s (k INT PRIMARY KEY, v INT) ",
+                             ks, tbl);
+    }
+
 
     private boolean transientContains(SSTableReader sstable)
     {
-        return csm.getTransientRepairsUnsafe().containsSSTable(sstable);
+        return ((CompactionStrategyManager) compactionStrategyContainer)
+               .getTransientRepairsUnsafe()
+               .containsSSTable(sstable);
     }
 
     private boolean pendingContains(SSTableReader sstable)
     {
-        return csm.getPendingRepairsUnsafe().containsSSTable(sstable);
+        return ((CompactionStrategyManager) compactionStrategyContainer)
+               .getPendingRepairsUnsafe()
+               .containsSSTable(sstable);
     }
 
     private boolean repairedContains(SSTableReader sstable)
     {
-        return csm.getRepairedUnsafe().containsSSTable(sstable);
+        return ((CompactionStrategyManager) compactionStrategyContainer)
+               .getRepairedUnsafe()
+               .containsSSTable(sstable);
     }
 
     private boolean unrepairedContains(SSTableReader sstable)
     {
-        return csm.getUnrepairedUnsafe().containsSSTable(sstable);
+        return ((CompactionStrategyManager) compactionStrategyContainer)
+               .getUnrepairedUnsafe()
+               .containsSSTable(sstable);
     }
 
     private boolean hasPendingStrategiesFor(UUID sessionID)
     {
-        return !Iterables.isEmpty(csm.getPendingRepairsUnsafe().getStrategiesFor(sessionID));
+        return !Iterables.isEmpty(((CompactionStrategyManager) compactionStrategyContainer)
+                                  .getPendingRepairsUnsafe()
+                                  .getStrategiesFor(sessionID));
     }
 
     private boolean hasTransientStrategiesFor(UUID sessionID)
     {
-        return !Iterables.isEmpty(csm.getTransientRepairsUnsafe().getStrategiesFor(sessionID));
+        return !Iterables.isEmpty(((CompactionStrategyManager) compactionStrategyContainer)
+                                  .getTransientRepairsUnsafe()
+                                  .getStrategiesFor(sessionID));
+    }
+
+    private void assertCompactionStrategyManagerPendingRepairs(boolean expectedEmpty)
+    {
+        assertEquals(expectedEmpty, ((CompactionStrategyManager) cfs.getCompactionStrategy()).pendingRepairs().isEmpty());
     }
 
     /**
      * Pending repair strategy should be created when we encounter a new pending id
      */
+    @Override
     @Test
-    public void sstableAdded()
+    public void testSstableAdded() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
-        Assert.assertTrue(Iterables.isEmpty(csm.getPendingRepairsUnsafe().allStrategies()));
+        Assert.assertTrue(Iterables.isEmpty(((CompactionStrategyManager) compactionStrategyContainer)
+                                            .getPendingRepairsUnsafe()
+                                            .allStrategies()));
 
         SSTableReader sstable = makeSSTable(true);
         Assert.assertFalse(sstable.isRepaired());
@@ -92,7 +124,7 @@ public void sstableAdded()
         Assert.assertFalse(hasTransientStrategiesFor(repairID));
 
         // add the sstable
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
         Assert.assertFalse(repairedContains(sstable));
         Assert.assertFalse(unrepairedContains(sstable));
         Assert.assertTrue(pendingContains(sstable));
@@ -100,8 +132,9 @@ public void sstableAdded()
         Assert.assertFalse(hasTransientStrategiesFor(repairID));
     }
 
+    @Override
     @Test
-    public void sstableListChangedAddAndRemove()
+    public void testSstableListChangedAddAndRemove() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -124,7 +157,7 @@ public void sstableListChangedAddAndRemove()
         notification = new SSTableListChangedNotification(Collections.singleton(sstable1),
                                                           Collections.emptyList(),
                                                           OperationType.COMPACTION);
-        csm.handleNotification(notification, cfs.getTracker());
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
 
         Assert.assertFalse(repairedContains(sstable1));
         Assert.assertFalse(unrepairedContains(sstable1));
@@ -139,7 +172,7 @@ public void sstableListChangedAddAndRemove()
         notification = new SSTableListChangedNotification(Collections.singleton(sstable2),
                                                           Collections.singleton(sstable1),
                                                           OperationType.COMPACTION);
-        csm.handleNotification(notification, cfs.getTracker());
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
 
         Assert.assertFalse(repairedContains(sstable1));
         Assert.assertFalse(unrepairedContains(sstable1));
@@ -149,8 +182,9 @@ public void sstableListChangedAddAndRemove()
         Assert.assertTrue(pendingContains(sstable2));
     }
 
+    @Override
     @Test
-    public void sstableRepairStatusChanged()
+    public void testSstableRepairStatusChanged() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -167,7 +201,7 @@ public void sstableRepairStatusChanged()
         // change to pending repaired
         mutateRepaired(sstable, repairID, false);
         notification = new SSTableRepairStatusChanged(Collections.singleton(sstable));
-        csm.handleNotification(notification, cfs.getTracker());
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
         Assert.assertFalse(unrepairedContains(sstable));
         Assert.assertFalse(repairedContains(sstable));
         Assert.assertTrue(hasPendingStrategiesFor(repairID));
@@ -177,26 +211,27 @@ public void sstableRepairStatusChanged()
         // change to repaired
         mutateRepaired(sstable, System.currentTimeMillis());
         notification = new SSTableRepairStatusChanged(Collections.singleton(sstable));
-        csm.handleNotification(notification, cfs.getTracker());
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
         Assert.assertFalse(unrepairedContains(sstable));
         Assert.assertTrue(repairedContains(sstable));
         Assert.assertFalse(pendingContains(sstable));
     }
 
+    @Override
     @Test
-    public void sstableDeleted()
+    public void testSstableDeleted() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
 
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairID, false);
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
         Assert.assertTrue(pendingContains(sstable));
 
         // delete sstable
         SSTableDeletingNotification notification = new SSTableDeletingNotification(sstable);
-        csm.handleNotification(notification, cfs.getTracker());
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
         Assert.assertFalse(pendingContains(sstable));
         Assert.assertFalse(unrepairedContains(sstable));
         Assert.assertFalse(repairedContains(sstable));
@@ -206,39 +241,35 @@ public void sstableDeleted()
      * CompactionStrategyManager.getStrategies should include
      * pending repair strategies when appropriate
      */
+    @Override
     @Test
-    public void getStrategies()
+    public void testStrategiesContainsPendingRepair() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
 
-        List<List<AbstractCompactionStrategy>> strategies;
-
-        strategies = csm.getStrategies();
-        Assert.assertEquals(3, strategies.size());
-        Assert.assertTrue(strategies.get(2).isEmpty());
+        Assert.assertTrue(compactionStrategyContainer.getStrategies(false, repairID).isEmpty());
 
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairID, false);
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
 
-        strategies = csm.getStrategies();
-        Assert.assertEquals(3, strategies.size());
-        Assert.assertFalse(strategies.get(2).isEmpty());
+        Assert.assertFalse(compactionStrategyContainer.getStrategies(false, repairID).isEmpty());
     }
 
     /**
      * Tests that finalized repairs result in cleanup compaction tasks
      * which reclassify the sstables as repaired
      */
+    @Override
     @Test
-    public void cleanupCompactionFinalized()
+    public void testCleanupCompactionFinalized() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairID, false);
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
         LocalSessionAccessor.finalizeUnsafe(repairID);
         Assert.assertTrue(hasPendingStrategiesFor(repairID));
         Assert.assertFalse(hasTransientStrategiesFor(repairID));
@@ -246,10 +277,12 @@ public void cleanupCompactionFinalized()
         Assert.assertTrue(sstable.isPendingRepair());
         Assert.assertFalse(sstable.isRepaired());
 
-        cfs.getCompactionStrategyManager().enable(); // enable compaction to fetch next background task
-        AbstractCompactionTask compactionTask = csm.getNextBackgroundTask(FBUtilities.nowInSeconds());
+        cfs.getCompactionStrategyContainer().enable(); // enable compaction to fetch next background task
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
         Assert.assertNotNull(compactionTask);
-        Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
+        Assert.assertSame(RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
         compactionTask.execute();
@@ -264,21 +297,22 @@ public void cleanupCompactionFinalized()
         long expectedRepairedAt = ActiveRepairService.instance.getParentRepairSession(repairID).repairedAt;
         Assert.assertFalse(sstable.isPendingRepair());
         Assert.assertTrue(sstable.isRepaired());
-        Assert.assertEquals(expectedRepairedAt, sstable.getSSTableMetadata().repairedAt);
+        assertEquals(expectedRepairedAt, sstable.getSSTableMetadata().repairedAt);
     }
 
     /**
      * Tests that failed repairs result in cleanup compaction tasks
      * which reclassify the sstables as unrepaired
      */
+    @Override
     @Test
-    public void cleanupCompactionFailed()
+    public void testCleanupCompactionFailed() throws IOException
     {
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairID, false);
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
         LocalSessionAccessor.failUnsafe(repairID);
 
         Assert.assertTrue(hasPendingStrategiesFor(repairID));
@@ -287,10 +321,12 @@ public void cleanupCompactionFailed()
         Assert.assertTrue(sstable.isPendingRepair());
         Assert.assertFalse(sstable.isRepaired());
 
-        cfs.getCompactionStrategyManager().enable(); // enable compaction to fetch next background task
-        AbstractCompactionTask compactionTask = csm.getNextBackgroundTask(FBUtilities.nowInSeconds());
+        cfs.getCompactionStrategyContainer().enable(); // enable compaction to fetch next background task
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
         Assert.assertNotNull(compactionTask);
-        Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
+        Assert.assertSame(RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
         compactionTask.execute();
@@ -303,18 +339,117 @@ public void cleanupCompactionFailed()
         // sstable should have pendingRepair cleared, and repairedAt set correctly
         Assert.assertFalse(sstable.isPendingRepair());
         Assert.assertFalse(sstable.isRepaired());
-        Assert.assertEquals(ActiveRepairService.UNREPAIRED_SSTABLE, sstable.getSSTableMetadata().repairedAt);
+        assertEquals(ActiveRepairService.UNREPAIRED_SSTABLE, sstable.getSSTableMetadata().repairedAt);
+    }
+
+    @Override
+    @Test
+    public void testSessionCompleted() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+        assertCompactionStrategyManagerPendingRepairs(true);
+
+        // add sstable as unrepaired
+        final boolean isOrphan = false;
+        SSTableReader sstable = makeSSTable(isOrphan);
+
+        // change to pending repair
+        mutateRepaired(sstable, repairID, false);
+        SSTableRepairStatusChanged notification = new SSTableRepairStatusChanged(Collections.singleton(sstable));
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
+        Assert.assertFalse(unrepairedContains(sstable));
+        Assert.assertFalse(repairedContains(sstable));
+        Assert.assertTrue(hasPendingStrategiesFor(repairID));
+        Assert.assertFalse(hasTransientStrategiesFor(repairID));
+        Assert.assertTrue(pendingContains(sstable));
+
+        // finalize
+        LocalSessionAccessor.finalizeUnsafe(repairID);
+
+        // complete session
+        ARS.consistent.local.sessionCompleted(ARS.consistent.local.getSession(repairID));
+
+        // sstable is repaired
+        Assert.assertFalse(unrepairedContains(sstable));
+        Assert.assertTrue(repairedContains(sstable));
+        Assert.assertFalse(pendingContains(sstable));
+    }
+
+    @Override
+    @Test
+    public void testSessionCompletedWithDifferentSSTables() throws IOException
+    {
+        UUID repairID1 = registerSession(cfs, true, true);
+        UUID repairID2 = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID1, COORDINATOR, PARTICIPANTS);
+        LocalSessionAccessor.prepareUnsafe(repairID2, COORDINATOR, PARTICIPANTS);
+        assertCompactionStrategyManagerPendingRepairs(true);
+
+        // add sstables as unrepaired
+        final boolean isOrphan = false;
+        SSTableReader sstable1 = makeSSTable(isOrphan);
+        Assert.assertTrue(unrepairedContains(sstable1));
+
+        SSTableReader sstable2 = makeSSTable(isOrphan);
+        Assert.assertTrue(unrepairedContains(sstable2));
+
+        SSTableReader sstable3 = makeSSTable(isOrphan);
+        Assert.assertTrue(unrepairedContains(sstable3));
+
+        // change sstable1 to pending repair for session 1
+        mutateRepaired(sstable1, repairID1, false);
+        SSTableRepairStatusChanged notification = new SSTableRepairStatusChanged(ImmutableList.of(sstable1));
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertTrue(sstable1.isPendingRepair());
+        Assert.assertTrue(hasPendingStrategiesFor(repairID1));
+        Assert.assertFalse(hasTransientStrategiesFor(repairID1));
+
+        // change sstable2 to pending repair for session 2
+        mutateRepaired(sstable2, repairID2, false);
+        notification = new SSTableRepairStatusChanged(ImmutableList.of(sstable2));
+        compactionStrategyContainer.handleNotification(notification, cfs.getTracker());
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertTrue(sstable2.isPendingRepair());
+        Assert.assertTrue(hasPendingStrategiesFor(repairID2));
+        Assert.assertFalse(hasTransientStrategiesFor(repairID2));
+
+        // change sstable3 to repaired
+        mutateRepaired(sstable3, System.currentTimeMillis());
+        Assert.assertTrue(sstable3.isRepaired());
+        Assert.assertFalse(sstable3.isPendingRepair());
+
+        // finalize session 1
+        LocalSessionAccessor.finalizeUnsafe(repairID1);
+
+        // simulate compaction on repaired sstable3
+        cfs.getTracker().tryModify(sstable3, OperationType.COMPACTION);
+
+        // completing session 1 will not require to disable compactions because:
+        // * sstable2 belongs to a different session
+        // * sstable3 is repaired
+        ARS.consistent.local.sessionCompleted(ARS.consistent.local.getSession(repairID1));
+
+        // now sstable1 and sstable3 are repaired
+        Assert.assertTrue(sstable1.isRepaired());
+        Assert.assertTrue(sstable3.isRepaired());
+        Assert.assertTrue(sstable2.isPendingRepair());
+
+        assertEquals(Collections.singleton(repairID2),
+                     ((CompactionStrategyManager) compactionStrategyContainer).pendingRepairs());
     }
 
+    @Override
     @Test
-    public void finalizedSessionTransientCleanup()
+    public void testFinalizedSessionTransientCleanup() throws IOException
     {
         Assert.assertTrue(cfs.getLiveSSTables().isEmpty());
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairID, true);
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
         LocalSessionAccessor.finalizeUnsafe(repairID);
 
         Assert.assertFalse(hasPendingStrategiesFor(repairID));
@@ -324,10 +459,12 @@ public void finalizedSessionTransientCleanup()
         Assert.assertFalse(repairedContains(sstable));
         Assert.assertFalse(unrepairedContains(sstable));
 
-        cfs.getCompactionStrategyManager().enable(); // enable compaction to fetch next background task
-        AbstractCompactionTask compactionTask = csm.getNextBackgroundTask(FBUtilities.nowInSeconds());
+        cfs.getCompactionStrategyContainer().enable(); // enable compaction to fetch next background task
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
         Assert.assertNotNull(compactionTask);
-        Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
+        Assert.assertSame(RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
         compactionTask.execute();
@@ -337,15 +474,16 @@ public void finalizedSessionTransientCleanup()
         Assert.assertFalse(hasTransientStrategiesFor(repairID));
     }
 
+    @Override
     @Test
-    public void failedSessionTransientCleanup()
+    public void testFailedSessionTransientCleanup() throws IOException
     {
         Assert.assertTrue(cfs.getLiveSSTables().isEmpty());
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairID, true);
-        csm.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
+        compactionStrategyContainer.handleNotification(new SSTableAddedNotification(Collections.singleton(sstable), null), cfs.getTracker());
         LocalSessionAccessor.failUnsafe(repairID);
 
         Assert.assertFalse(hasPendingStrategiesFor(repairID));
@@ -355,10 +493,12 @@ public void failedSessionTransientCleanup()
         Assert.assertFalse(repairedContains(sstable));
         Assert.assertFalse(unrepairedContains(sstable));
 
-        cfs.getCompactionStrategyManager().enable(); // enable compaction to fetch next background task
-        AbstractCompactionTask compactionTask = csm.getNextBackgroundTask(FBUtilities.nowInSeconds());
+        cfs.getCompactionStrategyContainer().enable(); // enable compaction to fetch next background task
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
         Assert.assertNotNull(compactionTask);
-        Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
+        Assert.assertSame(RepairFinishedCompactionTask.class, compactionTask.getClass());
 
         // run the compaction
         compactionTask.execute();
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
index 3ef18b3f69dc..8c501bce1466 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java
@@ -26,8 +26,6 @@
 import java.util.List;
 import java.util.Set;
 import java.util.UUID;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 
 import com.google.common.collect.Iterables;
@@ -38,7 +36,6 @@
 import org.junit.Before;
 import org.junit.BeforeClass;
 import org.junit.Test;
-
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -52,6 +49,7 @@
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.PartitionPosition;
 import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.SortedLocalRanges;
 import org.apache.cassandra.db.compaction.AbstractStrategyHolder.GroupedSSTableContainer;
 import org.apache.cassandra.dht.ByteOrderedPartitioner;
 import org.apache.cassandra.dht.IPartitioner;
@@ -63,6 +61,7 @@
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.UUIDGen;
+import org.mockito.Mockito;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -70,6 +69,7 @@
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.mockito.Mockito.when;
 
 public class CompactionStrategyManagerTest
 {
@@ -129,12 +129,12 @@ public void testSSTablesAssignedToCorrectCompactionStrategy() throws IOException
             if (i % 3 == 0)
             {
                 //make 1 third of sstables repaired
-                cfs.getCompactionStrategyManager().mutateRepaired(newSSTables, System.currentTimeMillis(), null, false);
+                cfs.mutateRepaired(newSSTables, System.currentTimeMillis(), null, false);
             }
             else if (i % 3 == 1)
             {
                 //make 1 third of sstables pending repair
-                cfs.getCompactionStrategyManager().mutateRepaired(newSSTables, 0, UUIDGen.getTimeUUID(), false);
+                cfs.mutateRepaired(newSSTables, 0, UUIDGen.getTimeUUID(), false);
             }
             previousSSTables = currentSSTables;
         }
@@ -151,6 +151,7 @@ public void testSSTablesAssignedToCorrectCompactionStrategy(int numSSTables, int
     {
         // Create a mock CFS with the given number of disks
         MockCFS cfs = createJBODMockCFS(numDisks);
+        CompactionStrategyFactory strategyFactory = new CompactionStrategyFactory(cfs);
         //Check that CFS will contain numSSTables
         assertEquals(numSSTables, cfs.getLiveSSTables().size());
 
@@ -159,9 +160,10 @@ public void testSSTablesAssignedToCorrectCompactionStrategy(int numSSTables, int
 
         MockBoundaryManager mockBoundaryManager = new MockBoundaryManager(cfs, boundaries);
         logger.debug("Boundaries for {} disks is {}", numDisks, Arrays.toString(boundaries));
-        CompactionStrategyManager csm = new CompactionStrategyManager(cfs, mockBoundaryManager::getBoundaries,
+        CompactionStrategyManager csm = new CompactionStrategyManager(strategyFactory,
+                                                                      mockBoundaryManager::getBoundaries,
                                                                       true);
-        csm.reload(cfs.metadata().params.compaction);
+        csm.reload(csm, cfs.metadata().params.compaction, CompactionStrategyContainer.ReloadReason.FULL);
 
         // Check that SSTables are assigned to the correct Compaction Strategy
         for (SSTableReader reader : cfs.getLiveSSTables())
@@ -201,82 +203,10 @@ public void testSSTablesAssignedToCorrectCompactionStrategy(int numSSTables, int
         }
     }
 
-    @Test
-    public void testAutomaticUpgradeConcurrency() throws Exception
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
-        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(true);
-        DatabaseDescriptor.setMaxConcurrentAutoUpgradeTasks(1);
-
-        // latch to block CompactionManager.BackgroundCompactionCandidate#maybeRunUpgradeTask
-        // inside the currentlyBackgroundUpgrading check - with max_concurrent_auto_upgrade_tasks = 1 this will make
-        // sure that BackgroundCompactionCandidate#maybeRunUpgradeTask returns false until the latch has been counted down
-        CountDownLatch latch = new CountDownLatch(1);
-        AtomicInteger upgradeTaskCount = new AtomicInteger(0);
-        MockCFSForCSM mock = new MockCFSForCSM(cfs, latch, upgradeTaskCount);
-
-        CompactionManager.BackgroundCompactionCandidate r = CompactionManager.instance.getBackgroundCompactionCandidate(mock);
-        CompactionStrategyManager mgr = mock.getCompactionStrategyManager();
-        // basic idea is that we start a thread which will be able to get in to the currentlyBackgroundUpgrading-guarded
-        // code in CompactionManager, then we try to run a bunch more of the upgrade tasks which should return false
-        // due to the currentlyBackgroundUpgrading count being >= max_concurrent_auto_upgrade_tasks
-        Thread t = new Thread(() -> r.maybeRunUpgradeTask(mgr));
-        t.start();
-        Thread.sleep(100); // let the thread start and grab the task
-        assertEquals(1, CompactionManager.instance.currentlyBackgroundUpgrading.get());
-        assertFalse(r.maybeRunUpgradeTask(mgr));
-        assertFalse(r.maybeRunUpgradeTask(mgr));
-        latch.countDown();
-        t.join();
-        assertEquals(1, upgradeTaskCount.get()); // we should only call findUpgradeSSTableTask once when concurrency = 1
-        assertEquals(0, CompactionManager.instance.currentlyBackgroundUpgrading.get());
-
-        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(false);
-    }
-
-    @Test
-    public void testAutomaticUpgradeConcurrency2() throws Exception
-    {
-        ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
-        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(true);
-        DatabaseDescriptor.setMaxConcurrentAutoUpgradeTasks(2);
-        // latch to block CompactionManager.BackgroundCompactionCandidate#maybeRunUpgradeTask
-        // inside the currentlyBackgroundUpgrading check - with max_concurrent_auto_upgrade_tasks = 1 this will make
-        // sure that BackgroundCompactionCandidate#maybeRunUpgradeTask returns false until the latch has been counted down
-        CountDownLatch latch = new CountDownLatch(1);
-        AtomicInteger upgradeTaskCount = new AtomicInteger();
-        MockCFSForCSM mock = new MockCFSForCSM(cfs, latch, upgradeTaskCount);
-
-        CompactionManager.BackgroundCompactionCandidate r = CompactionManager.instance.getBackgroundCompactionCandidate(mock);
-        CompactionStrategyManager mgr = mock.getCompactionStrategyManager();
-
-        // basic idea is that we start 2 threads who will be able to get in to the currentlyBackgroundUpgrading-guarded
-        // code in CompactionManager, then we try to run a bunch more of the upgrade task which should return false
-        // due to the currentlyBackgroundUpgrading count being >= max_concurrent_auto_upgrade_tasks
-        Thread t = new Thread(() -> r.maybeRunUpgradeTask(mgr));
-        t.start();
-        Thread t2 = new Thread(() -> r.maybeRunUpgradeTask(mgr));
-        t2.start();
-        Thread.sleep(100); // let the threads start and grab the task
-        assertEquals(2, CompactionManager.instance.currentlyBackgroundUpgrading.get());
-        assertFalse(r.maybeRunUpgradeTask(mgr));
-        assertFalse(r.maybeRunUpgradeTask(mgr));
-        assertFalse(r.maybeRunUpgradeTask(mgr));
-        assertEquals(2, CompactionManager.instance.currentlyBackgroundUpgrading.get());
-        latch.countDown();
-        t.join();
-        t2.join();
-        assertEquals(2, upgradeTaskCount.get());
-        assertEquals(0, CompactionManager.instance.currentlyBackgroundUpgrading.get());
-
-        DatabaseDescriptor.setMaxConcurrentAutoUpgradeTasks(1);
-        DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(false);
-    }
-
     private static void assertHolderExclusivity(boolean isRepaired, boolean isPendingRepair, boolean isTransient, Class<? extends AbstractStrategyHolder> expectedType)
     {
         ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
-        CompactionStrategyManager csm = cfs.getCompactionStrategyManager();
+        CompactionStrategyManager csm = (CompactionStrategyManager) cfs.getCompactionStrategy();
 
         AbstractStrategyHolder holder = csm.getHolder(isRepaired, isPendingRepair, isTransient);
         assertNotNull(holder);
@@ -297,7 +227,7 @@ private static void assertHolderExclusivity(boolean isRepaired, boolean isPendin
     private static void assertInvalieHolderConfig(boolean isRepaired, boolean isPendingRepair, boolean isTransient)
     {
         ColumnFamilyStore cfs = Keyspace.open(KS_PREFIX).getColumnFamilyStore(TABLE_PREFIX);
-        CompactionStrategyManager csm = cfs.getCompactionStrategyManager();
+        CompactionStrategyManager csm = (CompactionStrategyManager) cfs.getCompactionStrategy();
         try
         {
             csm.getHolder(isRepaired, isPendingRepair, isTransient);
@@ -340,6 +270,7 @@ public void groupSSTables() throws Exception
     {
         final int numDir = 4;
         ColumnFamilyStore cfs = createJBODMockCFS(numDir);
+        CompactionStrategyFactory strategyFactory = new CompactionStrategyFactory(cfs);
         Keyspace.open(cfs.keyspace.getName()).getColumnFamilyStore(cfs.name).disableAutoCompaction();
         assertTrue(cfs.getLiveSSTables().isEmpty());
         List<SSTableReader> transientRepairs = new ArrayList<>();
@@ -356,16 +287,20 @@ public void groupSSTables() throws Exception
             repaired.add(createSSTableWithKey(cfs.keyspace.getName(), cfs.name, key++));
         }
 
-        cfs.getCompactionStrategyManager().mutateRepaired(transientRepairs, 0, UUID.randomUUID(), true);
-        cfs.getCompactionStrategyManager().mutateRepaired(pendingRepair, 0, UUID.randomUUID(), false);
-        cfs.getCompactionStrategyManager().mutateRepaired(repaired, 1000, null, false);
+        cfs.mutateRepaired(transientRepairs, 0, UUID.randomUUID(), true);
+        cfs.mutateRepaired(pendingRepair, 0, UUID.randomUUID(), false);
+        cfs.mutateRepaired(repaired, 1000, null, false);
+
+
+        SortedLocalRanges localRanges = Mockito.mock(SortedLocalRanges.class);
+        when(localRanges.getRingVersion()).thenReturn(10L);
 
         DiskBoundaries boundaries = new DiskBoundaries(cfs, cfs.getDirectories().getWriteableLocations(),
                                                        Lists.newArrayList(forKey(100), forKey(200), forKey(300)),
-                                                       10, 10);
+                                                       localRanges, 10);
 
-        CompactionStrategyManager csm = new CompactionStrategyManager(cfs, () -> boundaries, true);
-        csm.reload(cfs.metadata().params.compaction);
+        CompactionStrategyManager csm = new CompactionStrategyManager(strategyFactory, () -> boundaries, true);
+        csm.reload(csm, cfs.metadata().params.compaction, CompactionStrategyContainer.ReloadReason.FULL);
 
         List<GroupedSSTableContainer> grouped = csm.groupSSTables(Iterables.concat( transientRepairs, pendingRepair, repaired, unrepaired));
 
@@ -438,7 +373,7 @@ private void verifySSTableIsAssignedToCorrectStrategy(Integer[] boundaries, Comp
         int index = getSSTableIndex(boundaries, reader);
         assertEquals(index, csm.compactionStrategyIndexFor(reader));
         // Check that compaction strategy actually contains SSTable
-        assertTrue(((SizeTieredCompactionStrategy)csm.compactionStrategyFor(reader)).sstables.contains(reader));
+        assertTrue(((SizeTieredCompactionStrategy) csm.compactionStrategyFor(reader)).sstables.contains(reader));
     }
 
     /**
@@ -468,9 +403,7 @@ private int getSSTableIndex(Integer[] boundaries, SSTableReader reader)
         return index;
     }
 
-
-
-    class MockBoundaryManager
+    private class MockBoundaryManager
     {
         private final ColumnFamilyStore cfs;
         private Integer[] positions;
@@ -498,7 +431,9 @@ public DiskBoundaries getBoundaries()
         private DiskBoundaries createDiskBoundaries(ColumnFamilyStore cfs, Integer[] boundaries)
         {
             List<PartitionPosition> positions = Arrays.stream(boundaries).map(b -> Util.token(String.format(String.format("%04d", b))).minKeyBound()).collect(Collectors.toList());
-            return new DiskBoundaries(cfs, cfs.getDirectories().getWriteableLocations(), positions, 0, 0);
+            SortedLocalRanges localRanges = Mockito.mock(SortedLocalRanges.class);
+            when(localRanges.getRingVersion()).thenReturn(0L);
+            return new DiskBoundaries(cfs, cfs.getDirectories().getWriteableLocations(), positions, localRanges, 0);
         }
     }
 
@@ -526,50 +461,4 @@ private static class MockCFS extends ColumnFamilyStore
             super(cfs.keyspace, cfs.getTableName(), 0, cfs.metadata, dirs, false, false, true);
         }
     }
-
-    private static class MockCFSForCSM extends ColumnFamilyStore
-    {
-        private final CountDownLatch latch;
-        private final AtomicInteger upgradeTaskCount;
-
-        private MockCFSForCSM(ColumnFamilyStore cfs, CountDownLatch latch, AtomicInteger upgradeTaskCount)
-        {
-            super(cfs.keyspace, cfs.name, 10, cfs.metadata, cfs.getDirectories(), true, false, false);
-            this.latch = latch;
-            this.upgradeTaskCount = upgradeTaskCount;
-        }
-        @Override
-        public CompactionStrategyManager getCompactionStrategyManager()
-        {
-            return new MockCSM(this, latch, upgradeTaskCount);
-        }
-    }
-
-    private static class MockCSM extends CompactionStrategyManager
-    {
-        private final CountDownLatch latch;
-        private final AtomicInteger upgradeTaskCount;
-
-        private MockCSM(ColumnFamilyStore cfs, CountDownLatch latch, AtomicInteger upgradeTaskCount)
-        {
-            super(cfs);
-            this.latch = latch;
-            this.upgradeTaskCount = upgradeTaskCount;
-        }
-
-        @Override
-        public AbstractCompactionTask findUpgradeSSTableTask()
-        {
-            try
-            {
-                latch.await();
-                upgradeTaskCount.incrementAndGet();
-            }
-            catch (InterruptedException e)
-            {
-                throw new RuntimeException(e);
-            }
-            return null;
-        }
-    }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java
index aad554d39e51..e6ad1b04dc3e 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyStatisticsTest.java
@@ -18,21 +18,19 @@
 
 package org.apache.cassandra.db.compaction;
 
-import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
-import java.util.Random;
 import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicReference;
 import java.util.stream.Collectors;
 
 import com.google.common.collect.ImmutableList;
@@ -41,208 +39,149 @@
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import org.apache.cassandra.dht.Murmur3Partitioner;
-import org.mockito.Mock;
-import org.mockito.Mockito;
-import org.mockito.MockitoAnnotations;
-
-import org.apache.cassandra.config.DatabaseDescriptor;
-import org.apache.cassandra.db.BufferDecoratedKey;
-import org.apache.cassandra.db.ColumnFamilyStore;
-import org.apache.cassandra.db.DecoratedKey;
-import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
-import org.apache.cassandra.db.lifecycle.SSTableSet;
-import org.apache.cassandra.db.lifecycle.Tracker;
-import org.apache.cassandra.db.marshal.AsciiType;
+import org.apache.cassandra.db.compaction.unified.Controller;
 import org.apache.cassandra.dht.Bounds;
-import org.apache.cassandra.dht.IPartitioner;
-import org.apache.cassandra.dht.Range;
-import org.apache.cassandra.dht.Splitter;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.format.SSTableReader;
-import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.FBUtilities;
 import org.apache.cassandra.utils.Pair;
+import org.mockito.Mockito;
 
 import static org.apache.cassandra.db.compaction.LeveledManifest.MAX_COMPACTING_L0;
 import static org.junit.Assert.*;
-import static org.mockito.ArgumentMatchers.anyIterable;
-import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.ArgumentMatchers.anyInt;
 import static org.mockito.Mockito.when;
 
 /**
  * Test for the compaction statistics for all strategies that support them.
  */
-public class CompactionStrategyStatisticsTest
+public class CompactionStrategyStatisticsTest extends BaseCompactionStrategyTest
 {
-    private static final double epsilon = 0.00000001;
-    private static final Random random = new Random(87689624525L);
-    private static final AtomicInteger generation = new AtomicInteger(1);
-
-    private final String keyspace = "ks";
-    private final String table = "table";
     private final int minCompactionThreshold = 4;
     private final int maxCompactionThreshold = 32;
-    private final long minSSTableSize = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE;
-
-    private long repairedAt;
-
-    @Mock
-    private ColumnFamilyStore cfs;
-
-    @Mock
-    private Tracker dataTracker;
-
-    @Mock
-    private CompactionStrategyManager strategyManager;
-
-    private CompactionLogger compactionLogger;
 
     @BeforeClass
     public static void setUpClass()
     {
-        DatabaseDescriptor.daemonInitialization(); // because of all the static initialization in CFS
-        DatabaseDescriptor.setPartitionerUnsafe(Murmur3Partitioner.instance);
+        BaseCompactionStrategyTest.setUpClass();
     }
 
     @Before
     public void setUp()
     {
-        MockitoAnnotations.initMocks(this);
-
-        TableMetadata metadata = TableMetadata.builder(keyspace, table)
-                                              .addPartitionKeyColumn("pk", AsciiType.instance)
-                                              .build();
-        repairedAt = System.currentTimeMillis();
+        super.setUp();
 
         when(cfs.getMinimumCompactionThreshold()).thenReturn(minCompactionThreshold);
         when(cfs.getMaximumCompactionThreshold()).thenReturn(maxCompactionThreshold);
-        when(cfs.metadata()).thenReturn(metadata);
-        when(cfs.getKeyspaceName()).thenReturn(keyspace);
-        when(cfs.getTableName()).thenReturn(table);
-        when(cfs.getTracker()).thenReturn(dataTracker);
-        when(cfs.getPartitioner()).thenReturn(DatabaseDescriptor.getPartitioner());
-        when(cfs.getCompactionStrategyManager()).thenReturn(strategyManager);
-
-        // use a real compaction logger to execute that code too even though we don't really check
-        // the content of the files, at least we cover the code. The files will be overwritten next
-        // time the test is run or by a gradle clean task, so they will not grow indefinitely
-        compactionLogger = new CompactionLogger(cfs, strategyManager);
-        compactionLogger.enable();
-        when(strategyManager.compactionLogger()).thenReturn(compactionLogger);
-    }
-
-    private void addSizeTieredOptions(Map<String, String> options)
-    {
-        options.put(SizeTieredCompactionStrategyOptions.MIN_SSTABLE_SIZE_KEY, Long.toString(minSSTableSize));
-        options.put(SizeTieredCompactionStrategyOptions.BUCKET_LOW_KEY, Double.toString(SizeTieredCompactionStrategyOptions.DEFAULT_BUCKET_LOW));
-        options.put(SizeTieredCompactionStrategyOptions.BUCKET_HIGH_KEY, Double.toString(SizeTieredCompactionStrategyOptions.DEFAULT_BUCKET_HIGH));
-    }
-
-    private void addTimeTieredOptions(Map<String, String> options)
-    {
-        addSizeTieredOptions(options);
-
-        options.put(TimeWindowCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, TimeUnit.MILLISECONDS.toString());
-        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_SIZE_KEY, "30");
-        options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "MINUTES");
-        options.put(TimeWindowCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, Long.toString(Long.MAX_VALUE)); // disable check for expired sstables
     }
 
-    private void addLeveledOptions(Map<String, String> options, long maxSSTableSizeBytes)
-    {
-        addSizeTieredOptions(options);
-
-        options.put(LeveledCompactionStrategy.SSTABLE_SIZE_OPTION, Long.toString(maxSSTableSizeBytes >> 20)); // Bytes to MB
-        options.put(LeveledCompactionStrategy.LEVEL_FANOUT_SIZE_OPTION, "10");
-    }
-
-    private SSTableReader mockSSTable(int level, long bytesOnDisk, long timestamp, double hotness, DecoratedKey first, DecoratedKey last)
-    {
-        SSTableReader ret = Mockito.mock(SSTableReader.class);
-
-        when(ret.bytesOnDisk()).thenReturn(bytesOnDisk);
-        when(ret.onDiskLength()).thenReturn(bytesOnDisk);
-        when(ret.uncompressedLength()).thenReturn(bytesOnDisk); // let's assume no compression
-        when(ret.hotness()).thenReturn(hotness);
-        when(ret.getSSTableLevel()).thenReturn(level);
-        when(ret.getMaxTimestamp()).thenReturn(timestamp);
-        when(ret.getMinTimestamp()).thenReturn(timestamp);
-        when(ret.getFirst()).thenReturn(first);
-        when(ret.getLast()).thenReturn(last);
-        when(ret.isMarkedSuspect()).thenReturn(false);
-        when(ret.isRepaired()).thenReturn(true);
-        when(ret.getRepairedAt()).thenReturn(repairedAt);
-        when(ret.getGeneration()).thenReturn(generation.getAndIncrement());
-        when(ret.toString()).thenReturn(String.format("Bytes on disk: %s, level %d, hotness %f, timestamp %d, first %s, last %s",
-                                                      FBUtilities.prettyPrintMemory(bytesOnDisk), level, hotness, timestamp, first, last));
-
-        return ret;
-    }
-
-    private List<SSTableReader> mockSSTables(int numSSTables, long bytesOnDisk, double hotness, long timestamp)
-    {
-        IPartitioner partitioner = cfs.getPartitioner();
-        DecoratedKey first = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
-        DecoratedKey last = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
-
-        List<SSTableReader> sstables = new ArrayList<>();
-        for (int i = 0; i < numSSTables; i++)
-        {
-            long b = (long)(bytesOnDisk * 0.8 + bytesOnDisk * 0.05 * random.nextDouble()); // leave 5% variability
-            double h = hotness * 0.8 + hotness * 0.05 * random.nextDouble(); // leave 5% variability
-            sstables.add(mockSSTable(0, b, timestamp, h, first, last));
-        }
-
-        return sstables;
-    }
-
-    private List<SSTableReader> mockNonOverlappingSSTables(int numSSTables, int level, long bytesOnDisk)
+    /**
+     * Creates 5 buckets with T sorted runs in each using W = 2 and o = 1 (the default)
+     */
+    @Test
+    public void testUnifiedCompactionStrategy_tiered_twoShards_fiveBuckets_W2()
     {
-        IPartitioner partitioner = cfs.getPartitioner(); // mocked same as DD.getPartitioner()
-        if (!partitioner.splitter().isPresent())
-            fail(String.format("Cannot split ranges with current partitioner %s", partitioner));
-
-        Range<Token> range = new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken());
-        Splitter.WeightedRange weightedRange = new Splitter.WeightedRange(1.0, range);
-        Splitter splitter = partitioner.splitter().get();
-        List<Token> boundaries = splitter.splitOwnedRanges(numSSTables,
-                                                           ImmutableList.of(weightedRange),
-                                                           false);
-        assertEquals(numSSTables, boundaries.size());
-        boundaries.add(0, partitioner.getMinimumToken());
-        ByteBuffer emptyBuffer = ByteBuffer.allocate(0);
-
-        long timestamp = System.currentTimeMillis();
-        List<SSTableReader> sstables = new ArrayList<>(numSSTables);
-        for (int i = 0; i < numSSTables; i++)
+        int W = 2; // W = 2 => T = F = 4
+        int T = 4;
+        int F = 4;
+        final long minSstableSizeBytes = 2L << 20; // 2 MB
+        final int numBuckets = 5;
+
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getScalingParameter(anyInt())).thenReturn(W);
+        when(controller.getFanout(anyInt())).thenReturn(F);
+        when(controller.getThreshold(anyInt())).thenReturn(T);
+        when(controller.getMinSstableSizeBytes()).thenReturn(minSstableSizeBytes);
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minSstableSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxCompactionSpaceBytes()).thenReturn(Long.MAX_VALUE);
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.random()).thenCallRealMethod();
+        // Calculate the minimum shard size such that the top bucket compactions won't be considered "oversized" and
+        // all will be allowed to run. The calculation below assumes (1) that compactions are considered "oversized"
+        // if they are more than 1/2 of the max shard size; (2) that mockSSTables uses 15% less than the max SSTable
+        // size for that bucket.
+        long topBucketMaxSstableSize = (long) (minSstableSizeBytes * Math.pow(F, numBuckets));
+        long minShardSizeWithoutOversizedCompactions = T * topBucketMaxSstableSize * 2;
+        when(controller.getShardSizeBytes()).thenReturn(minShardSizeWithoutOversizedCompactions);
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+        List<Collection<SSTableReader>> testBuckets = new ArrayList<>(numBuckets * 2);
+
+        // The order is repaired false, disk 0, then repaired true, disk 1, one bucket per shard, lowest to highest
+        // because the test picks from the end of the test buckets, we need to revert this order
+        for (int i = numBuckets - 1; i >= 0; i--)
         {
-            DecoratedKey first = new BufferDecoratedKey(boundaries.get(i).increaseSlightly(), emptyBuffer);
-            DecoratedKey last =  new BufferDecoratedKey(boundaries.get(i+1), emptyBuffer);
-            sstables.add(mockSSTable(level, bytesOnDisk, timestamp, 0., first, last));
-
-            timestamp+=10;
+            for (boolean repaired : new boolean[] { false, true })
+            {
+                for (int diskIndex = 1; diskIndex >= 0; diskIndex--)
+                {
+                    // calculate the max size then mockSSTables will remove 15% to this value,
+                    // this assumes o = 1, which is the default
+                    long size = (long) (minSstableSizeBytes * Math.pow(F, i + 1));
+                    List<SSTableReader> sstables = mockSSTables(T,
+                                                                size,
+                                                                0,
+                                                                System.currentTimeMillis(),
+                                                                diskIndex,
+                                                                repaired,
+                                                                null);
+                    testBuckets.add(sstables);
+                }
+            }
         }
 
-        return sstables;
-    }
-
-    private long totUncompressedLength(Collection<SSTableReader> sstables)
-    {
-        long ret = 0;
-        for (SSTableReader sstable : sstables)
-            ret += sstable.uncompressedLength();
-
-        return ret;
+        testCompactionStatistics(testBuckets, strategy);
     }
 
-    private double totHotness(Collection<SSTableReader> sstables)
+    /**
+     * Creates 5 buckets with T sorted runs in each using W = 2 and o = 1 (the default)
+     */
+    @Test
+    public void testUnifiedCompactionStrategy_leveled_one_shard_oneBucket_F8()
     {
-        double ret = 0;
-        for (SSTableReader sstable : sstables)
-            ret += sstable.hotness();
-
-        return ret;
+        int W = -6; // W = 2 => T = 2, F = 8
+        int T = 2;
+        int F = 8;
+        int m = 2; // m = 2 MB
+        long minSize = m << 20; // MB to bytes
+
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getScalingParameter(anyInt())).thenReturn(W);
+        when(controller.getFanout(anyInt())).thenReturn(F);
+        when(controller.getThreshold(anyInt())).thenReturn(T);
+        when(controller.getMinSstableSizeBytes()).thenReturn(minSize);
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minSize);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxCompactionSpaceBytes()).thenReturn(Long.MAX_VALUE);
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.random()).thenCallRealMethod();
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+
+        // put F sstables in the first bucket
+        List<SSTableReader> ssTablesList = new LinkedList<>();
+        for (int i = 0; i < F; i++)
+            ssTablesList.addAll(mockSSTables(1, minSize, 0, System.currentTimeMillis()));
+
+        Collections.sort(ssTablesList, Comparator.comparing(SSTableReader::onDiskLength).reversed());
+        Set<SSTableReader> sstables = new LinkedHashSet<>(F);
+        sstables.addAll(ssTablesList);
+
+        // sort by size and add 2 by 2 from largest to smallest, normally the sstable resulting from the 1 compaction
+        // would be added back to the same bucket and be selected for the next compaction but we don't simulate this
+        // so next time a compaction is invoked it will pick the next two largest sstables, that's why there will be
+        // F/2 compactions rather than F-1
+//        LinkedList<Collection<SSTableReader>> compactions = new LinkedList();
+//        for (int i = 0; (i + T) <= ssTablesList.size(); i += T)
+//        {
+//            List<SSTableReader> candidates = ssTablesList.subList(i, i + T);
+//            compactions.addFirst(candidates); // we want the first 2 sstables (the largest) to be the last in the list
+//        }
+
+        testCompactionStatistics(sstables, ImmutableList.of(sstables), 1, strategy);
     }
 
     /**
@@ -255,10 +194,10 @@ public void testSizeTieredCompactionStrategy_fiveBucketsOnePick()
         Map<String, String> options = new HashMap<>();
         addSizeTieredOptions(options);
 
-        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(cfs, options);
+        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(strategyFactory, options);
 
         final int numCompactions = 5;
-        long minSize = minSSTableSize;
+        long minSize = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE;
         double hotness = 1000;
 
         List<Collection<SSTableReader>> testBuckets = new ArrayList<>(numCompactions);
@@ -286,7 +225,7 @@ public void testSizeTieredCompactionStrategy_oneBucketFivePicks()
         Map<String, String> options = new HashMap<>();
         addSizeTieredOptions(options);
 
-        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(cfs, options);
+        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(strategyFactory, options);
 
         final int numCompactions = 5;
         long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 2;
@@ -316,7 +255,7 @@ public void testSizeTieredCompactionStrategy_threeBucketsTwoPicks()
         Map<String, String> options = new HashMap<>();
         addSizeTieredOptions(options);
 
-        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(cfs, options);
+        SizeTieredCompactionStrategy strategy = new SizeTieredCompactionStrategy(strategyFactory, options);
 
         long minSize = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE;
         double hotness = 1000;
@@ -351,7 +290,7 @@ public void testTimeWindowCompactionStrategy_fiveBucketsOnePick()
         Map<String, String> options = new HashMap<>();
         addTimeTieredOptions(options);
 
-        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(cfs, options);
+        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(strategyFactory, options);
 
         final int numCompactions = 5;
         long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 5;
@@ -379,7 +318,7 @@ public void testTimeWindowCompactionStrategy_oneBucketFivePicks()
         Map<String, String> options = new HashMap<>();
         addTimeTieredOptions(options);
 
-        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(cfs, options);
+        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(strategyFactory, options);
 
         final int numCompactions = 5;
         long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 5;
@@ -407,7 +346,7 @@ public void testTimeWindowCompactionStrategy_threeBucketsTwoPicks()
         Map<String, String> options = new HashMap<>();
         addTimeTieredOptions(options);
 
-        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(cfs, options);
+        TimeWindowCompactionStrategy strategy = new TimeWindowCompactionStrategy(strategyFactory, options);
 
         long size = SizeTieredCompactionStrategyOptions.DEFAULT_MIN_SSTABLE_SIZE * 10;
         double hotness = 1000;
@@ -458,7 +397,7 @@ public void testLeveledCompactionStrategy_threeLevels()
         long maxSSTableSize = 160 << 20; // 160 MB in bytes
         addLeveledOptions(options, maxSSTableSize);
 
-        LeveledCompactionStrategy strategy = new LeveledCompactionStrategy(cfs, options);
+        LeveledCompactionStrategy strategy = new LeveledCompactionStrategy(strategyFactory, options);
 
         final int numLevels = 3;
         List<List<SSTableReader>> ssTablesByLevel = new ArrayList<>(numLevels);
@@ -486,8 +425,21 @@ public void testLeveledCompactionStrategy_threeLevels()
         //L0 will compact all its sstables and the ones of L1 since they all overlap and the total is below the max threshold
         compactions.add(Sets.union(Sets.newLinkedHashSet(ssTablesByLevel.get(0)), Sets.newLinkedHashSet(ssTablesByLevel.get(1))));
 
-        // L1 will compact the first sstable because the score is > 1 plus the overlapping sstables from L2
-        compactions.add(overlapping(ssTablesByLevel.get(1).get(0), ssTablesByLevel.get(2)));
+        // L1 will compact the first sstable that it finds not overlapping with L2 sstables that are not suspect or already
+        // compacting. Because the next line will select the first sstable in L2 to compact, L1 will pick the first sstable
+        // that does not overlap with it
+        SSTableReader candidate = null;
+        for (SSTableReader c : ssTablesByLevel.get(1))
+        {
+            if (c.getFirst().compareTo(ssTablesByLevel.get(2).get(0).getLast()) > 0)
+            {
+                candidate = c;
+                break;
+            }
+        }
+        assertNotNull(candidate);
+        // compact the candidate with all the overlapping sstables of L2
+        compactions.add(overlapping(candidate, ssTablesByLevel.get(2)));
 
         // L2 will compact the first sstable because the score is > 1 but no other overlapping sstables since L3 is empty
         compactions.add(overlapping(ssTablesByLevel.get(2).get(0), ImmutableList.of()));
@@ -496,36 +448,44 @@ public void testLeveledCompactionStrategy_threeLevels()
         // already compacting, hence we can only test 2 compactions initially
         testCompactionStatistics(sstables, compactions, 2, strategy);
 
-        // Now check L0 compaction can proceed, the other levels won't compact since the score should be < 1
-        ssTablesByLevel.get(1).remove(0); // the first one must have been compacted
+        // Now check L0 compaction can proceed, the other levels won't compact since the score should be <= 1
+        ssTablesByLevel.get(1).remove(candidate); // remove the L1 sstable that was already compacted
         Set<SSTableReader> candidates = Sets.union(Sets.newLinkedHashSet(ssTablesByLevel.get(0)), Sets.newLinkedHashSet(ssTablesByLevel.get(1)));
         long totLength = totUncompressedLength(candidates);
-        UUID id = mockCompaction(strategy, sstables, candidates, Collections.emptySet());
-
-        verifyStatistics(strategy,
-                         1,
-                         1,
-                         candidates.size(),
-                         candidates.size(),
-                         totLength,
-                         0,
-                         0,
-                         0);
-
-        CompactionProgress progress = mockCompactionProgress(candidates, id);
-        strategy.getBackgroundCompactions().setInProgress(progress);
-
-        verifyStatistics(strategy,
-                         1,
-                         1,
-                         candidates.size(),
-                         candidates.size(),
-                         totLength,
-                         totLength,
-                         totLength,
-                         0);
-
-        strategy.backgroundCompactions.setCompleted(id);
+
+        Collection<AbstractCompactionTask> tasks = strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertFalse(tasks.isEmpty());
+
+        for (AbstractCompactionTask task : tasks)
+        {
+            assertNotNull(task);
+            UUID id = task.transaction().opId();
+
+            verifyStatistics(strategy,
+                             1,
+                             1,
+                             candidates.size(),
+                             candidates.size(),
+                             totLength,
+                             0,
+                             0,
+                             0);
+
+            CompactionProgress progress = mockCompletedCompactionProgress(candidates, id);
+            strategy.onInProgress(progress);
+
+            verifyStatistics(strategy,
+                             1,
+                             1,
+                             candidates.size(),
+                             candidates.size(),
+                             totLength,
+                             totLength,
+                             totLength,
+                             0);
+
+            strategy.backgroundCompactions.onCompleted(strategy, id);
+        }
 
         // Now we should have L1 again...
     }
@@ -540,7 +500,7 @@ public void testLeveledCompactionStrategy_stcsL0()
         long maxSSTableSize = 160 << 20; // 160 MB in bytes
         addLeveledOptions(options, maxSSTableSize);
 
-        LeveledCompactionStrategy strategy = new LeveledCompactionStrategy(cfs, options);
+        LeveledCompactionStrategy strategy = new LeveledCompactionStrategy(strategyFactory, options);
 
         int level = 1;
         long maxLevelSize = (long) (Math.pow(10, level) * maxSSTableSize);
@@ -570,7 +530,7 @@ public void testLeveledCompactionStrategy_stcsL0()
 
     private void testCompactionStatistics(List<Collection<SSTableReader>> compactions, AbstractCompactionStrategy strategy)
     {
-        Set<SSTableReader> sstables = compactions.stream().flatMap(bucket -> bucket.stream()).collect(Collectors.toSet());
+        Set<SSTableReader> sstables = compactions.stream().flatMap(Collection::stream).collect(Collectors.toSet());
         testCompactionStatistics(sstables, compactions, compactions.size(), strategy);
     }
 
@@ -588,9 +548,8 @@ private void testCompactionStatistics(Set<SSTableReader> sstables,
                                           int numExpectedCompactions,
                                           AbstractCompactionStrategy strategy)
     {
-        // Add the tables to the strategy
-        for (SSTableReader sstable : sstables)
-            strategy.addSSTable(sstable);
+        // Add the tables to the strategy and the data tracker
+        addSSTablesToStrategy(strategy, sstables);
 
         List<SSTableReader> sstablesForCompaction = compactions.stream().flatMap(Collection::stream).collect(Collectors.toList());
 
@@ -608,38 +567,30 @@ private void testCompactionStatistics(Set<SSTableReader> sstables,
         int numCompactionsInProgress = 0;
 
         // Create a compaction task and start the compaction for each bucket starting with the highest index
-        for (int i = 0; i < numExpectedCompactions; i++)
+        int i = 0;
+        while (i < numExpectedCompactions)
         {
-            int compactingLevel = compactions.size() - i - 1;
-            Set<SSTableReader> candidates = Sets.newHashSet(compactions.get(compactingLevel));
-
-            UUID id = mockCompaction(strategy, sstables, candidates, compacting);
+            List<Pair<Set<SSTableReader>, UUID>> tasksCompactions = new ArrayList<>(compactions.size());
+            Collection<AbstractCompactionTask> tasks = strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+            assertFalse(tasks.isEmpty());
 
-            numCompactionsInProgress++;
-            numSSTablesCompacting += candidates.size();
-            submittedCompactions.add(Pair.create(candidates, id));
+            // Keep track of all the tasks that were submitted (one per shard)
+            for (AbstractCompactionTask task : tasks)
+            {
+                Set<SSTableReader> candidates = Sets.newHashSet(task.transaction.originals());
 
-            // after mocking the compaction the list of pending compactions has been updated in the strategy
-            // and this will be reflected in the statistics but the compaction task has not started yet
-            verifyStatistics(strategy,
-                             numCompactions,
-                             numCompactionsInProgress,
-                             numSSTables,
-                             numSSTablesCompacting,
-                             totLength,
-                             totRead,
-                             totWritten,
-                             totHotness);
+                i++;
 
-            // Now we simulate starting the compaction task
-            CompactionProgress progress = mockCompactionProgress(candidates, id);
-            strategy.getBackgroundCompactions().setInProgress(progress);
+                assertNotNull(task);
+                UUID id = task.transaction().opId();
 
-            // The compaction has started and so we must updated the following expected values
-            totRead += progress.uncompressedBytesRead();
-            totWritten += progress.uncompressedBytesWritten();
+                numCompactionsInProgress++;
+                numSSTablesCompacting += candidates.size();
+                tasksCompactions.add(Pair.create(candidates, id));
+            }
 
-            // Now check that the statistics reflect the compaction in progress
+            // after mocking the compactions the list of pending compactions has been updated in the strategy
+            // and this will be reflected in the statistics but the compaction task has not started yet
             verifyStatistics(strategy,
                              numCompactions,
                              numCompactionsInProgress,
@@ -650,17 +601,47 @@ private void testCompactionStatistics(Set<SSTableReader> sstables,
                              totWritten,
                              totHotness);
 
-            // update compacting for the next iteration
-            compacting.addAll(candidates);
+            // Start the compactions and check the statistics are updated
+            for (Pair<Set<SSTableReader>, UUID> pair : tasksCompactions)
+            {
+                UUID id = pair.right;
+                Set<SSTableReader> candidates = pair.left;
+
+                // Now we simulate starting the compaction task
+                CompactionProgress progress = mockCompletedCompactionProgress(candidates, id);
+                strategy.onInProgress(progress);
+
+                // The compaction has started and so we must updated the following expected values
+                totRead += progress.uncompressedBytesRead();
+                totWritten += progress.uncompressedBytesWritten();
+
+                // Now check that the statistics reflect the compaction in progress
+                verifyStatistics(strategy,
+                                 numCompactions,
+                                 numCompactionsInProgress,
+                                 numSSTables,
+                                 numSSTablesCompacting,
+                                 totLength,
+                                 totRead,
+                                 totWritten,
+                                 totHotness);
+
+                // update compacting for the next iteration
+                compacting.addAll(candidates);
+            }
+
+            submittedCompactions.addAll(tasksCompactions);
         }
 
+        assertEquals(numExpectedCompactions, submittedCompactions.size());
+
         // Terminate the compactions one by one by closing the AutoCloseable and check
         // that the statistics are updated
         for (Pair<Set<SSTableReader>, UUID> pair : submittedCompactions)
         {
             Set<SSTableReader> compSSTables = pair.left;
             long totSSTablesLen = totUncompressedLength(compSSTables);
-            strategy.getBackgroundCompactions().setCompleted(pair.right);
+            strategy.onCompleted(pair.right);
 
             numCompactions--;
             numCompactionsInProgress--;
@@ -672,9 +653,7 @@ private void testCompactionStatistics(Set<SSTableReader> sstables,
             totWritten -= totSSTablesLen;
             totHotness -= totHotness(compSSTables);
 
-            for (SSTableReader sstable : pair.left)
-                strategy.removeSSTable(sstable);
-
+            removeSSTablesFromStrategy(strategy, pair.left);
             sstables.removeAll(pair.left);
             compacting.removeAll(pair.left);
 
@@ -688,52 +667,12 @@ private void testCompactionStatistics(Set<SSTableReader> sstables,
                              totWritten,
                              totHotness);
         }
-    }
 
-    private UUID mockCompaction(AbstractCompactionStrategy strategy, Set<SSTableReader> live, Set<SSTableReader> candidates, Set<SSTableReader> compacting)
-    {
-        final UUID id = UUID.randomUUID();
-        final AtomicReference<LifecycleTransaction> txn = new AtomicReference<>();
-
-        when(dataTracker.tryModify(anyIterable(), eq(OperationType.COMPACTION))).thenAnswer(invocation -> {
-            assertNull(txn.get());
-
-            LifecycleTransaction ret = Mockito.mock(LifecycleTransaction.class);
-            when(ret.opId()).thenReturn(id);
-            when(ret.originals()).thenReturn(candidates);
-            when(ret.getCompacting()).thenReturn(Sets.union(compacting, candidates));
-
-            txn.set(ret);
-            return ret;
-        });
-
-        when(cfs.getSSTables(eq(SSTableSet.LIVE))).thenReturn(live);
-        when(cfs.getNoncompactingSSTables()).thenAnswer(invocation -> Sets.difference(live, txn.get() == null ? compacting : Sets.union(compacting, candidates)));
-        when(cfs.getNoncompactingSSTables(anyIterable())).thenAnswer(invocation -> Sets.difference(Sets.newHashSet((Iterable<SSTableReader>)invocation.getArguments()[0]),
-                                                                                                  txn.get() == null ? compacting : Sets.union(compacting, candidates)));
-        when(cfs.getCompactingSSTables()).thenAnswer(invocation -> txn.get() == null ? compacting : Sets.union(compacting, candidates));
-
-        // Ask for a background compaction
-        AbstractCompactionTask task = strategy.getNextBackgroundTask((int) TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis()));
-        assertNotNull(task);
-        return id;
+        assertTrue(String.format("Data tracker still had compacting sstables: %s", dataTracker.getCompacting()),
+                   dataTracker.getCompacting().isEmpty());
     }
 
-    private CompactionProgress mockCompactionProgress(Set<SSTableReader> compacting, UUID id)
-    {
-        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
-
-        long compactingLen = totUncompressedLength(compacting);
-        when(progress.operationId()).thenReturn(id);
-        when(progress.inSSTables()).thenReturn(compacting);
-        when(progress.uncompressedBytesRead()).thenReturn(compactingLen);
-        when(progress.uncompressedBytesWritten()).thenReturn(compactingLen);
-        when(progress.durationInNanos()).thenReturn(TimeUnit.SECONDS.toNanos(30));
-
-        return progress;
-    }
-
-    private void verifyStatistics(AbstractCompactionStrategy strategy,
+    private void verifyStatistics(CompactionStrategy strategy,
                                   int expectedCompactions,
                                   int expectedCompacting,
                                   int expectedSSTables,
@@ -743,7 +682,7 @@ private void verifyStatistics(AbstractCompactionStrategy strategy,
                                   long expectedWrittenBytes,
                                   double expectedTotHotness)
     {
-        CompactionStrategyStatistics stats = strategy.getStatistics();
+        CompactionStrategyStatistics stats = strategy.getStatistics().get(0);
         System.out.println(stats.toString());
 
         assertEquals(keyspace, stats.keyspace());
@@ -777,7 +716,7 @@ private void verifyStatistics(AbstractCompactionStrategy strategy,
                 readBytes += tieredStatistics.read();
                 hotness += tieredStatistics.hotness;
             }
-            else
+            else if (compactionStatistics instanceof LeveledCompactionStatistics)
             {
                 LeveledCompactionStatistics leveledStatistics = (LeveledCompactionStatistics) compactionStatistics;
 
@@ -785,6 +724,14 @@ private void verifyStatistics(AbstractCompactionStrategy strategy,
                 writtenBytes += leveledStatistics.written();
                 readBytes += leveledStatistics.read();
             }
+            else
+            {
+                UnifiedCompactionStatistics tieredStatistics = (UnifiedCompactionStatistics) compactionStatistics;
+
+                totBytes += tieredStatistics.tot();
+                writtenBytes += tieredStatistics.written();
+                readBytes += tieredStatistics.read();
+            }
         }
 
         assertEquals(expectedCompactions, numCompactions);
@@ -800,8 +747,7 @@ private void verifyStatistics(AbstractCompactionStrategy strategy,
         assertEquals(expectedReadBytes, readBytes);
         assertEquals(expectedWrittenBytes, writtenBytes);
 
-        if (hotness > 0)
+        if (expectedTotHotness > 0)
             assertEquals(expectedTotHotness, hotness, epsilon);
-
     }
 }
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
index a5620ab1d457..140e76340e79 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionTaskTest.java
@@ -72,14 +72,14 @@ public static void setUpClass() throws Exception
     @Before
     public void setUp() throws Exception
     {
-        cfs.getCompactionStrategyManager().enable();
+        cfs.getCompactionStrategyContainer().enable();
         cfs.truncateBlocking();
     }
 
     @Test
     public void compactionDisabled() throws Exception
     {
-        cfs.getCompactionStrategyManager().disable();
+        cfs.getCompactionStrategyContainer().disable();
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (1, 1);");
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (2, 2);");
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
@@ -95,7 +95,7 @@ public void compactionDisabled() throws Exception
 
         AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0);
         Assert.assertNotNull(task);
-        cfs.getCompactionStrategyManager().pause();
+        cfs.getCompactionStrategyContainer().pause();
         try
         {
             task.execute(CompactionManager.instance.active);
@@ -111,7 +111,7 @@ public void compactionDisabled() throws Exception
     @Test
     public void compactionInterruption()
     {
-        cfs.getCompactionStrategyManager().disable();
+        cfs.getCompactionStrategyContainer().disable();
         Set<SSTableReader> sstables = generateData(2, 2);
 
         LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
@@ -156,7 +156,7 @@ private static void mutateRepaired(SSTableReader sstable, long repairedAt, UUID
     @Test
     public void mixedSSTableFailure() throws Exception
     {
-        cfs.getCompactionStrategyManager().disable();
+        cfs.getCompactionStrategyContainer().disable();
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (1, 1);");
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         QueryProcessor.executeInternal("INSERT INTO ks.tbl (k, v) VALUES (2, 2);");
@@ -205,7 +205,7 @@ public void mixedSSTableFailure() throws Exception
     @Test
     public void testCompactionReporting()
     {
-        cfs.getCompactionStrategyManager().disable();
+        cfs.getCompactionStrategyContainer().disable();
         Set<SSTableReader> sstables = generateData(2, 2);
         LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.COMPACTION);
         assertNotNull(txn);
@@ -213,13 +213,14 @@ public void testCompactionReporting()
         CompactionObserver compObserver = Mockito.mock(CompactionObserver.class);
         final ArgumentCaptor<TableOperation> tableOpCaptor = ArgumentCaptor.forClass(AbstractTableOperation.class);
         final ArgumentCaptor<CompactionProgress> compactionCaptor = ArgumentCaptor.forClass(CompactionProgress.class);
-        AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0, compObserver);
+        AbstractCompactionTask task = CompactionTask.forTesting(cfs, txn, 0);
+        task.addObserver(compObserver);
         assertNotNull(task);
         task.execute(operationObserver);
 
         verify(operationObserver, times(1)).onOperationStart(tableOpCaptor.capture());
-        verify(compObserver, times(1)).setInProgress(compactionCaptor.capture());
-        verify(compObserver, times(1)).setCompleted(eq(txn.opId()));
+        verify(compObserver, times(1)).onInProgress(compactionCaptor.capture());
+        verify(compObserver, times(1)).onCompleted(eq(txn.opId()));
     }
 
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
index d13e41f96514..843ccc582aa6 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsBytemanTest.java
@@ -18,12 +18,15 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.util.List;
+import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.function.Consumer;
 import java.util.stream.Collectors;
 
+import org.junit.Before;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 
@@ -33,6 +36,7 @@
 import org.apache.cassandra.dht.Range;
 import org.apache.cassandra.dht.Token;
 import org.apache.cassandra.io.sstable.Descriptor;
+import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.utils.FBUtilities;
 import org.jboss.byteman.contrib.bmunit.BMRule;
 import org.jboss.byteman.contrib.bmunit.BMRules;
@@ -45,6 +49,16 @@
 @RunWith(BMUnitRunner.class)
 public class CompactionsBytemanTest extends CQLTester
 {
+    @Before
+    public void setUp()
+    {
+        for (String ksname : Schema.instance.getKeyspaces())
+        {
+            for (ColumnFamilyStore cfs : Keyspace.open(ksname).getColumnFamilyStores())
+                cfs.disableAutoCompaction();
+        }
+    }
+
     /*
     Return false for the first time hasAvailableDiskSpace is called. i.e first SSTable is too big
     Create 5 SSTables. After compaction, there should be 2 left - 1 as the 9 SStables which were merged,
@@ -130,7 +144,7 @@ public void testCompactingCFCounting() throws Throwable
         assertEquals(0, CompactionManager.instance.compactingCF.count(cfs));
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
-        FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(cfs));
+        FBUtilities.waitOnFuture(CompactionManager.instance.submitBackground(cfs));
         assertEquals(0, CompactionManager.instance.compactingCF.count(cfs));
     }
 
@@ -194,7 +208,7 @@ public void testStopCompactionRepaired(Consumer<ColumnFamilyStore> compactionRun
             }
             cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
-        cfs.getCompactionStrategyManager().mutateRepaired(cfs.getLiveSSTables(), System.currentTimeMillis(), null, false);
+        cfs.mutateRepaired(cfs.getLiveSSTables(), System.currentTimeMillis(), null, false);
         for (int i = 0; i < 5; i++)
         {
             for (int j = 0; j < 10; j++)
@@ -223,4 +237,111 @@ public void testStopCompactionRepaired(Consumer<ColumnFamilyStore> compactionRun
         assertTrue(CompactionManager.instance.active.getTableOperations().stream().noneMatch(h -> h.getProgress().metadata().equals(cfs.metadata)));
 
     }
+
+    static Semaphore STARTED;
+    static Semaphore PROCEED;
+
+    @Test
+    @BMRule(name = "Delay compaction task execution",
+            targetClass = "AbstractCompactionTask",
+            targetMethod = "execute()",
+            action = "org.apache.cassandra.db.compaction.CompactionsBytemanTest.STARTED.release();\n" +
+                     "org.apache.cassandra.db.compaction.CompactionsBytemanTest.PROCEED.acquireUninterruptibly();")
+    public void testCompactionReloadDoesNotLoseHistory() throws Throwable
+    {
+        STARTED = new Semaphore(0);
+        PROCEED = new Semaphore(0);
+
+        try
+        {
+            createTable("CREATE TABLE %s (k INT, c INT, v INT, PRIMARY KEY (k, c)) WITH COMPACTION={'class': 'UnifiedCompactionStrategy'}");
+            ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+
+            for (int i = 0; i < 4; ++i)
+            {
+                execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 1, 1);
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+            }
+
+            // This is probably already started when we flushed the 4th sstable, but let's make sure.
+            CompactionManager.instance.submitBackground(cfs);
+            List<CompactionStrategyStatistics> statistics = cfs.getCompactionStrategy().getStatistics();
+            assertEquals(1, statistics.size());
+            assertEquals(1, statistics.get(0).aggregates().size());
+
+            execute("ALTER TABLE %s WITH COMPACTION={'class': 'UnifiedCompactionStrategy', 'num_shards': '2'}");
+            statistics = cfs.getCompactionStrategy().getStatistics();
+            assertEquals(1, statistics.size());
+            assertEquals(1, statistics.get(0).aggregates().size());
+        }
+        finally
+        {
+            // allow the task to continue
+            PROCEED.release();
+            dropTable("DROP TABLE %s");
+        }
+    }
+
+    @Test
+    @BMRule(name = "Delay compaction task execution",
+            targetClass = "AbstractCompactionTask",
+            targetMethod = "execute()",
+            action = "org.apache.cassandra.db.compaction.CompactionsBytemanTest.STARTED.release();\n" +
+                     "org.apache.cassandra.db.compaction.CompactionsBytemanTest.PROCEED.acquireUninterruptibly();")
+    public void testTotalCompactionsLCS() throws Throwable
+    {
+        testTotalCompactions("{'class': 'LeveledCompactionStrategy'}");
+    }
+
+    @BMRule(name = "Delay compaction task execution",
+            targetClass = "AbstractCompactionTask",
+            targetMethod = "execute()",
+            action = "org.apache.cassandra.db.compaction.CompactionsBytemanTest.STARTED.release();\n" +
+                     "org.apache.cassandra.db.compaction.CompactionsBytemanTest.PROCEED.acquireUninterruptibly();")
+    @Test
+    public void testTotalCompactionsSTCS() throws Throwable
+    {
+        testTotalCompactions("{'class': 'SizeTieredCompactionStrategy'}");
+    }
+
+    @Test
+    @BMRule(name = "Delay compaction task execution",
+            targetClass = "AbstractCompactionTask",
+            targetMethod = "execute()",
+            action = "org.apache.cassandra.db.compaction.CompactionsBytemanTest.STARTED.release();\n" +
+                     "org.apache.cassandra.db.compaction.CompactionsBytemanTest.PROCEED.acquireUninterruptibly();")
+    public void testTotalCompactionsUCS() throws Throwable
+    {
+        testTotalCompactions("{'class': 'UnifiedCompactionStrategy', 'static_scaling_parameters': 1}");
+    }
+
+    private void testTotalCompactions(String compactionOption) throws Throwable
+    {
+        STARTED = new Semaphore(0);
+        PROCEED = new Semaphore(0);
+
+        try
+        {
+            createTable("CREATE TABLE %s (k INT, c INT, v INT, PRIMARY KEY (k, c)) WITH COMPACTION=" + compactionOption);
+            ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+            cfs.disableAutoCompaction();
+            int numSSTables = 10;
+            for (int i = 0; i < numSSTables; i++)
+            {
+                execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", i, 1, 1);
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+            }
+            assertEquals(numSSTables, cfs.getLiveSSTables().size());
+
+            cfs.enableAutoCompaction(false);
+            STARTED.acquireUninterruptibly();
+            assertEquals(1, cfs.getCompactionStrategy().getTotalCompactions());
+        }
+        finally
+        {
+            // allow the task to continue
+            PROCEED.release();
+            dropTable("DROP TABLE %s");
+        }
+    }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
index 5c4576e09e03..63127dbe8afc 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsCQLTest.java
@@ -20,7 +20,6 @@
 import java.nio.ByteBuffer;
 import java.util.Collection;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.Random;
 import java.util.Set;
@@ -28,6 +27,7 @@
 import org.apache.commons.lang3.StringUtils;
 import org.junit.After;
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.apache.cassandra.cql3.CQLTester;
@@ -54,6 +54,7 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 import org.apache.cassandra.serializers.MarshalException;
 import org.apache.cassandra.schema.CompactionParams;
+import org.apache.cassandra.service.StorageService;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
@@ -68,6 +69,13 @@ public class CompactionsCQLTest extends CQLTester
 
     private Config.CorruptedTombstoneStrategy strategy;
 
+    @BeforeClass
+    public static void beforeClass()
+    {
+        CQLTester.setUpClass();
+        StorageService.instance.initServer();
+    }
+
     @Before
     public void before()
     {
@@ -85,7 +93,7 @@ public void after()
     public void testTriggerMinorCompactionSTCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -97,7 +105,7 @@ public void testTriggerMinorCompactionSTCS() throws Throwable
     public void testTriggerMinorCompactionLCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY) WITH compaction = {'class':'LeveledCompactionStrategy', 'sstable_size_in_mb':1, 'fanout_size':5};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -110,7 +118,7 @@ public void testTriggerMinorCompactionLCS() throws Throwable
     public void testTriggerMinorCompactionDTCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY) WITH compaction = {'class':'DateTieredCompactionStrategy', 'min_threshold':2};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1') using timestamp 1000"); // same timestamp = same window = minor compaction triggered
         flush();
         execute("insert into %s (id) values ('1') using timestamp 1000");
@@ -122,7 +130,7 @@ public void testTriggerMinorCompactionDTCS() throws Throwable
     public void testTriggerMinorCompactionTWCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY) WITH compaction = {'class':'TimeWindowCompactionStrategy', 'min_threshold':2};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -135,7 +143,7 @@ public void testTriggerMinorCompactionTWCS() throws Throwable
     public void testTriggerNoMinorCompactionSTCSDisabled() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':false};");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -147,14 +155,13 @@ public void testTriggerNoMinorCompactionSTCSDisabled() throws Throwable
     public void testTriggerMinorCompactionSTCSNodetoolEnabled() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':false};");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         getCurrentColumnFamilyStore().enableAutoCompaction();
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
 
         // Alter keyspace replication settings to force compaction strategy reload and check strategy is still enabled
         execute("alter keyspace "+keyspace()+" with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 }");
-        getCurrentColumnFamilyStore().getCompactionStrategyManager().maybeReloadDiskBoundaries();
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
 
         execute("insert into %s (id) values ('1')");
         flush();
@@ -167,9 +174,9 @@ public void testTriggerMinorCompactionSTCSNodetoolEnabled() throws Throwable
     public void testTriggerNoMinorCompactionSTCSNodetoolDisabled() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':true};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         getCurrentColumnFamilyStore().disableAutoCompaction();
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -181,9 +188,9 @@ public void testTriggerNoMinorCompactionSTCSNodetoolDisabled() throws Throwable
     public void testTriggerNoMinorCompactionSTCSAlterTable() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':true};");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("ALTER TABLE %s WITH compaction = {'class': 'SizeTieredCompactionStrategy', 'enabled': false}");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -195,9 +202,9 @@ public void testTriggerNoMinorCompactionSTCSAlterTable() throws Throwable
     public void testTriggerMinorCompactionSTCSAlterTable() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)  WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':2, 'enabled':false};");
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("ALTER TABLE %s WITH compaction = {'class': 'SizeTieredCompactionStrategy', 'min_threshold': 2, 'enabled': true}");
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         execute("insert into %s (id) values ('1')");
         flush();
         execute("insert into %s (id) values ('1')");
@@ -206,27 +213,39 @@ public void testTriggerMinorCompactionSTCSAlterTable() throws Throwable
     }
 
     @Test
-    public void testSetLocalCompactionStrategy() throws Throwable
+    public void testSetLocalCompactionStrategySTCS() throws Throwable
     {
         createTable("CREATE TABLE %s (id text PRIMARY KEY)");
+        testSetLocalCompactionStrategy(SizeTieredCompactionStrategy.class);
+    }
+
+    @Test
+    public void testSetLocalCompactionStrategyUCS() throws Throwable
+    {
+        testSetLocalCompactionStrategy(UnifiedCompactionStrategy.class);
+    }
+
+    private void testSetLocalCompactionStrategy(Class<? extends CompactionStrategy> strategy) throws Throwable
+    {
+        createTable(String.format("CREATE TABLE %%s (id text PRIMARY KEY) with compaction = {'class': '%s'}", strategy.getSimpleName()));
         Map<String, String> localOptions = new HashMap<>();
         localOptions.put("class", "DateTieredCompactionStrategy");
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), DateTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyContainer(), DateTieredCompactionStrategy.class));
         // Invalidate disk boundaries to ensure that boundary invalidation will not cause the old strategy to be reloaded
-        getCurrentColumnFamilyStore().invalidateDiskBoundaries();
+        getCurrentColumnFamilyStore().invalidateLocalRangesAndDiskBoundaries();
         // altering something non-compaction related
         execute("ALTER TABLE %s WITH gc_grace_seconds = 1000");
         // should keep the local compaction strat
-        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), DateTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyContainer(), DateTieredCompactionStrategy.class));
         // Alter keyspace replication settings to force compaction strategy reload
         execute("alter keyspace "+keyspace()+" with replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 3 }");
         // should keep the local compaction strat
-        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), DateTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyContainer(), DateTieredCompactionStrategy.class));
         // altering a compaction option
         execute("ALTER TABLE %s WITH compaction = {'class':'SizeTieredCompactionStrategy', 'min_threshold':3}");
         // will use the new option
-        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyManager(), SizeTieredCompactionStrategy.class));
+        assertTrue(verifyStrategies(getCurrentColumnFamilyStore().getCompactionStrategyContainer(), SizeTieredCompactionStrategy.class));
     }
 
     @Test
@@ -237,12 +256,12 @@ public void testSetLocalCompactionStrategyDisable() throws Throwable
         localOptions.put("class", "DateTieredCompactionStrategy");
         localOptions.put("enabled", "false");
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
         localOptions.clear();
         localOptions.put("class", "DateTieredCompactionStrategy");
         // localOptions.put("enabled", "true"); - this is default!
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
     }
 
 
@@ -254,10 +273,10 @@ public void testSetLocalCompactionStrategyEnable() throws Throwable
         localOptions.put("class", "DateTieredCompactionStrategy");
 
         getCurrentColumnFamilyStore().disableAutoCompaction();
-        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertFalse(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
 
         getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyManager().isEnabled());
+        assertTrue(getCurrentColumnFamilyStore().getCompactionStrategyContainer().isEnabled());
     }
 
 
@@ -416,8 +435,11 @@ public void testLCSThresholdParams() throws Throwable
             cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         assertEquals(50, cfs.getLiveSSTables().size());
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
-        AbstractCompactionTask act = lcs.getNextBackgroundTask(0);
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) ((CompactionStrategyManager) cfs.getCompactionStrategyContainer())
+                                                                    .getUnrepairedUnsafe().first();
+        Collection<AbstractCompactionTask> tasks = lcs.getNextBackgroundTasks(0);
+        assertEquals(1, tasks.size());
+        AbstractCompactionTask act = tasks.iterator().next();
         // we should be compacting all 50 sstables:
         assertEquals(50, act.transaction.originals().size());
         act.execute();
@@ -450,8 +472,12 @@ public void testSTCSinL0() throws Throwable
 
         // mark the L1 sstable as compacting to make sure we trigger STCS in L0:
         LifecycleTransaction txn = cfs.getTracker().tryModify(l1sstable, OperationType.COMPACTION);
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
-        AbstractCompactionTask act = lcs.getNextBackgroundTask(0);
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) ((CompactionStrategyManager) cfs.getCompactionStrategyContainer())
+                                                                    .getUnrepairedUnsafe()
+                                                                    .first();
+        Collection<AbstractCompactionTask> tasks = lcs.getNextBackgroundTasks(0);
+        assertEquals(1, tasks.size());
+        AbstractCompactionTask act = tasks.iterator().next();
         // note that max_threshold is 60 (more than the amount of L0 sstables), but MAX_COMPACTING_L0 is 32, which means we will trigger STCS with at most max_threshold sstables
         assertEquals(50, act.transaction.originals().size());
         assertEquals(0, ((LeveledCompactionTask)act).getLevel());
@@ -460,6 +486,67 @@ public void testSTCSinL0() throws Throwable
         act.execute();
     }
 
+    @Test
+    public void testABAReloadUCS()
+    {
+        testABAReload(UnifiedCompactionStrategy.class);
+    }
+
+    @Test
+    public void testABAReloadSTCS()
+    {
+        testABAReload(SizeTieredCompactionStrategy.class);
+    }
+
+    @Test
+    public void testABAReloadLCS()
+    {
+        testABAReload(LeveledCompactionStrategy.class);
+    }
+
+    private void testABAReload(Class<? extends CompactionStrategy> strategyClass)
+    {
+        createTable(String.format("CREATE TABLE %%s (id text PRIMARY KEY) WITH compaction = {'class':'%s'};", strategyClass.getSimpleName()));
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        assertEquals(strategyClass, cfs.getCompactionStrategyContainer().getCompactionParams().klass());
+        alterTable("ALTER TABLE %s WITH compaction = {'class': 'DateTieredCompactionStrategy'}");
+        assertEquals(DateTieredCompactionStrategy.class, cfs.getCompactionStrategyContainer().getCompactionParams().klass());
+        alterTable(String.format("ALTER TABLE %%s WITH compaction = {'class': '%s'}", strategyClass.getSimpleName()));
+        assertEquals(strategyClass, cfs.getCompactionStrategyContainer().getCompactionParams().klass());
+    }
+
+    @Test
+    public void testWithSecondaryIndexUCS() throws Throwable
+    {
+        testWithSecondaryIndex(UnifiedCompactionStrategy.class);
+    }
+
+    @Test
+    public void testWithSecondaryIndexSTCS() throws Throwable
+    {
+        testWithSecondaryIndex(SizeTieredCompactionStrategy.class);
+    }
+
+    @Test
+    public void testWithSecondaryIndexLCS() throws Throwable
+    {
+        testWithSecondaryIndex(LeveledCompactionStrategy.class);
+    }
+
+    public void testWithSecondaryIndex(Class<? extends CompactionStrategy> strategyClass) throws Throwable
+    {
+        createTable(String.format("CREATE TABLE %%s (pk int, c int, s int static, v int, PRIMARY KEY(pk, c)) WITH compaction = {'class':'%s'};", strategyClass.getSimpleName()));
+        createIndex("CREATE INDEX ON %s (v)");
+
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 1, 1, 9, 1);
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 1, 2, 9, 2);
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 3, 1, 9, 1);
+        execute("INSERT INTO %s (pk, c, s, v) VALUES (?, ?, ?, ?)", 4, 1, 9, 1);
+        flush();
+
+        compact();
+    }
+
     @Test
     public void testAbortNotifications() throws Throwable
     {
@@ -480,13 +567,16 @@ public void testAbortNotifications() throws Throwable
         }
         getCurrentColumnFamilyStore().forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) getCurrentColumnFamilyStore().getCompactionStrategyManager().getUnrepairedUnsafe().first();
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) ((CompactionStrategyManager) getCurrentColumnFamilyStore().getCompactionStrategyContainer())
+                                                                    .getUnrepairedUnsafe()
+                                                                    .first();
         LeveledCompactionTask lcsTask;
         while (true)
         {
-            lcsTask = (LeveledCompactionTask) lcs.getNextBackgroundTask(0);
-            if (lcsTask != null)
+            Collection<AbstractCompactionTask> tasks = lcs.getNextBackgroundTasks(0);
+            if (tasks.size() > 0)
             {
+                lcsTask = (LeveledCompactionTask) tasks.iterator().next();
                 lcsTask.execute(CompactionManager.instance.active);
                 break;
             }
@@ -521,7 +611,9 @@ public void testAbortNotifications() throws Throwable
         // sstables have been removed.
         try
         {
-            AbstractCompactionTask task = new NotifyingCompactionTask(lcs, (LeveledCompactionTask) lcs.getNextBackgroundTask(0));
+            Collection<AbstractCompactionTask> tasks = lcs.getNextBackgroundTasks(0);
+            assertEquals(1, tasks.size());
+            AbstractCompactionTask task = new NotifyingCompactionTask(lcs, (LeveledCompactionTask) tasks.iterator().next());
             task.execute(CompactionManager.instance.active);
             fail("task should throw exception");
         }
@@ -530,16 +622,10 @@ public void testAbortNotifications() throws Throwable
             // ignored
         }
 
-        lcsTask = (LeveledCompactionTask) lcs.getNextBackgroundTask(0);
-        try
-        {
-            assertNotNull(lcsTask);
-        }
-        finally
-        {
-            if (lcsTask != null)
-                lcsTask.transaction.abort();
-        }
+        Collection<AbstractCompactionTask> tasks = lcs.getNextBackgroundTasks(0);
+        assertEquals(1, tasks.size());
+        lcsTask = (LeveledCompactionTask) tasks.iterator().next();
+        lcsTask.transaction.abort();
     }
 
     private static class NotifyingCompactionTask extends LeveledCompactionTask
@@ -558,12 +644,12 @@ public CompactionAwareWriter getCompactionAwareWriter(ColumnFamilyStore cfs,
             return new MaxSSTableSizeWriter(cfs, directories, txn, nonExpiredSSTables, 1 << 20, 1)
             {
                 int switchCount = 0;
-                public void switchCompactionLocation(Directories.DataDirectory directory)
+                public void switchCompactionWriter(Directories.DataDirectory directory)
                 {
                     switchCount++;
                     if (switchCount > 5)
                         throw new RuntimeException("Throw after a few sstables have had their starts moved");
-                    super.switchCompactionLocation(directory);
+                    super.switchCompactionWriter(directory);
                 }
             };
         }
@@ -732,16 +818,15 @@ public void testProvidesTombstoneOptionverifiation()
          localOptions.put("provide_overlapping_tombstones","row");
 
          getCurrentColumnFamilyStore().setCompactionParameters(localOptions);
-         assertEquals(CompactionParams.TombstoneOption.ROW, getCurrentColumnFamilyStore().getCompactionStrategyManager().getCompactionParams().tombstoneOption());
+         assertEquals(CompactionParams.TombstoneOption.ROW, getCurrentColumnFamilyStore().getCompactionParams().tombstoneOption());
      }
 
-
-    public boolean verifyStrategies(CompactionStrategyManager manager, Class<? extends AbstractCompactionStrategy> expected)
+    public boolean verifyStrategies(CompactionStrategyContainer strategyContainer, Class<? extends AbstractCompactionStrategy> expected)
     {
         boolean found = false;
-        for (List<AbstractCompactionStrategy> strategies : manager.getStrategies())
+        for (CompactionStrategy strategy : strategyContainer.getStrategies())
         {
-            if (!strategies.stream().allMatch((strategy) -> strategy.getClass().equals(expected)))
+            if (!strategy.getClass().equals(expected))
                 return false;
             found = true;
         }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
index 032ab97d1a9a..f7095b9aaf9c 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsPurgeTest.java
@@ -303,7 +303,7 @@ public void testMinorCompactionPurge()
                 .build().applyUnsafe();
 
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
-        try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
+        try (CompactionTasks tasks = cfs.getCompactionStrategyContainer().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
         {
             Iterables.getOnlyElement(tasks).execute();
         }
@@ -355,7 +355,7 @@ public void testMinTimestampPurge()
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
 
         // compact the sstables with the c1/c2 data and the c1 tombstone
-        try (CompactionTasks tasks = cfs.getCompactionStrategyManager().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
+        try (CompactionTasks tasks = cfs.getCompactionStrategyContainer().getUserDefinedTasks(sstablesIncomplete, Integer.MAX_VALUE))
         {
             Iterables.getOnlyElement(tasks).execute();
         }
diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
index a6a57d4a1ab0..bb7af6012cca 100644
--- a/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/CompactionsTest.java
@@ -160,7 +160,7 @@ public void testSingleSSTableCompaction() throws Exception
 
         // enable compaction, submit background and wait for it to complete
         store.enableAutoCompaction();
-        FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
+        FBUtilities.waitOnFuture(CompactionManager.instance.submitBackground(store));
         do
         {
             TimeUnit.SECONDS.sleep(1);
@@ -210,7 +210,7 @@ public void testUncheckedTombstoneSizeTieredCompaction() throws Exception
 
         // enable compaction, submit background and wait for it to complete
         store.enableAutoCompaction();
-        FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
+        FBUtilities.waitOnFuture(CompactionManager.instance.submitBackground(store));
         do
         {
             TimeUnit.SECONDS.sleep(1);
@@ -232,7 +232,7 @@ public void testUncheckedTombstoneSizeTieredCompaction() throws Exception
         MigrationManager.announceTableUpdate(store.metadata().unbuild().gcGraceSeconds(1).compaction(CompactionParams.stcs(compactionOptions)).build(), true);
 
         //submit background task again and wait for it to complete
-        FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(store));
+        FBUtilities.waitOnFuture(CompactionManager.instance.submitBackground(store));
         do
         {
             TimeUnit.SECONDS.sleep(1);
@@ -604,7 +604,7 @@ public void testCompactionsCanBeInterrupted() throws Exception
         assertTrue(store.getLiveSSTables().size() >= 2);
 
         // Enable compaction but do not submit any background compactions
-        store.getCompactionStrategyManager().enable();
+        store.getCompactionStrategyContainer().enable();
 
         CountDownLatch compactionRegistered = new CountDownLatch(1);
         CountDownLatch resumeCompaction = new CountDownLatch(1);
diff --git a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
index 2b2f6a0b118f..0cc4b7fafe69 100644
--- a/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/DateTieredCompactionStrategyTest.java
@@ -322,13 +322,16 @@ public void testDropExpiredSSTables() throws InterruptedException
         options.put(DateTieredCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, "MILLISECONDS");
         options.put(DateTieredCompactionStrategyOptions.MAX_SSTABLE_AGE_KEY, Double.toString((1d / (24 * 60 * 60))));
         options.put(DateTieredCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0");
-        DateTieredCompactionStrategy dtcs = new DateTieredCompactionStrategy(cfs, options);
+        CompactionStrategyFactory factory = new CompactionStrategyFactory(cfs);
+        DateTieredCompactionStrategy dtcs = new DateTieredCompactionStrategy(factory, options);
         for (SSTableReader sstable : cfs.getLiveSSTables())
             dtcs.addSSTable(sstable);
         dtcs.startup();
-        assertNull(dtcs.getNextBackgroundTask((int) (System.currentTimeMillis() / 1000)));
+        assertTrue(dtcs.getNextBackgroundTasks((int) (System.currentTimeMillis() / 1000)).isEmpty());
         Thread.sleep(2000);
-        AbstractCompactionTask t = dtcs.getNextBackgroundTask((int) (System.currentTimeMillis()/1000));
+        Collection<AbstractCompactionTask> tasks = dtcs.getNextBackgroundTasks((int) (System.currentTimeMillis() / 1000));
+        assertEquals(1, tasks.size());
+        AbstractCompactionTask t = tasks.iterator().next();
         assertNotNull(t);
         assertEquals(1, Iterables.size(t.transaction.originals()));
         SSTableReader sstable = t.transaction.originals().iterator().next();
@@ -370,10 +373,15 @@ public void testSTCSBigWindow()
         }
         Map<String, String> options = new HashMap<>();
         options.put(SizeTieredCompactionStrategyOptions.MIN_SSTABLE_SIZE_KEY, "1");
-        DateTieredCompactionStrategy dtcs = new DateTieredCompactionStrategy(cfs, options);
+        CompactionStrategyFactory factory = new CompactionStrategyFactory(cfs);
+        DateTieredCompactionStrategy dtcs = new DateTieredCompactionStrategy(factory, options);
         for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL))
             dtcs.addSSTable(sstable);
-        AbstractCompactionTask task = dtcs.getNextBackgroundTask(0);
+
+        Collection<AbstractCompactionTask> tasks = dtcs.getNextBackgroundTasks(0);
+        assertEquals(1, tasks.size());
+
+        AbstractCompactionTask task = tasks.iterator().next();
         assertEquals(20, task.transaction.originals().size());
         task.transaction.abort();
         cfs.truncateBlocking();
diff --git a/test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategyTest.java
similarity index 89%
rename from test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java
rename to test/unit/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategyTest.java
index 82544f379340..b296b010c258 100644
--- a/test/unit/org/apache/cassandra/db/compaction/AbstractCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/LegacyAbstractCompactionStrategyTest.java
@@ -18,6 +18,7 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
@@ -39,7 +40,7 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.FBUtilities;
 
-public class AbstractCompactionStrategyTest
+public class LegacyAbstractCompactionStrategyTest
 {
     private static final String KEYSPACE1 = "Keyspace1";
     private static final String LCS_TABLE = "LCS_TABLE";
@@ -107,7 +108,9 @@ public void testGetNextBackgroundTaskDoesNotBlockTWCS()
     public void testGetNextBackgroundTaskDoesNotBlock(String table)
     {
         ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(table);
-        AbstractCompactionStrategy strategy = cfs.getCompactionStrategyManager().getStrategies().get(1).get(0);
+        CompactionStrategy strategy = cfs.getCompactionStrategyContainer()
+                                         .getStrategies(false, null)
+                                         .get(0);
 
         // Add 4 sstables
         for (int i = 1; i <= 4; i++)
@@ -116,7 +119,11 @@ public void testGetNextBackgroundTaskDoesNotBlock(String table)
         }
 
         // Check they are returned on the next background task
-        try (LifecycleTransaction txn = strategy.getNextBackgroundTask(FBUtilities.nowInSeconds()).transaction)
+        Collection<AbstractCompactionTask> tasks = strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        Assert.assertEquals(1, tasks.size());
+        AbstractCompactionTask task = tasks.iterator().next();
+        Assert.assertNotNull(task);
+        try (LifecycleTransaction txn = task.transaction)
         {
             Assert.assertEquals(cfs.getLiveSSTables(), txn.originals());
         }
@@ -125,7 +132,7 @@ public void testGetNextBackgroundTaskDoesNotBlock(String table)
         cfs.getTracker().removeUnsafe(cfs.getLiveSSTables());
 
         // verify the compaction strategy will return null
-        Assert.assertNull(strategy.getNextBackgroundTask(FBUtilities.nowInSeconds()));
+        Assert.assertTrue(strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).isEmpty());
     }
 
 
diff --git a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
index dfdda303e16a..5065755ce0e7 100644
--- a/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/LeveledCompactionStrategyTest.java
@@ -138,11 +138,11 @@ public void testGrouperLevels() throws Exception {
         }
 
         waitForLeveling(cfs);
-        CompactionStrategyManager strategyManager = cfs.getCompactionStrategyManager();
+        CompactionStrategyContainer strategyContainer = cfs.getCompactionStrategyContainer();
         // Checking we're not completely bad at math
 
-        int l1Count = strategyManager.getSSTableCountPerLevel()[1];
-        int l2Count = strategyManager.getSSTableCountPerLevel()[2];
+        int l1Count = strategyContainer.getSSTableCountPerLevel()[1];
+        int l2Count = strategyContainer.getSSTableCountPerLevel()[2];
         if (l1Count == 0 || l2Count == 0)
         {
             logger.error("L1 or L2 has 0 sstables. Expected > 0 on both.");
@@ -151,7 +151,7 @@ public void testGrouperLevels() throws Exception {
             Assert.fail();
         }
 
-        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyManager().groupSSTablesForAntiCompaction(cfs.getLiveSSTables());
+        Collection<Collection<SSTableReader>> groupedSSTables = cfs.getCompactionStrategyContainer().groupSSTablesForAntiCompaction(cfs.getLiveSSTables());
         for (Collection<SSTableReader> sstableGroup : groupedSSTables)
         {
             int groupLevel = -1;
@@ -193,10 +193,10 @@ public void testValidationMultipleSSTablePerLevel() throws Exception
         }
 
         waitForLeveling(cfs);
-        CompactionStrategyManager strategyManager = cfs.getCompactionStrategyManager();
+        CompactionStrategyContainer strategyContainer = cfs.getCompactionStrategyContainer();
         // Checking we're not completely bad at math
-        assertTrue(strategyManager.getSSTableCountPerLevel()[1] > 0);
-        assertTrue(strategyManager.getSSTableCountPerLevel()[2] > 0);
+        assertTrue(strategyContainer.getSSTableCountPerLevel()[1] > 0);
+        assertTrue(strategyContainer.getSSTableCountPerLevel()[2] > 0);
 
         Range<Token> range = new Range<>(Util.token(""), Util.token(""));
         int gcBefore = keyspace.getColumnFamilyStore(CF_STANDARDDLEVELED).gcBefore(FBUtilities.nowInSeconds());
@@ -220,7 +220,7 @@ public void testValidationMultipleSSTablePerLevel() throws Exception
      */
     public static void waitForLeveling(ColumnFamilyStore cfs) throws InterruptedException
     {
-        CompactionStrategyManager strategyManager = cfs.getCompactionStrategyManager();
+        CompactionStrategyContainer strategyContainer = cfs.getCompactionStrategyContainer();
         while (true)
         {
             // since we run several compaction strategies we wait until L0 in all strategies is empty and
@@ -228,19 +228,16 @@ public static void waitForLeveling(ColumnFamilyStore cfs) throws InterruptedExce
             // so it should be good enough
             boolean allL0Empty = true;
             boolean anyL1NonEmpty = false;
-            for (List<AbstractCompactionStrategy> strategies : strategyManager.getStrategies())
+            for (CompactionStrategy strategy : strategyContainer.getStrategies())
             {
-                for (AbstractCompactionStrategy strategy : strategies)
-                {
-                    if (!(strategy instanceof LeveledCompactionStrategy))
-                        return;
-                    // note that we check > 1 here, if there is too little data in L0, we don't compact it up to L1
-                    if (((LeveledCompactionStrategy)strategy).getLevelSize(0) > 1)
-                        allL0Empty = false;
-                    for (int i = 1; i < 5; i++)
-                        if (((LeveledCompactionStrategy)strategy).getLevelSize(i) > 0)
-                            anyL1NonEmpty = true;
-                }
+                if (!(strategy instanceof LeveledCompactionStrategy))
+                    return;
+                // note that we check > 1 here, if there is too little data in L0, we don't compact it up to L1
+                if (((LeveledCompactionStrategy)strategy).getLevelSize(0) > 1)
+                    allL0Empty = false;
+                for (int i = 1; i < 5; i++)
+                    if (((LeveledCompactionStrategy)strategy).getLevelSize(i) > 0)
+                        anyL1NonEmpty = true;
             }
             if (allL0Empty && anyL1NonEmpty)
                 return;
@@ -267,7 +264,9 @@ public void testCompactionProgress() throws Exception
         }
 
         waitForLeveling(cfs);
-        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getStrategies().get(1).get(0);
+        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategyContainer()
+                                                                            .getStrategies(false, null)
+                                                                            .get(0);
         assert strategy.getLevelSize(1) > 0;
 
         // get LeveledScanner for level 1 sstables
@@ -303,7 +302,9 @@ public void testMutateLevel() throws Exception
             cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
-        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getStrategies().get(1).get(0);
+        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) cfs.getCompactionStrategyContainer()
+                                                                            .getStrategies(false, null)
+                                                                            .get(0);
         cfs.forceMajorCompaction();
 
         for (SSTableReader s : cfs.getLiveSSTables())
@@ -349,14 +350,17 @@ public void testNewRepairedSSTable() throws Exception
         while(CompactionManager.instance.isCompacting(Arrays.asList(cfs), (sstable) -> true))
             Thread.sleep(100);
 
-        CompactionStrategyManager manager = cfs.getCompactionStrategyManager();
-        List<List<AbstractCompactionStrategy>> strategies = manager.getStrategies();
-        LeveledCompactionStrategy repaired = (LeveledCompactionStrategy) strategies.get(0).get(0);
-        LeveledCompactionStrategy unrepaired = (LeveledCompactionStrategy) strategies.get(1).get(0);
+        CompactionStrategyContainer strategyContainer = cfs.getCompactionStrategyContainer();
+        LeveledCompactionStrategy repaired = (LeveledCompactionStrategy) strategyContainer
+                                                                         .getStrategies(true, null)
+                                                                         .get(0);
+        LeveledCompactionStrategy unrepaired = (LeveledCompactionStrategy) strategyContainer
+                                                                           .getStrategies(false, null)
+                                                                           .get(0);
         assertEquals(0, repaired.manifest.getLevelCount() );
         assertEquals(2, unrepaired.manifest.getLevelCount());
-        assertTrue(manager.getSSTableCountPerLevel()[1] > 0);
-        assertTrue(manager.getSSTableCountPerLevel()[2] > 0);
+        assertTrue(strategyContainer.getSSTableCountPerLevel()[1] > 0);
+        assertTrue(strategyContainer.getSSTableCountPerLevel()[2] > 0);
 
         for (SSTableReader sstable : cfs.getLiveSSTables())
             assertFalse(sstable.isRepaired());
@@ -372,7 +376,7 @@ public void testNewRepairedSSTable() throws Exception
         sstable1.reloadSSTableMetadata();
         assertTrue(sstable1.isRepaired());
 
-        manager.handleNotification(new SSTableRepairStatusChanged(Arrays.asList(sstable1)), this);
+        strategyContainer.handleNotification(new SSTableRepairStatusChanged(Arrays.asList(sstable1)), this);
 
         int repairedSSTableCount = repaired.manifest.getSSTables().size();
         assertEquals(1, repairedSSTableCount);
@@ -382,7 +386,7 @@ public void testNewRepairedSSTable() throws Exception
         assertFalse(unrepaired.manifest.getLevel(2).contains(sstable1));
 
         unrepaired.removeSSTable(sstable2);
-        manager.handleNotification(new SSTableAddedNotification(singleton(sstable2), null), this);
+        strategyContainer.handleNotification(new SSTableAddedNotification(singleton(sstable2), null), this);
         assertTrue(unrepaired.manifest.getLevel(1).contains(sstable2));
         assertFalse(repaired.manifest.getLevel(1).contains(sstable2));
     }
@@ -531,7 +535,9 @@ public void testCompactionCandidateOrdering()
             update.applyUnsafe();
             cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
-        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) (cfs.getCompactionStrategyManager()).getStrategies().get(1).get(0);
+        LeveledCompactionStrategy strategy = (LeveledCompactionStrategy) (cfs.getCompactionStrategyContainer())
+                                                                         .getStrategies(false, null)
+                                                                         .get(0);
         // get readers for level 0 sstables
         Collection<SSTableReader> sstables = strategy.manifest.getLevel(0);
         Collection<SSTableReader> sortedCandidates = strategy.manifest.ageSortedSSTables(sstables);
@@ -588,23 +594,22 @@ public void testDisableSTCSInL0() throws IOException
     private int getTaskLevel(ColumnFamilyStore cfs)
     {
         int level = -1;
-        for (List<AbstractCompactionStrategy> strategies : cfs.getCompactionStrategyManager().getStrategies())
+        for (CompactionStrategy strategy : cfs.getCompactionStrategyContainer().getStrategies())
         {
-            for (AbstractCompactionStrategy strategy : strategies)
+            Collection<AbstractCompactionTask> tasks = strategy.getNextBackgroundTasks(0);
+            if (!tasks.isEmpty())
             {
-                AbstractCompactionTask task = strategy.getNextBackgroundTask(0);
-                if (task != null)
+                assertEquals(1, tasks.size());
+                AbstractCompactionTask task = tasks.iterator().next();
+                try
                 {
-                    try
-                    {
-                        assertTrue(task instanceof LeveledCompactionTask);
-                        LeveledCompactionTask lcsTask = (LeveledCompactionTask) task;
-                        level = Math.max(level, lcsTask.getLevel());
-                    }
-                    finally
-                    {
-                        task.transaction.abort();
-                    }
+                    assertTrue(task instanceof LeveledCompactionTask);
+                    LeveledCompactionTask lcsTask = (LeveledCompactionTask) task;
+                    level = Math.max(level, lcsTask.getLevel());
+                }
+                finally
+                {
+                    task.transaction.abort();
                 }
             }
         }
diff --git a/test/unit/org/apache/cassandra/db/compaction/PendingRepairManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/PendingRepairManagerTest.java
index 9f4cf8de690c..748bbfd1f980 100644
--- a/test/unit/org/apache/cassandra/db/compaction/PendingRepairManagerTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/PendingRepairManagerTest.java
@@ -18,6 +18,7 @@
 
 package org.apache.cassandra.db.compaction;
 
+import java.io.IOException;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.UUID;
@@ -34,13 +35,20 @@
 
 public class PendingRepairManagerTest extends AbstractPendingRepairTest
 {
+    @Override
+    public String createTableCql()
+    {
+        return String.format("CREATE TABLE %s.%s (k INT PRIMARY KEY, v INT)",
+                             ks, tbl);
+    }
+
     /**
      * If a local session is ongoing, it should not be cleaned up
      */
     @Test
     public void needsCleanupInProgress()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -58,7 +66,7 @@ public void needsCleanupInProgress()
     @Test
     public void needsCleanupFinalized()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -77,7 +85,7 @@ public void needsCleanupFinalized()
     @Test
     public void needsCleanupFailed()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -94,14 +102,14 @@ public void needsCleanupFailed()
     public void needsCleanupNoSession()
     {
         UUID fakeID = UUIDGen.getTimeUUID();
-        PendingRepairManager prm = new PendingRepairManager(cfs, null, false);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, null, false);
         Assert.assertTrue(prm.canCleanup(fakeID));
     }
 
     @Test
     public void estimateRemainingTasksInProgress()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -117,7 +125,7 @@ public void estimateRemainingTasksInProgress()
     @Test
     public void estimateRemainingFinishedRepairTasks()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -135,7 +143,7 @@ public void estimateRemainingFinishedRepairTasks()
     @Test
     public void getNextBackgroundTask()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -151,13 +159,15 @@ public void getNextBackgroundTask()
         LocalSessionAccessor.finalizeUnsafe(repairID);
 
         Assert.assertEquals(2, prm.getSessions().size());
-        Assert.assertNull(prm.getNextBackgroundTask(FBUtilities.nowInSeconds()));
-        AbstractCompactionTask compactionTask = prm.getNextRepairFinishedTask();
+        Assert.assertTrue(prm.getNextBackgroundTasks(FBUtilities.nowInSeconds()).isEmpty());
+        Collection<AbstractCompactionTask> compactionTasks = prm.getNextRepairFinishedTasks();
+        Assert.assertEquals(1, compactionTasks.size());
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
         try
         {
             Assert.assertNotNull(compactionTask);
-            Assert.assertSame(PendingRepairManager.RepairFinishedCompactionTask.class, compactionTask.getClass());
-            PendingRepairManager.RepairFinishedCompactionTask cleanupTask = (PendingRepairManager.RepairFinishedCompactionTask) compactionTask;
+            Assert.assertSame(RepairFinishedCompactionTask.class, compactionTask.getClass());
+            RepairFinishedCompactionTask cleanupTask = (RepairFinishedCompactionTask) compactionTask;
             Assert.assertEquals(repairID, cleanupTask.getSessionID());
         }
         finally
@@ -169,17 +179,17 @@ public void getNextBackgroundTask()
     @Test
     public void getNextBackgroundTaskNoSessions()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
-        Assert.assertNull(prm.getNextBackgroundTask(FBUtilities.nowInSeconds()));
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
+        Assert.assertTrue(prm.getNextBackgroundTasks(FBUtilities.nowInSeconds()).isEmpty());
     }
 
     /**
      * If all sessions should be cleaned up, getNextBackgroundTask should return null
      */
     @Test
-    public void getNextBackgroundTaskAllCleanup() throws Exception
+    public void getNextBackgroundTaskAllCleanup()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
 
@@ -190,14 +200,14 @@ public void getNextBackgroundTaskAllCleanup() throws Exception
         Assert.assertNotNull(prm.get(repairID));
         LocalSessionAccessor.finalizeUnsafe(repairID);
 
-        Assert.assertNull(prm.getNextBackgroundTask(FBUtilities.nowInSeconds()));
+        Assert.assertTrue(prm.getNextBackgroundTasks(FBUtilities.nowInSeconds()).isEmpty());
 
     }
 
     @Test
     public void maximalTaskNeedsCleanup()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -222,22 +232,20 @@ public void maximalTaskNeedsCleanup()
     @Test
     public void userDefinedTaskTest()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
         UUID repairId = registerSession(cfs, true, true);
         SSTableReader sstable = makeSSTable(true);
         mutateRepaired(sstable, repairId, false);
         prm.addSSTable(sstable);
 
-        try (CompactionTasks tasks = csm.getUserDefinedTasks(Collections.singleton(sstable), 100))
-        {
-            Assert.assertEquals(1, tasks.size());
-        }
+        Collection<AbstractCompactionTask> tasks = prm.createUserDefinedTasks(Collections.singleton(sstable), 100);
+        Assert.assertEquals(1, tasks.size());
     }
 
     @Test
     public void mixedPendingSessionsTest()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
         UUID repairId = registerSession(cfs, true, true);
         UUID repairId2 = registerSession(cfs, true, true);
         SSTableReader sstable = makeSSTable(true);
@@ -247,10 +255,8 @@ public void mixedPendingSessionsTest()
         mutateRepaired(sstable2, repairId2, false);
         prm.addSSTable(sstable);
         prm.addSSTable(sstable2);
-        try (CompactionTasks tasks = csm.getUserDefinedTasks(Lists.newArrayList(sstable, sstable2), 100))
-        {
-            Assert.assertEquals(2, tasks.size());
-        }
+        Collection<AbstractCompactionTask> tasks = prm.createUserDefinedTasks(Lists.newArrayList(sstable, sstable2), 100);
+        Assert.assertEquals(2, tasks.size());
     }
 
     /**
@@ -260,7 +266,7 @@ public void mixedPendingSessionsTest()
     @Test(expected = PendingRepairManager.IllegalSSTableArgumentException.class)
     public void getScannersInvalidSSTable() throws Exception
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
         SSTableReader sstable = makeSSTable(true);
         prm.getScanners(Collections.singleton(sstable), Collections.singleton(RANGE1));
     }
@@ -272,7 +278,7 @@ public void getScannersInvalidSSTable() throws Exception
     @Test(expected = PendingRepairManager.IllegalSSTableArgumentException.class)
     public void getOrCreateInvalidSSTable() throws Exception
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
         SSTableReader sstable = makeSSTable(true);
         prm.getOrCreate(sstable);
     }
@@ -280,7 +286,7 @@ public void getOrCreateInvalidSSTable() throws Exception
     @Test
     public void sessionHasData()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
 
         UUID repairID = registerSession(cfs, true, true);
         LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
@@ -295,14 +301,14 @@ public void sessionHasData()
     @Test
     public void noEmptyCompactionTask()
     {
-        PendingRepairManager prm = csm.getPendingRepairManagers().get(0);
+        PendingRepairManager prm = new PendingRepairManager(cfs, strategyFactory, cfs.getCompactionParams(), false);
         SSTableReader sstable = makeSSTable(false);
         UUID id = UUID.randomUUID();
         mutateRepaired(sstable, id, false);
         prm.getOrCreate(sstable);
         cfs.truncateBlocking();
         Assert.assertFalse(cfs.getSSTables(SSTableSet.LIVE).iterator().hasNext());
-        Assert.assertNull(cfs.getCompactionStrategyManager().getNextBackgroundTask(0));
+        Assert.assertTrue(cfs.getCompactionStrategy().getNextBackgroundTasks(0).isEmpty());
 
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java b/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
index ba5c203cb98a..ec07bd99e34a 100644
--- a/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/SingleSSTableLCSTaskTest.java
@@ -20,6 +20,7 @@
 
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
+import java.util.Collection;
 import java.util.Random;
 
 import org.apache.commons.lang3.StringUtils;
@@ -32,6 +33,7 @@
 import org.apache.cassandra.io.sstable.format.SSTableReader;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 
 public class SingleSSTableLCSTaskTest extends CQLTester
@@ -44,19 +46,22 @@ public void basicTest() throws Throwable
         execute("insert into %s (id, t) values (1, 'meep')");
         cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         SSTableReader sstable = cfs.getLiveSSTables().iterator().next();
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyContainer()
+                                                                       .getStrategies(false, null)
+                                                                       .get(0);
 
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstable, OperationType.COMPACTION))
         {
             if (txn != null)
             {
-                SingleSSTableLCSTask task = new SingleSSTableLCSTask(cfs, txn, 2);
+                SingleSSTableLCSTask task = new SingleSSTableLCSTask(lcs, txn, 2);
                 task.executeInternal();
             }
         }
         assertEquals(1, cfs.getLiveSSTables().size());
         cfs.getLiveSSTables().forEach(s -> assertEquals(2, s.getSSTableLevel()));
         // make sure compaction strategy is notified:
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
+
         for (int i = 0; i < lcs.manifest.getLevelCount(); i++)
         {
             if (i == 2)
@@ -98,17 +103,29 @@ private void compactionTestHelper(boolean singleSSTUplevel) throws Throwable
                 cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
         }
         // now we have a bunch of data in L0, first compaction will be a normal one, containing all sstables:
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
-        AbstractCompactionTask act = lcs.getNextBackgroundTask(0);
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) ((CompactionStrategyManager) cfs.getCompactionStrategyContainer())
+                                                                    .getUnrepairedUnsafe()
+                                                                    .first();
+        Collection<AbstractCompactionTask> tasks = lcs.getNextBackgroundTasks(0);
+        assertEquals(1, tasks.size());
+        AbstractCompactionTask act = tasks.iterator().next();
+        assertNotNull(act);
         act.execute();
 
         // now all sstables are laid out non-overlapping in L1, this means that the rest of the compactions
         // will be single sstable ones, make sure that we use SingleSSTableLCSTask if singleSSTUplevel is true:
-        while ((act = lcs.getNextBackgroundTask(0)) != null)
+        tasks = lcs.getNextBackgroundTasks(0);
+        while (!tasks.isEmpty())
         {
+            assertEquals(1, tasks.size());
+            act = tasks.iterator().next();
+            assertNotNull(act);
+
             assertTrue(lcs.getTotalCompactions() > 0);
             assertEquals(singleSSTUplevel, act instanceof SingleSSTableLCSTask);
             act.execute();
+
+            tasks = lcs.getNextBackgroundTasks(0);
         }
 
         assertEquals(0, lcs.getTotalCompactions());
@@ -136,11 +153,15 @@ public void corruptMetadataTest() throws Throwable
         file.writeBytes(StringUtils.repeat('z', 2));
         file.close();
         boolean gotException = false;
+        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) ((CompactionStrategyManager) cfs.getCompactionStrategyContainer())
+                                                                    .getUnrepairedUnsafe()
+                                                                    .first();
+
         try (LifecycleTransaction txn = cfs.getTracker().tryModify(sstable, OperationType.COMPACTION))
         {
             if (txn != null)
             {
-                SingleSSTableLCSTask task = new SingleSSTableLCSTask(cfs, txn, 2);
+                SingleSSTableLCSTask task = new SingleSSTableLCSTask(lcs, txn, 2);
                 task.executeInternal();
             }
         }
@@ -152,7 +173,7 @@ public void corruptMetadataTest() throws Throwable
         assertEquals(1, cfs.getLiveSSTables().size());
         for (SSTableReader sst : cfs.getLiveSSTables())
             assertEquals(0, sst.getSSTableMetadata().sstableLevel);
-        LeveledCompactionStrategy lcs = (LeveledCompactionStrategy) cfs.getCompactionStrategyManager().getUnrepairedUnsafe().first();
+
         assertEquals(1, lcs.getLevelSize(0));
         assertTrue(cfs.getTracker().getCompacting().isEmpty());
     }
diff --git a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
index ec44e5b01ef2..f8184409c847 100644
--- a/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategyTest.java
@@ -19,6 +19,7 @@
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -37,7 +38,6 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -282,16 +282,18 @@ public void testDropExpiredSSTables() throws InterruptedException
         options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "SECONDS");
         options.put(TimeWindowCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, "MILLISECONDS");
         options.put(TimeWindowCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0");
-        TimeWindowCompactionStrategy twcs = new TimeWindowCompactionStrategy(cfs, options);
+        TimeWindowCompactionStrategy twcs = new TimeWindowCompactionStrategy(new CompactionStrategyFactory(cfs), options);
         for (SSTableReader sstable : cfs.getLiveSSTables())
             twcs.addSSTable(sstable);
 
         twcs.startup();
-        assertNull(twcs.getNextBackgroundTask(nowInSeconds()));
+        assertTrue(twcs.getNextBackgroundTasks(nowInSeconds()).isEmpty());
 
         // Wait for the expiration of the first sstable
         Thread.sleep(TimeUnit.SECONDS.toMillis(TTL_SECONDS + 1));
-        AbstractCompactionTask t = twcs.getNextBackgroundTask(nowInSeconds());
+        Collection<AbstractCompactionTask> tasks = twcs.getNextBackgroundTasks(nowInSeconds());
+        assertEquals(1, tasks.size());
+        AbstractCompactionTask t = tasks.iterator().next();
         assertNotNull(t);
         assertEquals(1, Iterables.size(t.transaction.originals()));
         SSTableReader sstable = t.transaction.originals().iterator().next();
@@ -337,24 +339,26 @@ public void testDropOverlappingExpiredSSTables() throws InterruptedException
         options.put(TimeWindowCompactionStrategyOptions.COMPACTION_WINDOW_UNIT_KEY, "SECONDS");
         options.put(TimeWindowCompactionStrategyOptions.TIMESTAMP_RESOLUTION_KEY, "MILLISECONDS");
         options.put(TimeWindowCompactionStrategyOptions.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_KEY, "0");
-        TimeWindowCompactionStrategy twcs = new TimeWindowCompactionStrategy(cfs, options);
+        TimeWindowCompactionStrategy twcs = new TimeWindowCompactionStrategy(new CompactionStrategyFactory(cfs), options);
         for (SSTableReader sstable : cfs.getLiveSSTables())
             twcs.addSSTable(sstable);
 
         twcs.startup();
-        assertNull(twcs.getNextBackgroundTask(nowInSeconds()));
+        assertTrue(twcs.getNextBackgroundTasks(nowInSeconds()).isEmpty());
 
         // Wait for the expiration of the first sstable
         Thread.sleep(TimeUnit.SECONDS.toMillis(TTL_SECONDS + 1));
-        assertNull(twcs.getNextBackgroundTask(nowInSeconds()));
+        assertTrue(twcs.getNextBackgroundTasks(nowInSeconds()).isEmpty());
 
         options.put(TimeWindowCompactionStrategyOptions.UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_KEY, "true");
-        twcs = new TimeWindowCompactionStrategy(cfs, options);
+        twcs = new TimeWindowCompactionStrategy(new CompactionStrategyFactory(cfs), options);
         for (SSTableReader sstable : cfs.getLiveSSTables())
             twcs.addSSTable(sstable);
 
         twcs.startup();
-        AbstractCompactionTask t = twcs.getNextBackgroundTask(nowInSeconds());
+        Collection<AbstractCompactionTask> tasks = twcs.getNextBackgroundTasks(nowInSeconds());
+        assertEquals(1, tasks.size());
+        AbstractCompactionTask t = tasks.iterator().next();
         assertNotNull(t);
         assertEquals(1, Iterables.size(t.transaction.originals()));
         SSTableReader sstable = t.transaction.originals().iterator().next();
diff --git a/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionContainerPendingRepairTest.java b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionContainerPendingRepairTest.java
new file mode 100644
index 000000000000..b99e41720a15
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionContainerPendingRepairTest.java
@@ -0,0 +1,941 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.UUID;
+import java.util.stream.Collectors;
+
+import com.google.common.collect.ImmutableList;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy.Shard;
+import org.apache.cassandra.db.compaction.unified.UnifiedCompactionTask;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.repair.consistent.LocalSession;
+import org.apache.cassandra.repair.consistent.LocalSessionAccessor;
+import org.apache.cassandra.repair.consistent.LocalSessions;
+import org.apache.cassandra.utils.FBUtilities;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+
+/**
+ * Tests UnifiedCompactionContainer's handling of pending repair sstables
+ */
+public class UnifiedCompactionContainerPendingRepairTest extends AbstractPendingRepairTest implements CompactionStrategyContainerPendingRepairTest
+{
+    @Override
+    public String createTableCql()
+    {
+        return String.format("CREATE TABLE %s.%s (k INT PRIMARY KEY, v INT) " +
+                             "WITH COMPACTION={'class': 'UnifiedCompactionStrategy'} ",
+                             ks, tbl);
+    }
+
+    @Override
+    void handleOrphan(SSTableReader sstable)
+    {
+        // UCS is stateless, so nothing to do
+    }
+
+    @Override
+    @Test
+    public void testSstableAdded() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        SSTableReader sstable = makeSSTable(true);
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertFalse(sstable.isPendingRepair());
+        assertShardContainsSstable(sstable, false, false, false,null,true, true);
+
+        cfs.mutateRepaired(ImmutableList.of(sstable), 0, repairID, false);
+
+        Assert.assertFalse(sstable.isRepaired());
+        assertTrue(sstable.isPendingRepair());
+        assertEquals(repairID, sstable.getPendingRepair());
+        assertShardContainsSstable(sstable, false, true, false, repairID, true,true);
+    }
+
+    @Override
+    @Test
+    public void testSstableDeleted() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        SSTableReader sstable = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertFalse(sstable.isPendingRepair());
+        assertShardContainsSstable(sstable, false, false, false, null,true, true);
+
+        cfs.mutateRepaired(ImmutableList.of(sstable), 0, repairID, false);
+
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertTrue(sstable.isPendingRepair());
+        assertEquals(repairID, sstable.getPendingRepair());
+        assertShardContainsSstable(sstable, false, true, false, repairID, true,true);
+
+        // delete sstable
+        cfs.markObsolete(Collections.singletonList(sstable), OperationType.UNKNOWN);
+
+        assertShardContainsSstable(sstable, false, true, false, repairID, false,false);
+    }
+
+    @Override
+    @Test
+    public void testSstableListChangedAddAndRemove() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        SSTableReader sstable1 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertFalse(sstable1.isPendingRepair());
+        assertShardContainsSstable(sstable1, false, false, false, null,true, true);
+
+        SSTableReader sstable2 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertFalse(sstable2.isPendingRepair());
+        assertShardContainsSstable(sstable2, false, false, false, null,true, true);
+
+        cfs.mutateRepaired(ImmutableList.of(sstable1, sstable2), 0, repairID, false);
+
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertTrue(sstable1.isPendingRepair());
+        assertEquals(repairID, sstable1.getPendingRepair());
+        assertShardContainsSstable(sstable1, false, true, false, repairID, true,true);
+
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertTrue(sstable2.isPendingRepair());
+        assertEquals(repairID, sstable2.getPendingRepair());
+        assertShardContainsSstable(sstable2, false, true, false, repairID,true,true);
+
+        // remove sstable1
+        cfs.markObsolete(Collections.singletonList(sstable1), OperationType.UNKNOWN);
+
+        assertShardContainsSstable(sstable1, false, true, false, repairID,false,false);
+        assertShardContainsSstable(sstable2, false, true, false, repairID,true,true);
+    }
+
+    @Override
+    @Test
+    public void testSstableRepairStatusChanged() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        // add as unrepaired
+        final boolean isOrphan = false;
+        SSTableReader sstable = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertFalse(sstable.isPendingRepair());
+        assertShardContainsSstable(sstable, false, false, false, null,true, true);
+
+        // change to pending repair
+        cfs.mutateRepaired(Collections.singletonList(sstable), 0, repairID, false);
+
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertTrue(sstable.isPendingRepair());
+        assertEquals(repairID, sstable.getPendingRepair());
+        assertShardContainsSstable(sstable, false, true, false, repairID, true,true);
+
+        // change to repaired
+        long repairedAt = System.currentTimeMillis();
+        cfs.mutateRepaired(Collections.singletonList(sstable), repairedAt, null, false);
+
+        Assert.assertTrue(sstable.isRepaired());
+        Assert.assertFalse(sstable.isPendingRepair());
+        assertShardContainsSstable(sstable, true, false, false, null,true,true);
+    }
+
+    @Override
+    @Test
+    public void testStrategiesContainsPendingRepair() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        SSTableReader sstable = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertFalse(sstable.isPendingRepair());
+        assertShardContainsSstable(sstable, false, false, false, null,true, true);
+
+        assertFalse(cfs.hasPendingRepairSSTables(repairID));
+
+        cfs.mutateRepaired(Collections.singletonList(sstable), 0, repairID, false);
+
+        Assert.assertFalse(sstable.isRepaired());
+        Assert.assertTrue(sstable.isPendingRepair());
+        assertEquals(repairID, sstable.getPendingRepair());
+        assertShardContainsSstable(sstable, false, true, false, repairID,true,true);
+
+        assertTrue(cfs.hasPendingRepairSSTables(repairID));
+    }
+
+
+    /**
+     * Tests that finalized repairs result in {@link LocalSessions#sessionCompleted}
+     * which reclassify the sstables as repaired
+     */
+    @Override
+    @Test
+    public void testCleanupCompactionFinalized() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        int numberOfSStables = 4; // this has to be >= T
+        List<SSTableReader> sstables = new ArrayList<>(numberOfSStables);
+        for (int i = 0; i < numberOfSStables; i++)
+        {
+            SSTableReader sstable = makeSSTable(isOrphan);
+            sstables.add(sstable);
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertFalse(sstable.isPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       false,
+                                       false,
+                                       null,
+                                       true,
+                                       true);
+        }
+
+        // change to pending repair
+        cfs.mutateRepaired(sstables, 0, repairID, false);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            assertEquals(repairID, sstable.getPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       false,
+                                       repairID,
+                                       true,
+                                       true);
+        }
+
+        // finalize
+        LocalSessionAccessor.finalizeUnsafe(repairID);
+
+        for (SSTableReader sstable : sstables)
+        {
+
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            assertEquals(repairID, sstable.getPendingRepair());
+        }
+
+        // enable compaction to fetch next background task
+        compactionStrategyContainer.enable();
+
+        // pending repair sstables should be compacted
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
+        assertNotNull(compactionTask);
+        assertSame(UnifiedCompactionTask.class, compactionTask.getClass());
+
+        // run the compaction
+        compactionTask.execute();
+
+        // sstables should not be found in any shards after compacted
+        for (SSTableReader sstable : sstables)
+        {
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       false,
+                                       repairID,
+                                       false,
+                                       false);
+            assertFalse(cfs.getLiveSSTables().contains(sstable));
+            assertFalse(cfs.getPendingRepairSSTables(repairID).contains(sstable));
+        }
+
+        // new sstable is created with the same repairID
+        assertEquals(1, cfs.getPendingRepairSSTables(repairID).size());
+        SSTableReader compactedSSTable = cfs.getPendingRepairSSTables(repairID).iterator().next();
+
+        Assert.assertFalse(compactedSSTable.isRepaired());
+        Assert.assertTrue(compactedSSTable.isPendingRepair());
+        assertEquals(repairID, compactedSSTable.getPendingRepair());
+
+        // complete session
+        LocalSession session = ARS.consistent.local.getSession(repairID);
+        ARS.consistent.local.sessionCompleted(session);
+
+        Assert.assertTrue(compactedSSTable.isRepaired());
+        Assert.assertEquals(compactedSSTable.getRepairedAt(), session.repairedAt);
+        Assert.assertFalse(compactedSSTable.isPendingRepair());
+
+        assertEquals(0, cfs.getPendingRepairSSTables(repairID).size());
+    }
+
+    @Override
+    @Test
+    public void testFinalizedSessionTransientCleanup() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        int numberOfSStables = 4; // this has to be >= T
+        List<SSTableReader> sstables = new ArrayList<>(numberOfSStables);
+        for (int i = 0; i < numberOfSStables; i++)
+        {
+            SSTableReader sstable = makeSSTable(isOrphan);
+            sstables.add(sstable);
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertFalse(sstable.isPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       false,
+                                       false,
+                                       null,
+                                       true,
+                                       true);
+        }
+
+        // change to pending repair
+        cfs.mutateRepaired(sstables, 0, repairID, true);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            Assert.assertTrue(sstable.isTransient());
+            assertEquals(repairID, sstable.getPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       true,
+                                       repairID,
+                                       true,
+                                       true);
+        }
+
+        // finalize
+        LocalSessionAccessor.finalizeUnsafe(repairID);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            Assert.assertTrue(sstable.isTransient());
+            assertEquals(repairID, sstable.getPendingRepair());
+        }
+
+        // enable compaction to fetch next background task
+        compactionStrategyContainer.enable();
+
+        // pending repair sstables should be compacted
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
+        assertNotNull(compactionTask);
+        assertSame(UnifiedCompactionTask.class, compactionTask.getClass());
+
+        // run the compaction
+        compactionTask.execute();
+
+        // sstables should not be found in any shards after compacted
+        for (SSTableReader sstable : sstables)
+        {
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       true,
+                                       repairID,
+                                       false,
+                                       false);
+            assertFalse(cfs.getLiveSSTables().contains(sstable));
+            assertFalse(cfs.getPendingRepairSSTables(repairID).contains(sstable));
+        }
+
+        // new sstable is created with the same repairID
+        assertEquals(1, cfs.getPendingRepairSSTables(repairID).size());
+        SSTableReader compactedSSTable = cfs.getPendingRepairSSTables(repairID).iterator().next();
+
+        Assert.assertFalse(compactedSSTable.isRepaired());
+        Assert.assertTrue(compactedSSTable.isPendingRepair());
+        assertEquals(repairID, compactedSSTable.getPendingRepair());
+
+        // complete session
+        LocalSession session = ARS.consistent.local.getSession(repairID);
+        ARS.consistent.local.sessionCompleted(session);
+
+        assertTrue(cfs.getLiveSSTables().isEmpty());
+        assertEquals(0, cfs.getPendingRepairSSTables(repairID).size());
+    }
+
+    @Override
+    @Test
+    public void testFailedSessionTransientCleanup() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        int numberOfSStables = 4; // this has to be >= T
+        List<SSTableReader> sstables = new ArrayList<>(numberOfSStables);
+        for (int i = 0; i < numberOfSStables; i++)
+        {
+            SSTableReader sstable = makeSSTable(isOrphan);
+            sstables.add(sstable);
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertFalse(sstable.isPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       false,
+                                       false,
+                                       null,
+                                       true,
+                                       true);
+        }
+
+        // change to pending repair
+        cfs.mutateRepaired(sstables, 0, repairID, true);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            Assert.assertEquals(repairID, sstable.getPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       true,
+                                       repairID,
+                                       true,
+                                       true);
+        }
+        // fail
+        LocalSessionAccessor.failUnsafe(repairID);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            Assert.assertEquals(repairID, sstable.getPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       true,
+                                       repairID,
+                                       true,
+                                       true);
+        }
+
+        // enable compaction to fetch next background task
+        compactionStrategyContainer.enable();
+
+        // pending repair sstables should be compacted
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
+        assertNotNull(compactionTask);
+        assertSame(UnifiedCompactionTask.class, compactionTask.getClass());
+
+        // run the compaction
+        compactionTask.execute();
+
+        // sstables should not be found in any shards after compacted
+        for (SSTableReader sstable : sstables)
+        {
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       true,
+                                       repairID,
+                                       false,
+                                       false);
+            assertFalse(cfs.getLiveSSTables().contains(sstable));
+            assertFalse(cfs.getPendingRepairSSTables(repairID).contains(sstable));
+        }
+
+        // new sstable is created with the same repairID
+        assertEquals(1, cfs.getPendingRepairSSTables(repairID).size());
+        SSTableReader compactedSSTable = cfs.getPendingRepairSSTables(repairID).iterator().next();
+        Assert.assertEquals(repairID, compactedSSTable.getPendingRepair());
+        Assert.assertFalse(compactedSSTable.isRepaired());
+        Assert.assertTrue(compactedSSTable.isPendingRepair());
+        Assert.assertTrue(compactedSSTable.isTransient());
+
+        // complete session
+        LocalSession session = ARS.consistent.local.getSession(repairID);
+        ARS.consistent.local.sessionCompleted(session);
+
+        Assert.assertFalse(compactedSSTable.isRepaired());
+        Assert.assertFalse(compactedSSTable.isPendingRepair());
+        Assert.assertFalse(compactedSSTable.isTransient());
+
+        assertEquals(0, cfs.getPendingRepairSSTables(repairID).size());
+    }
+
+    /**
+     * Tests that failed repairs result in {@link LocalSessions#sessionCompleted}
+     * which reclassify the sstables as unrepaired
+     */
+    @Override
+    @Test
+    public void testCleanupCompactionFailed() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        final boolean isOrphan = true;
+        int numberOfSStables = 4; // this has to be >= T
+        List<SSTableReader> sstables = new ArrayList<>(numberOfSStables);
+        for (int i = 0; i < numberOfSStables; i++)
+        {
+            SSTableReader sstable = makeSSTable(isOrphan);
+            sstables.add(sstable);
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertFalse(sstable.isPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       false,
+                                       false,
+                                       null,
+                                       true,
+                                       true);
+        }
+
+        // change to pending repair
+        cfs.mutateRepaired(sstables, 0, repairID, false);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            Assert.assertEquals(repairID, sstable.getPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       false,
+                                       repairID,
+                                       true,
+                                       true);
+        }
+
+        // fail
+        LocalSessionAccessor.failUnsafe(repairID);
+
+        for (SSTableReader sstable : sstables)
+        {
+            Assert.assertFalse(sstable.isRepaired());
+            Assert.assertTrue(sstable.isPendingRepair());
+            Assert.assertEquals(repairID, sstable.getPendingRepair());
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       false,
+                                       repairID,
+                                       true,
+                                       true);
+        }
+
+        // enable compaction to fetch next background task
+        compactionStrategyContainer.enable();
+
+        // pending repair sstables should be compacted
+        Collection<AbstractCompactionTask> compactionTasks = compactionStrategyContainer.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals(1, compactionTasks.size());
+
+        AbstractCompactionTask compactionTask = compactionTasks.iterator().next();
+        assertNotNull(compactionTask);
+        assertSame(UnifiedCompactionTask.class, compactionTask.getClass());
+
+        // run the compaction
+        compactionTask.execute();
+
+        // sstables should not be found in any shards after compacted
+        for (SSTableReader sstable : sstables)
+        {
+            assertShardContainsSstable(sstable,
+                                       false,
+                                       true,
+                                       false,
+                                       repairID,
+                                       false,
+                                       false);
+            assertFalse(cfs.getLiveSSTables().contains(sstable));
+            assertFalse(cfs.getPendingRepairSSTables(repairID).contains(sstable));
+        }
+
+        // new sstable is created with the same repairID
+        assertEquals(1, cfs.getPendingRepairSSTables(repairID).size());
+        SSTableReader compactedSSTable = cfs.getPendingRepairSSTables(repairID).iterator().next();
+        Assert.assertEquals(repairID, compactedSSTable.getPendingRepair());
+        Assert.assertFalse(compactedSSTable.isRepaired());
+        Assert.assertTrue(compactedSSTable.isPendingRepair());
+
+        // complete session
+        LocalSession session = ARS.consistent.local.getSession(repairID);
+        ARS.consistent.local.sessionCompleted(session);
+
+        Assert.assertFalse(compactedSSTable.isRepaired());
+        Assert.assertFalse(compactedSSTable.isPendingRepair());
+
+        assertEquals(0, cfs.getPendingRepairSSTables(repairID).size());
+    }
+
+    @Override
+    @Test
+    public void testSessionCompleted() throws IOException
+    {
+        UUID repairID = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID, COORDINATOR, PARTICIPANTS);
+
+        // add sstables as unrepaired
+        final boolean isOrphan = false;
+        SSTableReader sstable1 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertFalse(sstable1.isPendingRepair());
+        assertShardContainsSstable(sstable1,
+                                   false,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        SSTableReader sstable2 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertFalse(sstable2.isPendingRepair());
+        assertShardContainsSstable(sstable2,
+                                   false,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        SSTableReader sstable3 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable3.isRepaired());
+        Assert.assertFalse(sstable3.isPendingRepair());
+        assertShardContainsSstable(sstable3,
+                                   false,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        // change to pending repair
+        cfs.mutateRepaired(ImmutableList.of(sstable1, sstable2, sstable3), 0, repairID, false);
+
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertTrue(sstable1.isPendingRepair());
+        Assert.assertEquals(repairID, sstable1.getPendingRepair());
+        assertShardContainsSstable(sstable1,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID,
+                                   true,
+                                   true);
+
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertTrue(sstable2.isPendingRepair());
+        Assert.assertEquals(repairID, sstable2.getPendingRepair());
+        assertShardContainsSstable(sstable2,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID,
+                                   true,
+                                   true);
+
+        Assert.assertFalse(sstable3.isRepaired());
+        Assert.assertTrue(sstable3.isPendingRepair());
+        Assert.assertEquals(repairID, sstable3.getPendingRepair());
+        assertShardContainsSstable(sstable3,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID,
+                                   true,
+                                   true);
+
+        // finalize
+        LocalSessionAccessor.finalizeUnsafe(repairID);
+
+        // complete (repair) session and sstables should be marked as repaired
+        LocalSession session = ARS.consistent.local.getSession(repairID);
+        ARS.consistent.local.sessionCompleted(session);
+
+        // sstables are repaired
+        assertShardContainsSstable(sstable1,
+                                   true,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+        assertShardContainsSstable(sstable2,
+                                   true,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+        assertShardContainsSstable(sstable3,
+                                   true,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+    }
+
+    @Override
+    @Test
+    public void testSessionCompletedWithDifferentSSTables() throws IOException
+    {
+        UUID repairID1 = registerSession(cfs, true, true);
+        UUID repairID2 = registerSession(cfs, true, true);
+        LocalSessionAccessor.prepareUnsafe(repairID1, COORDINATOR, PARTICIPANTS);
+        LocalSessionAccessor.prepareUnsafe(repairID2, COORDINATOR, PARTICIPANTS);
+
+        // add sstables as unrepaired
+        final boolean isOrphan = false;
+        SSTableReader sstable1 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertFalse(sstable1.isPendingRepair());
+        assertShardContainsSstable(sstable1,
+                                   false,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        SSTableReader sstable2 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertFalse(sstable2.isPendingRepair());
+        assertShardContainsSstable(sstable2,
+                                   false,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        SSTableReader sstable3 = makeSSTable(isOrphan);
+        Assert.assertFalse(sstable3.isRepaired());
+        Assert.assertFalse(sstable3.isPendingRepair());
+        assertShardContainsSstable(sstable3,
+                                   false,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        // change sstable1 to pending repair for session 1
+        cfs.mutateRepaired(Collections.singletonList(sstable1), 0, repairID1, false);
+
+        Assert.assertFalse(sstable1.isRepaired());
+        Assert.assertTrue(sstable1.isPendingRepair());
+        Assert.assertEquals(repairID1, sstable1.getPendingRepair());
+        assertShardContainsSstable(sstable1,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID1,
+                                   true,
+                                   true);
+        assertNumberOfShards(2);
+
+        // change sstable2 to pending repair for session 2
+        cfs.mutateRepaired(Collections.singletonList(sstable2), 0, repairID2, false);
+
+        Assert.assertFalse(sstable2.isRepaired());
+        Assert.assertTrue(sstable2.isPendingRepair());
+        Assert.assertEquals(repairID2, sstable2.getPendingRepair());
+        assertNumberOfShards(3);
+        assertShardContainsSstable(sstable2,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID2,
+                                   true,
+                                   true);
+
+        // change sstable3 to repaired
+        long repairedAt3 = System.currentTimeMillis();
+        cfs.mutateRepaired(Collections.singletonList(sstable3), repairedAt3, null, false);
+
+        Assert.assertTrue(sstable3.isRepaired());
+        Assert.assertFalse(sstable3.isPendingRepair());
+        assertNumberOfShards(3);
+        assertShardContainsSstable(sstable3,
+                                   true,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+
+        // finalize session 1
+        LocalSessionAccessor.finalizeUnsafe(repairID1);
+
+        // simulate index build on pending sstable for session 1
+        cfs.getTracker().tryModify(sstable1, OperationType.INDEX_BUILD);
+
+        // completing session 1 will not require to disable compactions because:
+        // * sstable1 is building index (and considered as compacting), which would not be found in any shards
+        // * sstable2 belongs to a different session
+        // * sstable3 is repaired
+        LocalSession session1 = ARS.consistent.local.getSession(repairID1);
+        ARS.consistent.local.sessionCompleted(session1);
+
+        // expecting sstable1 not found in any shards
+        assertShardContainsSstable(sstable1,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID1,
+                                   false,
+                                   true);
+
+        // expecting sstable2 exists in pending repair shard
+        assertShardContainsSstable(sstable2,
+                                   false,
+                                   true,
+                                   false,
+                                   repairID2,
+                                   true,
+                                   true);
+
+        // expecting sstable3 exists in repaired shards
+        assertShardContainsSstable(sstable3,
+                                   true,
+                                   false,
+                                   false,
+                                   null,
+                                   true,
+                                   true);
+    }
+
+    private void assertNumberOfShards(int expectedNumberOfShards)
+    {
+        Collection<CompactionStrategy> compactionStrategies = compactionStrategyContainer.getStrategies();
+        assertEquals(1, compactionStrategies.size());
+        compactionStrategies.forEach(cs -> {
+            assertTrue(cs instanceof UnifiedCompactionStrategy);
+
+            UnifiedCompactionStrategy ucs = ((UnifiedCompactionStrategy) cs);
+            assertEquals("Expecting number of shards in the strategy.",
+                         expectedNumberOfShards,
+                         ucs.getShardsWithBuckets().keySet().size());
+        });
+    }
+
+    private void assertShardContainsSstable(SSTableReader sstable,
+                                            boolean expectedIsRepaired,
+                                            boolean expectedIsPending,
+                                            boolean expectedIsTransient,
+                                            UUID expectedRepairId,
+                                            boolean expectedRepairStatus,
+                                            boolean expectedContainsSstable)
+    {
+        List<CompactionStrategy> compactionStrategies = compactionStrategyContainer.getStrategies();
+        // CompactionStrategyContainer should always contains 1 UnifiedCompactionStrategy
+        assertEquals(1, compactionStrategies.size());
+        compactionStrategies.forEach(cs -> {
+            assertTrue(cs instanceof UnifiedCompactionStrategy);
+
+            UnifiedCompactionStrategy ucs = ((UnifiedCompactionStrategy) cs);
+            Set<SSTableReader> ucsSstables = ucs.getSSTables()
+                                                .stream()
+                                                .filter(sst -> sst.equals(sstable))
+                                                .collect(Collectors.toSet());
+
+            assertEquals("Expecting strategy contains sstable.", expectedContainsSstable, ucsSstables.size() == 1);
+
+            Map<Shard, List<UnifiedCompactionStrategy.Bucket>> shardListMap = ucs.getShardsWithBuckets();
+            Set<Shard> shards = shardListMap.keySet();
+
+            if (expectedRepairStatus)
+            {
+                Set<Shard> shardsWithPrefix = shards.stream()
+                                                    .filter(shard -> {
+                                                        if (shard.sstables.isEmpty())
+                                                            return false;
+
+                                                        SSTableReader shardSSTable = shard.sstables.get(0);
+                                                        return shardSSTable.isRepaired() == expectedIsRepaired &&
+                                                               shardSSTable.isTransient() == expectedIsTransient &&
+                                                               shardSSTable.isPendingRepair() == expectedIsPending &&
+                                                               (shardSSTable.getPendingRepair() == null
+                                                                ? expectedRepairId == null
+                                                                : shardSSTable.getPendingRepair().equals(expectedRepairId));
+                                                    })
+                                                    .collect(Collectors.toSet());
+
+                assertEquals(String.format("Expecting a shard with repair status: pending=%s repaired=%s but found %s of it.",
+                                           expectedIsPending, expectedIsRepaired, shardsWithPrefix.size()),
+                             1,
+                             shardsWithPrefix.size());
+
+                Shard shardWithPrefix = shardsWithPrefix.iterator().next();
+                assertEquals(String.format("Expecting a shard with repair status: %s contains the sstable is %s.",
+                                           expectedRepairStatus,
+                                           expectedContainsSstable),
+                             expectedContainsSstable,
+                             shardWithPrefix.sstables.contains(sstable));
+            }
+            else
+            {
+                // not expecting any shard would contain the sstable
+                Set<Shard> shardsContainsSstable = shards.stream()
+                                                         .filter(shard -> shard.sstables.contains(sstable))
+                                                         .collect(Collectors.toSet());
+
+                assertTrue(String.format("Expecting no shard should contain the sstable but found exists in %s",
+                                         shardsContainsSstable),
+                           shardsContainsSstable.isEmpty());
+            }
+        });
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java
new file mode 100644
index 000000000000..5f1e52f57ce0
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/UnifiedCompactionStrategyTest.java
@@ -0,0 +1,1361 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction;
+
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.UUID;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Iterables;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.db.BufferDecoratedKey;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.SortedLocalRanges;
+import org.apache.cassandra.db.compaction.unified.Controller;
+import org.apache.cassandra.db.compaction.unified.UnifiedCompactionTask;
+import org.apache.cassandra.dht.IPartitioner;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.dht.Range;
+import org.apache.cassandra.dht.Splitter;
+import org.apache.cassandra.dht.Token;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
+import org.mockito.Mockito;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertSame;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.Mockito.when;
+
+/**
+ * The unified compaction strategy is described in this design document:
+ *
+ * TODO: link to design doc or SEP
+ *
+ * It has properties of  both tiered and leveled compactions and it adapts to the workload
+ * by switching between strategies or increasing / decreasing the fanout factor.
+ *
+ * The essential formulae are the calculations of buckets:
+ *
+ * S = ⌊log_oF(size / m)⌋ = ⌊(ln size - ln m) / (ln F + ln o)⌋
+ *
+ * where log_oF is the log with oF as the base
+ * o is the survival factor, currently fixed to 1
+ * F is the fanout factor calculated below
+ * m is the minimal size, fixed in the strategy options
+ * size is the sorted run size (sum of all the sizes of the sstables in the sorted run)
+ *
+ * Also, T is the number of sstables that trigger compaction.
+ *
+ * Give a parameter W, which is fixed in these tests, then T and F are calculated as follows:
+ *
+ * - W < 0 then T = 2 and F = 2 - W (leveled merge policy)
+ * - W > 0 then T = F and F = 2 + W (tiered merge policy)
+ * - W = 0 then T = F = 2 (middle ground)
+ */
+@RunWith(Parameterized.class)
+public class UnifiedCompactionStrategyTest extends BaseCompactionStrategyTest
+{
+    private final static long ONE_MB = 1 << 20;
+
+    // Multiple disks can be used both with and without disk boundaries. We want to test both cases.
+
+    @Parameterized.Parameters(name = "useDiskBoundaries {0}")
+    public static Iterable<Object[]> params()
+    {
+        return Arrays.asList(new Object[][] { {false}, {true} });
+    }
+
+    @Parameterized.Parameter
+    public boolean useDiskBoundaries = true;
+
+    @BeforeClass
+    public static void setUpClass()
+    {
+        BaseCompactionStrategyTest.setUpClass();
+    }
+
+    @Before
+    public void setUp()
+    {
+        super.setUp();
+    }
+
+    @Test
+    public void testNoSSTables()
+    {
+        Controller controller = Mockito.mock(Controller.class);
+        long minimalSizeBytes = 2 << 20;
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(4);
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(1);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.maxSSTablesToCompact()).thenReturn(1000);
+        when(controller.random()).thenCallRealMethod();
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+
+        assertTrue(strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).isEmpty());
+        assertEquals(0, strategy.getEstimatedRemainingTasks());
+    }
+
+    @Test
+    public void testGetBucketsSameWUniqueArena()
+    {
+        final int m = 2; // minimal sorted run size in MB m
+        final Map<Integer, Integer> sstables = new TreeMap<>();
+
+        for (int i = 0; i < 20; i++)
+        {
+            int numSSTables = 2 + random.nextInt(18);
+            sstables.put(m * i, numSSTables);
+        }
+
+        // W = 3, o = 1 => F = 5, T = 5 => expected T sstables and 2 buckets: 0-10m, 10-50m
+        testGetBucketsOneArena(sstables, new int[] { 3 }, m, new int[] { 5, 5});
+
+        // W = 2, o = 1 => F = 4, T = 4 => expected T sstables and 3 buckets: 0-8m, 8-32m, 32-128m
+        testGetBucketsOneArena(sstables, new int[] { 2 }, m, new int[] { 4, 4, 4});
+
+        // W = 0, o = 1 => F = 2, T = 2 => expected 2 sstables and 5 buckets: 0-4m, 4-8m, 8-16m, 16-32m, 32-64m
+        testGetBucketsOneArena(sstables, new int[] { 0 }, m, new int[] { 2, 2, 2, 2, 2});
+
+        // W = -2, o = 1 => F = 4, T = 2 => expected 2 sstables and 3 buckets: 0-8mb, 8-32m, 32-128m
+        testGetBucketsOneArena(sstables, new int[] { -2 }, m, new int[] { 2, 2, 2});
+
+        // W = -3, o = 1 => F = 5, T = 2 => expected 2 sstables and 2 buckets: 0-10m, 10-50m
+        testGetBucketsOneArena(sstables, new int[] { -3 }, m, new int[] { 2, 2});
+
+        // remove sstables from 4m to 8m to create an empty bucket in the next call
+        sstables.remove(4); // 4m
+        sstables.remove(6); // 6m
+        sstables.remove(8); // 8m
+
+        // W = 0, o = 1 => F = 2, T = 2 => expected 2 sstables and 5 buckets: 0-4m, 4-8m, 8-16m, 16-32m, 32-64m
+        testGetBucketsOneArena(sstables, new int[] { 0 }, m, new int[] { 2, 2, 2, 2, 2});
+    }
+
+    @Test
+    public void testGetBucketsDifferentWsUniqueArena()
+    {
+        final int m = 2; // minimal sorted run size in MB m
+        final Map<Integer, Integer> sstables = new TreeMap<>();
+
+        for (int i : new int[] { 50, 100, 200, 400, 600, 800, 1000})
+        {
+            int numSSTables = 2 + random.nextInt(18);
+            sstables.put(i, numSSTables);
+        }
+
+        // W = [30, 2, -6], o = 1 => F = [32, 4, 8] , T = [32, 4, 2]  => expected 3 buckets: 0-64m, 64-256m 256-2048m
+        testGetBucketsOneArena(sstables, new int[]{ 30, 2, -6 }, m, new int[] { 32, 4, 2});
+
+        // W = [30, 6, -8], o = 1 => F = [32, 8, 10] , T = [32, 8, 2]  => expected 3 buckets: 0-64m, 64-544m 544-5440m
+        testGetBucketsOneArena(sstables, new int[]{ 30, 6, -8 }, m, new int[] { 32, 8, 2});
+
+        // W = [0, 0, 0, -2, -2], o = 1 => F = [2, 2, 2, 4, 4] , T = [2, 2, 2, 2, 2]  => expected 6 buckets: 0-4m, 4-8m, 8-16m, 16-64m, 64-256m, 256-1024m
+        testGetBucketsOneArena(sstables, new int[]{ 0, 0, 0, -2, -2 }, m, new int[] { 2, 2, 2, 2, 2, 2});
+    }
+
+    private void testGetBucketsOneArena(Map<Integer, Integer> sstableMap, int[] Ws, int m, int[] expectedTs)
+    {
+        long minimalSizeBytes = m << 20;
+
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getNumShards()).thenReturn(1);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.maxSSTablesToCompact()).thenReturn(1000);
+
+        when(controller.getScalingParameter(anyInt())).thenAnswer(answer -> {
+            int index = answer.getArgument(0);
+            return Ws[index < Ws.length ? index : Ws.length - 1];
+        });
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.random()).thenCallRealMethod();
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+
+        IPartitioner partitioner = cfs.getPartitioner();
+        DecoratedKey first = new BufferDecoratedKey(partitioner.getMinimumToken(), ByteBuffer.allocate(0));
+        DecoratedKey last = new BufferDecoratedKey(partitioner.getMaximumToken(), ByteBuffer.allocate(0));
+
+        List<SSTableReader> sstables = new ArrayList<>();
+        long dataSetSizeBytes = 0;
+        for (Map.Entry<Integer, Integer> entry : sstableMap.entrySet())
+        {
+            for (int i = 0; i < entry.getValue(); i++)
+            {
+                // we want a number > 0 and < 1 so that the sstable has always some size and never crosses the boundary to the next bucket
+                // so we leave a 1% margin, picking a number from 0.01 to 0.99
+                double rand = 0.01 + 0.98 * random.nextDouble();
+                long sizeOnDiskBytes = (entry.getKey() << 20) + (long) (minimalSizeBytes * rand);
+                dataSetSizeBytes += sizeOnDiskBytes;
+                sstables.add(mockSSTable(sizeOnDiskBytes, System.currentTimeMillis(), first, last));
+            }
+        }
+        dataTracker.addInitialSSTables(sstables);
+
+        Map<UnifiedCompactionStrategy.Shard, List<UnifiedCompactionStrategy.Bucket>> arenas = strategy.getShardsWithBuckets();
+        assertNotNull(arenas);
+        assertEquals(1, arenas.size());
+
+        for (Map.Entry<UnifiedCompactionStrategy.Shard, List<UnifiedCompactionStrategy.Bucket>> entry : arenas.entrySet())
+        {
+            List<UnifiedCompactionStrategy.Bucket> buckets = entry.getValue();
+            assertEquals(expectedTs.length, buckets.size());
+
+            for (int i = 0; i < expectedTs.length; i++)
+            {
+                UnifiedCompactionStrategy.Bucket bucket = buckets.get(i);
+                if (bucket.sstables.size() >= expectedTs[i])
+                    assertFalse(bucket.getCompactionAggregate(entry.getKey(), Collections.EMPTY_SET, controller, dataSetSizeBytes).isEmpty());
+                else
+                    assertTrue(bucket.getCompactionAggregate(entry.getKey(), Collections.EMPTY_SET, controller, dataSetSizeBytes).isEmpty());
+            }
+        }
+    }
+
+    @Test
+    public void testOversizedCompactions_limitingTriggered_maxSpaceOverhead1pct()
+    {
+        testLimitOversizedCompactions(true, 0.01);
+    }
+
+    @Test
+    public void testOversizedCompactions_limitingTriggered_maxSpaceOverhead10pct()
+    {
+        testLimitOversizedCompactions(true, 0.1);
+    }
+
+    @Test
+    public void testOversizedCompactions_limitingTriggered_maxSpaceOverhead20pct()
+    {
+        testLimitOversizedCompactions(true, 0.2);
+    }
+
+    @Test
+    public void testOversizedCompactions_limitingTriggered_maxSpaceOverhead50pct()
+    {
+        testLimitOversizedCompactions(true, 0.5);
+    }
+
+    @Test
+    public void testOversizedCompactions_limitingTriggered_maxSpaceOverhead90pct()
+    {
+        testLimitOversizedCompactions(true, 0.9);
+    }
+
+    void testLimitOversizedCompactions(boolean triggerOversizedLimiting, double maxSpaceOverhead)
+    {
+        testLimitCompactions(1000, true, triggerOversizedLimiting, maxSpaceOverhead);
+    }
+
+    @Test
+    public void testLimitCompactions_noLimiting()
+    {
+        testLimitCompactionsCount(true, 1000);
+    }
+
+    @Test
+    public void testLimitCompactionsCount_1()
+    {
+        testLimitCompactionsCount(false, 1);
+    }
+
+    @Test
+    public void testLimitCompactionsCount_3()
+    {
+        testLimitCompactionsCount(false, 3);
+    }
+
+    @Test
+    public void testLimitCompactionsCount_PerLevel_1()
+    {
+        testLimitCompactionsCount(true, 1);
+    }
+
+    @Test
+    public void testLimitCompactionsCount_PerLevel_5()
+    {
+        testLimitCompactionsCount(true, 5);
+    }
+
+    @Test
+    public void testLimitCompactionsCount_PerLevel_11()
+    {
+        testLimitCompactionsCount(true, 11);
+    }
+
+    void testLimitCompactionsCount(boolean topLevelOnly, int count)
+    {
+        testLimitCompactions(count, topLevelOnly, false, 1.0);
+    }
+
+    public void testLimitCompactions(int maxCount, boolean topLevelOnly, boolean triggerOversizedLimiting, double maxSpaceOverhead)
+    {
+        final int numBuckets = 4;
+        UnifiedCompactionStrategy strategy = prepareStrategyWithLimits(maxCount,
+                                                                       topLevelOnly,
+                                                                       triggerOversizedLimiting,
+                                                                       maxSpaceOverhead,
+                                                                       Double.MAX_VALUE,
+                                                                       numBuckets);
+
+        int numShards = strategy.getController().getNumShards();
+        // Without limiting oversized compactions kicking in, we expect one compaction per shard, otherwise we expect
+        // a fraction of the number of all shards, proportional to the max allowed space amplification fraction.
+        int expectedCompactionTasks = triggerOversizedLimiting
+                                      ? (int) (Math.floor(numShards * maxSpaceOverhead))
+                                      : topLevelOnly
+                                        ? Math.min((maxCount + numBuckets - 1) / numBuckets, numShards)
+                                        : Math.min(maxCount, numShards);
+        // TODO: Check that a warning was issued if space overhead limit was too low.
+        assertEquals(expectedCompactionTasks, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+    }
+
+    @Test
+    public void testPreserveLayout_W2_947()
+    {
+        testPreserveLayout(2, 947);
+    }
+
+    @Test
+    public void testPreserveLayout_WM2_947()
+    {
+        testPreserveLayout(-2, 947);
+    }
+
+    @Test
+    public void testPreserveLayout_W2_251()
+    {
+        testPreserveLayout(2, 251);
+    }
+
+    @Test
+    public void testPreserveLayout_WM2_251()
+    {
+        testPreserveLayout(-2, 251);
+    }
+
+    @Test
+    public void testPreserveLayout_W2_320()
+    {
+        testPreserveLayout(2, 320);
+    }
+
+    @Test
+    public void testPreserveLayout_WM2_320()
+    {
+        testPreserveLayout(-2, 320);
+    }
+
+    @Test
+    public void testPreserveLayout_WM2_947_128()
+    {
+        testLayout(-2, 947, 128);
+    }
+
+    @Test
+    public void testPreserveLayout_WM2_947_64()
+    {
+        testLayout(-2, 947, 64);
+    }
+
+    public void testPreserveLayout(int W, int numSSTables)
+    {
+        testLayout(W, numSSTables, 10000);
+    }
+
+    @Test
+    public void testMaxSSTablesToCompact()
+    {
+        testLayout(2, 944,  60);
+        testLayout(2, 944, 1000);
+        testLayout(2, 944,  100);
+        testLayout(2, 803,  200);
+    }
+
+    public void testLayout(int W, int numSSTables, int maxSSTablesToCompact)
+    {
+        int F = 2 + Math.abs(W);
+        int T = W < 0 ? 2 : F;
+        final long minSstableSizeBytes = 2L << 20; // 2 MB
+        final int numShards = 1;
+        final int levels = (int) Math.floor(Math.log(numSSTables) / Math.log(F)) + 1;
+
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getMinSstableSizeBytes()).thenReturn(minSstableSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(W);
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(numShards);
+        when(controller.getMaxSpaceOverhead()).thenReturn(1.0);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minSstableSizeBytes);
+
+        if (maxSSTablesToCompact >= numSSTables)
+            when(controller.maxConcurrentCompactions()).thenReturn(levels * (W < 0 ? 1 : F)); // make sure the work is assigned to different levels
+        else
+            when(controller.maxConcurrentCompactions()).thenReturn(1000); // make sure the work is assigned to different levels
+
+        when(controller.maxCompactionSpaceBytes()).thenCallRealMethod();
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.getDataSetSizeBytes()).thenReturn(minSstableSizeBytes * numSSTables * numShards);
+        when(controller.maxSSTablesToCompact()).thenReturn(maxSSTablesToCompact);
+        Random random = Mockito.mock(Random.class);
+        when(random.nextInt(anyInt())).thenReturn(0);
+        when(controller.random()).thenReturn(random);
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+        List<SSTableReader> allSstables = new ArrayList<>();
+
+        List<SSTableReader> sstables = mockSSTables(numSSTables,
+                                                    minSstableSizeBytes,
+                                                    0,
+                                                    System.currentTimeMillis(),
+                                                    0,
+                                                    true,
+                                                    null);
+        allSstables.addAll(sstables);
+        dataTracker.addInitialSSTables(allSstables);
+
+        int num = numSSTables;
+        Collection<AbstractCompactionTask> tasks;
+        boolean headerPrinted = false;
+        while (true)
+        {
+            tasks = strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+            if (tasks.isEmpty())
+                break;
+
+            for (CompactionAggregate aggregate : strategy.getAggregates())
+            {
+                if (!headerPrinted)
+                    System.out.println(aggregate.getStatistics().header());
+                headerPrinted = true;
+                System.out.println(aggregate.getStatistics().data());
+            }
+
+            boolean layout = Math.min(num, maxSSTablesToCompact) > F * F;
+            int limit;
+            if (layout)
+            {
+                int forLimitLevel = (int) (Math.pow(F, Math.floor(Math.log(maxSSTablesToCompact) / Math.log(F))));
+                // for clarification see W < 0 case in layoutCompactions method
+                limit = W < 0 ? maxSSTablesToCompact / forLimitLevel * forLimitLevel : forLimitLevel;
+            }
+            else
+                limit = maxSSTablesToCompact;
+
+            for (AbstractCompactionTask task : tasks)
+            {
+                int expected = num;
+                if (layout)
+                {
+                    int forTopLevel = (int) (Math.pow(F, Math.floor(Math.log(num) / Math.log(F))));
+                    expected = W > 0
+                               ? forTopLevel
+                               : num / forTopLevel * forTopLevel;
+
+                }
+                expected = Math.min(expected, limit);
+
+                int count = task.transaction.originals().size();
+                assertEquals(expected, count);
+                num -= count;
+            }
+        }
+        // Check that we issue all the compactions
+        assertTrue(num < T);
+    }
+
+    @Test
+    public void testLimitCompactionsThroughput_1()
+    {
+        testLimitCompactionsThroughput(1000, 1);
+    }
+
+    @Test
+    public void testLimitCompactionsThroughput_3()
+    {
+        testLimitCompactionsThroughput(1000, 3);
+    }
+
+    @Test
+    public void testOversizedCompactions_limitingNotTriggered()
+    {
+        testLimitOversizedCompactions(false, 1.0);
+    }
+
+    void testLimitCompactionsThroughput(int maxCount, int maxThroughput)
+    {
+        UnifiedCompactionStrategy strategy = prepareStrategyWithLimits(maxCount, false, false, 1.0, maxThroughput, 4);
+
+        // first call should return a pilot task
+        assertEquals(1, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+
+        // if task hasn't progressed, no new tasks should be produced
+        assertEquals(0, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+
+        for (CompactionPick pick : strategy.backgroundCompactions.getCompactionsInProgress())
+            strategy.backgroundCompactions.onInProgress(mockProgress(strategy, pick.id));
+
+        // now that we have a rate, make sure we produce tasks to fill up the limit
+        assertEquals(Math.min(maxThroughput, maxCount) - 1, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+
+        // and don't create any new ones when the limit is filled, before they make progress
+        assertEquals(0, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+
+        for (CompactionPick pick : strategy.backgroundCompactions.getCompactionsInProgress())
+            if (pick.progress == null)
+                strategy.backgroundCompactions.onInProgress(mockProgress(strategy, pick.id));
+
+        // and also when they do
+        assertEquals(0, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+
+        for (int remaining = strategy.getController().getNumShards() - Math.min(maxThroughput, maxCount);
+             remaining > 0;
+             --remaining)
+        {
+            // mark a task as completed
+            strategy.backgroundCompactions.onCompleted(strategy, Iterables.get(strategy.backgroundCompactions.getCompactionsInProgress(), 0).id);
+
+            // and check that we get a new one
+            assertEquals(1, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+        }
+    }
+
+    private UnifiedCompactionStrategy prepareStrategyWithLimits(int maxCount,
+                                                                boolean topBucketOnly,
+                                                                boolean triggerOversizedLimiting,
+                                                                double maxSpaceOverhead,
+                                                                double maxThroughput,
+                                                                int numBuckets)
+    {
+        int W = 2; // W = 2 => T = F = 4
+        int T = 4;
+        int F = 4;
+        final long minSstableSizeBytes = 2L << 20; // 2 MB
+        final int numShards = 5;
+
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getMinSstableSizeBytes()).thenReturn(minSstableSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(W);
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(numShards);
+        when(controller.getMaxSpaceOverhead()).thenReturn(maxSpaceOverhead);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minSstableSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(maxCount);
+        when(controller.maxCompactionSpaceBytes()).thenCallRealMethod();
+        when(controller.maxThroughput()).thenReturn(maxThroughput);
+        when(controller.maxSSTablesToCompact()).thenReturn(1000);
+        // Calculate the minimum shard size such that the top bucket compactions won't be considered "oversized" and
+        // all will be allowed to run. The calculation below assumes (1) that compactions are considered "oversized"
+        // if they are more than 1/2 of the max shard size; (2) that mockSSTables uses 15% less than the max SSTable
+        // size for that bucket.
+        long topBucketMaxSstableSize = (long) (minSstableSizeBytes * Math.pow(F, numBuckets));
+        long topBucketMaxCompactionSize = T * topBucketMaxSstableSize;
+        when(controller.getDataSetSizeBytes()).thenReturn(topBucketMaxCompactionSize * numShards);
+        when(controller.random()).thenCallRealMethod();
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+        List<SSTableReader> allSstables = new ArrayList<>();
+
+        for (int i = numBuckets; i > 0; i--)
+        {
+            // Set compactions only in the top bucket of each shard
+            int numSstables = (!topBucketOnly || i == numBuckets) ? T : T - 1;
+            long size = (long) (minSstableSizeBytes * Math.pow(F, i));
+            // Simulate shards by using different disk indexes
+            for (int j = numShards; j > 0; j--)
+            {
+                List<SSTableReader> sstables = mockSSTables(numSstables,
+                                                            size,
+                                                            0,
+                                                            System.currentTimeMillis(),
+                                                            j - 1,
+                                                            true,
+                                                            null);
+                allSstables.addAll(sstables);
+            }
+        }
+        dataTracker.addInitialSSTables(allSstables);
+        return strategy;
+    }
+
+    private CompactionProgress mockProgress(UnifiedCompactionStrategy strategy, UUID id)
+    {
+        CompactionProgress progress = Mockito.mock(CompactionProgress.class);
+        when(progress.durationInNanos()).thenReturn(1000L*1000*1000);
+        when(progress.outputDiskSize()).thenReturn(1L);
+        when(progress.operationId()).thenReturn(id);
+        return progress;
+    }
+
+    private static final class ArenaSpecs
+    {
+        private List<SSTableReader> sstables;
+        private int[] expectedBuckets;
+
+        ArenaSpecs(int[] expectedBuckets)
+        {
+            this.sstables = new ArrayList<>();
+            this.expectedBuckets = expectedBuckets;
+        }
+    }
+
+    private ArenaSpecs mockArena(Token min,
+                                 Token max,
+                                 Map<Long, Integer> sstables,
+                                 boolean repaired,
+                                 UUID pendingRepair,
+                                 int diskIndex,
+                                 int[] expectedBuckets)
+    {
+        ArenaSpecs arena = new ArenaSpecs(expectedBuckets);
+        ByteBuffer bb = ByteBuffer.allocate(0);
+
+        sstables.forEach((size, num) -> {
+            // Generate a key inside the shard, but make sure it's not too close to the boundaries to compensate for
+            // rounding differences between splitting directly and splitting first by disk and then by shard.
+            Token first = min.getPartitioner().split(min, max, 0.01 + random.nextDouble() * 0.98);
+            Token last = min.getPartitioner().split(min, max, 0.99);
+
+            for (int i = 0; i < num; i++)
+            {
+                arena.sstables.add(mockSSTable(0,
+                                               size,
+                                               System.currentTimeMillis(),
+                                               0.0,
+                                               new BufferDecoratedKey(first, bb),
+                                               new BufferDecoratedKey(last, bb),
+                                               diskIndex,
+                                               repaired,
+                                               pendingRepair,
+                                               0));
+                first = first.increaseSlightly();
+            }
+        });
+
+        return arena;
+    }
+
+    private List<PartitionPosition> makeBoundaries(int numShards, int numDisks)
+    {
+        IPartitioner partitioner = cfs.getPartitioner();
+        assert numShards >= 1;
+        assert numDisks >= 1;
+
+        if (numShards * numDisks == 1)
+            return ImmutableList.of(partitioner.getMaximumToken().maxKeyBound());
+
+        Splitter splitter = partitioner.splitter().orElse(null);
+        assertNotNull("The partitioner must support a splitter", splitter);
+
+        int numBoundaries = useDiskBoundaries ? numDisks * numShards : numShards;
+        Splitter.WeightedRange range = new Splitter.WeightedRange(1.0, new Range<>(partitioner.getMinimumToken(), partitioner.getMaximumToken()));
+        final List<PartitionPosition> shards = splitter.splitOwnedRanges(numBoundaries, ImmutableList.of(range), Splitter.SplitType.ALWAYS_SPLIT)
+                                               .boundaries
+                                               .stream()
+                                               .map(Token::maxKeyBound)
+                                               .collect(Collectors.toList());
+        if (useDiskBoundaries)
+        {
+            diskBoundaryPositions = new ArrayList<>(numDisks);
+            for (int i = 0; i < numDisks; ++i)
+                diskBoundaryPositions.add(shards.get((i + 1) * numShards - 1));
+        }
+        return shards;
+    }
+
+    private List<ArenaSpecs> mockArenas(int diskIndex,
+                                        int diskCount,
+                                        boolean repaired,
+                                        UUID pendingRepair,
+                                        List<PartitionPosition> boundaries,
+                                        Map<Long, Integer> sstables,
+                                        int[] buckets)
+    {
+        List<ArenaSpecs> arenasList = new ArrayList<>();
+
+        int numShards = boundaries.size() / diskCount;
+        List<PartitionPosition> shardPositions = useDiskBoundaries
+                                                 ? boundaries.subList(diskIndex * numShards, (diskIndex + 1) * numShards)
+                                                 : boundaries;
+        Token min = useDiskBoundaries && diskIndex > 0
+                    ? boundaries.get(diskIndex * numShards - 1).getToken()
+                    : partitioner.getMinimumToken();
+
+        for (PartitionPosition boundary : shardPositions)
+        {
+            Token max = boundary.getToken();
+
+            // what matters is the first key, which must be less than max
+            arenasList.add(mockArena(min, max, sstables, repaired, pendingRepair, diskIndex, buckets));
+
+            min = max;
+        }
+
+        return arenasList;
+    }
+
+    private static Map<Long, Integer> mapFromPair(Pair<Long, Integer> ... pairs)
+    {
+        Map<Long, Integer> ret = new HashMap<>();
+        for (Pair<Long, Integer> pair : pairs)
+        {
+            ret.put(pair.left, pair.right);
+        }
+
+        return ret;
+    }
+
+    @Test
+    public void testAllArenasOneBucket_NoShards()
+    {
+        testAllArenasOneBucket(1);
+    }
+
+    @Test
+    public void testAllArenasOneBucket_MultipleShards()
+    {
+        testAllArenasOneBucket(5);
+    }
+
+    private void testAllArenasOneBucket(int numShards)
+    {
+        final int m = 2; // minimal sorted run size in MB
+        final int W = 2; // => o = 1 => F = 4, T = 4: 0-8m, 8-32m, 32-128m
+
+        List<PartitionPosition> boundaries = makeBoundaries(numShards, 2);
+        List<ArenaSpecs> arenasList = new ArrayList<>();
+
+        Map<Long, Integer> sstables = mapFromPair(Pair.create(4 * ONE_MB, 4));
+        int[] buckets = new int[]{4};
+
+        UUID pendingRepair = UUID.randomUUID();
+        arenasList.addAll(mockArenas(0, 2, false, pendingRepair, boundaries, sstables, buckets)); // pending repair
+
+        arenasList.addAll(mockArenas(0, 2, false, null, boundaries, sstables, buckets)); // unrepaired
+        arenasList.addAll(mockArenas(1, 2, false, null, boundaries, sstables, buckets)); // unrepaired, next disk
+
+        arenasList.addAll(mockArenas(0, 2, true, null, boundaries, sstables, buckets)); // repaired
+        arenasList.addAll(mockArenas(1, 2, true, null, boundaries, sstables, buckets)); // repaired, next disk
+
+        testGetBucketsMultipleArenas(arenasList, W, m, boundaries);
+    }
+
+    @Test
+    public void testRepairedOneDiskOneBucket_NoShards()
+    {
+        testRepairedOneDiskOneBucket(1);
+    }
+
+    @Test
+    public void testRepairedOneDiskOneBucket_MultipleShards()
+    {
+        testRepairedOneDiskOneBucket(5);
+    }
+
+    private void testRepairedOneDiskOneBucket(int numShards)
+    {
+        final int m = 2; // minimal sorted run size in MB
+        final int W = 2; // => o = 1 => F = 4, T = 4: 0-8m, 8-32m, 32-128m
+
+        Map<Long, Integer> sstables = mapFromPair(Pair.create(4 * ONE_MB, 4));
+        int[] buckets = new int[]{4};
+
+        List<PartitionPosition> boundaries = makeBoundaries(numShards, 1);
+        List<ArenaSpecs> arenas = mockArenas(0, 1, true, null, boundaries, sstables, buckets);
+        testGetBucketsMultipleArenas(arenas, W, m, boundaries);
+    }
+
+    @Test
+    public void testRepairedTwoDisksOneBucket_NoShards()
+    {
+        testRepairedTwoDisksOneBucket(1);
+    }
+
+    @Test
+    public void testRepairedTwoDisksOneBucket_MultipleShards()
+    {
+        testRepairedTwoDisksOneBucket(5);
+    }
+
+    private void testRepairedTwoDisksOneBucket(int numShards)
+    {
+        final int m = 2; // minimal sorted run size in MB
+        final int W = 2; // => o = 1 => F = 4, T = 4: 0-8m, 8-32m, 32-128m
+
+        Map<Long, Integer> sstables = mapFromPair(Pair.create(4 * ONE_MB, 4));
+        int[] buckets = new int[]{4};
+
+        List<PartitionPosition> boundaries = makeBoundaries(numShards, 2);
+        List<ArenaSpecs> arenas = new ArrayList<>();
+
+        arenas.addAll(mockArenas(0, 2, true, null, boundaries, sstables, buckets));
+        arenas.addAll(mockArenas(1, 2, true, null, boundaries, sstables, buckets));
+
+        testGetBucketsMultipleArenas(arenas, W, m, boundaries);
+    }
+
+    @Test
+    public void testRepairedMultipleDisksMultipleBuckets_NoShards()
+    {
+        testRepairedMultipleDisksMultipleBuckets(1);
+    }
+
+    @Test
+    public void testRepairedMultipleDisksMultipleBuckets_MultipleShards()
+    {
+        testRepairedMultipleDisksMultipleBuckets(15);
+    }
+
+    private void testRepairedMultipleDisksMultipleBuckets(int numShards)
+    {
+        final int m = 2; // minimal sorted run size in MB
+        final int W = 2; // => o = 1 => F = 4, T = 4: 0-8m, 8-32m, 32-128m
+
+        List<PartitionPosition> boundaries = makeBoundaries(numShards, 6);
+        List<ArenaSpecs> arenasList = new ArrayList<>();
+
+        Map<Long, Integer> sstables1 = mapFromPair(Pair.create(2 * ONE_MB, 4), Pair.create(8 * ONE_MB, 4), Pair.create(32 * ONE_MB, 4));
+        int[] buckets1 = new int[]{4,4,4};
+
+        Map<Long, Integer> sstables2 = mapFromPair(Pair.create(8 * ONE_MB, 4), Pair.create(32 * ONE_MB, 8));
+        int[] buckets2 = new int[]{0,4,8};
+
+        for (int i = 0; i < 6; i++)
+        {
+            if (i % 2 == 0)
+                arenasList.addAll(mockArenas(i, 6, true, null, boundaries, sstables1, buckets1));
+            else
+                arenasList.addAll(mockArenas(i, 6, true, null, boundaries, sstables2, buckets2));
+
+        }
+
+        testGetBucketsMultipleArenas(arenasList, W, m, boundaries);
+    }
+
+    @Test
+    public void testRepairedUnrepairedOneDiskMultipleBuckets_NoShards()
+    {
+        testRepairedUnrepairedOneDiskMultipleBuckets(1);
+    }
+
+    @Test
+    public void testRepairedUnrepairedOneDiskMultipleBuckets_MultipleShards()
+    {
+        testRepairedUnrepairedOneDiskMultipleBuckets(10);
+    }
+
+    private void testRepairedUnrepairedOneDiskMultipleBuckets(int numShards)
+    {
+        final int m = 2; // minimal sorted run size in MB
+        final int W = 2; // => o = 1 => F = 4, T = 4: 0-8m, 8-32m, 32-128m
+
+        List<PartitionPosition> boundaries = makeBoundaries(numShards, 1);
+        List<ArenaSpecs> arenasList = new ArrayList<>();
+
+        Map<Long, Integer> sstables1 = mapFromPair(Pair.create(2 * ONE_MB, 4), Pair.create(8 * ONE_MB, 4), Pair.create(32 * ONE_MB, 4));
+        int[] buckets1 = new int[]{4,4,4};
+
+        Map<Long, Integer> sstables2 = mapFromPair(Pair.create(8 * ONE_MB, 4), Pair.create(32 * ONE_MB, 8));
+        int[] buckets2 = new int[]{0,4,8};
+
+        arenasList.addAll(mockArenas(0, 1, true, null, boundaries, sstables2, buckets2)); // repaired
+        arenasList.addAll(mockArenas(0, 1, false, null, boundaries, sstables1, buckets1)); // unrepaired
+
+        testGetBucketsMultipleArenas(arenasList, W, m, boundaries);
+    }
+
+    @Test
+    public void testRepairedUnrepairedTwoDisksMultipleBuckets_NoShards()
+    {
+        testRepairedUnrepairedTwoDisksMultipleBuckets(1);
+    }
+
+    @Test
+    public void testRepairedUnrepairedTwoDisksMultipleBuckets_MultipleShards()
+    {
+        testRepairedUnrepairedTwoDisksMultipleBuckets(5);
+    }
+
+    private void testRepairedUnrepairedTwoDisksMultipleBuckets(int numShards)
+    {
+        final int m = 2; // minimal sorted run size in MB
+        final int W = 2; // => o = 1 => F = 4, T = 4: 0-8m, 8-32m, 32-128m
+
+        List<PartitionPosition> boundaries = makeBoundaries(numShards, 2);
+        List<ArenaSpecs> arenasList = new ArrayList<>();
+
+        Map<Long, Integer> sstables1 = mapFromPair(Pair.create(2 * ONE_MB, 4), Pair.create(8 * ONE_MB, 4), Pair.create(32 * ONE_MB, 4));
+        int[] buckets1 = new int[]{4,4,4};
+
+        Map<Long, Integer> sstables2 = mapFromPair(Pair.create(8 * ONE_MB, 4), Pair.create(32 * ONE_MB, 8));
+        int[] buckets2 = new int[]{0,4,8};
+
+        arenasList.addAll(mockArenas(0, 2, true, null, boundaries, sstables2, buckets2));  // repaired, first disk
+        arenasList.addAll(mockArenas(1, 2, true, null, boundaries, sstables1, buckets1));  // repaired, second disk
+
+        arenasList.addAll(mockArenas(0, 2, false, null, boundaries, sstables1, buckets1));  // unrepaired, first disk
+        arenasList.addAll(mockArenas(1, 2, false, null, boundaries, sstables2, buckets2));  // unrepaired, second disk
+
+        testGetBucketsMultipleArenas(arenasList, W, m, boundaries);
+    }
+
+    private void testGetBucketsMultipleArenas(List<ArenaSpecs> arenaSpecs, int W, int m, List<PartitionPosition> shards)
+    {
+        long minimalSizeBytes = m << 20;
+
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(W);
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.getNumShards()).thenReturn(shards.size());
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.maxSSTablesToCompact()).thenReturn(1000);
+        when(controller.random()).thenCallRealMethod();
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+
+        List<SSTableReader> sstables = arenaSpecs.stream().flatMap(a -> a.sstables.stream()).collect(Collectors.toList());
+        dataTracker.addInitialSSTables(sstables);
+
+        Map<UnifiedCompactionStrategy.Shard, List<UnifiedCompactionStrategy.Bucket>> arenas = strategy.getShardsWithBuckets();
+        assertNotNull(arenas);
+        assertEquals(arenaSpecs.size(), arenas.size());
+
+        int idx = 0;
+        for (Map.Entry<UnifiedCompactionStrategy.Shard, List<UnifiedCompactionStrategy.Bucket>> entry : arenas.entrySet())
+        {
+            List<UnifiedCompactionStrategy.Bucket> buckets = entry.getValue();
+            ArenaSpecs currentArenaSpecs = arenaSpecs.get(idx++);
+
+            assertEquals(currentArenaSpecs.expectedBuckets.length, buckets.size());
+            for (int i = 0; i < currentArenaSpecs.expectedBuckets.length; i++)
+                assertEquals(currentArenaSpecs.expectedBuckets[i], buckets.get(i).sstables.size());
+        }
+    }
+
+    @Test
+    public void testShardBoundaries()
+    {
+        // no shards
+        testShardBoundaries(ints(100), 1, 1, ints(10, 50));
+        // split on disks at minimum
+        testShardBoundaries(ints(30, 100), 1, 2, ints(10, 50));
+        testShardBoundaries(ints(20, 30, 40, 50, 100), 1, 5, ints(10, 51, 61, 70));
+
+        // no disks
+        testShardBoundaries(ints(30, 100), 2, 1, ints(10, 50));
+        testShardBoundaries(ints(20, 30, 40, 50, 100), 5, 1, ints(10, 51, 61, 70));
+
+        // split
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 60, 70, 80, 100), 9, 3, ints(0, 90));
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 70, 80, 90, 100), 9, 3, ints(0, 51, 61, 100));
+        testShardBoundaries(ints(10, 20, 30, 40, 60, 70, 80, 90, 100), 9, 3, ints(0, 49, 59, 100));
+        testShardBoundaries(ints(12, 23, 33, 45, 56, 70, 80, 90, 100), 9, 3, ints(0, 9, 11, 20, 21, 39, 41, 50, 51, 60, 64, 68, 68, 100));
+
+        // uneven
+        testShardBoundaries(ints(11, 22, 33, 42, 50, 58, 67, 78, 89, 100), 10, 3, ints(0, 100));
+        testShardBoundaries(ints(8, 17, 25, 38, 50, 58, 67, 75, 88, 100), 10, 4, ints(0, 100));
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 60, 70, 80, 90, 100), 10, 5, ints(0, 100));
+        testShardBoundaries(ints(8, 17, 33, 42, 50, 58, 67, 83, 92, 100), 10, 6, ints(0, 100));
+        testShardBoundaries(ints(14, 21, 29, 43, 50, 57, 71, 79, 86, 100), 10, 7, ints(0, 100));
+        testShardBoundaries(ints(13, 19, 25, 38, 50, 63, 69, 75, 88, 100), 10, 8, ints(0, 100));
+        testShardBoundaries(ints(11, 22, 33, 44, 50, 56, 67, 78, 89, 100), 10, 9, ints(0, 100));
+
+        // uneven again, where x0 are the disk boundaries and the others are inserted shard boundaries
+        testShardBoundaries(ints(3, 7, 10, 13, 15, 18, 20, 23, 27, 100), 10, 3, ints(0, 30));
+        testShardBoundaries(ints(3, 7, 10, 15, 20, 23, 27, 30, 35, 100), 10, 4, ints(0, 40));
+        testShardBoundaries(ints(5, 10, 15, 20, 25, 30, 35, 40, 45, 100), 10, 5, ints(0, 50));
+        testShardBoundaries(ints(5, 10, 20, 25, 30, 35, 40, 50, 55, 100), 10, 6, ints(0, 60));
+        testShardBoundaries(ints(10, 15, 20, 30, 35, 40, 50, 55, 60, 100), 10, 7, ints(0, 70));
+        testShardBoundaries(ints(10, 15, 20, 30, 40, 50, 55, 60, 70, 100), 10, 8, ints(0, 80));
+        testShardBoundaries(ints(10, 20, 30, 40, 45, 50, 60, 70, 80, 100), 10, 9, ints(0, 90));
+    }
+
+    @Test
+    public void testShardBoundariesWraparound()
+    {
+        // no shards
+        testShardBoundaries(ints(100), 1, 1, ints(50, 10));
+        // split on disks at minimum
+        testShardBoundaries(ints(70, 100), 1, 2, ints(50, 10));
+        testShardBoundaries(ints(10, 20, 30, 70, 100), 1, 5, ints(91, 31, 61, 71));
+        // no disks
+        testShardBoundaries(ints(70, 100), 2, 1, ints(50, 10));
+        testShardBoundaries(ints(10, 20, 30, 70, 100), 5, 1, ints(91, 31, 61, 71));
+        // split
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 60, 70, 90, 100), 9, 3, ints(81, 71));
+        testShardBoundaries(ints(10, 20, 30, 40, 60, 70, 80, 90, 100), 9, 3, ints(51, 41));
+        testShardBoundaries(ints(10, 30, 40, 50, 60, 70, 80, 90, 100), 9, 3, ints(21, 11));
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 60, 70, 90, 100), 9, 3, ints(89, 79));
+        testShardBoundaries(ints(10, 20, 30, 40, 60, 70, 80, 90, 100), 9, 3, ints(59, 49));
+        testShardBoundaries(ints(10, 30, 40, 50, 60, 70, 80, 90, 100), 9, 3, ints(29, 19));
+
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 70, 80, 90, 100), 9, 3, ints(91, 51, 61, 91));
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 70, 80, 90, 100), 9, 3, ints(21, 51, 61, 21));
+        testShardBoundaries(ints(10, 20, 30, 40, 50, 70, 80, 90, 100), 9, 3, ints(71, 51, 61, 71));
+    }
+
+    private int[] ints(int... values)
+    {
+        return values;
+    }
+
+    private void testShardBoundaries(int[] expected, int numShards, int numDisks, int[] rangeBounds)
+    {
+        IPartitioner partitioner = Murmur3Partitioner.instance;
+        List<Splitter.WeightedRange> ranges = new ArrayList<>();
+        for (int i = 0; i < rangeBounds.length; i += 2)
+            ranges.add(new Splitter.WeightedRange(1.0, new Range<>(getToken(rangeBounds[i + 0]), getToken(rangeBounds[i + 1]))));
+        SortedLocalRanges sortedRanges = SortedLocalRanges.forTesting(cfs, ranges);
+
+        List<PartitionPosition> diskBoundaries = sortedRanges.split(numDisks);
+
+        int[] result = UnifiedCompactionStrategy.computeShardBoundaries(sortedRanges, diskBoundaries, numShards, partitioner)
+                                                .stream()
+                                                .map(PartitionPosition::getToken)
+                                                .mapToInt(this::fromToken)
+                                                .toArray();
+
+        Assert.assertArrayEquals("Disks " + numDisks + " shards " + numShards + " expected " + Arrays.toString(expected) + " was " + Arrays.toString(result), expected, result);
+    }
+
+    private Token getToken(int x)
+    {
+        IPartitioner partitioner = Murmur3Partitioner.instance;
+        return partitioner.split(partitioner.getMinimumToken(), partitioner.getMaximumToken(), x * 0.01);
+    }
+
+    private int fromToken(Token t)
+    {
+        IPartitioner partitioner = Murmur3Partitioner.instance;
+        return (int) Math.round(partitioner.getMinimumToken().size(t) * 100.0);
+    }
+
+    @Test
+    public void testGetNextBackgroundTasks()
+    {
+        assertCompactionTask(1, 3, CompactionTask.class);
+        assertCompactionTask(3, 3, UnifiedCompactionTask.class);
+    }
+
+    private void assertCompactionTask(final int numShards, final int expectedNumOfTasks, Class expectedClass)
+    {
+        Controller controller = Mockito.mock(Controller.class);
+        long minimalSizeBytes = 2 << 20;
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(0);
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(numShards);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxCompactionSpaceBytes()).thenReturn(Long.MAX_VALUE);
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.maxSSTablesToCompact()).thenReturn(1000);
+        when(controller.random()).thenCallRealMethod();
+
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+
+        IPartitioner partitioner = cfs.getPartitioner();
+
+        List<SSTableReader> sstables = createSStables(partitioner);
+
+        dataTracker.addInitialSSTables(sstables);
+
+        Collection<AbstractCompactionTask> tasks = strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds());
+        assertEquals("Expecting number of next background tasks:", expectedNumOfTasks, tasks.size());
+        for (AbstractCompactionTask task : tasks)
+        {
+            assertSame(expectedClass, task.getClass());
+        }
+    }
+
+    private List<SSTableReader> createSStables(IPartitioner partitioner)
+    {
+        return createSStables(partitioner, mapFromPair(Pair.create(4 * ONE_MB, 4)), 10000, UUID.randomUUID());
+    }
+
+    private List<SSTableReader> createSStables(IPartitioner partitioner, int ttl, UUID pendingRepair)
+    {
+        return createSStables(partitioner, mapFromPair(Pair.create(4 * ONE_MB, 4)), ttl, pendingRepair);
+    }
+
+    private List<SSTableReader> createSStables(IPartitioner partitioner, Map<Long, Integer> sstablesMap)
+    {
+        return createSStables(partitioner, sstablesMap, 10000, UUID.randomUUID());
+    }
+
+    private List<SSTableReader> createSStables(IPartitioner partitioner,
+                                               Map<Long, Integer> sstablesMap,
+                                               int ttl,
+                                               UUID pendingRepair)
+    {
+        List<SSTableReader> mockSSTables = new ArrayList<>();
+        Token min = partitioner.getMinimumToken();
+        Token max = partitioner.getMaximumToken();
+        ByteBuffer bb = ByteBuffer.allocate(0);
+        sstablesMap.forEach((size, num) -> {
+            Token first = min.getPartitioner().split(min, max, 0.01 + random.nextDouble() * 0.98);
+
+            for (int i = 0; i < num; i++)
+            {
+                // pending repair
+                mockSSTables.add(mockSSTable(0,
+                                             size,
+                                             System.currentTimeMillis(),
+                                             0.0,
+                                             new BufferDecoratedKey(first, bb),
+                                             new BufferDecoratedKey(max, bb),
+                                             0,
+                                             false,
+                                             pendingRepair,
+                                             ttl));
+                first = first.increaseSlightly();
+            }
+
+            for (int i = 0; i < num; i++)
+            {
+                // unrepaired
+                mockSSTables.add(mockSSTable(0,
+                                             size,
+                                             System.currentTimeMillis(),
+                                             0.0,
+                                             new BufferDecoratedKey(first, bb),
+                                             new BufferDecoratedKey(max, bb),
+                                             0,
+                                             false,
+                                             null,
+                                             ttl));
+                first = first.increaseSlightly();
+            }
+
+            for (int i = 0; i < num; i++)
+            {
+                // repaired
+                mockSSTables.add(mockSSTable(0,
+                                             size,
+                                             System.currentTimeMillis(),
+                                             0.0,
+                                             new BufferDecoratedKey(first, bb),
+                                             new BufferDecoratedKey(max, bb),
+                                             0,
+                                             true,
+                                             null,
+                                             ttl));
+                first = first.increaseSlightly();
+            }
+        });
+        return mockSSTables;
+    }
+
+    @Test
+    public void testDropExpiredSSTables1Shard()
+    {
+        testDropExpiredFromBucket(1);
+        testDropExpiredAndCompactNonExpired();
+    }
+
+    @Test
+    public void testDropExpiredSSTables3Shards()
+    {
+        testDropExpiredFromBucket(3);
+    }
+
+    private void testDropExpiredFromBucket(int numShards)
+    {
+        Controller controller = Mockito.mock(Controller.class);
+        long minimalSizeBytes = 2 << 20;
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(3); // T=5
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(numShards);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxCompactionSpaceBytes()).thenReturn(Long.MAX_VALUE);
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.getIgnoreOverlapsInExpirationCheck()).thenReturn(false);
+        when(controller.random()).thenCallRealMethod();
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+        strategy.startup();
+
+        List<SSTableReader> sstables = createSStables(cfs.getPartitioner());
+        // Tracker#addSSTables also tries to backup SSTables, so we use addInitialSSTables and notify explicitly
+        dataTracker.addInitialSSTables(sstables);
+
+        try
+        {
+            // nothing to compact yet
+            assertEquals(0, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size());
+
+            int timestamp = sstables.get(sstables.size() - 1).getMaxLocalDeletionTime();
+            int expirationPoint = timestamp + 1;
+
+            assertEquals(3, strategy.getNextBackgroundTasks(expirationPoint).size()); // repaired, unrepaired, pending
+            Collection<CompactionPick> picks = strategy.backgroundCompactions.getCompactionsInProgress();
+            for (CompactionPick pick : picks)
+            {
+                // expired SSTables don't contribute to total size
+                assertTrue(pick.hasExpiredOnly());
+                assertEquals(sstables.size() / 3, pick.expired.size());
+                assertEquals(0L, pick.totSizeInBytes);
+                assertEquals(0L, pick.avgSizeInBytes);
+                assertEquals(0, pick.parent);
+            }
+        }
+        finally
+        {
+            strategy.shutdown();
+            dataTracker.dropSSTables();
+        }
+    }
+
+    private void testDropExpiredAndCompactNonExpired()
+    {
+        Controller controller = Mockito.mock(Controller.class);
+        long minimalSizeBytes = 2 << 20;
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getScalingParameter(anyInt())).thenReturn(2);
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(1);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxCompactionSpaceBytes()).thenReturn(Long.MAX_VALUE);
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.getIgnoreOverlapsInExpirationCheck()).thenReturn(false);
+        when(controller.random()).thenCallRealMethod();
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+        strategy.startup();
+
+        UUID pendingRepair = UUID.randomUUID();
+        List<SSTableReader> expiredSSTables = createSStables(cfs.getPartitioner(), 1000, pendingRepair);
+        List<SSTableReader> nonExpiredSSTables = createSStables(cfs.getPartitioner(), 0, pendingRepair);
+        List<SSTableReader> allSSTables = Stream.concat(expiredSSTables.stream(), nonExpiredSSTables.stream())
+                                                .collect(Collectors.toList());
+        dataTracker.addInitialSSTables(allSSTables);
+
+        int timestamp = expiredSSTables.get(expiredSSTables.size() - 1).getMaxLocalDeletionTime();
+        int expirationPoint = timestamp + 1;
+
+        try
+        {
+            strategy.getNextBackgroundTasks(expirationPoint);
+            Collection<CompactionPick> picks = strategy.backgroundCompactions.getCompactionsInProgress();
+
+            for (CompactionPick pick : picks)
+            {
+                assertFalse(pick.hasExpiredOnly());
+                assertEquals(pick.sstables.size() / 2, pick.expired.size());
+                Set<SSTableReader> nonExpired = pick.sstables.stream()
+                                                             .filter(sstable -> !pick.expired.contains(sstable))
+                                                             .collect(Collectors.toSet());
+                assertEquals(pick.sstables.size() / 2, nonExpired.size());
+                long expectedTotSize = nonExpired.stream()
+                                                 .mapToLong(SSTableReader::onDiskLength)
+                                                 .sum();
+                assertEquals(expectedTotSize, pick.totSizeInBytes);
+                assertEquals(expectedTotSize / nonExpired.size(), pick.avgSizeInBytes);
+                assertEquals(0, pick.parent);
+            }
+        }
+        finally
+        {
+            strategy.shutdown();
+            dataTracker.dropSSTables();
+        }
+    }
+
+    @Test
+    public void testPending()
+    {
+        Controller controller = Mockito.mock(Controller.class);
+        when(controller.getScalingParameter(anyInt())).thenReturn(-8); // F=10, T=2
+        when(controller.getFanout(anyInt())).thenCallRealMethod();
+        when(controller.getThreshold(anyInt())).thenCallRealMethod();
+        when(controller.maxSSTablesToCompact()).thenReturn(10); // same as fanout
+
+        long minimalSizeBytes = 2 << 20;
+        when(controller.getMinSstableSizeBytes()).thenReturn(minimalSizeBytes);
+        when(controller.getSurvivalFactor()).thenReturn(1.0);
+        when(controller.getNumShards()).thenReturn(1);
+        when(controller.getBaseSstableSize(anyInt())).thenReturn((double) minimalSizeBytes);
+        when(controller.maxConcurrentCompactions()).thenReturn(1000); // let it generate as many candidates as it can
+        when(controller.maxCompactionSpaceBytes()).thenReturn(Long.MAX_VALUE);
+        when(controller.maxThroughput()).thenReturn(Double.MAX_VALUE);
+        when(controller.getIgnoreOverlapsInExpirationCheck()).thenReturn(false);
+        when(controller.random()).thenCallRealMethod();
+        UnifiedCompactionStrategy strategy = new UnifiedCompactionStrategy(strategyFactory, controller);
+        strategy.startup();
+
+        List<SSTableReader> sstables = createSStables(cfs.getPartitioner(),
+                                                      mapFromPair(Pair.create(4 * ONE_MB, 91)));
+        dataTracker.addInitialSSTables(sstables);
+
+        assertEquals(3, strategy.getNextBackgroundTasks(FBUtilities.nowInSeconds()).size()); // repaired, unrepaired, pending
+        Collection<CompactionAggregate> aggregates = strategy.backgroundCompactions.getAggregates();
+        assertEquals(3, aggregates.size());
+        for (CompactionAggregate aggregate : aggregates)
+            assertEquals(8, aggregate.getPending().size());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java b/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java
index 727afea02a84..4cc89fd9219c 100644
--- a/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java
+++ b/test/unit/org/apache/cassandra/db/compaction/ZombieSSTablesTest.java
@@ -190,7 +190,7 @@ private void testZombieSSTablesMaximal(String tableName) throws Exception
         final ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(tableName + MAXIMAL);
 
         prepareZombieSSTables(cfs);
-        Collection<AbstractCompactionTask> maximalTasks = cfs.getCompactionStrategyManager().getMaximalTasks(0, false);
+        Collection<AbstractCompactionTask> maximalTasks = cfs.getCompactionStrategy().getMaximalTasks(0, false);
         assertNotNull(maximalTasks);
         assertFalse(maximalTasks.isEmpty());
         maximalTasks.stream().forEach(task -> task.transaction.abort());    // avoid leak
@@ -204,10 +204,10 @@ private void testZombieSSTables(String tableName) throws Exception
 
         prepareZombieSSTables(cfs);
 
-        CompactionStrategyManager compactionStrategyManager = cfs.getCompactionStrategyManager();
-        compactionStrategyManager.enable();
-        AbstractCompactionTask nextBackgroundTask = compactionStrategyManager.getNextBackgroundTask(0);
-        assertNotNull(nextBackgroundTask);
-        nextBackgroundTask.transaction.abort();    // avoid leak
+        cfs.getCompactionStrategyContainer().enable();
+        Collection<AbstractCompactionTask> nextBackgroundTasks = cfs.getCompactionStrategy().getNextBackgroundTasks(0);
+        assertNotNull(nextBackgroundTasks);
+        assertFalse(nextBackgroundTasks.isEmpty());
+        nextBackgroundTasks.stream().forEach(task -> task.transaction.abort());    // avoid leak
     }
 }
diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/AdaptiveControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/AdaptiveControllerTest.java
new file mode 100644
index 000000000000..1b02e1e6c3d5
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/unified/AdaptiveControllerTest.java
@@ -0,0 +1,313 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.UTF8Type;
+import org.apache.cassandra.dht.Murmur3Partitioner;
+import org.apache.cassandra.schema.CachingParams;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FixedMonotonicClock;
+import org.mockito.Mockito;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.anyInt;
+import static org.mockito.Mockito.when;
+
+public class AdaptiveControllerTest extends ControllerTest
+{
+    private CostsCalculator calculator;
+    private FixedMonotonicClock clock;
+
+    private final int minW = -10;
+    private final int maxW = 64;
+    private final int W = 0;
+    private final int interval = 60;
+    private final int minCost = 5;
+    private final double baseCost = minCost * 5;
+    private final double threshold = 0.15;
+
+    @Before
+    public void setup()
+    {
+        calculator = Mockito.mock(CostsCalculator.class);
+        clock = new FixedMonotonicClock();
+    }
+
+    private AdaptiveController makeController()
+    {
+        return makeController(dataSizeGB, numShards, sstableSizeMB);
+    }
+
+    private AdaptiveController makeController(int dataSizeGB, int numShards, int sstableSizeMB)
+    {
+        return new AdaptiveController(clock,
+                                      env,
+                                      W,
+                                      Controller.DEFAULT_SURVIVAL_FACTOR,
+                                      dataSizeGB << 10,
+                                      numShards,
+                                      sstableSizeMB,
+                                      0,
+                                      Controller.DEFAULT_MAX_SPACE_OVERHEAD,
+                                      0,
+                                      Controller.DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS,
+                                      Controller.DEFAULT_ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION,
+                                      interval,
+                                      minW,
+                                      maxW,
+                                      threshold,
+                                      minCost);
+    }
+
+    @Test
+    public void testFromOptions()
+    {
+        Map<String, String> options = new HashMap<>();
+        options.put(AdaptiveController.STARTING_SCALING_PARAMETER, "0");
+        options.put(AdaptiveController.MIN_SCALING_PARAMETER, "-10");
+        options.put(AdaptiveController.MAX_SCALING_PARAMETER, "32");
+        options.put(AdaptiveController.INTERVAL_SEC, "120");
+        options.put(AdaptiveController.THRESHOLD, "0.15");
+        options.put(AdaptiveController.MIN_COST, "5");
+
+        Controller controller = testFromOptions(true, options);
+        assertTrue(controller instanceof AdaptiveController);
+
+        for (int i = 0; i < 10; i++)
+            assertEquals(0, controller.getScalingParameter(i));
+    }
+
+    @Test
+    public void testValidateOptions()
+    {
+        Map<String, String> options = new HashMap<>();
+        options.put(AdaptiveController.STARTING_SCALING_PARAMETER, "0");
+        options.put(AdaptiveController.MIN_SCALING_PARAMETER, "-10");
+        options.put(AdaptiveController.MAX_SCALING_PARAMETER, "32");
+        options.put(AdaptiveController.INTERVAL_SEC, "120");
+        options.put(AdaptiveController.THRESHOLD, "0.15");
+        options.put(AdaptiveController.MIN_COST, "5");
+
+        super.testValidateOptions(options, true);
+    }
+
+    @Test
+    public void testStartShutdown()
+    {
+        AdaptiveController controller = makeController();
+        testStartShutdown(controller);
+    }
+
+    @Test
+    public void testShutdownNotStarted()
+    {
+        AdaptiveController controller = makeController();
+        testShutdownNotStarted(controller);
+    }
+
+    @Test(expected = IllegalStateException.class)
+    public void testStartAlreadyStarted()
+    {
+        AdaptiveController controller = makeController();
+        testStartAlreadyStarted(controller);
+    }
+
+    @Test
+    public void testMinSSTableSizeDynamic()
+    {
+        // <= 50 MB, round up to 50 MB
+        testMinSSTableSizeDynamic(1, 50);
+        testMinSSTableSizeDynamic((50 << 20) - 1, 50);
+        testMinSSTableSizeDynamic(50 << 20, 50);
+
+        // <= 100 MB, round up to 100 MB
+        testMinSSTableSizeDynamic((50 << 20) + 1, 100);
+        testMinSSTableSizeDynamic((100 << 20) - 1, 100);
+        testMinSSTableSizeDynamic(100 << 20, 100);
+
+        // no flush size, 50 MB, then flush size of 100 MB + 1 returns 150MB
+        testMinSSTableSizeDynamic(0, 50, (100 << 20) + 1, 150);
+    }
+
+    private void testMinSSTableSizeDynamic(long flushSizeBytes1, int minSSTableSizeMB1)
+    {
+        // The most common case, the second calculation is skipped so even if the env returns zero the second time, the result won't change
+        testMinSSTableSizeDynamic(flushSizeBytes1, minSSTableSizeMB1, 0, minSSTableSizeMB1);
+    }
+
+    private void testMinSSTableSizeDynamic(long flushSizeBytes1, int minSSTableSizeMB1, long flushSizeBytes2, int minSSTableSizeMB2)
+    {
+        // create a controller with minSSTableSizeMB set to zero so that it will calculate the min sstable size from the flush size
+        AdaptiveController controller = makeController(dataSizeGB, numShards, 0);
+
+        when(env.flushSize()).thenReturn(flushSizeBytes1 * 1.0);
+        assertEquals(minSSTableSizeMB1 << 20, controller.getMinSstableSizeBytes());
+
+        when(env.flushSize()).thenReturn(flushSizeBytes2 * 1.0);
+        assertEquals(minSSTableSizeMB2 << 20, controller.getMinSstableSizeBytes());
+    }
+
+
+    @Test
+    public void testUpdateNotEnoughTimeElapsed()
+    {
+        AdaptiveController controller = makeController();
+        controller.startup(strategy, calculator);
+
+        // no update, not enough time elapsed
+        controller.onStrategyBackgroundTaskRequest();
+        assertEquals(W, controller.getScalingParameter(0));
+    }
+
+    @Test
+    public void testUpdateBelowMinCost() throws InterruptedException
+    {
+        AdaptiveController controller = makeController();
+        controller.startup(strategy, calculator);
+
+        // no update, <= min cost
+        when(calculator.getReadCostForQueries(anyInt())).thenReturn((double) minCost);
+        when(calculator.getReadCostForQueries(anyInt())).thenReturn(0.);
+        when(calculator.spaceUsed()).thenReturn(1.0);
+
+        clock.setNowInNanos(clock.now() + TimeUnit.SECONDS.toNanos(interval + 1));
+        controller.onStrategyBackgroundTaskRequest();
+        assertEquals(W, controller.getScalingParameter(0));
+    }
+
+    @Test
+    public void testUpdateWithSize_min() throws InterruptedException
+    {
+        long totSize = (long) sstableSizeMB << 20;
+        testUpdateWithSize(totSize, new double[]{ baseCost, 0, baseCost }, new double[]{ 0, baseCost, baseCost }, new int[]{ 0, 0, 0 });
+    }
+
+    @Test
+    public void testUpdateWithSize_1GB() throws InterruptedException
+    {
+        long totSize = 1L << 31;
+        testUpdateWithSize(totSize, new double[]{ baseCost, 0, baseCost }, new double[]{ 0, baseCost, baseCost }, new int[]{ -9, 31, 1 });
+    }
+
+    @Test
+    public void testUpdateWithSize_2GB() throws InterruptedException
+    {
+        long totSize = 2L << 31;
+        testUpdateWithSize(totSize, new double[]{ baseCost, 0, baseCost }, new double[]{ 0, baseCost, baseCost }, new int[]{ -5, 44, 1 } );
+    }
+
+    @Test
+    public void testUpdateWithSize_128GB() throws InterruptedException
+    {
+        long totSize = 1L << 37;
+        testUpdateWithSize(totSize, new double[] {baseCost, 0, baseCost}, new double[] {0, baseCost, baseCost}, new int[] {-8, 39, 1});
+    }
+
+    @Test
+    public void testUpdateWithSize_512GB() throws InterruptedException
+    {
+        long totSize = 1L << 39;
+        testUpdateWithSize(totSize, new double[] {baseCost, 0, baseCost}, new double[] {0, baseCost, baseCost}, new int[] {-7, 63, 1});
+    }
+
+    @Test
+    public void testUpdateWithSize_1TB() throws InterruptedException
+    {
+        long totSize = 1L << 40;
+        testUpdateWithSize(totSize, new double[] {baseCost, 0, baseCost}, new double[] {0, baseCost, baseCost}, new int[] {-7, 25, 1});
+    }
+
+    @Test
+    public void testUpdateWithSize_5TB() throws InterruptedException
+    {
+        long totSize = 5 * (1L << 40);
+        testUpdateWithSize(totSize, new double[] {baseCost, 0, baseCost}, new double[] {0, baseCost, baseCost}, new int[] {-10, 39, 1});
+    }
+
+    @Test
+    public void testUpdateWithSize_10TB() throws InterruptedException
+    {
+        long totSize = 10 * (1L << 40);
+        testUpdateWithSize(totSize, new double[] { baseCost, 0, baseCost}, new double[] { 0, baseCost, baseCost}, new int[] { -8, 46, 1});
+    }
+
+    @Test
+    public void testUpdateWithSize_20TB() throws InterruptedException
+    {
+        long totSize = 20 * (1L << 49);
+        testUpdateWithSize(totSize, new double[] { baseCost, 0, baseCost}, new double[] { 0, baseCost, baseCost}, new int[] { -8, 40, 1});
+    }
+
+    private void testUpdateWithSize(long totSize, double[] readCosts, double[] writeCosts, int[] expectedWs) throws InterruptedException
+    {
+        int shardSizeGB = (int) (totSize >> 30);
+        AdaptiveController controller = makeController(shardSizeGB, 1, sstableSizeMB); // one unique shard
+        controller.startup(strategy, calculator);
+
+        assertEquals(readCosts.length, writeCosts.length);
+        assertEquals(writeCosts.length, expectedWs.length);
+
+        when(calculator.spaceUsed()).thenReturn((double) totSize);
+
+        for (int i = 0; i < readCosts.length; i++)
+        {
+            final double readCost = readCosts[i];
+            final double writeCost = writeCosts[i];
+
+            when(calculator.getReadCostForQueries(anyInt())).thenAnswer(answ -> (int) answ.getArgument(0) * readCost);
+            when(calculator.getWriteCostForQueries(anyInt())).thenAnswer(answ -> (int) answ.getArgument(0) * writeCost);
+
+            clock.setNowInNanos(clock.now() + TimeUnit.SECONDS.toNanos(interval + 1));
+
+            controller.onStrategyBackgroundTaskRequest();
+            assertEquals(expectedWs[i], controller.getScalingParameter(0));
+        }
+    }
+
+    @Test
+    public void testMetrics()
+    {
+        TableMetadata metadata = TableMetadata.builder("ks", "table")
+                                              .partitioner(Murmur3Partitioner.instance)
+                                              .addPartitionKeyColumn("key", UTF8Type.instance)
+                                              .addClusteringColumn("col", UTF8Type.instance)
+                                              .addRegularColumn("value", UTF8Type.instance)
+                                              .caching(CachingParams.CACHE_NOTHING)
+                                              .build();
+        Controller.Metrics metrics = new Controller.Metrics(metadata);
+        AdaptiveController controller = makeController();
+        metrics.setController(controller);
+
+        double wa = metrics.getMeasuredWA();
+        double readIo = metrics.getReadIOCost();
+        double writeIo = metrics.getWriteIOCost();
+        double totalIo = metrics.getTotalIOCost();
+
+        assertEquals(0, wa, 0);
+        assertEquals(0, readIo, 0);
+        assertEquals(0, writeIo, 0);
+        assertEquals(0, totalIo, 0);
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java
new file mode 100644
index 000000000000..1f6352d5d3e2
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/unified/ControllerTest.java
@@ -0,0 +1,178 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.Map;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.schema.TableMetadata;
+import org.mockito.Mock;
+import org.mockito.MockitoAnnotations;
+
+import static junit.framework.TestCase.assertNull;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyLong;
+import static org.mockito.Mockito.when;
+
+@Ignore
+public abstract class ControllerTest
+{
+    static final double epsilon = 0.00000001;
+    static final int dataSizeGB = 512;
+    static final int numShards = 4; // pick it so that dataSizeGB is exactly divisible or tests will break
+    static final int sstableSizeMB = 2;
+    static final double maxSpaceOverhead = 0.3d;
+    static final boolean allowOverlaps = false;
+    static final long checkFrequency= 600L;
+
+    @Mock
+    ColumnFamilyStore cfs;
+
+    @Mock
+    TableMetadata metadata;
+
+    @Mock
+    UnifiedCompactionStrategy strategy;
+
+    @Mock
+    ScheduledExecutorService executorService;
+
+    @Mock
+    ScheduledFuture fut;
+
+    @Mock
+    Environment env;
+
+    @BeforeClass
+    public static void setUpClass()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @Before
+    public void setUp()
+    {
+        MockitoAnnotations.initMocks(this);
+
+        when(strategy.getMetadata()).thenReturn(metadata);
+        when(strategy.getEstimatedRemainingTasks()).thenReturn(0);
+
+        when(metadata.toString()).thenReturn("");
+
+        when(executorService.scheduleAtFixedRate(any(Runnable.class), anyLong(), anyLong(), any(TimeUnit.class))).thenReturn(fut);
+
+        when(env.flushSize()).thenReturn((double) (sstableSizeMB << 20));
+    }
+
+    Controller testFromOptions(boolean adaptive, Map<String, String> options)
+    {
+        options.putIfAbsent(Controller.ADAPTIVE_OPTION, Boolean.toString(adaptive));
+        options.putIfAbsent(Controller.MIN_SSTABLE_SIZE_OPTION_MB, Integer.toString(sstableSizeMB));
+
+        options.putIfAbsent(Controller.DATASET_SIZE_OPTION_GB, Integer.toString(dataSizeGB));
+        options.putIfAbsent(Controller.NUM_SHARDS_OPTION, Integer.toString(numShards));
+        options.putIfAbsent(Controller.MAX_SPACE_OVERHEAD_OPTION, Double.toString(maxSpaceOverhead));
+        options.putIfAbsent(Controller.ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION, Boolean.toString(allowOverlaps));
+        options.putIfAbsent(Controller.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION, Long.toString(checkFrequency));
+
+        Controller.validateOptions(options);
+
+        Controller controller = Controller.fromOptions(cfs, options);
+        assertNotNull(controller);
+        assertNotNull(controller.toString());
+
+        assertEquals((long) sstableSizeMB << 20, controller.getMinSstableSizeBytes());
+        assertEquals((long) dataSizeGB << 30, controller.getDataSetSizeBytes());
+        assertEquals(numShards, controller.getNumShards());
+        assertEquals(((long) dataSizeGB << 30) / numShards, controller.getShardSizeBytes());
+        assertFalse(controller.isRunning());
+        assertEquals(Controller.DEFAULT_SURVIVAL_FACTOR, controller.getSurvivalFactor(), epsilon);
+        assertNull(controller.getCalculator());
+
+        return controller;
+    }
+
+    void testValidateOptions(Map<String, String> options, boolean adaptive)
+    {
+        options.putIfAbsent(Controller.ADAPTIVE_OPTION, Boolean.toString(adaptive));
+        options.putIfAbsent(Controller.MIN_SSTABLE_SIZE_OPTION_MB, Integer.toString(sstableSizeMB));
+
+        options.putIfAbsent(Controller.DATASET_SIZE_OPTION_GB, Integer.toString(dataSizeGB));
+        options.putIfAbsent(Controller.NUM_SHARDS_OPTION, Integer.toString(numShards));
+        options.putIfAbsent(Controller.MAX_SPACE_OVERHEAD_OPTION, Double.toString(maxSpaceOverhead));
+
+        options.putIfAbsent(Controller.ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION, Boolean.toString(allowOverlaps));
+        options.putIfAbsent(Controller.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION, Long.toString(checkFrequency));
+
+        options = Controller.validateOptions(options);
+        assertTrue(options.toString(), options.isEmpty());
+    }
+
+    void testStartShutdown(Controller controller)
+    {
+        assertNotNull(controller);
+
+        assertEquals((long) dataSizeGB << 30, controller.getDataSetSizeBytes());
+        assertEquals(numShards, controller.getNumShards());
+        assertEquals(((long) dataSizeGB << 30) / numShards, controller.getShardSizeBytes());
+        assertEquals((long) sstableSizeMB << 20, controller.getMinSstableSizeBytes());
+        assertFalse(controller.isRunning());
+        assertEquals(Controller.DEFAULT_SURVIVAL_FACTOR, controller.getSurvivalFactor(), epsilon);
+        assertNull(controller.getCalculator());
+
+        controller.startup(strategy, executorService);
+        assertTrue(controller.isRunning());
+        assertNotNull(controller.getCalculator());
+
+        controller.shutdown();
+        assertFalse(controller.isRunning());
+        assertNull(controller.getCalculator());
+
+        controller.shutdown(); // no op
+    }
+
+    void testShutdownNotStarted(Controller controller)
+    {
+        assertNotNull(controller);
+
+        controller.shutdown(); // no op.
+    }
+
+    void testStartAlreadyStarted(Controller controller)
+    {
+        assertNotNull(controller);
+
+        controller.startup(strategy, executorService);
+        assertTrue(controller.isRunning());
+        assertNotNull(controller.getCalculator());
+
+        controller.startup(strategy, executorService);
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/CostsCalculatorTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/CostsCalculatorTest.java
new file mode 100644
index 000000000000..18676964f1be
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/unified/CostsCalculatorTest.java
@@ -0,0 +1,286 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.Random;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Sets;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.cassandra.db.compaction.BackgroundCompactions;
+import org.apache.cassandra.db.compaction.UnifiedCompactionStrategy;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.utils.FixedMonotonicClock;
+import org.apache.cassandra.utils.MovingAverage;
+import org.apache.cassandra.utils.PageAware;
+import org.mockito.Mock;
+import org.mockito.Mockito;
+import org.mockito.MockitoAnnotations;
+
+import static org.junit.Assert.*;
+import static org.mockito.ArgumentMatchers.*;
+import static org.mockito.Mockito.times;
+import static org.mockito.Mockito.when;
+
+public class CostsCalculatorTest
+{
+    private static final double epsilon = 0.00000001;
+    private static final Random random = new Random(0L);
+    private static final double survivalFactor = 1;
+
+    @Mock
+    private Environment environment;
+
+    @Mock
+    private UnifiedCompactionStrategy strategy;
+
+    @Mock
+    private TableMetadata metadata;
+
+    @Mock
+    private BackgroundCompactions backgroundCompactions;
+
+    @Mock
+    private ScheduledExecutorService executorService;
+
+    @Mock
+    private ScheduledFuture fut;
+
+    @Mock
+    private SSTableReader sstable;
+
+    private FixedMonotonicClock clock;
+
+    @Before
+    public void setUp()
+    {
+        MockitoAnnotations.initMocks(this);
+
+        when(executorService.scheduleAtFixedRate(any(Runnable.class), anyLong(), anyLong(), any(TimeUnit.class))).thenReturn(fut);
+        when(strategy.getSSTables()).thenReturn(Sets.newHashSet(sstable));
+        when(strategy.getMetadata()).thenReturn(metadata);
+
+        when(sstable.onDiskLength()).thenReturn((long) PageAware.PAGE_SIZE);
+        when(backgroundCompactions.getAggregates()).thenReturn(ImmutableList.of());
+
+        clock = new FixedMonotonicClock();
+        when(environment.makeExpMovAverage()).thenAnswer(ts -> new MovingAverageMock());
+    }
+
+    @Test
+    public void testCreateAndClose()
+    {
+        CostsCalculator cost = new CostsCalculator(environment, strategy, executorService, survivalFactor);
+        assertNotNull(cost);
+        assertNotNull(cost.toString());
+
+        Mockito.verify(executorService, times(1)).scheduleAtFixedRate(any(Runnable.class), anyLong(), anyLong(), any(TimeUnit.class));
+
+        cost.close();
+        Mockito.verify(fut, times(1)).cancel(anyBoolean());
+    }
+
+    @Test
+    public void testUpdate() throws InterruptedException
+    {
+        testCosts(100, 100, PageAware.PAGE_SIZE, 1, 1, 1, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testDoubleReadTime() throws InterruptedException
+    {
+        testCosts(200, 100, PageAware.PAGE_SIZE, 1, 1, 1, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testDoubleWriteTime() throws InterruptedException
+    {
+        testCosts(100, 200, PageAware.PAGE_SIZE, 1, 1, 1, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testLargerChunkSize() throws InterruptedException
+    {
+        testCosts(100, 100, 64 << 10, 1, 1, 1, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testHalfCacheMissRatio() throws InterruptedException
+    {
+        testCosts(100, 100, PageAware.PAGE_SIZE, 0.5, 1, 1, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testReadMultiplier() throws InterruptedException
+    {
+        testCosts(1000, 100, PageAware.PAGE_SIZE, 1, 0.1, 1, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testWriteMultiplier() throws InterruptedException
+    {
+        testCosts(100, 100, PageAware.PAGE_SIZE, 1, 1, 10, 0.01, survivalFactor);
+    }
+
+    @Test
+    public void testSurvivalRatio() throws InterruptedException
+    {
+        testCosts(100, 100, PageAware.PAGE_SIZE, 1, 1, 1, 0.01, 0.5);
+    }
+
+    private void testCosts(long readTimeMicros,
+                           long writeTimeMicros,
+                           int chunkSize,
+                           double cacheMissRatio,
+                           double readMultiplier,
+                           double writeMultiplier,
+                           double bfprRatio,
+                           double survivalFactor) throws InterruptedException
+    {
+        int blockSize = PageAware.PAGE_SIZE;
+        long totPartitionsRead = 1 + random.nextInt(32);
+        long totBytesInserted = blockSize + random(blockSize);
+
+        when(environment.partitionsRead()).thenReturn(totPartitionsRead);
+        when(environment.bytesInserted()).thenReturn(totBytesInserted);
+        when(environment.chunkSize()).thenReturn(chunkSize);
+        when(environment.cacheMissRatio()).thenReturn(cacheMissRatio);
+        when(environment.bloomFilterFpRatio()).thenReturn(bfprRatio);
+        when(environment.sstablePartitionReadLatencyNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(readTimeMicros));
+        when(environment.flushLatencyPerKbInNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(writeTimeMicros));
+        when(environment.compactionLatencyPerKbInNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(writeTimeMicros));
+
+        CostsCalculator cost = new CostsCalculator(environment, strategy, executorService, survivalFactor, readMultiplier, writeMultiplier);
+        assertNotNull(cost);
+        assertNotNull(cost.toString());
+
+        cost.sampleValues();
+        assertNotNull(cost.toString());
+
+        for (int i = 0; i < 32; i++)
+        {
+            long bytesInserted = (i * blockSize + random(blockSize));
+            totPartitionsRead += (1 + i);
+            totBytesInserted += bytesInserted;
+
+            when(environment.partitionsRead()).thenReturn(totPartitionsRead);
+            when(environment.bytesInserted()).thenReturn(totBytesInserted);
+
+            clock.setNowInNanos(clock.now() + TimeUnit.MILLISECONDS.toNanos(CostsCalculator.samplingPeriodMs));
+            cost.sampleValues();
+            assertNotNull(cost.toString());
+
+            // the WA is 2 and the flush and compaction times for now are the same and equal to writeTimeMicros
+            double writeCost = ((bytesInserted / (double) (1 << 10)) * TimeUnit.MICROSECONDS.toNanos(writeTimeMicros)) / (double) TimeUnit.MILLISECONDS.toNanos(1);
+            assertEquals((writeCost + writeCost * 2) * writeMultiplier, cost.getWriteCostForQueries(2), epsilon);
+
+            // the RA is 2, the delta partitions read is i + 1
+            assertEquals((((i + 1) * readTimeMicros) / (double) TimeUnit.MILLISECONDS.toMicros(1)) * Math.min(1 + bfprRatio * 2 / survivalFactor, 2) * readMultiplier, cost.getReadCostForQueries(2), epsilon);
+        }
+    }
+
+    @Test
+    public void testNoBytesInserted()
+    {
+        int blockSize = PageAware.PAGE_SIZE;
+        long totPartitionsRead = 1 + random.nextInt(32);
+        long totBytesInserted = blockSize + random(blockSize);
+
+        when(environment.partitionsRead()).thenReturn(totPartitionsRead);
+        when(environment.bytesInserted()).thenReturn(totBytesInserted);
+        when(environment.chunkSize()).thenReturn(4096);
+        when(environment.cacheMissRatio()).thenReturn(0.05);
+        when(environment.bloomFilterFpRatio()).thenReturn(0.01);
+        when(environment.sstablePartitionReadLatencyNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(20));
+        when(environment.flushLatencyPerKbInNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(20));
+        when(environment.compactionLatencyPerKbInNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(20));
+
+        CostsCalculator cost = new CostsCalculator(environment, strategy, executorService, survivalFactor, 1, 1);
+        assertNotNull(cost);
+        assertNotNull(cost.toString());
+
+        cost.sampleValues();
+        assertNotNull(cost.toString());
+
+        when(environment.bytesInserted()).thenReturn(0L);
+        for (int i = 0; i < 10; i++)
+            assertEquals(0, cost.getWriteCostForQueries(i), epsilon);
+    }
+
+    @Test
+    public void testNoPartitionsRead()
+    {
+        int blockSize = PageAware.PAGE_SIZE;
+        long totPartitionsRead = 1 + random.nextInt(32);
+        long totBytesInserted = blockSize + random(blockSize);
+
+        when(environment.partitionsRead()).thenReturn(totPartitionsRead);
+        when(environment.bytesInserted()).thenReturn(totBytesInserted);
+        when(environment.chunkSize()).thenReturn(4096);
+        when(environment.cacheMissRatio()).thenReturn(0.05);
+        when(environment.bloomFilterFpRatio()).thenReturn(0.01);
+        when(environment.sstablePartitionReadLatencyNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(20));
+        when(environment.flushLatencyPerKbInNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(20));
+        when(environment.compactionLatencyPerKbInNanos()).thenReturn((double) TimeUnit.MICROSECONDS.toNanos(20));
+
+        CostsCalculator cost = new CostsCalculator(environment, strategy, executorService, survivalFactor, 1, 1);
+        assertNotNull(cost);
+        assertNotNull(cost.toString());
+
+        cost.sampleValues();
+        assertNotNull(cost.toString());
+
+        when(environment.partitionsRead()).thenReturn(0L);
+        for (int i = 0; i < 10; i++)
+            assertEquals(0, cost.getReadCostForQueries(i), epsilon);
+    }
+
+    private static long random(int blockSize)
+    {
+        return 1 + random.nextInt(blockSize - 1);
+    }
+
+    private static class MovingAverageMock implements MovingAverage
+    {
+        private double val = 0;
+
+        @Override
+        public MovingAverage update(double val)
+        {
+            this.val = val;
+            return this;
+        }
+
+        @Override
+        public double get()
+        {
+            return val;
+        }
+
+        @Override
+        public String toString()
+        {
+            return String.format("%.02f", val);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriterTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriterTest.java
new file mode 100644
index 000000000000..a914c3a692be
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/unified/ShardedCompactionWriterTest.java
@@ -0,0 +1,209 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.nio.ByteBuffer;
+import java.util.List;
+import java.util.Random;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionPosition;
+import org.apache.cassandra.db.compaction.CompactionController;
+import org.apache.cassandra.db.compaction.CompactionIterator;
+import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.db.compaction.writers.CompactionAwareWriter;
+import org.apache.cassandra.db.lifecycle.LifecycleTransaction;
+import org.apache.cassandra.io.sstable.ScannerList;
+import org.apache.cassandra.io.sstable.format.SSTableReader;
+import org.apache.cassandra.service.StorageService;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.UUIDGen;
+
+import static org.junit.Assert.assertEquals;
+
+public class ShardedCompactionWriterTest extends CQLTester
+{
+    private static final String KEYSPACE = "cawt_keyspace";
+    private static final String TABLE = "cawt_table";
+
+    private static final int ROW_PER_PARTITION = 10;
+
+    @BeforeClass
+    public static void beforeClass()
+    {
+        CQLTester.setUpClass();
+        CQLTester.prepareServer();
+        StorageService.instance.initServer();
+
+        // Disabling durable write since we don't care
+        schemaChange("CREATE KEYSPACE IF NOT EXISTS " + KEYSPACE + " WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'} AND durable_writes=false");
+        schemaChange(String.format("CREATE TABLE %s.%s (k int, t int, v blob, PRIMARY KEY (k, t))", KEYSPACE, TABLE));
+    }
+
+    @AfterClass
+    public static void tearDownClass()
+    {
+        QueryProcessor.executeInternal("DROP KEYSPACE IF EXISTS " + KEYSPACE);
+    }
+
+    private ColumnFamilyStore getColumnFamilyStore()
+    {
+        return Keyspace.open(KEYSPACE).getColumnFamilyStore(TABLE);
+    }
+
+    @Test
+    public void testOneSSTablePerShard() throws Throwable
+    {
+        // If we set the minSSTableSize ratio to 0.5, because this gets multiplied by the shard size to give the min sstable size,
+        // assuming evenly distributed data, it should split at each boundary and so we should end up with numShards sstables
+        int numShards = 5;
+        int rowCount = 5000;
+        double minSSTableSizeRatio = 0.5;
+        testShardedCompactionWriter(numShards, rowCount, minSSTableSizeRatio, numShards, true);
+    }
+
+    @Test
+    public void testOneSSTableOnly() throws Throwable
+    {
+        // If we set the minSSTableSize ratio to the number of shards 5, because this gets multiplied by the shard size to give
+        // the min sstable size, then it should ignore all boundaries because it won't reach the minimum sstable size until the
+        // end of the last shard and so we should end up with 1 sstable
+        int numShards = 5;
+        int rowCount = 5000;
+        double minSSTableSizeRatio = 5;
+        testShardedCompactionWriter(numShards, rowCount, minSSTableSizeRatio, 1, true);
+    }
+
+    @Test
+    public void testThreeSSTables() throws Throwable
+    {
+        // If we set the minSSTableSize ratio to 2, because this gets multiplied by the shard size to give
+        // the min sstable size, then it should merge 2 shards together assuming evenly distributed data
+        // and so we should end up with 3 sstables (numShards / 2)
+        int numShards = 6;
+        int rowCount = 5000;
+        double minSSTableSizeRatio = 2;
+        testShardedCompactionWriter(numShards, rowCount, minSSTableSizeRatio, 3, true);
+    }
+
+    @Test
+    public void testMultipleInputSSTables() throws Throwable
+    {
+        int numShards = 3;
+        int rowCount = 5000;
+        double minSSTableSizeRatio = 2;
+        testShardedCompactionWriter(numShards, rowCount, minSSTableSizeRatio, numShards, false);
+    }
+
+    private void testShardedCompactionWriter(int numShards, int rowCount, double minSSTableSizeRatio, int numOutputSSTables, boolean majorCompaction) throws Throwable
+    {
+        ColumnFamilyStore cfs = getColumnFamilyStore();
+        cfs.disableAutoCompaction();
+
+        populate(rowCount, majorCompaction);
+
+        LifecycleTransaction txn = cfs.getTracker().tryModify(cfs.getLiveSSTables(), OperationType.COMPACTION);
+        long inputSize = txn.originals().iterator().next().onDiskLength();
+        int minSSTableSize = (int) (((double) inputSize / numShards) * minSSTableSizeRatio);
+
+        List<PartitionPosition> boundaries = cfs.getLocalRanges().split(numShards);
+        ShardedCompactionWriter writer = new ShardedCompactionWriter(cfs, cfs.getDirectories(), txn, txn.originals(), false, minSSTableSize, boundaries);
+
+        int rows = compact(cfs, txn, writer);
+        assertEquals(numOutputSSTables, cfs.getLiveSSTables().size());
+        assertEquals(rowCount, rows);
+
+        long totalOnDiskLength = cfs.getLiveSSTables().stream().map(SSTableReader::onDiskLength).mapToLong(Long::longValue).sum();
+        long totalBFSize = cfs.getLiveSSTables().stream().map(SSTableReader::getBloomFilterSerializedSize).mapToLong(Long::longValue).sum();
+        assert totalBFSize > 16 * numOutputSSTables : "Bloom Filter is empty"; // 16 is the size of empty bloom filter
+        for (SSTableReader rdr : cfs.getLiveSSTables())
+            assertEquals((double) rdr.onDiskLength() / totalOnDiskLength,
+                         (double) rdr.getBloomFilterSerializedSize() / totalBFSize, 0.1);
+
+        validateData(cfs, rowCount);
+        cfs.truncateBlocking();
+    }
+
+    private int compact(ColumnFamilyStore cfs, LifecycleTransaction txn, CompactionAwareWriter writer)
+    {
+        //assert txn.originals().size() == 1;
+        int rowsWritten = 0;
+        int nowInSec = FBUtilities.nowInSeconds();
+        try (ScannerList scanners = cfs.getCompactionStrategy().getScanners(txn.originals());
+             CompactionController controller = new CompactionController(cfs, txn.originals(), cfs.gcBefore(nowInSec));
+             CompactionIterator ci = new CompactionIterator(OperationType.COMPACTION, scanners.scanners, controller, nowInSec, UUIDGen.getTimeUUID()))
+        {
+            while (ci.hasNext())
+            {
+                if (writer.append(ci.next()))
+                    rowsWritten++;
+            }
+        }
+        writer.finish();
+        return rowsWritten;
+    }
+
+    private void populate(int count, boolean compact) throws Throwable
+    {
+        byte [] payload = new byte[5000];
+        new Random(42).nextBytes(payload);
+        ByteBuffer b = ByteBuffer.wrap(payload);
+
+        ColumnFamilyStore cfs = getColumnFamilyStore();
+        for (int i = 0; i < count; i++)
+        {
+            for (int j = 0; j < ROW_PER_PARTITION; j++)
+                execute(String.format("INSERT INTO %s.%s(k, t, v) VALUES (?, ?, ?)", KEYSPACE, TABLE), i, j, b);
+
+            if (i % (count / 4) == 0)
+                cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        }
+
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        if (compact && cfs.getLiveSSTables().size() > 1)
+        {
+            // we want just one big sstable to avoid doing actual compaction in compact() above
+            try
+            {
+                cfs.forceMajorCompaction();
+            }
+            catch (Throwable t)
+            {
+                throw new RuntimeException(t);
+            }
+            assert cfs.getLiveSSTables().size() == 1 : cfs.getLiveSSTables();
+        }
+    }
+
+    private void validateData(ColumnFamilyStore cfs, int rowCount) throws Throwable
+    {
+        for (int i = 0; i < rowCount; i++)
+        {
+            Object[][] expected = new Object[ROW_PER_PARTITION][];
+            for (int j = 0; j < ROW_PER_PARTITION; j++)
+                expected[j] = row(i, j);
+
+            assertRows(execute(String.format("SELECT k, t FROM %s.%s WHERE k = :i", KEYSPACE, TABLE), i), expected);
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/ShardedMultiWriterTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/ShardedMultiWriterTest.java
new file mode 100644
index 000000000000..cee8b2d385a8
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/unified/ShardedMultiWriterTest.java
@@ -0,0 +1,121 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.nio.ByteBuffer;
+import java.util.Random;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.apache.cassandra.cql3.CQLTester;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.service.StorageService;
+
+import static org.junit.Assert.assertEquals;
+
+public class ShardedMultiWriterTest extends CQLTester
+{
+    private static final int ROW_PER_PARTITION = 10;
+
+    @BeforeClass
+    public static void beforeClass()
+    {
+        CQLTester.setUpClass();
+        StorageService.instance.initServer();
+    }
+
+    @Test
+    public void testShardedCompactionWriter_fiveToFiveShards() throws Throwable
+    {
+        int numShards = 5;
+        int minSSTableSizeMB = 2;
+        long totSizeBytes = ((minSSTableSizeMB << 20) * numShards) * 2;
+
+        // We have double the data required for 5 shards so we should get 5 shards
+        testShardedCompactionWriter(numShards, minSSTableSizeMB, totSizeBytes, numShards);
+    }
+
+    @Test
+    public void testShardedCompactionWriter_fiveToOneShard() throws Throwable
+    {
+        int numShards = 5;
+        int minSSTableSizeMB = 2;
+        long totSizeBytes = (minSSTableSizeMB << 20);
+
+        // there should be only 1 shard if there is <= minSSTableSize
+        testShardedCompactionWriter(numShards, minSSTableSizeMB, totSizeBytes, 1);
+    }
+
+    @Test
+    public void testShardedCompactionWriter_fiveToThreeShard() throws Throwable
+    {
+        int numShards = 5;
+        int minSSTableSizeMB = 2;
+        long totSizeBytes = (minSSTableSizeMB << 20) * 3;
+
+        // there should be only 3 shards if there is minSSTableSize * 3 data
+        testShardedCompactionWriter(numShards, minSSTableSizeMB, totSizeBytes, 3);
+    }
+
+    private void testShardedCompactionWriter(int numShards, int minSSTableSizeMB, long totSizeBytes, int numOutputSSTables) throws Throwable
+    {
+        createTable(String.format("CREATE TABLE %%s (k int, t int, v blob, PRIMARY KEY (k, t)) with compaction = " +
+                                  "{'class':'UnifiedCompactionStrategy', 'num_shards' : '%d', 'min_sstable_size_in_mb' : '%d'} ", numShards, minSSTableSizeMB));
+
+        ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
+        cfs.disableAutoCompaction();
+
+        int rowCount = insertData(totSizeBytes);
+        cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+
+        assertEquals(numOutputSSTables, cfs.getLiveSSTables().size());
+
+        validateData(rowCount);
+        cfs.truncateBlocking();
+    }
+
+    private int insertData(long totSizeBytes) throws Throwable
+    {
+        byte [] payload = new byte[5000];
+        ByteBuffer b = ByteBuffer.wrap(payload);
+        int rowCount = (int) Math.ceil((double) totSizeBytes / (8 + ROW_PER_PARTITION * payload.length));
+
+        for (int i = 0; i < rowCount; i++)
+        {
+            for (int j = 0; j < ROW_PER_PARTITION; j++)
+            {
+                new Random(42 + i * ROW_PER_PARTITION + j).nextBytes(payload); // write different data each time to make non-compressible
+                execute("INSERT INTO %s(k, t, v) VALUES (?, ?, ?)", i, j, b);
+            }
+        }
+
+        return rowCount;
+    }
+
+    private void validateData(int rowCount) throws Throwable
+    {
+        for (int i = 0; i < rowCount; i++)
+        {
+            Object[][] expected = new Object[ROW_PER_PARTITION][];
+            for (int j = 0; j < ROW_PER_PARTITION; j++)
+                expected[j] = row(i, j);
+
+            assertRows(execute("SELECT k, t FROM %s WHERE k = :i", i), expected);
+        }
+    }
+
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/compaction/unified/StaticControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/unified/StaticControllerTest.java
new file mode 100644
index 000000000000..45114ffdd904
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/compaction/unified/StaticControllerTest.java
@@ -0,0 +1,189 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.compaction.unified;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+
+import org.junit.Test;
+
+import com.sun.tools.javac.util.List;
+import org.apache.cassandra.exceptions.ConfigurationException;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class StaticControllerTest extends ControllerTest
+{
+    static final int[] Ws = new int[] { 30, 2, -6};
+
+    @Test
+    public void testFromOptions()
+    {
+        Map<String, String> options = new HashMap<>();
+        String wStr = Arrays.stream(Ws).mapToObj(Integer::toString).collect(Collectors.joining(","));
+        options.put(StaticController.STATIC_SCALING_PARAMETERS_OPTION, wStr);
+
+        Controller controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+
+        for (int i = 0; i < Ws.length; i++)
+            assertEquals(Ws[i], controller.getScalingParameter(i));
+
+        assertEquals(Ws[Ws.length-1], controller.getScalingParameter(Ws.length));
+    }
+
+    @Test
+    public void testValidateOptions()
+    {
+        Map<String, String> options = new HashMap<>();
+        String wStr = Arrays.stream(Ws).mapToObj(Integer::toString).collect(Collectors.joining(","));
+        options.put(StaticController.STATIC_SCALING_PARAMETERS_OPTION, wStr);
+
+        super.testValidateOptions(options, false);
+    }
+
+    @Test
+    public void testStartShutdown()
+    {
+        StaticController controller = new StaticController(env, Ws, Controller.DEFAULT_SURVIVAL_FACTOR, dataSizeGB << 10, numShards, sstableSizeMB, 0, Controller.DEFAULT_MAX_SPACE_OVERHEAD, 0, Controller.DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS, Controller.DEFAULT_ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION);
+        super.testStartShutdown(controller);
+    }
+
+    @Test
+    public void testShutdownNotStarted()
+    {
+        StaticController controller = new StaticController(env, Ws, Controller.DEFAULT_SURVIVAL_FACTOR, dataSizeGB << 10, numShards, sstableSizeMB, 0, Controller.DEFAULT_MAX_SPACE_OVERHEAD, 0, Controller.DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS, Controller.ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION);
+        super.testShutdownNotStarted(controller);
+    }
+
+    @Test(expected = IllegalStateException.class)
+    public void testStartAlreadyStarted()
+    {
+        StaticController controller = new StaticController(env, Ws, Controller.DEFAULT_SURVIVAL_FACTOR, dataSizeGB << 10, numShards, sstableSizeMB, 0, Controller.DEFAULT_MAX_SPACE_OVERHEAD, 0, Controller.DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS, Controller.ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION);
+        super.testStartAlreadyStarted(controller);
+    }
+
+    @Test
+    public void testMaxSpaceOverhead()
+    {
+        Map<String, String> options = new HashMap<>();
+
+        Controller controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+
+        assertEquals(maxSpaceOverhead, controller.getMaxSpaceOverhead(), 0.0d);
+
+        options.put(Controller.MAX_SPACE_OVERHEAD_OPTION, "0.5");
+        controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+
+        assertEquals(0.5d, controller.getMaxSpaceOverhead(), 0.0d);
+
+        options.put(Controller.MAX_SPACE_OVERHEAD_OPTION, "0.1");
+        controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+
+        assertEquals(1.0d / ControllerTest.numShards, controller.getMaxSpaceOverhead(), 0.0d);
+
+        for (Double d : List.of(0.0, 10.0, -10.0))
+        {
+            String s = d.toString();
+            try
+            {
+                options.put(Controller.MAX_SPACE_OVERHEAD_OPTION, s);
+                testFromOptions(false, options);
+                fail(String.format("%s validation must have failed for the value %s", Controller.MAX_SPACE_OVERHEAD_OPTION, s));
+            }
+            catch (ConfigurationException ce)
+            {
+                // expected
+                assertEquals(ce.getMessage(), String.format("Invalid configuration, %s must be between %f and %f: %s",
+                                                            Controller.MAX_SPACE_OVERHEAD_OPTION,
+                                                            Controller.MAX_SPACE_OVERHEAD_LOWER_BOUND,
+                                                            Controller.MAX_SPACE_OVERHEAD_UPPER_BOUND,
+                                                            s));
+            }
+        }
+    }
+
+    @Test
+    public void testMaxSSTablesToCompact()
+    {
+        Map<String, String> options = new HashMap<>();
+        Controller controller = testFromOptions(false, options);
+        assertTrue(controller.maxSSTablesToCompact <= controller.dataSetSizeMB * controller.maxSpaceOverhead / controller.minSstableSizeMB);
+
+        options.put(Controller.MAX_SPACE_OVERHEAD_OPTION, "0.1");
+        controller = testFromOptions(false, options);
+        assertTrue(controller.maxSSTablesToCompact <= controller.dataSetSizeMB * controller.maxSpaceOverhead / controller.minSstableSizeMB);
+
+        options.put(Controller.MAX_SSTABLES_TO_COMPACT_OPTION, "100");
+        controller = testFromOptions(false, options);
+        assertEquals(100, controller.maxSSTablesToCompact);
+
+        options.put(Controller.MAX_SSTABLES_TO_COMPACT_OPTION, "0");
+        controller = testFromOptions(false, options);
+        assertTrue(controller.maxSSTablesToCompact <= controller.dataSetSizeMB * controller.maxSpaceOverhead / controller.minSstableSizeMB);
+    }
+
+    @Test
+    public void testExpiredSSTableCheckFrequency()
+    {
+        Map<String, String> options = new HashMap<>();
+
+        Controller controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+        assertEquals(TimeUnit.MILLISECONDS.convert(Controller.DEFAULT_EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS, TimeUnit.SECONDS),
+                     controller.getExpiredSSTableCheckFrequency());
+
+        options.put(Controller.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION, "5");
+        controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+        assertEquals(5000L, controller.getExpiredSSTableCheckFrequency());
+
+        try
+        {
+            options.put(Controller.EXPIRED_SSTABLE_CHECK_FREQUENCY_SECONDS_OPTION, "0");
+            testFromOptions(false, options);
+            fail("Exception should be thrown");
+        }
+        catch (ConfigurationException e)
+        {
+            // valid path
+        }
+    }
+
+    @Test
+    public void testAllowOverlaps()
+    {
+        Map<String, String> options = new HashMap<>();
+
+        Controller controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+        assertEquals(Controller.DEFAULT_ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION, controller.getIgnoreOverlapsInExpirationCheck());
+
+        options.put(Controller.ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION_OPTION, "true");
+        controller = testFromOptions(false, options);
+        assertTrue(controller instanceof StaticController);
+        assertEquals(Controller.ALLOW_UNSAFE_AGGRESSIVE_SSTABLE_EXPIRATION, controller.getIgnoreOverlapsInExpirationCheck());
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
index aef525740024..df72813802f6 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java
@@ -30,12 +30,12 @@
 
 import org.junit.Assert;
 import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.io.sstable.ScannerList;
 import org.apache.cassandra.schema.TableMetadataRef;
 import org.apache.cassandra.schema.Schema;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
 import org.apache.cassandra.db.SerializationHeader;
-import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.CompactionController;
 import org.apache.cassandra.db.compaction.CompactionIterator;
 import org.apache.cassandra.db.compaction.OperationType;
@@ -151,7 +151,7 @@ private SSTableReader replaceSSTable(ColumnFamilyStore cfs, LifecycleTransaction
         try (CompactionController controller = new CompactionController(cfs, txn.originals(), cfs.gcBefore(FBUtilities.nowInSeconds())))
         {
             try (SSTableRewriter rewriter = SSTableRewriter.constructKeepingOriginals(txn, false, 1000);
-                 AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(txn.originals());
+                 ScannerList scanners = cfs.getCompactionStrategy().getScanners(txn.originals());
                  CompactionIterator ci = new CompactionIterator(txn.opType(), scanners.scanners, controller, nowInSec, txn.opId())
             )
             {
diff --git a/test/unit/org/apache/cassandra/dht/SplitterTest.java b/test/unit/org/apache/cassandra/dht/SplitterTest.java
index c591499e26e8..012d0d54654d 100644
--- a/test/unit/org/apache/cassandra/dht/SplitterTest.java
+++ b/test/unit/org/apache/cassandra/dht/SplitterTest.java
@@ -32,6 +32,7 @@
 
 import static com.google.common.collect.Sets.newHashSet;
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
@@ -77,7 +78,7 @@ public void testWithWeight()
         IPartitioner partitioner = Murmur3Partitioner.instance;
         Splitter splitter = partitioner.splitter().get();
 
-        assertEquals(splitter.splitOwnedRanges(2, ranges, false), splitter.splitOwnedRanges(2, ranges2, false));
+        assertEquals(splitter.splitOwnedRanges(2, ranges, Splitter.SplitType.ALWAYS_SPLIT), splitter.splitOwnedRanges(2, ranges2, Splitter.SplitType.ALWAYS_SPLIT));
     }
 
     @Test
@@ -95,7 +96,7 @@ public void testWithWeight2()
         IPartitioner partitioner = Murmur3Partitioner.instance;
         Splitter splitter = partitioner.splitter().get();
 
-        assertEquals(splitter.splitOwnedRanges(2, ranges, false), splitter.splitOwnedRanges(2, ranges2, false));
+        assertEquals(splitter.splitOwnedRanges(2, ranges, Splitter.SplitType.ALWAYS_SPLIT), splitter.splitOwnedRanges(2, ranges2, Splitter.SplitType.ALWAYS_SPLIT));
     }
 
     private Range<Token> t(long left, long right)
@@ -110,8 +111,8 @@ private static void randomSplitTestNoVNodes(IPartitioner partitioner)
         for (int i = 0; i < 10000; i++)
         {
             List<Splitter.WeightedRange> localRanges = generateLocalRanges(1, r.nextInt(4) + 1, splitter, r, partitioner instanceof RandomPartitioner);
-            List<Token> boundaries = splitter.splitOwnedRanges(r.nextInt(9) + 1, localRanges, false);
-            assertTrue("boundaries = " + boundaries + " ranges = " + localRanges, assertRangeSizeEqual(localRanges, boundaries, partitioner, splitter, true));
+            Splitter.SplitResult result = splitter.splitOwnedRanges(r.nextInt(9) + 1, localRanges, Splitter.SplitType.ALWAYS_SPLIT);
+            assertTrue("boundaries = " + result.boundaries + " ranges = " + localRanges, assertRangeSizeEqual(localRanges, result, partitioner, splitter, Splitter.SplitType.ALWAYS_SPLIT));
         }
     }
 
@@ -119,27 +120,50 @@ private static void randomSplitTestVNodes(IPartitioner partitioner)
     {
         Splitter splitter = getSplitter(partitioner);
         Random r = new Random();
-        for (int i = 0; i < 10000; i++)
+        for (Splitter.SplitType splitType : Splitter.SplitType.values())
         {
-            // we need many tokens to be able to split evenly over the disks
-            int numTokens = 172 + r.nextInt(128);
-            int rf = r.nextInt(4) + 2;
-            int parts = r.nextInt(5) + 1;
-            List<Splitter.WeightedRange> localRanges = generateLocalRanges(numTokens, rf, splitter, r, partitioner instanceof RandomPartitioner);
-            List<Token> boundaries = splitter.splitOwnedRanges(parts, localRanges, true);
-            if (!assertRangeSizeEqual(localRanges, boundaries, partitioner, splitter, false))
-                fail(String.format("Could not split %d tokens with rf=%d into %d parts (localRanges=%s, boundaries=%s)", numTokens, rf, parts, localRanges, boundaries));
+            for (int i = 0; i < 10000; i++)
+            {
+                // we need many tokens to be able to split evenly over the disks
+                int numTokens = 172 + r.nextInt(128);
+                int rf = r.nextInt(4) + 2;
+                int parts = r.nextInt(5) + 1;
+                List<Splitter.WeightedRange> localRanges = generateLocalRanges(numTokens, rf, splitter, r, partitioner instanceof RandomPartitioner);
+
+                Splitter.SplitResult result = splitter.splitOwnedRanges(parts, localRanges, splitType);
+                if (!assertRangeSizeEqual(localRanges, result, partitioner, splitter, splitType))
+                    fail(String.format("Could not split %d tokens with rf=%d into %d parts (localRanges=%s, boundaries=%s, splitType=%s)",
+                                       numTokens, rf, parts, localRanges, result.boundaries, splitType));
+            }
         }
     }
 
-    private static boolean assertRangeSizeEqual(List<Splitter.WeightedRange> localRanges, List<Token> tokens, IPartitioner partitioner, Splitter splitter, boolean splitIndividualRanges)
+    private static boolean assertRangeSizeEqual(List<Splitter.WeightedRange> localRanges,
+                                                Splitter.SplitResult splitResult,
+                                                IPartitioner partitioner,
+                                                Splitter splitter,
+                                                Splitter.SplitType splitType)
     {
+        List<Token> boundaries = splitResult.boundaries;
+        boolean splitIndividualRanges = splitResult.rangesWereSplit;
+
+        // Check if the split type was respected. This is only relevant if there are two or more tokens because
+        // if the splitter cannot split at all, then the split result will indicate that no ranges were split regardless
+        // of the split type
+        if (boundaries.size() > 1)
+        {
+            if (splitType == Splitter.SplitType.ALWAYS_SPLIT)
+                assertTrue("Local ranges can only be split when SplitType forces it", splitIndividualRanges);
+            else if (splitType == Splitter.SplitType.ONLY_WHOLE)
+                assertFalse("Local ranges should not be split when SplitType doesn't force it", splitIndividualRanges);
+        }
+
         Token start = partitioner.getMinimumToken();
         List<BigInteger> splits = new ArrayList<>();
 
-        for (int i = 0; i < tokens.size(); i++)
+        for (int i = 0; i < boundaries.size(); i++)
         {
-            Token end = i == tokens.size() - 1 ? partitioner.getMaximumToken() : tokens.get(i);
+            Token end = i == boundaries.size() - 1 ? partitioner.getMaximumToken() : boundaries.get(i);
             splits.add(sumOwnedBetween(localRanges, start, end, splitter, splitIndividualRanges));
             start = end;
         }
@@ -180,7 +204,11 @@ private static BigInteger sumOwnedBetween(List<Splitter.WeightedRange> localRang
         return sum;
     }
 
-    private static List<Splitter.WeightedRange> generateLocalRanges(int numTokens, int rf, Splitter splitter, Random r, boolean randomPartitioner)
+    public static List<Splitter.WeightedRange> generateLocalRanges(int numTokens,
+                                                                   int rf,
+                                                                   Splitter splitter,
+                                                                   Random r,
+                                                                   boolean randomPartitioner)
     {
         int localTokens = numTokens * rf;
         List<Token> randomTokens = new ArrayList<>();
@@ -515,7 +543,7 @@ private static Token getWrappedToken(IPartitioner partitioner, BigInteger positi
         return splitter.tokenForValue(position);
     }
 
-    private static Splitter getSplitter(IPartitioner partitioner)
+    public static Splitter getSplitter(IPartitioner partitioner)
     {
         return partitioner.splitter().orElseThrow(() -> new AssertionError(partitioner.getClass() + " must have a splitter"));
     }
diff --git a/test/unit/org/apache/cassandra/io/BloomFilterTrackerTest.java b/test/unit/org/apache/cassandra/io/BloomFilterTrackerTest.java
index afcf2a5659d0..dd7ea7e6f8a3 100644
--- a/test/unit/org/apache/cassandra/io/BloomFilterTrackerTest.java
+++ b/test/unit/org/apache/cassandra/io/BloomFilterTrackerTest.java
@@ -21,51 +21,64 @@
  */
 
 
+import java.util.concurrent.TimeUnit;
+
+import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.io.sstable.BloomFilterTracker;
 
 import static org.junit.Assert.assertEquals;
 
 public class BloomFilterTrackerTest
 {
+    @BeforeClass
+    public static void setUp()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
     @Test
-    public void testAddingFalsePositives()
+    public void testAddingFalsePositives() throws InterruptedException
     {
-        BloomFilterTracker bft = new BloomFilterTracker();
+        BloomFilterTracker bft = BloomFilterTracker.createMeterTracker();
         assertEquals(0L, bft.getFalsePositiveCount());
-        assertEquals(0L, bft.getRecentFalsePositiveCount());
+        assertEquals(0d, bft.getRecentFalsePositiveRate(), 0.0);
         bft.addFalsePositive();
         bft.addFalsePositive();
+        Thread.sleep(TimeUnit.SECONDS.toMillis(5L)); // wait for tick that updates rates
         assertEquals(2L, bft.getFalsePositiveCount());
-        assertEquals(2L, bft.getRecentFalsePositiveCount());
-        assertEquals(0L, bft.getRecentFalsePositiveCount());
+        assertEquals(0.4d, bft.getRecentFalsePositiveRate(), 0.0);
+        assertEquals(0.4d, bft.getRecentFalsePositiveRate(), 0.0);
         assertEquals(2L, bft.getFalsePositiveCount()); // sanity check
     }
 
     @Test
-    public void testAddingTruePositives()
+    public void testAddingTruePositives()  throws InterruptedException
     {
-        BloomFilterTracker bft = new BloomFilterTracker();
+        BloomFilterTracker bft = BloomFilterTracker.createMeterTracker();
         assertEquals(0L, bft.getTruePositiveCount());
-        assertEquals(0L, bft.getRecentTruePositiveCount());
+        assertEquals(0d, bft.getRecentTruePositiveRate(), 0.0);
         bft.addTruePositive();
         bft.addTruePositive();
+        Thread.sleep(TimeUnit.SECONDS.toMillis(5L)); // wait for tick that updates rates
         assertEquals(2L, bft.getTruePositiveCount());
-        assertEquals(2L, bft.getRecentTruePositiveCount());
-        assertEquals(0L, bft.getRecentTruePositiveCount());
+        assertEquals(0.4d, bft.getRecentTruePositiveRate(), 0.0);
+        assertEquals(0.4d, bft.getRecentTruePositiveRate(), 0.0);
         assertEquals(2L, bft.getTruePositiveCount()); // sanity check
     }
 
     @Test
-    public void testAddingToOneLeavesTheOtherAlone()
+    public void testAddingToOneLeavesTheOtherAlone() throws InterruptedException
     {
-        BloomFilterTracker bft = new BloomFilterTracker();
+        BloomFilterTracker bft = BloomFilterTracker.createMeterTracker();
         bft.addFalsePositive();
         assertEquals(0L, bft.getTruePositiveCount());
-        assertEquals(0L, bft.getRecentTruePositiveCount());
+        assertEquals(0d, bft.getRecentTruePositiveRate(), 0.0);
         bft.addTruePositive();
+        Thread.sleep(TimeUnit.SECONDS.toMillis(5L)); // wait for tick that updates rates
         assertEquals(1L, bft.getFalsePositiveCount());
-        assertEquals(1L, bft.getRecentFalsePositiveCount());
+        assertEquals(0.2d, bft.getRecentFalsePositiveRate(), 0.0);
     }
 }
diff --git a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
index 6f02d1a05448..2e7f3987f43c 100644
--- a/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/LegacySSTableTest.java
@@ -222,7 +222,7 @@ public void testMutateMetadataCSM() throws Exception
                     UUID random = UUID.randomUUID();
                     try
                     {
-                        cfs.getCompactionStrategyManager().mutateRepaired(Collections.singleton(sstable), UNREPAIRED_SSTABLE, random, false);
+                        cfs.mutateRepaired(Collections.singleton(sstable), UNREPAIRED_SSTABLE, random, false);
                         if (!sstable.descriptor.version.hasPendingRepair())
                             fail("We should fail setting pending repair on unsupported sstables "+sstable);
                     }
@@ -237,7 +237,7 @@ public void testMutateMetadataCSM() throws Exception
                 {
                     try
                     {
-                        cfs.getCompactionStrategyManager().mutateRepaired(Collections.singleton(sstable), UNREPAIRED_SSTABLE, UUID.randomUUID(), true);
+                        cfs.mutateRepaired(Collections.singleton(sstable), UNREPAIRED_SSTABLE, UUID.randomUUID(), true);
                         if (!sstable.descriptor.version.hasIsTransient())
                             fail("We should fail setting pending repair on unsupported sstables "+sstable);
                     }
@@ -383,9 +383,8 @@ public void testAutomaticUpgrade() throws Exception
             truncateLegacyTables(legacyVersion);
             loadLegacyTables(legacyVersion);
             ColumnFamilyStore cfs = Keyspace.open("legacy_tables").getColumnFamilyStore(String.format("legacy_%s_simple", legacyVersion));
-            AbstractCompactionTask act = cfs.getCompactionStrategyManager().getNextBackgroundTask(0);
             // there should be no compactions to run with auto upgrades disabled:
-            assertEquals(null, act);
+            assertTrue(cfs.getCompactionStrategy().getNextBackgroundTasks(0).isEmpty());
         }
 
         DatabaseDescriptor.setAutomaticSSTableUpgradeEnabled(true);
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
index 2210335592d4..3f50d2066301 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java
@@ -442,13 +442,13 @@ public void testGetPositionsKeyCacheStats()
         SSTableReader sstable = store.getLiveSSTables().iterator().next();
         sstable.getPosition(k(2), SSTableReader.Operator.EQ);
         assertEquals(0, sstable.getKeyCacheHit());
-        assertEquals(1, sstable.getBloomFilterTruePositiveCount());
+        assertEquals(1, store.getBloomFilterTruePositiveCount());
         sstable.getPosition(k(2), SSTableReader.Operator.EQ);
         assertEquals(1, sstable.getKeyCacheHit());
-        assertEquals(2, sstable.getBloomFilterTruePositiveCount());
+        assertEquals(2, store.getBloomFilterTruePositiveCount());
         sstable.getPosition(k(15), SSTableReader.Operator.EQ);
         assertEquals(1, sstable.getKeyCacheHit());
-        assertEquals(2, sstable.getBloomFilterTruePositiveCount());
+        assertEquals(2, store.getBloomFilterTruePositiveCount());
 
     }
 
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
index 24e5103884e3..3bdd2588758c 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java
@@ -45,7 +45,6 @@
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.db.rows.Rows;
 import org.apache.cassandra.db.rows.UnfilteredRowIterator;
-import org.apache.cassandra.db.compaction.AbstractCompactionStrategy;
 import org.apache.cassandra.db.compaction.CompactionController;
 import org.apache.cassandra.db.compaction.CompactionIterator;
 import org.apache.cassandra.db.compaction.OperationType;
@@ -90,7 +89,7 @@ public void basicTest()
         assertEquals(1, sstables.size());
         assertEquals(sstables.iterator().next().bytesOnDisk(), cfs.metric.liveDiskSpaceUsed.getCount());
         int nowInSec = FBUtilities.nowInSeconds();
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
+        try (ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
              SSTableRewriter writer = SSTableRewriter.constructKeepingOriginals(txn, false, 1000);
              CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
@@ -122,7 +121,7 @@ public void basicTest2()
         assertEquals(1, sstables.size());
 
         int nowInSec = FBUtilities.nowInSeconds();
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
+        try (ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
              SSTableRewriter writer = new SSTableRewriter(txn, 1000, 10000000, false, true);
              CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
@@ -155,7 +154,7 @@ public void getPositionsTest()
 
         int nowInSec = FBUtilities.nowInSeconds();
         boolean checked = false;
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
+        try (ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
              SSTableRewriter writer = new SSTableRewriter(txn, 1000, 10000000, false, true);
              CompactionController controller = new CompactionController(cfs, sstables, cfs.gcBefore(nowInSec));
@@ -792,7 +791,7 @@ public void testTwoWriters()
         Set<SSTableReader> sstables = Sets.newHashSet(s);
         assertEquals(1, sstables.size());
         int nowInSec = FBUtilities.nowInSeconds();
-        try (AbstractCompactionStrategy.ScannerList scanners = cfs.getCompactionStrategyManager().getScanners(sstables);
+        try (ScannerList scanners = cfs.getCompactionStrategy().getScanners(sstables);
              LifecycleTransaction txn = cfs.getTracker().tryModify(sstables, OperationType.UNKNOWN);
              SSTableRewriter writer = SSTableRewriter.constructWithoutEarlyOpening(txn, false, 1000);
              SSTableRewriter writer2 = SSTableRewriter.constructWithoutEarlyOpening(txn, false, 1000);
diff --git a/test/unit/org/apache/cassandra/repair/consistent/LocalSessionTest.java b/test/unit/org/apache/cassandra/repair/consistent/LocalSessionTest.java
index 80a12c0f30ff..2f8dae6a1dab 100644
--- a/test/unit/org/apache/cassandra/repair/consistent/LocalSessionTest.java
+++ b/test/unit/org/apache/cassandra/repair/consistent/LocalSessionTest.java
@@ -194,7 +194,7 @@ protected boolean isNodeInitialized()
 
         public Map<UUID, Integer> completedSessions = new HashMap<>();
 
-        protected void sessionCompleted(LocalSession session)
+        public void sessionCompleted(LocalSession session)
         {
             UUID sessionID = session.sessionID;
             int calls = completedSessions.getOrDefault(sessionID, 0);
diff --git a/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java b/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java
index 9172a7ce462b..67ecc607c198 100644
--- a/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java
+++ b/test/unit/org/apache/cassandra/repair/consistent/PendingRepairStatTest.java
@@ -122,7 +122,7 @@ private static void mutateRepaired(SSTableReader sstable, long repairedAt, UUID
     {
         try
         {
-            cfs.getCompactionStrategyManager().mutateRepaired(Collections.singleton(sstable), repairedAt, pendingRepair, false);
+            cfs.mutateRepaired(Collections.singleton(sstable), repairedAt, pendingRepair, false);
         }
         catch (IOException e)
         {
diff --git a/test/unit/org/apache/cassandra/tools/CompactionLogAnalyzerTest.java b/test/unit/org/apache/cassandra/tools/CompactionLogAnalyzerTest.java
new file mode 100644
index 000000000000..12b2e61d6bda
--- /dev/null
+++ b/test/unit/org/apache/cassandra/tools/CompactionLogAnalyzerTest.java
@@ -0,0 +1,136 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.cassandra.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.text.ParseException;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+
+public class CompactionLogAnalyzerTest
+{
+    private final String UNREPAIRED_SHARD_NAME = "0-Unrepaired-shard_0";
+    private final String REPAIRED_SHARD_NAME = "0-Repaired-shard_1";
+
+    private File[] logFiles;
+
+    @Before
+    public void before()
+    {
+        ClassLoader classLoader = getClass().getClassLoader();
+        File dir = new File(classLoader.getResource("org/apache/cassandra/tools/compaction_logs").getFile());
+        Assert.assertTrue(dir.isDirectory());
+
+        logFiles = dir.listFiles();
+        Assert.assertEquals(2, logFiles.length);
+        Arrays.sort(logFiles, Comparator.comparing(File::getName));
+    }
+
+    @Test
+    public void testReadDataPointsWithoutLimit() throws IOException, ParseException
+    {
+        List<CompactionLogAnalyzer.DataPoint> dataPoints = CompactionLogAnalyzer.readDataPoints(logFiles, null);
+
+        long lineCount0 = Files.lines(logFiles[0].toPath()).count();
+        long lineCount1 = Files.lines(logFiles[1].toPath()).count();
+        Assert.assertEquals(lineCount0 + lineCount1 - 2, dataPoints.size()); // without headers
+
+        int shard0Cnt = 0;
+        int shard1Cnt = 0;
+        for (CompactionLogAnalyzer.DataPoint dataPoint : dataPoints)
+        {
+            if (dataPoint.shardId.equals(REPAIRED_SHARD_NAME))
+            {
+                shard0Cnt++;
+            }
+            else if (dataPoint.shardId.equals(UNREPAIRED_SHARD_NAME))
+            {
+                shard1Cnt++;
+            }
+            else
+            {
+                throw new AssertionError("Unexpected shard id");
+            }
+        }
+        Assert.assertEquals(lineCount0 - 1, shard0Cnt);
+        Assert.assertEquals(lineCount1 - 1, shard1Cnt);
+    }
+
+    @Test
+    public void testReadDataPointsWithLimit() throws IOException, ParseException
+    {
+        int limit = 100;
+
+        List<CompactionLogAnalyzer.DataPoint> dataPoints = CompactionLogAnalyzer.readDataPoints(logFiles, limit);
+
+        Assert.assertEquals(2 * limit, dataPoints.size()); // without headers
+
+        int shard0Cnt = 0;
+        int shard1Cnt = 0;
+        for (CompactionLogAnalyzer.DataPoint dataPoint : dataPoints)
+        {
+            if (dataPoint.shardId.equals(REPAIRED_SHARD_NAME))
+            {
+                shard0Cnt++;
+            }
+            else if (dataPoint.shardId.equals(UNREPAIRED_SHARD_NAME))
+            {
+                shard1Cnt++;
+            }
+            else
+            {
+                throw new AssertionError("Unexpected shard id");
+            }
+        }
+        Assert.assertEquals(limit, shard0Cnt);
+        Assert.assertEquals(limit, shard1Cnt);
+    }
+
+    @Test
+    public void testProcessDataPoints() throws IOException, ParseException
+    {
+        List<CompactionLogAnalyzer.DataPoint> dataPoints = CompactionLogAnalyzer.readDataPoints(logFiles, null);
+        JSONArray jsonArray = CompactionLogAnalyzer.processData(dataPoints);
+        int expectedLevels = dataPoints.stream()
+                                       .mapToInt(dp -> dp.bucket)
+                                       .max()
+                                       .getAsInt() + 1; // + L0
+        Assert.assertEquals(expectedLevels + 1, jsonArray.size());  // + Total
+
+        JSONArray[] intervals = new JSONArray[expectedLevels + 1];
+        for (int i = 0; i < intervals.length; i++)
+        {
+            intervals[i] = (JSONArray) ((JSONObject) jsonArray.get(0)).get("intervals");
+        }
+
+        for (int i = 1; i < intervals.length; i++)
+        {
+            Assert.assertEquals(intervals[0].size(), intervals[i].size());
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/utils/ExpMovingAverageTest.java b/test/unit/org/apache/cassandra/utils/ExpMovingAverageTest.java
new file mode 100644
index 000000000000..0426f5993ebe
--- /dev/null
+++ b/test/unit/org/apache/cassandra/utils/ExpMovingAverageTest.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright DataStax, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.utils;
+
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+public class ExpMovingAverageTest
+{
+    private static final double epsilon = 0.0001;
+
+    @Test
+    public void testUpdates()
+    {
+        ExpMovingAverage average = new ExpMovingAverage(0.5);
+        assertNotNull(average.toString());
+
+        average.update(10);
+
+        assertEquals(10, average.get(), epsilon);
+
+        average.update(11);
+        assertEquals(10.5, average.get(), epsilon);
+
+        average.update(12);
+        assertEquals(11.25, average.get(), epsilon);
+
+        average.update(11.75);
+
+        assertEquals(11.5, average.get(), epsilon);
+    }
+
+    @Test
+    public void testDecay10()
+    {
+        testDecay(10, 0.01, ExpMovingAverage.decayBy10());
+    }
+
+    @Test
+    public void testDecay100()
+    {
+        testDecay(100, 0.01, ExpMovingAverage.decayBy100());
+    }
+
+    @Test
+    public void testDecay1000()
+    {
+        testDecay(1000, 0.01, ExpMovingAverage.decayBy1000());
+    }
+
+    @Test
+    public void testDecay()
+    {
+        double ratio = 0.1;
+        int count = 50;
+        testDecay(count, ratio, ExpMovingAverage.withDecay(ratio, count));
+    }
+
+    public void testDecay(int count, double expectedBelow, MovingAverage average)
+    {
+        average.update(1.0);    // on initialization average takes the exact value
+        for (int i = 0; i < count; ++i)
+            average.update(0.0);
+
+        assertTrue(average.get() <= expectedBelow + epsilon);
+        assertTrue(average.get() >= expectedBelow / 2 - epsilon);
+    }
+}
\ No newline at end of file
diff --git a/tools/analytics/plot_adaptive.gnu b/tools/analytics/plot_adaptive.gnu
new file mode 100644
index 000000000000..31a8088b293b
--- /dev/null
+++ b/tools/analytics/plot_adaptive.gnu
@@ -0,0 +1,71 @@
+# Run with: gnuplot ~/Downloads/db-3558/plot.sh
+# Documentation: http://www.gnuplot.info/docs_5.2/Gnuplot_5.2.pdf
+
+path = "."
+outputPath = path."/"
+
+# define the output type and size
+set terminal png size 2000,800 linewidth 2
+
+# the first line makes it skip the first row and use it for the legend otherwise we must specify for every plot:
+# ... every ::1 using 1:n with lines title "blah"
+set key autotitle columnhead
+set datafile separator ","
+
+# timestamp ms,W,num compactions,live sstables,space used bytes,Num inserted,Num read,% inserted,% read,Read IO,Read IO stddev,Write IO,Write IO stddev,Tot IO,Tot IO stddev, Num pending, WA
+
+#datasets = "R100_W0 R80_W20 R50_W50 R20_W80 R0_W100"
+#datasets = "R80_W20 R50_W50 R20_W80"
+#datasets = "R50_W50 R20_W80"
+datasets = "R50_W50"
+N = words(datasets)
+
+dataset(i) = word(datasets, i)
+file(i) = sprintf("%s/testAdaptiveController_%s.csv", path, word(datasets, i))
+
+set xlabel 'Time (ms)'
+#set xrange [0:300000]
+
+set output outputPath.'live_sstables.png'
+set title "Live sstables"
+set ylabel 'Num sstables'
+plot for [i=1:N] file(i) using 1:4 with linespoints title dataset(i)
+
+set output outputPath.'space_used.png'
+set title "Space used"
+set ylabel 'Bytes Used'
+plot for [i=1:N] file(i) using 1:5 with linespoints title dataset(i)
+
+
+set ylabel 'IO cost (ms)'
+set y2label 'W'
+set y2range [-20:100]
+set y2tics -12, 4
+set ytics nomirror
+
+do for [i=1:N] {
+
+	set output outputPath.'read_io_cost_'.dataset(i).'.png'
+	set title "Read IO cost"
+	plot file(i) using 1:10:11 with yerrorbars lt 1 title "", \
+    	 file(i) using 1:10 smooth acsplines lt 1 title dataset(i), \
+     	 file(i) using 1:2 with lines lt 2 axis x1y2 title "W"
+
+	set output outputPath.'write_io_cost_'.dataset(i).'.png'
+	set title "Write IO cost"
+	plot file(i) using 1:12:13 with yerrorbars lt 1 title "", \
+	     file(i) using 1:12 smooth acsplines lt 1 title dataset(i), \
+	     file(i) using 1:2 with lines lt 2 axis x1y2 title "W"
+
+	set output outputPath.'tot_io_cost_'.dataset(i).'.png'
+	set title "Tot. IO cost"
+	plot file(i) using 1:14:15 with yerrorbars lt 1 title "", \
+	     file(i) using 1:14 smooth acsplines lt 1 title dataset(i), \
+	     file(i) using 1:2 with lines lt 2 axis x1y2 title "W"
+
+	set output outputPath.'wa'.dataset(i).'.png'
+        set title "Write Amplification"
+        set ylabel 'WA'
+        plot file(i) using 1:17 with linespoints title 'WA', \
+             file(i) using 1:2 with lines lt 3 axis x1y2 title "W"
+}
\ No newline at end of file
diff --git a/tools/analytics/plot_static.gnu b/tools/analytics/plot_static.gnu
new file mode 100644
index 000000000000..a1e571aab46a
--- /dev/null
+++ b/tools/analytics/plot_static.gnu
@@ -0,0 +1,59 @@
+# Run with: gnuplot ~/Downloads/db-3558/plot.sh
+# Documentation: http://www.gnuplot.info/docs_5.2/Gnuplot_5.2.pdf
+
+path = "."
+outputPath = path."/"
+
+# define the output type and size
+set terminal png size 2000,800 linewidth 2
+
+# the first line makes it skip the first row and use it for the legend otherwise we must specify for every plot:
+# ... every ::1 using 1:n with lines title "blah"
+set key autotitle columnhead
+set datafile separator ","
+
+# timestamp ms,W,num compactions,live sstables,space used bytes,Num inserted,Num read,% inserted,% read,Read IO,Read IO stddev,Write IO,Write IO stddev,Tot IO,Tot IO stddev, Num pending, WA
+
+#datasets = "R100_W0 R80_W20 R50_W50 R20_W80 R0_W100"
+#datasets = "R80_W20 R50_W50 R20_W80"
+datasets = "R50_W50"
+N = words(datasets)
+
+dataset(i) = word(datasets, i)
+file(i) = sprintf("%s/testStaticAnalysis_%s-avg.csv", path, word(datasets, i))
+
+set xlabel 'W'
+
+set output outputPath.'live_sstables.png'
+set title "Live sstables"
+set ylabel 'Num sstables'
+plot for [i=1:N] file(i) using 2:4 with linespoints title dataset(i)
+
+set output outputPath.'space_used.png'
+set title "Space used"
+set ylabel 'Bytes Used'
+plot for [i=1:N] file(i) using 2:5 with linespoints title dataset(i)
+
+
+set ylabel 'ms'
+
+set output outputPath.'read_io_cost.png'
+set title "Read IO cost"
+plot for [i=1:N] file(i) using 2:10:11 with yerrorbars lt i title "", \
+     for [i=1:N] file(i) using 2:10 smooth acsplines lt i title dataset(i)
+
+set output outputPath.'write_io_cost.png'
+set title "Write IO cost"
+plot for [i=1:N] file(i) using 2:12:13 with yerrorbars lt i title "", \
+     for [i=1:N] file(i) using 2:12 smooth acsplines lt i title dataset(i)
+
+set output outputPath.'tot_io_cost.png'
+set title "Tot. IO cost"
+plot for [i=1:N] file(i) using 2:14:15 with yerrorbars lt i title "", \
+     for [i=1:N] file(i) using 2:14 smooth acsplines lt i title dataset(i)
+
+
+set output outputPath.'wa.png'
+set title "Write Amplification"
+set ylabel 'WA'
+plot for [i=1:N] file(i) using 2:17 with linespoints title dataset(i)
\ No newline at end of file
diff --git a/tools/bin/analyzecompactionlog b/tools/bin/analyzecompactionlog
new file mode 100755
index 000000000000..3c3bbe5aabca
--- /dev/null
+++ b/tools/bin/analyzecompactionlog
@@ -0,0 +1,48 @@
+#!/bin/sh
+
+# Copyright DataStax, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if [ "x$CASSANDRA_INCLUDE" = "x" ]; then
+    # Locations (in order) to use when searching for an include file.
+    for include in "`dirname "$0"`/cassandra.in.sh" \
+                   "$HOME/.cassandra.in.sh" \
+                   /usr/share/cassandra/cassandra.in.sh \
+                   /usr/local/share/cassandra/cassandra.in.sh \
+                   /opt/cassandra/cassandra.in.sh; do
+        if [ -r "$include" ]; then
+            . "$include"
+            break
+        fi
+    done
+elif [ -r "$CASSANDRA_INCLUDE" ]; then
+    . "$CASSANDRA_INCLUDE"
+fi
+
+if [ -z "$CLASSPATH" ]; then
+    echo "You must set the CLASSPATH var" >&2
+    exit 1
+fi
+
+if [ "x$MAX_HEAP_SIZE" = "x" ]; then
+    MAX_HEAP_SIZE="512m"
+fi
+
+"$JAVA" $JAVA_AGENT -ea -cp "$CLASSPATH" $JVM_OPTS -Xmx$MAX_HEAP_SIZE \
+        -Dcassandra.storagedir="$cassandra_storagedir" \
+        -Dlogback.configurationFile=logback-tools.xml \
+        $JVM_ARGS \
+        org.apache.cassandra.tools.CompactionLogAnalyzer "$@"
+
+# vi:ai sw=4 ts=4 tw=0 et
\ No newline at end of file
diff --git a/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java b/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
index 7afc4f123f03..1c4c9b43f535 100644
--- a/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
+++ b/tools/stress/src/org/apache/cassandra/stress/CompactionStress.java
@@ -221,7 +221,7 @@ public void run()
 
             StressProfile stressProfile = getStressProfile();
             ColumnFamilyStore cfs = initCf(stressProfile, true);
-            cfs.getCompactionStrategyManager().compactionLogger.enable();
+            cfs.getCompactionStrategy().getCompactionLogger().enable();
 
             List<Future<?>> futures = new ArrayList<>(threads);
             if (maximal)
@@ -231,20 +231,20 @@ public void run()
             else
             {
                 cfs.enableAutoCompaction();
-                cfs.getCompactionStrategyManager().enable();
+                cfs.getCompactionStrategyContainer().enable();
                 for (int i = 0; i < threads; i++)
-                    futures.addAll(CompactionManager.instance.submitBackground(cfs));
+                    futures.add(CompactionManager.instance.submitBackground(cfs));
             }
 
             long working;
             //Report compaction stats while working
-            while ((working = futures.stream().filter(f -> !f.isDone()).count()) > 0 || CompactionManager.instance.getActiveCompactions() > 0 || (!maximal && cfs.getCompactionStrategyManager().getEstimatedRemainingTasks() > 0))
+            while ((working = futures.stream().filter(f -> !f.isDone()).count()) > 0 || CompactionManager.instance.getActiveCompactions() > 0 || (!maximal && cfs.getCompactionStrategy().getEstimatedRemainingTasks() > 0))
             {
                 //Re-up any bg jobs
                 if (!maximal)
                 {
                     for (long i = working; i < threads; i++)
-                        futures.addAll(CompactionManager.instance.submitBackground(cfs));
+                        futures.add(CompactionManager.instance.submitBackground(cfs));
                 }
 
                 reportCompactionStats();

From 75ba24f4f71b2a1b98a38b0885d270702b64dc56 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 16 Jul 2021 12:24:54 +0300
Subject: [PATCH 110/151] Fix JMH libraries

---
 build.xml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/build.xml b/build.xml
index 537c87da39a7..45d6ddc8d191 100644
--- a/build.xml
+++ b/build.xml
@@ -1235,10 +1235,12 @@
 
   <target name="build-jmh" depends="build-test, jar" description="Create JMH uber jar">
       <jar jarfile="${build.test.dir}/deps.jar">
-          <zipgroupfileset dir="${build.dir.lib}/jars">
+          <zipgroupfileset dir="${build.test.dir}/lib/jars">
               <include name="*jmh*.jar"/>
+              <include name="junit*.jar"/>
               <include name="jopt*.jar"/>
               <include name="commons*.jar"/>
+              <include name="hamcrest*.jar"/>
           </zipgroupfileset>
           <zipgroupfileset dir="${build.lib}" includes="*.jar"/>
       </jar>

From e3e170f91846e2a076884f03bb070007e578d128 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Wed, 21 Jul 2021 23:20:15 +0200
Subject: [PATCH 111/151] STAR-839: Add the ability to choose the log file
 configuration for testclasslist-xxx targets (#225)

Also added logback-test-jenkins.xml which has disabled logging to console, default logs on trace level and filter out trace messages.
---
 build.xml                          |  7 +++-
 test/conf/logback-test-jenkins.xml | 63 ++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 1 deletion(-)
 create mode 100644 test/conf/logback-test-jenkins.xml

diff --git a/build.xml b/build.xml
index 45d6ddc8d191..34a1b9a4d2da 100644
--- a/build.xml
+++ b/build.xml
@@ -66,7 +66,7 @@
     <property name="test.driver.read_timeout_ms" value="12000"/>
     <property name="dist.dir" value="${build.dir}/dist"/>
     <property name="tmp.dir" value="${java.io.tmpdir}"/>
-
+    <property name="test.logback.configurationFile" value="${test.conf}/logback-test.xml"/>
     <property name="doc.dir" value="${basedir}/doc"/>
 
     <condition property="version" value="${base.version}">
@@ -1443,6 +1443,7 @@
         <jvmarg value="-Dcassandra.ring_delay_ms=1000"/>
         <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
         <jvmarg value="-Dcassandra.skip_sync=true" />
+        <jvmarg value="-Dlogback.configurationFile=file://${test.logback.configurationFile}"/>
       </testmacrohelper>
     </sequential>
   </macrodef>
@@ -1466,6 +1467,7 @@
         <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
         <jvmarg value="-Dcassandra.config=file:///${compressed_yaml}"/>
         <jvmarg value="-Dcassandra.skip_sync=true" />
+        <jvmarg value="-Dlogback.configurationFile=file://${test.logback.configurationFile}"/>
       </testmacrohelper>
     </sequential>
   </macrodef>
@@ -1486,6 +1488,7 @@
         <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
         <jvmarg value="-Dcassandra.config=file:///${cdc_yaml}"/>
         <jvmarg value="-Dcassandra.skip_sync=true" />
+        <jvmarg value="-Dlogback.configurationFile=file://${test.logback.configurationFile}"/>
       </testmacrohelper>
     </sequential>
   </macrodef>
@@ -1503,6 +1506,7 @@
         <jvmarg value="-Dcassandra.config.loader=org.apache.cassandra.OffsetAwareConfigurationLoader"/>
         <jvmarg value="-Dcassandra.skip_sync=true" />
         <jvmarg value="-Dcassandra.sstable.format.default=big" />
+        <jvmarg value="-Dlogback.configurationFile=file://${test.logback.configurationFile}"/>
       </testmacrohelper>
     </sequential>
   </macrodef>
@@ -1523,6 +1527,7 @@
         <jvmarg value="-Dcassandra.tolerate_sstable_size=true"/>
         <jvmarg value="-Dcassandra.config=file:///${system_keyspaces_directory_yaml}"/>
         <jvmarg value="-Dcassandra.skip_sync=true" />
+        <jvmarg value="-Dlogback.configurationFile=file://${test.logback.configurationFile}"/>
       </testmacrohelper>
     </sequential>
   </macrodef>
diff --git a/test/conf/logback-test-jenkins.xml b/test/conf/logback-test-jenkins.xml
new file mode 100644
index 000000000000..781ec1be83b6
--- /dev/null
+++ b/test/conf/logback-test-jenkins.xml
@@ -0,0 +1,63 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<configuration debug="false" scan="true" scanPeriod="60 seconds">
+  <!-- Shutdown hook ensures that async appender flushes -->
+  <shutdownHook class="ch.qos.logback.core.hook.DelayingShutdownHook"/>
+
+  <!-- Status listener is used to wrap stdout/stderr and tee to log file -->
+  <statusListener class="org.apache.cassandra.LogbackStatusListener" />
+
+  <appender name="FILE" class="ch.qos.logback.core.rolling.RollingFileAppender">
+
+    <file>./build/test/logs/${cassandra.testtag}/TEST-${suitename}.log</file>
+    <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
+      <fileNamePattern>./build/test/logs/${cassandra.testtag}/TEST-${suitename}.log.%i.gz</fileNamePattern>
+      <minIndex>1</minIndex>
+      <maxIndex>20</maxIndex>
+    </rollingPolicy>
+
+    <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
+      <maxFileSize>20MB</maxFileSize>
+    </triggeringPolicy>
+
+    <encoder>
+      <pattern>%-5level [%thread] %date{ISO8601} %msg%n</pattern>
+    </encoder>
+    <immediateFlush>false</immediateFlush>
+
+    <filter class="ch.qos.logback.classic.filter.ThresholdFilter">
+      <level>DEBUG</level>
+    </filter>
+  </appender>
+
+  <!-- Do not change the name of this appender. LogbackStatusListener uses the thread name
+       tied to the appender name to know when to write to real stdout/stderr vs forwarding to logback -->
+  <appender name="ASYNC" class="ch.qos.logback.classic.AsyncAppender">
+      <discardingThreshold>0</discardingThreshold>
+      <maxFlushTime>0</maxFlushTime>
+      <queueSize>1024</queueSize>
+      <appender-ref ref="FILE"/>
+      <includeCallerData>true</includeCallerData>
+  </appender>
+
+  <root level="TRACE">
+    <appender-ref ref="ASYNC" />
+  </root>
+</configuration>

From e41091328a9281184ba22bd0bd48b78f72f165b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Wed, 14 Jul 2021 13:31:08 +0200
Subject: [PATCH 112/151] STAR-821 Add failing test to expose problem with OR
 and PK restrictions

Handling OR and primary key restriction in WHERE does not work properly.
To be fixed by subsequent commits.
---
 .../index/sai/cql/ComplexQueryTest.java       | 30 +++++++++++++++++--
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
index 1dc18601b843..cd022d05f836 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
@@ -136,6 +136,29 @@ public void complexQueryWithMultipleClusterings() throws Throwable
 
     @Test
     public void complexQueryWithPartitionKeyRestriction() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, a int, b int, PRIMARY KEY(pk, ck))");
+
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 1, 1, 1, 5);
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 1, 2, 2, 6);
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 2, 1, 3, 7);
+        execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 2, 2, 4, 8);
+
+        UntypedResultSet resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 AND (a = 2 OR b = 7)");
+
+        assertRowsIgnoringOrder(resultSet, row(1, 2));
+
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7"))
+                .isInstanceOf(InvalidRequestException.class)
+                .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+
+        resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 OR (a = 2 OR b = 7)");
+
+        assertRowsIgnoringOrder(resultSet, row(1, 1), row(1, 2), row(2, 1));
+    }
+
+    @Test
+    public void complexQueryWithPartitionKeyRestrictionAndIndexes() throws Throwable
     {
         createTable("CREATE TABLE %s (pk int, ck int, a int, b int, PRIMARY KEY(pk, ck))");
         createIndex("CREATE CUSTOM INDEX ON %s(a) USING 'StorageAttachedIndex'");
@@ -150,10 +173,11 @@ public void complexQueryWithPartitionKeyRestriction() throws Throwable
 
         assertRowsIgnoringOrder(resultSet, row(1, 2));
 
-        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7")).isInstanceOf(InvalidRequestException.class)
-                                                                                                 .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7"))
+                .isInstanceOf(InvalidRequestException.class)
+                .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
 
-        resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7 ALLOW FILTERING");
+        resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 OR (a = 2 OR b = 7)");
 
         assertRowsIgnoringOrder(resultSet, row(1, 1), row(1, 2), row(2, 1));
     }

From f2ace24ba6f797049752e1e750ec2f548327ee0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Wed, 14 Jul 2021 13:35:10 +0200
Subject: [PATCH 113/151] STAR-821 Add a way to flatten WHERE expression tree

Collapsing levels with the same operator makes reasoning
about expression properties much simpler. AND and OR are
commutative and associative, so nesting is just a distraction.

Additionally refactored the expression tree container nodes
to be truly immutable and fixed a subtle error in rename() that
caused an OR node to be replaced by an AND node.
---
 .../apache/cassandra/cql3/WhereClause.java    | 98 +++++++++++++++----
 .../cql3/WhereClauseExpressionTreeTest.java   | 68 +++++++++++++
 2 files changed, 149 insertions(+), 17 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
index 9c69d6d8ae68..9cb7a14853af 100644
--- a/src/java/org/apache/cassandra/cql3/WhereClause.java
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java
@@ -17,12 +17,7 @@
  */
 package org.apache.cassandra.cql3;
 
-import java.util.ArrayDeque;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Deque;
-import java.util.List;
-import java.util.Objects;
+import java.util.*;
 import java.util.stream.Collectors;
 
 import com.google.common.collect.Lists;
@@ -257,7 +252,9 @@ boolean higherPrecedence(Operator operator)
 
         ContainerElement asContainer()
         {
-            return operator == Operator.OR ? new OrElement().add(expressionElements) : new AndElement().add(expressionElements);
+            return operator == Operator.OR
+                    ? new OrElement(expressionElements)
+                    : new AndElement(expressionElements);
         }
     }
 
@@ -314,6 +311,23 @@ public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
         {
             return this;
         }
+
+        /**
+         * Collapses expression tree levels of the same type.
+         * This is possible because OR and AND operations are commutative.
+         *
+         * <p>
+         * Example:
+         * <ul>
+         *   <li>AND(a, AND(b, c)) -> AND(a, b, c)</li>
+         *   <li>OR(OR(a, b), OR(c, d)) -> OR(a, b, c, d)</li>
+         * </ul>
+         * </p>
+         */
+        public ExpressionElement flatten()
+        {
+            return this;
+        }
     }
 
     public static abstract class VariableElement extends ExpressionElement
@@ -392,7 +406,15 @@ public String toString()
 
     public static abstract class ContainerElement extends ExpressionElement
     {
-        protected final List<ExpressionElement> children = new ArrayList<>();
+        protected final List<ExpressionElement> children;
+
+        protected ContainerElement(Collection<ExpressionElement> children)
+        {
+            this.children = new ArrayList<>(children.size());
+            this.children.addAll(children);
+        }
+
+        protected abstract ContainerElement withChildren(Collection<ExpressionElement> children);
 
         @Override
         public List<ContainerElement> operations()
@@ -403,12 +425,6 @@ public List<ContainerElement> operations()
                            .collect(Collectors.toList());
         }
 
-        public ContainerElement add(Deque<ExpressionElement> children)
-        {
-            this.children.addAll(children);
-            return this;
-        }
-
         protected abstract Operator operator();
 
         @Override
@@ -438,9 +454,35 @@ public boolean containsCustomExpressions()
         @Override
         public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
         {
-            AndElement element = new AndElement();
-            children.stream().map(c -> c.rename(from, to)).forEach(c -> element.children.add(c));
-            return element;
+            List<ExpressionElement> newChildren = children
+                    .stream()
+                    .map(c -> c.rename(from, to))
+                    .collect(Collectors.toList());
+
+            return this.withChildren(newChildren);
+        }
+
+        @Override
+        public ExpressionElement flatten()
+        {
+            List<ExpressionElement> newChildren = new ArrayList<>();
+            for (ExpressionElement child: children)
+            {
+                ExpressionElement flattened = child.flatten();
+                newChildren.add(flattened);
+
+                if (flattened instanceof ContainerElement)
+                {
+                    ContainerElement ce = (ContainerElement) flattened;
+                    if (ce.operator() == this.operator())
+                    {
+                        newChildren.remove(newChildren.size() - 1);
+                        newChildren.addAll(ce.children);
+                    }
+                }
+            }
+
+            return this.withChildren(newChildren);
         }
 
         @Override
@@ -458,6 +500,17 @@ public String toEncapsulatedString()
 
     public static class AndElement extends ContainerElement
     {
+        public AndElement(Collection<ExpressionElement> children)
+        {
+            super(children);
+        }
+
+        @Override
+        protected AndElement withChildren(Collection<ExpressionElement> children)
+        {
+            return new AndElement(children);
+        }
+
         @Override
         protected Operator operator()
         {
@@ -467,6 +520,17 @@ protected Operator operator()
 
     public static class OrElement extends ContainerElement
     {
+        public OrElement(Collection<ExpressionElement> children)
+        {
+            super(children);
+        }
+
+        @Override
+        protected OrElement withChildren(Collection<ExpressionElement> children)
+        {
+            return new OrElement(children);
+        }
+
         @Override
         protected Operator operator()
         {
diff --git a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
index 8de2ce40ff1e..00dc0639a720 100644
--- a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
@@ -26,6 +26,7 @@
 import org.apache.cassandra.exceptions.SyntaxException;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 public class WhereClauseExpressionTreeTest
 {
@@ -101,6 +102,73 @@ public void disjunctionExpressionWithPrecedence() throws Throwable
         testExpression("a = 1 AND (b = 1 OR (c = 1 AND d = 1 AND e = 1))");
     }
 
+    @Test
+    public void flattenConjunction() throws Throwable
+    {
+        WhereClause clause = WhereClause.parse("a = 1 AND (b = 1 AND c = 1)");
+        WhereClause.ExpressionElement flattened = clause.root().flatten();
+        assertTrue(flattened instanceof WhereClause.AndElement);
+        assertEquals(3, ((WhereClause.ContainerElement) flattened).children.size());
+        assertEquals("a = 1 AND b = 1 AND c = 1", flattened.toString());
+    }
+
+    @Test
+    public void flattenDisjunction() throws Throwable
+    {
+        WhereClause clause = WhereClause.parse("a = 1 OR (b = 1 OR c = 1)");
+        WhereClause.ExpressionElement flattened = clause.root().flatten();
+        assertTrue(flattened instanceof WhereClause.OrElement);
+        assertEquals(3, ((WhereClause.ContainerElement) flattened).children.size());
+        assertEquals("a = 1 OR b = 1 OR c = 1", flattened.toString());
+    }
+
+    @Test
+    public void flattenDeeplyNested() throws Throwable
+    {
+        WhereClause.ExpressionElement flattened;
+
+        // deeper nesting, right
+        flattened = WhereClause.parse("a = 1 OR (b = 1 OR (c = 1 OR d = 1))").root().flatten();
+        assertEquals("a = 1 OR b = 1 OR c = 1 OR d = 1", flattened.toString());
+
+        // deeper nesting, left
+        flattened = WhereClause.parse("((a = 1 OR b = 1) OR c = 1) OR d = 1").root().flatten();
+        assertEquals("a = 1 OR b = 1 OR c = 1 OR d = 1", flattened.toString());
+    }
+
+
+    @Test
+    public void flattenMixed() throws Throwable
+    {
+        WhereClause.ExpressionElement flattened;
+
+        flattened = WhereClause.parse("a = 1 OR (b = 1 AND c = 1)").root().flatten();
+        assertEquals("a = 1 OR (b = 1 AND c = 1)", flattened.toString());
+
+        flattened = WhereClause.parse("(a = 1 OR (b = 1 OR c = 1)) AND (d = 1 AND (e = 1 OR f = 1))").root().flatten();
+        assertEquals("(a = 1 OR b = 1 OR c = 1) AND d = 1 AND (e = 1 OR f = 1)", flattened.toString());
+    }
+
+    @Test
+    public void rename() throws Throwable
+    {
+        WhereClause.ExpressionElement root = WhereClause.parse("a1 = 1 OR (b1 = 1 AND c1 = 1)").root();
+
+        WhereClause.ExpressionElement renamed1 =
+                root.rename(
+                        ColumnIdentifier.getInterned("a1", false),
+                        ColumnIdentifier.getInterned("a2", false));
+
+        assertEquals("a2 = 1 OR (b1 = 1 AND c1 = 1)", renamed1.toString());
+
+        WhereClause.ExpressionElement renamed2 =
+                root.rename(
+                        ColumnIdentifier.getInterned("b1", false),
+                        ColumnIdentifier.getInterned("b2", false));
+
+        assertEquals("a1 = 1 OR (b2 = 1 AND c1 = 1)", renamed2.toString());
+    }
+
     @Test
     public void randomTest() throws Throwable
     {

From d4ae7e8698f0cfb8cfaa23c48d359f232cc51a81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Wed, 14 Jul 2021 17:08:41 +0200
Subject: [PATCH 114/151] STAR-821 Add ExpressionElement#conjunctiveForm()

This new method should allow to simplify transforming an expression
tree to a flat statement restrictions set. Conjunctive, flat expression form
makes it possible to look only at the top level of the expression
tree just to grab all restrictions that should be "ANDed", and there
should be no need for any further recursion in StatementRestrictions builder.

Also way we can enforce that partition key restrictions
must be top-level by just inspecting the root element of
the where clause expression.
---
 .../apache/cassandra/cql3/WhereClause.java    | 111 +++++++++++++-----
 .../cql3/WhereClauseExpressionTreeTest.java   |  32 +++++
 2 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
index 9cb7a14853af..ee35109f4bd1 100644
--- a/src/java/org/apache/cassandra/cql3/WhereClause.java
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.cql3;
 
 import java.util.*;
+import java.util.function.Predicate;
 import java.util.stream.Collectors;
 
 import com.google.common.collect.Lists;
@@ -68,6 +69,15 @@ public WhereClause renameIdentifier(ColumnIdentifier from, ColumnIdentifier to)
         return new WhereClause(rootElement.rename(from, to));
     }
 
+    /**
+     * @return a new WhereClause with the expression tree transforemd into conjuntive form
+     * @see ExpressionElement#conjunctiveForm()
+     */
+    public WhereClause conjunctiveForm()
+    {
+        return new WhereClause(rootElement.conjunctiveForm());
+    }
+
     public static WhereClause parse(String cql) throws RecognitionException
     {
         return CQLFragmentParser.parseAnyUnhandled(CqlParser::whereClause, cql).build();
@@ -300,12 +310,26 @@ public List<CustomIndexExpression> expressions()
             return Collections.emptyList();
         }
 
-        public boolean containsCustomExpressions()
+        /**
+         * Returns true if the given function f evaluates to true on any of the expression tree nodes.
+         */
+        public abstract boolean exists(Predicate<ExpressionElement> f);
+
+        /**
+         * Returns true if this expression tree contains more than one relation.
+         */
+        public final boolean isCompound()
         {
-            return false;
+            return exists(e -> e instanceof ContainerElement && ((ContainerElement) e).children.size() > 1);
         }
 
-        public abstract String toEncapsulatedString();
+        /**
+         * Returns true if this expression tree contains a CustomIndexExpressionElement node.
+         */
+        public final boolean containsCustomExpressions()
+        {
+            return exists(e -> e instanceof CustomIndexExpressionElement);
+        }
 
         public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
         {
@@ -313,29 +337,63 @@ public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)
         }
 
         /**
-         * Collapses expression tree levels of the same type.
-         * This is possible because OR and AND operations are commutative.
+         * Collapses expression tree levels of the same type to form a semantically equivalent,
+         * but simpler form of this tree.
+         *
+         * Collapsing is possible because OR and AND operations are associative.
          *
          * <p>
-         * Example:
-         * <ul>
-         *   <li>AND(a, AND(b, c)) -> AND(a, b, c)</li>
-         *   <li>OR(OR(a, b), OR(c, d)) -> OR(a, b, c, d)</li>
-         * </ul>
+         * Examples:
+         * <pre>
+         * AND(a, AND(b, c))      -> AND(a, b, c)
+         * OR(OR(a, b), OR(c, d)) -> OR(a, b, c, d)
+         * AND(a, OR(b, c))       -> AND(a, OR(b, c))
+         * </pre>
          * </p>
+         *
+         * @return a new tree; this tree is left unmodified
          */
         public ExpressionElement flatten()
         {
             return this;
         }
+
+        /**
+         * Creates a new tree that is a conjunctive form of this tree, semantically equivalent to this tree.
+         * The root of the conjunctive form is always an AndElement.
+         *
+         * The result tree is flattened so that nested conjunctions are lifted up to become the direct
+         * children of the root element. If the original tree does not have a top-level AndElement,
+         * an AndElement is inserted at the top, and a flattened original tree becomes its only child.
+         *
+         * <p>
+         * Examples:
+         * <pre>
+         * a = 1                                 -> AND(a = 1)
+         * AND(a = 1, b = 2)                     -> AND(a = 1, b = 2)
+         * AND(a = 1, AND(b = 2, c = 3))         -> AND(a = 1, b = 2, c = 3)
+         * OR(a = 1, b = 2)                      -> AND(OR(a = 1, b = 2))
+         * OR(a = 1, OR(b = 2, c = 3))           -> AND(OR(a = 1, b = 2, c = 3))
+         * </pre>
+         * </p>
+         *
+         * @return a new tree; this tree is left unmodified
+         */
+        public final AndElement conjunctiveForm()
+        {
+            ExpressionElement flattened = this.flatten();
+            return flattened instanceof AndElement
+                    ? (AndElement) flattened
+                    : new AndElement(Lists.newArrayList(flattened));
+        }
     }
 
     public static abstract class VariableElement extends ExpressionElement
     {
         @Override
-        public String toEncapsulatedString()
+        public boolean exists(Predicate<ExpressionElement> f)
         {
-            return toString();
+            return f.test(this);
         }
     }
 
@@ -391,12 +449,6 @@ public List<CustomIndexExpression> expressions()
             return Lists.newArrayList(customIndexExpression);
         }
 
-        @Override
-        public boolean containsCustomExpressions()
-        {
-            return true;
-        }
-
         @Override
         public String toString()
         {
@@ -410,12 +462,18 @@ public static abstract class ContainerElement extends ExpressionElement
 
         protected ContainerElement(Collection<ExpressionElement> children)
         {
+            assert children.size() >= 1: "ContainerElement cannot have 0 children";
             this.children = new ArrayList<>(children.size());
             this.children.addAll(children);
         }
 
+        /**
+         * Returns a new container of the same type with new children copied from the given collection
+         */
         protected abstract ContainerElement withChildren(Collection<ExpressionElement> children);
 
+        protected abstract Operator operator();
+
         @Override
         public List<ContainerElement> operations()
         {
@@ -425,8 +483,6 @@ public List<ContainerElement> operations()
                            .collect(Collectors.toList());
         }
 
-        protected abstract Operator operator();
-
         @Override
         public List<Relation> relations()
         {
@@ -446,9 +502,9 @@ public List<CustomIndexExpression> expressions()
         }
 
         @Override
-        public boolean containsCustomExpressions()
+        public boolean exists(Predicate<ExpressionElement> f)
         {
-            return children.stream().anyMatch(ExpressionElement::containsCustomExpressions);
+            return f.test(this) || children.stream().anyMatch(f);
         }
 
         @Override
@@ -488,13 +544,10 @@ public ExpressionElement flatten()
         @Override
         public String toString()
         {
-            return children.stream().map(ExpressionElement::toEncapsulatedString).collect(Collectors.joining(operator().joinValue()));
-        }
-
-        @Override
-        public String toEncapsulatedString()
-        {
-            return children.stream().map(ExpressionElement::toEncapsulatedString).collect(Collectors.joining(operator().joinValue(), "(", ")"));
+            return children
+                    .stream()
+                    .map(c -> children.size() > 1 && c.isCompound() ? '(' + c.toString() + ')': c.toString())
+                    .collect(Collectors.joining(operator().joinValue()));
         }
     }
 
diff --git a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
index 00dc0639a720..d986ea676275 100644
--- a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
@@ -149,6 +149,38 @@ public void flattenMixed() throws Throwable
         assertEquals("(a = 1 OR b = 1 OR c = 1) AND d = 1 AND (e = 1 OR f = 1)", flattened.toString());
     }
 
+    @Test
+    public void conjunctiveFormSimple() throws Throwable
+    {
+        WhereClause.AndElement conj = WhereClause.parse("a = 1").root().conjunctiveForm();
+        assertEquals(1, conj.children.size());
+        assertEquals("a = 1", conj.toString());
+    }
+
+    @Test
+    public void conjunctiveFormSingleAnd() throws Throwable
+    {
+        WhereClause.AndElement conj = WhereClause.parse("a = 1 AND b = 1").root().conjunctiveForm();
+        assertEquals(2, conj.children.size());
+        assertEquals("a = 1 AND b = 1", conj.toString());
+    }
+
+    @Test
+    public void conjunctiveFormSingleOr() throws Throwable
+    {
+        WhereClause.AndElement conj = WhereClause.parse("a = 1 OR b = 1").root().conjunctiveForm();
+        assertEquals(1, conj.children.size());
+        assertEquals("a = 1 OR b = 1", conj.toString());
+    }
+
+    @Test
+    public void conjunctiveFormNested() throws Throwable
+    {
+        WhereClause.AndElement conj = WhereClause.parse("a = 1 AND (b = 1 AND c = 1)").root().conjunctiveForm();
+        assertEquals(3, conj.children.size());
+        assertEquals("a = 1 AND b = 1 AND c = 1", conj.toString());
+    }
+
     @Test
     public void rename() throws Throwable
     {

From b28f8cda06b3382667fbf6d26a43655f74c77bf2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Fri, 16 Jul 2021 10:58:02 +0200
Subject: [PATCH 115/151] STAR-821 Reject queries with partition key
 restriction nested under OR

Examples of queries that are rejected now:
(pk - partition key; ck - clustering column; a, b, c: regular columns):

    SELECT ... WHERE pk = 1 OR a = 1 AND b = 2 AND ... ALLOW FILTERING
    SELECT ... WHERE a = 1 OR pk = 2 AND b = 2 AND ... ALLOW FILTERING

This commit additionally fixes ORed clustering column
restrictions with ALLOW FILTERING and no indexes.
The following query was missing the rows with ck = 1:

    SELECT ... WHERE ck = 1 OR a = 2 ALLOW FILTERING
---
 .../restrictions/StatementRestrictions.java   | 38 +++++++++++++---
 .../index/sai/cql/ComplexQueryTest.java       | 44 ++++++++++++++-----
 2 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
index d506f3b26e00..61cdc2287173 100644
--- a/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
+++ b/src/java/org/apache/cassandra/cql3/restrictions/StatementRestrictions.java
@@ -68,6 +68,9 @@ public class StatementRestrictions
     public static final String INDEX_DOES_NOT_SUPPORT_DISJUNCTION =
     "An index involved in this query does not support disjunctive queries using the OR operator";
 
+    public static final String PARTITION_KEY_RESTRICTION_MUST_BE_TOP_LEVEL =
+    "Restriction on partition key column %s must not be nested under OR operator";
+
     /**
      * The Column Family meta data
      */
@@ -302,11 +305,24 @@ public StatementRestrictions build()
             if (allowUseOfSecondaryIndices && type.allowUseOfSecondaryIndices())
                 indexRegistry = IndexRegistry.obtain(table);
 
-            return doBuild(whereClause.root(), indexRegistry);
+
+            WhereClause.AndElement root = whereClause.root().conjunctiveForm();
+            return doBuild(root, indexRegistry, 0);
         }
 
-        StatementRestrictions doBuild(WhereClause.ExpressionElement element, IndexRegistry indexRegistry)
+        /**
+         * Processes the WHERE clause expression tree recursively and assigns the restrictions to different sets
+         * based on the columns they are applied to.
+         *
+         * @param element root of the tree
+         * @param nestingLevel recursion depth needed to reject the restrictions that
+         *                     are not allowed to be nested (e.g. partition key restrictions)
+         */
+        StatementRestrictions doBuild(WhereClause.ExpressionElement element, IndexRegistry indexRegistry, int nestingLevel)
         {
+            assert element instanceof WhereClause.AndElement || nestingLevel > 0:
+                    "Root of the WHERE clause expression tree must be a conjunction";
+
             PartitionKeySingleRestrictionSet.Builder partitionKeyRestrictionSet = PartitionKeySingleRestrictionSet.builder(table.partitionKeyAsClusteringComparator());
             ClusteringColumnRestrictions.Builder clusteringColumnsRestrictionSet = ClusteringColumnRestrictions.builder(table, allowFiltering, indexRegistry);
             RestrictionSet.Builder nonPrimaryKeyRestrictionSet = RestrictionSet.builder();
@@ -340,11 +356,23 @@ StatementRestrictions doBuild(WhereClause.ExpressionElement element, IndexRegist
                     ColumnMetadata def = restriction.getFirstColumn();
                     if (def.isPartitionKey())
                     {
+                        // All partition key restrictions must be a part of the top-level AND operation.
+                        // The read path filtering logic is currently unable to filter rows based on
+                        // partition key restriction that is a part of a complex expression involving disjunctions.
+                        // ALLOW FILTERING does not cut it, as RowFilter can't handle ORed partition
+                        // key restrictions properly.
+                        if (nestingLevel > 0)
+                            throw invalidRequest(StatementRestrictions.PARTITION_KEY_RESTRICTION_MUST_BE_TOP_LEVEL, def);
+
                         partitionKeyRestrictionSet.addRestriction(restriction);
                     }
-                    else if (def.isClusteringColumn())
+                    // If a clustering column restriction is nested (under OR operator),
+                    // we can't treat it as a real clustering column,
+                    // but instead we treat it as a regular column and use
+                    // index (if we have one) or use row filtering on it; hence we require nestingLevel == 0 check here
+                    else if (def.isClusteringColumn() && nestingLevel == 0)
                     {
-                        clusteringColumnsRestrictionSet.addRestriction(restriction, element.isDisjunction());
+                        clusteringColumnsRestrictionSet.addRestriction(restriction);
                     }
                     else
                     {
@@ -526,7 +554,7 @@ else if (!allowFiltering)
             ImmutableList.Builder<StatementRestrictions> children = ImmutableList.builder();
 
             for (WhereClause.ContainerElement container : element.operations())
-                children.add(doBuild(container, indexRegistry));
+                children.add(doBuild(container, indexRegistry, nestingLevel + 1));
 
             return new StatementRestrictions(table,
                                              partitionKeyRestrictions,
diff --git a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
index cd022d05f836..8632b96d4b3b 100644
--- a/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
+++ b/test/unit/org/apache/cassandra/index/sai/cql/ComplexQueryTest.java
@@ -87,6 +87,23 @@ public void complexQueryTest() throws Throwable
         assertRowsIgnoringOrder(resultSet, row(1), row(6), row(7) );
     }
 
+    @Test
+    public void disjunctionWithClusteringKey() throws Throwable
+    {
+        createTable("CREATE TABLE %s (pk int, ck int, a int, PRIMARY KEY(pk, ck))");
+
+        execute("INSERT INTO %s (pk, ck, a) VALUES (?, ?, ?)", 1, 1, 1);
+        execute("INSERT INTO %s (pk, ck, a) VALUES (?, ?, ?)", 2, 2, 2);
+
+        assertThatThrownBy(() -> execute("SELECT pk FROM %s WHERE a = 1 or ck = 2"))
+                .isInstanceOf(InvalidRequestException.class)
+                .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
+
+        UntypedResultSet resultSet = execute("SELECT pk FROM %s WHERE a = 1 or ck = 2 ALLOW FILTERING");
+
+        assertRowsIgnoringOrder(resultSet, row(1), row(2));
+    }
+
     @Test
     public void disjunctionWithIndexOnClusteringKey() throws Throwable
     {
@@ -144,17 +161,24 @@ public void complexQueryWithPartitionKeyRestriction() throws Throwable
         execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 2, 1, 3, 7);
         execute("INSERT INTO %s (pk, ck, a, b) VALUES (?, ?, ?, ?)", 2, 2, 4, 8);
 
-        UntypedResultSet resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 AND (a = 2 OR b = 7)");
-
-        assertRowsIgnoringOrder(resultSet, row(1, 2));
 
-        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7"))
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 AND (a = 2 OR b = 7)"))
                 .isInstanceOf(InvalidRequestException.class)
                 .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
 
-        resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 OR (a = 2 OR b = 7)");
+        UntypedResultSet resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 AND (a = 2 OR b = 7) ALLOW FILTERING");
+
+        assertRowsIgnoringOrder(resultSet, row(1, 2));
+
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7 ALLOW FILTERING"))
+                .isInstanceOf(InvalidRequestException.class)
+                .hasMessage(String.format(StatementRestrictions.PARTITION_KEY_RESTRICTION_MUST_BE_TOP_LEVEL, "pk"));
 
-        assertRowsIgnoringOrder(resultSet, row(1, 1), row(1, 2), row(2, 1));
+        // Here pk = 1 is directly under AND operation, so a simple isDisjunction check on it would not be enough
+        // to reject it ;)
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE a = 2 OR (pk = 1 AND b = 7) ALLOW FILTERING"))
+                .isInstanceOf(InvalidRequestException.class)
+                .hasMessage(String.format(StatementRestrictions.PARTITION_KEY_RESTRICTION_MUST_BE_TOP_LEVEL, "pk"));
     }
 
     @Test
@@ -173,13 +197,9 @@ public void complexQueryWithPartitionKeyRestrictionAndIndexes() throws Throwable
 
         assertRowsIgnoringOrder(resultSet, row(1, 2));
 
-        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7"))
+        assertThatThrownBy(() -> execute("SELECT pk, ck FROM %s WHERE pk = 1 OR a = 2 OR b = 7 ALLOW FILTERING"))
                 .isInstanceOf(InvalidRequestException.class)
-                .hasMessage(StatementRestrictions.REQUIRES_ALLOW_FILTERING_MESSAGE);
-
-        resultSet = execute("SELECT pk, ck FROM %s WHERE pk = 1 OR (a = 2 OR b = 7)");
-
-        assertRowsIgnoringOrder(resultSet, row(1, 1), row(1, 2), row(2, 1));
+                .hasMessage(String.format(StatementRestrictions.PARTITION_KEY_RESTRICTION_MUST_BE_TOP_LEVEL, "pk"));
     }
 
     @Test

From 6f0f9a8b427b9363e385906b038a814a986e46f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Fri, 16 Jul 2021 18:04:08 +0200
Subject: [PATCH 116/151] STAR-821 Allow empty container elements

Looks like the CqlParser can construct empty container elements
when given invalid queries, before returning the error to the
user.
---
 src/java/org/apache/cassandra/cql3/WhereClause.java | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
index ee35109f4bd1..95f22b0f0e82 100644
--- a/src/java/org/apache/cassandra/cql3/WhereClause.java
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java
@@ -462,7 +462,6 @@ public static abstract class ContainerElement extends ExpressionElement
 
         protected ContainerElement(Collection<ExpressionElement> children)
         {
-            assert children.size() >= 1: "ContainerElement cannot have 0 children";
             this.children = new ArrayList<>(children.size());
             this.children.addAll(children);
         }
@@ -544,10 +543,12 @@ public ExpressionElement flatten()
         @Override
         public String toString()
         {
-            return children
-                    .stream()
-                    .map(c -> children.size() > 1 && c.isCompound() ? '(' + c.toString() + ')': c.toString())
-                    .collect(Collectors.joining(operator().joinValue()));
+            return children.isEmpty()
+                    ? "()"
+                    : children
+                        .stream()
+                        .map(c -> children.size() > 1 && c.isCompound() ? '(' + c.toString() + ')':c.toString())
+                        .collect(Collectors.joining(operator().joinValue()));
         }
     }
 

From 9cc2050f4608057b41ee78f715faafc827faf526 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Thu, 22 Jul 2021 15:26:58 +0200
Subject: [PATCH 117/151] STAR-821 Replace EmptyElement with an empty
 AndElement

No need to special-case an empty where clause, when we can
already model it with an AndElement. And it has a nice property
that it is already in a conjunctive form.
---
 .../apache/cassandra/cql3/WhereClause.java    | 41 +++++++++++--------
 .../cql3/WhereClauseExpressionTreeTest.java   |  9 ++++
 2 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
index 95f22b0f0e82..d883d32e0c64 100644
--- a/src/java/org/apache/cassandra/cql3/WhereClause.java
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java
@@ -33,7 +33,7 @@
  */
 public final class WhereClause
 {
-    private static final WhereClause EMPTY = new WhereClause(ExpressionElement.EMPTY);
+    private static final WhereClause EMPTY = new WhereClause(new AndElement(Collections.emptyList()));
 
     private final ExpressionElement rootElement;
 
@@ -288,8 +288,6 @@ enum PushState
 
     public static abstract class ExpressionElement
     {
-        private static final ExpressionElement EMPTY = new EmptyElement();
-
         public List<ContainerElement> operations()
         {
             return Collections.emptyList();
@@ -370,6 +368,7 @@ public ExpressionElement flatten()
          * Examples:
          * <pre>
          * a = 1                                 -> AND(a = 1)
+         * AND()                                 -> AND()
          * AND(a = 1, b = 2)                     -> AND(a = 1, b = 2)
          * AND(a = 1, AND(b = 2, c = 3))         -> AND(a = 1, b = 2, c = 3)
          * OR(a = 1, b = 2)                      -> AND(OR(a = 1, b = 2))
@@ -397,15 +396,6 @@ public boolean exists(Predicate<ExpressionElement> f)
         }
     }
 
-    public static class EmptyElement extends VariableElement
-    {
-        @Override
-        public String toString()
-        {
-            return "";
-        }
-    }
-
     public static class RelationElement extends VariableElement
     {
         private final Relation relation;
@@ -473,6 +463,8 @@ protected ContainerElement(Collection<ExpressionElement> children)
 
         protected abstract Operator operator();
 
+        protected abstract String emptyValue();
+
         @Override
         public List<ContainerElement> operations()
         {
@@ -543,12 +535,13 @@ public ExpressionElement flatten()
         @Override
         public String toString()
         {
-            return children.isEmpty()
-                    ? "()"
-                    : children
-                        .stream()
-                        .map(c -> children.size() > 1 && c.isCompound() ? '(' + c.toString() + ')':c.toString())
-                        .collect(Collectors.joining(operator().joinValue()));
+            if (children.isEmpty())
+                return emptyValue();
+
+            return children
+                .stream()
+                .map(c -> children.size() > 1 && c.isCompound() ? '(' + c.toString() + ')' : c.toString())
+                .collect(Collectors.joining(operator().joinValue()));
         }
     }
 
@@ -570,6 +563,12 @@ protected Operator operator()
         {
             return Operator.AND;
         }
+
+        @Override
+        protected String emptyValue()
+        {
+            return "TRUE";
+        }
     }
 
     public static class OrElement extends ContainerElement
@@ -591,6 +590,12 @@ protected Operator operator()
             return Operator.OR;
         }
 
+        @Override
+        protected String emptyValue()
+        {
+            return "FALSE";
+        }
+
         @Override
         public boolean isDisjunction()
         {
diff --git a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
index d986ea676275..8e6fd3c5e4d0 100644
--- a/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
+++ b/test/unit/org/apache/cassandra/cql3/WhereClauseExpressionTreeTest.java
@@ -149,6 +149,15 @@ public void flattenMixed() throws Throwable
         assertEquals("(a = 1 OR b = 1 OR c = 1) AND d = 1 AND (e = 1 OR f = 1)", flattened.toString());
     }
 
+    @Test
+    public void conjunctiveFormEmpty() throws Throwable
+    {
+        WhereClause conj = WhereClause.empty().conjunctiveForm();
+        assertEquals(WhereClause.empty(), conj);
+        assertEquals(0, conj.root().expressions().size());
+        assertEquals(0, conj.root().relations().size());
+    }
+
     @Test
     public void conjunctiveFormSimple() throws Throwable
     {

From d8345192d63b91873d0bdbe0ecbd8bb39264bdda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Ko=C5=82aczkowski?= <pkolaczk@datastax.com>
Date: Thu, 22 Jul 2021 15:30:01 +0200
Subject: [PATCH 118/151] STAR-821 Cleanup minor code-style issue reported by
 Sonar

---
 src/java/org/apache/cassandra/cql3/WhereClause.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/org/apache/cassandra/cql3/WhereClause.java b/src/java/org/apache/cassandra/cql3/WhereClause.java
index d883d32e0c64..1969283ec43c 100644
--- a/src/java/org/apache/cassandra/cql3/WhereClause.java
+++ b/src/java/org/apache/cassandra/cql3/WhereClause.java
@@ -326,7 +326,7 @@ public final boolean isCompound()
          */
         public final boolean containsCustomExpressions()
         {
-            return exists(e -> e instanceof CustomIndexExpressionElement);
+            return exists(CustomIndexExpressionElement.class::isInstance);
         }
 
         public ExpressionElement rename(ColumnIdentifier from, ColumnIdentifier to)

From d40f9e30625e5c0a74bf3f519ff37fc3e97648f3 Mon Sep 17 00:00:00 2001
From: Mike Adamson <madamson@datastax.com>
Date: Fri, 23 Jul 2021 13:05:19 +0100
Subject: [PATCH 119/151] STAR-815: MultiRangeReadCommand always used by SAI
 and remove info log in query path (#222)

---
 .../index/sai/plan/StorageAttachedIndexQueryPlan.java  |  3 +--
 .../service/reads/ShortReadPartitionsProtection.java   | 10 +++++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
index 50878558ae96..47e970e1ae5e 100644
--- a/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
+++ b/src/java/org/apache/cassandra/index/sai/plan/StorageAttachedIndexQueryPlan.java
@@ -37,7 +37,6 @@
 
 public class StorageAttachedIndexQueryPlan implements Index.QueryPlan
 {
-    private static final boolean USE_MULTI_RANGE_READ_COMMAND = Boolean.parseBoolean(System.getProperty("cassandra.sai.use_multi_range_read_command", "false"));
     private final ColumnFamilyStore cfs;
     private final TableQueryMetrics queryMetrics;
     private final RowFilter postIndexFilter;
@@ -137,6 +136,6 @@ public RowFilter postIndexQueryFilter()
     @Override
     public boolean supportsMultiRangeReadCommand()
     {
-        return USE_MULTI_RANGE_READ_COMMAND;
+        return true;
     }
 }
diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
index 10a964d6fc31..c3ea9e7b535c 100644
--- a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
+++ b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
@@ -18,6 +18,8 @@
 
 package org.apache.cassandra.service.reads;
 
+import java.util.concurrent.TimeUnit;
+
 import org.apache.cassandra.locator.Endpoints;
 import org.apache.cassandra.locator.ReplicaPlan;
 import org.apache.cassandra.locator.ReplicaPlans;
@@ -46,10 +48,13 @@
 import org.apache.cassandra.service.reads.repair.NoopReadRepair;
 import org.apache.cassandra.service.StorageProxy;
 import org.apache.cassandra.tracing.Tracing;
+import org.apache.cassandra.utils.NoSpamLogger;
 
 public class ShortReadPartitionsProtection extends Transformation<UnfilteredRowIterator> implements MorePartitions<UnfilteredPartitionIterator>
 {
     private static final Logger logger = LoggerFactory.getLogger(ShortReadPartitionsProtection.class);
+    private static final NoSpamLogger oneMinuteLogger = NoSpamLogger.getLogger(logger, 1, TimeUnit.MINUTES);
+
     private final ReadCommand command;
     private final Replica source;
 
@@ -84,6 +89,7 @@ public ShortReadPartitionsProtection(ReadCommand command,
     public UnfilteredRowIterator applyToPartition(UnfilteredRowIterator partition)
     {
         partitionsFetched = true;
+        rangeFetched = true;
 
         lastPartitionKey = partition.partitionKey();
 
@@ -152,7 +158,9 @@ public UnfilteredPartitionIterator moreContents()
 
         ColumnFamilyStore.metricsFor(command.metadata().id).shortReadProtectionRequests.mark();
         Tracing.trace("Requesting {} extra rows from {} for short read protection", toQuery, source);
-        logger.info("Requesting {} extra rows from {} for short read protection", toQuery, source);
+        // This is a NoSpamLogger because, in the event of unrepaired data or missing data on nodes in
+        // a cluster, we can end up spamming the logs with this message
+        oneMinuteLogger.info("Requesting {} extra rows from {} for short read protection", toQuery, source);
 
         // If we've arrived here, all responses have been consumed, and we're about to request more.
         preFetchCallback.run();

From 09693322222669f4e1975e8066331441789da8b5 Mon Sep 17 00:00:00 2001
From: dan jatnieks <jatnieks@pobox.com>
Date: Wed, 28 Jul 2021 19:57:57 -0700
Subject: [PATCH 120/151] STAR-847 Close client to prevent driver queries after
 dropping user (#227)

---
 .../org/apache/cassandra/cql3/CQLTester.java  | 37 ++++++++++++++++++-
 .../cassandra/guardrails/GuardrailTester.java | 12 +-----
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java
index 37beff53e1fc..36b2b01eb350 100644
--- a/test/unit/org/apache/cassandra/cql3/CQLTester.java
+++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java
@@ -1129,7 +1129,7 @@ private Cluster getCluster(ProtocolVersion protocolVersion)
                                         userProto -> initClientCluster(user, protocolVersion));
     }
 
-    private static Cluster initClientCluster(User user, ProtocolVersion version)
+    private Cluster initClientCluster(User user, ProtocolVersion version)
     {
         Pair<User, ProtocolVersion> key = Pair.create(user, version);
         Cluster cluster = clusters.get(key);
@@ -1141,11 +1141,39 @@ private static Cluster initClientCluster(User user, ProtocolVersion version)
             builder.withCredentials(user.username, user.password);
         cluster = builder.build();
 
-        logger.info("Started Java Driver instance for protocol version {}", version);
+        logger.info("Started Java Driver session for {} with protocol version {}", user, version);
 
         return cluster;
     }
 
+    protected void closeClientCluster(String username, String password)
+    {
+        // Close driver cluster belonging to user
+        User user = new User(username, password);
+        for (ProtocolVersion protocolVersion : PROTOCOL_VERSIONS)
+        {
+            closeClientCluster(user, protocolVersion);
+        }
+    }
+
+    private void closeClientCluster(User user, ProtocolVersion protocolVersion)
+    {
+        Pair<User, ProtocolVersion> key = Pair.create(user, protocolVersion);
+        Session session = sessions.remove(key);
+        if (session != null)
+        {
+            session.close();
+        }
+        
+        Cluster cluster = clusters.remove(key);
+        if (cluster != null)
+        {
+            cluster.close();
+        }
+
+        logger.info("Closed Java Driver session for {} with protocol version {}", user, protocolVersion);
+    }
+
     public static Cluster.Builder clusterBuilder(ProtocolVersion version)
     {
         Cluster.Builder builder = clusterBuilder();
@@ -2279,5 +2307,10 @@ public boolean equals(Object o)
             return Objects.equal(username, u.username)
                    && Objects.equal(password, u.password);
         }
+
+        public String toString()
+        {
+            return username;
+        }
     }
 }
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
index 39309a112467..56235aba55bc 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailTester.java
@@ -21,7 +21,6 @@
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.CopyOnWriteArrayList;
-import java.util.concurrent.TimeUnit;
 import java.util.function.BiConsumer;
 import javax.annotation.Nullable;
 
@@ -32,7 +31,6 @@
 import org.junit.BeforeClass;
 
 import com.datastax.driver.core.Statement;
-import com.datastax.driver.core.SimpleStatement;
 import com.datastax.driver.core.exceptions.InvalidQueryException;
 import org.apache.cassandra.auth.AuthenticatedUser;
 import org.apache.cassandra.config.DatabaseDescriptor;
@@ -40,7 +38,6 @@
 import org.apache.cassandra.exceptions.ConfigurationException;
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.QueryState;
-import org.awaitility.Awaitility;
 
 import static java.lang.String.format;
 import static org.junit.Assert.assertEquals;
@@ -83,13 +80,6 @@ public void beforeGuardrailTest() throws Throwable
         executeNet(format("CREATE USER IF NOT EXISTS %s WITH PASSWORD '%s'", USERNAME, PASSWORD));
         executeNet(format("GRANT ALL ON KEYSPACE %s TO %s", KEYSPACE, USERNAME));
 
-        // Make sure keyspace permissions have been applied
-        Awaitility.await()
-                  .atMost(10, TimeUnit.SECONDS)
-                  .with()
-                  .pollInterval(500, TimeUnit.MILLISECONDS)
-                  .until(() -> !executeNet(new SimpleStatement("LIST ALL OF " + USERNAME)).all().isEmpty());
-        
         useUser(USERNAME, PASSWORD);
 
         listener = new TestListener(null);
@@ -104,6 +94,8 @@ public void afterGuardrailTest() throws Throwable
     {
         Guardrails.unregister(listener);
 
+        closeClientCluster(USERNAME, PASSWORD);
+
         useSuperUser();
         executeNet("DROP USER " + USERNAME);
     }

From e11d71626e5a8db0a957e708855ffc8f18e1c0f5 Mon Sep 17 00:00:00 2001
From: Jacek Lewandowski <6516951+jacek-lewandowski@users.noreply.github.com>
Date: Fri, 30 Jul 2021 16:04:24 +0200
Subject: [PATCH 121/151] STAR-762: Implement paging in bytes (#223)

Add page size in bytes flag to protocol
Introduce PageSize object
Protocol version changes
No support for describe statement yet
Simplify SecondaryIndexManager page calculation
Add page size in bytes to DataLimits
Refactor pagers
Add / pull some tests
Add some toString implementations
Add PageSize to expected classes in DatabaseDescriptorRefTest

Fix AggregationPartitionIterator
So far we were passing the main page size to the AggregationPartitionIterator, which:
- was pointless because there is no paging when we aggregate everything
- it was actually harmful because AggregationPartitionIterator is a subclass of GroupByPartitionIterator and the later updates the subPager's limits with the minimum count of main page size and the number of remaining. It is correct if we use grouping aware limits, where count applies to the whole groups. But when we do aggregate everything, simply CQL limits are used and count limit applies to rows. Concluding, without fixing that we would limit the number of aggregated rows to the main page size which is not what we want
---
 conf/cassandra.yaml                           |    4 +
 doc/native_protocol_v4.1.spec                 | 1212 +++++++++++++++++
 .../cassandra/batchlog/BatchlogManager.java   |    3 +-
 .../org/apache/cassandra/config/Config.java   |    2 +
 .../cassandra/config/DatabaseDescriptor.java  |   18 +
 .../org/apache/cassandra/cql3/PageSize.java   |  153 +++
 .../apache/cassandra/cql3/QueryOptions.java   |   61 +-
 .../apache/cassandra/cql3/QueryProcessor.java |    2 +-
 .../cassandra/cql3/UntypedResultSet.java      |   13 +-
 .../cql3/statements/DescribeStatement.java    |   14 +-
 .../cql3/statements/SelectStatement.java      |  110 +-
 .../org/apache/cassandra/db/Mutation.java     |    6 +
 .../db/SystemKeyspaceMigrator40.java          |   10 +-
 .../cassandra/db/filter/DataLimits.java       |  452 ++++--
 .../cassandra/guardrails/Guardrails.java      |    1 -
 .../index/SecondaryIndexManager.java          |   36 +-
 .../internal/CollatedViewIndexBuilder.java    |    3 +-
 .../org/apache/cassandra/net/Message.java     |   11 +
 .../cassandra/net/MessagingService.java       |   10 +-
 .../repair/consistent/LocalSessions.java      |    3 +-
 .../service/pager/AbstractQueryPager.java     |  146 +-
 .../service/pager/AggregationQueryPager.java  |  179 ++-
 .../service/pager/MultiPartitionPager.java    |  133 +-
 .../cassandra/service/pager/PagingState.java  |   14 +-
 .../pager/PartitionRangeQueryPager.java       |   22 +-
 .../cassandra/service/pager/QueryPager.java   |   13 +-
 .../service/pager/SinglePartitionPager.java   |   31 +-
 .../reads/ShortReadPartitionsProtection.java  |    2 +-
 .../reads/ShortReadRowsProtection.java        |    2 +-
 .../apache/cassandra/transport/Client.java    |    5 +-
 .../transport/messages/ExecuteMessage.java    |    9 +-
 .../transport/messages/OptionsMessage.java    |    6 +
 .../transport/messages/QueryMessage.java      |   11 +-
 .../transport/messages/StartupMessage.java    |    1 +
 .../cassandra/transport/BurnTestUtil.java     |    3 +-
 .../distributed/impl/Coordinator.java         |   11 +-
 .../config/DatabaseDescriptorRefTest.java     |    1 +
 .../cql3/CustomNowInSecondsTest.java          |    2 +-
 .../cassandra/cql3/PagingQueryTest.java       |  498 ++++++-
 .../org/apache/cassandra/cql3/PagingTest.java |   10 +-
 .../db/AbstractReadCommandBuilder.java        |   21 +-
 .../apache/cassandra/db/ReadResponseTest.java |    8 +
 .../cassandra/db/filter/DataLimitsTest.java   |  139 ++
 .../db/lifecycle/LogReplicationSetTest.java   |    7 +-
 .../db/rows/UnfilteredRowIteratorsTest.java   |    7 +
 .../guardrails/GuardrailConsistencyTest.java  |    3 +-
 .../guardrails/GuardrailPagingTest.java       |  172 +++
 .../cassandra/index/CustomIndexTest.java      |   72 +-
 .../cassandra/service/QueryPagerTest.java     |  793 +++++++++--
 .../reads/range/RangeCommandsTest.java        |   29 +-
 .../cassandra/transport/SerDeserTest.java     |   10 +-
 51 files changed, 3932 insertions(+), 552 deletions(-)
 create mode 100644 doc/native_protocol_v4.1.spec
 create mode 100644 src/java/org/apache/cassandra/cql3/PageSize.java
 create mode 100644 test/unit/org/apache/cassandra/db/filter/DataLimitsTest.java
 create mode 100644 test/unit/org/apache/cassandra/guardrails/GuardrailPagingTest.java

diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml
index d70d025eb910..c117bd9993b0 100644
--- a/conf/cassandra.yaml
+++ b/conf/cassandra.yaml
@@ -1394,6 +1394,10 @@ report_unconfirmed_repaired_data_mismatches: false
 # table_count_warn_threshold: 150
 # keyspace_count_warn_threshold: 40
 
+# This is the page size used internally by aggregation queries. It aims to limit the memory used by aggregation
+# queries when there is a lot of data to aggregate.
+# aggregation_subpage_size_in_kb: 2048
+
 #########################
 # EXPERIMENTAL FEATURES #
 #########################
diff --git a/doc/native_protocol_v4.1.spec b/doc/native_protocol_v4.1.spec
new file mode 100644
index 000000000000..a10fd2404d8f
--- /dev/null
+++ b/doc/native_protocol_v4.1.spec
@@ -0,0 +1,1212 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+                             CQL BINARY PROTOCOL v4.1
+
+
+Table of Contents
+
+  1. Overview
+  2. Frame header
+    2.1. version
+    2.2. flags
+    2.3. stream
+    2.4. opcode
+    2.5. length
+  3. Notations
+  4. Messages
+    4.1. Requests
+      4.1.1. STARTUP
+      4.1.2. AUTH_RESPONSE
+      4.1.3. OPTIONS
+      4.1.4. QUERY
+      4.1.5. PREPARE
+      4.1.6. EXECUTE
+      4.1.7. BATCH
+      4.1.8. REGISTER
+    4.2. Responses
+      4.2.1. ERROR
+      4.2.2. READY
+      4.2.3. AUTHENTICATE
+      4.2.4. SUPPORTED
+      4.2.5. RESULT
+        4.2.5.1. Void
+        4.2.5.2. Rows
+        4.2.5.3. Set_keyspace
+        4.2.5.4. Prepared
+        4.2.5.5. Schema_change
+      4.2.6. EVENT
+      4.2.7. AUTH_CHALLENGE
+      4.2.8. AUTH_SUCCESS
+  5. Compression
+  6. Data Type Serialization Formats
+  7. User Defined Type Serialization
+  8. Result paging
+  9. Error codes
+  10. Changes from v4
+
+
+1. Overview
+
+  The CQL binary protocol is a frame based protocol. Frames are defined as:
+
+      0         8        16        24        32         40
+      +---------+---------+---------+---------+---------+
+      | version |  flags  |      stream       | opcode  |
+      +---------+---------+---------+---------+---------+
+      |                length                 |
+      +---------+---------+---------+---------+
+      |                                       |
+      .            ...  body ...              .
+      .                                       .
+      .                                       .
+      +----------------------------------------
+
+  The protocol is big-endian (network byte order).
+
+  Each frame contains a fixed size header (9 bytes) followed by a variable size
+  body. The header is described in Section 2. The content of the body depends
+  on the header opcode value (the body can in particular be empty for some
+  opcode values). The list of allowed opcodes is defined in Section 2.4 and the
+  details of each corresponding message are described Section 4.
+
+  The protocol distinguishes two types of frames: requests and responses. Requests
+  are those frames sent by the client to the server. Responses are those frames sent
+  by the server to the client. Note, however, that the protocol supports server pushes
+  (events) so a response does not necessarily come right after a client request.
+
+  Note to client implementors: client libraries should always assume that the
+  body of a given frame may contain more data than what is described in this
+  document. It will however always be safe to ignore the remainder of the frame
+  body in such cases. The reason is that this may enable extending the protocol
+  with optional features without needing to change the protocol version.
+
+
+
+2. Frame header
+
+2.1. version
+
+  The version is a single byte that indicates both the direction of the message
+  (request or response) and the version of the protocol in use. The most
+  significant bit of version is used to define the direction of the message:
+  0 indicates a request, 1 indicates a response. This can be useful for protocol
+  analyzers to distinguish the nature of the packet from the direction in which
+  it is moving. The rest of that byte is the protocol version (4 for the protocol
+  defined in this document). In other words, for this version of the protocol,
+  version will be one of:
+    0x04    Request frame for this protocol version
+    0x84    Response frame for this protocol version
+
+  Please note that while every message ships with the version, only one version
+  of messages is accepted on a given connection. In other words, the first message
+  exchanged (STARTUP) sets the version for the connection for the lifetime of this
+  connection.
+
+  This document describes version 4 of the protocol. For the changes made since
+  version 3, see Section 10.
+
+
+2.2. flags
+
+  Flags applying to this frame. The flags have the following meaning (described
+  by the mask that allows selecting them):
+    0x01: Compression flag. If set, the frame body is compressed. The actual
+          compression to use should have been set up beforehand through the
+          Startup message (which thus cannot be compressed; Section 4.1.1).
+    0x02: Tracing flag. For a request frame, this indicates the client requires
+          tracing of the request. Note that only QUERY, PREPARE and EXECUTE queries
+          support tracing. Other requests will simply ignore the tracing flag if 
+          set. If a request supports tracing and the tracing flag is set, the response
+          to this request will have the tracing flag set and contain tracing
+          information.
+          If a response frame has the tracing flag set, its body contains
+          a tracing ID. The tracing ID is a [uuid] and is the first thing in
+          the frame body.
+    0x04: Custom payload flag. For a request or response frame, this indicates
+          that a generic key-value custom payload for a custom QueryHandler
+          implementation is present in the frame. Such a custom payload is simply
+          ignored by the default QueryHandler implementation.
+          Currently, only QUERY, PREPARE, EXECUTE and BATCH requests support
+          payload.
+          Type of custom payload is [bytes map] (see below). If either or both
+          of the tracing and warning flags are set, the custom payload will follow
+          those indicated elements in the frame body. If neither are set, the custom
+          payload will be the first value in the frame body.
+    0x08: Warning flag. The response contains warnings which were generated by the
+          server to go along with this response.
+          If a response frame has the warning flag set, its body will contain the
+          text of the warnings. The warnings are a [string list] and will be the
+          first value in the frame body if the tracing flag is not set, or directly
+          after the tracing ID if it is.
+
+  The rest of flags is currently unused and ignored.
+
+2.3. stream
+
+  A frame has a stream id (a [short] value). When sending request messages, this
+  stream id must be set by the client to a non-negative value (negative stream id
+  are reserved for streams initiated by the server; currently all EVENT messages
+  (section 4.2.6) have a streamId of -1). If a client sends a request message
+  with the stream id X, it is guaranteed that the stream id of the response to
+  that message will be X.
+
+  This helps to enable the asynchronous nature of the protocol. If a client
+  sends multiple messages simultaneously (without waiting for responses), there
+  is no guarantee on the order of the responses. For instance, if the client
+  writes REQ_1, REQ_2, REQ_3 on the wire (in that order), the server might
+  respond to REQ_3 (or REQ_2) first. Assigning different stream ids to these 3
+  requests allows the client to distinguish to which request a received answer
+  responds to. As there can only be 32768 different simultaneous streams, it is up
+  to the client to reuse stream id.
+
+  Note that clients are free to use the protocol synchronously (i.e. wait for
+  the response to REQ_N before sending REQ_N+1). In that case, the stream id
+  can be safely set to 0. Clients should also feel free to use only a subset of
+  the 32768 maximum possible stream ids if it is simpler for its implementation.
+
+2.4. opcode
+
+  An integer byte that distinguishes the actual message:
+    0x00    ERROR
+    0x01    STARTUP
+    0x02    READY
+    0x03    AUTHENTICATE
+    0x05    OPTIONS
+    0x06    SUPPORTED
+    0x07    QUERY
+    0x08    RESULT
+    0x09    PREPARE
+    0x0A    EXECUTE
+    0x0B    REGISTER
+    0x0C    EVENT
+    0x0D    BATCH
+    0x0E    AUTH_CHALLENGE
+    0x0F    AUTH_RESPONSE
+    0x10    AUTH_SUCCESS
+
+  Messages are described in Section 4.
+
+  (Note that there is no 0x04 message in this version of the protocol)
+
+
+2.5. length
+
+  A 4 byte integer representing the length of the body of the frame (note:
+  currently a frame is limited to 256MB in length).
+
+
+3. Notations
+
+  To describe the layout of the frame body for the messages in Section 4, we
+  define the following:
+
+    [int]          A 4 bytes integer
+    [long]         A 8 bytes integer
+    [short]        A 2 bytes unsigned integer
+    [string]       A [short] n, followed by n bytes representing an UTF-8
+                   string.
+    [long string]  An [int] n, followed by n bytes representing an UTF-8 string.
+    [uuid]         A 16 bytes long uuid.
+    [string list]  A [short] n, followed by n [string].
+    [bytes]        A [int] n, followed by n bytes if n >= 0. If n < 0,
+                   no byte should follow and the value represented is `null`.
+    [value]        A [int] n, followed by n bytes if n >= 0.
+                   If n == -1 no byte should follow and the value represented is `null`.
+                   If n == -2 no byte should follow and the value represented is
+                   `not set` not resulting in any change to the existing value.
+                   n < -2 is an invalid value and results in an error.
+    [short bytes]  A [short] n, followed by n bytes if n >= 0.
+
+    [option]       A pair of <id><value> where <id> is a [short] representing
+                   the option id and <value> depends on that option (and can be
+                   of size 0). The supported id (and the corresponding <value>)
+                   will be described when this is used.
+    [option list]  A [short] n, followed by n [option].
+    [inet]         An address (ip and port) to a node. It consists of one
+                   [byte] n, that represents the address size, followed by n
+                   [byte] representing the IP address (in practice n can only be
+                   either 4 (IPv4) or 16 (IPv6)), following by one [int]
+                   representing the port.
+    [consistency]  A consistency level specification. This is a [short]
+                   representing a consistency level with the following
+                   correspondance:
+                     0x0000    ANY
+                     0x0001    ONE
+                     0x0002    TWO
+                     0x0003    THREE
+                     0x0004    QUORUM
+                     0x0005    ALL
+                     0x0006    LOCAL_QUORUM
+                     0x0007    EACH_QUORUM
+                     0x0008    SERIAL
+                     0x0009    LOCAL_SERIAL
+                     0x000A    LOCAL_ONE
+
+    [string map]      A [short] n, followed by n pair <k><v> where <k> and <v>
+                      are [string].
+    [string multimap] A [short] n, followed by n pair <k><v> where <k> is a
+                      [string] and <v> is a [string list].
+    [bytes map]       A [short] n, followed by n pair <k><v> where <k> is a
+                      [string] and <v> is a [bytes].
+
+
+4. Messages
+
+  Dependant on the flags specified in the header, the layout of the message body must be:
+    [<tracing_id>][<warnings>][<custom_payload>]<message>
+  where:
+    - <tracing_id> is a UUID tracing ID, present if this is a request message and the Tracing flag is set.
+    - <warnings> is a string list of warnings (if this is a request message and the Warning flag is set.
+    - <custom_payload> is bytes map for the serialised custom payload present if this is one of the message types
+      which support custom payloads (QUERY, PREPARE, EXECUTE and BATCH) and the Custom payload flag is set.
+    - <message> as defined below through sections 4 and 5.
+
+4.1. Requests
+
+  Note that outside of their normal responses (described below), all requests
+  can get an ERROR message (Section 4.2.1) as response.
+
+4.1.1. STARTUP
+
+  Initialize the connection. The server will respond by either a READY message
+  (in which case the connection is ready for queries) or an AUTHENTICATE message
+  (in which case credentials will need to be provided using AUTH_RESPONSE).
+
+  This must be the first message of the connection, except for OPTIONS that can
+  be sent before to find out the options supported by the server. Once the
+  connection has been initialized, a client should not send any more STARTUP
+  messages.
+
+  The body is a [string map] of options. Possible options are:
+    - "CQL_VERSION": the version of CQL to use. This option is mandatory and
+      currently the only version supported is "3.0.0". Note that this is
+      different from the protocol version.
+    - "COMPRESSION": the compression algorithm to use for frames (See section 5).
+      This is optional; if not specified no compression will be used.
+    - "NO_COMPACT": whether or not connection has to be established in compatibility
+      mode. This mode will make all Thrift and Compact Tables to be exposed as if
+      they were CQL Tables. This is optional; if not specified, the option will
+      not be used.
+    - "THROW_ON_OVERLOAD": In case of server overloaded with too many requests, by default the server puts
+            back pressure on the client connection. Instead, the server can send an OverloadedException error message back to
+            the client if this option is set to true.
+    - "PAGE_UNIT": a list of supported page units.
+
+
+4.1.2. AUTH_RESPONSE
+
+  Answers a server authentication challenge.
+
+  Authentication in the protocol is SASL based. The server sends authentication
+  challenges (a bytes token) to which the client answers with this message. Those
+  exchanges continue until the server accepts the authentication by sending a
+  AUTH_SUCCESS message after a client AUTH_RESPONSE. Note that the exchange
+  begins with the client sending an initial AUTH_RESPONSE in response to a
+  server AUTHENTICATE request.
+
+  The body of this message is a single [bytes] token. The details of what this
+  token contains (and when it can be null/empty, if ever) depends on the actual
+  authenticator used.
+
+  The response to a AUTH_RESPONSE is either a follow-up AUTH_CHALLENGE message,
+  an AUTH_SUCCESS message or an ERROR message.
+
+
+4.1.3. OPTIONS
+
+  Asks the server to return which STARTUP options are supported. The body of an
+  OPTIONS message should be empty and the server will respond with a SUPPORTED
+  message.
+
+
+4.1.4. QUERY
+
+  Performs a CQL query. The body of the message must be:
+    <query><query_parameters>
+  where <query> is a [long string] representing the query and
+  <query_parameters> must be
+    <consistency><flags>[<n>[name_1]<value_1>...[name_n]<value_n>][<result_page_size>][<paging_state>][<serial_consistency>][<timestamp>]
+  where:
+    - <consistency> is the [consistency] level for the operation.
+    - <flags> is a [byte] whose bits define the options for this query and
+      in particular influence what the remainder of the message contains.
+      A flag is set if the bit corresponding to its `mask` is set. Supported
+      flags are, given their mask:
+        0x00000001: Values. If set, a [short] <n> followed by <n> [value]
+                    values are provided. Those values are used for bound variables in
+                    the query. Optionally, if the 0x40 flag is present, each value
+                    will be preceded by a [string] name, representing the name of
+                    the marker the value must be bound to.
+        0x00000002: Skip_metadata. If set, the Result Set returned as a response
+                    to the query (if any) will have the NO_METADATA flag (see
+                    Section 4.2.5.2).
+        0x00000004: Page_size. If set, <result_page_size> is an [int]
+                    controlling the desired page size of the result (in CQL3 rows or bytes).
+                    See the section on paging (Section 8) for more details.
+        0x00000008: With_paging_state. If set, <paging_state> should be present.
+                    <paging_state> is a [bytes] value that should have been returned
+                    in a result set (Section 4.2.5.2). The query will be
+                    executed but starting from a given paging state. This is also to
+                    continue paging on a different node than the one where it
+                    started (See Section 8 for more details).
+        0x00000010: With serial consistency. If set, <serial_consistency> should be
+                    present. <serial_consistency> is the [consistency] level for the
+                    serial phase of conditional updates. That consitency can only be
+                    either SERIAL or LOCAL_SERIAL and if not present, it defaults to
+                    SERIAL. This option will be ignored for anything else other than a
+                    conditional update/insert.
+        0x00000020: With default timestamp. If set, <timestamp> should be present.
+                    <timestamp> is a [long] representing the default timestamp for the query
+                    in microseconds (negative values are forbidden). This will
+                    replace the server side assigned timestamp as default timestamp.
+                    Note that a timestamp in the query itself will still override
+                    this timestamp. This is entirely optional.
+        0x00000040: With names for values. This only makes sense if the 0x01 flag is set and
+                    is ignored otherwise. If present, the values from the 0x01 flag will
+                    be preceded by a name (see above). Note that this is only useful for
+                    QUERY requests where named bind markers are used; for EXECUTE statements,
+                    since the names for the expected values was returned during preparation,
+                    a client can always provide values in the right order without any names
+                    and using this flag, while supported, is almost surely inefficient.
+        0x40000000: When set, the <page_size> is provided in bytes rather than in rows.
+
+
+  Note that the consistency is ignored by some queries (USE, CREATE, ALTER,
+  TRUNCATE, ...).
+
+  The server will respond to a QUERY message with a RESULT message, the content
+  of which depends on the query.
+
+
+4.1.5. PREPARE
+
+  Prepare a query for later execution (through EXECUTE). The body consists of
+  the CQL query to prepare as a [long string].
+
+  The server will respond with a RESULT message with a `prepared` kind (0x0004,
+  see Section 4.2.5).
+
+
+4.1.6. EXECUTE
+
+  Executes a prepared query. The body of the message must be:
+    <id><query_parameters>
+  where <id> is the prepared query ID. It's the [short bytes] returned as a
+  response to a PREPARE message. As for <query_parameters>, it has the exact
+  same definition as in QUERY (see Section 4.1.4).
+
+  The response from the server will be a RESULT message.
+
+
+4.1.7. BATCH
+
+  Allows executing a list of queries (prepared or not) as a batch (note that
+  only DML statements are accepted in a batch). The body of the message must
+  be:
+    <type><n><query_1>...<query_n><consistency><flags>[<serial_consistency>][<timestamp>]
+  where:
+    - <type> is a [byte] indicating the type of batch to use:
+        - If <type> == 0, the batch will be "logged". This is equivalent to a
+          normal CQL3 batch statement.
+        - If <type> == 1, the batch will be "unlogged".
+        - If <type> == 2, the batch will be a "counter" batch (and non-counter
+          statements will be rejected).
+    - <flags> is a [byte] whose bits define the options for this query and
+      in particular influence what the remainder of the message contains. It is similar
+      to the <flags> from QUERY and EXECUTE methods, except that the 4 rightmost
+      bits must always be 0 as their corresponding options do not make sense for
+      Batch. A flag is set if the bit corresponding to its `mask` is set. Supported
+      flags are, given their mask:
+        0x10: With serial consistency. If set, <serial_consistency> should be
+              present. <serial_consistency> is the [consistency] level for the
+              serial phase of conditional updates. That consistency can only be
+              either SERIAL or LOCAL_SERIAL and if not present, it defaults to
+              SERIAL. This option will be ignored for anything else other than a
+              conditional update/insert.
+        0x20: With default timestamp. If set, <timestamp> should be present.
+              <timestamp> is a [long] representing the default timestamp for the query
+              in microseconds. This will replace the server side assigned
+              timestamp as default timestamp. Note that a timestamp in the query itself
+              will still override this timestamp. This is entirely optional.
+        0x40: With names for values. If set, then all values for all <query_i> must be
+              preceded by a [string] <name_i> that have the same meaning as in QUERY
+              requests [IMPORTANT NOTE: this feature does not work and should not be
+              used. It is specified in a way that makes it impossible for the server
+              to implement. This will be fixed in a future version of the native
+              protocol. See https://issues.apache.org/jira/browse/CASSANDRA-10246 for
+              more details].
+    - <n> is a [short] indicating the number of following queries.
+    - <query_1>...<query_n> are the queries to execute. A <query_i> must be of the
+      form:
+        <kind><string_or_id><n>[<name_1>]<value_1>...[<name_n>]<value_n>
+      where:
+       - <kind> is a [byte] indicating whether the following query is a prepared
+         one or not. <kind> value must be either 0 or 1.
+       - <string_or_id> depends on the value of <kind>. If <kind> == 0, it should be
+         a [long string] query string (as in QUERY, the query string might contain
+         bind markers). Otherwise (that is, if <kind> == 1), it should be a
+         [short bytes] representing a prepared query ID.
+       - <n> is a [short] indicating the number (possibly 0) of following values.
+       - <name_i> is the optional name of the following <value_i>. It must be present
+         if and only if the 0x40 flag is provided for the batch.
+       - <value_i> is the [value] to use for bound variable i (of bound variable <name_i>
+         if the 0x40 flag is used).
+    - <consistency> is the [consistency] level for the operation.
+    - <serial_consistency> is only present if the 0x10 flag is set. In that case,
+      <serial_consistency> is the [consistency] level for the serial phase of
+      conditional updates. That consitency can only be either SERIAL or
+      LOCAL_SERIAL and if not present will defaults to SERIAL. This option will
+      be ignored for anything else other than a conditional update/insert.
+
+  The server will respond with a RESULT message.
+
+
+4.1.8. REGISTER
+
+  Register this connection to receive some types of events. The body of the
+  message is a [string list] representing the event types to register for. See
+  section 4.2.6 for the list of valid event types.
+
+  The response to a REGISTER message will be a READY message.
+
+  Please note that if a client driver maintains multiple connections to a
+  Cassandra node and/or connections to multiple nodes, it is advised to
+  dedicate a handful of connections to receive events, but to *not* register
+  for events on all connections, as this would only result in receiving
+  multiple times the same event messages, wasting bandwidth.
+
+
+4.2. Responses
+
+  This section describes the content of the frame body for the different
+  responses. Please note that to make room for future evolution, clients should
+  support extra informations (that they should simply discard) to the one
+  described in this document at the end of the frame body.
+
+4.2.1. ERROR
+
+  Indicates an error processing a request. The body of the message will be an
+  error code ([int]) followed by a [string] error message. Then, depending on
+  the exception, more content may follow. The error codes are defined in
+  Section 9, along with their additional content if any.
+
+
+4.2.2. READY
+
+  Indicates that the server is ready to process queries. This message will be
+  sent by the server either after a STARTUP message if no authentication is
+  required (if authentication is required, the server indicates readiness by
+  sending a AUTH_RESPONSE message).
+
+  The body of a READY message is empty.
+
+
+4.2.3. AUTHENTICATE
+
+  Indicates that the server requires authentication, and which authentication
+  mechanism to use.
+
+  The authentication is SASL based and thus consists of a number of server
+  challenges (AUTH_CHALLENGE, Section 4.2.7) followed by client responses
+  (AUTH_RESPONSE, Section 4.1.2). The initial exchange is however boostrapped
+  by an initial client response. The details of that exchange (including how
+  many challenge-response pairs are required) are specific to the authenticator
+  in use. The exchange ends when the server sends an AUTH_SUCCESS message or
+  an ERROR message.
+
+  This message will be sent following a STARTUP message if authentication is
+  required and must be answered by a AUTH_RESPONSE message from the client.
+
+  The body consists of a single [string] indicating the full class name of the
+  IAuthenticator in use.
+
+
+4.2.4. SUPPORTED
+
+  Indicates which startup options are supported by the server. This message
+  comes as a response to an OPTIONS message.
+
+  The body of a SUPPORTED message is a [string multimap]. This multimap gives
+  for each of the supported STARTUP options, the list of supported values.
+
+
+4.2.5. RESULT
+
+  The result to a query (QUERY, PREPARE, EXECUTE or BATCH messages).
+
+  The first element of the body of a RESULT message is an [int] representing the
+  `kind` of result. The rest of the body depends on the kind. The kind can be
+  one of:
+    0x0001    Void: for results carrying no information.
+    0x0002    Rows: for results to select queries, returning a set of rows.
+    0x0003    Set_keyspace: the result to a `use` query.
+    0x0004    Prepared: result to a PREPARE message.
+    0x0005    Schema_change: the result to a schema altering query.
+
+  The body for each kind (after the [int] kind) is defined below.
+
+
+4.2.5.1. Void
+
+  The rest of the body for a Void result is empty. It indicates that a query was
+  successful without providing more information.
+
+
+4.2.5.2. Rows
+
+  Indicates a set of rows. The rest of the body of a Rows result is:
+    <metadata><rows_count><rows_content>
+  where:
+    - <metadata> is composed of:
+        <flags><columns_count>[<paging_state>][<global_table_spec>?<col_spec_1>...<col_spec_n>]
+      where:
+        - <flags> is an [int]. The bits of <flags> provides information on the
+          formatting of the remaining information. A flag is set if the bit
+          corresponding to its `mask` is set. Supported flags are, given their
+          mask:
+            0x0001    Global_tables_spec: if set, only one table spec (keyspace
+                      and table name) is provided as <global_table_spec>. If not
+                      set, <global_table_spec> is not present.
+            0x0002    Has_more_pages: indicates whether this is not the last
+                      page of results and more should be retrieved. If set, the
+                      <paging_state> will be present. The <paging_state> is a
+                      [bytes] value that should be used in QUERY/EXECUTE to
+                      continue paging and retrieve the remainder of the result for
+                      this query (See Section 8 for more details).
+            0x0004    No_metadata: if set, the <metadata> is only composed of
+                      these <flags>, the <column_count> and optionally the
+                      <paging_state> (depending on the Has_more_pages flag) but
+                      no other information (so no <global_table_spec> nor <col_spec_i>).
+                      This will only ever be the case if this was requested
+                      during the query (see QUERY and RESULT messages).
+        - <columns_count> is an [int] representing the number of columns selected
+          by the query that produced this result. It defines the number of <col_spec_i>
+          elements in and the number of elements for each row in <rows_content>.
+        - <global_table_spec> is present if the Global_tables_spec is set in
+          <flags>. It is composed of two [string] representing the
+          (unique) keyspace name and table name the columns belong to.
+        - <col_spec_i> specifies the columns returned in the query. There are
+          <column_count> such column specifications that are composed of:
+            (<ksname><tablename>)?<name><type>
+          The initial <ksname> and <tablename> are two [string] and are only present
+          if the Global_tables_spec flag is not set. The <column_name> is a
+          [string] and <type> is an [option] that corresponds to the description
+          (what this description is depends a bit on the context: in results to
+          selects, this will be either the user chosen alias or the selection used
+          (often a colum name, but it can be a function call too). In results to
+          a PREPARE, this will be either the name of the corresponding bind variable
+          or the column name for the variable if it is "anonymous") and type of
+          the corresponding result. The option for <type> is either a native
+          type (see below), in which case the option has no value, or a
+          'custom' type, in which case the value is a [string] representing
+          the fully qualified class name of the type represented. Valid option
+          ids are:
+            0x0000    Custom: the value is a [string], see above.
+            0x0001    Ascii
+            0x0002    Bigint
+            0x0003    Blob
+            0x0004    Boolean
+            0x0005    Counter
+            0x0006    Decimal
+            0x0007    Double
+            0x0008    Float
+            0x0009    Int
+            0x000B    Timestamp
+            0x000C    Uuid
+            0x000D    Varchar
+            0x000E    Varint
+            0x000F    Timeuuid
+            0x0010    Inet
+            0x0011    Date
+            0x0012    Time
+            0x0013    Smallint
+            0x0014    Tinyint
+            0x0020    List: the value is an [option], representing the type
+                            of the elements of the list.
+            0x0021    Map: the value is two [option], representing the types of the
+                           keys and values of the map
+            0x0022    Set: the value is an [option], representing the type
+                            of the elements of the set
+            0x0030    UDT: the value is <ks><udt_name><n><name_1><type_1>...<name_n><type_n>
+                           where:
+                              - <ks> is a [string] representing the keyspace name this
+                                UDT is part of.
+                              - <udt_name> is a [string] representing the UDT name.
+                              - <n> is a [short] representing the number of fields of
+                                the UDT, and thus the number of <name_i><type_i> pairs
+                                following
+                              - <name_i> is a [string] representing the name of the
+                                i_th field of the UDT.
+                              - <type_i> is an [option] representing the type of the
+                                i_th field of the UDT.
+            0x0031    Tuple: the value is <n><type_1>...<type_n> where <n> is a [short]
+                             representing the number of values in the type, and <type_i>
+                             are [option] representing the type of the i_th component
+                             of the tuple
+
+    - <rows_count> is an [int] representing the number of rows present in this
+      result. Those rows are serialized in the <rows_content> part.
+    - <rows_content> is composed of <row_1>...<row_m> where m is <rows_count>.
+      Each <row_i> is composed of <value_1>...<value_n> where n is
+      <columns_count> and where <value_j> is a [bytes] representing the value
+      returned for the jth column of the ith row. In other words, <rows_content>
+      is composed of (<rows_count> * <columns_count>) [bytes].
+
+
+4.2.5.3. Set_keyspace
+
+  The result to a `use` query. The body (after the kind [int]) is a single
+  [string] indicating the name of the keyspace that has been set.
+
+
+4.2.5.4. Prepared
+
+  The result to a PREPARE message. The body of a Prepared result is:
+    <id><metadata><result_metadata>
+  where:
+    - <id> is [short bytes] representing the prepared query ID.
+    - <metadata> is composed of:
+        <flags><columns_count><pk_count>[<pk_index_1>...<pk_index_n>][<global_table_spec>?<col_spec_1>...<col_spec_n>]
+      where:
+        - <flags> is an [int]. The bits of <flags> provides information on the
+          formatting of the remaining information. A flag is set if the bit
+          corresponding to its `mask` is set. Supported masks and their flags
+          are:
+            0x0001    Global_tables_spec: if set, only one table spec (keyspace
+                      and table name) is provided as <global_table_spec>. If not
+                      set, <global_table_spec> is not present.
+        - <columns_count> is an [int] representing the number of bind markers
+          in the prepared statement.  It defines the number of <col_spec_i>
+          elements.
+        - <pk_count> is an [int] representing the number of <pk_index_i>
+          elements to follow. If this value is zero, at least one of the
+          partition key columns in the table that the statement acts on
+          did not have a corresponding bind marker (or the bind marker
+          was wrapped in a function call).
+        - <pk_index_i> is a short that represents the index of the bind marker
+          that corresponds to the partition key column in position i.
+          For example, a <pk_index> sequence of [2, 0, 1] indicates that the
+          table has three partition key columns; the full partition key
+          can be constructed by creating a composite of the values for
+          the bind markers at index 2, at index 0, and at index 1.
+          This allows implementations with token-aware routing to correctly
+          construct the partition key without needing to inspect table
+          metadata.
+        - <global_table_spec> is present if the Global_tables_spec is set in
+          <flags>. If present, it is composed of two [string]s. The first
+          [string] is the name of the keyspace that the statement acts on.
+          The second [string] is the name of the table that the columns
+          represented by the bind markers belong to.
+        - <col_spec_i> specifies the bind markers in the prepared statement.
+          There are <column_count> such column specifications, each with the
+          following format:
+            (<ksname><tablename>)?<name><type>
+          The initial <ksname> and <tablename> are two [string] that are only
+          present if the Global_tables_spec flag is not set. The <name> field
+          is a [string] that holds the name of the bind marker (if named),
+          or the name of the column, field, or expression that the bind marker
+          corresponds to (if the bind marker is "anonymous").  The <type>
+          field is an [option] that represents the expected type of values for
+          the bind marker.  See the Rows documentation (section 4.2.5.2) for
+          full details on the <type> field.
+
+    - <result_metadata> is defined exactly the same as <metadata> in the Rows
+      documentation (section 4.2.5.2).  This describes the metadata for the
+      result set that will be returned when this prepared statement is executed.
+      Note that <result_metadata> may be empty (have the No_metadata flag and
+      0 columns, See section 4.2.5.2) and will be for any query that is not a
+      Select. In fact, there is never a guarantee that this will be non-empty, so
+      implementations should protect themselves accordingly. This result metadata
+      is an optimization that allows implementations to later execute the
+      prepared statement without requesting the metadata (see the Skip_metadata
+      flag in EXECUTE).  Clients can safely discard this metadata if they do not
+      want to take advantage of that optimization.
+
+  Note that the prepared query ID returned is global to the node on which the query
+  has been prepared. It can be used on any connection to that node
+  until the node is restarted (after which the query must be reprepared).
+
+4.2.5.5. Schema_change
+
+  The result to a schema altering query (creation/update/drop of a
+  keyspace/table/index). The body (after the kind [int]) is the same
+  as the body for a "SCHEMA_CHANGE" event, so 3 strings:
+    <change_type><target><options>
+  Please refer to section 4.2.6 below for the meaning of those fields.
+
+  Note that a query to create or drop an index is considered to be a change
+  to the table the index is on.
+
+
+4.2.6. EVENT
+
+  An event pushed by the server. A client will only receive events for the
+  types it has REGISTERed to. The body of an EVENT message will start with a
+  [string] representing the event type. The rest of the message depends on the
+  event type. The valid event types are:
+    - "TOPOLOGY_CHANGE": events related to change in the cluster topology.
+      Currently, events are sent when new nodes are added to the cluster, and
+      when nodes are removed. The body of the message (after the event type)
+      consists of a [string] and an [inet], corresponding respectively to the
+      type of change ("NEW_NODE" or "REMOVED_NODE") followed by the address of
+      the new/removed node.
+    - "STATUS_CHANGE": events related to change of node status. Currently,
+      up/down events are sent. The body of the message (after the event type)
+      consists of a [string] and an [inet], corresponding respectively to the
+      type of status change ("UP" or "DOWN") followed by the address of the
+      concerned node.
+    - "SCHEMA_CHANGE": events related to schema change. After the event type,
+      the rest of the message will be <change_type><target><options> where:
+        - <change_type> is a [string] representing the type of changed involved.
+          It will be one of "CREATED", "UPDATED" or "DROPPED".
+        - <target> is a [string] that can be one of "KEYSPACE", "TABLE", "TYPE",
+          "FUNCTION" or "AGGREGATE" and describes what has been modified
+          ("TYPE" stands for modifications related to user types, "FUNCTION"
+          for modifications related to user defined functions, "AGGREGATE"
+          for modifications related to user defined aggregates).
+        - <options> depends on the preceding <target>:
+          - If <target> is "KEYSPACE", then <options> will be a single [string]
+            representing the keyspace changed.
+          - If <target> is "TABLE" or "TYPE", then
+            <options> will be 2 [string]: the first one will be the keyspace
+            containing the affected object, and the second one will be the name
+            of said affected object (either the table, user type, function, or
+            aggregate name).
+          - If <target> is "FUNCTION" or "AGGREGATE", multiple arguments follow:
+            - [string] keyspace containing the user defined function / aggregate
+            - [string] the function/aggregate name
+            - [string list] one string for each argument type (as CQL type)
+
+  All EVENT messages have a streamId of -1 (Section 2.3).
+
+  Please note that "NEW_NODE" and "UP" events are sent based on internal Gossip
+  communication and as such may be sent a short delay before the binary
+  protocol server on the newly up node is fully started. Clients are thus
+  advised to wait a short time before trying to connect to the node (1 second
+  should be enough), otherwise they may experience a connection refusal at
+  first.
+
+4.2.7. AUTH_CHALLENGE
+
+  A server authentication challenge (see AUTH_RESPONSE (Section 4.1.2) for more
+  details).
+
+  The body of this message is a single [bytes] token. The details of what this
+  token contains (and when it can be null/empty, if ever) depends on the actual
+  authenticator used.
+
+  Clients are expected to answer the server challenge with an AUTH_RESPONSE
+  message.
+
+4.2.8. AUTH_SUCCESS
+
+  Indicates the success of the authentication phase. See Section 4.2.3 for more
+  details.
+
+  The body of this message is a single [bytes] token holding final information
+  from the server that the client may require to finish the authentication
+  process. What that token contains and whether it can be null depends on the
+  actual authenticator used.
+
+
+5. Compression
+
+  Frame compression is supported by the protocol, but then only the frame body
+  is compressed (the frame header should never be compressed).
+
+  Before being used, client and server must agree on a compression algorithm to
+  use, which is done in the STARTUP message. As a consequence, a STARTUP message
+  must never be compressed.  However, once the STARTUP frame has been received
+  by the server, messages can be compressed (including the response to the STARTUP
+  request). Frames do not have to be compressed, however, even if compression has
+  been agreed upon (a server may only compress frames above a certain size at its
+  discretion). A frame body should be compressed if and only if the compressed
+  flag (see Section 2.2) is set.
+
+  As of version 2 of the protocol, the following compressions are available:
+    - lz4 (https://code.google.com/p/lz4/). In that, note that the first four bytes
+      of the body will be the uncompressed length (followed by the compressed
+      bytes).
+    - snappy (https://code.google.com/p/snappy/). This compression might not be
+      available as it depends on a native lib (server-side) that might not be
+      avaivable on some installations.
+
+
+6. Data Type Serialization Formats
+
+  This sections describes the serialization formats for all CQL data types
+  supported by Cassandra through the native protocol.  These serialization
+  formats should be used by client drivers to encode values for EXECUTE
+  messages.  Cassandra will use these formats when returning values in
+  RESULT messages.
+
+  All values are represented as [bytes] in EXECUTE and RESULT messages.
+  The [bytes] format includes an int prefix denoting the length of the value.
+  For that reason, the serialization formats described here will not include
+  a length component.
+
+  For legacy compatibility reasons, note that most non-string types support
+  "empty" values (i.e. a value with zero length).  An empty value is distinct
+  from NULL, which is encoded with a negative length.
+
+  As with the rest of the native protocol, all encodings are big-endian.
+
+6.1. ascii
+
+  A sequence of bytes in the ASCII range [0, 127].  Bytes with values outside of
+  this range will result in a validation error.
+
+6.2 bigint
+
+  An eight-byte two's complement integer.
+
+6.3 blob
+
+  Any sequence of bytes.
+
+6.4 boolean
+
+  A single byte.  A value of 0 denotes "false"; any other value denotes "true".
+  (However, it is recommended that a value of 1 be used to represent "true".)
+
+6.5 date
+
+  An unsigned integer representing days with epoch centered at 2^31.
+  (unix epoch January 1st, 1970).
+  A few examples:
+    0:    -5877641-06-23
+    2^31: 1970-1-1
+    2^32: 5881580-07-11
+
+6.6 decimal
+
+  The decimal format represents an arbitrary-precision number.  It contains an
+  [int] "scale" component followed by a varint encoding (see section 6.17)
+  of the unscaled value.  The encoded value represents "<unscaled>E<-scale>".
+  In other words, "<unscaled> * 10 ^ (-1 * <scale>)".
+
+6.7 double
+
+  An 8 byte floating point number in the IEEE 754 binary64 format.
+
+6.8 float
+
+  A 4 byte floating point number in the IEEE 754 binary32 format.
+
+6.9 inet
+
+  A 4 byte or 16 byte sequence denoting an IPv4 or IPv6 address, respectively.
+
+6.10 int
+
+  A 4 byte two's complement integer.
+
+6.11 list
+
+  A [int] n indicating the number of elements in the list, followed by n
+  elements.  Each element is [bytes] representing the serialized value.
+
+6.12 map
+
+  A [int] n indicating the number of key/value pairs in the map, followed by
+  n entries.  Each entry is composed of two [bytes] representing the key
+  and value.
+
+6.13 set
+
+  A [int] n indicating the number of elements in the set, followed by n
+  elements.  Each element is [bytes] representing the serialized value.
+
+6.14 smallint
+
+  A 2 byte two's complement integer.
+
+6.15 text
+
+  A sequence of bytes conforming to the UTF-8 specifications.
+
+6.16 time
+
+  An 8 byte two's complement long representing nanoseconds since midnight.
+  Valid values are in the range 0 to 86399999999999
+
+6.17 timestamp
+
+  An 8 byte two's complement integer representing a millisecond-precision
+  offset from the unix epoch (00:00:00, January 1st, 1970).  Negative values
+  represent a negative offset from the epoch.
+
+6.18 timeuuid
+
+  A 16 byte sequence representing a version 1 UUID as defined by RFC 4122.
+
+6.19 tinyint
+
+  A 1 byte two's complement integer.
+
+6.20 tuple
+
+  A sequence of [bytes] values representing the items in a tuple.  The encoding
+  of each element depends on the data type for that position in the tuple.
+  Null values may be represented by using length -1 for the [bytes]
+  representation of an element.
+
+6.21 uuid
+
+  A 16 byte sequence representing any valid UUID as defined by RFC 4122.
+
+6.22 varchar
+
+  An alias of the "text" type.
+
+6.23 varint
+
+  A variable-length two's complement encoding of a signed integer.
+
+  The following examples may help implementors of this spec:
+
+  Value | Encoding
+  ------|---------
+      0 |     0x00
+      1 |     0x01
+    127 |     0x7F
+    128 |   0x0080
+    129 |   0x0081
+     -1 |     0xFF
+   -128 |     0x80
+   -129 |   0xFF7F
+
+  Note that positive numbers must use a most-significant byte with a value
+  less than 0x80, because a most-significant bit of 1 indicates a negative
+  value.  Implementors should pad positive values that have a MSB >= 0x80
+  with a leading 0x00 byte.
+
+
+7. User Defined Types
+
+  This section describes the serialization format for User defined types (UDT),
+  as described in section 4.2.5.2.
+
+  A UDT value is composed of successive [bytes] values, one for each field of the UDT
+  value (in the order defined by the type). A UDT value will generally have one value
+  for each field of the type it represents, but it is allowed to have less values than
+  the type has fields.
+
+
+8. Result paging
+
+  The protocol allows for paging the result of queries. For that, the QUERY and
+  EXECUTE messages have a <result_page_size> value that indicate the desired
+  page size in CQL3 rows or bytes.
+
+  If a positive value is provided for <result_page_size>, the result set of the
+  RESULT message returned for the query will contain at most the
+  <result_page_size> first rows or bytes of the query result. If that first page of results
+  contains the full result set for the query, the RESULT message (of kind `Rows`)
+  will have the Has_more_pages flag *not* set. However, if some results are not
+  part of the first response, the Has_more_pages flag will be set and the result
+  will contain a <paging_state> value. In that case, the <paging_state> value
+  should be used in a QUERY or EXECUTE message (that has the *same* query as
+  the original one or the behavior is undefined) to retrieve the next page of
+  results.
+
+  Only CQL3 queries that return a result set (RESULT message with a Rows `kind`)
+  support paging. For other type of queries, the <result_page_size> value is
+  ignored.
+
+  In the previous protocol versions the page size was always provided in rows. Since 4.1
+  the page size can be provided in bytes as well. Whether the page size is specified in
+  rows or bytes is controlled by query flags (see section 4.1.4 for details).
+
+  Note to client implementors:
+  - While <result_page_size> can be as low as 1, it will likely be detrimental
+    to performance to pick a value too low. A value below 100 is probably too
+    low for most use cases.
+  - Clients should not rely on the actual size of the result set returned to
+    decide if there are more results to fetch or not. Instead, they should always
+    check the Has_more_pages flag (unless they did not enable paging for the query
+    obviously). Clients should also not assert that no result will have more than
+    <result_page_size> results. While the current implementation always respects
+    the exact value of <result_page_size>, we reserve the right to return
+    slightly smaller or bigger pages in the future for performance reasons.
+  - The <paging_state> is specific to a protocol version and drivers should not
+    send a <paging_state> returned by a node using the protocol v3 to query a node
+    using the protocol v4 for instance.
+
+
+9. Error codes
+
+  Let us recall that an ERROR message is composed of <code><message>[...]
+  (see 4.2.1 for details). The supported error codes, as well as any additional
+  information the message may contain after the <message> are described below:
+    0x0000    Server error: something unexpected happened. This indicates a
+              server-side bug.
+    0x000A    Protocol error: some client message triggered a protocol
+              violation (for instance a QUERY message is sent before a STARTUP
+              one has been sent)
+    0x0100    Authentication error: authentication was required and failed. The
+              possible reason for failing depends on the authenticator in use,
+              which may or may not include more detail in the accompanying
+              error message.
+    0x1000    Unavailable exception. The rest of the ERROR message body will be
+                <cl><required><alive>
+              where:
+                <cl> is the [consistency] level of the query that triggered
+                     the exception.
+                <required> is an [int] representing the number of nodes that
+                           should be alive to respect <cl>
+                <alive> is an [int] representing the number of replicas that
+                        were known to be alive when the request had been
+                        processed (since an unavailable exception has been
+                        triggered, there will be <alive> < <required>)
+    0x1001    Overloaded: the request cannot be processed because the
+              coordinator node is overloaded
+    0x1002    Is_bootstrapping: the request was a read request but the
+              coordinator node is bootstrapping
+    0x1003    Truncate_error: error during a truncation error.
+    0x1100    Write_timeout: Timeout exception during a write request. The rest
+              of the ERROR message body will be
+                <cl><received><blockfor><writeType>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <received> is an [int] representing the number of nodes having
+                           acknowledged the request.
+                <blockfor> is an [int] representing the number of replicas whose
+                           acknowledgement is required to achieve <cl>.
+                <writeType> is a [string] that describe the type of the write
+                            that timed out. The value of that string can be one
+                            of:
+                             - "SIMPLE": the write was a non-batched
+                               non-counter write.
+                             - "BATCH": the write was a (logged) batch write.
+                               If this type is received, it means the batch log
+                               has been successfully written (otherwise a
+                               "BATCH_LOG" type would have been sent instead).
+                             - "UNLOGGED_BATCH": the write was an unlogged
+                               batch. No batch log write has been attempted.
+                             - "COUNTER": the write was a counter write
+                               (batched or not).
+                             - "BATCH_LOG": the timeout occurred during the
+                               write to the batch log when a (logged) batch
+                               write was requested.
+                             - "CAS": the timeout occured during the Compare And Set write/update.
+                             - "VIEW": the timeout occured when a write involves
+                                VIEW update and failure to acqiure local view(MV)
+                                lock for key within timeout
+                             - "CDC": the timeout occured when cdc_total_space_in_mb is
+                                exceeded when doing a write to data tracked by cdc.
+    0x1200    Read_timeout: Timeout exception during a read request. The rest
+              of the ERROR message body will be
+                <cl><received><blockfor><data_present>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <received> is an [int] representing the number of nodes having
+                           answered the request.
+                <blockfor> is an [int] representing the number of replicas whose
+                           response is required to achieve <cl>. Please note that
+                           it is possible to have <received> >= <blockfor> if
+                           <data_present> is false. Also in the (unlikely)
+                           case where <cl> is achieved but the coordinator node
+                           times out while waiting for read-repair acknowledgement.
+                <data_present> is a single byte. If its value is 0, it means
+                               the replica that was asked for data has not
+                               responded. Otherwise, the value is != 0.
+    0x1300    Read_failure: A non-timeout exception during a read request. The rest
+              of the ERROR message body will be
+                <cl><received><blockfor><numfailures><data_present>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <received> is an [int] representing the number of nodes having
+                           answered the request.
+                <blockfor> is an [int] representing the number of replicas whose
+                           acknowledgement is required to achieve <cl>.
+                <numfailures> is an [int] representing the number of nodes that
+                              experience a failure while executing the request.
+                <data_present> is a single byte. If its value is 0, it means
+                               the replica that was asked for data had not
+                               responded. Otherwise, the value is != 0.
+    0x1400    Function_failure: A (user defined) function failed during execution.
+              The rest of the ERROR message body will be
+                <keyspace><function><arg_types>
+              where:
+                <keyspace> is the keyspace [string] of the failed function
+                <function> is the name [string] of the failed function
+                <arg_types> [string list] one string for each argument type (as CQL type) of the failed function
+    0x1500    Write_failure: A non-timeout exception during a write request. The rest
+              of the ERROR message body will be
+                <cl><received><blockfor><numfailures><write_type>
+              where:
+                <cl> is the [consistency] level of the query having triggered
+                     the exception.
+                <received> is an [int] representing the number of nodes having
+                           answered the request.
+                <blockfor> is an [int] representing the number of replicas whose
+                           acknowledgement is required to achieve <cl>.
+                <numfailures> is an [int] representing the number of nodes that
+                              experience a failure while executing the request.
+                <writeType> is a [string] that describes the type of the write
+                            that failed. The value of that string can be one
+                            of:
+                             - "SIMPLE": the write was a non-batched
+                               non-counter write.
+                             - "BATCH": the write was a (logged) batch write.
+                               If this type is received, it means the batch log
+                               has been successfully written (otherwise a
+                               "BATCH_LOG" type would have been sent instead).
+                             - "UNLOGGED_BATCH": the write was an unlogged
+                               batch. No batch log write has been attempted.
+                             - "COUNTER": the write was a counter write
+                               (batched or not).
+                             - "BATCH_LOG": the failure occured during the
+                               write to the batch log when a (logged) batch
+                               write was requested.
+                             - "CAS": the failure occured during the Compare And Set write/update.
+                             - "VIEW": the failure occured when a write involves
+                                VIEW update and failure to acqiure local view(MV)
+                                lock for key within timeout
+                             - "CDC": the failure occured when cdc_total_space_in_mb is
+                                exceeded when doing a write to data tracked by cdc.
+
+    0x2000    Syntax_error: The submitted query has a syntax error.
+    0x2100    Unauthorized: The logged user doesn't have the right to perform
+              the query.
+    0x2200    Invalid: The query is syntactically correct but invalid.
+    0x2300    Config_error: The query is invalid because of some configuration issue
+    0x2400    Already_exists: The query attempted to create a keyspace or a
+              table that was already existing. The rest of the ERROR message
+              body will be <ks><table> where:
+                <ks> is a [string] representing either the keyspace that
+                     already exists, or the keyspace in which the table that
+                     already exists is.
+                <table> is a [string] representing the name of the table that
+                        already exists. If the query was attempting to create a
+                        keyspace, <table> will be present but will be the empty
+                        string.
+    0x2500    Unprepared: Can be thrown while a prepared statement tries to be
+              executed if the provided prepared statement ID is not known by
+              this host. The rest of the ERROR message body will be [short
+              bytes] representing the unknown ID.
+
+10. Changes from v4
+
+  * Query flags (Section 4.1.4) includes a new flag 0x40000000 which denotes that
+    the page size is specified in bytes rather than in rows.
diff --git a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
index 65ed71ec6459..0ecf9e516312 100644
--- a/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
+++ b/src/java/org/apache/cassandra/batchlog/BatchlogManager.java
@@ -43,6 +43,7 @@
 
 import org.apache.cassandra.concurrent.DebuggableScheduledThreadPoolExecutor;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.ConsistencyLevel;
@@ -222,7 +223,7 @@ private void replayFailedBatches()
         String query = String.format("SELECT id, mutations, version FROM %s.%s WHERE token(id) > token(?) AND token(id) <= token(?)",
                                      SchemaConstants.SYSTEM_KEYSPACE_NAME,
                                      SystemKeyspace.BATCHES);
-        UntypedResultSet batches = executeInternalWithPaging(query, pageSize, lastReplayedUuid, limitUuid);
+        UntypedResultSet batches = executeInternalWithPaging(query, PageSize.inRows(pageSize), lastReplayedUuid, limitUuid);
         processBatchlogEntries(batches, pageSize, rateLimiter);
         lastReplayedUuid = limitUuid;
         logger.trace("Finished replayFailedBatches");
diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java
index 09cb48d2c863..4919f345b869 100644
--- a/src/java/org/apache/cassandra/config/Config.java
+++ b/src/java/org/apache/cassandra/config/Config.java
@@ -545,6 +545,8 @@ public class Config
 
     public StorageAttachedIndexOptions sai_options = new StorageAttachedIndexOptions();
 
+    public volatile int aggregation_subpage_size_in_kb = 2048;
+
     /**
      * @deprecated migrate to {@link DatabaseDescriptor#isClientInitialized()}
      */
diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
index ed844aea1e8b..9e5ffd4f1429 100644
--- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
+++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java
@@ -43,6 +43,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.audit.AuditLogOptions;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.fql.FullQueryLoggerOptions;
 import org.apache.cassandra.auth.AllowAllInternodeAuthenticator;
 import org.apache.cassandra.auth.AuthConfig;
@@ -895,6 +896,11 @@ else if (conf.max_value_size_in_mb >= 2048)
         }
 
         validateMaxConcurrentAutoUpgradeTasksConf(conf.max_concurrent_automatic_sstable_upgrades);
+
+        if (conf.aggregation_subpage_size_in_kb < 1)
+            throw new ConfigurationException("aggregation_subpage_size_in_kb must be greater than 0");
+
+        setAggregationSubPageSize(getAggregationSubPageSize());
     }
 
     @VisibleForTesting
@@ -3459,4 +3465,16 @@ public static boolean isEmulateDbaasDefaults()
     {
         return conf.emulate_dbaas_defaults;
     }
+
+    public static PageSize getAggregationSubPageSize()
+    {
+        return PageSize.inBytes(conf.aggregation_subpage_size_in_kb * 1024);
+    }
+
+    public static void setAggregationSubPageSize(PageSize pageSize)
+    {
+        Preconditions.checkArgument(!pageSize.isDefined() || pageSize.getUnit() == PageSize.PageUnit.BYTES);
+        Preconditions.checkArgument(pageSize.bytes() >= 1024);
+        conf.aggregation_subpage_size_in_kb = pageSize.bytes() / 1024;
+    }
 }
diff --git a/src/java/org/apache/cassandra/cql3/PageSize.java b/src/java/org/apache/cassandra/cql3/PageSize.java
new file mode 100644
index 000000000000..2248610dec59
--- /dev/null
+++ b/src/java/org/apache/cassandra/cql3/PageSize.java
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.cassandra.cql3;
+
+import java.util.Objects;
+
+import com.google.common.base.Preconditions;
+
+public class PageSize
+{
+    public static final int NO_LIMIT = Integer.MAX_VALUE;
+
+    public static final PageSize NONE = new PageSize(NO_LIMIT, PageUnit.ROWS);
+
+    public enum PageUnit
+    {
+        ROWS, BYTES
+    }
+
+    private final int size;
+    private final PageUnit unit;
+
+    public PageSize(int size, PageUnit unit)
+    {
+        Preconditions.checkArgument(size >= 0);
+        Preconditions.checkNotNull(unit);
+        this.size = size;
+        this.unit = unit;
+    }
+
+    public int getSize()
+    {
+        return size;
+    }
+
+    public PageUnit getUnit()
+    {
+        return unit;
+    }
+
+    public int bytes()
+    {
+        return unit == PageUnit.BYTES ? size : NO_LIMIT;
+    }
+
+    public int rows()
+    {
+        return unit == PageUnit.ROWS ? size : NO_LIMIT;
+    }
+
+    /**
+     * Creates a page size representing {@code count} rows.
+     *
+     * @throws IllegalArgumentException if the size is not strictly positive.
+     */
+    public static PageSize inRows(int rowsCount)
+    {
+        return new PageSize(rowsCount, PageUnit.ROWS);
+    }
+
+    /**
+     * Creates a page size representing {@code size} bytes.
+     *
+     * @throws IllegalArgumentException if the size is not strictly positive.
+     */
+    public static PageSize inBytes(int bytesCount)
+    {
+        return new PageSize(bytesCount, PageUnit.BYTES);
+    }
+
+    /**
+     * Returns the minimum number of rows for the given number and the number of rows represented by this page size.
+     * If this page size is defined in bytes or undefined, it will just return the provided number of rows.
+     */
+    public int minRowsCount(int rowsCount)
+    {
+        return unit == PageUnit.ROWS ? Math.min(rowsCount, size) : rowsCount;
+    }
+
+    /**
+     * Returns the minimum number of bytes for the given number and the number of bytes represented by this page size.
+     * If this page size is defined in rows or undefined, it will just return the provided number of bytes.
+     */
+    public int minBytesCount(int bytesCount)
+    {
+        return unit == PageUnit.BYTES ? Math.min(bytesCount, size) : bytesCount;
+    }
+
+    public boolean isDefined()
+    {
+        return size < NO_LIMIT;
+    }
+
+    public PageSize withDecreasedRows(int rowsCount) {
+        return unit == PageUnit.ROWS && size != NO_LIMIT
+               ? inRows(Math.max(0, size - rowsCount))
+               : this;
+    }
+
+    public PageSize withDecreasedBytes(int bytesCount) {
+        return unit == PageUnit.BYTES && size != NO_LIMIT
+               ? inBytes(Math.max(0, size - bytesCount))
+               : this;
+    }
+
+    /**
+     * Assuming we went through the provided number of rows/bytes, it returns whether the page is completed.
+     * It will always return {@code false} if the page size is undefined (unlimited).
+     */
+    public boolean isCompleted(int count, PageUnit unit)
+    {
+        return this.unit == unit && this.size <= count;
+    }
+
+    @Override
+    public boolean equals(Object o)
+    {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        PageSize pageSize = (PageSize) o;
+        return size == pageSize.size && unit == pageSize.unit;
+    }
+
+    @Override
+    public int hashCode()
+    {
+        return Objects.hash(size, unit);
+    }
+
+    @Override
+    public String toString()
+    {
+        if (size == NO_LIMIT)
+            return "unlimited";
+        else
+            return size + " " + unit.name().toLowerCase();
+    }
+}
diff --git a/src/java/org/apache/cassandra/cql3/QueryOptions.java b/src/java/org/apache/cassandra/cql3/QueryOptions.java
index 9d4bb48737d6..8f077e0d2536 100644
--- a/src/java/org/apache/cassandra/cql3/QueryOptions.java
+++ b/src/java/org/apache/cassandra/cql3/QueryOptions.java
@@ -19,15 +19,17 @@
 
 import java.nio.ByteBuffer;
 import java.util.*;
+import javax.annotation.Nullable;
 
 import com.google.common.collect.ImmutableList;
+import org.apache.commons.lang3.builder.ToStringBuilder;
+import org.apache.commons.lang3.builder.ToStringStyle;
 
 import io.netty.buffer.ByteBuf;
-
-import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.service.pager.PagingState;
 import org.apache.cassandra.transport.CBCodec;
@@ -35,10 +37,6 @@
 import org.apache.cassandra.transport.ProtocolException;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.Pair;
-import org.apache.commons.lang3.builder.ToStringBuilder;
-import org.apache.commons.lang3.builder.ToStringStyle;
-
-import javax.annotation.Nullable;
 
 /**
  * Options for a query.
@@ -74,7 +72,7 @@ public static QueryOptions forProtocolVersion(ProtocolVersion protocolVersion)
     public static QueryOptions create(ConsistencyLevel consistency,
                                       List<ByteBuffer> values,
                                       boolean skipMetadata,
-                                      int pageSize,
+                                      PageSize pageSize,
                                       PagingState pagingState,
                                       ConsistencyLevel serialConsistency,
                                       ProtocolVersion version,
@@ -86,7 +84,7 @@ public static QueryOptions create(ConsistencyLevel consistency,
     public static QueryOptions create(ConsistencyLevel consistency,
                                       List<ByteBuffer> values,
                                       boolean skipMetadata,
-                                      int pageSize,
+                                      PageSize pageSize,
                                       PagingState pagingState,
                                       ConsistencyLevel serialConsistency,
                                       ProtocolVersion version,
@@ -177,8 +175,8 @@ public ImmutableList<ColumnSpecification> getColumnSpecifications()
         throw new UnsupportedOperationException();
     }
 
-    /**  The pageSize for this query. Will be {@code <= 0} if not relevant for the query.  */
-    public int getPageSize()
+    /** The pageSize for this query. Will be {@code <= 0} if not relevant for the query.  */
+    public PageSize getPageSize()
     {
         return getSpecificOptions().pageSize;
     }
@@ -380,16 +378,16 @@ public List<ByteBuffer> getValues()
     // Options that are likely to not be present in most queries
     static class SpecificOptions
     {
-        private static final SpecificOptions DEFAULT = new SpecificOptions(-1, null, null, Long.MIN_VALUE, null, Integer.MIN_VALUE);
+        private static final SpecificOptions DEFAULT = new SpecificOptions(PageSize.NONE, null, null, Long.MIN_VALUE, null, Integer.MIN_VALUE);
 
-        private final int pageSize;
+        private final PageSize pageSize;
         private final PagingState state;
         private final ConsistencyLevel serialConsistency;
         private final long timestamp;
         private final String keyspace;
         private final int nowInSeconds;
 
-        private SpecificOptions(int pageSize,
+        private SpecificOptions(PageSize pageSize,
                                 PagingState state,
                                 ConsistencyLevel serialConsistency,
                                 long timestamp,
@@ -418,7 +416,30 @@ private enum Flag
             TIMESTAMP,
             NAMES_FOR_VALUES,
             KEYSPACE,
-            NOW_IN_SECONDS;
+            NOW_IN_SECONDS,
+            UNUSED_9,
+            UNUSED_10,
+            UNUSED_11,
+            UNUSED_12,
+            UNUSED_13,
+            UNUSED_14,
+            UNUSED_15,
+            UNUSED_16,
+            UNUSED_17,
+            UNUSED_18,
+            UNUSED_19,
+            UNUSED_20,
+            UNUSED_21,
+            UNUSED_22,
+            UNUSED_23,
+            UNUSED_24,
+            UNUSED_25,
+            UNUSED_26,
+            UNUSED_27,
+            UNUSED_28,
+            UNUSED_29,
+            PAGE_SIZE_IN_BYTES,
+            UNUSED_31;
 
             private static final Flag[] ALL_VALUES = values();
 
@@ -472,7 +493,11 @@ public QueryOptions decode(ByteBuf body, ProtocolVersion version)
             SpecificOptions options = SpecificOptions.DEFAULT;
             if (!flags.isEmpty())
             {
-                int pageSize = flags.contains(Flag.PAGE_SIZE) ? body.readInt() : -1;
+                PageSize pageSize = flags.contains(Flag.PAGE_SIZE)
+                                          ? flags.contains(Flag.PAGE_SIZE_IN_BYTES)
+                                            ? PageSize.inBytes(body.readInt())
+                                            : PageSize.inRows(body.readInt())
+                                          : PageSize.NONE;
                 PagingState pagingState = flags.contains(Flag.PAGING_STATE) ? PagingState.deserialize(CBUtil.readValueNoCopy(body), version) : null;
                 ConsistencyLevel serialConsistency = flags.contains(Flag.SERIAL_CONSISTENCY) ? CBUtil.readConsistencyLevel(body) : null;
                 long timestamp = Long.MIN_VALUE;
@@ -505,7 +530,7 @@ public void encode(QueryOptions options, ByteBuf dest, ProtocolVersion version)
             if (flags.contains(Flag.VALUES))
                 CBUtil.writeValueList(options.getValues(), dest);
             if (flags.contains(Flag.PAGE_SIZE))
-                dest.writeInt(options.getPageSize());
+                dest.writeInt(options.getPageSize().getSize());
             if (flags.contains(Flag.PAGING_STATE))
                 CBUtil.writeValue(options.getPagingState().serialize(version), dest);
             if (flags.contains(Flag.SERIAL_CONSISTENCY))
@@ -556,7 +581,7 @@ private EnumSet<Flag> gatherFlags(QueryOptions options, ProtocolVersion version)
                 flags.add(Flag.VALUES);
             if (options.skipMetadata())
                 flags.add(Flag.SKIP_METADATA);
-            if (options.getPageSize() >= 0)
+            if (options.getPageSize().isDefined())
                 flags.add(Flag.PAGE_SIZE);
             if (options.getPagingState() != null)
                 flags.add(Flag.PAGING_STATE);
@@ -571,6 +596,8 @@ private EnumSet<Flag> gatherFlags(QueryOptions options, ProtocolVersion version)
                     flags.add(Flag.KEYSPACE);
                 if (options.getSpecificOptions().nowInSeconds != Integer.MIN_VALUE)
                     flags.add(Flag.NOW_IN_SECONDS);
+                if (options.getSpecificOptions().pageSize.getUnit() == PageSize.PageUnit.BYTES)
+                    flags.add(Flag.PAGE_SIZE_IN_BYTES);
             }
 
             return flags;
diff --git a/src/java/org/apache/cassandra/cql3/QueryProcessor.java b/src/java/org/apache/cassandra/cql3/QueryProcessor.java
index b7836b9aeca2..2383ee85a2bd 100644
--- a/src/java/org/apache/cassandra/cql3/QueryProcessor.java
+++ b/src/java/org/apache/cassandra/cql3/QueryProcessor.java
@@ -351,7 +351,7 @@ public static UntypedResultSet execute(String query, ConsistencyLevel cl, QueryS
         }
     }
 
-    public static UntypedResultSet executeInternalWithPaging(String query, int pageSize, Object... values)
+    public static UntypedResultSet executeInternalWithPaging(String query, PageSize pageSize, Object... values)
     {
         Prepared prepared = prepareInternal(query);
         if (!(prepared.statement instanceof SelectStatement))
diff --git a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
index 767a13976dce..7e386f765d43 100644
--- a/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
+++ b/src/java/org/apache/cassandra/cql3/UntypedResultSet.java
@@ -50,7 +50,7 @@ public static UntypedResultSet create(List<Map<String, ByteBuffer>> results)
         return new FromResultList(results);
     }
 
-    public static UntypedResultSet create(SelectStatement select, QueryPager pager, int pageSize)
+    public static UntypedResultSet create(SelectStatement select, QueryPager pager, PageSize pageSize)
     {
         return new FromPager(select, pager, pageSize);
     }
@@ -64,7 +64,7 @@ public static UntypedResultSet create(SelectStatement select,
                                           ConsistencyLevel cl,
                                           QueryState queryState,
                                           QueryPager pager,
-                                          int pageSize)
+                                          PageSize pageSize)
     {
         return new FromDistributedPager(select, cl, queryState, pager, pageSize);
     }
@@ -168,10 +168,10 @@ private static class FromPager extends UntypedResultSet
     {
         private final SelectStatement select;
         private final QueryPager pager;
-        private final int pageSize;
+        private final PageSize pageSize;
         private final List<ColumnSpecification> metadata;
 
-        private FromPager(SelectStatement select, QueryPager pager, int pageSize)
+        private FromPager(SelectStatement select, QueryPager pager, PageSize pageSize)
         {
             this.select = select;
             this.pager = pager;
@@ -229,13 +229,14 @@ private static class FromDistributedPager extends UntypedResultSet
         private final ConsistencyLevel cl;
         private final QueryState queryState;
         private final QueryPager pager;
-        private final int pageSize;
+        private final PageSize pageSize;
         private final List<ColumnSpecification> metadata;
 
         private FromDistributedPager(SelectStatement select,
                                      ConsistencyLevel cl,
                                      QueryState queryState,
-                                     QueryPager pager, int pageSize)
+                                     QueryPager pager,
+                                     PageSize pageSize)
         {
             this.select = select;
             this.cl = cl;
diff --git a/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java b/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java
index 6916c0f525a2..db78df50958d 100644
--- a/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/DescribeStatement.java
@@ -37,6 +37,7 @@
 import org.apache.cassandra.db.marshal.UTF8Type;
 import org.apache.cassandra.db.virtual.VirtualKeyspaceRegistry;
 import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.exceptions.OperationExecutionException;
 import org.apache.cassandra.exceptions.RequestExecutionException;
 import org.apache.cassandra.exceptions.RequestValidationException;
 import org.apache.cassandra.io.util.DataInputBuffer;
@@ -157,14 +158,17 @@ public ResultMessage executeLocally(QueryState state, QueryOptions options)
         //
 
         long offset = getOffset(pagingState, schemaVersion);
-        int pageSize = options.getPageSize();
+        PageSize pageSize = options.getPageSize();
+
+        if (pageSize.isDefined() && pageSize.getUnit() != PageSize.PageUnit.ROWS)
+            throw new OperationExecutionException("Paging in bytes is not supported for describe statement. Please specify the page size in rows.");
 
         Stream<? extends T> stream = describe(state.getClientState(), keyspaces);
 
         if (offset > 0L)
             stream = stream.skip(offset);
-        if (pageSize > 0)
-            stream = stream.limit(pageSize);
+        if (pageSize.isDefined())
+            stream = stream.limit(pageSize.getSize());
 
         List<List<ByteBuffer>> rows = stream.map(e -> toRow(e, includeInternalDetails))
                                             .collect(Collectors.toList());
@@ -172,9 +176,9 @@ public ResultMessage executeLocally(QueryState state, QueryOptions options)
         ResultSet.ResultMetadata resultMetadata = new ResultSet.ResultMetadata(metadata(state.getClientState()));
         ResultSet result = new ResultSet(resultMetadata, rows);
 
-        if (pageSize > 0 && rows.size() == pageSize)
+        if (pageSize.isDefined() && rows.size() == pageSize.getSize())
         {
-            result.metadata.setHasMorePages(getPagingState(offset + pageSize, schemaVersion));
+            result.metadata.setHasMorePages(getPagingState(offset + pageSize.getSize(), schemaVersion));
         }
 
         return new ResultMessage.Rows(result);
diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
index c20baa3ed657..abe0882503b3 100644
--- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
+++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.audit.AuditLogContext;
 import org.apache.cassandra.audit.AuditLogEntryType;
 import org.apache.cassandra.auth.Permission;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.guardrails.Guardrails;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.Schema;
@@ -69,6 +70,8 @@
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+
 import org.apache.commons.lang3.builder.ToStringBuilder;
 import org.apache.commons.lang3.builder.ToStringStyle;
 
@@ -76,6 +79,7 @@
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkNotNull;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkNull;
 import static org.apache.cassandra.cql3.statements.RequestValidations.checkTrue;
+import static org.apache.cassandra.db.filter.DataLimits.NO_LIMIT;
 import static org.apache.cassandra.utils.ByteBufferUtil.UNSET_BYTE_BUFFER;
 
 /**
@@ -91,8 +95,6 @@ public class SelectStatement implements CQLStatement
 {
     private static final Logger logger = LoggerFactory.getLogger(SelectStatement.class);
 
-    public static final int DEFAULT_PAGE_SIZE = 10000;
-
     public final VariableSpecifications bindVariables;
     public final TableMetadata table;
     public final Parameters parameters;
@@ -234,6 +236,24 @@ private void validateQueryOptions(QueryState queryState, QueryOptions options)
     {
         if (SchemaConstants.isUserKeyspace(table.keyspace))
             Guardrails.disallowedWriteConsistencies.ensureAllowed(options.getConsistency(), queryState);
+
+        PageSize pageSize = options.getPageSize();
+        if (pageSize != null && options.getPageSize().isDefined() && pageSize.getUnit() == PageSize.PageUnit.BYTES)
+        {
+            Guardrails.pageSize.guard(pageSize.bytes(), "in bytes", false, queryState);
+        }
+    }
+
+    /**
+     * Returns whether the paging can be skipped based on the user limits and the page size - that is, if the user limit
+     * is provided and is lower than the page size, it means that we will only return at most one page and thus paging
+     * is unnecessary in this case. That applies to the page size defined in rows - if the page size is defined in bytes
+     * we cannot say anything about the relation beteween the user rows limit and the page size.
+     */
+    private boolean canSkipPaging(DataLimits userLimits, PageSize pageSize)
+    {
+        return !pageSize.isDefined() ||
+               pageSize.getUnit() == PageSize.PageUnit.ROWS && !pageSize.isCompleted(userLimits.count(), PageSize.PageUnit.ROWS);
     }
 
     public ResultMessage.Rows execute(QueryState queryState, QueryOptions options, long queryStartNanoTime)
@@ -247,12 +267,15 @@ public ResultMessage.Rows execute(QueryState queryState, QueryOptions options, l
         int nowInSec = options.getNowInSeconds(queryState);
         int userLimit = getLimit(options);
         int userPerPartitionLimit = getPerPartitionLimit(options);
-        int pageSize = options.getPageSize();
+        PageSize pageSize = options.getPageSize();
 
         Selectors selectors = selection.newSelectors(options);
         ReadQuery query = getQuery(queryState, options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
 
-        if (aggregationSpec == null && (pageSize <= 0 || (query.limits().count() <= pageSize)))
+        if (query.limits().isGroupByLimit() && pageSize != null && pageSize.isDefined() && pageSize.getUnit() == PageSize.PageUnit.BYTES)
+            throw new InvalidRequestException("Paging in bytes cannot be specified for aggregation queries");
+
+        if (aggregationSpec == null && canSkipPaging(query.limits(), pageSize))
             return execute(query, options, queryState, selectors, nowInSec, userLimit, queryStartNanoTime);
 
         QueryPager pager = getPager(query, options);
@@ -270,11 +293,12 @@ public ReadQuery getQuery(QueryState state, QueryOptions options, int nowInSec)
     {
         Selectors selectors = selection.newSelectors(options);
         return getQuery(state,
-        options,
-        selectors.getColumnFilter(),
-        nowInSec,
-        getLimit(options),
-        getPerPartitionLimit(options), options.getPageSize());
+                        options,
+                        selectors.getColumnFilter(),
+                        nowInSec,
+                        getLimit(options),
+                        getPerPartitionLimit(options),
+                        options.getPageSize());
     }
 
     public ReadQuery getQuery(QueryState queryState,
@@ -283,11 +307,11 @@ public ReadQuery getQuery(QueryState queryState,
                               int nowInSec,
                               int userLimit,
                               int perPartitionLimit,
-                              int pageSize)
+                              PageSize pageSize)
     {
         boolean isPartitionRangeQuery = restrictions.isKeyRange() || restrictions.usesSecondaryIndexing() || restrictions.isDisjunction();
 
-        DataLimits limit = getDataLimits(userLimit, perPartitionLimit, pageSize);
+        DataLimits limit = getDataLimits(queryState, userLimit, perPartitionLimit);
 
         if (isPartitionRangeQuery)
             return getRangeCommand(options, columnFilter, limit, nowInSec, queryState);
@@ -344,7 +368,7 @@ public PagingState state()
             return pager.state();
         }
 
-        public abstract PartitionIterator fetchPage(int pageSize, long queryStartNanoTime);
+        public abstract PartitionIterator fetchPage(PageSize pageSize, long queryStartNanoTime);
 
         public static class NormalPager extends Pager
         {
@@ -358,7 +382,7 @@ private NormalPager(QueryPager pager, ConsistencyLevel consistency, QueryState q
                 this.queryState = queryState;
             }
 
-            public PartitionIterator fetchPage(int pageSize, long queryStartNanoTime)
+            public PartitionIterator fetchPage(PageSize pageSize, long queryStartNanoTime)
             {
                 return pager.fetchPage(pageSize, consistency, queryState, queryStartNanoTime);
             }
@@ -374,7 +398,7 @@ private InternalPager(QueryPager pager, ReadExecutionController executionControl
                 this.executionController = executionController;
             }
 
-            public PartitionIterator fetchPage(int pageSize, long queryStartNanoTime)
+            public PartitionIterator fetchPage(PageSize pageSize, long queryStartNanoTime)
             {
                 return pager.fetchPageInternal(pageSize, executionController);
             }
@@ -384,7 +408,7 @@ public PartitionIterator fetchPage(int pageSize, long queryStartNanoTime)
     private ResultMessage.Rows execute(Pager pager,
                                        QueryOptions options,
                                        Selectors selectors,
-                                       int pageSize,
+                                       PageSize pageSize,
                                        int nowInSec,
                                        int userLimit,
                                        long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
@@ -403,7 +427,7 @@ else if (restrictions.keyIsInRelation())
 
         // We can't properly do post-query ordering if we page (see #6722)
         // For GROUP BY or aggregation queries we always page internally even if the user has turned paging off
-        checkFalse(pageSize > 0 && needsPostQueryOrdering(),
+        checkFalse(pageSize.isDefined() && needsPostQueryOrdering(),
                   "Cannot page queries with both ORDER BY and a IN restriction on the partition key;"
                   + " you must either remove the ORDER BY or the IN and sort client side, or disable paging for this query");
 
@@ -446,14 +470,14 @@ public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options
     {
         int userLimit = getLimit(options);
         int userPerPartitionLimit = getPerPartitionLimit(options);
-        int pageSize = options.getPageSize();
+        PageSize pageSize = options.getPageSize();
 
         Selectors selectors = selection.newSelectors(options);
         ReadQuery query = getQuery(state, options, selectors.getColumnFilter(), nowInSec, userLimit, userPerPartitionLimit, pageSize);
 
         try (ReadExecutionController executionController = query.executionController())
         {
-            if (aggregationSpec == null && (pageSize <= 0 || (query.limits().count() <= pageSize)))
+            if (aggregationSpec == null && canSkipPaging(query.limits(), pageSize))
             {
                 try (PartitionIterator data = query.executeInternal(executionController))
                 {
@@ -473,14 +497,15 @@ public ResultMessage.Rows executeInternal(QueryState state, QueryOptions options
         }
     }
 
-    private QueryPager getPager(ReadQuery query, QueryOptions options)
+    @VisibleForTesting
+    public QueryPager getPager(ReadQuery query, QueryOptions options)
     {
         QueryPager pager = query.getPager(options.getPagingState(), options.getProtocolVersion());
 
         if (aggregationSpec == null || query.isEmpty())
             return pager;
 
-        return new AggregationQueryPager(pager, query.limits());
+        return new AggregationQueryPager(pager, DatabaseDescriptor.getAggregationSubPageSize(), query.limits());
     }
 
     public ResultSet process(PartitionIterator partitions, int nowInSec) throws InvalidRequestException
@@ -682,10 +707,10 @@ public Slices makeSlices(QueryOptions options)
         return builder.build();
     }
 
-    private DataLimits getDataLimits(int userLimit, int perPartitionLimit, int pageSize)
+    private DataLimits getDataLimits(QueryState queryState, int userLimit, int perPartitionLimit)
     {
-        int cqlRowLimit = DataLimits.NO_LIMIT;
-        int cqlPerPartitionLimit = DataLimits.NO_LIMIT;
+        int cqlRowLimit = NO_LIMIT;
+        int cqlPerPartitionLimit = NO_LIMIT;
 
         // If we do post ordering we need to get all the results sorted before we can trim them.
         if (aggregationSpec != AggregationSpecification.AGGREGATE_EVERYTHING)
@@ -695,28 +720,39 @@ private DataLimits getDataLimits(int userLimit, int perPartitionLimit, int pageS
             cqlPerPartitionLimit = perPartitionLimit;
         }
 
-        // Group by and aggregation queries will always be paged internally to avoid OOM.
-        // If the user provided a pageSize we'll use that to page internally (because why not), otherwise we use our default
-        if (pageSize <= 0)
-            pageSize = DEFAULT_PAGE_SIZE;
+        DataLimits limits = null;
 
         // Aggregation queries work fine on top of the group by paging but to maintain
         // backward compatibility we need to use the old way.
         if (aggregationSpec != null && aggregationSpec != AggregationSpecification.AGGREGATE_EVERYTHING)
         {
             if (parameters.isDistinct)
-                return DataLimits.distinctLimits(cqlRowLimit);
-
-            return DataLimits.groupByLimits(cqlRowLimit,
-                                            cqlPerPartitionLimit,
-                                            pageSize,
-                                            aggregationSpec);
+                limits = DataLimits.distinctLimits(cqlRowLimit);
+            else
+                limits = DataLimits.groupByLimits(cqlRowLimit,
+                                                  cqlPerPartitionLimit,
+                                                  NO_LIMIT,
+                                                  NO_LIMIT,
+                                                  aggregationSpec);
+        }
+        else
+        {
+            if (parameters.isDistinct)
+                limits = cqlRowLimit == NO_LIMIT ? DataLimits.DISTINCT_NONE : DataLimits.distinctLimits(cqlRowLimit);
+            else
+                limits = DataLimits.cqlLimits(cqlRowLimit, cqlPerPartitionLimit);
         }
 
-        if (parameters.isDistinct)
-            return cqlRowLimit == DataLimits.NO_LIMIT ? DataLimits.DISTINCT_NONE : DataLimits.distinctLimits(cqlRowLimit);
+        if (!limits.isGroupByLimit() && Guardrails.pageSize.enabled(queryState))
+        {
+            int bytesLimit = DatabaseDescriptor.getGuardrailsConfig().page_size_failure_threshold_in_kb * 1024;
+            String limitStr = "Applied page size limit of " + FBUtilities.prettyPrintMemory(bytesLimit);
+            ClientWarn.instance.warn(limitStr);
+            logger.trace(limitStr);
+            limits = limits.forPaging(PageSize.inBytes(bytesLimit));
+        }
 
-        return DataLimits.cqlLimits(cqlRowLimit, cqlPerPartitionLimit);
+        return limits;
     }
 
     /**
@@ -745,7 +781,7 @@ public int getPerPartitionLimit(QueryOptions options)
 
     private int getLimit(Term limit, QueryOptions options)
     {
-        int userLimit = DataLimits.NO_LIMIT;
+        int userLimit = NO_LIMIT;
 
         if (limit != null)
         {
diff --git a/src/java/org/apache/cassandra/db/Mutation.java b/src/java/org/apache/cassandra/db/Mutation.java
index ae8cb6cec561..d2d817d02928 100644
--- a/src/java/org/apache/cassandra/db/Mutation.java
+++ b/src/java/org/apache/cassandra/db/Mutation.java
@@ -43,6 +43,7 @@
 import static org.apache.cassandra.net.MessagingService.VERSION_30;
 import static org.apache.cassandra.net.MessagingService.VERSION_3014;
 import static org.apache.cassandra.net.MessagingService.VERSION_40;
+import static org.apache.cassandra.net.MessagingService.VERSION_41;
 import static org.apache.cassandra.utils.MonotonicClock.approxTime;
 
 public class Mutation implements IMutation
@@ -283,6 +284,7 @@ public String toString(boolean shallow)
     private int serializedSize30;
     private int serializedSize3014;
     private int serializedSize40;
+    private int serializedSize41;
     private int serializedSizeSG10;
 
     public int serializedSize(int version)
@@ -301,6 +303,10 @@ public int serializedSize(int version)
                 if (serializedSize40 == 0)
                     serializedSize40 = (int) serializer.serializedSize(this, VERSION_40);
                 return serializedSize40;
+            case VERSION_41:
+                if (serializedSize41 == 0)
+                    serializedSize41 = (int) serializer.serializedSize(this, VERSION_41);
+                return serializedSize41;
             case VERSION_SG_10:
                 if (serializedSizeSG10 == 0)
                     serializedSizeSG10 = (int) serializer.serializedSize(this, VERSION_SG_10);
diff --git a/src/java/org/apache/cassandra/db/SystemKeyspaceMigrator40.java b/src/java/org/apache/cassandra/db/SystemKeyspaceMigrator40.java
index e0a58baf637d..80ee20c8528b 100644
--- a/src/java/org/apache/cassandra/db/SystemKeyspaceMigrator40.java
+++ b/src/java/org/apache/cassandra/db/SystemKeyspaceMigrator40.java
@@ -27,6 +27,7 @@
 import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.schema.SchemaConstants;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.cql3.UntypedResultSet;
@@ -55,6 +56,7 @@ public class SystemKeyspaceMigrator40
 
 
     private static final Logger logger = LoggerFactory.getLogger(SystemKeyspaceMigrator40.class);
+    private static final PageSize DEFAULT_PAGE_SIZE = PageSize.inRows(1000);
 
     private SystemKeyspaceMigrator40() {}
 
@@ -95,7 +97,7 @@ private static void migratePeers()
                                       + " values ( ?, ?, ? , ? , ?, ?, ?, ?, ?, ?, ?, ?)",
                                       peersName);
 
-        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, 1000);
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, DEFAULT_PAGE_SIZE);
         int transferred = 0;
         logger.info("Migrating rows from legacy {} to {}", legacyPeersName, peersName);
         for (UntypedResultSet.Row row : rows)
@@ -138,7 +140,7 @@ private static void migratePeerEvents()
                                       + " values ( ?, ?, ? )",
                                       peerEventsName);
 
-        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, 1000);
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, DEFAULT_PAGE_SIZE);
         int transferred = 0;
         for (UntypedResultSet.Row row : rows)
         {
@@ -173,7 +175,7 @@ static void migrateTransferredRanges()
                                       + " values ( ?, ?, ? , ?, ?)",
                                       transferredRangesName);
 
-        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, 1000);
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, DEFAULT_PAGE_SIZE);
         int transferred = 0;
         for (UntypedResultSet.Row row : rows)
         {
@@ -209,7 +211,7 @@ static void migrateAvailableRanges()
                                       + " values ( ?, ?, ? )",
                                       availableRangesName);
 
-        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, 1000);
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(query, DEFAULT_PAGE_SIZE);
         int transferred = 0;
         for (UntypedResultSet.Row row : rows)
         {
diff --git a/src/java/org/apache/cassandra/db/filter/DataLimits.java b/src/java/org/apache/cassandra/db/filter/DataLimits.java
index 845cffd4457d..a32eda5e180a 100644
--- a/src/java/org/apache/cassandra/db/filter/DataLimits.java
+++ b/src/java/org/apache/cassandra/db/filter/DataLimits.java
@@ -19,34 +19,64 @@
 
 import java.io.IOException;
 import java.nio.ByteBuffer;
-
-import org.apache.cassandra.db.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.StringJoiner;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.PageSize;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ClusteringComparator;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.TypeSizes;
+import org.apache.cassandra.db.aggregation.AggregationSpecification;
 import org.apache.cassandra.db.aggregation.GroupMaker;
 import org.apache.cassandra.db.aggregation.GroupingState;
-import org.apache.cassandra.db.aggregation.AggregationSpecification;
-import org.apache.cassandra.db.rows.*;
-import org.apache.cassandra.db.partitions.*;
+import org.apache.cassandra.db.partitions.CachedPartition;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
+import org.apache.cassandra.db.rows.BaseRowIterator;
+import org.apache.cassandra.db.rows.Row;
+import org.apache.cassandra.db.rows.RowIterator;
+import org.apache.cassandra.db.rows.Rows;
+import org.apache.cassandra.db.rows.UnfilteredRowIterator;
 import org.apache.cassandra.db.transform.BasePartitions;
 import org.apache.cassandra.db.transform.BaseRows;
 import org.apache.cassandra.db.transform.StoppingTransformation;
 import org.apache.cassandra.db.transform.Transformation;
 import org.apache.cassandra.io.util.DataInputPlus;
 import org.apache.cassandra.io.util.DataOutputPlus;
+import org.apache.cassandra.net.MessagingService;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
 /**
- * Object in charge of tracking if we have fetch enough data for a given query.
- *
- * This is more complicated than a single count because we support PER PARTITION
- * limits, but also due to GROUP BY and paging.
+ * Object in charge of tracking if we have fetched enough data for a given query.
+ * <p>
+ * This is more complicated than a single count because we support {@code PER PARTITION}
+ * limits, but also due to {@code GROUP BY} and paging.
+ * </p>
+ * <p>
+ * Tracking happens by row count ({@see count()}) and bytes ({@see bytes()}), with the first exhausted limit
+ * taking precedence.
+ * </p>
+ * <p>
+ * When paging is used (see {@code forPaging} methods), the minimum number between the page size and the rows/bytes
+ * limit is enforced, meaning that we'll never return more rows than requested.
+ * </p>
  */
 public abstract class DataLimits
 {
+    private static final Logger logger = LoggerFactory.getLogger(DataLimits.class);
     public static final Serializer serializer = new Serializer();
 
     public static final int NO_LIMIT = Integer.MAX_VALUE;
 
-    public static final DataLimits NONE = new CQLLimits(NO_LIMIT)
+    public static final DataLimits NONE = new CQLLimits(NO_LIMIT, NO_LIMIT, NO_LIMIT, false)
     {
         @Override
         public boolean hasEnoughLiveData(CachedPartition cached, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
@@ -79,7 +109,7 @@ public PartitionIterator filter(PartitionIterator iter, int nowInSec, boolean co
 
     // We currently deal with distinct queries by querying full partitions but limiting the result at 1 row per
     // partition (see SelectStatement.makeFilter). So an "unbounded" distinct is still actually doing some filtering.
-    public static final DataLimits DISTINCT_NONE = new CQLLimits(NO_LIMIT, 1, true);
+    public static final DataLimits DISTINCT_NONE = new CQLLimits(NO_LIMIT, NO_LIMIT, 1, true);
 
     public enum Kind
     {
@@ -93,29 +123,30 @@ public enum Kind
 
     public static DataLimits cqlLimits(int cqlRowLimit)
     {
-        return cqlRowLimit == NO_LIMIT ? NONE : new CQLLimits(cqlRowLimit);
+        return cqlRowLimit == NO_LIMIT ? NONE : new CQLLimits(NO_LIMIT, cqlRowLimit, NO_LIMIT, false);
     }
 
     public static DataLimits cqlLimits(int cqlRowLimit, int perPartitionLimit)
     {
         return cqlRowLimit == NO_LIMIT && perPartitionLimit == NO_LIMIT
              ? NONE
-             : new CQLLimits(cqlRowLimit, perPartitionLimit);
+             : new CQLLimits(NO_LIMIT, cqlRowLimit, perPartitionLimit, false);
     }
 
-    private static DataLimits cqlLimits(int cqlRowLimit, int perPartitionLimit, boolean isDistinct)
+    private static DataLimits cqlLimits(int bytesLimit, int cqlRowLimit, int perPartitionLimit, boolean isDistinct)
     {
-        return cqlRowLimit == NO_LIMIT && perPartitionLimit == NO_LIMIT && !isDistinct
+        return bytesLimit == NO_LIMIT && cqlRowLimit == NO_LIMIT && perPartitionLimit == NO_LIMIT && !isDistinct
              ? NONE
-             : new CQLLimits(cqlRowLimit, perPartitionLimit, isDistinct);
+             : new CQLLimits(bytesLimit, cqlRowLimit, perPartitionLimit, isDistinct);
     }
 
     public static DataLimits groupByLimits(int groupLimit,
                                            int groupPerPartitionLimit,
+                                           int bytesLimit,
                                            int rowLimit,
                                            AggregationSpecification groupBySpec)
     {
-        return new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, rowLimit, groupBySpec);
+        return new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec);
     }
 
     public static DataLimits distinctLimits(int cqlRowLimit)
@@ -133,13 +164,20 @@ public boolean isGroupByLimit()
         return false;
     }
 
-    public boolean isExhausted(Counter counter)
+    /**
+     * Returns true if the count limit is not reached.
+     *
+     * Note: currently this method's only usage is for paging, where it is checked after processing a page as a quick
+     * signal that the data for the query is complete - if the count limit is not reached at the end of the page, this
+     * must be because there is no more data to return.
+     */
+    public boolean isCounterBelowLimits(Counter counter)
     {
-        return counter.counted() < count();
+        return counter.counted() < count() && counter.bytesCounted() < bytes();
     }
 
-    public abstract DataLimits forPaging(int pageSize);
-    public abstract DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining);
+    public abstract DataLimits forPaging(PageSize pageSize);
+    public abstract DataLimits forPaging(PageSize pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining);
 
     public abstract DataLimits forShortReadRetry(int toFetch);
 
@@ -177,6 +215,23 @@ public abstract Counter newCounter(int nowInSec,
                                        boolean countPartitionsWithOnlyStaticData,
                                        boolean enforceStrictLiveness);
 
+    /**
+     * The max number of bytes this limits enforces.
+     * <p>
+     * Note that if this value is set, less rows might be returned if the size of the current rows exceeds the bytes limit.
+     *
+     * @return the maximum number of bytes this limits enforces.
+     */
+    public abstract int bytes();
+
+    /**
+     * The max number of rows this limits enforces. Note that this means traversed rows, regardless we use grouping or not.
+     * <p>
+     * @return the maximum number of rows this limits enforces.
+     */
+    @VisibleForTesting
+    public abstract int rows();
+
     /**
      * The max number of results this limits enforces.
      * <p>
@@ -195,6 +250,17 @@ public abstract Counter newCounter(int nowInSec,
      */
     public abstract DataLimits withoutState();
 
+    /**
+     * Returns a copy of this DataLimits with updated counted limit whatever it is (either the rows limit
+     * or groups limit depending on the actual implementation)
+     */
+    public abstract DataLimits withCountedLimit(int newCountedLimit);
+
+    /**
+     * Returns a copy of this DataLimits with updated bytes limit.
+     */
+    public abstract DataLimits withBytesLimit(int bytesLimit);
+
     public UnfilteredPartitionIterator filter(UnfilteredPartitionIterator iter,
                                               int nowInSec,
                                               boolean countPartitionsWithOnlyStaticData)
@@ -280,6 +346,12 @@ public RowIterator applyTo(RowIterator partition)
 
         public abstract int countedInCurrentPartition();
 
+        /**
+         * The number of bytes for the counted rows.
+         *
+         * @return the number of bytes counted.
+         */
+        public abstract int bytesCounted();
         /**
          * The number of rows counted.
          *
@@ -339,36 +411,32 @@ public void onClose()
     }
 
     /**
-     * Limits used by CQL; this counts rows.
+     * Limits used by CQL; this counts rows or bytes read. Please note:
+     * <ul>
+     * <li>When paging on rows, the minimum number of rows between the current limit and the page size is used as actual limit.</li>
+     * <li>When paging on bytes, the number of bytes takes precedence over the rows limit.</li>
+     * </ul>
      */
     private static class CQLLimits extends DataLimits
     {
+        protected final int bytesLimit;
         protected final int rowLimit;
         protected final int perPartitionLimit;
 
         // Whether the query is a distinct query or not.
         protected final boolean isDistinct;
 
-        private CQLLimits(int rowLimit)
-        {
-            this(rowLimit, NO_LIMIT);
-        }
-
-        private CQLLimits(int rowLimit, int perPartitionLimit)
-        {
-            this(rowLimit, perPartitionLimit, false);
-        }
-
-        private CQLLimits(int rowLimit, int perPartitionLimit, boolean isDistinct)
+        private CQLLimits(int bytesLimit, int rowsLimit, int perPartitionLimit, boolean isDistinct)
         {
-            this.rowLimit = rowLimit;
+            this.bytesLimit = bytesLimit;
+            this.rowLimit = rowsLimit;
             this.perPartitionLimit = perPartitionLimit;
             this.isDistinct = isDistinct;
         }
 
         private static CQLLimits distinct(int rowLimit)
         {
-            return new CQLLimits(rowLimit, 1, true);
+            return new CQLLimits(NO_LIMIT, rowLimit, 1, true);
         }
 
         public Kind kind()
@@ -378,7 +446,7 @@ public Kind kind()
 
         public boolean isUnlimited()
         {
-            return rowLimit == NO_LIMIT && perPartitionLimit == NO_LIMIT;
+            return bytesLimit == NO_LIMIT && rowLimit == NO_LIMIT && perPartitionLimit == NO_LIMIT;
         }
 
         public boolean isDistinct()
@@ -386,19 +454,27 @@ public boolean isDistinct()
             return isDistinct;
         }
 
-        public DataLimits forPaging(int pageSize)
+        public DataLimits forPaging(PageSize pageSize)
         {
-            return new CQLLimits(pageSize, perPartitionLimit, isDistinct);
+            return new CQLLimits(pageSize.minBytesCount(bytesLimit),
+                                 pageSize.minRowsCount(rowLimit),
+                                 perPartitionLimit,
+                                 isDistinct);
         }
 
-        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        public DataLimits forPaging(PageSize pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
         {
-            return new CQLPagingLimits(pageSize, perPartitionLimit, isDistinct, lastReturnedKey, lastReturnedKeyRemaining);
+            return new CQLPagingLimits(pageSize.minBytesCount(bytesLimit),
+                                       pageSize.minRowsCount(rowLimit),
+                                       perPartitionLimit,
+                                       isDistinct,
+                                       lastReturnedKey,
+                                       lastReturnedKeyRemaining);
         }
 
         public DataLimits forShortReadRetry(int toFetch)
         {
-            return new CQLLimits(toFetch, perPartitionLimit, isDistinct);
+            return new CQLLimits(bytesLimit, toFetch, perPartitionLimit, isDistinct);
         }
 
         public boolean hasEnoughLiveData(CachedPartition cached, int nowInSec, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
@@ -435,6 +511,16 @@ public Counter newCounter(int nowInSec,
             return new CQLCounter(nowInSec, assumeLiveData, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
         }
 
+        public int bytes()
+        {
+            return bytesLimit;
+        }
+
+        public int rows()
+        {
+            return rowLimit;
+        }
+
         public int count()
         {
             return rowLimit;
@@ -450,6 +536,18 @@ public DataLimits withoutState()
             return this;
         }
 
+        @Override
+        public DataLimits withCountedLimit(int newCountedLimit)
+        {
+            return new CQLLimits(bytesLimit, newCountedLimit, perPartitionLimit, isDistinct);
+        }
+
+        @Override
+        public DataLimits withBytesLimit(int bytesLimit)
+        {
+            return new CQLLimits(bytesLimit, rowLimit, perPartitionLimit, isDistinct);
+        }
+
         public float estimateTotalResults(ColumnFamilyStore cfs)
         {
             // TODO: we should start storing stats on the number of rows (instead of the number of cells, which
@@ -460,10 +558,16 @@ public float estimateTotalResults(ColumnFamilyStore cfs)
 
         protected class CQLCounter extends Counter
         {
+            /**
+             * Bytes and rows counted by this counter.
+             */
+            protected int bytesCounted;
             protected int rowsCounted;
             protected int rowsInCurrentPartition;
             protected final boolean countPartitionsWithOnlyStaticData;
 
+            protected int staticRowBytes;
+
             protected boolean hasLiveStaticRow;
 
             public CQLCounter(int nowInSec,
@@ -480,13 +584,14 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
             {
                 rowsInCurrentPartition = 0;
                 hasLiveStaticRow = !staticRow.isEmpty() && isLive(staticRow);
+                staticRowBytes = hasLiveStaticRow && bytesLimit != NO_LIMIT ? staticRow.dataSize() : 0;
             }
 
             @Override
             public Row applyToRow(Row row)
             {
                 if (isLive(row))
-                    incrementRowCount();
+                    incrementRowCount(bytesLimit != NO_LIMIT ? row.dataSize() : 0);
                 return row;
             }
 
@@ -497,15 +602,18 @@ public void onPartitionClose()
                 // rows in the partition. However, if we only have the static row, it will be returned as one row
                 // so count it.
                 if (countPartitionsWithOnlyStaticData && hasLiveStaticRow && rowsInCurrentPartition == 0)
-                    incrementRowCount();
+                    incrementRowCount(staticRowBytes);
                 super.onPartitionClose();
             }
 
-            protected void incrementRowCount()
+            protected void incrementRowCount(int rowSize)
             {
-                if (++rowsCounted >= rowLimit)
+                bytesCounted += rowSize;
+                rowsCounted++;
+                rowsInCurrentPartition++;
+                if (bytesCounted >= bytesLimit || rowsCounted >= rowLimit)
                     stop();
-                if (++rowsInCurrentPartition >= perPartitionLimit)
+                if (rowsInCurrentPartition >= perPartitionLimit)
                     stopInPartition();
             }
 
@@ -519,6 +627,11 @@ public int countedInCurrentPartition()
                 return rowsInCurrentPartition;
             }
 
+            public int bytesCounted()
+            {
+                return bytesCounted;
+            }
+
             public int rowsCounted()
             {
                 return rowsCounted;
@@ -531,31 +644,35 @@ public int rowsCountedInCurrentPartition()
 
             public boolean isDone()
             {
-                return rowsCounted >= rowLimit;
+                return rowsCounted >= rowLimit || bytesCounted >= bytesLimit || counted() >= count();
             }
 
             public boolean isDoneForPartition()
             {
                 return isDone() || rowsInCurrentPartition >= perPartitionLimit;
             }
+
+            @Override
+            public String toString()
+            {
+                return String.format("%s(bytes=%s/%s, rows=%s/%s, partition-rows=%s/%s)", this.getClass().getName(),
+                                     bytesCounted(), bytesLimit, rowsCounted(), rowLimit, rowsCountedInCurrentPartition(), perPartitionLimit);
+            }
         }
 
         @Override
         public String toString()
         {
-            StringBuilder sb = new StringBuilder();
+            List<String> limits = new ArrayList<>(3);
 
+            if (bytesLimit != NO_LIMIT)
+                limits.add("BYTES LIMIT " + bytesLimit);
             if (rowLimit != NO_LIMIT)
-            {
-                sb.append("LIMIT ").append(rowLimit);
-                if (perPartitionLimit != NO_LIMIT)
-                    sb.append(' ');
-            }
-
+                limits.add("ROWS LIMIT " + rowLimit);
             if (perPartitionLimit != NO_LIMIT)
-                sb.append("PER PARTITION LIMIT ").append(perPartitionLimit);
+                limits.add("PER PARTITION LIMIT " + perPartitionLimit);
 
-            return sb.toString();
+            return String.join(" ", limits);
         }
     }
 
@@ -564,9 +681,9 @@ private static class CQLPagingLimits extends CQLLimits
         private final ByteBuffer lastReturnedKey;
         private final int lastReturnedKeyRemaining;
 
-        public CQLPagingLimits(int rowLimit, int perPartitionLimit, boolean isDistinct, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        public CQLPagingLimits(int bytesLimit, int rowLimit, int perPartitionLimit, boolean isDistinct, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
         {
-            super(rowLimit, perPartitionLimit, isDistinct);
+            super(bytesLimit, rowLimit, perPartitionLimit, isDistinct);
             this.lastReturnedKey = lastReturnedKey;
             this.lastReturnedKeyRemaining = lastReturnedKeyRemaining;
         }
@@ -578,13 +695,13 @@ public Kind kind()
         }
 
         @Override
-        public DataLimits forPaging(int pageSize)
+        public DataLimits forPaging(PageSize pageSize)
         {
             throw new UnsupportedOperationException();
         }
 
         @Override
-        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        public DataLimits forPaging(PageSize pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
         {
             throw new UnsupportedOperationException();
         }
@@ -592,7 +709,19 @@ public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastRe
         @Override
         public DataLimits withoutState()
         {
-            return new CQLLimits(rowLimit, perPartitionLimit, isDistinct);
+            return new CQLLimits(bytesLimit, rowLimit, perPartitionLimit, isDistinct);
+        }
+
+        @Override
+        public DataLimits withCountedLimit(int newCountedLimit)
+        {
+            return new CQLPagingLimits(bytesLimit, newCountedLimit, perPartitionLimit, isDistinct, lastReturnedKey, lastReturnedKeyRemaining);
+        }
+
+        @Override
+        public DataLimits withBytesLimit(int bytesLimit)
+        {
+            return new CQLPagingLimits(bytesLimit, rowLimit, perPartitionLimit, isDistinct, lastReturnedKey, lastReturnedKeyRemaining);
         }
 
         @Override
@@ -622,6 +751,7 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
                     // if any already, so force hasLiveStaticRow to false so we make sure to not count it
                     // once more.
                     hasLiveStaticRow = false;
+                    staticRowBytes = 0;
                 }
                 else
                 {
@@ -629,6 +759,16 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
                 }
             }
         }
+
+        @Override
+        public String toString()
+        {
+            return new StringJoiner(", ", CQLPagingLimits.class.getSimpleName() + "[", "]")
+                   .add("super=" + super.toString())
+                   .add("lastReturnedKey=" + (lastReturnedKey != null ? ByteBufferUtil.bytesToHex(lastReturnedKey) : null))
+                   .add("lastReturnedKeyRemaining=" + lastReturnedKeyRemaining)
+                   .toString();
+        }
     }
 
     /**
@@ -661,19 +801,21 @@ private static class CQLGroupByLimits extends CQLLimits
 
         public CQLGroupByLimits(int groupLimit,
                                 int groupPerPartitionLimit,
+                                int bytesLimit,
                                 int rowLimit,
                                 AggregationSpecification groupBySpec)
         {
-            this(groupLimit, groupPerPartitionLimit, rowLimit, groupBySpec, GroupingState.EMPTY_STATE);
+            this(groupLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec, GroupingState.EMPTY_STATE);
         }
 
         private CQLGroupByLimits(int groupLimit,
                                  int groupPerPartitionLimit,
+                                 int bytesLimit,
                                  int rowLimit,
                                  AggregationSpecification groupBySpec,
                                  GroupingState state)
         {
-            super(rowLimit, NO_LIMIT, false);
+            super(bytesLimit, rowLimit, NO_LIMIT, false);
             this.groupLimit = groupLimit;
             this.groupPerPartitionLimit = groupPerPartitionLimit;
             this.groupBySpec = groupBySpec;
@@ -694,12 +836,12 @@ public boolean isGroupByLimit()
 
         public boolean isUnlimited()
         {
-            return groupLimit == NO_LIMIT && groupPerPartitionLimit == NO_LIMIT && rowLimit == NO_LIMIT;
+            return groupLimit == NO_LIMIT && groupPerPartitionLimit == NO_LIMIT && super.isUnlimited();
         }
 
         public DataLimits forShortReadRetry(int toFetch)
         {
-            return new CQLLimits(toFetch);
+            return new CQLLimits(NO_LIMIT, toFetch, NO_LIMIT, false);
         }
 
         @Override
@@ -712,21 +854,35 @@ public float estimateTotalResults(ColumnFamilyStore cfs)
         }
 
         @Override
-        public DataLimits forPaging(int pageSize)
+        public DataLimits forPaging(PageSize pageSize)
         {
-            return new CQLGroupByLimits(pageSize,
+            if (logger.isTraceEnabled())
+                logger.trace("{} forPaging({})", hashCode(), pageSize);
+
+            return new CQLGroupByLimits(groupLimit,
                                         groupPerPartitionLimit,
-                                        rowLimit,
+                                        pageSize.minBytesCount(bytesLimit),
+                                        pageSize.minRowsCount(rowLimit),
                                         groupBySpec,
                                         state);
         }
 
         @Override
-        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
-        {
-            return new CQLGroupByPagingLimits(pageSize,
+        public DataLimits forPaging(PageSize pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        {
+            if (logger.isTraceEnabled())
+                logger.trace("{} forPaging({}, {}, {}) vs state {}/{}",
+                             hashCode(),
+                             pageSize,
+                             lastReturnedKey == null ? "null" : ByteBufferUtil.bytesToHex(lastReturnedKey),
+                             lastReturnedKeyRemaining,
+                             state.partitionKey() == null ? "null" : ByteBufferUtil.bytesToHex(state.partitionKey()),
+                             state.clustering() == null ? "null" : state.clustering().toString());
+
+            return new CQLGroupByPagingLimits(groupLimit,
                                               groupPerPartitionLimit,
-                                              rowLimit,
+                                              pageSize.minBytesCount(bytesLimit),
+                                              pageSize.minRowsCount(rowLimit),
                                               groupBySpec,
                                               state,
                                               lastReturnedKey,
@@ -736,8 +892,9 @@ public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastRe
         @Override
         public DataLimits forGroupByInternalPaging(GroupingState state)
         {
-            return new CQLGroupByLimits(rowLimit,
+            return new CQLGroupByLimits(groupLimit,
                                         groupPerPartitionLimit,
+                                        bytesLimit,
                                         rowLimit,
                                         groupBySpec,
                                         state);
@@ -769,41 +926,44 @@ public DataLimits withoutState()
         {
             return state == GroupingState.EMPTY_STATE
                  ? this
-                 : new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, rowLimit, groupBySpec);
+                 : new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec);
         }
 
+        @Override
+        public DataLimits withCountedLimit(int newCountedLimit)
+        {
+            return new CQLGroupByLimits(newCountedLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec, state);
+        }
+
+        @Override
+        public DataLimits withBytesLimit(int bytesLimit)
+        {
+            return new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec, state);
+        }
+
+
+
         @Override
         public String toString()
         {
-            StringBuilder sb = new StringBuilder();
+            List<String> limits = new ArrayList<>(4);
 
             if (groupLimit != NO_LIMIT)
-            {
-                sb.append("GROUP LIMIT ").append(groupLimit);
-                if (groupPerPartitionLimit != NO_LIMIT || rowLimit != NO_LIMIT)
-                    sb.append(' ');
-            }
-
+                limits.add("GROUP LIMIT " + groupLimit);
             if (groupPerPartitionLimit != NO_LIMIT)
-            {
-                sb.append("GROUP PER PARTITION LIMIT ").append(groupPerPartitionLimit);
-                if (rowLimit != NO_LIMIT)
-                    sb.append(' ');
-            }
-
+                limits.add("GROUP PER PARTITION LIMIT " + groupPerPartitionLimit);
+            if (bytesLimit != NO_LIMIT)
+                limits.add("BYTES LIMIT " + bytesLimit);
             if (rowLimit != NO_LIMIT)
-            {
-                sb.append("LIMIT ").append(rowLimit);
-            }
+                limits.add("ROWS LIMIT " + rowLimit);
 
-            return sb.toString();
+            return String.join(" ", limits);
         }
 
         @Override
-        public boolean isExhausted(Counter counter)
+        public boolean isCounterBelowLimits(Counter counter)
         {
-            return ((GroupByAwareCounter) counter).rowsCounted < rowLimit
-                    && counter.counted() < groupLimit;
+            return counter.rowsCounted() < rowLimit && counter.bytesCounted() < bytesLimit && counter.counted() < groupLimit;
         }
 
         protected class GroupByAwareCounter extends Counter
@@ -817,6 +977,11 @@ protected class GroupByAwareCounter extends Counter
              */
             protected DecoratedKey currentPartitionKey;
 
+            /**
+             * The number of bytes counted so far.
+             */
+            protected int bytesCounted;
+
             /**
              * The number of rows counted so far.
              */
@@ -842,6 +1007,8 @@ protected class GroupByAwareCounter extends Counter
 
             protected boolean hasLiveStaticRow;
 
+            protected int staticRowBytes;
+
             protected boolean hasReturnedRowsFromCurrentPartition;
 
             private GroupByAwareCounter(int nowInSec,
@@ -862,6 +1029,10 @@ private GroupByAwareCounter(int nowInSec,
             @Override
             public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
             {
+                if (logger.isTraceEnabled())
+                    logger.trace("{} - GroupByAwareCounter.newPartition {} with state {}", hashCode(),
+                                 ByteBufferUtil.bytesToHex(partitionKey.getKey()), state.partitionKey() != null ? ByteBufferUtil.bytesToHex(state.partitionKey()) : "null");
+
                 if (partitionKey.getKey().equals(state.partitionKey()))
                 {
                     // The only case were we could have state.partitionKey() equals to the partition key
@@ -871,6 +1042,7 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
                     // the static row if any already, so force hasLiveStaticRow to false so we make sure to not count it
                     // once more.
                     hasLiveStaticRow = false;
+                    staticRowBytes = 0;
                     hasReturnedRowsFromCurrentPartition = true;
                     hasUnfinishedGroup = true;
                 }
@@ -894,6 +1066,7 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
                     }
                     hasReturnedRowsFromCurrentPartition = false;
                     hasLiveStaticRow = !staticRow.isEmpty() && isLive(staticRow);
+                    staticRowBytes = hasLiveStaticRow ? staticRow.dataSize() : 0;
                 }
                 currentPartitionKey = partitionKey;
                 // If we are done we need to preserve the groupInCurrentPartition and rowsCountedInCurrentPartition
@@ -908,12 +1081,19 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
             @Override
             protected Row applyToStatic(Row row)
             {
+                if (logger.isTraceEnabled())
+                    logger.trace("{} - GroupByAwareCounter.applyToStatic {}/{}",
+                                 hashCode(),
+                                 currentPartitionKey != null ? ByteBufferUtil.bytesToHex(currentPartitionKey.getKey()) : "null",
+                                 row == null ? "null" : row.clustering().toString());
+
                 // It's possible that we're "done" if the partition we just started bumped the number of groups (in
                 // applyToPartition() above), in which case Transformation will still call this method. In that case, we
                 // want to ignore the static row, it should (and will) be returned with the next page/group if needs be.
                 if (enforceLimits && isDone())
                 {
                     hasLiveStaticRow = false; // The row has not been returned
+                    staticRowBytes = 0;
                     return Rows.EMPTY_STATIC_ROW;
                 }
                 return row;
@@ -922,6 +1102,12 @@ protected Row applyToStatic(Row row)
             @Override
             public Row applyToRow(Row row)
             {
+                if (logger.isTraceEnabled())
+                    logger.trace("{} - GroupByAwareCounter.applyToRow {}/{}",
+                                 hashCode(),
+                                 ByteBufferUtil.bytesToHex(currentPartitionKey.getKey()),
+                                 row.clustering().toString());
+
                 // We want to check if the row belongs to a new group even if it has been deleted. The goal being
                 // to minimize the chances of having to go through the same data twice if we detect on the next
                 // non deleted row that we have reached the limit.
@@ -946,7 +1132,7 @@ public Row applyToRow(Row row)
                 if (isLive(row))
                 {
                     hasUnfinishedGroup = true;
-                    incrementRowCount();
+                    incrementRowCount(bytesLimit != NO_LIMIT ? row.dataSize() : 0);
                     hasReturnedRowsFromCurrentPartition = true;
                 }
 
@@ -965,6 +1151,12 @@ public int countedInCurrentPartition()
                 return groupInCurrentPartition;
             }
 
+            @Override
+            public int bytesCounted()
+            {
+                return bytesCounted;
+            }
+
             @Override
             public int rowsCounted()
             {
@@ -977,10 +1169,12 @@ public int rowsCountedInCurrentPartition()
                 return rowsCountedInCurrentPartition;
             }
 
-            protected void incrementRowCount()
+            protected void incrementRowCount(int rowSize)
             {
                 rowsCountedInCurrentPartition++;
-                if (++rowsCounted >= rowLimit)
+                rowsCounted++;
+                bytesCounted += rowSize;
+                if (rowsCounted >= rowLimit || bytesCounted >= bytesLimit)
                     stop();
             }
 
@@ -1018,7 +1212,7 @@ public void onPartitionClose()
                 // so count it.
                 if (countPartitionsWithOnlyStaticData && hasLiveStaticRow && !hasReturnedRowsFromCurrentPartition)
                 {
-                    incrementRowCount();
+                    incrementRowCount(staticRowBytes);
                     incrementGroupCount();
                     incrementGroupInCurrentPartitionCount();
                     hasUnfinishedGroup = false;
@@ -1035,7 +1229,7 @@ public void onClose()
                 // 2) the end of the data is reached
                 // We know that the end of the data is reached if the group limit has not been reached
                 // and the number of rows counted is smaller than the internal page size.
-                if (hasUnfinishedGroup && groupCounted < groupLimit && rowsCounted < rowLimit)
+                if (hasUnfinishedGroup && groupCounted < groupLimit && bytesCounted < bytesLimit && rowsCounted < rowLimit)
                 {
                     incrementGroupCount();
                     incrementGroupInCurrentPartitionCount();
@@ -1043,6 +1237,13 @@ public void onClose()
 
                 super.onClose();
             }
+
+            @Override
+            public String toString()
+            {
+                return String.format("%s(bytes=%s/%s, rows=%s/%s, partition-rows=%s/%s, groups=%s/%s, partition-groups=%s/%s)", this.getClass().getName(),
+                                     bytesCounted(), bytesLimit, rowsCounted(), rowLimit, rowsCountedInCurrentPartition(), perPartitionLimit, groupCounted, groupLimit, groupInCurrentPartition, groupPerPartitionLimit);
+            }
         }
     }
 
@@ -1054,6 +1255,7 @@ private static class CQLGroupByPagingLimits extends CQLGroupByLimits
 
         public CQLGroupByPagingLimits(int groupLimit,
                                       int groupPerPartitionLimit,
+                                      int bytesLimit,
                                       int rowLimit,
                                       AggregationSpecification groupBySpec,
                                       GroupingState state,
@@ -1062,6 +1264,7 @@ public CQLGroupByPagingLimits(int groupLimit,
         {
             super(groupLimit,
                   groupPerPartitionLimit,
+                  bytesLimit,
                   rowLimit,
                   groupBySpec,
                   state);
@@ -1077,13 +1280,13 @@ public Kind kind()
         }
 
         @Override
-        public DataLimits forPaging(int pageSize)
+        public DataLimits forPaging(PageSize pageSize)
         {
             throw new UnsupportedOperationException();
         }
 
         @Override
-        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        public DataLimits forPaging(PageSize pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
         {
             throw new UnsupportedOperationException();
         }
@@ -1104,9 +1307,23 @@ public Counter newCounter(int nowInSec, boolean assumeLiveData, boolean countPar
         @Override
         public DataLimits withoutState()
         {
-            return new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, rowLimit, groupBySpec);
+            return new CQLGroupByLimits(groupLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec);
         }
 
+        @Override
+        public DataLimits withCountedLimit(int newCountedLimit)
+        {
+            return new CQLGroupByPagingLimits(newCountedLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec, state, lastReturnedKey, lastReturnedKeyRemaining);
+        }
+
+        @Override
+        public DataLimits withBytesLimit(int bytesLimit)
+        {
+            return new CQLGroupByPagingLimits(groupLimit, groupPerPartitionLimit, bytesLimit, rowLimit, groupBySpec, state, lastReturnedKey, lastReturnedKeyRemaining);
+        }
+
+
+
         private class PagingGroupByAwareCounter extends GroupByAwareCounter
         {
             private PagingGroupByAwareCounter(int nowInSec, boolean assumeLiveData, boolean countPartitionsWithOnlyStaticData, boolean enforceStrictLiveness)
@@ -1117,12 +1334,17 @@ private PagingGroupByAwareCounter(int nowInSec, boolean assumeLiveData, boolean
             @Override
             public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
             {
+                if (logger.isTraceEnabled())
+                    logger.trace("{} - CQLGroupByPagingLimits.applyToPartition {}",
+                                 hashCode(), ByteBufferUtil.bytesToHex(partitionKey.getKey()));
+
                 if (partitionKey.getKey().equals(lastReturnedKey))
                 {
                     currentPartitionKey = partitionKey;
                     groupInCurrentPartition = groupPerPartitionLimit - lastReturnedKeyRemaining;
                     hasReturnedRowsFromCurrentPartition = true;
                     hasLiveStaticRow = false;
+                    staticRowBytes = 0;
                     hasUnfinishedGroup = state.hasClustering();
                 }
                 else
@@ -1131,6 +1353,16 @@ public void applyToPartition(DecoratedKey partitionKey, Row staticRow)
                 }
             }
         }
+
+        @Override
+        public String toString()
+        {
+            return new StringJoiner(", ", CQLGroupByPagingLimits.class.getSimpleName() + "[", "]")
+                   .add("super=" + super.toString())
+                   .add("lastReturnedKey=" + (lastReturnedKey != null ? ByteBufferUtil.bytesToHex(lastReturnedKey) : null))
+                   .add("lastReturnedKeyRemaining=" + lastReturnedKeyRemaining)
+                   .toString();
+        }
     }
 
     public static class Serializer
@@ -1145,6 +1377,8 @@ public void serialize(DataLimits limits, DataOutputPlus out, int version, Cluste
                     CQLLimits cqlLimits = (CQLLimits)limits;
                     out.writeUnsignedVInt(cqlLimits.rowLimit);
                     out.writeUnsignedVInt(cqlLimits.perPartitionLimit);
+                    if (version >= MessagingService.VERSION_41)
+                        out.writeUnsignedVInt(cqlLimits.bytesLimit);
                     out.writeBoolean(cqlLimits.isDistinct);
                     if (limits.kind() == Kind.CQL_PAGING_LIMIT)
                     {
@@ -1159,6 +1393,8 @@ public void serialize(DataLimits limits, DataOutputPlus out, int version, Cluste
                     out.writeUnsignedVInt(groupByLimits.groupLimit);
                     out.writeUnsignedVInt(groupByLimits.groupPerPartitionLimit);
                     out.writeUnsignedVInt(groupByLimits.rowLimit);
+                    if (version >= MessagingService.VERSION_41)
+                        out.writeUnsignedVInt(groupByLimits.bytesLimit);
 
                     AggregationSpecification groupBySpec = groupByLimits.groupBySpec;
                     AggregationSpecification.serializer.serialize(groupBySpec, out, version);
@@ -1185,12 +1421,13 @@ public DataLimits deserialize(DataInputPlus in, int version, ClusteringComparato
                 {
                     int rowLimit = (int) in.readUnsignedVInt();
                     int perPartitionLimit = (int) in.readUnsignedVInt();
+                    int bytesLimit = version >= MessagingService.VERSION_41 ? (int) in.readUnsignedVInt() : NO_LIMIT;
                     boolean isDistinct = in.readBoolean();
                     if (kind == Kind.CQL_LIMIT)
-                        return cqlLimits(rowLimit, perPartitionLimit, isDistinct);
+                        return cqlLimits(bytesLimit, rowLimit, perPartitionLimit, isDistinct);
                     ByteBuffer lastKey = ByteBufferUtil.readWithVIntLength(in);
                     int lastRemaining = (int) in.readUnsignedVInt();
-                    return new CQLPagingLimits(rowLimit, perPartitionLimit, isDistinct, lastKey, lastRemaining);
+                    return new CQLPagingLimits(bytesLimit, rowLimit, perPartitionLimit, isDistinct, lastKey, lastRemaining);
                 }
                 case CQL_GROUP_BY_LIMIT:
                 case CQL_GROUP_BY_PAGING_LIMIT:
@@ -1198,6 +1435,7 @@ public DataLimits deserialize(DataInputPlus in, int version, ClusteringComparato
                     int groupLimit = (int) in.readUnsignedVInt();
                     int groupPerPartitionLimit = (int) in.readUnsignedVInt();
                     int rowLimit = (int) in.readUnsignedVInt();
+                    int bytesLimit = version >= MessagingService.VERSION_41 ? (int) in.readUnsignedVInt() : NO_LIMIT;
 
                     AggregationSpecification groupBySpec = AggregationSpecification.serializer.deserialize(in, version, comparator);
 
@@ -1206,6 +1444,7 @@ public DataLimits deserialize(DataInputPlus in, int version, ClusteringComparato
                     if (kind == Kind.CQL_GROUP_BY_LIMIT)
                         return new CQLGroupByLimits(groupLimit,
                                                     groupPerPartitionLimit,
+                                                    bytesLimit,
                                                     rowLimit,
                                                     groupBySpec,
                                                     state);
@@ -1214,6 +1453,7 @@ public DataLimits deserialize(DataInputPlus in, int version, ClusteringComparato
                     int lastRemaining = (int) in.readUnsignedVInt();
                     return new CQLGroupByPagingLimits(groupLimit,
                                                       groupPerPartitionLimit,
+                                                      bytesLimit,
                                                       rowLimit,
                                                       groupBySpec,
                                                       state,
@@ -1234,6 +1474,8 @@ public long serializedSize(DataLimits limits, int version, ClusteringComparator
                     CQLLimits cqlLimits = (CQLLimits) limits;
                     size += TypeSizes.sizeofUnsignedVInt(cqlLimits.rowLimit);
                     size += TypeSizes.sizeofUnsignedVInt(cqlLimits.perPartitionLimit);
+                    if (version >= MessagingService.VERSION_41)
+                        size += TypeSizes.sizeofUnsignedVInt(cqlLimits.bytesLimit);
                     size += TypeSizes.sizeof(cqlLimits.isDistinct);
                     if (limits.kind() == Kind.CQL_PAGING_LIMIT)
                     {
@@ -1248,6 +1490,8 @@ public long serializedSize(DataLimits limits, int version, ClusteringComparator
                     size += TypeSizes.sizeofUnsignedVInt(groupByLimits.groupLimit);
                     size += TypeSizes.sizeofUnsignedVInt(groupByLimits.groupPerPartitionLimit);
                     size += TypeSizes.sizeofUnsignedVInt(groupByLimits.rowLimit);
+                    if (version >= MessagingService.VERSION_41)
+                        size += TypeSizes.sizeofUnsignedVInt(groupByLimits.bytesLimit);
 
                     AggregationSpecification groupBySpec = groupByLimits.groupBySpec;
                     size += AggregationSpecification.serializer.serializedSize(groupBySpec, version);
diff --git a/src/java/org/apache/cassandra/guardrails/Guardrails.java b/src/java/org/apache/cassandra/guardrails/Guardrails.java
index 3dabca591409..e23d90f13aa6 100644
--- a/src/java/org/apache/cassandra/guardrails/Guardrails.java
+++ b/src/java/org/apache/cassandra/guardrails/Guardrails.java
@@ -166,7 +166,6 @@ public abstract class Guardrails
                   (x, what, v, t) -> format("Tables cannot have more than %s materialized views, failed to create materialized view %s",
                                             t, what));
 
-    // TODO Unused until STAR-762 implements paging by bytes and can port pagesize related DB-3208 guardrails
     public static final Threshold pageSize =
     new SizeThreshold("page_size",
                       () -> -1L,
diff --git a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
index fe527ecd5908..7f6a7f4f5056 100644
--- a/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
+++ b/src/java/org/apache/cassandra/index/SecondaryIndexManager.java
@@ -53,6 +53,7 @@
 import org.apache.cassandra.concurrent.NamedThreadFactory;
 import org.apache.cassandra.concurrent.Stage;
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.compaction.CompactionManager;
@@ -940,7 +941,7 @@ public boolean hasIndexes()
     /**
      * When building an index against existing data in sstables, add the given partition to the index
      */
-    public void indexPartition(DecoratedKey key, Set<Index> indexes, int pageSize)
+    public void indexPartition(DecoratedKey key, Set<Index> indexes, PageSize pageSize)
     {
         if (logger.isTraceEnabled())
             logger.trace("Indexing partition {}", baseCfs.metadata().partitionKeyType.getString(key.getKey()));
@@ -1030,39 +1031,12 @@ public void indexPartition(DecoratedKey key, Set<Index> indexes, int pageSize)
     /**
      * Return the page size used when indexing an entire partition
      */
-    public int calculateIndexingPageSize()
+    public PageSize calculateIndexingPageSize()
     {
         if (Boolean.getBoolean("cassandra.force_default_indexing_page_size"))
-            return DEFAULT_PAGE_SIZE;
+            return PageSize.inRows(DEFAULT_PAGE_SIZE);
 
-        double targetPageSizeInBytes = 32 * 1024 * 1024;
-        double meanPartitionSize = baseCfs.getMeanPartitionSize();
-        if (meanPartitionSize <= 0)
-            return DEFAULT_PAGE_SIZE;
-
-        int meanCellsPerPartition = baseCfs.getMeanEstimatedCellPerPartitionCount();
-        if (meanCellsPerPartition <= 0)
-            return DEFAULT_PAGE_SIZE;
-
-        int columnsPerRow = baseCfs.metadata().regularColumns().size();
-        if (columnsPerRow <= 0)
-            return DEFAULT_PAGE_SIZE;
-
-        int meanRowsPerPartition = meanCellsPerPartition / columnsPerRow;
-        double meanRowSize = meanPartitionSize / meanRowsPerPartition;
-
-        int pageSize = (int) Math.max(1, Math.min(DEFAULT_PAGE_SIZE, targetPageSizeInBytes / meanRowSize));
-
-        logger.trace("Calculated page size {} for indexing {}.{} ({}/{}/{}/{})",
-                     pageSize,
-                     baseCfs.metadata.keyspace,
-                     baseCfs.metadata.name,
-                     meanPartitionSize,
-                     meanCellsPerPartition,
-                     meanRowsPerPartition,
-                     meanRowSize);
-
-        return pageSize;
+        return PageSize.inBytes(32 * 1024 * 1024);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java b/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java
index b17bb65f9413..5e6af413d899 100644
--- a/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java
+++ b/src/java/org/apache/cassandra/index/internal/CollatedViewIndexBuilder.java
@@ -21,6 +21,7 @@
 import java.util.Set;
 import java.util.UUID;
 
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.compaction.CompactionInterruptedException;
@@ -65,7 +66,7 @@ public void build()
     {
         try
         {
-            int pageSize = cfs.indexManager.calculateIndexingPageSize();
+            PageSize pageSize = cfs.indexManager.calculateIndexingPageSize();
             while (iter.hasNext())
             {
                 if (isStopRequested())
diff --git a/src/java/org/apache/cassandra/net/Message.java b/src/java/org/apache/cassandra/net/Message.java
index dc2508672a06..26cbc604c2e4 100644
--- a/src/java/org/apache/cassandra/net/Message.java
+++ b/src/java/org/apache/cassandra/net/Message.java
@@ -48,6 +48,7 @@
 import static org.apache.cassandra.db.TypeSizes.sizeof;
 import static org.apache.cassandra.db.TypeSizes.sizeofUnsignedVInt;
 import static org.apache.cassandra.locator.InetAddressAndPort.Serializer.inetAddressAndPortSerializer;
+import static org.apache.cassandra.net.MessagingService.VERSION_41;
 import static org.apache.cassandra.net.MessagingService.VERSION_SG_10;
 import static org.apache.cassandra.net.MessagingService.VERSION_3014;
 import static org.apache.cassandra.net.MessagingService.VERSION_30;
@@ -1319,6 +1320,7 @@ private static <In,Out> IVersionedAsymmetricSerializer<In, Out> getPayloadSerial
     private int serializedSize30;
     private int serializedSize3014;
     private int serializedSize40;
+    private int serializedSize41;
     private int serializedSizeSG10;
 
     /**
@@ -1340,6 +1342,10 @@ public int serializedSize(int version)
                 if (serializedSize40 == 0)
                     serializedSize40 = serializer.serializedSize(this, VERSION_40);
                 return serializedSize40;
+            case VERSION_41:
+                if (serializedSize41 == 0)
+                    serializedSize41 = serializer.serializedSize(this, VERSION_41);
+                return serializedSize41;
             case VERSION_SG_10:
                 if (serializedSizeSG10 == 0)
                     serializedSizeSG10 = (int) serializer.serializedSize(this, VERSION_SG_10);
@@ -1352,6 +1358,7 @@ public int serializedSize(int version)
     private int payloadSize30   = -1;
     private int payloadSize3014 = -1;
     private int payloadSize40   = -1;
+    private int payloadSize41   = -1;
     private int payloadSizeSG10 = -1;
 
     private int payloadSize(int version)
@@ -1370,6 +1377,10 @@ private int payloadSize(int version)
                 if (payloadSize40 < 0)
                     payloadSize40 = serializer.payloadSize(this, VERSION_40);
                 return payloadSize40;
+            case VERSION_41:
+                if (payloadSize41 < 0)
+                    payloadSize41 = serializer.payloadSize(this, VERSION_41);
+                return payloadSize41;
             case VERSION_SG_10:
                 if (payloadSizeSG10 < 0)
                     payloadSizeSG10 = serializer.payloadSize(this, VERSION_SG_10);
diff --git a/src/java/org/apache/cassandra/net/MessagingService.java b/src/java/org/apache/cassandra/net/MessagingService.java
index 7a92a2dc44f4..2d9e79ea7cea 100644
--- a/src/java/org/apache/cassandra/net/MessagingService.java
+++ b/src/java/org/apache/cassandra/net/MessagingService.java
@@ -202,6 +202,7 @@ public final class MessagingService extends MessagingServiceMBeanImpl
     public static final int VERSION_30 = 10;
     public static final int VERSION_3014 = 11;
     public static final int VERSION_40 = 12;
+    public static final int VERSION_41 = 13;
     // Current Stargazer version while we have serialization differences
     // If differences get merged upstream then we can revert to OS versioning
     public static final int VERSION_SG_10 = 100;
@@ -212,10 +213,11 @@ public final class MessagingService extends MessagingServiceMBeanImpl
 
     public enum Version
     {
-        VERSION_30(10),
-        VERSION_3014(11),
-        VERSION_40(12),
-        STARGAZER_10(100);
+        VERSION_30(MessagingService.VERSION_30),
+        VERSION_3014(MessagingService.VERSION_3014),
+        VERSION_40(MessagingService.VERSION_40),
+        VERSION_41(MessagingService.VERSION_41),
+        STARGAZER_10(MessagingService.VERSION_SG_10);
 
         public final int value;
 
diff --git a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
index fa668cbea362..f60fb38b35be 100644
--- a/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
+++ b/src/java/org/apache/cassandra/repair/consistent/LocalSessions.java
@@ -54,6 +54,7 @@
 import com.google.common.util.concurrent.ListenableFuture;
 import com.google.common.util.concurrent.MoreExecutors;
 
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.compaction.CleanupTask;
 import org.apache.cassandra.db.compaction.OperationType;
 import org.apache.cassandra.db.compaction.RepairFinishedCompactionTask;
@@ -398,7 +399,7 @@ public synchronized void start()
     {
         Preconditions.checkArgument(!started, "LocalSessions.start can only be called once");
         Preconditions.checkArgument(sessions.isEmpty(), "No sessions should be added before start");
-        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(String.format("SELECT * FROM %s.%s", keyspace, table), 1000);
+        UntypedResultSet rows = QueryProcessor.executeInternalWithPaging(String.format("SELECT * FROM %s.%s", keyspace, table), PageSize.inRows(1000));
         Map<UUID, LocalSession> loadedSessions = new HashMap<>();
         for (UntypedResultSet.Row row : rows)
         {
diff --git a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
index 7f0896a445a4..e3b16a76c136 100644
--- a/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/AbstractQueryPager.java
@@ -17,6 +17,14 @@
  */
 package org.apache.cassandra.service.pager;
 
+import java.util.StringJoiner;
+
+import javax.annotation.concurrent.NotThreadSafe;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.partitions.*;
@@ -26,23 +34,42 @@
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 
+@NotThreadSafe
 abstract class AbstractQueryPager<T extends ReadQuery> implements QueryPager
 {
+    private static final Logger logger = LoggerFactory.getLogger(AbstractQueryPager.class);
+
     protected final T query;
+
+    // the limits provided as a part of the query
     protected final DataLimits limits;
     protected final ProtocolVersion protocolVersion;
     private final boolean enforceStrictLiveness;
 
-    private int remaining;
+    // This is the counter which was used for the last page we fetched. It can be used to obtain the number of
+    // fetched rows or bytes.
+    private DataLimits.Counter lastCounter;
 
-    // This is the last key we've been reading from (or can still be reading within). This the key for
+    // This is the last key we've been reading from (or can still be reading within). This is the key for
     // which remainingInPartition makes sense: if we're starting another key, we should reset remainingInPartition
     // (and this is done in PagerIterator). This can be null (when we start).
     private DecoratedKey lastKey;
+
+    // The remaining and remainingInPartition are initially set to the user limits provided in the query (via the
+    // LIMIT and PER PARTITION LIMIT clauses). When a page is fetched, iterated and closed, those values are updated
+    // with the number of items counted on that recently fetched page.
+    private int remaining;
     private int remainingInPartition;
 
+    // Whether the pager is exhausted or not - the pager gets exhausted if the recently fetched, iterated and closed
+    // page has less items than the requested page size
     private boolean exhausted;
 
+    // The paging transformation which is used for the recently requested page. It is set when we request the new page
+    // and then cleaned when the page is closed. We use it to prevent fetching a new page until the previous one is
+    // closed.
+    private PagerTransformation<?> currentPagerTransformation;
+
     protected AbstractQueryPager(T query, ProtocolVersion protocolVersion)
     {
         this.query = query;
@@ -59,58 +86,77 @@ public ReadExecutionController executionController()
         return query.executionController();
     }
 
-    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime)
+    public PartitionIterator fetchPage(PageSize pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime)
     {
+        assert currentPagerTransformation == null;
+
         if (isExhausted())
             return EmptyIterators.partition();
 
-        pageSize = Math.min(pageSize, remaining);
-        Pager pager = new RowPager(limits.forPaging(pageSize), query.nowInSec());
-        ReadQuery readQuery = nextPageReadQuery(pageSize);
+        DataLimits updatedQueryLimits = nextPageLimits();
+        RowPagerTransformation pagerTransformation = new RowPagerTransformation(updatedQueryLimits.forPaging(pageSize), query.nowInSec());
+        ReadQuery readQuery = nextPageReadQuery(pageSize, updatedQueryLimits);
         if (readQuery == null)
         {
             exhausted = true;
             return EmptyIterators.partition();
         }
-        return Transformation.apply(readQuery.execute(consistency, queryState, queryStartNanoTime), pager);
+        currentPagerTransformation = pagerTransformation;
+        return Transformation.apply(readQuery.execute(consistency, queryState, queryStartNanoTime), pagerTransformation);
     }
 
-    public PartitionIterator fetchPageInternal(int pageSize, ReadExecutionController executionController)
+    @Override
+    public PartitionIterator fetchPageInternal(PageSize pageSize, ReadExecutionController executionController)
     {
+        assert currentPagerTransformation == null;
+
         if (isExhausted())
             return EmptyIterators.partition();
 
-        pageSize = Math.min(pageSize, remaining);
-        RowPager pager = new RowPager(limits.forPaging(pageSize), query.nowInSec());
-        ReadQuery readQuery = nextPageReadQuery(pageSize);
+        DataLimits updatedQueryLimits = nextPageLimits();
+        RowPagerTransformation pagerTransformation = new RowPagerTransformation(updatedQueryLimits.forPaging(pageSize), query.nowInSec());
+        ReadQuery readQuery = nextPageReadQuery(pageSize, updatedQueryLimits);
         if (readQuery == null)
         {
             exhausted = true;
             return EmptyIterators.partition();
         }
-        return Transformation.apply(readQuery.executeInternal(executionController), pager);
+        currentPagerTransformation = pagerTransformation;
+        return Transformation.apply(readQuery.executeInternal(executionController), pagerTransformation);
     }
 
-    public UnfilteredPartitionIterator fetchPageUnfiltered(TableMetadata metadata, int pageSize, ReadExecutionController executionController)
+    public UnfilteredPartitionIterator fetchPageUnfiltered(TableMetadata metadata, PageSize pageSize, ReadExecutionController executionController)
     {
+        assert currentPagerTransformation == null;
+
         if (isExhausted())
             return EmptyIterators.unfilteredPartition(metadata);
 
-        pageSize = Math.min(pageSize, remaining);
-        UnfilteredPager pager = new UnfilteredPager(limits.forPaging(pageSize), query.nowInSec());
-        ReadQuery readQuery = nextPageReadQuery(pageSize);
+        DataLimits updatedQueryLimits = nextPageLimits();
+        UnfilteredPagerTransformation pagerTransformation = new UnfilteredPagerTransformation(updatedQueryLimits.forPaging(pageSize), query.nowInSec());
+        ReadQuery readQuery = nextPageReadQuery(pageSize, updatedQueryLimits);
         if (readQuery == null)
         {
             exhausted = true;
             return EmptyIterators.unfilteredPartition(metadata);
         }
-        return Transformation.apply(readQuery.executeLocally(executionController), pager);
+        currentPagerTransformation = pagerTransformation;
+        return Transformation.apply(readQuery.executeLocally(executionController), pagerTransformation);
     }
 
-    private class UnfilteredPager extends Pager<Unfiltered>
+    /**
+     * For subsequent pages we want to limit the number of rows to the minimum of the currently set limit in the query
+     * and the number of remaining rows in page. Note that paging itself will be applied separately.
+     */
+    protected DataLimits nextPageLimits()
     {
+        return limits.withCountedLimit(Math.min(limits.count(), remaining));
+    }
 
-        private UnfilteredPager(DataLimits pageLimits, int nowInSec)
+    private class UnfilteredPagerTransformation extends PagerTransformation<Unfiltered>
+    {
+
+        private UnfilteredPagerTransformation(DataLimits pageLimits, int nowInSec)
         {
             super(pageLimits, nowInSec);
         }
@@ -121,10 +167,10 @@ protected BaseRowIterator<Unfiltered> apply(BaseRowIterator<Unfiltered> partitio
         }
     }
 
-    private class RowPager extends Pager<Row>
+    private class RowPagerTransformation extends PagerTransformation<Row>
     {
 
-        private RowPager(DataLimits pageLimits, int nowInSec)
+        private RowPagerTransformation(DataLimits pageLimits, int nowInSec)
         {
             super(pageLimits, nowInSec);
         }
@@ -135,7 +181,7 @@ protected BaseRowIterator<Row> apply(BaseRowIterator<Row> partition)
         }
     }
 
-    private abstract class Pager<T extends Unfiltered> extends Transformation<BaseRowIterator<T>>
+    private abstract class PagerTransformation<T extends Unfiltered> extends Transformation<BaseRowIterator<T>>
     {
         private final DataLimits pageLimits;
         protected final DataLimits.Counter counter;
@@ -143,10 +189,20 @@ private abstract class Pager<T extends Unfiltered> extends Transformation<BaseRo
         private Row lastRow;
         private boolean isFirstPartition = true;
 
-        private Pager(DataLimits pageLimits, int nowInSec)
+        private PagerTransformation(DataLimits pageLimits, int nowInSec)
         {
             this.counter = pageLimits.newCounter(nowInSec, true, query.selectsFullPartition(), enforceStrictLiveness);
+            lastCounter = this.counter;
+
+            // Page limits are passed to the transformation for two reasons
+            // - to create a counter,
+            // - to determine if the query pager is exhausted when we reach the end of data (if the end of data
+            //   is reached before we reach the page limit, we conclude that the query pager is exhausted and there
+            //   are no more pages to fetch)
             this.pageLimits = pageLimits;
+
+            if (logger.isTraceEnabled())
+                logger.trace("Fetching new page - created {}", this);
         }
 
         @Override
@@ -177,6 +233,7 @@ public BaseRowIterator<T> applyToPartition(BaseRowIterator<T> partition)
         @Override
         public void onClose()
         {
+            assert lastCounter == counter;
             // In some case like GROUP BY a counter need to know when the processing is completed.
             counter.onClose();
 
@@ -196,7 +253,9 @@ public void onClose()
             {
                 remainingInPartition -= counter.countedInCurrentPartition();
             }
-            exhausted = pageLimits.isExhausted(counter);
+            // if the counter did not count up to the page limits, then the iteration must have reached the end
+            exhausted = pageLimits.isCounterBelowLimits(counter);
+            currentPagerTransformation = null;
         }
 
         public Row applyToStatic(Row row)
@@ -222,6 +281,18 @@ public Row applyToRow(Row row)
             lastRow = row;
             return row;
         }
+
+        @Override
+        public String toString()
+        {
+            return new StringJoiner(", ", PagerTransformation.class.getSimpleName() + "[", "]")
+                   .add("pageLimits=" + pageLimits)
+                   .add("counter=" + counter)
+                   .add("currentKey=" + currentKey)
+                   .add("lastRow=" + lastRow)
+                   .add("isFirstPartition=" + isFirstPartition)
+                   .toString();
+        }
     }
 
     protected void restoreState(DecoratedKey lastKey, int remaining, int remainingInPartition)
@@ -233,7 +304,7 @@ protected void restoreState(DecoratedKey lastKey, int remaining, int remainingIn
 
     public boolean isExhausted()
     {
-        return exhausted || remaining == 0 || ((this instanceof SinglePartitionPager) && remainingInPartition == 0);
+        return exhausted || remaining == 0;
     }
 
     public int maxRemaining()
@@ -246,7 +317,30 @@ protected int remainingInPartition()
         return remainingInPartition;
     }
 
-    protected abstract T nextPageReadQuery(int pageSize);
+    /**
+     * Returns the {@link DataLimits.Counter} for the page which was last fetched (the last page in the meaning
+     * the last returned and traversed row iterator, the iterator must be closed in order for this method to return
+     * proper counter)
+     */
+    public DataLimits.Counter getLastCounter()
+    {
+        return lastCounter;
+    }
+
+    protected abstract T nextPageReadQuery(PageSize pageSize, DataLimits limits);
     protected abstract void recordLast(DecoratedKey key, Row row);
     protected abstract boolean isPreviouslyReturnedPartition(DecoratedKey key);
+
+    @Override
+    public String toString()
+    {
+        return new StringJoiner(", ", AbstractQueryPager.class.getSimpleName() + "[", "]")
+               .add("limits=" + limits)
+               .add("remaining=" + remaining)
+               .add("lastCounter=" + lastCounter)
+               .add("lastKey=" + lastKey)
+               .add("remainingInPartition=" + remainingInPartition)
+               .add("exhausted=" + exhausted)
+               .toString();
+    }
 }
diff --git a/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java b/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java
index 5a484aaa8566..414b0800d375 100644
--- a/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/AggregationQueryPager.java
@@ -19,7 +19,13 @@
 
 import java.nio.ByteBuffer;
 import java.util.NoSuchElementException;
+import java.util.StringJoiner;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.cql3.PageSize;
+import org.apache.cassandra.exceptions.OperationExecutionException;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.aggregation.GroupingState;
@@ -34,31 +40,54 @@
  * <p>
  * For aggregation/group by queries, the user page size is in number of groups. But each group could be composed of very
  * many rows so to avoid running into OOMs, this pager will page internal queries into sub-pages. So each call to
- * {@link fetchPage} may (transparently) yield multiple internal queries (sub-pages).
+ * {@link #fetchPage(PageSize, ConsistencyLevel, QueryState, long)} may (transparently) yield multiple internal queries
+ * (sub-pages).
  */
 public final class AggregationQueryPager implements QueryPager
 {
+    private static final Logger logger = LoggerFactory.getLogger(AggregationQueryPager.class);
+
     private final DataLimits limits;
 
+    private final PageSize subPageSize;
+
     // The sub-pager, used to retrieve the next sub-page.
     private QueryPager subPager;
 
-    public AggregationQueryPager(QueryPager subPager, DataLimits limits)
+    public AggregationQueryPager(QueryPager subPager, PageSize subPageSize, DataLimits limits)
     {
         this.subPager = subPager;
         this.limits = limits;
+        this.subPageSize = subPageSize;
     }
 
+    /**
+     * This will return the iterator over the partitions. The iterator is limited by the provided page size and the user
+     * specified limit (in the query). Both the limit and the page size are applied to the number of groups covered by
+     * the returned data.
+     * <p/>
+     * In case of group-by queries the page size can be provided only in rows unit ({@link OperationExecutionException}
+     * is thrown otherwise). In case of 'aggregate everything' queries, the provided page size and the limits are
+     * ignored as we always return a single row.
+     *
+     * @param pageSize    the maximum number of elements to return in the next page (groups)
+     * @param consistency the consistency level to achieve for the query
+     * @param queryState  the {@code QueryState} for the query. In practice, this can be null unless
+     *                    {@code consistency} is a serial consistency
+     */
     @Override
-    public PartitionIterator fetchPage(int pageSize,
+    public PartitionIterator fetchPage(PageSize pageSize,
                                        ConsistencyLevel consistency,
                                        QueryState queryState,
-                                       long queryStartNanoTime)
+                                       long queryStartNanoTime) throws OperationExecutionException
     {
+        if (pageSize.isDefined() && pageSize.getUnit() != PageSize.PageUnit.ROWS)
+            throw new OperationExecutionException("Paging in bytes is not supported for aggregation queries. Please specify the page size in rows.");
+
         if (limits.isGroupByLimit())
-            return new GroupByPartitionIterator(pageSize, consistency, queryState, queryStartNanoTime);
+            return new GroupByPartitionIterator(pageSize, subPageSize, consistency, queryState, queryStartNanoTime);
 
-        return new AggregationPartitionIterator(pageSize, consistency, queryState, queryStartNanoTime);
+        return new AggregationPartitionIterator(subPageSize, consistency, queryState, queryStartNanoTime);
     }
 
     @Override
@@ -67,13 +96,22 @@ public ReadExecutionController executionController()
         return subPager.executionController();
     }
 
+    /**
+     * {@see #fetchPage}
+     *
+     * @param pageSize the maximum number of elements to return in the next page
+     * @param executionController the {@code ReadExecutionController} protecting the read
+     */
     @Override
-    public PartitionIterator fetchPageInternal(int pageSize, ReadExecutionController executionController)
+    public PartitionIterator fetchPageInternal(PageSize pageSize, ReadExecutionController executionController)
     {
+        if (pageSize.isDefined() && pageSize.getUnit() != PageSize.PageUnit.ROWS)
+            throw new OperationExecutionException("Paging in bytes is not supported for aggregation queries. Please specify the page size in rows.");
+
         if (limits.isGroupByLimit())
-            return new GroupByPartitionIterator(pageSize, executionController, System.nanoTime());
+            return new GroupByPartitionIterator(pageSize, subPageSize, executionController, System.nanoTime());
 
-        return new AggregationPartitionIterator(pageSize, executionController, System.nanoTime());
+        return new AggregationPartitionIterator(subPageSize, executionController, System.nanoTime());
     }
 
     @Override
@@ -109,7 +147,12 @@ public class GroupByPartitionIterator implements PartitionIterator
         /**
          * The top-level page size in number of groups.
          */
-        private final int pageSize;
+        private final PageSize groupsPageSize;
+
+        /**
+         * Page size for internal paging
+         */
+        private final PageSize subPageSize;
 
         // For "normal" queries
         private final ConsistencyLevel consistency;
@@ -148,46 +191,44 @@ public class GroupByPartitionIterator implements PartitionIterator
          */
         private Clustering<?> lastClustering;
 
-        /**
-         * The initial amount of row remaining
-         */
-        private int initialMaxRemaining;
-
         private long queryStartNanoTime;
 
-        public GroupByPartitionIterator(int pageSize,
+        protected int initialMaxRemaining;
+
+        public GroupByPartitionIterator(PageSize groupsPageSize,
+                                        PageSize subPageSize,
                                         ConsistencyLevel consistency,
                                         QueryState queryState,
                                         long queryStartNanoTime)
         {
-            this(pageSize, consistency, queryState, null, queryStartNanoTime);
+            this(groupsPageSize, subPageSize, consistency, queryState, null, queryStartNanoTime);
         }
 
-        public GroupByPartitionIterator(int pageSize,
+        public GroupByPartitionIterator(PageSize groupsPageSize,
+                                        PageSize subPageSize,
                                         ReadExecutionController executionController,
                                         long queryStartNanoTime)
        {
-           this(pageSize, null, null, executionController, queryStartNanoTime);
+           this(groupsPageSize, subPageSize, null, null, executionController, queryStartNanoTime);
        }
 
-        private GroupByPartitionIterator(int pageSize,
+        private GroupByPartitionIterator(PageSize groupsPageSize,
+                                         PageSize subPageSize,
                                          ConsistencyLevel consistency,
                                          QueryState queryState,
                                          ReadExecutionController executionController,
                                          long queryStartNanoTime)
         {
-            this.pageSize = handlePagingOff(pageSize);
+            this.groupsPageSize = groupsPageSize;
+            this.subPageSize = subPageSize;
             this.consistency = consistency;
             this.queryState = queryState;
             this.executionController = executionController;
             this.queryStartNanoTime = queryStartNanoTime;
-        }
+            subPager = subPager.withUpdatedLimit(limits.withCountedLimit(groupsPageSize.minRowsCount(maxRemaining())));
 
-        private int handlePagingOff(int pageSize)
-        {
-            // If the paging is off, the pageSize will be <= 0. So we need to replace
-            // it by DataLimits.NO_LIMIT
-            return pageSize <= 0 ? DataLimits.NO_LIMIT : pageSize;
+            if (logger.isTraceEnabled())
+                logger.trace("Fetching a new page - created {}", this);
         }
 
         public final void close()
@@ -213,39 +254,42 @@ public final boolean hasNext()
         }
 
         /**
-         * Loads the next <code>RowIterator</code> to be returned.
+         * Loads the next <code>RowIterator</code> to be returned. The iteration finishes when we reach either the
+         * user groups limit or the groups page size. The user provided limit is initially set in subPager.maxRemaining().
          */
         private void fetchNextRowIterator()
         {
+            // we haven't started yet, fetch the first sub page (partition iterator with sub-page limit)
             if (partitionIterator == null)
             {
                 initialMaxRemaining = subPager.maxRemaining();
-                partitionIterator = fetchSubPage(pageSize);
+                partitionIterator = fetchSubPage(subPageSize);
             }
 
             while (!partitionIterator.hasNext())
             {
                 partitionIterator.close();
 
-                int counted = initialMaxRemaining - subPager.maxRemaining();
-
-                if (isDone(pageSize, counted) || subPager.isExhausted())
+                int remaining = getRemaining();
+                assert remaining >= 0;
+                if (remaining == 0 || subPager.isExhausted())
                 {
                     endOfData = true;
                     closed = true;
                     return;
                 }
 
-                subPager = updatePagerLimit(subPager, limits, lastPartitionKey, lastClustering);
-                partitionIterator = fetchSubPage(computeSubPageSize(pageSize, counted));
+                subPager = updatePagerLimit(subPager, limits.withCountedLimit(remaining), lastPartitionKey, lastClustering);
+                partitionIterator = fetchSubPage(subPageSize);
             }
 
             next = partitionIterator.next();
         }
 
-        protected boolean isDone(int pageSize, int counted)
+        protected int getRemaining()
         {
-            return counted == pageSize;
+            int counted = initialMaxRemaining - subPager.maxRemaining();
+            return groupsPageSize.withDecreasedRows(counted).rows();
         }
 
         /**
@@ -267,25 +311,13 @@ protected QueryPager updatePagerLimit(QueryPager pager,
             return pager.withUpdatedLimit(newLimits);
         }
 
-        /**
-         * Computes the size of the next sub-page to retrieve.
-         *
-         * @param pageSize the top-level page size
-         * @param counted the number of result returned so far by the previous sub-pages
-         * @return the size of the next sub-page to retrieve
-         */
-        protected int computeSubPageSize(int pageSize, int counted)
-        {
-            return pageSize - counted;
-        }
-
         /**
          * Fetchs the next sub-page.
          *
          * @param subPageSize the sub-page size in number of groups
          * @return the next sub-page
          */
-        private final PartitionIterator fetchSubPage(int subPageSize)
+        private final PartitionIterator fetchSubPage(PageSize subPageSize)
         {
             return consistency != null ? subPager.fetchPage(subPageSize, consistency, queryState, queryStartNanoTime)
                                        : subPager.fetchPageInternal(subPageSize, executionController);
@@ -384,11 +416,42 @@ public boolean hasNext()
 
             public Row next()
             {
+                // we need to check this because this.rowIterator may exhaust if the sub-page is done and in such a case
+                // #hasNext switches this.rowIterator to the new one, which is obtained for the next page
+                if (!hasNext())
+                    throw new NoSuchElementException();
+
                 Row row = this.rowIterator.next();
                 lastClustering = row.clustering();
                 return row;
             }
         }
+
+        @Override
+        public String toString()
+        {
+            return new StringJoiner(", ", GroupByPartitionIterator.class.getSimpleName() + "[", "]")
+                   .add("groupsPageSize=" + groupsPageSize)
+                   .add("subPageSize=" + subPageSize)
+                   .add("endOfData=" + endOfData)
+                   .add("closed=" + closed)
+                   .add("limits=" + limits)
+                   .add("lastPartitionKey=" + lastPartitionKey)
+                   .add("lastClustering=" + ((lastClustering != null && subPager.executionController() != null) ? lastClustering.toString(subPager.executionController().metadata()): String.valueOf(lastClustering)))
+                   .add("initialMaxRemaining=" + initialMaxRemaining)
+                   .add("sub-pager=" + subPager.toString())
+                   .toString();
+        }
+    }
+
+    @Override
+    public String toString()
+    {
+        return new StringJoiner(", ", AggregationQueryPager.class.getSimpleName() + "[", "]")
+               .add("limits=" + limits)
+               .add("subPageSize=" + subPageSize)
+               .add("subPager=" + subPager)
+               .toString();
     }
 
     /**
@@ -398,19 +461,19 @@ public Row next()
      */
     public final class AggregationPartitionIterator extends GroupByPartitionIterator
     {
-        public AggregationPartitionIterator(int pageSize,
+        public AggregationPartitionIterator(PageSize subPageSize,
                                             ConsistencyLevel consistency,
                                             QueryState queryState,
                                             long queryStartNanoTime)
         {
-            super(pageSize, consistency, queryState, queryStartNanoTime);
+            super(PageSize.NONE, subPageSize, consistency, queryState, queryStartNanoTime);
         }
 
-        public AggregationPartitionIterator(int pageSize,
+        public AggregationPartitionIterator(PageSize subPageSize,
                                             ReadExecutionController executionController,
                                             long queryStartNanoTime)
         {
-            super(pageSize, executionController, queryStartNanoTime);
+            super(PageSize.NONE, subPageSize, executionController, queryStartNanoTime);
         }
 
         @Override
@@ -423,15 +486,9 @@ protected QueryPager updatePagerLimit(QueryPager pager,
         }
 
         @Override
-        protected boolean isDone(int pageSize, int counted)
-        {
-            return false;
-        }
-
-        @Override
-        protected int computeSubPageSize(int pageSize, int counted)
+        protected int getRemaining()
         {
-            return pageSize;
+            return initialMaxRemaining;
         }
     }
 }
diff --git a/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java b/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
index 83dfbc91d81c..9c236efad21b 100644
--- a/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
+++ b/src/java/org/apache/cassandra/service/pager/MultiPartitionPager.java
@@ -17,11 +17,18 @@
  */
 package org.apache.cassandra.service.pager;
 
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.AbstractIterator;
 
 import java.util.Arrays;
+import java.util.StringJoiner;
+
+import javax.annotation.Nonnull;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.*;
@@ -46,12 +53,24 @@
  */
 public class MultiPartitionPager<T extends SinglePartitionReadQuery> implements QueryPager
 {
+    private static final Logger logger = LoggerFactory.getLogger(MultiPartitionPager.class);
+
+    private static final SinglePartitionPager[] NO_PAGERS = new SinglePartitionPager[0];
+
+    // a pager per queried partition
+    @Nonnull
     private final SinglePartitionPager[] pagers;
+
+    // user limit
     private final DataLimits limit;
 
     private final int nowInSec;
 
+    // the number of rows left to be returned according to the user limits (those provided in query)
+    // when remaining reaches 0, the pager is considered exhausted
     private int remaining;
+
+    // the index of the current single partition pager
     private int current;
 
     public MultiPartitionPager(SinglePartitionReadQuery.Group<T> group, PagingState state, ProtocolVersion protocolVersion)
@@ -69,7 +88,7 @@ public MultiPartitionPager(SinglePartitionReadQuery.Group<T> group, PagingState
 
         if (i >= group.queries.size())
         {
-            pagers = null;
+            pagers = NO_PAGERS;
             return;
         }
 
@@ -122,7 +141,8 @@ public PagingState state()
 
     public boolean isExhausted()
     {
-        if (remaining <= 0 || pagers == null)
+        assert remaining >= 0;
+        if (remaining == 0)
             return true;
 
         while (current < pagers.length)
@@ -148,23 +168,27 @@ public ReadExecutionController executionController()
     }
 
     @SuppressWarnings("resource") // iter closed via countingIter
-    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
+    public PartitionIterator fetchPage(PageSize pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
     {
-        int toQuery = Math.min(remaining, pageSize);
-        return new PagersIterator(toQuery, consistency, queryState, null, queryStartNanoTime);
+        return new PagersIterator(pageSize, consistency, queryState, null, queryStartNanoTime);
     }
 
     @SuppressWarnings("resource") // iter closed via countingIter
-    public PartitionIterator fetchPageInternal(int pageSize, ReadExecutionController executionController) throws RequestValidationException, RequestExecutionException
+    public PartitionIterator fetchPageInternal(PageSize pageSize, ReadExecutionController executionController) throws RequestValidationException, RequestExecutionException
     {
-        int toQuery = Math.min(remaining, pageSize);
-        return new PagersIterator(toQuery, null, null, executionController, System.nanoTime());
+        return new PagersIterator(pageSize, null, null, executionController, System.nanoTime());
     }
 
+    /**
+     * This is an iterator over RowIterators (subsequent partitions). It starts from {@link #pagers} at {@link #current}
+     * and make sure that the overall amount of data does not exceed the provided {@link PagersIterator#pageSize}.
+     * This means that it can cut the row iteration in the first partition or return multiple partitions and cut the
+     * row iterator in n-th partition. It will update the {@link #current} index and {@link #remaining} as it goes.
+     */
     private class PagersIterator extends AbstractIterator<RowIterator> implements PartitionIterator
     {
-        private final int pageSize;
-        private PartitionIterator result;
+        private final PageSize pageSize;
+        private PartitionIterator partitionIterator;
         private boolean closed;
         private final long queryStartNanoTime;
 
@@ -175,32 +199,48 @@ private class PagersIterator extends AbstractIterator<RowIterator> implements Pa
         // For internal queries
         private final ReadExecutionController executionController;
 
-        private int pagerMaxRemaining;
+        private int countedRows;
+        private int countedBytes;
         private int counted;
 
-        public PagersIterator(int pageSize, ConsistencyLevel consistency, QueryState queryState, ReadExecutionController executionController, long queryStartNanoTime)
+        public PagersIterator(PageSize pageSize, ConsistencyLevel consistency, QueryState queryState, ReadExecutionController executionController, long queryStartNanoTime)
         {
             this.pageSize = pageSize;
             this.consistency = consistency;
             this.queryState = queryState;
             this.executionController = executionController;
             this.queryStartNanoTime = queryStartNanoTime;
+
+            if (logger.isTraceEnabled())
+                logger.trace("Fetching a new page - created {}", this);
         }
 
         protected RowIterator computeNext()
         {
-            while (result == null || !result.hasNext())
+            while (partitionIterator == null || !partitionIterator.hasNext())
             {
-                if (result != null)
+                DataLimits.Counter lastPageCounter = null;
+                if (partitionIterator != null)
                 {
-                    result.close();
-                    counted += pagerMaxRemaining - pagers[current].maxRemaining();
+                    // we've just reached the end of partition,
+                    // let's close the row iterator and update the global counters
+                    partitionIterator.close();
+
+                    lastPageCounter = pagers[current].getLastCounter();
+                    countedRows += lastPageCounter.rowsCounted();
+                    countedBytes += lastPageCounter.bytesCounted();
+                    counted += lastPageCounter.counted();
+                    remaining -= lastPageCounter.counted();
                 }
 
-                // We are done if we have reached the page size or in the case of GROUP BY if the current pager
-                // is not exhausted.
-                boolean isDone = counted >= pageSize
-                        || (result != null && limit.isGroupByLimit() && !pagers[current].isExhausted());
+                // We are done if:
+                // - we have reached the page size,
+                // - or in the case of GROUP BY if the current pager is not exhausted - which means that we read all the rows withing the limit before exhausting the pager
+                boolean isDone = pageSize.isCompleted(countedRows, PageSize.PageUnit.ROWS)
+                                 || pageSize.isCompleted(countedBytes, PageSize.PageUnit.BYTES)
+                                 || limit.count() <= counted
+                                 || limit.bytes() <= countedBytes
+                                 || (partitionIterator != null && limit.isGroupByLimit() && !pagers[current].isExhausted());
 
                 // isExhausted() will sets us on the first non-exhausted pager
                 if (isDone || isExhausted())
@@ -209,20 +249,44 @@ protected RowIterator computeNext()
                     return endOfData();
                 }
 
-                pagerMaxRemaining = pagers[current].maxRemaining();
-                int toQuery = pageSize - counted;
-                result = consistency == null
-                       ? pagers[current].fetchPageInternal(toQuery, executionController)
-                       : pagers[current].fetchPage(toQuery, consistency, queryState, queryStartNanoTime);
+                // we will update the limits for the current pager before using it so that we can be sure we don't fetch
+                // more than remaining or more than what was left to be fetched according to the recently set limits
+                // (for example in case of groups paging) - that later limit is just the limit which was set minus what
+                // we counted so far
+                int newCountedLimit = Math.max(0, Math.min(remaining, limit.count() - counted));
+                // this works exactly the same way as above - it is required for the limits imposed by Guardrails,
+                // whihc are set on the query
+                int newBytesLimit = Math.max(0, limit.bytes() - countedBytes);
+
+                DataLimits updatedLimit = pagers[current].limits.withCountedLimit(newCountedLimit).withBytesLimit(newBytesLimit);
+                pagers[current] = pagers[current].withUpdatedLimit(updatedLimit);
+
+                PageSize remainingPagePart = pageSize.withDecreasedRows(countedRows)
+                                                     .withDecreasedBytes(countedBytes);
+
+                partitionIterator = consistency == null
+                                    ? pagers[current].fetchPageInternal(remainingPagePart, executionController)
+                                    : pagers[current].fetchPage(remainingPagePart, consistency, queryState, queryStartNanoTime);
             }
-            return result.next();
+            return partitionIterator.next();
         }
 
         public void close()
         {
-            remaining -= counted;
-            if (result != null && !closed)
-                result.close();
+            if (partitionIterator != null && !closed)
+                partitionIterator.close();
+        }
+
+        @Override
+        public String toString()
+        {
+            return new StringJoiner(", ", PagersIterator.class.getSimpleName() + "[", "]")
+                   .add("pageSize=" + pageSize)
+                   .add("closed=" + closed)
+                   .add("countedRows=" + countedRows)
+                   .add("countedBytes=" + countedBytes)
+                   .add("counted=" + counted)
+                   .toString();
         }
     }
 
@@ -230,4 +294,15 @@ public int maxRemaining()
     {
         return remaining;
     }
+
+    @Override
+    public String toString()
+    {
+        return new StringJoiner(", ", MultiPartitionPager.class.getSimpleName() + "[", "]")
+               .add("current=" + current)
+               .add("pagers.length=" + pagers.length)
+               .add("limit=" + limit)
+               .add("remaining=" + remaining)
+               .toString();
+    }
 }
\ No newline at end of file
diff --git a/src/java/org/apache/cassandra/service/pager/PagingState.java b/src/java/org/apache/cassandra/service/pager/PagingState.java
index 4dde93631088..a4dff99ae8b3 100644
--- a/src/java/org/apache/cassandra/service/pager/PagingState.java
+++ b/src/java/org/apache/cassandra/service/pager/PagingState.java
@@ -24,9 +24,11 @@
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.primitives.Ints;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import org.apache.cassandra.db.Clustering;
 import org.apache.cassandra.db.marshal.AbstractType;
-import org.apache.cassandra.db.marshal.ByteArrayAccessor;
 import org.apache.cassandra.db.marshal.ByteBufferAccessor;
 import org.apache.cassandra.db.marshal.BytesType;
 import org.apache.cassandra.db.marshal.CompositeType;
@@ -49,6 +51,8 @@
 @SuppressWarnings("WeakerAccess")
 public class PagingState
 {
+    private static final Logger logger = LoggerFactory.getLogger(PagingState.class);
+
     public final ByteBuffer partitionKey;  // Can be null for single partition queries.
     public final RowMark rowMark;          // Can be null if not needed.
     public final int remaining;
@@ -115,10 +119,14 @@ public static PagingState deserialize(ByteBuffer bytes, ProtocolVersion protocol
         }
         catch (IOException e)
         {
-            throw new ProtocolException("Invalid value for the paging state");
+            String msg =  "Failed to deserialize the paging state with protocol version: " + protocolVersion;
+            logger.trace(msg, e);
+            throw new ProtocolException(msg, protocolVersion);
         }
 
-        throw new ProtocolException("Invalid value for the paging state");
+        String msg =  "The serialized paging state does not match any serialization format for protocol version: " + protocolVersion;
+        logger.trace(msg);
+        throw new ProtocolException(msg, protocolVersion);
     }
 
     /*
diff --git a/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java b/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java
index 4f1e0e76c357..7c29c7d3d5e4 100644
--- a/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/PartitionRangeQueryPager.java
@@ -17,6 +17,9 @@
  */
 package org.apache.cassandra.service.pager;
 
+import java.util.StringJoiner;
+
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.rows.Row;
@@ -74,15 +77,14 @@ public PagingState state()
     }
 
     @Override
-    protected PartitionRangeReadQuery nextPageReadQuery(int pageSize)
+    protected PartitionRangeReadQuery nextPageReadQuery(PageSize pageSize, DataLimits limits)
     {
-        DataLimits limits;
         DataRange fullRange = query.dataRange();
         DataRange pageRange;
         if (lastReturnedKey == null)
         {
             pageRange = fullRange;
-            limits = query.limits().forPaging(pageSize);
+            limits = limits.forPaging(pageSize);
         }
         // if the last key was the one of the end of the range we know that we are done
         else if (lastReturnedKey.equals(fullRange.keyRange().right) && remainingInPartition() == 0 && lastReturnedRow == null)
@@ -97,12 +99,12 @@ else if (lastReturnedKey.equals(fullRange.keyRange().right) && remainingInPartit
             if (includeLastKey)
             {
                 pageRange = fullRange.forPaging(bounds, query.metadata().comparator, lastReturnedRow.clustering(query.metadata()), false);
-                limits = query.limits().forPaging(pageSize, lastReturnedKey.getKey(), remainingInPartition());
+                limits = limits.forPaging(pageSize, lastReturnedKey.getKey(), remainingInPartition());
             }
             else
             {
                 pageRange = fullRange.forSubRange(bounds);
-                limits = query.limits().forPaging(pageSize);
+                limits = limits.forPaging(pageSize);
             }
         }
 
@@ -139,4 +141,14 @@ private AbstractBounds<PartitionPosition> makeKeyBounds(PartitionPosition lastRe
              ? new IncludingExcludingBounds<>(lastReturnedKey, bounds.right)
              : new ExcludingBounds<>(lastReturnedKey, bounds.right);
     }
+
+    @Override
+    public String toString()
+    {
+        return new StringJoiner(", ", PartitionRangeQueryPager.class.getSimpleName() + "[", "]")
+               .add("super=" + super.toString())
+               .add("lastReturnedKey=" + lastReturnedKey)
+               .add("lastReturnedRow=" + (lastReturnedRow != null ? lastReturnedRow.clustering(query.metadata()).toString(query.metadata()) : null))
+               .toString();
+    }
 }
diff --git a/src/java/org/apache/cassandra/service/pager/QueryPager.java b/src/java/org/apache/cassandra/service/pager/QueryPager.java
index e9b0b158e76d..f1e9f23847c6 100644
--- a/src/java/org/apache/cassandra/service/pager/QueryPager.java
+++ b/src/java/org/apache/cassandra/service/pager/QueryPager.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.service.pager;
 
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.ReadExecutionController;
 import org.apache.cassandra.db.filter.DataLimits;
@@ -54,12 +55,12 @@ public ReadExecutionController executionController()
             return ReadExecutionController.empty();
         }
 
-        public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
+        public PartitionIterator fetchPage(PageSize pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException
         {
             return EmptyIterators.partition();
         }
 
-        public PartitionIterator fetchPageInternal(int pageSize, ReadExecutionController executionController) throws RequestValidationException, RequestExecutionException
+        public PartitionIterator fetchPageInternal(PageSize pageSize, ReadExecutionController executionController) throws RequestValidationException, RequestExecutionException
         {
             return EmptyIterators.partition();
         }
@@ -94,13 +95,13 @@ public QueryPager withUpdatedLimit(DataLimits newLimits)
      * {@code consistency} is a serial consistency.
      * @return the page of result.
      */
-    public PartitionIterator fetchPage(int pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException;
+    public PartitionIterator fetchPage(PageSize pageSize, ConsistencyLevel consistency, QueryState queryState, long queryStartNanoTime) throws RequestValidationException, RequestExecutionException;
 
     /**
      * Starts a new read operation.
      * <p>
-     * This must be called before {@link fetchPageInternal} and passed to it to protect the read.
-     * The returned object <b>must</b> be closed on all path and it is thus strongly advised to
+     * This must be called before {@link #fetchPageInternal(PageSize, ReadExecutionController)} and passed to it
+     * to protect the read. The returned object <b>must</b> be closed on all path and it is thus strongly advised to
      * use it in a try-with-ressource construction.
      *
      * @return a newly started order group for this {@code QueryPager}.
@@ -114,7 +115,7 @@ public QueryPager withUpdatedLimit(DataLimits newLimits)
      * @param executionController the {@code ReadExecutionController} protecting the read.
      * @return the page of result.
      */
-    public PartitionIterator fetchPageInternal(int pageSize, ReadExecutionController executionController) throws RequestValidationException, RequestExecutionException;
+    public PartitionIterator fetchPageInternal(PageSize pageSize, ReadExecutionController executionController) throws RequestValidationException, RequestExecutionException;
 
     /**
      * Whether or not this pager is exhausted, i.e. whether or not a call to
diff --git a/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java b/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java
index 832526e5ce6c..68d355571490 100644
--- a/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java
+++ b/src/java/org/apache/cassandra/service/pager/SinglePartitionPager.java
@@ -18,7 +18,9 @@
 package org.apache.cassandra.service.pager;
 
 import java.nio.ByteBuffer;
+import java.util.StringJoiner;
 
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.*;
 import org.apache.cassandra.db.rows.*;
 import org.apache.cassandra.db.filter.*;
@@ -26,7 +28,7 @@
 
 /**
  * Common interface to single partition queries (by slice and by name).
- *
+ * <p>
  * For use by MultiPartitionPager.
  */
 public class SinglePartitionPager extends AbstractQueryPager<SinglePartitionReadQuery>
@@ -78,21 +80,27 @@ public DataLimits limits()
     public PagingState state()
     {
         return lastReturned == null
-             ? null
-             : new PagingState(null, lastReturned, maxRemaining(), remainingInPartition());
+               ? null
+               : new PagingState(null, lastReturned, maxRemaining(), remainingInPartition());
     }
 
     @Override
-    protected SinglePartitionReadQuery nextPageReadQuery(int pageSize)
+    protected SinglePartitionReadQuery nextPageReadQuery(PageSize pageSize, DataLimits limits)
     {
         Clustering<?> clustering = lastReturned == null ? null : lastReturned.clustering(query.metadata());
-        DataLimits limits = lastReturned == null
-                          ? limits().forPaging(pageSize)
-                          : limits().forPaging(pageSize, key(), remainingInPartition());
+        limits = lastReturned == null
+                 ? limits.forPaging(pageSize)
+                 : limits.forPaging(pageSize, key(), remainingInPartition());
 
         return query.forPaging(clustering, limits);
     }
 
+    @Override
+    public boolean isExhausted()
+    {
+        return super.isExhausted() || remainingInPartition() == 0;
+    }
+
     protected void recordLast(DecoratedKey key, Row last)
     {
         if (last != null && last.clustering() != Clustering.STATIC_CLUSTERING)
@@ -103,4 +111,13 @@ protected boolean isPreviouslyReturnedPartition(DecoratedKey key)
     {
         return lastReturned != null;
     }
+
+    @Override
+    public String toString()
+    {
+        return new StringJoiner(", ", SinglePartitionPager.class.getSimpleName() + "[", "]")
+               .add("super=" + super.toString())
+               .add("lastReturned=" + (lastReturned != null ? lastReturned.clustering(query.metadata()).toString(query.metadata()) : null))
+               .toString();
+    }
 }
diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
index c3ea9e7b535c..38171cc00279 100644
--- a/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
+++ b/src/java/org/apache/cassandra/service/reads/ShortReadPartitionsProtection.java
@@ -134,7 +134,7 @@ public UnfilteredPartitionIterator moreContents()
          * Can only take the short cut if there is no per partition limit set. Otherwise it's possible to hit false
          * positives due to some rows being uncounted for in certain scenarios (see CASSANDRA-13911).
          */
-        if (command.limits().isExhausted(singleResultCounter) && command.limits().perPartitionCount() == DataLimits.NO_LIMIT)
+        if (command.limits().isCounterBelowLimits(singleResultCounter) && command.limits().perPartitionCount() == DataLimits.NO_LIMIT)
             return null;
 
         /*
diff --git a/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java b/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java
index 4ed9e329563b..1c08e0ff293b 100644
--- a/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java
+++ b/src/java/org/apache/cassandra/service/reads/ShortReadRowsProtection.java
@@ -92,7 +92,7 @@ public UnfilteredRowIterator moreContents()
          * Can only take the short cut if there is no per partition limit set. Otherwise it's possible to hit false
          * positives due to some rows being uncounted for in certain scenarios (see CASSANDRA-13911).
          */
-        if (command.limits().isExhausted(singleResultCounter) && command.limits().perPartitionCount() == DataLimits.NO_LIMIT)
+        if (command.limits().isCounterBelowLimits(singleResultCounter) && command.limits().perPartitionCount() == DataLimits.NO_LIMIT)
             return null;
 
         /*
diff --git a/src/java/org/apache/cassandra/transport/Client.java b/src/java/org/apache/cassandra/transport/Client.java
index 76f710e5ee51..82aa79ebbfcd 100644
--- a/src/java/org/apache/cassandra/transport/Client.java
+++ b/src/java/org/apache/cassandra/transport/Client.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.auth.PasswordAuthenticator;
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.config.EncryptionOptions;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.db.ConsistencyLevel;
 import org.apache.cassandra.db.marshal.Int32Type;
@@ -131,14 +132,14 @@ else if (msgType.equals("QUERY"))
             line = line.substring(6);
             // Ugly hack to allow setting a page size, but that's playground code anyway
             String query = line;
-            int pageSize = -1;
+            PageSize pageSize = PageSize.NONE;
             if (line.matches(".+ !\\d+$"))
             {
                 int idx = line.lastIndexOf('!');
                 query = line.substring(0, idx-1);
                 try
                 {
-                    pageSize = Integer.parseInt(line.substring(idx+1, line.length()));
+                    pageSize = PageSize.inRows(Integer.parseInt(line.substring(idx + 1, line.length())));
                 }
                 catch (NumberFormatException e)
                 {
diff --git a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
index 2fc06a69f774..55fb1e6a5fe0 100644
--- a/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/ExecuteMessage.java
@@ -123,7 +123,7 @@ protected Message.Response execute(QueryState state, long queryStartNanoTime, bo
             CQLStatement statement = prepared.statement;
             options.prepare(statement.getBindVariables());
 
-            if (options.getPageSize() == 0)
+            if (options.getPageSize().getSize() == 0)
                 throw new ProtocolException("The page size cannot be 0");
 
             if (traceRequest)
@@ -182,8 +182,11 @@ else if (options.skipMetadata())
     private void traceQuery(QueryState state, QueryHandler.Prepared prepared)
     {
         ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
-        if (options.getPageSize() > 0)
-            builder.put("page_size", Integer.toString(options.getPageSize()));
+        if (options.getPageSize().isDefined())
+        {
+            builder.put("page_size", Integer.toString(options.getPageSize().getSize()));
+            builder.put("page_size_unit", options.getPageSize().getUnit().name());
+        }
         if (options.getConsistency() != null)
             builder.put("consistency_level", options.getConsistency().name());
         if (options.getSerialConsistency(state) != null)
diff --git a/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java b/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
index cfb653134c4b..29ce76e76c22 100644
--- a/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/OptionsMessage.java
@@ -18,14 +18,17 @@
 package org.apache.cassandra.transport.messages;
 
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 
 import io.netty.buffer.ByteBuf;
 
 import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.service.QueryState;
 import org.apache.cassandra.transport.Compressor;
@@ -37,6 +40,8 @@
  */
 public class OptionsMessage extends Message.Request
 {
+    private static final List<String> supportedPageUnits = Arrays.stream(PageSize.PageUnit.values()).map(PageSize.PageUnit::name).collect(Collectors.toList());
+
     public static final Message.Codec<OptionsMessage> codec = new Message.Codec<OptionsMessage>()
     {
         public OptionsMessage decode(ByteBuf body, ProtocolVersion version)
@@ -76,6 +81,7 @@ protected Message.Response execute(QueryState state, long queryStartNanoTime, bo
         supported.put(StartupMessage.COMPRESSION, compressions);
         supported.put(StartupMessage.PROTOCOL_VERSIONS, ProtocolVersion.supportedVersions());
         supported.put(StartupMessage.EMULATE_DBAAS_DEFAULTS, Collections.singletonList(String.valueOf(DatabaseDescriptor.isEmulateDbaasDefaults())));
+        supported.put(StartupMessage.PAGE_UNIT, supportedPageUnits);
 
         return new SupportedMessage(supported);
     }
diff --git a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
index b347fc2fb3c4..ca19efd4e265 100644
--- a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java
@@ -95,7 +95,7 @@ protected Message.Response execute(QueryState state, long queryStartNanoTime, bo
         CQLStatement statement = null;
         try
         {
-            if (options.getPageSize() == 0)
+            if (options.getPageSize().getSize() == 0)
                 throw new ProtocolException("The page size cannot be 0");
 
             if (traceRequest)
@@ -127,8 +127,11 @@ private void traceQuery(QueryState state)
     {
         ImmutableMap.Builder<String, String> builder = ImmutableMap.builder();
         builder.put("query", query);
-        if (options.getPageSize() > 0)
-            builder.put("page_size", Integer.toString(options.getPageSize()));
+        if (options.getPageSize().isDefined())
+        {
+            builder.put("page_size", Integer.toString(options.getPageSize().getSize()));
+            builder.put("page_size_unit", options.getPageSize().getUnit().name());
+        }
         if (options.getConsistency() != null)
             builder.put("consistency_level", options.getConsistency().name());
         if (options.getSerialConsistency(state) != null)
@@ -140,6 +143,6 @@ private void traceQuery(QueryState state)
     @Override
     public String toString()
     {
-        return String.format("QUERY %s [pageSize = %d]", query, options.getPageSize());
+        return String.format("QUERY %s [pageSize = %s]", query, options.getPageSize());
     }
 }
diff --git a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
index 1bae7f05f711..26522391e3cb 100644
--- a/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
+++ b/src/java/org/apache/cassandra/transport/messages/StartupMessage.java
@@ -41,6 +41,7 @@ public class StartupMessage extends Message.Request
     public static final String DRIVER_VERSION = "DRIVER_VERSION";
     public static final String THROW_ON_OVERLOAD = "THROW_ON_OVERLOAD";
     public static final String EMULATE_DBAAS_DEFAULTS = "EMULATE_DBAAS_DEFAULTS";
+    public static final String PAGE_UNIT = "PAGE_UNIT";
 
     public static final Message.Codec<StartupMessage> codec = new Message.Codec<StartupMessage>()
     {
diff --git a/test/burn/org/apache/cassandra/transport/BurnTestUtil.java b/test/burn/org/apache/cassandra/transport/BurnTestUtil.java
index 961045514b8c..49f7cc95ed15 100644
--- a/test/burn/org/apache/cassandra/transport/BurnTestUtil.java
+++ b/test/burn/org/apache/cassandra/transport/BurnTestUtil.java
@@ -27,6 +27,7 @@
 import com.datastax.driver.core.SimpleStatement;
 import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.ColumnSpecification;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.ResultSet;
 import org.apache.cassandra.db.ConsistencyLevel;
@@ -75,7 +76,7 @@ public static QueryMessage generateQueryMessage(int idx, SizeCaps sizeCaps, Prot
         QueryOptions queryOptions = QueryOptions.create(ConsistencyLevel.ONE,
                                                         values,
                                                         true,
-                                                        10,
+                                                        PageSize.inRows(10),
                                                         null,
                                                         null,
                                                         version,
diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
index 32ca5a5636b8..8f98f5e9def8 100644
--- a/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
+++ b/test/distributed/org/apache/cassandra/distributed/impl/Coordinator.java
@@ -30,9 +30,9 @@
 import com.google.common.collect.Iterators;
 
 import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.QueryProcessor;
-import org.apache.cassandra.cql3.UntypedResultSet;
 import org.apache.cassandra.cql3.statements.SelectStatement;
 import org.apache.cassandra.distributed.api.ConsistencyLevel;
 import org.apache.cassandra.distributed.api.ICoordinator;
@@ -43,11 +43,8 @@
 import org.apache.cassandra.service.ClientState;
 import org.apache.cassandra.service.ClientWarn;
 import org.apache.cassandra.service.QueryState;
-import org.apache.cassandra.service.pager.QueryPager;
-import org.apache.cassandra.transport.ClientStat;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.tracing.Tracing;
-import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.transport.messages.ResultMessage;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
@@ -105,7 +102,7 @@ private SimpleQueryResult executeInternal(String query, ConsistencyLevel consist
                                              QueryOptions.create(toCassandraCL(consistencyLevel),
                                                                  boundBBValues,
                                                                  false,
-                                                                 Integer.MAX_VALUE,
+                                                                 PageSize.NONE,
                                                                  null,
                                                                  null,
                                                                  ProtocolVersion.CURRENT,
@@ -152,7 +149,7 @@ public QueryResult executeWithPagingWithResult(String query, ConsistencyLevel co
             QueryOptions initialOptions = QueryOptions.create(toCassandraCL(consistencyLevel),
                                                               boundBBValues,
                                                               false,
-                                                              pageSize,
+                                                              PageSize.inRows(pageSize),
                                                               null,
                                                               null,
                                                               ProtocolVersion.CURRENT,
@@ -175,7 +172,7 @@ public boolean hasNext()
                     QueryOptions nextOptions = QueryOptions.create(toCassandraCL(consistencyLevel),
                                                                    boundBBValues,
                                                                    true,
-                                                                   pageSize,
+                                                                   PageSize.inRows(pageSize),
                                                                    rows.result.metadata.getPagingState(),
                                                                    null,
                                                                    ProtocolVersion.CURRENT,
diff --git a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
index 5cf68de14710..184239df2ea1 100644
--- a/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
+++ b/test/unit/org/apache/cassandra/config/DatabaseDescriptorRefTest.java
@@ -93,6 +93,7 @@ public class DatabaseDescriptorRefTest
     "org.apache.cassandra.config.YamlConfigurationLoader$CustomConstructor",
     "org.apache.cassandra.config.TransparentDataEncryptionOptions",
     "org.apache.cassandra.config.StorageAttachedIndexOptions",
+    "org.apache.cassandra.cql3.PageSize",
     "org.apache.cassandra.db.ConsistencyLevel",
     "org.apache.cassandra.db.commitlog.CommitLogSegmentManagerFactory",
     "org.apache.cassandra.db.commitlog.DefaultCommitLogSegmentMgrFactory",
diff --git a/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java b/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java
index 983acfaf97b1..e162d30f5766 100644
--- a/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java
+++ b/test/unit/org/apache/cassandra/cql3/CustomNowInSecondsTest.java
@@ -197,7 +197,7 @@ private static QueryOptions queryOptions(int nowInSeconds)
         return QueryOptions.create(ConsistencyLevel.ONE,
                                    Collections.emptyList(),
                                    false,
-                                   Integer.MAX_VALUE,
+                                   PageSize.NONE,
                                    null,
                                    null,
                                    ProtocolVersion.CURRENT,
diff --git a/test/unit/org/apache/cassandra/cql3/PagingQueryTest.java b/test/unit/org/apache/cassandra/cql3/PagingQueryTest.java
index 8f5f2828b482..8b469a60d2fc 100644
--- a/test/unit/org/apache/cassandra/cql3/PagingQueryTest.java
+++ b/test/unit/org/apache/cassandra/cql3/PagingQueryTest.java
@@ -18,20 +18,61 @@
 
 package org.apache.cassandra.cql3;
 
+import java.nio.Buffer;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.Iterator;
+import java.util.List;
 import java.util.concurrent.ThreadLocalRandom;
+import java.util.function.Supplier;
 
 import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
 
-import com.datastax.driver.core.*;
 import com.datastax.driver.core.ResultSet;
+import com.datastax.driver.core.Row;
+import com.datastax.driver.core.Session;
+import com.datastax.driver.core.SimpleStatement;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.statements.SelectStatement;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.ReadQuery;
+import org.apache.cassandra.db.partitions.PartitionIterator;
+import org.apache.cassandra.exceptions.OperationExecutionException;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.service.pager.AggregationQueryPager;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.FBUtilities;
+import org.apache.cassandra.utils.Pair;
 
+import static org.apache.commons.lang3.ArrayUtils.EMPTY_OBJECT_ARRAY;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
+@RunWith(Parameterized.class)
 public class PagingQueryTest extends CQLTester
 {
+    final int ROW_SIZE = 49; // size of internal representation
+
+    @Parameterized.Parameters(name = "aggregation_sub_page_size={0}")
+    public static Collection<Object[]> generateParameters()
+    {
+        return Arrays.asList(new Object[]{ PageSize.inBytes(1024) }, new Object[]{ PageSize.NONE });
+    }
+
+    public PagingQueryTest(PageSize subPageSize)
+    {
+        DatabaseDescriptor.setAggregationSubPageSize(subPageSize);
+    }
+
     @Test
     public void pagingOnRegularColumn() throws Throwable
     {
@@ -59,53 +100,450 @@ public void pagingOnRegularColumn() throws Throwable
 
         flush();
 
-        try (Session session = sessionNet())
+        Session session = sessionNet();
+        SimpleStatement stmt = new SimpleStatement("SELECT c1, c2, v1, v2 FROM " + KEYSPACE + '.' + currentTable() + " WHERE k1 = 1");
+        stmt.setFetchSize(3);
+        ResultSet rs = session.execute(stmt);
+        Iterator<Row> iter = rs.iterator();
+        for (int c1 = 0; c1 < 100; c1++)
+        {
+            for (int c2 = 0; c2 < 100; c2++)
+            {
+                assertTrue(iter.hasNext());
+                Row row = iter.next();
+                String msg = "On " + c1 + ',' + c2;
+                assertEquals(msg, c1, row.getInt(0));
+                assertEquals(msg, c2, row.getInt(1));
+                assertEquals(msg, Integer.toString(c1), row.getString(2));
+                assertEquals(msg, Integer.toString(c2), row.getString(3));
+            }
+        }
+        assertFalse(iter.hasNext());
+
+        for (int c1 = 0; c1 < 100; c1++)
         {
-            SimpleStatement stmt = new SimpleStatement("SELECT c1, c2, v1, v2 FROM " + KEYSPACE + '.' + currentTable() + " WHERE k1 = 1");
+            stmt = new SimpleStatement("SELECT c1, c2, v1, v2 FROM " + KEYSPACE + '.' + currentTable() + " WHERE k1 = 1 AND c1 = ?", c1);
             stmt.setFetchSize(3);
-            ResultSet rs = session.execute(stmt);
-            Iterator<Row> iter = rs.iterator();
-            for (int c1 = 0; c1 < 100; c1++)
+            rs = session.execute(stmt);
+            iter = rs.iterator();
+            for (int c2 = 0; c2 < 100; c2++)
+            {
+                assertTrue(iter.hasNext());
+                Row row = iter.next();
+                String msg = "Within " + c1 + " on " + c2;
+                assertEquals(msg, c1, row.getInt(0));
+                assertEquals(msg, c2, row.getInt(1));
+                assertEquals(msg, Integer.toString(c1), row.getString(2));
+                assertEquals(msg, Integer.toString(c2), row.getString(3));
+            }
+            assertFalse(iter.hasNext());
+        }
+    }
+
+    // new paging-in-bytes tests
+
+    /**
+     * Returns a lambda that creates a pager for the query
+     */
+    private Supplier<Pair<QueryPager, SelectStatement>> getPager(String query, Object... args)
+    {
+        return () -> {
+            QueryHandler.Prepared prepared = QueryProcessor.prepareInternal(String.format(query, args));
+            SelectStatement select = (SelectStatement) prepared.statement;
+            ReadQuery readQuery = select.getQuery(QueryState.forInternalCalls(), QueryProcessor.makeInternalOptions(prepared.statement, EMPTY_OBJECT_ARRAY), FBUtilities.nowInSeconds());
+            QueryPager pager = select.getPager(readQuery, QueryOptions.forInternalCalls(ConsistencyLevel.LOCAL_ONE, Collections.emptyList()));
+            return Pair.create(pager, select);
+        };
+    }
+
+    /**
+     * Inovke the test and check for the expected number of rows
+     */
+    private void assertResults(Supplier<Pair<QueryPager, SelectStatement>> pagerSupplier, int expectedCount)
+    {
+        Pair<QueryPager, SelectStatement> pagerAndStmt = pagerSupplier.get();
+        QueryPager pager = pagerAndStmt.left;
+        SelectStatement select = pagerAndStmt.right;
+
+        List<List<ByteBuffer>> rows;
+
+        int nowInSec = FBUtilities.nowInSeconds();
+        assertThat(pager.isExhausted()).isFalse();
+        try (ReadExecutionController executionController = pager.executionController();
+             PartitionIterator iter = pager.fetchPageInternal(PageSize.NONE, executionController))
+        {
+            rows = select.process(iter, nowInSec).rows;
+        }
+
+        assertThat(rows.size()).isEqualTo(expectedCount);
+        assertThat(pager.isExhausted()).isTrue();
+    }
+
+    /**
+     * Invoke the tests with the provided page size. Firstly we just request the page size in rows as provided by the parameter.
+     * In the second test we convert (by multiplying) the requested number of rows on page to the number of bytes (assuming certain row size).
+     */
+    private void assertResults(Supplier<Pair<QueryPager, SelectStatement>> pagerSupplier, int requestedPageSizeInRows, int expectedCountOnFirstPage, int expectedCount)
+    {
+        assertResults(pagerSupplier, PageSize.inRows(requestedPageSizeInRows), expectedCountOnFirstPage, expectedCount);
+        assertResults(pagerSupplier, PageSize.inBytes(requestedPageSizeInRows * ROW_SIZE), expectedCountOnFirstPage, expectedCount);
+    }
+
+    /**
+     * Invoke the tests with the provided page size. Firstly we just request the page size in rows as provided by the parameter.
+     * In the second test we convert (by multiplying) the requested number of rows on page to the number of bytes (assuming certain row size).
+     */
+    private void assertResults(Supplier<Pair<QueryPager, SelectStatement>> pagerSupplier, int requestedPageSizeInRows, int expectedCountOnFirstPage, int expectedCount, int expectedValue)
+    {
+        List<List<ByteBuffer>> rows = assertResults(pagerSupplier, PageSize.inRows(requestedPageSizeInRows), expectedCountOnFirstPage, expectedCount);
+        assertThat(ByteBufferUtil.toLong(rows.get(0).get(0))).isEqualTo((long) expectedValue);
+        assertResults(pagerSupplier, PageSize.inBytes(requestedPageSizeInRows * ROW_SIZE), expectedCountOnFirstPage, expectedCount);
+    }
+
+    /**
+     * Invoke the test with the provided page size. Expect the exact number of rows on the first page and exact number of rows in total (all pages).
+     */
+    private List<List<ByteBuffer>> assertResults(Supplier<Pair<QueryPager, SelectStatement>> pagerSupplier, PageSize requestedPageSize, int expectedCountOnFirstPage, int expectedCount)
+    {
+        Pair<QueryPager, SelectStatement> pagerAndStmt = pagerSupplier.get();
+        QueryPager pager = pagerAndStmt.left;
+        SelectStatement select = pagerAndStmt.right;
+
+        List<List<ByteBuffer>> rows = null;
+
+        int nowInSec = FBUtilities.nowInSeconds();
+
+        int countOnFirstPage = -1;
+        int count = 0;
+
+        logger.info("Assertion on query {} with requested page size {} - expected count on first page = {}, expected count total = {}:", select.toString(), requestedPageSize, expectedCountOnFirstPage, expectedCount);
+
+        try
+        {
+            while (!pager.isExhausted())
             {
-                for (int c2 = 0; c2 < 100; c2++)
+                try (ReadExecutionController executionController = pager.executionController();
+                     PartitionIterator iter = pager.fetchPageInternal(requestedPageSize, executionController))
                 {
-                    assertTrue(iter.hasNext());
-                    Row row = iter.next();
-                    String msg = "On " + c1 + ',' + c2;
-                    assertEquals(msg, c1, row.getInt(0));
-                    assertEquals(msg, c2, row.getInt(1));
-                    assertEquals(msg, Integer.toString(c1), row.getString(2));
-                    assertEquals(msg, Integer.toString(c2), row.getString(3));
+                    rows = select.process(iter, nowInSec).rows;
+                    logger.info("Got page of {} rows with size: {}", rows.size(), rows.stream().mapToInt(cols -> cols.stream().mapToInt(Buffer::remaining).sum()).sum());
                 }
+
+                if (countOnFirstPage < 0)
+                    countOnFirstPage = rows.size();
+                count += rows.size();
             }
-            assertFalse(iter.hasNext());
 
-            for (int c1 = 0; c1 < 100; c1++)
+            assertThat(countOnFirstPage).isEqualTo(expectedCountOnFirstPage);
+            assertThat(count).isEqualTo(expectedCount);
+            assertThat(pager.isExhausted()).isTrue();
+        }
+        catch (OperationExecutionException ex)
+        {
+            if (pager instanceof AggregationQueryPager && requestedPageSize.getUnit() == PageSize.PageUnit.BYTES)
+                return null;
+        }
+
+        if (pager instanceof AggregationQueryPager && requestedPageSize.getUnit() == PageSize.PageUnit.BYTES)
+            fail("Expected " + OperationExecutionException.class.getSimpleName() + " to be thrown when paging is in bytes");
+
+        return rows;
+    }
+
+    private void testPagingCases(String query, int selPartitions, int selClusterings, int genPartitions, int genClusterings) throws Throwable
+    {
+        testPagingCases(query, selPartitions, selClusterings, genPartitions, genClusterings, 1);
+    }
+
+
+    private void testPagingCases(String query, int selPartitions, int selClusterings, int genPartitions, int genClusterings, int genClusterings2) throws Throwable
+    {
+        String table = generateData(genPartitions, genClusterings, genClusterings2);
+
+        flush(true);
+        Supplier<Pair<QueryPager, SelectStatement>> pagerSupplier;
+        query = String.format(query, KEYSPACE + '.' + table);
+        int selected = selPartitions * selClusterings;
+
+        // when there is a page size
+        pagerSupplier = getPager("%s ALLOW FILTERING", query);
+        assertResults(pagerSupplier, selected / 3, selected / 3, selected);
+
+        // when there is a query limit
+        pagerSupplier = getPager("%s LIMIT %d ALLOW FILTERING", query, selected / 3);
+        assertResults(pagerSupplier, selected / 3);
+
+
+        // when there is a per partition limit
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d ALLOW FILTERING", query, selClusterings / 2);
+        assertResults(pagerSupplier, selPartitions * (selClusterings / 2));
+
+
+        // when there is a page size and a query limit:
+
+        // - where query limit is == page size
+        pagerSupplier = getPager("%s LIMIT %d ALLOW FILTERING", query, selected / 2);
+        assertResults(pagerSupplier, selected / 2, selected / 2, selected / 2);
+
+        // - where query limit is < page size
+        pagerSupplier = getPager("%s LIMIT %d ALLOW FILTERING", query, selected / 3);
+        assertResults(pagerSupplier, selected / 2, selected / 3, selected / 3);
+
+        // - where query limit is > page size
+        pagerSupplier = getPager("%s LIMIT %d ALLOW FILTERING", query, selected / 2);
+        assertResults(pagerSupplier, selected / 3, selected / 3, selected / 2);
+
+
+        // when there is a per partition limit and a query limit:
+
+        // - where query limit is < per partition limit
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 2, selClusterings / 3);
+        assertResults(pagerSupplier, selClusterings / 3);
+
+        // - where query limit is > per partition limit (case for single partition and multiple partitions)
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 3, selClusterings / 2);
+        if (selPartitions == 1)
+            assertResults(pagerSupplier, selClusterings / 3);
+        else
+            assertResults(pagerSupplier, selClusterings / 2);
+
+        // - where query limit is == per partition limit
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 2, selClusterings / 2);
+        assertResults(pagerSupplier, selClusterings / 2);
+
+        // when there is a page size and a per partition limit,
+
+        // - where page size is < per partition limit
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d ALLOW FILTERING", query, selClusterings / 2);
+        assertResults(pagerSupplier, selClusterings / 3, selClusterings / 3, selPartitions * (selClusterings / 2));
+
+        // - where page size is == per partition limit
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d ALLOW FILTERING", query, selClusterings / 2);
+        assertResults(pagerSupplier, selClusterings / 2, selClusterings / 2, selPartitions * (selClusterings / 2));
+
+        // - where page size is > per partition limit (case for single partition and mulitple partitions)
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d ALLOW FILTERING", query, selClusterings / 3);
+        if (selPartitions == 1)
+            assertResults(pagerSupplier, selClusterings / 2, selClusterings / 3, selClusterings / 3);
+        else
+            assertResults(pagerSupplier, selClusterings / 2, selClusterings / 2, selPartitions * (selClusterings / 3));
+
+
+        // when there is a page size, a per partition limit and a query limit
+
+        // - where per partition limit == query limit == page size
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 2, selClusterings / 2);
+        assertResults(pagerSupplier, selClusterings / 2, selClusterings / 2, selClusterings / 2);
+
+        // - where per partition limit > query limit > page size
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 2, selClusterings / 3);
+        assertResults(pagerSupplier, selClusterings / 4, selClusterings / 4, selClusterings / 3);
+
+        // - where per partition limit > page size > query limit
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 2, selClusterings / 4);
+        assertResults(pagerSupplier, selClusterings / 3, selClusterings / 4, selClusterings / 4);
+
+        // - where per query limit > per partition limit > page size (case for single partition and mulitple partitions)
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 3, selClusterings / 2);
+        if (selPartitions == 1)
+            assertResults(pagerSupplier, selClusterings / 4, selClusterings / 4, selClusterings / 3);
+        else
+            assertResults(pagerSupplier, selClusterings / 4, selClusterings / 4, selClusterings / 2);
+
+        // - where per query limit > page size > per partition limit (case for single partition and mulitple partitions)
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 4, selClusterings / 2);
+        if (selPartitions == 1)
+            assertResults(pagerSupplier, selClusterings / 3, selClusterings / 4, selClusterings / 4);
+        else
+            assertResults(pagerSupplier, selClusterings / 3, selClusterings / 3, selClusterings / 2);
+
+        // - where page size > per partition limit > query limit (case for single partition and mulitple partitions)
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 3, selClusterings / 4);
+        if (selPartitions == 1)
+            assertResults(pagerSupplier, selClusterings / 2, selClusterings / 4, selClusterings / 4);
+        else
+            assertResults(pagerSupplier, selClusterings / 2, selClusterings / 4, selClusterings / 4);
+
+        // - where page size > query limit > per partition limit (case for single partition and mulitple partitions)
+        pagerSupplier = getPager("%s PER PARTITION LIMIT %d LIMIT %d ALLOW FILTERING", query, selClusterings / 4, selClusterings / 3);
+        if (selPartitions == 1)
+            assertResults(pagerSupplier, selClusterings / 2, selClusterings / 4, selClusterings / 4);
+        else
+            assertResults(pagerSupplier, selClusterings / 2, selClusterings / 3, selClusterings / 3);
+    }
+
+
+    private void testPagingCasesWithAggregateEverything(String query, int genPartitions, int genClusterings, int genClusterings2, int expectedResult) throws Throwable
+    {
+        String table = generateData(genPartitions, genClusterings, genClusterings2);
+
+        flush(true);
+        Supplier<Pair<QueryPager, SelectStatement>> pagerSupplier;
+        query = String.format(query, KEYSPACE + '.' + table);
+
+        // when there is a page size
+        pagerSupplier = getPager("%s ALLOW FILTERING", query);
+        assertResults(pagerSupplier, 1, 1, 1, expectedResult);
+    }
+
+    private String generateData(int genPartitions, int genClusterings, int genClusterings2) throws Throwable
+    {
+        String table = createTable("CREATE TABLE %s (k INT, c INT, c2 INT, v INT, PRIMARY KEY (k, c, c2))");
+        for (int k = 0; k < genPartitions; k++)
+        {
+            for (int c = 0; c < genClusterings; c++)
             {
-                stmt = new SimpleStatement("SELECT c1, c2, v1, v2 FROM " + KEYSPACE + '.' + currentTable() + " WHERE k1 = 1 AND c1 = ?", c1);
-                stmt.setFetchSize(3);
-                rs = session.execute(stmt);
-                iter = rs.iterator();
-                for (int c2 = 0; c2 < 100; c2++)
+                for (int c2 = 0; c2 < genClusterings2; c2++)
                 {
-                    assertTrue(iter.hasNext());
-                    Row row = iter.next();
-                    String msg = "Within " + c1 + " on " + c2;
-                    assertEquals(msg, c1, row.getInt(0));
-                    assertEquals(msg, c2, row.getInt(1));
-                    assertEquals(msg, Integer.toString(c1), row.getString(2));
-                    assertEquals(msg, Integer.toString(c2), row.getString(3));
+                    execute("INSERT INTO %s (k, c, c2, v) VALUES (?, ?, ?, ?)", k, c, c2, 1);
+                    if ((k * genClusterings + c) % (3 * (genClusterings + genPartitions) / 2) == 0)
+                        flush(true);
                 }
-                assertFalse(iter.hasNext());
             }
         }
+        return table;
+    }
+
+
+    @Test
+    public void testLimitsOnFullScanQuery() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s", 10, 10, 10, 10);
+    }
+
+    @Test
+    public void testLimitsOnSliceSelection() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE c > 2 AND c <= 7", 10, 5, 10, 10);
+    }
+
+    @Test
+    public void testLimitsOnClusteringsSelection() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE c IN (2, 4, 7, 8)", 10, 4, 10, 10);
+    }
+
+    @Test
+    public void testLimitsOnSliceAndKeyRangeSelection() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE c > 2 AND c <= 7 AND TOKEN(k) > TOKEN(0)", 6, 5, 10, 10);
+    }
+
+    @Test
+    public void testLimitsInSinglePartition() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE k = 5", 1, 100, 10, 100);
+    }
+
+    @Test
+    public void testLimitsInMultiplePartitions() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE k IN (5, 7, 9)", 3, 100, 10, 100);
+    }
+
+    @Test
+    public void testLimitsOnSliceInSinglePartition() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE c > 20 AND c <= 70 AND k = 5", 1, 50, 10, 100);
+    }
+
+    @Test
+    public void testLimitsOnClusteringsInSinglePartitionSelection() throws Throwable
+    {
+        testPagingCases("SELECT * FROM %s WHERE c IN (2, 4, 7, 8) AND k = 5", 1, 4, 10, 10);
+    }
+
+    @Test
+    public void testLimitsOnFullScanQueryWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s GROUP BY k, c", 10, 10, 10, 10, 10);
+    }
+
+    @Test
+    public void testLimitsOnSliceSelectionWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s WHERE c > 2 AND c <= 7 GROUP BY k, c", 10, 5, 10, 10, 10);
     }
 
+    @Test
+    public void testLimitsOnClusteringsSelectionWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s WHERE c IN (2, 4, 7, 8) GROUP BY k, c", 10, 4, 10, 10, 10);
+    }
+
+    @Test
+    public void testLimitsOnSliceAndKeyRangeSelectionWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s WHERE c > 2 AND c <= 7 AND TOKEN(k) > TOKEN(0) GROUP BY k, c", 6, 5, 10, 10, 10);
+    }
+
+    @Test
+    public void testLimitsInSinglePartitionWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s WHERE k = 5 GROUP BY k, c", 1, 100, 10, 100, 10);
+    }
+
+    @Test
+    public void testLimitsInMultiplePartitionsWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s WHERE k IN (5, 7, 9) GROUP BY k, c", 3, 100, 10, 100, 10);
+    }
+
+    @Test
+    public void testLimitsOnSliceInSinglePartitionWithGrouping() throws Throwable
+    {
+        testPagingCases("SELECT k, c, SUM(v) FROM %s WHERE c > 20 AND c <= 70 AND k = 5 GROUP BY k, c", 1, 50, 10, 100, 10);
+    }
+
+
+    @Test
+    public void testLimitsOnFullScanQueryWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s", 3, 3, 3, 27);
+    }
+
+    @Test
+    public void testLimitsOnSliceSelectionWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s WHERE c > 2 AND c <= 7", 10, 10, 10, 500);
+    }
+
+    @Test
+    public void testLimitsOnClusteringsSelectionWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s WHERE c IN (2, 4, 7, 8)", 10, 10, 10, 400);
+    }
+
+    @Test
+    public void testLimitsOnSliceAndKeyRangeSelectionWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s WHERE c > 2 AND c <= 7 AND TOKEN(k) > TOKEN(0)", 10, 10, 10, 300);
+    }
+
+    @Test
+    public void testLimitsInSinglePartitionWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s WHERE k = 5", 10, 10, 10, 100);
+    }
+
+    @Test
+    public void testLimitsInMultiplePartitionsWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s WHERE k IN (5, 7, 9)", 10, 10, 10, 300);
+    }
+
+    @Test
+    public void testLimitsOnSliceInSinglePartitionWithAggregateEverything() throws Throwable
+    {
+        testPagingCasesWithAggregateEverything("SELECT COUNT(*) FROM %s WHERE c > 2 AND c <= 7 AND k = 5", 10, 10, 10, 50);
+    }
+
+
     private static String someText()
     {
         char[] arr = new char[1024];
         for (int i = 0; i < arr.length; i++)
-            arr[i] = (char)(32 + ThreadLocalRandom.current().nextInt(95));
+            arr[i] = (char) (32 + ThreadLocalRandom.current().nextInt(95));
         return new String(arr);
     }
 }
diff --git a/test/unit/org/apache/cassandra/cql3/PagingTest.java b/test/unit/org/apache/cassandra/cql3/PagingTest.java
index 50bba0ed32c8..5af813ee8a07 100644
--- a/test/unit/org/apache/cassandra/cql3/PagingTest.java
+++ b/test/unit/org/apache/cassandra/cql3/PagingTest.java
@@ -17,9 +17,7 @@
  */
 package org.apache.cassandra.cql3;
 
-import java.net.InetAddress;
 import java.util.Iterator;
-import java.util.List;
 
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
@@ -32,10 +30,12 @@
 import com.datastax.driver.core.SimpleStatement;
 import com.datastax.driver.core.Statement;
 import org.apache.cassandra.config.DatabaseDescriptor;
-
-import org.apache.cassandra.dht.Murmur3Partitioner;
 import org.apache.cassandra.dht.Murmur3Partitioner.LongToken;
-import org.apache.cassandra.locator.*;
+import org.apache.cassandra.locator.AbstractEndpointSnitch;
+import org.apache.cassandra.locator.IEndpointSnitch;
+import org.apache.cassandra.locator.InetAddressAndPort;
+import org.apache.cassandra.locator.Replica;
+import org.apache.cassandra.locator.ReplicaCollection;
 import org.apache.cassandra.service.EmbeddedCassandraService;
 import org.apache.cassandra.service.StorageService;
 import org.apache.cassandra.utils.FBUtilities;
diff --git a/test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java b/test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java
index 6215ca637de9..fe15f5ba1c80 100644
--- a/test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java
+++ b/test/unit/org/apache/cassandra/db/AbstractReadCommandBuilder.java
@@ -23,6 +23,7 @@
 
 import com.google.common.collect.Sets;
 
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.cql3.ColumnIdentifier;
@@ -39,7 +40,8 @@ public abstract class AbstractReadCommandBuilder
     protected int nowInSeconds;
 
     private int cqlLimit = -1;
-    private int pagingLimit = -1;
+    private PageSize pageSize = PageSize.NONE;
+    private int perPartitionLimit = -1;
     protected boolean reversed = false;
 
     protected Set<ColumnIdentifier> columns;
@@ -114,9 +116,15 @@ public AbstractReadCommandBuilder withLimit(int newLimit)
         return this;
     }
 
-    public AbstractReadCommandBuilder withPagingLimit(int newLimit)
+    public AbstractReadCommandBuilder withPageSize(PageSize pageSize)
     {
-        this.pagingLimit = newLimit;
+        this.pageSize = pageSize;
+        return this;
+    }
+
+    public AbstractReadCommandBuilder withPerPartitionLimit(int perPartitionLimit)
+    {
+        this.perPartitionLimit = perPartitionLimit;
         return this;
     }
 
@@ -212,9 +220,10 @@ protected ClusteringIndexFilter makeFilter()
 
     protected DataLimits makeLimits()
     {
-        DataLimits limits = cqlLimit < 0 ? DataLimits.NONE : DataLimits.cqlLimits(cqlLimit);
-        if (pagingLimit >= 0)
-            limits = limits.forPaging(pagingLimit);
+        DataLimits limits = DataLimits.cqlLimits(cqlLimit < 0 ? DataLimits.NO_LIMIT : cqlLimit,
+                                                 perPartitionLimit < 0 ? DataLimits.NO_LIMIT : perPartitionLimit);
+        if (pageSize.isDefined())
+            limits = limits.forPaging(pageSize);
         return limits;
     }
 
diff --git a/test/unit/org/apache/cassandra/db/ReadResponseTest.java b/test/unit/org/apache/cassandra/db/ReadResponseTest.java
index 6e1a804e5fb6..890e33f2c9e8 100644
--- a/test/unit/org/apache/cassandra/db/ReadResponseTest.java
+++ b/test/unit/org/apache/cassandra/db/ReadResponseTest.java
@@ -23,8 +23,10 @@
 import java.util.Random;
 
 import org.junit.Before;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.filter.ColumnFilter;
 import org.apache.cassandra.db.filter.DataLimits;
 import org.apache.cassandra.db.filter.RowFilter;
@@ -49,6 +51,12 @@ public class ReadResponseTest
     private final Random random = new Random();
     private TableMetadata metadata;
 
+    @BeforeClass
+    public static void setupClass()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
     @Before
     public void setup()
     {
diff --git a/test/unit/org/apache/cassandra/db/filter/DataLimitsTest.java b/test/unit/org/apache/cassandra/db/filter/DataLimitsTest.java
new file mode 100644
index 000000000000..e88cba45f92f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/filter/DataLimitsTest.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.filter;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.PageSize;
+import org.apache.cassandra.db.aggregation.AggregationSpecification;
+import org.apache.cassandra.io.util.DataInputBuffer;
+import org.apache.cassandra.io.util.DataOutputBuffer;
+import org.apache.cassandra.net.MessagingService;
+import org.apache.cassandra.utils.ByteBufferUtil;
+
+import static org.apache.cassandra.db.filter.DataLimits.NO_LIMIT;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.fail;
+
+public class DataLimitsTest
+{
+    private final static Logger logger = LoggerFactory.getLogger(DataLimitsTest.class);
+
+    ByteBuffer lastReturnedKey = ByteBufferUtil.bytes("lastReturnedKey");
+
+    DataLimits cqlLimits = DataLimits.cqlLimits(19, 17);
+    DataLimits cqlLimitsForPagingInRows = cqlLimits.forPaging(PageSize.inRows(13));
+    DataLimits cqlLimitsForPagingInBytes = cqlLimits.forPaging(PageSize.inBytes(13));
+    DataLimits cqlLimitsForPagingInRowsWithLastRow = cqlLimits.forPaging(PageSize.inRows(13), lastReturnedKey, 5);
+    DataLimits cqlLimitsForPagingInBytesWithLastRow = cqlLimits.forPaging(PageSize.inBytes(13), lastReturnedKey, 5);
+    DataLimits groupByLimits = DataLimits.groupByLimits(19, 17, NO_LIMIT, NO_LIMIT, AggregationSpecification.AGGREGATE_EVERYTHING);
+    DataLimits groupByLimitsForPagingInRows = groupByLimits.forPaging(PageSize.inRows(13));
+    DataLimits groupByLimitsForPagingInBytes = groupByLimits.forPaging(PageSize.inBytes(13));
+    DataLimits groupByLimitsForPagingInRowsWithLastRow = groupByLimits.forPaging(PageSize.inRows(13), lastReturnedKey, 5);
+    DataLimits groupByLimitsForPagingInBytesWithLastRow = groupByLimits.forPaging(PageSize.inBytes(13), lastReturnedKey, 5);
+
+    @BeforeClass
+    public static void initClass()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
+
+    @Test
+    public void serializationTest()
+    {
+        for (MessagingService.Version version : MessagingService.Version.values())
+        {
+            checkSerialization(version, cqlLimits, "cql limits");
+            checkSerialization(version, cqlLimitsForPagingInRows, "cql limits for paging in rows");
+            checkSerialization(version, cqlLimitsForPagingInBytes, "cql limits for paging in bytes");
+            checkSerialization(version, cqlLimitsForPagingInRowsWithLastRow, "cql limits for paging in rows with last row");
+            checkSerialization(version, cqlLimitsForPagingInBytesWithLastRow, "cql limits for paging in bytes with last row");
+            checkSerialization(version, groupByLimits, "group by limits");
+            checkSerialization(version, groupByLimitsForPagingInRows, "group by limits for paging in rows");
+            checkSerialization(version, groupByLimitsForPagingInBytes, "group by limits for paging in bytes");
+            checkSerialization(version, groupByLimitsForPagingInRowsWithLastRow, "group by limits for paging in rows with last row");
+            checkSerialization(version, groupByLimitsForPagingInBytesWithLastRow, "group by limits for paging in bytes with last row");
+        }
+    }
+
+    @Test
+    public void toStringTest()
+    {
+        String lastRetKeyStr = String.format("lastReturnedKey=%s", ByteBufferUtil.bytesToHex(lastReturnedKey));
+        String lastRetKeyRemainingStr = "lastReturnedKeyRemaining=5";
+
+        assertThat(cqlLimits.toString()).contains("ROWS LIMIT 19").contains("PER PARTITION LIMIT 17").doesNotContain("BYTES LIMIT");
+        assertThat(cqlLimitsForPagingInRows.toString()).contains("ROWS LIMIT 13").contains("PER PARTITION LIMIT 17").doesNotContain("BYTES LIMIT");
+        assertThat(cqlLimitsForPagingInBytes.toString()).contains("BYTES LIMIT 13").contains("ROWS LIMIT 19").contains("PER PARTITION LIMIT 17");
+        assertThat(cqlLimitsForPagingInRowsWithLastRow.toString()).contains("ROWS LIMIT 13").contains("PER PARTITION LIMIT 17").doesNotContain("BYTES LIMIT").contains(lastRetKeyStr).contains(lastRetKeyRemainingStr);
+        assertThat(cqlLimitsForPagingInBytesWithLastRow.toString()).contains("BYTES LIMIT 13").contains("ROWS LIMIT 19").contains("PER PARTITION LIMIT 17").contains(lastRetKeyStr).contains(lastRetKeyRemainingStr);
+
+        assertThat(groupByLimits.toString()).contains("GROUP LIMIT 19").contains("GROUP PER PARTITION LIMIT 17").doesNotContain("ROWS LIMIT").doesNotContain("BYTES LIMIT");
+        assertThat(groupByLimitsForPagingInRows.toString()).contains("GROUP LIMIT 19").contains("GROUP PER PARTITION LIMIT 17").contains("ROWS LIMIT 13").doesNotContain("BYTES LIMIT");
+        assertThat(groupByLimitsForPagingInBytes.toString()).contains("GROUP LIMIT 19").contains("GROUP PER PARTITION LIMIT 17").doesNotContain("ROWS LIMIT").contains("BYTES LIMIT 13");
+        assertThat(groupByLimitsForPagingInRowsWithLastRow.toString()).contains("GROUP LIMIT 19").contains("GROUP PER PARTITION LIMIT 17").contains("ROWS LIMIT 13").doesNotContain("BYTES LIMIT").contains(lastRetKeyStr).contains(lastRetKeyRemainingStr);
+        assertThat(groupByLimitsForPagingInBytesWithLastRow.toString()).contains("GROUP LIMIT 19").contains("GROUP PER PARTITION LIMIT 17").doesNotContain("ROWS LIMIT").contains("BYTES LIMIT 13").contains(lastRetKeyStr).contains(lastRetKeyRemainingStr);
+    }
+
+    private void checkSerialization(MessagingService.Version version, DataLimits limits, String name)
+    {
+        String msg = String.format("serialization of %s for version %s", name, version);
+        int size = (int) DataLimits.serializer.serializedSize(limits, version.value, null);
+        try (DataOutputBuffer out = new DataOutputBuffer(2 * size))
+        {
+            DataLimits.serializer.serialize(limits, out, version.value, null);
+            out.flush();
+            assertThat(out.getLength()).describedAs(msg).isEqualTo(size);
+            try (DataInputBuffer in = new DataInputBuffer(out.getData()))
+            {
+                DataLimits deserializedLimits = DataLimits.serializer.deserialize(in, version.value, null);
+                assertThat(deserializedLimits.count()).describedAs(msg).isEqualTo(limits.count());
+
+                if (version.value >= MessagingService.VERSION_41)
+                    assertThat(deserializedLimits.bytes()).describedAs(msg).isEqualTo(limits.bytes());
+                else
+                    assertThat(deserializedLimits.bytes()).describedAs(msg).isEqualTo(NO_LIMIT);
+
+                assertThat(deserializedLimits.rows()).describedAs(msg).isEqualTo(limits.rows());
+                assertThat(deserializedLimits.perPartitionCount()).describedAs(msg).isEqualTo(limits.perPartitionCount());
+                assertThat(deserializedLimits.isDistinct()).describedAs(msg).isEqualTo(limits.isDistinct());
+                assertThat(deserializedLimits.isUnlimited()).describedAs(msg).isEqualTo(limits.isUnlimited());
+                assertThat(deserializedLimits.kind()).describedAs(msg).isEqualTo(limits.kind());
+                assertThat(deserializedLimits.isGroupByLimit()).describedAs(msg).isEqualTo(limits.isGroupByLimit());
+            }
+            catch (IOException e)
+            {
+                logger.error("Failed to deserialize: " + msg, e);
+                fail(msg);
+            }
+        }
+        catch (IOException e)
+        {
+            logger.error("Failed to serialize: " + msg, e);
+            fail(msg);
+        }
+    }
+}
\ No newline at end of file
diff --git a/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java b/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java
index 6baccd6af1d1..f0aff8c98c52 100644
--- a/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java
+++ b/test/unit/org/apache/cassandra/db/lifecycle/LogReplicationSetTest.java
@@ -19,10 +19,10 @@
 import java.io.File;
 import java.util.ArrayList;
 
-import org.junit.Assert;
 import org.junit.Test;
 
 import org.apache.cassandra.io.util.FileUtils;
+import org.assertj.core.api.Assertions;
 import org.mockito.Mockito;
 
 public class LogReplicationSetTest
@@ -37,9 +37,8 @@ public void shouldThrowIfAppendFailedToAllReplicas() throws Throwable
         replicas.addReplicas(spyFiles);
         spyFiles.forEach(f -> Mockito.when(f.exists()).thenThrow(new RuntimeException()));
 
-        Assert.assertThrows(RuntimeException.class,
-                            () ->
-                            replicas.append(LogRecord.makeAbort(System.currentTimeMillis())));
+        Assertions.assertThatExceptionOfType(RuntimeException.class)
+                  .isThrownBy(() -> replicas.append(LogRecord.makeAbort(System.currentTimeMillis())));
     }
 
     @Test
diff --git a/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java b/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java
index 9a40823db983..e2f85e7e0063 100644
--- a/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java
+++ b/test/unit/org/apache/cassandra/db/rows/UnfilteredRowIteratorsTest.java
@@ -21,10 +21,12 @@
 import java.util.Arrays;
 import java.util.Iterator;
 
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import org.junit.Assert;
 import org.apache.cassandra.Util;
+import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.db.BufferDecoratedKey;
 import org.apache.cassandra.db.DecoratedKey;
 import org.apache.cassandra.db.DeletionTime;
@@ -56,6 +58,11 @@ public class UnfilteredRowIteratorsTest
         v2Metadata = metadata.regularAndStaticColumns().columns(false).getSimple(1);
     }
 
+    @BeforeClass
+    public static void setupClass()
+    {
+        DatabaseDescriptor.daemonInitialization();
+    }
 
     @Test
     public void concatTest()
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
index d6678aafca00..5dddac3e1589 100644
--- a/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailConsistencyTest.java
@@ -31,6 +31,7 @@
 
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLStatement;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.cql3.QueryOptions;
 import org.apache.cassandra.cql3.QueryProcessor;
 import org.apache.cassandra.db.ConsistencyLevel;
@@ -88,7 +89,7 @@ private QueryOptions queryOptions(ConsistencyLevel cl, ConsistencyLevel serialCl
         return QueryOptions.create(cl,
                                    Collections.emptyList(),
                                    false,
-                                   1,
+                                   PageSize.inRows(1),
                                    null,
                                    serialCl,
                                    ProtocolVersion.CURRENT,
diff --git a/test/unit/org/apache/cassandra/guardrails/GuardrailPagingTest.java b/test/unit/org/apache/cassandra/guardrails/GuardrailPagingTest.java
new file mode 100644
index 000000000000..505f6aab636f
--- /dev/null
+++ b/test/unit/org/apache/cassandra/guardrails/GuardrailPagingTest.java
@@ -0,0 +1,172 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.guardrails;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import org.apache.cassandra.auth.AuthenticatedUser;
+import org.apache.cassandra.config.DatabaseDescriptor;
+import org.apache.cassandra.cql3.PageSize;
+import org.apache.cassandra.cql3.QueryHandler;
+import org.apache.cassandra.cql3.QueryOptions;
+import org.apache.cassandra.cql3.QueryProcessor;
+import org.apache.cassandra.db.ConsistencyLevel;
+import org.apache.cassandra.exceptions.InvalidRequestException;
+import org.apache.cassandra.service.ClientState;
+import org.apache.cassandra.service.QueryState;
+import org.apache.cassandra.transport.ProtocolVersion;
+import org.apache.cassandra.transport.messages.ResultMessage;
+import org.assertj.core.api.Assertions;
+
+import static org.junit.Assert.assertTrue;
+
+@RunWith(Parameterized.class)
+public class GuardrailPagingTest extends GuardrailTester
+{
+    private static final String PARTITION_RANGE_QUERY = "SELECT * FROM %s.%s";
+    private static final String SINGLE_PARTITION_QUERY = "SELECT * FROM %s.%s WHERE k = 5";
+    private static final String MULTI_PARTITION_QUERY = "SELECT * FROM %s.%s WHERE k IN (1, 3, 5)";
+
+    private static int defaultPageSizeThreshold;
+    private static final int pageSizeThresholdInKB = 5;
+
+    private static final int partitionCount = 10;
+    private static final int rowsPerPartition = 100;
+
+    @Parameterized.Parameters(name = "q={0},size={1}")
+    public static Collection<Object[]> parameters()
+    {
+        return Arrays.asList(new Object[]{ PARTITION_RANGE_QUERY, partitionCount * rowsPerPartition },
+                             new Object[]{ SINGLE_PARTITION_QUERY, rowsPerPartition },
+                             new Object[]{ MULTI_PARTITION_QUERY, 3 * rowsPerPartition });
+    }
+
+    @Parameterized.Parameter(0)
+    public String query;
+
+    @Parameterized.Parameter(1)
+    public int limit;
+
+    @BeforeClass
+    public static void setup()
+    {
+        defaultPageSizeThreshold = DatabaseDescriptor.getGuardrailsConfig().page_size_failure_threshold_in_kb;
+    }
+
+    @After
+    public void tearDown()
+    {
+        DatabaseDescriptor.getGuardrailsConfig().page_size_failure_threshold_in_kb = defaultPageSizeThreshold;
+    }
+
+    @Before
+    public void setUp() throws Throwable
+    {
+        DatabaseDescriptor.getGuardrailsConfig().page_size_failure_threshold_in_kb = pageSizeThresholdInKB;
+
+        createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))");
+
+        for (int i = 0; i < partitionCount; i++)
+            for (int j = 0; j < rowsPerPartition; j++)
+                execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", i, j, "long long test message bla bla bla bla bla bla bla bla bla bla bla");
+    }
+
+    @Test
+    public void testConfigValidation()
+    {
+        testValidationOfStrictlyPositiveProperty((c, v) -> c.page_size_failure_threshold_in_kb = v.intValue(),
+                                                 "page_size_failure_threshold_in_kb");
+    }
+
+    private ResultMessage.Rows selectWithPaging(String query, PageSize pageSize, ClientState clientState) throws InvalidRequestException
+    {
+        QueryOptions options = QueryOptions.create(ConsistencyLevel.LOCAL_QUORUM,
+                                                   Collections.emptyList(),
+                                                   false,
+                                                   pageSize,
+                                                   null,
+                                                   ConsistencyLevel.LOCAL_SERIAL,
+                                                   ProtocolVersion.CURRENT,
+                                                   KEYSPACE);
+
+        clientState.setKeyspace(KEYSPACE);
+        QueryState queryState = new QueryState(clientState);
+
+        QueryHandler.Prepared prepared = QueryProcessor.prepareInternal(String.format(query, KEYSPACE, currentTable()));
+        return (ResultMessage.Rows) prepared.statement.execute(queryState, options, System.nanoTime());
+    }
+
+    private ResultMessage.Rows testQueryWithPagedByRows(String query, PageSize pageSize, int rowLimit) throws Throwable
+    {
+        ResultMessage.Rows result = selectWithPaging(query, pageSize, ClientState.forExternalCalls(AuthenticatedUser.ANONYMOUS_USER));
+        Assertions.assertThat(result.result.rows.size()).isLessThan(rowLimit);
+        return result;
+    }
+
+    /**
+     * Test that the number of returned rows per page is silently limited to fit into the guardrail hard limit
+     */
+    @Test
+    public void testPartitionQueryWithPagedByRows() throws Throwable
+    {
+        // ask for more rows per page than can fit with the current guardrail
+        testQueryWithPagedByRows(query, PageSize.inRows(limit), limit);
+    }
+
+    /**
+     * Test that a query throws with page size that is bigger than the guardrail hard limit
+     */
+    @Test(expected = InvalidRequestException.class)
+    public void testQueryWithLargeBytePagesThrows() throws Throwable
+    {
+        testQueryWithPagedByRows(query, PageSize.inBytes(10 * 1024), limit);
+    }
+
+    /**
+     * Test that a query does not throw with page size that is smaller than the guardrail hard limit
+     */
+    @Test
+    public void testQueryWithSmallBytePagesWorks() throws Throwable
+    {
+        int maxPageSize = 2 * 1024;
+        ResultMessage.Rows result = testQueryWithPagedByRows(query, PageSize.inBytes(maxPageSize), limit);
+        // technically incorrect as we compare a size of encoded message to be sent to a client to the page size,
+        // but we can't know the page at this point.
+        assertTrue(ResultMessage.codec.encodedSize(result, ProtocolVersion.CURRENT) < maxPageSize);
+    }
+
+    /**
+     * Test that superusers and internal queries are excluded from the guardrail.
+     */
+    @Test
+    public void testExcludedUsers()
+    {
+        selectWithPaging(query, PageSize.inBytes(10 * 1024), ClientState.forInternalCalls());
+        selectWithPaging(query, PageSize.inBytes(10 * 1024), ClientState.forExternalCalls(new AuthenticatedUser("cassandra")));
+    }
+}
diff --git a/test/unit/org/apache/cassandra/index/CustomIndexTest.java b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
index 5a222545b9df..4a4ca3675ef2 100644
--- a/test/unit/org/apache/cassandra/index/CustomIndexTest.java
+++ b/test/unit/org/apache/cassandra/index/CustomIndexTest.java
@@ -35,7 +35,7 @@
 
 import com.datastax.driver.core.exceptions.QueryValidationException;
 import org.apache.cassandra.Util;
-import org.apache.cassandra.db.compaction.OperationType;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.filter.RowFilter;
 import org.apache.cassandra.db.lifecycle.LifecycleNewTracker;
 import org.apache.cassandra.db.memtable.Memtable;
@@ -47,7 +47,6 @@
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.ColumnIdentifier;
 import org.apache.cassandra.cql3.restrictions.IndexRestrictions;
-import org.apache.cassandra.cql3.restrictions.StatementRestrictions;
 import org.apache.cassandra.cql3.statements.schema.IndexTarget;
 import org.apache.cassandra.cql3.statements.ModificationStatement;
 import org.apache.cassandra.db.*;
@@ -115,11 +114,11 @@ public void indexControlsIfIncludedInBuildOnNewSSTables() throws Throwable
         flush();
 
         SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
-        IndexIncludedInBuild included = (IndexIncludedInBuild)indexManager.getIndexByName(toInclude);
+        IndexIncludedInBuild included = (IndexIncludedInBuild) indexManager.getIndexByName(toInclude);
         included.reset();
         assertTrue(included.rowsInserted.isEmpty());
 
-        IndexExcludedFromBuild excluded = (IndexExcludedFromBuild)indexManager.getIndexByName(toExclude);
+        IndexExcludedFromBuild excluded = (IndexExcludedFromBuild) indexManager.getIndexByName(toExclude);
         excluded.reset();
         assertTrue(excluded.rowsInserted.isEmpty());
 
@@ -143,7 +142,7 @@ public void indexReceivesWriteTimeDeletionsCorrectly() throws Throwable
         execute("INSERT INTO %s (a, b, c, d) VALUES (?, ?, ?, ?)", 0, 1, 3, 3);
 
         SecondaryIndexManager indexManager = getCurrentColumnFamilyStore().indexManager;
-        StubIndex index = (StubIndex)indexManager.getIndexByName(indexName);
+        StubIndex index = (StubIndex) indexManager.getIndexByName(indexName);
         assertEquals(4, index.rowsInserted.size());
         assertTrue(index.partitionDeletions.isEmpty());
         assertTrue(index.rangeTombstones.isEmpty());
@@ -156,6 +155,7 @@ public void indexReceivesWriteTimeDeletionsCorrectly() throws Throwable
         assertEquals(1, index.partitionDeletions.size());
         assertEquals(1, index.rangeTombstones.size());
     }
+
     @Test
     public void nonCustomIndexesRequireExactlyOneTargetColumn() throws Throwable
     {
@@ -163,7 +163,7 @@ public void nonCustomIndexesRequireExactlyOneTargetColumn() throws Throwable
 
         assertInvalidMessage("Only CUSTOM indexes support multiple columns", "CREATE INDEX multi_idx on %s(v1,v2)");
         assertInvalidMessage("Only CUSTOM indexes can be created without specifying a target column",
-                           "CREATE INDEX no_targets on %s()");
+                             "CREATE INDEX no_targets on %s()");
 
         createIndex(String.format("CREATE CUSTOM INDEX multi_idx ON %%s(v1, v2) USING '%s'", StubIndex.class.getName()));
         assertIndexCreated("multi_idx", "v1", "v2");
@@ -409,7 +409,7 @@ public void customIndexDoesntSupportCustomExpressions() throws Throwable
                                   indexName,
                                   NoCustomExpressionsIndex.class.getName()));
         assertInvalidThrowMessage(Optional.of(ProtocolVersion.CURRENT),
-                                  String.format( IndexRestrictions.CUSTOM_EXPRESSION_NOT_SUPPORTED, indexName),
+                                  String.format(IndexRestrictions.CUSTOM_EXPRESSION_NOT_SUPPORTED, indexName),
                                   QueryValidationException.class,
                                   String.format("SELECT * FROM %%s WHERE expr(%s, 'foo bar baz')", indexName));
     }
@@ -468,9 +468,9 @@ public void indexSelectionPrefersMostSelectiveIndex() throws Throwable
                                   currentTable(),
                                   SettableSelectivityIndex.class.getName()));
         SettableSelectivityIndex moreSelective =
-            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_more_selective");
+        (SettableSelectivityIndex) getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_more_selective");
         SettableSelectivityIndex lessSelective =
-            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_less_selective");
+        (SettableSelectivityIndex) getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_less_selective");
         assertEquals(0, moreSelective.searchersProvided);
         assertEquals(0, lessSelective.searchersProvided);
 
@@ -499,9 +499,9 @@ public void customExpressionForcesIndexSelection() throws Throwable
                                   currentTable(),
                                   SettableSelectivityIndex.class.getName()));
         SettableSelectivityIndex moreSelective =
-            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_more_selective");
+        (SettableSelectivityIndex) getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_more_selective");
         SettableSelectivityIndex lessSelective =
-            (SettableSelectivityIndex)getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_less_selective");
+        (SettableSelectivityIndex) getCurrentColumnFamilyStore().indexManager.getIndexByName(currentTable() + "_less_selective");
         assertEquals(0, moreSelective.searchersProvided);
         assertEquals(0, lessSelective.searchersProvided);
 
@@ -550,7 +550,7 @@ public void reloadIndexMetadataOnBaseCfsReload() throws Throwable
         createIndex(String.format("CREATE CUSTOM INDEX reload_counter ON %%s() USING '%s'",
                                   CountMetadataReloadsIndex.class.getName()));
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
-        CountMetadataReloadsIndex index = (CountMetadataReloadsIndex)cfs.indexManager.getIndexByName("reload_counter");
+        CountMetadataReloadsIndex index = (CountMetadataReloadsIndex) cfs.indexManager.getIndexByName("reload_counter");
         assertEquals(0, index.reloads.get());
 
         // reloading the CFS, even without any metadata changes invokes the index's metadata reload task
@@ -564,7 +564,7 @@ public void notifyIndexersOfPartitionAndRowRemovalDuringCleanup() throws Throwab
         createTable("CREATE TABLE %s (k int, c int, v int, PRIMARY KEY (k,c))");
         createIndex(String.format("CREATE CUSTOM INDEX cleanup_index ON %%s() USING '%s'", StubIndex.class.getName()));
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
-        StubIndex index  = (StubIndex)cfs.indexManager.getIndexByName("cleanup_index");
+        StubIndex index = (StubIndex) cfs.indexManager.getIndexByName("cleanup_index");
 
         execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 0, 0);
         execute("INSERT INTO %s (k, c, v) VALUES (?, ?, ?)", 0, 1, 1);
@@ -593,7 +593,7 @@ public void notifyIndexersOfExpiredRowsDuringCompaction() throws Throwable
         createTable("CREATE TABLE %s (k int, c int, PRIMARY KEY (k,c))");
         createIndex(String.format("CREATE CUSTOM INDEX row_ttl_test_index ON %%s() USING '%s'", StubIndex.class.getName()));
         ColumnFamilyStore cfs = getCurrentColumnFamilyStore();
-        StubIndex index  = (StubIndex)cfs.indexManager.getIndexByName("row_ttl_test_index");
+        StubIndex index = (StubIndex) cfs.indexManager.getIndexByName("row_ttl_test_index");
 
         execute("INSERT INTO %s (k, c) VALUES (?, ?) USING TTL 1", 0, 0);
         execute("INSERT INTO %s (k, c) VALUES (?, ?)", 0, 1);
@@ -679,7 +679,7 @@ public void indexBuildingPagesLargePartitions() throws Throwable
         // Index the partition with an Indexer which artificially simulates additional concurrent
         // flush activity by periodically issuing barriers on the read & write op groupings
         DecoratedKey targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(0));
-        indexManager.indexPartition(targetKey, Collections.singleton(index), totalRows / 10);
+        indexManager.indexPartition(targetKey, Collections.singleton(index), PageSize.inRows(totalRows / 10));
 
         // When indexing is done check that:
         // * The base table's read ordering at finish was > the one at the start (i.e. that
@@ -738,7 +738,7 @@ public void partitionIndexTest() throws Throwable
         for (int pageSize = 1; pageSize <= 5; pageSize++)
         {
             targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(1));
-            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            indexManager.indexPartition(targetKey, Collections.singleton(index), PageSize.inRows(pageSize));
             assertEquals(3, index.rowsInserted.size());
             assertEquals(0, index.rangeTombstones.size());
             assertTrue(index.partitionDeletions.get(0).isLive());
@@ -748,7 +748,7 @@ public void partitionIndexTest() throws Throwable
         for (int pageSize = 1; pageSize <= 5; pageSize++)
         {
             targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(2));
-            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            indexManager.indexPartition(targetKey, Collections.singleton(index), PageSize.inRows(pageSize));
             assertEquals(1, index.rowsInserted.size());
             assertEquals(0, index.rangeTombstones.size());
             assertTrue(index.partitionDeletions.get(0).isLive());
@@ -758,7 +758,7 @@ public void partitionIndexTest() throws Throwable
         for (int pageSize = 1; pageSize <= 5; pageSize++)
         {
             targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(3));
-            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            indexManager.indexPartition(targetKey, Collections.singleton(index), PageSize.inRows(pageSize));
             assertEquals(1, index.rowsInserted.size());
             assertEquals(2, index.rangeTombstones.size());
             assertTrue(index.partitionDeletions.get(0).isLive());
@@ -768,7 +768,7 @@ public void partitionIndexTest() throws Throwable
         for (int pageSize = 1; pageSize <= 5; pageSize++)
         {
             targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(5));
-            indexManager.indexPartition(targetKey, Collections.singleton(index), pageSize);
+            indexManager.indexPartition(targetKey, Collections.singleton(index), PageSize.inRows(pageSize));
             assertEquals(1, index.partitionDeletions.size());
             assertFalse(index.partitionDeletions.get(0).isLive());
             index.reset();
@@ -797,7 +797,7 @@ public void partitionIsNotOverIndexed() throws Throwable
 
         // Index the partition
         DecoratedKey targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(0));
-        indexManager.indexPartition(targetKey, Collections.singleton(index), totalRows);
+        indexManager.indexPartition(targetKey, Collections.singleton(index), PageSize.inRows(totalRows));
 
         // Assert only one partition is counted
         assertEquals(1, index.beginCalls);
@@ -828,7 +828,7 @@ public void rangeTombstoneTest() throws Throwable
 
         // Index the partition
         DecoratedKey targetKey = getCurrentColumnFamilyStore().decorateKey(ByteBufferUtil.bytes(1));
-        indexManager.indexPartition(targetKey, Sets.newHashSet(index, index2), 1);
+        indexManager.indexPartition(targetKey, Sets.newHashSet(index, index2), PageSize.inRows(1));
 
         // and both indexes should have the same range tombstone
         assertEquals(index.rangeTombstones, index2.rangeTombstones);
@@ -972,8 +972,8 @@ public long getEstimatedResultRows()
 
         public Searcher searcherFor(ReadCommand command)
         {
-                searchersProvided++;
-                return super.searcherFor(command);
+            searchersProvided++;
+            return super.searcherFor(command);
         }
     }
 
@@ -1132,14 +1132,21 @@ public void finish()
                         readOrderingAtFinish = baseCfs.readOrdering.getCurrent();
                 }
 
-                public void partitionDelete(DeletionTime deletionTime) { }
-
-                public void rangeTombstone(RangeTombstone tombstone) { }
+                public void partitionDelete(DeletionTime deletionTime)
+                {
+                }
 
-                public void updateRow(Row oldRowData, Row newRowData) { }
+                public void rangeTombstone(RangeTombstone tombstone)
+                {
+                }
 
-                public void removeRow(Row row) { }
+                public void updateRow(Row oldRowData, Row newRowData)
+                {
+                }
 
+                public void removeRow(Row row)
+                {
+                }
             };
         }
     }
@@ -1224,7 +1231,8 @@ public void reset()
         @Override
         public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNewTracker tracker)
         {
-            return new SSTableFlushObserver() {
+            return new SSTableFlushObserver()
+            {
 
                 @Override
                 public void begin()
@@ -1536,7 +1544,8 @@ public Index.Indexer indexerFor(Predicate<Index> indexSelector,
                                                      .filter(Objects::nonNull)
                                                      .collect(Collectors.toSet());
 
-                return indexers.isEmpty() ? null : new Index.Indexer() {
+                return indexers.isEmpty() ? null : new Index.Indexer()
+                {
 
                     @Override
                     public void begin()
@@ -1604,7 +1613,8 @@ public SSTableFlushObserver getFlushObserver(Descriptor descriptor, LifecycleNew
                                                              .filter(Objects::nonNull)
                                                              .collect(Collectors.toSet());
 
-                return new SSTableFlushObserver() {
+                return new SSTableFlushObserver()
+                {
 
                     @Override
                     public void begin()
diff --git a/test/unit/org/apache/cassandra/service/QueryPagerTest.java b/test/unit/org/apache/cassandra/service/QueryPagerTest.java
index 11d1cb096957..0b1e33970148 100644
--- a/test/unit/org/apache/cassandra/service/QueryPagerTest.java
+++ b/test/unit/org/apache/cassandra/service/QueryPagerTest.java
@@ -1,65 +1,111 @@
 /*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*    http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*/
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.cassandra.service;
 
 import java.nio.ByteBuffer;
 import java.nio.charset.CharacterCodingException;
-import java.util.*;
-
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedSet;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-import org.apache.cassandra.*;
-import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
-import org.apache.cassandra.schema.ColumnMetadata;
-import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.OrderedJUnit4ClassRunner;
+import org.apache.cassandra.SchemaLoader;
+import org.apache.cassandra.Util;
 import org.apache.cassandra.cql3.ColumnIdentifier;
-import org.apache.cassandra.db.*;
+import org.apache.cassandra.cql3.PageSize;
+import org.apache.cassandra.cql3.statements.schema.CreateTableStatement;
+import org.apache.cassandra.db.AbstractReadCommandBuilder;
+import org.apache.cassandra.db.Clustering;
+import org.apache.cassandra.db.ColumnFamilyStore;
+import org.apache.cassandra.db.DecoratedKey;
+import org.apache.cassandra.db.Keyspace;
+import org.apache.cassandra.db.PartitionRangeReadQuery;
+import org.apache.cassandra.db.ReadCommand;
+import org.apache.cassandra.db.ReadExecutionController;
+import org.apache.cassandra.db.ReadQuery;
+import org.apache.cassandra.db.RowUpdateBuilder;
+import org.apache.cassandra.db.SinglePartitionReadCommand;
+import org.apache.cassandra.db.SinglePartitionReadCommand.Group;
+import org.apache.cassandra.db.SinglePartitionReadQuery;
+import org.apache.cassandra.db.Slice;
+import org.apache.cassandra.db.Slices;
+import org.apache.cassandra.db.filter.ClusteringIndexFilter;
+import org.apache.cassandra.db.filter.ClusteringIndexSliceFilter;
+import org.apache.cassandra.db.filter.ColumnFilter;
+import org.apache.cassandra.db.filter.DataLimits;
+import org.apache.cassandra.db.filter.RowFilter;
+import org.apache.cassandra.db.marshal.Int32Type;
+import org.apache.cassandra.db.partitions.FilteredPartition;
+import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.db.rows.Cell;
 import org.apache.cassandra.db.rows.Row;
 import org.apache.cassandra.db.rows.RowIterator;
-import org.apache.cassandra.db.filter.*;
-import org.apache.cassandra.db.partitions.FilteredPartition;
-import org.apache.cassandra.db.partitions.PartitionIterator;
 import org.apache.cassandra.exceptions.ConfigurationException;
+import org.apache.cassandra.schema.ColumnMetadata;
 import org.apache.cassandra.schema.KeyspaceParams;
-import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.schema.TableMetadata;
+import org.apache.cassandra.service.pager.AggregationQueryPager;
+import org.apache.cassandra.service.pager.MultiPartitionPager;
 import org.apache.cassandra.service.pager.PagingState;
+import org.apache.cassandra.service.pager.PartitionRangeQueryPager;
+import org.apache.cassandra.service.pager.QueryPager;
+import org.apache.cassandra.service.pager.SinglePartitionPager;
 import org.apache.cassandra.transport.ProtocolVersion;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.FBUtilities;
+import org.assertj.core.api.Assertions;
 
 import static org.apache.cassandra.cql3.QueryProcessor.executeInternal;
 import static org.apache.cassandra.utils.ByteBufferUtil.bytes;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 
 @RunWith(OrderedJUnit4ClassRunner.class)
 public class QueryPagerTest
 {
+    private final static Logger logger = LoggerFactory.getLogger(QueryPagerTest.class);
+
     public static final String KEYSPACE1 = "QueryPagerTest";
     public static final String CF_STANDARD = "Standard1";
     public static final String KEYSPACE_CQL = "cql_keyspace";
     public static final String CF_CQL = "table2";
     public static final String CF_CQL_WITH_STATIC = "with_static";
     public static final int nowInSec = FBUtilities.nowInSeconds();
+    public static List<String> tokenOrderedKeys;
 
     @BeforeClass
     public static void defineSchema() throws ConfigurationException
@@ -78,12 +124,12 @@ public static void defineSchema() throws ConfigurationException
                                                                + "v text,"
                                                                + "PRIMARY KEY (k, c))", KEYSPACE_CQL),
                                     CreateTableStatement.parse("CREATE TABLE " + CF_CQL_WITH_STATIC + " ("
-                                                               + "pk text, "
-                                                               + "ck int, "
+                                                               + "k text, "
+                                                               + "c text, "
                                                                + "st int static, "
                                                                + "v1 int, "
                                                                + "v2 int, "
-                                                               + "PRIMARY KEY(pk, ck))", KEYSPACE_CQL));
+                                                               + "PRIMARY KEY(k, c))", KEYSPACE_CQL));
         addData();
     }
 
@@ -101,11 +147,13 @@ private static String string(ByteBuffer bb)
 
     public static void addData()
     {
-        cfs().clearUnsafe();
+        cfs(KEYSPACE1, CF_STANDARD).clearUnsafe();
 
         int nbKeys = 10;
         int nbCols = 10;
 
+        SortedSet<String> tokens = Sets.newTreeSet(Comparator.comparing(a -> cfs(KEYSPACE1, CF_STANDARD).getPartitioner().decorateKey(bytes(a))));
+
         // *
         // * Creates the following data:
         // *   k1: c1 ... cn
@@ -116,15 +164,18 @@ public static void addData()
         {
             for (int j = 0; j < nbCols; j++)
             {
-                RowUpdateBuilder builder = new RowUpdateBuilder(cfs().metadata(), FBUtilities.timestampMicros(), "k" + i);
+                tokens.add("k" + i);
+                RowUpdateBuilder builder = new RowUpdateBuilder(cfs(KEYSPACE1, CF_STANDARD).metadata(), FBUtilities.timestampMicros(), "k" + i);
                 builder.clustering("c" + j).add("val", "").build().applyUnsafe();
             }
         }
+
+        tokenOrderedKeys = Lists.newArrayList(tokens);
     }
 
-    private static ColumnFamilyStore cfs()
+    private static ColumnFamilyStore cfs(String ks, String cf)
     {
-        return Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD);
+        return Keyspace.open(ks).getColumnFamilyStore(cf);
     }
 
     private static List<FilteredPartition> query(QueryPager pager, int expectedSize)
@@ -138,7 +189,7 @@ private static List<FilteredPartition> query(QueryPager pager, int toQuery, int
         List<FilteredPartition> partitionList = new ArrayList<>();
         int rows = 0;
         try (ReadExecutionController executionController = pager.executionController();
-             PartitionIterator iterator = pager.fetchPageInternal(toQuery, executionController))
+             PartitionIterator iterator = pager.fetchPageInternal(PageSize.inRows(toQuery), executionController))
         {
             while (iterator.hasNext())
             {
@@ -155,51 +206,119 @@ private static List<FilteredPartition> query(QueryPager pager, int toQuery, int
         return partitionList;
     }
 
-    private static ReadCommand namesQuery(String key, String... names)
+    private static Map<DecoratedKey, List<Row>> fetchPage(QueryPager pager, int pageSize, PageSize.PageUnit pageUnit)
     {
-        AbstractReadCommandBuilder builder = Util.cmd(cfs(), key);
+        logger.info("----------------------------------------------------------------");
+        Map<DecoratedKey, List<Row>> ret = Maps.newHashMap();
+        try (ReadExecutionController ec = pager.executionController();
+             PartitionIterator iterator = pager.fetchPageInternal(new PageSize(pageSize, pageUnit), ec))
+        {
+            while (iterator.hasNext())
+            {
+                try (RowIterator partition = iterator.next())
+                {
+                    logger.info("Partition {}", partition.partitionKey());
+                    List<Row> rows = new ArrayList<>();
+                    Row staticRow = partition.staticRow();
+                    if (!partition.hasNext() && !staticRow.isEmpty())
+                    {
+                        rows.add(staticRow);
+                        logger.info("\tStatic row {}", staticRow.toString(partition.metadata()));
+                    }
+
+                    while (partition.hasNext())
+                    {
+                        Row row = partition.next();
+                        rows.add(row);
+                        logger.info("\tRow {}", row.toString(partition.metadata()));
+                    }
+
+                    ret.put(partition.partitionKey(), rows);
+                }
+            }
+        }
+        catch (Throwable t)
+        {
+            t.printStackTrace();
+            throw t;
+        }
+        return ret;
+    }
+
+    private static ReadCommand namesQuery(int count, int partitionCount, PageSize pageSize, ColumnFamilyStore cfs, String key, String... names)
+    {
+        AbstractReadCommandBuilder builder = Util.cmd(cfs, key).withNowInSeconds(nowInSec);
         for (String name : names)
             builder.includeRow(name);
-        return builder.withPagingLimit(100).build();
+        if (count > 0)
+            builder.withLimit(count);
+        if (partitionCount > 0)
+            builder.withPerPartitionLimit(partitionCount);
+        if (pageSize != null && !pageSize.equals(PageSize.NONE))
+            builder.withPageSize(pageSize);
+
+        return builder.build();
     }
 
-    private static SinglePartitionReadCommand sliceQuery(String key, String start, String end, int count)
+    private static SinglePartitionReadCommand sliceQuery(ColumnFamilyStore cfs, String key, String start, String end)
     {
-        return sliceQuery(key, start, end, false, count);
+        return sliceQuery(-1, -1, PageSize.NONE, cfs, key, start, end, false);
     }
 
-    private static SinglePartitionReadCommand sliceQuery(String key, String start, String end, boolean reversed, int count)
+    private static SinglePartitionReadCommand sliceQuery(ColumnFamilyStore cfs, String key, String start, String end, boolean reversed)
     {
-        ClusteringComparator cmp = cfs().getComparator();
-        TableMetadata metadata = cfs().metadata();
-
-        Slice slice = Slice.make(cmp.make(start), cmp.make(end));
-        ClusteringIndexSliceFilter filter = new ClusteringIndexSliceFilter(Slices.with(cmp, slice), reversed);
+        return sliceQuery(-1, -1, PageSize.NONE, cfs, key, start, end, reversed);
+    }
 
-        return SinglePartitionReadCommand.create(metadata, nowInSec, ColumnFilter.all(metadata), RowFilter.NONE, DataLimits.NONE, Util.dk(key), filter);
+    private static SinglePartitionReadCommand sliceQuery(int count, int partitionCount, PageSize paging, ColumnFamilyStore cfs, String key, String start, String end, boolean reversed)
+    {
+        AbstractReadCommandBuilder builder = Util.cmd(cfs, key).fromIncl(start).toIncl(end).withNowInSeconds(nowInSec);
+        if (reversed)
+            builder.reverse();
+        if (count > 0)
+            builder.withLimit(count);
+        if (partitionCount > 0)
+            builder.withPerPartitionLimit(partitionCount);
+        if (paging != null && !paging.equals(PageSize.NONE))
+            builder.withPageSize(paging);
+
+        return (SinglePartitionReadCommand) builder.build();
     }
 
-    private static ReadCommand rangeNamesQuery(String keyStart, String keyEnd, int count, String... names)
+    private static ReadCommand rangeNamesQuery(int count, int partitionCount, PageSize paging, ColumnFamilyStore cfs, String keyStart, String keyEnd, String... names)
     {
-        AbstractReadCommandBuilder builder = Util.cmd(cfs())
+        AbstractReadCommandBuilder builder = Util.cmd(cfs)
                                                  .fromKeyExcl(keyStart)
                                                  .toKeyIncl(keyEnd)
-                                                 .withPagingLimit(count);
+                                                 .withNowInSeconds(nowInSec);
         for (String name : names)
             builder.includeRow(name);
+        if (count > 0)
+            builder.withLimit(count);
+        if (partitionCount > 0)
+            builder.withPerPartitionLimit(partitionCount);
+        if (paging != null && !paging.equals(PageSize.NONE))
+            builder.withPageSize(paging);
 
         return builder.build();
     }
 
-    private static ReadCommand rangeSliceQuery(String keyStart, String keyEnd, int count, String start, String end)
+    private static ReadCommand rangeSliceQuery(int count, int partitionCount, PageSize paging, ColumnFamilyStore cfs, String keyStart, String keyEnd, String start, String end)
     {
-        return Util.cmd(cfs())
-                   .fromKeyExcl(keyStart)
-                   .toKeyIncl(keyEnd)
-                   .fromIncl(start)
-                   .toIncl(end)
-                   .withPagingLimit(count)
-                   .build();
+        AbstractReadCommandBuilder builder = Util.cmd(cfs)
+                                                 .fromKeyExcl(keyStart)
+                                                 .toKeyIncl(keyEnd)
+                                                 .fromIncl(start)
+                                                 .toIncl(end)
+                                                 .withNowInSeconds(nowInSec);
+        if (count > 0)
+            builder.withLimit(count);
+        if (partitionCount > 0)
+            builder.withPerPartitionLimit(partitionCount);
+        if (paging != null && !paging.equals(PageSize.NONE))
+            builder.withPageSize(paging);
+
+        return builder.build();
     }
 
     private static void assertRow(FilteredPartition r, String key, String... names)
@@ -218,7 +337,7 @@ private static void assertRow(FilteredPartition partition, String key, ByteBuffe
         for (Row row : Util.once(partition.iterator()))
         {
             ByteBuffer expected = names[i++];
-            assertEquals("column " + i + " doesn't match "+string(expected)+" vs "+string(row.clustering().bufferAt(0)), expected, row.clustering().bufferAt(0));
+            assertEquals("column " + i + " doesn't match " + string(expected) + " vs " + string(row.clustering().bufferAt(0)), expected, row.clustering().bufferAt(0));
         }
     }
 
@@ -234,13 +353,16 @@ private QueryPager maybeRecreate(QueryPager pager, ReadQuery command, boolean te
     @Test
     public void namesQueryTest()
     {
-        for(ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
             namesQueryTest(protocolVersion);
     }
 
     public void namesQueryTest(ProtocolVersion protocolVersion)
     {
-        QueryPager pager = namesQuery("k0", "c1", "c5", "c7", "c8").getPager(null, protocolVersion);
+        QueryPager pager = namesQuery(-1, -1, new PageSize(100, PageSize.PageUnit.ROWS),
+                                      cfs(KEYSPACE1, CF_STANDARD),
+                                      "k0", "c1", "c5", "c7", "c8")
+                           .getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
         List<FilteredPartition> partition = query(pager, 5, 4);
@@ -252,7 +374,7 @@ public void namesQueryTest(ProtocolVersion protocolVersion)
     @Test
     public void sliceQueryTest()
     {
-        for(ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
         {
             sliceQueryTest(false, protocolVersion);
             sliceQueryTest(true, protocolVersion);
@@ -261,7 +383,7 @@ public void sliceQueryTest()
 
     public void sliceQueryTest(boolean testPagingState, ProtocolVersion protocolVersion)
     {
-        ReadCommand command = sliceQuery("k0", "c1", "c8", 10);
+        ReadCommand command = sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k0", "c1", "c8");
         QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
@@ -283,10 +405,55 @@ public void sliceQueryTest(boolean testPagingState, ProtocolVersion protocolVers
         assertTrue(pager.isExhausted());
     }
 
+    @Test
+    public void sliceQueryWithLimitsTest() throws Exception
+    {
+        boolean testPagingState = true;
+        ProtocolVersion protocolVersion = ProtocolVersion.CURRENT;
+
+        // Test with count < partitionCount
+
+        int count = 1;
+        int partitionCount = 2;
+
+        ReadCommand command = sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k0", "c1", "c8", false);
+        QueryPager pager = command.getPager(null, protocolVersion);
+        List<FilteredPartition> partition = query(pager, 3, count);
+        assertRow(partition.get(0), "k0", "c1");
+        assertTrue(pager.isExhausted());
+
+        // Test with count > partitionCount
+
+        count = 2;
+        partitionCount = 1;
+
+        command = sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k0", "c1", "c8", false);
+        pager = command.getPager(null, protocolVersion);
+        partition = query(pager, 3, partitionCount);
+        assertRow(partition.get(0), "k0", "c1");
+        assertTrue(pager.isExhausted());
+
+        // Test with counts spanning multiple pages
+
+        count = 5;
+        partitionCount = 5;
+
+        command = sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k0", "c1", "c8", false);
+        pager = command.getPager(null, protocolVersion);
+        partition = query(pager, 3, 3);
+        assertRow(partition.get(0), "k0", "c1", "c2", "c3");
+        assertFalse(pager.isExhausted());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        partition = query(pager, 3, 2);
+        assertRow(partition.get(0), "k0", "c4", "c5");
+        assertTrue(pager.isExhausted());
+    }
+
     @Test
     public void reversedSliceQueryTest()
     {
-        for(ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
         {
             reversedSliceQueryTest(false, protocolVersion);
             reversedSliceQueryTest(true, protocolVersion);
@@ -295,7 +462,7 @@ public void reversedSliceQueryTest()
 
     public void reversedSliceQueryTest(boolean testPagingState, ProtocolVersion protocolVersion)
     {
-        ReadCommand command = sliceQuery("k0", "c1", "c8", true, 10);
+        ReadCommand command = sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k0", "c1", "c8", true);
         QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
@@ -320,7 +487,7 @@ public void reversedSliceQueryTest(boolean testPagingState, ProtocolVersion prot
     @Test
     public void multiQueryTest()
     {
-        for(ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
         {
             multiQueryTest(false, protocolVersion);
             multiQueryTest(true, protocolVersion);
@@ -329,10 +496,10 @@ public void multiQueryTest()
 
     public void multiQueryTest(boolean testPagingState, ProtocolVersion protocolVersion)
     {
-        ReadQuery command = new SinglePartitionReadCommand.Group(new ArrayList<SinglePartitionReadCommand>()
+        ReadQuery command = new Group(new ArrayList<SinglePartitionReadCommand>()
         {{
-            add(sliceQuery("k1", "c2", "c6", 10));
-            add(sliceQuery("k4", "c3", "c5", 10));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k1", "c2", "c6"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k4", "c3", "c5"));
         }}, DataLimits.NONE);
         QueryPager pager = command.getPager(null, protocolVersion);
 
@@ -343,7 +510,7 @@ public void multiQueryTest(boolean testPagingState, ProtocolVersion protocolVers
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
-        partition = query(pager , 4);
+        partition = query(pager, 4);
         assertRow(partition.get(0), "k1", "c5", "c6");
         assertRow(partition.get(1), "k4", "c3", "c4");
         assertFalse(pager.isExhausted());
@@ -356,10 +523,155 @@ public void multiQueryTest(boolean testPagingState, ProtocolVersion protocolVers
         assertTrue(pager.isExhausted());
     }
 
+    /**
+     * Test a query with 1 CQL row per partition with various page sizes.
+     */
+    @Test
+    public void multiPartitionSingleRowQueryTest() throws Exception
+    {
+        int totQueryRows = 4;
+        ReadQuery command = new Group(new ArrayList<SinglePartitionReadCommand>()
+        {{
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k1", "c1", "c1"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k2", "c1", "c1"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k3", "c1", "c1"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k4", "c1", "c1"));
+        }}, DataLimits.NONE);
+
+        checkRows(command, PageSize.PageUnit.ROWS, totQueryRows, new int[]{ 7, 8, 9, 10, 15, 16, 20 });
+    }
+
+    /**
+     * Test a query with 4 CQL rows per partition with various page sizes.
+     */
+    @Test
+    public void multiPartitionFourRowsQueryTest() throws Exception
+    {
+        int totQueryRows = 8;
+        ReadQuery command = new Group(new ArrayList<SinglePartitionReadCommand>()
+        {{
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k1", "c1", "c4"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k2", "c1", "c4"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k3", "c1", "c4"));
+            add(sliceQuery(cfs(KEYSPACE1, CF_STANDARD), "k4", "c1", "c4"));
+        }}, DataLimits.cqlLimits(8));
+
+        checkRows(command, PageSize.PageUnit.ROWS, totQueryRows, new int[]{ 2, 7, 8, 9, 10, 15, 16, 20 });
+    }
+
+    @Test
+    public void multiPartitionQueryWithRowLimitTest() throws Exception
+    {
+        int count = 8;
+        int partitionCount = DataLimits.NO_LIMIT;
+        int totQueryRows = 8;
+        ReadQuery command = new Group(new ArrayList<SinglePartitionReadCommand>()
+        {{
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k1", "c1", "c4", false));
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k2", "c1", "c4", false));
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k3", "c1", "c4", false));
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k4", "c1", "c4", false));
+        }}, DataLimits.cqlLimits(count, partitionCount));
+
+        checkRows(command, PageSize.PageUnit.ROWS, totQueryRows, new int[]{ 2, 7, 8, 9, 10, 15, 16, 20 });
+    }
+
+    @Test
+    public void multiPartitionQueryWithPartitionLimitTest() throws Exception
+    {
+        int count = DataLimits.NO_LIMIT;
+        int partitionCount = 2;
+        int totQueryRows = 8;
+        ReadQuery command = new Group(new ArrayList<SinglePartitionReadCommand>()
+        {{
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k1", "c1", "c4", false));
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k2", "c1", "c4", false));
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k3", "c1", "c4", false));
+            add(sliceQuery(count, partitionCount, PageSize.NONE, cfs(KEYSPACE1, CF_STANDARD), "k4", "c1", "c4", false));
+        }}, DataLimits.cqlLimits(count, partitionCount));
+
+        checkRows(command, PageSize.PageUnit.ROWS, totQueryRows, new int[]{ 2, 7, 8, 9, 10, 15, 16, 20 });
+    }
+
+    private void checkRows(ReadQuery command, PageSize.PageUnit pageUnit, int totQueryRows, int... pages)
+    {
+        for (int pageSize : pages)
+        {
+            Map<DecoratedKey, Set<Row>> allRows = Maps.newHashMap();
+            int currentRows = 0;
+            QueryPager pager = command.getPager(null, ProtocolVersion.CURRENT);
+            assertFalse(String.format("Failed due to exhausted pager at page size %s %s", pageSize, pageUnit),
+                        pager.isExhausted());
+
+            logger.info("Testing with page size: {}", pageSize);
+            while (!pager.isExhausted())
+            {
+                Map<DecoratedKey, List<Row>> rows = fetchPage(pager, pageSize, pageUnit);
+
+                if (rows.size() > 0)
+                {
+                    int numRows = rows.values().stream().map(List::size).reduce(0, Integer::sum);
+                    int numBytes = rows.values().stream().flatMap(r -> r.stream()).reduce(0, (s, r) -> s + r.dataSize(), Integer::sum);
+
+                    for (Map.Entry<DecoratedKey, List<Row>> entry : rows.entrySet())
+                        allRows.merge(entry.getKey(), new HashSet(entry.getValue()), ((rows1, rows2) -> {
+                            rows1.addAll(rows2);
+                            return rows1;
+                        }));
+
+                    if (pageUnit == PageSize.PageUnit.ROWS)
+                    {
+                        int expectedSize = Math.min(pageSize, totQueryRows - currentRows);
+                        assertEquals(String.format("Failed after %d rows with rows page size %d and current number of rows %d;\n%s",
+                                                   currentRows, pageSize, numRows, formatRows(allRows)),
+                                     expectedSize, numRows);
+                    }
+                    else
+                    {
+                        boolean bytesRead = numBytes < (pageSize + (numBytes / numRows));
+                        assertTrue(String.format("Failed after %d rows with bytes page size %d and current number of rows %d due to bytes read %d;\n%s",
+                                                 currentRows, pageSize, numRows, numBytes, formatRows(allRows)),
+                                   bytesRead);
+                    }
+
+                    currentRows += numRows;
+
+                    if (!pager.isExhausted())
+                        pager = maybeRecreate(pager, command, true, ProtocolVersion.CURRENT);
+                }
+                else
+                    assertTrue(String.format("Failed due to non-exhausted pager at page size %s %s", pageSize, pageUnit),
+                               pager.isExhausted());
+            }
+
+            assertEquals(String.format("Failed with page size %d %s - expected %d rows in total but got:\n%s",
+                                       pageSize, pageUnit, totQueryRows, formatRows(allRows)),
+                         totQueryRows, (long) allRows.values().stream().map(Set::size).reduce(0, Integer::sum));
+        }
+    }
+
+    private String formatRows(Map<DecoratedKey, Set<Row>> rows)
+    {
+        TableMetadata metadata = cfs(KEYSPACE1, CF_STANDARD).metadata();
+
+        StringBuilder str = new StringBuilder();
+        for (Map.Entry<DecoratedKey, Set<Row>> entry : rows.entrySet())
+        {
+            for (Row row : entry.getValue())
+            {
+                str.append(entry.getKey().toString());
+                str.append(' ');
+                str.append(row.toString(metadata));
+                str.append('\n');
+            }
+        }
+        return str.toString();
+    }
+
     @Test
     public void rangeNamesQueryTest()
     {
-        for(ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
         {
             rangeNamesQueryTest(false, protocolVersion);
             rangeNamesQueryTest(true, protocolVersion);
@@ -368,20 +680,23 @@ public void rangeNamesQueryTest()
 
     public void rangeNamesQueryTest(boolean testPagingState, ProtocolVersion protocolVersion)
     {
-        ReadCommand command = rangeNamesQuery("k0", "k5", 100, "c1", "c4", "c8");
+        ReadCommand command = rangeNamesQuery(-1, -1, new PageSize(100, PageSize.PageUnit.ROWS),
+                                              cfs(KEYSPACE1, CF_STANDARD),
+                                              tokenOrderedKeys.get(0), tokenOrderedKeys.get(5),
+                                              "c1", "c4", "c8");
         QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
         List<FilteredPartition> partitions = query(pager, 3 * 3);
         for (int i = 1; i <= 3; i++)
-            assertRow(partitions.get(i-1), "k" + i, "c1", "c4", "c8");
+            assertRow(partitions.get(i - 1), tokenOrderedKeys.get(i), "c1", "c4", "c8");
         assertFalse(pager.isExhausted());
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
         partitions = query(pager, 3 * 3, 2 * 3);
         for (int i = 4; i <= 5; i++)
-            assertRow(partitions.get(i-4), "k" + i, "c1", "c4", "c8");
+            assertRow(partitions.get(i - 4), tokenOrderedKeys.get(i), "c1", "c4", "c8");
 
         assertTrue(pager.isExhausted());
     }
@@ -389,7 +704,7 @@ public void rangeNamesQueryTest(boolean testPagingState, ProtocolVersion protoco
     @Test
     public void rangeSliceQueryTest()
     {
-        for(ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion protocolVersion : ProtocolVersion.SUPPORTED)
         {
             rangeSliceQueryTest(false, protocolVersion);
             rangeSliceQueryTest(true, protocolVersion);
@@ -398,61 +713,146 @@ public void rangeSliceQueryTest()
 
     public void rangeSliceQueryTest(boolean testPagingState, ProtocolVersion protocolVersion)
     {
-        ReadCommand command = rangeSliceQuery("k1", "k5", 100, "c1", "c7");
+        ReadCommand command = rangeSliceQuery(-1, -1, new PageSize(100, PageSize.PageUnit.ROWS),
+                                              cfs(KEYSPACE1, CF_STANDARD),
+                                              tokenOrderedKeys.get(0), tokenOrderedKeys.get(4),
+                                              "c1", "c7");
         QueryPager pager = command.getPager(null, protocolVersion);
 
         assertFalse(pager.isExhausted());
         List<FilteredPartition> partitions = query(pager, 5);
-        assertRow(partitions.get(0), "k2", "c1", "c2", "c3", "c4", "c5");
+        assertRow(partitions.get(0), tokenOrderedKeys.get(1), "c1", "c2", "c3", "c4", "c5");
         assertFalse(pager.isExhausted());
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
         partitions = query(pager, 4);
-        assertRow(partitions.get(0), "k2", "c6", "c7");
-        assertRow(partitions.get(1), "k3", "c1", "c2");
+        assertRow(partitions.get(0), tokenOrderedKeys.get(1), "c6", "c7");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(2), "c1", "c2");
         assertFalse(pager.isExhausted());
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
         partitions = query(pager, 6);
-        assertRow(partitions.get(0), "k3", "c3", "c4", "c5", "c6", "c7");
-        assertRow(partitions.get(1), "k4", "c1");
+        assertRow(partitions.get(0), tokenOrderedKeys.get(2), "c3", "c4", "c5", "c6", "c7");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(3), "c1");
         assertFalse(pager.isExhausted());
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
         partitions = query(pager, 5);
-        assertRow(partitions.get(0), "k4", "c2", "c3", "c4", "c5", "c6");
+        assertRow(partitions.get(0), tokenOrderedKeys.get(3), "c2", "c3", "c4", "c5", "c6");
         assertFalse(pager.isExhausted());
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
         partitions = query(pager, 5);
-        assertRow(partitions.get(0), "k4", "c7");
-        assertRow(partitions.get(1), "k5", "c1", "c2", "c3", "c4");
+        assertRow(partitions.get(0), tokenOrderedKeys.get(3), "c7");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(4), "c1", "c2", "c3", "c4");
         assertFalse(pager.isExhausted());
 
         pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
         assertFalse(pager.isExhausted());
         partitions = query(pager, 5, 3);
-        assertRow(partitions.get(0), "k5", "c5", "c6", "c7");
+        assertRow(partitions.get(0), tokenOrderedKeys.get(4), "c5", "c6", "c7");
+
+        assertTrue(pager.isExhausted());
+    }
+
+    @Test
+    public void rangeSliceQueryWithLimitsTest() throws Exception
+    {
+        boolean testPagingState = true;
+        ProtocolVersion protocolVersion = ProtocolVersion.CURRENT;
+
+        // Test with count < partitionCount
+
+        int count = 1;
+        int partitionCount = 2;
+
+        ReadCommand command = rangeSliceQuery(count, partitionCount, new PageSize(100, PageSize.PageUnit.ROWS),
+                                              cfs(KEYSPACE1, CF_STANDARD),
+                                              tokenOrderedKeys.get(0), tokenOrderedKeys.get(4),
+                                              "c1", "c7");
+
+        QueryPager pager = command.getPager(null, protocolVersion);
+        List<FilteredPartition> partitions = query(pager, 5, count);
+        assertEquals(1, partitions.size());
+        assertRow(partitions.get(0), tokenOrderedKeys.get(1), "c1");
+        assertTrue(pager.isExhausted());
+
+        // Test with count > partitionCount
+
+        count = 2;
+        partitionCount = 1;
+
+        command = rangeSliceQuery(count, partitionCount, new PageSize(100, PageSize.PageUnit.ROWS),
+                                  cfs(KEYSPACE1, CF_STANDARD),
+                                  tokenOrderedKeys.get(0), tokenOrderedKeys.get(4),
+                                  "c1", "c7");
+
+        pager = command.getPager(null, protocolVersion);
+        partitions = query(pager, 5, count);
+        assertEquals(2, partitions.size());
+        assertRow(partitions.get(0), tokenOrderedKeys.get(1), "c1");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(2), "c1");
+        assertTrue(pager.isExhausted());
+
+        // Test with count spanning multiple partitions
 
+        count = 4;
+        partitionCount = 2;
+
+        command = rangeSliceQuery(count, partitionCount, new PageSize(100, PageSize.PageUnit.ROWS),
+                                  cfs(KEYSPACE1, CF_STANDARD),
+                                  tokenOrderedKeys.get(0), tokenOrderedKeys.get(4),
+                                  "c1", "c7");
+
+        pager = command.getPager(null, protocolVersion);
+        partitions = query(pager, 5, count);
+        assertEquals(2, partitions.size());
+        assertRow(partitions.get(0), tokenOrderedKeys.get(1), "c1", "c2");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(2), "c1", "c2");
+        assertTrue(pager.isExhausted());
+
+        // Test with count spanning multiple pages
+
+        count = 8;
+        partitionCount = 2;
+
+        command = rangeSliceQuery(count, partitionCount, new PageSize(100, PageSize.PageUnit.ROWS),
+                                  cfs(KEYSPACE1, CF_STANDARD),
+                                  tokenOrderedKeys.get(0), tokenOrderedKeys.get(4),
+                                  "c1", "c7");
+
+        pager = command.getPager(null, protocolVersion);
+        partitions = query(pager, 5, 5);
+        assertEquals(3, partitions.size());
+        assertRow(partitions.get(0), tokenOrderedKeys.get(1), "c1", "c2");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(2), "c1", "c2");
+        assertRow(partitions.get(2), tokenOrderedKeys.get(3), "c1");
+        assertFalse(pager.isExhausted());
+
+        pager = maybeRecreate(pager, command, testPagingState, protocolVersion);
+        partitions = query(pager, 5, 3);
+        assertEquals(2, partitions.size());
+        assertRow(partitions.get(0), tokenOrderedKeys.get(3), "c2");
+        assertRow(partitions.get(1), tokenOrderedKeys.get(4), "c1", "c2");
         assertTrue(pager.isExhausted());
     }
 
     @Test
     public void SliceQueryWithTombstoneTest()
     {
-        for(ProtocolVersion version : ProtocolVersion.SUPPORTED)
+        for (ProtocolVersion version : ProtocolVersion.SUPPORTED)
             SliceQueryWithTombstoneTest(version);
     }
 
     public void SliceQueryWithTombstoneTest(ProtocolVersion protocolVersion)
     {
         // Testing for the bug of #6748
-        String keyspace = "cql_keyspace";
-        String table = "table2";
+        String keyspace = KEYSPACE_CQL;
+        String table = CF_CQL;
         ColumnFamilyStore cfs = Keyspace.open(keyspace).getColumnFamilyStore(table);
 
         // Insert rows but with a tombstone as last cell
@@ -479,8 +879,8 @@ public void pagingReversedQueriesWithStaticColumnsTest()
         // see CASSANDRA-13222
 
         // insert some rows into a single partition
-        for (int i=0; i < 5; i++)
-            executeInternal(String.format("INSERT INTO %s.%s (pk, ck, st, v1, v2) VALUES ('k0', %3$s, %3$s, %3$s, %3$s)",
+        for (int i = 0; i < 5; i++)
+            executeInternal(String.format("INSERT INTO %s.%s (k, c, st, v1, v2) VALUES ('k0', '%3$s', %3$s, %3$s, %3$s)",
                                           KEYSPACE_CQL, CF_CQL_WITH_STATIC, i));
 
         // query the table in reverse with page size = 1 & check that the returned rows contain the correct cells
@@ -497,10 +897,10 @@ private void queryAndVerifyCells(TableMetadata table, boolean reversed, String k
         ColumnMetadata staticColumn = table.staticColumns().getSimple(0);
         assertEquals(staticColumn.name.toCQLString(), "st");
 
-        for (int i=0; i<5; i++)
+        for (int i = 0; i < 5; i++)
         {
             try (ReadExecutionController controller = pager.executionController();
-                 PartitionIterator partitions = pager.fetchPageInternal(1, controller))
+                 PartitionIterator partitions = pager.fetchPageInternal(PageSize.inRows(1), controller))
             {
                 try (RowIterator partition = partitions.next())
                 {
@@ -509,7 +909,7 @@ private void queryAndVerifyCells(TableMetadata table, boolean reversed, String k
                     Row row = partition.next();
                     int cellIndex = !reversed ? i : 4 - i;
 
-                    assertEquals(row.clustering().bufferAt(0), ByteBufferUtil.bytes(cellIndex));
+                    assertEquals(row.clustering().bufferAt(0), ByteBufferUtil.bytes(""+cellIndex));
                     assertCell(row, table.getColumn(new ColumnIdentifier("v1", false)), cellIndex);
                     assertCell(row, table.getColumn(new ColumnIdentifier("v2", false)), cellIndex);
 
@@ -520,8 +920,8 @@ private void queryAndVerifyCells(TableMetadata table, boolean reversed, String k
         }
 
         // After processing the 5 rows there should be no more rows to return
-        try ( ReadExecutionController controller = pager.executionController();
-              PartitionIterator partitions = pager.fetchPageInternal(1, controller))
+        try (ReadExecutionController controller = pager.executionController();
+             PartitionIterator partitions = pager.fetchPageInternal(PageSize.inRows(1), controller))
         {
             assertFalse(partitions.hasNext());
         }
@@ -533,4 +933,201 @@ private void assertCell(Row row, ColumnMetadata column, int value)
         assertNotNull(cell);
         assertEquals(value, ByteBufferUtil.toInt(cell.buffer()));
     }
+
+    @Test
+    public void testSinglePartitionPagingByBytes()
+    {
+        executeInternal(String.format("TRUNCATE TABLE %s.%s", KEYSPACE_CQL, CF_CQL));
+
+        int rows = 10;
+
+        for (int i = 0; i < rows; i++)
+            executeInternal(String.format("INSERT INTO %s.%s(k, c, v) VALUES('k', 'c%s', 'ignored')", KEYSPACE_CQL, CF_CQL, i));
+
+        // Test with rows limit:
+
+        int maxExpected = rows;
+        for (int count = 0; count <= maxExpected; count++)
+        {
+            SinglePartitionReadCommand q = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k", "c0", "c9", false);
+            checkRows(q, PageSize.PageUnit.BYTES, count > 0 ? count : maxExpected, 1, 128, 256, 1024);
+        }
+
+        // Test with partition limit:
+
+        for (int partitionCount = 1; partitionCount <= rows; partitionCount++)
+        {
+            SinglePartitionReadCommand q = sliceQuery(-1, partitionCount, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k", "c0", "c9", false);
+            checkRows(q, PageSize.PageUnit.BYTES, partitionCount, 1, 128, 256, 1024);
+        }
+    }
+
+    @Test
+    public void testPartitionRangePagingByBytes()
+    {
+        executeInternal(String.format("TRUNCATE TABLE %s.%s", KEYSPACE_CQL, CF_CQL));
+
+        int pks = 10;
+        int cs = 10;
+
+        SortedSet<String> tokens = Sets.newTreeSet(Comparator.comparing(a -> cfs(KEYSPACE_CQL, CF_CQL).getPartitioner().decorateKey(bytes(a))));
+        for (int i = 0; i < pks; i++)
+        {
+            for (int j = 0; j < cs; j++)
+            {
+                executeInternal(String.format("INSERT INTO %s.%s(k, c, v) VALUES('k%s', 'c%s', 'ignored')", KEYSPACE_CQL, CF_CQL, i, j));
+            }
+            tokens.add("k" + i);
+        }
+
+        // Test with rows limit:
+
+        int maxExpected = pks - 1;
+        for (int count = 0; count <= maxExpected; count++)
+        {
+            ReadCommand q = rangeSliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), tokens.first(), tokens.last(), "c0", "c0");
+            checkRows(q, PageSize.PageUnit.BYTES, count > 0 ? count : maxExpected, 1, 128, 256, 1024);
+        }
+
+        // Test with partition limit:
+
+        for (int partitionCount = 1; partitionCount <= cs; partitionCount++)
+        {
+            ReadCommand q = rangeSliceQuery(-1, partitionCount, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), tokens.first(), tokens.last(), "c0", "c9");
+            checkRows(q, PageSize.PageUnit.BYTES, partitionCount * (pks - 1), 1, 128, 256, 1024);
+        }
+    }
+
+    @Test
+    public void testMultiPartitionPagingByBytes()
+    {
+        executeInternal(String.format("TRUNCATE TABLE %s.%s", KEYSPACE_CQL, CF_CQL));
+
+        int pks = 10;
+        int cs = 10;
+
+        for (int i = 0; i < pks; i++)
+            for (int j = 0; j < cs; j++)
+                executeInternal(String.format("INSERT INTO %s.%s(k, c, v) VALUES('k%s', 'c%s', 'ignored')", KEYSPACE_CQL, CF_CQL, i, j));
+
+        // Test with rows limit:
+
+        int maxExpected = 22; // the sum of the clustering keys in the command group below
+        for (int count = 0; count <= maxExpected; count++)
+        {
+            SinglePartitionReadCommand q1 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k0", "c0", "c1", false);
+            SinglePartitionReadCommand q2 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k1", "c0", "c3", false);
+            SinglePartitionReadCommand q3 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k2", "c0", "c5", false);
+            SinglePartitionReadCommand q4 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k3", "c0", "c9", false);
+            Group q = new Group(
+            Arrays.asList(q1, q2, q3, q4),
+            count > 0 ? DataLimits.cqlLimits(count) : DataLimits.NONE);
+            checkRows(q, PageSize.PageUnit.BYTES, count > 0 ? count : maxExpected, 1, 128, 256, 1024);
+        }
+
+        // Test with partition limit:
+
+        for (int partitionCount = 1; partitionCount <= cs; partitionCount++)
+        {
+            SinglePartitionReadCommand q1 = sliceQuery(-1, partitionCount, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k0", "c0", "c9", false);
+            SinglePartitionReadCommand q2 = sliceQuery(-1, partitionCount, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k1", "c0", "c9", false);
+            SinglePartitionReadCommand q3 = sliceQuery(-1, partitionCount, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k2", "c0", "c9", false);
+            SinglePartitionReadCommand q4 = sliceQuery(-1, partitionCount, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL), "k3", "c0", "c9", false);
+            Group q = new Group(
+            Arrays.asList(q1, q2, q3, q4),
+            DataLimits.cqlLimits(Integer.MAX_VALUE, partitionCount));
+            checkRows(q, PageSize.PageUnit.BYTES, partitionCount * 4, 1, 128, 256, 1024);
+        }
+    }
+
+    @Test
+    public void testStaticRowsPagingByBytes()
+    {
+        executeInternal(String.format("TRUNCATE TABLE %s.%s", KEYSPACE_CQL, CF_CQL_WITH_STATIC));
+
+        int rows = 10;
+
+        for (int i = 0; i < rows; i++)
+            executeInternal(String.format("INSERT INTO %s.%s(k, c, st) VALUES('k%s', 'c', 0)", KEYSPACE_CQL, CF_CQL_WITH_STATIC, i));
+
+        int maxExpected = 4;
+        for (int count = 0; count <= maxExpected; count++)
+        {
+            SinglePartitionReadCommand q1 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL_WITH_STATIC), "k0", "c", "c", false);
+            SinglePartitionReadCommand q2 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL_WITH_STATIC), "k1", "c", "c", false);
+            SinglePartitionReadCommand q3 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL_WITH_STATIC), "k2", "c", "c", false);
+            SinglePartitionReadCommand q4 = sliceQuery(count, -1, PageSize.NONE, cfs(KEYSPACE_CQL, CF_CQL_WITH_STATIC), "k3", "c", "c", false);
+            Group q = new Group(
+            Arrays.asList(q1, q2, q3, q4),
+            count > 0 ? DataLimits.cqlLimits(count) : DataLimits.NONE);
+            checkRows(q, PageSize.PageUnit.BYTES, count > 0 ? count : maxExpected, 1, 128, 256, 1024);
+        }
+    }
+
+    @Test
+    public void toStringTest()
+    {
+        TableMetadata metadata = TableMetadata.builder("ks", "tab")
+                                              .addPartitionKeyColumn("k", Int32Type.instance)
+                                              .addClusteringColumn("c", Int32Type.instance)
+                                              .addColumn(ColumnMetadata.regularColumn("ks", "tab", "v", Int32Type.instance))
+                                              .build();
+
+        DataLimits limits = DataLimits.cqlLimits(31, 29);
+
+        Clustering clustering = Clustering.make(bytes(11));
+        Row row = mock(Row.class);
+        when(row.clustering()).thenReturn(clustering);
+        when(row.isRow()).thenReturn(true);
+
+        PagingState state = new PagingState(ByteBufferUtil.bytes(1), PagingState.RowMark.create(metadata, row, ProtocolVersion.CURRENT), 19, 17);
+
+        SinglePartitionReadQuery singlePartitionReadQuery = mock(SinglePartitionReadQuery.class);
+        when(singlePartitionReadQuery.metadata()).thenReturn(metadata);
+        when(singlePartitionReadQuery.limits()).thenReturn(limits);
+        when(singlePartitionReadQuery.partitionKey()).thenReturn(metadata.partitioner.decorateKey(ByteBufferUtil.bytes(1)));
+        QueryPager singlePartitionPager = new SinglePartitionPager(singlePartitionReadQuery, state, ProtocolVersion.CURRENT);
+        Assertions.assertThat(singlePartitionPager.toString())
+                  .contains(limits.toString())
+                  .contains("remaining=19")
+                  .contains("remainingInPartition=17")
+                  .contains("lastReturned=c=11")
+                  .contains("lastCounter=null")
+                  .contains("lastKey=DecoratedKey(00000001, 00000001)")
+                  .contains("exhausted=false");
+
+        PartitionRangeReadQuery partitionRangeReadQuery = mock(PartitionRangeReadQuery.class);
+        when(partitionRangeReadQuery.metadata()).thenReturn(metadata);
+        when(partitionRangeReadQuery.limits()).thenReturn(limits);
+        QueryPager partitionRangeQueryPager = new PartitionRangeQueryPager(partitionRangeReadQuery, state, ProtocolVersion.CURRENT);
+        Assertions.assertThat(partitionRangeQueryPager.toString())
+                  .contains(limits.toString())
+                  .contains("remaining=19")
+                  .contains("remainingInPartition=17")
+                  .contains("lastReturnedRow=c=11")
+                  .contains("lastCounter=null")
+                  .contains("lastKey=DecoratedKey(00000001, 00000001)")
+                  .contains("lastReturnedKey=DecoratedKey(00000001, 00000001)")
+                  .contains("exhausted=false");
+
+        Group singlePartitionReadQueryGroup = Group.create(metadata,
+                                                           FBUtilities.nowInSeconds(),
+                                                           ColumnFilter.all(metadata),
+                                                           RowFilter.NONE, limits,
+                                                           Arrays.asList(metadata.partitioner.decorateKey(bytes(1)), metadata.partitioner.decorateKey(bytes(2))),
+                                                           new ClusteringIndexSliceFilter(Slices.ALL, false));
+        QueryPager multiPartitionPager = new MultiPartitionPager<>(singlePartitionReadQueryGroup, state, ProtocolVersion.CURRENT);
+        Assertions.assertThat(multiPartitionPager.toString())
+                  .contains("pagers.length=2")
+                  .contains("limit=" + limits)
+                  .contains("remaining=19")
+                  .contains("current=0");
+
+        AggregationQueryPager aggregationQueryPager = new AggregationQueryPager(singlePartitionPager, PageSize.inBytes(512), limits);
+        Assertions.assertThat(aggregationQueryPager.toString())
+                  .contains("limits=" + limits)
+                  .contains("subPageSize=512 bytes")
+                  .contains("subPager=" + singlePartitionPager)
+                  .contains("lastReturned=c=11");
+    }
 }
diff --git a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java
index b3ac4a2507c9..86b139b1ad6e 100644
--- a/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java
+++ b/test/unit/org/apache/cassandra/service/reads/range/RangeCommandsTest.java
@@ -28,6 +28,7 @@
 import org.apache.cassandra.config.DatabaseDescriptor;
 import org.apache.cassandra.cql3.CQLTester;
 import org.apache.cassandra.cql3.Operator;
+import org.apache.cassandra.cql3.PageSize;
 import org.apache.cassandra.db.AbstractReadCommandBuilder;
 import org.apache.cassandra.db.ColumnFamilyStore;
 import org.apache.cassandra.db.Keyspace;
@@ -215,13 +216,13 @@ public boolean isDistinct()
         }
 
         @Override
-        public DataLimits forPaging(int pageSize)
+        public DataLimits forPaging(PageSize pageSize)
         {
             return wrapped.forPaging(pageSize);
         }
 
         @Override
-        public DataLimits forPaging(int pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
+        public DataLimits forPaging(PageSize pageSize, ByteBuffer lastReturnedKey, int lastReturnedKeyRemaining)
         {
             return wrapped.forPaging(pageSize, lastReturnedKey, lastReturnedKeyRemaining);
         }
@@ -244,6 +245,18 @@ public Counter newCounter(int nowInSec, boolean assumeLiveData, boolean countPar
             return wrapped.newCounter(nowInSec, assumeLiveData, countPartitionsWithOnlyStaticData, enforceStrictLiveness);
         }
 
+        @Override
+        public int bytes()
+        {
+            return wrapped.bytes();
+        }
+
+        @Override
+        public int rows()
+        {
+            return wrapped.rows();
+        }
+
         @Override
         public int count()
         {
@@ -261,6 +274,18 @@ public DataLimits withoutState()
         {
             return wrapped.withoutState();
         }
+
+        @Override
+        public DataLimits withCountedLimit(int newCountedLimit)
+        {
+            return wrapped.withCountedLimit(newCountedLimit);
+        }
+
+        @Override
+        public DataLimits withBytesLimit(int bytesLimit)
+        {
+            return wrapped.withBytesLimit(bytesLimit);
+        }
     }
 
     public static final class MockedIndex extends StubIndex
diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
index a6de876f6d30..d19ee7573d6b 100644
--- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java
+++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java
@@ -337,7 +337,7 @@ public void queryOptionsSerDeserTest()
                 QueryOptions.create(ConsistencyLevel.ALL,
                                     Collections.singletonList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 })),
                                     false,
-                                    5000,
+                                    PageSize.inRows(5000),
                                     Util.makeSomePagingState(version),
                                     ConsistencyLevel.SERIAL,
                                     version,
@@ -353,7 +353,7 @@ public void queryOptionsSerDeserTest()
                                     Arrays.asList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 }),
                                                   ByteBuffer.wrap(new byte[] { 0x03, 0x04, 0x05, 0x03, 0x04, 0x05 })),
                                     true,
-                                    10,
+                                    PageSize.inRows(10),
                                     Util.makeSomePagingState(version),
                                     ConsistencyLevel.SERIAL,
                                     version,
@@ -369,7 +369,7 @@ public void queryOptionsSerDeserTest()
                                     Arrays.asList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 }),
                                                   ByteBuffer.wrap(new byte[] { 0x03, 0x04, 0x05, 0x03, 0x04, 0x05 })),
                                     true,
-                                    10,
+                                    PageSize.inBytes(10),
                                     Util.makeSomePagingState(version),
                                     ConsistencyLevel.SERIAL,
                                     version,
@@ -430,7 +430,7 @@ private void defaultSerialCLGuardrailsTest(ProtocolVersion version,
         QueryOptions queryOptions = QueryOptions.create(ConsistencyLevel.ALL,
                                                         Collections.singletonList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 })),
                                                         false,
-                                                        5000,
+                                                        PageSize.inRows(5000),
                                                         Util.makeSomePagingState(version),
                                                         null,
                                                         version,
@@ -487,7 +487,7 @@ private void specifiedSerialCLGuardrailsTest(ProtocolVersion version,
         QueryOptions queryOptions = QueryOptions.create(ConsistencyLevel.ALL,
                                                         Collections.singletonList(ByteBuffer.wrap(new byte[] { 0x00, 0x01, 0x02 })),
                                                         false,
-                                                        5000,
+                                                        PageSize.inRows(5000),
                                                         Util.makeSomePagingState(version),
                                                         specifiedSerialConsistency,
                                                         version,

From f00e340447f0be54cbb6ccb72a5b31baf94a14d7 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Tue, 13 Jul 2021 18:08:20 +0300
Subject: [PATCH 122/151] STAR-767: Do not copy to heap for flushing

This was failing because off-heap native clustering keys were
used in stats metadata without being copied, referencing memory
that could be overwritten.

Also fixes a problem creating retainable/minimized versions of
clustering bounds and boundaries.
---
 .../db/AbstractArrayClusteringPrefix.java     |   2 +-
 .../db/AbstractBufferClusteringPrefix.java    |  12 +-
 .../apache/cassandra/db/ClusteringPrefix.java |   8 +-
 .../apache/cassandra/db/NativeClustering.java |  21 +-
 .../cassandra/db/memtable/TrieMemtable.java   |  10 +-
 .../sstable/metadata/MetadataCollector.java   |  17 +-
 .../cassandra/utils/ByteBufferUtil.java       |   4 +-
 .../cassandra/utils/memory/SlabAllocator.java |   4 +-
 .../cassandra/db/ClusteringBoundTest.java     |  43 ----
 .../cassandra/db/ClusteringPrefixTest.java    | 232 ++++++++++++++++++
 .../io/sstable/SSTableMetadataTest.java       |  31 ++-
 11 files changed, 306 insertions(+), 78 deletions(-)
 delete mode 100644 test/unit/org/apache/cassandra/db/ClusteringBoundTest.java
 create mode 100644 test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java

diff --git a/src/java/org/apache/cassandra/db/AbstractArrayClusteringPrefix.java b/src/java/org/apache/cassandra/db/AbstractArrayClusteringPrefix.java
index 211eeb0460cd..ee3e43e5782e 100644
--- a/src/java/org/apache/cassandra/db/AbstractArrayClusteringPrefix.java
+++ b/src/java/org/apache/cassandra/db/AbstractArrayClusteringPrefix.java
@@ -49,7 +49,7 @@ public ByteBuffer[] getBufferArray()
         return out;
     }
 
-    public ClusteringPrefix<byte[]> minimize()
+    public ClusteringPrefix<byte[]> retainable()
     {
         return this;
     }
diff --git a/src/java/org/apache/cassandra/db/AbstractBufferClusteringPrefix.java b/src/java/org/apache/cassandra/db/AbstractBufferClusteringPrefix.java
index 457d0c4befa8..f8ffcaee8eec 100644
--- a/src/java/org/apache/cassandra/db/AbstractBufferClusteringPrefix.java
+++ b/src/java/org/apache/cassandra/db/AbstractBufferClusteringPrefix.java
@@ -42,10 +42,18 @@ public ByteBuffer[] getBufferArray()
         return getRawValues();
     }
 
-    public ClusteringPrefix<ByteBuffer> minimize()
+    public ClusteringPrefix<ByteBuffer> retainable()
     {
         if (!ByteBufferUtil.canMinimize(values))
             return this;
-        return new BufferClustering(ByteBufferUtil.minimizeBuffers(values));
+
+        ByteBuffer[] values = ByteBufferUtil.minimizeBuffers(this.values);
+        if (kind.isBoundary())
+            return accessor().factory().boundary(kind, values);
+        if (kind.isBound())
+            return accessor().factory().bound(kind, values);
+
+        assert kind() != Kind.STATIC_CLUSTERING;    // not minimizable
+        return accessor().factory().clustering(values);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
index 0a22306b6bff..59fd83b8cc6a 100644
--- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java
+++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java
@@ -34,7 +34,6 @@
 import org.apache.cassandra.io.util.DataOutputPlus;
 import org.apache.cassandra.schema.TableMetadata;
 import org.apache.cassandra.utils.ByteArrayUtil;
-import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version;
 import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
@@ -367,10 +366,11 @@ public default String clusteringString(List<AbstractType<?>> types)
     public ByteBuffer[] getBufferArray();
 
     /**
-     * If the prefix contains byte buffers that can be minimized (see {@link ByteBufferUtil#minimalBufferFor(ByteBuffer)}),
-     * this will return a copy of the prefix with minimized values, otherwise it returns itself.
+     * Return the key in a form that can be retained for longer-term use. This means extracting keys stored in shared
+     * memory (i.e. in memtables) to minimized on-heap versions.
+     * If the object is already in minimal form, no action will be taken.
      */
-    public ClusteringPrefix<V> minimize();
+    public ClusteringPrefix<V> retainable();
 
     public static class Serializer
     {
diff --git a/src/java/org/apache/cassandra/db/NativeClustering.java b/src/java/org/apache/cassandra/db/NativeClustering.java
index 0e4c19db17ef..8ac81520bf69 100644
--- a/src/java/org/apache/cassandra/db/NativeClustering.java
+++ b/src/java/org/apache/cassandra/db/NativeClustering.java
@@ -25,6 +25,7 @@
 import org.apache.cassandra.db.marshal.ValueAccessor;
 import org.apache.cassandra.utils.ObjectSizes;
 import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.apache.cassandra.utils.memory.HeapAllocator;
 import org.apache.cassandra.utils.memory.MemoryUtil;
 import org.apache.cassandra.utils.memory.NativeAllocator;
 
@@ -36,11 +37,6 @@ public class NativeClustering implements Clustering<ByteBuffer>
 
     private NativeClustering() { peer = 0; }
 
-    public ClusteringPrefix<ByteBuffer> minimize()
-    {
-        return this;
-    }
-
     public NativeClustering(NativeAllocator allocator, OpOrder.Group writeOp, Clustering<?> clustering)
     {
         int count = clustering.size();
@@ -157,4 +153,19 @@ public final boolean equals(Object o)
     {
         return ClusteringPrefix.equals(this, o);
     }
+
+    public ClusteringPrefix<ByteBuffer> retainable()
+    {
+        assert kind() == Kind.CLUSTERING; // tombstones are never stored natively
+
+        // always extract
+        ByteBuffer[] values = new ByteBuffer[size()];
+        for (int i = 0; i < values.length; ++i)
+        {
+            ByteBuffer value = get(i);
+            values[i] = value != null ? HeapAllocator.instance.clone(value) : null;
+        }
+
+        return accessor().factory().clustering(values);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
index d11eb0dc0649..ed410a42ddb9 100644
--- a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
@@ -392,9 +392,9 @@ public long partitionCount()
             public Iterator<MemtablePartition> iterator()
             {
                 return Iterators.transform(toFlush.entryIterator(),
-                                           // TODO: During flushing we shouldn't need to copy partition data on heap because the memtable can't
-                                           // disappear until we are done with the flush. Figure out why EnsureOnHeap.NOOP doesn't work.
-                                           entry -> getPartitionFromTrieEntry(metadata(), allocator.ensureOnHeap(), entry));
+                                           // During flushing we are certain the memtable will remain at least until
+                                           // the flush completes. No copying to heap is necessary.
+                                           entry -> getPartitionFromTrieEntry(metadata(), EnsureOnHeap.NOOP, entry));
             }
 
             public long partitionKeySize()
@@ -660,7 +660,7 @@ public Row lastRow()
         @Override
         public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Slices slices, boolean reversed)
         {
-            return ensureOnHeap.applyToPartition(super.unfilteredIterator(selection, slices, reversed));
+            return unfilteredIterator(holder(), selection, slices, reversed);
         }
 
         @Override
@@ -673,7 +673,7 @@ public UnfilteredRowIterator unfilteredIterator(ColumnFilter selection, Navigabl
         @Override
         public UnfilteredRowIterator unfilteredIterator()
         {
-            return ensureOnHeap.applyToPartition(super.unfilteredIterator());
+            return unfilteredIterator(ColumnFilter.selection(super.columns()), Slices.ALL, false);
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
index b15a9d667394..c681af956e19 100755
--- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
+++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java
@@ -287,16 +287,15 @@ public void updateClusteringValues(Clustering<?> clustering)
         if (!clusteringInitialized)
         {
             clusteringInitialized = true;
-            minClustering = clustering.minimize();
-            maxClustering = minClustering;
+            maxClustering = minClustering = clustering;
         }
         else if (comparator.compare((ClusteringPrefix<?>) clustering, (ClusteringPrefix<?>) maxClustering) > 0)
         {
-            maxClustering = clustering.minimize();
+            maxClustering = clustering;
         }
         else if (comparator.compare((ClusteringPrefix<?>) clustering, (ClusteringPrefix<?>) minClustering) < 0)
         {
-            minClustering = clustering.minimize();
+            minClustering = clustering;
         }
     }
 
@@ -312,18 +311,17 @@ public void updateClusteringValuesByBoundOrBoundary(ClusteringBoundOrBoundary<?>
         if (!clusteringInitialized)
         {
             clusteringInitialized = true;
-            minClustering = clusteringBoundOrBoundary.minimize();
-            maxClustering = minClustering;
+            maxClustering = minClustering = clusteringBoundOrBoundary;
         }
         else if (clusteringBoundOrBoundary.kind().isStart())
         {
             if (comparator.compare(clusteringBoundOrBoundary, minClustering) < 0)
-                minClustering = clusteringBoundOrBoundary.minimize();
+                minClustering = clusteringBoundOrBoundary;
         }
         else
         {
             if (comparator.compare(clusteringBoundOrBoundary, maxClustering) > 0)
-                maxClustering = clusteringBoundOrBoundary.minimize();
+                maxClustering = clusteringBoundOrBoundary;
         }
     }
 
@@ -349,7 +347,8 @@ public Map<MetadataType, MetadataComponent> finalizeMetadata(String partitioner,
                                                              estimatedTombstoneDropTime.build(),
                                                              sstableLevel,
                                                              comparator.subtypes(),
-                                                             Slice.make(minClustering, maxClustering),
+                                                             Slice.make(minClustering.retainable(),
+                                                                        maxClustering.retainable()),
                                                              hasLegacyCounterShards,
                                                              hasPartitionLevelDeletions,
                                                              repairedAt,
diff --git a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
index 4567ea128d87..6c5f2cb4a19c 100644
--- a/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
+++ b/src/java/org/apache/cassandra/utils/ByteBufferUtil.java
@@ -718,13 +718,13 @@ public static boolean isPrefix(ByteBuffer prefix, ByteBuffer value)
 
     public static boolean canMinimize(ByteBuffer buf)
     {
-        return buf != null && (buf.capacity() > buf.remaining() || !buf.hasArray());
+        return buf != null && (!buf.hasArray() || buf.array().length > buf.remaining());
     }
 
     /** trims size of bytebuffer to exactly number of bytes in it, to do not hold too much memory */
     public static ByteBuffer minimalBufferFor(ByteBuffer buf)
     {
-        return buf.capacity() > buf.remaining() || !buf.hasArray() ? ByteBuffer.wrap(getArray(buf)) : buf;
+        return !buf.hasArray() || buf.array().length > buf.remaining() ? ByteBuffer.wrap(getArray(buf)) : buf;
     }
 
     public static ByteBuffer[] minimizeBuffers(ByteBuffer[] src)
diff --git a/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java b/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
index 538cd3f175f6..6c6c862b5967 100644
--- a/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
+++ b/src/java/org/apache/cassandra/utils/memory/SlabAllocator.java
@@ -29,6 +29,7 @@
 import org.apache.cassandra.io.util.FileUtils;
 import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.concurrent.OpOrder;
+import org.assertj.core.util.VisibleForTesting;
 
 /**
 + * The SlabAllocator is a bump-the-pointer allocator that allocates
@@ -152,7 +153,8 @@ private Region getRegion()
         }
     }
 
-    protected AbstractAllocator allocator(OpOrder.Group writeOp)
+    @VisibleForTesting
+    public AbstractAllocator allocator(OpOrder.Group writeOp)
     {
         return new ContextAllocator(writeOp, this);
     }
diff --git a/test/unit/org/apache/cassandra/db/ClusteringBoundTest.java b/test/unit/org/apache/cassandra/db/ClusteringBoundTest.java
deleted file mode 100644
index 20fcc2086c05..000000000000
--- a/test/unit/org/apache/cassandra/db/ClusteringBoundTest.java
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.db;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-public class ClusteringBoundTest
-{
-    @Test
-    public void arrayTopAndBottom()
-    {
-        Assert.assertTrue(ArrayClusteringBound.BOTTOM.isBottom());
-        Assert.assertFalse(ArrayClusteringBound.BOTTOM.isTop());
-        Assert.assertTrue(ArrayClusteringBound.TOP.isTop());
-        Assert.assertFalse(ArrayClusteringBound.TOP.isBottom());
-    }
-
-    @Test
-    public void bufferTopAndBottom()
-    {
-        Assert.assertTrue(BufferClusteringBound.BOTTOM.isBottom());
-        Assert.assertFalse(BufferClusteringBound.BOTTOM.isTop());
-        Assert.assertTrue(BufferClusteringBound.TOP.isTop());
-        Assert.assertFalse(BufferClusteringBound.TOP.isBottom());
-    }
-}
diff --git a/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java b/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java
new file mode 100644
index 000000000000..04585de14385
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/ClusteringPrefixTest.java
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ThreadLocalRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
+import java.util.function.Function;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.db.marshal.ByteArrayAccessor;
+import org.apache.cassandra.db.marshal.ByteBufferAccessor;
+import org.apache.cassandra.db.marshal.ValueAccessor;
+import org.apache.cassandra.utils.ByteBufferUtil;
+import org.apache.cassandra.utils.memory.AbstractAllocator;
+import org.apache.cassandra.utils.memory.MemtablePool;
+import org.apache.cassandra.utils.memory.NativeAllocator;
+import org.apache.cassandra.utils.memory.NativePool;
+import org.apache.cassandra.utils.memory.SlabAllocator;
+import org.apache.cassandra.utils.memory.SlabPool;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class ClusteringPrefixTest
+{
+    @Test
+    public void arrayTopAndBottom()
+    {
+        Assert.assertTrue(ArrayClusteringBound.BOTTOM.isBottom());
+        Assert.assertFalse(ArrayClusteringBound.BOTTOM.isTop());
+        Assert.assertTrue(ArrayClusteringBound.TOP.isTop());
+        Assert.assertFalse(ArrayClusteringBound.TOP.isBottom());
+    }
+
+    @Test
+    public void bufferTopAndBottom()
+    {
+        Assert.assertTrue(BufferClusteringBound.BOTTOM.isBottom());
+        Assert.assertFalse(BufferClusteringBound.BOTTOM.isTop());
+        Assert.assertTrue(BufferClusteringBound.TOP.isTop());
+        Assert.assertFalse(BufferClusteringBound.TOP.isBottom());
+    }
+
+    @Test
+    public void testRetainableArray()
+    {
+        testRetainable(ByteArrayAccessor.instance.factory(), x -> new byte[][] {x.getBytes(StandardCharsets.UTF_8)});
+    }
+
+    @Test
+    public void testRetainableOnHeap()
+    {
+        testRetainable(ByteBufferAccessor.instance.factory(), x -> new ByteBuffer[] {ByteBufferUtil.bytes(x)});
+    }
+
+    @Test
+    public void testRetainableOnHeapSliced()
+    {
+        for (int prepend = 0; prepend < 3; ++prepend)
+            for (int append = 0; append < 3; ++append)
+            {
+                testRetainable(ByteBufferAccessor.instance.factory(),
+                               slicingAllocator(prepend, append));
+            }
+    }
+
+    private Function<String, ByteBuffer[]> slicingAllocator(int prepend, int append)
+    {
+        return x ->
+        {
+            ByteBuffer bytes = ByteBufferUtil.bytes(x);
+            ByteBuffer sliced = ByteBuffer.allocate(bytes.remaining() + prepend + append);
+            for (int i = 0; i < prepend; ++i)
+                sliced.put((byte) ThreadLocalRandom.current().nextInt());
+            sliced.put(bytes);
+            bytes.flip();
+            for (int i = 0; i < append; ++i)
+                sliced.put((byte) ThreadLocalRandom.current().nextInt());
+            sliced.position(prepend).limit(prepend + bytes.remaining());
+            return new ByteBuffer[]{ sliced.slice() };
+        };
+    }
+
+    @Test
+    public void testRetainableOffHeap()
+    {
+        testRetainable(ByteBufferAccessor.instance.factory(), x ->
+        {
+            ByteBuffer h = ByteBufferUtil.bytes(x);
+            ByteBuffer v = ByteBuffer.allocateDirect(h.remaining());
+            v.put(h);
+            v.flip();
+            return new ByteBuffer[] {v};
+        });
+    }
+
+    @Test
+    public void testRetainableOnHeapSlab() throws InterruptedException, TimeoutException
+    {
+        testRetainableSlab(true);
+    }
+
+    @Test
+    public void testRetainableOffHeapSlab() throws InterruptedException, TimeoutException
+    {
+        testRetainableSlab(false);
+    }
+
+    public void testRetainableSlab(boolean onHeap) throws InterruptedException, TimeoutException
+    {
+        MemtablePool pool = new SlabPool(1L << 24, onHeap ? 0 : 1L << 24, 1.0f, () -> CompletableFuture.completedFuture(false));
+        AbstractAllocator allocator = ((SlabAllocator) pool.newAllocator()).allocator(null);
+        try
+        {
+            testRetainable(ByteBufferAccessor.instance.factory(), x ->
+            {
+                ByteBuffer h = ByteBufferUtil.bytes(x);
+                ByteBuffer v = allocator.allocate(h.remaining());
+                v.put(h);
+                v.flip();
+                return new ByteBuffer[] {v};
+            });
+        }
+        finally
+        {
+            pool.shutdownAndWait(10, TimeUnit.SECONDS);
+        }
+    }
+
+    @Test
+    public void testRetainableNative() throws InterruptedException, TimeoutException
+    {
+        MemtablePool pool = new NativePool(1L << 24,1L << 24, 1.0f, () -> CompletableFuture.completedFuture(false));
+        NativeAllocator allocator = (NativeAllocator) pool.newAllocator();
+        try
+        {
+            testRetainable(ByteBufferAccessor.instance.factory(),
+                           x -> new ByteBuffer[] {ByteBufferUtil.bytes(x)},
+                           x -> x.kind() == ClusteringPrefix.Kind.CLUSTERING
+                                ? new NativeClustering(allocator, null, (Clustering) x)
+                                : x);
+        }
+        finally
+        {
+            pool.shutdownAndWait(10, TimeUnit.SECONDS);
+        }
+    }
+
+    public <V> void testRetainable(ValueAccessor.ObjectFactory<V> factory,
+                                   Function<String, V[]> allocator)
+    {
+        testRetainable(factory, allocator, null);
+    }
+
+    public <V> void testRetainable(ValueAccessor.ObjectFactory<V> factory,
+                                   Function<String, V[]> allocator,
+                                   Function<ClusteringPrefix<V>, ClusteringPrefix<V>> mapper)
+    {
+        ClusteringPrefix<V>[] clusterings = new ClusteringPrefix[]
+        {
+            factory.clustering(),
+            factory.staticClustering(),
+            factory.clustering(allocator.apply("test")),
+            factory.bound(ClusteringPrefix.Kind.INCL_START_BOUND, allocator.apply("testA")),
+            factory.bound(ClusteringPrefix.Kind.INCL_END_BOUND, allocator.apply("testB")),
+            factory.bound(ClusteringPrefix.Kind.EXCL_START_BOUND, allocator.apply("testC")),
+            factory.bound(ClusteringPrefix.Kind.EXCL_END_BOUND, allocator.apply("testD")),
+            factory.boundary(ClusteringPrefix.Kind.EXCL_END_INCL_START_BOUNDARY, allocator.apply("testE")),
+            factory.boundary(ClusteringPrefix.Kind.INCL_END_EXCL_START_BOUNDARY, allocator.apply("testF")),
+        };
+
+        if (mapper != null)
+            clusterings = Arrays.stream(clusterings)
+                                .map(mapper)
+                                .toArray(ClusteringPrefix[]::new);
+
+        testRetainable(clusterings);
+    }
+
+    public void testRetainable(ClusteringPrefix[] clusterings)
+    {
+        for (ClusteringPrefix clustering : clusterings)
+        {
+            ClusteringPrefix retainable = clustering.retainable();
+            assertEquals(clustering, retainable);
+            assertClusteringIsRetainable(retainable);
+        }
+    }
+
+
+    public static void assertClusteringIsRetainable(ClusteringPrefix clustering)
+    {
+        if (clustering instanceof AbstractArrayClusteringPrefix)
+            return; // has to be on-heap and minimized
+
+        assertTrue(clustering instanceof AbstractBufferClusteringPrefix);
+        AbstractBufferClusteringPrefix abcf = (AbstractBufferClusteringPrefix) clustering;
+        ByteBuffer[] buffers = abcf.getBufferArray();
+        for (ByteBuffer b : buffers)
+        {
+            assertFalse(b.isDirect());
+            assertTrue(b.hasArray());
+            assertTrue(b.capacity() == b.remaining());
+            assertEquals(0, b.arrayOffset());
+            assertEquals(b.capacity(), b.array().length);
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
index cde8d089efc2..80aa22d5932a 100644
--- a/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
+++ b/test/unit/org/apache/cassandra/io/sstable/SSTableMetadataTest.java
@@ -36,6 +36,7 @@
 import org.apache.cassandra.schema.KeyspaceParams;
 import org.apache.cassandra.utils.ByteBufferUtil;
 
+import static org.apache.cassandra.db.ClusteringPrefixTest.assertClusteringIsRetainable;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
@@ -218,9 +219,9 @@ public void trackMaxMinColNames() throws CharacterCodingException
         {
             assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0)), "0col100");
             assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0)), "7col149");
-            // make sure the clustering values are minimised
-            assertTrue(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0).capacity() < 50);
-            assertTrue(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0).capacity() < 50);
+            // make sure stats don't reference native or off-heap data
+            assertClusteringIsRetainable(sstable.getSSTableMetadata().coveredClustering.start());
+            assertClusteringIsRetainable(sstable.getSSTableMetadata().coveredClustering.end());
         }
         String key = "row2";
 
@@ -240,9 +241,27 @@ public void trackMaxMinColNames() throws CharacterCodingException
         {
             assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0)), "0col100");
             assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0)), "9col298");
-            // and make sure the clustering values are still minimised after compaction
-            assertTrue(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0).capacity() < 50);
-            assertTrue(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0).capacity() < 50);
+            // make sure stats don't reference native or off-heap data
+            assertClusteringIsRetainable(sstable.getSSTableMetadata().coveredClustering.start());
+            assertClusteringIsRetainable(sstable.getSSTableMetadata().coveredClustering.end());
+        }
+
+        key = "row3";
+        new RowUpdateBuilder(store.metadata(), System.currentTimeMillis(), key)
+            .addRangeTombstone("0", "7")
+            .build()
+            .apply();
+
+        store.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS);
+        store.forceMajorCompaction();
+        assertEquals(1, store.getLiveSSTables().size());
+        for (SSTableReader sstable : store.getLiveSSTables())
+        {
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.start().bufferAt(0)), "0");
+            assertEquals(ByteBufferUtil.string(sstable.getSSTableMetadata().coveredClustering.end().bufferAt(0)), "9col298");
+            // make sure stats don't reference native or off-heap data
+            assertClusteringIsRetainable(sstable.getSSTableMetadata().coveredClustering.start());
+            assertClusteringIsRetainable(sstable.getSSTableMetadata().coveredClustering.end());
         }
     }
 

From b81104d1af84098978991d0f0481a2268835ad62 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 2 Apr 2021 12:57:19 +0300
Subject: [PATCH 123/151] Add Cursor walks to trie interface

(Singleton, iteration and apply)
---
 .../cassandra/db/tries/CursorFromNode.java    |  91 +++++++++++
 .../cassandra/db/tries/MemtableTrie.java      | 153 +++++++++---------
 .../cassandra/db/tries/SingletonTrie.java     |  52 ++++++
 .../org/apache/cassandra/db/tries/Trie.java   |  76 +++++++++
 .../apache/cassandra/db/tries/TrieDumper.java |   1 +
 .../db/tries/TrieEntriesIterator.java         |  36 +++--
 .../cassandra/db/tries/TrieIterator.java      | 112 -------------
 .../db/tries/TrieIteratorWithKey.java         | 126 ---------------
 .../db/tries/TrieValuesIterator.java          |  35 ++--
 .../apache/cassandra/db/tries/TrieWalker.java |  48 ++----
 10 files changed, 347 insertions(+), 383 deletions(-)
 create mode 100644 src/java/org/apache/cassandra/db/tries/CursorFromNode.java
 delete mode 100644 src/java/org/apache/cassandra/db/tries/TrieIterator.java
 delete mode 100644 src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java

diff --git a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
new file mode 100644
index 000000000000..33419fd35b37
--- /dev/null
+++ b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+public class CursorFromNode<T> implements Trie.Cursor<T>
+{
+    static final Trie.Node<Object, Trie.Node> EMPTY_NODE = new Trie.NoChildrenNode<Object, Trie.Node>(null)
+    {
+        public Object content()
+        {
+            return null;
+        }
+    };
+
+    Trie.Node<T, Trie.Node> current;
+    int level;
+
+    CursorFromNode(Trie<T> trie)
+    {
+        current = trie.root();
+        if (current == null)
+            current = (Trie.Node<T, Trie.Node>) EMPTY_NODE;
+
+        level = 0;
+    }
+
+    public int advance()
+    {
+        Trie.Remaining has = current.startIteration();
+        Trie.Node<T, Trie.Node> child = null;
+        do
+        {
+            while (has == null)
+            {
+                current = current.parentLink;
+                --level;
+                if (current == null)
+                {
+                    assert level == -1;
+                    return level;
+                }
+                has = current.advanceIteration();
+            }
+
+            child = current.getCurrentChild(current);
+            if (child == null)
+                has = current.advanceIteration();
+        }
+        while (child == null);
+        current = child;
+        return ++level;
+    }
+
+    public int level()
+    {
+        return level;
+    }
+
+    public T content()
+    {
+        return current.content();
+    }
+
+    public int transitionAtLevel(int level)
+    {
+        int nodeLevel = this.level;
+        Trie.Node<T, Trie.Node> node = current;
+        while (nodeLevel > level)
+        {
+            node = node.parentLink;
+            --nodeLevel;
+        }
+        return node.currentTransition;
+    }
+}
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index a7d908989548..65a92f31a7be 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -24,6 +24,7 @@
 
 import com.google.common.annotations.VisibleForTesting;
 
+import net.nicoulaj.compilecommand.annotations.DontInline;
 import org.agrona.concurrent.UnsafeBuffer;
 import org.apache.cassandra.io.compress.BufferType;
 import org.apache.cassandra.io.util.FileUtils;
@@ -560,12 +561,9 @@ private int preserveContent(int existingPreContentNode,
     /**
      * State of the walk of the given mutation trie. Passed to mutation nodes in their parentState link.
      */
-    class ApplyState<U>
+    class ApplyState
     {
-        /**
-         * The node from the mutation trie.
-         */
-        final Node<U, ApplyState<U>> mutationNode;
+        final ApplyState parentState;
 
         /**
          * Pointer to the existing node before skipping over content nodes, i.e. this is either the same as
@@ -589,9 +587,14 @@ class ApplyState<U>
          */
         int updatedPostContentNode;
 
-        ApplyState(Node<U, ApplyState<U>> mutationNode, int transition)
+        final int parentTransition;
+        final int contentIndex;
+
+        <U> ApplyState(ApplyState parentState, int transition, U mutationContent, UpsertTransformer<T, U> transformer)
         {
-            ApplyState<U> parentState = mutationNode.parentLink;
+            this.parentState = parentState;
+            this.parentTransition = transition;
+
             if (parentState == null)
                 existingPreContentNode = root;
             else
@@ -601,15 +604,49 @@ class ApplyState<U>
                                          : getChild(parentState.existingPostContentNode, transition);
             }
 
-            existingPostContentNode = followContentTransition(existingPreContentNode);
-            updatedPostContentNode = existingPostContentNode;
+            int existingContentIndex = -1;
+            if (isLeaf(existingPreContentNode))
+            {
+                existingContentIndex = ~existingPreContentNode;
+                existingPostContentNode = NONE;
+            }
+            else if (offset(existingPreContentNode) == PREFIX_OFFSET)
+            {
+                existingContentIndex = getInt(existingPreContentNode + PREFIX_CONTENT_OFFSET);
+                existingPostContentNode = followContentTransition(existingPreContentNode);
+            }
+            else
+                existingPostContentNode = existingPreContentNode;
 
-            this.mutationNode = mutationNode;
+            if (mutationContent != null)
+            {
+                if (existingContentIndex != -1)
+                {
+                    final T existingContent = getContent(existingContentIndex);
+                    T combinedContent = transformer.apply(existingContent, mutationContent);
+                    setContent(existingContentIndex, combinedContent);
+                    if (combinedContent != null)
+                        contentIndex = existingContentIndex;
+                    else
+                        contentIndex = -1;
+                }
+                else
+                {
+                    T combinedContent = transformer.apply(null, mutationContent);
+                    if (combinedContent != null)
+                        contentIndex = addContent(combinedContent);
+                    else
+                        contentIndex = -1;
+                }
+            }
+            else
+                contentIndex = existingContentIndex;
+
+            updatedPostContentNode = existingPostContentNode;
         }
 
-        private void attachChild(int ourChild) throws SpaceExhaustedException
+        private void attachChild(int transition, int ourChild) throws SpaceExhaustedException
         {
-            int transition = mutationNode.currentTransition;
             if (isNull(updatedPostContentNode))
                 updatedPostContentNode = expandOrCreateChainNode(transition, ourChild);
             else
@@ -618,41 +655,8 @@ private void attachChild(int ourChild) throws SpaceExhaustedException
                                                                        ourChild);
         }
 
-        private int applyContent(U mutationContent, UpsertTransformer<T, U> transformer) throws SpaceExhaustedException
+        private int applyContent() throws SpaceExhaustedException
         {
-            // common case, no new content
-            if (mutationContent == null)
-                return preserveContent(existingPreContentNode, existingPostContentNode, updatedPostContentNode);
-
-            int contentIndex = -1;
-            int existingContentIndex = -1;
-
-            if (existingPreContentNode != existingPostContentNode)
-            {
-                // There is pre-existing content which must be merged with the new.
-                if (isLeaf(existingPreContentNode))
-                    existingContentIndex = ~existingPreContentNode;
-                else
-                {
-                    assert offset(existingPreContentNode) == PREFIX_OFFSET;
-                    existingContentIndex = getInt(existingPreContentNode + PREFIX_CONTENT_OFFSET);
-                }
-
-                final T existingContent = getContent(existingContentIndex);
-                T combinedContent = transformer.apply(existingContent, mutationContent);
-                setContent(existingContentIndex, combinedContent);
-                if (combinedContent != null)
-                    contentIndex = existingContentIndex;
-            }
-            else
-            {
-                // No pre-existing content.
-                T combinedContent = transformer.apply(null, mutationContent);
-                if (combinedContent != null)
-                    contentIndex = addContent(combinedContent);
-            }
-
-            // The supplied transformer may return null, e.g. to delete data. In this case we don't have a content index.
             if (contentIndex == -1)
                 return updatedPostContentNode;
 
@@ -662,22 +666,22 @@ private int applyContent(U mutationContent, UpsertTransformer<T, U> transformer)
             // We can't update in-place if there was no preexisting prefix, or if the prefix was embedded and the target
             // node must change.
             if (existingPreContentNode == existingPostContentNode ||
+                isNull(existingPostContentNode) ||
                 isEmbeddedPrefixNode(existingPreContentNode) && updatedPostContentNode != existingPostContentNode)
                 return createContentNode(contentIndex, updatedPostContentNode, isNull(existingPostContentNode));
 
             // Otherwise modify in place
             if (updatedPostContentNode != existingPostContentNode) // to use volatile write but also ensure we don't corrupt embedded nodes
                 putIntVolatile(existingPreContentNode + PREFIX_POINTER_OFFSET, updatedPostContentNode);
-            assert contentIndex == existingContentIndex;
+            assert contentIndex == getInt(existingPreContentNode + PREFIX_CONTENT_OFFSET);
             return existingPreContentNode;
         }
 
-        private ApplyState<U> attachAndMoveToParentState(UpsertTransformer<T, U> transformer) throws SpaceExhaustedException
+        private <U> ApplyState attachAndMoveToParentState() throws SpaceExhaustedException
         {
-            ApplyState<U> parentState = mutationNode.parentLink;
+            ApplyState parentState = this.parentState;
 
-            int updatedPreContentNode = applyContent(mutationNode.content(),
-                                                     transformer);
+            int updatedPreContentNode = applyContent();
 
             if (parentState == null)
             {
@@ -693,7 +697,7 @@ private ApplyState<U> attachAndMoveToParentState(UpsertTransformer<T, U> transfo
             }
 
             if (updatedPreContentNode != existingPreContentNode)
-                parentState.attachChild(updatedPreContentNode);
+                parentState.attachChild(parentTransition, updatedPreContentNode);
 
             return parentState;
         }
@@ -729,41 +733,33 @@ public interface UpsertTransformer<T, U>
      */
     public <U> void apply(Trie<U> mutation, final UpsertTransformer<T, U> transformer) throws SpaceExhaustedException
     {
-        Node<U, ApplyState<U>> current = mutation.root();
-        if (current == null)
+        Cursor<U> mutationCursor = mutation.cursor();
+        if (mutationCursor.level() == -1)
             return;
+        assert mutationCursor.level() == 0;
 
-        ApplyState<U> state = new ApplyState<U>(current, current.parentLink != null ? current.parentLink.mutationNode.currentTransition : -1);
+        ApplyState state = new ApplyState(null, -1, mutationCursor.content(), transformer);
+        int prevLevel = 0;
 
-        Trie.Remaining has = current.startIteration();
         while (true)
         {
-            if (has != null)
-            {
-                // We have a transition, get child to descend into
-                Node<U, ApplyState<U>> child = current.getCurrentChild(state);
-
-                if (child == null)
-                {
-                    // no child, get next
-                    has = current.advanceIteration();
-                }
-                else
-                {
-                    state = new ApplyState<U>(child, current.currentTransition);
-                    current = child;
-                    has = current.startIteration();
-                }
-            }
-            else
+            int level = mutationCursor.advance();
+            while (prevLevel >= level)
             {
                 // There are no more children. Ascend to the parent state to continue walk.
-                state = state.attachAndMoveToParentState(transformer);
+                state = state.attachAndMoveToParentState();
+                --prevLevel;
                 if (state == null)
-                    break;
-                current = state.mutationNode;
-                has = current.advanceIteration();
+                {
+                    assert prevLevel == -1;
+                    return;
+                }
             }
+            assert level == prevLevel + 1;
+
+            // We have a transition, get child to descend into
+            state = new ApplyState(state, mutationCursor.transition(), mutationCursor.content(), transformer);
+            prevLevel = level;
         }
     }
 
@@ -819,6 +815,7 @@ public <R> void putRecursive(ByteComparable key, R value, final UpsertTransforme
             root = newRoot;
     }
 
+    @DontInline
     private <R> int putRecursive(int node, ByteSource key, R value, final UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
     {
         int transition = key.next();
diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
index 8c79d1e17aba..9fca393d7420 100644
--- a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
@@ -118,4 +118,56 @@ public <L> Node<T, L> root()
     {
         return makeNode(null, key.asComparableBytes(BYTE_COMPARABLE_VERSION));
     }
+
+    public Cursor cursor()
+    {
+        return new Cursor();
+    }
+
+    class Cursor implements Trie.Cursor<T>
+    {
+        ByteSource.Peekable src = ByteSource.peekable(key.asComparableBytes(BYTE_COMPARABLE_VERSION));
+        int currentLevel = 0;
+        int currentTransition = -1;
+
+        public int advance()
+        {
+            currentTransition = src.next();
+            if (currentTransition != ByteSource.END_OF_STREAM)
+                return ++currentLevel;
+            else
+                return currentLevel = -1;
+        }
+
+        public int level()
+        {
+            return currentLevel;
+        }
+
+        public T content()
+        {
+            return src.peek() == ByteSource.END_OF_STREAM ? value : null;
+        }
+
+        public int transition()
+        {
+            return currentTransition;
+        }
+
+        public void retrieveKey(byte[] dest)
+        {
+            ByteSource srcCopy = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+            for (int i = 0; i < currentLevel; ++i)
+                dest[i] = (byte) srcCopy.next();
+        }
+
+        public int transitionAtLevel(int level)
+        {
+            ByteSource srcCopy = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
+            int next = -1;
+            for (int i = 0; i <= level; ++i)
+                next = (byte) srcCopy.next();
+            return next;
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index 2873e0a0704c..fce612464fdd 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -201,6 +201,56 @@ public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver
      */
     protected abstract <L> Node<T, L> root();
 
+    // Cursor-style walks
+    interface Cursor<T>
+    {
+        int advance(); // returns level (can be prev+1 or <=prev), -1 means done
+        default int advanceMultiple() // advance, descending multiple levels if that does not require extra work (e.g. chain nodes)
+        {
+            return advance();
+        }
+
+        default T advanceToContent() // advances all the way (to next content)
+        {
+            while (true)
+            {
+                int level = advanceMultiple();
+                if (level < 0)
+                    return null;
+                T content = content();
+                if (content != null)
+                    return content;
+            }
+        }
+
+//        int advanceTo(int transition); // advance to child with this transition or higher. if none exists, ascend to parent and advance
+//        default int ascend() // ignore the remaining children at this level or below and ascend to parent and advance
+//        {
+//            return advanceTo(Integer.MAX_VALUE);
+//        }
+
+        int level(); // return current state
+        default int transition()
+        {
+            return transitionAtLevel(level() - 1);
+        }
+        T content();
+
+        int transitionAtLevel(int level);
+
+        default void retrieveKey(byte[] dest) // length is the level
+        {
+            int level = level();
+            for (int i = 0; i < level; ++i)
+                dest[i] = (byte) transitionAtLevel(i);
+        }
+    }
+
+    protected Cursor<T> cursor()
+    {
+        return new CursorFromNode<>(this);
+    }
+
     // Version of the byte comparable conversion to use for all operations
     static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS41;
 
@@ -443,6 +493,32 @@ public <L> Node<Object, L> root()
         {
             return null;
         }
+
+        protected Cursor<Object> cursor()
+        {
+            return new Cursor<Object>()
+            {
+                public int advance()
+                {
+                    return -1;
+                }
+
+                public int level()
+                {
+                    return -1;
+                }
+
+                public Object content()
+                {
+                    return null;
+                }
+
+                public int transitionAtLevel(int level)
+                {
+                    return 0;
+                }
+            };
+        }
     };
 
     @SuppressWarnings("unchecked")
diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumper.java b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
index 60f11963000a..d031f6c46554 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieDumper.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
@@ -24,6 +24,7 @@
  */
 class TrieDumper<T> implements TrieWalker<T, String>
 {
+    // TODO: Test then make simpler direct version
     private final Function<T, String> contentToString;
     private final StringBuilder b = new StringBuilder();
     private int depth = -1;
diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
index e0a1d5281fe2..1ade8ddc657f 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
@@ -17,30 +17,40 @@
  */
 package org.apache.cassandra.db.tries;
 
+import java.util.AbstractMap;
+import java.util.Arrays;
 import java.util.Map;
 
+import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 
 /**
  * Convertor of trie entries to iterator where each entry is passed through {@link #mapContent} (to be implemented by
  * descendants).
  */
-public abstract class TrieEntriesIterator<T, V> extends TrieIteratorWithKey<T, V>
+public abstract class TrieEntriesIterator<T, V> extends AbstractIterator<V>
 {
+    private final Trie.Cursor<T> cursor;
+
     protected TrieEntriesIterator(Trie<T> trie)
     {
-        super(trie);
+        this.cursor = trie.cursor();
     }
 
-    V contentOf(Trie.Node<T, NodeWithPosition<T>> node)
+    public V computeNext()
     {
-        T content = node.content();
-        if (content == null)
-            return null;
-        return mapContent(content, path, ppos);
+        T value = cursor.advanceToContent();
+        if (value == null)
+            return endOfData();
+
+        int keyLength = cursor.level();
+
+        byte[] array = new byte[keyLength];
+        cursor.retrieveKey(array);
+        return mapContent(value, array);
     }
 
-    protected abstract V mapContent(T content, byte[] bytes, int byteLength);
+    protected abstract V mapContent(T content, byte[] bytes);
 
     /**
      * Iterator representing the content of the trie a sequence of (path, content) pairs.
@@ -53,9 +63,15 @@ public AsEntries(Trie<T> trie)
             super(trie);
         }
 
-        protected Map.Entry<ByteComparable, T> mapContent(T content, byte[] bytes, int byteLength)
+        protected Map.Entry<ByteComparable, T> mapContent(T content, byte[] bytes)
         {
-            return toEntry(content, bytes, byteLength);
+            return toEntry(content, bytes);
         }
     }
+
+    static <T> java.util.Map.Entry<ByteComparable, T> toEntry(T content, byte[] bytes)
+    {
+        ByteComparable b = ByteComparable.fixedLength(bytes);
+        return new AbstractMap.SimpleImmutableEntry<>(b, content);
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieIterator.java b/src/java/org/apache/cassandra/db/tries/TrieIterator.java
deleted file mode 100644
index b5591fc67fa1..000000000000
--- a/src/java/org/apache/cassandra/db/tries/TrieIterator.java
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.tries;
-
-import org.apache.cassandra.utils.AbstractIterator;
-
-/**
- * Utility class for performing some walks over the trie that result in an iterator of items.
- * See TrieValuesIterator and TrieEntriesIterator for usage.
- */
-abstract class TrieIterator<T, L, V> extends AbstractIterator<V>
-{
-    private Trie.Node<T, L> current;
-
-    protected TrieIterator(Trie<T> trie)
-    {
-        current = trie.root();
-        if (current == null)
-            endOfData();
-    }
-
-    protected V computeNext()
-    {
-        Trie.Remaining has = startIteration();
-
-        while (true)
-        {
-            if (has != null)
-            {
-                // We have a transition, get child to descend into
-                Trie.Node<T, L> child = getChild(current, has);
-
-                if (child == null)
-                {
-                    // no child, get next
-                    has = advanceIteration();
-                }
-                else
-                {
-                    // Enter node
-                    current = child;
-                    // Check payload
-                    V v = contentOf(child);
-                    if (v != null)
-                        return v; // payload was produced, wait for next()
-
-                    has = startIteration();
-                }
-            }
-            else
-            {
-                // There are no more children. Ascend to the parent state to continue walk.
-                current = exitNodeAndReturnParent(current);
-                if (current == null)
-                {
-                    // We've reached back the root, our walk is finished
-                    return endOfData();
-                }
-                has = advanceIteration();
-            }
-        }
-    }
-
-    /**
-     * Start the iteration on a node. Can be overridden by children (e.g. to skip processing branch).
-     */
-    Trie.Remaining startIteration()
-    {
-        return current.startIteration();
-    }
-
-    /**
-     * Advance the iteration on a node. Can be overridden by children (e.g. to skip processing selected transitions).
-     */
-    Trie.Remaining advanceIteration()
-    {
-        return current.advanceIteration();
-    }
-
-    // The methods below are to be overridden by subclasses.
-
-    /**
-     * Called by advance to descend into a child node.
-     */
-    abstract Trie.Node<T, L> getChild(Trie.Node<T, L> node, Trie.Remaining has);
-
-    /**
-     * Called when a node is exited.
-     * Returns the parent with which to continue the traversal.
-     */
-    abstract Trie.Node<T, L> exitNodeAndReturnParent(Trie.Node<T, L> n);
-
-    /**
-     * Called to retrieve the content to be issued for a given node (e.g. content(), or a (path, content()) pair.
-     */
-    abstract V contentOf(Trie.Node<T, L> n);
-}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java b/src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java
deleted file mode 100644
index 97fb8ecbf482..000000000000
--- a/src/java/org/apache/cassandra/db/tries/TrieIteratorWithKey.java
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.tries;
-
-import java.util.AbstractMap;
-import java.util.Arrays;
-
-import org.agrona.concurrent.UnsafeBuffer;
-import org.apache.cassandra.utils.bytecomparable.ByteComparable;
-
-/**
- * Convertor of trie content to flow, with information about he paths used to reach the content node.
- * Descendants need to implement {@link TrieIterator#contentOf(Trie.Node)}; when the method is called the first
- * {@link #ppos} bytes of {@link #path} will be filled with the path used to reach the node.
- */
-public abstract class TrieIteratorWithKey<T, V>
-extends TrieIterator<T, TrieIteratorWithKey.NodeWithPosition<T>, V>
-        implements Trie.TransitionsReceiver
-{
-    byte[] path = new byte[256];
-    int ppos = 0;
-    NodeWithPosition<T> currentParentLink = (NodeWithPosition<T>) NO_LINK;
-
-    static final NodeWithPosition<Object> NO_LINK = new NodeWithPosition<>(-1, null);
-
-    static class NodeWithPosition<T>
-    {
-        final int ppos;
-        final Trie.Node<T, NodeWithPosition<T>> node;
-
-        NodeWithPosition(int ppos, Trie.Node<T, NodeWithPosition<T>> node)
-        {
-            this.ppos = ppos;
-            this.node = node;
-        }
-    }
-
-    protected TrieIteratorWithKey(Trie<T> trie)
-    {
-        super(trie);
-    }
-
-    public void add(int t)
-    {
-        if (ppos == path.length)
-            path = Arrays.copyOf(path, path.length * 2);
-        path[ppos++] = (byte) t;
-    }
-
-    public void add(UnsafeBuffer b, int pos, int count)
-    {
-        if (ppos + count > path.length)
-            path = Arrays.copyOf(path, Math.max(ppos + count + 16, path.length * 2));
-        b.getBytes(pos, path, ppos, count);
-        ppos += count;
-    }
-
-    Trie.Node<T, NodeWithPosition<T>> getChild(Trie.Node<T, NodeWithPosition<T>> node, Trie.Remaining has)
-    {
-        int currentPos = ppos;
-        add(node.currentTransition);
-
-        NodeWithPosition<T> parentLink;
-        if (has == Trie.Remaining.ONE)
-        {
-            // As in TrieValuesIterator, when we are processing the last child of a node we can skip it when backtracking.
-            parentLink = node.parentLink;
-        }
-        else
-        {
-            assert has != null;
-            // Otherwise, we need to be returning to this node. Create a parentLink object if one doesn't yet exist,
-            // saving the byte position in the path.
-            if (currentParentLink.node != node)
-            {
-                assert currentParentLink.ppos < currentPos;
-                currentParentLink = new NodeWithPosition<>(currentPos, node);
-            }
-            parentLink = currentParentLink;
-        }
-
-        Trie.Node<T, NodeWithPosition<T>> child = node.getCurrentChild(parentLink);
-
-        if (child != null)
-            child = child.getUniqueDescendant(parentLink, this);
-
-        if (child == null)
-            ppos = parentLink != null ? parentLink.ppos : 0; // restore state as we won't get an exitNodeAndReturnParent call.
-
-        return child;
-    }
-
-    Trie.Node<T, NodeWithPosition<T>> exitNodeAndReturnParent(Trie.Node<T, NodeWithPosition<T>> n)
-    {
-        NodeWithPosition<T> parentLink = n.parentLink;
-        if (parentLink == null)
-            return null;
-        else
-        {
-            ppos = parentLink.ppos;
-            currentParentLink = parentLink;
-            return parentLink.node;
-        }
-    }
-
-    static <T> java.util.Map.Entry<ByteComparable, T> toEntry(T content, byte[] bytes, int byteLength)
-    {
-        ByteComparable b = ByteComparable.fixedLength(Arrays.copyOf(bytes, byteLength));
-        return new AbstractMap.SimpleImmutableEntry<>(b, content);
-    }
-}
diff --git a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
index 7a473f80bdb0..26ca4968744e 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.db.tries;
 
+import org.apache.cassandra.utils.AbstractIterator;
+
 /**
  * Convertor of trie contents to flow.
  *
@@ -24,34 +26,21 @@
  * Java. Using {@code <>} when instantiating works, but any subclasses will also need to declare this useless type
  * argument.
  */
-class TrieValuesIterator<T, L extends Trie.Node<T, L>> extends TrieIterator<T, L, T>
+class TrieValuesIterator<T> extends AbstractIterator<T>
 {
-    public TrieValuesIterator(Trie<T> trie)
-    {
-        super(trie);
-    }
-
-    Trie.Node<T, L> getChild(Trie.Node<T, L> node, Trie.Remaining has)
-    {
-        // If we know this is the last child for this node, we can just as well skip this node when backtracking,
-        final L parentLink = has == Trie.Remaining.ONE ? node.parentLink : (L) node;
-
-        Trie.Node<T, L> child = node.getCurrentChild(parentLink);
-
-        // and as long as any child has single descendant, we don't need to backtrack to that either.
-        if (child != null)
-            child = child.getUniqueDescendant(parentLink, null);
-
-        return child;
-    }
+    private final Trie.Cursor<T> cursor;
 
-    Trie.Node<T, L> exitNodeAndReturnParent(Trie.Node<T, L> n)
+    protected TrieValuesIterator(Trie<T> trie)
     {
-        return n.parentLink;
+        cursor = trie.cursor();
     }
 
-    T contentOf(Trie.Node<T, L> node)
+    protected T computeNext()
     {
-        return node.content();
+        T value = cursor.advanceToContent();
+        if (value == null)
+            return endOfData();
+        else
+            return value;
     }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieWalker.java b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
index a17e9907bd24..5b435e2a86e8 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieWalker.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
@@ -49,48 +49,28 @@ public interface TrieWalker<T, V>
      */
     V completion();
 
-    public static <T, V, L extends Trie.Node<T, L>> V process(TrieWalker<T, V> walker, Trie<T> trie)
+    public static <T, V> V process(TrieWalker<T, V> walker, Trie<T> trie)
     {
-        Trie.Node<T, L> current = trie.root();
-        if (current == null)
+        Trie.Cursor<T> cursor = trie.cursor();
+        if (cursor.level() == -1)
             return walker.completion();
 
-        walker.onNodeEntry(-1, current.content());
+        walker.onNodeEntry(-1, cursor.content());
 
-        Trie.Remaining has = current.startIteration();
-
-        while (true)
+        int prevLevel = 0;
+        int level = cursor.advance();
+        while (level != -1)
         {
-            if (has != null)
-            {
-                // We have a transition, get child to descend into
-                Trie.Node<T, L> child = current.getCurrentChild((L) current);
-                if (child == null)
-                {
-                    // no child, get next
-                    has = current.advanceIteration();
-                }
-                else
-                {
-                    walker.onNodeEntry(current.currentTransition, child.content());
-
-                    // We have a new child. Move to it
-                    current = child;
-                    has = child.startIteration();
-                }
-            }
-            else
+            while (prevLevel >= level)
             {
-                // There are no more children. Ascend to the parent state to continue walk.
                 walker.onNodeExit();
-                current = current.parentLink;
-                if (current == null)
-                {
-                    // We've reached back the root, our walk is finished
-                    return walker.completion();
-                }
-                has = current.advanceIteration();
+                --prevLevel;
+                assert prevLevel >= 0;
             }
+            walker.onNodeEntry(cursor.transition(), cursor.content());
+            prevLevel = level;
+            level = cursor.advance();
         }
+        return walker.completion();
     }
 }

From 09f5584695bf6e8f4f5707cce9bc31ec9c4066ac Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 2 Apr 2021 14:16:43 +0300
Subject: [PATCH 124/151] Fix write benchmark

---
 .../tries/MemtableTrieWriteBench.java         | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
index 71600812dfac..9a0287ea4505 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
@@ -25,6 +25,7 @@
 import org.apache.cassandra.io.compress.BufferType;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
 
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.MILLISECONDS)
@@ -47,7 +48,7 @@ public class MemtableTrieWriteBench
     final static MemtableTrie.UpsertTransformer<Byte, Byte> resolver = (x, y) -> y;
 
     @Benchmark
-    public void putSequential() throws MemtableTrie.SpaceExhaustedException
+    public void putSequential(Blackhole bh) throws MemtableTrie.SpaceExhaustedException
     {
         MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
         ByteBuffer buf = ByteBuffer.allocate(keyLength);
@@ -58,10 +59,12 @@ public void putSequential() throws MemtableTrie.SpaceExhaustedException
             buf.putLong(keyLength - 8, l);
             trie.putRecursive(ByteComparable.fixedLength(buf), Byte.valueOf((byte) (l >> 56)), resolver);
         }
+        System.out.println(trie.valuesCount());
+        bh.consume(trie);
     }
 
     @Benchmark
-    public void putRandom() throws MemtableTrie.SpaceExhaustedException
+    public void putRandom(Blackhole bh) throws MemtableTrie.SpaceExhaustedException
     {
         MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
         Random rand = new Random(1);
@@ -70,12 +73,14 @@ public void putRandom() throws MemtableTrie.SpaceExhaustedException
         for (long current = 0; current < count; ++current)
         {
             rand.nextBytes(buf);
-            trie.putRecursive(ByteComparable.fixedLength(buf), buf[0], resolver);
+            trie.putRecursive(ByteComparable.fixedLength(buf), Byte.valueOf(buf[0]), resolver);
         }
+        System.out.println(trie.valuesCount());
+        bh.consume(trie);
     }
 
     @Benchmark
-    public void applySequential() throws MemtableTrie.SpaceExhaustedException
+    public void applySequential(Blackhole bh) throws MemtableTrie.SpaceExhaustedException
     {
         MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
         ByteBuffer buf = ByteBuffer.allocate(keyLength);
@@ -86,10 +91,12 @@ public void applySequential() throws MemtableTrie.SpaceExhaustedException
             buf.putLong(keyLength - 8, l);
             trie.putSingleton(ByteComparable.fixedLength(buf), Byte.valueOf((byte) (l >> 56)), resolver);
         }
+        System.out.println(trie.valuesCount());
+        bh.consume(trie);
     }
 
     @Benchmark
-    public void applyRandom() throws MemtableTrie.SpaceExhaustedException
+    public void applyRandom(Blackhole bh) throws MemtableTrie.SpaceExhaustedException
     {
         MemtableTrie<Byte> trie = new MemtableTrie(bufferType);
         Random rand = new Random(1);
@@ -100,5 +107,7 @@ public void applyRandom() throws MemtableTrie.SpaceExhaustedException
             rand.nextBytes(buf);
             trie.putSingleton(ByteComparable.fixedLength(buf), Byte.valueOf(buf[0]), resolver);
         }
+        System.out.println(trie.valuesCount());
+        bh.consume(trie);
     }
 }

From 658311525fb5f5f7ce0ded2beeede9bad9ab0bdc Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 2 Apr 2021 14:54:56 +0300
Subject: [PATCH 125/151] Apply state in array

---
 .../cassandra/db/tries/MemtableTrie.java      | 162 ++++++++++++------
 1 file changed, 113 insertions(+), 49 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index 65a92f31a7be..452a15091cb7 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -18,6 +18,7 @@
 package org.apache.cassandra.db.tries;
 
 import java.nio.ByteBuffer;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.NoSuchElementException;
 import java.util.concurrent.atomic.AtomicReferenceArray;
@@ -65,6 +66,7 @@ public class MemtableTrie<T> extends MemtableReadTrie<T>
 
     private int allocatedPos = 0;
     private int contentCount = 0;
+    private int maxDepth = 0;
 
     private final BufferType bufferType;    // on or off heap
 
@@ -561,21 +563,42 @@ private int preserveContent(int existingPreContentNode,
     /**
      * State of the walk of the given mutation trie. Passed to mutation nodes in their parentState link.
      */
-    class ApplyState
+    class ApplyState<U>
     {
-        final ApplyState parentState;
+        final UpsertTransformer<T, U> transformer;
+        int[] data = new int[Math.max(maxDepth + 1, 16) * 5];
+        int currentLevel = -1;
+
+        ApplyState(UpsertTransformer<T, U> transformer)
+        {
+            this.transformer = transformer;
+        }
 
         /**
          * Pointer to the existing node before skipping over content nodes, i.e. this is either the same as
          * existingPostContentNode or a pointer to a prefix or leaf node whose child is existingPostContentNode.
          */
-        final int existingPreContentNode;
+        int existingPreContentNode()
+        {
+            return data[currentLevel * 5 + 0];
+        }
+        void setExistingPreContentNode(int value)
+        {
+            data[currentLevel * 5 + 0] = value;
+        }
 
         /**
          * Pointer to the existing node being updated, after any content nodes have been skipped and before any
          * modification have been applied. Always a non-content node.
          */
-        final int existingPostContentNode;
+        int existingPostContentNode()
+        {
+            return data[currentLevel * 5 + 1];
+        }
+        void setExistingPostContentNode(int value)
+        {
+            data[currentLevel * 5 + 1] = value;
+        }
 
         /**
          * The updated node, i.e. the node to which the relevant modifications are being applied. This will change as
@@ -585,26 +608,57 @@ class ApplyState
          * applied in-place, this will be the same as existingPostContentNode, otherwise a completely different
          * pointer. Always a non-content node.
          */
-        int updatedPostContentNode;
+        int updatedPostContentNode()
+        {
+            return data[currentLevel * 5 + 2];
+        }
 
-        final int parentTransition;
-        final int contentIndex;
+        void setUpdatedPostContentNode(int value)
+        {
+            data[currentLevel * 5 + 2] = value;
+        }
 
-        <U> ApplyState(ApplyState parentState, int transition, U mutationContent, UpsertTransformer<T, U> transformer)
+        int transition()
         {
-            this.parentState = parentState;
-            this.parentTransition = transition;
+            return data[currentLevel * 5 + 3];
+        }
+        void setTransition(int transition)
+        {
+            data[currentLevel * 5 + 3] = transition;
+        }
+        int contentIndex()
+        {
+            return data[currentLevel * 5 + 4];
+        }
+        void setContentIndex(int value)
+        {
+            data[currentLevel * 5 + 4] = value;
+        }
 
-            if (parentState == null)
+        void descend(int transition, U mutationContent)
+        {
+            int existingPreContentNode;
+            if (currentLevel < 0)
                 existingPreContentNode = root;
             else
             {
-                existingPreContentNode = isNull(parentState.existingPostContentNode)
+                setTransition(transition);
+                existingPreContentNode = isNull(existingPostContentNode())
                                          ? NONE
-                                         : getChild(parentState.existingPostContentNode, transition);
+                                         : getChild(existingPostContentNode(), transition);
+            }
+
+            ++currentLevel;
+            if (currentLevel > maxDepth)
+            {
+                maxDepth = currentLevel;
+                if (currentLevel * 5 >= data.length)
+                    data = Arrays.copyOf(data, currentLevel * 5 * 2);
             }
+            setExistingPreContentNode(existingPreContentNode);
 
             int existingContentIndex = -1;
+            int existingPostContentNode;
             if (isLeaf(existingPreContentNode))
             {
                 existingContentIndex = ~existingPreContentNode;
@@ -617,7 +671,15 @@ else if (offset(existingPreContentNode) == PREFIX_OFFSET)
             }
             else
                 existingPostContentNode = existingPreContentNode;
+            setExistingPostContentNode(existingPostContentNode);
+            setUpdatedPostContentNode(existingPostContentNode);
+
+            int contentIndex = updateContentIndex(mutationContent, existingContentIndex);
+            setContentIndex(contentIndex);
+        }
 
+        private int updateContentIndex(U mutationContent, int existingContentIndex)
+        {
             if (mutationContent != null)
             {
                 if (existingContentIndex != -1)
@@ -626,43 +688,47 @@ else if (offset(existingPreContentNode) == PREFIX_OFFSET)
                     T combinedContent = transformer.apply(existingContent, mutationContent);
                     setContent(existingContentIndex, combinedContent);
                     if (combinedContent != null)
-                        contentIndex = existingContentIndex;
+                        return existingContentIndex;
                     else
-                        contentIndex = -1;
+                        return -1;
                 }
                 else
                 {
                     T combinedContent = transformer.apply(null, mutationContent);
                     if (combinedContent != null)
-                        contentIndex = addContent(combinedContent);
+                        return addContent(combinedContent);
                     else
-                        contentIndex = -1;
+                        return -1;
                 }
             }
             else
-                contentIndex = existingContentIndex;
-
-            updatedPostContentNode = existingPostContentNode;
+                return existingContentIndex;
         }
 
         private void attachChild(int transition, int ourChild) throws SpaceExhaustedException
         {
+            int updatedPostContentNode = updatedPostContentNode();
             if (isNull(updatedPostContentNode))
-                updatedPostContentNode = expandOrCreateChainNode(transition, ourChild);
+                setUpdatedPostContentNode(expandOrCreateChainNode(transition, ourChild));
             else
-                updatedPostContentNode = MemtableTrie.this.attachChild(updatedPostContentNode,
-                                                                       transition,
-                                                                       ourChild);
+                setUpdatedPostContentNode(MemtableTrie.this.attachChild(updatedPostContentNode,
+                                                                        transition,
+                                                                        ourChild));
         }
 
         private int applyContent() throws SpaceExhaustedException
         {
+            int contentIndex = contentIndex();
+            int updatedPostContentNode = updatedPostContentNode();
             if (contentIndex == -1)
                 return updatedPostContentNode;
 
             if (isNull(updatedPostContentNode))
                 return ~contentIndex;
 
+            int existingPreContentNode = existingPreContentNode();
+            int existingPostContentNode = existingPostContentNode();
+
             // We can't update in-place if there was no preexisting prefix, or if the prefix was embedded and the target
             // node must change.
             if (existingPreContentNode == existingPostContentNode ||
@@ -677,13 +743,13 @@ private int applyContent() throws SpaceExhaustedException
             return existingPreContentNode;
         }
 
-        private <U> ApplyState attachAndMoveToParentState() throws SpaceExhaustedException
+        // true if still have work to do, false if completed
+        private boolean attachAndMoveToParentState() throws SpaceExhaustedException
         {
-            ApplyState parentState = this.parentState;
-
             int updatedPreContentNode = applyContent();
-
-            if (parentState == null)
+            int existingPreContentNode = existingPreContentNode();
+            --currentLevel;
+            if (currentLevel == -1)
             {
                 assert root == existingPreContentNode;
                 if (updatedPreContentNode != existingPreContentNode)
@@ -692,14 +758,11 @@ private <U> ApplyState attachAndMoveToParentState() throws SpaceExhaustedExcepti
                     // we don't want to invalidate the value in other cores' caches unnecessarily).
                     root = updatedPreContentNode;
                 }
-
-                return null;
+                return false;
             }
-
             if (updatedPreContentNode != existingPreContentNode)
-                parentState.attachChild(parentTransition, updatedPreContentNode);
-
-            return parentState;
+                attachChild(transition(), updatedPreContentNode);
+            return true;
         }
     }
 
@@ -737,29 +800,26 @@ public <U> void apply(Trie<U> mutation, final UpsertTransformer<T, U> transforme
         if (mutationCursor.level() == -1)
             return;
         assert mutationCursor.level() == 0;
-
-        ApplyState state = new ApplyState(null, -1, mutationCursor.content(), transformer);
-        int prevLevel = 0;
+        ApplyState state = new ApplyState(transformer);
+        state.descend(-1, mutationCursor.content());
+        assert state.currentLevel == 0;
 
         while (true)
         {
             int level = mutationCursor.advance();
-            while (prevLevel >= level)
+            while (state.currentLevel >= level)
             {
                 // There are no more children. Ascend to the parent state to continue walk.
-                state = state.attachAndMoveToParentState();
-                --prevLevel;
-                if (state == null)
+                if (!state.attachAndMoveToParentState())
                 {
-                    assert prevLevel == -1;
+                    assert level == -1;
                     return;
                 }
             }
-            assert level == prevLevel + 1;
 
             // We have a transition, get child to descend into
-            state = new ApplyState(state, mutationCursor.transition(), mutationCursor.content(), transformer);
-            prevLevel = level;
+            state.descend(mutationCursor.transition(), mutationCursor.content());
+            assert state.currentLevel == level;
         }
     }
 
@@ -810,23 +870,27 @@ public <R> void putSingleton(ByteComparable key,
      */
     public <R> void putRecursive(ByteComparable key, R value, final UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
     {
-        int newRoot = putRecursive(root, key.asComparableBytes(BYTE_COMPARABLE_VERSION), value, transformer);
+        int newRoot = putRecursive(root, key.asComparableBytes(BYTE_COMPARABLE_VERSION), 0, value, transformer);
         if (newRoot != root)
             root = newRoot;
     }
 
     @DontInline
-    private <R> int putRecursive(int node, ByteSource key, R value, final UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
+    private <R> int putRecursive(int node, ByteSource key, int depth, R value, final UpsertTransformer<T, R> transformer) throws SpaceExhaustedException
     {
         int transition = key.next();
         if (transition == ByteSource.END_OF_STREAM)
+        {
+            if (depth > maxDepth)
+                maxDepth = depth;
             return applyContent(node, value, transformer);
+        }
 
         int child = NONE;
         if (!isNull(node))
             child = getChild(node, transition);
 
-        int newChild = putRecursive(child, key, value, transformer);
+        int newChild = putRecursive(child, key, depth + 1, value, transformer);
         if (newChild == child)
             return node;
 

From abe473f85d1464a61a82600047ec3c03b771ec31 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 2 Apr 2021 15:08:34 +0300
Subject: [PATCH 126/151] Retained and reused applyState

---
 .../cassandra/db/tries/MemtableTrie.java      | 24 ++++++++++---------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index 452a15091cb7..a25f563faf3e 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -560,18 +560,19 @@ private int preserveContent(int existingPreContentNode,
         return updatePrefixNodeChild(existingPreContentNode, updatedPostContentNode);
     }
 
+    final ApplyState applyState = new ApplyState();
+
     /**
      * State of the walk of the given mutation trie. Passed to mutation nodes in their parentState link.
      */
-    class ApplyState<U>
+    class ApplyState
     {
-        final UpsertTransformer<T, U> transformer;
-        int[] data = new int[Math.max(maxDepth + 1, 16) * 5];
+        int[] data = new int[16 * 5];
         int currentLevel = -1;
 
-        ApplyState(UpsertTransformer<T, U> transformer)
+        void reset()
         {
-            this.transformer = transformer;
+            currentLevel = -1;
         }
 
         /**
@@ -635,7 +636,7 @@ void setContentIndex(int value)
             data[currentLevel * 5 + 4] = value;
         }
 
-        void descend(int transition, U mutationContent)
+        <U> void descend(int transition, U mutationContent, final UpsertTransformer<T, U> transformer)
         {
             int existingPreContentNode;
             if (currentLevel < 0)
@@ -674,11 +675,11 @@ else if (offset(existingPreContentNode) == PREFIX_OFFSET)
             setExistingPostContentNode(existingPostContentNode);
             setUpdatedPostContentNode(existingPostContentNode);
 
-            int contentIndex = updateContentIndex(mutationContent, existingContentIndex);
+            int contentIndex = updateContentIndex(mutationContent, existingContentIndex, transformer);
             setContentIndex(contentIndex);
         }
 
-        private int updateContentIndex(U mutationContent, int existingContentIndex)
+        private <U> int updateContentIndex(U mutationContent, int existingContentIndex, final UpsertTransformer<T, U> transformer)
         {
             if (mutationContent != null)
             {
@@ -800,8 +801,9 @@ public <U> void apply(Trie<U> mutation, final UpsertTransformer<T, U> transforme
         if (mutationCursor.level() == -1)
             return;
         assert mutationCursor.level() == 0;
-        ApplyState state = new ApplyState(transformer);
-        state.descend(-1, mutationCursor.content());
+        ApplyState state = applyState;
+        state.reset();
+        state.descend(-1, mutationCursor.content(), transformer);
         assert state.currentLevel == 0;
 
         while (true)
@@ -818,7 +820,7 @@ public <U> void apply(Trie<U> mutation, final UpsertTransformer<T, U> transforme
             }
 
             // We have a transition, get child to descend into
-            state.descend(mutationCursor.transition(), mutationCursor.content());
+            state.descend(mutationCursor.transition(), mutationCursor.content(), transformer);
             assert state.currentLevel == level;
         }
     }

From c85baf1544dbd1b54eaf16c0fef792afbaa6fc8c Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Sat, 3 Apr 2021 10:07:25 +0300
Subject: [PATCH 127/151] Cursor implementation for MemtableTrie

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 206 +++++++++++++++++-
 .../cassandra/db/tries/MemtableTrie.java      |   1 -
 2 files changed, 199 insertions(+), 8 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 7aad6a7773b4..8f07c0804f46 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -17,6 +17,7 @@
  */
 package org.apache.cassandra.db.tries;
 
+import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicReferenceArray;
 import java.util.function.Function;
 
@@ -181,6 +182,7 @@ Block offsets used to identify node types (by comparing them to the node 'pointe
      */
     static final int NONE = 0;
 
+    int maxDepth = 0;
     volatile int root;
 
     /*
@@ -427,7 +429,7 @@ int getSparseChild(int node, int trans)
     int splitNodeMidIndex(int trans)
     {
         // first 2 bytes of the 2-3-3 split
-        return (trans >> 6) & 0x3;
+        return (trans >> 6);
     }
 
     /** Given a transition, returns the corresponding index (within the mid block) of the pointer to the tail block of
@@ -535,9 +537,7 @@ public Remaining advanceIteration()
 
         Remaining nextValid(int trans)
         {
-            if (trans >= 0x100)
-                return null;
-
+            assert trans >= 0 && trans <= 0x100;
             // Splits the 2-3-3 parts of the transition
             int midIndex = splitNodeMidIndex(trans);
             int tailIdx = splitNodeTailIndex(trans);
@@ -818,15 +818,207 @@ void dump(int indent, StringBuilder b, Function<T, String> contentToString)
         }
     }
 
+    public <L> BaseNode<L> root()
+    {
+        return makeNode(root, null);
+    }
+
+
     /*
-     Direct read methods
+     Cursor implementation
      */
 
-    public <L> BaseNode<L> root()
+    class MemtableCursor implements Cursor<T>
     {
-        return makeNode(root, null);
+        int[] nodes = new int[maxDepth + 1];
+        byte[] transitions = new byte[maxDepth + 1];
+        T content;
+        int level = -1;
+
+        MemtableCursor()
+        {
+            descendInto(root);
+        }
+
+        public int advance()
+        {
+            int transition = -1;
+            while (true)
+            {
+                int child = advanceToNextChild(level, transition);
+                if (child != NONE)
+                    return descendInto(child);
+
+                --level;
+                if (level == -1)
+                    return level;
+                transition = transitions[level] & 0xFF;
+            }
+        }
+
+        private int descendInto(int child)
+        {
+            ++level;
+            maybeGrow(level);
+            content = getNodeContent(child);
+            nodes[level] = followContentTransition(child);
+            return level;
+        }
+
+        private void maybeGrow(int level)
+        {
+            if (level >= nodes.length)
+            {
+                // A concurrent update may have increased the max depth beyond what we saw at initialization.
+                // There's a happens before between us getting here, reading a volatile write, and the writer thread
+                // updating maxDepth, so if we read it now we should get an updated value.
+                int newLength = maxDepth + 1;
+                assert level < newLength;
+                nodes = Arrays.copyOf(nodes, newLength);
+                transitions = Arrays.copyOf(transitions, newLength);
+            }
+        }
+
+        int advanceToNextChild(int level, int transition)
+        {
+            int node = nodes[level];
+            if (isNull(node))
+                return NONE;
+
+            switch (offset(node))
+            {
+            case SPLIT_OFFSET:
+                return nextValidSplitTransition(node, transition + 1);
+            case SPARSE_OFFSET:
+                return nextValidSparseTransition(node, transition + 1);
+            default:
+                return nextValidChainTransition(node, transition + 1);
+            }
+        }
+
+        int nextValidSplitTransition(int node, int trans)
+        {
+            assert trans >= 0 && trans <= 0x100;
+            // Splits the 2-3-3 parts of the transition
+            int midIndex = splitNodeMidIndex(trans);
+            int tailIdx = splitNodeTailIndex(trans);
+            int childIdx = splitNodeChildIndex(trans);
+
+            while (midIndex < 4)
+            {
+                int mid = getInt(node + SPLIT_POINTER_OFFSET + midIndex * 4);
+                if (!isNull(mid))
+                {
+                    while (tailIdx < 8)
+                    {
+                        int tail = getInt(mid + tailIdx * 4);
+                        if (!isNull(tail))
+                        {
+                            while (childIdx < 8)
+                            {
+                                int child = getInt(tail + childIdx * 4);
+                                if (!isNull(child))
+                                {
+                                    transitions[level] = (byte) ((midIndex << 6) | (tailIdx << 3) | childIdx);
+                                    return child;
+                                }
+                                ++childIdx;
+                            }
+                        }
+                        childIdx = 0;
+                        ++tailIdx;
+                    }
+                }
+                tailIdx = 0;
+                ++midIndex;
+            }
+            return NONE;
+        }
+
+        private int nextValidSparseTransition(int node, int transition)
+        {
+            int minValid = Integer.MAX_VALUE;
+            int minChild = NONE;
+
+            for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
+            {
+                int child = getInt(node + SPARSE_CHILDREN_OFFSET + i * 4);
+                int t = getByte(node + SPARSE_BYTES_OFFSET + i);
+                if (child != NONE && t >= transition && t < minValid)
+                {
+                    minValid = t;
+                    minChild = child;
+                }
+            }
+            transitions[level] = (byte) minValid;  // don't care if we override with garbage on no match
+            return minChild;
+        }
+
+        private int nextValidChainTransition(int node, int transition)
+        {
+            int chainByte = getByte(node);
+            if (transition <= chainByte)
+            {
+                transitions[level] = (byte) chainByte;
+                int next = node + 1;
+                return offset(next) <= CHAIN_MAX_OFFSET
+                       ? next
+                       : getInt(next);
+            }
+            else
+                return NONE;
+        }
+
+        public int advanceMultiple()
+        {
+            int node = nodes[level];
+            if (isNull(node) || offset(node) > CHAIN_MAX_OFFSET)
+                return advance();
+
+            int pointer = chainBlockChildPointer(node);
+            int length = pointer - node;
+            UnsafeBuffer buffer = getBuffer(node);
+            int ofs = getOffset(node);
+
+            maybeGrow(level + length);
+            buffer.getBytes(ofs, transitions, level, length);
+            Arrays.fill(nodes, level, level + length, NONE);
+
+            level += length - 1; // compensate for increase below
+            return descendInto(getInt(pointer));
+        }
+
+        public int level()
+        {
+            return level;
+        }
+
+        public T content()
+        {
+            return content;
+        }
+
+        public int transitionAtLevel(int level)
+        {
+            return transitions[level];
+        }
+
+        public void retrieveKey(byte[] dest)
+        {
+            System.arraycopy(transitions, 0, dest, 0, level);
+        }
+    }
+
+    public MemtableCursor cursor()
+    {
+        return new MemtableCursor();
     }
 
+
+    /*
+     Direct read methods
+     */
+
     /**
      * Get the content mapped by the specified key.
      * Fast implementation using integer node addresses.
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index a25f563faf3e..7ca5ac9d9caf 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -66,7 +66,6 @@ public class MemtableTrie<T> extends MemtableReadTrie<T>
 
     private int allocatedPos = 0;
     private int contentCount = 0;
-    private int maxDepth = 0;
 
     private final BufferType bufferType;    // on or off heap
 

From 989351f5ed5982dad3d52a0c653853afcaebe91b Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Tue, 6 Apr 2021 11:55:47 +0300
Subject: [PATCH 128/151] Use TransitionsReceiver, only store backtracking
 steps

---
 .../cassandra/db/tries/CursorFromNode.java    |  13 +-
 .../cassandra/db/tries/MemtableReadTrie.java  | 166 +++++++++++-------
 .../cassandra/db/tries/MemtableTrie.java      |   2 +-
 .../cassandra/db/tries/SingletonTrie.java     |   2 +-
 .../org/apache/cassandra/db/tries/Trie.java   |  43 +++--
 .../db/tries/TrieEntriesIterator.java         |  42 +++--
 .../db/tries/TrieValuesIterator.java          |   2 +-
 .../apache/cassandra/db/tries/TrieWalker.java |   2 +-
 .../db/tries/MemtableTrieTestBase.java        |   4 +-
 9 files changed, 162 insertions(+), 114 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
index 33419fd35b37..5ef9a46050f4 100644
--- a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
+++ b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
@@ -77,15 +77,10 @@ public T content()
         return current.content();
     }
 
-    public int transitionAtLevel(int level)
+    public int incomingTransition()
     {
-        int nodeLevel = this.level;
-        Trie.Node<T, Trie.Node> node = current;
-        while (nodeLevel > level)
-        {
-            node = node.parentLink;
-            --nodeLevel;
-        }
-        return node.currentTransition;
+
+        Trie.Node<T, Trie.Node> parent = current.parentLink;
+        return parent != null ? parent.currentTransition : -1;
     }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 8f07c0804f46..4655929f8b9d 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -830,60 +830,82 @@ public <L> BaseNode<L> root()
 
     class MemtableCursor implements Cursor<T>
     {
-        int[] nodes = new int[maxDepth + 1];
-        byte[] transitions = new byte[maxDepth + 1];
-        T content;
-        int level = -1;
+        private int[] backtrack = new int[48];
+        private int backtrackLevel = 0;
+
+        private int currentNode;
+
+        private int incomingTransition;
+        private T content;
+        private int level = -1;
 
         MemtableCursor()
         {
-            descendInto(root);
+            descendInto(root, -1);
         }
 
+        private int node(int backtrackLevel)
+        {
+            return backtrack[backtrackLevel * 3 + 0];
+        }
+        private int data(int backtrackLevel)
+        {
+            return backtrack[backtrackLevel * 3 + 1];
+        }
+        private int level(int backtrackLevel)
+        {
+            return backtrack[backtrackLevel * 3 + 2];
+        }
+        void addBacktrack(int node, int data, int level)
+        {
+            if (backtrackLevel * 3 >= backtrack.length)
+                backtrack = Arrays.copyOf(backtrack, backtrack.length * 2);
+            backtrack[backtrackLevel * 3 + 0] = node;
+            backtrack[backtrackLevel * 3 + 1] = data;
+            backtrack[backtrackLevel * 3 + 2] = level;
+            ++backtrackLevel;
+        }
+
+        @Override
         public int advance()
         {
             int transition = -1;
             while (true)
             {
-                int child = advanceToNextChild(level, transition);
-                if (child != NONE)
-                    return descendInto(child);
-
-                --level;
-                if (level == -1)
+                if (advanceToNextChild(currentNode, transition))
                     return level;
-                transition = transitions[level] & 0xFF;
+
+                if (--backtrackLevel < 0)
+                    return -1;
+
+                level = level(backtrackLevel);
+                currentNode = node(backtrackLevel);
+                transition = data(backtrackLevel);
             }
         }
 
-        private int descendInto(int child)
+        private int descendInto(int child, int transition)
         {
             ++level;
-            maybeGrow(level);
+            incomingTransition = transition;
             content = getNodeContent(child);
-            nodes[level] = followContentTransition(child);
+            currentNode = followContentTransition(child);
             return level;
         }
 
-        private void maybeGrow(int level)
+        private int descendIntoChain(int child, int transition)
         {
-            if (level >= nodes.length)
-            {
-                // A concurrent update may have increased the max depth beyond what we saw at initialization.
-                // There's a happens before between us getting here, reading a volatile write, and the writer thread
-                // updating maxDepth, so if we read it now we should get an updated value.
-                int newLength = maxDepth + 1;
-                assert level < newLength;
-                nodes = Arrays.copyOf(nodes, newLength);
-                transitions = Arrays.copyOf(transitions, newLength);
-            }
+            ++level;
+            incomingTransition = transition;
+            content = null;
+            currentNode = child;
+            return level;
         }
 
-        int advanceToNextChild(int level, int transition)
+        private boolean advanceToNextChild(int node, int transition)
         {
-            int node = nodes[level];
             if (isNull(node))
-                return NONE;
+                return false;
 
             switch (offset(node))
             {
@@ -892,11 +914,11 @@ int advanceToNextChild(int level, int transition)
             case SPARSE_OFFSET:
                 return nextValidSparseTransition(node, transition + 1);
             default:
-                return nextValidChainTransition(node, transition + 1);
+                return getChainTransition(node);
             }
         }
 
-        int nextValidSplitTransition(int node, int trans)
+        private boolean nextValidSplitTransition(int node, int trans)
         {
             assert trans >= 0 && trans <= 0x100;
             // Splits the 2-3-3 parts of the transition
@@ -919,8 +941,10 @@ int nextValidSplitTransition(int node, int trans)
                                 int child = getInt(tail + childIdx * 4);
                                 if (!isNull(child))
                                 {
-                                    transitions[level] = (byte) ((midIndex << 6) | (tailIdx << 3) | childIdx);
-                                    return child;
+                                    int transition = ((midIndex << 6) | (tailIdx << 3) | childIdx);
+                                    addBacktrack(node, transition, level);
+                                    descendInto(child, transition);
+                                    return true;
                                 }
                                 ++childIdx;
                             }
@@ -932,60 +956,71 @@ int nextValidSplitTransition(int node, int trans)
                 tailIdx = 0;
                 ++midIndex;
             }
-            return NONE;
+            return false;
         }
 
-        private int nextValidSparseTransition(int node, int transition)
+        private boolean nextValidSparseTransition(int node, int transition)
         {
             int minValid = Integer.MAX_VALUE;
             int minChild = NONE;
+            int validCount = 0;
 
             for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
             {
                 int child = getInt(node + SPARSE_CHILDREN_OFFSET + i * 4);
+                if (child == NONE)
+                    break;
                 int t = getByte(node + SPARSE_BYTES_OFFSET + i);
-                if (child != NONE && t >= transition && t < minValid)
+                if (t >= transition)
                 {
-                    minValid = t;
-                    minChild = child;
+                    if (t < minValid)
+                    {
+                        minValid = t;
+                        minChild = child;
+                    }
+                    ++validCount;
                 }
             }
-            transitions[level] = (byte) minValid;  // don't care if we override with garbage on no match
-            return minChild;
+            if (validCount == 0)
+                return false;
+
+            if (validCount > 1)
+                addBacktrack(node, minValid, level);
+
+            descendInto(minChild, minValid);
+            return true;
         }
 
-        private int nextValidChainTransition(int node, int transition)
+        private boolean getChainTransition(int node)
         {
-            int chainByte = getByte(node);
-            if (transition <= chainByte)
-            {
-                transitions[level] = (byte) chainByte;
-                int next = node + 1;
-                return offset(next) <= CHAIN_MAX_OFFSET
-                       ? next
-                       : getInt(next);
-            }
+            // no backtracking needed
+            int transition = getByte(node);
+            int next = node + 1;
+            if (offset(next) <= CHAIN_MAX_OFFSET)
+                descendIntoChain(next, transition);
             else
-                return NONE;
+                descendInto(getInt(next), transition);
+            return true;
         }
 
-        public int advanceMultiple()
+        @Override
+        public int advanceMultiple(TransitionsReceiver receiver)
         {
-            int node = nodes[level];
+            int node = currentNode;
             if (isNull(node) || offset(node) > CHAIN_MAX_OFFSET)
-                return advance();
+                return Cursor.advanceMultiple(this, receiver);
 
             int pointer = chainBlockChildPointer(node);
             int length = pointer - node;
-            UnsafeBuffer buffer = getBuffer(node);
-            int ofs = getOffset(node);
-
-            maybeGrow(level + length);
-            buffer.getBytes(ofs, transitions, level, length);
-            Arrays.fill(nodes, level, level + length, NONE);
+            if (receiver != null)
+            {
+                UnsafeBuffer buffer = getBuffer(node);
+                int ofs = getOffset(node);
+                receiver.add(buffer, ofs, length);
+            }
 
             level += length - 1; // compensate for increase below
-            return descendInto(getInt(pointer));
+            return descendInto(getInt(pointer), -1);
         }
 
         public int level()
@@ -998,14 +1033,9 @@ public T content()
             return content;
         }
 
-        public int transitionAtLevel(int level)
-        {
-            return transitions[level];
-        }
-
-        public void retrieveKey(byte[] dest)
+        public int incomingTransition()
         {
-            System.arraycopy(transitions, 0, dest, 0, level);
+            return incomingTransition;
         }
     }
 
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index 7ca5ac9d9caf..4c5e5906c55b 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -819,7 +819,7 @@ public <U> void apply(Trie<U> mutation, final UpsertTransformer<T, U> transforme
             }
 
             // We have a transition, get child to descend into
-            state.descend(mutationCursor.transition(), mutationCursor.content(), transformer);
+            state.descend(mutationCursor.incomingTransition(), mutationCursor.content(), transformer);
             assert state.currentLevel == level;
         }
     }
diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
index 9fca393d7420..c87fe48ba527 100644
--- a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
@@ -149,7 +149,7 @@ public T content()
             return src.peek() == ByteSource.END_OF_STREAM ? value : null;
         }
 
-        public int transition()
+        public int incomingTransition()
         {
             return currentTransition;
         }
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index fce612464fdd..de06eff5748b 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -98,6 +98,8 @@ protected interface TransitionsReceiver
         void add(int t);
         /** Add the count bytes from position pos at the given buffer. */
         void add(UnsafeBuffer b, int pos, int count);
+        /** Delete all bytes beyond the given length. */
+        void reset(int newLength);
     }
 
     /**
@@ -205,16 +207,31 @@ public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver
     interface Cursor<T>
     {
         int advance(); // returns level (can be prev+1 or <=prev), -1 means done
-        default int advanceMultiple() // advance, descending multiple levels if that does not require extra work (e.g. chain nodes)
+        default int advanceMultiple(TransitionsReceiver receiver) // advance, descending multiple levels if that does not require extra work (e.g. chain nodes)
         {
-            return advance();
+            return advanceMultiple(this, receiver);
         }
 
-        default T advanceToContent() // advances all the way (to next content)
+        static int advanceMultiple(Cursor c, TransitionsReceiver receiver)
+        {
+            if (receiver == null)
+                return c.advance();
+
+            int prevLevel = c.level();
+            int level = c.advance();
+            if (level < 0)
+                return level;
+            if (level <= prevLevel)
+                receiver.reset(level - 1);
+            receiver.add(c.incomingTransition());
+            return level;
+        }
+
+        default T advanceToContent(TransitionsReceiver receiver) // advances all the way (to next content)
         {
             while (true)
             {
-                int level = advanceMultiple();
+                int level = advanceMultiple(receiver);
                 if (level < 0)
                     return null;
                 T content = content();
@@ -230,20 +247,8 @@ default T advanceToContent() // advances all the way (to next content)
 //        }
 
         int level(); // return current state
-        default int transition()
-        {
-            return transitionAtLevel(level() - 1);
-        }
+        int incomingTransition(); // not set in advanceMultiple/ToCursor
         T content();
-
-        int transitionAtLevel(int level);
-
-        default void retrieveKey(byte[] dest) // length is the level
-        {
-            int level = level();
-            for (int i = 0; i < level; ++i)
-                dest[i] = (byte) transitionAtLevel(i);
-        }
     }
 
     protected Cursor<T> cursor()
@@ -513,9 +518,9 @@ public Object content()
                     return null;
                 }
 
-                public int transitionAtLevel(int level)
+                public int incomingTransition()
                 {
-                    return 0;
+                    return -1;
                 }
             };
         }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
index 1ade8ddc657f..825bcb7cd5a9 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
@@ -21,6 +21,7 @@
 import java.util.Arrays;
 import java.util.Map;
 
+import org.agrona.concurrent.UnsafeBuffer;
 import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 
@@ -28,9 +29,11 @@
  * Convertor of trie entries to iterator where each entry is passed through {@link #mapContent} (to be implemented by
  * descendants).
  */
-public abstract class TrieEntriesIterator<T, V> extends AbstractIterator<V>
+public abstract class TrieEntriesIterator<T, V> extends AbstractIterator<V> implements Trie.TransitionsReceiver
 {
     private final Trie.Cursor<T> cursor;
+    private byte[] keyBytes = new byte[32];
+    private int keyPos = 0;
 
     protected TrieEntriesIterator(Trie<T> trie)
     {
@@ -39,18 +42,35 @@ protected TrieEntriesIterator(Trie<T> trie)
 
     public V computeNext()
     {
-        T value = cursor.advanceToContent();
+        T value = cursor.advanceToContent(this);
         if (value == null)
             return endOfData();
 
-        int keyLength = cursor.level();
+        return mapContent(value, keyBytes, keyPos);
+    }
+
+    public void add(int t)
+    {
+        if (keyPos >= keyBytes.length)
+            keyBytes = Arrays.copyOf(keyBytes, keyPos * 2);
+        keyBytes[keyPos++] = (byte) t;
+    }
 
-        byte[] array = new byte[keyLength];
-        cursor.retrieveKey(array);
-        return mapContent(value, array);
+    public void add(UnsafeBuffer b, int pos, int count)
+    {
+        int newPos = keyPos + count;
+        if (newPos > keyBytes.length)
+            keyBytes = Arrays.copyOf(keyBytes, Math.max(newPos + 16, keyBytes.length * 2));
+        b.getBytes(pos, keyBytes, keyPos, count);
+        keyPos = newPos;
+    }
+
+    public void reset(int newLength)
+    {
+        keyPos = newLength;
     }
 
-    protected abstract V mapContent(T content, byte[] bytes);
+    protected abstract V mapContent(T content, byte[] bytes, int byteLength);
 
     /**
      * Iterator representing the content of the trie a sequence of (path, content) pairs.
@@ -63,15 +83,15 @@ public AsEntries(Trie<T> trie)
             super(trie);
         }
 
-        protected Map.Entry<ByteComparable, T> mapContent(T content, byte[] bytes)
+        protected Map.Entry<ByteComparable, T> mapContent(T content, byte[] bytes, int byteLength)
         {
-            return toEntry(content, bytes);
+            return toEntry(content, bytes, byteLength);
         }
     }
 
-    static <T> java.util.Map.Entry<ByteComparable, T> toEntry(T content, byte[] bytes)
+    static <T> java.util.Map.Entry<ByteComparable, T> toEntry(T content, byte[] bytes, int byteLength)
     {
-        ByteComparable b = ByteComparable.fixedLength(bytes);
+        ByteComparable b = ByteComparable.fixedLength(Arrays.copyOf(bytes, byteLength));
         return new AbstractMap.SimpleImmutableEntry<>(b, content);
     }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
index 26ca4968744e..c05e0649e0f8 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
@@ -37,7 +37,7 @@ protected TrieValuesIterator(Trie<T> trie)
 
     protected T computeNext()
     {
-        T value = cursor.advanceToContent();
+        T value = cursor.advanceToContent(null);
         if (value == null)
             return endOfData();
         else
diff --git a/src/java/org/apache/cassandra/db/tries/TrieWalker.java b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
index 5b435e2a86e8..37a9fb0c68c3 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieWalker.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
@@ -67,7 +67,7 @@ public static <T, V> V process(TrieWalker<T, V> walker, Trie<T> trie)
                 --prevLevel;
                 assert prevLevel >= 0;
             }
-            walker.onNodeEntry(cursor.transition(), cursor.content());
+            walker.onNodeEntry(cursor.incomingTransition(), cursor.content());
             prevLevel = level;
             level = cursor.advance();
         }
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
index 480cfb2e0e8f..fa2ca8ef0b42 100644
--- a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
@@ -23,14 +23,12 @@
 import java.util.function.Function;
 import java.util.stream.Stream;
 
-import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Throwables;
 import com.google.common.collect.HashMultiset;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
 import com.google.common.collect.Multiset;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import org.apache.cassandra.io.compress.BufferType;
@@ -289,7 +287,7 @@ public void testDirect()
                           trie.sizeOnHeap(), trie.sizeOffHeap(), onh, keysize, ts);
         System.out.format("per entry on heap %.2f off heap %.2f measured %.2f keys %.2f treemap %.2f\n",
                           trie.sizeOnHeap() * 1.0 / COUNT, trie.sizeOffHeap() * 1.0 / COUNT, onh * 1.0 / COUNT, keysize * 1.0 / COUNT, ts * 1.0 / COUNT);
-        // System.out.println("Trie " + trie.dump(ByteBufferUtil::bytesToHex).get());
+        // System.out.println("Trie " + trie.dump(ByteBufferUtil::bytesToHex));
 
         assertSameContent(trie, content);
         checkGet(trie, content);

From 3b1f8da79bc91abbf2eed9ca1bfd0fbe416eab52 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 7 Apr 2021 11:50:01 +0300
Subject: [PATCH 129/151] Intersection words, but RangeTrieSet is very ugly

---
 .../cassandra/db/tries/CursorFromNode.java    |  13 +-
 .../cassandra/db/tries/MemtableReadTrie.java  |  18 +-
 .../cassandra/db/tries/RangeTrieSet.java      | 291 ++++++++++++++++--
 .../db/tries/SetIntersectionTrie.java         | 123 +++++++-
 .../cassandra/db/tries/SingletonTrie.java     |   5 +
 .../org/apache/cassandra/db/tries/Trie.java   |  15 +-
 .../db/tries/TrieEntriesIterator.java         |  30 +-
 .../apache/cassandra/db/tries/TrieSet.java    |  64 +---
 .../db/tries/TrieValuesIterator.java          |  30 +-
 .../utils/bytecomparable/ByteComparable.java  |  33 ++
 .../utils/bytecomparable/ByteSource.java      |  20 ++
 .../db/tries/SetIntersectionTrieTest.java     | 131 +++++++-
 12 files changed, 662 insertions(+), 111 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
index 5ef9a46050f4..54e0d520ef33 100644
--- a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
+++ b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
@@ -42,7 +42,11 @@ public Object content()
 
     public int advance()
     {
-        Trie.Remaining has = current.startIteration();
+        return advance(current.startIteration());
+    }
+
+    private int advance(Trie.Remaining has)
+    {
         Trie.Node<T, Trie.Node> child = null;
         do
         {
@@ -67,6 +71,13 @@ public int advance()
         return ++level;
     }
 
+    public int ascend()
+    {
+        --level;
+        current = current.parentLink;
+        return advance(current.advanceIteration());
+    }
+
     public int level()
     {
         return level;
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 4655929f8b9d..ab9d8e733e5b 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -869,7 +869,11 @@ void addBacktrack(int node, int data, int level)
         @Override
         public int advance()
         {
-            int transition = -1;
+            return advance(-1);
+        }
+
+        private int advance(int transition)
+        {
             while (true)
             {
                 if (advanceToNextChild(currentNode, transition))
@@ -884,6 +888,18 @@ public int advance()
             }
         }
 
+        @Override
+        public int ascend()
+        {
+            if (--backtrackLevel < 0)
+                return -1;
+
+            level = level(backtrackLevel);
+            currentNode = node(backtrackLevel);
+            int transition = data(backtrackLevel);
+            return advance(transition);
+        }
+
         private int descendInto(int child, int transition)
         {
             ++level;
diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
index 2dfe7ccd9ca0..229476bab295 100644
--- a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -17,6 +17,8 @@
  */
 package org.apache.cassandra.db.tries;
 
+import java.util.Arrays;
+
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 import org.apache.cassandra.utils.bytecomparable.ByteSource;
 
@@ -47,25 +49,271 @@ public class RangeTrieSet extends TrieSet
         this.includeRight = includeRight;
     }
 
-    public SetNode root()
+    protected Cursor<InSet> cursor()
+    {
+        return new RangeCursor();
+    }
+
+    private class RangeCursor implements Cursor<InSet>
+    {
+        private int[] backlog;
+        int backlogPos;
+        private ByteSource remainingLeftLimit;
+        private ByteSource remainingRightLimit;
+        boolean atLeftLimit;
+        boolean atRightLimit;
+        boolean rightLimitDone;
+        int leftLimitNext;
+        int rightLimitNext;
+        int transitionAtRightLevel;
+        private int incomingTransition;
+        private int level;
+        InSet inSet;
+
+
+        private RangeCursor()
+        {
+            backlog = new int[32];
+            backlogPos = 0;
+            level = 0;
+            atLeftLimit = left != null;
+            if (atLeftLimit)
+            {
+                remainingLeftLimit = left.asComparableBytes(BYTE_COMPARABLE_VERSION);
+                leftLimitNext = remainingLeftLimit.next();
+                inSet = InSet.PREFIX;
+                if (leftLimitNext == ByteSource.END_OF_STREAM)
+                {
+                    atLeftLimit = false;
+                    if (includeLeft)
+                        inSet = InSet.CONTAINED;
+                    transitionAtRightLevel = -1;
+                }
+            }
+            else
+            {
+                inSet = InSet.CONTAINED;
+                transitionAtRightLevel = -1;
+            }
+
+            atRightLimit = right != null;
+            if (atRightLimit)
+            {
+                remainingRightLimit = right.asComparableBytes(BYTE_COMPARABLE_VERSION);
+                rightLimitNext = remainingRightLimit.next();
+                if (rightLimitNext == ByteSource.END_OF_STREAM)
+                {
+                    rightLimitDone = true;
+                    assert !atLeftLimit;
+                    atRightLimit = false;
+                    if (!includeRight)
+                    {
+                        level = -1;
+                        inSet = InSet.PREFIX;
+                        return;
+                    }
+                }
+                else
+                    rightLimitDone = false;
+            }
+            else
+            {
+                // else we exhaust the backlog at level -1 and terminate before any continueAlongRight is called
+                rightLimitNext = 255;
+                rightLimitDone = true;
+            }
+
+            incomingTransition = -1;
+            if (!atLeftLimit && !atRightLimit && rightLimitNext >= 0 && inSet == InSet.CONTAINED)
+                inSet = InSet.BRANCH;
+        }
+
+
+        public int advance()
+        {
+            if (atLeftLimit)
+            {
+                if (atRightLimit)
+                    return descendAlongBoth();
+                else
+                    return descendAlongLeft();
+            }
+
+            if (processBacklog())
+                return level;
+
+            return continueAlongRight();
+        }
+
+        private int descendAlongBoth()
+        {
+            assert rightLimitNext >= leftLimitNext;
+            int next = leftLimitNext;
+            leftLimitNext = remainingLeftLimit.next();
+            if (rightLimitNext == next)
+                rightLimitNext = remainingRightLimit.next();
+            else
+            {
+                transitionAtRightLevel = next + 1;
+                atRightLimit = false;
+            }
+
+            incomingTransition = next;
+            if (leftLimitNext != ByteSource.END_OF_STREAM)
+            {
+                inSet = InSet.PREFIX;
+            }
+            else
+            {
+                atLeftLimit = false;
+                if (rightLimitNext == ByteSource.END_OF_STREAM)
+                {
+                    if (includeLeft && includeRight)
+                        inSet = InSet.CONTAINED;
+                    else
+                        return -1;
+                }
+                inSet = includeLeft ? InSet.BRANCH : InSet.PREFIX;//: InSet.BRANCH_EXCLUDING;
+            }
+            return ++level;
+        }
+
+        private int descendAlongLeft()
+        {
+            int next = leftLimitNext;
+            leftLimitNext = remainingLeftLimit.next();
+            addBacklog(next + 1);
+
+            incomingTransition = next;
+            if (leftLimitNext != ByteSource.END_OF_STREAM)
+            {
+                inSet = InSet.PREFIX;
+            }
+            else
+            {
+                atLeftLimit = false;
+                inSet = includeLeft ? InSet.BRANCH : InSet.PREFIX;//: InSet.BRANCH_EXCLUDING;
+            }
+            return ++level;
+        }
+
+        private boolean processBacklog()
+        {
+            while (backlogPos > 0)
+            {
+                incomingTransition = backlog[backlogPos - 1]++;
+                if (incomingTransition < 256)
+                {
+                    inSet = InSet.BRANCH;
+                    return true;
+                }
+                --backlogPos;
+                --level;
+            }
+            return false;
+        }
+
+        private int continueAlongRight()
+        {
+            if (transitionAtRightLevel < 0)
+            {
+                transitionAtRightLevel = 0;
+                ++level;
+            }
+            incomingTransition = transitionAtRightLevel++;
+
+            if (incomingTransition < rightLimitNext)
+            {
+                inSet = InSet.BRANCH;
+                return level;
+            }
+            else // (incomingTransition == rightLimitNext)
+            {
+                if (rightLimitDone)
+                    return -1;
+
+                rightLimitNext = remainingRightLimit.next();
+                if (rightLimitNext == ByteSource.END_OF_STREAM)
+                {
+                    rightLimitDone = true;
+                    if (!includeRight)
+                        return -1;
+                }
+                transitionAtRightLevel = -1;
+                inSet = InSet.CONTAINED;
+                return level;
+            }
+        }
+
+        void addBacklog(int transition)
+        {
+            if (backlogPos == backlog.length)
+                backlog = Arrays.copyOf(backlog, backlogPos * 2);
+            backlog[backlogPos++] = transition;
+        }
+
+        public int ascend()
+        {
+            atLeftLimit = false;
+            if (processBacklog())
+                return level;
+            if (transitionAtRightLevel < 0)
+                return -1;
+            return continueAlongRight();
+        }
+
+        public int level()
+        {
+            return level;
+        }
+
+        public int incomingTransition()
+        {
+            return incomingTransition;
+        }
+
+        public InSet content()
+        {
+            return inSet;
+        }
+    }
+
+
+    // TODO: Change to start/stop sets when nodes are taken out of the picture
+
+    public <L> Node<InSet, L> root()
     {
         return makeNode(left == null ? null : left.asComparableBytes(Trie.BYTE_COMPARABLE_VERSION),
                         left != null,
                         right == null ? null : right.asComparableBytes(Trie.BYTE_COMPARABLE_VERSION),
-                        right != null);
+                        right != null,
+                        null);
     }
 
-    private SetNode makeNode(ByteSource lLimit, boolean atLLimit, ByteSource rLimit, boolean atRLimit)
+    private <L> Node<InSet, L> makeNode(ByteSource lLimit, boolean atLLimit, ByteSource rLimit, boolean atRLimit, L parentLink)
     {
         // We only have a constraint on the branch if we are at one or both boundaries.
         // If the node falls completely between them, the whole branch (at any depth) is in the set.
         if (!atLLimit && !atRLimit)
-            return FULL;
+            return parentLink == null ? (Node<InSet, L>) FULL : new FullNode<>(parentLink);
+
+        return new RangeNode<>(lLimit, atLLimit, rLimit, atRLimit, parentLink);
+    }
 
-        return new RangeNode(lLimit, atLLimit, rLimit, atRLimit);
+    class FullNode<L> extends NoChildrenNode<InSet, L>
+    {
+        FullNode(L parent)
+        {
+            super(parent);
+        }
+
+        public InSet content()
+        {
+            return InSet.BRANCH;
+        }
     }
 
-    class RangeNode implements SetNode
+    class RangeNode<L> extends Node<InSet, L>
     {
         /** Byte at the left boundary, inclusive. */
         final int llimit;
@@ -79,13 +327,11 @@ class RangeNode implements SetNode
         final boolean atRLimit;
 
         /** Whether the current path is in the covered set. */
-        final boolean inSet;
+        final InSet inSet;
 
-        int currentTransition;
-
-
-        RangeNode(ByteSource remainingLLimit, boolean atLLimit, ByteSource remainingRLimit, boolean atRLimit)
+        RangeNode(ByteSource remainingLLimit, boolean atLLimit, ByteSource remainingRLimit, boolean atRLimit, L parentLink)
         {
+            super(parentLink);
             int llimit = 0;
             boolean inSet = true;
             if (atLLimit)
@@ -119,34 +365,31 @@ class RangeNode implements SetNode
             this.remainingRLimit = remainingRLimit;
             this.atLLimit = atLLimit;
             this.atRLimit = atRLimit;
-            this.inSet = inSet;
+            this.inSet = inSet ? InSet.CONTAINED : InSet.PREFIX;
         }
 
-        public SetNode getCurrentChild()
+        public Node<InSet, L> getCurrentChild(L parentLink)
         {
             return makeNode(remainingLLimit, atLLimit && (currentTransition == llimit),
-                            remainingRLimit, atRLimit && (currentTransition == rlimit));
-        }
-
-        public int currentTransition()
-        {
-            return currentTransition;
+                            remainingRLimit, atRLimit && (currentTransition == rlimit),
+                            parentLink);
         }
 
-        public boolean startIteration()
+        public Remaining startIteration()
         {
             currentTransition = llimit;
-            return currentTransition <= rlimit;
+            return currentTransition <= rlimit ? Remaining.MULTIPLE : null;
         }
 
-        public boolean advanceIteration()
+        public Remaining advanceIteration()
         {
-            return ++currentTransition <= rlimit;
+            return ++currentTransition <= rlimit ? Remaining.MULTIPLE : null;
         }
 
-        public boolean inSet()
+        public InSet content()
         {
             return inSet;
         }
     }
+
 }
diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
index 52ee84feb0ea..19d19d981251 100644
--- a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -34,12 +34,12 @@ class SetIntersectionTrie<T> extends Trie<T>
     @Override
     public <L> Node<T, L> root()
     {
-        TrieSet.SetNode sRoot = intersectingSet.root();
+        Node<TrieSet.InSet, Void> sRoot = intersectingSet.root();
         if (sRoot == null)
             return null;
 
         Node<T, L> tRoot = trie.root();
-        if (sRoot == TrieSet.FULL)
+        if (sRoot.content() == TrieSet.InSet.BRANCH)
             return tRoot;
         if (tRoot == null)
             return null;
@@ -50,9 +50,9 @@ public <L> Node<T, L> root()
     static class IntersectionNode<T, L> extends Node<T, L>
     {
         final Node<T, L> tNode;
-        final TrieSet.SetNode sNode;
+        final Node<TrieSet.InSet, Void> sNode;
 
-        public IntersectionNode(Node<T, L> tNode, TrieSet.SetNode sNode)
+        public IntersectionNode(Node<T, L> tNode, Node<TrieSet.InSet, Void> sNode)
         {
             super(tNode.parentLink);
             this.tNode = tNode;
@@ -61,8 +61,8 @@ public IntersectionNode(Node<T, L> tNode, TrieSet.SetNode sNode)
 
         public Remaining startIteration()
         {
-            boolean sHas = sNode.startIteration();
-            if (!sHas)
+            Remaining sHas = sNode.startIteration();
+            if (sHas == null)
                 return null;
 
             return advanceToIntersection(tNode.startIteration());
@@ -70,18 +70,18 @@ public Remaining startIteration()
 
         public Remaining advanceIteration()
         {
-            boolean sHas = sNode.advanceIteration();
-            if (!sHas)
+            Remaining sHas = sNode.advanceIteration();
+            if (sHas == null)
                 return null;
             return advanceToIntersection(tNode.advanceIteration());
         }
 
         public Remaining advanceToIntersection(Remaining tHas)
         {
-            boolean sHas;
+            Remaining sHas;
             if (tHas == null)
                 return null;
-            int sByte = sNode.currentTransition();
+            int sByte = sNode.currentTransition;
             int tByte = tNode.currentTransition;
 
             while (tByte != sByte)
@@ -96,9 +96,9 @@ public Remaining advanceToIntersection(Remaining tHas)
                 else // (tByte > sByte)
                 {
                     sHas = sNode.advanceIteration();
-                    if (!sHas)
+                    if (sHas == null)
                         return null;
-                    sByte = sNode.currentTransition();
+                    sByte = sNode.currentTransition;
                 }
             }
             currentTransition = sByte;
@@ -107,7 +107,7 @@ public Remaining advanceToIntersection(Remaining tHas)
 
         public Node<T, L> getCurrentChild(L parent)
         {
-            TrieSet.SetNode receivedSetNode = sNode.getCurrentChild();
+            Node<TrieSet.InSet, Void> receivedSetNode = sNode.getCurrentChild(null);
 
             if (receivedSetNode == null)
                 return null;    // branch is completely outside the set
@@ -117,7 +117,7 @@ public Node<T, L> getCurrentChild(L parent)
             if (nn == null)
                 return null;
 
-            if (receivedSetNode == TrieSet.FULL)
+            if (receivedSetNode.content() == TrieSet.InSet.BRANCH)
                 return nn;     // Branch is fully covered, we no longer need to augment nodes there.
 
             return new IntersectionNode<>(nn, receivedSetNode);
@@ -125,9 +125,102 @@ public Node<T, L> getCurrentChild(L parent)
 
         public T content()
         {
-            if (sNode.inSet())
+            if (sNode.content().pointIncluded())
                 return tNode.content();
             return null;
         }
     }
+
+    protected Cursor<T> cursor()
+    {
+        return new IntersectionCursor(trie.cursor(), intersectingSet.cursor());
+    }
+
+    private class IntersectionCursor implements Cursor<T>
+    {
+        private final Cursor<T> tCursor;
+        private final Cursor<TrieSet.InSet> sCursor;
+        int branchLevel = Integer.MAX_VALUE;
+
+        public IntersectionCursor(Cursor<T> tCursor,
+                                  Cursor<TrieSet.InSet> sCursor)
+        {
+            this.tCursor = tCursor;
+            this.sCursor = sCursor;
+        }
+
+        public int advance()
+        {
+            int tLevel = tCursor.advance();
+            if (sCursor.content().branchCovered())
+            {
+                if (tLevel > sCursor.level())
+                    return tLevel;
+                // otherwise we have left the intersection's covered branch
+            }
+            int sLevel = sCursor.advance();
+
+            return advanceToIntersection(tLevel, sLevel);
+        }
+
+        public int ascend() // this is not tested ATM
+        {
+            int tLevel = tCursor.ascend();
+            if (sCursor.content().branchCovered())
+            {
+                if (tLevel > sCursor.level())
+                    return tLevel;
+                // otherwise we have left the intersection's covered branch
+            }
+            int sLevel = sCursor.ascend();
+
+            return advanceToIntersection(tLevel, sLevel);
+        }
+
+        private int advanceToIntersection(int tLevel, int sLevel)
+        {
+            while (sLevel != -1 && tLevel != -1)
+            {
+                if (sLevel == tLevel)
+                {
+                    int tIncoming = tCursor.incomingTransition();
+                    int sIncoming = sCursor.incomingTransition();
+                    if (sIncoming == tIncoming)
+                        return tLevel;  // got entry
+                    else if (sIncoming < tIncoming)
+                        sLevel = sCursor.advance();
+                    else // sIncoming > tIncoming
+                        tLevel = tCursor.advance();
+                }
+                else if (sLevel < tLevel)
+                {
+                    if (sCursor.content().branchCovered())
+                        return tLevel;
+                    tLevel = tCursor.ascend();
+                }
+                else // (sLevel > tLevel)
+                    sLevel = sCursor.ascend();
+            }
+            return -1;
+        }
+
+        // TODO: implement advanceMultiple
+
+        public int level()
+        {
+            return tCursor.level();
+        }
+
+        public int incomingTransition()
+        {
+            return tCursor.incomingTransition();
+        }
+
+        public T content()
+        {
+            return sCursor.content().pointIncluded()
+                   ? tCursor.content()
+                   : null;
+        }
+    }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
index c87fe48ba527..40dd20e0c6e5 100644
--- a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
@@ -139,6 +139,11 @@ public int advance()
                 return currentLevel = -1;
         }
 
+        public int ascend()
+        {
+            return -1;  // no alternatives
+        }
+
         public int level()
         {
             return currentLevel;
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index de06eff5748b..b1fa2e20d28e 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -241,7 +241,10 @@ default T advanceToContent(TransitionsReceiver receiver) // advances all the way
         }
 
 //        int advanceTo(int transition); // advance to child with this transition or higher. if none exists, ascend to parent and advance
-//        default int ascend() // ignore the remaining children at this level or below and ascend to parent and advance
+
+        int ascend();
+
+        //        default int ascend() // ignore the remaining children at this level or below and ascend to parent and advance
 //        {
 //            return advanceTo(Integer.MAX_VALUE);
 //        }
@@ -249,6 +252,7 @@ default T advanceToContent(TransitionsReceiver receiver) // advances all the way
         int level(); // return current state
         int incomingTransition(); // not set in advanceMultiple/ToCursor
         T content();
+
     }
 
     protected Cursor<T> cursor()
@@ -329,8 +333,8 @@ public static <T> Trie<T> singleton(ByteComparable b, T v)
      */
     public Trie<T> subtrie(ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight)
     {
-        if (left == null && right == null)
-            return this;
+//        if (left == null && right == null)
+//            return this;
 
         return new SetIntersectionTrie<>(this, TrieSet.range(left, includeLeft, right, includeRight));
     }
@@ -508,6 +512,11 @@ public int advance()
                     return -1;
                 }
 
+                public int ascend()
+                {
+                    return -1;
+                }
+
                 public int level()
                 {
                     return -1;
diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
index 825bcb7cd5a9..56511dc9a895 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
@@ -19,34 +19,48 @@
 
 import java.util.AbstractMap;
 import java.util.Arrays;
+import java.util.Iterator;
 import java.util.Map;
 
 import org.agrona.concurrent.UnsafeBuffer;
-import org.apache.cassandra.utils.AbstractIterator;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 
 /**
  * Convertor of trie entries to iterator where each entry is passed through {@link #mapContent} (to be implemented by
  * descendants).
  */
-public abstract class TrieEntriesIterator<T, V> extends AbstractIterator<V> implements Trie.TransitionsReceiver
+public abstract class TrieEntriesIterator<T, V> implements Iterator<V>, Trie.TransitionsReceiver
 {
     private final Trie.Cursor<T> cursor;
     private byte[] keyBytes = new byte[32];
     private int keyPos = 0;
+    T next;
+    boolean gotNext;
 
     protected TrieEntriesIterator(Trie<T> trie)
     {
-        this.cursor = trie.cursor();
+        cursor = trie.cursor();
+        next = cursor.content();
+        gotNext = next != null;
     }
 
-    public V computeNext()
+    public boolean hasNext()
     {
-        T value = cursor.advanceToContent(this);
-        if (value == null)
-            return endOfData();
+        if (!gotNext)
+        {
+            next = cursor.advanceToContent(this);
+            gotNext = true;
+        }
 
-        return mapContent(value, keyBytes, keyPos);
+        return next != null;
+    }
+
+    public V next()
+    {
+        gotNext = false;
+        T v = next;
+        next = null;
+        return mapContent(v, keyBytes, keyPos);
     }
 
     public void add(int t)
diff --git a/src/java/org/apache/cassandra/db/tries/TrieSet.java b/src/java/org/apache/cassandra/db/tries/TrieSet.java
index c1c0d46a879b..c978b6d731ea 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieSet.java
@@ -29,74 +29,44 @@
  * a node (e.g. with asynchronous trie walks), it must enforce a happens-before relationship between calls to the
  * methods of a node.
  */
-public abstract class TrieSet
+public abstract class TrieSet extends Trie<TrieSet.InSet>
 {
-    public abstract SetNode root();
-
-    interface SetNode
-    {
-        boolean startIteration();
-        boolean advanceIteration();
-        int currentTransition();
-        SetNode getCurrentChild();
-
-        /**
-         * Returns true if this specific position is in the set (i.e. if content in the intersected node should be
-         * returned).
-         *
-         * Note: Having a node produced by the trie set does not necessarily mean the relevant key is in the set.
-         * Imagine a singleton set, e.g. {010203}. It will be represented as the following trie:
-         *     root -01-> node1 -02-> node2 -03-> node3
-         * where only node3 will have inSet() == true. Root (corresponding to empty key), node1 (key 01) and node2 (key
-         * 0102) are not in the set and thus their inSet() will be false.
-         */
-        boolean inSet();
-    }
-
-    protected static final SetNode FULL = new SetNode()
+    enum InSet
     {
-        public AssertionError error()
-        {
-            throw new AssertionError("SetNode FULL must be handled explicitly.");
-        }
+        PREFIX, // this is a prefix node, and the specific point is not conained in the set (e.g. points on the left range path)
+        CONTAINED, // this is a prefix node, and the point is contained in the set (e.g. points on the right range path)
+        BRANCH; // the whole branch is contained in the set (e.g. interior nodes for a range)
 
-        public boolean startIteration()
+        boolean pointIncluded()
         {
-            throw error();
+            return this != PREFIX;
         }
 
-        public boolean advanceIteration()
+        boolean branchCovered()
         {
-            throw error();
-        }
-
-        public int currentTransition()
-        {
-            throw error();
-        }
-
-        public SetNode getCurrentChild()
-        {
-            throw error();
+            return this == BRANCH;
         }
+    }
 
-        public boolean inSet()
+    protected static final Node<InSet, Object> FULL = new NoChildrenNode<InSet, Object>(null)
+    {
+        public InSet content()
         {
-            throw error();
+            return InSet.BRANCH;
         }
     };
 
     private static final TrieSet FULL_SET = new TrieSet()
     {
-        public SetNode root()
+        public <L> Node<InSet, L> root()
         {
-            return FULL;
+            return (Node<InSet, L>) FULL;
         }
     };
 
     private static final TrieSet EMPTY_SET = new TrieSet()
     {
-        public SetNode root()
+        public <L> Node<InSet, L> root()
         {
             return null;
         }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
index c05e0649e0f8..7297ee12d1bc 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieValuesIterator.java
@@ -17,7 +17,7 @@
  */
 package org.apache.cassandra.db.tries;
 
-import org.apache.cassandra.utils.AbstractIterator;
+import java.util.Iterator;
 
 /**
  * Convertor of trie contents to flow.
@@ -26,21 +26,35 @@
  * Java. Using {@code <>} when instantiating works, but any subclasses will also need to declare this useless type
  * argument.
  */
-class TrieValuesIterator<T> extends AbstractIterator<T>
+class TrieValuesIterator<T> implements Iterator<T>
 {
     private final Trie.Cursor<T> cursor;
+    T next;
+    boolean gotNext = false;
 
     protected TrieValuesIterator(Trie<T> trie)
     {
         cursor = trie.cursor();
+        next = cursor.content();
+        gotNext = next != null;
     }
 
-    protected T computeNext()
+    public boolean hasNext()
     {
-        T value = cursor.advanceToContent(null);
-        if (value == null)
-            return endOfData();
-        else
-            return value;
+        if (!gotNext)
+        {
+            next = cursor.advanceToContent(null);
+            gotNext = true;
+        }
+
+        return next != null;
+    }
+
+    public T next()
+    {
+        gotNext = false;
+        T v = next;
+        next = null;
+        return v;
     }
 }
diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java
index a27999566472..ae7e1e2a861c 100644
--- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java
@@ -19,6 +19,9 @@
 package org.apache.cassandra.utils.bytecomparable;
 
 import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 
 /**
  * Interface indicating a value can be represented/identified by a comparable {@link ByteSource}.
@@ -57,6 +60,36 @@ default String byteComparableAsString(Version version)
         return builder.toString();
     }
 
+    default byte[] asArray(Version version)
+    {
+        ByteSource src = asComparableBytes(version);
+
+        final int step = 232;   // size chosen so that new byte[step] fits into 256 bytes
+        byte[] last = new byte[step];
+        int copied = src.nextBytes(last);
+        if (copied < step)
+            return Arrays.copyOf(last, copied);
+
+        List<byte[]> other = new ArrayList<>();
+        do
+        {
+            other.add(last);
+            last = new byte[step];
+            copied = src.nextBytes(last);
+        }
+        while (copied == step);
+
+        byte[] dest = new byte[other.size() * step + copied];
+        int pos = 0;
+        for (byte[] b : other)
+        {
+            System.arraycopy(b, 0, dest, pos, step);
+            pos += step;
+        }
+        System.arraycopy(last, 0, dest, pos, copied);
+        return dest;
+    }
+
     // Simple factories used for testing
 
     static ByteComparable of(String s)
diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
index dd7c231dfa88..ea4f1d053903 100644
--- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
@@ -34,6 +34,26 @@ public interface ByteSource
     /** Get the next byte, unsigned. Must be between 0 and 255, or END_OF_STREAM if there are no more bytes. */
     int next();
 
+    /**
+     * Fill in the next bytes of the source in the given array.
+     *
+     * @return The number of bytes transferred. If equal to the size of the destination, the source may have further
+     *         bytes to consume. Otherwise the source has been fully consumed and it would be an error to call this
+     *         method (or next()) again.
+     */
+    default int nextBytes(byte[] dest)
+    {
+        int i;
+        for (i = 0; i < dest.length; ++i)
+        {
+            int next = next();
+            if (next == END_OF_STREAM)
+                return i;
+            dest[i] = (byte) next;
+        }
+        return i;
+    }
+
     /** Value returned if at the end of the stream. */
     int END_OF_STREAM = -1;
 
diff --git a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
index 9661361fc76d..55cfb7909dae 100644
--- a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
+++ b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
@@ -74,6 +74,8 @@ public void testIntersectRange(int count)
             if (!includeLeft && !includeRight && cmp == 0)
                 includeRight = true;
             checkEqualRange(content1, trie1, l, includeLeft, r, includeRight);
+            checkEqualRange(content1, trie1, null, includeLeft, r, includeRight);
+            checkEqualRange(content1, trie1, l, includeLeft, null, includeRight);
         }
     }
 
@@ -152,7 +154,7 @@ public Node<Integer, L> getCurrentChild(L parent)
 
                 public Integer content()
                 {
-                    return null;
+                    return -1;
                 }
 
                 @Override
@@ -196,13 +198,134 @@ private static ByteComparable of(int value)
     }
 
     @Test
-    public void testSimpleIntersection()
+    public void testSimpleIntersectionII()
     {
         Trie<Integer> trie = singleLevelIntTrie(10);
-        assertEquals(asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
 
         Trie<Integer> intersection = trie.subtrie(of(3), true, of(7), true);
-        // Currently returns [4, 5, 6, 7, 8, 9], which "looks" wrong.
         assertEquals(asList(3, 4, 5, 6, 7), toList(intersection));
     }
+
+    @Test
+    public void testSimpleIntersectionEI()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(of(3), false, of(7), true);
+        assertEquals(asList(4, 5, 6, 7), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleIntersectionIE()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(of(3), true, of(7), false);
+        assertEquals(asList(3, 4, 5, 6), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleIntersectionEE()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(of(3), false, of(7), false);
+        assertEquals(asList(4, 5, 6), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleLeftIntersectionE()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(of(3), false, null, true);
+        assertEquals(asList(4, 5, 6, 7, 8, 9), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleLeftIntersectionI()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(of(3), true, null, true);
+        assertEquals(asList(3, 4, 5, 6, 7, 8, 9), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleRightIntersectionE()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(null, true, of(7), false);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleRightIntersectionI()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(null, true, of(7), true);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleNoIntersection()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(null, true, null, true);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection));
+    }
+
+    @Test
+    public void testSimpleEmptyIntersectionLeft()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(ByteComparable.EMPTY, true, null, true);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection));
+
+        intersection = trie.subtrie(ByteComparable.EMPTY, false, null, true);
+        assertEquals(asList(0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(intersection));
+
+        intersection = trie.subtrie(ByteComparable.EMPTY, true, of(5), true);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5), toList(intersection));
+
+        intersection = trie.subtrie(ByteComparable.EMPTY, false, of(5), true);
+        assertEquals(asList(0, 1, 2, 3, 4, 5), toList(intersection));
+
+    }
+
+    @Test
+    public void testSimpleEmptyIntersectionRight()
+    {
+        Trie<Integer> trie = singleLevelIntTrie(10);
+        assertEquals(asList(-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9), toList(trie));
+
+        Trie<Integer> intersection = trie.subtrie(null, true, ByteComparable.EMPTY, true);
+        assertEquals(asList(-1), toList(intersection));
+
+        intersection = trie.subtrie(null, true, ByteComparable.EMPTY, false);
+        assertEquals(asList(), toList(intersection));
+
+        intersection = trie.subtrie(ByteComparable.EMPTY, true, ByteComparable.EMPTY, true);
+        assertEquals(asList(-1), toList(intersection));
+
+        intersection = trie.subtrie(ByteComparable.EMPTY, false, ByteComparable.EMPTY, true);
+        assertEquals(asList(), toList(intersection));
+
+        intersection = trie.subtrie(ByteComparable.EMPTY, false, ByteComparable.EMPTY, false);
+        assertEquals(asList(), toList(intersection));
+    }
 }

From 76f706a3f9407df2e4e7a5fedd1cb67273693462 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 7 Apr 2021 12:07:08 +0300
Subject: [PATCH 130/151] AdvanceMultiple in SetIntersectionTrie

---
 .../db/tries/SetIntersectionTrie.java         | 23 ++++++++++++++++++-
 .../org/apache/cassandra/db/tries/Trie.java   |  4 ++--
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
index 19d19d981251..6beeb90c0e5a 100644
--- a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -140,7 +140,6 @@ private class IntersectionCursor implements Cursor<T>
     {
         private final Cursor<T> tCursor;
         private final Cursor<TrieSet.InSet> sCursor;
-        int branchLevel = Integer.MAX_VALUE;
 
         public IntersectionCursor(Cursor<T> tCursor,
                                   Cursor<TrieSet.InSet> sCursor)
@@ -149,6 +148,7 @@ public IntersectionCursor(Cursor<T> tCursor,
             this.sCursor = sCursor;
         }
 
+        @Override
         public int advance()
         {
             int tLevel = tCursor.advance();
@@ -163,6 +163,27 @@ public int advance()
             return advanceToIntersection(tLevel, sLevel);
         }
 
+        @Override
+        public int advanceMultiple(TransitionsReceiver transitionsReceiver)
+        {
+            if (!sCursor.content().branchCovered())
+                return Cursor.advanceMultiple(this, transitionsReceiver);
+
+            int tLevel = tCursor.advanceMultiple(transitionsReceiver);  // FIXME this could overwrite a byte
+            if (tLevel > sCursor.level())
+                return tLevel;
+
+            int sLevel = sCursor.advance();
+            int rLevel = advanceToIntersection(tLevel, sLevel);
+            if (transitionsReceiver != null && rLevel >= 0)
+            {
+                transitionsReceiver.reset(rLevel - 1);
+                transitionsReceiver.add(incomingTransition());
+            }
+            return rLevel;
+        }
+
+        @Override
         public int ascend() // this is not tested ATM
         {
             int tLevel = tCursor.ascend();
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index b1fa2e20d28e..c94531cec46d 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -333,8 +333,8 @@ public static <T> Trie<T> singleton(ByteComparable b, T v)
      */
     public Trie<T> subtrie(ByteComparable left, boolean includeLeft, ByteComparable right, boolean includeRight)
     {
-//        if (left == null && right == null)
-//            return this;
+        if (left == null && right == null)
+            return this;
 
         return new SetIntersectionTrie<>(this, TrieSet.range(left, includeLeft, right, includeRight));
     }

From 76c42b4ac2790ecc9e86e0e7ac9cd8b399490275 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 16:11:20 +0300
Subject: [PATCH 131/151] Switch to advanceMultiple not passing last transition
 to receiver

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 19 +++---
 .../cassandra/db/tries/MemtableTrie.java      | 12 +---
 .../db/tries/SetIntersectionTrie.java         | 26 ++++----
 .../cassandra/db/tries/SingletonTrie.java     | 36 ++++++-----
 .../org/apache/cassandra/db/tries/Trie.java   | 59 ++++++++++++-------
 .../db/tries/TrieEntriesIterator.java         |  2 +-
 6 files changed, 84 insertions(+), 70 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index ab9d8e733e5b..41943e5fb7c5 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -182,7 +182,6 @@ Block offsets used to identify node types (by comparing them to the node 'pointe
      */
     static final int NONE = 0;
 
-    int maxDepth = 0;
     volatile int root;
 
     /*
@@ -1023,20 +1022,21 @@ private boolean getChainTransition(int node)
         public int advanceMultiple(TransitionsReceiver receiver)
         {
             int node = currentNode;
-            if (isNull(node) || offset(node) > CHAIN_MAX_OFFSET)
-                return Cursor.advanceMultiple(this, receiver);
+            if (!isChainNode(node))
+                return advance();
 
             int pointer = chainBlockChildPointer(node);
-            int length = pointer - node;
-            if (receiver != null)
+
+            int length = pointer - node - 1;
+            if (receiver != null && length > 0)
             {
                 UnsafeBuffer buffer = getBuffer(node);
                 int ofs = getOffset(node);
                 receiver.add(buffer, ofs, length);
             }
 
-            level += length - 1; // compensate for increase below
-            return descendInto(getInt(pointer), -1);
+            level += length;
+            return descendInto(getInt(pointer), getByte(pointer - 1));
         }
 
         public int level()
@@ -1055,6 +1055,11 @@ public int incomingTransition()
         }
     }
 
+    private boolean isChainNode(int node)
+    {
+        return !isNullOrLeaf(node) && offset(node) <= CHAIN_MAX_OFFSET;
+    }
+
     public MemtableCursor cursor()
     {
         return new MemtableCursor();
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
index 4c5e5906c55b..8a1bfd6b5bd1 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableTrie.java
@@ -649,12 +649,8 @@ <U> void descend(int transition, U mutationContent, final UpsertTransformer<T, U
             }
 
             ++currentLevel;
-            if (currentLevel > maxDepth)
-            {
-                maxDepth = currentLevel;
-                if (currentLevel * 5 >= data.length)
-                    data = Arrays.copyOf(data, currentLevel * 5 * 2);
-            }
+            if (currentLevel * 5 >= data.length)
+                data = Arrays.copyOf(data, currentLevel * 5 * 2);
             setExistingPreContentNode(existingPreContentNode);
 
             int existingContentIndex = -1;
@@ -881,11 +877,7 @@ private <R> int putRecursive(int node, ByteSource key, int depth, R value, final
     {
         int transition = key.next();
         if (transition == ByteSource.END_OF_STREAM)
-        {
-            if (depth > maxDepth)
-                maxDepth = depth;
             return applyContent(node, value, transformer);
-        }
 
         int child = NONE;
         if (!isNull(node))
diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
index 6beeb90c0e5a..563dde37f496 100644
--- a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -136,7 +136,7 @@ protected Cursor<T> cursor()
         return new IntersectionCursor(trie.cursor(), intersectingSet.cursor());
     }
 
-    private class IntersectionCursor implements Cursor<T>
+    private static class IntersectionCursor<T> implements Cursor<T>
     {
         private final Cursor<T> tCursor;
         private final Cursor<TrieSet.InSet> sCursor;
@@ -166,21 +166,19 @@ public int advance()
         @Override
         public int advanceMultiple(TransitionsReceiver transitionsReceiver)
         {
-            if (!sCursor.content().branchCovered())
-                return Cursor.advanceMultiple(this, transitionsReceiver);
-
-            int tLevel = tCursor.advanceMultiple(transitionsReceiver);  // FIXME this could overwrite a byte
-            if (tLevel > sCursor.level())
-                return tLevel;
-
-            int sLevel = sCursor.advance();
-            int rLevel = advanceToIntersection(tLevel, sLevel);
-            if (transitionsReceiver != null && rLevel >= 0)
+            int tLevel;
+            if (sCursor.content().branchCovered())
             {
-                transitionsReceiver.reset(rLevel - 1);
-                transitionsReceiver.add(incomingTransition());
+                tLevel = tCursor.advanceMultiple(transitionsReceiver);
+                if (tLevel > sCursor.level())
+                    return tLevel;
+                // otherwise we have left the intersection's covered branch
             }
-            return rLevel;
+            else
+                tLevel = tCursor.advance();
+
+            int sLevel = sCursor.advance();
+            return advanceToIntersection(tLevel, sLevel);
         }
 
         @Override
diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
index 40dd20e0c6e5..e49a6fa7a39d 100644
--- a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
@@ -139,6 +139,26 @@ public int advance()
                 return currentLevel = -1;
         }
 
+        @Override
+        public int advanceMultiple(TransitionsReceiver receiver)
+        {
+            int current = src.next();
+            int level = currentLevel;
+            if (current == ByteSource.END_OF_STREAM)
+                return currentLevel = -1;
+            int next = src.next();
+            while (next != ByteSource.END_OF_STREAM)
+            {
+                if (receiver != null)
+                    receiver.add(current);
+                current = next;
+                next = src.next();
+                ++level;
+            }
+            currentTransition = current;
+            return currentLevel = ++level;
+        }
+
         public int ascend()
         {
             return -1;  // no alternatives
@@ -158,21 +178,5 @@ public int incomingTransition()
         {
             return currentTransition;
         }
-
-        public void retrieveKey(byte[] dest)
-        {
-            ByteSource srcCopy = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
-            for (int i = 0; i < currentLevel; ++i)
-                dest[i] = (byte) srcCopy.next();
-        }
-
-        public int transitionAtLevel(int level)
-        {
-            ByteSource srcCopy = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
-            int next = -1;
-            for (int i = 0; i <= level; ++i)
-                next = (byte) srcCopy.next();
-            return next;
-        }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index c94531cec46d..8bcfb2d84bbc 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -98,6 +98,10 @@ protected interface TransitionsReceiver
         void add(int t);
         /** Add the count bytes from position pos at the given buffer. */
         void add(UnsafeBuffer b, int pos, int count);
+    }
+
+    interface ResettingTransitionsReceiver extends TransitionsReceiver
+    {
         /** Delete all bytes beyond the given length. */
         void reset(int newLength);
     }
@@ -206,45 +210,56 @@ public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver
     // Cursor-style walks
     interface Cursor<T>
     {
-        int advance(); // returns level (can be prev+1 or <=prev), -1 means done
-        default int advanceMultiple(TransitionsReceiver receiver) // advance, descending multiple levels if that does not require extra work (e.g. chain nodes)
-        {
-            return advanceMultiple(this, receiver);
-        }
+        /**
+         * Advance one position.
+         * This can be either:
+         * - descending one level
+         * - ascending to the closest parent that has remaining children, and then descending one level
+         * @return level (can be prev+1 or <=prev), -1 means done
+         */
+        int advance();
 
-        static int advanceMultiple(Cursor c, TransitionsReceiver receiver)
+        /**
+         * Advance, descending multiple levels if that does not require extra work (e.g. chain nodes)
+         * Receiver will be given all transitions taken except the last; i.e. on an ascend it will not receive any
+         *
+         * @param receiver
+         * @return
+         */
+        default int advanceMultiple(TransitionsReceiver receiver)
         {
-            if (receiver == null)
-                return c.advance();
-
-            int prevLevel = c.level();
-            int level = c.advance();
-            if (level < 0)
-                return level;
-            if (level <= prevLevel)
-                receiver.reset(level - 1);
-            receiver.add(c.incomingTransition());
-            return level;
+            return advance();
         }
 
-        default T advanceToContent(TransitionsReceiver receiver) // advances all the way (to next content)
+        default T advanceToContent(ResettingTransitionsReceiver receiver) // advances all the way (to next content)
         {
+            int prevLevel = level();
             while (true)
             {
-                int level = advanceMultiple(receiver);
-                if (level < 0)
+                int currLevel = advanceMultiple(receiver);
+                if (currLevel <= 0)
                     return null;
+                if (receiver != null)
+                {
+                    if (currLevel <= prevLevel)
+                        receiver.reset(currLevel - 1);
+                    receiver.add(incomingTransition());
+                }
                 T content = content();
                 if (content != null)
                     return content;
+                prevLevel = currLevel;
             }
         }
 
 //        int advanceTo(int transition); // advance to child with this transition or higher. if none exists, ascend to parent and advance
 
-        int ascend();
+        /**
+         * ignore the remaining children at this level or below and ascend to parent and advance
+         */
+        int ascend(); // ignore the remaining children at this level or below and ascend to parent and advance
 
-        //        default int ascend() // ignore the remaining children at this level or below and ascend to parent and advance
+        //        default int ascend()
 //        {
 //            return advanceTo(Integer.MAX_VALUE);
 //        }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
index 56511dc9a895..24a84a353882 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieEntriesIterator.java
@@ -29,7 +29,7 @@
  * Convertor of trie entries to iterator where each entry is passed through {@link #mapContent} (to be implemented by
  * descendants).
  */
-public abstract class TrieEntriesIterator<T, V> implements Iterator<V>, Trie.TransitionsReceiver
+public abstract class TrieEntriesIterator<T, V> implements Iterator<V>, Trie.ResettingTransitionsReceiver
 {
     private final Trie.Cursor<T> cursor;
     private byte[] keyBytes = new byte[32];

From 9bb32debbffca087329bdef8c3767a259678efe7 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 16:11:35 +0300
Subject: [PATCH 132/151] Implement MergeTrie cursor

---
 .../apache/cassandra/db/tries/MergeTrie.java  | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
index 4aba992848dd..547de7f72ef0 100644
--- a/src/java/org/apache/cassandra/db/tries/MergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
@@ -47,6 +47,12 @@ public <L> Node<T, L> root()
         return makeNode(resolver, t1.root(), t2.root());
     }
 
+    @Override
+    protected Cursor<T> cursor()
+    {
+        return new MergeCursor<>(resolver, t1, t2);
+    }
+
     private static <T, L> Node<T, L> makeNode(MergeResolver<T> resolver, Node<T, L> child1, Node<T, L> child2)
     {
         if (child1 != null && child2 != null)
@@ -160,6 +166,110 @@ else if (nc == null)
         }
     }
 
+    static class MergeCursor<T> implements Cursor<T>
+    {
+        private final MergeResolver<T> resolver;
+        private final Cursor<T> c1;
+        private final Cursor<T> c2;
+
+        boolean atC1;
+        boolean atC2;
+
+        MergeCursor(MergeResolver<T> resolver, Trie<T> t1, Trie<T> t2)
+        {
+            this.resolver = resolver;
+            this.c1 = t1.cursor();
+            this.c2 = t2.cursor();
+            atC1 = atC2 = true;
+        }
+
+        @Override
+        public int advance()
+        {
+            return checkOrder(atC1 ? c1.advance() : c1.level(),
+                              atC2 ? c2.advance() : c2.level());
+        }
+
+        @Override
+        public int ascend()
+        {
+            return checkOrder(atC1 ? c1.ascend() : c1.level(),
+                              atC2 ? c2.ascend() : c2.level());
+        }
+
+        @Override
+        public int advanceMultiple(TransitionsReceiver receiver)
+        {
+            if (atC1 & atC2)
+                return advance();
+
+            if (atC1)
+            {
+                int c2level = c2.level();
+                int c1level = c1.advanceMultiple(receiver);
+                if (c1level <= c2level)
+                    return checkOrder(c1level, c2level);
+                else
+                    return c1level;   // atC1 stays true, atC2 false, c2 remains where it is
+            }
+            else // atC2
+            {
+                int c1level = c1.level();
+                int c2level = c2.advanceMultiple(receiver);
+                if (c2level <= c1level)
+                    return checkOrder(c1level, c2level);
+                else
+                    return c2level;   // atC2 stays true, atC1 false, c1 remains where it is
+            }
+        }
+
+        private int checkOrder(int c1level, int c2level)
+        {
+            if (c1level > c2level)
+            {
+                atC1 = true;
+                atC2 = false;
+                return c1level;
+            }
+            if (c1level < c2level)
+            {
+                atC1 = false;
+                atC2 = true;
+                return c2level;
+            }
+            int c1trans = c1.incomingTransition();
+            int c2trans = c2.incomingTransition();
+            atC1 = c1trans <= c2trans;
+            atC2 = c1trans >= c2trans;
+            assert atC1 | atC2;
+            return c1level;
+        }
+
+        @Override
+        public int level()
+        {
+            return atC1 ? c1.level() : c2.level();
+        }
+
+        @Override
+        public int incomingTransition()
+        {
+            return atC1 ? c1.incomingTransition() : c2.incomingTransition();
+        }
+
+        public T content()
+        {
+            T mc = atC2 ? c2.content() : null;
+            T nc = atC1 ? c1.content() : null;
+            if (mc == null)
+                return nc;
+            else if (nc == null)
+                return mc;
+            else
+                return resolver.resolve(nc, mc);
+        }
+    }
+
     /**
      * Special instance for sources that are guaranteed (by the caller) distinct. The main difference is that we can
      * form unordered value list by concatenating sources.

From 0b0609af3afb9b90bd841733e3f9c2f0e5fb7b9e Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 16:32:43 +0300
Subject: [PATCH 133/151] Precalculate incoming, level and content for
 MergeTrie

---
 .../apache/cassandra/db/tries/MergeTrie.java  | 72 ++++++++++++++-----
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
index 547de7f72ef0..ff0c5141dc65 100644
--- a/src/java/org/apache/cassandra/db/tries/MergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
@@ -174,6 +174,9 @@ static class MergeCursor<T> implements Cursor<T>
 
         boolean atC1;
         boolean atC2;
+        int incomingTransition;
+        int level;
+        T content;
 
         MergeCursor(MergeResolver<T> resolver, Trie<T> t1, Trie<T> t2)
         {
@@ -181,6 +184,9 @@ static class MergeCursor<T> implements Cursor<T>
             this.c1 = t1.cursor();
             this.c2 = t2.cursor();
             atC1 = atC2 = true;
+            level = 0;
+            incomingTransition = -1;
+            content = null;
         }
 
         @Override
@@ -210,7 +216,11 @@ public int advanceMultiple(TransitionsReceiver receiver)
                 if (c1level <= c2level)
                     return checkOrder(c1level, c2level);
                 else
-                    return c1level;   // atC1 stays true, atC2 false, c2 remains where it is
+                {
+                    incomingTransition = c1.incomingTransition();
+                    content = c1.content();
+                    return level = c1level;   // atC1 stays true, atC2 false, c2 remains where it is
+                }
             }
             else // atC2
             {
@@ -219,7 +229,11 @@ public int advanceMultiple(TransitionsReceiver receiver)
                 if (c2level <= c1level)
                     return checkOrder(c1level, c2level);
                 else
-                    return c2level;   // atC2 stays true, atC1 false, c1 remains where it is
+                {
+                    incomingTransition = c2.incomingTransition();
+                    content = c2.content();
+                    return level = c2level;   // atC2 stays true, atC1 false, c1 remains where it is
+                }
             }
         }
 
@@ -229,44 +243,66 @@ private int checkOrder(int c1level, int c2level)
             {
                 atC1 = true;
                 atC2 = false;
-                return c1level;
+                incomingTransition = c1.incomingTransition();
+                content = c1.content();
+                return level = c1level;
             }
             if (c1level < c2level)
             {
                 atC1 = false;
                 atC2 = true;
-                return c2level;
+                incomingTransition = c2.incomingTransition();
+                content = c2.content();
+                return level = c2level;
             }
+
             int c1trans = c1.incomingTransition();
             int c2trans = c2.incomingTransition();
-            atC1 = c1trans <= c2trans;
-            atC2 = c1trans >= c2trans;
-            assert atC1 | atC2;
-            return c1level;
+            if (c1trans < c2trans)
+            {
+                atC1 = true;
+                atC2 = false;
+                incomingTransition = c1trans;
+                content = c1.content();
+                return level = c1level;
+            }
+            if (c1trans > c2trans)
+            {
+                atC1 = false;
+                atC2 = true;
+                incomingTransition = c2trans;
+                content = c2.content();
+                return level = c2level;
+            }
+
+            atC1 = atC2 = true;
+            incomingTransition = c1trans;
+            T c1content = c1.content();
+            T c2content = c2.content();
+            content = c1content == null
+                      ? c2content
+                      : c2content == null
+                        ? c1content
+                        : resolver.resolve(c1content, c2content);
+            return level = c1level;
         }
 
         @Override
         public int level()
         {
-            return atC1 ? c1.level() : c2.level();
+            return level;
         }
 
         @Override
         public int incomingTransition()
         {
-            return atC1 ? c1.incomingTransition() : c2.incomingTransition();
+            return incomingTransition;
         }
 
+        @Override
         public T content()
         {
-            T mc = atC2 ? c2.content() : null;
-            T nc = atC1 ? c1.content() : null;
-            if (mc == null)
-                return nc;
-            else if (nc == null)
-                return mc;
-            else
-                return resolver.resolve(nc, mc);
+            return content;
         }
     }
 

From bd51eb1548392fc17341625485b4b82a2cbf384d Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 16:33:56 +0300
Subject: [PATCH 134/151] Revert "Precalculate incoming, level and content for
 MergeTrie" No performance benefit and for uglier code

---
 .../apache/cassandra/db/tries/MergeTrie.java  | 72 +++++--------------
 1 file changed, 18 insertions(+), 54 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
index ff0c5141dc65..547de7f72ef0 100644
--- a/src/java/org/apache/cassandra/db/tries/MergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
@@ -174,9 +174,6 @@ static class MergeCursor<T> implements Cursor<T>
 
         boolean atC1;
         boolean atC2;
-        int incomingTransition;
-        int level;
-        T content;
 
         MergeCursor(MergeResolver<T> resolver, Trie<T> t1, Trie<T> t2)
         {
@@ -184,9 +181,6 @@ static class MergeCursor<T> implements Cursor<T>
             this.c1 = t1.cursor();
             this.c2 = t2.cursor();
             atC1 = atC2 = true;
-            level = 0;
-            incomingTransition = -1;
-            content = null;
         }
 
         @Override
@@ -216,11 +210,7 @@ public int advanceMultiple(TransitionsReceiver receiver)
                 if (c1level <= c2level)
                     return checkOrder(c1level, c2level);
                 else
-                {
-                    incomingTransition = c1.incomingTransition();
-                    content = c1.content();
-                    return level = c1level;   // atC1 stays true, atC2 false, c2 remains where it is
-                }
+                    return c1level;   // atC1 stays true, atC2 false, c2 remains where it is
             }
             else // atC2
             {
@@ -229,11 +219,7 @@ public int advanceMultiple(TransitionsReceiver receiver)
                 if (c2level <= c1level)
                     return checkOrder(c1level, c2level);
                 else
-                {
-                    incomingTransition = c2.incomingTransition();
-                    content = c2.content();
-                    return level = c2level;   // atC2 stays true, atC1 false, c1 remains where it is
-                }
+                    return c2level;   // atC2 stays true, atC1 false, c1 remains where it is
             }
         }
 
@@ -243,66 +229,44 @@ private int checkOrder(int c1level, int c2level)
             {
                 atC1 = true;
                 atC2 = false;
-                incomingTransition = c1.incomingTransition();
-                content = c1.content();
-                return level = c1level;
+                return c1level;
             }
             if (c1level < c2level)
             {
                 atC1 = false;
                 atC2 = true;
-                incomingTransition = c2.incomingTransition();
-                content = c2.content();
-                return level = c2level;
+                return c2level;
             }
-
             int c1trans = c1.incomingTransition();
             int c2trans = c2.incomingTransition();
-            if (c1trans < c2trans)
-            {
-                atC1 = true;
-                atC2 = false;
-                incomingTransition = c1trans;
-                content = c1.content();
-                return level = c1level;
-            }
-            if (c1trans > c2trans)
-            {
-                atC1 = false;
-                atC2 = true;
-                incomingTransition = c2trans;
-                content = c2.content();
-                return level = c2level;
-            }
-
-            atC1 = atC2 = true;
-            incomingTransition = c1trans;
-            T c1content = c1.content();
-            T c2content = c2.content();
-            content = c1content == null
-                      ? c2content
-                      : c2content == null
-                        ? c1content
-                        : resolver.resolve(c1content, c2content);
-            return level = c1level;
+            atC1 = c1trans <= c2trans;
+            atC2 = c1trans >= c2trans;
+            assert atC1 | atC2;
+            return c1level;
         }
 
         @Override
         public int level()
         {
-            return level;
+            return atC1 ? c1.level() : c2.level();
         }
 
         @Override
         public int incomingTransition()
         {
-            return incomingTransition;
+            return atC1 ? c1.incomingTransition() : c2.incomingTransition();
         }
 
-        @Override
         public T content()
         {
-            return content;
+            T mc = atC2 ? c2.content() : null;
+            T nc = atC1 ? c1.content() : null;
+            if (mc == null)
+                return nc;
+            else if (nc == null)
+                return mc;
+            else
+                return resolver.resolve(nc, mc);
         }
     }
 

From 847f1a737874e67fd31f74cf116a2d1c528c34c8 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 17:52:38 +0300
Subject: [PATCH 135/151] Fix MergeTrie.ascend

---
 src/java/org/apache/cassandra/db/tries/MergeTrie.java | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
index 547de7f72ef0..36a8fa7f905f 100644
--- a/src/java/org/apache/cassandra/db/tries/MergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
@@ -193,8 +193,11 @@ public int advance()
         @Override
         public int ascend()
         {
-            return checkOrder(atC1 ? c1.ascend() : c1.level(),
-                              atC2 ? c2.ascend() : c2.level());
+            int c1level = c1.level();
+            int c2level = c2.level();
+            int level = Math.max(c1level, c2level);
+            return checkOrder(c1level == level ? c1.ascend() : c1level,
+                              c2level == level ? c2.ascend() : c2level);
         }
 
         @Override

From f61144e53f5eafc07e4d4b263962eaf43b7d4e14 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 17:53:09 +0300
Subject: [PATCH 136/151] First CollectionMergeTrie implementations

---
 .../db/tries/CollectionMergeTrie.java         | 181 ++++++++++++++++++
 .../org/apache/cassandra/db/tries/Trie.java   |   6 +-
 2 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
index c6e311bea3a9..3b04537b8e5f 100644
--- a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
@@ -19,6 +19,7 @@
 
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Comparator;
 import java.util.List;
 
 import com.google.common.collect.Iterables;
@@ -51,6 +52,12 @@ public <L> Node<T, L> root()
         return makeMerge(resolver, nodes);
     }
 
+    @Override
+    protected Cursor<T> cursor()
+    {
+        return new CollectionMergeCursor<>(resolver, inputs);
+    }
+
     private static <T, L> Node<T, L> makeMerge(CollectionMergeResolver<T> resolver, List<Node<T, L>> nodes)
     {
         switch (nodes.size())
@@ -293,6 +300,180 @@ public T content()
         }
     }
 
+    static <T> int compareCursors(Cursor<T> c1, Cursor<T> c2)
+    {
+        int c1level = c1.level();
+        int c2level = c2.level();
+        if (c1level != c2level)
+            return -Integer.compare(c1level, c2level);
+        return Integer.compare(c1.incomingTransition(), c2.incomingTransition());
+    }
+
+    static class CollectionMergeCursor<T> implements Cursor<T>
+    {
+        private final CollectionMergeResolver<T> resolver;
+        private final Cursor[] heap;
+        private final List<T> contents;
+
+        public CollectionMergeCursor(CollectionMergeResolver<T> resolver, Collection<? extends Trie<T>> inputs)
+        {
+            this.resolver = resolver;
+            int count = inputs.size();
+            heap = new Cursor[count];
+            contents = new ArrayList<>(count);
+            int i = 0;
+            for (Trie<T> trie : inputs)
+            {
+                Cursor<T> cursor = trie.cursor();
+                if (cursor.level() < 0)
+                    heap[--count] = cursor;    // empty trie / no root, put it at the end
+                else
+                    heap[i++] = cursor;
+            }
+            // head is now well-formed, count many cursors are equal and need advancing
+        }
+
+        @Override
+        public int advance()
+        {
+            advance(0);
+            return level();
+        }
+
+        /**
+         * Advance the state of the input at the given index and any of its descendants that are at the same
+         * transition byte and restore the heap invariant for the subtree rooted at the given index.
+         * Calls itself recursively and used by advanceState with index = 0 and transition = state[0].transition
+         * to advance the state of the merge.
+         */
+        private void advance(int index)
+        {
+            Cursor<T> item = heap[index];
+
+            // If the children are at the same transition byte, they also need advancing and their subheap
+            // invariant to be restored.
+            int next = index * 2 + 1;
+            if (next < heap.length && compareCursors(item, heap[next]) == 0)
+                advance(next);
+            ++next;
+            if (next < heap.length && compareCursors(item, heap[next]) == 0)
+                advance(next);
+
+            item.advance();
+
+            // At this point the heaps at both children are advanced and well-formed. Place current node in its
+            // proper position.
+            heapifyDown(item, index);
+            // The heap rooted at index is now advanced and well-formed.
+        }
+
+        /**
+         * Push the given state down in the heap from the given index until it finds its proper place among
+         * the subheap rooted at that position.
+         */
+        private void heapifyDown(Cursor<T> item, int index)
+        {
+            while (true)
+            {
+                int next = index * 2 + 1;
+                if (next >= heap.length)
+                    break;
+                // Select the smaller of the two children to push down to.
+                if (next + 1 < heap.length && compareCursors(heap[next], heap[next + 1]) > 0)
+                    ++next;
+                // If the child is greater, the invariant has been restored.
+                if (compareCursors(heap[next], item) >= 0)
+                    break;
+                heap[index] = heap[next];
+                index = next;
+            }
+            heap[index] = item;
+        }
+
+        // TODO: implement advanceMultiple
+        // TODO: add single-source optimization
+
+        @Override
+        public int ascend()
+        {
+            ascend(0, level());
+            return level();
+        }
+
+        /**
+         * Advance the state of the input at the given index and any of its descendants that are at the same
+         * transition byte and restore the heap invariant for the subtree rooted at the given index.
+         * Calls itself recursively and used by advanceState with index = 0 and transition = state[0].transition
+         * to advance the state of the merge.
+         */
+        private void ascend(int index, int level)
+        {
+            Cursor<T> item = heap[index];
+
+            // If the children are at the same transition byte, they also need advancing and their subheap
+            // invariant to be restored.
+            int next = index * 2 + 1;
+            if (next < heap.length && heap[next].level() == level)
+                ascend(next, level);
+            ++next;
+            if (next < heap.length && heap[next].level() == level)
+                ascend(next, level);
+
+            item.ascend();
+
+            // At this point the heaps at both children are advanced and well-formed. Place current node in its
+            // proper position.
+            heapifyDown(item, index);
+            // The heap rooted at index is now advanced and well-formed.
+        }
+
+        @Override
+        public int level()
+        {
+            return heap[0].level();  // okay also if count == 0
+        }
+
+        @Override
+        public int incomingTransition()
+        {
+            return heap[0].incomingTransition();
+        }
+
+        @Override
+        public T content()
+        {
+            contents.clear();   // this is preferably done immediately after
+            collectContent(0);
+            switch (contents.size())
+            {
+                case 0:
+                    return null;
+                case 1:
+                    return contents.get(0);
+                default:
+                    return resolver.resolve(contents);
+            }
+        }
+
+        private void collectContent(int index)
+        {
+            Cursor<T> item = heap[index];
+
+            T itemContent = item.content();
+            if (itemContent != null)
+                contents.add(itemContent);
+
+            // If the children are at the same transition byte, they also need advancing and their subheap
+            // invariant to be restored.
+            int next = index * 2 + 1;
+            if (next < heap.length && compareCursors(item, heap[next]) == 0)
+                collectContent(next);
+            ++next;
+            if (next < heap.length && compareCursors(item, heap[next]) == 0)
+                collectContent(next);
+        }
+    }
+
     /**
      * Special instance for sources that are guaranteed distinct. The main difference is that we can form unordered
      * value list by concatenating sources.
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index 8bcfb2d84bbc..577d0cf32ab7 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -264,9 +264,9 @@ default T advanceToContent(ResettingTransitionsReceiver receiver) // advances al
 //            return advanceTo(Integer.MAX_VALUE);
 //        }
 
-        int level(); // return current state
-        int incomingTransition(); // not set in advanceMultiple/ToCursor
-        T content();
+        int level(); // return current state; if just starting / on root, return 0
+        int incomingTransition(); // return the last transition taken; if just starting / on root, return -1
+        T content(); // return content -- may be non-null on root
 
     }
 

From 4e64584c57c555b22d1c3dca5ba8d954f8159677 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Fri, 23 Apr 2021 21:01:59 +0300
Subject: [PATCH 137/151] CollectionMergeTrie with advanceMultiple

---
 .../db/tries/CollectionMergeTrie.java         | 121 ++++++++++--------
 1 file changed, 69 insertions(+), 52 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
index 3b04537b8e5f..61b0b282ea0a 100644
--- a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
@@ -312,52 +312,57 @@ static <T> int compareCursors(Cursor<T> c1, Cursor<T> c2)
     static class CollectionMergeCursor<T> implements Cursor<T>
     {
         private final CollectionMergeResolver<T> resolver;
-        private final Cursor[] heap;
+        private final Cursor<T>[] heap;
+        private Cursor<T> head;
         private final List<T> contents;
 
         public CollectionMergeCursor(CollectionMergeResolver<T> resolver, Collection<? extends Trie<T>> inputs)
         {
             this.resolver = resolver;
             int count = inputs.size();
-            heap = new Cursor[count];
+            heap = new Cursor[count - 1];
             contents = new ArrayList<>(count);
-            int i = 0;
+            int i = -1;
+            --count;
             for (Trie<T> trie : inputs)
             {
                 Cursor<T> cursor = trie.cursor();
-                if (cursor.level() < 0)
+                if (cursor.level() < 0 && count > 0)
                     heap[--count] = cursor;    // empty trie / no root, put it at the end
-                else
+                else if (i >= 0)
                     heap[i++] = cursor;
+                else
+                {
+                    head = cursor;
+                    ++i;
+                }
             }
-            // head is now well-formed, count many cursors are equal and need advancing
         }
 
         @Override
         public int advance()
         {
             advance(0);
-            return level();
+            return maybeSwapHead(head.advance());
         }
 
         /**
          * Advance the state of the input at the given index and any of its descendants that are at the same
          * transition byte and restore the heap invariant for the subtree rooted at the given index.
-         * Calls itself recursively and used by advanceState with index = 0 and transition = state[0].transition
-         * to advance the state of the merge.
+         * Calls itself recursively and used by advance with index = 0 to advance the state of the merge.
          */
         private void advance(int index)
         {
+            if (index >= heap.length)
+                return;
             Cursor<T> item = heap[index];
+            if (head.level() != item.level() || head.incomingTransition() != item.incomingTransition())
+                return;
 
             // If the children are at the same transition byte, they also need advancing and their subheap
             // invariant to be restored.
-            int next = index * 2 + 1;
-            if (next < heap.length && compareCursors(item, heap[next]) == 0)
-                advance(next);
-            ++next;
-            if (next < heap.length && compareCursors(item, heap[next]) == 0)
-                advance(next);
+            advance(index * 2 + 1);
+            advance(index * 2 + 2);
 
             item.advance();
 
@@ -390,87 +395,99 @@ private void heapifyDown(Cursor<T> item, int index)
             heap[index] = item;
         }
 
-        // TODO: implement advanceMultiple
-        // TODO: add single-source optimization
+        private int maybeSwapHead(int headLevel)
+        {
+            int heap0Level = heap[0].level();
+            if (headLevel > heap0Level ||
+                (headLevel == heap0Level && head.incomingTransition() <= heap[0].incomingTransition()))
+                return headLevel;
+            // otherwise we need to swap heap and heap[0]
+            Cursor<T> newHeap0 = head;
+            head = heap[0];
+            heapifyDown(newHeap0, 0);
+            return heap0Level;
+        }
+
+        @Override
+        public int advanceMultiple(TransitionsReceiver receiver)
+        {
+            Cursor<T> heap0 = heap[0];
+            if (head.level() == heap0.level() && head.incomingTransition() == heap0.incomingTransition())
+                return advance();   // more than one source at current position, can't do multiple.
+
+            return maybeSwapHead(head.advanceMultiple(receiver));
+        }
 
         @Override
         public int ascend()
         {
-            ascend(0, level());
-            return level();
+            ascend(0);
+            return maybeSwapHead(head.ascend());
         }
 
-        /**
-         * Advance the state of the input at the given index and any of its descendants that are at the same
-         * transition byte and restore the heap invariant for the subtree rooted at the given index.
-         * Calls itself recursively and used by advanceState with index = 0 and transition = state[0].transition
-         * to advance the state of the merge.
-         */
-        private void ascend(int index, int level)
+        private void ascend(int index)
         {
+            if (index >= heap.length)
+                return;
             Cursor<T> item = heap[index];
+            if (head.level() != item.level())
+                return;
 
-            // If the children are at the same transition byte, they also need advancing and their subheap
-            // invariant to be restored.
-            int next = index * 2 + 1;
-            if (next < heap.length && heap[next].level() == level)
-                ascend(next, level);
-            ++next;
-            if (next < heap.length && heap[next].level() == level)
-                ascend(next, level);
+            ascend(index * 2 + 1);
+            ascend(index * 2 + 2);
 
             item.ascend();
-
-            // At this point the heaps at both children are advanced and well-formed. Place current node in its
-            // proper position.
             heapifyDown(item, index);
-            // The heap rooted at index is now advanced and well-formed.
         }
 
         @Override
         public int level()
         {
-            return heap[0].level();  // okay also if count == 0
+            return head.level();
         }
 
         @Override
         public int incomingTransition()
         {
-            return heap[0].incomingTransition();
+            return head.incomingTransition();
         }
 
         @Override
         public T content()
         {
-            contents.clear();   // this is preferably done immediately after
+            T itemContent = head.content();
+            if (itemContent != null)
+                contents.add(itemContent);
+
             collectContent(0);
+            T toReturn;
             switch (contents.size())
             {
                 case 0:
-                    return null;
+                    toReturn = null;
                 case 1:
-                    return contents.get(0);
+                    toReturn = contents.get(0);
                 default:
-                    return resolver.resolve(contents);
+                    toReturn = resolver.resolve(contents);
             }
+            contents.clear();
+            return toReturn;
         }
 
         private void collectContent(int index)
         {
+            if (index >= heap.length)
+                return;
             Cursor<T> item = heap[index];
+            if (head.level() != item.level() || head.incomingTransition() != item.incomingTransition())
+                return;
 
             T itemContent = item.content();
             if (itemContent != null)
                 contents.add(itemContent);
 
-            // If the children are at the same transition byte, they also need advancing and their subheap
-            // invariant to be restored.
-            int next = index * 2 + 1;
-            if (next < heap.length && compareCursors(item, heap[next]) == 0)
-                collectContent(next);
-            ++next;
-            if (next < heap.length && compareCursors(item, heap[next]) == 0)
-                collectContent(next);
+            collectContent(index * 2 + 1);
+            collectContent(index * 2 + 2);
         }
     }
 

From 5106e4a841c074630dbca07060edfc2d008315fc Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Sat, 24 Apr 2021 10:06:27 +0300
Subject: [PATCH 138/151] More elaborate MemtableReadTrie.advanceMultiple

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 47 +++++++++++++++----
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 41943e5fb7c5..9a122a2fefa0 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -1018,6 +1018,10 @@ private boolean getChainTransition(int node)
             return true;
         }
 
+        // TODO: don't redo buffer/offset calculations
+        // TODO: use sparse order word
+        // TODO: reexamine backtracking
+
         @Override
         public int advanceMultiple(TransitionsReceiver receiver)
         {
@@ -1025,18 +1029,41 @@ public int advanceMultiple(TransitionsReceiver receiver)
             if (!isChainNode(node))
                 return advance();
 
-            int pointer = chainBlockChildPointer(node);
-
-            int length = pointer - node - 1;
-            if (receiver != null && length > 0)
+            while (true)
             {
-                UnsafeBuffer buffer = getBuffer(node);
-                int ofs = getOffset(node);
-                receiver.add(buffer, ofs, length);
-            }
+                int pointer = chainBlockChildPointer(node);
+                int child = getInt(pointer);
+                if (isNullOrLeaf(child) || offset(child) == PREFIX_OFFSET)
+                {
+                    int length = pointer - node - 1;
+                    if (receiver != null && length > 0)
+                    {
+                        UnsafeBuffer buffer = getBuffer(node);
+                        int ofs = getOffset(node);
+                        receiver.add(buffer, ofs, length);
+                    }
+
+                    level += length;
+                    return descendInto(getInt(pointer), getByte(pointer - 1));
+                }
 
-            level += length;
-            return descendInto(getInt(pointer), getByte(pointer - 1));
+                int length = pointer - node;
+                if (receiver != null)
+                {
+                    UnsafeBuffer buffer = getBuffer(node);
+                    int ofs = getOffset(node);
+                    receiver.add(buffer, ofs, length);
+                }
+
+                level += length;
+                if (!isChainNode(child))
+                {
+                    boolean success = advanceToNextChild(child, -1);
+                    assert success;
+                    return level;
+                }
+                node = child;
+            }
         }
 
         public int level()

From 87cefe047c60bca582695a041ac09e6c07231f1a Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Sat, 24 Apr 2021 10:07:02 +0300
Subject: [PATCH 139/151] greaterCursor and equalCursor

---
 .../db/tries/CollectionMergeTrie.java         | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
index 61b0b282ea0a..340457cc497d 100644
--- a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
@@ -300,13 +300,18 @@ public T content()
         }
     }
 
-    static <T> int compareCursors(Cursor<T> c1, Cursor<T> c2)
+    static <T> boolean greaterCursor(Cursor<T> c1, Cursor<T> c2)
     {
         int c1level = c1.level();
         int c2level = c2.level();
         if (c1level != c2level)
-            return -Integer.compare(c1level, c2level);
-        return Integer.compare(c1.incomingTransition(), c2.incomingTransition());
+            return c1level < c2level;
+        return c1.incomingTransition() > c2.incomingTransition();
+    }
+
+    static <T> boolean equalCursor(Cursor<T> c1, Cursor<T> c2)
+    {
+        return c1.level() == c2.level() && c1.incomingTransition() == c2.incomingTransition();
     }
 
     static class CollectionMergeCursor<T> implements Cursor<T>
@@ -356,7 +361,7 @@ private void advance(int index)
             if (index >= heap.length)
                 return;
             Cursor<T> item = heap[index];
-            if (head.level() != item.level() || head.incomingTransition() != item.incomingTransition())
+            if (!equalCursor(item, head))
                 return;
 
             // If the children are at the same transition byte, they also need advancing and their subheap
@@ -384,10 +389,10 @@ private void heapifyDown(Cursor<T> item, int index)
                 if (next >= heap.length)
                     break;
                 // Select the smaller of the two children to push down to.
-                if (next + 1 < heap.length && compareCursors(heap[next], heap[next + 1]) > 0)
+                if (next + 1 < heap.length && greaterCursor(heap[next], heap[next + 1]))
                     ++next;
-                // If the child is greater, the invariant has been restored.
-                if (compareCursors(heap[next], item) >= 0)
+                // If the child is greater or equal, the invariant has been restored.
+                if (!greaterCursor(item, heap[next]))
                     break;
                 heap[index] = heap[next];
                 index = next;
@@ -411,8 +416,7 @@ private int maybeSwapHead(int headLevel)
         @Override
         public int advanceMultiple(TransitionsReceiver receiver)
         {
-            Cursor<T> heap0 = heap[0];
-            if (head.level() == heap0.level() && head.incomingTransition() == heap0.incomingTransition())
+            if (equalCursor(heap[0], head))
                 return advance();   // more than one source at current position, can't do multiple.
 
             return maybeSwapHead(head.advanceMultiple(receiver));
@@ -465,10 +469,13 @@ public T content()
             {
                 case 0:
                     toReturn = null;
+                    break;
                 case 1:
                     toReturn = contents.get(0);
+                    break;
                 default:
                     toReturn = resolver.resolve(contents);
+                    break;
             }
             contents.clear();
             return toReturn;
@@ -479,7 +486,7 @@ private void collectContent(int index)
             if (index >= heap.length)
                 return;
             Cursor<T> item = heap[index];
-            if (head.level() != item.level() || head.incomingTransition() != item.incomingTransition())
+            if (!equalCursor(item, head))
                 return;
 
             T itemContent = item.content();

From 7e62a7e6ee17b08406415b133268a85a37118e8f Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Sat, 24 Apr 2021 10:32:24 +0300
Subject: [PATCH 140/151] some don't redo buffer/offset calculations

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 34 ++++++++-----------
 .../tries/MemtableTrieWriteBench.java         |  8 ++---
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 9a122a2fefa0..e970cbb192dd 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -979,13 +979,15 @@ private boolean nextValidSparseTransition(int node, int transition)
             int minValid = Integer.MAX_VALUE;
             int minChild = NONE;
             int validCount = 0;
+            UnsafeBuffer buffer = getBuffer(node);
+            int ofs = getOffset(node);
 
             for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
             {
-                int child = getInt(node + SPARSE_CHILDREN_OFFSET + i * 4);
+                int child = buffer.getInt(ofs + SPARSE_CHILDREN_OFFSET + i * 4);
                 if (child == NONE)
                     break;
-                int t = getByte(node + SPARSE_BYTES_OFFSET + i);
+                int t = buffer.getByte(ofs + SPARSE_BYTES_OFFSET + i) & 0xFF;
                 if (t >= transition)
                 {
                     if (t < minValid)
@@ -1019,8 +1021,8 @@ private boolean getChainTransition(int node)
         }
 
         // TODO: don't redo buffer/offset calculations
-        // TODO: use sparse order word
-        // TODO: reexamine backtracking
+        // TODO: maybe use sparse order word
+        // TODO: reexamine backtracking, separate backtrack positions for dense sub-levels
 
         @Override
         public int advanceMultiple(TransitionsReceiver receiver)
@@ -1031,31 +1033,25 @@ public int advanceMultiple(TransitionsReceiver receiver)
 
             while (true)
             {
-                int pointer = chainBlockChildPointer(node);
-                int child = getInt(pointer);
+                UnsafeBuffer buffer = getBuffer(node);
+                int ofs = getOffset(node);
+                int pointer = chainBlockChildPointer(ofs);
+                int child = buffer.getInt(pointer);
+                int length = pointer - ofs;
                 if (isNullOrLeaf(child) || offset(child) == PREFIX_OFFSET)
                 {
-                    int length = pointer - node - 1;
+                    --length;   // leave the last byte for incomingTransition
                     if (receiver != null && length > 0)
-                    {
-                        UnsafeBuffer buffer = getBuffer(node);
-                        int ofs = getOffset(node);
                         receiver.add(buffer, ofs, length);
-                    }
-
                     level += length;
-                    return descendInto(getInt(pointer), getByte(pointer - 1));
+
+                    return descendInto(child, buffer.getByte(pointer - 1) & 0xFF);
                 }
 
-                int length = pointer - node;
                 if (receiver != null)
-                {
-                    UnsafeBuffer buffer = getBuffer(node);
-                    int ofs = getOffset(node);
                     receiver.add(buffer, ofs, length);
-                }
-
                 level += length;
+
                 if (!isChainNode(child))
                 {
                     boolean success = advanceToNextChild(child, -1);
diff --git a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
index 9a0287ea4505..36a9b012489b 100644
--- a/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
+++ b/test/microbench/org/apache/cassandra/test/microbench/tries/MemtableTrieWriteBench.java
@@ -59,7 +59,7 @@ public void putSequential(Blackhole bh) throws MemtableTrie.SpaceExhaustedExcept
             buf.putLong(keyLength - 8, l);
             trie.putRecursive(ByteComparable.fixedLength(buf), Byte.valueOf((byte) (l >> 56)), resolver);
         }
-        System.out.println(trie.valuesCount());
+//        System.out.println(trie.valuesCount());
         bh.consume(trie);
     }
 
@@ -75,7 +75,7 @@ public void putRandom(Blackhole bh) throws MemtableTrie.SpaceExhaustedException
             rand.nextBytes(buf);
             trie.putRecursive(ByteComparable.fixedLength(buf), Byte.valueOf(buf[0]), resolver);
         }
-        System.out.println(trie.valuesCount());
+//        System.out.println(trie.valuesCount());
         bh.consume(trie);
     }
 
@@ -91,7 +91,7 @@ public void applySequential(Blackhole bh) throws MemtableTrie.SpaceExhaustedExce
             buf.putLong(keyLength - 8, l);
             trie.putSingleton(ByteComparable.fixedLength(buf), Byte.valueOf((byte) (l >> 56)), resolver);
         }
-        System.out.println(trie.valuesCount());
+//        System.out.println(trie.valuesCount());
         bh.consume(trie);
     }
 
@@ -107,7 +107,7 @@ public void applyRandom(Blackhole bh) throws MemtableTrie.SpaceExhaustedExceptio
             rand.nextBytes(buf);
             trie.putSingleton(ByteComparable.fixedLength(buf), Byte.valueOf(buf[0]), resolver);
         }
-        System.out.println(trie.valuesCount());
+//        System.out.println(trie.valuesCount());
         bh.consume(trie);
     }
 }

From 30eca9e83990dc23fdf40610cce1d9b900fd0c37 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Tue, 10 Aug 2021 01:16:01 +0300
Subject: [PATCH 141/151] Direct TrieDumper

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 82 +++++++++++--------
 .../org/apache/cassandra/db/tries/Trie.java   |  7 +-
 .../apache/cassandra/db/tries/TrieDumper.java | 79 +++++++++---------
 .../apache/cassandra/db/tries/TrieWalker.java | 76 -----------------
 4 files changed, 91 insertions(+), 153 deletions(-)
 delete mode 100644 src/java/org/apache/cassandra/db/tries/TrieWalker.java

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index e970cbb192dd..f8e71f976db1 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -248,7 +248,9 @@ int getOffset(int pos)
     }
 
 
-    /** Pointer offset for a node pointer. */
+    /**
+     * Pointer offset for a node pointer.
+     */
     int offset(int pos)
     {
         return pos & (BLOCK_SIZE - 1);
@@ -264,7 +266,10 @@ final int getShort(int pos)
         return getBuffer(pos).getShort(getOffset(pos)) & 0xFFFF;
     }
 
-    final int getInt(int pos) { return getBuffer(pos).getInt(getOffset(pos)); }
+    final int getInt(int pos)
+    {
+        return getBuffer(pos).getInt(getOffset(pos));
+    }
 
     T getContent(int index)
     {
@@ -302,7 +307,9 @@ private int chainBlockChildPointer(int node)
         return (node & -BLOCK_SIZE) | LAST_POINTER_OFFSET;
     }
 
-    /** Create a trie node for the given pointer */
+    /**
+     * Create a trie node for the given pointer
+     */
     <L> BaseNode<L> makeNode(int node, L parent)
     {
         if (isNull(node))
@@ -324,7 +331,9 @@ <L> BaseNode<L> makeNode(int node, L parent)
         }
     }
 
-    /** Get a node's child for the given transition character */
+    /**
+     * Get a node's child for the given transition character
+     */
     int getChild(int node, int trans)
     {
         if (isNullOrLeaf(node))
@@ -369,7 +378,7 @@ protected int followContentTransition(int node)
 
     /**
      * Advance as long as the cell pointed to by the given pointer will let you.
-     *
+     * <p>
      * This is the same as getChild(node, first), except for chain nodes where it would walk the fill chain as long as
      * the input source matches.
      */
@@ -403,7 +412,9 @@ int advance(int node, int first, ByteSource rest)
         }
     }
 
-    /** Get the child for the given transition character, knowing that the node is sparse */
+    /**
+     * Get the child for the given transition character, knowing that the node is sparse
+     */
     int getSparseChild(int node, int trans)
     {
         for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
@@ -423,31 +434,39 @@ int getSparseChild(int node, int trans)
         return NONE;
     }
 
-    /** Given a transition, returns the corresponding index (within the node block) of the pointer to the mid block of
-     * a split node. */
+    /**
+     * Given a transition, returns the corresponding index (within the node block) of the pointer to the mid block of
+     * a split node.
+     */
     int splitNodeMidIndex(int trans)
     {
         // first 2 bytes of the 2-3-3 split
         return (trans >> 6);
     }
 
-    /** Given a transition, returns the corresponding index (within the mid block) of the pointer to the tail block of
-     * a split node. */
+    /**
+     * Given a transition, returns the corresponding index (within the mid block) of the pointer to the tail block of
+     * a split node.
+     */
     int splitNodeTailIndex(int trans)
     {
         // second 3 bytes of the 2-3-3 split
         return (trans >> 3) & 0x7;
     }
 
-    /** Given a transition, returns the corresponding index (within the tail block) of the pointer to the child of
-     * a split node. */
+    /**
+     * Given a transition, returns the corresponding index (within the tail block) of the pointer to the child of
+     * a split node.
+     */
     int splitNodeChildIndex(int trans)
     {
         // third 3 bytes of the 2-3-3 split
         return trans & 0x7;
     }
 
-    /** Get the child for the given transition character, knowing that the node is split */
+    /**
+     * Get the child for the given transition character, knowing that the node is split
+     */
     int getSplitChild(int node, int trans)
     {
         int mid = getInt(node + SPLIT_POINTER_OFFSET + splitNodeMidIndex(trans) * 4);
@@ -460,7 +479,9 @@ int getSplitChild(int node, int trans)
         return getInt(tail + splitNodeChildIndex(trans) * 4);
     }
 
-    /** Get the content for a given node */
+    /**
+     * Get the content for a given node
+     */
     T getNodeContent(int node)
     {
         if (isLeaf(node))
@@ -577,7 +598,7 @@ Remaining nextValid(int trans)
         void dump(int indent, StringBuilder b, Function<T, String> contentToString)
         {
             indent++;
-            b.append(" -> Split\n");
+            b.append(" -> [Split]\n");
             for (int idx = 0; idx < 256; ++idx)
             {
                 BaseNode<L> child = makeNode(getChild(idx), null);
@@ -632,7 +653,7 @@ public Remaining advanceIteration()
         void dump(int indent, StringBuilder b, Function<T, String> contentToString)
         {
             indent++;
-            b.append(" -> Sparse\n");
+            b.append(" -> [Sparse]\n");
             for (int idx = 0; idx < SPARSE_CHILD_COUNT; ++idx)
             {
                 BaseNode<L> child = makeNode(getInt(node + SPARSE_CHILDREN_OFFSET + idx * 4), null);
@@ -699,13 +720,13 @@ public BaseNode<L> getUniqueDescendant(L parentLink, TransitionsReceiver receive
         @Override
         void dump(int indent, StringBuilder b, Function<T, String> contentToString)
         {
-            b.append(" -> Chain\n");
+            b.append(" -> [Chain]\n");
             for (int i = 0; i < indent + 1; ++i)
                 b.append("  ");
             int limit = chainBlockChildPointer(node);
             for (int p = node; p < limit; ++p)
             {
-                indent ++;
+                indent++;
                 b.append(String.format("%02x", getByte(p)));
             }
             makeNode(getInt(limit), null).dump(indent, b, contentToString);
@@ -847,14 +868,17 @@ private int node(int backtrackLevel)
         {
             return backtrack[backtrackLevel * 3 + 0];
         }
+
         private int data(int backtrackLevel)
         {
             return backtrack[backtrackLevel * 3 + 1];
         }
+
         private int level(int backtrackLevel)
         {
             return backtrack[backtrackLevel * 3 + 2];
         }
+
         void addBacktrack(int node, int data, int level)
         {
             if (backtrackLevel * 3 >= backtrack.length)
@@ -924,12 +948,12 @@ private boolean advanceToNextChild(int node, int transition)
 
             switch (offset(node))
             {
-            case SPLIT_OFFSET:
-                return nextValidSplitTransition(node, transition + 1);
-            case SPARSE_OFFSET:
-                return nextValidSparseTransition(node, transition + 1);
-            default:
-                return getChainTransition(node);
+                case SPLIT_OFFSET:
+                    return nextValidSplitTransition(node, transition + 1);
+                case SPARSE_OFFSET:
+                    return nextValidSparseTransition(node, transition + 1);
+                default:
+                    return getChainTransition(node);
             }
         }
 
@@ -1088,7 +1112,6 @@ public MemtableCursor cursor()
         return new MemtableCursor();
     }
 
-
     /*
      Direct read methods
      */
@@ -1132,13 +1155,4 @@ public String dump(Function<T, String> contentToString)
             b.append("empty");
         return b.toString();
     }
-
-    /**
-     * Override as non-throwing.
-     */
-    @Override
-    public String dump()
-    {
-        return dump(Object::toString);
-    }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index 577d0cf32ab7..e953d61873e8 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -309,11 +309,6 @@ public Node<T, L> getCurrentChild(L parent)
         }
     }
 
-    public <V> V walk(TrieWalker<T, V> walker)
-    {
-        return TrieWalker.process(walker, this);
-    }
-
     public String dump()
     {
         return dump(Object::toString);
@@ -321,7 +316,7 @@ public String dump()
 
     public String dump(Function<T, String> contentToString)
     {
-        return walk(new TrieDumper<>(contentToString));
+        return TrieDumper.process(contentToString, this);
     }
 
     /**
diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumper.java b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
index d031f6c46554..01cd63b2bb0c 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieDumper.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
@@ -19,59 +19,64 @@
 
 import java.util.function.Function;
 
+import org.agrona.concurrent.UnsafeBuffer;
+
 /**
  * Simple utility class for dumping the structure of a trie to string.
  */
-class TrieDumper<T> implements TrieWalker<T, String>
+class TrieDumper<T>
 {
-    // TODO: Test then make simpler direct version
-    private final Function<T, String> contentToString;
-    private final StringBuilder b = new StringBuilder();
-    private int depth = -1;
-    private boolean indented = true;
-
-    TrieDumper(Function<T, String> contentToString)
+    public static <T> String process(Function<T, String> contentToString, Trie<T> trie)
     {
-        this.contentToString = contentToString;
+        StringBuilder sb = new StringBuilder();
+        Trie.ResettingTransitionsReceiver receiver = new TransitionsDumper(sb);
+        Trie.Cursor<T> cursor = trie.cursor();
+        while (true)
+        {
+            T content = cursor.advanceToContent(receiver);
+            if (content == null)
+                return sb.toString();
+            sb.append(" -> ");
+            sb.append(contentToString.apply(content));
+            receiver.reset(cursor.level());
+        }
     }
 
-    public void onNodeEntry(int incomingTransition, T content)
+    private static class TransitionsDumper implements Trie.ResettingTransitionsReceiver
     {
-        if (!indented)
+        private final StringBuilder b;
+        boolean justReset = true;
+
+        public TransitionsDumper(StringBuilder b)
         {
-            for (int i = 0; i < depth; ++i)
-                b.append("  ");
-            indented = true;
+            this.b = b;
         }
 
-        ++depth;
-        if (incomingTransition != -1)
-            b.append(String.format("%02x", incomingTransition));
-
-        if (content != null)
+        @Override
+        public void reset(int newLength)
         {
-            // Only go to a new line once a payload is reached
-            indented = false;
-            b.append(" -> ");
-            b.append(contentToString.apply(content));
-            b.append('\n');
+            if (!justReset)
+            {
+                b.append('\n');
+                for (int i = 0; i < newLength; ++i)
+                    b.append("  ");
+                justReset = true;
+            }
         }
-    }
 
-    public void onNodeExit()
-    {
-        if (indented)
+        @Override
+        public void add(int incomingTransition)
         {
-            // We are backtracking without having printed content or meta. Although unexpected, this can legally happen
-            // (e.g. if an intersection has resulted in an empty node).
-            indented = false;
-            b.append('\n');
+            b.append(String.format("%02x", incomingTransition));
+            justReset = false;
         }
-        --depth;
-    }
 
-    public String completion()
-    {
-        return b.toString();
+        @Override
+        public void add(UnsafeBuffer buf, int pos, int count)
+        {
+            for (int i = 0; i < count; ++i)
+                b.append(String.format("%02x", buf.getByte(pos + i) & 0xFF));
+            justReset = false;
+        }
     }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieWalker.java b/src/java/org/apache/cassandra/db/tries/TrieWalker.java
deleted file mode 100644
index 37a9fb0c68c3..000000000000
--- a/src/java/org/apache/cassandra/db/tries/TrieWalker.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.cassandra.db.tries;
-
-/**
- * Utility class for performing some types of walks over the trie, where the result can be used as a
- * CompletableFuture.
- * See TrieDumper for sample usage.
- */
-public interface TrieWalker<T, V>
-{
-    /**
-     * Called when entering a node of the trie.
-     *
-     * @param incomingTransition the transition that led here, -1 if this is the root.
-     */
-    void onNodeEntry(int incomingTransition, T content);
-
-    /**
-     * Called when leaving a node of the trie, that is after having exited its last children.
-     */
-    void onNodeExit();
-
-    /**
-     * The final value of the trie walk.
-     * <p>
-     * This is called on completion of the walk (after calling {@link #onNodeExit} on the root node) to obtain the
-     * final outcome of the walk.
-     * <p>
-     * Note: the type parameter L must be equal to {@code Trie.Node<T, L>}. There is no way to specify such recursive
-     * types in Java, but it does get inferred correctly in calls to this method.
-     *
-     * @return the final outcome of the walk.
-     */
-    V completion();
-
-    public static <T, V> V process(TrieWalker<T, V> walker, Trie<T> trie)
-    {
-        Trie.Cursor<T> cursor = trie.cursor();
-        if (cursor.level() == -1)
-            return walker.completion();
-
-        walker.onNodeEntry(-1, cursor.content());
-
-        int prevLevel = 0;
-        int level = cursor.advance();
-        while (level != -1)
-        {
-            while (prevLevel >= level)
-            {
-                walker.onNodeExit();
-                --prevLevel;
-                assert prevLevel >= 0;
-            }
-            walker.onNodeEntry(cursor.incomingTransition(), cursor.content());
-            prevLevel = level;
-            level = cursor.advance();
-        }
-        return walker.completion();
-    }
-}

From 0bea6e327d6e11008414e6e577255e4d094322f4 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Tue, 10 Aug 2021 01:31:18 +0300
Subject: [PATCH 142/151] Typed cursor dump

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 94 +++++++++++++++++--
 .../apache/cassandra/db/tries/TrieDumper.java | 23 +++--
 2 files changed, 104 insertions(+), 13 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index f8e71f976db1..cce5233d1979 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -1086,6 +1086,25 @@ public int advanceMultiple(TransitionsReceiver receiver)
             }
         }
 
+        int advanceChainPath(TransitionsReceiver receiver)
+        {
+            int node = currentNode;
+            if (!isChainNode(node))
+                return advance();
+
+            UnsafeBuffer buffer = getBuffer(node);
+            int ofs = getOffset(node);
+            int pointer = chainBlockChildPointer(ofs);
+            int child = buffer.getInt(pointer);
+            int length = pointer - ofs;
+            --length;   // leave the last byte for incomingTransition
+            if (receiver != null && length > 0)
+                receiver.add(buffer, ofs, length);
+            level += length;
+
+            return descendInto(child, buffer.getByte(pointer - 1) & 0xFF);
+        }
+
         public int level()
         {
             return level;
@@ -1148,11 +1167,74 @@ public boolean isEmpty()
     @Override
     public String dump(Function<T, String> contentToString)
     {
-        StringBuilder b = new StringBuilder();
-        if (!isNull(root))
-            root().dump(0, b, contentToString);
-        else
-            b.append("empty");
-        return b.toString();
+        MemtableCursor source = cursor();
+        class TypedNodesCursor implements Cursor<String>
+        {
+            @Override
+            public int advance()
+            {
+                return source.advance();
+            }
+
+
+            @Override
+            public int advanceMultiple(TransitionsReceiver receiver)
+            {
+                return source.advanceChainPath(receiver);
+            }
+
+            @Override
+            public int ascend()
+            {
+                return source.ascend();
+            }
+
+            @Override
+            public int level()
+            {
+                return source.level();
+            }
+
+            @Override
+            public int incomingTransition()
+            {
+                return source.incomingTransition();
+            }
+
+            @Override
+            public String content()
+            {
+                String type = null;
+                int node = source.currentNode;
+                if (!isNullOrLeaf(node))
+                {
+                    switch (offset(node))
+                    {
+                        case SPARSE_OFFSET:
+                            type = "[SPARSE]";
+                            break;
+                        case SPLIT_OFFSET:
+                            type = "[SPLIT]";
+                            break;
+                        case PREFIX_OFFSET:
+                            throw new AssertionError("Unexpected prefix as cursor currentNode.");
+                        default:
+                            type = "[CHAIN]";
+                            break;
+                    }
+                }
+                T content = source.content();
+                if (content != null)
+                {
+                    if (type != null)
+                        return contentToString.apply(content) + " -> " + type;
+                    else
+                        return contentToString.apply(content);
+                }
+                else
+                    return type;
+            }
+        }
+        return TrieDumper.dump(Object::toString, new TypedNodesCursor());
     }
 }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumper.java b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
index 01cd63b2bb0c..3ed1c3a2851a 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieDumper.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
@@ -27,10 +27,14 @@
 class TrieDumper<T>
 {
     public static <T> String process(Function<T, String> contentToString, Trie<T> trie)
+    {
+        return dump(contentToString, trie.cursor());
+    }
+
+    static <T> String dump(Function<T, String> contentToString, Trie.Cursor<T> cursor)
     {
         StringBuilder sb = new StringBuilder();
         Trie.ResettingTransitionsReceiver receiver = new TransitionsDumper(sb);
-        Trie.Cursor<T> cursor = trie.cursor();
         while (true)
         {
             T content = cursor.advanceToContent(receiver);
@@ -45,7 +49,7 @@ public static <T> String process(Function<T, String> contentToString, Trie<T> tr
     private static class TransitionsDumper implements Trie.ResettingTransitionsReceiver
     {
         private final StringBuilder b;
-        boolean justReset = true;
+        int needsIndent = -1;
 
         public TransitionsDumper(StringBuilder b)
         {
@@ -55,28 +59,33 @@ public TransitionsDumper(StringBuilder b)
         @Override
         public void reset(int newLength)
         {
-            if (!justReset)
+            needsIndent = newLength;
+        }
+
+        private void maybeIndent()
+        {
+            if (needsIndent >= 0)
             {
                 b.append('\n');
-                for (int i = 0; i < newLength; ++i)
+                for (int i = 0; i < needsIndent; ++i)
                     b.append("  ");
-                justReset = true;
+                needsIndent = -1;
             }
         }
 
         @Override
         public void add(int incomingTransition)
         {
+            maybeIndent();
             b.append(String.format("%02x", incomingTransition));
-            justReset = false;
         }
 
         @Override
         public void add(UnsafeBuffer buf, int pos, int count)
         {
+            maybeIndent();
             for (int i = 0; i < count; ++i)
                 b.append(String.format("%02x", buf.getByte(pos + i) & 0xFF));
-            justReset = false;
         }
     }
 }

From 4f161ddc4ac68def1a33e76205659d5f2007527a Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Tue, 10 Aug 2021 03:40:23 +0300
Subject: [PATCH 143/151] Remove Node representation of Tries.

---
 .../db/tries/CollectionMergeTrie.java         | 254 ------------
 .../cassandra/db/tries/CursorFromNode.java    |  97 -----
 .../cassandra/db/tries/MemtableReadTrie.java  | 373 ------------------
 .../apache/cassandra/db/tries/MergeTrie.java  | 119 ------
 .../cassandra/db/tries/RangeTrieSet.java      | 112 ------
 .../db/tries/SetIntersectionTrie.java         | 100 -----
 .../cassandra/db/tries/SingletonTrie.java     |  84 ----
 .../org/apache/cassandra/db/tries/Trie.java   | 154 +-------
 .../apache/cassandra/db/tries/TrieSet.java    |  57 +--
 .../db/tries/MemtableTrieTestBase.java        | 126 +++---
 .../db/tries/SetIntersectionTrieTest.java     |  65 ++-
 11 files changed, 134 insertions(+), 1407 deletions(-)
 delete mode 100644 src/java/org/apache/cassandra/db/tries/CursorFromNode.java

diff --git a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
index 340457cc497d..31b65c24f72d 100644
--- a/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/CollectionMergeTrie.java
@@ -40,266 +40,12 @@ class CollectionMergeTrie<T> extends Trie<T>
         this.inputs = inputs;
     }
 
-    public <L> Node<T, L> root()
-    {
-        List<Node<T, L>> nodes = new ArrayList<>(inputs.size());
-        for (Trie<T> input : inputs)
-        {
-            Node<T, L> root = input.root();
-            if (root != null)
-                nodes.add(root);
-        }
-        return makeMerge(resolver, nodes);
-    }
-
     @Override
     protected Cursor<T> cursor()
     {
         return new CollectionMergeCursor<>(resolver, inputs);
     }
 
-    private static <T, L> Node<T, L> makeMerge(CollectionMergeResolver<T> resolver, List<Node<T, L>> nodes)
-    {
-        switch (nodes.size())
-        {
-        case 0:
-            return null;
-        case 1:
-            return nodes.get(0);
-        case 2:
-            return new MergeTrie.MergeNode<>(resolver, nodes.get(0), nodes.get(1));
-        default:
-            return new MergeNode<>(resolver, nodes);
-        }
-    }
-
-    static class MergeNode<T, L> extends Node<T, L>
-    {
-        private final CollectionMergeResolver<T> resolver;  // only called on more than one input
-        final List<Node<T, L>> nodes;
-        T content;
-        volatile boolean contentMerged = false;
-
-        MergeNode(CollectionMergeResolver<T> resolver, List<Node<T, L>> nodes)
-        {
-            // All children necessarily use the same parent link (given during getCurrentChild). Make that ours.
-            super(nodes.get(0).parentLink);
-            this.resolver = resolver;
-            this.nodes = nodes;
-        }
-
-        /*
-         * The merge node is effectively a merge iterator of children.
-         *
-         * The most straightforward way to implement merging of iterators is to use a {@code PriorityQueue},
-         * {@code poll} it to find the next item to consume, then {@code add} the iterator back after advancing.
-         * This is not very efficient as {@code poll} and {@code add} in all cases require at least
-         * {@code log(size)} comparisons and swaps (usually more than {@code 2*log(size)}) per consumed item, even
-         * if the input is suitable for fast iteration.
-         *
-         * The implementation below makes use of the fact that replacing the top element in a binary heap can be
-         * done much more efficiently than separately removing it and placing it back, especially in the cases where
-         * the top iterator is to be used again very soon (e.g. when there are large sections of the output where
-         * only a limited number of input iterators overlap, which is normally the case in many practically useful
-         * situations, e.g. levelled compaction).
-         *
-         * The implementation builds and maintains a binary heap of sources (stored in an array), where we do not
-         * add items after the initial construction. Instead we advance the smallest element (which is at the top
-         * of the heap) and push it down to find its place for its new transition character. Should this source
-         * be exhausted, we swap it with the last source in the heap and proceed by pushing that down in the
-         * heap.
-         *
-         * In the case where we have multiple sources with matching transition characters, the merging algorithm
-         * must be able to merge all equal values. To achieve this {@code getCurrentChild} walks the heap to
-         * find all equal items without advancing the sources, and separately {@code advanceIteration} advances
-         * all equal sources and restores the heap structure.
-         *
-         * The latter is done equivalently to the process of building the initial heap in {@code startIteration}
-         * using back-to-front heapification as done in the classic heapsort algorithm. It only needs to heapify
-         * subheaps whose top item is advanced (i.e. one whose transition character matches the current),
-         * and we can do that recursively from bottom to top. Should a source be exhausted when advancing, it can
-         * be thrown away by swapping in the last source in the heap (note: we must be careful to advance that
-         * source too if required).
-         *
-         * Note: This is a simplification of the MergeIterator code from CASSANDRA-8915, without the leading ordered
-         * section and equalParent flag since comparisons of transition characters are cheap.
-         */
-
-        public Remaining startIteration()
-        {
-            int count = nodes.size();
-            // Get every input's initial state and move nodes with no children at the end.
-            for (int i = 0; i < count; ++i)
-            {
-                Node<T, L> ni = nodes.get(i);
-                boolean sHas = ni.startIteration() != null;
-                if (!sHas)
-                {
-                    --count;
-                    // put last one at its place (will do nothing if count now equals i)
-                    nodes.set(i, nodes.get(count));
-                    nodes.remove(count);
-                    // make sure the moved input is processed
-                    --i;
-                }
-            }
-            // We now create a heap from the input states we got. This process has linear complexity in the number
-            // of input states (see heapsort algorithm).
-            while (--count >= 0)
-                heapifyDown(count);
-
-            if (nodes.isEmpty())
-                return null;
-            currentTransition = nodes.get(0).currentTransition;
-            return Remaining.MULTIPLE;
-        }
-
-        public Remaining advanceIteration()
-        {
-            int current = currentTransition;
-            advance(current, 0);
-
-            if (nodes.isEmpty())
-                return null;
-            currentTransition = nodes.get(0).currentTransition;
-            return Remaining.MULTIPLE;
-        }
-
-        public Node<T, L> getCurrentChild(L parent)
-        {
-            int current = currentTransition;
-            List<Node<T, L>> children = new ArrayList<>(nodes.size());
-            collectEqual(0, current, parent, children);
-            return makeMerge(resolver, children);
-        }
-
-        /**
-         * Gets the child for every input in the heap rooted at the given index that is at the given transition.
-         * Calls itself recursively and used by getCurrentChild with index = 0 and transition = state[0].transition
-         * to get the child for all inputs that are positioned at the current minimal transition.
-         */
-        void collectEqual(int index, int transition, L parent, List<Node<T, L>> list)
-        {
-            Node<T, L> child = nodes.get(index).getCurrentChild(parent);
-            if (child != null)
-                list.add(child);
-
-            // Check if any of the children in the heap are at the same transition.
-            // If so, collect children recursively.
-            int next = index * 2 + 1;
-            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
-                collectEqual(next, transition, parent, list);
-            ++next;
-            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
-                collectEqual(next, transition, parent, list);
-        }
-
-        /**
-         * Advance the state of the input at the given index and any of its descendants that are at the same
-         * transition byte and restore the heap invariant for the subtree rooted at the given index.
-         * Calls itself recursively and used by advanceState with index = 0 and transition = state[0].transition
-         * to advance the state of the merge.
-         */
-        private void advance(int transition, int index)
-        {
-            Node<T, L> n = nodes.get(index);
-            // Advance current node and remove it from active heap if it has no further children.
-            while (n.advanceIteration() == null)
-            {
-                // n has no further children, it needs to be removed from the active heap.
-                // Move the last to index'th position and continue processing with that node.
-                int nodeCount = nodes.size() - 1;
-                n = nodes.remove(nodeCount);
-                if (nodeCount == index)
-                    return; // done, n was at the end of the heap so the subheap to advance is now empty thus
-                            // the invariant is trivially true
-
-                nodes.set(index, n);
-                // The node we swapped in may also need advancing. If so, repeat the procedure above.
-                if (n.currentTransition > transition)
-                    break;
-            }
-
-            // If the children are at the same transition byte, they also need advancing and their subheap
-            // invariant to be restored.
-            int next = index * 2 + 1;
-            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
-                advance(transition, next);
-            ++next;
-            if (next < nodes.size() && nodes.get(next).currentTransition == transition)
-                advance(transition, next);
-
-            // At this point the heaps at both children are advanced and well-formed. Place current node in its
-            // proper position.
-            heapifyDown(index);
-            // The heap rooted at index is now advanced and well-formed.
-        }
-
-        /**
-         * Push the given state down in the heap from the given index until it finds its proper place among
-         * the subheap rooted at that position.
-         */
-        private void heapifyDown(int index)
-        {
-            Node<T, L> node = nodes.get(index);
-
-            int transition = node.currentTransition;
-            while (true)
-            {
-                int next = index * 2 + 1;
-                if (next >= nodes.size())
-                    break;
-                // Select the smaller of the two children to push down to.
-                if (next + 1 < nodes.size() && nodes.get(next).currentTransition > nodes.get(next + 1).currentTransition)
-                    ++next;
-                // If the child is greater, the invariant has been restored.
-                if (nodes.get(next).currentTransition >= transition)
-                    break;
-                nodes.set(index, nodes.get(next));
-                index = next;
-            }
-            nodes.set(index, node);
-        }
-
-        public T content()
-        {
-            if (!contentMerged)
-            {
-                // If we only have input from zero or one source, we will keep it here, avoiding the allocation
-                // of the list until necessary.
-                T v = null;
-                Collection<T> values = null;
-                for (Node<T, L> n : nodes)
-                {
-                    T c = n.content();
-                    if (c == null)
-                        continue;
-                    if (v == null)
-                    {
-                        v = c;  // one element
-                        continue;
-                    }
-                    if (values == null)
-                    {
-                        // more than one
-                        values = new ArrayList<>();
-                        values.add(v);
-                    }
-                    values.add(c);
-                }
-
-                if (values == null)
-                    content = v;
-                else
-                    content = resolver.resolve(values);
-
-                // Save content to avoid doing this costly operation again.
-                contentMerged = true;
-            }
-            return content;
-        }
-    }
-
     static <T> boolean greaterCursor(Cursor<T> c1, Cursor<T> c2)
     {
         int c1level = c1.level();
diff --git a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java b/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
deleted file mode 100644
index 54e0d520ef33..000000000000
--- a/src/java/org/apache/cassandra/db/tries/CursorFromNode.java
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.cassandra.db.tries;
-
-public class CursorFromNode<T> implements Trie.Cursor<T>
-{
-    static final Trie.Node<Object, Trie.Node> EMPTY_NODE = new Trie.NoChildrenNode<Object, Trie.Node>(null)
-    {
-        public Object content()
-        {
-            return null;
-        }
-    };
-
-    Trie.Node<T, Trie.Node> current;
-    int level;
-
-    CursorFromNode(Trie<T> trie)
-    {
-        current = trie.root();
-        if (current == null)
-            current = (Trie.Node<T, Trie.Node>) EMPTY_NODE;
-
-        level = 0;
-    }
-
-    public int advance()
-    {
-        return advance(current.startIteration());
-    }
-
-    private int advance(Trie.Remaining has)
-    {
-        Trie.Node<T, Trie.Node> child = null;
-        do
-        {
-            while (has == null)
-            {
-                current = current.parentLink;
-                --level;
-                if (current == null)
-                {
-                    assert level == -1;
-                    return level;
-                }
-                has = current.advanceIteration();
-            }
-
-            child = current.getCurrentChild(current);
-            if (child == null)
-                has = current.advanceIteration();
-        }
-        while (child == null);
-        current = child;
-        return ++level;
-    }
-
-    public int ascend()
-    {
-        --level;
-        current = current.parentLink;
-        return advance(current.advanceIteration());
-    }
-
-    public int level()
-    {
-        return level;
-    }
-
-    public T content()
-    {
-        return current.content();
-    }
-
-    public int incomingTransition()
-    {
-
-        Trie.Node<T, Trie.Node> parent = current.parentLink;
-        return parent != null ? parent.currentTransition : -1;
-    }
-}
diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index cce5233d1979..00d39fc44eca 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -307,30 +307,6 @@ private int chainBlockChildPointer(int node)
         return (node & -BLOCK_SIZE) | LAST_POINTER_OFFSET;
     }
 
-    /**
-     * Create a trie node for the given pointer
-     */
-    <L> BaseNode<L> makeNode(int node, L parent)
-    {
-        if (isNull(node))
-            return null;
-
-        if (isLeaf(node))
-            return new LeafNode<>(node, parent);
-
-        switch (offset(node))
-        {
-            case SPARSE_OFFSET:
-                return new SparseNode<>(node, parent);
-            case SPLIT_OFFSET:
-                return new SplitNode<>(node, parent);
-            case PREFIX_OFFSET:
-                return new PrefixNode<>(node, parent);
-            default:
-                return new ChainNode<>(node, parent);
-        }
-    }
-
     /**
      * Get a node's child for the given transition character
      */
@@ -496,354 +472,6 @@ T getNodeContent(int node)
                : null;
     }
 
-    /*
-     Trie.Node implementations
-     */
-
-    abstract class BaseNode<L> extends Node<T, L>
-    {
-        final int node;
-
-        BaseNode(int node, L parent)
-        {
-            super(parent);
-            this.node = node;
-        }
-
-        // MemtableTrie nodes don't throw and always return MemtableTrie nodes.
-        @Override
-        public abstract BaseNode<L> getCurrentChild(L parent);
-
-        @Override
-        public T content()
-        {
-            return null;
-        }
-
-        abstract void dump(int indent, StringBuilder b, Function<T, String> contentToString);
-    }
-
-    class SplitNode<L> extends BaseNode<L>
-    {
-        SplitNode(int node, L parent)
-        {
-            super(node, parent);
-            assert offset(node) == SPLIT_OFFSET;
-        }
-
-        @Override
-        public BaseNode<L> getCurrentChild(L parent)
-        {
-            int child = getChild(currentTransition);
-            return makeNode(child, parent);
-        }
-
-        int getChild(int idx)
-        {
-            return getSplitChild(node, idx);
-        }
-
-        @Override
-        public Remaining startIteration()
-        {
-            return nextValid(0);
-        }
-
-        @Override
-        public Remaining advanceIteration()
-        {
-            return nextValid(currentTransition + 1);
-        }
-
-        Remaining nextValid(int trans)
-        {
-            assert trans >= 0 && trans <= 0x100;
-            // Splits the 2-3-3 parts of the transition
-            int midIndex = splitNodeMidIndex(trans);
-            int tailIdx = splitNodeTailIndex(trans);
-            int childIdx = splitNodeChildIndex(trans);
-
-            while (midIndex < 4)
-            {
-                int mid = getInt(node + SPLIT_POINTER_OFFSET + midIndex * 4);
-                if (!isNull(mid))
-                {
-                    while (tailIdx < 8)
-                    {
-                        int tail = getInt(mid + tailIdx * 4);
-                        if (!isNull(tail))
-                        {
-                            while (childIdx < 8)
-                            {
-                                int child = getInt(tail + childIdx * 4);
-                                if (!isNull(child))
-                                {
-                                    currentTransition = ((midIndex << 6) | (tailIdx << 3) | childIdx);
-                                    return Remaining.MULTIPLE;  // no need to be precise on the count
-                                }
-                                ++childIdx;
-                            }
-                        }
-                        childIdx = 0;
-                        ++tailIdx;
-                    }
-                }
-                tailIdx = 0;
-                ++midIndex;
-            }
-            return null;
-        }
-
-        @Override
-        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
-        {
-            indent++;
-            b.append(" -> [Split]\n");
-            for (int idx = 0; idx < 256; ++idx)
-            {
-                BaseNode<L> child = makeNode(getChild(idx), null);
-                if (child != null)
-                {
-                    for (int i = 0; i < indent; ++i)
-                        b.append("  ");
-                    b.append(String.format("%02x", idx));
-                    child.dump(indent, b, contentToString);
-                }
-            }
-        }
-    }
-
-    class SparseNode<L> extends BaseNode<L>
-    {
-        int iterationState;
-
-        SparseNode(int node, L parent)
-        {
-            super(node, parent);
-            assert offset(node) == SPARSE_OFFSET;
-        }
-
-        @Override
-        public BaseNode<L> getCurrentChild(L parent)
-        {
-            int child = getInt(node + SPARSE_CHILDREN_OFFSET + 4 * (iterationState % SPARSE_CHILD_COUNT));
-            return makeNode(child, parent);
-        }
-
-        @Override
-        public Remaining startIteration()
-        {
-            iterationState = getShort(node + SPARSE_ORDER_OFFSET);
-            currentTransition = getByte(node + SPARSE_BYTES_OFFSET + iterationState % SPARSE_CHILD_COUNT);
-            return Remaining.MULTIPLE;
-        }
-
-        @Override
-        public Remaining advanceIteration()
-        {
-            iterationState /= SPARSE_CHILD_COUNT;
-            // the last item is never in position 0
-            if (iterationState == 0)
-                return null;
-            currentTransition = getByte(node + SPARSE_BYTES_OFFSET + iterationState % SPARSE_CHILD_COUNT);
-            return iterationState >= SPARSE_CHILD_COUNT ? Remaining.MULTIPLE : Remaining.ONE;
-        }
-
-        @Override
-        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
-        {
-            indent++;
-            b.append(" -> [Sparse]\n");
-            for (int idx = 0; idx < SPARSE_CHILD_COUNT; ++idx)
-            {
-                BaseNode<L> child = makeNode(getInt(node + SPARSE_CHILDREN_OFFSET + idx * 4), null);
-                if (child != null)
-                {
-                    for (int i = 0; i < indent; ++i)
-                        b.append("  ");
-                    b.append(String.format("%02x", getByte(node + SPARSE_BYTES_OFFSET + idx)));
-                    child.dump(indent, b, contentToString);
-                }
-            }
-        }
-    }
-
-    class ChainNode<L> extends BaseNode<L>
-    {
-        // This node's pos points to the exact character of the next transition. The number of characters left is what
-        // needs to be added to that position to be one int away from the end of the node.
-        ChainNode(int node, L parent)
-        {
-            super(node, parent);
-            assert offset(node) >= CHAIN_MIN_OFFSET && offset(node) <= CHAIN_MAX_OFFSET;
-            currentTransition = getByte(node);
-        }
-
-        @Override
-        public Remaining startIteration()
-        {
-            return Remaining.ONE;
-        }
-
-        @Override
-        public Remaining advanceIteration()
-        {
-            return null;
-        }
-
-        @Override
-        public BaseNode<L> getCurrentChild(L parent)
-        {
-            if (offset(node + 1) == LAST_POINTER_OFFSET)
-                return makeNode(getInt(node + 1), parent);
-            return new ChainNode<>(node + 1, parent);
-        }
-
-        @Override
-        public BaseNode<L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver)
-        {
-            int child = node;
-            do
-            {
-                final int pointerPos = chainBlockChildPointer(child);
-                if (receiver != null)
-                    receiver.add(getBuffer(child), getOffset(child), pointerPos - child);
-                // jump directly to the child at the end of the chain
-                child = getInt(pointerPos);
-                // and continue jumping as long as the resulting node is a chain
-            }
-            while (child > 0 && offset(child) <= CHAIN_MAX_OFFSET);
-
-            return makeNode(child, parentLink);
-        }
-
-        @Override
-        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
-        {
-            b.append(" -> [Chain]\n");
-            for (int i = 0; i < indent + 1; ++i)
-                b.append("  ");
-            int limit = chainBlockChildPointer(node);
-            for (int p = node; p < limit; ++p)
-            {
-                indent++;
-                b.append(String.format("%02x", getByte(p)));
-            }
-            makeNode(getInt(limit), null).dump(indent, b, contentToString);
-        }
-    }
-
-    class PrefixNode<L> extends BaseNode<L>
-    {
-        /**
-         * The augmented node. Prefix nodes are not presented as separate nodes, but instead only add content to
-         * another type of node. To prevent having separate instances for prefix-augmented split/sparse/chain, we
-         * instantiate and wrap a node of that type and only change what content() and getUniqueDescendant() do.
-         */
-        final BaseNode<L> augmentedNode;
-
-        PrefixNode(int node, L parent)
-        {
-            super(node, parent);
-            assert offset(node) == PREFIX_OFFSET;
-            this.augmentedNode = makeNode(followContentTransition(node), parent);
-        }
-
-        @Override
-        public Remaining startIteration()
-        {
-            Remaining result = augmentedNode.startIteration();
-            currentTransition = augmentedNode.currentTransition;
-            return result;
-        }
-
-        @Override
-        public Remaining advanceIteration()
-        {
-            Remaining result = augmentedNode.advanceIteration();
-            currentTransition = augmentedNode.currentTransition;
-            return result;
-        }
-
-        @Override
-        public T content()
-        {
-            return getNodeContent(node);
-        }
-
-        @Override
-        public BaseNode<L> getCurrentChild(L parent)
-        {
-            return augmentedNode.getCurrentChild(parent);
-        }
-
-        // Note: we do not map getUniqueDescendant to the augmented node's method as we want consumers to pay
-        // attention to this node.
-
-        @Override
-        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
-        {
-            T content = content();
-            b.append(" -> ");
-            b.append(contentToString.apply(content));
-            b.append('\n');
-            for (int i = 0; i < indent + 1; ++i)
-                b.append("  ");
-            augmentedNode.dump(indent, b, contentToString);
-        }
-    }
-
-    class LeafNode<L> extends BaseNode<L>
-    {
-        LeafNode(int node, L parent)
-        {
-            super(node, parent);
-            assert node < NONE;
-        }
-
-        IllegalStateException error()
-        {
-            return new IllegalStateException("Node has no children.");
-        }
-
-        @Override
-        public Remaining startIteration()
-        {
-            return null;
-        }
-
-        @Override
-        public Remaining advanceIteration()
-        {
-            throw error();
-        }
-
-        @Override
-        public BaseNode<L> getCurrentChild(L parent)
-        {
-            throw error();
-        }
-
-        @Override
-        public T content()
-        {
-            return getContent(~node);
-        }
-
-        void dump(int indent, StringBuilder b, Function<T, String> contentToString)
-        {
-            b.append(" -> ");
-            b.append(contentToString.apply(content()));
-            b.append("\n");
-        }
-    }
-
-    public <L> BaseNode<L> root()
-    {
-        return makeNode(root, null);
-    }
-
-
     /*
      Cursor implementation
      */
@@ -1160,7 +788,6 @@ public boolean isEmpty()
         return isNull(root);
     }
 
-
     /**
      * Override of dump to provide more detailed printout that includes the type of each node in the trie.
      */
diff --git a/src/java/org/apache/cassandra/db/tries/MergeTrie.java b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
index 36a8fa7f905f..b8f7f512f670 100644
--- a/src/java/org/apache/cassandra/db/tries/MergeTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MergeTrie.java
@@ -41,131 +41,12 @@ class MergeTrie<T> extends Trie<T>
         this.t2 = t2;
     }
 
-    @Override
-    public <L> Node<T, L> root()
-    {
-        return makeNode(resolver, t1.root(), t2.root());
-    }
-
     @Override
     protected Cursor<T> cursor()
     {
         return new MergeCursor<>(resolver, t1, t2);
     }
 
-    private static <T, L> Node<T, L> makeNode(MergeResolver<T> resolver, Node<T, L> child1, Node<T, L> child2)
-    {
-        if (child1 != null && child2 != null)
-            return new MergeNode<>(resolver, child1, child2);
-
-        if (child1 != null)
-            return child1;
-
-        if (child2 != null)
-            return child2;
-
-        return null;
-    }
-
-    static class MergeNode<T, L> extends Node<T, L>
-    {
-        private final MergeResolver<T> resolver;
-        final Node<T, L> n1;
-        final Node<T, L> n2;
-        int b1;
-        int b2;
-
-        MergeNode(MergeResolver<T> resolver, Node<T, L> n1, Node<T, L> n2)
-        {
-            // Both children have the same parent link (passed during getCurrentChild). Use either as ours.
-            super(n1.parentLink);
-            assert n2.parentLink == n1.parentLink;
-            this.resolver = resolver;
-            this.n1 = n1;
-            this.n2 = n2;
-        }
-
-        private Remaining makeState(Remaining has1, Remaining has2)
-        {
-            Remaining result;
-            if (has1 != null)
-            {
-                b1 = n1.currentTransition;
-                result = Remaining.MULTIPLE;
-            }
-            else
-            {
-                b1 = NOT_PRESENT;
-                result = has2;
-            }
-            currentTransition = b1;
-            if (has2 != null)
-            {
-                b2 = n2.currentTransition;
-                if (b2 < b1)
-                    currentTransition = b2;
-                else if (b1 == b2 && has1 == Remaining.ONE && has2 == Remaining.ONE)
-                    result = Remaining.ONE;
-            }
-            else
-            {
-                b2 = NOT_PRESENT;
-                result = has1;
-            }
-            return result;
-        }
-
-        public Remaining startIteration()
-        {
-            return makeState(n1.startIteration(), n2.startIteration());
-        }
-
-        public Remaining advanceIteration()
-        {
-            int prevb1 = b1;
-            int prevb2 = b2;
-            // We must advance the state of the source with the smaller transition byte.
-            // If their transition bytes are equal, we advance both.
-            if (prevb1 <= prevb2)
-            {
-                boolean has = n1.advanceIteration() != null;
-                b1 = has ? n1.currentTransition : NOT_PRESENT;
-            }
-            if (prevb1 >= prevb2)
-            {
-                boolean has = n2.advanceIteration() != null;
-                b2 = has ? n2.currentTransition : NOT_PRESENT;
-            }
-            currentTransition = Math.min(b1, b2);
-            return b1 < NOT_PRESENT || b2 < NOT_PRESENT ? Remaining.MULTIPLE : null;
-        }
-
-        public Node<T, L> getCurrentChild(L parent)
-        {
-            Node<T, L> child1 = null;
-            Node<T, L> child2 = null;
-
-            if (b1 <= b2)
-                child1 = n1.getCurrentChild(parent);
-            if (b1 >= b2)
-                child2 = n2.getCurrentChild(parent);
-
-            return makeNode(resolver, child1, child2);
-        }
-
-        public T content()
-        {
-            T mc = n2.content();
-            T nc = n1.content();
-            if (mc == null)
-                return nc;
-            else if (nc == null)
-                return mc;
-            else
-                return resolver.resolve(nc, mc);
-        }
-    }
-
     static class MergeCursor<T> implements Cursor<T>
     {
         private final MergeResolver<T> resolver;
diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
index 229476bab295..f24d663a38e8 100644
--- a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -280,116 +280,4 @@ public InSet content()
 
 
     // TODO: Change to start/stop sets when nodes are taken out of the picture
-
-    public <L> Node<InSet, L> root()
-    {
-        return makeNode(left == null ? null : left.asComparableBytes(Trie.BYTE_COMPARABLE_VERSION),
-                        left != null,
-                        right == null ? null : right.asComparableBytes(Trie.BYTE_COMPARABLE_VERSION),
-                        right != null,
-                        null);
-    }
-
-    private <L> Node<InSet, L> makeNode(ByteSource lLimit, boolean atLLimit, ByteSource rLimit, boolean atRLimit, L parentLink)
-    {
-        // We only have a constraint on the branch if we are at one or both boundaries.
-        // If the node falls completely between them, the whole branch (at any depth) is in the set.
-        if (!atLLimit && !atRLimit)
-            return parentLink == null ? (Node<InSet, L>) FULL : new FullNode<>(parentLink);
-
-        return new RangeNode<>(lLimit, atLLimit, rLimit, atRLimit, parentLink);
-    }
-
-    class FullNode<L> extends NoChildrenNode<InSet, L>
-    {
-        FullNode(L parent)
-        {
-            super(parent);
-        }
-
-        public InSet content()
-        {
-            return InSet.BRANCH;
-        }
-    }
-
-    class RangeNode<L> extends Node<InSet, L>
-    {
-        /** Byte at the left boundary, inclusive. */
-        final int llimit;
-        final ByteSource remainingLLimit;
-        /** Byte at the right boundary, inclusive. */
-        final int rlimit;
-        final ByteSource remainingRLimit;
-        /** Whether or not we are descending along the left boundary. */
-        final boolean atLLimit;
-        /** Whether or not we are descending along the right boundary. */
-        final boolean atRLimit;
-
-        /** Whether the current path is in the covered set. */
-        final InSet inSet;
-
-        RangeNode(ByteSource remainingLLimit, boolean atLLimit, ByteSource remainingRLimit, boolean atRLimit, L parentLink)
-        {
-            super(parentLink);
-            int llimit = 0;
-            boolean inSet = true;
-            if (atLLimit)
-            {
-                llimit = remainingLLimit.next();
-                if (llimit == ByteSource.END_OF_STREAM)
-                {
-                    atLLimit = false;
-                    llimit = 0;
-                    inSet &= includeLeft; // The current path matches left boundary
-                }
-                else
-                    inSet = false;  // The current path is a prefix of the left boundary, ie. smaller.
-            }
-            int rlimit = 255;
-            if (atRLimit)
-            {
-                rlimit = remainingRLimit.next();
-                if (rlimit == ByteSource.END_OF_STREAM)
-                {
-                    atRLimit = false;
-                    rlimit = -1;    // no op, added for clarity. Node should have no children.
-                    inSet &= includeRight; // The current path matches right boundary
-                }
-            }
-            assert llimit <= rlimit || rlimit == -1 : "Bound " + left + " not <= " + right + " in range " + llimit + " vs " + rlimit;
-
-            this.llimit = llimit;
-            this.remainingLLimit = remainingLLimit;
-            this.rlimit = rlimit;
-            this.remainingRLimit = remainingRLimit;
-            this.atLLimit = atLLimit;
-            this.atRLimit = atRLimit;
-            this.inSet = inSet ? InSet.CONTAINED : InSet.PREFIX;
-        }
-
-        public Node<InSet, L> getCurrentChild(L parentLink)
-        {
-            return makeNode(remainingLLimit, atLLimit && (currentTransition == llimit),
-                            remainingRLimit, atRLimit && (currentTransition == rlimit),
-                            parentLink);
-        }
-
-        public Remaining startIteration()
-        {
-            currentTransition = llimit;
-            return currentTransition <= rlimit ? Remaining.MULTIPLE : null;
-        }
-
-        public Remaining advanceIteration()
-        {
-            return ++currentTransition <= rlimit ? Remaining.MULTIPLE : null;
-        }
-
-        public InSet content()
-        {
-            return inSet;
-        }
-    }
-
 }
diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
index 563dde37f496..6291a7d3e37c 100644
--- a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -31,106 +31,6 @@ class SetIntersectionTrie<T> extends Trie<T>
         this.intersectingSet = intersectingSet;
     }
 
-    @Override
-    public <L> Node<T, L> root()
-    {
-        Node<TrieSet.InSet, Void> sRoot = intersectingSet.root();
-        if (sRoot == null)
-            return null;
-
-        Node<T, L> tRoot = trie.root();
-        if (sRoot.content() == TrieSet.InSet.BRANCH)
-            return tRoot;
-        if (tRoot == null)
-            return null;
-
-        return new IntersectionNode<>(tRoot, sRoot);
-    }
-
-    static class IntersectionNode<T, L> extends Node<T, L>
-    {
-        final Node<T, L> tNode;
-        final Node<TrieSet.InSet, Void> sNode;
-
-        public IntersectionNode(Node<T, L> tNode, Node<TrieSet.InSet, Void> sNode)
-        {
-            super(tNode.parentLink);
-            this.tNode = tNode;
-            this.sNode = sNode;
-        }
-
-        public Remaining startIteration()
-        {
-            Remaining sHas = sNode.startIteration();
-            if (sHas == null)
-                return null;
-
-            return advanceToIntersection(tNode.startIteration());
-        }
-
-        public Remaining advanceIteration()
-        {
-            Remaining sHas = sNode.advanceIteration();
-            if (sHas == null)
-                return null;
-            return advanceToIntersection(tNode.advanceIteration());
-        }
-
-        public Remaining advanceToIntersection(Remaining tHas)
-        {
-            Remaining sHas;
-            if (tHas == null)
-                return null;
-            int sByte = sNode.currentTransition;
-            int tByte = tNode.currentTransition;
-
-            while (tByte != sByte)
-            {
-                if (tByte < sByte)
-                {
-                    tHas = tNode.advanceIteration();
-                    if (tHas == null)
-                        return null;
-                    tByte = tNode.currentTransition;
-                }
-                else // (tByte > sByte)
-                {
-                    sHas = sNode.advanceIteration();
-                    if (sHas == null)
-                        return null;
-                    sByte = sNode.currentTransition;
-                }
-            }
-            currentTransition = sByte;
-            return tHas;    // ONE or MULTIPLE
-        }
-
-        public Node<T, L> getCurrentChild(L parent)
-        {
-            Node<TrieSet.InSet, Void> receivedSetNode = sNode.getCurrentChild(null);
-
-            if (receivedSetNode == null)
-                return null;    // branch is completely outside the set
-
-            Node<T, L> nn = tNode.getCurrentChild(parent);
-
-            if (nn == null)
-                return null;
-
-            if (receivedSetNode.content() == TrieSet.InSet.BRANCH)
-                return nn;     // Branch is fully covered, we no longer need to augment nodes there.
-
-            return new IntersectionNode<>(nn, receivedSetNode);
-        }
-
-        public T content()
-        {
-            if (sNode.content().pointIncluded())
-                return tNode.content();
-            return null;
-        }
-    }
-
     protected Cursor<T> cursor()
     {
         return new IntersectionCursor(trie.cursor(), intersectingSet.cursor());
diff --git a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
index e49a6fa7a39d..7445c030820b 100644
--- a/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SingletonTrie.java
@@ -35,90 +35,6 @@ class SingletonTrie<T> extends Trie<T>
         this.value = value;
     }
 
-    private class ENode<L> extends NoChildrenNode<T, L>
-    {
-        ENode(L parent)
-        {
-            super(parent);
-        }
-
-        @Override
-        public T content()
-        {
-            return value;
-        }
-    }
-
-    private class SNode<L> extends Node<T, L>
-    {
-        private final ByteSource source;
-        boolean requested = false;
-
-        SNode(int trans, L parent, ByteSource source)
-        {
-            super(parent);
-            this.currentTransition = trans;
-            this.source = source;
-        }
-
-        @Override
-        public Node<T, L> getCurrentChild(L parent)
-        {
-            // Requesting more than once will screw up the iteration of source.
-            assert !requested : "getCurrentChild can only be called once for a given transition.";
-            requested = true;
-            return makeNode(parent, source);
-        }
-
-        @Override
-        public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver)
-        {
-            if (receiver != null)
-            {
-                receiver.add(currentTransition);
-                int next;
-                while ((next = source.next()) != ByteSource.END_OF_STREAM)
-                {
-                    receiver.add(next);
-                }
-            }
-
-            return new ENode<>(parentLink);
-        }
-
-        @Override
-        public Remaining startIteration()
-        {
-            return Remaining.ONE;
-        }
-
-        @Override
-        public Remaining advanceIteration()
-        {
-            return null;
-        }
-
-        @Override
-        public T content()
-        {
-            return null;
-        }
-    }
-
-    private <L> Node<T, L> makeNode(L parent, ByteSource source)
-    {
-        int next = source.next();
-        if (next == ByteSource.END_OF_STREAM)
-            return new ENode<>(parent);
-        else
-            return new SNode<>(next, parent, source);
-    }
-
-    public <L> Node<T, L> root()
-    {
-        return makeNode(null, key.asComparableBytes(BYTE_COMPARABLE_VERSION));
-    }
-
     public Cursor cursor()
     {
         return new Cursor();
diff --git a/src/java/org/apache/cassandra/db/tries/Trie.java b/src/java/org/apache/cassandra/db/tries/Trie.java
index e953d61873e8..c9f64c60dec0 100644
--- a/src/java/org/apache/cassandra/db/tries/Trie.java
+++ b/src/java/org/apache/cassandra/db/tries/Trie.java
@@ -90,7 +90,7 @@ protected enum Remaining
     }
 
     /**
-     * Used by {@link Node#getUniqueDescendant} to feed the transitions taken.
+     * Used by {@link Cursor#advanceMultiple} to feed the transitions taken.
      */
     protected interface TransitionsReceiver
     {
@@ -100,113 +100,15 @@ protected interface TransitionsReceiver
         void add(UnsafeBuffer b, int pos, int count);
     }
 
+    /**
+     * Used by {@link Cursor#advanceToContent} to track the transitions and backtracking taken.
+     */
     interface ResettingTransitionsReceiver extends TransitionsReceiver
     {
         /** Delete all bytes beyond the given length. */
         void reset(int newLength);
     }
 
-    /**
-     * A trie node. Provides methods for listing the transition bytes and children of the node, as well as its content.
-     * Once a node is made available, all its methods, except the ones retrieving children, must proceed without
-     * blocking or throwing exceptions.
-     *
-     * To enable efficient traversals the node effectively stores a call stack, a back link to the state that
-     * was used to obtain the node. This data is used to resume walks along the items in a trie.
-     *
-     * A node is a stateful non-thread-safe object. It is okay to access it from different threads, provided such
-     * accesses are not concurrent, i.e. there is a happens-before relationship between calling each of a node's
-     * methods.
-     */
-    protected abstract static class Node<T, L>
-    {
-        /**
-         * Parent state, as set when {@link #getCurrentChild} or {@link #getUniqueDescendant} is called, or
-         * {@code null} if this is a root node.
-         * Often a node (which also holds its iteration state), but it does not need to be. Users/subscribers of the
-         * trie interface can choose what this link needs to contain, e.g. a merge node with a list of source nodes
-         * or a pair of a parent node with a byte array containing the key that leads to it.
-         */
-        public final L parentLink;
-
-        /** Current transition byte, set after each call to {@link #startIteration} and {@link #advanceIteration}. */
-        protected int currentTransition = -1;
-
-        protected Node(L parentLink)
-        {
-            this.parentLink = parentLink;
-        }
-
-        /**
-         * Sets up the node for forward iteration, positions it on the first child and sets {@link #currentTransition}.
-         * Note: It is expected that the node will be traversed only once, more precisely that no consumer will ask
-         * twice for the same child. Some implementations (e.g. singleton, subtrie) may fail if this is violated.
-         *
-         * @return null if the node has no children, otherwise {@link Remaining#MULTIPLE} or {@link Remaining#ONE} (if
-         * it knows this is the only transition).
-         */
-        public abstract Remaining startIteration();
-
-        /**
-         * Advances the node state to the next transition of the node and sets {@link #currentTransition}.
-         * <p>
-         * This can only be called after an iteration has been started by {@link #startIteration}.
-         *
-         * @return null if the node has no more children, otherwise {@link Remaining#MULTIPLE} or {@link Remaining#ONE}
-         * (if it knows this is the last transition).
-         *
-         * @throws IllegalStateException if no iteration has been started (with {@link #startIteration}), or if the
-         * preceding call to {@link #startIteration} or this method returned {@code null}. (Note: Implementations
-         * should permit this to be called after {@link Remaining#ONE}, which is redundant but easier to work with.)
-         */
-        public abstract Remaining advanceIteration();
-
-        /**
-         * Gets the child of this node corresponding to the current transition and with the given parent link.
-         * The current transition must have been set using {@link #startIteration} or {@link #advanceIteration},
-         * and it's an error to call this after either has returned {@code null}. This should only be
-         * called once for a given transition/child.
-         *
-         * The method may return null if the child turns out to not be present (e.g. in a dense node where it could be
-         * better to leave the check for the request call, or if a concurrent write has prepared the transition but not
-         * yet made it active by writing the child).
-         *
-         * @param parentLink the parent state to use to set {@link Node#parentLink} in the node provided as result to
-         * this request.
-         * @return the child corresponding to the current transition or null if the child does not exist
-         * (even though {@link #startIteration}/{@link #advanceIteration} thought it did).
-         */
-        public abstract Node<T, L> getCurrentChild(L parentLink);
-
-        /**
-         * If the node has exactly one child and no content, go to that child and continue descending while this is
-         * the case.
-         * This is done so that iteration over the content of the trie does not need to remember the parts of the path
-         * that are not branching points and thus don't need to be revisited while backtracking up the trie.
-         * Overridden by chain nodes (MemtableTrie.ChainNode); see TrieValuesIterator for usage.
-         * The receiver argument can be null if the caller does not need a record of the transitions taken.
-         */
-        public Node<T, L> getUniqueDescendant(L parentLink, TransitionsReceiver receiver)
-        {
-            return this;
-        }
-
-        /**
-         * The content of this node, if any.
-         *
-         * @return the content of this node, or {@code null} if it has no attached content.
-         */
-        public abstract T content();
-    }
-
-    /**
-     * Returns an instantiation of the root node with null parent link.
-     * This is the only method that needs to be implemented in children.
-     *
-     * @param <L> The type of parent link that will be used in the traversal.
-     */
-    protected abstract <L> Node<T, L> root();
-
     // Cursor-style walks
     interface Cursor<T>
     {
@@ -252,63 +154,22 @@ default T advanceToContent(ResettingTransitionsReceiver receiver) // advances al
             }
         }
 
-//        int advanceTo(int transition); // advance to child with this transition or higher. if none exists, ascend to parent and advance
-
         /**
          * ignore the remaining children at this level or below and ascend to parent and advance
          */
         int ascend(); // ignore the remaining children at this level or below and ascend to parent and advance
 
-        //        default int ascend()
-//        {
-//            return advanceTo(Integer.MAX_VALUE);
-//        }
-
         int level(); // return current state; if just starting / on root, return 0
         int incomingTransition(); // return the last transition taken; if just starting / on root, return -1
         T content(); // return content -- may be non-null on root
 
     }
 
-    protected Cursor<T> cursor()
-    {
-        return new CursorFromNode<>(this);
-    }
+    protected abstract Cursor<T> cursor();
 
     // Version of the byte comparable conversion to use for all operations
     static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS41;
 
-    /**
-     * Base helper class to write node having no childen.
-     */
-    protected abstract static class NoChildrenNode<T, L> extends Node<T, L>
-    {
-        NoChildrenNode(L parent)
-        {
-            super(parent);
-        }
-
-        public IllegalStateException error()
-        {
-            return new IllegalStateException("Node has no children.");
-        }
-
-        public Remaining startIteration()
-        {
-            return null;
-        }
-
-        public Remaining advanceIteration()
-        {
-            throw error();
-        }
-
-        public Node<T, L> getCurrentChild(L parent)
-        {
-            throw error();
-        }
-    }
-
     public String dump()
     {
         return dump(Object::toString);
@@ -508,11 +369,6 @@ public static <T> Trie<T> mergeDistinct(Collection<? extends Trie<T>> sources)
 
     private static final Trie<Object> EMPTY = new Trie<Object>()
     {
-        public <L> Node<Object, L> root()
-        {
-            return null;
-        }
-
         protected Cursor<Object> cursor()
         {
             return new Cursor<Object>()
diff --git a/src/java/org/apache/cassandra/db/tries/TrieSet.java b/src/java/org/apache/cassandra/db/tries/TrieSet.java
index c978b6d731ea..65e6bb73581d 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieSet.java
@@ -48,27 +48,45 @@ boolean branchCovered()
         }
     }
 
-    protected static final Node<InSet, Object> FULL = new NoChildrenNode<InSet, Object>(null)
-    {
-        public InSet content()
-        {
-            return InSet.BRANCH;
-        }
-    };
-
     private static final TrieSet FULL_SET = new TrieSet()
     {
-        public <L> Node<InSet, L> root()
+        @Override
+        protected Cursor<InSet> cursor()
         {
-            return (Node<InSet, L>) FULL;
-        }
-    };
+            return new Cursor<InSet>()
+            {
+                int level = 0;
 
-    private static final TrieSet EMPTY_SET = new TrieSet()
-    {
-        public <L> Node<InSet, L> root()
-        {
-            return null;
+                @Override
+                public int advance()
+                {
+                    return level = -1;
+                }
+
+                @Override
+                public int ascend()
+                {
+                    return level = -1;
+                }
+
+                @Override
+                public int level()
+                {
+                    return level;
+                }
+
+                @Override
+                public int incomingTransition()
+                {
+                    return -1;
+                }
+
+                @Override
+                public InSet content()
+                {
+                    return InSet.BRANCH;
+                }
+            };
         }
     };
 
@@ -88,9 +106,4 @@ public static TrieSet full()
     {
         return FULL_SET;
     }
-
-    public static TrieSet empty()
-    {
-        return EMPTY_SET;
-    }
 }
diff --git a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
index fa2ca8ef0b42..2612df657a8d 100644
--- a/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
+++ b/test/unit/org/apache/cassandra/db/tries/MemtableTrieTestBase.java
@@ -136,90 +136,108 @@ public void testUpdateContent()
         }
     }
 
-    static class SpecifiedChildrenNode<L> extends Trie.Node<ByteBuffer, L>
+    static class SpecLevel
     {
-        final Object[] children;
+        Object[] children;
+        int curChild;
+        Object content;
+        SpecLevel parent;
 
-        SpecifiedChildrenNode(L parent, Object[] children)
+        public SpecLevel(Object[] spec, Object content, SpecLevel parent)
         {
-            super(parent);
-            this.children = children;
+            this.children = spec;
+            this.content = content;
+            this.parent = parent;
+            this.curChild = -1;
         }
+    }
+
+    public static class CursorFromSpec implements Trie.Cursor<ByteBuffer>
+    {
+        SpecLevel stack;
+        int level;
 
-        public Trie.Remaining startIteration()
+        CursorFromSpec(Object[] spec)
         {
-            currentTransition = 0x30;
-            return remaining();
+            stack = new SpecLevel(spec, null, null);
+            level = 0;
         }
 
-        private Trie.Remaining remaining()
+        public int advance()
         {
-            final int left = children.length - (currentTransition - 0x30);
-            return left > 1
-                   ? Trie.Remaining.MULTIPLE
-                   : left == 1
-                     ? Trie.Remaining.ONE
-                     : null;
+            SpecLevel current = stack;
+            while (current != null && ++current.curChild >= current.children.length)
+            {
+                current = current.parent;
+                --level;
+            }
+            if (current == null)
+            {
+                assert level == -1;
+                return level;
+            }
+
+            Object child = current.children[current.curChild];
+            if (child instanceof Object[])
+                stack = new SpecLevel((Object[]) child, null, current);
+            else
+                stack = new SpecLevel(new Object[0], child, current);
+
+            return ++level;
         }
 
-        public Trie.Remaining advanceIteration()
+        public int advanceMultiple()
         {
-            ++currentTransition;
-            return remaining();
+            if (++stack.curChild >= stack.children.length)
+                return ascend();
+
+            Object child = stack.children[stack.curChild];
+            while (child instanceof Object[])
+            {
+                stack = new SpecLevel((Object[]) child, null, stack);
+                ++level;
+                if (stack.children.length == 0)
+                    return level;
+                child = stack.children[0];
+            }
+            stack = new SpecLevel(new Object[0], child, stack);
+
+
+            return ++level;
         }
 
-        public Trie.Node<ByteBuffer, L> getCurrentChild(L parentLink)
+        public int ascend()
         {
-            return makeSpecifiedChildrenNode(parentLink, children[currentTransition - 0x30]);
+            --level;
+            stack = stack.parent;
+            return advance();
         }
 
-        public Trie.Node<ByteBuffer, L> getUniqueDescendant(L parentLink, Trie.TransitionsReceiver receiver)
+        public int level()
         {
-            if (children.length != 1)
-                return this;
-
-            if (receiver != null)
-                receiver.add(0x30);
-
-            Object child;
-            for (child = children[0];
-                 child instanceof Object[] && ((Object[]) child).length == 1;
-                 child = ((Object[]) child)[0])
-                if (receiver != null)
-                    receiver.add(0x30);
-
-            return makeSpecifiedChildrenNode(parentLink, child);
+            return level;
         }
 
         public ByteBuffer content()
         {
-            return null;
+            return (ByteBuffer) stack.content;
         }
-    }
 
-    static <L> Trie.Node<ByteBuffer, L> makeSpecifiedChildrenNode(L parent, Object nodeDef)
-    {
-        if (nodeDef == null)
-            return null;
-        else if (nodeDef instanceof Object[])
-            return new SpecifiedChildrenNode<>(parent, (Object[]) nodeDef);
-        else
-            return new Trie.NoChildrenNode<ByteBuffer, L>(parent)
-            {
-                public ByteBuffer content()
-                {
-                    return (ByteBuffer) nodeDef;
-                }
-            };
+        public int incomingTransition()
+        {
+            SpecLevel parent = stack.parent;
+            return parent != null ? parent.curChild + 0x30 : -1;
+        }
     }
 
-    static Trie<ByteBuffer> specifiedTrie(Object nodeDef)
+    static Trie<ByteBuffer> specifiedTrie(Object[] nodeDef)
     {
         return new Trie<ByteBuffer>()
         {
-            protected <L> Node<ByteBuffer, L> root()
+            @Override
+            protected Cursor<ByteBuffer> cursor()
             {
-                return makeSpecifiedChildrenNode(null, nodeDef);
+                return new CursorFromSpec(nodeDef);
             }
         };
     }
diff --git a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
index 55cfb7909dae..fbc1ab1d7ec4 100644
--- a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
+++ b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
@@ -121,73 +121,52 @@ private static Trie<Integer> singleLevelIntTrie(int childs)
     {
         return new Trie<Integer>()
         {
-            protected <L> Node<Integer, L> root()
+            @Override
+            protected Cursor<Integer> cursor()
             {
-                return new RootNode<>();
+                return new SingleLevelCursor();
             }
 
-            /** Root node of the trie: has {@code childs} transition, each leading to a {@link LeafNode} whose content
-             * is the value of the transition. */
-            class RootNode<L> extends Node<Integer, L>
+            class SingleLevelCursor implements Cursor<Integer>
             {
-                RootNode()
-                {
-                    super(null);
-                    currentTransition = 0;
-                }
-
-                public Remaining startIteration()
-                {
-                    currentTransition = 0;
-                    return childs == 0 ? null : (childs == 1 ? Remaining.ONE : Remaining.MULTIPLE);
-                }
+                int current = -1;
 
-                public Remaining advanceIteration()
+                @Override
+                public int advance()
                 {
-                    return ++currentTransition >= childs ? null : Remaining.MULTIPLE;
+                    ++current;
+                    return level();
                 }
 
-                public Node<Integer, L> getCurrentChild(L parent)
+                @Override
+                public int ascend()
                 {
-                    return new LeafNode<>(parent, currentTransition);
+                    return advance();
                 }
 
-                public Integer content()
+                @Override
+                public int level()
                 {
+                    if (current == -1)
+                        return 0;
+                    if (current < childs)
+                        return 1;
                     return -1;
                 }
 
                 @Override
-                public String toString()
+                public int incomingTransition()
                 {
-                    return String.format("ROOT(t=%s, parent=%s)", currentTransition, parentLink);
-                }
-            }
-
-            /** Leaf nodes: no children but a content corresponding ot the transition leading to them */
-            class LeafNode<L> extends NoChildrenNode<Integer, L>
-            {
-                private final int value;
-
-                LeafNode(L parent, int value)
-                {
-                    super(parent);
-                    this.value = value;
-                }
-
-                public Integer content()
-                {
-                    return value;
+                    return current;
                 }
 
                 @Override
-                public String toString()
+                public Integer content()
                 {
-                    return String.format("LEAF(%d, parent=%s)", value, parentLink);
+                    return current;
                 }
             }
         };
-
     }
 
     /** Creates a single byte {@link ByteComparable} with the provide value */

From aa615c8381d26f460e370b4178e338b1dca52722 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 11 Aug 2021 09:44:26 +0300
Subject: [PATCH 144/151] Fix problems in SetIntersectionTrie and RangeTrieSet

---
 src/java/org/apache/cassandra/db/tries/RangeTrieSet.java      | 3 +--
 .../org/apache/cassandra/db/tries/SetIntersectionTrie.java    | 4 ++--
 .../apache/cassandra/db/tries/SetIntersectionTrieTest.java    | 3 ++-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
index f24d663a38e8..cb6a71e148d8 100644
--- a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -76,6 +76,7 @@ private RangeCursor()
             backlog = new int[32];
             backlogPos = 0;
             level = 0;
+            transitionAtRightLevel = -1;
             atLeftLimit = left != null;
             if (atLeftLimit)
             {
@@ -87,13 +88,11 @@ private RangeCursor()
                     atLeftLimit = false;
                     if (includeLeft)
                         inSet = InSet.CONTAINED;
-                    transitionAtRightLevel = -1;
                 }
             }
             else
             {
                 inSet = InSet.CONTAINED;
-                transitionAtRightLevel = -1;
             }
 
             atRightLimit = right != null;
diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
index 6291a7d3e37c..20d10a61f0ac 100644
--- a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -107,9 +107,9 @@ private int advanceToIntersection(int tLevel, int sLevel)
                     if (sIncoming == tIncoming)
                         return tLevel;  // got entry
                     else if (sIncoming < tIncoming)
-                        sLevel = sCursor.advance();
+                        sLevel = sCursor.ascend();
                     else // sIncoming > tIncoming
-                        tLevel = tCursor.advance();
+                        tLevel = tCursor.ascend();
                 }
                 else if (sLevel < tLevel)
                 {
diff --git a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
index fbc1ab1d7ec4..e7ab6e52e9c2 100644
--- a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
+++ b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
@@ -28,6 +28,7 @@
 import org.junit.Test;
 
 import com.googlecode.concurrenttrees.common.Iterables;
+import org.apache.cassandra.utils.ByteBufferUtil;
 import org.apache.cassandra.utils.bytecomparable.ByteComparable;
 
 import static org.apache.cassandra.db.tries.MemtableTrieTestBase.asString;
@@ -86,7 +87,7 @@ public void checkEqualRange(NavigableMap<ByteComparable, ByteBuffer> content1,
                                 ByteComparable r,
                                 boolean includeRight)
     {
-        System.out.format("Intersection with %s%s:%s%s\n", includeLeft ? "[" : "(", asString(l), asString(r), includeRight ? "]" : ")");
+        System.out.println(String.format("Intersection with %s%s:%s%s", includeLeft ? "[" : "(", asString(l), asString(r), includeRight ? "]" : ")"));
         SortedMap<ByteComparable, ByteBuffer> imap = l == null
                                                      ? r == null
                                                        ? content1

From a171b5f6dc4d75577f460537acf1adfd5386b8b6 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 11 Aug 2021 19:02:09 +0300
Subject: [PATCH 145/151] Handle inclusiveness with nextKey, add RangeTrieTest

---
 .../cassandra/db/tries/RangeTrieSet.java      |  89 ++++----
 .../apache/cassandra/db/tries/TrieSet.java    |   2 +-
 .../utils/bytecomparable/ByteSource.java      |  26 +++
 .../cassandra/db/tries/RangeTrieTest.java     | 190 ++++++++++++++++++
 .../db/tries/SetIntersectionTrieTest.java     |   4 +-
 5 files changed, 262 insertions(+), 49 deletions(-)
 create mode 100644 test/unit/org/apache/cassandra/db/tries/RangeTrieTest.java

diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
index cb6a71e148d8..93323fc2eff6 100644
--- a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -77,53 +77,43 @@ private RangeCursor()
             backlogPos = 0;
             level = 0;
             transitionAtRightLevel = -1;
-            atLeftLimit = left != null;
-            if (atLeftLimit)
+            if (left != null)
             {
                 remainingLeftLimit = left.asComparableBytes(BYTE_COMPARABLE_VERSION);
+                if (!includeLeft)
+                    remainingLeftLimit = ByteSource.nextKey(remainingLeftLimit);
                 leftLimitNext = remainingLeftLimit.next();
-                inSet = InSet.PREFIX;
-                if (leftLimitNext == ByteSource.END_OF_STREAM)
-                {
-                    atLeftLimit = false;
-                    if (includeLeft)
-                        inSet = InSet.CONTAINED;
-                }
+                atLeftLimit = leftLimitNext != ByteSource.END_OF_STREAM;
             }
             else
-            {
-                inSet = InSet.CONTAINED;
-            }
+                atLeftLimit = false;
+
+            inSet = atLeftLimit ? InSet.PREFIX : InSet.CONTAINED;
 
             atRightLimit = right != null;
             if (atRightLimit)
             {
                 remainingRightLimit = right.asComparableBytes(BYTE_COMPARABLE_VERSION);
+                if (includeRight)
+                    remainingRightLimit = ByteSource.nextKey(remainingRightLimit);
                 rightLimitNext = remainingRightLimit.next();
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
                 {
-                    rightLimitDone = true;
-                    assert !atLeftLimit;
-                    atRightLimit = false;
-                    if (!includeRight)
-                    {
-                        level = -1;
-                        inSet = InSet.PREFIX;
-                        return;
-                    }
+                    level = -1;
+                    inSet = InSet.PREFIX;
+                    return;
                 }
-                else
-                    rightLimitDone = false;
+                rightLimitDone = false;
             }
             else
             {
                 // else we exhaust the backlog at level -1 and terminate before any continueAlongRight is called
-                rightLimitNext = 255;
+                rightLimitNext = 256;
                 rightLimitDone = true;
             }
 
             incomingTransition = -1;
-            if (!atLeftLimit && !atRightLimit && rightLimitNext >= 0 && inSet == InSet.CONTAINED)
+            if (!atLeftLimit && !atRightLimit)
                 inSet = InSet.BRANCH;
         }
 
@@ -146,33 +136,42 @@ public int advance()
 
         private int descendAlongBoth()
         {
-            assert rightLimitNext >= leftLimitNext;
-            int next = leftLimitNext;
-            leftLimitNext = remainingLeftLimit.next();
-            if (rightLimitNext == next)
-                rightLimitNext = remainingRightLimit.next();
-            else
+            if (rightLimitNext > leftLimitNext)
             {
-                transitionAtRightLevel = next + 1;
+                transitionAtRightLevel = leftLimitNext + 1;
                 atRightLimit = false;
+                int next = leftLimitNext;
+                leftLimitNext = remainingLeftLimit.next();
+
+                incomingTransition = next;
+                if (leftLimitNext != ByteSource.END_OF_STREAM)
+                {
+                    inSet = InSet.PREFIX;
+                }
+                else
+                {
+                    atLeftLimit = false;
+                    inSet = InSet.BRANCH;
+                }
+                return ++level;
             }
+            assert rightLimitNext == leftLimitNext;
 
-            incomingTransition = next;
+            incomingTransition = leftLimitNext;
+            rightLimitNext = remainingRightLimit.next();
+            leftLimitNext = remainingLeftLimit.next();
             if (leftLimitNext != ByteSource.END_OF_STREAM)
             {
                 inSet = InSet.PREFIX;
+                assert rightLimitNext != ByteSource.END_OF_STREAM;
             }
             else
             {
                 atLeftLimit = false;
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
-                {
-                    if (includeLeft && includeRight)
-                        inSet = InSet.CONTAINED;
-                    else
-                        return -1;
-                }
-                inSet = includeLeft ? InSet.BRANCH : InSet.PREFIX;//: InSet.BRANCH_EXCLUDING;
+                    return -1;
+
+                inSet = InSet.CONTAINED;
             }
             return ++level;
         }
@@ -191,7 +190,7 @@ private int descendAlongLeft()
             else
             {
                 atLeftLimit = false;
-                inSet = includeLeft ? InSet.BRANCH : InSet.PREFIX;//: InSet.BRANCH_EXCLUDING;
+                inSet = InSet.BRANCH;
             }
             return ++level;
         }
@@ -226,18 +225,14 @@ private int continueAlongRight()
                 inSet = InSet.BRANCH;
                 return level;
             }
-            else // (incomingTransition == rightLimitNext)
+            else
             {
                 if (rightLimitDone)
                     return -1;
 
                 rightLimitNext = remainingRightLimit.next();
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
-                {
-                    rightLimitDone = true;
-                    if (!includeRight)
-                        return -1;
-                }
+                    return -1;
                 transitionAtRightLevel = -1;
                 inSet = InSet.CONTAINED;
                 return level;
diff --git a/src/java/org/apache/cassandra/db/tries/TrieSet.java b/src/java/org/apache/cassandra/db/tries/TrieSet.java
index 65e6bb73581d..3dc8fc4503de 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieSet.java
@@ -33,7 +33,7 @@ public abstract class TrieSet extends Trie<TrieSet.InSet>
 {
     enum InSet
     {
-        PREFIX, // this is a prefix node, and the specific point is not conained in the set (e.g. points on the left range path)
+        PREFIX, // this is a prefix node, and the specific point is not contained in the set (e.g. points on the left range path)
         CONTAINED, // this is a prefix node, and the point is contained in the set (e.g. points on the right range path)
         BRANCH; // the whole branch is contained in the set (e.g. interior nodes for a range)
 
diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
index ea4f1d053903..b2cf92475f2b 100644
--- a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
+++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java
@@ -669,6 +669,32 @@ public int next()
         };
     }
 
+    /**
+     * Returns the key that is immediately after src in the topology.
+     * @param src
+     * @return src with added 00 byte at the end
+     */
+    public static ByteSource nextKey(ByteSource src)
+    {
+        return new ByteSource()
+        {
+            boolean done = false;
+
+            @Override
+            public int next()
+            {
+                if (done)
+                    return END_OF_STREAM;
+                int n = src.next();
+                if (n != END_OF_STREAM)
+                    return n;
+
+                done = true;
+                return 0;
+            }
+        };
+    }
+
     public class Peekable implements ByteSource
     {
         static final int NONE = Integer.MIN_VALUE;
diff --git a/test/unit/org/apache/cassandra/db/tries/RangeTrieTest.java b/test/unit/org/apache/cassandra/db/tries/RangeTrieTest.java
new file mode 100644
index 000000000000..ed534de03f63
--- /dev/null
+++ b/test/unit/org/apache/cassandra/db/tries/RangeTrieTest.java
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.cassandra.db.tries;
+
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+
+import com.google.common.collect.Iterables;
+import org.junit.Assert;
+import org.junit.Test;
+
+import org.apache.cassandra.utils.bytecomparable.ByteComparable;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+public class RangeTrieTest
+{
+    @Test
+    public void testSpecified()
+    {
+        ByteComparable l = ByteComparable.fixedLength(new byte[] {0});
+        boolean includeLeft = false;
+        ByteComparable r = ByteComparable.fixedLength(new byte[] {0, 0, 0});
+        boolean includeRight = true;
+
+        TrieSet set = new RangeTrieSet(l, includeLeft, r, includeRight);
+        System.out.println(String.format("Range %s%s,%s%s",
+                                         includeLeft ? "[" : "(",
+                                         l != null ? l.byteComparableAsString(ByteComparable.Version.OSS41) : null,
+                                         r != null ? r.byteComparableAsString(ByteComparable.Version.OSS41) : null,
+                                         includeRight ? "]" : ")"
+        ));
+        System.out.println(set.dump());
+
+
+//        testSpecifiedRanges(new String[]{
+//                            "\000\000"
+//                            },
+//                            new String[]{
+//                            "\000",
+////                            "test12",
+//                            });
+        testSpecifiedRanges(new String[]{
+                            "test1",
+                            "test2",
+                            "test55",
+                            "test123",
+                            "test124",
+                            "test12",
+                            "test21",
+                            "tease",
+                            "sort",
+                            "sorting",
+                            "square",
+                            "\777\000",
+                            "\000\777",
+                            "\000\000",
+                            "\000\000\000",
+                            "\000\000\777",
+                            "\777\777"
+                            },
+                            new String[]{
+                            "test1",
+                            "test11",
+                            "test12",
+                            "test13",
+                            "test2",
+                            "test21",
+                            "te",
+                            "s",
+                            "q",
+                            "\000",
+                            "\777",
+                            "\777\000",
+                            "\000\777",
+                            "\000\000",
+                            "\000\000\000",
+                            "\000\000\777",
+                            "\777\777"
+                            });
+    }
+
+    public void testSpecifiedRanges(String[] keys, String[] boundaries)
+    {
+        testSpecifiedRanges(toByteComparable(keys),
+                            toByteComparable(boundaries));
+    }
+
+    private ByteComparable[] toByteComparable(String[] keys)
+    {
+        return Arrays.stream(keys)
+                     .map(x -> ByteComparable.fixedLength(x.getBytes(StandardCharsets.UTF_8)))
+                     .toArray(ByteComparable[]::new);
+    }
+
+    public void testSpecifiedRanges(ByteComparable[] keys, ByteComparable[] boundaries)
+    {
+        Arrays.sort(boundaries, (a, b) -> ByteComparable.compare(a, b, ByteComparable.Version.OSS41));
+        for (int li = -1; li < boundaries.length; ++li)
+        {
+            ByteComparable l = li < 0 ? null : boundaries[li];
+            for (int ri = Math.max(0, li); ri <= boundaries.length; ++ri)
+            {
+                ByteComparable r = ri == boundaries.length ? null : boundaries[ri];
+
+                for (int i = li == ri ? 3 : 0; i < 4; ++i)
+                {
+                    boolean includeLeft = (i & 1) != 0;
+                    boolean includeRight = (i & 2) != 0;
+                    TrieSet set = new RangeTrieSet(l, includeLeft, r, includeRight);
+
+                    verifySetProperties(set);
+
+                    for (ByteComparable key : keys)
+                    {
+                        int cmp1 = l != null ? ByteComparable.compare(key, l, ByteComparable.Version.OSS41) : 1;
+                        int cmp2 = r != null ? ByteComparable.compare(r, key, ByteComparable.Version.OSS41) : 1;
+                        Trie<Boolean> ix = new SetIntersectionTrie<Boolean>(Trie.singleton(key, true),
+                                                                            set);
+                        boolean expected = true;
+                        if (cmp1 < 0 || cmp1 == 0 && !includeLeft)
+                            expected = false;
+                        if (cmp2 < 0 || cmp2 == 0 && !includeRight)
+                            expected = false;
+                        boolean actual = Iterables.getFirst(ix.values(), false);
+                        if (expected != actual)
+                        {
+                            System.err.println("Range trie");
+                            System.err.println(set.dump());
+                            System.err.println("Intersection");
+                            System.err.println(ix.dump());
+                            Assert.fail(String.format("Failed on range %s%s,%s%s key %s expected %s got %s\n",
+                                                      includeLeft ? "[" : "(",
+                                                      l != null ? l.byteComparableAsString(ByteComparable.Version.OSS41) : null,
+                                                      r != null ? r.byteComparableAsString(ByteComparable.Version.OSS41) : null,
+                                                      includeRight ? "]" : ")",
+                                                      key.byteComparableAsString(ByteComparable.Version.OSS41),
+                                                      expected,
+                                                      actual));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void verifySetProperties(TrieSet set)
+    {
+        try
+        {
+            Trie.Cursor<TrieSet.InSet> cursor = set.cursor();
+            int level = cursor.advance();
+            while (level > 0)
+            {
+                assertEquals(level, cursor.level());
+                TrieSet.InSet inSet = cursor.content();
+                int prevLevel = level;
+                level = cursor.advance();
+//                if (inSet == null)
+//                    assertTrue("non-included nodes presented by a set must have children", level > prevLevel);
+//                else
+                if (inSet == TrieSet.InSet.BRANCH)
+                    assertFalse("fully included branches presented by a set must not have children", level > prevLevel);
+            }
+        }
+        catch (AssertionError e)
+        {
+            System.err.println(set.dump());
+            throw e;
+        }
+    }
+}
diff --git a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
index e7ab6e52e9c2..51cfbc8d4a0f 100644
--- a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
+++ b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
@@ -305,7 +305,9 @@ public void testSimpleEmptyIntersectionRight()
         intersection = trie.subtrie(ByteComparable.EMPTY, false, ByteComparable.EMPTY, true);
         assertEquals(asList(), toList(intersection));
 
-        intersection = trie.subtrie(ByteComparable.EMPTY, false, ByteComparable.EMPTY, false);
+        intersection = trie.subtrie(ByteComparable.EMPTY, true, ByteComparable.EMPTY, false);
         assertEquals(asList(), toList(intersection));
+
+        // (empty, empty) is an invalid call as the "(empty" is greater than "empty)"
     }
 }

From 310a505b55254d495f090b42993373e79066f13f Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 11 Aug 2021 21:08:13 +0300
Subject: [PATCH 146/151] Clean-up

---
 .../cassandra/db/tries/RangeTrieSet.java      | 56 +++++++------------
 .../apache/cassandra/db/tries/TrieDumper.java | 10 ++--
 2 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
index 93323fc2eff6..76a9ab4be3f9 100644
--- a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -51,10 +51,10 @@ public class RangeTrieSet extends TrieSet
 
     protected Cursor<InSet> cursor()
     {
-        return new RangeCursor();
+        return new RangeCursor(this);
     }
 
-    private class RangeCursor implements Cursor<InSet>
+    private static class RangeCursor implements Cursor<InSet>
     {
         private int[] backlog;
         int backlogPos;
@@ -62,7 +62,6 @@ private class RangeCursor implements Cursor<InSet>
         private ByteSource remainingRightLimit;
         boolean atLeftLimit;
         boolean atRightLimit;
-        boolean rightLimitDone;
         int leftLimitNext;
         int rightLimitNext;
         int transitionAtRightLevel;
@@ -71,16 +70,16 @@ private class RangeCursor implements Cursor<InSet>
         InSet inSet;
 
 
-        private RangeCursor()
+        private RangeCursor(RangeTrieSet set)
         {
             backlog = new int[32];
             backlogPos = 0;
             level = 0;
             transitionAtRightLevel = -1;
-            if (left != null)
+            if (set.left != null)
             {
-                remainingLeftLimit = left.asComparableBytes(BYTE_COMPARABLE_VERSION);
-                if (!includeLeft)
+                remainingLeftLimit = set.left.asComparableBytes(BYTE_COMPARABLE_VERSION);
+                if (!set.includeLeft)
                     remainingLeftLimit = ByteSource.nextKey(remainingLeftLimit);
                 leftLimitNext = remainingLeftLimit.next();
                 atLeftLimit = leftLimitNext != ByteSource.END_OF_STREAM;
@@ -88,13 +87,11 @@ private RangeCursor()
             else
                 atLeftLimit = false;
 
-            inSet = atLeftLimit ? InSet.PREFIX : InSet.CONTAINED;
-
-            atRightLimit = right != null;
+            atRightLimit = set.right != null;
             if (atRightLimit)
             {
-                remainingRightLimit = right.asComparableBytes(BYTE_COMPARABLE_VERSION);
-                if (includeRight)
+                remainingRightLimit = set.right.asComparableBytes(BYTE_COMPARABLE_VERSION);
+                if (set.includeRight)
                     remainingRightLimit = ByteSource.nextKey(remainingRightLimit);
                 rightLimitNext = remainingRightLimit.next();
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
@@ -103,18 +100,14 @@ private RangeCursor()
                     inSet = InSet.PREFIX;
                     return;
                 }
-                rightLimitDone = false;
             }
             else
-            {
-                // else we exhaust the backlog at level -1 and terminate before any continueAlongRight is called
                 rightLimitNext = 256;
-                rightLimitDone = true;
-            }
 
             incomingTransition = -1;
-            if (!atLeftLimit && !atRightLimit)
-                inSet = InSet.BRANCH;
+            inSet = atLeftLimit ? InSet.PREFIX
+                                : atRightLimit ? InSet.CONTAINED
+                                               : InSet.BRANCH;
         }
 
 
@@ -125,7 +118,10 @@ public int advance()
                 if (atRightLimit)
                     return descendAlongBoth();
                 else
+                {
+                    addBacklog(leftLimitNext + 1);
                     return descendAlongLeft();
+                }
             }
 
             if (processBacklog())
@@ -138,25 +134,12 @@ private int descendAlongBoth()
         {
             if (rightLimitNext > leftLimitNext)
             {
-                transitionAtRightLevel = leftLimitNext + 1;
                 atRightLimit = false;
-                int next = leftLimitNext;
-                leftLimitNext = remainingLeftLimit.next();
-
-                incomingTransition = next;
-                if (leftLimitNext != ByteSource.END_OF_STREAM)
-                {
-                    inSet = InSet.PREFIX;
-                }
-                else
-                {
-                    atLeftLimit = false;
-                    inSet = InSet.BRANCH;
-                }
-                return ++level;
+                transitionAtRightLevel = leftLimitNext + 1;
+                return descendAlongLeft();
             }
-            assert rightLimitNext == leftLimitNext;
 
+            assert rightLimitNext == leftLimitNext;
             incomingTransition = leftLimitNext;
             rightLimitNext = remainingRightLimit.next();
             leftLimitNext = remainingLeftLimit.next();
@@ -180,7 +163,6 @@ private int descendAlongLeft()
         {
             int next = leftLimitNext;
             leftLimitNext = remainingLeftLimit.next();
-            addBacklog(next + 1);
 
             incomingTransition = next;
             if (leftLimitNext != ByteSource.END_OF_STREAM)
@@ -227,7 +209,7 @@ private int continueAlongRight()
             }
             else
             {
-                if (rightLimitDone)
+                if (incomingTransition >= 256)  // the no-right-limit case
                     return -1;
 
                 rightLimitNext = remainingRightLimit.next();
diff --git a/src/java/org/apache/cassandra/db/tries/TrieDumper.java b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
index 3ed1c3a2851a..d1f06a1c951d 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieDumper.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieDumper.java
@@ -35,15 +35,17 @@ static <T> String dump(Function<T, String> contentToString, Trie.Cursor<T> curso
     {
         StringBuilder sb = new StringBuilder();
         Trie.ResettingTransitionsReceiver receiver = new TransitionsDumper(sb);
-        while (true)
+        T content = cursor.content();
+        if (content == null)
+            content = cursor.advanceToContent(receiver);
+        while (content != null)
         {
-            T content = cursor.advanceToContent(receiver);
-            if (content == null)
-                return sb.toString();
             sb.append(" -> ");
             sb.append(contentToString.apply(content));
             receiver.reset(cursor.level());
+            content = cursor.advanceToContent(receiver);
         }
+        return sb.toString();
     }
 
     private static class TransitionsDumper implements Trie.ResettingTransitionsReceiver

From 1c675a3beae4a60cb48fe04bf39d446e25de167f Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Wed, 28 Apr 2021 11:33:29 +0300
Subject: [PATCH 147/151] Further cleanup

---
 .../cassandra/db/tries/RangeTrieSet.java      |  14 +--
 .../db/tries/SetIntersectionTrie.java         |  10 +-
 .../apache/cassandra/db/tries/TrieSet.java    |  14 +--
 .../db/tries/SetIntersectionTrieTest.java     | 106 +++++++++++++++---
 4 files changed, 104 insertions(+), 40 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
index 76a9ab4be3f9..166779c64d0d 100644
--- a/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/RangeTrieSet.java
@@ -97,7 +97,7 @@ private RangeCursor(RangeTrieSet set)
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
                 {
                     level = -1;
-                    inSet = InSet.PREFIX;
+                    inSet = null;
                     return;
                 }
             }
@@ -105,8 +105,8 @@ private RangeCursor(RangeTrieSet set)
                 rightLimitNext = 256;
 
             incomingTransition = -1;
-            inSet = atLeftLimit ? InSet.PREFIX
-                                : atRightLimit ? InSet.CONTAINED
+            inSet = atLeftLimit ? null
+                                : atRightLimit ? InSet.INCLUDED
                                                : InSet.BRANCH;
         }
 
@@ -145,7 +145,7 @@ private int descendAlongBoth()
             leftLimitNext = remainingLeftLimit.next();
             if (leftLimitNext != ByteSource.END_OF_STREAM)
             {
-                inSet = InSet.PREFIX;
+                inSet = null;
                 assert rightLimitNext != ByteSource.END_OF_STREAM;
             }
             else
@@ -154,7 +154,7 @@ private int descendAlongBoth()
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
                     return -1;
 
-                inSet = InSet.CONTAINED;
+                inSet = InSet.INCLUDED;
             }
             return ++level;
         }
@@ -167,7 +167,7 @@ private int descendAlongLeft()
             incomingTransition = next;
             if (leftLimitNext != ByteSource.END_OF_STREAM)
             {
-                inSet = InSet.PREFIX;
+                inSet = null;
             }
             else
             {
@@ -216,7 +216,7 @@ private int continueAlongRight()
                 if (rightLimitNext == ByteSource.END_OF_STREAM)
                     return -1;
                 transitionAtRightLevel = -1;
-                inSet = InSet.CONTAINED;
+                inSet = InSet.INCLUDED;
                 return level;
             }
         }
diff --git a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
index 20d10a61f0ac..ab10e41a516c 100644
--- a/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/SetIntersectionTrie.java
@@ -52,7 +52,7 @@ public IntersectionCursor(Cursor<T> tCursor,
         public int advance()
         {
             int tLevel = tCursor.advance();
-            if (sCursor.content().branchCovered())
+            if (sCursor.content() == TrieSet.InSet.BRANCH)
             {
                 if (tLevel > sCursor.level())
                     return tLevel;
@@ -67,7 +67,7 @@ public int advance()
         public int advanceMultiple(TransitionsReceiver transitionsReceiver)
         {
             int tLevel;
-            if (sCursor.content().branchCovered())
+            if (sCursor.content() == TrieSet.InSet.BRANCH)
             {
                 tLevel = tCursor.advanceMultiple(transitionsReceiver);
                 if (tLevel > sCursor.level())
@@ -85,7 +85,7 @@ public int advanceMultiple(TransitionsReceiver transitionsReceiver)
         public int ascend() // this is not tested ATM
         {
             int tLevel = tCursor.ascend();
-            if (sCursor.content().branchCovered())
+            if (sCursor.content() == TrieSet.InSet.BRANCH)
             {
                 if (tLevel > sCursor.level())
                     return tLevel;
@@ -113,7 +113,7 @@ else if (sIncoming < tIncoming)
                 }
                 else if (sLevel < tLevel)
                 {
-                    if (sCursor.content().branchCovered())
+                    if (sCursor.content() == TrieSet.InSet.BRANCH)
                         return tLevel;
                     tLevel = tCursor.ascend();
                 }
@@ -137,7 +137,7 @@ public int incomingTransition()
 
         public T content()
         {
-            return sCursor.content().pointIncluded()
+            return sCursor.content() != null
                    ? tCursor.content()
                    : null;
         }
diff --git a/src/java/org/apache/cassandra/db/tries/TrieSet.java b/src/java/org/apache/cassandra/db/tries/TrieSet.java
index 3dc8fc4503de..2a7fce45c138 100644
--- a/src/java/org/apache/cassandra/db/tries/TrieSet.java
+++ b/src/java/org/apache/cassandra/db/tries/TrieSet.java
@@ -33,19 +33,9 @@ public abstract class TrieSet extends Trie<TrieSet.InSet>
 {
     enum InSet
     {
-        PREFIX, // this is a prefix node, and the specific point is not contained in the set (e.g. points on the left range path)
-        CONTAINED, // this is a prefix node, and the point is contained in the set (e.g. points on the right range path)
+        // null content value specifies that the specific point is not contained in the set (e.g. points on the left range path)
+        INCLUDED, // this point is contained in the set (e.g. points on the right range path)
         BRANCH; // the whole branch is contained in the set (e.g. interior nodes for a range)
-
-        boolean pointIncluded()
-        {
-            return this != PREFIX;
-        }
-
-        boolean branchCovered()
-        {
-            return this == BRANCH;
-        }
     }
 
     private static final TrieSet FULL_SET = new TrieSet()
diff --git a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
index 51cfbc8d4a0f..f5a34e0a4932 100644
--- a/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
+++ b/test/unit/org/apache/cassandra/db/tries/SetIntersectionTrieTest.java
@@ -19,6 +19,9 @@
 package org.apache.cassandra.db.tries;
 
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Comparator;
 import java.util.List;
 import java.util.NavigableMap;
 import java.util.Random;
@@ -41,7 +44,7 @@
 
 public class SetIntersectionTrieTest
 {
-    private static final int COUNT = 15000;
+    private static final int COUNT = 1000;
     Random rand = new Random();
 
     @Test
@@ -58,25 +61,96 @@ public void testIntersectRange(int count)
         MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(src1, content1, true);
 
         checkEqualRange(content1, trie1, null, true, null, true);
-        checkEqualRange(content1, trie1, MemtableTrieTestBase.generateKey(rand), true, null, true);
-        checkEqualRange(content1, trie1, null, true, MemtableTrieTestBase.generateKey(rand), true);
-        for (int i = 0; i < 4; ++i)
+        for (int loop = 0; loop < 100; ++loop)
         {
-            ByteComparable l = rand.nextBoolean() ? MemtableTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)];
-            ByteComparable r = rand.nextBoolean() ? MemtableTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)];
-            int cmp = ByteComparable.compare(l, r, Trie.BYTE_COMPARABLE_VERSION);
-            if (cmp > 0)
+            checkEqualRange(content1, trie1, MemtableTrieTestBase.generateKey(rand), true, null, true);
+            checkEqualRange(content1, trie1, null, true, MemtableTrieTestBase.generateKey(rand), true);
+            for (int i = 0; i < 4; ++i)
             {
-                ByteComparable t = l;l = r;r = t; // swap
+                ByteComparable l = rand.nextBoolean() ? MemtableTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)];
+                ByteComparable r = rand.nextBoolean() ? MemtableTrieTestBase.generateKey(rand) : src1[rand.nextInt(src1.length)];
+                int cmp = ByteComparable.compare(l, r, Trie.BYTE_COMPARABLE_VERSION);
+                if (cmp > 0)
+                {
+                    ByteComparable t = l;
+                    l = r;
+                    r = t; // swap
+                }
+
+                boolean includeLeft = (i & 1) != 0 || cmp == 0;
+                boolean includeRight = (i & 2) != 0 || cmp == 0;
+                checkEqualRange(content1, trie1, l, includeLeft, r, includeRight);
+                checkEqualRange(content1, trie1, null, includeLeft, r, includeRight);
+                checkEqualRange(content1, trie1, l, includeLeft, null, includeRight);
             }
+        }
+    }
+
+    @Test
+    public void testSpecified()
+    {
+        testSpecifiedIntersections(new String[]{
+                                       "test1",
+                                       "test2",
+                                       "test55",
+                                       "test123",
+                                       "test124",
+                                       "test12",
+                                       "test21",
+                                       "tease",
+                                       "sort",
+                                       "sorting",
+                                       "square"
+                                   },
+                                   new String[]{
+                                       "test1",
+                                       "test11",
+                                       "test12",
+                                       "test13",
+                                       "test2",
+                                       "test21",
+                                       "te",
+                                       "s",
+                                       "q",
+                                       ""
+                                   });
+    }
+
+    public void testSpecifiedIntersections(String[] keys, String[] boundaries)
+    {
+        testSpecifiedIntersections(toByteComparable(keys),
+                                   toByteComparable(boundaries));
+    }
+
+    private ByteComparable[] toByteComparable(String[] keys)
+    {
+        return Arrays.stream(keys)
+                     .map(x -> ByteComparable.fixedLength(x.getBytes(StandardCharsets.UTF_8)))
+                     .toArray(ByteComparable[]::new);
+    }
+
+    public void testSpecifiedIntersections(ByteComparable[] keys, ByteComparable[] boundaries)
+    {
+        Comparator<ByteComparable> byteComparableComparator = (bytes1, bytes2) -> ByteComparable.compare(bytes1, bytes2, Trie.BYTE_COMPARABLE_VERSION);
+        Arrays.sort(boundaries, byteComparableComparator);
+        NavigableMap<ByteComparable, ByteBuffer> content1 = new TreeMap<>(byteComparableComparator);
+        MemtableTrie<ByteBuffer> trie1 = makeMemtableTrie(keys, content1, true);
 
-            boolean includeLeft = (i & 1) != 0;
-            boolean includeRight = (i & 2) != 0;
-            if (!includeLeft && !includeRight && cmp == 0)
-                includeRight = true;
-            checkEqualRange(content1, trie1, l, includeLeft, r, includeRight);
-            checkEqualRange(content1, trie1, null, includeLeft, r, includeRight);
-            checkEqualRange(content1, trie1, l, includeLeft, null, includeRight);
+        for (int li = -1; li < boundaries.length; ++li)
+        {
+            ByteComparable l = li < 0 ? null : boundaries[li];
+            for (int ri = Math.max(0, li); ri <= boundaries.length; ++ri)
+            {
+                ByteComparable r = ri == boundaries.length ? null : boundaries[ri];
+                for (int i = 0; i < 4; ++i)
+                {
+                    boolean includeLeft = (i & 1) != 0;
+                    boolean includeRight = (i & 2) != 0;
+                    if ((!includeLeft || !includeRight) && li == ri)
+                        continue;
+                    checkEqualRange(content1, trie1, l, includeLeft, r, includeRight);
+                }
+            }
         }
     }
 

From fc85ad216b615333d9045559e5fa341179883c77 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Sat, 24 Apr 2021 10:32:24 +0300
Subject: [PATCH 148/151] Don't redo buffer/offset calculations

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 00d39fc44eca..3424c1a7c66d 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -593,19 +593,25 @@ private boolean nextValidSplitTransition(int node, int trans)
             int tailIdx = splitNodeTailIndex(trans);
             int childIdx = splitNodeChildIndex(trans);
 
+            UnsafeBuffer nodeBuffer = getBuffer(node);
+            int nodeOfs = getOffset(node);
             while (midIndex < 4)
             {
-                int mid = getInt(node + SPLIT_POINTER_OFFSET + midIndex * 4);
+                int mid = nodeBuffer.getInt(nodeOfs + SPLIT_POINTER_OFFSET + midIndex * 4);
+                UnsafeBuffer midBuffer = getBuffer(mid);
+                int midOfs = getOffset(mid);
                 if (!isNull(mid))
                 {
                     while (tailIdx < 8)
                     {
-                        int tail = getInt(mid + tailIdx * 4);
+                        int tail = midBuffer.getInt(midOfs + tailIdx * 4);
+                        UnsafeBuffer tailBuffer = getBuffer(tail);
+                        int tailOfs = getOffset(tail);
                         if (!isNull(tail))
                         {
                             while (childIdx < 8)
                             {
-                                int child = getInt(tail + childIdx * 4);
+                                int child = tailBuffer.getInt(tailOfs + childIdx * 4);
                                 if (!isNull(child))
                                 {
                                     int transition = ((midIndex << 6) | (tailIdx << 3) | childIdx);
@@ -631,15 +637,15 @@ private boolean nextValidSparseTransition(int node, int transition)
             int minValid = Integer.MAX_VALUE;
             int minChild = NONE;
             int validCount = 0;
-            UnsafeBuffer buffer = getBuffer(node);
-            int ofs = getOffset(node);
+            UnsafeBuffer nodeBuffer = getBuffer(node);
+            int nodeOfs = getOffset(node);
 
             for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
             {
-                int child = buffer.getInt(ofs + SPARSE_CHILDREN_OFFSET + i * 4);
+                int child = nodeBuffer.getInt(nodeOfs + SPARSE_CHILDREN_OFFSET + i * 4);
                 if (child == NONE)
                     break;
-                int t = buffer.getByte(ofs + SPARSE_BYTES_OFFSET + i) & 0xFF;
+                int t = nodeBuffer.getByte(nodeOfs + SPARSE_BYTES_OFFSET + i) & 0xFF;
                 if (t >= transition)
                 {
                     if (t < minValid)
@@ -663,18 +669,21 @@ private boolean nextValidSparseTransition(int node, int transition)
         private boolean getChainTransition(int node)
         {
             // no backtracking needed
-            int transition = getByte(node);
+            UnsafeBuffer nodeBuffer = getBuffer(node);
+            int nodeOfs = getOffset(node);
+            int transition = nodeBuffer.getByte(nodeOfs) & 0xFF;
             int next = node + 1;
             if (offset(next) <= CHAIN_MAX_OFFSET)
                 descendIntoChain(next, transition);
             else
-                descendInto(getInt(next), transition);
+                descendInto(nodeBuffer.getInt(nodeOfs + 1), transition);
             return true;
         }
 
         // TODO: don't redo buffer/offset calculations
         // TODO: maybe use sparse order word
-        // TODO: reexamine backtracking, separate backtrack positions for dense sub-levels
+        // TODO: reexamine backtracking
+        // TODO: maybe separate backtrack positions for dense sub-levels
 
         @Override
         public int advanceMultiple(TransitionsReceiver receiver)

From 934b912f7fdf4c0a9e1dc53af4d1e8ba5c871072 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Mon, 26 Apr 2021 10:04:56 +0300
Subject: [PATCH 149/151] Use sparse order word

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 67 +++++++------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index 3424c1a7c66d..af8bae7a238e 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -520,22 +520,22 @@ void addBacktrack(int node, int data, int level)
         @Override
         public int advance()
         {
-            return advance(-1);
+            return advance(0);
         }
 
-        private int advance(int transition)
+        private int advance(int data)
         {
             while (true)
             {
-                if (advanceToNextChild(currentNode, transition))
+                if (advanceToNextChild(currentNode, data))
                     return level;
 
                 if (--backtrackLevel < 0)
-                    return -1;
+                    return level = -1;
 
                 level = level(backtrackLevel);
                 currentNode = node(backtrackLevel);
-                transition = data(backtrackLevel);
+                data = data(backtrackLevel);
             }
         }
 
@@ -543,12 +543,12 @@ private int advance(int transition)
         public int ascend()
         {
             if (--backtrackLevel < 0)
-                return -1;
+                return level = -1;
 
             level = level(backtrackLevel);
             currentNode = node(backtrackLevel);
-            int transition = data(backtrackLevel);
-            return advance(transition);
+            int data = data(backtrackLevel);
+            return advance(data);
         }
 
         private int descendInto(int child, int transition)
@@ -569,7 +569,7 @@ private int descendIntoChain(int child, int transition)
             return level;
         }
 
-        private boolean advanceToNextChild(int node, int transition)
+        private boolean advanceToNextChild(int node, int data)
         {
             if (isNull(node))
                 return false;
@@ -577,9 +577,9 @@ private boolean advanceToNextChild(int node, int transition)
             switch (offset(node))
             {
                 case SPLIT_OFFSET:
-                    return nextValidSplitTransition(node, transition + 1);
+                    return nextValidSplitTransition(node, data);
                 case SPARSE_OFFSET:
-                    return nextValidSparseTransition(node, transition + 1);
+                    return nextValidSparseTransition(node, data);
                 default:
                     return getChainTransition(node);
             }
@@ -615,7 +615,8 @@ private boolean nextValidSplitTransition(int node, int trans)
                                 if (!isNull(child))
                                 {
                                     int transition = ((midIndex << 6) | (tailIdx << 3) | childIdx);
-                                    addBacktrack(node, transition, level);
+                                    if (transition < 0xFF)
+                                        addBacktrack(node, transition + 1, level);
                                     descendInto(child, transition);
                                     return true;
                                 }
@@ -632,37 +633,19 @@ private boolean nextValidSplitTransition(int node, int trans)
             return false;
         }
 
-        private boolean nextValidSparseTransition(int node, int transition)
+        private boolean nextValidSparseTransition(int node, int data)
         {
-            int minValid = Integer.MAX_VALUE;
-            int minChild = NONE;
-            int validCount = 0;
             UnsafeBuffer nodeBuffer = getBuffer(node);
             int nodeOfs = getOffset(node);
-
-            for (int i = 0; i < SPARSE_CHILD_COUNT; ++i)
-            {
-                int child = nodeBuffer.getInt(nodeOfs + SPARSE_CHILDREN_OFFSET + i * 4);
-                if (child == NONE)
-                    break;
-                int t = nodeBuffer.getByte(nodeOfs + SPARSE_BYTES_OFFSET + i) & 0xFF;
-                if (t >= transition)
-                {
-                    if (t < minValid)
-                    {
-                        minValid = t;
-                        minChild = child;
-                    }
-                    ++validCount;
-                }
-            }
-            if (validCount == 0)
-                return false;
-
-            if (validCount > 1)
-                addBacktrack(node, minValid, level);
-
-            descendInto(minChild, minValid);
+            if (data <= 0)
+                data = nodeBuffer.getShort(nodeOfs + SPARSE_ORDER_OFFSET) & 0xFFFF;
+            int index = data % SPARSE_CHILD_COUNT;
+            data = data / SPARSE_CHILD_COUNT;
+            if (data > 0)
+                addBacktrack(node, data, level);
+            int child = nodeBuffer.getInt(nodeOfs + SPARSE_CHILDREN_OFFSET + index * 4);
+            int transition = nodeBuffer.getByte(nodeOfs + SPARSE_BYTES_OFFSET + index) & 0xFF;
+            descendInto(child, transition);
             return true;
         }
 
@@ -680,8 +663,6 @@ private boolean getChainTransition(int node)
             return true;
         }
 
-        // TODO: don't redo buffer/offset calculations
-        // TODO: maybe use sparse order word
         // TODO: reexamine backtracking
         // TODO: maybe separate backtrack positions for dense sub-levels
 
@@ -715,7 +696,7 @@ public int advanceMultiple(TransitionsReceiver receiver)
 
                 if (!isChainNode(child))
                 {
-                    boolean success = advanceToNextChild(child, -1);
+                    boolean success = advanceToNextChild(child, 0);
                     assert success;
                     return level;
                 }

From 1415f8852c8aa94548da71eb5a3a9cd9a6254731 Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Mon, 26 Apr 2021 11:41:37 +0300
Subject: [PATCH 150/151] Backtracking into split sub-nodes

---
 .../cassandra/db/tries/MemtableReadTrie.java  | 92 +++++++++++--------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
index af8bae7a238e..1a0a65eb59b1 100644
--- a/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
+++ b/src/java/org/apache/cassandra/db/tries/MemtableReadTrie.java
@@ -587,50 +587,65 @@ private boolean advanceToNextChild(int node, int data)
 
         private boolean nextValidSplitTransition(int node, int trans)
         {
-            assert trans >= 0 && trans <= 0x100;
-            // Splits the 2-3-3 parts of the transition
-            int midIndex = splitNodeMidIndex(trans);
-            int tailIdx = splitNodeTailIndex(trans);
-            int childIdx = splitNodeChildIndex(trans);
+            assert trans >= 0 && trans <= 0xFF;
+            // To avoid repeatedly following the top transitions, we put backtracking entries for each level of the
+            // split sub-trie and use the bits of `trans` to understand which level the backtracking info points to.
 
-            UnsafeBuffer nodeBuffer = getBuffer(node);
-            int nodeOfs = getOffset(node);
-            while (midIndex < 4)
+            int childIndex = splitNodeChildIndex(trans);
+            if (childIndex == 0)
             {
-                int mid = nodeBuffer.getInt(nodeOfs + SPLIT_POINTER_OFFSET + midIndex * 4);
-                UnsafeBuffer midBuffer = getBuffer(mid);
-                int midOfs = getOffset(mid);
-                if (!isNull(mid))
+                int tailIndex = splitNodeTailIndex(trans);
+                if (tailIndex == 0)
                 {
-                    while (tailIdx < 8)
+                    int midIndex = splitNodeMidIndex(trans);
+                    int mid;
+                    while (true)
                     {
-                        int tail = midBuffer.getInt(midOfs + tailIdx * 4);
-                        UnsafeBuffer tailBuffer = getBuffer(tail);
-                        int tailOfs = getOffset(tail);
-                        if (!isNull(tail))
-                        {
-                            while (childIdx < 8)
-                            {
-                                int child = tailBuffer.getInt(tailOfs + childIdx * 4);
-                                if (!isNull(child))
-                                {
-                                    int transition = ((midIndex << 6) | (tailIdx << 3) | childIdx);
-                                    if (transition < 0xFF)
-                                        addBacktrack(node, transition + 1, level);
-                                    descendInto(child, transition);
-                                    return true;
-                                }
-                                ++childIdx;
-                            }
-                        }
-                        childIdx = 0;
-                        ++tailIdx;
+                        mid = getInt(node + SPLIT_POINTER_OFFSET + midIndex * 4);
+                        if (!isNull(mid))
+                            break;
+                        if (++midIndex == 4)
+                            return false;
                     }
+                    if (midIndex + 1 < 4)
+                        addBacktrack(node, (midIndex + 1) << 6, level); // Store backtracking pos for the top sub-node
+                    trans = midIndex << 6;
+                    node = mid + SPLIT_OFFSET;  // Adjust sub-node pointer so that backtracking can bring us here
+                }
+                else
+                    trans = trans & -(1 << 6);
+
+                int tail;
+                while (true)
+                {
+                    tail = getInt(node - SPLIT_OFFSET + tailIndex * 4);
+                    if (!isNull(tail))
+                        break;
+                    if (++tailIndex == 8)
+                        return false;
                 }
-                tailIdx = 0;
-                ++midIndex;
+                if (tailIndex + 1 < 8)
+                    addBacktrack(node, (tailIndex + 1) << 3 | trans, level); // Store backtracking pos for the mid sub-node
+                trans = tailIndex << 3 | trans;
+                node = tail + SPLIT_OFFSET;
             }
-            return false;
+            else
+                trans = trans & -(1 << 3);
+
+            int child;
+            while (true)
+            {
+                child = getInt(node - SPLIT_OFFSET + childIndex * 4);
+                if (!isNull(child))
+                    break;
+                if (++childIndex == 8)
+                    return false;
+            }
+            if (childIndex + 1 < 8)
+                addBacktrack(node, (childIndex + 1) | trans, level);
+            trans = childIndex | trans;
+            descendInto(child, trans);
+            return true;
         }
 
         private boolean nextValidSparseTransition(int node, int data)
@@ -663,9 +678,6 @@ private boolean getChainTransition(int node)
             return true;
         }
 
-        // TODO: reexamine backtracking
-        // TODO: maybe separate backtrack positions for dense sub-levels
-
         @Override
         public int advanceMultiple(TransitionsReceiver receiver)
         {

From bbcb869e3ff6cdcd0d3da6d91e7f53e5f6085afb Mon Sep 17 00:00:00 2001
From: Branimir Lambov <branimir.lambov@datastax.com>
Date: Mon, 17 May 2021 15:38:44 +0300
Subject: [PATCH 151/151] Shard on lowest hash bits

---
 .../cassandra/db/memtable/TrieMemtable.java    | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
index ed410a42ddb9..8520d15c7daf 100644
--- a/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
+++ b/src/java/org/apache/cassandra/db/memtable/TrieMemtable.java
@@ -114,12 +114,6 @@ public class TrieMemtable extends AbstractAllocatorMemtable
     private AtomicBoolean switchRequested = new AtomicBoolean(false);
 
 
-    // The boundaries for the keyspace as they were calculated when the memtable is created.
-    // The boundaries will be NONE for system keyspaces or if StorageService is not yet initialized.
-    // The fact this is fixed for the duration of the memtable lifetime, guarantees we'll always pick the same core
-    // for the a given key, even if we race with the StorageService initialization or with topology changes.
-    private final ShardBoundaries boundaries;
-
     /**
      * Core-specific memtable regions. All writes must go through the specific core. The data structures used
      * are concurrent-read safe, thus reads can be carried out from any thread.
@@ -143,9 +137,8 @@ public class TrieMemtable extends AbstractAllocatorMemtable
     TrieMemtable(AtomicReference<CommitLogPosition> commitLogLowerBound, TableMetadataRef metadataRef, Owner owner)
     {
         super(commitLogLowerBound, metadataRef, owner);
-        this.boundaries = owner.localRangeSplits(SHARD_COUNT);
         this.metrics = TrieMemtableMetricsView.getOrCreate(metadataRef.keyspace, metadataRef.name);
-        this.shards = generatePartitionShards(boundaries.shardCount(), metadataRef, metrics);
+        this.shards = generatePartitionShards(SHARD_COUNT, metadataRef, metrics);
         this.mergedTrie = makeMergedTrie(shards);
         logger.debug("Created memtable with {} shards", this.shards.length);
     }
@@ -212,6 +205,11 @@ public void discard()
         }
     }
 
+    int getShardForKey(DecoratedKey key)
+    {
+        return Math.floorMod(key.filterHashLowerBits(), SHARD_COUNT);
+    }
+
     /**
      * Should only be called by ColumnFamilyStore.apply via Keyspace.apply, which supplies the appropriate
      * OpOrdering.
@@ -221,7 +219,7 @@ public void discard()
     public long put(PartitionUpdate update, UpdateTransaction indexer, OpOrder.Group opGroup)
     {
         DecoratedKey key = update.partitionKey();
-        MemtableShard shard = shards[boundaries.getShardForKey(key)];
+        MemtableShard shard = shards[getShardForKey(key)];
         long colUpdateTimeDelta = shard.put(key, update, indexer, opGroup);
 
         if (shard.data.reachedAllocatedSizeThreshold() && !switchRequested.getAndSet(true))
@@ -328,7 +326,7 @@ public MemtableUnfilteredPartitionIterator makePartitionIterator(final ColumnFil
 
     public Partition getPartition(DecoratedKey key)
     {
-        int shardIndex = boundaries.getShardForKey(key);
+        int shardIndex = getShardForKey(key);
         BTreePartitionData data = shards[shardIndex].data.get(key);
         if (data != null)
             return createPartition(metadata(), allocator.ensureOnHeap(), key, data);